def get_dxp(start, end, news_url, portables_url): """Calculate the time until the start or end of a Double XP Weekend""" utc_time = datetime.utcnow() start_date = parser.parse(start) end_date = parser.parse(end) if utc_time < start_date: delta = relativedelta(start_date, utc_time) if delta.days >= 1: return '1. [DXP Weekend starts in: **%(days)d day, %(hours)d hours**](' \ % delta.__dict__ + news_url + \ '#dxp) \n \n2. [Portables & Boxes FC Information](' + portables_url + ') \n' return '1. [DXP Weekend starts in: **%(hours)d hours**](' \ % delta.__dict__ + news_url + \ '#dxp) \n \n2. [Portables & Boxes FC Information](' + portables_url + ') \n' elif utc_time > end_date: return '1. DXP Weekend has ended.' else: delta = relativedelta(end_date, utc_time) if delta.days > 1: return '1. [DXP Weekend is LIVE: **%(days)d days, %(hours)d hours to go**](' \ % delta.__dict__ + news_url + \ '#dxp) \n \n2. [Portables & Boxes FC Information](' + portables_url + ') \n' elif delta.days == 1: return '1. [DXP Weekend is LIVE: **%(days)d day, %(hours)d hours to go**](' \ % delta.__dict__ + news_url + \ '#dxp) \n \n2. [Portables & Boxes FC Information](' + portables_url + ') \n' return '1. [DXP Weekend is LIVE: **%(hours)d hours to go**](' \ % delta.__dict__ + news_url + \ '#dxp) \n \n2. [Portables & Boxes FC Information](' + portables_url + ') \n'
def add_document(self, writer, file_path, config): file_name = str( file_path.replace(".", " ").replace("/", " ").replace( "\\", " ").replace("_", " ").replace("-", " ")) # read file content with codecs.open(file_path, 'r', encoding='utf-8') as f: content = f.read() path = str(file_path) # parse markdown fields parser = MarkdownParser() parser.parse(content, config) modtime = os.path.getmtime(path) print( "adding to index: path: %s size:%d tags:'%s' headlines:'%s' modtime=%d" % (path, len(content), parser.tags, parser.headlines, modtime)) writer.add_document(path=path, filename=file_name, headlines=parser.headlines, tags=parser.tags, content=content, doubleemphasiswords=parser.doubleemphasiswords, emphasiswords=parser.emphasiswords, time=modtime)
def analyse(rootdir): list = os.listdir(rootdir) new = True for file in list: if not file.endswith('.xml'): continue if new: parser_new.parse(os.path.join(rootdir, file)) new = False else: parser.parse(os.path.join(rootdir, file))
def select_date(tree, element): try: text = tree.select(element)[0].get_text().replace('\"', '') year = parser.parse(text).year except Exception as err: year = '' writelog(' PARSE ERROR', element, err) return year
def extract_np(c, data, parser, mode): text = word_tokenize(data) sentence = pos_tag(text) result = [] parsed_sentence = parser.parse(sentence) # # Clearer visuals for debugging # print(parsed_sentence) # parsed_sentence.draw() for np in clean_np(parsed_sentence, mode): result.append(np) # This counts number of times the NPs appears in the input data(review + summary) c.update(lower_and_lemma(result))
def crawl_domain(self, domain, depth, debug=False, limit=None, visited=set()): """ Fetches a domain, and then crawls its internal pages until given depth. Returns a dictionary of url -> html code. """ pages = {} base_domain = urllib.parse.urlparse( domain ).netloc html = self.fetch( domain, debug) if html is not None: pages[domain] = html visited.add( domain ) else: if debug is True: print( "Impossible to crawl %s" % domain ) return {} if depth > 0 and (limit is None or limit > 0): dom = None parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom")) try: dom = parser.parse( html ) except Exception as e: if debug is True: print( e ) return {} links = html5wrapper.extract_doc_links( dom ) for key in links: # We do not want anchors to be crawled if len(key) < 1 or key[0] == '#': continue url = urllib.parse.urljoin(domain, key) # Trying to get eventual file extension, and to check its validity parts = url.split(".") ext = parts[ len(parts) - 1].strip().lower() if ext in self.badextensions: continue # Let's check if it's an internal link, and not an outgoing one if base_domain == urllib.parse.urlparse( url ).netloc and \ url not in visited and (limit is None or limit > 0): visited.add( url ) pages.update( self.crawl_domain(url, depth - 1, debug, limit, visited) ) if limit is not None: limit -= 1 return pages
def replace_images(msg, addr): parser = commonmark.Parser() ast = parser.parse(msg["data"]["raw"]) ripper = html_image_ripper() ripper.feed(msg["data"]["cooked"]) for cur, entering in ast.walker(): if cur.t == "image" and entering: cur.t = "link" dest = ripper.images.pop(0) if dest.startswith("/"): dest = addr + dest cur.destination = dest renderer = commonmark_extensions.plaintext.CommonMarkToCommonMarkRenderer() return renderer.render(ast)
def work(self): while True: try: url = self._master.getTask() if not url: break page = load(url) text, names = parser.parse(page) name = extractName(url) store(page, HTML_PATH + name, url) store(text, TEXT_PATH + name, url) self._master.report(url) self._master.pushUrls(namesToUrls(names)) except (urllib.error.URLError, urllib.error.HTTPError) as exception: log.error('Load "%s"... %s %s', url, exception.reason, exception.code) log.debug('Shutdown thread "%s"...', self.name)
def stream_reuters_documents(data_path='/Users/newuser/Downloads/reuters21578'): """Iterate over documents of the Reuters dataset. The Reuters archive will automatically be downloaded and uncompressed if the `data_path` directory does not exist. Documents are represented as dictionaries with 'body' (str), 'title' (str), 'topics' (list(str)) keys. """ DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/' 'reuters21578-mld/reuters21578.tar.gz') ARCHIVE_FILENAME = 'reuters21578.tar.gz' if data_path is None: data_path = os.path.join(get_data_home(), "reuters") if not os.path.exists(data_path): """Download the dataset.""" print("downloading dataset (once and for all) into %s" % data_path) os.mkdir(data_path) def progress(blocknum, bs, size): total_sz_mb = '%.2f MB' % (size / 1e6) current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6) if _not_in_sphinx(): print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='') archive_path = os.path.join(data_path, ARCHIVE_FILENAME) urllib.request.urlretrieve(DOWNLOAD_URL, filename=archive_path, reporthook=progress) if _not_in_sphinx(): print('\r', end='') print("untarring Reuters dataset...") tarfile.open(archive_path, 'r:gz').extractall(data_path) print("done.") parser = ReutersParser() for filename in glob(os.path.join(data_path, "*.sgm")): for doc in parser.parse(open(filename, 'rb')): yield doc
def nltk_parser(txt): myblob = TextBlobDE(txt) sent = [x[1] for x in myblob.tags] sent_text = [x[0] for x in myblob.tags] cfg_grammar = nltk.CFG.fromstring(""" S -> NP VP | S CC S NP -> 'DT' N | 'DT' N PP | 'PRP' | N | 'PRP$' VP -> V NP | V NP PP | V ADJP ADJP -> 'RB' 'JJ' | 'JJ' PP -> P NP N -> 'NN' | 'NNP' | 'NNS' | 'FW' V -> 'VBN' | 'VB' | 'MD' P -> 'IN' | 'TO' CC -> 'CC' O -> 'RP' | 'WDT' | 'TRUNC' | 'CD' """) parser = nltk.parse.ChartParser(cfg_grammar) for tree in parser.parse(sent): print(tree) tree.draw()
"""Extract syntax highlighting choices out of the paste page.""" def handle_starttag(self, tag, attrs): """The parser enters a tag.""" if tag == 'option': self._current_tag = dict(attrs)['value'] def handle_data(self, data): """The parser processes data inside a tag.""" if self._current_tag is not None: self._resultdict[data] = self._current_tag def handle_endtag(self, tag): """The parser leaves a tag.""" if tag == 'option': self._current_tag = None def parse(self, data): """Main method.""" self._current_tag = None self._resultdict = {} self.feed(data) return self._resultdict parser = PasteOfCodeParser() data = parser.parse(requests.get('http://paste.ofcode.org/').text) json.dump(data, sys.stdout, indent=4) print()
def handle(self, *args, **options): if not options.get('input_json') or not options.get('app'): raise CommandError('Both --input_json and --app arguments are mandatory') with open(options.get('input_json'), 'rb') as f: groups, articles = json.loads(f.read().decode('UTF-8')) fixtures_path = os.path.join(settings.BASE_DIR, options.get('app'), 'fixtures') if not os.path.exists(fixtures_path): os.makedirs(fixtures_path) self.fill_groups(groups) with open(os.path.join(fixtures_path, "categories.json"), "wb") as f: f.write(bytes( json.dumps(self.all_groups, ensure_ascii=False, indent=4) , 'UTF-8')) self.stdout.write('Successfully built categories.json fixture with {0} records'.format(len(self.all_groups))) all_articles = [] article_id = 1 for article in articles: parsedDate = parser.parse(article['properties']['60']['value'], dayfirst=True, yearfirst=False) preview = article['properties']['CML2_PREVIEW_TEXT']['value'] if preview and len(preview) >= 252: preview = preview[:252] + '...' all_articles.append({ 'model': 'news.article', 'pk': article_id, 'fields': { 'enabled': True, 'publish_date': parsedDate.strftime('%Y-%m-%dT%H:%M:%SZ'), 'slug': article['properties']['CML2_CODE']['value'], 'title': article['title'], 'announcement': preview, 'description': self.html.unescape(article['properties']['CML2_DETAIL_TEXT']['value'] or ''), 'source': article['properties']['62']['value'], 'author': article['properties']['64']['value'], 'translated_by': article['properties']['65']['value'], 'photo': article['properties']['54']['value'], 'photo_copyrights': article['properties']['66']['value'], 'views': 0, 'order': article['properties']['CML2_SORT']['value'], #'tags': article['tags'], 'categories': [self.old_to_new_groups[gid] for gid in article['groups']] }, }) article_id += 1 with open(os.path.join(fixtures_path, "articles.json"), "wb") as f: f.write(bytes( json.dumps(all_articles, ensure_ascii=False, indent=4) , 'UTF-8')) self.stdout.write('Successfully built articles.json fixture with {0} records'.format(len(articles)))
def crawl_domain(self, domain, depth, debug=False, limit=None, visited=set()): """ Fetches a domain, and then crawls its internal pages until given depth. Returns a dictionary of url -> html code. """ pages = {} base_domain = urllib.parse.urlparse(domain).netloc html = self.fetch(domain, debug) if html is not None: pages[domain] = html visited.add(domain) else: if debug is True: print("Impossible to crawl %s" % domain) return {} if depth > 0 and (limit is None or limit > 0): dom = None parser = html5lib.HTMLParser( tree=html5lib.treebuilders.getTreeBuilder("dom")) try: dom = parser.parse(html) except Exception as e: if debug is True: print(e) return {} links = html5wrapper.extract_doc_links(dom) for key in links: # We do not want anchors to be crawled if len(key) < 1 or key[0] == '#': continue url = urllib.parse.urljoin(domain, key) # Trying to get eventual file extension, and to check its validity parts = url.split(".") ext = parts[len(parts) - 1].strip().lower() if ext in self.badextensions: continue # Let's check if it's an internal link, and not an outgoing one if base_domain == urllib.parse.urlparse( url ).netloc and \ url not in visited and (limit is None or limit > 0): visited.add(url) pages.update( self.crawl_domain(url, depth - 1, debug, limit, visited)) if limit is not None: limit -= 1 return pages