def get_texts(texts): articles, articles_all = 0, 0 positions, positions_all = 0, 0 try: # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... for group in utils.chunkize(texts, chunksize=10 * processes, maxsize=1): for title, text, pageid in pool.imap(process_article, group): #if articles % 1000 == 0: # print(articles) articles_all += 1 # article redirects and short stubs are pruned here if any( title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): continue if len(text) < 500: continue articles += 1 yield text except KeyboardInterrupt: logger.warning( "user terminated iteration over Wikipedia corpus after %i documents with %i positions " "(total %i articles, %i positions before pruning articles shorter than %i words)", articles, positions, articles_all, positions_all, 50) else: logger.info( "finished iterating over Wikipedia corpus of %i documents with %i positions " "(total %i articles, %i positions before pruning articles shorter than %i words)", articles, positions, articles_all, positions_all, 50) finally: pool.terminate()
def get_texts_with_sections(self): """Iterate over the dump, returning titles and text versions of all sections of articles. Notes ----- Only articles of sufficient length are returned (short articles & redirects etc are ignored). Note that this iterates over the **texts**; if you want vectors, just use the standard corpus interface instead of this function: .. sourcecode:: pycon >>> for vec in wiki_corpus: >>> print(vec) Yields ------ (str, list of (str, str), list of (str, str)) Structure contains (title, [(section_heading, section_content), ...], (Optionally)[(interlink_article, interlink_text), ...]). """ skipped_namespace, skipped_length, skipped_redirect = 0, 0, 0 total_articles, total_sections = 0, 0 page_xmls = extract_page_xmls(self.fileobj) pool = multiprocessing.Pool(self.processes) # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... for group in utils.chunkize(page_xmls, chunksize=10 * self.processes, maxsize=1): for article in pool.imap(partial(segment, include_interlinks=self.include_interlinks), group): # chunksize=10): partial(merge_names, b='Sons') article_title, sections = article[0], article[1] # article redirects are pruned here if any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): # filter non-articles skipped_namespace += 1 continue if not sections or sections[0][1].lstrip().lower().startswith("#redirect"): # filter redirect skipped_redirect += 1 continue if sum(len(body.strip()) for (_, body) in sections) < self.min_article_character: # filter stubs (incomplete, very short articles) skipped_length += 1 continue total_articles += 1 total_sections += len(sections) if self.include_interlinks: interlinks = article[2] yield (article_title, sections, interlinks) else: yield (article_title, sections) logger.info( "finished processing %i articles with %i sections (skipped %i redirects, %i stubs, %i ignored namespaces)", total_articles, total_sections, skipped_redirect, skipped_length, skipped_namespace) pool.terminate() self.length = total_articles # cache corpus length
def get_texts_with_sections(self): """Iterate over the dump, returning titles and text versions of all sections of articles. Notes ----- Only articles of sufficient length are returned (short articles & redirects etc are ignored). Note that this iterates over the **texts**; if you want vectors, just use the standard corpus interface instead of this function: .. sourcecode:: pycon >>> for vec in wiki_corpus: >>> print(vec) Yields ------ (str, list of (str, str), dict of (str: str)) Structure contains (title, [(section_heading, section_content), ...], (Optionally){interlinks}). """ skipped_namespace, skipped_length, skipped_redirect = 0, 0, 0 total_articles, total_sections = 0, 0 page_xmls = extract_page_xmls(self.fileobj) pool = multiprocessing.Pool(self.processes) # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... for group in utils.chunkize(page_xmls, chunksize=10 * self.processes, maxsize=1): for article in pool.imap(partial(segment, include_interlinks=self.include_interlinks), group): # chunksize=10): partial(merge_names, b='Sons') article_title, sections = article[0], article[1] # article redirects are pruned here if any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): # filter non-articles skipped_namespace += 1 continue if not sections or sections[0][1].lstrip().lower().startswith("#redirect"): # filter redirect skipped_redirect += 1 continue if sum(len(body.strip()) for (_, body) in sections) < self.min_article_character: # filter stubs (incomplete, very short articles) skipped_length += 1 continue total_articles += 1 total_sections += len(sections) if self.include_interlinks: interlinks = article[2] yield (article_title, sections, interlinks) else: yield (article_title, sections) logger.info( "finished processing %i articles with %i sections (skipped %i redirects, %i stubs, %i ignored namespaces)", total_articles, total_sections, skipped_redirect, skipped_length, skipped_namespace) pool.terminate() self.length = total_articles # cache corpus length
def get_texts_with_sections(self): """Iterate over the dump, returning titles and text versions of all sections of articles. Notes ----- Only articles of sufficient length are returned (short articles & redirects etc are ignored). Note that this iterates over the **texts**; if you want vectors, just use the standard corpus interface instead of this function:: >>> for vec in wiki_corpus: >>> print(vec) Yields ------ (str, list of (str, str)) Structure contains (title, [(section_heading, section_content), ...]). """ articles = 0 page_xmls = extract_page_xmls(self.fileobj) pool = multiprocessing.Pool(self.processes) # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... for group in utils.chunkize(page_xmls, chunksize=10 * self.processes, maxsize=1): for article_title, sections in pool.imap(segment, group): # chunksize=10): # article redirects are pruned here if any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): # filter non-articles continue if not sections or sections[0][1].lstrip().lower().startswith("#redirect"): # filter redirect continue if sum(len(body.strip()) for (_, body) in sections) < self.min_article_character: # filter very short articles (trash) continue articles += 1 yield (article_title, sections) pool.terminate() self.length = articles # cache corpus length