import sys import json from os import path from gensim.corpora.wikicorpus import WikiCorpus base_dir = path.join(path.dirname(path.realpath(__file__)), path.pardir) wiki_filename = 'simplewiki-20171103-pages-articles-multistream.xml.bz2' wiki_path = path.join(base_dir, 'corpora', wiki_filename) outname = path.join(base_dir, 'corpora', 'simplewikiselect') index = [] # Save information about articles as they've been processed. wiki = WikiCorpus(wiki_path, dictionary=True) # dict=True avoids making vocab wiki.metadata = True # Want article titles print("Loading Wikipedia archive (this may take a few minutes)... ", end="") articles = list(wiki.get_texts()) print("Done.") num_articles = len(articles) print("Total Number of Articles:", num_articles) MAX_WC = 20_000_000 ARTICLE_MIN_WC = 200 ARTICLE_MAX_WC = 10000 ac = 0 wc = 0 selected = [] with open(outname + ".txt", "w") as f:
) print('outputs a bag-of-words representation.') else: dump_fname = sys.argv[1] dict_fname = sys.argv[2] mm_fname = sys.argv[3] vector_format = sys.argv[4] if len(sys.argv) >= 5 else 'tfidf' freq_dict = Dictionary.load(dict_fname) wiki_corpus = WikiCorpus(dump_fname, dictionary=freq_dict) tfidf = TfidfModel(dictionary=freq_dict) # Since metadata doesn't normally stay with a document when it's # transformed into tf-idf values, we have to implement it ourselves wiki_corpus.metadata = True metadata_queue = [] class MetadataRemovedCorpus: def __init__(self, corpus): self.corpus = corpus def __iter__(self): for doc, metadata in self.corpus: metadata_queue.append(metadata) yield doc tfidf_corpus = tfidf[MetadataRemovedCorpus(wiki_corpus)] class MetadataAddedCorpus: def __init__(self, corpus):