import sys
import json
from os import path
from gensim.corpora.wikicorpus import WikiCorpus

base_dir = path.join(path.dirname(path.realpath(__file__)), path.pardir)
wiki_filename = 'simplewiki-20171103-pages-articles-multistream.xml.bz2'
wiki_path = path.join(base_dir, 'corpora', wiki_filename)
outname = path.join(base_dir, 'corpora', 'simplewikiselect')

index = []  # Save information about articles as they've been processed.

wiki = WikiCorpus(wiki_path, dictionary=True)  # dict=True avoids making vocab
wiki.metadata = True  # Want article titles
print("Loading Wikipedia archive (this may take a few minutes)... ", end="")
articles = list(wiki.get_texts())
print("Done.")

num_articles = len(articles)

print("Total Number of Articles:", num_articles)

MAX_WC = 20_000_000
ARTICLE_MIN_WC = 200
ARTICLE_MAX_WC = 10000

ac = 0
wc = 0
selected = []

with open(outname + ".txt", "w") as f:
Example #2
0
        )
        print('outputs a bag-of-words representation.')
    else:
        dump_fname = sys.argv[1]
        dict_fname = sys.argv[2]
        mm_fname = sys.argv[3]
        vector_format = sys.argv[4] if len(sys.argv) >= 5 else 'tfidf'

        freq_dict = Dictionary.load(dict_fname)
        wiki_corpus = WikiCorpus(dump_fname, dictionary=freq_dict)
        tfidf = TfidfModel(dictionary=freq_dict)

        # Since metadata doesn't normally stay with a document when it's
        # transformed into tf-idf values, we have to implement it ourselves

        wiki_corpus.metadata = True
        metadata_queue = []

        class MetadataRemovedCorpus:
            def __init__(self, corpus):
                self.corpus = corpus

            def __iter__(self):
                for doc, metadata in self.corpus:
                    metadata_queue.append(metadata)
                    yield doc

        tfidf_corpus = tfidf[MetadataRemovedCorpus(wiki_corpus)]

        class MetadataAddedCorpus:
            def __init__(self, corpus):