else: keep_words = DEFAULT_DICT_SIZE if os.path.exists(outp + '_wordids.txt.bz2') and os.path.exists(outp + '_corpus.pkl.bz2'): dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') wiki = TextCorpus.load(outp + '_corpus.pkl.bz2') else: wiki = TextCorpus(inp) # only keep the most frequent words wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=keep_words) wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') wiki.save(outp + '_corpus.pkl.bz2') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') # build tfidf if os.path.exists(outp + '_tfidf.mm'): mm = gensim.corpora.MmCorpus(outp + '_tfidf.mm') else: tfidf = TfidfModel(wiki, id2word=dictionary, normalize=True) #tfidf.save(outp + '.tfidf_model') # save tfidf vectors in matrix market format mm = tfidf[wiki] MmCorpus.serialize(outp + '_tfidf.mm', mm, progress_cnt=10000)