def hashdictionary_corpus(dataframe, id_range=32000): """Returns a HashDictionary mapping words to ids. Precomputed HashDictionaries are read from file if previously cached, or generated then cached otherwise. Parameters ---------- dataframe : Pandas DataFrame The DataFrame containing the documents to process. id_range : int The maximum number of ids available. Returns ------- dictionary : Gensim HashDictionary HashDictionary mapping words to ids. """ filename = 'caches/models/dictionary_{}.model'.format(id_range) if not os.path.isfile(filename): corpus = text_corpus(dataframe) dictionary = HashDictionary(corpus, id_range=id_range) dictionary.save(filename) else: dictionary = HashDictionary.load(filename) return dictionary
tokens = list(filter(None, tokens)) return tokens class Corpus(object): def __iter__(self): for file in glob.glob("*.txt"): print(file) paper = Path(file).read_text(encoding='utf8') yield paper corpus_memory_friendly = Corpus() papers = list(corpus_memory_friendly) texts = [list(preprocess(t)) for t in papers] # define the dictionary: dictionary = Dictionary(texts) dictionary.save('reasoning_corpura.dict') corpus = [dictionary.doc2bow(text) for text in texts] MmCorpus.serialize('reasoning_bow.mm', corpus) hash_dictionary = HashDictionary(texts ) hash_dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) hash_dictionary.save_as_text('reasoning_wordids.txt.bz2') hash_dictionary.save('reasoning_corpura_hash.dict')
Download it from: https://www.kaggle.com/benhamner/nips-papers/downloads/papers.csv/2 """ papers = pd.read_csv('papers.csv') corpus = list(papers['paper_text']) print("corpus size: ", len(corpus)) # ToDo: check performance with lemmatization: gensim.utils.lemmatize tokenized_corpus = [[ utils.to_unicode(token) for token in utils.tokenize(corpus_item, lower=True, errors='ignore') ] for corpus_item in corpus] hash_dictionary = HashDictionary(tokenized_corpus) bow_corpus = [hash_dictionary.doc2bow(text) for text in corpus] MmCorpus.serialize('nips_bow.mm', bow_corpus, progress_cnt=10000) hash_dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) hash_dictionary.save_as_text('nips_wordids.txt.bz2') hash_dictionary.save('nips_corpura_hash.dict') dictionary = Dictionary(tokenized_corpus) dictionary.save('nips_corpura.dict')