def apply_tfidf(reviews_text: list): dictionary = HashDictionary() review_tokens = [] for result in reviews_text: # Tokenize the reviews review_tokens.append(tokenize(result)) # Build the dictionary dictionary.add_documents(review_tokens) # Convert to vector corpus vectors = [dictionary.doc2bow(token) for token in review_tokens] # Build TF-IDF model tfidf = TfidfModel(vectors) # Get TF-IDF weights weights = tfidf[vectors] # Get terms from the dictionary and pair with weights freq = dict() for doc in weights: for pair in doc: list_of_words = list(dictionary[pair[0]]) for word in list_of_words: if word in freq: freq[word] += pair[1] else: freq[word] = pair[1] return freq
Download it from: https://www.kaggle.com/benhamner/nips-papers/downloads/papers.csv/2 """ papers = pd.read_csv('papers.csv') corpus = list(papers['paper_text']) print("corpus size: ", len(corpus)) # ToDo: check performance with lemmatization: gensim.utils.lemmatize tokenized_corpus = [[ utils.to_unicode(token) for token in utils.tokenize(corpus_item, lower=True, errors='ignore') ] for corpus_item in corpus] hash_dictionary = HashDictionary(tokenized_corpus) bow_corpus = [hash_dictionary.doc2bow(text) for text in corpus] MmCorpus.serialize('nips_bow.mm', bow_corpus, progress_cnt=10000) hash_dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) hash_dictionary.save_as_text('nips_wordids.txt.bz2') hash_dictionary.save('nips_corpura_hash.dict') dictionary = Dictionary(tokenized_corpus) dictionary.save('nips_corpura.dict')