Esempio n. 1
0
def apply_tfidf(reviews_text: list):
    dictionary = HashDictionary()
    review_tokens = []
    for result in reviews_text:
        # Tokenize the reviews
        review_tokens.append(tokenize(result))

    # Build the dictionary
    dictionary.add_documents(review_tokens)
    # Convert to vector corpus
    vectors = [dictionary.doc2bow(token) for token in review_tokens]
    # Build TF-IDF model
    tfidf = TfidfModel(vectors)
    # Get TF-IDF weights
    weights = tfidf[vectors]
    # Get terms from the dictionary and pair with weights
    freq = dict()
    for doc in weights:
        for pair in doc:
            list_of_words = list(dictionary[pair[0]])
            for word in list_of_words:
                if word in freq:
                    freq[word] += pair[1]
                else:
                    freq[word] = pair[1]
    return freq
Esempio n. 2
0
Download it from:

    https://www.kaggle.com/benhamner/nips-papers/downloads/papers.csv/2

"""
papers = pd.read_csv('papers.csv')
corpus = list(papers['paper_text'])

print("corpus size: ", len(corpus))

# ToDo: check performance with lemmatization: gensim.utils.lemmatize

tokenized_corpus = [[
    utils.to_unicode(token)
    for token in utils.tokenize(corpus_item, lower=True, errors='ignore')
] for corpus_item in corpus]

hash_dictionary = HashDictionary(tokenized_corpus)

bow_corpus = [hash_dictionary.doc2bow(text) for text in corpus]
MmCorpus.serialize('nips_bow.mm', bow_corpus, progress_cnt=10000)

hash_dictionary.filter_extremes(no_below=20,
                                no_above=0.1,
                                keep_n=DEFAULT_DICT_SIZE)
hash_dictionary.save_as_text('nips_wordids.txt.bz2')
hash_dictionary.save('nips_corpura_hash.dict')

dictionary = Dictionary(tokenized_corpus)
dictionary.save('nips_corpura.dict')