/
tf-idf.py
92 lines (71 loc) · 2.72 KB
/
tf-idf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import pandas as pd
from lda_util import clean_tsv
from nltk.text import TextCollection
import math
class TF_IDF:
def __init__(self, path="data/hair_dryer.tsv"):
self.raw_df = pd.read_csv(path, sep='\t', encoding='utf-8')
def get_tf_idf_dict(self, column_type="review_body", save_path="tf_idf_value/hair_dryer_tf_idf_dict.csv"):
'''
Get TF-IDF dictionary through all the review
You can choose review headline to analyse either
(let column_type = "review_headline")
'''
reviews = self.raw_df[column_type].tolist()
# get clean header
reviews_list_cleaned = clean_tsv(reviews)
freq = {}
# traverse to get the word frequency
for reviews in reviews_list_cleaned:
for review in reviews:
if freq.get(review) is None:
freq[review] = 1
else:
freq[review] += 1
# calculate tf
total_freq = sum(freq.values())
tf = {}
for key in freq.keys():
tf[key] = freq[key] / total_freq
# calculate idf
total_review = len(reviews_list_cleaned)
doc_freq = {}
idf = {}
for word in freq.keys():
doc_freq[word] = 0
for reviews in reviews_list_cleaned:
if word in reviews:
doc_freq[word] += 1
for word in doc_freq.keys():
idf[word] = math.log(total_review / (doc_freq[word] + 1))
# calculate tf-idf
words = []
tf_idf = []
for word in tf.keys():
words.append(word)
tf_idf.append(tf[word] * idf[word])
df = pd.DataFrame({"word": words, "tf-idf": tf_idf})
df.to_csv(save_path, encoding='utf-8')
def get_tf_idf_dict_nltk(self, column_type="review_body", save_path="tf_idf_value/hair_dryer_tf_idf_dict.csv"):
'''
### nltk version
it's super slow so don't use it
'''
reviews = self.raw_df[column_type].tolist()
# get clean header
reviews_list_cleaned = clean_tsv(reviews)
# get all words
words = set()
for reviews in reviews_list_cleaned:
for review in reviews:
words.add(review)
words = list(words)
corpus = TextCollection(reviews_list_cleaned)
tf_idf = []
for word in words:
tf_idf.append(corpus.tf_idf(word, corpus))
df = pd.DataFrame({"word": words, "tf-idf": tf_idf})
df.to_csv(save_path, encoding='utf-8')
if __name__ == "__main__":
test = TF_IDF(path="data/pacifier_filtered.tsv")
test.get_tf_idf_dict(save_path="tf_idf_value/pacifier_filtered_tf_idf_dict.csv")