forked from shellyjang/gchat_analysis
/
words_that_divide_tfidf.py
48 lines (39 loc) · 1.6 KB
/
words_that_divide_tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
'''
'''
from sklearn.feature_extraction.text import TfidfVectorizer
from corpus_analyzer import word_bag
import os, sys
def tfidf_scores(work_dir, output_file, ngram_range=(1,1), num_results=5):
tf = TfidfVectorizer(analyzer='word', ngram_range=ngram_range, min_df = 0, stop_words = 'english')
if work_dir[-1] != '/':
work_dir += '/'
files = [work_dir + f for f in os.listdir(work_dir) if '@' in f]
corpus = []
for f in files:
corpus.append(word_bag(f))
print '-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-'
print 'fitting the corpus to generate TfIdf matrix'
tfidf_matrix = tf.fit_transform(corpus)
print 'finished!'
print '-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-'
feature_names = tf.get_feature_names()
dense = tfidf_matrix.todense()
g = open(output_file, 'w')
for (ii, f) in enumerate(files):
chat = dense[ii].tolist()[0]
phrase_scores = [pair for pair in enumerate(chat) if pair[1] > 0]
sorted_phrase_scores = sorted(phrase_scores, key=lambda x: x[1] * -1)
# print 'words most used in %s' %(f.split('/')[-1])
g.write('words most used in %s\n' %(f.split('/')[-1]))
for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:num_results]:
# print('{0: <20} {1}'.format(phrase, score))
g.write('{0: <20} {1}\n'.format(phrase, score))
if __name__ == '__main__':
if len(sys.argv) == 5:
tfidf_scores(sys.argv[1], sys.argv[2], eval(sys.argv[3]), int(sys.argv[4]))
elif len(sys.argv) == 4:
tfidf_scores(sys.argv[1], sys.argv[2], eval(sys.argv[3]))
elif len(sys.argv) == 3:
tfidf_scores(sys.argv[1], sys.argv[2])
else:
tfidf_scores('/Users/sj334u/Desktop/corpus/')