/
test_extract_term_frequency.py
91 lines (68 loc) · 2.41 KB
/
test_extract_term_frequency.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from load_tweets import load_tweets
from remove_stop_words import remove_stop_words
from remove_punctuation import remove_punctuation
from remove_non_english import remove_non_english
from remove_digits import remove_digits
from remove_links import remove_links
from remove_empty import remove_empty
from feature_extractors import *
from clustering import *
from trend_extraction import *
from stats_and_plots import *
import json
from scipy import sparse
import matplotlib.pyplot as plt
import numpy as np
import re
'''
A side thing that I was working on with the idea that we want to
look for spikes in term frequency over time. Still sort of buggy.
-James
'''
def trend_extract_term_frequency(samples : list):
# Evaluate the term frequency in each sample
dict = {}
for sample in samples:
for key, value in get_word_counts(sample):
if key in dict:
dict[key].append(value)
else:
dict[key] = [value]
li = []
for key,value in dict.items():
maxValue = max(value)
averageValue = np.mean([t for t in value if t != maxValue])
delta = maxValue - averageValue
li.append((key, delta))
print(delta)
sorted_list = sorted(li, key=lambda x: x[1],reverse=True)
print(sorted_list[0:5][1])
def get_word_counts(tweets : list) -> list:
'''
Takes in a list of tweets and returns a dictionary with word and its count
:param tweets: list of tweets, where each tweet is a dictionary
:return: dictionary containing the word and its count
'''
dict = {}
cv = CountVectorizer()
text = [t['text'] for t in tweets]
tdm = cv.fit_transform(text)
word_counts = tdm.sum(axis=0).tolist()[0]
assert(len(cv.get_feature_names()) == len(word_counts))
return zip(cv.get_feature_names(), word_counts)
if __name__ == '__main__':
print('Loading...')
samples = [load_tweets('data/sample%d.txt' % i) for i in range(1,4)]
print('Loaded.')
print('Cleaning...')
for tweets in samples:
tweets = remove_stop_words(tweets)
tweets = remove_punctuation(tweets)
tweets = remove_non_english(tweets)
tweets = remove_links(tweets)
tweets = remove_digits(tweets)
tweets = remove_empty(tweets)
print('Cleaned.')
print('Extract Trends')
trend_extract_term_frequency(samples)
print('Extracted')