-
Notifications
You must be signed in to change notification settings - Fork 0
/
hw4_collect_tweets.py
223 lines (183 loc) · 8.02 KB
/
hw4_collect_tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# -*- coding: UTF-8 -*-
import pandas as pd
from collections import defaultdict
import json
import pickle
import numpy as np
import nltk
import os
from sklearn.feature_extraction import DictVectorizer
import time
import pprint
import twitter
import cPickle
import main
from main import log_info, log_warn
class TweetsGetter(object):
PICKLED_TWEETS = "tweets.pkl"
BLACK_LISTED_IDS = "black_listed_ids.pkl"
NUMBER_OF_TWEETS_TO_GET = 200 # for each user, hardcoded. Max value for GetUserTimeline method
def __init__(self):
self.api = main.get_twitter_api()
self.tweets = {}
self.black_list_ids = []
if os.path.isfile(self.PICKLED_TWEETS):
with open(self.PICKLED_TWEETS, 'rb') as f:
self.tweets = cPickle.load(f)
if os.path.isfile(self.BLACK_LISTED_IDS):
with open(self.BLACK_LISTED_IDS, 'rb') as f:
self.black_list_ids = cPickle.load(f)
self.tweets_cached = len(self.tweets)
log_info("number of users with tweets cached: %s " % self.tweets_cached)
log_info("number of black listed users cached: %s " % len(self.black_list_ids))
def __del__(self):
if len(self.tweets) > self.tweets_cached:
log_info("Saving downloaded tweets and blacklist ids")
with open(self.PICKLED_TWEETS, 'wb') as f:
cPickle.dump(self.tweets, f, pickle.HIGHEST_PROTOCOL)
with open(self.BLACK_LISTED_IDS, 'wb') as f:
cPickle.dump(self.black_list_ids, f, pickle.HIGHEST_PROTOCOL)
# returns None in case of problems
def get_tweets(self, user_id):
if user_id in self.tweets:
return self.tweets[user_id]
elif user_id in self.black_list_ids:
return None
else:
log_info("Retrieving tweets of user with id %s..." % user_id)
tweets = None
while True:
try:
tweets = self.api.GetUserTimeline(user_id=user_id, count=self.NUMBER_OF_TWEETS_TO_GET,
include_rts=False, exclude_replies=True)
except twitter.TwitterError, e:
# skip private and deleted accounts
if e.message == "Not authorized." or e.message == [{u'message': u'Sorry, that page does not exist.', u'code': 34}]:
self.black_list_ids.append(user_id)
log_info("User %s added to black list" % user_id)
return None
elif e.message == [{u'message': u'Rate limit exceeded', u'code': 88}]:
log_warn("Rate limit exceeded exception was thrown! Trying to retrieve tweets again")
continue
else:
self.__del__()
raise e
except IOError:
log_warn("IOError exception was thrown! No idea that it means, but trying to retreive tweets again")
continue
finally:
users_loaded = len(self.tweets)
if users_loaded % 50 == 0:
log_info("%s users loaded, spilling them to disk" % users_loaded)
self.__del__()
break
self.tweets[user_id] = map(lambda tweetStatus: tweetStatus.AsDict(), tweets)
return self.tweets[user_id]
# returns None in case of problems, or list of tweets, each as a dict, otherwise
# get_all_users_tweets is used instead
def get_user_tweets(user_id):
pass
def get_all_users_tweets(uids):
tweets_getter = TweetsGetter()
uids_with_tweets = map(lambda uid: (uid, tweets_getter.get_tweets(uid)), uids)
log_info("All tweets are loaded")
return uids_with_tweets
# parse string to list of words, lowercasing, removing refs to other users (@usrname), reducing number of vowels,
# , filtering out http(s) links
class Wordenizer(object):
tokenizer = nltk.tokenize.TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
@staticmethod
def wordenize(text):
tokenized = Wordenizer.tokenizer.tokenize(text)
http_filtered = filter(lambda word: not word.startswith("http"), tokenized)
return http_filtered
def get_words(text):
words = Wordenizer.wordenize(text)
words = filter(lambda word: len(word) > 2, words) # filter out short strings, including punctuation
return words
class Normalizer(object):
lemmatizer = nltk.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
stopwords += [u'every', u'wa', u'going', u"don't", u"i'm", u"via", u"really", u"time", u'would']
@staticmethod
def tokenize(word):
lemmatized = Normalizer.lemmatizer.lemmatize(word)
return lemmatized
def get_tokens(words):
# normalize
words_normalized = map(lambda w: Normalizer.tokenize(w), words)
stop_words_filtered = filter(lambda word: word not in Normalizer.stopwords, words_normalized)
return stop_words_filtered
# accepts tweet as Status.AsDict() result dict
def get_tweet_tokens(tweet):
assert(tweet is not None)
words = get_words(tweet["text"])
return get_tokens(words)
# accepts list of tweets, each as a dict, turns them into tokens and calculates frequency of each
def tweets_to_grouped_tokens(tweets):
if tweets is None:
return {}
tokens = map(lambda tweet: get_tweet_tokens(tweet), tweets)
tokens_flattened = [token for tokenlist in tokens for token in tokenlist]
tokens_freq_dict = {}
for token in tokens_flattened:
if token in tokens_freq_dict:
tokens_freq_dict[token] += 1
else:
tokens_freq_dict[token] = 1
# tokens_freq_dict = {key: len(list(group)) for key, group in groupby(tokens_flattened)} # why it doesn't work?
return tokens_freq_dict
def collect_users_tokens(df_users):
uids = df_users["uid"].values
uids_with_tweets = get_all_users_tweets(uids)
uids_with_token_dict_freqs = map(lambda uid_with_tweets: (uid_with_tweets[0],
tweets_to_grouped_tokens(uid_with_tweets[1])),
uids_with_tweets)
uids = map(lambda uids_with_token_dict_freqs: uids_with_token_dict_freqs[0], uids_with_token_dict_freqs)
dict_freqs = map(lambda uids_with_token_dict_freqs: uids_with_token_dict_freqs[1], uids_with_token_dict_freqs)
return (uids, dict_freqs)
def get_full_token_set(users_tokens):
tokens = set()
for d in users_tokens:
dtokens = d.keys()
tokens |= set(dtokens)
return tokens
def get_full_frequencies(users_tokens):
freqs = {}
for d in users_tokens:
for token, freq in d.items():
if token in freqs:
freqs[token] = freqs[token] + freq
else:
freqs[token] = freq
return freqs
def draw_tag_cloud(users_tokens):
from PIL import Image
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator
trump_coloring = np.array(Image.open("pics/trump.png"))
freqs = get_full_frequencies(users_tokens)
freq_pairs = freqs.items()
wc = WordCloud(max_words=2000, mask=trump_coloring,
max_font_size=40, random_state=42)
wc.generate_from_frequencies(freq_pairs)
image_colors = ImageColorGenerator(trump_coloring)
# plt.imshow(wc)
# plt.axis("off")
#
# plt.figure()
plt.imshow(wc.recolor(color_func=image_colors))
# recolor wordcloud and show
# we could also give color_func=image_colors directly in the constructor
# plt.imshow(trump_coloring, cmap=plt.cm.gray)
plt.axis("off")
plt.show()
if __name__ == "__main__":
df_users = main.get_users()
# df_users = df_users[:1000]
users, users_tokens = collect_users_tokens(df_users)
token_set = get_full_token_set(users_tokens)
v = DictVectorizer()
vs = v.fit_transform(users_tokens)
np.savez("files/out_4.dat", data=vs, users=users, users_tokens=list(token_set))
draw_tag_cloud(users_tokens)