/
tweets.py
382 lines (322 loc) · 9.38 KB
/
tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
#!/usr/bin/env python
import collections
import datetime
import json
import os
import re
import time
import requests
from common import redis
try:
if os.environ.get('DEBUG'):
import settings_local as settings
else:
import settings_prod as settings
except ImportError:
import settings
strip_whitespace = lambda x: re.sub('\s+', ' ', x).strip()
class Thesaurus(object):
# Moby's Thesaurus
filename = 'mthesaur.UTF-8.txt'
@property
def words(self):
if hasattr(self, 'mthesaur_words'):
return self.mthesaur_words
lines = open(self.filename).readlines()
self.mthesaur_words = '\n'.join(lines)
return self.mthesaur_words
thesaurus = Thesaurus()
def get_synonyms(word):
lines = thesaurus.words
start = lines.find('\n%s,' % word)
lines = lines[start:].strip()
end = lines.find('\n')
synonyms = lines[:end]
return synonyms.strip().split(',')
sentiments = '''
feel {mood}
feel so {mood}
i'm {mood}
i am {mood}
im {mood}
makes me {mood}
made me {mood}
making me {mood}
in a {mood} mood
im so {mood}
i'm so {mood}
i am so {mood}
'''.strip().split('\n')
search_terms = [strip_whitespace(x.format(mood='')) for x in sentiments]
totals = ('total_saw', 'total_analyzed', 'total_rejected')
moods = '''
composed
elated
unsure
clearheaded
tired
depressed
guilty
confused
anxious
confident
hostile
agreeable
energetic
'''.strip().split('\n')
mood_roots = {
'elated': '''
happy
satisfied
pleased
cheerful
overjoyed
'''.strip().split('\n'),
'depressed': '''
unhappy
sad
blue
hopeless
discouraged
lonely
miserable
gloomy
refreshed
'''.strip().split('\n'),
'agreeable': '''
friendly
agreeable
helpful
forgiving
kindly
good-natured
warm-hearted
good-tempered
'''.strip().split('\n'),
'hostile': '''
angry
peeved
grouchy
spiteful
annoyed
resentful
bitter
ready to fight
rebellious
furious
bad-tempered
'''.strip().split('\n'),
'energetic': '''
lively
full of pep
vigorous
energetic
'''.strip().split('\n'),
'tired': '''
worn out
listless
fatigued
exhausted
sluggish
weary
bushed
'''.strip().split('\n'),
'confused': '''
confused
unable to concentrate
muddled
bewildered
forgetful
uncertain about things
'''.strip().split('\n'),
'clearheaded': '''
efficient
alert
'''.strip().split('\n'),
'composed': '''
relaxed
'''.strip().split('\n'),
'anxious': '''
uneasy
restless
nervous
anxious
terrified
tense
shaky
on edge
panicky
'''.strip().split('\n'),
'confident': '''
strong
bold
powerful
secure
confident
self-assured
forceful
'''.strip().split('\n'),
'unsure': '''
weak
timid
unsure
self-doubting
uncertain
feeble
unassertive
'''.strip().split('\n'),
'guilty': '''
sorry for things done
unworthy
desperate
helpless
worthless
guilty
'''.strip().split('\n'),
}
mood_synonyms = {}
# Uncomment to print mood synonyms.
# import pprint
# pprint.pprint(mood_synonyms)
# import sys
# sys.exit(1)
def run():
# When the run started.
timestamp = datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')
redis_timestamp = datetime.datetime.today().strftime('%Y%m%d%H%M%S')
print timestamp
# Sorted Set.
# key: runs
# score: redis_timestamp
# value: timestamp
redis.zadd('runs', timestamp, redis_timestamp)
if settings.MOCK:
counts = {}
counts['exact'] = {'composed': 1, 'elated': 8, 'energetic': 2, 'tired': 1, 'depressed': 6, 'anxious': 4, 'confident': 1, 'agreeable': 5}
counts['fuzzy'] = {'composed': 10, 'elated': 80, 'energetic': 20, 'tired': 10, 'depressed': 60, 'anxious': 40, 'confident': 10, 'agreeable': 50}
#
# TODO: Add total analyzed, etc.
#
else:
counts = process_tweets(search_terms)
print counts
for mood in moods:
for precision, sub_counts in counts.iteritems():
# Set.
# key: runs:<redis_timestamp>:<precision [exact or fuzzy]>:<mood>
# value: <count>
redis.set('runs:%s:moods:%s:%s' % (redis_timestamp, precision, mood),
sub_counts.get(mood, 0))
for total in totals:
for precision, sub_counts in counts.iteritems():
# Set.
# key: runs:<redis_timestamp>:<precision [exact or fuzzy]>:<total>
# value: <count>
redis.set('runs:%s:totals:%s:%s' % (redis_timestamp, precision, total),
sub_counts.get(total, 0))
def process_tweets(terms):
# All totals default to 0.
counts = {'exact': collections.Counter(), 'fuzzy': collections.Counter()}
tweets_seen = []
base_url = 'http://search.twitter.com/search.json?q=%s&rpp=99&page=%s&result_type=recent'
for term in terms:
if settings.DEBUG:
print term
proceed = True
# Keep iterating until there are no more pages.
page = 1
while proceed:
url = base_url % (term.replace(' ', '+'), page)
if not settings.MOCK:
res = requests.get(url, timeout=3)
try:
if settings.MOCK:
data = {'results': [{'id': 1, 'text': 'feel happy'}], 'next_page': '1'}
proceed = False
else:
data = json.loads(res.content)
except ValueError:
proceed = False
else:
if 'next_page' not in data:
proceed = False
try:
results = data['results']
except KeyError:
# No more pages.
proceed = False
else:
for tweet in results:
if tweet['id'] not in tweets_seen:
tweets_seen.append(tweet['id'])
tweet = strip_whitespace(tweet['text'].lower())
# Keep track of the mood counts per tweet.
mood_counts = get_mood_counts(tweet)
counts['exact'].update(mood_counts['exact'])
counts['fuzzy'].update(mood_counts['exact'])
if settings.DEBUG:
print 'exact:', dict(counts['exact'])
print 'fuzzy:', dict(counts['fuzzy'])
if settings.DEBUG:
print '\t', '-' * 69
print '\t', url
print '\t', len(results)
page += 1
# Let Twitter catch its breath.
if page % 5 == 0:
time.sleep(1)
time.sleep(1)
return counts
def get_mood_counts(tweet):
tweet_counts = {'exact': {}, 'fuzzy': {}}
# Go through all the sentiment phrases (e.g., "I feel so {mood}").
for sentiment in sentiments:
# Go through all the moods (e.g., "depressed").
for mood in moods:
# Get all the synonyms for this mood word.
words = mood_synonyms.get(mood, set(mood))
words = filter(None, words)
for word in words:
# See if we find this phrase in the tweet.
phrase_to_look_for = sentiment.format(mood=word)
if phrase_to_look_for in tweet:
tweet_counts['exact'][mood] = 1
# See if the word appears anywhere in the tweet.
if word in tweet:
tweet_counts['fuzzy'][mood] = 1
if tweet_counts['exact']:
tweet_counts['exact']['total_analyzed'] = 1
else:
tweet_counts['exact']['total_rejected'] = 1
tweet_counts['exact']['total_saw'] = 1
if tweet_counts['fuzzy']:
tweet_counts['fuzzy']['total_analyzed'] = 1
else:
tweet_counts['fuzzy']['total_rejected'] = 1
tweet_counts['fuzzy']['total_saw'] = 1
return tweet_counts
def build_synonyms():
for mood in moods:
# Notice that the original mood word is returned by `synsets`.
cached = redis.smembers('synonyms:%s' % mood)
if cached:
# This is already in redis DB, so we're good.
mood_synonyms[mood] = cached
else:
synonyms = get_synonyms(mood)
if mood in mood_roots:
for root in mood_roots[mood]:
synonyms += get_synonyms(root)
# Make this a sorted alphabetical list of unique synonyms.
synonyms = sorted(list(set(synonyms)))
# Remove any blank values.
synonyms = filter(None, synonyms)
# Store it in redis DB, so we don't have to a lookup next time.
for synonym in synonyms:
redis.sadd('synonyms:%s' % mood, synonym)
mood_synonyms[mood] = synonyms
if __name__ == '__main__':
build_synonyms()
while True:
run()
time.sleep(60 * 30) # 30 minutes