/
main.py
162 lines (131 loc) · 5 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# -*- coding: utf-8 -*-
import os
import sys
from time import sleep
import logging
from pymongo import MongoClient
from delorean import parse
from utils.secret.twitconfig import T # hide access keys
# provide project name as a command line argument
# use lowercase ascii-only no-spaces
# (the program does not check for these yet -- so take care)
try:
PROJECT = sys.argv[1]
except:
raise KeyError('No project name provided.')
# put hashtags in a text file called tags_<PROJECT>.txt next to this script
SIGINT_RCVD = False # Converted to True when Ctrl-C is pressed
CLIENT = MongoClient('localhost', 27017)
DB = CLIENT.tweets_db
TWEETS = DB[PROJECT]
TAGS_FILE = os.path.join('tags_{}.txt'.format(PROJECT))
LOG_FILE = os.path.join('logs', 'log_{}.txt'.format(PROJECT))
TAG_DICT = {}
def get_tweets(query, count=100, max_id=-1):
if max_id != -1:
retriever = T.search.tweets(q=query,
count=count,
max_id=max_id)
else:
# on first call, only send count, to get oldest tweets for query
retriever = T.search.tweets(q=query,
count=count)
statuses = retriever[u'statuses']
# sleep_int = get_sleep_interval()
sleep(2.3) # comply with API level rate-limiting: 450 calls in every 15 min window
if len(statuses) > 0:
msg = 'Query: {}. Loaded {} statuses from id {} to id {}.'
msg = msg.format(query,
len(statuses),
statuses[0]['id'],
statuses[-1]['id'])
logging.info(msg)
new_max = insert_tweets(statuses, query)
return new_max
else:
logging.info('No tweets for hashtag: {}.'.format(query))
return False
def make_hash_field(hashtags):
tag_list = []
for item in hashtags:
tag_lc = item['text'].lower()
tag_list.append(tag_lc)
return tag_list
def insert_tweets(tweets, query):
# TODO clean up the logic re the max_id -- the current tweet_id is not
# necessarily the max id for all hastags being monitored! We need to keep
# separate max_id for each tag...
max_id = -1
results = []
feedback = ''
for tweet in tweets:
tweet['_id'] = tweet['id']
max_id = int(tweet['_id'])
tweet.pop('id', None) # delete the old id key
tweet['hashtags'] = make_hash_field(tweet['entities']['hashtags'])
tweet['created_at'] = parse(tweet['created_at']).datetime
tweet['user']['created_at'] = parse(tweet['user']['created_at']).datetime
try:
result = TWEETS.update({'_id': tweet['_id']}, tweet, upsert=True)
# print '{} inserted with tags {}'.format(result, tweet['hashtags'])
results.append(result)
except:
logging.info('ERROR in updating document id: {}.'.format(tweet['_id']))
feedback = ['+' for result in results if not result['updatedExisting']]
feedback = '{:3} / {:3} {}'.format(len(feedback), len(tweets), query)
print feedback
return max_id
def get_tags(fyle):
with open(fyle, 'rb') as f:
lines = f.readlines()
hash_tags = [ln.strip()
for ln in lines
if not ln.startswith('#') and ln.strip()]
return hash_tags
def process_tags(sigint):
# NOTE that hashtags in tag_dict have no leading # sign but this is needed for Twitter API queries
# foreach item in the queue get max id and
# send a call to get_tweets, go to next in the queue, cycle
queue = TAG_DICT.keys()
try:
while not sigint:
tag = queue.pop(0)
# query = tag
query = '#' + tag
maxid = TAG_DICT[tag]
# print '{}: Getting query {} from maxid {} in queue {}'.format(cycles, query, maxid, str(queue))
maxid = get_tweets(query, max_id=maxid)
if maxid:
TAG_DICT[tag] = maxid
queue.append(tag)
except KeyboardInterrupt:
sigint = True
logging.info('SIGINT received. Capture terminated by user.')
def get_latest_id(spec):
"""Find the first and last document given a spec dict"""
try:
# latest = TWEETS.find_one(spec)
# latest = TWEETS.find(spec).sort([('_id', -1)]).limit(1).next()
latest = TWEETS.find(spec).sort([('_id', -1)]).limit(1).next()
latest = latest['_id']
except:
latest = -1
return latest
def main():
tags = get_tags(TAGS_FILE)
for tag in tags:
doc_max_id = get_latest_id(spec={'hashtags': tag})
if doc_max_id:
TAG_DICT[tag] = doc_max_id
else:
TAG_DICT[tag] = 0
for k, v in TAG_DICT.iteritems():
print k, v
logging.basicConfig(filename=LOG_FILE,
level=logging.DEBUG,
format='%(asctime)s %(message)s',
datefmt='%m/%d/%Y %I:%M:%S %p')
logging.info('Started loading tweets.')
process_tags(sigint=SIGINT_RCVD)
if __name__ == '__main__':
main()