forked from scraperwiki/twitter-follows-tool
/
twfollow.py
executable file
·345 lines (279 loc) · 12.6 KB
/
twfollow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
#!/usr/bin/python
import os
import json
import urllib
import sys
import collections
import dateutil.parser
import subprocess
import httplib
import sqlite3
import datetime
import scraperwiki
import httplib
import random
import datetime
from secrets import *
MAX_TO_GET=100000
# Horrendous hack to work around some Twitter / Python incompatibility
# http://bobrochel.blogspot.co.nz/2010/11/bad-servers-chunked-encoding-and.html
def patch_http_response_read(func):
def inner(*args):
try:
return func(*args)
except httplib.IncompleteRead, e:
return e.partial
return inner
httplib.HTTPResponse.read = patch_http_response_read(httplib.HTTPResponse.read)
# Make sure you install this version of "twitter":
# http://pypi.python.org/pypi/twitter
# http://mike.verdone.ca/twitter/
# https://github.com/sixohsix/twitter
import twitter
#########################################################################
# Authentication to Twitter
# This is designed to, when good, be submitted as a patch to add to twitter.oauth_dance (which
# currently only has a function for PIN authentication, not redirect)
from twitter.api import Twitter
from twitter.oauth import OAuth, write_token_file, read_token_file
from twitter.oauth_dance import parse_oauth_tokens
def oauth_url_dance(consumer_key, consumer_secret, callback_url, oauth_verifier, pre_verify_token_filename, verified_token_filename):
# Verification happens in two stages...
# 1) If we haven't done a pre-verification yet... Then we get credentials from Twitter
# that will be used to sign our redirect to them, find the redirect, and instruct the Javascript
# that called us to do the redirect.
if not os.path.exists(CREDS_PRE_VERIFIY):
twitter = Twitter(auth=OAuth('', '', consumer_key, consumer_secret), format='', api_version=None)
oauth_token, oauth_token_secret = parse_oauth_tokens(twitter.oauth.request_token(oauth_callback = callback_url))
write_token_file(pre_verify_token_filename, oauth_token, oauth_token_secret)
oauth_url = 'https://api.twitter.com/oauth/authorize?' + urllib.urlencode({ 'oauth_token': oauth_token })
return oauth_url
# 2) We've done pre-verification, hopefully the user has authed us in Twitter
# and we've been redirected to. Check we are and ask for the permanent tokens.
oauth_token, oauth_token_secret = read_token_file(CREDS_PRE_VERIFIY)
twitter = Twitter(auth=OAuth( oauth_token, oauth_token_secret, consumer_key, consumer_secret), format='', api_version=None)
oauth_token, oauth_token_secret = parse_oauth_tokens(twitter.oauth.access_token(oauth_verifier=oauth_verifier))
write_token_file(verified_token_filename, oauth_token, oauth_token_secret)
return oauth_token, oauth_token_secret
def do_tool_oauth():
if not os.path.exists(CREDS_VERIFIED):
if len(sys.argv) < 3:
result = "need-oauth"
else:
(callback_url, oauth_verifier) = (sys.argv[1], sys.argv[2])
result = oauth_url_dance(CONSUMER_KEY, CONSUMER_SECRET, callback_url, oauth_verifier, CREDS_PRE_VERIFIY, CREDS_VERIFIED)
# a string means a URL for a redirect (otherwise we get a tuple back with auth tokens in)
if type(result) == str:
set_status_and_exit('auth-redirect', 'error', 'Permission needed from Twitter', { 'url': result } )
oauth_token, oauth_token_secret = read_token_file(CREDS_VERIFIED)
tw = twitter.Twitter(auth=twitter.OAuth( oauth_token, oauth_token_secret, CONSUMER_KEY, CONSUMER_SECRET))
return tw
#########################################################################
# Helper functions
# Stores one Twitter user in the ScraperWiki database
def convert_user(batch, user):
data = collections.OrderedDict()
data['id'] = user['id']
data['name'] = user['name']
data['screen_name'] = user['screen_name']
data['profile_url'] = "https://twitter.com/" + user['screen_name']
data['profile_image'] = user['profile_image_url_https'] # shorten name to avoid wasting horizontal space
data['description'] = user['description']
data['location'] = user['location']
data['url'] = user['url']
data['followers_count'] = user['followers_count']
data['following_count'] = user['friends_count'] # rename as "friends" is confusing to end users
data['statuses_count'] = user['statuses_count']
data['created_at'] = dateutil.parser.parse(user['created_at'])
data['batch'] = batch # this is needed internally to track progress of getting all the followers
return data
# After detecting an auth failed error mid work, call this
def clear_auth_and_restart():
# remove auth files and respawn
try:
os.remove(CREDS_PRE_VERIFIY)
os.remove(CREDS_VERIFIED)
except OSError:
# don't worry if the files aren't there
pass
subprocess.call(sys.argv)
sys.exit()
# Signal back to the calling Javascript, to the database, and custard's status API, our status
def set_status_and_exit(status, typ, message, extra = {}):
global current_status
extra['status'] = status
print json.dumps(extra)
scraperwiki.status(typ, message)
current_status = status
save_status()
sys.exit()
# Store all our progress variables
def save_status():
global current_batch, next_cursor, batch_got, batch_expected, current_status
# Update progress indicators...
# For number of users got, we count the total of:
# 1) all followers in the last full batch
# 2) all followers transferred into the new batch so far
# i.e. all those for whom batch >= (current_batch - 1)
try:
batch_got = scraperwiki.sql.select("count(*) as c from twitter_followers where batch >= %d" % (current_batch - 1))[0]['c']
except:
batch_got = 0
data = {
'id': 'followers',
'current_batch': current_batch,
'next_cursor': next_cursor,
'batch_got': batch_got,
'batch_expected': batch_expected,
'current_status': current_status,
'when': datetime.datetime.now().isoformat()
}
scraperwiki.sql.save(['id'], data, table_name='__status')
# Load in all our progress variables
current_batch = 1
next_cursor = -1
batch_got = 0
batch_expected = 0
current_status = 'clean-slate'
def get_status():
global current_batch, next_cursor, batch_got, batch_expected, current_status
try:
data = scraperwiki.sql.select("* from __status where id='followers'")
except sqlite3.OperationalError, e:
if str(e) == "no such table: __status":
return
raise
if len(data) == 0:
return
assert(len(data) == 1)
data = data[0]
current_batch = data['current_batch']
next_cursor = data['next_cursor']
batch_got = data['batch_got']
batch_expected = data['batch_expected']
current_status = data['current_status']
# http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python
def chunks(l, n):
return [l[i:i+n] for i in range(0, len(l), n)]
#########################################################################
# Main code
pages_got = 0
try:
# Rename old status table to new __status name.
# This can be removed after it has been active long enough to
# update all existing tools.
try :
scraperwiki.sql.execute("SELECT 1 FROM status")
except sqlite3.OperationalError:
pass
else:
scraperwiki.sql.execute("ALTER TABLE status RENAME TO __status")
# Parameters to this command vary:
# a. None: try and scrape Twitter followers
# b. callback_url oauth_verifier: have just come back from Twitter with these oauth tokens
# c. "clean-slate": wipe database and start again
if len(sys.argv) > 1 and sys.argv[1] == 'clean-slate':
scraperwiki.sql.execute("drop table if exists twitter_followers")
scraperwiki.sql.execute("drop table if exists __status")
os.system("crontab -r >/dev/null 2>&1")
set_status_and_exit('clean-slate', 'error', 'No user set')
sys.exit()
# Make the followers table *first* with dumb data, calling DumpTruck directly,
# so it appears before the status one in the list
scraperwiki.sql.dt.create_table({'id': 1, 'batch': 1}, 'twitter_followers')
scraperwiki.sql.execute("CREATE INDEX IF NOT EXISTS batch_index "
"ON twitter_followers (batch)")
# Get user we're working on from file we store it in
screen_name = open("user.txt").read().strip()
# Connect to Twitter
tw = do_tool_oauth()
# A batch is one scan through the list of followers - we have to scan as
# our API calls are limited. The cursor is Twitter's identifier of where
# in the current batch we are.
get_status()
# Note that each user is only in the most recent batch they've been found in
# (we don't keep all the history)
# Look up latest followers count
profile = tw.users.lookup(screen_name=screen_name)
batch_expected = profile[0]['followers_count']
# Things basically working, so make sure we run again by writing a crontab.
if not os.path.isfile("crontab"):
crontab = open("tool/crontab.template").read()
# ... run at a random minute to distribute load XXX platform should do this for us
crontab = crontab.replace("RANDOM", str(random.randint(0, 59)))
open("crontab", "w").write(crontab)
os.system("crontab crontab")
# Get as many pages in the batch as we can (most likely 15!)
onetime = 'ONETIME' in os.environ
live_dataset = 'LIVE_DATASET' in os.environ
while True:
#raise httplib.IncompleteRead('hi') # for testing
#print "getting", next_cursor
# get the identifiers of followers - one page worth (up to 5000 people)
if next_cursor == -1:
result = tw.followers.ids(screen_name=screen_name)
else:
result = tw.followers.ids(screen_name=screen_name, cursor=next_cursor)
ids = result['ids']
# and then the user details for all the ids
double_break = False
for chunk in chunks(ids, 100):
users = tw.users.lookup(user_id=(",".join(map(str, chunk))))
data = []
for user in users:
datum = convert_user(current_batch, user)
data.append(datum)
scraperwiki.sql.save(['id'], data, table_name="twitter_followers")
save_status()
# Don't allow more than a certain number
if batch_got >= MAX_TO_GET:
os.system("crontab -r >/dev/null 2>&1")
set_status_and_exit("ok-limit", 'ok', "Reached %d follower limit" % MAX_TO_GET)
# If being run from the user interface, return quickly after being
# sure we've got *something* (the Javascript will then spawn us
# again in the background to slowly get the rest)
if onetime:
double_break = True
break
if double_break:
break
# we have all the info for one page - record got and save it
pages_got += 1
next_cursor = result['next_cursor']
# While debugging, only do one page to avoid rate limits by uncommenting this:
# break
if next_cursor == 0:
# We've finished a batch
next_cursor = -1
current_batch += 1
if not live_dataset:
# Disable cron job, we're done
os.system("crontab -r >/dev/null 2>&1")
set_status_and_exit("ok-done", 'ok', "Finished")
break
except twitter.api.TwitterHTTPError, e:
if "Twitter sent status 401 for URL" in str(e):
clear_auth_and_restart()
# https://dev.twitter.com/docs/error-codes-responses
obj = json.loads(e.response_data)
code = obj['errors'][0]['code']
# authentication failure
if (code in [32, 89]):
clear_auth_and_restart()
# page not found
if code == 34:
set_status_and_exit('not-there', 'error', 'User not on Twitter')
# rate limit exceeded
if code == 88:
# provided we got at least one page, rate limit isn't an error but expected
if pages_got == 0:
set_status_and_exit('rate-limit', 'error', 'Twitter is rate limiting you')
else:
# anything else is an unexpected error - if ones occur a lot, add the above instead
raise
except httplib.IncompleteRead, e:
# I think this is effectively a rate limit error - so only count if it was first error
if pages_got == 0:
set_status_and_exit('rate-limit', 'error', 'Twitter broke the connection')
# Save progress message
set_status_and_exit("ok-updating", 'ok', "Running... %d/%d" % (batch_got, batch_expected))