/
fetcherparser.py
executable file
·237 lines (217 loc) · 9.15 KB
/
fetcherparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""This module will handle fetching the JSON and parsing it to do recursive fetches of messages and their resources (images etc)"""
import json, urllib_cached, hashlib, os, re
import traceback
# some global config values
debug = True
debug_tracebacks = False
fetch_profile_images = True
import storage, screenscraper
apikey = storage.read_api_key()
replycache = {}
recursion_loop_detector = {}
def json_parse_url(url):
"""Trivial helper to avoid copy-pasting same code all over"""
if debug:
print "Fetching (JSON) %s" % url
try:
fp = urllib_cached.urlopen(url)
parsed = json.load(fp)
fp.close()
except Exception,e:
print "Got exception %s" % e
return None
return parsed
# compile this just once, it's used by fetch_resource()
getparams_re = re.compile('\?.*$')
local_resource_re = re.compile('^' + os.path.join('resources', '').replace('\\', '\\\\') + '[0-9a-f]{2}')
def fetch_resource(url):
"""Fetches and stores locally remote resources and returns the local filepath"""
if local_resource_re.match(url):
# This is already a local resource
return url
local_id = hashlib.md5(url).hexdigest()
extension = ""
# Try to figure out a file extension just to make things nicer to file browsers
try:
filename = getparams_re.sub('', os.path.basename(url))
extension = filename.rsplit('.', 1)[1] # get the last extension.
except Exception,e:
print "Got exception %s when trying to figure out file extension for %s" % (e, url)
local_path = os.path.join('resources', local_id[0:2], local_id + "." + extension)
# If we already have the file just return it
if os.path.isfile(local_path):
# Make sure the file has sane amount of data...
if (os.stat(local_path).st_size < 16):
print "ERR: Local file %s is empty, removing" % local_path
os.unlink(local_path)
else:
return local_path
# Create the container dir if it's not there
if not os.path.isdir(os.path.dirname(local_path)):
os.makedirs(os.path.dirname(local_path))
if debug:
print "Fetching (BIN) %s to %s" % (url, local_path)
fp_from = None
fp_to = None
try:
fp_from = urllib_cached.urlopen(url)
fp_to = open(local_path, 'wb')
# TODO: use a sensibly sized buffer ?
fp_to.write(fp_from.read())
fp_from.close()
fp_to.close()
except Exception,e:
print "Got exception %s" % e
if fp_from:
fp_from.close()
if fp_to:
fp_to.close()
if os.path.isfile(local_path):
os.unlink(local_path)
return None
return local_path
def fetch_message(object_id):
"""Returns a message, from local cache if available, otherwise loads via REST API, you probably should be calling recursive_fetch_message first"""
if debug:
print "fetch_message(%s) called" % repr(object_id)
if debug_tracebacks:
traceback.print_stack()
object_id = str(object_id) # cast to normal str
if storage.in_cache_byid(object_id):
obj = storage.get_byid(object_id)
if ( ( obj.has_key('truncated')
and obj['truncated'])
or ( obj.has_key('QaikuBackup_stale')
and obj['QaikuBackup_stale'])
):
# Object is stale, do not return from cache
if debug:
print "storage.objectcache[%s] is stale" % repr(object_id)
if debug_tracebacks:
print json.dumps(storage.get_byid(object_id), sort_keys=True, indent=4)
pass
else:
if debug:
print "message %s returned from cache" % object_id
return storage.get_byid(object_id)
else:
#print "objectcache has no key %s" % repr(object_id)
#print json.dumps(storage.objectcache, sort_keys=True, indent=4)
pass
url = "http://www.qaiku.com/api/statuses/show/%s.json?apikey=%s" % (object_id, apikey)
parsed = json_parse_url(url)
if not parsed:
# parse failed, return stale object if we have one
if storage.in_cache_byid(object_id):
return storage.get_byid(object_id)
else:
return None
storage.update(parsed)
return storage.get_byid(object_id)
def clear_recursion_loop_detector():
for k in recursion_loop_detector.keys():
del(recursion_loop_detector[k])
# TODO: rethink this and fetch_message
def recursive_fetch_message(object_id, recursion_level = 0):
"""Fetches a message and all it's dependendies/replies/etc"""
object_id = str(object_id) # cast to normal str
if debug:
print "recursive_fetch_message(%s, %d)" % (repr(object_id), recursion_level)
#print "recursion_loop_detector=%s" % repr(recursion_loop_detector)
if recursion_loop_detector.has_key(object_id):
return False
obj = fetch_message(object_id)
if not obj:
return False
# Keep track of recursion so we do not get trapped in an infinite loop
recursion_loop_detector[object_id] = True
# Cache and rewrite profile image url
if ( fetch_profile_images
and obj.has_key('user')
and obj['user'].has_key('profile_image_url')):
res = fetch_resource(obj['user']['profile_image_url'])
if res:
obj['user']['profile_image_url'] = res
# Fetch the message image if any, however this is the tiny thumbnail :..(
if ( obj.has_key('image_url')
and obj['image_url']):
res = fetch_resource(obj['image_url'])
if res:
obj['image_url'] = res
# Force objectcache update to make sure we don't have funky COW issues
storage.update(obj)
# Fetch the real image, this will take some screen-scraping unless Rohea is kind enough to add the URL to the API in these last times...
screenscraper.fill_and_fetch_image_urls(obj['id'])
# Retvieve parent message is any (and other link properties ?)
for k in ['in_reply_to_status_id',]:
if ( obj.has_key(k)
and obj[k]):
if debug:
print "Recursing %s->%s(=%s)" % (obj['id'], k, obj[k])
recursive_fetch_message(obj[k], recursion_level+1)
# Get replies
if debug:
print "Checking replies for %s" % (obj['id'])
replies = fetch_replies(obj['id'], recursion_level+1)
# Clear the recursion tracker
if recursion_level == 0:
clear_recursion_loop_detector()
return True
def fetch_paged(urlbase, pagelimit=None):
"""This will loop through page numbers until no more results are returned"""
resultlist = []
page = 0
loop = True
while (loop):
url = "%s?apikey=%s&page=%d" % (urlbase, apikey, page)
parsed = json_parse_url(url)
if ( not parsed
or len(parsed) == 0):
loop = False
continue
resultlist = resultlist+parsed
page = page+1
if ( pagelimit
and page > pagelimit):
loop = False
return resultlist
def fetch_replies(object_id, recursion_level = 0):
"""Get full list of replies to a message (and insert them to cache, recursing)"""
object_id = str(object_id) # normalize the id
if replycache.has_key(object_id):
return replycache[object_id]
replies = fetch_paged("http://www.qaiku.com/api/statuses/replies/%s.json" % object_id)
if not replies:
replies = []
# Cache all the messages while at it
mass_insert_and_recurse(replies, recursion_level)
replycache[object_id] = map(lambda o: fetch_message(str(o['id'])), replies) # refresh the objects before placing them as pointers to the replycache
# And put a list of the replies to the object we fetched them for
obj = fetch_message(object_id)
if storage.in_cache(obj): # this should not fail, not at this point anymore
obj['QaikuBackup_replies'] = [ o['id'] for o in replies ] # Map a list of the reply IDs to the object (using python pointers we could just point to the list of objects but that would cause no end of headache for the JSON serialization we plane to do)
# Force objectcache update to make sure we don't have funky COW issues
storage.update(obj)
return replycache[object_id]
def insert_and_recurse(qaiku_message, recursion_level = 0):
"""Insert a message to cache and get all it's resources/replies/etc"""
if not storage.in_cache(qaiku_message):
storage.update(qaiku_message)
return recursive_fetch_message(qaiku_message['id'], recursion_level)
def mass_insert_and_recurse(list_of_messages, recursion_level = 0):
for qaiku_message in list_of_messages:
if not storage.in_cache(qaiku_message):
storage.update(qaiku_message)
# And then handle the recursions
for qaiku_message in list_of_messages:
recursive_fetch_message(qaiku_message['id'], recursion_level)
if __name__ == '__main__':
import sys,os
print "*** STARTING ***"
recursive_fetch_message(sys.argv[1])
urllib_cached.clean()
print "*** DONE ***"
print "storage.objectcache contents:"
print json.dumps(storage.objectcache, sort_keys=True, indent=4)