-
Notifications
You must be signed in to change notification settings - Fork 0
/
subreddit-link-crawler.py
executable file
·352 lines (305 loc) · 14.8 KB
/
subreddit-link-crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
#!/usr/bin/env python3
'''
A script for crawling all links listed in a given subreddit and saving them to a given directory.
'''
__author__ = "Todd Shore"
__copyright__ = "Copyright 2016 Todd Shore"
__license__ = "GPL"
__reddit_author_username__ = "errantlinguist"
__reddit_app_name__ = "subreddit-link-crawler"
__reddit_redirect_uri__ = "https://github.com/errantlinguist/subreddit-link-crawler"
__version__ = "0.0.1"
__website__ = "https://github.com/errantlinguist/subreddit-link-crawler"
import argparse
import datetime
import mimetypes
import os
import sys
import requests
import string
import time
from cgi import parse_header
from collections import deque
from urllib.parse import urlsplit
DEFAULT_EXPECTED_CONTENT_TYPE="text/html"
DEFAULT_OUTPATH_SUFFIX = mimetypes.guess_extension(DEFAULT_EXPECTED_CONTENT_TYPE)
DEFAULT_REQUEST_CHARSET = "UTF-8"
URL_FILENAME_TRANSLATION_TABLE = {ord(':') : '-', ord('/') : os.path.sep, ord('\\') : '-', ord('*') : '-', ord('?') : '-', ord('"') : '\'', ord('<') : '-', ord('>') : '-', ord('|') : '-', ord('\0') : '0', ord('.') : os.path.sep}
__CRAWLING_REQUEST_HEADERS = {
"Accept" : DEFAULT_EXPECTED_CONTENT_TYPE + ";application/xhtml+xml",
"Accept-Charset" : DEFAULT_REQUEST_CHARSET,
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.102 Safari/537.36"
}
'''
See: https://github.com/reddit/reddit/wiki/API#user-content-rules
'''
__REDDIT_USER_AGENT_STR = "%(platform)s:%(app_id)s:%(version)s (by /u/%(reddit_username)s)" % {"platform" : sys.platform, "app_id" : __reddit_app_name__, "version" : __version__, "reddit_username" : __reddit_author_username__}
__AUTH_TOKEN_REQUEST_HEADERS = {
"User-Agent" : __REDDIT_USER_AGENT_STR
}
class AuthTokenData(object):
def __init__(self, auth_token_response):
json = auth_token_response.json()
self.token_type = json["token_type"]
self.access_token = json["access_token"]
self.auth_expiration_time = time.time() + int(json["expires_in"])
class CrawlStatistics(object):
def __init__(self):
self.attempted = set()
self.failed = set()
self.time_range = (float("inf"), float("-inf"))
def notify_attempt(self, key, time):
# Calculate new range even for possible duplicate attempts
if time < self.time_range[0]:
self.time_range = (time, self.time_range[1])
if time > self.time_range[1]:
self.time_range = (self.time_range[0], time)
return self.__add_attempt(key, time)
def __add_attempt(self, key, time):
old_attempted_len = len(self.attempted)
self.attempted.add(key)
return len(self.attempted) > old_attempted_len
class SubredditLinkCrawler(object):
def __init__(self, auth, subreddit, outdir, limit, batch_size, max_retries, user_agent):
self.auth = auth
self.get_method = "https://oauth.reddit.com/r/" + subreddit + "/.json"
if limit:
self.limit = limit
else:
self.limit = float("inf")
self.max_retries = max_retries
self.outdir = outdir
self.user_agent = user_agent
self.parsed_url_count = 0
self.last_parsed_thing = None
self.stats = CrawlStatistics()
self.processed_url_count = 0
self.__params = {"count" : self.parsed_url_count, "limit" : batch_size}
@property
def params(self):
self.__params["after"] = self.last_parsed_thing
self.__params["count"] = self.parsed_url_count
if self.remaining_url_count < self.__params["limit"]:
self.__params["limit"] = self.remaining_url_count
return self.__params
@property
def remaining_url_count(self):
return self.limit - self.processed_url_count
def crawl(self):
auth_token_data = self.retrieve_auth_token()
while self.remaining_url_count > 0:
batch_urls = []
if auth_token_data.auth_expiration_time <= time.time():
print("Refreshing authentication token.", file=sys.stderr)
auth_token_data = self.refresh_auth_token(auth_token_data)
try:
next_page_response = self.__request_next_listing(auth_token_data)
except requests.HTTPError as e:
code = next_page_response.status_code
if code == requests.status_codes.codes.unauthorized or code == requests.status_codes.codes.forbidden:
print("Refreshing authentication token.", file=sys.stderr)
auth_token_data = self.refresh_auth_token(auth_token_data)
# Try again after having refreshed the auth token but don't catch the next exception if one is raised
next_page_response = self.__request_next_listing(auth_token_data)
else:
raise e
reddit_thing_urls, last_thing_name = parse_reddit_thing_urls_from_response(next_page_response)
for name, creation_timestamp, url in reddit_thing_urls:
self.parsed_url_count += 1
#print("Adding URL \"%s\" to batch." % url, file=sys.stderr)
if self.stats.notify_attempt(url, creation_timestamp):
batch_urls.append(url)
else:
print("URL \"%s\" has already been processed; Skipping." % url, file=sys.stderr)
#print("Processing %d crawled URLs." % len(batch_urls), file=sys.stderr)
failed_urls = save_pages(batch_urls, self.outdir, self.max_retries)
self.stats.failed.update(failed_urls)
successful_url_count = len(batch_urls) - len(failed_urls)
self.processed_url_count += successful_url_count
if last_thing_name:
self.last_parsed_thing = last_thing_name
else:
if self.limit == float("inf"):
print("End of subreddit encountered; Aborting after having processed %d URLs." %self.processed_url_count, file=sys.stderr)
else:
print("End of subreddit encountered; Aborting after having processed %d of an intended %d URLs." %(self.processed_url_count, self.limit), file=sys.stderr)
break
print("Retrieved %d out of %d unique pages returned by reddit." % (self.processed_url_count, len(self.stats.attempted)), file=sys.stderr)
return self.stats
def refresh_auth_token(self, auth_token_data):
auth_token_response = refresh_auth_token(auth_token_data.access_token, self.auth)
auth_token_response.raise_for_status()
return AuthTokenData(auth_token_response)
def retrieve_auth_token(self):
auth_token_response = retrieve_auth_token(self.auth)
auth_token_response.raise_for_status()
return AuthTokenData(auth_token_response)
def __request_next_listing(self, auth_token_data):
# (Re-)create the header dictionary here in the case that the token had to be refreshed
headers = {
"Accept" : "application/json",
"Accept-Charset" : DEFAULT_REQUEST_CHARSET,
"Authorization": auth_token_data.token_type + " " + auth_token_data.access_token,
"User-Agent": self.user_agent}
result = requests.get(self.get_method, headers=headers, params=self.params)
result.raise_for_status()
return result
def create_url_filename(url_str, content_type):
# See also: http://stackoverflow.com/a/7406369/1391325
split_url = urlsplit(url_str)
netloc = split_url[1]
netloc_dirname = os.path.sep.join(reversed(netloc.split('.')))
path = split_url[2]
stripped_url_str = "".join((netloc_dirname, path))
url_without_ext, existing_ext = os.path.splitext(stripped_url_str)
filename_without_ext = url_without_ext.translate(URL_FILENAME_TRANSLATION_TABLE)
if filename_without_ext.endswith(os.path.sep):
filename_without_ext = filename_without_ext[:-len(os.path.sep)]
if existing_ext:
acceptable_filename_exts = mimetypes.guess_all_extensions(content_type)
if existing_ext in acceptable_filename_exts:
# Re-concatenate the now-normalized filename base with the original extension
result = filename_without_ext + existing_ext
else:
canonical_ext = mimetypes.guess_extension(content_type)
if canonical_ext:
# If a canonical extension was found for the given content type, concatenate it to the now-normalized filename base
result = filename_without_ext + canonical_ext
else:
# If no canonical extension was found, re-concatenate the original extension after normalizing it
normalized_existing_ext = normalize_url_component(existing_ext, ".")
result = filename_without_ext + normalized_existing_ext
else:
# Concatenate the canonical extension for the given content type to the result filename in order to avoid potential clashes with other URLs
canonical_ext = mimetypes.guess_extension(content_type)
if canonical_ext:
result = filename_without_ext + canonical_ext
else:
# Just add some extention
result = filename_without_ext + DEFAULT_OUTPATH_SUFFIX
return result
def format_timestamp(posix_time):
'''
See: http://stackoverflow.com/a/37188257/1391325
'''
return datetime.datetime.utcfromtimestamp(posix_time).strftime('%Y-%m-%dT%H:%M:%SZ')
def normalize_url_component(component, ignored_prefix=None):
if ignored_prefix is not None and component.startswith(ignored_prefix):
substr_to_normalize = component[len(ignored_prefix):]
normalized_substr = substr_to_normalize.translate(URL_FILENAME_TRANSLATION_TABLE)
result = ignored_prefix + normalized_substr
else:
result = component.translate(URL_FILENAME_TRANSLATION_TABLE)
return result
def print_stats(stats, outfile):
print("Oldest listed link date: " + format_timestamp(stats.time_range[0]))
print("Newest listed link date: " + format_timestamp(stats.time_range[1]))
print("Failed URLS:", file=outfile)
for failed_url in stats.failed:
print(failed_url, file=outfile)
def refresh_auth_token(refresh_token, auth):
post_data = {"grant_type": "refresh_token", "refresh_token" : refresh_token}
return requests.post("https://www.reddit.com/api/v1/access_token", auth=auth, data=post_data, headers=__AUTH_TOKEN_REQUEST_HEADERS)
def retrieve_auth_token(auth):
'''
Oauth2 authentication <https://github.com/reddit/reddit/wiki/OAuth2#user-content-authorization>
See: https://github.com/reddit/reddit/wiki/OAuth2-Quick-Start-Example#user-content-python-example
'''
post_data = {"grant_type": "client_credentials"}
return requests.post("https://www.reddit.com/api/v1/access_token", auth=auth, data=post_data, headers=__AUTH_TOKEN_REQUEST_HEADERS)
def save_pages(urls, outpath_prefix, max_retries):
result = set()
url_attempt_queue = deque((url, 0) for url in urls)
while url_attempt_queue:
url, attempts = url_attempt_queue.popleft()
# First try to guess the MIME type returned by the server for the given URL in order to guess the result filename
guessed_content_type = mimetypes.guess_type(url)[0]
if guessed_content_type:
#print("Guessed URL content type to be \"%s\"." % guessed_content_type, file=sys.stderr)
pass
else:
guessed_content_type = DEFAULT_EXPECTED_CONTENT_TYPE
#print("Could not guess URL content type; Defaulting to \"%s\"." % guessed_content_type, file=sys.stderr)
guessed_outpath_filename = create_url_filename(url, guessed_content_type)
outpath = os.path.join(outpath_prefix, guessed_outpath_filename)
if os.path.exists(outpath):
print("File path \"%s\" already exists; Skipping." % outpath, file=sys.stderr)
else:
#print("Downloading article \"%s\"." % url, file=sys.stderr)
try:
crawling_response = requests.get(url, headers=__CRAWLING_REQUEST_HEADERS)
try:
attempts += 1
crawling_response.raise_for_status()
response_content_type = crawling_response.headers["Content-Type"]
if response_content_type:
# Strip any possible encoding data
response_content_type = parse_header(response_content_type)[0]
if response_content_type != guessed_content_type:
#print("Re-calculating output path for received content type \"%s\"." % response_content_type, file=sys.stderr)
outpath_filename = create_url_filename(url, response_content_type)
outpath = os.path.join(outpath_prefix, outpath_filename)
if os.path.exists(outpath):
print("File path \"%s\" already exists; Skipping." % outpath, file=sys.stderr)
else:
# After getting the response data, write it to file
write_to_unknown_dir(outpath, crawling_response.text)
print("%s > %s" %(url, outpath), file=sys.stderr)
else:
# After getting the response data, write it to file
write_to_unknown_dir(outpath, crawling_response.text)
print("%s > %s" %(url, outpath), file=sys.stderr)
else:
# After getting the response data, write it to file
write_to_unknown_dir(outpath, crawling_response.text)
print("%s > %s" %(url, outpath), file=sys.stderr)
except requests.HTTPError as e:
code = crawling_response.status_code
if attempts > max_retries:
print("Received HTTP status %d while requesting URL \"%s\" (attempt %d); Giving up." %(code, url, attempts), file=sys.stderr)
result.add(url)
else:
print("Received HTTP status %d while requesting URL \"%s\" (attempt %d); Will try again later." %(code, url, attempts), file=sys.stderr)
url_attempt_queue.append((url, attempts))
except requests.RequestException as e:
print("An irrecoverable error occurred while requesting URL \"%s\" (attempt %d); Giving up: %s" %(url, attempts, e), file=sys.stderr)
result.add(url)
return result
def parse_reddit_thing_urls_from_response(response):
data = response.json()["data"]
reddit_thing_urls = parse_reddit_thing_urls(data)
last_thing_name = data["after"]
return reddit_thing_urls, last_thing_name
def parse_reddit_thing_urls(data):
children = data["children"]
#print("Processing %d child(ren)." % len(children))
for child in children:
child_data = child["data"]
child_name = child_data["name"]
child_creation_timestamp = child_data["created_utc"]
url_attr = "url"
url = child_data.get(url_attr)
if url:
yield (child_name, child_creation_timestamp, url)
else:
print("Reddit thing named \"%s\" has no \"%s\" attribute." %(child_name, url_attr), file=sys.stderr)
def write_to_unknown_dir(outpath, content):
outdir = os.path.dirname(outpath)
if not os.path.exists(outdir):
os.makedirs(outdir)
with open(outpath, 'w') as outf:
outf.write(content)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape all links from the subreddit \"nottheonion\" and then crawl and save each the linked page.")
parser.add_argument("subreddit", help="The name of the subreddit to crawl.")
parser.add_argument("secret", help="The reddit application secret.")
parser.add_argument("outdir", help="The directory path under which the crawled pages will be saved.")
parser.add_argument("-l", "--limit", default=None, help="The total number of reddit links to process.", metavar="COUNT", type=int)
parser.add_argument("-b", "--batch-size", default=100, help="The maximum number of reddit things to request at once.", metavar="COUNT", type=int)
parser.add_argument("-r", "--max-retries", default=3, help="The number of times to re-try requesting a given URL if a non-successful HTTP code is returned on the first attempt.", metavar="COUNT", type=int)
args = parser.parse_args()
print("Scraping links from subreddit \"%s\" and saving to \"%s\"." % (args.subreddit, args.outdir), file=sys.stderr)
auth = ("_JNFnqor9ZT4mQ", args.secret)
crawler = SubredditLinkCrawler(auth, args.subreddit, args.outdir, args.limit, args.batch_size, args.max_retries, __REDDIT_USER_AGENT_STR)
stats = crawler.crawl()
print_stats(stats, sys.stdout)