-
Notifications
You must be signed in to change notification settings - Fork 2
/
subreddit_fetcher.py
107 lines (87 loc) · 4.22 KB
/
subreddit_fetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""
Fetches all fields for the specified submissions on reddit and stores it into a CSV file.
Example Usage:
python reddit_fetcher.py --start=0 --end=200 --columns='internal_id,id,subreddit' --output_prefix="output"
Note there is a special field added, "internal_id", that is the integer value of base36 encoded reddit id.
"""
# Explaining this black magic:
# http://stackoverflow.com/questions/3828723/why-we-need-sys-setdefaultencodingutf-8-in-a-py-script
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
# import codecs
import csv
import gflags
import praw
import time
from secret import REDDIT_USER_AGENT
from reddit_common import Submission
from reddit_common import submission_to_unicode
FLAGS = gflags.FLAGS
gflags.DEFINE_string('start_ts', None, 'Start fetching from timestamp.')
gflags.DEFINE_string('end_ts', None, 'End fetching at timestamp')
gflags.DEFINE_integer('step_days', 15, 'Fetch submissions in batches of step_days')
gflags.DEFINE_string('subreddits', '', 'CSV list of subreddits to fetch')
gflags.DEFINE_string('output_prefix', None, 'Location and file prefix of output CSV')
gflags.MarkFlagAsRequired('start_ts')
gflags.MarkFlagAsRequired('end_ts')
gflags.MarkFlagAsRequired('output_prefix')
gflags.MarkFlagAsRequired('subreddits')
# Constants
MAX_SUBMISSIONS_PER_QUERY = 100
# If the number of results > RESULTS_CORRECTNESS_CHECK, you should probably
# reduce --step_days. Reddit limits the number of results to ~1000, so to not
# truncate our results, we have this correctness check in place
RESULTS_CORRECTNESS_CHECK = 900
SECONDS_IN_A_DAY = 60 * 60 * 24
def epoch_to_date_string(epoch):
# pattern = '%Y-%m-%d %H:%M:%S'
pattern = '%Y-%m-%d'
return time.strftime(pattern, time.localtime(epoch))
def date_to_epoch(date_str):
pattern = '%Y-%m-%d'
return int(time.mktime(time.strptime(date_str, pattern)))
def fetch_submissions(start_epoch, end_epoch, subreddits, csv_writer):
r = praw.Reddit(user_agent=REDDIT_USER_AGENT)
seconds_to_increment = SECONDS_IN_A_DAY * FLAGS.step_days
total_records_fetched = 0
for subreddit in subreddits:
for i in xrange(start_epoch, end_epoch, seconds_to_increment):
segment_start = i
segment_end = segment_start + min(seconds_to_increment, end_epoch - i)
query = 'timestamp:%d..%d' % (segment_start, segment_end)
results = list(r.search(query, subreddit=subreddit,
sort='new', limit=None,
syntax='cloudsearch'))
if len(results) > RESULTS_CORRECTNESS_CHECK:
print ("WARNING: received %i results. This is dangerously close"
"to the max number of allowed results (1000)." % (
len(results)))
for result in results:
# We store the score as a string because we will have to write
# it.
submission = Submission(
result.id, result.title, result.selftext, result.url,
result.permalink, unicode(result.score), subreddit)
# submission = submission_fields_as_strings(submission)
# unicode_row = [s.encode('utf-8') for s in submission]
unicode_row = submission_to_unicode(submission)
csv_writer.writerow(unicode_row)
total_records_fetched += len(results)
segment_start_string = epoch_to_date_string(segment_start)
segment_end_string = epoch_to_date_string(segment_end)
print "[%s] - from %s to %s fetched %i results. Total submissions saved: %i" % (
subreddit, segment_start_string, segment_end_string, len(results),
total_records_fetched)
def main(argv):
argv = FLAGS(argv) # parse flags
output_file = FLAGS.output_prefix + '-' + FLAGS.start_ts + '-to-' + FLAGS.end_ts + '.csv'
start_epoch = date_to_epoch(FLAGS.start_ts)
end_epoch = date_to_epoch(FLAGS.end_ts)
subreddits = FLAGS.subreddits.split(',')
with open(output_file, 'w') as f:
csv_writer = csv.writer(f)
csv_writer.writerow(Submission._fields)
fetch_submissions(start_epoch, end_epoch, subreddits, csv_writer)
if __name__ == '__main__':
main(sys.argv)