forked from pfroud/ucsc-class-info-bot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_reddit.py
323 lines (239 loc) · 10.8 KB
/
scrape_reddit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
"""
Scrapes the self text and comments of a reddit submission for mentions of courses.
"""
from datetime import datetime
import praw # python wrapper for reddit api
import re # regular expressions
import pickle # serializer
import os.path
from pprint import pprint
from dept_names import dept_names
import database # used for pad_course_num() and load_database()
from database import CourseDatabase, Department, Course # need this to de-pickle these classes
# http://praw.readthedocs.org/en/stable/pages/writing_a_bot.html
# http://praw.readthedocs.org/en/stable/pages/comment_parsing.html
# previously " ?[0-9]+[A-Za-z]?"
regex = re.compile(" [0-9]+[A-Za-z]?")
def _get_mentions_in_string(source_):
"""Finds mentions of courses (department and number) in a string.
:param source_: string to look for courses in.
:type source_: str
:return: array of strings of course names
:rtype: list
"""
# TODO need to tweak this so it can see:
# 3zor5k: 'Math19B'
# 3zmpwg: 'HAVC40 or HAVC45'
str_in = source_.lower()
courses_found = []
# for subj in database.all_departments: # iterate subjects
for subj in ['havc']: # iterate subjects
# set start of search to beginning of string
start_of_next_search = 0
# search until reached end of sring
while start_of_next_search < len(str_in):
# trim away part of string already searched through
trimmed_str = str_in[start_of_next_search:]
# run string search
subj_start_index = trimmed_str.find(subj)
if subj_start_index >= 0: # if found a subject in body
# set string index where subject ends
subj_end_index = subj_start_index + len(subj)
# slice string to send to regex matcher. maximum of 5 extra chars needed
regex_substr = trimmed_str[subj_end_index: subj_end_index + 5]
# set next search to start after this one ends
start_of_next_search += subj_end_index
# search for course number
regex_result = regex.match(regex_substr)
if regex_result is not None: # if found a class number
# string with subject and course number
subj_with_number = trimmed_str[subj_start_index: subj_end_index + regex_result.end()]
courses_found.append(subj_with_number)
# print("matched string \"" + subj_with_number + "\".")
else:
break
return courses_found
def _remove_list_duplicates_preserve_order(input_list):
"""Removes duplicates from a list, while preserving order.
To do this easily without preserving order, do list(set(input_list)).
:param input_list:
:type input_list: list
:return:
:rtype: list
"""
new_list = []
for i in input_list:
if i not in new_list:
new_list.append(i)
return new_list
def get_mentions_in_submission(submission_):
"""Finds mentions of a course in a submission's title, selftext, and comments.
:param submission_: a praw submission object
:type submission_: praw.objects.Submission
:return: an array of strings of course names
:rtype: list
"""
course_names = []
course_names.extend(_get_mentions_in_string(submission_.title))
course_names.extend(_get_mentions_in_string(submission_.selftext))
flat_comments = praw.helpers.flatten_tree(submission_.comments)
for comment in flat_comments:
if comment.author.name == 'ucsc-class-info-bot':
continue
course_names.extend(_get_mentions_in_string(comment.body))
return _remove_list_duplicates_preserve_order(course_names)
def auth_reddit():
"""Load access information and return PRAW reddit api context.
:return: praw instance
:rtype praw.__init__.AuthenticatedReddit
"""
red = praw.Reddit(user_agent = 'desktop:ucsc-class-info-bot:v0.0.1 (by /u/ucsc-class-info-bot)',
site_name = 'ucsc_bot')
with open('access_information.pickle', 'rb') as file:
access_information = pickle.load(file)
file.close()
red.set_access_credentials(**access_information)
return red
def _get_course_obj_from_mention(db_, mention_):
"""Converts mention of course to course object
:param db_: course database with course info
:type db_: CourseDatabase
:param mention_: string of course mention, like 'econ 1'
:type mention_: str
:return: course database from the mention
:rtype: Course
"""
split = mention_.split(' ')
dept = split[0].lower()
# if dept == 'cs':
# dept = 'cmps'
# if dept == 'ce':
# dept = 'cmpe'
num = database.pad_course_num(split[1].upper())
# num = split[1].upper()
try:
course_obj = db_.depts[dept].courses[num]
except KeyError:
return None
return course_obj
def get_markdown(db_, mention_list_):
"""Returns a markdown comment with info about the classes mentioned in the list
:param db_: course database with info
:type db_: CourseDatabase
:param mention_list_: list of mentions, like ['econ 1', 'cmps 5j']
:type mention_list_: list
:return: string of markdown comment
:rtype str
"""
if not mention_list_: # if list is empty
return None
markdown_string = 'Classes mentioned in this thread:\n\n \n\n'
for mention in mention_list_:
course_obj = _get_course_obj_from_mention(db_, mention)
if course_obj is None: # excepted Keyerror
continue
markdown_string += _course_to_markdown(course_obj) + ' \n\n'
markdown_string += '---------------\n\n \n\n' + \
'*I am a bot. If I screw up, please comment or message me. ' + \
'[I\'m open source!](https://github.com/pfroud/ucsc-class-info-bot)*'
return markdown_string
posts_with_comments_pickle_path = os.path.join(os.path.dirname(__file__), 'posts_with_comments.pickle')
def _course_to_markdown(course_):
"""Returns a markdown representation of a course for use in reddit comments. Example:
'**ECON 1: Into to Stuff**
>We learn about econ and things.'
:param course_: Course to get markdown of
:type course_: Course
:return: string of markdown of the course
:rtype: str
"""
# TODO add the department name?
# dept_name = dept_names[course_.dept]
markdown_string = '**{} {}: {}**\n'.format(course_.dept.upper(), course_.number.strip('0'), course_.name)
markdown_string += '>{}\n\n'.format(course_.description)
return markdown_string
def save_posts_with_comments():
"""Saves to disk the dict of posts that have already been commented on"""
with open(posts_with_comments_pickle_path, 'wb') as file:
pickle.dump(posts_with_comments, file)
file.close()
def load_posts_with_comments():
"""Loads from disk the dict of posts that have already been commented on
:return: dict of posts that have already been commented on
:rtype: dict
"""
with open(posts_with_comments_pickle_path, 'rb') as file:
a_c = pickle.load(file)
file.close()
return a_c
def post_comment(submission_, actually_do_it = True):
"""Posts a comment on the submission with info about the courses mentioned
:param submission_: submission object to post the comment to
:type submission_: praw.objects.Submission
:param actually_do_it: whether to actually post a comment to reddit.com
:type actually_do_it: bool
:return: message about the action taken.
:rtype: str
"""
submission_id = submission_.id
mentions_current = get_mentions_in_submission(submission_)
if not mentions_current: # no mentions in the submission
_print_csv_row(submission_, 'No mentions in thread.', [], [])
return
if submission_id in posts_with_comments.keys(): # already have a comment with class info
already_commented_obj = posts_with_comments[submission_id]
mentions_previous = already_commented_obj.mentions_list
if mentions_current == mentions_previous: # already commented, but no new classes have been mentioned
_print_csv_row(submission_, 'No new mentions.', mentions_current, mentions_previous)
return
if actually_do_it:
existing_comment = reddit.get_info(thing_id = 't1_' + already_commented_obj.comment_id)
existing_comment.edit(get_markdown(db, mentions_current))
posts_with_comments[submission_id].mentions_list = mentions_current
_print_csv_row(submission_, 'Edited comment.', mentions_current, mentions_previous)
else:
if actually_do_it:
new_comment = submission_.add_comment(get_markdown(db, mentions_current))
posts_with_comments[submission_id] = ExistingComment(new_comment.id, mentions_current)
_print_csv_row(submission_, 'Comment added.', mentions_current, [])
def _print_csv_row(submission_, action, mentions_current, mentions_previous):
"""Prints a CSV row to stdout to be used as a log about what happened with a comment.
:param submission_: Submission object that you are commenting on
:type submission_: praw.objects.Submission
:param action: string describing the action taken
:type action: str
:param mentions_current: list of current class mentions
:type mentions_current: list
:param mentions_previous: list of class mentions last known about
:type mentions_previous: list
"""
print( # I have put the string on it's own line b/c PyCharm's formatter and PEP inspector want different things
'{id}{_}{author}{_}{title}{_}{action}{_}{mentions_current}{_}{mentions_previous}'
.format(
id = submission_.id,
author = submission_.author,
title = submission_.title,
action = action,
mentions_current = mentions_current,
mentions_previous = mentions_previous,
_ = '\t'))
class ExistingComment:
"""Info about an existing comment with class info."""
def __init__(self, comment_id_, mentions_):
self.comment_id = comment_id_
self.mentions_list = mentions_
def __str__(self):
return "\"{}\"->\"{}\"".format(self.comment_id, self.mentions_list)
print(_get_mentions_in_string('HAVC40'))
exit()
# print('Started {}.'.format(datetime.now()))
posts_with_comments = load_posts_with_comments()
db = database.load_database()
reddit = auth_reddit()
print('id{_}author{_}title{_}action{_}current mentions{_}previous mentions'.format(_ = '\t'))
# post_comment(reddit.get_submission(submission_id = '3yw5sz')) # on /r/bottesting
subreddit = reddit.get_subreddit('ucsc')
for submission in subreddit.get_new():
# print(submission)
post_comment(submission)
save_posts_with_comments()