def main(subreddit_list): reddit = Reddit(config.data_location) subreddits = { subredit.strip().split("/")[-1] for subredit in subreddit_list } for subreddit in subreddits: sub = reddit.get_subreddit(subreddit) with open(f'../acl/{subreddit}_user_perline.csv', 'w') as fp: csv_file = csv.writer(fp) csv_file.writerow(['SeqId', 'InstNo', 'Author', 'Text']) for post in sub.posts: if 'selftext' in post and post['selftext'] and post[ 'selftext'] != '[removed]' and post[ 'author'] != '[deleted]' and post[ 'author'] != 'AutoModerator': content_post = post.get('selftext').replace('\n', ' ').lower() #clean_text = clean(content_post) #csv_file.writerow([post.get('id'), 0, post['author'], clean_text]) content_post = nltk.tokenize.sent_tokenize(content_post) if len(content_post) > 4: count = 0 for sent in content_post: sent = clean(sent) sent = nltk.tokenize.word_tokenize(sent) sent = ' '.join(sent) csv_file.writerow([ post.get('id'), count, post['subreddit'], sent ]) count += 1
def main(user_list, subreddit_list, output_file): reddit = Reddit(config.data_location) subreddits = {forum.strip().split("/")[-1] for forum in subreddit_list} users = {useritem.strip().split(",")[-1] for useritem in user_list} try: subreddits.remove("") except: pass subreddits = sorted(subreddits) try: users.remove("") except: pass users = sorted(users) csvf = csv.writer(output_file) csvf.writerow(["username", "month", "subreddit", "count"]) for s in subreddits: print(s) subcount = defaultdict( lambda: defaultdict(int)) # author -> month -> count for post in reddit.get_subreddit(s).posts: if post.get("author", "") in users: utc = datetime.utcfromtimestamp( post["created_utc"]).strftime('%Y-%m') subcount[post.get("author", "")][utc] += 1 for u in sorted(subcount): for t in sorted(subcount[u]): csvf.writerow([u, t, s, subcount[u][t]])
def main(subreddit_file, database): cur.execute("""select entity_source_id from data_source_instance where entity_source_descriptor = 'reddit#id#POST';""") keys = {row["entity_source_id"] for row in cur.fetchall()} subreddits = {s.strip() for s in subreddit_file} keys2 = set() for subreddit in subreddits: s = Reddit.get_subreddit(subreddit) keys2.extend({p["id"] for p in s.posts}) print "Found ", len(keys), "keys in",db," versus",len(keys2),"keys in directory" print "\n".join(list(keys2.difference(keys))[:1000])
def main(subreddits, subreddit_list, top_n): reddit = Reddit(config.data_location) subreddits = list(subreddits) if subreddit_list is not None: subreddits.extend( [forum.strip().split("/")[-1] for forum in subreddit_list]) for subreddit in subreddits: subr = reddit.get_subreddit(subreddit) top_authors = subr.top_authors(top_n) for auth in top_authors: print(subreddit + "," + auth)
def main(subreddit, top_n): reddit = Reddit(config.data_location) subr = reddit.get_subreddit(subreddit) users = Counter(post['author'] for post in subr.posts) # remove deleted account del users['[deleted]'] top_authors = next(zip(*users.most_common(top_n))) print('Pulling the following authors...') print('\n'.join(top_authors)) print(top_authors) # pull 1000 posts for each user each time pull_posts(1000, authors=top_authors)
def posts2csv(post_f, authors=None, subreddits=None, seen_posts = set(), verbose=True, limit = 1000): reddit = Reddit(MongoClient('mongodb://127.0.0.1:27017')["reddit"]) subreddits = [reddit.get_subreddit(s) for s in subreddits] authors = [reddit.get_user(a) for a in authors] subredditset = set() # subreddit info doesn't seem to have the "subreddit_id". To do : get that with r/subreddit/<name>/about # for now, use subreddit name as forum identifier csvp = csv.writer(post_f) csvp.writerow("id,replyto,username,user_annotation_flairtext,annotation_over18,annotation_score,forum,discourse,title,when,dataset_file,post".split(",")) for subreddit in subreddits: print(subreddit.name) postids = set(subreddit.post_ids) - seen_posts for i, idd in enumerate(postids): post = subreddit.post(idd) if i%1000 == 999: print("post",i,"of",len(postids),limit,"to go") if "selftext" not in post or post["selftext"] == "": continue # Skip URL-only posts if "subreddit" not in post: print("No subreddit in post " + post["id"]) continue if post["id"] in seen_posts: continue csvp.writerow([post["id"],None,post["author"],post["author_flair_text"],str(post["over_18"]),str(post["score"]), post["subreddit"],"Reddit",post["title"], datetime.fromtimestamp(post["created"], tz).isoformat(), "reddit",post.get("selftext",post["url"])]) limit -= 1 if limit == 0: return for author in authors: print(author.name) postids = set(author.post_ids) - seen_posts for i,post in enumerate([author.post(id) for id in postids]): if i%1000 == 999: print("post",i,"of",len(postids),limit,"to go") if "selftext" not in post or post["selftext"] == "": continue # Skip URL-only posts if "subreddit" not in post: print("No subreddit in post " + post["id"]) continue if post["id"] in seen_posts: continue csvp.writerow([post["id"],None,post["author"],post["author_flair_text"],str(post["over_18"]),str(post["score"]), post["subreddit"],"Reddit",post["title"], datetime.fromtimestamp(post["created"], tz).isoformat(), "reddit",post.get("selftext",post["url"])]) limit -= 1 if limit == 0: return
def test_report(self): # login as new user to report submission oth = Reddit(USER_AGENT) oth.login('PyApiTestUser3', '1111') subreddit = oth.get_subreddit(self.sr) submission = None for submission in subreddit.get_new_by_date(): if not submission.hidden: break if not submission or submission.hidden: self.fail('Could not find a non-reported submission.') submission.report() # check if submission was reported for report in self.r.get_subreddit(self.sr).get_reports(): if report.id == submission.id: break else: self.fail('Could not find reported submission.')
def main(subreddit_list, keyword_list, topic_list): reddit = Reddit(config.data_location) subreddits = { subredit.strip().split("/")[-1] for subredit in subreddit_list } keywords = {keyword.strip().lower() for keyword in keyword_list} print(keywords) topics = {topic.strip().lower() for topic in topic_list} print(topics) for subreddit in subreddits: sub = reddit.get_subreddit(subreddit) with open(f'../lgbtq/data/{subreddit}.csv', 'w') as fp: csv_file = csv.writer(fp) csv_file.writerow([ 'PostId', 'PostTime', 'author', 'PostContent', 'MatchingWord', 'MatchTopic' ]) for post in sub.posts: if 'selftext' in post and post['selftext'] and post[ 'selftext'] != '[removed]' and post[ 'author'] != '[deleted]' and post[ 'author'] != 'AutoModerator': content_post = post.get('selftext').replace('\n', ' ').lower() clean_text = clean(content_post) match_1 = match(keywords, clean_text) match_2 = match(topics, clean_text) if len(set(match_1)) != 0 or len(set(match_2)) != 0: csv_file.writerow([ post.get('id'), time.ctime(post.get('created_utc')), post['author'], clean_text, set(match_1) if len(match_1) > 0 else None, set(match_2) if len(match_2) > 0 else None ])
class ModUtils(object): VERSION = '0.1.dev' def __init__(self, subreddit, site=None, verbose=None): self.reddit = Reddit(str(self), site) self.sub = self.reddit.get_subreddit(subreddit) self.verbose = verbose self._current_flair = None def __str__(self): return 'BBoe\'s ModUtils %s' % self.VERSION def add_users(self, category): mapping = {'banned': 'ban', 'contributors': 'make_contributor', 'moderators': 'make_moderator'} if category not in mapping: print '%r is not a valid option for --add' % category return func = getattr(self.sub, mapping[category]) print 'Enter user names (any separation should suffice):' data = sys.stdin.read().strip() for name in re.split('[^A-Za-z_]+', data): func(name) print 'Added %r to %s' % (name, category) def current_flair(self): if self._current_flair is None: self._current_flair = [] if self.verbose: print 'Fetching flair list for %s' % self.sub for flair in self.sub.flair_list(): self._current_flair.append(flair) yield flair else: for item in self._current_flair: yield item def flair_template_sync(self, editable, limit, # pylint: disable-msg=R0912 static, sort, use_css, use_text): # Parameter verification if not use_text and not use_css: raise Exception('At least one of use_text or use_css must be True') sorts = ('alpha', 'size') if sort not in sorts: raise Exception('Sort must be one of: %s' % ', '.join(sorts)) # Build current flair list along with static values if static: counter = dict((x, limit) for x in static) else: counter = {} if self.verbose: sys.stdout.write('Retrieving current flair') sys.stdout.flush() for flair in self.current_flair(): if self.verbose: sys.stdout.write('.') sys.stdout.flush() if use_text and use_css: key = (flair['flair_text'], flair['flair_css_class']) elif use_text: key = flair['flair_text'] else: key = flair['flair_css_class'] if key in counter: counter[key] += 1 else: counter[key] = 1 if self.verbose: print # Sort flair list items according to the specified sort if sort == 'alpha': items = sorted(counter.items()) else: items = sorted(counter.items(), key=lambda x: x[1], reverse=True) # Clear current templates and store flair according to the sort if self.verbose: print 'Clearing current flair templates' self.sub.clear_flair_templates() for key, count in items: if not key or count < limit: continue if use_text and use_css: text, css = key elif use_text: text, css = key, '' else: text, css = '', key if self.verbose: print 'Adding template: text: "%s" css: "%s"' % (text, css) self.sub.add_flair_template(text, css, editable) def login(self, user, pswd): if self.verbose: print 'Logging in' self.reddit.login(user, pswd) if self.verbose: print 'Fetching moderator list for %s' % self.sub if str(self.sub).lower() not in [str(x).lower() for x in self.reddit.user.my_moderation()]: raise Exception('You do not moderate %s' % self.sub) def message(self, category, subject, msg_file): users = getattr(self.sub, 'get_%s' % category)() if not users: print 'There are no %s on %s.' % (category, str(self.sub)) return if msg_file: try: msg = open(msg_file).read() except IOError, error: print str(error) return else:
class SubRedditStats(object): VERSION = '0.2.0' post_prefix = 'Subreddit Stats:' post_header = '---\n###%s\n' post_footer = ('>Generated with [BBoe](/user/bboe)\'s [Subreddit Stats]' '(https://github.com/bboe/subreddit_stats) \n%s' 'SRS Marker: %d') re_marker = re.compile('SRS Marker: (\d+)') @staticmethod def _previous_max(submission): try: val = SubRedditStats.re_marker.findall(submission.selftext)[-1] return float(val) except (IndexError, TypeError): print 'End marker not found in previous submission. Aborting' sys.exit(1) @staticmethod def _permalink(permalink): tokens = permalink.split('/') if tokens[8] == '': # submission return '/comments/%s/_/' % (tokens[6]) else: # comment return '/comments/%s/_/%s?context=1' % (tokens[6], tokens[8]) @staticmethod def _user(user): return '[%s](/user/%s)' % (user.replace('_', '\_'), user) def __init__(self, subreddit, site, verbosity): self.reddit = Reddit(str(self), site) self.subreddit = self.reddit.get_subreddit(subreddit) self.verbosity = verbosity self.submissions = [] self.comments = [] self.submitters = defaultdict(list) self.commenters = defaultdict(list) self.min_date = 0 self.max_date = time.time() - DAYS_IN_SECONDS * 3 self.prev_srs = None # Config self.reddit.config.comment_limit = -1 # Fetch max comments possible self.reddit.config.comment_sort = 'top' def __str__(self): return 'BBoe\'s SubRedditStats %s' % self.VERSION def login(self, user, pswd): if self.verbosity > 0: print 'Logging in' self.reddit.login(user, pswd) def msg(self, msg, level, overwrite=False): if self.verbosity >= level: sys.stdout.write(msg) if overwrite: sys.stdout.write('\r') sys.stdout.flush() else: sys.stdout.write('\n') def prev_stat(self, prev_url): submission = self.reddit.get_submission(prev_url) self.min_date = self._previous_max(submission) self.prev_srs = prev_url def fetch_recent_submissions(self, max_duration, after, exclude_self, since_last=True): '''Fetches recent submissions in subreddit with boundaries. Does not include posts within the last three days as their scores may not be representative. Keyword arguments: max_duration -- When set, specifies the number of days to include after -- When set, fetch all submission after this submission id. exclude_self -- When true, don't include self posts. since_last -- When true use info from last submission to determine the stop point ''' if max_duration: self.min_date = self.max_date - DAYS_IN_SECONDS * max_duration url_data = {'after': after} if after else None self.msg('DEBUG: Fetching submissions', 1) for submission in self.subreddit.get_new_by_date(limit=None, url_data=url_data): if submission.created_utc > self.max_date: continue if submission.created_utc <= self.min_date: break if (since_last and str(submission.author) == str(self.reddit.user) and submission.title.startswith(self.post_prefix)): # Use info in this post to update the min_date # And don't include this post self.msg('Found previous: %s' % submission.title, 2) if self.prev_srs == None: # Only use the most recent self.min_date = max(self.min_date, self._previous_max(submission)) self.prev_srs = submission.permalink continue if exclude_self and submission.is_self: continue self.submissions.append(submission) self.msg('DEBUG: Found %d submissions' % len(self.submissions), 1) if len(self.submissions) == 0: return False # Update real min and max dates self.submissions.sort(key=lambda x: x.created_utc) self.min_date = self.submissions[0].created_utc self.max_date = self.submissions[-1].created_utc return True def fetch_top_submissions(self, top, exclude_self): '''Fetches top 1000 submissions by some top value. Keyword arguments: top -- One of week, month, year, all exclude_self -- When true, don't include self posts. ''' if top not in ('day', 'week', 'month', 'year', 'all'): raise TypeError('%r is not a valid top value' % top) self.msg('DEBUG: Fetching submissions', 1) url_data = {'t': top} for submission in self.subreddit.get_top(limit=None, url_data=url_data): if exclude_self and submission.is_self: continue self.submissions.append(submission) self.msg('DEBUG: Found %d submissions' % len(self.submissions), 1) if len(self.submissions) == 0: return False # Update real min and max dates self.submissions.sort(key=lambda x: x.created_utc) self.min_date = self.submissions[0].created_utc self.max_date = self.submissions[-1].created_utc return True def process_submitters(self): self.msg('DEBUG: Processing Submitters', 1) for submission in self.submissions: if submission.author: self.submitters[str(submission.author)].append(submission) def process_commenters(self): num = len(self.submissions) self.msg('DEBUG: Processing Commenters on %d submissions' % num, 1) for i, submission in enumerate(self.submissions): self.msg('%d/%d submissions' % (i + 1, num), 2, overwrite=True) if submission.num_comments == 0: continue try: self.comments.extend(submission.all_comments_flat) except Exception as exception: print 'Exception fetching comments on %r: %s' % (submission.content_id, str(exception)) for orphans in submission._orphaned.values(): self.comments.extend(orphans) for comment in self.comments: if comment.author: self.commenters[str(comment.author)].append(comment) def basic_stats(self): sub_ups = sum(x.ups for x in self.submissions) sub_downs = sum(x.downs for x in self.submissions) comm_ups = sum(x.ups for x in self.comments) comm_downs = sum(x.downs for x in self.comments) sub_up_perc = sub_ups * 100 / (sub_ups + sub_downs) comm_up_perc = comm_ups * 100 / (comm_ups + comm_downs) values = [('Total', len(self.submissions), '', len(self.comments), ''), ('Unique Redditors', len(self.submitters), '', len(self.commenters), ''), ('Upvotes', sub_ups, '%d%%' % sub_up_perc, comm_ups, '%d%%' % comm_up_perc), ('Downvotes', sub_downs, '%d%%' % (100 - sub_up_perc), comm_downs, '%d%%' % (100 - comm_up_perc))] retval = '||Submissions|%|Comments|%|\n:-:|--:|--:|--:|--:\n' for quad in values: retval += '__%s__|%d|%s|%d|%s\n' % quad return '%s\n' % retval def top_submitters(self, num, num_submissions): num = min(num, len(self.submitters)) if num <= 0: return '' top_submitters = sorted(self.submitters.items(), reverse=True, key=lambda x: (sum(y.score for y in x[1]), len(x[1])))[:num] retval = self.post_header % 'Top Submitters\' Top Submissions' for (author, submissions) in top_submitters: retval += '0. %d pts, %d submissions: %s\n' % ( sum(x.score for x in submissions), len(submissions), self._user(author)) for sub in sorted(submissions, reverse=True, key=lambda x: x.score)[:num_submissions]: title = sub.title.replace('\n', ' ').strip() if sub.permalink != sub.url: retval += ' 0. [%s](%s)' % (title, sub.url) else: retval += ' 0. %s' % title retval += ' (%d pts, [%d comments](%s))\n' % ( sub.score, sub.num_comments, self._permalink(sub.permalink)) retval += '\n' return retval def top_commenters(self, num): score = lambda x: x.ups - x.downs num = min(num, len(self.commenters)) if num <= 0: return '' top_commenters = sorted(self.commenters.items(), reverse=True, key=lambda x: (sum(score(y) for y in x[1]), len(x[1])))[:num] retval = self.post_header % 'Top Commenters' for author, comments in top_commenters: retval += '0. %s (%d pts, %d comments)\n' % ( self._user(author), sum(score(x) for x in comments), len(comments)) return '%s\n' % retval def top_submissions(self, num): num = min(num, len(self.submissions)) if num <= 0: return '' top_submissions = sorted(self.submissions, reverse=True, key=lambda x: x.score)[:num] retval = self.post_header % 'Top Submissions' for sub in top_submissions: author = str(sub.author) title = sub.title.replace('\n', ' ').strip() if sub.permalink != sub.url: retval += '0. [%s](%s)' % (title, sub.url) else: retval += '0. %s' % title retval += ' by %s (%d pts, [%d comments](%s))\n' % ( self._user(author), sub.score, sub.num_comments, self._permalink(sub.permalink)) return '%s\n' % retval def top_comments(self, num): score = lambda x: x.ups - x.downs num = min(num, len(self.comments)) if num <= 0: return '' top_comments = sorted(self.comments, reverse=True, key=score)[:num] retval = self.post_header % 'Top Comments' for comment in top_comments: author = str(comment.author) title = comment.submission.title.replace('\n', ' ').strip() retval += ('0. %d pts: %s\'s [comment](%s) in %s\n' % (score(comment), self._user(author), self._permalink(comment.permalink), title)) return '%s\n' % retval def publish_results(self, subreddit, submitters, commenters, submissions, comments, top, debug=False): def timef(timestamp): dtime = datetime.fromtimestamp(timestamp) return dtime.strftime('%Y-%m-%d %H:%M PDT') title = '%s %s %ssubmissions from %s to %s' % ( self.post_prefix, str(self.subreddit), 'top ' if top else '', timef(self.min_date), timef(self.max_date)) if self.prev_srs: prev = '[Previous Stat](%s) \n' % self._permalink(self.prev_srs) else: prev = '' basic = self.basic_stats() t_commenters = self.top_commenters(commenters) t_submissions = self.top_submissions(submissions) t_comments = self.top_comments(comments) footer = self.post_footer % (prev, self.max_date) body = '' num_submissions = 10 while body == '' or len(body) > MAX_BODY_SIZE and num_submissions > 2: t_submitters = self.top_submitters(submitters, num_submissions) body = (basic + t_submitters + t_commenters + t_submissions + t_comments + footer) num_submissions -= 1 if len(body) > MAX_BODY_SIZE: print 'The resulting message is too big. Not submitting.' debug = True if not debug: msg = ('You are about to submit to subreddit %s as %s.\n' 'Are you sure? yes/[no]: ' % (subreddit, str(self.reddit.user))) if raw_input(msg).lower() not in ['y', 'yes']: print 'Submission aborted' else: try: self.reddit.submit(subreddit, title, text=body) return except Exception, error: print 'The submission failed:', error # We made it here either to debug=True or an error. print title print body