Example #1
0
def main(subreddit_list):
    reddit = Reddit(config.data_location)
    subreddits = {
        subredit.strip().split("/")[-1]
        for subredit in subreddit_list
    }

    for subreddit in subreddits:
        sub = reddit.get_subreddit(subreddit)
        with open(f'../acl/{subreddit}_user_perline.csv', 'w') as fp:
            csv_file = csv.writer(fp)
            csv_file.writerow(['SeqId', 'InstNo', 'Author', 'Text'])
            for post in sub.posts:
                if 'selftext' in post and post['selftext'] and post[
                        'selftext'] != '[removed]' and post[
                            'author'] != '[deleted]' and post[
                                'author'] != 'AutoModerator':
                    content_post = post.get('selftext').replace('\n',
                                                                ' ').lower()
                    #clean_text = clean(content_post)
                    #csv_file.writerow([post.get('id'), 0, post['author'], clean_text])
                    content_post = nltk.tokenize.sent_tokenize(content_post)
                    if len(content_post) > 4:
                        count = 0
                        for sent in content_post:
                            sent = clean(sent)
                            sent = nltk.tokenize.word_tokenize(sent)
                            sent = ' '.join(sent)
                            csv_file.writerow([
                                post.get('id'), count, post['subreddit'], sent
                            ])
                            count += 1
Example #2
0
def main(user_list, subreddit_list, output_file):
    reddit = Reddit(config.data_location)
    subreddits = {forum.strip().split("/")[-1] for forum in subreddit_list}
    users = {useritem.strip().split(",")[-1] for useritem in user_list}
    try:
        subreddits.remove("")
    except:
        pass
    subreddits = sorted(subreddits)
    try:
        users.remove("")
    except:
        pass
    users = sorted(users)
    csvf = csv.writer(output_file)
    csvf.writerow(["username", "month", "subreddit", "count"])

    for s in subreddits:
        print(s)
        subcount = defaultdict(
            lambda: defaultdict(int))  # author -> month -> count
        for post in reddit.get_subreddit(s).posts:
            if post.get("author", "") in users:
                utc = datetime.utcfromtimestamp(
                    post["created_utc"]).strftime('%Y-%m')
                subcount[post.get("author", "")][utc] += 1
        for u in sorted(subcount):
            for t in sorted(subcount[u]):
                csvf.writerow([u, t, s, subcount[u][t]])
def main(subreddit_file, database):
    cur.execute("""select entity_source_id from data_source_instance where entity_source_descriptor = 'reddit#id#POST';""")
    keys = {row["entity_source_id"] for row in cur.fetchall()}
    
    subreddits = {s.strip() for s in subreddit_file}
    keys2 = set()
    for subreddit in subreddits:
        s = Reddit.get_subreddit(subreddit)
        keys2.extend({p["id"] for p in s.posts})
    
    print "Found ", len(keys), "keys in",db," versus",len(keys2),"keys in directory"
    
    print "\n".join(list(keys2.difference(keys))[:1000])
Example #4
0
def main(subreddits, subreddit_list, top_n):
    reddit = Reddit(config.data_location)
    subreddits = list(subreddits)

    if subreddit_list is not None:
        subreddits.extend(
            [forum.strip().split("/")[-1] for forum in subreddit_list])

    for subreddit in subreddits:
        subr = reddit.get_subreddit(subreddit)
        top_authors = subr.top_authors(top_n)

        for auth in top_authors:
            print(subreddit + "," + auth)
Example #5
0
def main(subreddit, top_n):
    reddit = Reddit(config.data_location)

    subr = reddit.get_subreddit(subreddit)
    users = Counter(post['author'] for post in subr.posts)
    # remove deleted account
    del users['[deleted]']
    top_authors = next(zip(*users.most_common(top_n)))

    print('Pulling the following authors...')
    print('\n'.join(top_authors))
    print(top_authors)

    # pull 1000 posts for each user each time
    pull_posts(1000, authors=top_authors)
Example #6
0
def posts2csv(post_f, authors=None, subreddits=None, seen_posts = set(), verbose=True, limit = 1000):
    reddit = Reddit(MongoClient('mongodb://127.0.0.1:27017')["reddit"])
    
    subreddits = [reddit.get_subreddit(s) for s in subreddits]
    authors = [reddit.get_user(a) for a in authors]

    subredditset = set()

    # subreddit info doesn't seem to have the "subreddit_id".   To do : get that with r/subreddit/<name>/about
    # for now, use subreddit name as forum identifier
    csvp = csv.writer(post_f)
    csvp.writerow("id,replyto,username,user_annotation_flairtext,annotation_over18,annotation_score,forum,discourse,title,when,dataset_file,post".split(","))

    for subreddit in subreddits:
        print(subreddit.name)
        postids = set(subreddit.post_ids) - seen_posts
        for i, idd in enumerate(postids):
            post = subreddit.post(idd)
            if i%1000 == 999: print("post",i,"of",len(postids),limit,"to go")
            if "selftext" not in post or post["selftext"] == "": continue   # Skip URL-only posts
            if "subreddit" not in post:
                print("No subreddit in post " + post["id"])
                continue
            if post["id"] in seen_posts: continue
            csvp.writerow([post["id"],None,post["author"],post["author_flair_text"],str(post["over_18"]),str(post["score"]),
                           post["subreddit"],"Reddit",post["title"],
                           datetime.fromtimestamp(post["created"], tz).isoformat(),
                           "reddit",post.get("selftext",post["url"])])
            limit -= 1
            if limit == 0: return

    for author in authors:
        print(author.name)
        postids = set(author.post_ids) - seen_posts
        for i,post in enumerate([author.post(id) for id in postids]):
            if i%1000 == 999: print("post",i,"of",len(postids),limit,"to go")
            if "selftext" not in post or post["selftext"] == "": continue   # Skip URL-only posts
            if "subreddit" not in post:
                print("No subreddit in post " + post["id"])
                continue
            if post["id"] in seen_posts: continue
            csvp.writerow([post["id"],None,post["author"],post["author_flair_text"],str(post["over_18"]),str(post["score"]),
                           post["subreddit"],"Reddit",post["title"],
                           datetime.fromtimestamp(post["created"], tz).isoformat(),
                           "reddit",post.get("selftext",post["url"])])
            limit -= 1
            if limit == 0: return
Example #7
0
 def test_report(self):
     # login as new user to report submission
     oth = Reddit(USER_AGENT)
     oth.login('PyApiTestUser3', '1111')
     subreddit = oth.get_subreddit(self.sr)
     submission = None
     for submission in subreddit.get_new_by_date():
         if not submission.hidden:
             break
     if not submission or submission.hidden:
         self.fail('Could not find a non-reported submission.')
     submission.report()
     # check if submission was reported
     for report in self.r.get_subreddit(self.sr).get_reports():
         if report.id == submission.id:
             break
     else:
         self.fail('Could not find reported submission.')
Example #8
0
 def test_report(self):
     # login as new user to report submission
     oth = Reddit(USER_AGENT)
     oth.login('PyApiTestUser3', '1111')
     subreddit = oth.get_subreddit(self.sr)
     submission = None
     for submission in subreddit.get_new_by_date():
         if not submission.hidden:
             break
     if not submission or submission.hidden:
         self.fail('Could not find a non-reported submission.')
     submission.report()
     # check if submission was reported
     for report in self.r.get_subreddit(self.sr).get_reports():
         if report.id == submission.id:
             break
     else:
         self.fail('Could not find reported submission.')
Example #9
0
def main(subreddit_list, keyword_list, topic_list):
    reddit = Reddit(config.data_location)
    subreddits = {
        subredit.strip().split("/")[-1]
        for subredit in subreddit_list
    }

    keywords = {keyword.strip().lower() for keyword in keyword_list}
    print(keywords)
    topics = {topic.strip().lower() for topic in topic_list}
    print(topics)

    for subreddit in subreddits:
        sub = reddit.get_subreddit(subreddit)
        with open(f'../lgbtq/data/{subreddit}.csv', 'w') as fp:
            csv_file = csv.writer(fp)
            csv_file.writerow([
                'PostId', 'PostTime', 'author', 'PostContent', 'MatchingWord',
                'MatchTopic'
            ])
            for post in sub.posts:
                if 'selftext' in post and post['selftext'] and post[
                        'selftext'] != '[removed]' and post[
                            'author'] != '[deleted]' and post[
                                'author'] != 'AutoModerator':
                    content_post = post.get('selftext').replace('\n',
                                                                ' ').lower()
                    clean_text = clean(content_post)
                    match_1 = match(keywords, clean_text)
                    match_2 = match(topics, clean_text)

                    if len(set(match_1)) != 0 or len(set(match_2)) != 0:
                        csv_file.writerow([
                            post.get('id'),
                            time.ctime(post.get('created_utc')),
                            post['author'], clean_text,
                            set(match_1) if len(match_1) > 0 else None,
                            set(match_2) if len(match_2) > 0 else None
                        ])
Example #10
0
class ModUtils(object):
    VERSION = '0.1.dev'

    def __init__(self, subreddit, site=None, verbose=None):
        self.reddit = Reddit(str(self), site)
        self.sub = self.reddit.get_subreddit(subreddit)
        self.verbose = verbose
        self._current_flair = None

    def __str__(self):
        return 'BBoe\'s ModUtils %s' % self.VERSION

    def add_users(self, category):
        mapping = {'banned': 'ban',
                   'contributors': 'make_contributor',
                   'moderators': 'make_moderator'}

        if category not in mapping:
            print '%r is not a valid option for --add' % category
            return

        func = getattr(self.sub, mapping[category])
        print 'Enter user names (any separation should suffice):'
        data = sys.stdin.read().strip()
        for name in re.split('[^A-Za-z_]+', data):
            func(name)
            print 'Added %r to %s' % (name, category)

    def current_flair(self):
        if self._current_flair is None:
            self._current_flair = []
            if self.verbose:
                print 'Fetching flair list for %s' % self.sub
            for flair in self.sub.flair_list():
                self._current_flair.append(flair)
                yield flair
        else:
            for item in self._current_flair:
                yield item

    def flair_template_sync(self, editable, limit,  # pylint: disable-msg=R0912
                            static, sort, use_css, use_text):
        # Parameter verification
        if not use_text and not use_css:
            raise Exception('At least one of use_text or use_css must be True')
        sorts = ('alpha', 'size')
        if sort not in sorts:
            raise Exception('Sort must be one of: %s' % ', '.join(sorts))

        # Build current flair list along with static values
        if static:
            counter = dict((x, limit) for x in static)
        else:
            counter = {}
        if self.verbose:
            sys.stdout.write('Retrieving current flair')
            sys.stdout.flush()
        for flair in self.current_flair():
            if self.verbose:
                sys.stdout.write('.')
                sys.stdout.flush()
            if use_text and use_css:
                key = (flair['flair_text'], flair['flair_css_class'])
            elif use_text:
                key = flair['flair_text']
            else:
                key = flair['flair_css_class']
            if key in counter:
                counter[key] += 1
            else:
                counter[key] = 1
        if self.verbose:
            print

        # Sort flair list items according to the specified sort
        if sort == 'alpha':
            items = sorted(counter.items())
        else:
            items = sorted(counter.items(), key=lambda x: x[1], reverse=True)

        # Clear current templates and store flair according to the sort
        if self.verbose:
            print 'Clearing current flair templates'
        self.sub.clear_flair_templates()
        for key, count in items:
            if not key or count < limit:
                continue
            if use_text and use_css:
                text, css = key
            elif use_text:
                text, css = key, ''
            else:
                text, css = '', key
            if self.verbose:
                print 'Adding template: text: "%s" css: "%s"' % (text, css)
            self.sub.add_flair_template(text, css, editable)

    def login(self, user, pswd):
        if self.verbose:
            print 'Logging in'
        self.reddit.login(user, pswd)
        if self.verbose:
            print 'Fetching moderator list for %s' % self.sub
        if str(self.sub).lower() not in [str(x).lower() for x in
                                         self.reddit.user.my_moderation()]:
            raise Exception('You do not moderate %s' % self.sub)

    def message(self, category, subject, msg_file):
        users = getattr(self.sub, 'get_%s' % category)()
        if not users:
            print 'There are no %s on %s.' % (category, str(self.sub))
            return

        if msg_file:
            try:
                msg = open(msg_file).read()
            except IOError, error:
                print str(error)
                return
        else:
class SubRedditStats(object):
    VERSION = '0.2.0'

    post_prefix = 'Subreddit Stats:'
    post_header = '---\n###%s\n'
    post_footer = ('>Generated with [BBoe](/user/bboe)\'s [Subreddit Stats]'
                   '(https://github.com/bboe/subreddit_stats)  \n%s'
                   'SRS Marker: %d')
    re_marker = re.compile('SRS Marker: (\d+)')

    @staticmethod
    def _previous_max(submission):
        try:
            val = SubRedditStats.re_marker.findall(submission.selftext)[-1]
            return float(val)
        except (IndexError, TypeError):
            print 'End marker not found in previous submission. Aborting'
            sys.exit(1)

    @staticmethod
    def _permalink(permalink):
        tokens = permalink.split('/')
        if tokens[8] == '':  # submission
            return '/comments/%s/_/' % (tokens[6])
        else:  # comment
            return '/comments/%s/_/%s?context=1' % (tokens[6], tokens[8])

    @staticmethod
    def _user(user):
        return '[%s](/user/%s)' % (user.replace('_', '\_'), user)

    def __init__(self, subreddit, site, verbosity):
        self.reddit = Reddit(str(self), site)
        self.subreddit = self.reddit.get_subreddit(subreddit)
        self.verbosity = verbosity
        self.submissions = []
        self.comments = []
        self.submitters = defaultdict(list)
        self.commenters = defaultdict(list)
        self.min_date = 0
        self.max_date = time.time() - DAYS_IN_SECONDS * 3
        self.prev_srs = None
        # Config
        self.reddit.config.comment_limit = -1  # Fetch max comments possible
        self.reddit.config.comment_sort = 'top'

    def __str__(self):
        return 'BBoe\'s SubRedditStats %s' % self.VERSION

    def login(self, user, pswd):
        if self.verbosity > 0:
            print 'Logging in'
        self.reddit.login(user, pswd)

    def msg(self, msg, level, overwrite=False):
        if self.verbosity >= level:
            sys.stdout.write(msg)
            if overwrite:
                sys.stdout.write('\r')
                sys.stdout.flush()
            else:
                sys.stdout.write('\n')

    def prev_stat(self, prev_url):
        submission = self.reddit.get_submission(prev_url)
        self.min_date = self._previous_max(submission)
        self.prev_srs = prev_url

    def fetch_recent_submissions(self, max_duration, after, exclude_self,
                                 since_last=True):
        '''Fetches recent submissions in subreddit with boundaries.

        Does not include posts within the last three days as their scores may
        not be representative.

        Keyword arguments:
        max_duration -- When set, specifies the number of days to include
        after -- When set, fetch all submission after this submission id.
        exclude_self -- When true, don't include self posts.
        since_last -- When true use info from last submission to determine the
                      stop point
        '''
        if max_duration:
            self.min_date = self.max_date - DAYS_IN_SECONDS * max_duration
        url_data = {'after': after} if after else None
        self.msg('DEBUG: Fetching submissions', 1)
        for submission in self.subreddit.get_new_by_date(limit=None,
                                                         url_data=url_data):
            if submission.created_utc > self.max_date:
                continue
            if submission.created_utc <= self.min_date:
                break
            if (since_last and str(submission.author) == str(self.reddit.user)
                and submission.title.startswith(self.post_prefix)):
                # Use info in this post to update the min_date
                # And don't include this post
                self.msg('Found previous: %s' % submission.title, 2)
                if self.prev_srs == None:  # Only use the most recent
                    self.min_date = max(self.min_date,
                                        self._previous_max(submission))
                    self.prev_srs = submission.permalink
                continue
            if exclude_self and submission.is_self:
                continue
            self.submissions.append(submission)
        self.msg('DEBUG: Found %d submissions' % len(self.submissions), 1)
        if len(self.submissions) == 0:
            return False

        # Update real min and max dates
        self.submissions.sort(key=lambda x: x.created_utc)
        self.min_date = self.submissions[0].created_utc
        self.max_date = self.submissions[-1].created_utc
        return True

    def fetch_top_submissions(self, top, exclude_self):
        '''Fetches top 1000 submissions by some top value.

        Keyword arguments:
        top -- One of week, month, year, all
        exclude_self -- When true, don't include self posts.
        '''
        if top not in ('day', 'week', 'month', 'year', 'all'):
            raise TypeError('%r is not a valid top value' % top)
        self.msg('DEBUG: Fetching submissions', 1)
        url_data = {'t': top}
        for submission in self.subreddit.get_top(limit=None,
                                                 url_data=url_data):
            if exclude_self and submission.is_self:
                continue
            self.submissions.append(submission)
        self.msg('DEBUG: Found %d submissions' % len(self.submissions), 1)
        if len(self.submissions) == 0:
            return False

        # Update real min and max dates
        self.submissions.sort(key=lambda x: x.created_utc)
        self.min_date = self.submissions[0].created_utc
        self.max_date = self.submissions[-1].created_utc
        return True

    def process_submitters(self):
        self.msg('DEBUG: Processing Submitters', 1)
        for submission in self.submissions:
            if submission.author:
                self.submitters[str(submission.author)].append(submission)

    def process_commenters(self):
        num = len(self.submissions)
        self.msg('DEBUG: Processing Commenters on %d submissions' % num, 1)
        for i, submission in enumerate(self.submissions):
            self.msg('%d/%d submissions' % (i + 1, num), 2, overwrite=True)
            if submission.num_comments == 0:
                continue
            try:
                self.comments.extend(submission.all_comments_flat)
            except Exception as exception:
                print 'Exception fetching comments on %r: %s' % (submission.content_id,
                                                                 str(exception))
            for orphans in submission._orphaned.values():
                self.comments.extend(orphans)
        for comment in self.comments:
            if comment.author:
                self.commenters[str(comment.author)].append(comment)

    def basic_stats(self):
        sub_ups = sum(x.ups for x in self.submissions)
        sub_downs = sum(x.downs for x in self.submissions)
        comm_ups = sum(x.ups for x in self.comments)
        comm_downs = sum(x.downs for x in self.comments)

        sub_up_perc = sub_ups * 100 / (sub_ups + sub_downs)
        comm_up_perc = comm_ups * 100 / (comm_ups + comm_downs)

        values = [('Total', len(self.submissions), '', len(self.comments), ''),
                  ('Unique Redditors', len(self.submitters), '',
                   len(self.commenters), ''),
                  ('Upvotes', sub_ups, '%d%%' % sub_up_perc,
                   comm_ups, '%d%%' % comm_up_perc),
                  ('Downvotes', sub_downs, '%d%%' % (100 - sub_up_perc),
                   comm_downs, '%d%%' % (100 - comm_up_perc))]

        retval = '||Submissions|%|Comments|%|\n:-:|--:|--:|--:|--:\n'
        for quad in values:
            retval += '__%s__|%d|%s|%d|%s\n' % quad
        return '%s\n' % retval

    def top_submitters(self, num, num_submissions):
        num = min(num, len(self.submitters))
        if num <= 0:
            return ''

        top_submitters = sorted(self.submitters.items(), reverse=True,
                                key=lambda x: (sum(y.score for y in x[1]),
                                               len(x[1])))[:num]

        retval = self.post_header % 'Top Submitters\' Top Submissions'
        for (author, submissions) in top_submitters:
            retval += '0. %d pts, %d submissions: %s\n' % (
                sum(x.score for x in submissions), len(submissions),
                self._user(author))
            for sub in sorted(submissions, reverse=True,
                              key=lambda x: x.score)[:num_submissions]:
                title = sub.title.replace('\n', ' ').strip()
                if sub.permalink != sub.url:
                    retval += '  0. [%s](%s)' % (title, sub.url)
                else:
                    retval += '  0. %s' % title
                retval += ' (%d pts, [%d comments](%s))\n' % (
                    sub.score, sub.num_comments,
                    self._permalink(sub.permalink))
            retval += '\n'
        return retval

    def top_commenters(self, num):
        score = lambda x: x.ups - x.downs

        num = min(num, len(self.commenters))
        if num <= 0:
            return ''

        top_commenters = sorted(self.commenters.items(), reverse=True,
                                key=lambda x: (sum(score(y) for y in x[1]),
                                               len(x[1])))[:num]

        retval = self.post_header % 'Top Commenters'
        for author, comments in top_commenters:
            retval += '0. %s (%d pts, %d comments)\n' % (
                self._user(author), sum(score(x) for x in comments),
                len(comments))
        return '%s\n' % retval

    def top_submissions(self, num):
        num = min(num, len(self.submissions))
        if num <= 0:
            return ''

        top_submissions = sorted(self.submissions, reverse=True,
                                 key=lambda x: x.score)[:num]

        retval = self.post_header % 'Top Submissions'
        for sub in top_submissions:
            author = str(sub.author)
            title = sub.title.replace('\n', ' ').strip()
            if sub.permalink != sub.url:
                retval += '0. [%s](%s)' % (title, sub.url)
            else:
                retval += '0. %s' % title
            retval += ' by %s (%d pts, [%d comments](%s))\n' % (
                self._user(author), sub.score, sub.num_comments,
                self._permalink(sub.permalink))
        return '%s\n' % retval

    def top_comments(self, num):
        score = lambda x: x.ups - x.downs

        num = min(num, len(self.comments))
        if num <= 0:
            return ''

        top_comments = sorted(self.comments, reverse=True,
                                 key=score)[:num]
        retval = self.post_header % 'Top Comments'
        for comment in top_comments:
            author = str(comment.author)
            title = comment.submission.title.replace('\n', ' ').strip()
            retval += ('0. %d pts: %s\'s [comment](%s) in %s\n'
                       % (score(comment), self._user(author),
                          self._permalink(comment.permalink), title))
        return '%s\n' % retval

    def publish_results(self, subreddit, submitters, commenters, submissions,
                        comments, top, debug=False):
        def timef(timestamp):
            dtime = datetime.fromtimestamp(timestamp)
            return dtime.strftime('%Y-%m-%d %H:%M PDT')

        title = '%s %s %ssubmissions from %s to %s' % (
            self.post_prefix, str(self.subreddit), 'top ' if top else '',
            timef(self.min_date), timef(self.max_date))
        if self.prev_srs:
            prev = '[Previous Stat](%s)  \n' % self._permalink(self.prev_srs)
        else:
            prev = ''

        basic = self.basic_stats()
        t_commenters = self.top_commenters(commenters)
        t_submissions = self.top_submissions(submissions)
        t_comments = self.top_comments(comments)
        footer = self.post_footer % (prev, self.max_date)

        body = ''
        num_submissions = 10
        while body == '' or len(body) > MAX_BODY_SIZE and num_submissions > 2:
            t_submitters = self.top_submitters(submitters, num_submissions)
            body = (basic + t_submitters + t_commenters + t_submissions +
                    t_comments + footer)
            num_submissions -= 1

        if len(body) > MAX_BODY_SIZE:
            print 'The resulting message is too big. Not submitting.'
            debug = True

        if not debug:
            msg = ('You are about to submit to subreddit %s as %s.\n'
                   'Are you sure? yes/[no]: ' % (subreddit,
                                                 str(self.reddit.user)))
            if raw_input(msg).lower() not in ['y', 'yes']:
                print 'Submission aborted'
            else:
                try:
                    self.reddit.submit(subreddit, title, text=body)
                    return
                except Exception, error:
                    print 'The submission failed:', error

        # We made it here either to debug=True or an error.
        print title
        print body