Esempio n. 1
0
# dictionary format with the filter attributes as keys
sub_results = [
    post.d_ for post in list(
        api.search_submissions(after=start_epoch,
                               subreddit=subreddit,
                               filter=['title', 'selftext', 'id'],
                               limit=1000))
]
#print(sub_results)

# map posts to comments
post_to_comment_dict = dict()

for post in sub_results:
    sub_id = post['id']
    # fetch comment ids
    comment_id_results = list(api._get_submission_comment_ids(sub_id))
    #print("sub_id: ", sub_id, " comments: ", comment_id_results)
    post_to_comment_dict[sub_id] = comment_id_results

# fetch comment data
for comment_list in post_to_comment_dict.values():
    # parent ids: t3 is original submission, t1 is another comment
    # is_submitter = true means that it is a reply from OP
    comment_results = [
        comment.d_ for comment in list(
            api.search_comments(ids=comment_list,
                                filter=['id', 'body', 'is_submitter', 'score'
                                        ]))
    ]
    #print(comment_results)
def fetch_links(subreddit=None,
                date_start=None,
                date_stop=None,
                limit=None,
                score=None,
                self_only=False):
    if subreddit is None or date_start is None or date_stop is None:
        print('ERROR: missing required arguments')
        exit()

    api = PushshiftAPI(rate_limit_per_minute=pushshift_rate_limit_per_minute,
                       detect_local_tz=False)

    # get links
    links = []
    print('fetching submissions %s to %s...' % (time.strftime(
        '%Y-%m-%d', date_start), time.strftime('%Y-%m-%d', date_stop)))
    params = {
        'after':
        int(mktime(date_start)) - 86400,  # make date inclusive, adjust for UTC
        'before': int(mktime(date_stop)) + 86400,
        'subreddit': subreddit,
        'filter': link_fields,
        'sort': 'asc',
        'sort_type': 'created_utc',
    }
    if limit:
        params['limit'] = int(limit)
    if score:
        params['score'] = score
    if self_only:
        params['is_self'] = True
    link_results = list(api.search_submissions(**params))
    print('processing %s links' % len(link_results))
    for s in link_results:
        # print('%s %s' % (datetime.utcfromtimestamp(int(s.d_['created_utc'])), s.d_['title']))
        # pprint(s)

        # get comment ids
        comments = []
        if s.d_['num_comments'] > 0 and not comment_data_exists(
                subreddit, s.d_['created_utc'], s.d_['id']):
            comment_ids = list(api._get_submission_comment_ids(s.d_['id']))
            # print('%s comment_ids: %s' % (data['id'], comment_ids))

            # get comments
            if (len(comment_ids) > 0):
                mychunks = []
                if len(comment_ids) > max_comments_per_query:
                    mychunks = chunks(comment_ids, max_comments_per_query)
                else:
                    mychunks = [comment_ids]
                for chunk in mychunks:
                    comment_params = {
                        'filter': comment_fields,
                        'ids': ','.join(chunk),
                        'limit': max_comments_per_query,
                    }
                    comments_results = list(
                        api.search_comments(**comment_params))
                    print(
                        '%s fetch link %s comments %s/%s' %
                        (datetime.utcfromtimestamp(int(s.d_['created_utc'])),
                         s.d_['id'], len(comments_results), len(comment_ids)))
                    for c in comments_results:
                        comments.append(c.d_)

        s.d_['comments'] = comments
        links.append(s.d_)

        # write results
        if len(links) >= write_every:
            success = write_links(subreddit, links)
            if success:
                links = []

    # write remining results
    if len(links):
        write_links(subreddit, links)
                logger.warning(f"Found more than 1000 ({bin_submissions}) submissions in one bin, try lowering the bin size: {before}-{after}")

            out_dir = data_dir.joinpath(subreddit)
            os.makedirs(out_dir, exist_ok=True)
            if len(list(out_dir.glob("*.text"))) > args.number_of_threads:
                print(f"stopping at {args.number_of_threads} threads")
                break
            for submission in submissions:
                submission = psaw_to_dict(submission)
                submission_id = get_id_for_comments(submission)
                out_file = out_dir.joinpath(submission_id + ".pickle")

                if not out_file.is_file():
                    # Get comments
                    submission_comment_ids = api._get_submission_comment_ids(
                        submission["id"]
                    )
                    if len(submission_comment_ids) > 3000:
                        logger.debug(f"Skipping thread with large amount of commments {submission['id']}")
                        continue  # because it's too slow to parse these large trees with the current code
                    comment_dict = collections.defaultdict(list)

                    # Batch to avoid 414: Url too long
                    batch_size = 400 # We can do 1000 at a time
                    for i in range(0, len(submission_comment_ids), batch_size):
                        batch_ids = submission_comment_ids[i : i + batch_size]

                        # Use psaw
                        try:
                            comments = api.search_comments(ids=batch_ids)
                            # It will just repeat unless we set a limit