コード例 #1
0
ファイル: subreddit.py プロジェクト: pl77/redditPostArchiver
def reddit_comment_update(appcfg, update_length=604800):
    print('     ---UPDATING COMMENTS WITH DATA FROM THE REDDIT API')
    totalnumber = Comment.select().where(
        (Comment.retrieved_on - Comment.created_utc) < update_length).count()
    needs_update_list = list()
    needs_update = Comment.select().where(
        (Comment.retrieved_on - Comment.created_utc) < update_length)
    print(
        '         ---Building Task List.  This could take a while for large subreddits'
    )

    with tqdm(total=totalnumber, ncols=100, dynamic_ncols=False) as nbar:
        for dbcomment in needs_update:
            fullname = "t1_{}".format(dbcomment.comment_id)
            needs_update_list.append(fullname)
            nbar.update(1)
    needs_update_list = list(chunks(needs_update_list, 100))
    print(
        '         ---Accessing data from Reddit API and entering into database'
    )
    with tqdm(total=totalnumber, ncols=100, dynamic_ncols=False) as pbar:
        for nlist in needs_update_list:
            try:
                rd_comments = list(r.info(nlist))
            except RequestException:
                print("Connection Error to Reddit API. Exiting...")
                # quit()
                return
            with appcfg.database.atomic():
                for rdcomment in rd_comments:
                    updatedtime = arrow.now().timestamp
                    if rdcomment.author is None and rdcomment.body == '[deleted]':
                        Comment.update(
                            score=rdcomment.score,
                            retrieved_on=updatedtime,
                            deleted=True).where(
                                Comment.comment_id == rdcomment.id).execute()
                        """
                    elif rdcomment.body == '[deleted]':
                        Comment.update(score=rdcomment.score,
                                       retrieved_on=updatedtime,
                                       deleted=False).where(Comment.comment_id == rdcomment.id).execute()
                    elif rdcomment.author is None:
                        Comment.update(score=rdcomment.score,
                                       # body=rdcomment.body_html,
                                       retrieved_on=updatedtime,
                                       deleted=True).where(Comment.comment_id == rdcomment.id).execute()
                        """
                    else:
                        Comment.update(
                            score=rdcomment.score,
                            # body=rdcomment.body_html,
                            retrieved_on=updatedtime,
                            deleted=False).where(
                                Comment.comment_id == rdcomment.id).execute()
                    pbar.update(1)
コード例 #2
0
def process_comment_urls(udb, ulimit=100000, number_of_processes=4):
    print('---EXTRACTING COMMENT URLS')
    totalcompleted = 0
    if ulimit == 0:
        ulimit = None
    total_to_process = Comment.select().where(
        Comment.number_urls.is_null()).count()
    if ulimit is not None and total_to_process > ulimit:
        total_to_process = ulimit
    with tqdm(total=total_to_process) as pbar:
        while totalcompleted < total_to_process:
            with udb.atomic():
                queue_tasks = [(comment.id, comment.body)
                               for comment in Comment.select().where(
                                   Comment.number_urls.is_null()).limit(ulimit)
                               ]
            # Create queues
            task_queue = Queue()  # ctx.Queue()  #
            done_queue = Queue()  # ctx.Queue()  #

            # Submit tasks
            for task in queue_tasks:
                task_queue.put(task)

            # Start worker processes
            for i in range(number_of_processes):
                Process(target=url_worker,
                        args=(task_queue, done_queue)).start()

            for i in range(len(queue_tasks)):
                comment_id, url_set = done_queue.get()
                try:
                    with udb.atomic():
                        Comment.update(number_urls=len(url_set)).where(
                            Comment.id == comment_id).execute()
                        for url in url_set:
                            url, urlcreated = Url.get_or_create(link=url)
                            try:
                                CommentLinks.insert(
                                    comment=comment_id,
                                    url=url.id).on_conflict_ignore().execute()
                            except SQLError:
                                print(comment_id, url.id)
                                raise
                except KeyboardInterrupt:
                    quit()

                pbar.update(1)
                totalcompleted += 1

            # Tell child processes to stop
            for i in range(number_of_processes):
                task_queue.put('STOP')
    """
コード例 #3
0
ファイル: subreddit.py プロジェクト: pl77/redditPostArchiver
def process_comments(appcfg):
    # Get newest comments with two week overlap
    print('   PROCESSING NEWEST PUSHSHIFT.IO COMMENTS FOR', appcfg.subreddit)

    try:
        newest_utc = int(
            Comment.select(fn.MAX(Comment.created_utc)).scalar().timestamp())
    except (TypeError, AttributeError):
        newest_utc = None
    if newest_utc is not None:
        oldestdate = newest_utc  # - 1209600  # two weeks overlap, in seconds
    else:
        oldestdate = appcfg.oldestdate

    try:
        comment_id_set = get_push_comments(appcfg, appcfg.newestdate,
                                           oldestdate)
    except (ConnectionError, SSLError, ChunkedEncodingError):
        comment_id_set = None
        print("     Connection Error for Pushshift API.  Quitting...")
        # quit()
        return comment_id_set

    # Get oldest comments in case progress was interrupted, with two week overlap
    try:
        oldest_utc = int(
            Comment.select(fn.MIN(Comment.created_utc)).scalar().timestamp())
    except (TypeError, AttributeError):
        oldest_utc = None
    if oldest_utc is not None:
        newestdate = oldest_utc  # + 1209600  # two weeks overlap, in seconds
    else:
        newestdate = appcfg.newestdate
    print('   PROCESSING OLDEST PUSHSHIFT.IO COMMENTS FOR', appcfg.subreddit)

    try:
        old_comment_id_set = get_push_comments(appcfg, newestdate,
                                               appcfg.oldestdate)
    except (ConnectionError, SSLError, ChunkedEncodingError):
        old_comment_id_set = None
        print("     Connection Error for Pushshift API.  Quitting...")
        # quit()
        return old_comment_id_set
    comment_id_set |= old_comment_id_set
    filedate = arrow.now().timestamp
    basedir = "/rpa" if os.environ.get('DOCKER', '0') == '1' else '.'
    coutput_file_path = "{basedir}/{subreddit}_comments_{timestamp}.txt".format(
        basedir=basedir, subreddit=appcfg.subreddit, timestamp=filedate)

    # with open(coutput_file_path, 'w', encoding='UTF-8') as comment_file:
    #     comment_file.writelines(comment_id_set)
    print("     Total comments submitted to", appcfg.subreddit, "in set:",
          len(comment_id_set))
    deleted = Author.get_or_none(name='[deleted]')
    if deleted is not None:
        cupdatet = Comment.update(deleted=True).where(
            (Comment.author == deleted.id)
            & (Comment.deleted.is_null() or Comment.deleted == 0)).execute()
        print(
            '     Updated deleted field in comments.  Set deleted = True for',
            cupdatet, 'records.')
        cupdatef = Comment.update(
            deleted=False).where((Comment.author != deleted.id)
                                 & (Comment.deleted.is_null())).execute()
        print(
            '     Updated deleted field in comments.  Set deleted = False for',
            cupdatef, 'records.')