def reddit_comment_update(appcfg, update_length=604800): print(' ---UPDATING COMMENTS WITH DATA FROM THE REDDIT API') totalnumber = Comment.select().where( (Comment.retrieved_on - Comment.created_utc) < update_length).count() needs_update_list = list() needs_update = Comment.select().where( (Comment.retrieved_on - Comment.created_utc) < update_length) print( ' ---Building Task List. This could take a while for large subreddits' ) with tqdm(total=totalnumber, ncols=100, dynamic_ncols=False) as nbar: for dbcomment in needs_update: fullname = "t1_{}".format(dbcomment.comment_id) needs_update_list.append(fullname) nbar.update(1) needs_update_list = list(chunks(needs_update_list, 100)) print( ' ---Accessing data from Reddit API and entering into database' ) with tqdm(total=totalnumber, ncols=100, dynamic_ncols=False) as pbar: for nlist in needs_update_list: try: rd_comments = list(r.info(nlist)) except RequestException: print("Connection Error to Reddit API. Exiting...") # quit() return with appcfg.database.atomic(): for rdcomment in rd_comments: updatedtime = arrow.now().timestamp if rdcomment.author is None and rdcomment.body == '[deleted]': Comment.update( score=rdcomment.score, retrieved_on=updatedtime, deleted=True).where( Comment.comment_id == rdcomment.id).execute() """ elif rdcomment.body == '[deleted]': Comment.update(score=rdcomment.score, retrieved_on=updatedtime, deleted=False).where(Comment.comment_id == rdcomment.id).execute() elif rdcomment.author is None: Comment.update(score=rdcomment.score, # body=rdcomment.body_html, retrieved_on=updatedtime, deleted=True).where(Comment.comment_id == rdcomment.id).execute() """ else: Comment.update( score=rdcomment.score, # body=rdcomment.body_html, retrieved_on=updatedtime, deleted=False).where( Comment.comment_id == rdcomment.id).execute() pbar.update(1)
def process_comment_urls(udb, ulimit=100000, number_of_processes=4): print('---EXTRACTING COMMENT URLS') totalcompleted = 0 if ulimit == 0: ulimit = None total_to_process = Comment.select().where( Comment.number_urls.is_null()).count() if ulimit is not None and total_to_process > ulimit: total_to_process = ulimit with tqdm(total=total_to_process) as pbar: while totalcompleted < total_to_process: with udb.atomic(): queue_tasks = [(comment.id, comment.body) for comment in Comment.select().where( Comment.number_urls.is_null()).limit(ulimit) ] # Create queues task_queue = Queue() # ctx.Queue() # done_queue = Queue() # ctx.Queue() # # Submit tasks for task in queue_tasks: task_queue.put(task) # Start worker processes for i in range(number_of_processes): Process(target=url_worker, args=(task_queue, done_queue)).start() for i in range(len(queue_tasks)): comment_id, url_set = done_queue.get() try: with udb.atomic(): Comment.update(number_urls=len(url_set)).where( Comment.id == comment_id).execute() for url in url_set: url, urlcreated = Url.get_or_create(link=url) try: CommentLinks.insert( comment=comment_id, url=url.id).on_conflict_ignore().execute() except SQLError: print(comment_id, url.id) raise except KeyboardInterrupt: quit() pbar.update(1) totalcompleted += 1 # Tell child processes to stop for i in range(number_of_processes): task_queue.put('STOP') """
def process_comments(appcfg): # Get newest comments with two week overlap print(' PROCESSING NEWEST PUSHSHIFT.IO COMMENTS FOR', appcfg.subreddit) try: newest_utc = int( Comment.select(fn.MAX(Comment.created_utc)).scalar().timestamp()) except (TypeError, AttributeError): newest_utc = None if newest_utc is not None: oldestdate = newest_utc # - 1209600 # two weeks overlap, in seconds else: oldestdate = appcfg.oldestdate try: comment_id_set = get_push_comments(appcfg, appcfg.newestdate, oldestdate) except (ConnectionError, SSLError, ChunkedEncodingError): comment_id_set = None print(" Connection Error for Pushshift API. Quitting...") # quit() return comment_id_set # Get oldest comments in case progress was interrupted, with two week overlap try: oldest_utc = int( Comment.select(fn.MIN(Comment.created_utc)).scalar().timestamp()) except (TypeError, AttributeError): oldest_utc = None if oldest_utc is not None: newestdate = oldest_utc # + 1209600 # two weeks overlap, in seconds else: newestdate = appcfg.newestdate print(' PROCESSING OLDEST PUSHSHIFT.IO COMMENTS FOR', appcfg.subreddit) try: old_comment_id_set = get_push_comments(appcfg, newestdate, appcfg.oldestdate) except (ConnectionError, SSLError, ChunkedEncodingError): old_comment_id_set = None print(" Connection Error for Pushshift API. Quitting...") # quit() return old_comment_id_set comment_id_set |= old_comment_id_set filedate = arrow.now().timestamp basedir = "/rpa" if os.environ.get('DOCKER', '0') == '1' else '.' coutput_file_path = "{basedir}/{subreddit}_comments_{timestamp}.txt".format( basedir=basedir, subreddit=appcfg.subreddit, timestamp=filedate) # with open(coutput_file_path, 'w', encoding='UTF-8') as comment_file: # comment_file.writelines(comment_id_set) print(" Total comments submitted to", appcfg.subreddit, "in set:", len(comment_id_set)) deleted = Author.get_or_none(name='[deleted]') if deleted is not None: cupdatet = Comment.update(deleted=True).where( (Comment.author == deleted.id) & (Comment.deleted.is_null() or Comment.deleted == 0)).execute() print( ' Updated deleted field in comments. Set deleted = True for', cupdatet, 'records.') cupdatef = Comment.update( deleted=False).where((Comment.author != deleted.id) & (Comment.deleted.is_null())).execute() print( ' Updated deleted field in comments. Set deleted = False for', cupdatef, 'records.')