done_counter = -1 # memory management mem_counter = -1 mem_counter_limit = 1000 # variables - text processing bs = None bs_div_docBody = None bs_div_mainText = None my_newsbank_helper = None cleaned_article_body = "" original_text = "" # initialize summary helper my_summary_helper = SummaryHelper() # First, get all Articles that have archive_source = "NewsBank" articles_to_process_qs = Article.objects.filter( archive_source = "NewsBank" ) # get article count. article_count = articles_to_process_qs.count() # loop over the articles. First, use QuerySetHelper to get an iterator over the # records that won't load them all into memory at once. #article_qs = articles_to_process_qs article_qs = QuerySetHelper.queryset_generator( articles_to_process_qs ) # set up loop variables. article_counter = 0 mem_counter = 0
article_author_rs = None author_counter = 0 current_article_author = None current_article_data = None current_article = None current_author_string = "" author_parts = None author_parts_length = -1 # auditing org_string_counter = 0 article_source_counter = 0 bad_author_string_counter = 0 # initialize summary helper my_summary_helper = SummaryHelper() # retrieve all article author rows where organization_string is empty. article_author_rs = Article_Author.objects.filter( organization_string__isnull = True ) # loop for current_article_author in article_author_rs: author_counter += 1 # get article data current_article_data = current_article_author.article_data # get article current_article = current_article_data.article
# make an instance reddit_collector = RedditCollector() # initialize connection parameters. reddit_collector.user_agent = my_user_agent reddit_collector.username = my_username reddit_collector.password = my_password # set collector to output details reddit_collector.do_output_details = False # set bulk collection flag (defaults to True) #reddit_collector.do_bulk_create = False # initialize summary helper my_summary_helper = SummaryHelper() # get post QuerySet #post_qs = reddit_collect.models.Post.objects.filter( reddit_id = reddit_post_id ) post_qs = reddit_collect.models.Post.objects.filter( reddit_id__in = [ '1cp0i3', '1d67nv' ] ) # num_comments? django_post = post_qs[ 0 ] print( "==> num_comments: " + str( django_post.num_comments ) ) # 115, at time of collection my_summary_helper.set_prop_value( "num_comments", django_post.num_comments ) my_summary_helper.set_prop_desc( "num_comments", "num_comments (post)" ) # pass the QuerySet to the collect_comments() method. reddit_collector.collect_comments( post_qs )
post = None comments = None flat_comments = None test_comment = None comment_prop_map = None summary_string = "" # set variables for interacting with reddit. my_user_agent = "<user_agent>" my_username = "******" my_password = "******" #reddit_post_id = "1cp0i3" reddit_post_id = "1bvkol" # init summary helper. my_summary_helper = SummaryHelper() print( "Starting PRAW test at " + str( start_dt ) ) # set user agent. r = praw.Reddit( user_agent = my_user_agent ) # got login set? if ( ( ( my_username ) and ( my_username != "" ) ) and ( ( my_password ) and ( my_password != "" ) ) ): # yes. Login. r.login( my_username, my_password ) print( "==> Logged in." ) #-- END check to see if we log in. --#
query_set_helper = None comment_qs_iterator = None total_comment_count = -1 comment_count = -1 comment_counter = -1 aggregate_counter = -1 overall_progress_summary = "" output_details = True comment = None related_post = None related_subreddit = None summary_string = "" # init summary helper my_summary_helper = SummaryHelper() # set slice size slice_size = 10000 # 10000 - want to maximize work done between queries, but not do so much that you # cause slowdown, or have the result set so large that python and django have # trouble with loading all the objects into memory. 100 or 10 were too few # (too little work between queries). 1000 was OK, but I think still not # enough work between query calls. # retrieve comments that don't have a related subreddit. comment_qs = reddit_collect.models.Comment.objects.filter( subreddit = None ) # get count of comments. comment_count = comment_qs.count()
def code_article_data( self, query_set_IN ): """ Accepts query set of Articles. Creates a new instance of the ArticleCoder class for coder_type passed in, places the query set in it, sets up its instance variables appropriately according to the request, then codes the attribution in the articles using the coder class. Returns status message. Results in Article_Data for each attribution detected by the coder in each article. Checks for the attribution to already have been detected using article, paragraph number, etc. If so, does not create an additional Article_Data instance (could add a flag for this later if needed...). Preconditions: assumes that we have a query set of Articles passed in that we can store in the instance. If not, does nothing, returns empty string. Postconditions: Returns status message. Results in Article_Data for each attribution detected by the coder in each article. Checks for the attribution to already have been detected using article, paragraph number, etc. If so, does not create an additional Article_Data instance (could add a flag for this later if needed...). Parameters: - query_set_IN - django HTTP request instance that contains parameters we use to generate network data. Returns: - String - Status message. """ # return reference status_OUT = '' # declare variables me = "code_article_data" logging_message = "" my_logger = None do_i_print_updates = False my_summary_helper = None summary_string = "" article_coder = None param_dict = {} current_status = "" my_exception_helper = None exception_message = "" # rate-limiting variables am_i_rate_limited = False continue_work = True # auditing variables article_counter = -1 exception_counter = -1 error_counter = -1 # grab a logger. my_logger = self.get_logger() # do I print some status? do_i_print_updates = self.do_print_updates # initialize summary helper my_summary_helper = SummaryHelper() # init rate-limiting am_i_rate_limited = self.do_manage_time # do we have a query set? if ( query_set_IN ): # create instance of ArticleCoder. article_coder = self.get_coder_instance() # initialize ArticleCoder instance from params. # Get parent parameter container. my_params = self.get_param_container() # retrieve the inner dictionary. param_dict = my_params.get_parameters() # use the dictionary from the param container to initialize. article_coder.initialize_from_params( param_dict ) # loop on the article list, passing each to the ArticleCoder for # processing. article_counter = 0 exception_counter = 0 error_counter = 0 continue_work = True for current_article in query_set_IN: # OK to continue work? if ( continue_work == True ): # increment article counter article_counter += 1 # rate-limited? if ( am_i_rate_limited == True ): # yes - start timer. self.start_request() #-- END pre-request check for rate-limiting --# # a little debugging to start logging_message = "\n\n============================================================\n==> article " + str( article_counter ) + ": " + str( current_article.id ) + " - " + current_article.headline my_logger.info( logging_message ) # print? if ( do_i_print_updates == True ): print( logging_message ) #-- END check to see if we print a message. # add per-article exception handling, so we can get an idea of how # many articles cause problems. try: # code the article. current_status = article_coder.code_article( current_article ) # record status self.record_article_status( current_article.id, current_status ) # success? if ( current_status != ArticleCoder.STATUS_SUCCESS ): # nope. Error. error_counter += 1 logging_message = "======> In " + me + "(): ERROR - " + current_status + "; article = " + str( current_article ) my_logger.debug( logging_message ) # print? if ( do_i_print_updates == True ): print( logging_message ) #-- END check to see if we print a message. #-- END check to see if success --# except Exception as e: # increment exception_counter exception_counter += 1 # get exception helper. my_exception_helper = self.get_exception_helper() # log exception, no email or anything. exception_message = "Exception caught for article " + str( current_article.id ) my_exception_helper.process_exception( e, exception_message ) logging_message = "======> " + exception_message + " - " + str( e ) my_logger.debug( logging_message ) # print? if ( do_i_print_updates == True ): print( logging_message ) #-- END check to see if we print a message. # record status self.record_article_status( current_article.id, logging_message ) #-- END exception handling around individual article processing. --# # rate-limited? if ( am_i_rate_limited == True ): # yes - check if we may continue. continue_work = self.may_i_continue() #-- END post-request check for rate-limiting --# else: # not OK to continue work. Break? #break pass #-- END check to see if OK to continue. If not... --# #-- END loop over articles --# # add some debug? if ( ArticleCoder.DEBUG_FLAG == True ): # yup. status_OUT += "\n\n" + article_coder.debug + "\n\n" #-- END check to see if we have debug to output. --# #-- END check to make sure we have a query set. --# # add stuff to summary and print the results. # set stop time my_summary_helper.set_stop_time() # add stuff to summary my_summary_helper.set_prop_value( "article_counter", article_counter ) my_summary_helper.set_prop_desc( "article_counter", "Articles processed" ) my_summary_helper.set_prop_value( "error_counter", error_counter ) my_summary_helper.set_prop_desc( "error_counter", "Error count" ) my_summary_helper.set_prop_value( "exception_counter", exception_counter ) my_summary_helper.set_prop_desc( "exception_counter", "Exception count" ) # output - set prefix if you want. summary_string += my_summary_helper.create_summary_string( item_prefix_IN = "==> " ) my_logger.info( summary_string ) # output summary string as status. status_OUT += summary_string return status_OUT
current_pub_date = "" current_text = "" is_matched = False matching_article_list = None match_count = -1 match_article = None match_counter = -1 # auditing single_match_counter = 0 multi_match_counter = 0 no_match_counter = 0 archive_match_counter = 0 # initialize summary helper my_summary_helper = SummaryHelper() # get a list of all the articles to migrate that don't have an article ID. article_list = Articles_To_Migrate.objects.filter(article=None).order_by("id") # get number of articles to process articles_to_process = article_list.count() print(articles_to_process) # loop over articles. for current_article in article_list: # look for plain old articles that have same headline. current_headline = current_article.headline print("") print("==> article " + str(current_article.id) + ": " + str(current_article))