done_counter = -1

# memory management
mem_counter = -1
mem_counter_limit = 1000

# variables - text processing
bs = None
bs_div_docBody = None
bs_div_mainText = None
my_newsbank_helper = None
cleaned_article_body = ""
original_text = ""

# initialize summary helper
my_summary_helper = SummaryHelper()

# First, get all Articles that have archive_source = "NewsBank"
articles_to_process_qs = Article.objects.filter( archive_source = "NewsBank" )

# get article count.
article_count = articles_to_process_qs.count()

# loop over the articles.  First, use QuerySetHelper to get an iterator over the
#    records that won't load them all into memory at once.
#article_qs = articles_to_process_qs
article_qs = QuerySetHelper.queryset_generator( articles_to_process_qs )

# set up loop variables.
article_counter = 0
mem_counter = 0
article_author_rs = None
author_counter = 0
current_article_author = None
current_article_data = None
current_article = None
current_author_string = ""
author_parts = None
author_parts_length = -1

# auditing
org_string_counter = 0
article_source_counter = 0
bad_author_string_counter = 0

# initialize summary helper
my_summary_helper = SummaryHelper()

# retrieve all article author rows where organization_string is empty.
article_author_rs = Article_Author.objects.filter( organization_string__isnull = True )

# loop
for current_article_author in article_author_rs:

    author_counter += 1

    # get article data
    current_article_data = current_article_author.article_data
    
    # get article
    current_article = current_article_data.article
    
# make an instance
reddit_collector = RedditCollector()

# initialize connection parameters.
reddit_collector.user_agent = my_user_agent
reddit_collector.username = my_username
reddit_collector.password = my_password

# set collector to output details
reddit_collector.do_output_details = False

# set bulk collection flag (defaults to True)
#reddit_collector.do_bulk_create = False

# initialize summary helper
my_summary_helper = SummaryHelper()

# get post QuerySet
#post_qs = reddit_collect.models.Post.objects.filter( reddit_id = reddit_post_id )
post_qs = reddit_collect.models.Post.objects.filter( reddit_id__in = [ '1cp0i3', '1d67nv' ] )

# num_comments?
django_post = post_qs[ 0 ]
print( "==> num_comments: " + str( django_post.num_comments ) ) # 115, at time of collection

my_summary_helper.set_prop_value( "num_comments", django_post.num_comments )
my_summary_helper.set_prop_desc( "num_comments", "num_comments (post)" )
    
# pass the QuerySet to the collect_comments() method.
reddit_collector.collect_comments( post_qs )
post = None
comments = None
flat_comments = None
test_comment = None
comment_prop_map = None
summary_string = ""

# set variables for interacting with reddit.
my_user_agent = "<user_agent>"
my_username = "******"
my_password = "******"
#reddit_post_id = "1cp0i3"
reddit_post_id = "1bvkol"

# init summary helper.
my_summary_helper = SummaryHelper()

print( "Starting PRAW test at " + str( start_dt ) )

# set user agent.
r = praw.Reddit( user_agent = my_user_agent )

# got login set?
if ( ( ( my_username ) and ( my_username != "" ) ) and ( ( my_password ) and ( my_password != "" ) ) ):

    # yes.  Login.
    r.login( my_username, my_password )

    print( "==> Logged in." )

#-- END check to see if we log in. --#
query_set_helper = None
comment_qs_iterator = None
total_comment_count = -1
comment_count = -1
comment_counter = -1
aggregate_counter = -1
overall_progress_summary = ""
output_details = True

comment = None
related_post = None
related_subreddit = None
summary_string = ""

# init summary helper
my_summary_helper = SummaryHelper()

# set slice size
slice_size = 10000

# 10000 - want to maximize work done between queries, but not do so much that you
#    cause slowdown, or have the result set so large that python and django have
#    trouble with loading all the objects into memory.  100 or 10 were too few
#    (too little work between queries).  1000 was OK, but I think still not
#    enough work between query calls.

# retrieve comments that don't have a related subreddit.
comment_qs = reddit_collect.models.Comment.objects.filter( subreddit = None )

# get count of comments.
comment_count = comment_qs.count()
Exemple #6
0
    def code_article_data( self, query_set_IN ):

        """
            Accepts query set of Articles.  Creates a new instance of the
               ArticleCoder class for coder_type passed in, places the query set
               in it, sets up its instance variables appropriately according to
               the request, then codes the attribution in the articles using the
               coder class.  Returns status message.  Results in Article_Data for
               each attribution detected by the coder in each article.  Checks
               for the attribution to already have been detected using article,
               paragraph number, etc.  If so, does not create an additional
               Article_Data instance (could add a flag for this later if
               needed...).
            Preconditions: assumes that we have a query set of Articles passed
               in that we can store in the instance.  If not, does nothing,
               returns empty string.
            Postconditions: Returns status message.  Results in Article_Data for
               each attribution detected by the coder in each article.  Checks
               for the attribution to already have been detected using article,
               paragraph number, etc.  If so, does not create an additional
               Article_Data instance (could add a flag for this later if
               needed...).

            Parameters:
            - query_set_IN - django HTTP request instance that contains parameters we use to generate network data.

            Returns:
            - String - Status message.
        """

        # return reference
        status_OUT = ''

        # declare variables
        me = "code_article_data"
        logging_message = ""
        my_logger = None
        do_i_print_updates = False
        my_summary_helper = None
        summary_string = ""
        article_coder = None
        param_dict = {}
        current_status = ""
        my_exception_helper = None
        exception_message = ""
        
        # rate-limiting variables
        am_i_rate_limited = False
        continue_work = True
        
        # auditing variables
        article_counter = -1
        exception_counter = -1
        error_counter = -1
        
        # grab a logger.
        my_logger = self.get_logger()
        
        # do I print some status?
        do_i_print_updates = self.do_print_updates
        
        # initialize summary helper
        my_summary_helper = SummaryHelper()
        
        # init rate-limiting
        am_i_rate_limited = self.do_manage_time

        # do we have a query set?
        if ( query_set_IN ):

            # create instance of ArticleCoder.
            article_coder = self.get_coder_instance()

            # initialize ArticleCoder instance from params.
            
            # Get parent parameter container.
            my_params = self.get_param_container()
            
            # retrieve the inner dictionary.
            param_dict = my_params.get_parameters()
            
            # use the dictionary from the param container to initialize.
            article_coder.initialize_from_params( param_dict )

            # loop on the article list, passing each to the ArticleCoder for
            #    processing.
            article_counter = 0
            exception_counter = 0
            error_counter = 0
            continue_work = True
            for current_article in query_set_IN:
            
                # OK to continue work?
                if ( continue_work == True ):

                    # increment article counter
                    article_counter += 1
                    
                    # rate-limited?
                    if ( am_i_rate_limited == True ):
                    
                        # yes - start timer.
                        self.start_request()
                    
                    #-- END pre-request check for rate-limiting --#
                    
                    # a little debugging to start
                    logging_message = "\n\n============================================================\n==> article " + str( article_counter ) + ": " + str( current_article.id ) + " - " + current_article.headline
                    my_logger.info( logging_message )
                    
                    # print?
                    if ( do_i_print_updates == True ):
                    
                        print( logging_message )
                        
                    #-- END check to see if we print a message.
                    
                    # add per-article exception handling, so we can get an idea of how
                    #    many articles cause problems.
                    try:
                
                        # code the article.
                        current_status = article_coder.code_article( current_article )
                        
                        # record status
                        self.record_article_status( current_article.id, current_status )
                        
                        # success?
                        if ( current_status != ArticleCoder.STATUS_SUCCESS ):
                        
                            # nope.  Error.
                            error_counter += 1
                            
                            logging_message = "======> In " + me + "(): ERROR - " + current_status + "; article = " + str( current_article )
                            my_logger.debug( logging_message )
                            
                            # print?
                            if ( do_i_print_updates == True ):
                            
                                print( logging_message )
                                
                            #-- END check to see if we print a message.
                            
                        #-- END check to see if success --#
                        
                    except Exception as e:
                        
                        # increment exception_counter
                        exception_counter += 1
                        
                        # get exception helper.
                        my_exception_helper = self.get_exception_helper()
                        
                        # log exception, no email or anything.
                        exception_message = "Exception caught for article " + str( current_article.id )
                        my_exception_helper.process_exception( e, exception_message )
                        
                        logging_message = "======> " + exception_message + " - " + str( e )
                        my_logger.debug( logging_message )
                        
                        # print?
                        if ( do_i_print_updates == True ):
                        
                            print( logging_message )
                            
                        #-- END check to see if we print a message.
                        
                        # record status
                        self.record_article_status( current_article.id, logging_message )

                    #-- END exception handling around individual article processing. --#
                
                    # rate-limited?
                    if ( am_i_rate_limited == True ):
                    
                        # yes - check if we may continue.
                        continue_work = self.may_i_continue()
                    
                    #-- END post-request check for rate-limiting --#
                    
                else:
                
                    # not OK to continue work.  Break?
                    #break
                    pass
                
                #-- END check to see if OK to continue.  If not... --#
                
            #-- END loop over articles --#

            # add some debug?
            if ( ArticleCoder.DEBUG_FLAG == True ):

                # yup.
                status_OUT += "\n\n" + article_coder.debug + "\n\n"

            #-- END check to see if we have debug to output. --#

        #-- END check to make sure we have a query set. --#
        
        # add stuff to summary and print the results.

        # set stop time
        my_summary_helper.set_stop_time()

        # add stuff to summary
        my_summary_helper.set_prop_value( "article_counter", article_counter )
        my_summary_helper.set_prop_desc( "article_counter", "Articles processed" )

        my_summary_helper.set_prop_value( "error_counter", error_counter )
        my_summary_helper.set_prop_desc( "error_counter", "Error count" )

        my_summary_helper.set_prop_value( "exception_counter", exception_counter )
        my_summary_helper.set_prop_desc( "exception_counter", "Exception count" )

        # output - set prefix if you want.
        summary_string += my_summary_helper.create_summary_string( item_prefix_IN = "==> " )
        my_logger.info( summary_string )
        
        # output summary string as status.
        status_OUT += summary_string

        return status_OUT
Exemple #7
0
current_pub_date = ""
current_text = ""
is_matched = False
matching_article_list = None
match_count = -1
match_article = None
match_counter = -1

# auditing
single_match_counter = 0
multi_match_counter = 0
no_match_counter = 0
archive_match_counter = 0

# initialize summary helper
my_summary_helper = SummaryHelper()

# get a list of all the articles to migrate that don't have an article ID.
article_list = Articles_To_Migrate.objects.filter(article=None).order_by("id")

# get number of articles to process
articles_to_process = article_list.count()
print(articles_to_process)

# loop over articles.
for current_article in article_list:

    # look for plain old articles that have same headline.
    current_headline = current_article.headline
    print("")
    print("==> article " + str(current_article.id) + ": " + str(current_article))