def fetch_submissions(start_epoch, end_epoch, subreddits, csv_writer):
    r = praw.Reddit(user_agent=REDDIT_USER_AGENT)
    seconds_to_increment = SECONDS_IN_A_DAY * FLAGS.step_days
    total_records_fetched = 0
    for subreddit in subreddits:
        for i in xrange(start_epoch, end_epoch, seconds_to_increment):
            segment_start = i
            segment_end = segment_start + min(seconds_to_increment, end_epoch - i)
            query = 'timestamp:%d..%d' % (segment_start, segment_end)
            results = list(r.search(query, subreddit=subreddit,
                                    sort='new', limit=None,
                                    syntax='cloudsearch'))
            if len(results) > RESULTS_CORRECTNESS_CHECK:
                print ("WARNING: received %i results. This is dangerously close"
                       "to the max number of allowed results (1000)." % (
                           len(results)))
            for result in results:
                # We store the score as a string because we will have to write
                # it.
                submission = Submission(
                    result.id, result.title, result.selftext, result.url,
                    result.permalink, unicode(result.score), subreddit)

                # submission = submission_fields_as_strings(submission)
                # unicode_row = [s.encode('utf-8') for s in submission]
                unicode_row = submission_to_unicode(submission)
                csv_writer.writerow(unicode_row)
            total_records_fetched += len(results)
            segment_start_string = epoch_to_date_string(segment_start)
            segment_end_string = epoch_to_date_string(segment_end)
            print "[%s] - from %s to %s fetched %i results. Total submissions saved: %i" % (
                subreddit, segment_start_string, segment_end_string, len(results),
                total_records_fetched)
Exemple #2
0
def fetch_submissions(start_epoch, end_epoch, subreddits, csv_writer):
    r = praw.Reddit(user_agent=REDDIT_USER_AGENT)
    seconds_to_increment = SECONDS_IN_A_DAY * FLAGS.step_days
    total_records_fetched = 0
    for subreddit in subreddits:
        for i in xrange(start_epoch, end_epoch, seconds_to_increment):
            segment_start = i
            segment_end = segment_start + min(seconds_to_increment,
                                              end_epoch - i)
            query = 'timestamp:%d..%d' % (segment_start, segment_end)
            results = list(
                r.search(query,
                         subreddit=subreddit,
                         sort='new',
                         limit=None,
                         syntax='cloudsearch'))
            if len(results) > RESULTS_CORRECTNESS_CHECK:
                print(
                    "WARNING: received %i results. This is dangerously close"
                    "to the max number of allowed results (1000)." %
                    (len(results)))
            for result in results:
                # We store the score as a string because we will have to write
                # it.
                submission = Submission(result.id, result.title,
                                        result.selftext,
                                        result.url, result.permalink,
                                        unicode(result.score), subreddit)

                # submission = submission_fields_as_strings(submission)
                # unicode_row = [s.encode('utf-8') for s in submission]
                unicode_row = submission_to_unicode(submission)
                csv_writer.writerow(unicode_row)
            total_records_fetched += len(results)
            segment_start_string = epoch_to_date_string(segment_start)
            segment_end_string = epoch_to_date_string(segment_end)
            print "[%s] - from %s to %s fetched %i results. Total submissions saved: %i" % (
                subreddit, segment_start_string, segment_end_string,
                len(results), total_records_fetched)
def run():
    # Test apparatus:
    # Open a file, read the titles, try to extract content, output CSV with
    # extracted content.

    # TODO: I would really like to move this logic into a well-designed class
    # for general-purpose .yaml based extractors.

    rules = yaml.load(open('rules.yaml', 'r'))
    variables = rules['variables']
    feature_extractors_single = get_feature_extractor(
        'feature_extractors_single', rules, variables)
    feature_extractors_list = get_feature_extractor('feature_extractors_list',
                                                    rules, variables)
    feature_extractors_boolean = get_feature_extractor(
        'feature_extractors_boolean', rules, variables)

    input_file = open(FLAGS.input, 'r')
    output_file = open(FLAGS.output, 'w')
    csv_writer = csv.writer(output_file)
    csv_writer.writerow(ProcessedSubmission._fields)
    csv_reader = csv.reader(input_file)
    csv_reader.next()  # skip the first row because of headers
    number_completed_rows = 0
    for row in csv_reader:
        submission = submission_from_csv_row(row)
        all_extracted_variables = {}

        text_to_process = {
            'title': submission.title,
            'selftext': submission.selftext,
            'url': submission.url
        }
        debug_mode = False
        if FLAGS.debug_id == submission.id:
            print "Debug printing for --debug_id=%s" % (FLAGS.debug_id)
            debug_mode = True
        for location, text in text_to_process.iteritems():
            # Extract single features

            # To support unicode characters in the text we are trying to parse:
            # UGH...can't wait till gflags supports Python3...
            text = text.decode('utf-8')

            for extractor_name, extractor_dict in feature_extractors_in_order(
                    feature_extractors_single):
                order = extractor_dict['order']
                if location not in extractor_dict['locations']:
                    continue
                pattern = extractor_dict['pattern']
                if debug_mode:
                    print "extractor [%i, %s]" % (order, extractor_name)
                    print "  text: ", text
                    print "  pattern:", pattern
                pattern = pattern.decode('utf-8')
                m = re.search(pattern, text, re.UNICODE)
                if m is not None:
                    extracted_variables = m.groupdict()
                    # We do this funky thing here because we want the existing
                    # values in all_extracted_variables to have priority over
                    # the new values we are adding. This allows us to order
                    # our feature extractors in order from most confident to
                    # least confident without worries of adding features from
                    # less confident feature extractors.
                    if debug_mode:
                        print "  extractor returned:", extracted_variables
                    extracted_variables.update(all_extracted_variables)
                    all_extracted_variables = extracted_variables
                else:
                    if debug_mode:
                        print "  extractor returned no matches."
                if debug_mode:
                    print "----------------------------------------------"
                    # print "Extracted!", all_extracted_variables
                #print "text: ", text
                #print "pattern: ", pattern
            # Extract feature lists:
            for extractor_name, extractor_dict in feature_extractors_in_order(
                    feature_extractors_list):
                if location not in extractor_dict['locations']:
                    continue
                pattern = extractor_dict['pattern']
                pattern = pattern.decode('utf-8')
                matches = re.findall(pattern, text, re.UNICODE)
                assert isinstance(matches, list)
                if extractor_name not in all_extracted_variables:
                    all_extracted_variables[extractor_name] = set()
                for match in matches:
                    all_extracted_variables[extractor_name].add(match)
            # Set Boolean features
            for extractor_name, extractor_dict in feature_extractors_in_order(
                    feature_extractors_boolean):
                if location not in extractor_dict['locations']:
                    continue
                pattern = extractor_dict['pattern']
                # By default, we set all features to False
                if extractor_name not in all_extracted_variables:
                    all_extracted_variables[extractor_name] = False
                pattern = pattern.decode('utf-8')
                m = re.search(pattern, text, re.UNICODE)
                if m is not None:
                    all_extracted_variables[extractor_name] = True
        # print "all_extracted_variables: ", all_extracted_variables

        (complete, height_in, start_weight_lbs, end_weight_lbs,
         gender_is_female, age, imgur_images,
         imgur_albums) = process_extracted_variables(all_extracted_variables,
                                                     debug_mode)
        if debug_mode:
            print all_extracted_variables

        if complete:
            number_completed_rows += 1
        if FLAGS.output_only_complete and not complete:
            continue
        processed_submission = ProcessedSubmission(
            complete, height_in, start_weight_lbs, end_weight_lbs,
            gender_is_female, age, imgur_images, imgur_albums, *submission)
        # Output to csv
        csv_writer.writerow(submission_to_unicode(processed_submission))
    input_file.close()
    output_file.close()
    print "number_completed_rows: ", number_completed_rows
def run():
    # Test apparatus:
    # Open a file, read the titles, try to extract content, output CSV with
    # extracted content.

    # TODO: I would really like to move this logic into a well-designed class
    # for general-purpose .yaml based extractors.

    rules = yaml.load(open('rules.yaml', 'r'))
    variables = rules['variables']
    feature_extractors_single = get_feature_extractor('feature_extractors_single', rules, variables)
    feature_extractors_list = get_feature_extractor('feature_extractors_list', rules, variables)
    feature_extractors_boolean = get_feature_extractor('feature_extractors_boolean', rules, variables)


    input_file = open(FLAGS.input, 'r')
    output_file = open(FLAGS.output, 'w')
    csv_writer = csv.writer(output_file)
    csv_writer.writerow(ProcessedSubmission._fields)
    csv_reader = csv.reader(input_file)
    csv_reader.next()  # skip the first row because of headers
    number_completed_rows = 0
    for row in csv_reader:
        submission = submission_from_csv_row(row)
        all_extracted_variables = {}

        text_to_process = {'title': submission.title,
                           'selftext': submission.selftext,
                           'url':submission.url}
        debug_mode = False
        if FLAGS.debug_id == submission.id:
            print "Debug printing for --debug_id=%s" % (FLAGS.debug_id)
            debug_mode = True
        for location, text in text_to_process.iteritems():
            # Extract single features

            # To support unicode characters in the text we are trying to parse:
            # UGH...can't wait till gflags supports Python3...
            text = text.decode('utf-8')


            for extractor_name, extractor_dict in feature_extractors_in_order(feature_extractors_single):
                order = extractor_dict['order']
                if location not in extractor_dict['locations']:
                    continue
                pattern = extractor_dict['pattern']
                if debug_mode:
                    print "extractor [%i, %s]" % (order, extractor_name)
                    print "  text: ", text
                    print "  pattern:", pattern
                pattern = pattern.decode('utf-8')
                m = re.search(pattern, text, re.UNICODE)
                if m is not None:
                    extracted_variables = m.groupdict()
                    # We do this funky thing here because we want the existing
                    # values in all_extracted_variables to have priority over
                    # the new values we are adding. This allows us to order
                    # our feature extractors in order from most confident to
                    # least confident without worries of adding features from
                    # less confident feature extractors.
                    if debug_mode:
                        print  "  extractor returned:", extracted_variables
                    extracted_variables.update(all_extracted_variables)
                    all_extracted_variables = extracted_variables
                else:
                    if debug_mode:
                        print "  extractor returned no matches."
                if debug_mode:
                    print "----------------------------------------------"
                    # print "Extracted!", all_extracted_variables
                #print "text: ", text
                #print "pattern: ", pattern
            # Extract feature lists:
            for extractor_name, extractor_dict in feature_extractors_in_order(feature_extractors_list):
                if location not in extractor_dict['locations']:
                    continue
                pattern = extractor_dict['pattern']
                pattern = pattern.decode('utf-8')
                matches = re.findall(pattern, text, re.UNICODE)
                assert isinstance(matches, list)
                if extractor_name not in all_extracted_variables:
                    all_extracted_variables[extractor_name] = set()
                for match in matches:
                    all_extracted_variables[extractor_name].add(match)
            # Set Boolean features
            for extractor_name, extractor_dict in feature_extractors_in_order(feature_extractors_boolean):
                if location not in extractor_dict['locations']:
                    continue
                pattern = extractor_dict['pattern']
                # By default, we set all features to False
                if extractor_name not in all_extracted_variables:
                    all_extracted_variables[extractor_name] = False
                pattern = pattern.decode('utf-8')
                m = re.search(pattern, text, re.UNICODE)
                if m is not None:
                    all_extracted_variables[extractor_name] = True
        # print "all_extracted_variables: ", all_extracted_variables


        (complete, height_in, start_weight_lbs, end_weight_lbs,gender_is_female,
         age, imgur_images, imgur_albums) = process_extracted_variables(
            all_extracted_variables, debug_mode)
        if debug_mode:
            print all_extracted_variables

        if complete:
            number_completed_rows += 1
        if FLAGS.output_only_complete and not complete:
            continue
        processed_submission = ProcessedSubmission(
            complete, height_in, start_weight_lbs, end_weight_lbs, gender_is_female, age,
            imgur_images, imgur_albums, *submission)
        # Output to csv
        csv_writer.writerow(submission_to_unicode(processed_submission))
    input_file.close()
    output_file.close()
    print "number_completed_rows: ", number_completed_rows