コード例 #1
0
def _write_stories(stories_dict):
    start_time = time.time()
    sorted_stories = sorted(stories_dict.keys())
    row_num = 0
    output_stream = open_safely(PROCESSED_STORIES_FILE_PATH, "w")
    for story_key in sorted_stories:
        if FETCH_FULL_STORIES:
            story_timestamp, story_contents = stories_dict[story_key]
            story_title_with_contents = story_key[NEW_STORIES_TITLE_INDEX] + \
                " " + story_contents
            story_sans_timestamp_as_tuple = \
                (story_key[NEW_STORIES_FEED_URL_INDEX],
                 story_key[NEW_STORIES_FEED_TITLE_INDEX],
                    story_key[NEW_STORIES_URL_INDEX],
                    story_title_with_contents)
            story_sans_timestamp_as_str = \
                DELIMITER.join(story_sans_timestamp_as_tuple)
        else:
            story_timestamp = stories_dict[story_key]
            story_sans_timestamp_as_str = DELIMITER.join(story_key)
        story_timestamp_as_str = DELIMITER + str(story_timestamp)
        story_as_str = story_sans_timestamp_as_str + story_timestamp_as_str
        output_stream.write(story_as_str + "\n")
        stories_dict[story_key] = row_num
        row_num += 1
    output_stream.close()
    print("Wrote %d cleaned and sorted %s to %s" %
          (row_num, STORIES_DESCRIPTOR, PROCESSED_STORIES_FILE_PATH))
    report_time_elapsed(start_time)
コード例 #2
0
def _write_events(events_list, output_file_path, story_id_dict, user_id_offset):
    """Write the given events to the given output file using new story IDs.
    
    Maintain the ordering of events_list in the output file.  Write events in
    newline-delimited raw text format.  Within each event, delimit fields by
    DELIMITER.  Write events with fields (new_user_id, new_story_id,
    time_occurred), where new user IDs start from 0.  Assume the the first
    element in events_list belongs to the user with the smallest ID of those in
    the list.
    
    events_list, a list, contains all the events of a given type (reads or
    clickthroughs) for a range of users.  Each element of events_list is in the
    form (old_user_id, old_story_id, time_occurred).
    output_file_path, a str, is the file path to which to output events.
    story_id_dict, a dict, maps from old story IDs to new story IDs.
    user_id_offset, an int, is the value that must be subtracted from an old
    user ID to produce the corresponding new user ID.
    """
    output_stream = open_safely(output_file_path, "w")

    for old_event in events_list:
        old_user_id = old_event[EVENTS_USER_ID_INDEX]
        new_user_id = old_user_id - user_id_offset
        old_story_id = old_event[EVENTS_STORY_ID_INDEX]
        new_story_id = story_id_dict[old_story_id]
        time_occurred = old_event[NEW_EVENTS_TIMESTAMP_INDEX]
        new_event = (new_user_id, new_story_id, time_occurred)
        output_stream.write(DELIMITER.join(map(str, new_event)) + "\n")

    output_stream.close()
コード例 #3
0
def _write_events(events_list, output_file_path, event_descriptor):
    start_time = time.time()
    output_stream = open_safely(output_file_path, "w")
    for event in events_list:
        output_stream.write(DELIMITER.join(map(str, event)) + "\n")
    output_stream.close()
    num_events = len(events_list)
    print("Wrote %d cleaned and sorted %s to %s" %
          (num_events, event_descriptor, output_file_path))
    report_time_elapsed(start_time)