def _read_events(story_id_dict, input_file_path, event_descriptor):
    """"Return a list of the events in the given file with the given story IDs.

    Generate list elements in [user_id, new_story_id, time_occurred] form, and
    maintain the ordering of the input file.
    
    inpurt_file_path, a str, is the file path to the Pulse event log file to
    read in.
    story_id_dict, a dict, maps from old story IDs to new story IDs.  Only
    events with story IDs that are keys in story_id_dict are retained in the
    output, but these old story IDs are replaced with the corresponding new
    story IDs.
    event_descriptor, a str, briefly describes the events in the plural form and
    is used to notify the user upon completion.
    """
    events_list = []
    num_events = 0
    num_events_kept = 0
    input_stream = open_safely(input_file_path)
    for event_as_str in input_stream:
        event_as_list = map(int, event_as_str[:-1].split(DELIMITER))
        old_user_id = event_as_list[EVENTS_STORY_ID_INDEX]
        if old_user_id in story_id_dict:
            event_as_list[EVENTS_STORY_ID_INDEX] = story_id_dict[old_user_id]
            events_list.append(event_as_list)
            num_events_kept += 1
        num_events += 1
    input_stream.close()
    num_events_discarded = num_events - num_events_kept
    discard_rate = float(100 * num_events_discarded) / float(num_events)
    print(("Read a total of %d %s, %d (%.2f%%) of which were discarded " + \
           "because the full contents of the associated story could not be " + \
           "fetched.") % (num_events, event_descriptor, num_events_discarded,
                          discard_rate))
    return events_list
def _write_events(events_list, output_file_path, story_id_dict, user_id_offset):
    """Write the given events to the given output file using new story IDs.
    
    Maintain the ordering of events_list in the output file.  Write events in
    newline-delimited raw text format.  Within each event, delimit fields by
    DELIMITER.  Write events with fields (new_user_id, new_story_id,
    time_occurred), where new user IDs start from 0.  Assume the the first
    element in events_list belongs to the user with the smallest ID of those in
    the list.
    
    events_list, a list, contains all the events of a given type (reads or
    clickthroughs) for a range of users.  Each element of events_list is in the
    form (old_user_id, old_story_id, time_occurred).
    output_file_path, a str, is the file path to which to output events.
    story_id_dict, a dict, maps from old story IDs to new story IDs.
    user_id_offset, an int, is the value that must be subtracted from an old
    user ID to produce the corresponding new user ID.
    """
    output_stream = open_safely(output_file_path, "w")

    for old_event in events_list:
        old_user_id = old_event[EVENTS_USER_ID_INDEX]
        new_user_id = old_user_id - user_id_offset
        old_story_id = old_event[EVENTS_STORY_ID_INDEX]
        new_story_id = story_id_dict[old_story_id]
        time_occurred = old_event[NEW_EVENTS_TIMESTAMP_INDEX]
        new_event = (new_user_id, new_story_id, time_occurred)
        output_stream.write(DELIMITER.join(map(str, new_event)) + "\n")

    output_stream.close()
def _read_stories(input_file_path):
    """Return a list of stories with full contents and a dict with new IDs.

    Generate list elements in tuple form, where each element corresponds to a
    single line of the given processed stories log file.  Do not trim the
    newline off the end of the last element of the tuple.  Append a space
    followed by the full story contents to the title of each story.  Omit
    stories for which the full story contents could not be fetched.  Generate
    dict entries mapping from story IDs in the input file to story IDs in the
    output file.  Maintain the ordering of the input file in the list.  This
    ordering is equivalent to ascending order of both old story IDs and new
    story IDs.  Be warned that fetching full story contents is quite slow and
    consumes a great deal of bandwidth, so you or other users on your network
    may experience connectivity problems while executing this function.

    input_file_path, a str, is the file path to the processed Pulse stories log
    file that contains story URLs and titles but not the full contents of the
    stories themselves.
    """
    start_time = time.time()
    old_story_id = 0
    new_story_id = 0
    stories_list = []
    story_id_dict = {}
    story_contents_dict = {}
    socket.setdefaulttimeout(TIMEOUT_LENGTH)
    input_stream = open_safely(input_file_path)
    
    for story_as_str in input_stream:
        story_as_list = story_as_str.split(DELIMITER)
        story_url = story_as_list[NEW_STORIES_URL_INDEX]
        if story_url in story_contents_dict:
            story_contents = story_contents_dict[story_url]
        else:
            story_contents = html2text.extractFromURL(story_url)
            if (story_contents is not None) and \
                    (len(story_contents) <= MIN_STORY_LENGTH):
                story_contents = None
            story_contents_dict[story_url] = story_contents
        if story_contents is not None:
            story_as_list[NEW_STORIES_TITLE_INDEX] += " " + story_contents
            stories_list.append(tuple(story_as_list))
            story_id_dict[old_story_id] = new_story_id
            new_story_id += 1
        old_story_id += 1
        
    input_stream.close()
    num_stories_discarded = old_story_id - new_story_id
    discard_rate = float(100 * num_stories_discarded) / float(old_story_id)
    print(("Read a total of %d %s, %d (%.2f%%) of which were discarded " + \
           "because their full contents could not be fetched.") % \
           (old_story_id, STORIES_DESCRIPTOR, num_stories_discarded,
            discard_rate))
    report_time_elapsed(start_time)
    return (stories_list, story_id_dict)
def _read_stories(story_ids):
    """Return a list of stories with the given IDs and a dict with new IDs.

    Generate list elements in newline-terminated str form, where each element is
    a single line of the processed stories log file.  Generate dict entries
    mapping from story IDs in the input file to story IDs in the output file.
    Maintain the ordering of the input file in the list.  This ordering is
    equivalent to ascending order of both old story IDs and new story IDs.

    story_ids, a set, contains the IDs of the stories to include in the output.
    """
    old_story_id = 0
    new_story_id = 0
    stories_list = []
    story_id_dict = {}
    input_stream = open_safely(PROCESSED_STORIES_FILE_PATH)
    for story in input_stream:
        if old_story_id in story_ids:
            stories_list.append(story)
            story_id_dict[old_story_id] = new_story_id
            new_story_id += 1
        old_story_id += 1
    input_stream.close()
    return (stories_list, story_id_dict)
def _read_events(min_user_id, max_user_id, input_file_path):
    """Return a list of the events in the given file for the given users.

    Generate list elements of the form (user_id, story_id, time_occurred), and
    maintain the ordering of the input file.
    
    min_user_id, an int, is the smallest user ID that will be included, and is
    in processed form (i.e., 0, 1, 2) rather than the original 38-character
    hexadecimal format.
    max_user_id, an int, is the largest user ID that will be included, and is
    in processed form (i.e., 0, 1, 2) rather than the original 38-character
    hexadecimal format.
    inpurt_file_path, a str, is the file path to the Pulse event log file to
    read in.
    """
    events_list = []
    input_stream = open_safely(input_file_path)
    for event_as_str in input_stream:
        event_as_tuple = map(int, tuple(event_as_str[:-1].split(DELIMITER)))
        curr_user_id = event_as_tuple[EVENTS_USER_ID_INDEX]
        if (min_user_id <= curr_user_id) and (curr_user_id <= max_user_id):
            events_list.append(event_as_tuple)
    input_stream.close()
    return events_list