def _read_events(story_id_dict, input_file_path, event_descriptor): """"Return a list of the events in the given file with the given story IDs. Generate list elements in [user_id, new_story_id, time_occurred] form, and maintain the ordering of the input file. inpurt_file_path, a str, is the file path to the Pulse event log file to read in. story_id_dict, a dict, maps from old story IDs to new story IDs. Only events with story IDs that are keys in story_id_dict are retained in the output, but these old story IDs are replaced with the corresponding new story IDs. event_descriptor, a str, briefly describes the events in the plural form and is used to notify the user upon completion. """ events_list = [] num_events = 0 num_events_kept = 0 input_stream = open_safely(input_file_path) for event_as_str in input_stream: event_as_list = map(int, event_as_str[:-1].split(DELIMITER)) old_user_id = event_as_list[EVENTS_STORY_ID_INDEX] if old_user_id in story_id_dict: event_as_list[EVENTS_STORY_ID_INDEX] = story_id_dict[old_user_id] events_list.append(event_as_list) num_events_kept += 1 num_events += 1 input_stream.close() num_events_discarded = num_events - num_events_kept discard_rate = float(100 * num_events_discarded) / float(num_events) print(("Read a total of %d %s, %d (%.2f%%) of which were discarded " + \ "because the full contents of the associated story could not be " + \ "fetched.") % (num_events, event_descriptor, num_events_discarded, discard_rate)) return events_list
def _write_events(events_list, output_file_path, story_id_dict, user_id_offset): """Write the given events to the given output file using new story IDs. Maintain the ordering of events_list in the output file. Write events in newline-delimited raw text format. Within each event, delimit fields by DELIMITER. Write events with fields (new_user_id, new_story_id, time_occurred), where new user IDs start from 0. Assume the the first element in events_list belongs to the user with the smallest ID of those in the list. events_list, a list, contains all the events of a given type (reads or clickthroughs) for a range of users. Each element of events_list is in the form (old_user_id, old_story_id, time_occurred). output_file_path, a str, is the file path to which to output events. story_id_dict, a dict, maps from old story IDs to new story IDs. user_id_offset, an int, is the value that must be subtracted from an old user ID to produce the corresponding new user ID. """ output_stream = open_safely(output_file_path, "w") for old_event in events_list: old_user_id = old_event[EVENTS_USER_ID_INDEX] new_user_id = old_user_id - user_id_offset old_story_id = old_event[EVENTS_STORY_ID_INDEX] new_story_id = story_id_dict[old_story_id] time_occurred = old_event[NEW_EVENTS_TIMESTAMP_INDEX] new_event = (new_user_id, new_story_id, time_occurred) output_stream.write(DELIMITER.join(map(str, new_event)) + "\n") output_stream.close()
def _read_stories(input_file_path): """Return a list of stories with full contents and a dict with new IDs. Generate list elements in tuple form, where each element corresponds to a single line of the given processed stories log file. Do not trim the newline off the end of the last element of the tuple. Append a space followed by the full story contents to the title of each story. Omit stories for which the full story contents could not be fetched. Generate dict entries mapping from story IDs in the input file to story IDs in the output file. Maintain the ordering of the input file in the list. This ordering is equivalent to ascending order of both old story IDs and new story IDs. Be warned that fetching full story contents is quite slow and consumes a great deal of bandwidth, so you or other users on your network may experience connectivity problems while executing this function. input_file_path, a str, is the file path to the processed Pulse stories log file that contains story URLs and titles but not the full contents of the stories themselves. """ start_time = time.time() old_story_id = 0 new_story_id = 0 stories_list = [] story_id_dict = {} story_contents_dict = {} socket.setdefaulttimeout(TIMEOUT_LENGTH) input_stream = open_safely(input_file_path) for story_as_str in input_stream: story_as_list = story_as_str.split(DELIMITER) story_url = story_as_list[NEW_STORIES_URL_INDEX] if story_url in story_contents_dict: story_contents = story_contents_dict[story_url] else: story_contents = html2text.extractFromURL(story_url) if (story_contents is not None) and \ (len(story_contents) <= MIN_STORY_LENGTH): story_contents = None story_contents_dict[story_url] = story_contents if story_contents is not None: story_as_list[NEW_STORIES_TITLE_INDEX] += " " + story_contents stories_list.append(tuple(story_as_list)) story_id_dict[old_story_id] = new_story_id new_story_id += 1 old_story_id += 1 input_stream.close() num_stories_discarded = old_story_id - new_story_id discard_rate = float(100 * num_stories_discarded) / float(old_story_id) print(("Read a total of %d %s, %d (%.2f%%) of which were discarded " + \ "because their full contents could not be fetched.") % \ (old_story_id, STORIES_DESCRIPTOR, num_stories_discarded, discard_rate)) report_time_elapsed(start_time) return (stories_list, story_id_dict)
def _read_stories(story_ids): """Return a list of stories with the given IDs and a dict with new IDs. Generate list elements in newline-terminated str form, where each element is a single line of the processed stories log file. Generate dict entries mapping from story IDs in the input file to story IDs in the output file. Maintain the ordering of the input file in the list. This ordering is equivalent to ascending order of both old story IDs and new story IDs. story_ids, a set, contains the IDs of the stories to include in the output. """ old_story_id = 0 new_story_id = 0 stories_list = [] story_id_dict = {} input_stream = open_safely(PROCESSED_STORIES_FILE_PATH) for story in input_stream: if old_story_id in story_ids: stories_list.append(story) story_id_dict[old_story_id] = new_story_id new_story_id += 1 old_story_id += 1 input_stream.close() return (stories_list, story_id_dict)
def _read_events(min_user_id, max_user_id, input_file_path): """Return a list of the events in the given file for the given users. Generate list elements of the form (user_id, story_id, time_occurred), and maintain the ordering of the input file. min_user_id, an int, is the smallest user ID that will be included, and is in processed form (i.e., 0, 1, 2) rather than the original 38-character hexadecimal format. max_user_id, an int, is the largest user ID that will be included, and is in processed form (i.e., 0, 1, 2) rather than the original 38-character hexadecimal format. inpurt_file_path, a str, is the file path to the Pulse event log file to read in. """ events_list = [] input_stream = open_safely(input_file_path) for event_as_str in input_stream: event_as_tuple = map(int, tuple(event_as_str[:-1].split(DELIMITER))) curr_user_id = event_as_tuple[EVENTS_USER_ID_INDEX] if (min_user_id <= curr_user_id) and (curr_user_id <= max_user_id): events_list.append(event_as_tuple) input_stream.close() return events_list