def create_fixtures(min_user_id, max_user_id): """Create processed Pulse log files with data only for the given users. Assume processed data is available in PROCESSED_DATA_DIRECTORY. Include only events performed by the given users and stories referenced in such events. Reassign story IDs to account for the omission of other stories. Reassign the given user IDs to 0, 1, 2, etc. to account for the omission of other users. Place output in a directory named Fixtures for Users min_user_id-max_user_id within PROCESSED_DATA_DIRECTORY, creating such a directory if it does not already exist. min_user_id, an int, is the smallest user ID to include in the output, and is in processed form (i.e., 0, 1, 2) rather than the original 38-character hexadecimal format. max_user_id, an int, is the largest user ID to include in the output, and is in processed form (i.e., 0, 1, 2) rather than the original 38-character hexadecimal format. """ start_time = time.time() if not isinstance(min_user_id, int) or not isinstance(max_user_id, int): raise TypeError("min_user_id and max_user_id must both be of type int.") if min_user_id > max_user_id: raise ValueError( ("min_user_id is %d but must be less than or " + "equal to max_user_id, which is %d.") % (min_user_id, max_user_id) ) if min_user_id < 0: raise ValueError(("min_user_id is %d, but user IDs must be " + "non-negative.") % min_user_id) reads_list = _read_events(min_user_id, max_user_id, PROCESSED_READS_FILE_PATH) clickthroughs_list = _read_events(min_user_id, max_user_id, PROCESSED_CLICKTHROUGHS_FILE_PATH) max_user_id_found = _get_largest_user_id(reads_list, clickthroughs_list) if max_user_id_found is None: raise LookupError( ("No User IDs in the range [%d, %d] were found in" + " the processed data.") % (min_user_id, max_user_id) ) if max_user_id_found < max_user_id: raise LookupError( ("max_user_id is %d, but the largest user ID in " + "the processed data is %d.") % (max_user_id, max_user_id_found) ) story_ids = frozenset([event[EVENTS_STORY_ID_INDEX] for event in reads_list + clickthroughs_list]) stories_list, story_id_dict = _read_stories(story_ids) output_directory = "%sFixtures for Users %d-%d/" % (PROCESSED_DATA_DIRECTORY, min_user_id, max_user_id) if not os.path.exists(output_directory): os.mkdir(output_directory) output_reads_path = output_directory + READS_FILENAME _write_events(reads_list, output_reads_path, story_id_dict, min_user_id) output_clickthroughs_path = output_directory + CLICKTHROUGHS_FILENAME _write_events(clickthroughs_list, output_clickthroughs_path, story_id_dict, min_user_id) output_stories_path = output_directory + STORIES_FILENAME write_iterable(stories_list, output_stories_path, "") print("Output fixtures in directory: %s" % output_directory) report_time_elapsed(start_time)
def fetch_story_contents(input_directory): """Write processed Pulse log files with full story contents. Output files by the same name in a sub-directory of the given directory named SUB_DIRECTORY_NAME. Append a space followed by the full story contents to the story titles. Remove stories for which no content could not be fetched and events involving these stories. Reassign story and user IDs to 0, 1, 2, etc. to fill the resulting gaps in the ID sequences. Output a user IDs log file in which row numbers correspond to the new user IDs, and row values correspond to the old user IDs. input_directory, a str, is the file path to a directory containing processed Pulse log files lacking full story contents. """ if not isinstance(input_directory, str): raise TypeError("Expected input_directory to be of type str.") if not path.isdir(input_directory): raise ValueError("Could not find given directory: %s" % input_directory) input_stories_path = path.join(input_directory, STORIES_FILENAME) stories_list, story_id_dict = _read_stories(input_stories_path) input_reads_path = path.join(input_directory, READS_FILENAME) reads_list = _read_events(story_id_dict, input_reads_path, READS_DESCRIPTOR) input_clickthroughs_path = path.join(input_directory, CLICKTHROUGHS_FILENAME) clickthroughs_list = _read_events(story_id_dict, input_clickthroughs_path, CLICKTHROUGHS_DESCRIPTOR) user_ids_list = get_user_ids(reads_list, clickthroughs_list) reads_list = [map(str, read) for read in reads_list] clickthroughs_list = [map(str, clickthrough) for clickthrough in \ clickthroughs_list] user_ids_list = map(str, user_ids_list) output_directory = path.join(input_directory, SUB_DIRECTORY_NAME) if not path.exists(output_directory): os.mkdir(output_directory) output_stories_path = path.join(output_directory, STORIES_FILENAME) write_2d_iterable(stories_list, output_stories_path, "") output_reads_path = path.join(output_directory, READS_FILENAME) write_2d_iterable(reads_list, output_reads_path) output_clickthroughs_path = path.join(output_directory, CLICKTHROUGHS_FILENAME) write_2d_iterable(clickthroughs_list, output_clickthroughs_path) output_users_path = path.join(output_directory, USER_IDS_FILENAME) write_iterable(user_ids_list, output_users_path)