def _write_stories(stories_dict): start_time = time.time() sorted_stories = sorted(stories_dict.keys()) row_num = 0 output_stream = open_safely(PROCESSED_STORIES_FILE_PATH, "w") for story_key in sorted_stories: if FETCH_FULL_STORIES: story_timestamp, story_contents = stories_dict[story_key] story_title_with_contents = story_key[NEW_STORIES_TITLE_INDEX] + \ " " + story_contents story_sans_timestamp_as_tuple = \ (story_key[NEW_STORIES_FEED_URL_INDEX], story_key[NEW_STORIES_FEED_TITLE_INDEX], story_key[NEW_STORIES_URL_INDEX], story_title_with_contents) story_sans_timestamp_as_str = \ DELIMITER.join(story_sans_timestamp_as_tuple) else: story_timestamp = stories_dict[story_key] story_sans_timestamp_as_str = DELIMITER.join(story_key) story_timestamp_as_str = DELIMITER + str(story_timestamp) story_as_str = story_sans_timestamp_as_str + story_timestamp_as_str output_stream.write(story_as_str + "\n") stories_dict[story_key] = row_num row_num += 1 output_stream.close() print("Wrote %d cleaned and sorted %s to %s" % (row_num, STORIES_DESCRIPTOR, PROCESSED_STORIES_FILE_PATH)) report_time_elapsed(start_time)
def stem_processed_stories(input_file_path): """ """ start_time = time.time() if not isinstance(input_file_path, str): raise TypeError("Expected input_file_path to be of type str.") stemmer = PorterStemmer() stories_list = [] prog = re.compile('\W+') story_stream = open_safely(input_file_path) for story_as_str in story_stream: story_as_list = story_as_str[:-1].lower().split(DELIMITER) story_title = story_as_list[NEW_STORIES_TITLE_INDEX] tok_contents = WordPunctTokenizer().tokenize(story_title) stem_contents = [stemmer.stem(word) for word in tok_contents if \ prog.match(word) is None] story_as_list[NEW_STORIES_TITLE_INDEX] = " ".join(stem_contents) stories_list.append(story_as_list) story_stream.close() output_file_path = input_file_path + STEMMED_STORIES_EXTENSION write_2d_iterable(stories_list, output_file_path) print("Output stemmed stories to %s" % output_file_path) report_time_elapsed(start_time)
def _clean_data(input_file_path, num_fields, timestamp_index, data_descriptor, insert_data_fn, stories_dict, callback_data = None): start_time = time.time() stories_dict_already_built = (len(stories_dict) > 0) num_rows = 0 input_stream = open_safely(input_file_path) for row in input_stream: num_rows += 1 row_without_newline = row[:-1] _clean_row(row_without_newline, num_fields, timestamp_index, insert_data_fn, stories_dict, callback_data) input_stream.close() if stories_dict_already_built: # We just cleaned user reads or clickthroughs. num_valid_rows = len(callback_data) else: # We just cleaned stories. num_valid_rows = len(stories_dict) num_invalid_rows = num_rows - num_valid_rows discard_rate = float(100 * num_invalid_rows) / float(num_rows) print("Read a total of %d %s, %d (%.2f%%) of which were discarded." % (num_rows, data_descriptor, num_invalid_rows, discard_rate)) report_time_elapsed(start_time)
def _write_user_ids(user_ids_list): start_time = time.time() output_stream = open_safely(USER_IDS_FILE_PATH, "w") for user_id in user_ids_list: output_stream.write(user_id + "\n") output_stream.close() num_users = len(user_ids_list) print(("Wrote %d cleaned and sorted original 38-character hexadecimal %s " + "to %s") % (num_users, USER_IDS_DESCRIPTOR, USER_IDS_FILE_PATH)) report_time_elapsed(start_time)
def _write_events(events_list, output_file_path, event_descriptor): start_time = time.time() output_stream = open_safely(output_file_path, "w") for event in events_list: output_stream.write(DELIMITER.join(map(str, event)) + "\n") output_stream.close() num_events = len(events_list) print("Wrote %d cleaned and sorted %s to %s" % (num_events, event_descriptor, output_file_path)) report_time_elapsed(start_time)
def create_fixtures(min_user_id, max_user_id): """Create processed Pulse log files with data only for the given users. Assume processed data is available in PROCESSED_DATA_DIRECTORY. Include only events performed by the given users and stories referenced in such events. Reassign story IDs to account for the omission of other stories. Reassign the given user IDs to 0, 1, 2, etc. to account for the omission of other users. Place output in a directory named Fixtures for Users min_user_id-max_user_id within PROCESSED_DATA_DIRECTORY, creating such a directory if it does not already exist. min_user_id, an int, is the smallest user ID to include in the output, and is in processed form (i.e., 0, 1, 2) rather than the original 38-character hexadecimal format. max_user_id, an int, is the largest user ID to include in the output, and is in processed form (i.e., 0, 1, 2) rather than the original 38-character hexadecimal format. """ start_time = time.time() if not isinstance(min_user_id, int) or not isinstance(max_user_id, int): raise TypeError("min_user_id and max_user_id must both be of type int.") if min_user_id > max_user_id: raise ValueError( ("min_user_id is %d but must be less than or " + "equal to max_user_id, which is %d.") % (min_user_id, max_user_id) ) if min_user_id < 0: raise ValueError(("min_user_id is %d, but user IDs must be " + "non-negative.") % min_user_id) reads_list = _read_events(min_user_id, max_user_id, PROCESSED_READS_FILE_PATH) clickthroughs_list = _read_events(min_user_id, max_user_id, PROCESSED_CLICKTHROUGHS_FILE_PATH) max_user_id_found = _get_largest_user_id(reads_list, clickthroughs_list) if max_user_id_found is None: raise LookupError( ("No User IDs in the range [%d, %d] were found in" + " the processed data.") % (min_user_id, max_user_id) ) if max_user_id_found < max_user_id: raise LookupError( ("max_user_id is %d, but the largest user ID in " + "the processed data is %d.") % (max_user_id, max_user_id_found) ) story_ids = frozenset([event[EVENTS_STORY_ID_INDEX] for event in reads_list + clickthroughs_list]) stories_list, story_id_dict = _read_stories(story_ids) output_directory = "%sFixtures for Users %d-%d/" % (PROCESSED_DATA_DIRECTORY, min_user_id, max_user_id) if not os.path.exists(output_directory): os.mkdir(output_directory) output_reads_path = output_directory + READS_FILENAME _write_events(reads_list, output_reads_path, story_id_dict, min_user_id) output_clickthroughs_path = output_directory + CLICKTHROUGHS_FILENAME _write_events(clickthroughs_list, output_clickthroughs_path, story_id_dict, min_user_id) output_stories_path = output_directory + STORIES_FILENAME write_iterable(stories_list, output_stories_path, "") print("Output fixtures in directory: %s" % output_directory) report_time_elapsed(start_time)
def get_user_ids(reads_list, clickthroughs_list): start_time = time.time() user_ids_set = set() for read in reads_list: user_ids_set.add(read[EVENTS_USER_ID_INDEX]) for clickthrough in clickthroughs_list: user_ids_set.add(clickthrough[EVENTS_USER_ID_INDEX]) user_ids_list = sorted(user_ids_set) user_ids_dict = dict([(original_user_id, new_user_id) for \ (new_user_id, original_user_id) in \ enumerate(user_ids_list)]) _reassign_user_ids(user_ids_dict, reads_list) _reassign_user_ids(user_ids_dict, clickthroughs_list) print("Reassigned %s from original values to 0, 1, 2, etc." % \ USER_IDS_DESCRIPTOR) report_time_elapsed(start_time) return user_ids_list