def stem_processed_stories(input_file_path): """ """ start_time = time.time() if not isinstance(input_file_path, str): raise TypeError("Expected input_file_path to be of type str.") stemmer = PorterStemmer() stories_list = [] prog = re.compile('\W+') story_stream = open_safely(input_file_path) for story_as_str in story_stream: story_as_list = story_as_str[:-1].lower().split(DELIMITER) story_title = story_as_list[NEW_STORIES_TITLE_INDEX] tok_contents = WordPunctTokenizer().tokenize(story_title) stem_contents = [stemmer.stem(word) for word in tok_contents if \ prog.match(word) is None] story_as_list[NEW_STORIES_TITLE_INDEX] = " ".join(stem_contents) stories_list.append(story_as_list) story_stream.close() output_file_path = input_file_path + STEMMED_STORIES_EXTENSION write_2d_iterable(stories_list, output_file_path) print("Output stemmed stories to %s" % output_file_path) report_time_elapsed(start_time)
def _write_stories(stories_dict): start_time = time.time() sorted_stories = sorted(stories_dict.keys()) row_num = 0 output_stream = open_safely(PROCESSED_STORIES_FILE_PATH, "w") for story_key in sorted_stories: if FETCH_FULL_STORIES: story_timestamp, story_contents = stories_dict[story_key] story_title_with_contents = story_key[NEW_STORIES_TITLE_INDEX] + \ " " + story_contents story_sans_timestamp_as_tuple = \ (story_key[NEW_STORIES_FEED_URL_INDEX], story_key[NEW_STORIES_FEED_TITLE_INDEX], story_key[NEW_STORIES_URL_INDEX], story_title_with_contents) story_sans_timestamp_as_str = \ DELIMITER.join(story_sans_timestamp_as_tuple) else: story_timestamp = stories_dict[story_key] story_sans_timestamp_as_str = DELIMITER.join(story_key) story_timestamp_as_str = DELIMITER + str(story_timestamp) story_as_str = story_sans_timestamp_as_str + story_timestamp_as_str output_stream.write(story_as_str + "\n") stories_dict[story_key] = row_num row_num += 1 output_stream.close() print("Wrote %d cleaned and sorted %s to %s" % (row_num, STORIES_DESCRIPTOR, PROCESSED_STORIES_FILE_PATH)) report_time_elapsed(start_time)
def _clean_data(input_file_path, num_fields, timestamp_index, data_descriptor, insert_data_fn, stories_dict, callback_data = None): start_time = time.time() stories_dict_already_built = (len(stories_dict) > 0) num_rows = 0 input_stream = open_safely(input_file_path) for row in input_stream: num_rows += 1 row_without_newline = row[:-1] _clean_row(row_without_newline, num_fields, timestamp_index, insert_data_fn, stories_dict, callback_data) input_stream.close() if stories_dict_already_built: # We just cleaned user reads or clickthroughs. num_valid_rows = len(callback_data) else: # We just cleaned stories. num_valid_rows = len(stories_dict) num_invalid_rows = num_rows - num_valid_rows discard_rate = float(100 * num_invalid_rows) / float(num_rows) print("Read a total of %d %s, %d (%.2f%%) of which were discarded." % (num_rows, data_descriptor, num_invalid_rows, discard_rate)) report_time_elapsed(start_time)
def _read_stories(): stories = [] story_stream = open_safely(STORIES_FILE_PATH) for story_as_str in story_stream: story_as_list = story_as_str[:-1].lower().split(DELIMITER) time_first_read = int(story_as_list[STORIES_TIMESTAMP_INDEX]) story_as_list[STORIES_TIMESTAMP_INDEX] = time_first_read stories.append(tuple(story_as_list)) story_stream.close() return stories
def _write_user_ids(user_ids_list): start_time = time.time() output_stream = open_safely(USER_IDS_FILE_PATH, "w") for user_id in user_ids_list: output_stream.write(user_id + "\n") output_stream.close() num_users = len(user_ids_list) print(("Wrote %d cleaned and sorted original 38-character hexadecimal %s " + "to %s") % (num_users, USER_IDS_DESCRIPTOR, USER_IDS_FILE_PATH)) report_time_elapsed(start_time)
def _write_events(events_list, output_file_path, event_descriptor): start_time = time.time() output_stream = open_safely(output_file_path, "w") for event in events_list: output_stream.write(DELIMITER.join(map(str, event)) + "\n") output_stream.close() num_events = len(events_list) print("Wrote %d cleaned and sorted %s to %s" % (num_events, event_descriptor, output_file_path)) report_time_elapsed(start_time)
def classify(version): if not os.path.exists(OUTPUT_DIRECTORY): os.mkdir(OUTPUT_DIRECTORY) ignore = False logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO) random.seed() ############################################ stories = _read_stories() events = _read_events() # NUM_USERS_TO_ANALYZE = 500 user_list = [[] for i in range(NUM_USERS_TO_ANALYZE)] day = 0 max_day = 30 curr_day = EARLIEST_ACCEPTABLE_TIMESTAMP curr_day += SECONDS_IN_DAY reselect_by_user = [[] for i in range(NUM_USERS_TO_ANALYZE)] while day < max_day: corpus_dict = corpora.Dictionary( story[NEW_STORIES_TITLE_INDEX].split() for story in stories if story[STORIES_TIMESTAMP_INDEX] <= curr_day ) # remove stop words and words that appear only once stop_ids = [corpus_dict.token2id[stopword] for stopword in STOPLIST if stopword in corpus_dict.token2id] once_ids = [tokenid for tokenid, docfreq in corpus_dict.dfs.iteritems() if docfreq == 1] ''' for tokenid in once_ids: # replace token with UNK corpus_dict[tokenid] = "UNK"''' # remove stop words and words that appear only once corpus_dict.filter_tokens(stop_ids + once_ids) # remove gaps in id sequence after words that were removed corpus_dict.compactify() #################### # tf-idf # #################### tfidf = _build_tfidf_model(corpus_dict, stories, curr_day) for user_id in range(NUM_USERS_TO_ANALYZE): # user_id+=904 user_tfidf, pos_tfidf, num_pos_train, num_neg_train, to_ignore = _tfidf( tfidf, corpus_dict, stories, events, user_id, curr_day, reselect_by_user[user_id], False ) # reselect_by_user = [[] for i in range(NUM_USERS_TO_ANALYZE)] if user_tfidf != []: # modelsvm = train(labels, corpus_tfidf) to_predict, other_tfidf, num_pos_predict, num_neg_predict, chosen_stories = _tfidf( tfidf, corpus_dict, stories, events, user_id, curr_day, [], True ) if to_predict != []: p_labs, p_vals, labels_predict = _train_and_predict( user_tfidf, pos_tfidf, to_predict, num_pos_train, num_neg_train, num_pos_predict, num_neg_predict, version, ignore, ) reselect = [] num_bool = True for i in range(len(p_labs)): if labels_predict[i] == -1: if num_bool: num_pos_predict = i num_bool = False if p_labs[i] == 1: next_day = curr_day + SECONDS_IN_DAY if chosen_stories[i - num_pos_predict][2] <= next_day: reselect += [chosen_stories[i - num_pos_predict]] reselect_by_user[user_id] += reselect p, r, f = _p_r_f_one(labels_predict, p_labs) user_list[user_id].append((p, r, f, day)) curr_day += SECONDS_IN_DAY day += 1 user_a_p = 0 user_a_r = 0 user_a_f = 0 skipped = 0 print("Read stories from %s" % STORIES_FILE_PATH) print("Read events from %s" % EVENTS_FILE_PATH) print("%d users were analyzed" % NUM_USERS_TO_ANALYZE) output_file_name = "reselect.py %s %s %d %d output written at %d.txt" % ( version, sys.argv[2], KERNEL_NUMBER, NUM_USERS_TO_ANALYZE, time.time(), ) output_file_path = OUTPUT_DIRECTORY + output_file_name print("Outputting precision, recall, and f_1 scores to %s" % output_file_path) user_id = 0 output_stream = open_safely(output_file_path, "w") for user in user_list: av_p = 0 av_r = 0 for results in user: av_p += results[0] av_r += results[1] f_1 = results[2] day = results[3] output_stream.write("%.3f\t%.3f\t%.3f\t%d\t%d\n" % (results[0], results[1], f_1, day, user_id)) if len(user) > 0: av_p = av_p / float(len(user)) av_r = av_r / float(len(user)) denominator = av_p + av_r if denominator == 0.0: av_f = 0.0 else: av_f = (2 * av_p * av_r) / float(av_p + av_r) user_a_p += av_p user_a_r += av_r else: skipped += 1 user_id += 1 user_a_p = user_a_p / float(NUM_USERS_TO_ANALYZE - skipped) user_a_r = user_a_r / float(NUM_USERS_TO_ANALYZE - skipped) denominator = user_a_p + user_a_r if denominator == 0.0: user_a_f = 0.0 else: user_a_f = (2 * user_a_p * user_a_r) / float(user_a_p + user_a_r) output_stream.write("%.3f\t%.3f\t%.3f\t-1\t-1\n" % (user_a_p, user_a_r, user_a_f)) output_stream.close()
def _read_events(): event_stream = open_safely(EVENTS_FILE_PATH) events = [tuple(map(int, event[:-1].split(DELIMITER))) for event in event_stream] event_stream.close() return events