def calculate_metrics(df): """ Calculates metrics at different k (1 to 10 + 20,30,40,50)""" #print(df.columns) klist = list(range(1, 11)) klist.extend([20, 30, 40, 50, 100, 200, 300, 500]) print(klist) # 14 x 3 x 4 columns added for each for k in tqdm(klist): df['average_precision_p2v_{}'.format(k)] = df['p2v_binary'].progress_apply(lambda x: average_precision(x, k)) df['average_precision_d2v_{}'.format(k)] = df['d2v_binary'].progress_apply(lambda x: average_precision(x, k)) df['recall_p2v_{}'.format(k)] = df[['p2v_binary', 'ground_truth']].apply( lambda x: recall_at_k(x.p2v_binary, x.ground_truth, k), axis=1) df['recall_d2v_{}'.format(k)] = df[['d2v_binary', 'ground_truth']].progress_apply( lambda x: recall_at_k(x.d2v_binary, x.ground_truth, k), axis=1) df['reciprocal_rank_p2v_{}'.format(k)] = df['p2v_binary'].progress_apply(lambda x: reciprocal_rank(x, k)) df['reciprocal_rank_d2v_{}'.format(k)] = df['d2v_binary'].progress_apply(lambda x: reciprocal_rank(x, k)) df['ndcg_p2v_{}'.format(k)] = df['p2v_binary'].progress_apply(lambda x: ndcg(x, k)) df['ndcg_d2v_{}'.format(k)] = df['d2v_binary'].progress_apply(lambda x: ndcg(x, k)) #df.to_csv('/home/ashwath/Programs/UnpaywallMAG/Evaluation/paper2vec_unpaywall_500.tsv', sep='\t') df.to_pickle('/home/ashwath/Programs/UnpaywallMAG/Pickles/paperwisemetrics_unpaywall_p2v_d2v_df_may21.pickle') print("METRICS CALCULATED, time to calculate the means") # Get the mean of all the index columns df = df.drop(['p2v_recommendations', 'p2v_binary', 'd2v_recommendations', 'd2v_binary', 'ground_truth'], axis=1) mean_series = df.mean() mean_series.to_csv('/home/ashwath/Programs/UnpaywallMAG/Evaluation/meanmetrics_p2v_d2v_may21.tsv', sep='\t', index=True, header=False) print("C'est fini.")
def evaluate(self, X): qid_unique = np.unique(X["qid"]) n = len(qid_unique) losses = np.zeros(n) ndcgs = np.zeros(n) ndcgs_all = np.zeros(n) errs = np.zeros(n) for e, qid in enumerate(qid_unique): ind = np.where(X["qid"] == qid)[0] feed_dict = self._get_feed_dict(X, ind, training=False) loss, score = self.sess.run((self.loss, self.score), feed_dict=feed_dict) df = pd.DataFrame({ "label": X["label"][ind].flatten(), "score": score.flatten() }) df.sort_values("score", ascending=False, inplace=True) losses[e] = loss ndcgs[e] = ndcg(df["label"]) ndcgs_all[e] = ndcg(df["label"], top_ten=False) #errs[e] = calc_err(df["label"]) losses_mean = np.mean(losses) ndcgs_mean = np.mean(ndcgs) ndcgs_all_mean = np.mean(ndcgs_all) errs_mean = np.mean(errs) return losses_mean, errs_mean, ndcgs_mean, ndcgs_all_mean
def evaluate(path): cad = read_dataset('cad.csv') rgbd = read_dataset('rgbd.csv') freqs = freq_count(cad) results = load_results(path, rgbd, cad) mP = 0.0 mR = 0.0 mF = 0.0 mAP = 0.0 mNDCG = 0.0 mNNT1 = 0.0 mNNT2 = 0.0 for (queried, retrieved) in results: f = freqs[queried[0]] x = categories_to_rel(queried, retrieved)[:f] # Sum up the retrieval scores mP += precision(x) mR += recall(x, f) mF += f1score(x, f) mNDCG += ndcg(x) mAP += average_precision(x, f) mNNT1 += nnt1(x, f) mNNT2 += nnt2(x, f) n = len(results) print('num queries:', n) print('mean precision:', mP / n) print('mean recall:', mR / n) print('mean F1:', mF / n) print('mean AP:', mAP / n) print('mean NDCG: ', mNDCG / n) print('mean NNT1: ', mNNT1 / n) print('mean NNT2: ', mNNT2 / n) # Plot PR-curve cutoff = 1000 mean_precisions = np.zeros(cutoff, np.float64) mean_recalls = np.zeros(cutoff, np.float64) for (queried, retrieved) in results: x = categories_to_rel(queried, retrieved)[:cutoff] x = np.pad(x, (0, cutoff - len(x)), 'constant', constant_values=(0)) precisions = [] recalls = [] for k, _ in enumerate(x): p = precision(x[:k + 1]) r = recall(x[:k + 1], freqs[queried[0]]) precisions.append(p) recalls.append(r) mean_precisions += precisions mean_recalls += recalls mean_precisions /= len(results) mean_recalls /= len(results) plt.plot(mean_recalls, mean_precisions) plt.xlabel('Recall') plt.ylabel('Precision') plt.axis([0, 1, 0, 1.05]) plt.show()
def evaluate(path): queries = read_dataset('queries.csv') targets = read_dataset('targets.csv') freqs = freq_count(targets) results = load_results(path, queries, targets) cutoff = 1000 precisions = [] recalls = [] f1scores = [] aps = [] gains = [] nnt1s = [] nnt2s = [] for (queried, retrieved) in results: x = categories_to_rel(queried, retrieved)[:cutoff] p = precision(x) r = recall(x, freqs[queried[0]]) f = f1score(x, freqs[queried[0]]) g = ndcg(x) ap = average_precision(x, freqs[queried[0]]) t1 = nnt1(x, freqs[queried[0]]) t2 = nnt2(x, freqs[queried[0]]) precisions.append(p) recalls.append(r) f1scores.append(f) gains.append(g) aps.append(ap) nnt1s.append(t1) nnt2s.append(t2) print('precision:', p) print('recall:', r) print('F1 score:', f) print('average precision:', ap) print('NDCG:', g) print('nearest neighbor:', t1, t2)
def evaluate_ndcg(args): sum_ndcg = 0 for idx, playlist in enumerate(playlists): recommended_songs_scores = [ (idx_to_track_all[k], score) for k, score in enumerate(user_item_recs.getrow(idx).toarray()[0]) if score > 0 ] existing_songs = { track['track_uri']: 1 for track in playlists[idx]["tracks"] } existing_albums = set([ track_to_data[track]["album_uri"] for track in existing_songs.keys() ]) existing_artists = defaultdict(int) if len(existing_albums) > 0 and len(existing_songs) / len( existing_albums) > args['song_to_album_ratio']: if "random" not in sample_group: potential_recs = [ track for track, vals in track_to_data.items() if vals["album_uri"] == track_to_data[ playlist["tracks"][-1]["track_uri"]]["album_uri"] ] else: potential_recs = [] for album in existing_albums: potential_recs += [ track for track, vals in track_to_data.items() if vals["album_uri"] == album ] else: potential_recs = [] potential_recs += [ song for song, score in sorted( recommended_songs_scores, key=itemgetter(1), reverse=True) ] potential_recs += overall_top_songs recs = [] for song in potential_recs: if len(recs) == 500: break if song not in existing_songs and song not in recs and existing_artists[ track_to_data[song] ["artist_uri"]] < args['max_songs_by_same_artist']: recs.append(song) existing_artists[track_to_data[song]["artist_uri"]] += 1 holdout_tracks = [ track['track_uri'] for track in val_holdout_tracks[playlist["pid"]]["tracks"] ] sum_ndcg += metrics.ndcg(holdout_tracks, recs, 500) return {'ndcg': sum_ndcg / len(playlists), 'status': "OK", 'model': None}
def evaluate(path): queries = read_dataset('queries.csv') targets = read_dataset('targets.csv') freqs = freq_count(targets) results = load_results(path, queries, targets) cutoff = 1000 precisions = [] recalls = [] f1scores = [] aps = [] gains = [] nnt1s = [] nnt2s = [] for (queried, retrieved) in results: x = categories_to_rel(queried, retrieved)[:cutoff] p = precision(x) r = recall(x, freqs[queried[0]]) f = f1score(x, freqs[queried[0]]) g = ndcg(x) ap = average_precision(x, freqs[queried[0]]) t1 = nnt1(x, freqs[queried[0]]) t2 = nnt2(x, freqs[queried[0]]) precisions.append(p) recalls.append(r) f1scores.append(f) gains.append(g) aps.append(ap) nnt1s.append(t1) nnt2s.append(t2) print('mean precision:', numpy.mean(precisions)) print('mean recall:', numpy.mean(recalls)) print('mean F1 score:', numpy.mean(f1scores)) print('mAP:', numpy.mean(aps)) print('mean NDCG:', numpy.mean(gains)) print('mean nearest neighbor:', numpy.mean(nnt1s), numpy.mean(nnt2s)) # plot precision-recall curve mean_precisions = numpy.zeros(cutoff, numpy.float64) mean_recalls = numpy.zeros(cutoff, numpy.float64) for (queried, retrieved) in results: x = categories_to_rel(queried, retrieved)[:cutoff] x = numpy.pad(x, (0, cutoff - len(x)), 'constant', constant_values=(0)) precisions = [] recalls = [] for k, _ in enumerate(x): p = precision(x[:k + 1]) r = recall(x[:k + 1], freqs[queried[0]]) precisions.append(p) recalls.append(r) mean_precisions += precisions mean_recalls += recalls mean_precisions /= len(results) mean_recalls /= len(results) plt.plot(mean_recalls, mean_precisions) plt.xlabel('Recall') plt.ylabel('Precision') plt.axis([0, 1, 0, 1.05]) plt.show()
def infer(train_data, test_data, user_size, item_size): config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: ############################### CREATE MODEL ############################# iterator = tf.data.Iterator.from_structure(train_data.output_types, train_data.output_shapes) model = NCF.NCF(FLAGS.embedding_size, user_size, item_size, FLAGS.lr, FLAGS.optim, FLAGS.initializer, FLAGS.loss_func, FLAGS.activation, FLAGS.regularizer, iterator, FLAGS.topK, FLAGS.dropout, is_training=True) model.build() # train_init_op = iterator.make_initializer(train_data) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print("model files do not exist") exit(1) ############################### Training #################################### total_time = 0 count = 0 for epoch in range(FLAGS.epochs): ################################ EVALUATION ################################## sess.run(model.iterator.make_initializer(test_data)) model.is_training = False HR, MRR, NDCG = [], [], [] start_time = time.time() try: while True: prediction, label = model.step(sess, None) count = count + 1 label = int(label[0]) HR.append(metrics.hit(label, prediction)) MRR.append(metrics.mrr(label, prediction)) NDCG.append(metrics.ndcg(label, prediction)) except tf.errors.OutOfRangeError: hr = np.array(HR).mean() mrr = np.array(MRR).mean() ndcg = np.array(NDCG).mean() print("Epoch %d testing " %epoch + "Took: " + time.strftime("%H: %M: %S", time.gmtime(time.time() - start_time))) print("HR is %.3f, MRR is %.3f, NDCG is %.3f" %(hr, mrr, ndcg)) total_time += time.time() - start_time print("Total Epochs: %d on inference " %(epoch+1)) print("Total recommendations: %d" % (count * FLAGS.batch_size)) print("Approximate accelerator time in seconds is: %.2f" % total_time) print("Approximate accelerator performance in recommendations/second is: %.2f" % (float(count * FLAGS.batch_size)/float(total_time)))
def calc_metrics(file): all_predictions = [] all_labels = [] impressions = read_impressions_file(file) if sample_size > 0: impressions = random.sample(impressions, sample_size) for i, impression in enumerate(impressions): preds, labels = calc_impression(impression) all_predictions.append(preds) all_labels.append(labels) if i % 100 == 99: print("Completed {} / {}".format(i + 1, len(impressions))) metrics = { "auc": group_auc(all_predictions, all_labels), "mrr": mrr(all_predictions, all_labels), "ndcg@5": ndcg(all_predictions, all_labels, 5), "ndcg@10": ndcg(all_predictions, all_labels, 10) } return metrics
def eval_model(model, data_loader, sample_prob=1.0, train=False): sample_data = data_loader.sample_valid_data(sample_prob, train=train) with torch.no_grad(): all_predictions = [] all_labels = [] for impression in sample_data: user_ids, news_ids, _, _, _, labels = impression prediction = model(user_ids, news_ids).view(-1) all_predictions.append(prediction.detach().numpy()) all_labels.append(labels.detach().numpy()) metrics = { "auc": group_auc(all_predictions, all_labels), "mrr": mrr(all_predictions, all_labels), "ndcg@5": ndcg(all_predictions, all_labels, 5), "ndcg@10": ndcg(all_predictions, all_labels, 10) } print(metrics)
def main(): # dataset has format like [user_id, song_id, play_count] file = 'train_triplets.txt' print("Loading data...") load_data(file) print("Starting evaluation...") calc_neighbours() print("Finished evaluations.") print_top_songs_for_user(1) print("Starting cross validation...") print("RMSE result: ", str(rmse(train_set, test_set))) print("MAE result: ", str(mae(train_set, test_set))) print("NDCG result: ", str(ndcg(train_set, test_set)))
def test(model, sess, test_data, all_items_idx, user_bought): model.is_training = False model.test_first = True all_items_embed = [] HR, MRR, NDCG = [], [], [] ########################## GET ALL ITEM EMBEDDING ONCE ###################### for sample in test_data.get_all_test(): item_embed = model.step(sess, sample, None, None) all_items_embed.append(item_embed[0][0]) model.test_first = False all_items_embed = np.array(all_items_embed) ########################## TEST FOR EACH USER QUERY PAIR ##################### for sample in test_data.get_instance(): item_indices = model.step(sess, sample, all_items_embed, None)[0] itemID = sample[3] reviewerID = sample[4] ranking_list = all_items_idx[item_indices].tolist() top_idx = [] u_bought = user_bought[reviewerID] if reviewerID in user_bought else [] while len( top_idx ) < FLAGS.topK: # delete those items already bought by the user candidate_item = ranking_list.pop() if candidate_item not in u_bought or candidate_item == itemID: top_idx.append(candidate_item) top_idx = np.array(top_idx) HR.append(metrics.hit(itemID, top_idx)) MRR.append(metrics.mrr(itemID, top_idx)) NDCG.append(metrics.ndcg(itemID, top_idx)) hr = np.array(HR).mean() mrr = np.array(MRR).mean() ndcg = np.array(NDCG).mean() print("HR is %.3f, MRR is %.3f, NDCG is %.3f" % (hr, mrr, ndcg))
def test(data_set, model, data_loader, show_auc = False, use_dummy_gcn=False, use_struc=None): with torch.no_grad(): logging.info('----- start_test -----') model.eval() precision = [] recall = [] ndcg_score = [] auc_score = [] for user_ids, _, __ in data_loader: user_ids = user_ids.to(device) ratings = model.get_users_ratings(user_ids, use_dummy_gcn, use_struc) ground_truths = [] for i, user_id_t in enumerate(user_ids): user_id = user_id_t.item() ground_truths.append(data_set.test_user_dict[user_id]) train_pos = data_set.train_user_dict[user_id] for pos_item in train_pos: ratings[i][pos_item] = -1 # delete train data in ratings # Precision, Recall, NDCG ___, index_k = torch.topk(ratings, k=TOPK) # index_k.shape = (batch_size, TOPK), dtype=torch.int batch_predict_items = index_k.cpu().tolist() batch_precision, batch_recall = precision_and_recall(batch_predict_items, ground_truths) batch_ndcg = ndcg(batch_predict_items, ground_truths) # AUC if show_auc: ratings = ratings.cpu().numpy() batch_auc = auc(ratings, data_set.get_item_num(), ground_truths) auc_score.append(batch_auc) precision.append(batch_precision) recall.append(batch_recall) ndcg_score.append(batch_ndcg) precision = np.mean(precision) recall = np.mean(recall) ndcg_score = np.mean(ndcg_score) if show_auc: # Calculate AUC scores spends a long time auc_score = np.mean(auc_score) logging.info('test result: precision ' + str(precision) + '; recall ' + str(recall) + '; ndcg ' + str(ndcg_score) + '; auc ' + str(auc_score)) else: logging.info('test result: precision ' + str(precision) + '; recall ' + str(recall) + '; ndcg ' + str(ndcg_score))
def _run_evaluation_on_selected_users(self, recommender_object, usersToEvaluate): start_time = time.time() start_time_print = time.time() results_dict = {} for cutoff in self.cutoff_list: results_dict[cutoff] = create_empty_metrics_dict( self.n_items, self.n_users, recommender_object.URM_train, self.ignore_items_ID, self.ignore_users_ID, cutoff, self.diversity_object) n_users_evaluated = 0 for test_user in usersToEvaluate: # Being the URM CSR, the indices are the non-zero column indexes relevant_items = self.get_user_relevant_items(test_user) n_users_evaluated += 1 recommended_items = recommender_object.recommend( test_user, remove_seen_flag=self.exclude_seen, cutoff=self.max_cutoff, remove_top_pop_flag=False, remove_CustomItems_flag=self.ignore_items_flag) is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True) for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] is_relevant_current_cutoff = is_relevant[0:cutoff] recommended_items_current_cutoff = recommended_items[0:cutoff] results_current_cutoff[ EvaluatorMetrics.ROC_AUC.value] += roc_auc( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.PRECISION.value] += precision( is_relevant_current_cutoff, len(relevant_items)) results_current_cutoff[ EvaluatorMetrics.RECALL.value] += recall( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN. value] += recall_min_test_len( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MAP.value] += map( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MRR.value] += rr( is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg( recommended_items_current_cutoff, relevant_items, relevance=self.get_user_test_ratings(test_user), at=cutoff) results_current_cutoff[ EvaluatorMetrics.HIT_RATE. value] += is_relevant_current_cutoff.sum() results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.NOVELTY.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_USER.value].add_recommendations( recommended_items_current_cutoff, test_user) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL. value].add_recommendations( recommended_items_current_cutoff) if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff: results_current_cutoff[ EvaluatorMetrics.DIVERSITY_SIMILARITY. value].add_recommendations( recommended_items_current_cutoff) if time.time() - start_time_print > 30 or n_users_evaluated == len( self.usersToEvaluate): print( "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}" .format( n_users_evaluated, 100.0 * float(n_users_evaluated) / len(self.usersToEvaluate), time.time() - start_time, float(n_users_evaluated) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_print = time.time() return results_dict, n_users_evaluated
def train(train_data,test_data,user_size,item_size): with tf.Session() as sess: iterator = tf.data.Iterator.from_structure(train_data.output_types, train_data.output_shapes) model = NCF.NCF(FLAGS.embedding_size, user_size, item_size, FLAGS.lr, FLAGS.optim, FLAGS.initializer, FLAGS.loss_func, FLAGS.activation, FLAGS.regularizer, iterator, FLAGS.topK, FLAGS.dropout, is_training=True) model.build() ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print("Creating model with fresh parameters.") sess.run(tf.global_variables_initializer()) count = 0 for epoch in range(FLAGS.epochs): sess.run(model.iterator.make_initializer(train_data)) model.is_training = True model.get_data() start_time = time.time() try: while True: model.step(sess, count) count += 1 except tf.errors.OutOfRangeError: print("Epoch %d training " % epoch + "Took: " + time.strftime("%H: %M: %S", time.gmtime(time.time() - start_time))) sess.run(model.iterator.make_initializer(test_data)) model.is_training = False model.get_data() start_time = time.time() HR,MRR,NDCG = [],[],[] prediction, label = model.step(sess, None) try: while True: prediction, label = model.step(sess, None) label = int(label[0]) HR.append(metrics.hit(label, prediction)) MRR.append(metrics.mrr(label, prediction)) NDCG.append(metrics.ndcg(label, prediction)) except tf.errors.OutOfRangeError: hr = np.array(HR).mean() mrr = np.array(MRR).mean() ndcg = np.array(NDCG).mean() print("Epoch %d testing " % epoch + "Took: " + time.strftime("%H: %M: %S", time.gmtime(time.time() - start_time))) print("HR is %.3f, MRR is %.3f, NDCG is %.3f" % (hr, mrr, ndcg)) ################################## SAVE MODEL ################################ checkpoint_path = os.path.join(FLAGS.model_dir, "NCF.ckpt") model.saver.save(sess, checkpoint_path)
ndcg_orig = [] ndcg_rnd = [] gts = [] res_orig = [] res_pred = [] res_rnd = [] pool = Pool(40) for i in range(int(len(playlists_ids) / 1000)): results = pool.map(evaluate, range(i * 1000, (i + 1) * 1000)) for rets_orig, rets_pred, gt in results: if len(gt) > 0: res_orig.append(rets_orig) res_pred.append(rets_pred) rets_rnd = [ int(tr) for tr in rnd.sample(dict_test_ids.keys(), N) ] res_rnd.append(rets_rnd) gts.append(gt) ndcg_pred.append(metrics.ndcg(gt, rets_pred, N)) ndcg_orig.append(metrics.ndcg(gt, rets_orig, N)) ndcg_rnd.append(metrics.ndcg(gt, rets_rnd, N)) print("MAP") print("PRED MAP@", N, ": ", metrics.mapk(gts, res_pred, N)) print("ORIG MAP@", N, ": ", metrics.mapk(gts, res_orig, N)) print("RND MAP@", N, ": ", metrics.mapk(gts, res_rnd, N)) print("NDCG:") print("PRED: ", np.mean(ndcg_pred)) print("ORIG: ", np.mean(ndcg_orig)) print("RND: ", np.mean(ndcg_rnd))
sum_rprec_arti = 0 rprec_by_group = defaultdict(float) ndcg_by_group = defaultdict(float) clicks_by_group = defaultdict(float) rprec_arti_by_group = defaultdict(float) group_counts = defaultdict(int) total_playlists = len(pid_recs_map) for pid in pid_recs_map: recs = pid_recs_map[pid] holdout_tracks = [track['track_uri'] for track in val_holdout_tracks[pid]["tracks"]] rprec = metrics.r_precision(holdout_tracks, recs) ndcg = metrics.ndcg(holdout_tracks, recs, 500) clicks = metrics.playlist_extender_clicks(holdout_tracks, recs, 500) rprec_arti = metrics.r_precision_with_artist_fallback(holdout_tracks, recs, track_to_data) sum_rprec += rprec sum_ndcg += ndcg sum_clicks += clicks sum_rprec_arti += rprec_arti sample_type = val_holdout_tracks[pid]["sample_type"] rprec_by_group[sample_type] += rprec ndcg_by_group[sample_type] += ndcg clicks_by_group[sample_type] += clicks rprec_arti_by_group[sample_type] += rprec_arti group_counts[sample_type] += 1
# # TODO: Here you can write to file the recommendations for each user in the test split. # WARNING: there is a catch with the item idx! # # this will rank *all* items recommended_items = recommender.recommend(user_profile, exclude_seen=True) # use this to have the *top-k* recommended items (warning: this can underestimate ROC-AUC for small k) # recommended_items = recommender.recommend(user_profile, k=at, exclude_seen=True) roc_auc_ += roc_auc(recommended_items, relevant_items) precision_ += precision(recommended_items, relevant_items, at=at) recall_ += recall(recommended_items, relevant_items, at=at) map_ += map(recommended_items, relevant_items, at=at) mrr_ += rr(recommended_items, relevant_items, at=at) ndcg_ += ndcg(recommended_items, relevant_items, relevance=test[test_user].data, at=at) roc_auc_ /= neval precision_ /= neval recall_ /= neval map_ /= neval mrr_ /= neval ndcg_ /= neval logger.info('Ranking quality') logger.info('ROC-AUC: {:.4f}'.format(roc_auc_)) logger.info('Precision@{}: {:.4f}'.format(at, precision_)) logger.info('Recall@{}: {:.4f}'.format(at, recall_)) logger.info('MAP@{}: {:.4f}'.format(at, map_)) logger.info('MRR@{}: {:.4f}'.format(at, mrr_)) logger.info('NDCG@{}: {:.4f}'.format(at, ndcg_))
import datetime from data import load_data, get_data, get_song_sets from model import learning from metrics import rmse, mae, ndcg max_users = 1000 iterations = 5 print("Data loading...") load_data(max_users) print("Finished.") # Cross validation for i in range(0, iterations): print("Iteration", i + 1, "/", iterations) print("Learning is in process...") learning_set, testing_set = get_song_sets() data = get_data() start_time = time.time() learning(data, learning_set, 100) finish_time = time.time() print("Learning finished. Time:", datetime.timedelta(seconds=finish_time - start_time)) print("RMSE:", rmse(data, testing_set)) print("MAE: ", mae(data, testing_set)) print("NDCG:", ndcg(data, testing_set)) print("=====")
def evaluate_ndcg(args): item_weight = args.get("item_weight", 0) word_weight = args.get("word_weight", 0) albu_weight = args.get("albu_weight", 0) arti_weight = args.get("arti_weight", 0) song_to_album_ratio = args.get("song_to_album_ratio", 500) # Create recommendations for the challenge playlists user_item_recs = (item_weight * user_ibcf_recs + word_weight * user_word_recs + albu_weight * user_albu_recs + arti_weight * user_arti_recs) start = time.time() sum_ndcg = 0 for idx, playlist in enumerate(playlists): recommended_songs_scores = [ (idx_to_track_all[k], score) for k, score in enumerate(user_item_recs.getrow(idx).toarray()[0]) if score > 0 ] existing_songs = { track['track_uri']: 1 for track in playlists[idx]["tracks"] } existing_albums = set([ track_to_data[track]["album_uri"] for track in existing_songs.keys() ]) existing_artists = defaultdict(int) if "random" not in sample_group and len(existing_albums) > 0 and len( existing_songs) / len(existing_albums) > song_to_album_ratio: potential_recs = [ track for track, vals in track_to_data.items() if vals["album_uri"] == track_to_data[ playlist["tracks"][-1]["track_uri"]]["album_uri"] ] else: potential_recs = [] potential_recs += [ song for song, score in sorted( recommended_songs_scores, key=itemgetter(1), reverse=True) ] potential_recs += overall_top_songs recs = [] for song in potential_recs: if len(recs) == 500: break if song not in existing_songs and song not in recs and existing_artists[ track_to_data[song] ["artist_uri"]] < args['max_songs_by_same_artist']: recs.append(song) existing_artists[track_to_data[song]["artist_uri"]] += 1 holdout_tracks = [ track['track_uri'] for track in val_holdout_tracks[playlist["pid"]]["tracks"] ] sum_ndcg += metrics.ndcg(holdout_tracks, recs, 500) return { 'loss': -sum_ndcg / len(playlists), 'status': STATUS_OK, 'model': None }
def _run_evaluation_on_selected_users(self, recommender_object, usersToEvaluate, block_size=1000): start_time = time.time() start_time_print = time.time() results_dict = {} for cutoff in self.cutoff_list: results_dict[cutoff] = create_empty_metrics_dict( self.n_items, self.n_users, recommender_object.get_URM_train(), self.ignore_items_ID, self.ignore_users_ID, cutoff, self.diversity_object) n_users_evaluated = 0 # Start from -block_size to ensure it to be 0 at the first block user_batch_start = 0 user_batch_end = 0 while user_batch_start < len(self.usersToEvaluate): user_batch_end = user_batch_start + block_size user_batch_end = min(user_batch_end, len(usersToEvaluate)) test_user_batch_array = np.array( usersToEvaluate[user_batch_start:user_batch_end]) user_batch_start = user_batch_end # Compute predictions for a batch of users using vectorization, much more efficient than computing it one at a time recommended_items_batch_list = recommender_object.recommend( test_user_batch_array, remove_seen_flag=self.exclude_seen, cutoff=self.max_cutoff, remove_top_pop_flag=False, remove_CustomItems_flag=self.ignore_items_flag) # Compute recommendation quality for each user in batch for batch_user_index in range(len(recommended_items_batch_list)): user_id = test_user_batch_array[batch_user_index] recommended_items = recommended_items_batch_list[ batch_user_index] # Being the URM CSR, the indices are the non-zero column indexes relevant_items = self.get_user_relevant_items(user_id) is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True) n_users_evaluated += 1 for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] is_relevant_current_cutoff = is_relevant[0:cutoff] recommended_items_current_cutoff = recommended_items[ 0:cutoff] results_current_cutoff[ EvaluatorMetrics.ROC_AUC.value] += roc_auc( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.PRECISION.value] += precision( is_relevant_current_cutoff, len(relevant_items)) results_current_cutoff[ EvaluatorMetrics.RECALL.value] += recall( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN. value] += recall_min_test_len( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MAP.value] += map( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MRR.value] += rr( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.NDCG.value] += ndcg( recommended_items_current_cutoff, relevant_items, relevance=self.get_user_test_ratings(user_id), at=cutoff) results_current_cutoff[ EvaluatorMetrics.HIT_RATE. value] += is_relevant_current_cutoff.sum() results_current_cutoff[ EvaluatorMetrics.ARHR.value] += arhr( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.NOVELTY.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_GINI. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.SHANNON_ENTROPY. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_ITEM. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_USER. value].add_recommendations( recommended_items_current_cutoff, user_id) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_HERFINDAHL. value].add_recommendations( recommended_items_current_cutoff) if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff: results_current_cutoff[ EvaluatorMetrics.DIVERSITY_SIMILARITY. value].add_recommendations( recommended_items_current_cutoff) if time.time( ) - start_time_print > 30 or n_users_evaluated == len( self.usersToEvaluate): print( "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}" .format( n_users_evaluated, 100.0 * float(n_users_evaluated) / len(self.usersToEvaluate), time.time() - start_time, float(n_users_evaluated) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_print = time.time() return results_dict, n_users_evaluated
def train(train_data, test_data, n_user, n_item): with tf.Session() as sess: iterator = tf.data.Iterator.from_structure(train_data.output_types, train_data.output_shapes) model = NCF.NCF(FLAGS.embedding_size, n_user, n_item, FLAGS.lr, FLAGS.optim, FLAGS.initializer, FLAGS.loss_func, FLAGS.activation, FLAGS.regularizer, iterator, FLAGS.topK, FLAGS.dropout, is_training=True) model.build() # 有参数就读取, 没有就重新训练 ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt and ckpt.model_checkpoint_path: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) # 加载模型参数 model.saver.restore(sess, ckpt.model_checkpoint_path) else: print("Creating model with fresh parameters.") sess.run(tf.global_variables_initializer()) count = 0 # 在训练集上训练epochs轮 for epoch in range(FLAGS.epochs): # 训练集的迭代器 sess.run(model.iterator.make_initializer(train_data)) model.is_training = True model.get_data() start_time = time.time() try: while True: # 直到生成器没数据, 也就是所有训练数据遍历一次 model.step(sess, count) count += 1 except tf.errors.OutOfRangeError: # 打印训练一轮的时间 print("Epoch %d training " % epoch + "Took: " + time.strftime("%H: %M: %S", time.gmtime(time.time() - start_time))) # 测试集的迭代器 sess.run(model.iterator.make_initializer(test_data)) model.is_training = False model.get_data() start_time = time.time() HR, MRR, NDCG = [], [], [] pred_item, gt_item = model.step(sess, None) try: while True: # 直到生成器没数据, 也就是所有测试数据遍历一次 pred_item, gt_item = model.step(sess, None) # 对于测试集每同一批量数据的item都一样, 所以只取一个 gt_item = int(gt_item[0]) HR.append(metrics.hit(gt_item, pred_item)) MRR.append(metrics.mrr(gt_item, pred_item)) NDCG.append(metrics.ndcg(gt_item, pred_item)) # 评估值取均值 except tf.errors.OutOfRangeError: hr = np.array(HR).mean() mrr = np.array(MRR).mean() ndcg = np.array(NDCG).mean() print("Epoch %d testing " % epoch + "Took: " + time.strftime("%H: %M: %S", time.gmtime(time.time() - start_time))) print("HR is %.3f, MRR is %.3f, NDCG is %.3f" % (hr, mrr, ndcg)) # 保存模型参数 checkpoint_path = os.path.join(FLAGS.model_dir, "NCF.ckpt") model.saver.save(sess, checkpoint_path)
def evaluateRecommender(self, recommender_object): """ :param recommender_object: the trained recommender object, a Recommender subclass :param URM_test_list: list of URMs to test the recommender against, or a single URM object :param cutoff_list: list of cutoffs to be use to report the scores, or a single cutoff """ results_dict = {} for cutoff in self.cutoff_list: results_dict[cutoff] = create_empty_metrics_dict( self.n_items, self.n_users, recommender_object.URM_train, self.ignore_items_ID, self.ignore_users_ID, cutoff, self.diversity_object) start_time = time.time() start_time_print = time.time() n_eval = 0 self.__all_items = np.arange(0, self.n_items, dtype=np.int) self.__all_items = set(self.__all_items) if self.ignore_items_flag: recommender_object.set_items_to_ignore(self.ignore_items_ID) for test_user in self.usersToEvaluate: # Being the URM CSR, the indices are the non-zero column indexes relevant_items = self.get_user_relevant_items(test_user) n_eval += 1 self.user_specific_remove_items(recommender_object, test_user) # recommended_items = recommender_object.recommend(np.array(test_user), remove_seen_flag=self.exclude_seen, # cutoff = self.max_cutoff, remove_top_pop_flag=False, remove_CustomItems_flag=self.ignore_items_flag) recommended_items = recommender_object.recommend( np.atleast_1d(test_user), remove_seen_flag=self.exclude_seen, cutoff=self.max_cutoff, remove_top_pop_flag=False, remove_CustomItems_flag=self.ignore_items_flag) recommended_items = np.array(recommended_items[0]) recommender_object.reset_items_to_ignore() is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True) for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] is_relevant_current_cutoff = is_relevant[0:cutoff] recommended_items_current_cutoff = recommended_items[0:cutoff] results_current_cutoff[ EvaluatorMetrics.ROC_AUC.value] += roc_auc( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.PRECISION.value] += precision( is_relevant_current_cutoff, len(relevant_items)) results_current_cutoff[ EvaluatorMetrics.RECALL.value] += recall( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN. value] += recall_min_test_len( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MAP.value] += map( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MRR.value] += rr( is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg( recommended_items_current_cutoff, relevant_items, relevance=self.get_user_test_ratings(test_user), at=cutoff) results_current_cutoff[ EvaluatorMetrics.HIT_RATE. value] += is_relevant_current_cutoff.sum() results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.NOVELTY.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_USER.value].add_recommendations( recommended_items_current_cutoff, test_user) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL. value].add_recommendations( recommended_items_current_cutoff) if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff: results_current_cutoff[ EvaluatorMetrics.DIVERSITY_SIMILARITY. value].add_recommendations( recommended_items_current_cutoff) if time.time() - start_time_print > 30 or n_eval == len( self.usersToEvaluate): print( "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}" .format(n_eval, 100.0 * float(n_eval) / len(self.usersToEvaluate), time.time() - start_time, float(n_eval) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_print = time.time() if (n_eval > 0): for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] for key in results_current_cutoff.keys(): value = results_current_cutoff[key] if isinstance(value, Metrics_Object): results_current_cutoff[key] = value.get_metric_value() else: results_current_cutoff[key] = value / n_eval precision_ = results_current_cutoff[ EvaluatorMetrics.PRECISION.value] recall_ = results_current_cutoff[EvaluatorMetrics.RECALL.value] if precision_ + recall_ != 0: results_current_cutoff[EvaluatorMetrics.F1.value] = 2 * ( precision_ * recall_) / (precision_ + recall_) else: print( "WARNING: No users had a sufficient number of relevant items") if self.ignore_items_flag: recommender_object.reset_items_to_ignore() results_run_string = self.get_result_string(results_dict) return (results_dict, results_run_string)
def train(train_data, test_data, user_size, item_size): config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: ############################### CREATE MODEL ############################# iterator = tf.data.Iterator.from_structure(train_data.output_types, train_data.output_shapes) model = NCF.NCF(FLAGS.embedding_size, user_size, item_size, FLAGS.lr, FLAGS.optim, FLAGS.initializer, FLAGS.loss_func, FLAGS.activation, FLAGS.regularizer, iterator, FLAGS.topK, FLAGS.dropout, is_training=True) model.build() # train_init_op = iterator.make_initializer(train_data) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print("Creating model with fresh parameters.") sess.run(tf.global_variables_initializer()) ############################### Training #################################### count = 0 for epoch in range(FLAGS.epochs): sess.run(model.iterator.make_initializer(train_data)) model.is_training = True start_time = time.time() try: while True: model.step(sess, count) count += 1 except tf.errors.OutOfRangeError: print("Epoch %d training " %epoch + "Took: " + time.strftime("%H: %M: %S", time.gmtime(time.time() - start_time))) ################################ EVALUATION ################################## sess.run(model.iterator.make_initializer(test_data)) model.is_training = False start_time = time.time() HR, MRR, NDCG = [], [], [] try: while True: prediction, label = model.step(sess, None) label = int(label[0]) HR.append(metrics.hit(label, prediction)) MRR.append(metrics.mrr(label, prediction)) NDCG.append(metrics.ndcg(label, prediction)) except tf.errors.OutOfRangeError: hr = np.array(HR).mean() mrr = np.array(MRR).mean() ndcg = np.array(NDCG).mean() print("Epoch %d testing " %epoch + "Took: " + time.strftime("%H: %M: %S", time.gmtime(time.time() - start_time))) print("HR is %.3f, MRR is %.3f, NDCG is %.3f" %(hr, mrr, ndcg)) ################################## SAVE MODEL ################################ checkpoint_path = os.path.join(FLAGS.model_dir, "NCF.ckpt") model.saver.save(sess, checkpoint_path)
def compute_score(predictions): from metrics import ndcg return ndcg(predictions.relevance_grade, 38)