def generate_reward(gold_index_list, answer_index_list): reward = 0 ap = 0 reciprocal_rank = 0 answer_list = list(answer_index_list) size = len(answer_index_list) true = sum(gold_index_list > 0) inp = np.zeros(size) for rank, val in enumerate(gold_index_list): if val and rank in answer_list: inp[answer_list.index(rank)] = val maxk = sum(inp > 0) if true: ap = average_precision(inp) * (maxk / true) reciprocal_rank = mean_reciprocal_rank([inp]) ndcg = ndcg_at_k(inp, min(10, size)) dcg_five = dcg_at_k(inp, 5) reward = (ap + reciprocal_rank + ndcg + dcg_five) / 4 ranks = [1, 3, 5, 10] reward_tuple = [reward, ap, reciprocal_rank, ndcg, dcg_five] for r in ranks: reward_tuple.append(precision_at_k(inp, min(r, len(inp)))) for r in ranks: reward_tuple.append(ndcg_at_k(inp, min(r, len(inp)))) return reward_tuple
def evalResults(results, trueRelevance, noveltyList, trainModelIDs, rev_dict, uid, alg, params, rec, outFile, diversity, novelty): params = [str(i) for i in params] #calculate rating precision mmScaler = MinMaxScaler(copy=True) results = mmScaler.fit_transform(results.reshape(-1, 1)) results = results.reshape((-1, )) r2Sc = r2_score(trueRelevance, results) mae = mean_absolute_error(trueRelevance, results) #calculate ranking scores idx = (-results).argsort() if diversity == "yes": reranked = mmr_sorted(range(len(results)), 0.8, results, rev_dict, 10) idx1 = [k for k, v in reranked.items()] idx2 = [i for i in idx if i not in idx1] idx1.extend(idx2) idx = idx1 rankedRelevance = trueRelevance[idx] rankedNovelty = noveltyList[idx] #print(rankedRelevance) map = rank_metrics.average_precision(rankedRelevance) aucSc = roc_auc_score(trueRelevance, results) nDCG10 = rank_metrics.ndcg_at_k(rankedRelevance, 10) nDCG100 = rank_metrics.ndcg_at_k(rankedRelevance, 100) nDCG = rank_metrics.ndcg_at_k(rankedRelevance, len(rankedRelevance)) p5 = prec_at_n(rankedRelevance, 5) r5 = rec_at_n(rankedRelevance, 5) n5 = meanNovelty_at_n(rankedNovelty, 5) un5 = user_novelty_at_n(idx, trainModelIDs, 5) ild5 = ild_at_n(idx, rev_dict, 5) p10 = prec_at_n(rankedRelevance, 10) r10 = rec_at_n(rankedRelevance, 10) n10 = meanNovelty_at_n(rankedNovelty, 10) ild10 = ild_at_n(idx, rev_dict, 10) un10 = user_novelty_at_n(idx, trainModelIDs, 10) mrr = rank_metrics.mean_reciprocal_rank([rankedRelevance]) #print((uid, alg, ",".join(params), rec, r2Sc, mae, map, aucSc, mrr, p5, p10, r5, r10, nDCG10, nDCG100, nDCG)) txt = "%s;%s;%s;%s;%s;%s;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f\n" % ( uid, alg, ",".join(params), rec, diversity, novelty, r2Sc, mae, map, aucSc, mrr, p5, p10, r5, r10, nDCG10, nDCG100, nDCG, n5, n10, un5, un10, ild5, ild10) outFile.write(txt) return (r2Sc, mae, map, aucSc, mrr, p5, p10, r5, r10, nDCG10, nDCG100, nDCG, n5, n10, ild5, ild10)
def test_ndcg(self): r = [3, 2, 3, 0, 1, 2] # Already best ranked r_best = [5, 5, 5, 4, 4, 3] self.assertLessEqual(ndcg_at_k( r, 1, ), 1.0) self.assertAlmostEqual(ndcg_at_k(r, 6, method=0), 0.961, places=3) self.assertLessEqual(ndcg_at_k( r_best, 4, ), 1.0)
def per_user_rankings(test_data, test_label, scores): unique_users = np.unique(test_data[:, 0]) user_array = test_data[:, 0] ndcg = [] aupr_list = [] auc_list = [] for u in unique_users: indices_u = np.in1d(user_array, [u]) labels_u = test_label[indices_u].astype(float) scores_u = scores[indices_u].astype(float) #ndcg is calculated only for the users with some positive examples if not all(i <= 0.001 for i in labels_u): tmp = np.c_[labels_u, scores_u] tmp = tmp[tmp[:, 1].argsort()[::-1], :] ordered_labels = tmp[:, 0] ndcg_u = rank.ndcg_at_k(ordered_labels, ordered_labels.shape[0], 1) ndcg.append(ndcg_u) prec, rec, thr = precision_recall_curve(labels_u, scores_u) aupr_val = auc(rec, prec) aupr_list.append(aupr_val) fpr, tpr, thr = roc_curve(labels_u, scores_u) auc_val = auc(fpr, tpr) auc_list.append(auc_val) return np.array([ndcg, aupr_list, auc_list])
def evals(self, x_test, y_test, top_n): user_item = {} m, n = x_test.shape for i in range(m): u, i, r = x_test[i][0], x_test[i][1], y_test[i] user_item.setdefault(u, {}) user_item[u][i] = r recommend_dict = self.rec_top(top_n) ndcg = [] for u in recommend_dict: temp = [] for i in recommend_dict[u]: if i in user_item[u]: temp.append(user_item[u][i]) else: temp.append(0) ndcg.append(rank_metrics.ndcg_at_k(temp, top_n)) print('ndcg:%f' % np.mean(ndcg)) p, r = 0., 0. for u in recommend_dict: cm_users = set(user_item[u]) & set(recommend_dict[u]) p += len(cm_users) / top_n r += len(cm_users) / len(user_item[u]) precision = p / len(recommend_dict) recall = r / len(recommend_dict) print("precision=%f\nrecall=%f" % (precision, recall))
def compute_metrics(model, criterion, loader, k=5): global GPU_AVAILABLE loss = 0. p_at_1 = 0 p_at_k = 0 ndcg = 0 for X, Y in loader: X = Variable(X) Y = [Variable(y) for y in Y] if GPU_AVAILABLE: X = X.cuda() Y = [y.cuda() for y in Y] outputs = model(Y) loss += criterion(outputs).data.item() if GPU_AVAILABLE: outputs = [out.cpu() for out in outputs] outputs = [out.data.numpy().squeeze() for out in outputs] idxs = [np.argsort(out)[::-1] for out in outputs] p_at_1 += sum([np.mean(idx[:1] < 1) for idx in idxs]) p_at_k += sum([np.mean(idx[:k] < k) for idx in idxs]) ndcg += sum( [ndcg_at_k(out.tolist(), k=k, method=0) for out in outputs]) N = len(loader.dataset) return loss / N, p_at_1 / N, p_at_k / N, ndcg / N
def compute_metrics(ranked_judgements, pr_atk, threshold_grade): """ Given the ranked judgements compute the metrics for a query. :param ranked_judgements: list(int); graded or binary relevances in rank order. :param pr_atk: int; the @K value to use for computing precision and recall. :param threshold_grade: int; Assuming 0-3 graded relevances, threshold at some point and convert graded to binary relevance. :return: """ graded_judgements = ranked_judgements ranked_judgements = [ 1 if rel >= threshold_grade else 0 for rel in graded_judgements ] # Use the full set of candidate not the pr_atk. ndcg = rm.ndcg_at_k(graded_judgements, len(ranked_judgements)) ndcg_pr = rm.ndcg_at_k(graded_judgements, int(0.20 * len(ranked_judgements))) ndcg_20 = rm.ndcg_at_k(graded_judgements, 20) max_total_relevant = sum(ranked_judgements) recall = recall_at_k(ranked_rel=ranked_judgements, atk=pr_atk, max_total_relevant=max_total_relevant) precision = rm.precision_at_k(r=ranked_judgements, k=pr_atk) r_precision = rm.r_precision(r=ranked_judgements) f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 av_precision = rm.average_precision(r=ranked_judgements) reciprocal_rank = rm.mean_reciprocal_rank(rs=[ranked_judgements]) metrics = { 'recall': float(recall), 'precision': float(precision), 'f1': float(f1), 'r_precision': float(r_precision), 'av_precision': float(av_precision), 'reciprocal_rank': float(reciprocal_rank), 'ndcg': ndcg, 'ndcg@20': ndcg_20, 'ndcg%20': ndcg_pr } return metrics
def compute_metrics(model, loader, k=5, mode='bilinear'): global GPU_AVAILABLE p_at_1 = 0 p_at_k = 0 ndcg = 0 for X, Y in loader: X = Variable(X) Y = [Variable(y) for y in Y] if GPU_AVAILABLE: X = X.cuda() Y = [y.cuda() for y in Y] if mode == "bilinear": outputs = model(X, Y) if GPU_AVAILABLE: outputs = [out.cpu() for out in outputs] outputs = [out.data.numpy().squeeze() for out in outputs] elif mode == "project_x": X_proj = model.project_x(X).data.numpy() X_proj = normalize_rows(X_proj) Y = [y.data.numpy() for y in Y] outputs = [ x.reshape(1, -1).dot(np.atleast_2d(y).T).squeeze() for x, y in zip(X_proj, Y) ] elif mode == "project_y": Y_proj = [model.project_y(y).data.numpy() for y in Y] Y_proj = [normalize_rows(y) for y in Y_proj] X = X.data.numpy() outputs = [ x.reshape(1, -1).dot(np.atleast_2d(y).T).squeeze() for x, y in zip(X, Y_proj) ] elif mode == "random": outputs = [np.random.random(len(y)) for y in Y] else: raise ValueError("not a valid mode") idxs = [np.argsort(out)[::-1] for out in outputs] p_at_1 += sum([np.mean(idx[:1] < 1) for idx in idxs]) p_at_k += sum([np.mean(idx[:k] < k) for idx in idxs]) ndcg += sum( [ndcg_at_k(out.tolist(), k=k, method=0) for out in outputs]) N = len(loader.dataset) return p_at_1 / N, p_at_k / N, ndcg / N
def calc_NDCG(prediction, target, k=5): """This needs to work out the relevances and once the NDCG is calculated then all of the NDCG for each query is added together and divided by the total amount of queries giving a mean NDCG score. All the inputs need to be numpy.ndarray""" rel = prediction == target all_NDCG = np.zeros(target.shape) for i, relevance in enumerate(rel): all_NDCG[i,0] = ndcg_at_k(relevance, k, 1) NDCG = all_NDCG.sum()/len(all_NDCG) return NDCG
def calc_NDCG(prediction, target, k=5): """This needs to work out the relevances and once the NDCG is calculated then all of the NDCG for each query is added together and divided by the total amount of queries giving a mean NDCG score. All the inputs need to be numpy.ndarray""" rel = prediction == target all_NDCG = np.zeros(target.shape) for i, relevance in enumerate(rel): all_NDCG[i, 0] = ndcg_at_k(relevance, k, 1) NDCG = all_NDCG.sum() / len(all_NDCG) return NDCG
def normalized_discounted_cummulative_gain(test_data, test_label, scores): unique_users = np.unique(test_data[:, 0]) user_array = test_data[:, 0] ndcg = [] for u in unique_users: indices_u = np.in1d(user_array, [u]) labels_u = test_label[indices_u].astype(float) scores_u = scores[indices_u].astype(float) #ndcg is calculated only for the users with some positive examples if not all(i <= 0.001 for i in labels_u): tmp = np.c_[labels_u, scores_u] tmp = tmp[tmp[:, 1].argsort()[::-1], :] ordered_labels = tmp[:, 0] ndcg_u = rank.ndcg_at_k(ordered_labels, ordered_labels.shape[0], 1) ndcg.append(ndcg_u) return np.mean(ndcg)
def generate_reward(gold_index_list, answer_index_list, reward_type): reward = 0 ap = 0. reciprocal_rank = 0 answer_list = list(answer_index_list) size = len(answer_index_list) true = sum(gold_index_list > 0) inp = np.zeros(size) for rank, val in enumerate(gold_index_list): if val and rank in answer_list: inp[answer_list.index(rank)] = val maxk = sum(inp > 0) if true: ap = average_precision(inp) * (maxk / true) reciprocal_rank = mean_reciprocal_rank([inp]) ndcg = ndcg_at_k(inp, min(10, size)) dcg_five = dcg_at_k(inp, 5) reward = rewards[reward_type - 1](inp, ap, reciprocal_rank, ndcg, dcg_five) return reward, ap, reciprocal_rank, ndcg, dcg_five
def evaluate(model, data, logdir, epoch, out_f, gpu): get_embedding(model, data, logdir, gpu, test=False) img_embeddings, img_fns, gel_embeddings, gel_fns = get_embedding(model, data, logdir, gpu, test=True) precision = get_score(img_embeddings, img_fns, gel_embeddings, gel_fns) return precision nb_img = len(img_embeddings) nb_gel = len(gel_embeddings) distance_matrix = np.zeros((nb_gel, nb_img)) img_embeddings = np.array(img_embeddings) gel_embeddings = np.array(gel_embeddings) dim_embedding = img_embeddings.shape[-1] img_embeddings = img_embeddings.reshape((nb_img, dim_embedding)) gel_embeddings = gel_embeddings.reshape((nb_gel, dim_embedding)) scores = [] for i in range(nb_gel): distance_matrix[i, :] = np.mean(np.square(img_embeddings - gel_embeddings[i, :]), axis=1).T r = [] for j in range(nb_img): if (get_gel_id(img_fns[j]) == get_gel_id(gel_fns[i])): r.append(1) else: r.append(0) d = distance_matrix[i, :].tolist() a = zip(d, r) a = sorted(a, key=lambda d: d[0]) r = [x[1] for x in a] ndcg = [rank_metrics.ndcg_at_k(r, k) for k in [10, 20, 30]] precision = [rank_metrics.precision_at_k(r, k) for k in [10, 20, 30]] scores.append(ndcg + precision) scores = np.array(scores) scores = np.mean(scores, axis=0) print "ndcg & precision", scores print >> out_f, "ndcg & precision", scores
def evaluate_retrieval(query_dct, corpus_dct, inverted_index, method_type): ''' Given a query dictionary and a corpus dictionary, go through each query and determine the NDCG for its retrieval with the disease labels as relevance measures. ''' metric_dct = {} for query_key in query_dct: doc_score_dct = {} q_disease_list, q_symptom_list, q_herb_list = query_dct[query_key] for doc_key in corpus_dct: d_disease_list, d_symptom_list, d_herb_list = corpus_dct[doc_key] # With no query expansion, our document is just the set of symptoms. document = d_symptom_list[:] if 'mixed' in method_type or 'synonym' in method_type: document += d_herb_list # If expanded, q_symptom list might also contain herbs. doc_score = okapi_bm25(q_symptom_list, document, inverted_index, len(corpus_dct)) # Compute the relevance judgement. relevance = get_rel_score(q_disease_list, d_disease_list) doc_score_dct[(doc_key, relevance)] = doc_score sorted_scores = sorted(doc_score_dct.items(), key=operator.itemgetter(1), reverse=True) # Get the relevance rankings. rel_list = [pair[0][1] for pair in sorted_scores] # Compute different rank metrics for different values of k. for k in k_list: if k not in metric_dct: metric_dct[k] = [] if rank_metric == 'ndcg': metric_dct[k] += [ndcg_at_k(rel_list, k)] elif rank_metric == 'precision': metric_dct[k] += [precision_at_k(rel_list, k)] return metric_dct
def summarize(self): """Give summary statistics about the tournament.""" res = self.run() # res = self.results # champ should be undefeated champ = list(np.where(res.strength == max(res.strength))[0]) copeland = (res.wins[champ] == self.n_rounds) # top-k ranks = pd.DataFrame(data=np.transpose([ res.strength.rank(ascending=False), res.wins.rank(ascending=False), res.wins ]), columns=["str_rank", "win_rank", "wins"]) ranks['relevant'] = ranks['str_rank'] <= self.k borda = (ranks.win_rank[champ] == ranks.win_rank.min()) top_k_df = ranks.loc[ranks['str_rank'] <= self.k] top_k = sum(top_k_df['wins'] >= self.n_rounds - 2) / self.k tau, k_p = scipy.stats.kendalltau(ranks.str_rank, ranks.win_rank) rho, sp_p = scipy.stats.spearmanr(ranks.str_rank, ranks.win_rank) ranks.sort_values(by="win_rank") # using rank_metrics rel_vec = ranks.relevant.values prec = rank_metrics.r_precision(rel_vec) prec_at_k = rank_metrics.precision_at_k(rel_vec, self.k) avg_prec = rank_metrics.average_precision(rel_vec) dcg = rank_metrics.dcg_at_k(rel_vec, self.k) ndcg = rank_metrics.ndcg_at_k(rel_vec, self.k) df = pd.DataFrame(data=[ list([ int(copeland), int(borda), float(top_k), prec, prec_at_k, avg_prec, dcg, ndcg, float(tau), float(rho) ]) ], columns=[ 'undef_champ', 'top_champ', 'top_k_found', 'precision', 'precision_at_k', 'avg_prec', 'dcg', 'ndcg', 'tau', 'rho' ]) return df
def evaluate(self, ratings: Dict[int, List[int]], negatives: Dict[int, List[int]], topN: int): """ evaluate performance of models :param ratings: key: user, value: list of positive items :param negatives: key: user, value: list of negative items :param topN: int :return: """ ndcgs, apks, recalls = [], [], [] for user in sorted(ratings.keys()): pos_items = ratings[user] neg_items = negatives[user] assert type(pos_items) == list and type(neg_items) == list items = neg_items + pos_items users = np.full(len(items), user, dtype=np.int64) items = np.asarray(items) predictions = self.predict(users, items) labels = [0.0] * len(neg_items) + [1.0] * len(pos_items) labels = np.array(labels) # compute metric here indices = np.argsort(-predictions)[:topN] # indices of items with highest scores ranklist = labels[indices] ndcg = rank_metrics.ndcg_at_k(ranklist, topN) _, recall = rank_metrics._compute_precision_recall(ranklist, topN) apk = rank_metrics.average_precision(ranklist[:topN]) ndcgs.append(ndcg) apks.append(apk) recalls.append(recall) results = {} results["ndcg"] = np.nanmean(ndcgs) results["ndcg_list"] = ndcgs results["map"] = np.nanmean(apks) results["maps_list"] = apks results["recall"] = np.nanmean(recalls) results["recalls_list"] = recalls return results
def evaluate_ndcg_at_k(testing_data, k=3): ''' calc the similarity based on merging term and topic model :param: k, top k results accounted for calculating NDCG :param: lambda_ratio, the ratio for blending, final_similarity = lambda*term_similarity + (1-lambda)*topic_model_similarity lambda_ratio = 0 means only topic_model_similarity. lambda_ratio = 1 means only term_similarity :return: return NDCG@k ''' ndcg_total = 0 query_number = 0 ''' Get the final similarity rank ''' # print('Getting the final similarity rank') for iir_name, mapping_dict in testing_data.items(): if not iir_name in similarity_matrix: # print ('%s not found' % iir_name) continue weighted_similarities = similarity_matrix[iir_name] weighted_similarities = sorted(weighted_similarities, key=lambda item: item[1], reverse=True) r_array = [] # array used as input of nDCG for entry in weighted_similarities: if entry[0] in mapping_dict: r_array.append(mapping_dict[entry[0]]) else: r_array.append(0) # print(entry[0], entry[0], entry[1]) ndcg = ndcg_at_k(r_array, k) # print(r_array) # print(ndcg) ndcg_total += ndcg return float(ndcg_total) / len(testing_data)
def main(args): global verbose verbose = args.verbose scores = read_scores(args.path_input, col=args.col, reverse=args.flag_reverse) rel2facts = {} ndcgs = {} for fact_en in scores.index.unique(): ss = scores.loc[fact_en] try: rel = ss['rel'].values[0] except: logger.warning('Only one Japanese fact') continue try: rel2facts[rel].append(fact_en) except KeyError: rel2facts[rel] = [fact_en] ndcgs[fact_en] = ndcg_at_k(ss['label'].values, args.k, method=0) if args.path_output: with open(args.path_output, 'w') as f: for k, v in sorted(ndcgs.items(), key=lambda t: t[0]): f.write('{}\t{}\n'.format(k, v)) if args.flag_by_relation: for rel, facts in sorted(rel2facts.items(), key=lambda t: t[0]): l = [ndcgs[fact_en] for fact_en in facts] print('{}\t{}\t{}'.format(rel, sum(l) / len(l), len(l))) ndcg = sum(ndcgs.values()) / len(ndcgs) if verbose: logger.info('nDCG@{}: {}'.format(args.k, ndcg)) return ndcg
def rew6(inp, ap, reciprocal_rank, ndcg, dcg_five): return (ap + precision_at_k(inp, 3) + precision_at_k(inp, 5) + ndcg_at_k(inp, 3) + ndcg_at_k(inp, 5)) / 5
# In[36]: import numpy as np import rank_metrics import sys relevanceVector = np.loadtxt(open(sys.argv[1] + "/rv/relevanceVector_" + sys.argv[2]), delimiter=" ") f = open(sys.argv[1] + '/em/evalMetrics_' + sys.argv[2], 'w') for k in range(1, 16): total_precision_k = 0 total_dcg_k = 0 total_ndcg_k = 0 for row in relevanceVector: precision_k = rank_metrics.precision_at_k(row, k) dcg_k = rank_metrics.dcg_at_k(row, k, 0) ndcg_k = rank_metrics.ndcg_at_k(row, k, 0) total_precision_k = total_precision_k + precision_k total_dcg_k = total_dcg_k + dcg_k total_ndcg_k = total_ndcg_k + ndcg_k f.write("precision@" + str(k) + ": " + str(total_precision_k) + "\n") f.write("dcg@" + str(k) + ": " + str(total_dcg_k) + "\n") f.write("ndcg@" + str(k) + ": " + str(total_ndcg_k) + "\n") mrr = rank_metrics.mean_reciprocal_rank(relevanceVector) f.write("Mean Reciprocal Rank: " + str(mrr) + "\n") maP = rank_metrics.mean_average_precision(relevanceVector) f.write("Mean Average Precision: " + str(maP) + "\n") f.close()
def test_limitation_1(self): r_one = [1, 1, 1] r_two = [1, 1, 1, 0] self.assertEqual(ndcg_at_k(r_one, 3, method=1), ndcg_at_k(r_two, 4, method=1))
if ent not in ent_vec: ent_skip_count += 1 else: tmp_can_count = 0 for can in eval_query[ent]: if can in ent_vec: tmp_can_count += 1 a = ent_vec[ent]*ent_vec[can] sim.append((can, a.sum())) if tmp_can_count > 1: sim_rank = sorted(sim, key=lambda sim : sim[1], reverse=True) r = [] for item in sim_rank: r.append(eval_query[ent][item[0]]) if len(r) >1: tmp_n1 = rm.ndcg_at_k(r, 1, 1) else: tmp_n1 = rm.ndcg_at_k(r, len(r), 1) if len(r) >5: tmp_n5 = rm.ndcg_at_k(r, 5, 1) else: tmp_n5 = rm.ndcg_at_k(r, len(r), 1) if len(r) >10: tmp_n10 = rm.ndcg_at_k(r, 10, 1) else: tmp_n10 = rm.ndcg_at_k(r, len(r), 1) tmp_ap = rm.average_precision(r) ndcg1_sum += tmp_n1 ndcg5_sum += tmp_n5 ndcg10_sum += tmp_n10 map_sum += tmp_ap
def rec_net(train_loader, test_loader, node_emb, sequence_tensor): best_hit_1 = 0.0 best_hit_5 = 0.0 best_hit_10 = 0.0 best_hit_20 = 0.0 best_hit_50 = 0.0 best_ndcg_1 = 0.0 best_ndcg_5 = 0.0 best_ndcg_10 = 0.0 best_ndcg_20 = 0.0 best_ndcg_50 = 0.0 all_pos = [] all_neg = [] test_data.numpy() for index in range(test_data.shape[0]): user = test_data[index][0].item() item = test_data[index][1].item() link = test_data[index][2].item() if link == 1: all_pos.append((index, user, item)) else: all_neg.append((index, user, item)) recommendation = Recommendation(100).to(device) optimizer = torch.optim.Adam(recommendation.parameters(), lr=1e-3) for epoch in range(100): train_start_time = time.time() running_loss = 0.0 for step, batch in enumerate(train_loader): batch_item_emb = node_emb[batch[:, 1]].reshape( (batch.shape[0], 1, 100)).to(device) batch_labels = batch[:, 2].to(device) batch_sequence_tensor = sequence_tensor[batch[:, 0]].reshape( (batch.shape[0], 9, 100)).to(device) optimizer.zero_grad() prediction = recommendation(batch_item_emb, batch_sequence_tensor).to(device) loss_train = torch.nn.functional.cross_entropy( prediction, batch_labels).to(device) loss_train.backward() optimizer.step() running_loss += loss_train.item() train_time = time.time() - train_start_time print( f'epoch: {epoch}, training loss: {running_loss}, train time: {train_time}' ) if (epoch + 1) % 50 != 0: continue testing_start_time = time.time() hit_num_1 = 0 hit_num_5 = 0 hit_num_10 = 0 hit_num_20 = 0 hit_num_50 = 0 all_ndcg_1 = 0 all_ndcg_5 = 0 all_ndcg_10 = 0 all_ndcg_20 = 0 all_ndcg_50 = 0 for i, u_v_p in enumerate(all_pos): start = N * i end = N * i + N p_and_n_seq = all_neg[start:end] p_and_n_seq.append(tuple(u_v_p)) # N+1 items # 找到embedding,求出score scores = [] for index, userid, itemid in p_and_n_seq: # calculate score of user and item user_emb = node_emb[userid].reshape((1, 1, 100)).to(device) this_item_emb = node_emb[itemid].reshape( (1, 1, 100)).to(device) this_sequence_tensor = sequence_tensor[userid].reshape( (1, 9, 100)).to(device) score = recommendation(this_item_emb, this_sequence_tensor)[:, -1].to(device) scores.append(score.item()) normalized_scores = [ ((u_i_score - min(scores)) / (max(scores) - min(scores))) for u_i_score in scores ] pos_id = len(scores) - 1 s = np.array(scores) sorted_s = np.argsort(-s) if sorted_s[0] == pos_id: hit_num_1 += 1 hit_num_5 += 1 hit_num_10 += 1 hit_num_20 += 1 hit_num_50 += 1 elif pos_id in sorted_s[1:5]: hit_num_5 += 1 hit_num_10 += 1 hit_num_20 += 1 hit_num_50 += 1 elif pos_id in sorted_s[5:10]: hit_num_10 += 1 hit_num_20 += 1 hit_num_50 += 1 elif pos_id in sorted_s[10:20]: hit_num_20 += 1 hit_num_50 += 1 elif pos_id in sorted_s[20:50]: hit_num_50 += 1 ndcg_1 = ndcg_at_k(normalized_scores, 1, 0) ndcg_5 = ndcg_at_k(normalized_scores, 5, 0) ndcg_10 = ndcg_at_k(normalized_scores, 10, 0) ndcg_20 = ndcg_at_k(normalized_scores, 20, 0) ndcg_50 = ndcg_at_k(normalized_scores, 50, 0) all_ndcg_1 += ndcg_1 all_ndcg_5 += ndcg_5 all_ndcg_10 += ndcg_10 all_ndcg_20 += ndcg_20 all_ndcg_50 += ndcg_50 all_pos_num = len(all_pos) hit_rate_1 = hit_num_1 / all_pos_num hit_rate_5 = hit_num_5 / all_pos_num hit_rate_10 = hit_num_10 / all_pos_num hit_rate_20 = hit_num_20 / all_pos_num hit_rate_50 = hit_num_50 / all_pos_num all_ndcg_1 = all_ndcg_1 / all_pos_num all_ndcg_5 = all_ndcg_5 / all_pos_num all_ndcg_10 = all_ndcg_10 / all_pos_num all_ndcg_20 = all_ndcg_20 / all_pos_num all_ndcg_50 = all_ndcg_50 / all_pos_num if best_hit_1 < hit_rate_1: best_hit_1 = hit_rate_1 if best_hit_5 < hit_rate_5: best_hit_5 = hit_rate_5 if best_ndcg_1 < all_ndcg_1: best_ndcg_1 = all_ndcg_1 if best_hit_10 < hit_rate_10: best_hit_10 = hit_rate_10 if best_hit_20 < hit_rate_20: best_hit_20 = hit_rate_20 if best_hit_50 < hit_rate_50: best_hit_50 = hit_rate_50 if best_ndcg_5 < all_ndcg_5: best_ndcg_5 = all_ndcg_5 if best_ndcg_10 < all_ndcg_10: best_ndcg_10 = all_ndcg_10 if best_ndcg_20 < all_ndcg_20: best_ndcg_20 = all_ndcg_20 if best_ndcg_50 < all_ndcg_50: best_ndcg_50 = all_ndcg_50 testing_time = time.time() - testing_start_time print( f"epo:{epoch}|" f"HR@1:{hit_rate_1:.4f} | HR@5:{hit_rate_5:.4f} | HR@10:{hit_rate_10:.4f} | HR@20:{hit_rate_20:.4f} | HR@50:{hit_rate_50:.4f} |" f" NDCG@1:{all_ndcg_1:.4f} | NDCG@5:{all_ndcg_5:.4f} | NDCG@10:{all_ndcg_10:.4f}| NDCG@20:{all_ndcg_20:.4f}| NDCG@50:{all_ndcg_50:.4f}|" f" best_HR@1:{best_hit_1:.4f} | best_HR@5:{best_hit_5:.4f} | best_HR@10:{best_hit_10:.4f} | best_HR@20:{best_hit_20:.4f} | best_HR@50:{best_hit_50:.4f} |" f" best_NDCG@1:{best_ndcg_1:.4f} | best_NDCG@5:{best_ndcg_5:.4f} | best_NDCG@10:{best_ndcg_10:.4f} | best_NDCG@20:{best_ndcg_20:.4f} | best_NDCG@50:{best_ndcg_50:.4f} |" f" train_time:{train_time:.2f} | test_time:{testing_time:.2f}") print('training finish')
def evaluate_retrieval(query_dct, corpus_dct): ''' Given a query dictionary and a corpus dictionary, go through each query and determine the NDCG for its retrieval with the disease labels as relevance measures. ''' # Map each symptom and herb to the number of patient visits it appears in. inverted_index, avg_doc_len = get_inverted_index(corpus_dct) corpus_size = len(corpus_dct) metric_dct = {} for query_key in query_dct: doc_score_dct = {} # Ignore the query herb set. q_disease is label, q_symptom is query. q_disease_set, q_symptom_set, q_herb_set = query_dct[query_key] for doc_key in corpus_dct: d_disease_set, d_symptom_set, d_herb_set = corpus_dct[doc_key] # With no query expansion, our document is just the set of symptoms. document = d_symptom_set # If synonym or herbs/mixed expansions, add herb list into document. if args.method == 'synonym' or args.term_type in [ 'herbs', 'mixed' ]: document = document.union(d_herb_set) # Get the score between the query and the document. doc_score = okapi_bm25(q_symptom_set, document, inverted_index, corpus_size, avg_doc_len) # Compute the relevance judgement. relevance = get_rel_score(q_disease_set, d_disease_set) doc_score_dct[(doc_key, relevance)] = doc_score sorted_scores = sorted(doc_score_dct.items(), key=operator.itemgetter(1), reverse=True) # Get the relevance rankings. rel_list = [pair[0][1] for pair in sorted_scores] # Compute different rank metrics for different values of k. for k in k_list: if k not in metric_dct: metric_dct[k] = [] if args.rank_metric == 'ndcg': metric_dct[k] += [ndcg_at_k(rel_list, k)] elif args.rank_metric == 'precision': # metric_dct[k] += [precision_at_k(rel_list, k)] metric_dct[k] += [sum(rel_list[:k]) / float(k)] elif args.rank_metric == 'recall': metric_dct[k] += [sum(rel_list[:k]) / float(sum(rel_list))] elif args.rank_metric == 'f1': precision = sum(rel_list[:k]) / float(k) recall = sum(rel_list[:k]) / float(sum(rel_list)) if precision == 0: metric_dct[k] += [0] else: metric_dct[k] += [ 2 * precision * recall / (precision + recall) ] elif args.rank_metric == 'map': r = np.asarray(rel_list[:k]) != 0 out = [precision_at_k(r, i + 1) for i in range(r.size) if r[i]] if not out: metric_dct[k] += [0.0] else: metric_dct[k] += [sum(out) / sum(rel_list)] return metric_dct