def group_ndcg_score(truth, prediction, k=None, group_indices=None): if group_indices is None: return ndcg_score(np.expand_dims(truth, axis=0), np.expand_dims(prediction, axis=0), k=k) else: avg_ndcg = 0 cnt = 0 for sel in group_indices: sel_truth = truth[sel] sel_prediction = prediction[sel] if len(sel) == 1: continue else: try: group_ndcg = ndcg_score(np.expand_dims(sel_truth, axis=0), np.expand_dims(sel_prediction, axis=0), k=k) avg_ndcg += group_ndcg cnt += 1 except Exception: print(sel_truth) print(sel_prediction) raise Exception avg_ndcg /= cnt return avg_ndcg
def main(): movielens = fetch_movielens() train = movielens['train'] test = movielens['test'] print(train.shape) print(test.shape) model = LightFM(learning_rate=0.05, loss='bpr') model.fit(train, epochs=10) k = 10 train_precision = precision_at_k(model, train, k=k).mean() test_precision = precision_at_k(model, test, k=k).mean() print(f'precision_at_{k}(train): {train_precision}') print(f'precision_at_{k}(test) : {test_precision}') train_auc = auc_score(model, train).mean() test_auc = auc_score(model, test).mean() print(f'auc_score(train): {train_auc}') print(f'auc_score(test) : {test_auc}') y_train_preds = model.predict_rank(train) y_test_preds = model.predict_rank(test) train_ndcg = ndcg_score(train.toarray(), y_train_preds.toarray()) test_ndcg = ndcg_score(test.toarray(), y_test_preds.toarray()) print(f'ndcg_score(train): {train_ndcg}') print(f'ndcg_score(test) : {test_ndcg}') print('DONE') return 0
def show_ndcg(): print("This {0:.4f}".format(ndcg_score(r1, r2, 4))) print("Best {0:.4f}".format(ndcg_score(r1, r2_best, 4))) print("Worst {0:.4f}".format(ndcg_score(r1, r2_worst, 4))) print(dcg_score(r1, r2)) print(dcg_score(r1, r2_best)) print(dcg_score(r1, r2_worst))
def test_ndcg_score(): # Check perfect ranking y_true = [1, 0, 2] y_score = [ [0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9] ] perfect = ndcg_score(y_true, y_score) assert_equal(perfect, 1.0) # Check bad ranking with a small K y_true = [0, 2, 1] y_score = [ [0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9] ] short_k = ndcg_score(y_true, y_score, k=1) assert_equal(short_k, 0.0) # Check a random scoring y_true = [2, 1, 0] y_score = [ [0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9] ] average_ranking = ndcg_score(y_true, y_score, k=2) assert_almost_equal(average_ranking, 0.63092975)
def evaluate_metric(args, net, dataset, segment='valid', debug = False): # 输入的是valid 与 test data里面的数据,每个人一个购买的数据,我们对预测的分进行排序,看testvalid的NDCG是多少 # 我们可以对用户进行128一个batch进行计算 # 对test data # 我们的pred 是对所有的用户进行预测,难免时间有点长,我们 # 我们根据的是用户对所有物品的一个评分,这个评分对应一个物品,而我们的testdata里也有对应物品,我们看能不能预测到 # input: predicted value\test data # output: NDCG,recall possible_rating_values = dataset.possible_rating_values nd_possible_rating_values = th.FloatTensor(possible_rating_values).to(args.device) if segment == "test": rating_matrix = dataset.test_rating_matrix enc_graph = dataset.test_enc_graph dec_graph = dataset.test_recall_dec_graph user_len = len(list(pd.unique(dataset.test_rating_info["user_id"]))) elif segment == "valid": rating_matrix = dataset.valid_rating_matrix enc_graph = dataset.valid_enc_graph dec_graph = dataset.valid_recall_dec_graph user_len = len(list(pd.unique(dataset.valid_rating_info["user_id"]))) else: raise NotImplementedError # Evaluate RMSE net.eval() with th.no_grad(): pred_ratings, reg_loss, user_out, movie_out, W = net(enc_graph, dec_graph, dataset.user_feature, dataset.movie_feature) if args.loss_func == "CE": max_rating, max_indices = th.max(pred_ratings, dim=1) pred = nd_possible_rating_values[max_indices] real_pred_ratings = (th.softmax(pred_ratings, dim=1) * nd_possible_rating_values.view(1, -1)).sum(dim=1) elif args.loss_func == "MLP": real_pred_ratings = pred_ratings[:, 0] pred = real_pred_ratings.cpu().numpy() predition = np.reshape(pred, (user_len, movie_out.shape[0])) print("pred:",predition[0:2],predition.shape) #pred = real_pred_ratings.cpu().numpy()[0:movie_out.shape[0]] rating_matrix = rating_matrix.cpu().numpy() metric_ndcg = [] ndcg_20 = ndcg_score(rating_matrix, predition, k=20) ndcg_40 = ndcg_score(rating_matrix, predition, k=40) ndcg_80 = ndcg_score(rating_matrix, predition, k=80) metric_ndcg.append(ndcg_20) metric_ndcg.append(ndcg_40) metric_ndcg.append(ndcg_80) if segment == "test": print("ndcg@20:",ndcg_20) print("ndcg@40:",ndcg_40) print("ndcg@80:",ndcg_80) return metric_ndcg
def grid_search(relevance, docs, summarize_class, parameter_space): scores = {} with Live(console=console, screen=True, auto_refresh=False) as live: for i, param in enumerate(parameter_space, 1): summarizer = summarize_class(**param) score_10, score_50, score_80 = [], [], [] for y_true, d in zip(relevance, docs): y_pred = [summarizer.predict(d)] score_10.append( ndcg_score(y_true, y_pred, k=ceil(len(d) * 0.1))) score_50.append( ndcg_score(y_true, y_pred, k=ceil(len(d) * 0.5))) score_80.append( ndcg_score(y_true, y_pred, k=ceil(len(d) * 0.8))) scores[(summarizer.__class__.__name__, json.dumps(param))] = np.array(score_10).mean(), np.array( score_50).mean(), np.array(score_80).mean() live.update(topk_table(scores, i, len(parameter_space)), refresh=True) return scores
def test_perform_w_new_items(self): metric = self.metric result_w_new_items = metric.perform(split_w_new_items) u1_actual = [[2, 3, 1, 0, 2, 3, 0, 0]] u1_ideal = [[3, 3, 2, 2, 1, 0, 0, 0]] u1_expected_ndcg = ndcg_score(u1_ideal, u1_actual) u1_result_ndcg = float( result_w_new_items.query('from_id == "u1"')[str(metric)]) self.assertAlmostEqual(u1_expected_ndcg, u1_result_ndcg) u2_actual = [[3, 0, 4, 3]] u2_ideal = [[4, 3, 3, 0]] u2_expected_ndcg = ndcg_score(u2_ideal, u2_actual) u2_result_ndcg = float( result_w_new_items.query('from_id == "u2"')[str(metric)]) self.assertAlmostEqual(u2_expected_ndcg, u2_result_ndcg) sys_expected_ndcg = (u1_expected_ndcg + u2_expected_ndcg) / 2 sys_result_ndcg = float( result_w_new_items.query('from_id == "sys"')[str(metric)]) self.assertAlmostEqual(sys_expected_ndcg, sys_result_ndcg)
def evaluate(filename, k, relevance_field_name=None): if relevance_field_name: evals = list( load_json( filename, lambda x: { 'name': x['name'], 'score': x[relevance_field_name] })) predicted = [ thing['name'] for thing in sorted(evals, key=lambda x: -x['score']) ] else: predicted = [ thing['name'] for thing in load_json(filename, lambda x: {'name': x['name']}) ] random_predicted = copy.copy(predicted) random.shuffle(random_predicted) if len(seeds.intersection(predicted)) < 30: logging.warning( "Not enough seeds included in the list to be evaluated. This evaluation may not be accurate." ) results = {} results[f'mAP@{k}'] = ap_at_k(seeds, predicted, k) results[f'p@{k}'] = precision(seeds, predicted, k) results[f'random_mAP@{k}'] = ap_at_k(seeds, random_predicted, k) results[f'random_p@{k}'] = precision(seeds, random_predicted, k) if args.ndcg: from sklearn.metrics import ndcg_score scores = np.array([thing['score'] for thing in evals]) targets = np.array([thing['name'] in seeds for thing in evals]) results[f'nDCG@{k}'] = ndcg_score([targets[:k]], [scores[:k]]) random.shuffle(scores) results[f'random_nDCG@{k}'] = ndcg_score([targets[:k]], [scores[:k]]) return results
def create_candidate_svm(embedding, term, quants, classifier, plot_svm=False, descriptions=None, quant_name=None, pgbar=None, **kwargs): #!! term is only used for visualization, and ist must stay that way for CLUSTER_DIRECTION_ALGO = "reclassify" ! bin_labels = np.array(quants, dtype=bool) # Ensure that regardless of quant_measure this is correct binary classification labels # (tmp := len(quants)/(2*np.bincount(bin_labels)))[0]/tmp[1] is roughly equal to bin_labels.mean() so balancing is good if classifier == "SVM": svm = sklearn.svm.LinearSVC(class_weight="balanced", loss="hinge", max_iter=20000) elif classifier == "SVM_square": svm = sklearn.svm.LinearSVC(dual=False, class_weight="balanced") #squared-hinge instead of hinge (but fastest!) elif classifier == "SVM2": warnings.warn("Using an SVM Implementation that's slower for this kind of data!") svm = sklearn.svm.SVC(kernel="linear", class_weight="balanced", decision_function_shape="ovo") #slower than LinearSVC, don't use! # see https://stackoverflow.com/q/33843981/5122790, https://stackoverflow.com/q/35076586/5122790 else: raise NotImplementedError(f"Demanded classifier {classifier} not implemented!") with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") svm.fit(embedding, bin_labels) if w: assert issubclass(w[0].category, (sklearn.exceptions.ConvergenceWarning, DeprecationWarning)) no_converge = (bool(w) and issubclass(w[0].category, sklearn.exceptions.ConvergenceWarning)) tn, fp, fn, tp = confusion_matrix(bin_labels, svm.predict(embedding)).ravel() res = {"accuracy": (tp + tn) / len(quants), "precision": tp / (tp + fp), "recall": tp / (tp + fn), "did_converge": not no_converge} res["f_one"] = 2 * (res["precision"] * res["recall"]) / (res["precision"] + res["recall"]) #now, in [DESC15:4.2.1], they compare the "ranking induced by \vec{v_t} with the number of times the term occurs in the entity's documents" with Cohen's Kappa. #see notebooks/proof_of_concept/get_svm_decisionboundary.ipynb#Checking-projection-methods-&-distance-measures-from-point-to-projection for the ranking decision_plane = NDPlane(svm.coef_[0], svm.intercept_[0]) #don't even need the plane class here dist = lambda x, plane: np.dot(plane.normal, x) + plane.intercept distances = [dist(point, decision_plane) for point in embedding] assert np.allclose(distances, svm.decision_function(embedding)) #see https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC.decision_function, https://stats.stackexchange.com/a/14881 distances /= np.linalg.norm(svm.coef_[0]) #TODO: add the links and this normalification to the distances-notebook #sanity check: do most of the points with label=0 have the same sign `np.count_nonzero(np.sign(np.array(distances)[bin_labels])+1) # bin_labels, np.array((np.sign(np.array(distances))+1)/2, dtype=bool) # quant_ranking = np.zeros(quants.shape); quant_ranking[np.where(quants > 0)] = np.argsort(quants[quants > 0]) #TODO cohen's kappa hat nen sample_weight parameter!! DESC15 write they select Kappa "due to its tolerance to class imbalance." -> Does that mean I have to set the weight?! kappa_weights = get_setting("KAPPA_WEIGHTS") if get_setting("KAPPA_WEIGHTS") != "None" else None res["kappa_rank2rank_dense"] = cohen_kappa(rankdata(quants, method="dense"), rankdata(distances, method="dense"), weights=kappa_weights) #if there are 14.900 zeros, the next is a 1 res["kappa_rank2rank_min"] = cohen_kappa(rankdata(quants, method="min"), rankdata(distances, method="dense"), weights=kappa_weights) #if there are 14.900 zeros, the next one is a 14.901 res["kappa_bin2bin"] = cohen_kappa(bin_labels, [i > 0 for i in distances], weights=kappa_weights) res["kappa_digitized"] = cohen_kappa(np.digitize(quants, np.histogram_bin_edges(quants)[1:]), np.digitize(distances, np.histogram_bin_edges(distances)[1:]), weights=kappa_weights) res["ndcg_all"] = ndcg_score(np.array([quants]), np.expand_dims(distances,0)) res["ndcg_onlypos"] = ndcg_score(np.array([quants]), np.expand_dims(distances, 0), k=np.count_nonzero(np.array(quants))) nonzero_indices = np.where(np.array(quants) > 0)[0] q2, d2 = np.array(quants)[nonzero_indices], np.array(distances)[nonzero_indices] with nullcontext(): #warnings.catch_warnings(): #TODO get rid of what cuases the nans here!!! # warnings.filterwarnings('ignore', r'invalid value encountered in true_divide') if quant_name == "count": # in DESC15 they write "measure the correlation between the ranking induced by \vec{vt} and the number of times t appears in the documents associated with each entity", so maybe compare ranking to count?! # res["kappa_count2rank"] = cohen_kappa(quants, rankdata(distances, method="dense"), weights=kappa_weights) res["kappa_count2rank_onlypos"] = cohen_kappa(q2, rankdata(d2, method="dense"), weights=kappa_weights) res["kappa_rank2rank_onlypos_dense"] = cohen_kappa(rankdata(q2, method="dense"), rankdata(d2, method="dense"), weights=kappa_weights) res["kappa_rank2rank_onlypos_min"] = cohen_kappa(rankdata(q2, method="min"), rankdata(d2, method="min"), weights=kappa_weights) res["kappa_rank2rank_onlypos_max"] = cohen_kappa(rankdata(q2, method="max"), rankdata(d2, method="max"), weights=kappa_weights) # res["kappa_digitized_onlypos_1"] = cohen_kappa(np.digitize(q2, np.histogram_bin_edges(quants)[1:]), np.digitize(d2, np.histogram_bin_edges(distances)[1:]), weights=kappa_weights) #one ^ has as histogram-bins what it would be for ALL data, two only for the nonzero-ones res["kappa_digitized_onlypos_2"] = cohen_kappa(np.digitize(q2, np.histogram_bin_edges(q2)[1:]), np.digitize(d2, np.histogram_bin_edges(d2)[1:]), weights=kappa_weights) if plot_svm and descriptions is not None: display_svm(embedding, np.array(bin_labels, dtype=int), svm, term=term, descriptions=descriptions, name=term+" "+(", ".join(f"{k}: {round(v, 3)}" for k, v in res.items())), quants=quants, distances=distances, **kwargs) if pgbar is not None: pgbar.update(1) return res, decision_plane, term
def evaluate(table_format, tag, debug): """Evaluate all summarizers in sadedeGel""" if not debug: warnings.filterwarnings("ignore") anno = load_annotated_corpus(False) relevance = [[doc['relevance']] for doc in anno] summarizers = [ summ for summ in SUMMARIZERS if any(_tag in summ[1] for _tag in tag) ] scores = defaultdict(list) for word_tokenizer in ['simple', 'bert']: click.echo("Word Tokenizer: " + click.style(f"{word_tokenizer}", fg="blue")) docs = [Doc.from_sentences(doc['sentences']) for doc in anno] # Reset document because of memoization with tokenizer_context(word_tokenizer): for name, summarizer in summarizers: click.echo(click.style(f" {name} ", fg="magenta"), nl=False) # skip simple tokenizer for clustering models if ("cluster" in summarizer or "rank" in summarizer or name == "TFIDF Summarizer") and \ word_tokenizer == "simple": click.echo(click.style("SKIP", fg="yellow")) continue for i, (y_true, d) in enumerate(zip(relevance, docs)): dot_progress(i, len(relevance)) y_pred = [summarizer.predict(d.sents)] score_10 = ndcg_score(y_true, y_pred, k=ceil(len(d) * 0.1)) score_50 = ndcg_score(y_true, y_pred, k=ceil(len(d) * 0.5)) score_80 = ndcg_score(y_true, y_pred, k=ceil(len(d) * 0.8)) scores[f"{name} - {word_tokenizer}"].append( (score_10, score_50, score_80)) table = [[ algo, np.array([s[0] for s in scores]).mean(), np.array([s[1] for s in scores]).mean(), np.array([s[2] for s in scores]).mean() ] for algo, scores in scores.items()] # TODO: Sample weight of instances. print( tabulate(table, headers=[ 'Method & Tokenizer', 'ndcg(k=0.1)', 'ndcg(k=0.5)', 'ndcg(k=0.8)' ], tablefmt=table_format, floatfmt=".4f")) if debug: click.echo(np.array(table).shape)
def eval(): true_relevance = np.asarray([[10, 0, 0, 1, 5]]) scores = np.asarray([[1, 0, 0, 0, 1]]) ndcg_score(true_relevance, scores,k=5) score = 0 return score
def evaluate_metric(args, net, dataset, segment='valid', debug=False): possible_rating_values = dataset.possible_rating_values nd_possible_rating_values = th.FloatTensor(possible_rating_values).to( args.device) if segment == "test": rating_matrix = dataset.test_rating_matrix enc_graph = dataset.test_enc_graph dec_graph = dataset.test_recall_dec_graph user_len = len(list(pd.unique(dataset.test_rating_info["user_id"]))) elif segment == "valid": rating_matrix = dataset.valid_rating_matrix enc_graph = dataset.valid_enc_graph dec_graph = dataset.valid_recall_dec_graph user_len = len(list(pd.unique(dataset.valid_rating_info["user_id"]))) else: raise NotImplementedError # Evaluate RMSE net.eval() with th.no_grad(): pred_ratings, reg_loss, user_out, movie_out, W = net( enc_graph, dec_graph, dataset.user_feature, dataset.movie_feature) if args.loss_func == "CE": max_rating, max_indices = th.max(pred_ratings, dim=1) pred = nd_possible_rating_values[max_indices] real_pred_ratings = (th.softmax(pred_ratings, dim=1) * nd_possible_rating_values.view(1, -1)).sum( dim=1) elif args.loss_func == "MLP": real_pred_ratings = pred_ratings[:, 0] pred = real_pred_ratings.cpu().numpy() predition = np.reshape(pred, (user_len, movie_out.shape[0])) #pred = real_pred_ratings.cpu().numpy()[0:movie_out.shape[0]] rating_matrix = rating_matrix.cpu().numpy() metric_ndcg = [] ndcg_20 = ndcg_score(rating_matrix, predition, k=20) ndcg_40 = ndcg_score(rating_matrix, predition, k=40) ndcg_80 = ndcg_score(rating_matrix, predition, k=80) metric_ndcg.append(ndcg_20) metric_ndcg.append(ndcg_40) metric_ndcg.append(ndcg_80) if segment == "test": print("NDCG test") print("ndcg@20:", ndcg_20) print("ndcg@40:", ndcg_40) print("ndcg@80:", ndcg_80) return metric_ndcg
def compute_ndcg_scores(groundTruthRanks,integerValuedQueryRanks): ndcg_scores_5 = [] ndcg_scores_10 = [] count = 0 for x,y in zip(groundTruthRanks,integerValuedQueryRanks): if(len(x)>1): true_relevance = np.asarray([x]) relevance_score = np.asarray([y]) ndcg_scores_5.append(ndcg_score(true_relevance, relevance_score,k=5)) ndcg_scores_10.append(ndcg_score(true_relevance, relevance_score,k=10)) return (sum(ndcg_scores_5)/len(ndcg_scores_5)),(sum(ndcg_scores_10)/len(ndcg_scores_10))
def calc_rank_scores_at_k(y_true, y_pred, top_k_pctiles=[10,20]): # Given a list of scalar true and predicted scores, calculate F1 metrics for top K percentile elements. # Calculates precision@K, supports@K, ndcg@K precisions = [] ndcg_scores = [] supports = [] for top_pctile in top_k_pctiles: pctile = 100 - top_pctile # top 10%-tile means 90%-tile CDF thr_true = np.percentile(y_true, pctile) # find threshold for true labels thr_pred = np.percentile(y_pred, pctile) # find threshold for predicted labels labels_true = y_true >= thr_true # label +ve class in true_scores labels_pred = y_pred >= thr_pred # label +ve class in predicted scores f1_metrics = f1_help(labels_true, # calculate f1 for topK viral labels_pred, # binary classfiication average='binary', pos_label=1) num_top_rank = sum(labels_true) ndcg = ndcg_score(y_true.reshape((1,-1)), # calculate ndcg score at K y_pred.reshape((1,-1)), # the rank scores must be axis 1 k=num_top_rank) # each 'query' is axis 0 precisions.append(f1_metrics[0]) supports.append(sum(labels_true)) ndcg_scores.append(ndcg) return precisions, supports, ndcg_scores
def evaluate_ground_truth(args): """评估ground truth训练集的质量。""" author_rank_val = load_author_rank('val') author_rank_train = load_author_rank('train_original') fields = list(set(author_rank_val) & set(author_rank_train)) author_rank_val = {k: v for k, v in author_rank_val.items() if k in fields} author_rank_train = { k: v for k, v in author_rank_train.items() if k in fields } num_authors = OAGCSDataset()[0].num_nodes('author') true_relevance = np.zeros((len(fields), num_authors), dtype=np.int32) scores = np.zeros_like(true_relevance) for i, f in enumerate(fields): for r, a in enumerate(author_rank_val[f]): if a != -1: true_relevance[i, a] = math.ceil((100 - r) / 10) author_rank_val[f] = [a for a in author_rank_val[f] if a != -1] for r, a in enumerate(author_rank_train[f]): scores[i, a] = len(author_rank_train[f]) - r for k in (100, 50, 20, 10, 5): print( 'nDCG@{0}={1:.4f}\tPrecision@{0}={2:.4f}\tRecall@{0}={3:.4f}'. format( k, ndcg_score(true_relevance, scores, k=k, ignore_ties=True), sum( precision_at_k(author_rank_val[f], author_rank_train[f], k) for f in fields) / len(fields), sum( recall_at_k(author_rank_val[f], author_rank_train[f], k) for f in fields) / len(fields)))
def compute_ndcg_between_vislists(l1: lux.vis.VisList, l2: lux.vis.VisList, k: int) -> float: if len(l1) == len(l2) == 1: return 1 l1_scores = [vis.score for vis in l1] map1 = convert_vlist_to_hashmap(l1) l2_scores = [vis.score for vis in l2] map2 = convert_vlist_to_hashmap(l2) # Combine two dictionaries map1,map2 into a single global_map global_map = set(map1.keys()) global_map.update(set(map2.keys())) global_map = list(global_map) # Somehow our own NDCG calculation always leads to > 1 # aligned_score1 = list(get_aligned_dict(map1,global_map).values()) # aligned_score2 = list(get_aligned_dict(map2,global_map).values()) # return ndcg(aligned_score1,aligned_score2,5) # from scipy.stats import stats # rank1 = stats.rankdata(aligned_score1) # rank2 =stats.rankdata(aligned_score2) # return ndcg(rank1,rank2,3) aligned_score1 = np.asarray( [list(get_aligned_dict(map1, global_map).values())]) aligned_score2 = np.asarray( [list(get_aligned_dict(map2, global_map).values())]) from sklearn.metrics import ndcg_score return ndcg_score(aligned_score1, aligned_score2, k=k)
def get_ndcg_score(model: RecommenderSystem, test_ratings: pd.DataFrame) -> List[float]: test_users = set(test_ratings['userId'].values) ndcg_scores = [] for user_id in tqdm(test_users, desc='Testing predictions'): pred_movies, pred_scores = model.predict_scores(user_id) pred_movies = { movie_id: score for movie_id, score in zip(pred_movies, pred_scores) } test_user_ratings = test_ratings.loc[test_ratings['userId'] == user_id] test_user_ratings = test_user_ratings.sort_values(by='rating', ascending=False) test_user_movies = test_user_ratings['movieId'].values pred_movies_scores = [] for movie_id in test_user_ratings['movieId'].values: pred_score = pred_movies[movie_id] pred_movies_scores.append(pred_score) ndcg = ndcg_score([test_user_movies], [pred_movies_scores]) ndcg_scores.append(ndcg) return ndcg_scores
def evaluate_ndcg(k, pred_list, label_list, batch_size, list_length): preds = np.array_split(pred_list.flatten(), pred_list.shape[0] / list_length) labels = np.array_split(label_list.flatten(), pred_list.shape[0] / list_length) NDCG = ndcg_score(y_true=labels, y_score=preds, k=k) ''' ndcg=[] for pred,label in zip(preds,labels): idx = np.argsort(-pred) accumulation = 0.0 normalization = 0.0 sorted_label = label[np.argsort(-label)] for i in range(0,k): accumulation += float(label[idx[i]])/ np.log2(i+2.0) normalization += float(sorted_label[i])/ np.log2(i+2.0) if normalization == 0: ndcg.append(0) else: ndcg.append(accumulation/normalization) NDCG=np.mean(ndcg) ''' return NDCG
def main(): # Evaluate model performance # Get the "ideal" order of y_test by sorting in descending order. args = parse.get_test_args() X_train, X_test, X_val, y_train, y_test, y_val, group_vali, group_train = get_data( args["data_path"]) gbm = lgb.Booster(model_file=args["model_path"]) true_relevance = y_test.sort_values(ascending=False) # Get the actual order of y_test by sorting it according to our model's predictions. test_pred = gbm.predict(X_test) y_test = pd.DataFrame({ "relevance_score": y_test, "predicted_ranking": test_pred }) relevance_score = y_test.sort_values("predicted_ranking", ascending=False) # Use computed variables to calculate the nDCG score print( "nDCG score: ", ndcg_score([true_relevance.to_numpy()], [relevance_score["relevance_score"].to_numpy()]), )
def test_restriction_local(U: int, I: int) -> None: try: from sklearn.metrics import ndcg_score except: pytest.skip() rns = np.random.RandomState(42) recommendables: List[np.ndarray] = [] for _ in range(U): recommendables.append( rns.choice(np.arange(I), replace=False, size=rns.randint(2, I)) ) scores = rns.randn(U, I) X_gt = (rns.rand(U, I) >= 0.3).astype(np.float64) eval = Evaluator( sps.csr_matrix(X_gt), offset=0, cutoff=I, n_threads=1, per_user_recommendable_items=recommendables, ) # empty mask mock_rec = MockRecommender(sps.csr_matrix(X_gt.shape), scores) my_score = eval.get_score(mock_rec) sklearn_metrics = defaultdict(list) for i in range(scores.shape[0]): if X_gt[i, recommendables[i]].sum() == 0: continue ndcg = ndcg_score( X_gt[i, recommendables[i]][None, :], scores[i, recommendables[i]][None, :] ) sklearn_metrics["ndcg"].append(ndcg) assert my_score["ndcg"] == pytest.approx(np.mean(sklearn_metrics["ndcg"]), abs=1e-8)
def get_ndcg(surprise_predictions, k_highest_scores=None): """ Calculates the ndcg (normalized discounted cumulative gain) from surprise predictions, using sklearn.metrics.ndcg_score and scipy.sparse Parameters: surprise_predictions (List of surprise.prediction_algorithms.predictions.Prediction): list of predictions k_highest_scores (positive integer): Only consider the highest k scores in the ranking. If None, use all. Returns: float in [0., 1.]: The averaged NDCG scores over all recommendations """ uids = [int(p.uid) for p in surprise_predictions] iids = [int(p.iid) for p in surprise_predictions] r_uis = [p.r_ui for p in surprise_predictions] ests = [p.est for p in surprise_predictions] assert (len(uids) == len(iids) == len(r_uis) == len(ests)) sparse_preds = sparse.coo_matrix((ests, (uids, iids))) sparse_vals = sparse.coo_matrix((r_uis, (uids, iids))) dense_preds = sparse_preds.toarray() dense_vals = sparse_vals.toarray() return ndcg_score(y_true=dense_vals, y_score=dense_preds, k=k_highest_scores)
def test_restriction_global(U: int, I: int, R: int) -> None: rns = np.random.RandomState(42) recommendable = rns.choice(np.arange(I), replace=False, size=R) scores = rns.randn(U, I) X_gt = (rns.rand(U, I) >= 0.3).astype(np.float64) eval = Evaluator( sps.csr_matrix(X_gt), offset=0, cutoff=I, n_threads=1, recommendable_items=recommendable, ) # empty mask mock_rec = MockRecommender(sps.csr_matrix(X_gt.shape), scores) my_score = eval.get_score(mock_rec) sklearn_metrics = defaultdict(list) for i in range(scores.shape[0]): if X_gt[i, recommendable].sum() == 0: continue ndcg = ndcg_score(X_gt[i, recommendable][None, :], scores[i, recommendable][None, :]) sklearn_metrics["ndcg"].append(ndcg) assert my_score["ndcg"] == pytest.approx(np.mean(sklearn_metrics["ndcg"]), abs=1e-8)
def ndcg( target_index: int, scraps: np.ndarray, interactions: np.ndarray, benchmark: Benchmark, exp_base: int = 1, k: int = None, ) -> List[float]: _assert_scrap_size(scraps, benchmark) if k: scraps = scraps[:, :k] assert isinstance(exp_base, int) assert scraps.shape == interactions.shape exp_base = float(exp_base) levels = benchmark.dsm.levels[target_index] relevance = levels[scraps].astype(float) relevance[relevance <= 0] = np.inf relevance -= 1 relevance = np.power(exp_base, -relevance) scores = [] for interaction, ranking in zip(interactions, relevance): score = ndcg_score([interaction], [ranking]) scores.append(score) return scores
def mean_ndcg(self, res, qrels): from sklearn.metrics import ndcg_score ndcgs=[] joined = res.merge(qrels, how='left', on=['qid', 'docno']) for qid, qid_group in joined.fillna(0).groupby('qid'): ndcgs.append(ndcg_score([qid_group["label"].values], [qid_group["score"].values])) return sum(ndcgs) / len(ndcgs)
def evaluate_model(model, valid_data_loader, k, device): scores = [] for inputs, targets in valid_data_loader: out = model(inputs.to(device)) scores.append(ndcg_score(targets.numpy(), out[0].T.cpu().numpy(), k=k)) return np.mean(scores)
def __call__( self, y: "npt.ArrayLike", preds: "npt.ArrayLike", ) -> "StatsOutType": from ..sk.metrics import ( spearmans_correlation, pearsons_correlation, tau_correlation, ) from sklearn.metrics import ndcg_score y_ = clip(np.asarray(y)) preds_ = clip(np.asarray(preds)) results: "StatsOutType" = { "pearsons": pearsons_correlation(self.as_1d(y_), self.as_1d(preds_)), "spearmans": spearmans_correlation(self.as_1d(y_), self.as_1d(preds_)), "tau": tau_correlation(self.as_1d(y_), self.as_1d(preds_)), "ndcg": ndcg_score(self.as_2d(y_).T, self.as_2d(preds_).T), } return results
def ndcg_at_k(expected_order, actual_scores, k): """ NDCG score provided by the sklearn ndcg_score method: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.ndcg_score.html The expected order is the array of places (integer number >0): lower values mean more important items. The actual score is the score computed for each element. Higher score means more important item. :param expected_order: real ranking from ground-truth :param actual_scores: scores to test :param k: number of places in the rank to take into account :return: float value of precision between 0 and 1 """ if expected_order.shape != actual_scores.shape: raise Exception("Shapes must match") if len(expected_order.shape) != 1: raise Exception("Not tested on higher dimensions") expected_order = np.expand_dims(expected_order, axis=0) actual_scores = np.expand_dims(actual_scores, axis=0) ndcg_true_rel = (np.max(expected_order) - expected_order) ndcg_real_scores = actual_scores return ndcg_score(ndcg_true_rel, ndcg_real_scores, k=k)
def ndcg(gt, preds): print('.ndcg') gt = torch.from_numpy(gt) preds = torch.from_numpy(preds) K = [5, 10, 20] return [ndcg_score(gt, preds, k=k) for k in K] # 看这个地方具体怎么写的 修改这个地方的具体实现
def test_metrics_with_cutoff(U: int, I: int, C: int) -> None: rns = np.random.RandomState(42) scores = rns.randn(U, I) X_gt = (rns.rand(U, I) >= 0.3).astype(np.float64) eval = Evaluator(sps.csr_matrix(X_gt), offset=0, cutoff=C, n_threads=2) eval_finer_chunk = Evaluator( sps.csr_matrix(X_gt), offset=0, cutoff=C, n_threads=2, mb_size=1 ) # empty mask mock_rec = MockRecommender(sps.csr_matrix(X_gt.shape), scores) my_score = eval.get_score(mock_rec) my_score_finer = eval_finer_chunk.get_score(mock_rec) for key in my_score: assert my_score_finer[key] == pytest.approx(my_score[key]) ndcg = 0.0 valid_users = 0 map = 0.0 precision = 0.0 recall = 0.0 item_appearance_count = np.zeros((I,), dtype=np.float64) for i in range(U): nzs = set(X_gt[i].nonzero()[0]) if len(nzs) == 0: continue valid_users += 1 ndcg += ndcg_score(X_gt[[i]], scores[[i]], k=C) recommended = scores[i].argsort()[::-1][:C] recall_denom = min(C, len(nzs)) ap = 0.0 current_hit = 0 for i, rec in enumerate(recommended): item_appearance_count[rec] += 1.0 if rec in nzs: current_hit += 1 ap += current_hit / float(i + 1) ap /= recall_denom map += ap recall += current_hit / recall_denom precision += current_hit / C entropy = (lambda p: -p.dot(np.log(p)))( item_appearance_count / item_appearance_count.sum() ) item_appearance_sorted_normalized = ( np.sort(item_appearance_count) / item_appearance_count.sum() ) lorentz_curve = np.cumsum(item_appearance_sorted_normalized) gini_index = 0 delta = 1 / I for i in range(I): f = 2 * (((i + 1) / I) - lorentz_curve[i]) gini_index += delta * f assert my_score["ndcg"] == pytest.approx(ndcg / valid_users) assert my_score["map"] == pytest.approx(map / valid_users, abs=1e-8) assert my_score["precision"] == pytest.approx(precision / valid_users, abs=1e-8) assert my_score["recall"] == pytest.approx(recall / valid_users, abs=1e-8) assert my_score["entropy"] == pytest.approx(entropy) assert my_score["gini_index"] == pytest.approx(gini_index)
def calculate_ndcg_k(b_hat, c_ui_test, k): """ Calculate NDCG of predicted relevance scores. Args: b_hat (np.array): Array filled with predicted relevance scores for user-item combinations. c_ui_test (sp.csr_matrix): Sparse user-item matrix with test interactions. k (int): Length of recommended lists. Returns: float: NDCG score averaged over all users. """ n_users, n_items = c_ui_test.shape ndcgs = np.zeros(n_users) for u in range(n_users): # Take predicted relevance scores for this user. scores = b_hat[u, :] # True relevance is whether or not user bought the item in the test period. true_relevance = np.array(sp.csr_matrix.todense(c_ui_test[u, :])).flatten() true_relevance_binary = (true_relevance > 0).astype(float) # Reshape to use sklearn function. scores = np.reshape(scores, (1, len(scores))) true_relevance_binary = np.reshape( true_relevance_binary, (1, len(true_relevance_binary)) ) ndcg_u = ndcg_score(true_relevance_binary, scores, k) ndcgs[u] = ndcg_u ndcg = np.mean(ndcgs) return ndcg