def generate_reward(gold_index_list, answer_index_list): reward = 0 ap = 0 reciprocal_rank = 0 answer_list = list(answer_index_list) size = len(answer_index_list) true = sum(gold_index_list > 0) inp = np.zeros(size) for rank, val in enumerate(gold_index_list): if val and rank in answer_list: inp[answer_list.index(rank)] = val maxk = sum(inp > 0) if true: ap = average_precision(inp) * (maxk / true) reciprocal_rank = mean_reciprocal_rank([inp]) ndcg = ndcg_at_k(inp, min(10, size)) dcg_five = dcg_at_k(inp, 5) reward = (ap + reciprocal_rank + ndcg + dcg_five) / 4 ranks = [1, 3, 5, 10] reward_tuple = [reward, ap, reciprocal_rank, ndcg, dcg_five] for r in ranks: reward_tuple.append(precision_at_k(inp, min(r, len(inp)))) for r in ranks: reward_tuple.append(ndcg_at_k(inp, min(r, len(inp)))) return reward_tuple
def evalResults(results, trueRelevance, noveltyList, trainModelIDs, rev_dict, uid, alg, params, rec, outFile, diversity, novelty): params = [str(i) for i in params] #calculate rating precision mmScaler = MinMaxScaler(copy=True) results = mmScaler.fit_transform(results.reshape(-1, 1)) results = results.reshape((-1, )) r2Sc = r2_score(trueRelevance, results) mae = mean_absolute_error(trueRelevance, results) #calculate ranking scores idx = (-results).argsort() if diversity == "yes": reranked = mmr_sorted(range(len(results)), 0.8, results, rev_dict, 10) idx1 = [k for k, v in reranked.items()] idx2 = [i for i in idx if i not in idx1] idx1.extend(idx2) idx = idx1 rankedRelevance = trueRelevance[idx] rankedNovelty = noveltyList[idx] #print(rankedRelevance) map = rank_metrics.average_precision(rankedRelevance) aucSc = roc_auc_score(trueRelevance, results) nDCG10 = rank_metrics.ndcg_at_k(rankedRelevance, 10) nDCG100 = rank_metrics.ndcg_at_k(rankedRelevance, 100) nDCG = rank_metrics.ndcg_at_k(rankedRelevance, len(rankedRelevance)) p5 = prec_at_n(rankedRelevance, 5) r5 = rec_at_n(rankedRelevance, 5) n5 = meanNovelty_at_n(rankedNovelty, 5) un5 = user_novelty_at_n(idx, trainModelIDs, 5) ild5 = ild_at_n(idx, rev_dict, 5) p10 = prec_at_n(rankedRelevance, 10) r10 = rec_at_n(rankedRelevance, 10) n10 = meanNovelty_at_n(rankedNovelty, 10) ild10 = ild_at_n(idx, rev_dict, 10) un10 = user_novelty_at_n(idx, trainModelIDs, 10) mrr = rank_metrics.mean_reciprocal_rank([rankedRelevance]) #print((uid, alg, ",".join(params), rec, r2Sc, mae, map, aucSc, mrr, p5, p10, r5, r10, nDCG10, nDCG100, nDCG)) txt = "%s;%s;%s;%s;%s;%s;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f\n" % ( uid, alg, ",".join(params), rec, diversity, novelty, r2Sc, mae, map, aucSc, mrr, p5, p10, r5, r10, nDCG10, nDCG100, nDCG, n5, n10, un5, un10, ild5, ild10) outFile.write(txt) return (r2Sc, mae, map, aucSc, mrr, p5, p10, r5, r10, nDCG10, nDCG100, nDCG, n5, n10, ild5, ild10)
def generate_reward(gold_index_list, answer_index_list, reward_type): reward = 0 ap = 0. reciprocal_rank = 0 answer_list = list(answer_index_list) size = len(answer_index_list) true = sum(gold_index_list > 0) inp = np.zeros(size) for rank, val in enumerate(gold_index_list): if val and rank in answer_list: inp[answer_list.index(rank)] = val maxk = sum(inp > 0) if true: ap = average_precision(inp) * (maxk / true) reciprocal_rank = mean_reciprocal_rank([inp]) ndcg = ndcg_at_k(inp, min(10, size)) dcg_five = dcg_at_k(inp, 5) reward = rewards[reward_type - 1](inp, ap, reciprocal_rank, ndcg, dcg_five) return reward, ap, reciprocal_rank, ndcg, dcg_five
def evaluate_results(qids_rs, Y, k): values = defaultdict(list) for qid, r in qids_rs: gold = harvest(Y, qid) gold_topk = gold[argtopk(gold, k)] R = np.count_nonzero(gold_topk) # real ndcg idcg = rm.dcg_at_k(gold_topk, k) ndcg = rm.dcg_at_k(r, k) / idcg values["ndcg"].append(ndcg) # Verified # MAP@k ap = rm.average_precision(r) values["MAP"].append(ap) # MRR - compute by hand ind = np.asarray(r).nonzero()[0] mrr = (1. / (ind[0] + 1)) if ind.size else 0. values["MRR"].append(mrr) # R precision # R = min(R, k) # ok lets be fair.. you cant get more than k # we dont need that anymore, since we chop of the remainder # before computing R recall = rm.recall(r, R) values["recall"].append(recall) # precision = rm.precision_at_k(pad(scored_result, k), k) precision = rm.precision(r) values["precision"].append(precision) f1 = f1_score(precision, recall) values["f1_score"].append(f1) # Safe variant does not fail if len(r) < k p_at_5 = rm.safe_precision_at_k(r, 5) values["precision@5"].append(p_at_5) p_at_10 = rm.safe_precision_at_k(r, 10) values["precision@10"].append(p_at_10) return values
def summarize(self): """Give summary statistics about the tournament.""" res = self.run() # res = self.results # champ should be undefeated champ = list(np.where(res.strength == max(res.strength))[0]) copeland = (res.wins[champ] == self.n_rounds) # top-k ranks = pd.DataFrame(data=np.transpose([ res.strength.rank(ascending=False), res.wins.rank(ascending=False), res.wins ]), columns=["str_rank", "win_rank", "wins"]) ranks['relevant'] = ranks['str_rank'] <= self.k borda = (ranks.win_rank[champ] == ranks.win_rank.min()) top_k_df = ranks.loc[ranks['str_rank'] <= self.k] top_k = sum(top_k_df['wins'] >= self.n_rounds - 2) / self.k tau, k_p = scipy.stats.kendalltau(ranks.str_rank, ranks.win_rank) rho, sp_p = scipy.stats.spearmanr(ranks.str_rank, ranks.win_rank) ranks.sort_values(by="win_rank") # using rank_metrics rel_vec = ranks.relevant.values prec = rank_metrics.r_precision(rel_vec) prec_at_k = rank_metrics.precision_at_k(rel_vec, self.k) avg_prec = rank_metrics.average_precision(rel_vec) dcg = rank_metrics.dcg_at_k(rel_vec, self.k) ndcg = rank_metrics.ndcg_at_k(rel_vec, self.k) df = pd.DataFrame(data=[ list([ int(copeland), int(borda), float(top_k), prec, prec_at_k, avg_prec, dcg, ndcg, float(tau), float(rho) ]) ], columns=[ 'undef_champ', 'top_champ', 'top_k_found', 'precision', 'precision_at_k', 'avg_prec', 'dcg', 'ndcg', 'tau', 'rho' ]) return df
def generate_reward(gold_index_list, answer_index_list, reward_type=1): reward = 0 ap = 0 reciprocal_rank = 0 answer_list = list(deepcopy(answer_index_list)) size = len(answer_index_list) true = sum(gold_index_list) inp = np.zeros(size) for rank, val in enumerate(gold_index_list): if val and rank in answer_list: inp[answer_list.index(rank)] = 2 if true: ap = average_precision(inp) * (sum(inp > 0) / true) reciprocal_rank = mean_reciprocal_rank([inp]) #ndcg = ndcg_at_k(inp,size) #if reward_type==1: # reward = (ap+reciprocal_rank)/2 #elif reward_type ==2 : # reward = dcg_at_k(inp,size) rewards = [(ap + reciprocal_rank) / 2, dcg_at_k(inp, size)] return rewards[reward_type - 1], ap, reciprocal_rank, (inp[0] > 0)
def compute_metrics(ranked_judgements, pr_atk, threshold_grade): """ Given the ranked judgements compute the metrics for a query. :param ranked_judgements: list(int); graded or binary relevances in rank order. :param pr_atk: int; the @K value to use for computing precision and recall. :param threshold_grade: int; Assuming 0-3 graded relevances, threshold at some point and convert graded to binary relevance. :return: """ graded_judgements = ranked_judgements ranked_judgements = [ 1 if rel >= threshold_grade else 0 for rel in graded_judgements ] # Use the full set of candidate not the pr_atk. ndcg = rm.ndcg_at_k(graded_judgements, len(ranked_judgements)) ndcg_pr = rm.ndcg_at_k(graded_judgements, int(0.20 * len(ranked_judgements))) ndcg_20 = rm.ndcg_at_k(graded_judgements, 20) max_total_relevant = sum(ranked_judgements) recall = recall_at_k(ranked_rel=ranked_judgements, atk=pr_atk, max_total_relevant=max_total_relevant) precision = rm.precision_at_k(r=ranked_judgements, k=pr_atk) r_precision = rm.r_precision(r=ranked_judgements) f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 av_precision = rm.average_precision(r=ranked_judgements) reciprocal_rank = rm.mean_reciprocal_rank(rs=[ranked_judgements]) metrics = { 'recall': float(recall), 'precision': float(precision), 'f1': float(f1), 'r_precision': float(r_precision), 'av_precision': float(av_precision), 'reciprocal_rank': float(reciprocal_rank), 'ndcg': ndcg, 'ndcg@20': ndcg_20, 'ndcg%20': ndcg_pr } return metrics
def evaluate(self, ratings: Dict[int, List[int]], negatives: Dict[int, List[int]], topN: int): """ evaluate performance of models :param ratings: key: user, value: list of positive items :param negatives: key: user, value: list of negative items :param topN: int :return: """ ndcgs, apks, recalls = [], [], [] for user in sorted(ratings.keys()): pos_items = ratings[user] neg_items = negatives[user] assert type(pos_items) == list and type(neg_items) == list items = neg_items + pos_items users = np.full(len(items), user, dtype=np.int64) items = np.asarray(items) predictions = self.predict(users, items) labels = [0.0] * len(neg_items) + [1.0] * len(pos_items) labels = np.array(labels) # compute metric here indices = np.argsort(-predictions)[:topN] # indices of items with highest scores ranklist = labels[indices] ndcg = rank_metrics.ndcg_at_k(ranklist, topN) _, recall = rank_metrics._compute_precision_recall(ranklist, topN) apk = rank_metrics.average_precision(ranklist[:topN]) ndcgs.append(ndcg) apks.append(apk) recalls.append(recall) results = {} results["ndcg"] = np.nanmean(ndcgs) results["ndcg_list"] = ndcgs results["map"] = np.nanmean(apks) results["maps_list"] = apks results["recall"] = np.nanmean(recalls) results["recalls_list"] = recalls return results
#a=a[0:np.int(cutter/10)] #score=score[0:np.int(cutter/10)] list1=next(r) filter_object = filter(lambda x: x != "", list1) list1 = list(filter_object) list1=np.unique(np.array(list1, dtype=int)) binary=np.isin(a, list1).astype(int) den= np.argwhere(binary==1) if np.array(den).size<1: mrr=0 else: mrr=(1/(den[0]+1)) map=rm.average_precision(binary) top1=0 top5=0 top10=0 top20=0 top100=0 if 1 in binary[:100]: top100=1 if 1 in binary[:20]: top20=1 if 1 in binary[:10]: top10=1 if 1 in binary[:5]: top5=1 if 1 in binary[:1]: top1=1
def evaluate(self, X, Y, k=20, verbose=0, replacement=0, n_jobs=1): """ :X: [(qid, str)] query id, query string pairs :Y: pandas dataseries with qid,docid index or [dict] :k: Limit the result for all metrics to this value, the models are also given a hint of how many they should return. :replacement: 0 means that (query, doc) pairs not prevalent in Y will not be considered relevant, None means that those are not considered (skipped). """ # rs = [] # if n_jobs > 1: # return process_and_evaluate(self, X, Y, k, n_jobs) values = defaultdict(list) for qid, query in X: # execute query if verbose > 0: print(qid, ":", query) t0 = timer() # if replacement is None, we need to drop after querying result = self.query(query, k=(None if replacement is None else k)) values["time_per_query"].append(timer() - t0) # if verbose > 0: # print(result[:k]) # result = result[:k] # TRIM HERE # soak the generator scored_result = [ harvest(Y, qid, docid, replacement) for docid in result ] if replacement is None: scored_result, notfound = filter_none(scored_result) values["gold_not_found"].append(notfound) if k is not None: # dont let the models cheat by returning more than k r = scored_result[:k] else: # if k is None, consider all r = scored_result # if verbose > 0: # print(r) # gold = np.array(list(Y[qid].values())) gold = harvest(Y, qid) import sys # print(gold, file=sys.stderr) topk_indices = argtopk(gold, k) print(topk_indices, file=sys.stderr) gold_topk = gold[topk_indices] # print('Top k in gold standard:', gold_topk, file=sys.stderr) R = np.count_nonzero(gold_topk) if verbose > 0: print("Retrieved {} relevant out of {} possible.".format( np.count_nonzero(r), R)) # real ndcg idcg = rm.dcg_at_k(gold_topk, k) ndcg = rm.dcg_at_k(scored_result, k) / idcg values["ndcg"].append(ndcg) # Verified # MAP@k ap = rm.average_precision(r) values["MAP"].append(ap) # MRR - compute by hand ind = np.asarray(r).nonzero()[0] mrr = (1. / (ind[0] + 1)) if ind.size else 0. values["MRR"].append(mrr) # R precision # R = min(R, k) # ok lets be fair.. you cant get more than k # we dont need that anymore, since we chop of the remainder # before computing R recall = rm.recall(r, R) values["recall"].append(recall) # precision = rm.precision_at_k(pad(scored_result, k), k) precision = rm.precision(r) values["precision"].append(precision) f1 = f1_score(precision, recall) values["f1_score"].append(f1) # Safe variant does not fail if len(r) < k p_at_5 = rm.safe_precision_at_k(r, 5) values["precision@5"].append(p_at_5) p_at_10 = rm.safe_precision_at_k(r, 10) values["precision@10"].append(p_at_10) # rs.append(r) if verbose > 0: # print("Precision: {:.4f}".format(precision)) # print("Recall: {:.4f}".format(recall)) # print("F1-Score: {:.4f}".format(f1)) print("AP: {:.4f}".format(ap)) print("RR: {:.4f}".format(mrr)) print("NDCG: {:.4f}".format(ndcg)) return values
r = [] for item in sim_rank: r.append(eval_query[ent][item[0]]) if len(r) >1: tmp_n1 = rm.ndcg_at_k(r, 1, 1) else: tmp_n1 = rm.ndcg_at_k(r, len(r), 1) if len(r) >5: tmp_n5 = rm.ndcg_at_k(r, 5, 1) else: tmp_n5 = rm.ndcg_at_k(r, len(r), 1) if len(r) >10: tmp_n10 = rm.ndcg_at_k(r, 10, 1) else: tmp_n10 = rm.ndcg_at_k(r, len(r), 1) tmp_ap = rm.average_precision(r) ndcg1_sum += tmp_n1 ndcg5_sum += tmp_n5 ndcg10_sum += tmp_n10 map_sum += tmp_ap can_count += tmp_can_count else: ent_skip_count +=1 act_ent_count = len(eval_query)-ent_skip_count with codecs.open(log_file, 'a', encoding='UTF-8') as fout_log: fout_log.write("**********************************\n") fout_log.write("eval %d(%d) entities with %d(%d) candidate entities for %s!\n" % (act_ent_count,len(eval_query),can_count/act_ent_count,relatedness_pair_num/len(eval_query), entity_vec_file)) fout_log.write("ndcg1 : %f, ndcg5 : %f, ndcg10 : %f, map : %f\n" % (float(ndcg1_sum/act_ent_count),float(ndcg5_sum/act_ent_count),float(ndcg10_sum/act_ent_count),float(map_sum/act_ent_count))) fout_log.write("**********************************\n")
#scr[i, :] = 0.5 * (np.maximum(scr1, 0) ** 0.5 + np.maximum(scr2, 0) ** 0.5) #scr[i, :] = 0.5 * (np.maximum(scr1, 0) + np.maximum(scr2, 0)) scr[i, :] = 0.5 * (scr1 + scr2) # -------------------------------------------------------------------------- print("computing tag-centric scores ..") ap = [] for i, tag in enumerate(tags): # rank images idxs = np.argsort(scr[:, i])[::-1] # compute AP(tag) relevant = [im for im in tag2im[tag] if im in images_test] r = [int(images_test[j] in relevant) for j in idxs] ap.append(average_precision(r)) print(" {} {:.2f}".format(tag, 100 * ap[-1])) print("done") # -------------------------------------------------------------------------- print("computing image-centric scores ..") iap = [] for i, im in enumerate(images_test): # rank tags idxs = np.argsort(scr[i, :])[::-1] # compute AP(image) relevant = list(im2tag[im])
def evaluate(self, X, Y, k=20, verbose=0, replacement=0, n_jobs=1): """ :X: [(qid, str)] query id, query string pairs :Y: pandas dataseries with qid,docid index or [dict] :k: Limit the result for all metrics to this value, the models are also given a hint of how many they should return. :replacement: 0 means that (query, doc) pairs not prevalent in Y will not be considered relevant, None means that those are not considered (skipped). """ # rs = [] # if n_jobs > 1: # return process_and_evaluate(self, X, Y, k, n_jobs) values = defaultdict(list) for qid, query in X: # execute query if verbose > 0: print(qid, ":", query) t0 = timer() # if replacement is None, we need to drop after querying result = self.query(query, k=(None if replacement is None else k)) values["time_per_query"].append(timer() - t0) # if verbose > 0: # print(result[:k]) # result = result[:k] # TRIM HERE # soak the generator scored_result = [harvest(Y, qid, docid, replacement) for docid in result] if replacement is None: scored_result, notfound = filter_none(scored_result) values["gold_not_found"].append(notfound) if k is not None: # dont let the models cheat by returning more than k r = scored_result[:k] else: # if k is None, consider all r = scored_result # if verbose > 0: # print(r) # gold = np.array(list(Y[qid].values())) gold = harvest(Y, qid) import sys # print(gold, file=sys.stderr) topk_indices = argtopk(gold, k) print(topk_indices, file=sys.stderr) gold_topk = gold[topk_indices] # print('Top k in gold standard:', gold_topk, file=sys.stderr) R = np.count_nonzero(gold_topk) if verbose > 0: print("Retrieved {} relevant out of {} possible." .format(np.count_nonzero(r), R)) # real ndcg idcg = rm.dcg_at_k(gold_topk, k) ndcg = rm.dcg_at_k(scored_result, k) / idcg values["ndcg"].append(ndcg) # Verified # MAP@k ap = rm.average_precision(r) values["MAP"].append(ap) # MRR - compute by hand ind = np.asarray(r).nonzero()[0] mrr = (1. / (ind[0] + 1)) if ind.size else 0. values["MRR"].append(mrr) # R precision # R = min(R, k) # ok lets be fair.. you cant get more than k # we dont need that anymore, since we chop of the remainder # before computing R recall = rm.recall(r, R) values["recall"].append(recall) # precision = rm.precision_at_k(pad(scored_result, k), k) precision = rm.precision(r) values["precision"].append(precision) f1 = f1_score(precision, recall) values["f1_score"].append(f1) # Safe variant does not fail if len(r) < k p_at_5 = rm.safe_precision_at_k(r, 5) values["precision@5"].append(p_at_5) p_at_10 = rm.safe_precision_at_k(r, 10) values["precision@10"].append(p_at_10) # rs.append(r) if verbose > 0: # print("Precision: {:.4f}".format(precision)) # print("Recall: {:.4f}".format(recall)) # print("F1-Score: {:.4f}".format(f1)) print("AP: {:.4f}".format(ap)) print("RR: {:.4f}".format(mrr)) print("NDCG: {:.4f}".format(ndcg)) return values