def test_ndcg_cut(self): qrel = { 'q1': { 'd1': 0, 'd2': 1, 'd3': 0, }, 'q2': { 'd2': 1, 'd3': 1, }, } evaluator = pytrec_eval.RelevanceEvaluator(qrel, {'ndcg_cut.3'}) self.assertAlmostEqual( evaluator.evaluate({ 'q1': { 'd1': 1.0, 'd2': 0.0, # rank 3 'd3': 1.5, }, 'q2': {}, })['q1']['ndcg_cut_3'], 0.5) evaluator = pytrec_eval.RelevanceEvaluator(qrel, {'ndcg_cut.1'}) self.assertAlmostEqual( evaluator.evaluate({ 'q1': { 'd1': 1.0, 'd2': 2.0, # rank 3 'd3': 1.5, }, 'q2': {}, })['q1']['ndcg_cut_1'], 1.0) evaluator = pytrec_eval.RelevanceEvaluator(qrel, {'ndcg_cut.1,2,3,1000'}) result = evaluator.evaluate({ 'q1': { 'd1': 1.0, 'd2': 0.0, # rank 3 'd3': 1.5, }, 'q2': {}, })['q1'] self.assertAlmostEqual(result['ndcg_cut_3'], 0.5) self.assertAlmostEqual(result['ndcg_cut_2'], 0.0) self.assertAlmostEqual(result['ndcg_cut_1'], 0.0) self.assertAlmostEqual(result['ndcg_cut_1000'], 0.5)
def benchmark(model, model_name, docs, idx2key): qrels, queries = read_ap.read_qrels() overall_ser = {} # Adopted version from the TFIDF benchmark test print("Running GENSIM Benchmark") # collect results for qid in tqdm(qrels): query_text = queries[qid] results = rank(model, docs, query_text) #print(results) overall_ser[qid] = dict([(idx2key[idx], score) for idx, score in results]) print(overall_ser) #print(overall_ser[100]) evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'ndcg'}) metrics = evaluator.evaluate(overall_ser) json_filename = f"./json_files/benchmark_{model_name}.json" # dump to JSON with open(json_filename, "w") as writer: json.dump(metrics, writer, indent=1) return json_filename
def main(): parser = argparse.ArgumentParser() parser.add_argument('qrel') parser.add_argument('run') parser.add_argument('measure') args = parser.parse_args() assert os.path.exists(args.qrel) assert os.path.exists(args.run) with open(args.qrel, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(args.run, 'r') as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator( qrel, {args.measure}) results = evaluator.evaluate(run) def print_line(measure, scope, value): #scope = query_id = topic_id print('{:25s}{:8s}{:.22f}'.format(measure, scope, value)) avg_DCG = [] for query_id, query_measures in results.items(): for measure, value in sorted(query_measures.items()): avg_DCG.append(value) print_line(measure, query_id, value) print(avg_DCG) print(mean(avg_DCG)) print(' avg of nDCG {:f}'.format(mean(avg_DCG)))
def get_metric(self, qrels: str, trec: str, metric: str = 'ndcg_cut_10', split: dict = None, split_idx: int = -1) -> Dict[str, float]: with open(qrels, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(trec, 'r') as f_run: run = pytrec_eval.parse_run(f_run) # partial evaluation if split is not None and split_idx >= 0: for qid in copy.deepcopy(run): if qid not in split[split_idx]: _ = run.pop(qid) evaluator = pytrec_eval.RelevanceEvaluator( qrel, pytrec_eval.supported_measures) results = evaluator.evaluate(run) for query_id, query_measures in sorted(results.items()): pass mes = {} for measure in sorted(query_measures.keys()): mes[measure] = pytrec_eval.compute_aggregated_measure( measure, [ query_measures[measure] for query_measures in results.values() ]) return mes[metric]
def crossvalidated_interpolation(dev, test, metric): """ Return an interpolated ranking """ # TODO refactor out (shared with crossvalidated_ranking) valid_metrics = {"P", "map", "map_cut", "ndcg_cut", "Rprec", "recip_rank"} cut_points = [5, 10, 15, 20, 30, 100, 200, 500, 1000] # the metrics we expect pytrec_eval to output (after expanding _cut) expected_metrics = {m for m in valid_metrics if not m.endswith("_cut") and m != "P"} | { m + "_" + str(cutoff) for cutoff in cut_points for m in valid_metrics if m.endswith("_cut") or m == "P" } if metric in ["ndcg", "ndcg_cut"]: mkey = "ndcg_cut_20" elif metric in expected_metrics: mkey = metric else: raise RuntimeError("requested metric %s is not one of the supported metrics: %s" % (metric, sorted(expected_metrics))) avg_metric = lambda run_metrics: np.mean([qid[mkey] for qid in run_metrics.values()]) assert len(set(dev["qrels"].keys()).intersection(test["qrels"].keys())) == 0 dev_eval = pytrec_eval.RelevanceEvaluator(dev["qrels"], valid_metrics) best_metric, best_alpha = -np.inf, None for alpha in np.arange(0, 1.001, 0.05): run_metrics = dev_eval.evaluate( Searcher.interpolate_runs(dev["reranker"], dev["searcher"], dev["qrels"].keys(), alpha) ) mavgp = avg_metric(run_metrics) if mavgp > best_metric: best_metric = mavgp best_alpha = alpha test_run = Searcher.interpolate_runs(test["reranker"], test["searcher"], test["qrels"].keys(), best_alpha) dev_run = Searcher.interpolate_runs(dev["reranker"], dev["searcher"], dev["qrels"].keys(), best_alpha) return (best_alpha, test_run, dev_run)
def evaluate(eval_path, qrel_path, res_path): measures = {"map", "ndcg_cut", "recall", "P"} with open(qrel_path, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures) with open(res_path, 'r') as f_run: run = pytrec_eval.parse_run(f_run) all_metrics = evaluator.evaluate(run) metrics = { 'P_5': 0, 'P_10': 0, 'P_20': 0, 'ndcg_cut_5': 0, 'ndcg_cut_10': 0, 'ndcg_cut_20': 0, 'ndcg_cut_100': 0, 'map': 0, 'recall_100': 0 } nb_queries = len(all_metrics) for key, values in all_metrics.items(): for metric in metrics: metrics[metric] += values[metric] / nb_queries with open(eval_path, 'w') as f: json.dump(metrics, f)
def summary(self): qrel = {} run = {} assert (len(self.qrel_list) == len(self.run_list)) for i in range(len(self.qrel_list)): assert (len(self.qrel_list[i]) == len(self.run_list[i])) qid = 'q{}'.format(i + 1) qrel[qid] = {} run[qid] = {} for j in range(len(self.run_list[i])): did = 'd{}'.format(j + 1) qrel[qid][did] = int(self.qrel_list[i][j]) run[qid][did] = float(self.run_list[i][j]) evaluater = pytrec_eval.RelevanceEvaluator( qrel, {'map', 'map_cut', 'ndcg', 'ndcg_cut', 'recall'}) trec = evaluater.evaluate(run) results = { 'mrr': np.mean(self.rr_list), 'mrr10': np.mean(self.rr10_list), 'err': np.mean(self.err_list), 'err20': np.mean(self.err20_list), 'map': np.mean([trec[d]['map'] for d in trec]), 'map10': np.mean([trec[d]['map_cut_10'] for d in trec]), 'map20': np.mean([trec[d]['map_cut_20'] for d in trec]), 'ndcg': np.mean([trec[d]['ndcg'] for d in trec]), 'ndcg10': np.mean([trec[d]['ndcg_cut_10'] for d in trec]), 'ndcg20': np.mean([trec[d]['ndcg_cut_20'] for d in trec]), 'recall100': np.mean([trec[d]['recall_100'] for d in trec]) } return results
def evaluate(res, qrels, metrics=['map', 'ndcg'], perquery=False): """ Evaluate the result dataframe with the given qrels Args: res: Either a dataframe with columns=['qid', 'docno', 'score'] or a dict {qid:{docno:score,},} qrels: Either a dataframe with columns=['qid','docno', 'label'] or a dict {qid:{docno:label,},} metrics(list): A list of strings specifying which evaluation metrics to use. Default=['map', 'ndcg'] perquery(bool): If true return each metric for each query, else return mean metrics. Default=False """ if isinstance(res, pd.DataFrame): batch_retrieve_results_dict = Utils.convert_res_to_dict(res) else: batch_retrieve_results_dict = res if isinstance(qrels, pd.DataFrame): qrels_dic = Utils.convert_qrels_to_dict(qrels) else: qrels_dic = qrels evaluator = pytrec_eval.RelevanceEvaluator(qrels_dic, set(metrics)) result = evaluator.evaluate(batch_retrieve_results_dict) if perquery: return result else: measures_sum = {} mean_dict = {} for val in result.values(): for measure, measure_val in val.items(): measures_sum[measure] = measures_sum.get(measure, 0.0) + measure_val for measure, value in measures_sum.items(): mean_dict[measure] = value / len(result.values()) return mean_dict
def evaluate_doc2vec(doc2vec_model, description, test_subset=False): qrels, queries = read_ap.read_qrels() if test_subset: queries = { qid: q for qid, q in queries.items() if int(qid) < 101 and int(qid) > 75 } overall_ser = {} # collect results for qid in queries: results = rank_query_given_document(queries[qid], doc2vec_model) overall_ser[qid] = dict(results) if int(qid) not in np.arange(76, 101): evaluate.write_trec_results(qid, results, f"./doc2vec/results/") evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'ndcg'}) metrics = evaluator.evaluate(overall_ser) if not test_subset: with open(f"./doc2vec/results/doc2vec_{description}.json", "w") as writer: json.dump(metrics, writer, indent=1) return metrics
def qrel_metrics(qrel_file, run_file, metrics=('ndcg', 'map')): """Get metrics (ndcg and map by default) for a run compared to a qrel file. Arguments: qrel_file -- qrel file with ground truth data run_file -- predictions from the run metrics -- which metrics to evaluate on, can use any valid metrics that the trec_eval tool accepts Returns: metric_values -- dictionary of metric values (out of 100), rounded to two decimal places """ with open(qrel_file, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(run_file, 'r') as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator(qrel, set(metrics)) results = evaluator.evaluate(run) metric_values = {} for measure in sorted(metrics): res = pytrec_eval.compute_aggregated_measure( measure, [query_measures[measure] for query_measures in results.values()] ) metric_values[measure] = np.round(100 * res, 2) return metric_values
def test_measure_params(self): qrel = { 'q1': { 'd1': 0, 'd2': 1, 'd3': 0, }, 'q2': { 'd2': 1, 'd3': 1, }, } run = { 'q1': { 'd1': 1.0, 'd2': 0.0, 'd3': 1.5, }, 'q2': {}, } # empty run evaluator = pytrec_eval.RelevanceEvaluator(qrel, [ 'ndcg_cut', 'ndcg_cut.1,4', 'ndcg_cut_20,4', 'ndcg_cut_15', 'recall.1000', 'P' ]) self.assertEqual( set(evaluator.evaluate(run)['q1'].keys()), { 'ndcg_cut_1', 'ndcg_cut_4', 'ndcg_cut_15', 'ndcg_cut_20', 'recall_1000', 'P_200', 'P_15', 'P_10', 'P_5', 'P_30', 'P_100', 'P_20', 'P_500', 'P_1000' })
def compute_metrics(dictionary, model, index, corpus_type, num_topics, doc_ids): """ Compute MAP and nDCG scores and save to json file. """ metric_path = ("./LSI_results/LSI_{}_and_{}_topics.json".format( corpus_type, num_topics)) #check whether metrics for corpus type and num_topics were already generated if not os.path.exists(metric_path): # Get ranking of document for every query and compute the MAP and NDCG score. qrels, queries = ra.read_qrels() overall_ser = {} #ranking per query for qid in tqdm(qrels): query = queries[qid] ranking = query_similarity(query, dictionary, model, index, doc_ids) overall_ser[qid] = ranking # Compute model evaluation scores per query evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'ndcg'}) metrics = evaluator.evaluate(overall_ser) with open( "./LSI_results/LSI_{}_and_{}_topics.json".format( corpus_type, num_topics), "w") as writer: json.dump(metrics, writer, indent=1) else: print('metrics for LSI_{} with {} topics were already computed'.format( corpus_type, num_topics))
def EvalDevQuery(args, query_embedding2id, passage_embedding2id, dev_query_positive_id, I_nearest_neighbor): prediction = { } #[qid][docid] = docscore, here we use -rank as score, so the higher the rank (1 > 2), the higher the score (-1 > -2) for query_idx in range(I_nearest_neighbor.shape[0]): query_id = query_embedding2id[query_idx] prediction[query_id] = {} top_ann_pid = I_nearest_neighbor[query_idx, :].copy() selected_ann_idx = top_ann_pid[:50] rank = 0 for idx in selected_ann_idx: pred_pid = passage_embedding2id[idx] rank += 1 prediction[query_id][pred_pid] = -rank # use out of the box evaluation script evaluator = pytrec_eval.RelevanceEvaluator( convert_to_string_id(dev_query_positive_id), {'map_cut', 'ndcg_cut'}) eval_query_cnt = 0 result = evaluator.evaluate(convert_to_string_id(prediction)) ndcg = 0 for k in result.keys(): eval_query_cnt += 1 ndcg += result[k]["ndcg_cut_10"] final_ndcg = ndcg / eval_query_cnt print("Rank:" + str(args.rank) + " --- ANN NDCG@10:" + str(final_ndcg)) return final_ndcg, eval_query_cnt
def evaluate_models(results): """ Calculate METRICS for each model in the results dict ---- Input example: # results = { # 'model_1': { # 'preds': [[1,2],[1,2]], # 'labels': [[1,2],[1,2]] # } #} """ for model in results.keys(): preds = results[model]['preds'] labels = results[model]['labels'] run = {} qrel = {} for i, p in enumerate(preds): run['q{}'.format(i+1)] = {} qrel['q{}'.format(i+1)] = {} for j, _ in enumerate(range(len(p))): run['q{}'.format(i+1)]['d{}'.format(j+1)] = float(preds[i][j]) qrel['q{}'.format(i + 1)]['d{}'.format(j + 1)] = int(labels[i][j]) evaluator = pytrec_eval.RelevanceEvaluator(qrel, METRICS) results[model]['eval'] = evaluator.evaluate(run) return results
def calc_metrics(self, qrels_dict, run_dict, metrics, verbose=False): rel_args = {} for metric in metrics: for exp in PTE_METRIC_MAP: match = re.match(exp, str(metric)) if match: params = match.groupdict() rel, gains = int(params.get('rel') or '1'), params.get('gain') rel_args.setdefault((rel, gains), {}) metric_name = PTE_METRIC_MAP[exp].format(**params) rel_args[rel, gains][metric_name] = str(metric) break result = {} for (rel, gains), measures in rel_args.items(): these_qrels = self._apply_gains(qrels_dict, gains) evaluator = pytrec_eval.RelevanceEvaluator(these_qrels, measures.keys(), relevance_level=rel) pte_results = evaluator.evaluate(run_dict) # translate and filter this to the output format for pte_name, onir_name in measures.items(): result[onir_name] = {} for qid in pte_results: result[onir_name][qid] = pte_results[qid][pte_name] return result
def evaluate_queries(self, qrels, queries, save_path): save_path = os.path.join(self.ARGS.save_dir, save_path) overall_ser = {} # TODO: save necessary info for result file => trec_results = [] doc_ids, doc_embeddings = self.build_doc_embeddings() print(f"Running Word2Vec Evaluation, ww-size: {self.ARGS.ww_size}, vocab-size: {len(self.vocab['id2token'])}") for qid in tqdm(qrels): query_text = queries[qid] results = self.match_query_against_docs(query_text, doc_ids, doc_embeddings) overall_ser[qid] = dict(results) if int(qid) not in np.arange(76, 101): evaluate.write_trec_results(qid, results, save_path) evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'ndcg'}) metrics = evaluator.evaluate(overall_ser) evaluate.calc_mean_metrics(metrics) # dump this to JSON - *Not* Optional - This is submitted in the assignment! with open(os.path.join(save_path, "word2vec_metrics.json"), "w") as writer: json.dump(metrics, writer, indent=1)
def cal_ndcg(qrels, trec, k): with open(qrels, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(trec, 'r') as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator(qrel, pytrec_eval.supported_measures) results = evaluator.evaluate(run) for query_id, query_measures in sorted(results.items()): pass mes = {} for measure in sorted(query_measures.keys()): mes[measure] = pytrec_eval.compute_aggregated_measure( measure, [query_measures[measure] for query_measures in results.values()]) metric = 'ndcg_cut_%d' % k if metric not in mes: print('Depth of NDCG not available.') exit() ndcg = mes[metric] return ndcg
def evaluate_retrieval(ground_truth, run, eval_missing_truth): print("Evaluate: Passage Retrieval") result = {} retrieval_run = get_retrieval_run(run) retrieval_ground_truth_for_type = get_retrieval_ground_truth( ground_truth, eval_missing_truth) retrieval_run_for_type = { turn_id: passages for (turn_id, passages) in retrieval_run.items() if turn_id in retrieval_ground_truth_for_type } if retrieval_run_for_type: # at least one turn for this type => evaluate metric = pytrec_eval.RelevanceEvaluator( retrieval_ground_truth_for_type, {'recip_rank'}) mrrs = [ score["recip_rank"] for score in metric.evaluate(retrieval_run_for_type).values() ] average_mrr = sum(mrrs) / len(mrrs) result["MRR"] = average_mrr print(" used retrieved passages for %d questions" % len(retrieval_run_for_type)) else: print(" skipped for no retrieved passages") return result
def evaluate(res, qrels, metrics=['map', 'ndcg'], perquery=False): """ Evaluate the result dataframe with the given qrels Args: res: Either a dataframe with columns=['qid', 'docno', 'score'] or a dict {qid:{docno:score,},} qrels: Either a dataframe with columns=['qid','docno', 'label'] or a dict {qid:{docno:label,},} metrics(list): A list of strings specifying which evaluation metrics to use. Default=['map', 'ndcg'] perquery(bool): If true return each metric for each query, else return mean metrics. Default=False """ from .io import coerce_dataframe if not isinstance(res, dict): res = coerce_dataframe(res) if isinstance(res, pd.DataFrame): batch_retrieve_results_dict = Utils.convert_res_to_dict(res) else: batch_retrieve_results_dict = res if isinstance(qrels, pd.DataFrame): qrels_dic = Utils.convert_qrels_to_dict(qrels) else: qrels_dic = qrels if len(batch_retrieve_results_dict) == 0: raise ValueError("No results for evaluation") req_metrics = set() cutdown = False for m in metrics: if m.startswith("ndcg_cut_"): req_metrics.add("ndcg_cut") cutdown = True elif m.startswith("P_"): req_metrics.add("P") cutdown = True else: req_metrics.add(m) evaluator = pytrec_eval.RelevanceEvaluator(qrels_dic, req_metrics) result = evaluator.evaluate(batch_retrieve_results_dict) if perquery: if not cutdown: return result # user wanted metrics like ndcg_cut_5, but we had to request ndcg_cut # lets cutout the metrics they didnt want # get any arbitrary query q = next(iter(result.keys())) todel = [] for m in result[q]: if not m in metrics: todel.append(m) for q in result: for m in todel: del result[q][m] return result means = Utils.mean_of_measures(result) if cutdown: means = {m: means[m] for m in metrics} return means
def evaluate(res, qrels, metrics=['map', 'ndcg'], perquery=False): """ Evaluate the result dataframe with the given qrels Args: res: Either a dataframe with columns=['qid', 'docno', 'score'] or a dict {qid:{docno:score,},} qrels: Either a dataframe with columns=['qid','docno', 'label'] or a dict {qid:{docno:label,},} metrics(list): A list of strings specifying which evaluation metrics to use. Default=['map', 'ndcg'] perquery(bool): If true return each metric for each query, else return mean metrics. Default=False """ def now(): from datetime import datetime return datetime.now().strftime("%H:%M:%S.%f") #print(now() + " evaluate started") if isinstance(res, pd.DataFrame): batch_retrieve_results_dict = Utils.convert_res_to_dict(res) else: batch_retrieve_results_dict = res #print(now() + " res ready") if isinstance(qrels, pd.DataFrame): qrels_dic = Utils.convert_qrels_to_dict(qrels) else: qrels_dic = qrels #print(now() + " qrels ready") req_metrics = set() cutdown = False for m in metrics: if m.startswith("ndcg_cut_"): req_metrics.add("ndcg_cut") cutdown = True # elif m.startswith("P_"): # req_metrics.add("P") else: req_metrics.add(m) evaluator = pytrec_eval.RelevanceEvaluator(qrels_dic, req_metrics) #print(now() + " evaluating") result = evaluator.evaluate(batch_retrieve_results_dict) #print(now() + " evaluation done") if perquery: if not cutdown: return result # user wanted metrics like ndcg_cut_5, but we had to request ndcg_cut # lets cutout the metrics they didnt want # get any arbitrary query q = next(iter(result.keys())) todel = [] for m in result[q]: if not m in metrics: todel.append(m) for q in result: for m in todel: del result[q][m] return result return Utils.mean_of_measures(result, metrics)
def setup_evaluator_from_relevance_file(qrel_path, measures={ "map", "ndcg_cut", "recall", "P" }): with open(qrel_path, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) return pytrec_eval.RelevanceEvaluator(qrel, measures)
def validate(model, dataset, run, valid_qrels, epoch): run_scores = run_model(model, dataset, run) metric = VALIDATION_METRIC if metric.startswith("P_"): metric = "P" trec_eval = pytrec_eval.RelevanceEvaluator(valid_qrels, {metric}) eval_scores = trec_eval.evaluate(run_scores) return mean([d[VALIDATION_METRIC] for d in eval_scores.values()])
def evaluate(qrel_file, run_file, out_path, measures=("map_cut", "map", "ndcg", "ndcg_cut", "P")): evaluator = pytrec_eval.RelevanceEvaluator( qrel_file, set(measures)) metrics = evaluator.evaluate(run_file) out = agg_metrics_queries(metrics) with open(out_path, "w") as handler: json.dump(out, handler) return out
def sig_test_from_runs(qrels, runs1, runs2, metric="map"): if set(runs1) != set(runs2): raise ValueError(f"Expect same keys from two run objects.") evaluator = pytrec_eval.RelevanceEvaluator(qrels, {metric}) scores1 = _calc_scores(runs1, metric=metric, evaluator=evaluator) scores2 = _calc_scores(runs2, metric=metric, evaluator=evaluator) t, p = stats.ttest_rel(scores1, scores2) return t, p
def compute_mrr_last(D, I, qids, ref_dict, dev_query_positive_id): knn_pkl = {"D": D, "I": I} all_knn_list = all_gather(knn_pkl) mrr = 0.0 final_recall = 0.0 if is_first_worker(): prediction = {} D_merged = concat_key(all_knn_list, "D", axis=1) I_merged = concat_key(all_knn_list, "I", axis=1) print(D_merged.shape, I_merged.shape) # we pad with negative pids and distance -128 - if they make it to the top we have a problem idx = np.argsort(D_merged, axis=1)[:, ::-1][:, :1000] sorted_I = np.take_along_axis(I_merged, idx, axis=1) candidate_dict = {} for i, qid in enumerate(qids): seen_pids = set() if qid not in candidate_dict: prediction[qid] = {} candidate_dict[qid] = [0] * 1000 j = 0 for pid in sorted_I[i]: if pid >= 0 and pid not in seen_pids: candidate_dict[qid][j] = pid prediction[qid][pid] = -(j + 1) #-rank j += 1 seen_pids.add(pid) # allowed, message = quality_checks_qids(ref_dict, candidate_dict) # if message != '': # print(message) # mrr_metrics = compute_metrics(ref_dict, candidate_dict) # mrr = mrr_metrics["MRR @10"] # print(mrr) allowed, message = quality_checks_qids(ref_dict, candidate_dict) if message != '': print(message) mrr_metrics = compute_metrics(ref_dict, candidate_dict) mrr = mrr_metrics["MRR @10"] print(mrr) evaluator = pytrec_eval.RelevanceEvaluator( convert_to_string_id(dev_query_positive_id), {'recall'}) eval_query_cnt = 0 recall = 0 topN = 1000 result = evaluator.evaluate(convert_to_string_id(prediction)) for k in result.keys(): eval_query_cnt += 1 recall += result[k]["recall_" + str(topN)] final_recall = recall / eval_query_cnt print('final_recall: ', final_recall) return mrr, final_recall
def __init__(self, q_rels, save_dir=None, save_name="rerank_eval.run"): ''' q_rels: dict: {'q_id':[d_id, d_id,...],...} ''' pytrec_q_rels = {} for q_id, d_ids in q_rels.items(): pytrec_q_rels[q_id] = {d_id: 1 for d_id in d_ids} self.evaluator = pytrec_eval.RelevanceEvaluator( pytrec_q_rels, {'map', 'ndcg_cut_3', 'set_recall', 'recip_rank'})
def compute_metrics(docs, vocab_embs, word2id, id2word): """ For a trained model, compute the MAP and NDCG based on a set of queries and all documents in the corpus. Returns: metrics: a nested dict of queries and their MAP and NDCG scores. """ # Create document embeddings if not os.path.exists("./pickles/word2vec_doc_embs.pkl"): print("constructing document embeddings") doc_embs = {} keys = list(docs.keys()) for d in tqdm(keys): doc = docs[d] doc_emb = create_doc_emb(vocab_embs, doc, word2id, id2word) doc_embs[d] = doc_emb with open("./pickles/word2vec_doc_embs.pkl", "wb") as writer: pkl.dump(doc_embs, writer) else: with open("./pickles/word2vec_doc_embs.pkl", "rb") as reader: doc_embs = pkl.load(reader) # Create query embedding and compare to every docuemnt embedding qrels, queries = ra.read_qrels() overall_ser = {} #ranking per query for qid in tqdm(qrels): query = queries[qid] query = ra.process_text(query) query_emb = create_doc_emb(vocab_embs, query, word2id, id2word) ranking, trec_results = get_ranking(qid, query_emb, doc_embs, vocab_embs) overall_ser[qid] = ranking if not int(qid) in range(76, 100): with open("./results/word2vec_trec.csv", "a+") as f: f.write("\n".join("{},{},{},{},{},{}".format( x[0], x[1], x[2], x[3], x[4], x[5]) for x in trec_results)) f.write("\n") # Compute the MAP and NDCG per query evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'ndcg'}) metrics = evaluator.evaluate(overall_ser) # Get the average model evaluation scores over all queries average = {'map': 0, 'ndcg': 0} for q in list(metrics.values()): average['map'] += q['map'] average['ndcg'] += q['ndcg'] average['map'] = average['map'] / len(queries) average['ndcg'] = average['ndcg'] / len(queries) print( 'average model evaluation scores over all queries {}'.format(average)) return (metrics)
def ranking_measure(qrel, pred): evaluator = pytrec_eval.RelevanceEvaluator( qrel, {'map', 'ndcg'}) results = evaluator.evaluate(pred) ndcg_score = np.mean([i['ndcg'] for i in results.values()]) map_score = np.mean([i['map'] for i in results.values()]) return ndcg_score, map_score
def test_search_run_metrics(tmpdir): qrels_dict = {"q1": {"d1": 1, "d2": 0, "d3": 2}, "q2": {"d5": 0, "d6": 1}} run_dict = { "q1": { "d1": 1.1, "d2": 1.0 }, "q2": { "d5": 9.0, "d6": 8.0 }, "q3": { "d7": 1.0, "d8": 2.0 } } valid_metrics = {"P", "map", "map_cut", "ndcg_cut", "Rprec", "recip_rank"} fn = tmpdir / "searcher" Searcher.write_trec_run(run_dict, fn) # calculate results with q1 and q2 searcher = Searcher(None, None, None, None) qids = set(qrels_dict.keys()) evaluator = pytrec_eval.RelevanceEvaluator(qrels_dict, valid_metrics) partial_metrics = searcher.search_run_metrics(fn, evaluator, qids) # cache file exists? assert os.path.exists(fn + ".metrics") # add q3 and re-run to update cache qrels_dict["q3"] = {"d7": 0, "d8": 2} qids = set(qrels_dict.keys()) evaluator = pytrec_eval.RelevanceEvaluator(qrels_dict, valid_metrics) metrics = searcher.search_run_metrics(fn, evaluator, qids) assert "q3" in metrics assert "q2" in metrics # remove original file to ensure results loaded from cache, # then make sure metrics haven't changed (and include the new q3) os.remove(fn) cached_metrics = searcher.search_run_metrics(fn, evaluator, qids) assert metrics == cached_metrics
def eval_preds(test_qrels, target_preds): dev_eval = pytrec_eval.RelevanceEvaluator(test_qrels, {"ndcg_cut", "P", "map"}) result = dev_eval.evaluate(target_preds) fold_metrics = defaultdict(list) for qid, metrics in result.items(): for metric, val in metrics.items(): fold_metrics[metric].append(val) return {key: np.mean(vals) for key, vals in fold_metrics.items()}