Exemple #1
0
    def test_ndcg_cut(self):
        qrel = {
            'q1': {
                'd1': 0,
                'd2': 1,
                'd3': 0,
            },
            'q2': {
                'd2': 1,
                'd3': 1,
            },
        }

        evaluator = pytrec_eval.RelevanceEvaluator(qrel, {'ndcg_cut.3'})

        self.assertAlmostEqual(
            evaluator.evaluate({
                'q1': {
                    'd1': 1.0,
                    'd2': 0.0,  # rank 3
                    'd3': 1.5,
                },
                'q2': {},
            })['q1']['ndcg_cut_3'],
            0.5)

        evaluator = pytrec_eval.RelevanceEvaluator(qrel, {'ndcg_cut.1'})

        self.assertAlmostEqual(
            evaluator.evaluate({
                'q1': {
                    'd1': 1.0,
                    'd2': 2.0,  # rank 3
                    'd3': 1.5,
                },
                'q2': {},
            })['q1']['ndcg_cut_1'],
            1.0)

        evaluator = pytrec_eval.RelevanceEvaluator(qrel,
                                                   {'ndcg_cut.1,2,3,1000'})
        result = evaluator.evaluate({
            'q1': {
                'd1': 1.0,
                'd2': 0.0,  # rank 3
                'd3': 1.5,
            },
            'q2': {},
        })['q1']
        self.assertAlmostEqual(result['ndcg_cut_3'], 0.5)
        self.assertAlmostEqual(result['ndcg_cut_2'], 0.0)
        self.assertAlmostEqual(result['ndcg_cut_1'], 0.0)
        self.assertAlmostEqual(result['ndcg_cut_1000'], 0.5)
Exemple #2
0
def benchmark(model, model_name, docs, idx2key):
    qrels, queries = read_ap.read_qrels()

    overall_ser = {}

    # Adopted version from the TFIDF benchmark test
    print("Running GENSIM Benchmark")
    # collect results
    for qid in tqdm(qrels):
        query_text = queries[qid]
        results = rank(model, docs, query_text)
        #print(results)
        overall_ser[qid] = dict([(idx2key[idx], score)
                                 for idx, score in results])

    print(overall_ser)

    #print(overall_ser[100])
    evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'ndcg'})
    metrics = evaluator.evaluate(overall_ser)

    json_filename = f"./json_files/benchmark_{model_name}.json"

    # dump to JSON
    with open(json_filename, "w") as writer:
        json.dump(metrics, writer, indent=1)

    return json_filename
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('qrel')
    parser.add_argument('run')
    parser.add_argument('measure')

    args = parser.parse_args()

    assert os.path.exists(args.qrel)
    assert os.path.exists(args.run)

    with open(args.qrel, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(args.run, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)
    
    evaluator = pytrec_eval.RelevanceEvaluator(
        qrel, {args.measure})

    results = evaluator.evaluate(run)
    
    def print_line(measure, scope, value):
        #scope = query_id = topic_id
        print('{:25s}{:8s}{:.22f}'.format(measure, scope, value))
    avg_DCG = []
    for query_id, query_measures in results.items():
   
        for measure, value in sorted(query_measures.items()):
            avg_DCG.append(value)
            print_line(measure, query_id, value)
    print(avg_DCG)
    print(mean(avg_DCG))
    print(' avg of nDCG {:f}'.format(mean(avg_DCG)))
Exemple #4
0
    def get_metric(self,
                   qrels: str,
                   trec: str,
                   metric: str = 'ndcg_cut_10',
                   split: dict = None,
                   split_idx: int = -1) -> Dict[str, float]:
        with open(qrels, 'r') as f_qrel:
            qrel = pytrec_eval.parse_qrel(f_qrel)
        with open(trec, 'r') as f_run:
            run = pytrec_eval.parse_run(f_run)

        # partial evaluation
        if split is not None and split_idx >= 0:
            for qid in copy.deepcopy(run):
                if qid not in split[split_idx]:
                    _ = run.pop(qid)

        evaluator = pytrec_eval.RelevanceEvaluator(
            qrel, pytrec_eval.supported_measures)
        results = evaluator.evaluate(run)
        for query_id, query_measures in sorted(results.items()):
            pass
        mes = {}
        for measure in sorted(query_measures.keys()):
            mes[measure] = pytrec_eval.compute_aggregated_measure(
                measure, [
                    query_measures[measure]
                    for query_measures in results.values()
                ])
        return mes[metric]
Exemple #5
0
    def crossvalidated_interpolation(dev, test, metric):
        """ Return an interpolated ranking """

        # TODO refactor out (shared with crossvalidated_ranking)
        valid_metrics = {"P", "map", "map_cut", "ndcg_cut", "Rprec", "recip_rank"}
        cut_points = [5, 10, 15, 20, 30, 100, 200, 500, 1000]
        # the metrics we expect pytrec_eval to output (after expanding _cut)
        expected_metrics = {m for m in valid_metrics if not m.endswith("_cut") and m != "P"} | {
            m + "_" + str(cutoff) for cutoff in cut_points for m in valid_metrics if m.endswith("_cut") or m == "P"
        }

        if metric in ["ndcg", "ndcg_cut"]:
            mkey = "ndcg_cut_20"
        elif metric in expected_metrics:
            mkey = metric
        else:
            raise RuntimeError("requested metric %s is not one of the supported metrics: %s" % (metric, sorted(expected_metrics)))
        avg_metric = lambda run_metrics: np.mean([qid[mkey] for qid in run_metrics.values()])

        assert len(set(dev["qrels"].keys()).intersection(test["qrels"].keys())) == 0
        dev_eval = pytrec_eval.RelevanceEvaluator(dev["qrels"], valid_metrics)
        best_metric, best_alpha = -np.inf, None
        for alpha in np.arange(0, 1.001, 0.05):
            run_metrics = dev_eval.evaluate(
                Searcher.interpolate_runs(dev["reranker"], dev["searcher"], dev["qrels"].keys(), alpha)
            )
            mavgp = avg_metric(run_metrics)
            if mavgp > best_metric:
                best_metric = mavgp
                best_alpha = alpha

        test_run = Searcher.interpolate_runs(test["reranker"], test["searcher"], test["qrels"].keys(), best_alpha)
        dev_run = Searcher.interpolate_runs(dev["reranker"], dev["searcher"], dev["qrels"].keys(), best_alpha)
        return (best_alpha, test_run, dev_run)
Exemple #6
0
def evaluate(eval_path, qrel_path, res_path):

    measures = {"map", "ndcg_cut", "recall", "P"}

    with open(qrel_path, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures)

    with open(res_path, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)

    all_metrics = evaluator.evaluate(run)

    metrics = {
        'P_5': 0,
        'P_10': 0,
        'P_20': 0,
        'ndcg_cut_5': 0,
        'ndcg_cut_10': 0,
        'ndcg_cut_20': 0,
        'ndcg_cut_100': 0,
        'map': 0,
        'recall_100': 0
    }

    nb_queries = len(all_metrics)
    for key, values in all_metrics.items():
        for metric in metrics:
            metrics[metric] += values[metric] / nb_queries

    with open(eval_path, 'w') as f:
        json.dump(metrics, f)
    def summary(self):
        qrel = {}
        run = {}
        assert (len(self.qrel_list) == len(self.run_list))
        for i in range(len(self.qrel_list)):
            assert (len(self.qrel_list[i]) == len(self.run_list[i]))
            qid = 'q{}'.format(i + 1)
            qrel[qid] = {}
            run[qid] = {}
            for j in range(len(self.run_list[i])):
                did = 'd{}'.format(j + 1)
                qrel[qid][did] = int(self.qrel_list[i][j])
                run[qid][did] = float(self.run_list[i][j])

        evaluater = pytrec_eval.RelevanceEvaluator(
            qrel, {'map', 'map_cut', 'ndcg', 'ndcg_cut', 'recall'})
        trec = evaluater.evaluate(run)
        results = {
            'mrr': np.mean(self.rr_list),
            'mrr10': np.mean(self.rr10_list),
            'err': np.mean(self.err_list),
            'err20': np.mean(self.err20_list),
            'map': np.mean([trec[d]['map'] for d in trec]),
            'map10': np.mean([trec[d]['map_cut_10'] for d in trec]),
            'map20': np.mean([trec[d]['map_cut_20'] for d in trec]),
            'ndcg': np.mean([trec[d]['ndcg'] for d in trec]),
            'ndcg10': np.mean([trec[d]['ndcg_cut_10'] for d in trec]),
            'ndcg20': np.mean([trec[d]['ndcg_cut_20'] for d in trec]),
            'recall100': np.mean([trec[d]['recall_100'] for d in trec])
        }
        return results
Exemple #8
0
    def evaluate(res, qrels, metrics=['map', 'ndcg'], perquery=False):
        """
        Evaluate the result dataframe with the given qrels

        Args:
            res: Either a dataframe with columns=['qid', 'docno', 'score'] or a dict {qid:{docno:score,},}
            qrels: Either a dataframe with columns=['qid','docno', 'label'] or a dict {qid:{docno:label,},}
            metrics(list): A list of strings specifying which evaluation metrics to use. Default=['map', 'ndcg']
            perquery(bool): If true return each metric for each query, else return mean metrics. Default=False
        """

        if isinstance(res, pd.DataFrame):
            batch_retrieve_results_dict = Utils.convert_res_to_dict(res)
        else:
            batch_retrieve_results_dict = res

        if isinstance(qrels, pd.DataFrame):
            qrels_dic = Utils.convert_qrels_to_dict(qrels)
        else:
            qrels_dic = qrels

        evaluator = pytrec_eval.RelevanceEvaluator(qrels_dic, set(metrics))
        result = evaluator.evaluate(batch_retrieve_results_dict)
        if perquery:
            return result
        else:
            measures_sum = {}
            mean_dict = {}
            for val in result.values():
                for measure, measure_val in val.items():
                    measures_sum[measure] = measures_sum.get(measure, 0.0) + measure_val
            for measure, value in measures_sum.items():
                mean_dict[measure] = value / len(result.values())
            return mean_dict
def evaluate_doc2vec(doc2vec_model, description, test_subset=False):

    qrels, queries = read_ap.read_qrels()

    if test_subset:
        queries = {
            qid: q
            for qid, q in queries.items() if int(qid) < 101 and int(qid) > 75
        }

    overall_ser = {}
    # collect results
    for qid in queries:
        results = rank_query_given_document(queries[qid], doc2vec_model)
        overall_ser[qid] = dict(results)

        if int(qid) not in np.arange(76, 101):
            evaluate.write_trec_results(qid, results, f"./doc2vec/results/")

    evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'ndcg'})
    metrics = evaluator.evaluate(overall_ser)

    if not test_subset:
        with open(f"./doc2vec/results/doc2vec_{description}.json",
                  "w") as writer:
            json.dump(metrics, writer, indent=1)

    return metrics
def qrel_metrics(qrel_file, run_file, metrics=('ndcg', 'map')):
    """Get metrics (ndcg and map by default) for a run compared to a qrel file.

    Arguments:
        qrel_file -- qrel file with ground truth data
        run_file -- predictions from the run
        metrics -- which metrics to evaluate on,
                   can use any valid metrics that the trec_eval tool accepts

    Returns:
        metric_values -- dictionary of metric values (out of 100), rounded to two decimal places
    """
    with open(qrel_file, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(run_file, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)
        
    evaluator = pytrec_eval.RelevanceEvaluator(qrel, set(metrics))
    results = evaluator.evaluate(run)

    metric_values = {}
    for measure in sorted(metrics):
        res = pytrec_eval.compute_aggregated_measure(
                measure, 
                [query_measures[measure]  for query_measures in results.values()]
            )
        metric_values[measure] = np.round(100 * res, 2)
    return metric_values
Exemple #11
0
    def test_measure_params(self):
        qrel = {
            'q1': {
                'd1': 0,
                'd2': 1,
                'd3': 0,
            },
            'q2': {
                'd2': 1,
                'd3': 1,
            },
        }
        run = {
            'q1': {
                'd1': 1.0,
                'd2': 0.0,
                'd3': 1.5,
            },
            'q2': {},
        }

        # empty run
        evaluator = pytrec_eval.RelevanceEvaluator(qrel, [
            'ndcg_cut', 'ndcg_cut.1,4', 'ndcg_cut_20,4', 'ndcg_cut_15',
            'recall.1000', 'P'
        ])
        self.assertEqual(
            set(evaluator.evaluate(run)['q1'].keys()), {
                'ndcg_cut_1', 'ndcg_cut_4', 'ndcg_cut_15', 'ndcg_cut_20',
                'recall_1000', 'P_200', 'P_15', 'P_10', 'P_5', 'P_30', 'P_100',
                'P_20', 'P_500', 'P_1000'
            })
Exemple #12
0
def compute_metrics(dictionary, model, index, corpus_type, num_topics,
                    doc_ids):
    """
    Compute MAP and nDCG scores and save to json file.
    """
    metric_path = ("./LSI_results/LSI_{}_and_{}_topics.json".format(
        corpus_type, num_topics))
    #check whether metrics for corpus type and num_topics were already generated
    if not os.path.exists(metric_path):

        # Get ranking of document for every query and compute the MAP and NDCG score.
        qrels, queries = ra.read_qrels()
        overall_ser = {}  #ranking per query
        for qid in tqdm(qrels):
            query = queries[qid]
            ranking = query_similarity(query, dictionary, model, index,
                                       doc_ids)
            overall_ser[qid] = ranking

        # Compute model evaluation scores per query
        evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'ndcg'})
        metrics = evaluator.evaluate(overall_ser)

        with open(
                "./LSI_results/LSI_{}_and_{}_topics.json".format(
                    corpus_type, num_topics), "w") as writer:
            json.dump(metrics, writer, indent=1)
    else:
        print('metrics for LSI_{} with {} topics were already computed'.format(
            corpus_type, num_topics))
Exemple #13
0
def EvalDevQuery(args, query_embedding2id, passage_embedding2id,
                 dev_query_positive_id, I_nearest_neighbor):
    prediction = {
    }  #[qid][docid] = docscore, here we use -rank as score, so the higher the rank (1 > 2), the higher the score (-1 > -2)

    for query_idx in range(I_nearest_neighbor.shape[0]):
        query_id = query_embedding2id[query_idx]
        prediction[query_id] = {}

        top_ann_pid = I_nearest_neighbor[query_idx, :].copy()
        selected_ann_idx = top_ann_pid[:50]
        rank = 0
        for idx in selected_ann_idx:
            pred_pid = passage_embedding2id[idx]
            rank += 1
            prediction[query_id][pred_pid] = -rank

    # use out of the box evaluation script
    evaluator = pytrec_eval.RelevanceEvaluator(
        convert_to_string_id(dev_query_positive_id), {'map_cut', 'ndcg_cut'})

    eval_query_cnt = 0
    result = evaluator.evaluate(convert_to_string_id(prediction))
    ndcg = 0

    for k in result.keys():
        eval_query_cnt += 1
        ndcg += result[k]["ndcg_cut_10"]

    final_ndcg = ndcg / eval_query_cnt
    print("Rank:" + str(args.rank) + " --- ANN NDCG@10:" + str(final_ndcg))

    return final_ndcg, eval_query_cnt
Exemple #14
0
def evaluate_models(results):
    """
    Calculate METRICS for each model in the results dict
    ----
    Input example:
    # results = {
    #  'model_1': {
    #     'preds': [[1,2],[1,2]],
    #     'labels': [[1,2],[1,2]]
    #   }
    #}
    """

    for model in results.keys():
        preds = results[model]['preds']
        labels = results[model]['labels']
        run = {}
        qrel = {}
        for i, p in enumerate(preds):
            run['q{}'.format(i+1)] = {}
            qrel['q{}'.format(i+1)] = {}
            for j, _ in enumerate(range(len(p))):
                run['q{}'.format(i+1)]['d{}'.format(j+1)] = float(preds[i][j])
                qrel['q{}'.format(i + 1)]['d{}'.format(j + 1)] = int(labels[i][j])
        evaluator = pytrec_eval.RelevanceEvaluator(qrel, METRICS)
        results[model]['eval'] = evaluator.evaluate(run)
    return results
Exemple #15
0
 def calc_metrics(self, qrels_dict, run_dict, metrics, verbose=False):
     rel_args = {}
     for metric in metrics:
         for exp in PTE_METRIC_MAP:
             match = re.match(exp, str(metric))
             if match:
                 params = match.groupdict()
                 rel, gains = int(params.get('rel')
                                  or '1'), params.get('gain')
                 rel_args.setdefault((rel, gains), {})
                 metric_name = PTE_METRIC_MAP[exp].format(**params)
                 rel_args[rel, gains][metric_name] = str(metric)
                 break
     result = {}
     for (rel, gains), measures in rel_args.items():
         these_qrels = self._apply_gains(qrels_dict, gains)
         evaluator = pytrec_eval.RelevanceEvaluator(these_qrels,
                                                    measures.keys(),
                                                    relevance_level=rel)
         pte_results = evaluator.evaluate(run_dict)
         # translate and filter this to the output format
         for pte_name, onir_name in measures.items():
             result[onir_name] = {}
             for qid in pte_results:
                 result[onir_name][qid] = pte_results[qid][pte_name]
     return result
Exemple #16
0
    def evaluate_queries(self, qrels, queries, save_path):

        save_path = os.path.join(self.ARGS.save_dir, save_path)

        overall_ser = {}
        # TODO: save necessary info for result file => trec_results = []

        doc_ids, doc_embeddings = self.build_doc_embeddings()

        print(f"Running Word2Vec Evaluation, ww-size: {self.ARGS.ww_size}, vocab-size: {len(self.vocab['id2token'])}")
        for qid in tqdm(qrels):
            query_text = queries[qid]

            results = self.match_query_against_docs(query_text, doc_ids, doc_embeddings)
            overall_ser[qid] = dict(results)

            if int(qid) not in np.arange(76, 101):
                evaluate.write_trec_results(qid, results, save_path)

        evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'ndcg'})
        metrics = evaluator.evaluate(overall_ser)

        evaluate.calc_mean_metrics(metrics)

        # dump this to JSON - *Not* Optional - This is submitted in the assignment!
        with open(os.path.join(save_path, "word2vec_metrics.json"), "w") as writer:
            json.dump(metrics, writer, indent=1)
Exemple #17
0
def cal_ndcg(qrels, trec, k):
    with open(qrels, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(trec, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)

    evaluator = pytrec_eval.RelevanceEvaluator(qrel,
                                               pytrec_eval.supported_measures)
    results = evaluator.evaluate(run)
    for query_id, query_measures in sorted(results.items()):
        pass

    mes = {}
    for measure in sorted(query_measures.keys()):
        mes[measure] = pytrec_eval.compute_aggregated_measure(
            measure,
            [query_measures[measure] for query_measures in results.values()])

    metric = 'ndcg_cut_%d' % k
    if metric not in mes:
        print('Depth of NDCG not available.')
        exit()
    ndcg = mes[metric]

    return ndcg
def evaluate_retrieval(ground_truth, run, eval_missing_truth):
    print("Evaluate: Passage Retrieval")
    result = {}
    retrieval_run = get_retrieval_run(run)
    retrieval_ground_truth_for_type = get_retrieval_ground_truth(
        ground_truth, eval_missing_truth)
    retrieval_run_for_type = {
        turn_id: passages
        for (turn_id, passages) in retrieval_run.items()
        if turn_id in retrieval_ground_truth_for_type
    }
    if retrieval_run_for_type:  # at least one turn for this type => evaluate
        metric = pytrec_eval.RelevanceEvaluator(
            retrieval_ground_truth_for_type, {'recip_rank'})
        mrrs = [
            score["recip_rank"]
            for score in metric.evaluate(retrieval_run_for_type).values()
        ]
        average_mrr = sum(mrrs) / len(mrrs)
        result["MRR"] = average_mrr
        print("    used retrieved passages for %d questions" %
              len(retrieval_run_for_type))
    else:
        print("    skipped for no retrieved passages")
    return result
Exemple #19
0
    def evaluate(res, qrels, metrics=['map', 'ndcg'], perquery=False):
        """
        Evaluate the result dataframe with the given qrels

        Args:
            res: Either a dataframe with columns=['qid', 'docno', 'score'] or a dict {qid:{docno:score,},}
            qrels: Either a dataframe with columns=['qid','docno', 'label'] or a dict {qid:{docno:label,},}
            metrics(list): A list of strings specifying which evaluation metrics to use. Default=['map', 'ndcg']
            perquery(bool): If true return each metric for each query, else return mean metrics. Default=False
        """
        from .io import coerce_dataframe
        if not isinstance(res, dict):
            res = coerce_dataframe(res)
        if isinstance(res, pd.DataFrame):
            batch_retrieve_results_dict = Utils.convert_res_to_dict(res)
        else:
            batch_retrieve_results_dict = res
        if isinstance(qrels, pd.DataFrame):
            qrels_dic = Utils.convert_qrels_to_dict(qrels)
        else:
            qrels_dic = qrels
        if len(batch_retrieve_results_dict) == 0:
            raise ValueError("No results for evaluation")
        req_metrics = set()
        cutdown = False
        for m in metrics:
            if m.startswith("ndcg_cut_"):
                req_metrics.add("ndcg_cut")
                cutdown = True
            elif m.startswith("P_"):
                req_metrics.add("P")
                cutdown = True
            else:
                req_metrics.add(m)

        evaluator = pytrec_eval.RelevanceEvaluator(qrels_dic, req_metrics)

        result = evaluator.evaluate(batch_retrieve_results_dict)
        if perquery:
            if not cutdown:
                return result
            # user wanted metrics like ndcg_cut_5, but we had to request ndcg_cut
            # lets cutout the metrics they didnt want

            # get any arbitrary query
            q = next(iter(result.keys()))
            todel = []
            for m in result[q]:
                if not m in metrics:
                    todel.append(m)
            for q in result:
                for m in todel:
                    del result[q][m]
            return result

        means = Utils.mean_of_measures(result)
        if cutdown:
            means = {m: means[m] for m in metrics}
        return means
Exemple #20
0
    def evaluate(res, qrels, metrics=['map', 'ndcg'], perquery=False):
        """
        Evaluate the result dataframe with the given qrels

        Args:
            res: Either a dataframe with columns=['qid', 'docno', 'score'] or a dict {qid:{docno:score,},}
            qrels: Either a dataframe with columns=['qid','docno', 'label'] or a dict {qid:{docno:label,},}
            metrics(list): A list of strings specifying which evaluation metrics to use. Default=['map', 'ndcg']
            perquery(bool): If true return each metric for each query, else return mean metrics. Default=False
        """
        def now():
            from datetime import datetime
            return datetime.now().strftime("%H:%M:%S.%f")

        #print(now() + " evaluate started")
        if isinstance(res, pd.DataFrame):
            batch_retrieve_results_dict = Utils.convert_res_to_dict(res)
        else:
            batch_retrieve_results_dict = res
        #print(now() + " res ready")
        if isinstance(qrels, pd.DataFrame):
            qrels_dic = Utils.convert_qrels_to_dict(qrels)
        else:
            qrels_dic = qrels
        #print(now() + " qrels ready")
        req_metrics = set()
        cutdown = False
        for m in metrics:
            if m.startswith("ndcg_cut_"):
                req_metrics.add("ndcg_cut")
                cutdown = True
            # elif m.startswith("P_"):
            #     req_metrics.add("P")
            else:
                req_metrics.add(m)

        evaluator = pytrec_eval.RelevanceEvaluator(qrels_dic, req_metrics)

        #print(now() + " evaluating")
        result = evaluator.evaluate(batch_retrieve_results_dict)
        #print(now() + " evaluation done")
        if perquery:
            if not cutdown:
                return result
            # user wanted metrics like ndcg_cut_5, but we had to request ndcg_cut
            # lets cutout the metrics they didnt want

            # get any arbitrary query
            q = next(iter(result.keys()))
            todel = []
            for m in result[q]:
                if not m in metrics:
                    todel.append(m)
            for q in result:
                for m in todel:
                    del result[q][m]
            return result

        return Utils.mean_of_measures(result, metrics)
Exemple #21
0
def setup_evaluator_from_relevance_file(qrel_path,
                                        measures={
                                            "map", "ndcg_cut", "recall", "P"
                                        }):
    with open(qrel_path, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    return pytrec_eval.RelevanceEvaluator(qrel, measures)
Exemple #22
0
def validate(model, dataset, run, valid_qrels, epoch):
    run_scores = run_model(model, dataset, run)
    metric = VALIDATION_METRIC
    if metric.startswith("P_"):
        metric = "P"
    trec_eval = pytrec_eval.RelevanceEvaluator(valid_qrels, {metric})
    eval_scores = trec_eval.evaluate(run_scores)
    return mean([d[VALIDATION_METRIC] for d in eval_scores.values()])
Exemple #23
0
def evaluate(qrel_file, run_file, out_path, measures=("map_cut", "map", "ndcg", "ndcg_cut", "P")):
    evaluator = pytrec_eval.RelevanceEvaluator(
        qrel_file, set(measures))
    metrics = evaluator.evaluate(run_file)
    out = agg_metrics_queries(metrics)
    with open(out_path, "w") as handler:
        json.dump(out, handler)
    return out
Exemple #24
0
def sig_test_from_runs(qrels, runs1, runs2, metric="map"):
    if set(runs1) != set(runs2):
        raise ValueError(f"Expect same keys from two run objects.")

    evaluator = pytrec_eval.RelevanceEvaluator(qrels, {metric})
    scores1 = _calc_scores(runs1, metric=metric, evaluator=evaluator)
    scores2 = _calc_scores(runs2, metric=metric, evaluator=evaluator)
    t, p = stats.ttest_rel(scores1, scores2)
    return t, p
Exemple #25
0
def compute_mrr_last(D, I, qids, ref_dict, dev_query_positive_id):
    knn_pkl = {"D": D, "I": I}
    all_knn_list = all_gather(knn_pkl)
    mrr = 0.0
    final_recall = 0.0
    if is_first_worker():
        prediction = {}
        D_merged = concat_key(all_knn_list, "D", axis=1)
        I_merged = concat_key(all_knn_list, "I", axis=1)
        print(D_merged.shape, I_merged.shape)
        # we pad with negative pids and distance -128 - if they make it to the top we have a problem
        idx = np.argsort(D_merged, axis=1)[:, ::-1][:, :1000]
        sorted_I = np.take_along_axis(I_merged, idx, axis=1)
        candidate_dict = {}
        for i, qid in enumerate(qids):
            seen_pids = set()
            if qid not in candidate_dict:
                prediction[qid] = {}
                candidate_dict[qid] = [0] * 1000
            j = 0
            for pid in sorted_I[i]:
                if pid >= 0 and pid not in seen_pids:
                    candidate_dict[qid][j] = pid
                    prediction[qid][pid] = -(j + 1)  #-rank
                    j += 1
                    seen_pids.add(pid)

        # allowed, message = quality_checks_qids(ref_dict, candidate_dict)
        # if message != '':
        #     print(message)

        # mrr_metrics = compute_metrics(ref_dict, candidate_dict)
        # mrr = mrr_metrics["MRR @10"]
        # print(mrr)
        allowed, message = quality_checks_qids(ref_dict, candidate_dict)
        if message != '':
            print(message)

        mrr_metrics = compute_metrics(ref_dict, candidate_dict)
        mrr = mrr_metrics["MRR @10"]
        print(mrr)

        evaluator = pytrec_eval.RelevanceEvaluator(
            convert_to_string_id(dev_query_positive_id), {'recall'})

        eval_query_cnt = 0
        recall = 0
        topN = 1000
        result = evaluator.evaluate(convert_to_string_id(prediction))
        for k in result.keys():
            eval_query_cnt += 1
            recall += result[k]["recall_" + str(topN)]

        final_recall = recall / eval_query_cnt
        print('final_recall: ', final_recall)

    return mrr, final_recall
Exemple #26
0
 def __init__(self, q_rels, save_dir=None, save_name="rerank_eval.run"):
     '''
     q_rels: dict: {'q_id':[d_id, d_id,...],...}
     '''
     pytrec_q_rels = {}
     for q_id, d_ids in q_rels.items():
         pytrec_q_rels[q_id] = {d_id: 1 for d_id in d_ids}
     self.evaluator = pytrec_eval.RelevanceEvaluator(
         pytrec_q_rels, {'map', 'ndcg_cut_3', 'set_recall', 'recip_rank'})
def compute_metrics(docs, vocab_embs, word2id, id2word):
    """
    For a trained model, compute the MAP and NDCG based on a set of queries and
    all documents in the corpus.

    Returns:
        metrics: a nested dict of queries and their MAP and NDCG scores.
    """
    # Create document embeddings
    if not os.path.exists("./pickles/word2vec_doc_embs.pkl"):
        print("constructing document embeddings")
        doc_embs = {}
        keys = list(docs.keys())
        for d in tqdm(keys):
            doc = docs[d]
            doc_emb = create_doc_emb(vocab_embs, doc, word2id, id2word)
            doc_embs[d] = doc_emb

        with open("./pickles/word2vec_doc_embs.pkl", "wb") as writer:
            pkl.dump(doc_embs, writer)
    else:
        with open("./pickles/word2vec_doc_embs.pkl", "rb") as reader:
            doc_embs = pkl.load(reader)

    # Create query embedding and compare to every docuemnt embedding
    qrels, queries = ra.read_qrels()
    overall_ser = {}  #ranking per query
    for qid in tqdm(qrels):
        query = queries[qid]
        query = ra.process_text(query)
        query_emb = create_doc_emb(vocab_embs, query, word2id, id2word)
        ranking, trec_results = get_ranking(qid, query_emb, doc_embs,
                                            vocab_embs)
        overall_ser[qid] = ranking

        if not int(qid) in range(76, 100):
            with open("./results/word2vec_trec.csv", "a+") as f:
                f.write("\n".join("{},{},{},{},{},{}".format(
                    x[0], x[1], x[2], x[3], x[4], x[5]) for x in trec_results))
                f.write("\n")

    # Compute the MAP and NDCG per query
    evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'ndcg'})
    metrics = evaluator.evaluate(overall_ser)

    # Get the average model evaluation scores over all queries
    average = {'map': 0, 'ndcg': 0}
    for q in list(metrics.values()):
        average['map'] += q['map']
        average['ndcg'] += q['ndcg']
    average['map'] = average['map'] / len(queries)
    average['ndcg'] = average['ndcg'] / len(queries)
    print(
        'average model evaluation scores over all queries {}'.format(average))

    return (metrics)
Exemple #28
0
def ranking_measure(qrel, pred):

    evaluator = pytrec_eval.RelevanceEvaluator(
        qrel, {'map', 'ndcg'})

    results = evaluator.evaluate(pred)
    ndcg_score = np.mean([i['ndcg'] for i in results.values()])
    map_score = np.mean([i['map'] for i in results.values()])

    return ndcg_score, map_score
Exemple #29
0
def test_search_run_metrics(tmpdir):
    qrels_dict = {"q1": {"d1": 1, "d2": 0, "d3": 2}, "q2": {"d5": 0, "d6": 1}}
    run_dict = {
        "q1": {
            "d1": 1.1,
            "d2": 1.0
        },
        "q2": {
            "d5": 9.0,
            "d6": 8.0
        },
        "q3": {
            "d7": 1.0,
            "d8": 2.0
        }
    }
    valid_metrics = {"P", "map", "map_cut", "ndcg_cut", "Rprec", "recip_rank"}

    fn = tmpdir / "searcher"
    Searcher.write_trec_run(run_dict, fn)

    # calculate results with q1 and q2
    searcher = Searcher(None, None, None, None)
    qids = set(qrels_dict.keys())
    evaluator = pytrec_eval.RelevanceEvaluator(qrels_dict, valid_metrics)
    partial_metrics = searcher.search_run_metrics(fn, evaluator, qids)

    # cache file exists?
    assert os.path.exists(fn + ".metrics")

    # add q3 and re-run to update cache
    qrels_dict["q3"] = {"d7": 0, "d8": 2}
    qids = set(qrels_dict.keys())
    evaluator = pytrec_eval.RelevanceEvaluator(qrels_dict, valid_metrics)
    metrics = searcher.search_run_metrics(fn, evaluator, qids)
    assert "q3" in metrics
    assert "q2" in metrics

    # remove original file to ensure results loaded from cache,
    # then make sure metrics haven't changed (and include the new q3)
    os.remove(fn)
    cached_metrics = searcher.search_run_metrics(fn, evaluator, qids)
    assert metrics == cached_metrics
Exemple #30
0
def eval_preds(test_qrels, target_preds):
    dev_eval = pytrec_eval.RelevanceEvaluator(test_qrels,
                                              {"ndcg_cut", "P", "map"})
    result = dev_eval.evaluate(target_preds)
    fold_metrics = defaultdict(list)

    for qid, metrics in result.items():
        for metric, val in metrics.items():
            fold_metrics[metric].append(val)

    return {key: np.mean(vals) for key, vals in fold_metrics.items()}