Esempio n. 1
0
 def _iter_calc(self, run):
     with tempfile.NamedTemporaryFile() as perlf, \
          ir_measures.util.QrelsConverter(self.qrels).as_tmp_file() as qrelsf, \
          ir_measures.util.RunConverter(run).as_tmp_file() as runf:
         perlf_contents = pkgutil.get_data('ir_measures', 'bin/gdeval.pl')
         perlf.write(perlf_contents)
         perlf.flush()
         for cutoff, nDCG_measure, ERR_measure in self.invocations:
             cmd = ['perl', perlf.name, qrelsf.name, runf.name, str(cutoff)]
             output = subprocess.check_output(cmd)
             output = output.decode().replace('\t', ' ').split('\n')
             for i, s in enumerate(output):
                 if s == '' or i == 0:
                     continue
                 arr = s.split(',')
                 assert len(arr) == 4
                 _, qid, ndcg, err = arr
                 if nDCG_measure is not None:
                     yield Metric(query_id=qid,
                                  measure=nDCG_measure,
                                  value=float(ndcg))
                 if ERR_measure is not None:
                     yield Metric(query_id=qid,
                                  measure=ERR_measure,
                                  value=float(err))
    def iter_calc(self, run) -> Iterator['Metric']:
        """Compute the metrics for the run, discarding topics with no relevant documents"""
        run = ir_measures.util.RunConverter(run).as_sorteddict()

        for measure, cutoff, rel in self.invocations:
            for qid, documents in run.items():
                # Get the relevance assessments 
                qrel = self.qrels.get(qid, {})
                if len(qrel) == 0:
                    continue

                # Count the number of non relevant documents above a
                # relevant one
                _cutoff = cutoff or len(documents)
                nonrels = [0]
                for _, document in zip(range(_cutoff), documents):
                    if qrel.get(document.doc_id, 0) >= rel:
                        nonrels.append(nonrels[-1])
                    else:
                        nonrels[-1] += 1

                # Only report if one relevant document was retrieved
                if len(nonrels) >= 2:
                    value = 1. - sum(nonrels[:-1]) / (float(nonrels[-1]) * (len(nonrels) - 1))
                    yield Metric(query_id=qid, measure=measure, value=value)
 def _iter_calc(self, run):
     import pandas as pd
     available_qids = set(self.qrels.qrels_data['query'].unique())
     tmp_run = ir_measures.util.RunConverter(run).as_namedtuple_iter()
     tmp_run = pd.DataFrame(tmp_run)
     if len(tmp_run) == 0:
         tmp_run = pd.DataFrame(columns=['query', 'docid', 'score'],
                                dtype='object')
     else:
         tmp_run = tmp_run.rename(columns={
             'query_id': 'query',
             'doc_id': 'docid',
             'score': 'score'
         })
     tmp_run.sort_values(['query', 'score'],
                         ascending=[True, False],
                         inplace=True)
     run = self.trectools.TrecRun()
     run.run_data = tmp_run
     evaluator = self.trectools.TrecEval(run, self.qrels)
     for invocation, measure in self.invocations:
         for query_id, value in invocation(evaluator).itertuples():
             if query_id in available_qids:
                 yield Metric(query_id=query_id,
                              measure=measure,
                              value=value)
Esempio n. 4
0
 def iter_calc(self, run):
     result = self.evaluator.evaluate(run)
     for query_id, measures in result.items():
         for measure_str, value in measures.items():
             yield Metric(query_id=query_id,
                          measure=self.measure_map[measure_str][0],
                          value=value)
Esempio n. 5
0
 def _iter_calc(self, run):
     run = ir_measures.util.RunConverter(run).as_dict_of_dict()
     for measure, p, normalize in self.invocations:
         for qid in run:
             if qid in self.qrels:
                 value = compatibility(self.qrels[qid], run[qid], p,
                                       normalize)
                 yield Metric(query_id=qid, measure=measure, value=value)
Esempio n. 6
0
 def iter_calc(self, run) -> Iterator['Metric']:
     """
     Yields per-topic metrics this run.
     """
     expected_measure_qids = set(
         itertools.product(self.measures, self.qrel_qids))
     for metric in self._iter_calc(run):
         expected_measure_qids.discard((metric.measure, metric.query_id))
         yield metric
     for measure, query_id in sorted(expected_measure_qids,
                                     key=lambda x: (str(x[0]), x[1])):
         yield Metric(query_id=query_id,
                      measure=measure,
                      value=measure.DEFAULT)
Esempio n. 7
0
 def _iter_calc(self, run):
     run = ir_measures.util.RunConverter(run).as_dict_of_dict()
     sorted_run = {
         q: list(sorted(run[q].items(), key=lambda x: (-x[1], x[0])))
         for q in run
     }
     for qid in run:
         qid_qrels = self.qrels.get(qid)
         if qid_qrels:
             for cutoff, measure in self.cutoffs:
                 judged_c = sum(
                     (did in qid_qrels)
                     for did, _ in sorted_run.get(qid, [])[:cutoff])
                 value = judged_c / cutoff
                 yield Metric(query_id=qid, measure=measure, value=value)
 def _iter_calc(self, run):
     run = ir_measures.util.RunConverter(run).as_dict_of_dict()
     sorted_run = {
         q: list(sorted(run[q].items(), key=lambda x: (-x[1], x[0])))
         for q in run
     }
     sorted_run = {q: [did for did, _ in v] for q, v in sorted_run.items()}
     for measure, rel, cutoff in self.invocations:
         if cutoff is NOT_PROVIDED:
             cutoff = sys.maxsize
         msmarco_result = msmarco_eval.compute_metrics(
             self.qrels_by_rel[rel], sorted_run, max_rank=cutoff)
         for qid, value in msmarco_result[f'MRR @{cutoff} by query'].items(
         ):
             yield Metric(query_id=qid, measure=measure, value=value)
Esempio n. 9
0
    def test_nDCG(self):
        qrels = list(
            ir_measures.read_trec_qrels('''
0 0 D0 1
0 0 D1 -1
0 0 D2 0
0 0 D3 2
0 0 D4 0
1 0 D0 1
1 0 D3 2
1 0 D4 -1
1 0 D5 0
'''))
        run = list(
            ir_measures.read_trec_run('''
0 0 D0 1 0.8 run
0 0 D2 2 0.7 run
0 0 D1 3 0.3 run
0 0 D3 4 0.4 run
0 0 D4 5 0.1 run
1 0 D1 1 0.8 run
1 0 D4 2 0.7 run
1 0 D3 3 0.3 run
1 0 D2 4 0.4 run
'''))
        provider = ir_measures.pytrec_eval
        measure = ir_measures.nDCG
        self.assertMetrics(provider.iter_calc([measure], qrels, run), [
            Metric(query_id='0', measure=measure, value=0.76018),
            Metric(query_id='1', measure=measure, value=0.32739)
        ])

        measure = ir_measures.nDCG @ 3
        self.assertMetrics(provider.iter_calc([measure], qrels, run), [
            Metric(query_id='0', measure=measure, value=0.76018),
            Metric(query_id='1', measure=measure, value=0.0)
        ])

        measure = ir_measures.nDCG(gains={0: 1, 1: 4})
        self.assertMetrics(provider.iter_calc([measure], qrels, run), [
            Metric(query_id='0', measure=measure, value=0.97177),
            Metric(query_id='1', measure=measure, value=0.14949)
        ])
Esempio n. 10
0
    def test_P(self):
        qrels = list(
            ir_measures.read_trec_qrels('''
0 0 D0 0
0 0 D1 1
0 0 D2 1
0 0 D3 2
0 0 D4 0
'''))
        run = list(
            ir_measures.read_trec_run('''
0 0 D0 1 0.8 run
0 0 D2 2 0.7 run
0 0 D1 3 0.3 run
0 0 D3 4 0.4 run
0 0 D4 5 0.1 run
'''))
        measure = ir_measures.P @ 5
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.6))
        measure = ir_measures.P(rel=2) @ 5
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.2))

        measure = ir_measures.SetP
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.6))
        measure = ir_measures.SetP(rel=2) @ 5
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.2))

        measure = ir_measures.R @ 5
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 1.0))
        measure = ir_measures.R(rel=2) @ 5
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 1.0))
        measure = ir_measures.R @ 2
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.3333333333333333))
        measure = ir_measures.R(rel=2) @ 2
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.0))

        measure = ir_measures.SetR
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 1.0))
        measure = ir_measures.SetR(rel=2) @ 5
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 1.0))

        measure = ir_measures.RR
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.5))
        measure = ir_measures.RR(rel=2)
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.3333333333333333))
        measure = ir_measures.RR @ 10
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.5))
        measure = ir_measures.RR(rel=2) @ 10
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.3333333333333333))
        measure = ir_measures.RR @ 2
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.5))
        measure = ir_measures.RR(rel=2) @ 2
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.))

        measure = ir_measures.AP
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.6388888888888888))
        measure = ir_measures.AP(rel=2)
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.3333333333333333))
        measure = ir_measures.AP @ 10
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.6388888888888888))
        measure = ir_measures.AP(rel=2) @ 10
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.3333333333333333))
        measure = ir_measures.AP @ 2
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.16666666666666666))
        measure = ir_measures.AP(rel=2) @ 2
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.0))

        measure = ir_measures.Success @ 10
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 1.))
        measure = ir_measures.Success(rel=2) @ 10
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 1.))
        measure = ir_measures.Success @ 2
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 1.))
        measure = ir_measures.Success(rel=2) @ 2
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.))

        measure = ir_measures.NumRet(rel=1)
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 3.))
        measure = ir_measures.NumRet(rel=2)
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))

        measure = ir_measures.nDCG
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.6584645692843067))
        measure = ir_measures.nDCG @ 5
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.6584645692843067))
        measure = ir_measures.nDCG @ 2
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.23981246656813146))

        measure = ir_measures.nDCG(dcg='exp-log2')
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.6201040599710453))
        measure = ir_measures.nDCG(dcg='exp-log2') @ 5
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.6201040599710453))
        measure = ir_measures.nDCG(dcg='exp-log2') @ 2
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.17376534287144002))

        measure = ir_measures.Rprec
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.6666666666666666))
        measure = ir_measures.Rprec(rel=2)
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.))
Esempio n. 11
0
    def test_empty(self):
        qrels = list(ir_measures.read_trec_qrels('''
0 0 D0 0
0 0 D1 1
0 0 D2 1
0 0 D3 2
0 0 D4 0
1 0 D0 1
1 0 D3 2
1 0 D5 2
'''))
        partial_qrels = [q for q in qrels if q.query_id == '0']
        run = list(ir_measures.read_trec_run('''
0 0 D0 1 0.8 run
0 0 D2 2 0.7 run
0 0 D1 3 0.3 run
0 0 D3 4 0.4 run
0 0 D4 5 0.1 run
1 0 D1 1 0.8 run
1 0 D3 2 0.7 run
1 0 D4 3 0.3 run
1 0 D2 4 0.4 run
'''))
        partial_run = [r for r in run if r.query_id == '0']
        empty = []

        # qrels but no run
        self.assertEqual(set(ir_measures.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.), Metric('1', P@5, 0.)})
        self.assertEqual(set(ir_measures.gdeval.iter_calc([ERR@5], qrels, empty)), {Metric('0', ERR@5, 0.), Metric('1', ERR@5, 0.)})
        self.assertEqual(set(ir_measures.judged.iter_calc([Judged@5], qrels, empty)), {Metric('0', Judged@5, 0.), Metric('1', Judged@5, 0.)})
        self.assertEqual(set(ir_measures.msmarco.iter_calc([RR@5], qrels, empty)), {Metric('0', RR@5, 0.), Metric('1', RR@5, 0.)})
        self.assertEqual(set(ir_measures.pytrec_eval.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.), Metric('1', P@5, 0.)})
        self.assertEqual(set(ir_measures.trectools.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.), Metric('1', P@5, 0.)})
        self.assertEqual(set(ir_measures.cwl_eval.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.0), Metric('1', P@5, 0.0)})
        self.assertEqual(set(ir_measures.compat.iter_calc([Compat(p=0.8)], qrels, empty)), {Metric('0', Compat(p=0.8), 0.0), Metric('1', Compat(p=0.8), 0.0)})
        self.assertEqual(set(ir_measures.accuracy.iter_calc([Accuracy()], qrels, empty)), set())

        # qrels but partial run
        self.assertEqual(set(ir_measures.iter_calc([P@5], qrels, partial_run)), {Metric('0', P@5, 0.6), Metric('1', P@5, 0.)})
        self.assertEqual(set(ir_measures.gdeval.iter_calc([ERR@5], qrels, partial_run)), {Metric('0', ERR@5, 0.10175), Metric('1', ERR@5, 0.)})
        self.assertEqual(set(ir_measures.judged.iter_calc([Judged@5], qrels, partial_run)), {Metric('0', Judged@5, 1.), Metric('1', Judged@5, 0.)})
        self.assertEqual(set(ir_measures.msmarco.iter_calc([RR@5], qrels, partial_run)), {Metric('0', RR@5, 0.5), Metric('1', RR@5, 0.)})
        self.assertEqual(set(ir_measures.pytrec_eval.iter_calc([P@5], qrels, partial_run)), {Metric('0', P@5, 0.6), Metric('1', P@5, 0.)})
        self.assertEqual(set(ir_measures.trectools.iter_calc([P@5], qrels, partial_run)), {Metric('0', P@5, 0.6), Metric('1', P@5, 0.)})
        self.assertEqual(set(ir_measures.cwl_eval.iter_calc([P@5], qrels, partial_run)), {CwlMetric('0', P@5, 0.6000000000000001, 3.0, 1.0, 5.0, 5.0), Metric('1', P@5, 0.0)})
        self.assertEqual(set(ir_measures.compat.iter_calc([Compat(p=0.8)], qrels, partial_run)), {Metric('0', Compat(p=0.8), 0.4744431703672816), Metric('1', Compat(p=0.8), 0.0)})
        self.assertEqual(set(ir_measures.accuracy.iter_calc([Accuracy()], qrels, partial_run)), {Metric('0', Accuracy(), 0.5)})

        # run but no qrels
        self.assertEqual(list(ir_measures.iter_calc([P@5], empty, run)), [])
        self.assertEqual(list(ir_measures.gdeval.iter_calc([ERR@5], empty, run)), [])
        self.assertEqual(list(ir_measures.judged.iter_calc([Judged@5], empty, run)), [])
        self.assertEqual(list(ir_measures.msmarco.iter_calc([RR@5], empty, run)), [])
        self.assertEqual(list(ir_measures.pytrec_eval.iter_calc([P@5], empty, run)), [])
        self.assertEqual(list(ir_measures.trectools.iter_calc([P@5], empty, run)), [])
        self.assertEqual(list(ir_measures.cwl_eval.iter_calc([P@5], empty, run)), [])
        self.assertEqual(list(ir_measures.compat.iter_calc([Compat(p=0.8)], empty, run)), [])
        self.assertEqual(list(ir_measures.accuracy.iter_calc([Accuracy()], empty, run)), [])

        # run but partial qrels
        self.assertEqual(set(ir_measures.iter_calc([P@5], partial_qrels, run)), {Metric('0', P@5, 0.6)})
        self.assertEqual(set(ir_measures.gdeval.iter_calc([ERR@5], partial_qrels, run)), {Metric('0', ERR@5, 0.10175)})
        self.assertEqual(set(ir_measures.judged.iter_calc([Judged@5], partial_qrels, run)), {Metric('0', Judged@5, 1.)})
        self.assertEqual(set(ir_measures.msmarco.iter_calc([RR@5], partial_qrels, run)), {Metric('0', RR@5, 0.5)})
        self.assertEqual(set(ir_measures.pytrec_eval.iter_calc([P@5], partial_qrels, run)), {Metric('0', P@5, 0.6)})
        self.assertEqual(set(ir_measures.trectools.iter_calc([P@5], partial_qrels, run)), {Metric('0', P@5, 0.6)})
        self.assertEqual(set(ir_measures.cwl_eval.iter_calc([P@5], partial_qrels, run)), {CwlMetric('0', P@5, 0.6000000000000001, 3.0, 1.0, 5.0, 5.0)})
        self.assertEqual(set(ir_measures.compat.iter_calc([Compat(p=0.8)], partial_qrels, run)), {Metric('0', Compat(p=0.8), 0.4744431703672816)})
        self.assertEqual(set(ir_measures.accuracy.iter_calc([Accuracy()], partial_qrels, run)), {Metric('0', Accuracy(), 0.5)})

        # both no run and no qrels
        self.assertEqual(list(ir_measures.iter_calc([P@5], empty, empty)), [])
        self.assertEqual(list(ir_measures.gdeval.iter_calc([ERR@5], empty, empty)), [])
        self.assertEqual(list(ir_measures.judged.iter_calc([Judged@5], empty, empty)), [])
        self.assertEqual(list(ir_measures.msmarco.iter_calc([RR@5], empty, empty)), [])
        self.assertEqual(list(ir_measures.pytrec_eval.iter_calc([P@5], empty, empty)), [])
        self.assertEqual(list(ir_measures.trectools.iter_calc([P@5], empty, empty)), [])
        self.assertEqual(list(ir_measures.cwl_eval.iter_calc([P@5], empty, empty)), [])
        self.assertEqual(list(ir_measures.compat.iter_calc([Compat(p=0.8)], empty, empty)), [])
        self.assertEqual(list(ir_measures.accuracy.iter_calc([Accuracy()], empty, empty)), [])

        # qrels but no run
        numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], qrels, empty), {P@5: 0.})
        numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], qrels, empty), {ERR@5: 0.})
        numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], qrels, empty), {Judged@5: 0.})
        numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], qrels, empty), {RR@5: 0.})
        numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], qrels, empty), {P@5: 0.})
        numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], qrels, empty), {P@5: 0.})
        numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], qrels, empty), {P@5: 0.})
        numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], qrels, empty), {Compat(p=0.8): 0.})
        numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], qrels, empty), {Accuracy(): float('NaN')})

        # qrels but partial run
        numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.3})
        numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], qrels, partial_run), {ERR@5: 0.050875})
        numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], qrels, partial_run), {Judged@5: 0.5})
        numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], qrels, partial_run), {RR@5: 0.25})
        numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.3})
        numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.3})
        numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.30000000000000004})
        numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], qrels, partial_run), {Compat(p=0.8): 0.2372215851836408})
        numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], qrels, partial_run), {Accuracy(): 0.5})

        # run but no qrels
        numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], empty, run), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], empty, run), {ERR@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], empty, run), {Judged@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], empty, run), {RR@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], empty, run), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], empty, run), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], empty, run), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], empty, run), {Compat(p=0.8): float('NaN')})
        numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], empty, run), {Accuracy(): float('NaN')})

        # run but partial qrels
        numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6})
        numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], partial_qrels, run), {ERR@5: 0.10175})
        numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], partial_qrels, run), {Judged@5: 1.0})
        numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], partial_qrels, run), {RR@5: 0.5})
        numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6})
        numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6})
        numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6000000000000001})
        numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], partial_qrels, run), {Compat(p=0.8): 0.4744431703672816})
        numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], partial_qrels, run), {Accuracy(): 0.5})

        # both no run and no qrels
        numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], empty, empty), {ERR@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], empty, empty), {Judged@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], empty, empty), {RR@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], empty, empty), {Compat(p=0.8): float('NaN')})
        numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], empty, empty), {Accuracy(): float('NaN')})
Esempio n. 12
0
 def iter_calc(self, run):
     self.ranx.evaluate(self.qrels, run, list(self.measure_map))
     for measure, qid_value_map in run.scores.items():
         for query_id, value in qid_value_map.items():
             yield Metric(query_id=query_id, measure=self.measure_map[measure][0], value=value)
     run.scores.clear()