def _iter_calc(self, run): with tempfile.NamedTemporaryFile() as perlf, \ ir_measures.util.QrelsConverter(self.qrels).as_tmp_file() as qrelsf, \ ir_measures.util.RunConverter(run).as_tmp_file() as runf: perlf_contents = pkgutil.get_data('ir_measures', 'bin/gdeval.pl') perlf.write(perlf_contents) perlf.flush() for cutoff, nDCG_measure, ERR_measure in self.invocations: cmd = ['perl', perlf.name, qrelsf.name, runf.name, str(cutoff)] output = subprocess.check_output(cmd) output = output.decode().replace('\t', ' ').split('\n') for i, s in enumerate(output): if s == '' or i == 0: continue arr = s.split(',') assert len(arr) == 4 _, qid, ndcg, err = arr if nDCG_measure is not None: yield Metric(query_id=qid, measure=nDCG_measure, value=float(ndcg)) if ERR_measure is not None: yield Metric(query_id=qid, measure=ERR_measure, value=float(err))
def iter_calc(self, run) -> Iterator['Metric']: """Compute the metrics for the run, discarding topics with no relevant documents""" run = ir_measures.util.RunConverter(run).as_sorteddict() for measure, cutoff, rel in self.invocations: for qid, documents in run.items(): # Get the relevance assessments qrel = self.qrels.get(qid, {}) if len(qrel) == 0: continue # Count the number of non relevant documents above a # relevant one _cutoff = cutoff or len(documents) nonrels = [0] for _, document in zip(range(_cutoff), documents): if qrel.get(document.doc_id, 0) >= rel: nonrels.append(nonrels[-1]) else: nonrels[-1] += 1 # Only report if one relevant document was retrieved if len(nonrels) >= 2: value = 1. - sum(nonrels[:-1]) / (float(nonrels[-1]) * (len(nonrels) - 1)) yield Metric(query_id=qid, measure=measure, value=value)
def _iter_calc(self, run): import pandas as pd available_qids = set(self.qrels.qrels_data['query'].unique()) tmp_run = ir_measures.util.RunConverter(run).as_namedtuple_iter() tmp_run = pd.DataFrame(tmp_run) if len(tmp_run) == 0: tmp_run = pd.DataFrame(columns=['query', 'docid', 'score'], dtype='object') else: tmp_run = tmp_run.rename(columns={ 'query_id': 'query', 'doc_id': 'docid', 'score': 'score' }) tmp_run.sort_values(['query', 'score'], ascending=[True, False], inplace=True) run = self.trectools.TrecRun() run.run_data = tmp_run evaluator = self.trectools.TrecEval(run, self.qrels) for invocation, measure in self.invocations: for query_id, value in invocation(evaluator).itertuples(): if query_id in available_qids: yield Metric(query_id=query_id, measure=measure, value=value)
def iter_calc(self, run): result = self.evaluator.evaluate(run) for query_id, measures in result.items(): for measure_str, value in measures.items(): yield Metric(query_id=query_id, measure=self.measure_map[measure_str][0], value=value)
def _iter_calc(self, run): run = ir_measures.util.RunConverter(run).as_dict_of_dict() for measure, p, normalize in self.invocations: for qid in run: if qid in self.qrels: value = compatibility(self.qrels[qid], run[qid], p, normalize) yield Metric(query_id=qid, measure=measure, value=value)
def iter_calc(self, run) -> Iterator['Metric']: """ Yields per-topic metrics this run. """ expected_measure_qids = set( itertools.product(self.measures, self.qrel_qids)) for metric in self._iter_calc(run): expected_measure_qids.discard((metric.measure, metric.query_id)) yield metric for measure, query_id in sorted(expected_measure_qids, key=lambda x: (str(x[0]), x[1])): yield Metric(query_id=query_id, measure=measure, value=measure.DEFAULT)
def _iter_calc(self, run): run = ir_measures.util.RunConverter(run).as_dict_of_dict() sorted_run = { q: list(sorted(run[q].items(), key=lambda x: (-x[1], x[0]))) for q in run } for qid in run: qid_qrels = self.qrels.get(qid) if qid_qrels: for cutoff, measure in self.cutoffs: judged_c = sum( (did in qid_qrels) for did, _ in sorted_run.get(qid, [])[:cutoff]) value = judged_c / cutoff yield Metric(query_id=qid, measure=measure, value=value)
def _iter_calc(self, run): run = ir_measures.util.RunConverter(run).as_dict_of_dict() sorted_run = { q: list(sorted(run[q].items(), key=lambda x: (-x[1], x[0]))) for q in run } sorted_run = {q: [did for did, _ in v] for q, v in sorted_run.items()} for measure, rel, cutoff in self.invocations: if cutoff is NOT_PROVIDED: cutoff = sys.maxsize msmarco_result = msmarco_eval.compute_metrics( self.qrels_by_rel[rel], sorted_run, max_rank=cutoff) for qid, value in msmarco_result[f'MRR @{cutoff} by query'].items( ): yield Metric(query_id=qid, measure=measure, value=value)
def test_nDCG(self): qrels = list( ir_measures.read_trec_qrels(''' 0 0 D0 1 0 0 D1 -1 0 0 D2 0 0 0 D3 2 0 0 D4 0 1 0 D0 1 1 0 D3 2 1 0 D4 -1 1 0 D5 0 ''')) run = list( ir_measures.read_trec_run(''' 0 0 D0 1 0.8 run 0 0 D2 2 0.7 run 0 0 D1 3 0.3 run 0 0 D3 4 0.4 run 0 0 D4 5 0.1 run 1 0 D1 1 0.8 run 1 0 D4 2 0.7 run 1 0 D3 3 0.3 run 1 0 D2 4 0.4 run ''')) provider = ir_measures.pytrec_eval measure = ir_measures.nDCG self.assertMetrics(provider.iter_calc([measure], qrels, run), [ Metric(query_id='0', measure=measure, value=0.76018), Metric(query_id='1', measure=measure, value=0.32739) ]) measure = ir_measures.nDCG @ 3 self.assertMetrics(provider.iter_calc([measure], qrels, run), [ Metric(query_id='0', measure=measure, value=0.76018), Metric(query_id='1', measure=measure, value=0.0) ]) measure = ir_measures.nDCG(gains={0: 1, 1: 4}) self.assertMetrics(provider.iter_calc([measure], qrels, run), [ Metric(query_id='0', measure=measure, value=0.97177), Metric(query_id='1', measure=measure, value=0.14949) ])
def test_P(self): qrels = list( ir_measures.read_trec_qrels(''' 0 0 D0 0 0 0 D1 1 0 0 D2 1 0 0 D3 2 0 0 D4 0 ''')) run = list( ir_measures.read_trec_run(''' 0 0 D0 1 0.8 run 0 0 D2 2 0.7 run 0 0 D1 3 0.3 run 0 0 D3 4 0.4 run 0 0 D4 5 0.1 run ''')) measure = ir_measures.P @ 5 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.6)) measure = ir_measures.P(rel=2) @ 5 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.2)) measure = ir_measures.SetP result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.6)) measure = ir_measures.SetP(rel=2) @ 5 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.2)) measure = ir_measures.R @ 5 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 1.0)) measure = ir_measures.R(rel=2) @ 5 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 1.0)) measure = ir_measures.R @ 2 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.3333333333333333)) measure = ir_measures.R(rel=2) @ 2 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.0)) measure = ir_measures.SetR result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 1.0)) measure = ir_measures.SetR(rel=2) @ 5 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 1.0)) measure = ir_measures.RR result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.5)) measure = ir_measures.RR(rel=2) result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.3333333333333333)) measure = ir_measures.RR @ 10 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.5)) measure = ir_measures.RR(rel=2) @ 10 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.3333333333333333)) measure = ir_measures.RR @ 2 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.5)) measure = ir_measures.RR(rel=2) @ 2 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.)) measure = ir_measures.AP result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.6388888888888888)) measure = ir_measures.AP(rel=2) result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.3333333333333333)) measure = ir_measures.AP @ 10 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.6388888888888888)) measure = ir_measures.AP(rel=2) @ 10 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.3333333333333333)) measure = ir_measures.AP @ 2 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.16666666666666666)) measure = ir_measures.AP(rel=2) @ 2 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.0)) measure = ir_measures.Success @ 10 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 1.)) measure = ir_measures.Success(rel=2) @ 10 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 1.)) measure = ir_measures.Success @ 2 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 1.)) measure = ir_measures.Success(rel=2) @ 2 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.)) measure = ir_measures.NumRet(rel=1) result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 3.)) measure = ir_measures.NumRet(rel=2) result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) measure = ir_measures.nDCG result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.6584645692843067)) measure = ir_measures.nDCG @ 5 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.6584645692843067)) measure = ir_measures.nDCG @ 2 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.23981246656813146)) measure = ir_measures.nDCG(dcg='exp-log2') result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.6201040599710453)) measure = ir_measures.nDCG(dcg='exp-log2') @ 5 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.6201040599710453)) measure = ir_measures.nDCG(dcg='exp-log2') @ 2 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.17376534287144002)) measure = ir_measures.Rprec result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.6666666666666666)) measure = ir_measures.Rprec(rel=2) result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.))
def test_empty(self): qrels = list(ir_measures.read_trec_qrels(''' 0 0 D0 0 0 0 D1 1 0 0 D2 1 0 0 D3 2 0 0 D4 0 1 0 D0 1 1 0 D3 2 1 0 D5 2 ''')) partial_qrels = [q for q in qrels if q.query_id == '0'] run = list(ir_measures.read_trec_run(''' 0 0 D0 1 0.8 run 0 0 D2 2 0.7 run 0 0 D1 3 0.3 run 0 0 D3 4 0.4 run 0 0 D4 5 0.1 run 1 0 D1 1 0.8 run 1 0 D3 2 0.7 run 1 0 D4 3 0.3 run 1 0 D2 4 0.4 run ''')) partial_run = [r for r in run if r.query_id == '0'] empty = [] # qrels but no run self.assertEqual(set(ir_measures.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.), Metric('1', P@5, 0.)}) self.assertEqual(set(ir_measures.gdeval.iter_calc([ERR@5], qrels, empty)), {Metric('0', ERR@5, 0.), Metric('1', ERR@5, 0.)}) self.assertEqual(set(ir_measures.judged.iter_calc([Judged@5], qrels, empty)), {Metric('0', Judged@5, 0.), Metric('1', Judged@5, 0.)}) self.assertEqual(set(ir_measures.msmarco.iter_calc([RR@5], qrels, empty)), {Metric('0', RR@5, 0.), Metric('1', RR@5, 0.)}) self.assertEqual(set(ir_measures.pytrec_eval.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.), Metric('1', P@5, 0.)}) self.assertEqual(set(ir_measures.trectools.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.), Metric('1', P@5, 0.)}) self.assertEqual(set(ir_measures.cwl_eval.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.0), Metric('1', P@5, 0.0)}) self.assertEqual(set(ir_measures.compat.iter_calc([Compat(p=0.8)], qrels, empty)), {Metric('0', Compat(p=0.8), 0.0), Metric('1', Compat(p=0.8), 0.0)}) self.assertEqual(set(ir_measures.accuracy.iter_calc([Accuracy()], qrels, empty)), set()) # qrels but partial run self.assertEqual(set(ir_measures.iter_calc([P@5], qrels, partial_run)), {Metric('0', P@5, 0.6), Metric('1', P@5, 0.)}) self.assertEqual(set(ir_measures.gdeval.iter_calc([ERR@5], qrels, partial_run)), {Metric('0', ERR@5, 0.10175), Metric('1', ERR@5, 0.)}) self.assertEqual(set(ir_measures.judged.iter_calc([Judged@5], qrels, partial_run)), {Metric('0', Judged@5, 1.), Metric('1', Judged@5, 0.)}) self.assertEqual(set(ir_measures.msmarco.iter_calc([RR@5], qrels, partial_run)), {Metric('0', RR@5, 0.5), Metric('1', RR@5, 0.)}) self.assertEqual(set(ir_measures.pytrec_eval.iter_calc([P@5], qrels, partial_run)), {Metric('0', P@5, 0.6), Metric('1', P@5, 0.)}) self.assertEqual(set(ir_measures.trectools.iter_calc([P@5], qrels, partial_run)), {Metric('0', P@5, 0.6), Metric('1', P@5, 0.)}) self.assertEqual(set(ir_measures.cwl_eval.iter_calc([P@5], qrels, partial_run)), {CwlMetric('0', P@5, 0.6000000000000001, 3.0, 1.0, 5.0, 5.0), Metric('1', P@5, 0.0)}) self.assertEqual(set(ir_measures.compat.iter_calc([Compat(p=0.8)], qrels, partial_run)), {Metric('0', Compat(p=0.8), 0.4744431703672816), Metric('1', Compat(p=0.8), 0.0)}) self.assertEqual(set(ir_measures.accuracy.iter_calc([Accuracy()], qrels, partial_run)), {Metric('0', Accuracy(), 0.5)}) # run but no qrels self.assertEqual(list(ir_measures.iter_calc([P@5], empty, run)), []) self.assertEqual(list(ir_measures.gdeval.iter_calc([ERR@5], empty, run)), []) self.assertEqual(list(ir_measures.judged.iter_calc([Judged@5], empty, run)), []) self.assertEqual(list(ir_measures.msmarco.iter_calc([RR@5], empty, run)), []) self.assertEqual(list(ir_measures.pytrec_eval.iter_calc([P@5], empty, run)), []) self.assertEqual(list(ir_measures.trectools.iter_calc([P@5], empty, run)), []) self.assertEqual(list(ir_measures.cwl_eval.iter_calc([P@5], empty, run)), []) self.assertEqual(list(ir_measures.compat.iter_calc([Compat(p=0.8)], empty, run)), []) self.assertEqual(list(ir_measures.accuracy.iter_calc([Accuracy()], empty, run)), []) # run but partial qrels self.assertEqual(set(ir_measures.iter_calc([P@5], partial_qrels, run)), {Metric('0', P@5, 0.6)}) self.assertEqual(set(ir_measures.gdeval.iter_calc([ERR@5], partial_qrels, run)), {Metric('0', ERR@5, 0.10175)}) self.assertEqual(set(ir_measures.judged.iter_calc([Judged@5], partial_qrels, run)), {Metric('0', Judged@5, 1.)}) self.assertEqual(set(ir_measures.msmarco.iter_calc([RR@5], partial_qrels, run)), {Metric('0', RR@5, 0.5)}) self.assertEqual(set(ir_measures.pytrec_eval.iter_calc([P@5], partial_qrels, run)), {Metric('0', P@5, 0.6)}) self.assertEqual(set(ir_measures.trectools.iter_calc([P@5], partial_qrels, run)), {Metric('0', P@5, 0.6)}) self.assertEqual(set(ir_measures.cwl_eval.iter_calc([P@5], partial_qrels, run)), {CwlMetric('0', P@5, 0.6000000000000001, 3.0, 1.0, 5.0, 5.0)}) self.assertEqual(set(ir_measures.compat.iter_calc([Compat(p=0.8)], partial_qrels, run)), {Metric('0', Compat(p=0.8), 0.4744431703672816)}) self.assertEqual(set(ir_measures.accuracy.iter_calc([Accuracy()], partial_qrels, run)), {Metric('0', Accuracy(), 0.5)}) # both no run and no qrels self.assertEqual(list(ir_measures.iter_calc([P@5], empty, empty)), []) self.assertEqual(list(ir_measures.gdeval.iter_calc([ERR@5], empty, empty)), []) self.assertEqual(list(ir_measures.judged.iter_calc([Judged@5], empty, empty)), []) self.assertEqual(list(ir_measures.msmarco.iter_calc([RR@5], empty, empty)), []) self.assertEqual(list(ir_measures.pytrec_eval.iter_calc([P@5], empty, empty)), []) self.assertEqual(list(ir_measures.trectools.iter_calc([P@5], empty, empty)), []) self.assertEqual(list(ir_measures.cwl_eval.iter_calc([P@5], empty, empty)), []) self.assertEqual(list(ir_measures.compat.iter_calc([Compat(p=0.8)], empty, empty)), []) self.assertEqual(list(ir_measures.accuracy.iter_calc([Accuracy()], empty, empty)), []) # qrels but no run numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], qrels, empty), {P@5: 0.}) numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], qrels, empty), {ERR@5: 0.}) numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], qrels, empty), {Judged@5: 0.}) numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], qrels, empty), {RR@5: 0.}) numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], qrels, empty), {P@5: 0.}) numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], qrels, empty), {P@5: 0.}) numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], qrels, empty), {P@5: 0.}) numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], qrels, empty), {Compat(p=0.8): 0.}) numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], qrels, empty), {Accuracy(): float('NaN')}) # qrels but partial run numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.3}) numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], qrels, partial_run), {ERR@5: 0.050875}) numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], qrels, partial_run), {Judged@5: 0.5}) numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], qrels, partial_run), {RR@5: 0.25}) numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.3}) numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.3}) numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.30000000000000004}) numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], qrels, partial_run), {Compat(p=0.8): 0.2372215851836408}) numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], qrels, partial_run), {Accuracy(): 0.5}) # run but no qrels numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], empty, run), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], empty, run), {ERR@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], empty, run), {Judged@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], empty, run), {RR@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], empty, run), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], empty, run), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], empty, run), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], empty, run), {Compat(p=0.8): float('NaN')}) numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], empty, run), {Accuracy(): float('NaN')}) # run but partial qrels numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6}) numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], partial_qrels, run), {ERR@5: 0.10175}) numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], partial_qrels, run), {Judged@5: 1.0}) numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], partial_qrels, run), {RR@5: 0.5}) numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6}) numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6}) numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6000000000000001}) numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], partial_qrels, run), {Compat(p=0.8): 0.4744431703672816}) numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], partial_qrels, run), {Accuracy(): 0.5}) # both no run and no qrels numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], empty, empty), {ERR@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], empty, empty), {Judged@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], empty, empty), {RR@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], empty, empty), {Compat(p=0.8): float('NaN')}) numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], empty, empty), {Accuracy(): float('NaN')})
def iter_calc(self, run): self.ranx.evaluate(self.qrels, run, list(self.measure_map)) for measure, qid_value_map in run.scores.items(): for query_id, value in qid_value_map.items(): yield Metric(query_id=query_id, measure=self.measure_map[measure][0], value=value) run.scores.clear()