nonrels.append(nonrels[-1]) else: nonrels[-1] += 1 # Only report if one relevant document was retrieved if len(nonrels) >= 2: value = 1. - sum(nonrels[:-1]) / (float(nonrels[-1]) * (len(nonrels) - 1)) yield Metric(query_id=qid, measure=measure, value=value) class AccuracyProvider(providers.Provider): """Accuracy provider""" NAME = "accuracy" SUPPORTED_MEASURES = [ _Accuracy(cutoff=Any(), rel=Any()), ] def _evaluator(self, measures, qrels) -> providers.Evaluator: invocations = [] for measure in ir_measures.util.flatten_measures(measures): if measure.NAME == _Accuracy.NAME: cutoff = 0 if measure['cutoff'] is NOT_PROVIDED else measure['cutoff'] invocations.append((measure, cutoff, measure['rel'])) else: raise ValueError(f'unsupported measure {measure}') qrels = ir_measures.util.QrelsConverter(qrels).as_dict_of_dict() return AccuracyEvaluator(measures, qrels, invocations) providers.register(AccuracyProvider())
self.invocations = invocations def _iter_calc(self, run): with tempfile.NamedTemporaryFile() as perlf, \ ir_measures.util.QrelsConverter(self.qrels).as_tmp_file() as qrelsf, \ ir_measures.util.RunConverter(run).as_tmp_file() as runf: perlf_contents = pkgutil.get_data('ir_measures', 'bin/gdeval.pl') perlf.write(perlf_contents) perlf.flush() for cutoff, nDCG_measure, ERR_measure in self.invocations: cmd = ['perl', perlf.name, qrelsf.name, runf.name, str(cutoff)] output = subprocess.check_output(cmd) output = output.decode().replace('\t', ' ').split('\n') for i, s in enumerate(output): if s == '' or i == 0: continue arr = s.split(',') assert len(arr) == 4 _, qid, ndcg, err = arr if nDCG_measure is not None: yield Metric(query_id=qid, measure=nDCG_measure, value=float(ndcg)) if ERR_measure is not None: yield Metric(query_id=qid, measure=ERR_measure, value=float(err)) providers.register(GdevalProvider())
else: raise ValueError(f'unsupported measure {measure}') qrels = ir_measures.util.QrelsConverter(qrels).as_dict_of_dict() return JudgedEvaluator(measures, qrels, cutoffs) class JudgedEvaluator(providers.Evaluator): def __init__(self, measures, qrels, cutoffs): super().__init__(measures, set(qrels.keys())) self.qrels = qrels self.cutoffs = cutoffs def _iter_calc(self, run): run = ir_measures.util.RunConverter(run).as_dict_of_dict() sorted_run = { q: list(sorted(run[q].items(), key=lambda x: (-x[1], x[0]))) for q in run } for qid in run: qid_qrels = self.qrels.get(qid) if qid_qrels: for cutoff, measure in self.cutoffs: judged_c = sum( (did in qid_qrels) for did, _ in sorted_run.get(qid, [])[:cutoff]) value = judged_c / cutoff yield Metric(query_id=qid, measure=measure, value=value) providers.register(JudgedProvider())
def _irm_convert_to_measure(self, measure): if measure.NAME == 'P': return PrecisionCWLMetric(measure['cutoff']) if measure.NAME == 'RR': return RRCWLMetric() if measure.NAME == 'AP': return APCWLMetric() if measure.NAME == 'RBP': return RBPCWLMetric(measure['p']) if measure.NAME == 'BPM': return BPMCWLMetric(measure['T'], measure['cutoff']) if measure.NAME == 'NERR8': return NERReq8CWLMetric(measure['cutoff']) if measure.NAME == 'NERR9': return NERReq9CWLMetric(measure['cutoff']) if measure.NAME == 'NERR10': return NERReq10CWLMetric(measure['p']) if measure.NAME == 'NERR11': return NERReq11CWLMetric(measure['T']) if measure.NAME == 'SDCG': return NDCGCWLMetric(measure['cutoff']) if measure.NAME == 'INST': return INSTCWLMetric(measure['T']) if measure.NAME == 'INSQ': return INSQCWLMetric(measure['T']) raise KeyError(f'measure {measure} not supported') providers.register(CwlEvalProvider())
def _iter_calc(self, run): import pandas as pd available_qids = set(self.qrels.qrels_data['query'].unique()) tmp_run = ir_measures.util.RunConverter(run).as_namedtuple_iter() tmp_run = pd.DataFrame(tmp_run) if len(tmp_run) == 0: tmp_run = pd.DataFrame(columns=['query', 'docid', 'score'], dtype='object') else: tmp_run = tmp_run.rename(columns={ 'query_id': 'query', 'doc_id': 'docid', 'score': 'score' }) tmp_run.sort_values(['query', 'score'], ascending=[True, False], inplace=True) run = self.trectools.TrecRun() run.run_data = tmp_run evaluator = self.trectools.TrecEval(run, self.qrels) for invocation, measure in self.invocations: for query_id, value in invocation(evaluator).itertuples(): if query_id in available_qids: yield Metric(query_id=query_id, measure=measure, value=value) providers.register(TrectoolsProvider())
self.evaluator = pnd.RelevanceEvaluator(qrels, measure_map.keys(), relevance_level=rel_level, alpha=alpha, beta=beta) self.measure_map = measure_map self.qid_did_filter = None if judged_only: self.qid_did_filter = set( (qrel.query_id, qrel.doc_id) for qrel in qrels) def iter_calc(self, run): if self.qid_did_filter is not None: # used when judged_only filtered_run = {} for qid in run: filtered_run[qid] = {} for did, score in run[qid].items(): if (qid, did) in self.qid_did_filter: filtered_run[qid][did] = score run = filtered_run for record in self.evaluator.evaluate_iter(run): query_id = record['query_id'] del record['query_id'] for measure_str, value in record.items(): yield Metric(query_id=query_id, measure=self.measure_map[measure_str], value=value) providers.register(PyNdEvalProvider())
class PytrecEvalEvaluator(providers.Evaluator): def __init__(self, measures, invokers, qrels): super().__init__(measures, set(qrels.keys())) self.invokers = invokers def _iter_calc(self, run): # Convert qrels to dict_of_dict (input format used by pytrec_eval) run = ir_measures.util.RunConverter(run).as_dict_of_dict() for invoker in self.invokers: yield from invoker.iter_calc(run) class PytrecEvalInvoker: def __init__(self, pte, qrels, measure_map, rel_level): self.evaluator = pte.RelevanceEvaluator( qrels, [m for _, m in measure_map.values()], relevance_level=rel_level) self.measure_map = measure_map def iter_calc(self, run): result = self.evaluator.evaluate(run) for query_id, measures in result.items(): for measure_str, value in measures.items(): yield Metric(query_id=query_id, measure=self.measure_map[measure_str][0], value=value) providers.register(PytrecEvalProvider())
ideal = [docno for docno in qrels if qrels[docno] > 0] ideal.sort(key=lambda docno: run[docno] if docno in run else 0.0, reverse=True) ideal.sort(key=lambda docno: qrels[docno], reverse=True) depth = max(len(ranking), len(ideal)) score = rbo(ranking, ideal, p, depth) if normalize: best = rbo(ideal, ideal, p, depth) if best > 0.0: score = rbo(ranking, ideal, p, depth) / best return score class CompatEvaluator(providers.Evaluator): def __init__(self, measures, qrels, invocations): super().__init__(measures, set(qrels.keys())) self.qrels = qrels self.invocations = invocations def _iter_calc(self, run): run = ir_measures.util.RunConverter(run).as_dict_of_dict() for measure, p, normalize in self.invocations: for qid in run: if qid in self.qrels: value = compatibility(self.qrels[qid], run[qid], p, normalize) yield Metric(query_id=qid, measure=measure, value=value) providers.register(CompatProvider())
raise RuntimeError('ranx not available (do you need to `pip install ranx`?)', ex) class RanxEvaluator(providers.Evaluator): def __init__(self, ranx, measures, invokers, qrels, qids): super().__init__(measures, qids) self.ranx = ranx self.invokers = invokers def _iter_calc(self, run): run = self.ranx.Run.from_df(ir_measures.util.RunConverter(run).as_pd_dataframe(), q_id_col='query_id', doc_id_col='doc_id', score_col='score') for invoker in self.invokers: yield from invoker.iter_calc(run) class RanxInvoker: def __init__(self, ranx, qrels, measure_map): self.ranx = ranx self.qrels = qrels self.measure_map = measure_map def iter_calc(self, run): self.ranx.evaluate(self.qrels, run, list(self.measure_map)) for measure, qid_value_map in run.scores.items(): for query_id, value in qid_value_map.items(): yield Metric(query_id=query_id, measure=self.measure_map[measure][0], value=value) run.scores.clear() providers.register(RanxProvider())
query_ids = set() self.qrels_by_rel = {rel: {} for _, rel, _ in invocations} for qrel in ir_measures.util.QrelsConverter( qrels).as_namedtuple_iter(): query_ids.add(qrel.query_id) for rel in self.qrels_by_rel: if qrel.relevance >= rel: self.qrels_by_rel[rel].setdefault(qrel.query_id, {})[qrel.doc_id] = 1 super().__init__(measures, query_ids) self.invocations = invocations def _iter_calc(self, run): run = ir_measures.util.RunConverter(run).as_dict_of_dict() sorted_run = { q: list(sorted(run[q].items(), key=lambda x: (-x[1], x[0]))) for q in run } sorted_run = {q: [did for did, _ in v] for q, v in sorted_run.items()} for measure, rel, cutoff in self.invocations: if cutoff is NOT_PROVIDED: cutoff = sys.maxsize msmarco_result = msmarco_eval.compute_metrics( self.qrels_by_rel[rel], sorted_run, max_rank=cutoff) for qid, value in msmarco_result[f'MRR @{cutoff} by query'].items( ): yield Metric(query_id=qid, measure=measure, value=value) providers.register(MsMarcoProvider())