コード例 #1
0
                        nonrels.append(nonrels[-1])
                    else:
                        nonrels[-1] += 1

                # Only report if one relevant document was retrieved
                if len(nonrels) >= 2:
                    value = 1. - sum(nonrels[:-1]) / (float(nonrels[-1]) * (len(nonrels) - 1))
                    yield Metric(query_id=qid, measure=measure, value=value)
 

class AccuracyProvider(providers.Provider):
    """Accuracy provider"""
    NAME = "accuracy"
    SUPPORTED_MEASURES = [
        _Accuracy(cutoff=Any(), rel=Any()),
    ]

    def _evaluator(self, measures, qrels) -> providers.Evaluator:
        invocations = []
        for measure in ir_measures.util.flatten_measures(measures):
            if measure.NAME == _Accuracy.NAME:
                cutoff = 0 if measure['cutoff'] is NOT_PROVIDED else measure['cutoff']
                invocations.append((measure, cutoff, measure['rel']))
            else:
                raise ValueError(f'unsupported measure {measure}')
        qrels = ir_measures.util.QrelsConverter(qrels).as_dict_of_dict()
        
        return AccuracyEvaluator(measures, qrels, invocations)

providers.register(AccuracyProvider())
コード例 #2
0
        self.invocations = invocations

    def _iter_calc(self, run):
        with tempfile.NamedTemporaryFile() as perlf, \
             ir_measures.util.QrelsConverter(self.qrels).as_tmp_file() as qrelsf, \
             ir_measures.util.RunConverter(run).as_tmp_file() as runf:
            perlf_contents = pkgutil.get_data('ir_measures', 'bin/gdeval.pl')
            perlf.write(perlf_contents)
            perlf.flush()
            for cutoff, nDCG_measure, ERR_measure in self.invocations:
                cmd = ['perl', perlf.name, qrelsf.name, runf.name, str(cutoff)]
                output = subprocess.check_output(cmd)
                output = output.decode().replace('\t', ' ').split('\n')
                for i, s in enumerate(output):
                    if s == '' or i == 0:
                        continue
                    arr = s.split(',')
                    assert len(arr) == 4
                    _, qid, ndcg, err = arr
                    if nDCG_measure is not None:
                        yield Metric(query_id=qid,
                                     measure=nDCG_measure,
                                     value=float(ndcg))
                    if ERR_measure is not None:
                        yield Metric(query_id=qid,
                                     measure=ERR_measure,
                                     value=float(err))


providers.register(GdevalProvider())
コード例 #3
0
            else:
                raise ValueError(f'unsupported measure {measure}')
        qrels = ir_measures.util.QrelsConverter(qrels).as_dict_of_dict()
        return JudgedEvaluator(measures, qrels, cutoffs)


class JudgedEvaluator(providers.Evaluator):
    def __init__(self, measures, qrels, cutoffs):
        super().__init__(measures, set(qrels.keys()))
        self.qrels = qrels
        self.cutoffs = cutoffs

    def _iter_calc(self, run):
        run = ir_measures.util.RunConverter(run).as_dict_of_dict()
        sorted_run = {
            q: list(sorted(run[q].items(), key=lambda x: (-x[1], x[0])))
            for q in run
        }
        for qid in run:
            qid_qrels = self.qrels.get(qid)
            if qid_qrels:
                for cutoff, measure in self.cutoffs:
                    judged_c = sum(
                        (did in qid_qrels)
                        for did, _ in sorted_run.get(qid, [])[:cutoff])
                    value = judged_c / cutoff
                    yield Metric(query_id=qid, measure=measure, value=value)


providers.register(JudgedProvider())
コード例 #4
0
ファイル: cwl_eval.py プロジェクト: terrierteam/ir_measures
    def _irm_convert_to_measure(self, measure):
        if measure.NAME == 'P':
            return PrecisionCWLMetric(measure['cutoff'])
        if measure.NAME == 'RR':
            return RRCWLMetric()
        if measure.NAME == 'AP':
            return APCWLMetric()
        if measure.NAME == 'RBP':
            return RBPCWLMetric(measure['p'])
        if measure.NAME == 'BPM':
            return BPMCWLMetric(measure['T'], measure['cutoff'])
        if measure.NAME == 'NERR8':
            return NERReq8CWLMetric(measure['cutoff'])
        if measure.NAME == 'NERR9':
            return NERReq9CWLMetric(measure['cutoff'])
        if measure.NAME == 'NERR10':
            return NERReq10CWLMetric(measure['p'])
        if measure.NAME == 'NERR11':
            return NERReq11CWLMetric(measure['T'])
        if measure.NAME == 'SDCG':
            return NDCGCWLMetric(measure['cutoff'])
        if measure.NAME == 'INST':
            return INSTCWLMetric(measure['T'])
        if measure.NAME == 'INSQ':
            return INSQCWLMetric(measure['T'])
        raise KeyError(f'measure {measure} not supported')


providers.register(CwlEvalProvider())
コード例 #5
0
    def _iter_calc(self, run):
        import pandas as pd
        available_qids = set(self.qrels.qrels_data['query'].unique())
        tmp_run = ir_measures.util.RunConverter(run).as_namedtuple_iter()
        tmp_run = pd.DataFrame(tmp_run)
        if len(tmp_run) == 0:
            tmp_run = pd.DataFrame(columns=['query', 'docid', 'score'],
                                   dtype='object')
        else:
            tmp_run = tmp_run.rename(columns={
                'query_id': 'query',
                'doc_id': 'docid',
                'score': 'score'
            })
        tmp_run.sort_values(['query', 'score'],
                            ascending=[True, False],
                            inplace=True)
        run = self.trectools.TrecRun()
        run.run_data = tmp_run
        evaluator = self.trectools.TrecEval(run, self.qrels)
        for invocation, measure in self.invocations:
            for query_id, value in invocation(evaluator).itertuples():
                if query_id in available_qids:
                    yield Metric(query_id=query_id,
                                 measure=measure,
                                 value=value)


providers.register(TrectoolsProvider())
コード例 #6
0
        self.evaluator = pnd.RelevanceEvaluator(qrels,
                                                measure_map.keys(),
                                                relevance_level=rel_level,
                                                alpha=alpha,
                                                beta=beta)
        self.measure_map = measure_map
        self.qid_did_filter = None
        if judged_only:
            self.qid_did_filter = set(
                (qrel.query_id, qrel.doc_id) for qrel in qrels)

    def iter_calc(self, run):
        if self.qid_did_filter is not None:  # used when judged_only
            filtered_run = {}
            for qid in run:
                filtered_run[qid] = {}
                for did, score in run[qid].items():
                    if (qid, did) in self.qid_did_filter:
                        filtered_run[qid][did] = score
            run = filtered_run
        for record in self.evaluator.evaluate_iter(run):
            query_id = record['query_id']
            del record['query_id']
            for measure_str, value in record.items():
                yield Metric(query_id=query_id,
                             measure=self.measure_map[measure_str],
                             value=value)


providers.register(PyNdEvalProvider())
コード例 #7
0
class PytrecEvalEvaluator(providers.Evaluator):
    def __init__(self, measures, invokers, qrels):
        super().__init__(measures, set(qrels.keys()))
        self.invokers = invokers

    def _iter_calc(self, run):
        # Convert qrels to dict_of_dict (input format used by pytrec_eval)
        run = ir_measures.util.RunConverter(run).as_dict_of_dict()
        for invoker in self.invokers:
            yield from invoker.iter_calc(run)


class PytrecEvalInvoker:
    def __init__(self, pte, qrels, measure_map, rel_level):
        self.evaluator = pte.RelevanceEvaluator(
            qrels, [m for _, m in measure_map.values()],
            relevance_level=rel_level)
        self.measure_map = measure_map

    def iter_calc(self, run):
        result = self.evaluator.evaluate(run)
        for query_id, measures in result.items():
            for measure_str, value in measures.items():
                yield Metric(query_id=query_id,
                             measure=self.measure_map[measure_str][0],
                             value=value)


providers.register(PytrecEvalProvider())
コード例 #8
0
    ideal = [docno for docno in qrels if qrels[docno] > 0]
    ideal.sort(key=lambda docno: run[docno] if docno in run else 0.0,
               reverse=True)
    ideal.sort(key=lambda docno: qrels[docno], reverse=True)
    depth = max(len(ranking), len(ideal))
    score = rbo(ranking, ideal, p, depth)
    if normalize:
        best = rbo(ideal, ideal, p, depth)
        if best > 0.0:
            score = rbo(ranking, ideal, p, depth) / best
    return score


class CompatEvaluator(providers.Evaluator):
    def __init__(self, measures, qrels, invocations):
        super().__init__(measures, set(qrels.keys()))
        self.qrels = qrels
        self.invocations = invocations

    def _iter_calc(self, run):
        run = ir_measures.util.RunConverter(run).as_dict_of_dict()
        for measure, p, normalize in self.invocations:
            for qid in run:
                if qid in self.qrels:
                    value = compatibility(self.qrels[qid], run[qid], p,
                                          normalize)
                    yield Metric(query_id=qid, measure=measure, value=value)


providers.register(CompatProvider())
コード例 #9
0
            raise RuntimeError('ranx not available (do you need to `pip install ranx`?)', ex)


class RanxEvaluator(providers.Evaluator):
    def __init__(self, ranx, measures, invokers, qrels, qids):
        super().__init__(measures, qids)
        self.ranx = ranx
        self.invokers = invokers

    def _iter_calc(self, run):
        run = self.ranx.Run.from_df(ir_measures.util.RunConverter(run).as_pd_dataframe(), q_id_col='query_id', doc_id_col='doc_id', score_col='score')
        for invoker in self.invokers:
            yield from invoker.iter_calc(run)


class RanxInvoker:
    def __init__(self, ranx, qrels, measure_map):
        self.ranx = ranx
        self.qrels = qrels
        self.measure_map = measure_map

    def iter_calc(self, run):
        self.ranx.evaluate(self.qrels, run, list(self.measure_map))
        for measure, qid_value_map in run.scores.items():
            for query_id, value in qid_value_map.items():
                yield Metric(query_id=query_id, measure=self.measure_map[measure][0], value=value)
        run.scores.clear()


providers.register(RanxProvider())
コード例 #10
0
        query_ids = set()
        self.qrels_by_rel = {rel: {} for _, rel, _ in invocations}
        for qrel in ir_measures.util.QrelsConverter(
                qrels).as_namedtuple_iter():
            query_ids.add(qrel.query_id)
            for rel in self.qrels_by_rel:
                if qrel.relevance >= rel:
                    self.qrels_by_rel[rel].setdefault(qrel.query_id,
                                                      {})[qrel.doc_id] = 1
        super().__init__(measures, query_ids)
        self.invocations = invocations

    def _iter_calc(self, run):
        run = ir_measures.util.RunConverter(run).as_dict_of_dict()
        sorted_run = {
            q: list(sorted(run[q].items(), key=lambda x: (-x[1], x[0])))
            for q in run
        }
        sorted_run = {q: [did for did, _ in v] for q, v in sorted_run.items()}
        for measure, rel, cutoff in self.invocations:
            if cutoff is NOT_PROVIDED:
                cutoff = sys.maxsize
            msmarco_result = msmarco_eval.compute_metrics(
                self.qrels_by_rel[rel], sorted_run, max_rank=cutoff)
            for qid, value in msmarco_result[f'MRR @{cutoff} by query'].items(
            ):
                yield Metric(query_id=qid, measure=measure, value=value)


providers.register(MsMarcoProvider())