Esempio n. 1
0
class GdevalProvider(providers.Provider):
    """
    gdeval
    """
    NAME = 'gdeval'
    SUPPORTED_MEASURES = [
        measures._nDCG(cutoff=Any(required=True),
                       dcg=Choices('exp-log2'),
                       gains=Choices(NOT_PROVIDED)),
        measures._ERR(cutoff=Any(required=True)),
    ]

    def _evaluator(self, measures, qrels):
        MEASURES = ('nDCG', 'ERR')
        cutoffs = {}
        for measure in ir_measures.util.flatten_measures(measures):
            if measure.NAME in MEASURES:
                cutoff = measure['cutoff']
                if cutoff not in cutoffs:
                    cutoffs[cutoff] = [None] * len(MEASURES)
                cutoffs[cutoff][MEASURES.index(measure.NAME)] = measure
            else:
                raise ValueError(f'unsupported measure {measure}')
        invocations = []
        for cutoff, (NDCG, ERR) in cutoffs.items():
            invocations.append((cutoff, NDCG, ERR))
        qrels = list(
            ir_measures.util.QrelsConverter(qrels).as_namedtuple_iter())
        return GdevalEvaluator(measures, qrels, invocations)

    def initialize(self):
        try:
            subprocess.check_output(['perl', '--version'])
        except CalledProcessError as ex:
            raise RuntimeError('perl not available', ex)
Esempio n. 2
0
class PytrecEvalProvider(providers.Provider):
    """
    pytrec_eval

    https://github.com/cvangysel/pytrec_eval

::

    @inproceedings{VanGysel2018pytreceval,
        title={Pytrec\\_eval: An Extremely Fast Python Interface to trec\\_eval},
        author={Van Gysel, Christophe and de Rijke, Maarten},
        publisher={ACM},
        booktitle={SIGIR},
        year={2018},
    }

    """
    NAME = 'pytrec_eval'
    SUPPORTED_MEASURES = [
        measures._P(cutoff=Any(), rel=Any()),
        measures._RR(cutoff=Choices(NOT_PROVIDED), rel=Any()),
        measures._Rprec(rel=Any()),
        measures._AP(cutoff=Any(), rel=Any()),
        measures._nDCG(cutoff=Any(), dcg=Choices('log2'), gains=Any()),
        measures._R(cutoff=Any()),
        measures._Bpref(rel=Any()),
        measures._NumRet(rel=Any()),
        measures._NumQ(),
        measures._NumRel(
            rel=Choices(1)
        ),  # for some reason, relevance_level doesn't flow through to num_rel, so can only support rel=1
        measures._SetAP(rel=Any()),
        measures._SetF(rel=Any(), beta=Any()),
        measures._SetP(rel=Any(), relative=Any()),
        measures._SetR(rel=Any()),
        measures._Success(rel=Any(), cutoff=Any()),
        measures._IPrec(recall=Any()),
        measures._infAP(rel=Any()),
        # Cannot support Judged because software doesn't support negative relevance levels: <https://github.com/cvangysel/pytrec_eval/blob/2362660e02c324df281932cc23ad7efd31cd3957/src/pytrec_eval.cpp#L354>
    ]

    def __init__(self):
        super().__init__()
        self.pytrec_eval = None

    def _evaluator(self, measures, qrels):
        measures = ir_measures.util.flatten_measures(measures)
        # Convert qrels to dict_of_dict (input format used by pytrec_eval)
        qrels = ir_measures.util.QrelsConverter(qrels).as_dict_of_dict()

        # Depending on the measure params, we may need multiple invocations of pytrec_eval
        # (e.g., with different rel_level, since it only supports running with 1 rel_level at a time)
        invokers = self._build_invokers(measures, qrels)
        return PytrecEvalEvaluator(measures, invokers, qrels)

    def _build_invokers(self, measures, qrels):
        invocations = {}
        setf_count = 0
        for measure in measures:
            match_str = None
            if measure.NAME == 'P':
                invocation_key = (measure['rel'], 0, None)
                measure_str = f'P_{measure["cutoff"]}'
            elif measure.NAME == 'RR':
                invocation_key = (measure['rel'], 0, None)
                measure_str = f'recip_rank'
            elif measure.NAME == 'Rprec':
                invocation_key = (measure['rel'], 0, None)
                measure_str = f'Rprec'
            elif measure.NAME == 'AP':
                invocation_key = (measure['rel'], 0, None)
                if measure['cutoff'] is NOT_PROVIDED:
                    measure_str = f'map'
                else:
                    measure_str = f'map_cut_{measure["cutoff"]}'
            elif measure.NAME == 'infAP':
                invocation_key = (measure['rel'], 0, None)
                measure_str = f'infAP'
            elif measure.NAME == 'nDCG':
                if measure['gains'] is NOT_PROVIDED:
                    # Doesn't matter where this goes... Put it in an existing invocation, or just (1,) if none yet exist
                    if invocations:
                        invocation_key = next(iter(invocations))
                    else:
                        invocation_key = (1, 0, None)
                else:
                    invocation_key = (1, 0, hashabledict(measure['gains']))
                if measure['cutoff'] is NOT_PROVIDED:
                    measure_str = f'ndcg'
                else:
                    measure_str = f'ndcg_cut_{measure["cutoff"]}'
            elif measure.NAME == 'R':
                invocation_key = (measure['rel'], 0, None)
                measure_str = f'recall_{measure["cutoff"]}'
            elif measure.NAME == 'Bpref':
                invocation_key = (measure['rel'], 0, None)
                measure_str = f'bpref'
            elif measure.NAME == 'NumRet':
                if measure['rel'] is NOT_PROVIDED:
                    # Doesn't matter where this goes... Put it in an existing invocation, or just (1,) if none yet exist
                    if invocations:
                        invocation_key = next(iter(invocations))
                    else:
                        invocation_key = (1, 0, None)
                    measure_str = 'num_ret'
                else:
                    invocation_key = (measure['rel'], 0, None)
                    measure_str = 'num_rel_ret'
            elif measure.NAME == 'NumQ':
                # Doesn't matter where this goes... Put it in an existing invocation, or just (1,) if none yet exist
                if invocations:
                    invocation_key = next(iter(invocations))
                else:
                    invocation_key = (1, 0, None)
                measure_str = 'num_q'
            elif measure.NAME == 'NumRel':
                invocation_key = (measure['rel'], 0, None)
                measure_str = 'num_rel'
            elif measure.NAME == 'SetAP':
                invocation_key = (measure['rel'], 0, None)
                measure_str = f'set_map'
            elif measure.NAME == 'SetF':
                # set_F is strange (or buggy?) in both trec_eval and pytrec_eval. It only accepts
                # the first beta argument it's given, which is why we use the setf_count approach
                # to handle multiple invocations. It also is always reported as the name set_F by
                # pytrec_eval, so we need different measure_str and match_str here.
                invocation_key = (measure['rel'], setf_count, None)
                setf_count += 1
                measure_str = f'set_F_{measure["beta"]}'
                match_str = 'set_F'
                if measure['beta'] == 1.:
                    measure_str = f'set_F'
                else:
                    measure_str = f'set_F_{measure["beta"]}'
            elif measure.NAME == 'SetP':
                if measure['relative']:
                    invocation_key = (measure['rel'], 0, None)
                    measure_str = f'set_relative_P'
                else:
                    invocation_key = (measure['rel'], 0, None)
                    measure_str = f'set_P'
            elif measure.NAME == 'SetR':
                invocation_key = (measure['rel'], 0, None)
                measure_str = f'set_recall'
            elif measure.NAME == 'Success':
                invocation_key = (measure['rel'], 0, None)
                measure_str = f'success_{measure["cutoff"]}'
            elif measure.NAME == 'IPrec':
                invocation_key = (measure['rel'], 0, None)
                measure_str = f'iprec_at_recall_{measure["recall"]:.2f}'
            else:
                raise ValueError(f'unsupported measure {measure}')

            if match_str is None:
                match_str = measure_str

            if invocation_key not in invocations:
                invocations[invocation_key] = {}
            invocations[invocation_key][match_str] = (measure, measure_str)

        invokers = []
        for (rel_level, it, gains), measure_map in invocations.items():
            these_qrels = qrels
            if gains is not None:
                # Map the gains
                these_qrels = {
                    qid: {
                        did: gains.get(score, score)
                        for did, score in vals.items()
                    }
                    for qid, vals in these_qrels.items()
                }
            invokers.append(
                PytrecEvalInvoker(self.pytrec_eval, these_qrels, measure_map,
                                  rel_level))

        return invokers

    def initialize(self):
        try:
            import pytrec_eval
            self.pytrec_eval = pytrec_eval
        except ImportError as ex:
            raise RuntimeError('pytrec_eval not available', ex)
class TrectoolsProvider(providers.Provider):
    """
    trectools

    https://github.com/joaopalotti/trectools

::

    @inproceedings{palotti2019,
       author = {Palotti, Joao and Scells, Harrisen and Zuccon, Guido},
       title = {TrecTools: an open-source Python library for Information Retrieval practitioners involved in TREC-like campaigns},
       series = {SIGIR'19},
       year = {2019},
       location = {Paris, France},
       publisher = {ACM}
    }

    """
    NAME = 'trectools'
    SUPPORTED_MEASURES = [
        measures._P(cutoff=Any(), rel=Choices(1)),
        measures._RR(cutoff=Choices(NOT_PROVIDED), rel=Choices(1)),
        measures._Rprec(rel=Choices(1)),
        measures._AP(cutoff=Any(), rel=Choices(1)),
        measures._nDCG(cutoff=Any(), dcg=Any(), gains=Choices(NOT_PROVIDED)),
        measures._Bpref(rel=Choices(1)),
        measures._RBP(cutoff=Any(), p=Any(), rel=Any()),
        # Other supported metrics: urbp, ubpref, alpha_urbp, geometric_map, unjudged
    ]

    def __init__(self):
        super().__init__()
        self.trectools = None

    def _evaluator(self, measures, qrels):
        import pandas as pd
        measures = ir_measures.util.flatten_measures(measures)
        # Convert qrels to dict_of_dict (input format used by pytrec_eval)
        tmp_qrels = ir_measures.util.QrelsConverter(qrels).as_namedtuple_iter()
        tmp_qrels = pd.DataFrame(tmp_qrels)
        if len(tmp_qrels) == 0:
            tmp_qrels = pd.DataFrame(columns=['query', 'docid', 'rel'],
                                     dtype='object')
        else:
            tmp_qrels = tmp_qrels.rename(columns={
                'query_id': 'query',
                'doc_id': 'docid',
                'relevance': 'rel'
            })
        qrels = self.trectools.TrecQrel()
        qrels.qrels_data = tmp_qrels

        invocations = self._build_invocations(measures)

        return TrectoolsEvaluator(measures, qrels, invocations, self.trectools)

    def _build_invocations(self, measures):
        invocations = []
        for measure in measures:

            def depth():
                try:
                    cutoff = measure['cutoff']
                except KeyError:
                    cutoff = NOT_PROVIDED
                if cutoff is NOT_PROVIDED:
                    cutoff = sys.maxsize
                return cutoff

            if measure.NAME == 'P':
                fn = functools.partial(self.trectools.TrecEval.get_precision,
                                       depth=depth(),
                                       per_query=True,
                                       trec_eval=False,
                                       removeUnjudged=False)
            elif measure.NAME == 'RR':
                fn = functools.partial(
                    self.trectools.TrecEval.get_reciprocal_rank,
                    depth=depth(),
                    per_query=True,
                    trec_eval=False,
                    removeUnjudged=False)
            elif measure.NAME == 'Rprec':
                fn = functools.partial(self.trectools.TrecEval.get_rprec,
                                       depth=depth(),
                                       per_query=True,
                                       trec_eval=False,
                                       removeUnjudged=False)
            elif measure.NAME == 'AP':
                fn = functools.partial(self.trectools.TrecEval.get_map,
                                       depth=depth(),
                                       per_query=True,
                                       trec_eval=False)
            elif measure.NAME == 'nDCG':
                te_mode = {'log2': True, 'exp-log2': False}[measure['dcg']]
                # trec_eval has other side-effects; namely ordering by score instead of rank.
                # But in our setting, those are always the same so no difference.
                fn = functools.partial(self.trectools.TrecEval.get_ndcg,
                                       depth=depth(),
                                       per_query=True,
                                       trec_eval=te_mode,
                                       removeUnjudged=False)
            elif measure.NAME == 'Bpref':
                fn = functools.partial(self.trectools.TrecEval.get_bpref,
                                       depth=depth(),
                                       per_query=True,
                                       trec_eval=False)
            elif measure.NAME == 'RBP':
                rel = measure['rel']
                if rel is not NOT_PROVIDED:
                    # TODO: how to handle different relevance levels? I think the only way is to modify
                    # the dataframe.
                    raise RuntimeError('unsupported')
                    fn = lambda ev: self.trectools.TrecEval.get_rbp(
                        ev,
                        p=measure['p'],
                        depth=depth(),
                        per_query=True,
                        binary_topical_relevance=True,
                        average_ties=True,
                        removeUnjudged=False)[0]
                else:
                    fn = lambda ev: self.trectools.TrecEval.get_rbp(
                        ev,
                        p=measure['p'],
                        depth=depth(),
                        per_query=True,
                        binary_topical_relevance=False,
                        average_ties=True,
                        removeUnjudged=False)[0]
            else:
                raise ValueError(f'unsupported measure {measure}')

            invocations.append((fn, measure))

        return invocations

    def initialize(self):
        try:
            import trectools
            self.trectools = trectools
        except ImportError as ex:
            raise RuntimeError('trectools not available', ex)
Esempio n. 4
0
class RanxProvider(providers.Provider):
    """
    ranx

    https://amenra.github.io/ranx/

::

    @misc{ranx2021,
      title = {ranx: A Blazing-Fast Python Library for Ranking Evaluation and Comparison},
      author = {Bassani, Elias},
      year = {2021},
      publisher = {GitHub},
      howpublished = {\\url{https://github.com/AmenRa/ranx}},
    }

    """
    NAME = 'ranx'
    SUPPORTED_MEASURES = [
        measures._P(cutoff=Any(), rel=Any()),
        measures._SetP(rel=Any()),
        measures._RR(cutoff=Choices(NOT_PROVIDED), rel=Any()),
        measures._Rprec(rel=Any()),
        measures._AP(cutoff=Any(), rel=Any()),
        measures._nDCG(cutoff=Any(), dcg=Choices('log2', 'exp-log2'), gains=Choices(NOT_PROVIDED)),
        measures._R(cutoff=Any()),
        measures._SetR(rel=Any()),
        measures._NumRet(rel=Any(required=True)),
        measures._Success(cutoff=Any(required=True), rel=Any()),
    ]

    def __init__(self):
        super().__init__()
        self.ranx = None

    def _evaluator(self, measures, qrels):
        measures = ir_measures.util.flatten_measures(measures)
        # Convert qrels to dict_of_dict (input format used by pytrec_eval)
        qrels = ir_measures.util.QrelsConverter(qrels).as_pd_dataframe()
        qids = set(qrels['query_id'].unique())

        # Depending on the measure params, we may need multiple invocations of ranx
        # (e.g., with different rel_level, since it only supports running with 1 rel_level at a time)
        invokers = self._build_invokers(measures, qrels)
        return RanxEvaluator(self.ranx, measures, invokers, qrels, qids=qids)

    def _build_invokers(self, measures, qrels):
        invocations = {}
        setf_count = 0
        for measure in measures:
            match_str = None
            if measure.NAME == 'P':
                invocation_key = (measure['rel'], 0)
                measure_str = f'precision@{measure["cutoff"]}'
            elif measure.NAME == 'SetP':
                invocation_key = (measure['rel'], 0)
                measure_str = f'precision'
            elif measure.NAME == 'R':
                invocation_key = (measure['rel'], 0)
                measure_str = f'recall@{measure["cutoff"]}'
            elif measure.NAME == 'SetR':
                invocation_key = (measure['rel'], 0)
                measure_str = f'recall'
            elif measure.NAME == 'RR':
                invocation_key = (measure['rel'], 0)
                if 'cutoff' in measure.params:
                    measure_str = f'mrr@{measure["cutoff"]}'
                else:
                    measure_str = f'mrr'
            elif measure.NAME == 'AP':
                invocation_key = (measure['rel'], 0)
                if 'cutoff' in measure.params:
                    measure_str = f'map@{measure["cutoff"]}'
                else:
                    measure_str = f'map'
            elif measure.NAME == 'Success':
                invocation_key = (measure['rel'], 0)
                measure_str = f'hit_rate@{measure["cutoff"]}'
            elif measure.NAME == 'NumRet':
                invocation_key = (measure['rel'], 0)
                measure_str = f'hits'
            elif measure.NAME == 'nDCG':
                invocation_key = (None, 0)
                name = 'ndcg_burges' if measure.params.get('dcg', measure.SUPPORTED_PARAMS['dcg'].default) == 'exp-log2' else 'ndcg'
                if 'cutoff' in measure.params:
                    measure_str = f'{name}@{measure["cutoff"]}'
                else:
                    measure_str = name
            elif measure.NAME == 'Rprec':
                invocation_key = (measure['rel'], 0)
                measure_str = f'r-precision'
            else:
                raise ValueError(f'unsupported measure {measure}')

            if match_str is None:
                match_str = measure_str

            if invocation_key not in invocations:
                invocations[invocation_key] = {}
            invocations[invocation_key][match_str] = (measure, measure_str)

        invokers = []
        for (rel_level, it), measure_map in invocations.items():
            if rel_level is not None:
                these_qrels = qrels.assign(relevance=(qrels['relevance']>=rel_level).astype(int))
            else:
                these_qrels = qrels
            these_qrels = self.ranx.Qrels.from_df(these_qrels, q_id_col='query_id', doc_id_col='doc_id', score_col='relevance')
            invokers.append(RanxInvoker(self.ranx, these_qrels, measure_map))

        return invokers

    def initialize(self):
        try:
            import ranx
            self.ranx = ranx
        except ImportError as ex:
            raise RuntimeError('ranx not available (do you need to `pip install ranx`?)', ex)