class MsMarcoProvider(providers.Provider): """ MS MARCO's implementation of RR """ NAME = 'msmarco' SUPPORTED_MEASURES = [ measures._RR(cutoff=Any(), rel=Any()), ] def _evaluator(self, measures, qrels): measures = ir_measures.util.flatten_measures(measures) invocations = [] for measure in measures: if measure.NAME == 'RR': invocations.append( (measure, measure['rel'], measure['cutoff'])) else: raise ValueError(f'unsupported measure {measure}') return MsMarcoEvaluator(measures, qrels, invocations)
class TrectoolsProvider(providers.Provider): """ trectools https://github.com/joaopalotti/trectools :: @inproceedings{palotti2019, author = {Palotti, Joao and Scells, Harrisen and Zuccon, Guido}, title = {TrecTools: an open-source Python library for Information Retrieval practitioners involved in TREC-like campaigns}, series = {SIGIR'19}, year = {2019}, location = {Paris, France}, publisher = {ACM} } """ NAME = 'trectools' SUPPORTED_MEASURES = [ measures._P(cutoff=Any(), rel=Choices(1)), measures._RR(cutoff=Choices(NOT_PROVIDED), rel=Choices(1)), measures._Rprec(rel=Choices(1)), measures._AP(cutoff=Any(), rel=Choices(1)), measures._nDCG(cutoff=Any(), dcg=Any(), gains=Choices(NOT_PROVIDED)), measures._Bpref(rel=Choices(1)), measures._RBP(cutoff=Any(), p=Any(), rel=Any()), # Other supported metrics: urbp, ubpref, alpha_urbp, geometric_map, unjudged ] def __init__(self): super().__init__() self.trectools = None def _evaluator(self, measures, qrels): import pandas as pd measures = ir_measures.util.flatten_measures(measures) # Convert qrels to dict_of_dict (input format used by pytrec_eval) tmp_qrels = ir_measures.util.QrelsConverter(qrels).as_namedtuple_iter() tmp_qrels = pd.DataFrame(tmp_qrels) if len(tmp_qrels) == 0: tmp_qrels = pd.DataFrame(columns=['query', 'docid', 'rel'], dtype='object') else: tmp_qrels = tmp_qrels.rename(columns={ 'query_id': 'query', 'doc_id': 'docid', 'relevance': 'rel' }) qrels = self.trectools.TrecQrel() qrels.qrels_data = tmp_qrels invocations = self._build_invocations(measures) return TrectoolsEvaluator(measures, qrels, invocations, self.trectools) def _build_invocations(self, measures): invocations = [] for measure in measures: def depth(): try: cutoff = measure['cutoff'] except KeyError: cutoff = NOT_PROVIDED if cutoff is NOT_PROVIDED: cutoff = sys.maxsize return cutoff if measure.NAME == 'P': fn = functools.partial(self.trectools.TrecEval.get_precision, depth=depth(), per_query=True, trec_eval=False, removeUnjudged=False) elif measure.NAME == 'RR': fn = functools.partial( self.trectools.TrecEval.get_reciprocal_rank, depth=depth(), per_query=True, trec_eval=False, removeUnjudged=False) elif measure.NAME == 'Rprec': fn = functools.partial(self.trectools.TrecEval.get_rprec, depth=depth(), per_query=True, trec_eval=False, removeUnjudged=False) elif measure.NAME == 'AP': fn = functools.partial(self.trectools.TrecEval.get_map, depth=depth(), per_query=True, trec_eval=False) elif measure.NAME == 'nDCG': te_mode = {'log2': True, 'exp-log2': False}[measure['dcg']] # trec_eval has other side-effects; namely ordering by score instead of rank. # But in our setting, those are always the same so no difference. fn = functools.partial(self.trectools.TrecEval.get_ndcg, depth=depth(), per_query=True, trec_eval=te_mode, removeUnjudged=False) elif measure.NAME == 'Bpref': fn = functools.partial(self.trectools.TrecEval.get_bpref, depth=depth(), per_query=True, trec_eval=False) elif measure.NAME == 'RBP': rel = measure['rel'] if rel is not NOT_PROVIDED: # TODO: how to handle different relevance levels? I think the only way is to modify # the dataframe. raise RuntimeError('unsupported') fn = lambda ev: self.trectools.TrecEval.get_rbp( ev, p=measure['p'], depth=depth(), per_query=True, binary_topical_relevance=True, average_ties=True, removeUnjudged=False)[0] else: fn = lambda ev: self.trectools.TrecEval.get_rbp( ev, p=measure['p'], depth=depth(), per_query=True, binary_topical_relevance=False, average_ties=True, removeUnjudged=False)[0] else: raise ValueError(f'unsupported measure {measure}') invocations.append((fn, measure)) return invocations def initialize(self): try: import trectools self.trectools = trectools except ImportError as ex: raise RuntimeError('trectools not available', ex)
class CwlEvalProvider(providers.Provider): """ cwl_eval, providing C/W/L ("cool") framework measures. https://github.com/ireval/cwl :: @inproceedings{azzopardi2019cwl, author = {Azzopardi, Leif and Thomas, Paul and Moffat, Alistair}, title = {cwl\\_eval: An Evaluation Tool for Information Retrieval}, booktitle = {SIGIR}, year = {2019} } """ NAME = 'cwl_eval' SUPPORTED_MEASURES = [ measures._P(cutoff=Any(), rel=Any()), measures._RR(cutoff=Choices(NOT_PROVIDED), rel=Any()), measures._AP(cutoff=Choices(NOT_PROVIDED), rel=Any()), measures._RBP(cutoff=Choices(NOT_PROVIDED), rel=Any(required=True), p=Any()), measures._BPM(cutoff=Any(), T=Any(), min_rel=Any(), max_rel=Any(required=True)), measures._SDCG(cutoff=Any(required=True), dcg=Choices('log2'), min_rel=Any(), max_rel=Any(required=True)), measures._NERR8(cutoff=Any(required=True), min_rel=Any(), max_rel=Any(required=True)), measures._NERR9(cutoff=Any(required=True), min_rel=Any(), max_rel=Any(required=True)), measures._NERR10(p=Any(), min_rel=Any(), max_rel=Any(required=True)), measures._NERR11(T=Any(), min_rel=Any(), max_rel=Any(required=True)), measures._INST(T=Any(), min_rel=Any(), max_rel=Any(required=True)), measures._INSQ(T=Any(), min_rel=Any(), max_rel=Any(required=True)), ] def _evaluator(self, measures, qrels): invocations = {} measures = ir_measures.util.flatten_measures(measures) for measure in measures: if measure.NAME in ('P', 'RR', 'AP', 'RBP'): inv_key = (measure['rel'], None, None) elif measure.NAME in ('BPM', 'SDCG', 'NERR8', 'NERR9', 'NERR10', 'NERR11', 'INST', 'INSQ'): inv_key = (None, measure['min_rel'], measure['max_rel']) if inv_key not in invocations: invocations[inv_key] = [] invocations[inv_key].append(measure) return CwlEvaluator(measures, qrels, invocations) def initialize(self): # disable the cwl logger (which writes to cwl.log) cwl_logger = logging.getLogger('cwl') cwl_logger.disabled = True
class PytrecEvalProvider(providers.Provider): """ pytrec_eval https://github.com/cvangysel/pytrec_eval :: @inproceedings{VanGysel2018pytreceval, title={Pytrec\\_eval: An Extremely Fast Python Interface to trec\\_eval}, author={Van Gysel, Christophe and de Rijke, Maarten}, publisher={ACM}, booktitle={SIGIR}, year={2018}, } """ NAME = 'pytrec_eval' SUPPORTED_MEASURES = [ measures._P(cutoff=Any(), rel=Any()), measures._RR(cutoff=Choices(NOT_PROVIDED), rel=Any()), measures._Rprec(rel=Any()), measures._AP(cutoff=Any(), rel=Any()), measures._nDCG(cutoff=Any(), dcg=Choices('log2'), gains=Any()), measures._R(cutoff=Any()), measures._Bpref(rel=Any()), measures._NumRet(rel=Any()), measures._NumQ(), measures._NumRel( rel=Choices(1) ), # for some reason, relevance_level doesn't flow through to num_rel, so can only support rel=1 measures._SetAP(rel=Any()), measures._SetF(rel=Any(), beta=Any()), measures._SetP(rel=Any(), relative=Any()), measures._SetR(rel=Any()), measures._Success(rel=Any(), cutoff=Any()), measures._IPrec(recall=Any()), measures._infAP(rel=Any()), # Cannot support Judged because software doesn't support negative relevance levels: <https://github.com/cvangysel/pytrec_eval/blob/2362660e02c324df281932cc23ad7efd31cd3957/src/pytrec_eval.cpp#L354> ] def __init__(self): super().__init__() self.pytrec_eval = None def _evaluator(self, measures, qrels): measures = ir_measures.util.flatten_measures(measures) # Convert qrels to dict_of_dict (input format used by pytrec_eval) qrels = ir_measures.util.QrelsConverter(qrels).as_dict_of_dict() # Depending on the measure params, we may need multiple invocations of pytrec_eval # (e.g., with different rel_level, since it only supports running with 1 rel_level at a time) invokers = self._build_invokers(measures, qrels) return PytrecEvalEvaluator(measures, invokers, qrels) def _build_invokers(self, measures, qrels): invocations = {} setf_count = 0 for measure in measures: match_str = None if measure.NAME == 'P': invocation_key = (measure['rel'], 0, None) measure_str = f'P_{measure["cutoff"]}' elif measure.NAME == 'RR': invocation_key = (measure['rel'], 0, None) measure_str = f'recip_rank' elif measure.NAME == 'Rprec': invocation_key = (measure['rel'], 0, None) measure_str = f'Rprec' elif measure.NAME == 'AP': invocation_key = (measure['rel'], 0, None) if measure['cutoff'] is NOT_PROVIDED: measure_str = f'map' else: measure_str = f'map_cut_{measure["cutoff"]}' elif measure.NAME == 'infAP': invocation_key = (measure['rel'], 0, None) measure_str = f'infAP' elif measure.NAME == 'nDCG': if measure['gains'] is NOT_PROVIDED: # Doesn't matter where this goes... Put it in an existing invocation, or just (1,) if none yet exist if invocations: invocation_key = next(iter(invocations)) else: invocation_key = (1, 0, None) else: invocation_key = (1, 0, hashabledict(measure['gains'])) if measure['cutoff'] is NOT_PROVIDED: measure_str = f'ndcg' else: measure_str = f'ndcg_cut_{measure["cutoff"]}' elif measure.NAME == 'R': invocation_key = (measure['rel'], 0, None) measure_str = f'recall_{measure["cutoff"]}' elif measure.NAME == 'Bpref': invocation_key = (measure['rel'], 0, None) measure_str = f'bpref' elif measure.NAME == 'NumRet': if measure['rel'] is NOT_PROVIDED: # Doesn't matter where this goes... Put it in an existing invocation, or just (1,) if none yet exist if invocations: invocation_key = next(iter(invocations)) else: invocation_key = (1, 0, None) measure_str = 'num_ret' else: invocation_key = (measure['rel'], 0, None) measure_str = 'num_rel_ret' elif measure.NAME == 'NumQ': # Doesn't matter where this goes... Put it in an existing invocation, or just (1,) if none yet exist if invocations: invocation_key = next(iter(invocations)) else: invocation_key = (1, 0, None) measure_str = 'num_q' elif measure.NAME == 'NumRel': invocation_key = (measure['rel'], 0, None) measure_str = 'num_rel' elif measure.NAME == 'SetAP': invocation_key = (measure['rel'], 0, None) measure_str = f'set_map' elif measure.NAME == 'SetF': # set_F is strange (or buggy?) in both trec_eval and pytrec_eval. It only accepts # the first beta argument it's given, which is why we use the setf_count approach # to handle multiple invocations. It also is always reported as the name set_F by # pytrec_eval, so we need different measure_str and match_str here. invocation_key = (measure['rel'], setf_count, None) setf_count += 1 measure_str = f'set_F_{measure["beta"]}' match_str = 'set_F' if measure['beta'] == 1.: measure_str = f'set_F' else: measure_str = f'set_F_{measure["beta"]}' elif measure.NAME == 'SetP': if measure['relative']: invocation_key = (measure['rel'], 0, None) measure_str = f'set_relative_P' else: invocation_key = (measure['rel'], 0, None) measure_str = f'set_P' elif measure.NAME == 'SetR': invocation_key = (measure['rel'], 0, None) measure_str = f'set_recall' elif measure.NAME == 'Success': invocation_key = (measure['rel'], 0, None) measure_str = f'success_{measure["cutoff"]}' elif measure.NAME == 'IPrec': invocation_key = (measure['rel'], 0, None) measure_str = f'iprec_at_recall_{measure["recall"]:.2f}' else: raise ValueError(f'unsupported measure {measure}') if match_str is None: match_str = measure_str if invocation_key not in invocations: invocations[invocation_key] = {} invocations[invocation_key][match_str] = (measure, measure_str) invokers = [] for (rel_level, it, gains), measure_map in invocations.items(): these_qrels = qrels if gains is not None: # Map the gains these_qrels = { qid: { did: gains.get(score, score) for did, score in vals.items() } for qid, vals in these_qrels.items() } invokers.append( PytrecEvalInvoker(self.pytrec_eval, these_qrels, measure_map, rel_level)) return invokers def initialize(self): try: import pytrec_eval self.pytrec_eval = pytrec_eval except ImportError as ex: raise RuntimeError('pytrec_eval not available', ex)
class RanxProvider(providers.Provider): """ ranx https://amenra.github.io/ranx/ :: @misc{ranx2021, title = {ranx: A Blazing-Fast Python Library for Ranking Evaluation and Comparison}, author = {Bassani, Elias}, year = {2021}, publisher = {GitHub}, howpublished = {\\url{https://github.com/AmenRa/ranx}}, } """ NAME = 'ranx' SUPPORTED_MEASURES = [ measures._P(cutoff=Any(), rel=Any()), measures._SetP(rel=Any()), measures._RR(cutoff=Choices(NOT_PROVIDED), rel=Any()), measures._Rprec(rel=Any()), measures._AP(cutoff=Any(), rel=Any()), measures._nDCG(cutoff=Any(), dcg=Choices('log2', 'exp-log2'), gains=Choices(NOT_PROVIDED)), measures._R(cutoff=Any()), measures._SetR(rel=Any()), measures._NumRet(rel=Any(required=True)), measures._Success(cutoff=Any(required=True), rel=Any()), ] def __init__(self): super().__init__() self.ranx = None def _evaluator(self, measures, qrels): measures = ir_measures.util.flatten_measures(measures) # Convert qrels to dict_of_dict (input format used by pytrec_eval) qrels = ir_measures.util.QrelsConverter(qrels).as_pd_dataframe() qids = set(qrels['query_id'].unique()) # Depending on the measure params, we may need multiple invocations of ranx # (e.g., with different rel_level, since it only supports running with 1 rel_level at a time) invokers = self._build_invokers(measures, qrels) return RanxEvaluator(self.ranx, measures, invokers, qrels, qids=qids) def _build_invokers(self, measures, qrels): invocations = {} setf_count = 0 for measure in measures: match_str = None if measure.NAME == 'P': invocation_key = (measure['rel'], 0) measure_str = f'precision@{measure["cutoff"]}' elif measure.NAME == 'SetP': invocation_key = (measure['rel'], 0) measure_str = f'precision' elif measure.NAME == 'R': invocation_key = (measure['rel'], 0) measure_str = f'recall@{measure["cutoff"]}' elif measure.NAME == 'SetR': invocation_key = (measure['rel'], 0) measure_str = f'recall' elif measure.NAME == 'RR': invocation_key = (measure['rel'], 0) if 'cutoff' in measure.params: measure_str = f'mrr@{measure["cutoff"]}' else: measure_str = f'mrr' elif measure.NAME == 'AP': invocation_key = (measure['rel'], 0) if 'cutoff' in measure.params: measure_str = f'map@{measure["cutoff"]}' else: measure_str = f'map' elif measure.NAME == 'Success': invocation_key = (measure['rel'], 0) measure_str = f'hit_rate@{measure["cutoff"]}' elif measure.NAME == 'NumRet': invocation_key = (measure['rel'], 0) measure_str = f'hits' elif measure.NAME == 'nDCG': invocation_key = (None, 0) name = 'ndcg_burges' if measure.params.get('dcg', measure.SUPPORTED_PARAMS['dcg'].default) == 'exp-log2' else 'ndcg' if 'cutoff' in measure.params: measure_str = f'{name}@{measure["cutoff"]}' else: measure_str = name elif measure.NAME == 'Rprec': invocation_key = (measure['rel'], 0) measure_str = f'r-precision' else: raise ValueError(f'unsupported measure {measure}') if match_str is None: match_str = measure_str if invocation_key not in invocations: invocations[invocation_key] = {} invocations[invocation_key][match_str] = (measure, measure_str) invokers = [] for (rel_level, it), measure_map in invocations.items(): if rel_level is not None: these_qrels = qrels.assign(relevance=(qrels['relevance']>=rel_level).astype(int)) else: these_qrels = qrels these_qrels = self.ranx.Qrels.from_df(these_qrels, q_id_col='query_id', doc_id_col='doc_id', score_col='relevance') invokers.append(RanxInvoker(self.ranx, these_qrels, measure_map)) return invokers def initialize(self): try: import ranx self.ranx = ranx except ImportError as ex: raise RuntimeError('ranx not available (do you need to `pip install ranx`?)', ex)