from ir_measures import measures from .base import Measure, ParamInfo, SumAgg class _NumRel(measures.Measure): """ The number of relevant documents the query has (independent of what the system retrieved). """ __name__ = 'NumRel' NAME = __name__ PRETTY_NAME = 'Number of Relevant Documents' SHORT_DESC = 'The number of relevant documents present in the qrels' SUPPORTED_PARAMS = { 'rel': measures.ParamInfo(dtype=int, default=1, desc='minimum relevance score to be counted (inclusive)') } def aggregator(self): return SumAgg() NumRel = _NumRel() measures.register(NumRel)
year = {2017}, url = {http://doi.acm.org/10.1145/3077136.3080841} } """ __name__ = 'BPM' NAME = __name__ PRETTY_NAME = 'Bejeweled Player Model' SHORT_DESC = 'A measure that balances both gain and user patience to determine when they stop traversing search results.' SUPPORTED_PARAMS = { 'cutoff': measures.ParamInfo(dtype=int, required=True, desc='ranking cutoff threshold'), 'T': measures.ParamInfo(dtype=float, default=1., desc='total desired gain (normalized)'), 'min_rel': measures.ParamInfo(dtype=int, default=0, desc='minimum relevance score'), 'max_rel': measures.ParamInfo(dtype=int, required=True, desc='maximum relevance score'), } BPM = _BPM() measures.register(BPM)
from ir_measures import measures from .base import Measure, ParamInfo class _ERR(measures.Measure): """ The Expected Reciprocal Rank (ERR) is a precision-focused measure. In essence, an extension of reciprocal rank that encapsulates both graded relevance and a more realistic cascade-based user model of how users brwose a ranking. """ __name__ = 'ERR' NAME = __name__ PRETTY_NAME = 'Expected Reciprocal Rank' SHORT_DESC = 'An extension of Reciprocal Rank that accounts for both graded relevance and a more realistic user model.' SUPPORTED_PARAMS = { 'cutoff': measures.ParamInfo(dtype=int, required=False, desc='ranking cutoff threshold'), } ERR = _ERR() measures.register(ERR)
@inproceedings{Moffat:2015:IAM:2838931.2838938, author = {Moffat, Alistair and Bailey, Peter and Scholer, Falk and Thomas, Paul}, title = {INST: An Adaptive Metric for Information Retrieval Evaluation}, booktitle = {Proceedings of the 20th Australasian Document Computing Symposium}, year = {2015}, url = {http://doi.acm.org/10.1145/2838931.2838938} } """ __name__ = 'INSQ' NAME = __name__ SUPPORTED_PARAMS = { 'T': measures.ParamInfo(dtype=float, default=1.0, desc='TODO'), 'min_rel': measures.ParamInfo(dtype=int, default=0, desc='minimum relevance score'), 'max_rel': measures.ParamInfo(dtype=int, required=True, desc='maximum relevance score'), } INST = _INST() measures.register(INST) INSQ = _INSQ() measures.register(INSQ)
class _R(measures.Measure): """ Recall@k (R@k). The fraction of relevant documents for a query that have been retrieved by rank k. NOTE: Some tasks define Recall@k as whether any relevant documents are found in the top k results. This software follows the TREC convention and refers to that measure as Success@k. """ __name__ = 'R' NAME = __name__ PRETTY_NAME = 'Recall at k' SHORT_DESC = 'The percentage of relevant documents retrieved in the top k results.' SUPPORTED_PARAMS = { 'cutoff': measures.ParamInfo(dtype=int, required=True, desc='ranking cutoff threshold'), 'rel': measures.ParamInfo( dtype=int, default=1, desc='minimum relevance score to be considered relevant (inclusive)' ) } R = _R() Recall = R measures.register(R, ['Recall'])
class _Bpref(measures.Measure): """ Binary Preference (Bpref). This measure examines the relative ranks of judged relevant and non-relevant documents. Non-judged documents are not considered. :: @inproceedings{Buckley2004RetrievalEW, title={Retrieval evaluation with incomplete information}, author={Chris Buckley and Ellen M. Voorhees}, booktitle={SIGIR}, year={2004} } """ __name__ = 'Bpref' NAME = __name__ SUPPORTED_PARAMS = { 'rel': measures.ParamInfo( dtype=int, default=1, desc='minimum relevance score to be considered relevant (inclusive)' ) } Bpref = _Bpref() BPref = Bpref measures.register(Bpref, ['BPref'])
from .base import Measure, ParamInfo class _Success(measures.Measure): """ 1 if a document with at least rel relevance is found in the first cutoff documents, else 0. NOTE: Some refer to this measure as Recall@k. This software follows the TREC convention, where Recall@k is defined as the proportion of known relevant documents retrieved in the top k results. """ __name__ = 'Success' NAME = __name__ PRETTY_NAME = 'Success at k' SHORT_DESC = 'An indicator if any relevant document is retrieved in the top k results.' SUPPORTED_PARAMS = { 'cutoff': measures.ParamInfo(dtype=int, required=True, desc='ranking cutoff threshold'), 'rel': measures.ParamInfo( dtype=int, default=1, desc='minimum relevance score to be considered relevant (inclusive)' ) } Success = _Success() measures.register(Success)
class _SetP(measures.Measure): """ The Set Precision (SetP); i.e., the number of relevant docs divided by the total number retrieved """ __name__ = 'SetP' NAME = __name__ SUPPORTED_PARAMS = { 'rel': measures.ParamInfo(dtype=int, default=1, desc='minimum relevance score to be considered relevant (inclusive)'), 'relative': measures.ParamInfo(dtype=bool, default=False, desc='calculate the measure using the maximum possible SetP for the provided result size'), } SetP = _SetP() SetRelP = _SetP(relative=True) measures.register(SetP) measures.register(SetRelP, name='SetRelP') class _SetR(measures.Measure): """ The Set Recall (SetR); i.e., the number of relevant docs divided by the total number of relevant documents """ __name__ = 'SetR' NAME = __name__ SUPPORTED_PARAMS = { 'rel': measures.ParamInfo(dtype=int, default=1, desc='minimum relevance score to be considered relevant (inclusive)') } SetR = _SetR() measures.register(SetR)
@article{Moffat:2008:RPM:1416950.1416952, author = {Moffat, Alistair and Zobel, Justin}, title = {Rank-biased Precision for Measurement of Retrieval Effectiveness}, journal = {ACM Trans. Inf. Syst.}, year = {2008}, url = {http://doi.acm.org/10.1145/1416950.1416952} } """ __name__ = 'RBP' NAME = __name__ SUPPORTED_PARAMS = { 'cutoff': measures.ParamInfo(dtype=int, required=False, desc='ranking cutoff threshold'), 'p': measures.ParamInfo(dtype=float, default=0.8, desc='persistence'), 'rel': measures.ParamInfo( dtype=int, required=False, desc= 'minimum relevance score to be considered relevant (inclusive), or NOT_PROVIDED to use graded relevance' ) } RBP = _RBP() measures.register(RBP)
:: @misc{rijsbergen:1979:ir, title={Information Retrieval.}, author={Van Rijsbergen, Cornelis J}, year={1979}, publisher={USA: Butterworth-Heinemann} } """ __name__ = 'P' NAME = __name__ PRETTY_NAME = 'Precision at k' SHORT_DESC = 'The percentage of documents in the top k results that are relevant.' SUPPORTED_PARAMS = { 'cutoff': measures.ParamInfo(dtype=int, required=True, desc='ranking cutoff threshold'), 'rel': measures.ParamInfo( dtype=int, default=1, desc='minimum relevance score to be considered relevant (inclusive)' ) } P = _P() Precision = P measures.register(P, ['Precision'])
class _NumRet(measures.Measure): """ The number of results returned. When rel is provided, counts the number of documents returned with at least that relevance score (inclusive). """ __name__ = 'NumRet' NAME = __name__ PRETTY_NAME = 'Number of Retrieved Documents' SHORT_DESC = 'The number of documents present in the result set' SUPPORTED_PARAMS = { 'rel': measures.ParamInfo( dtype=int, required=False, desc= 'minimum relevance score to be counted (inclusive), or all documents returned if NOT_PROVIDED' ) } def aggregator(self): return SumAgg() NumRet = _NumRet() NumRelRet = NumRet(rel=1) measures.register(NumRet) measures.register(NumRelRet, name='NumRelRet')
from ir_measures import measures from .base import Measure, ParamInfo, SumAgg class _NumQ(measures.Measure): """ The total number of queries. """ __name__ = 'NumQ' NAME = __name__ SUPPORTED_PARAMS = {} def aggregator(self): return SumAgg() NumQ = _NumQ() measures.register(NumQ)
from ir_measures import measures from .base import Measure, ParamInfo class _Accuracy(Measure): """Accuracy metric Reports the probability that a relevant document is ranked before a non relevant one. This metric purpose is to be used for diagnosis (checking that train/test/validation accuracy match). As such, it only considers relevant documents which are within the returned ones. """ __name__ = 'Accuracy' NAME = __name__ SUPPORTED_PARAMS = { 'cutoff': ParamInfo(dtype=int, required=False, desc='ranking cutoff threshold'), 'rel': ParamInfo( dtype=int, default=1, desc='minimum relevance score to be considered relevant (inclusive)' ) } Accuracy = _Accuracy() measures.register(Accuracy)
from ir_measures import measures from .base import Measure, ParamInfo class _Judged(measures.Measure): """ Percentage of results in the top k (cutoff) results that have relevance judgments. Equivalent to P@k with a rel lower than any judgment. """ __name__ = 'Judged' NAME = __name__ SUPPORTED_PARAMS = { 'cutoff': measures.ParamInfo(dtype=int, required=True, desc='ranking cutoff threshold'), } Judged = _Judged() measures.register(Judged)
""" The normalized Discounted Cumulative Gain (nDCG). Uses graded labels - systems that put the highest graded documents at the top of the ranking. It is normalized wrt. the Ideal NDCG, i.e. documents ranked in descending order of graded label. :: @article{Jarvelin:2002:CGE:582415.582418, author = {J\"{a}rvelin, Kalervo and Kek\"{a}l\"{a}inen, Jaana}, title = {Cumulated Gain-based Evaluation of IR Techniques}, journal = {ACM Trans. Inf. Syst.}, volume = {20}, number = {4}, year = {2002}, pages = {422--446}, numpages = {25}, url = {http://doi.acm.org/10.1145/582415.582418}, } """ __name__ = 'nDCG' NAME = __name__ SUPPORTED_PARAMS = { 'cutoff': measures.ParamInfo(dtype=int, required=False, desc='ranking cutoff threshold'), 'dcg': measures.ParamInfo(dtype=str, choices=['log2', 'exp-log2'], default='log2', desc='DCG formulation') } nDCG = _nDCG() NDCG = nDCG measures.register(nDCG, ['NDCG'])
author = {Donna Harman}, title = {Evaluation Issues in Information Retrieval}, journal = {Information Processing and Management}, volume = {28}, number = {4}, pages = {439 - -440}, year = {1992}, } """ __name__ = 'AP' NAME = __name__ PRETTY_NAME = '(Mean) Average Precision' SHORT_DESC = 'The mean of the precision scores at each relevant item retrieved.' SUPPORTED_PARAMS = { 'cutoff': measures.ParamInfo(dtype=int, required=False, desc='ranking cutoff threshold'), 'rel': measures.ParamInfo( dtype=int, default=1, desc='minimum relevance score to be considered relevant (inclusive)' ) } AP = _AP() MAP = AP measures.register(AP, ['MAP'])
The precision at R, where R is the number of relevant documents for a given query. Has the cute property that it is also the recall at R. :: @misc{Buckley2005RetrievalSE, title={Retrieval System Evaluation}, author={Chris Buckley and Ellen M. Voorhees}, annote={Chapter 3 in TREC: Experiment and Evaluation in Information Retrieval}, howpublished={MIT Press}, year={2005} } """ __name__ = 'Rprec' NAME = __name__ PRETTY_NAME = 'Precsion at R' SHORT_DESC = 'Precsion at R, where R is the number of relevant documents for a given query.' SUPPORTED_PARAMS = { 'rel': measures.ParamInfo( dtype=int, default=1, desc='minimum relevance score to be considered relevant (inclusive)' ) } Rprec = _Rprec() RPrec = Rprec measures.register(Rprec, ['RPrec'])
class _SDCG(measures.Measure): """ The Scaled Discounted Cumulative Gain (SDCG), a variant of nDCG that assumes more fully-relevant documents exist but are not labeled. """ __name__ = 'SDCG' NAME = __name__ SUPPORTED_PARAMS = { 'cutoff': measures.ParamInfo(dtype=int, required=True, desc='ranking cutoff threshold'), 'dcg': measures.ParamInfo(dtype=str, choices=['log2'], default='log2', desc='DCG formulation'), 'min_rel': measures.ParamInfo(dtype=int, default=0, desc='minimum relevance score'), 'max_rel': measures.ParamInfo(dtype=int, required=True, desc='maximum relevance score'), } SDCG = _SDCG() measures.register(SDCG)
from ir_measures import measures from .base import Measure, ParamInfo class _IPrec(measures.Measure): """ Interpolated Precision at a given recall cutoff. Used for building precision-recall graphs. Unlike most measures, where @ indicates an absolute cutoff threshold, here @ sets the recall cutoff. """ __name__ = 'IPrec' NAME = __name__ PRETTY_NAME = 'Interpolated Precision@recall' SHORT_DESC = 'The interpolated precision at a given recall cutoff.' AT_PARAM = 'recall' SUPPORTED_PARAMS = { 'recall': measures.ParamInfo(dtype=float, required=True, desc='recall threshold'), 'rel': measures.ParamInfo( dtype=int, default=1, desc='minimum relevance score to be considered relevant (inclusive)' ) } IPrec = _IPrec() measures.register(IPrec)
year = {2021}, url = {https://doi.org/10.1145/3471158.3472239} } """ __name__ = 'NERR11' NAME = __name__ SUPPORTED_PARAMS = { 'T': measures.ParamInfo(dtype=float, default=1.0, desc='total desired gain (normalized)'), 'min_rel': measures.ParamInfo(dtype=int, default=0, desc='minimum relevance score'), 'max_rel': measures.ParamInfo(dtype=int, required=True, desc='maximum relevance score'), } NERR8 = _NERR8() measures.register(NERR8) NERR9 = _NERR9() measures.register(NERR9) NERR10 = _NERR10() measures.register(NERR10) NERR11 = _NERR11() measures.register(NERR11)
@article{kantor2000trec, title={The TREC-5 Confusion Track}, author={Kantor, Paul and Voorhees, Ellen}, journal={Information Retrieval}, volume={2}, number={2-3}, pages={165--176}, year={2000} } """ __name__ = 'RR' NAME = __name__ SUPPORTED_PARAMS = { 'cutoff': measures.ParamInfo(dtype=int, required=False, desc='ranking cutoff threshold'), 'rel': measures.ParamInfo( dtype=int, default=1, desc='minimum relevance score to be considered relevant (inclusive)' ) } RR = _RR() MRR = RR measures.register(RR, ['MRR'])
@article{10.1145/3451161, author = {Clarke, Charles L. A. and Vtyurina, Alexandra and Smucker, Mark D.}, title = {Assessing Top-k Preferences}, journal = {ACM Transactions on Information Systems}, volume = {39}, number = {3}, articleno = {33}, numpages = {21}, year = {2021}, url = {https://doi.org/10.1145/3451161}, } """ __name__ = 'Compat' NAME = __name__ PRETTY_NAME = 'Compatibility' SHORT_DESC = 'The Rank Biased Overlap between the results and an ideal ranking.' SUPPORTED_PARAMS = { 'p': measures.ParamInfo(dtype=float, default=0.95, desc='persistence'), 'normalize': measures.ParamInfo( dtype=bool, default=True, desc='apply normalization for finite ideal rankings'), } Compat = _Compat() measures.register(Compat)
from ir_measures import measures from .base import Measure, ParamInfo class _infAP(measures.Measure): """ Inferred AP. AP implementation that accounts for pooled-but-unjudged documents by assuming that they are relevant at the same proportion as other judged documents. Essentially, skips documents that were pooled-but-not-judged, and assumes unjudged are non-relevant. Pooled-but-unjudged indicated by a score of -1, by convention. Note that not all qrels use this convention. """ __name__ = 'infAP' NAME = __name__ SUPPORTED_PARAMS = { 'rel': measures.ParamInfo( dtype=int, default=1, desc='minimum relevance score to be considered relevant (inclusive)' ) } infAP = _infAP() measures.register(infAP)
SHORT_DESC = 'The percentage of subtopics covered by the top k documents.' SUPPORTED_PARAMS = { 'cutoff': measures.ParamInfo(dtype=int, required=False, desc='ranking cutoff threshold'), 'rel': measures.ParamInfo(dtype=int, default=1, desc='minimum relevance score to be considered relevant (inclusive)'), } ERR_IA = _ERR_IA() nERR_IA = _nERR_IA() alpha_DCG = _alpha_DCG() α_DCG = alpha_DCG alpha_nDCG = _alpha_nDCG() α_nDCG = alpha_nDCG NRBP = _NRBP() nNRBP = _nNRBP() AP_IA = _AP_IA() MAP_IA = AP_IA P_IA = _P_IA() StRecall = _StRecall() measures.register(ERR_IA) measures.register(nERR_IA) measures.register(alpha_DCG, aliases=['α_DCG']) measures.register(alpha_nDCG, aliases=['α_nDCG']) measures.register(NRBP) measures.register(nNRBP) measures.register(AP_IA, aliases=['MAP_IA']) measures.register(P_IA) measures.register(StRecall)