Exemple #1
0
from ir_measures import measures
from .base import Measure, ParamInfo, SumAgg


class _NumRel(measures.Measure):
    """
    The number of relevant documents the query has (independent of what the system retrieved).
    """
    __name__ = 'NumRel'
    NAME = __name__
    PRETTY_NAME = 'Number of Relevant Documents'
    SHORT_DESC = 'The number of relevant documents present in the qrels'
    SUPPORTED_PARAMS = {
        'rel': measures.ParamInfo(dtype=int, default=1, desc='minimum relevance score to be counted (inclusive)')
    }

    def aggregator(self):
        return SumAgg()


NumRel = _NumRel()
measures.register(NumRel)
Exemple #2
0
       year = {2017},
       url = {http://doi.acm.org/10.1145/3077136.3080841}
     }
    """
    __name__ = 'BPM'
    NAME = __name__
    PRETTY_NAME = 'Bejeweled Player Model'
    SHORT_DESC = 'A measure that balances both gain and user patience to determine when they stop traversing search results.'
    SUPPORTED_PARAMS = {
        'cutoff':
        measures.ParamInfo(dtype=int,
                           required=True,
                           desc='ranking cutoff threshold'),
        'T':
        measures.ParamInfo(dtype=float,
                           default=1.,
                           desc='total desired gain (normalized)'),
        'min_rel':
        measures.ParamInfo(dtype=int,
                           default=0,
                           desc='minimum relevance score'),
        'max_rel':
        measures.ParamInfo(dtype=int,
                           required=True,
                           desc='maximum relevance score'),
    }


BPM = _BPM()
measures.register(BPM)
Exemple #3
0
from ir_measures import measures
from .base import Measure, ParamInfo


class _ERR(measures.Measure):
    """
    The Expected Reciprocal Rank (ERR) is a precision-focused measure.
    In essence, an extension of reciprocal rank that encapsulates both graded relevance and
    a more realistic cascade-based user model of how users brwose a ranking.
    """
    __name__ = 'ERR'
    NAME = __name__
    PRETTY_NAME = 'Expected Reciprocal Rank'
    SHORT_DESC = 'An extension of Reciprocal Rank that accounts for both graded relevance and a more realistic user model.'
    SUPPORTED_PARAMS = {
        'cutoff': measures.ParamInfo(dtype=int, required=False, desc='ranking cutoff threshold'),
    }


ERR = _ERR()
measures.register(ERR)
Exemple #4
0
     @inproceedings{Moffat:2015:IAM:2838931.2838938,
       author = {Moffat, Alistair and Bailey, Peter and Scholer, Falk and Thomas, Paul},
       title = {INST: An Adaptive Metric for Information Retrieval Evaluation},
       booktitle = {Proceedings of the 20th Australasian Document Computing Symposium},
       year = {2015},
       url = {http://doi.acm.org/10.1145/2838931.2838938}
     }
    """
    __name__ = 'INSQ'
    NAME = __name__
    SUPPORTED_PARAMS = {
        'T':
        measures.ParamInfo(dtype=float, default=1.0, desc='TODO'),
        'min_rel':
        measures.ParamInfo(dtype=int,
                           default=0,
                           desc='minimum relevance score'),
        'max_rel':
        measures.ParamInfo(dtype=int,
                           required=True,
                           desc='maximum relevance score'),
    }


INST = _INST()
measures.register(INST)

INSQ = _INSQ()
measures.register(INSQ)
Exemple #5
0

class _R(measures.Measure):
    """
    Recall@k (R@k). The fraction of relevant documents for a query that have been retrieved by rank k.

    NOTE: Some tasks define Recall@k as whether any relevant documents are found in the top k results.
    This software follows the TREC convention and refers to that measure as Success@k.
    """
    __name__ = 'R'
    NAME = __name__
    PRETTY_NAME = 'Recall at k'
    SHORT_DESC = 'The percentage of relevant documents retrieved in the top k results.'
    SUPPORTED_PARAMS = {
        'cutoff':
        measures.ParamInfo(dtype=int,
                           required=True,
                           desc='ranking cutoff threshold'),
        'rel':
        measures.ParamInfo(
            dtype=int,
            default=1,
            desc='minimum relevance score to be considered relevant (inclusive)'
        )
    }


R = _R()
Recall = R
measures.register(R, ['Recall'])
Exemple #6
0
class _Bpref(measures.Measure):
    """
    Binary Preference (Bpref).
    This measure examines the relative ranks of judged relevant and non-relevant documents. Non-judged documents are not considered. 

::

    @inproceedings{Buckley2004RetrievalEW,
      title={Retrieval evaluation with incomplete information},
      author={Chris Buckley and Ellen M. Voorhees},
      booktitle={SIGIR},
      year={2004}
    }
    """
    __name__ = 'Bpref'
    NAME = __name__
    SUPPORTED_PARAMS = {
        'rel':
        measures.ParamInfo(
            dtype=int,
            default=1,
            desc='minimum relevance score to be considered relevant (inclusive)'
        )
    }


Bpref = _Bpref()
BPref = Bpref
measures.register(Bpref, ['BPref'])
Exemple #7
0
from .base import Measure, ParamInfo


class _Success(measures.Measure):
    """
    1 if a document with at least rel relevance is found in the first cutoff documents, else 0.

    NOTE: Some refer to this measure as Recall@k. This software follows the TREC convention, where
    Recall@k is defined as the proportion of known relevant documents retrieved in the top k results.
    """
    __name__ = 'Success'
    NAME = __name__
    PRETTY_NAME = 'Success at k'
    SHORT_DESC = 'An indicator if any relevant document is retrieved in the top k results.'
    SUPPORTED_PARAMS = {
        'cutoff':
        measures.ParamInfo(dtype=int,
                           required=True,
                           desc='ranking cutoff threshold'),
        'rel':
        measures.ParamInfo(
            dtype=int,
            default=1,
            desc='minimum relevance score to be considered relevant (inclusive)'
        )
    }


Success = _Success()
measures.register(Success)
Exemple #8
0

class _SetP(measures.Measure):
    """
    The Set Precision (SetP); i.e., the number of relevant docs divided by the total number retrieved
    """
    __name__ = 'SetP'
    NAME = __name__
    SUPPORTED_PARAMS = {
        'rel': measures.ParamInfo(dtype=int, default=1, desc='minimum relevance score to be considered relevant (inclusive)'),
        'relative': measures.ParamInfo(dtype=bool, default=False, desc='calculate the measure using the maximum possible SetP for the provided result size'),
    }

SetP = _SetP()
SetRelP = _SetP(relative=True)
measures.register(SetP)
measures.register(SetRelP, name='SetRelP')


class _SetR(measures.Measure):
    """
    The Set Recall (SetR); i.e., the number of relevant docs divided by the total number of relevant documents
    """
    __name__ = 'SetR'
    NAME = __name__
    SUPPORTED_PARAMS = {
        'rel': measures.ParamInfo(dtype=int, default=1, desc='minimum relevance score to be considered relevant (inclusive)')
    }

SetR = _SetR()
measures.register(SetR)
Exemple #9
0
     @article{Moffat:2008:RPM:1416950.1416952,
       author = {Moffat, Alistair and Zobel, Justin},
       title = {Rank-biased Precision for Measurement of Retrieval Effectiveness},
       journal = {ACM Trans. Inf. Syst.},
       year = {2008},
       url = {http://doi.acm.org/10.1145/1416950.1416952}
     }
    """
    __name__ = 'RBP'
    NAME = __name__
    SUPPORTED_PARAMS = {
        'cutoff':
        measures.ParamInfo(dtype=int,
                           required=False,
                           desc='ranking cutoff threshold'),
        'p':
        measures.ParamInfo(dtype=float, default=0.8, desc='persistence'),
        'rel':
        measures.ParamInfo(
            dtype=int,
            required=False,
            desc=
            'minimum relevance score to be considered relevant (inclusive), or NOT_PROVIDED to use graded relevance'
        )
    }


RBP = _RBP()
measures.register(RBP)
Exemple #10
0
::

    @misc{rijsbergen:1979:ir,
      title={Information Retrieval.},
      author={Van Rijsbergen, Cornelis J},
      year={1979},
      publisher={USA: Butterworth-Heinemann}
    }
    """
    __name__ = 'P'
    NAME = __name__
    PRETTY_NAME = 'Precision at k'
    SHORT_DESC = 'The percentage of documents in the top k results that are relevant.'
    SUPPORTED_PARAMS = {
        'cutoff':
        measures.ParamInfo(dtype=int,
                           required=True,
                           desc='ranking cutoff threshold'),
        'rel':
        measures.ParamInfo(
            dtype=int,
            default=1,
            desc='minimum relevance score to be considered relevant (inclusive)'
        )
    }


P = _P()
Precision = P
measures.register(P, ['Precision'])
Exemple #11
0

class _NumRet(measures.Measure):
    """
    The number of results returned. When rel is provided, counts the number of documents
    returned with at least that relevance score (inclusive).
    """
    __name__ = 'NumRet'
    NAME = __name__
    PRETTY_NAME = 'Number of Retrieved Documents'
    SHORT_DESC = 'The number of documents present in the result set'

    SUPPORTED_PARAMS = {
        'rel':
        measures.ParamInfo(
            dtype=int,
            required=False,
            desc=
            'minimum relevance score to be counted (inclusive), or all documents returned if NOT_PROVIDED'
        )
    }

    def aggregator(self):
        return SumAgg()


NumRet = _NumRet()
NumRelRet = NumRet(rel=1)
measures.register(NumRet)
measures.register(NumRelRet, name='NumRelRet')
Exemple #12
0
from ir_measures import measures
from .base import Measure, ParamInfo, SumAgg


class _NumQ(measures.Measure):
    """
    The total number of queries.
    """
    __name__ = 'NumQ'
    NAME = __name__
    SUPPORTED_PARAMS = {}

    def aggregator(self):
        return SumAgg()


NumQ = _NumQ()
measures.register(NumQ)
Exemple #13
0
from ir_measures import measures
from .base import Measure, ParamInfo


class _Accuracy(Measure):
    """Accuracy metric

    Reports the probability that a relevant document is ranked before a non relevant one.
    This metric purpose is to be used for diagnosis (checking that train/test/validation accuracy match).
    As such, it only considers relevant documents which are within the returned ones.
    """
    __name__ = 'Accuracy'
    NAME = __name__
    SUPPORTED_PARAMS = {
        'cutoff':
        ParamInfo(dtype=int, required=False, desc='ranking cutoff threshold'),
        'rel':
        ParamInfo(
            dtype=int,
            default=1,
            desc='minimum relevance score to be considered relevant (inclusive)'
        )
    }


Accuracy = _Accuracy()
measures.register(Accuracy)
Exemple #14
0
from ir_measures import measures
from .base import Measure, ParamInfo


class _Judged(measures.Measure):
    """
    Percentage of results in the top k (cutoff) results that have relevance judgments. Equivalent to P@k with
    a rel lower than any judgment.
    """
    __name__ = 'Judged'
    NAME = __name__
    SUPPORTED_PARAMS = {
        'cutoff':
        measures.ParamInfo(dtype=int,
                           required=True,
                           desc='ranking cutoff threshold'),
    }


Judged = _Judged()
measures.register(Judged)
Exemple #15
0
    """
    The normalized Discounted Cumulative Gain (nDCG).
    Uses graded labels - systems that put the highest graded documents at the top of the ranking.
    It is normalized wrt. the Ideal NDCG, i.e. documents ranked in descending order of graded label.

::

    @article{Jarvelin:2002:CGE:582415.582418,
      author = {J\"{a}rvelin, Kalervo and Kek\"{a}l\"{a}inen, Jaana},
      title = {Cumulated Gain-based Evaluation of IR Techniques},
      journal = {ACM Trans. Inf. Syst.},
      volume = {20},
      number = {4},
      year = {2002},
      pages = {422--446},
      numpages = {25},
      url = {http://doi.acm.org/10.1145/582415.582418},
    }
    """
    __name__ = 'nDCG'
    NAME = __name__
    SUPPORTED_PARAMS = {
        'cutoff': measures.ParamInfo(dtype=int, required=False, desc='ranking cutoff threshold'),
        'dcg': measures.ParamInfo(dtype=str, choices=['log2', 'exp-log2'], default='log2', desc='DCG formulation')
    }


nDCG = _nDCG()
NDCG = nDCG
measures.register(nDCG, ['NDCG'])
Exemple #16
0
      author = {Donna Harman},
      title = {Evaluation Issues in Information Retrieval},
      journal = {Information Processing and Management},
      volume = {28},
      number = {4},
      pages = {439 - -440},
      year = {1992},
    }
    """
    __name__ = 'AP'
    NAME = __name__
    PRETTY_NAME = '(Mean) Average Precision'
    SHORT_DESC = 'The mean of the precision scores at each relevant item retrieved.'
    SUPPORTED_PARAMS = {
        'cutoff':
        measures.ParamInfo(dtype=int,
                           required=False,
                           desc='ranking cutoff threshold'),
        'rel':
        measures.ParamInfo(
            dtype=int,
            default=1,
            desc='minimum relevance score to be considered relevant (inclusive)'
        )
    }


AP = _AP()
MAP = AP
measures.register(AP, ['MAP'])
Exemple #17
0
    The precision at R, where R is the number of relevant documents for a given query. Has the cute property that
    it is also the recall at R.

::

    @misc{Buckley2005RetrievalSE,
      title={Retrieval System Evaluation},
      author={Chris Buckley and Ellen M. Voorhees},
      annote={Chapter 3 in TREC: Experiment and Evaluation in Information Retrieval},
      howpublished={MIT Press},
      year={2005}
    }
    """
    __name__ = 'Rprec'
    NAME = __name__
    PRETTY_NAME = 'Precsion at R'
    SHORT_DESC = 'Precsion at R, where R is the number of relevant documents for a given query.'
    SUPPORTED_PARAMS = {
        'rel':
        measures.ParamInfo(
            dtype=int,
            default=1,
            desc='minimum relevance score to be considered relevant (inclusive)'
        )
    }


Rprec = _Rprec()
RPrec = Rprec
measures.register(Rprec, ['RPrec'])
Exemple #18
0
class _SDCG(measures.Measure):
    """
    The Scaled Discounted Cumulative Gain (SDCG), a variant of nDCG that assumes more
    fully-relevant documents exist but are not labeled.
    """
    __name__ = 'SDCG'
    NAME = __name__
    SUPPORTED_PARAMS = {
        'cutoff':
        measures.ParamInfo(dtype=int,
                           required=True,
                           desc='ranking cutoff threshold'),
        'dcg':
        measures.ParamInfo(dtype=str,
                           choices=['log2'],
                           default='log2',
                           desc='DCG formulation'),
        'min_rel':
        measures.ParamInfo(dtype=int,
                           default=0,
                           desc='minimum relevance score'),
        'max_rel':
        measures.ParamInfo(dtype=int,
                           required=True,
                           desc='maximum relevance score'),
    }


SDCG = _SDCG()
measures.register(SDCG)
Exemple #19
0
from ir_measures import measures
from .base import Measure, ParamInfo


class _IPrec(measures.Measure):
    """
    Interpolated Precision at a given recall cutoff. Used for building precision-recall graphs.
    Unlike most measures, where @ indicates an absolute cutoff threshold, here @ sets the recall
    cutoff.
    """
    __name__ = 'IPrec'
    NAME = __name__
    PRETTY_NAME = 'Interpolated Precision@recall'
    SHORT_DESC = 'The interpolated precision at a given recall cutoff.'
    AT_PARAM = 'recall'
    SUPPORTED_PARAMS = {
        'recall':
        measures.ParamInfo(dtype=float, required=True,
                           desc='recall threshold'),
        'rel':
        measures.ParamInfo(
            dtype=int,
            default=1,
            desc='minimum relevance score to be considered relevant (inclusive)'
        )
    }


IPrec = _IPrec()
measures.register(IPrec)
Exemple #20
0
       year = {2021},
       url = {https://doi.org/10.1145/3471158.3472239}
     }
    """
    __name__ = 'NERR11'
    NAME = __name__
    SUPPORTED_PARAMS = {
        'T':
        measures.ParamInfo(dtype=float,
                           default=1.0,
                           desc='total desired gain (normalized)'),
        'min_rel':
        measures.ParamInfo(dtype=int,
                           default=0,
                           desc='minimum relevance score'),
        'max_rel':
        measures.ParamInfo(dtype=int,
                           required=True,
                           desc='maximum relevance score'),
    }


NERR8 = _NERR8()
measures.register(NERR8)
NERR9 = _NERR9()
measures.register(NERR9)
NERR10 = _NERR10()
measures.register(NERR10)
NERR11 = _NERR11()
measures.register(NERR11)
Exemple #21
0
    @article{kantor2000trec,
      title={The TREC-5 Confusion Track},
      author={Kantor, Paul and Voorhees, Ellen},
      journal={Information Retrieval},
      volume={2},
      number={2-3},
      pages={165--176},
      year={2000}
    }
    """
    __name__ = 'RR'
    NAME = __name__
    SUPPORTED_PARAMS = {
        'cutoff':
        measures.ParamInfo(dtype=int,
                           required=False,
                           desc='ranking cutoff threshold'),
        'rel':
        measures.ParamInfo(
            dtype=int,
            default=1,
            desc='minimum relevance score to be considered relevant (inclusive)'
        )
    }


RR = _RR()
MRR = RR
measures.register(RR, ['MRR'])
Exemple #22
0
    @article{10.1145/3451161,
      author = {Clarke, Charles L. A. and Vtyurina, Alexandra and Smucker, Mark D.},
      title = {Assessing Top-k Preferences},
      journal = {ACM Transactions on Information Systems},
      volume = {39},
      number = {3},
      articleno = {33},
      numpages = {21},
      year = {2021},
      url = {https://doi.org/10.1145/3451161},
    }
    """
    __name__ = 'Compat'
    NAME = __name__
    PRETTY_NAME = 'Compatibility'
    SHORT_DESC = 'The Rank Biased Overlap between the results and an ideal ranking.'
    SUPPORTED_PARAMS = {
        'p':
        measures.ParamInfo(dtype=float, default=0.95, desc='persistence'),
        'normalize':
        measures.ParamInfo(
            dtype=bool,
            default=True,
            desc='apply normalization for finite ideal rankings'),
    }


Compat = _Compat()
measures.register(Compat)
Exemple #23
0
from ir_measures import measures
from .base import Measure, ParamInfo


class _infAP(measures.Measure):
    """
    Inferred AP. AP implementation that accounts for pooled-but-unjudged documents by assuming
    that they are relevant at the same proportion as other judged documents. Essentially, skips
    documents that were pooled-but-not-judged, and assumes unjudged are non-relevant.

    Pooled-but-unjudged indicated by a score of -1, by convention. Note that not all qrels use
    this convention.
    """
    __name__ = 'infAP'
    NAME = __name__
    SUPPORTED_PARAMS = {
        'rel':
        measures.ParamInfo(
            dtype=int,
            default=1,
            desc='minimum relevance score to be considered relevant (inclusive)'
        )
    }


infAP = _infAP()
measures.register(infAP)
Exemple #24
0
    SHORT_DESC = 'The percentage of subtopics covered by the top k documents.'
    SUPPORTED_PARAMS = {
        'cutoff': measures.ParamInfo(dtype=int, required=False, desc='ranking cutoff threshold'),
        'rel': measures.ParamInfo(dtype=int, default=1, desc='minimum relevance score to be considered relevant (inclusive)'),
    }



ERR_IA = _ERR_IA()
nERR_IA = _nERR_IA()
alpha_DCG = _alpha_DCG()
α_DCG = alpha_DCG
alpha_nDCG = _alpha_nDCG()
α_nDCG = alpha_nDCG
NRBP = _NRBP()
nNRBP = _nNRBP()
AP_IA = _AP_IA()
MAP_IA = AP_IA
P_IA = _P_IA()
StRecall = _StRecall()

measures.register(ERR_IA)
measures.register(nERR_IA)
measures.register(alpha_DCG, aliases=['α_DCG'])
measures.register(alpha_nDCG, aliases=['α_nDCG'])
measures.register(NRBP)
measures.register(nNRBP)
measures.register(AP_IA, aliases=['MAP_IA'])
measures.register(P_IA)
measures.register(StRecall)