def test_parse_measure(self): tests = { 'AP': AP, AP: AP, 'MAP': AP, MAP: MAP, 'P@10': P @ 10, P @ 10: P @ 10, 'nDCG@10': nDCG @ 10, 'P(rel=2)@10': P(rel=2) @ 10, 'nDCG(dcg="exp-log2")@10': nDCG(dcg='exp-log2') @ 10, 'nDCG(dcg="exp-log2", cutoff=20)@10': nDCG(dcg='exp-log2') @ 10, 'nDCG(dcg="exp-log2", cutoff=20)': nDCG(dcg='exp-log2') @ 20, 'nDCG(gains={0:1,1:2})': nDCG(gains={ 0: 1, 1: 2 }), 'nDCG(gains={1: 2, 0: 1})': nDCG(gains={ 0: 1, 1: 2 }), 'nDCG(gains={0:1,1:2})@5': nDCG(gains={ 0: 1, 1: 2 }) @ 5, '[email protected]': IPrec @ 0.2, 'IPrec(rel=2)@0.2': IPrec(rel=2) @ 0.2, 'IPrec(rel=2, recall=0.4)@0.2': IPrec(rel=2) @ 0.2, 'IPrec(rel=2, recall=0.4)': IPrec(rel=2) @ 0.4, IPrec(rel=2) @ 0.4: IPrec(rel=2) @ 0.4, } for key, value in tests.items(): with self.subTest(key): self.assertEqual(ir_measures.parse_measure(key), value)
def supports(self, metric): try: measure = ir_measures.parse_measure(str(metric)) return True except ValueError: return False except NameError: return False
def calc_metrics(self, qrels, run, metrics, verbose=False): measures = {ir_measures.parse_measure(str(m)): str(m) for m in metrics} results = {} for metric in ir_measures.iter_calc(list(measures), qrels, run): measure = measures[metric.measure] if measure not in results: results[measure] = {} results[measure][metric.query_id] = metric.value return results
def _query_differences(self, run1, run2, *args, **kwargs): """ :param run1: TREC run. Has the format {qid: {docid: score}, ...} :param run2: Same as above :param args: :param kwargs: Expects a 'dataset' parameter. This is an instance of ir-datasets :return: A list of qids that differ the most in the metric """ assert "dataset" in kwargs, "Dataset object not supplied for qrel measure" dataset = kwargs["dataset"] assert dataset.has_qrels( ), "Dataset object does not have the qrels files" overlapping_keys = set(run1.keys()).intersection(set(run2.keys())) run1 = { qid: doc_id_to_score for qid, doc_id_to_score in run1.items() if qid in overlapping_keys } run2 = { qid: doc_id_to_score for qid, doc_id_to_score in run2.items() if qid in overlapping_keys } qrels = dataset.qrels_dict() try: metric = parse_measure(self.metric) except NameError: print( "Unknown measure: {}. Please provide a measure supported by https://ir-measur.es/" .format(self.metric)) sys.exit(1) topk = self.topk eval_run_1 = self.convert_to_nested_dict( iter_calc([metric], qrels, run1)) eval_run_2 = self.convert_to_nested_dict( iter_calc([metric], qrels, run2)) query_ids = eval_run_1.keys() & eval_run_2.keys() query_ids = sorted( query_ids, key=lambda x: abs(eval_run_1[x][metric] - eval_run_2[x][metric]), reverse=True) query_ids = query_ids[:topk] id2diff = { x: abs(eval_run_1[x][metric] - eval_run_2[x][metric]) for x in query_ids } id2qrelscores = { x: [eval_run_1[x][metric], eval_run_2[x][metric]] for x in query_ids } return query_ids, id2diff, self.metric, id2qrelscores
def _get_measures(args): measures, errors = [], [] for mstr in args.measures: for m in mstr.split(): try: measure = ir_measures.parse_measure(m) if measure not in measures: measures.append(measure) except ValueError: errors.append(f'syntax error: {m}') except NameError: errors.append(f'unknown measure: {m}') if errors: sys.stderr.write('\n'.join(['error parsing measures'] + errors + [''])) sys.exit(-1) return measures