Exemple #1
0
    def _query_differences(self, run1, run2, *args, **kwargs):
        """
        :param run1: TREC run. Has the format {qid: {docid: score}, ...}
        :param run2: Same as above
        :param args:
        :param kwargs: Expects a 'dataset' parameter. This is an instance of ir-datasets
        :return: A list of qids that differ the most in the metric
        """
        assert "dataset" in kwargs, "Dataset object not supplied for qrel measure"
        dataset = kwargs["dataset"]
        assert dataset.has_qrels(
        ), "Dataset object does not have the qrels files"
        overlapping_keys = set(run1.keys()).intersection(set(run2.keys()))
        run1 = {
            qid: doc_id_to_score
            for qid, doc_id_to_score in run1.items() if qid in overlapping_keys
        }
        run2 = {
            qid: doc_id_to_score
            for qid, doc_id_to_score in run2.items() if qid in overlapping_keys
        }

        qrels = dataset.qrels_dict()
        try:
            metric = parse_measure(self.metric)
        except NameError:
            print(
                "Unknown measure: {}. Please provide a measure supported by https://ir-measur.es/"
                .format(self.metric))
            sys.exit(1)

        topk = self.topk
        eval_run_1 = self.convert_to_nested_dict(
            iter_calc([metric], qrels, run1))
        eval_run_2 = self.convert_to_nested_dict(
            iter_calc([metric], qrels, run2))

        query_ids = eval_run_1.keys() & eval_run_2.keys()
        query_ids = sorted(
            query_ids,
            key=lambda x: abs(eval_run_1[x][metric] - eval_run_2[x][metric]),
            reverse=True)
        query_ids = query_ids[:topk]
        id2diff = {
            x: abs(eval_run_1[x][metric] - eval_run_2[x][metric])
            for x in query_ids
        }
        id2qrelscores = {
            x: [eval_run_1[x][metric], eval_run_2[x][metric]]
            for x in query_ids
        }
        return query_ids, id2diff, self.metric, id2qrelscores
Exemple #2
0
 def calc_metrics(self, qrels, run, metrics, verbose=False):
     measures = {ir_measures.parse_measure(str(m)): str(m) for m in metrics}
     results = {}
     for metric in ir_measures.iter_calc(list(measures), qrels, run):
         measure = measures[metric.measure]
         if measure not in results:
             results[measure] = {}
         results[measure][metric.query_id] = metric.value
     return results
Exemple #3
0
 def iter_calc(self, qrels, run):
     self.validate_params()
     return ir_measures.iter_calc([self], qrels, run)
Exemple #4
0
    def test_empty(self):
        qrels = list(ir_measures.read_trec_qrels('''
0 0 D0 0
0 0 D1 1
0 0 D2 1
0 0 D3 2
0 0 D4 0
1 0 D0 1
1 0 D3 2
1 0 D5 2
'''))
        partial_qrels = [q for q in qrels if q.query_id == '0']
        run = list(ir_measures.read_trec_run('''
0 0 D0 1 0.8 run
0 0 D2 2 0.7 run
0 0 D1 3 0.3 run
0 0 D3 4 0.4 run
0 0 D4 5 0.1 run
1 0 D1 1 0.8 run
1 0 D3 2 0.7 run
1 0 D4 3 0.3 run
1 0 D2 4 0.4 run
'''))
        partial_run = [r for r in run if r.query_id == '0']
        empty = []

        # qrels but no run
        self.assertEqual(set(ir_measures.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.), Metric('1', P@5, 0.)})
        self.assertEqual(set(ir_measures.gdeval.iter_calc([ERR@5], qrels, empty)), {Metric('0', ERR@5, 0.), Metric('1', ERR@5, 0.)})
        self.assertEqual(set(ir_measures.judged.iter_calc([Judged@5], qrels, empty)), {Metric('0', Judged@5, 0.), Metric('1', Judged@5, 0.)})
        self.assertEqual(set(ir_measures.msmarco.iter_calc([RR@5], qrels, empty)), {Metric('0', RR@5, 0.), Metric('1', RR@5, 0.)})
        self.assertEqual(set(ir_measures.pytrec_eval.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.), Metric('1', P@5, 0.)})
        self.assertEqual(set(ir_measures.trectools.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.), Metric('1', P@5, 0.)})
        self.assertEqual(set(ir_measures.cwl_eval.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.0), Metric('1', P@5, 0.0)})
        self.assertEqual(set(ir_measures.compat.iter_calc([Compat(p=0.8)], qrels, empty)), {Metric('0', Compat(p=0.8), 0.0), Metric('1', Compat(p=0.8), 0.0)})
        self.assertEqual(set(ir_measures.accuracy.iter_calc([Accuracy()], qrels, empty)), set())

        # qrels but partial run
        self.assertEqual(set(ir_measures.iter_calc([P@5], qrels, partial_run)), {Metric('0', P@5, 0.6), Metric('1', P@5, 0.)})
        self.assertEqual(set(ir_measures.gdeval.iter_calc([ERR@5], qrels, partial_run)), {Metric('0', ERR@5, 0.10175), Metric('1', ERR@5, 0.)})
        self.assertEqual(set(ir_measures.judged.iter_calc([Judged@5], qrels, partial_run)), {Metric('0', Judged@5, 1.), Metric('1', Judged@5, 0.)})
        self.assertEqual(set(ir_measures.msmarco.iter_calc([RR@5], qrels, partial_run)), {Metric('0', RR@5, 0.5), Metric('1', RR@5, 0.)})
        self.assertEqual(set(ir_measures.pytrec_eval.iter_calc([P@5], qrels, partial_run)), {Metric('0', P@5, 0.6), Metric('1', P@5, 0.)})
        self.assertEqual(set(ir_measures.trectools.iter_calc([P@5], qrels, partial_run)), {Metric('0', P@5, 0.6), Metric('1', P@5, 0.)})
        self.assertEqual(set(ir_measures.cwl_eval.iter_calc([P@5], qrels, partial_run)), {CwlMetric('0', P@5, 0.6000000000000001, 3.0, 1.0, 5.0, 5.0), Metric('1', P@5, 0.0)})
        self.assertEqual(set(ir_measures.compat.iter_calc([Compat(p=0.8)], qrels, partial_run)), {Metric('0', Compat(p=0.8), 0.4744431703672816), Metric('1', Compat(p=0.8), 0.0)})
        self.assertEqual(set(ir_measures.accuracy.iter_calc([Accuracy()], qrels, partial_run)), {Metric('0', Accuracy(), 0.5)})

        # run but no qrels
        self.assertEqual(list(ir_measures.iter_calc([P@5], empty, run)), [])
        self.assertEqual(list(ir_measures.gdeval.iter_calc([ERR@5], empty, run)), [])
        self.assertEqual(list(ir_measures.judged.iter_calc([Judged@5], empty, run)), [])
        self.assertEqual(list(ir_measures.msmarco.iter_calc([RR@5], empty, run)), [])
        self.assertEqual(list(ir_measures.pytrec_eval.iter_calc([P@5], empty, run)), [])
        self.assertEqual(list(ir_measures.trectools.iter_calc([P@5], empty, run)), [])
        self.assertEqual(list(ir_measures.cwl_eval.iter_calc([P@5], empty, run)), [])
        self.assertEqual(list(ir_measures.compat.iter_calc([Compat(p=0.8)], empty, run)), [])
        self.assertEqual(list(ir_measures.accuracy.iter_calc([Accuracy()], empty, run)), [])

        # run but partial qrels
        self.assertEqual(set(ir_measures.iter_calc([P@5], partial_qrels, run)), {Metric('0', P@5, 0.6)})
        self.assertEqual(set(ir_measures.gdeval.iter_calc([ERR@5], partial_qrels, run)), {Metric('0', ERR@5, 0.10175)})
        self.assertEqual(set(ir_measures.judged.iter_calc([Judged@5], partial_qrels, run)), {Metric('0', Judged@5, 1.)})
        self.assertEqual(set(ir_measures.msmarco.iter_calc([RR@5], partial_qrels, run)), {Metric('0', RR@5, 0.5)})
        self.assertEqual(set(ir_measures.pytrec_eval.iter_calc([P@5], partial_qrels, run)), {Metric('0', P@5, 0.6)})
        self.assertEqual(set(ir_measures.trectools.iter_calc([P@5], partial_qrels, run)), {Metric('0', P@5, 0.6)})
        self.assertEqual(set(ir_measures.cwl_eval.iter_calc([P@5], partial_qrels, run)), {CwlMetric('0', P@5, 0.6000000000000001, 3.0, 1.0, 5.0, 5.0)})
        self.assertEqual(set(ir_measures.compat.iter_calc([Compat(p=0.8)], partial_qrels, run)), {Metric('0', Compat(p=0.8), 0.4744431703672816)})
        self.assertEqual(set(ir_measures.accuracy.iter_calc([Accuracy()], partial_qrels, run)), {Metric('0', Accuracy(), 0.5)})

        # both no run and no qrels
        self.assertEqual(list(ir_measures.iter_calc([P@5], empty, empty)), [])
        self.assertEqual(list(ir_measures.gdeval.iter_calc([ERR@5], empty, empty)), [])
        self.assertEqual(list(ir_measures.judged.iter_calc([Judged@5], empty, empty)), [])
        self.assertEqual(list(ir_measures.msmarco.iter_calc([RR@5], empty, empty)), [])
        self.assertEqual(list(ir_measures.pytrec_eval.iter_calc([P@5], empty, empty)), [])
        self.assertEqual(list(ir_measures.trectools.iter_calc([P@5], empty, empty)), [])
        self.assertEqual(list(ir_measures.cwl_eval.iter_calc([P@5], empty, empty)), [])
        self.assertEqual(list(ir_measures.compat.iter_calc([Compat(p=0.8)], empty, empty)), [])
        self.assertEqual(list(ir_measures.accuracy.iter_calc([Accuracy()], empty, empty)), [])

        # qrels but no run
        numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], qrels, empty), {P@5: 0.})
        numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], qrels, empty), {ERR@5: 0.})
        numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], qrels, empty), {Judged@5: 0.})
        numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], qrels, empty), {RR@5: 0.})
        numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], qrels, empty), {P@5: 0.})
        numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], qrels, empty), {P@5: 0.})
        numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], qrels, empty), {P@5: 0.})
        numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], qrels, empty), {Compat(p=0.8): 0.})
        numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], qrels, empty), {Accuracy(): float('NaN')})

        # qrels but partial run
        numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.3})
        numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], qrels, partial_run), {ERR@5: 0.050875})
        numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], qrels, partial_run), {Judged@5: 0.5})
        numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], qrels, partial_run), {RR@5: 0.25})
        numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.3})
        numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.3})
        numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.30000000000000004})
        numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], qrels, partial_run), {Compat(p=0.8): 0.2372215851836408})
        numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], qrels, partial_run), {Accuracy(): 0.5})

        # run but no qrels
        numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], empty, run), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], empty, run), {ERR@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], empty, run), {Judged@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], empty, run), {RR@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], empty, run), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], empty, run), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], empty, run), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], empty, run), {Compat(p=0.8): float('NaN')})
        numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], empty, run), {Accuracy(): float('NaN')})

        # run but partial qrels
        numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6})
        numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], partial_qrels, run), {ERR@5: 0.10175})
        numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], partial_qrels, run), {Judged@5: 1.0})
        numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], partial_qrels, run), {RR@5: 0.5})
        numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6})
        numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6})
        numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6000000000000001})
        numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], partial_qrels, run), {Compat(p=0.8): 0.4744431703672816})
        numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], partial_qrels, run), {Accuracy(): 0.5})

        # both no run and no qrels
        numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], empty, empty), {ERR@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], empty, empty), {Judged@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], empty, empty), {RR@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], empty, empty), {Compat(p=0.8): float('NaN')})
        numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], empty, empty), {Accuracy(): float('NaN')})
Exemple #5
0
def _run_and_evaluate(system: SYSTEM_OR_RESULTS_TYPE,
                      topics: pd.DataFrame,
                      qrels: pd.DataFrame,
                      metrics: MEASURES_TYPE,
                      pbar=None,
                      save_mode=None,
                      save_file=None,
                      perquery: bool = False,
                      batch_size=None,
                      backfill_qids: Sequence[str] = None):

    from .io import read_results, write_results

    if pbar is None:
        from . import tqdm
        pbar = tqdm(disable=True)

    metrics, rev_mapping = _convert_measures(metrics)
    qrels = qrels.rename(columns={
        'qid': 'query_id',
        'docno': 'doc_id',
        'label': 'relevance'
    })
    from timeit import default_timer as timer
    runtime = 0
    num_q = qrels['query_id'].nunique()
    if save_file is not None and os.path.exists(save_file):
        if save_mode == "reuse":
            system = read_results(save_file)
        elif save_mode == "overwrite":
            os.remove(save_file)
        else:
            raise ValueError(
                "Unknown save_file argument '%s', valid options are 'reuse' or 'overwrite'"
                % save_mode)

    # if its a DataFrame, use it as the results
    if isinstance(system, pd.DataFrame):
        res = system
        res = coerce_dataframe_types(res)
        if len(res) == 0:
            raise ValueError("%d topics, but no results in dataframe" %
                             len(topics))
        evalMeasuresDict = _ir_measures_to_dict(
            ir_measures.iter_calc(metrics, qrels,
                                  res.rename(columns=_irmeasures_columns)),
            metrics, rev_mapping, num_q, perquery, backfill_qids)
        pbar.update()

    elif batch_size is None:
        #transformer, evaluate all queries at once

        starttime = timer()
        res = system.transform(topics)
        endtime = timer()
        runtime = (endtime - starttime) * 1000.

        # write results to save_file; we can be sure this file does not exist
        if save_file is not None:
            write_results(res, save_file)

        res = coerce_dataframe_types(res)

        if len(res) == 0:
            raise ValueError("%d topics, but no results received from %s" %
                             (len(topics), str(system)))

        evalMeasuresDict = _ir_measures_to_dict(
            ir_measures.iter_calc(metrics, qrels,
                                  res.rename(columns=_irmeasures_columns)),
            metrics, rev_mapping, num_q, perquery, backfill_qids)
        pbar.update()
    else:
        #transformer, evaluate queries in batches
        assert batch_size > 0
        starttime = timer()
        evalMeasuresDict = {}
        remaining_qrel_qids = set(qrels.query_id)
        try:
            for i, (res, batch_topics) in enumerate(
                    system.transform_gen(topics,
                                         batch_size=batch_size,
                                         output_topics=True)):
                if len(res) == 0:
                    raise ValueError(
                        "batch of %d topics, but no results received in batch %d from %s"
                        % (len(batch_topics), i, str(system)))
                endtime = timer()
                runtime += (endtime - starttime) * 1000.

                # write results to save_file; we will append for subsequent batches
                if save_file is not None:
                    write_results(res, save_file, append=True)

                res = coerce_dataframe_types(res)
                batch_qids = set(batch_topics.qid)
                batch_qrels = qrels[qrels.query_id.isin(
                    batch_qids
                )]  # filter qrels down to just the qids that appear in this batch
                remaining_qrel_qids.difference_update(batch_qids)
                batch_backfill = [
                    qid for qid in backfill_qids if qid in batch_qids
                ] if backfill_qids is not None else None
                evalMeasuresDict.update(
                    _ir_measures_to_dict(ir_measures.iter_calc(
                        metrics, batch_qrels,
                        res.rename(columns=_irmeasures_columns)),
                                         metrics,
                                         rev_mapping,
                                         num_q,
                                         perquery=True,
                                         backfill_qids=batch_backfill))
                pbar.update()
                starttime = timer()
        except:
            # if an error is thrown, we need to clean up our existing file
            if save_file is not None and os.path.exists(save_file):
                os.remove(save_file)
            raise
        if remaining_qrel_qids:
            # there are some qids in the qrels that were not in the topics. Get the default values for these and update evalMeasuresDict
            missing_qrels = qrels[qrels.query_id.isin(remaining_qrel_qids)]
            empty_res = pd.DataFrame([],
                                     columns=['query_id', 'doc_id', 'score'])
            evalMeasuresDict.update(
                _ir_measures_to_dict(ir_measures.iter_calc(
                    metrics, missing_qrels, empty_res),
                                     metrics,
                                     rev_mapping,
                                     num_q,
                                     perquery=True))
        if not perquery:
            # aggregate measures if not in per query mode
            aggregators = {
                rev_mapping.get(m, str(m)): m.aggregator()
                for m in metrics
            }
            for q in evalMeasuresDict:
                for metric in metrics:
                    s_metric = rev_mapping.get(metric, str(metric))
                    aggregators[s_metric].add(evalMeasuresDict[q][s_metric])
            evalMeasuresDict = {
                m: agg.result()
                for m, agg in aggregators.items()
            }
    return (runtime, evalMeasuresDict)
Exemple #6
0
def _run_and_evaluate(system: SYSTEM_OR_RESULTS_TYPE,
                      topics: pd.DataFrame,
                      qrels: pd.DataFrame,
                      metrics: MEASURES_TYPE,
                      perquery: bool = False,
                      batch_size=None):

    metrics, rev_mapping = _convert_measures(metrics)
    qrels = qrels.rename(columns={
        'qid': 'query_id',
        'docno': 'doc_id',
        'label': 'relevance'
    })
    from timeit import default_timer as timer
    runtime = 0
    num_q = qrels['query_id'].nunique()
    # if its a DataFrame, use it as the results
    if isinstance(system, pd.DataFrame):
        res = system
        if len(res) == 0:
            raise ValueError("%d topics, but no results in dataframe" %
                             len(topics))
        evalMeasuresDict = _ir_measures_to_dict(
            ir_measures.iter_calc(metrics, qrels,
                                  res.rename(columns=_irmeasures_columns)),
            metrics, rev_mapping, num_q, perquery)

    elif batch_size is None:
        #transformer, evaluate all queries at once

        starttime = timer()
        res = system.transform(topics)
        endtime = timer()
        runtime = (endtime - starttime) * 1000.

        if len(res) == 0:
            raise ValueError("%d topics, but no results received from %s" %
                             (len(topics), str(system)))

        evalMeasuresDict = _ir_measures_to_dict(
            ir_measures.iter_calc(metrics, qrels,
                                  res.rename(columns=_irmeasures_columns)),
            metrics, rev_mapping, num_q, perquery)
    else:
        #transformer, evaluate queries in batches
        assert batch_size > 0
        starttime = timer()
        results = []
        evalMeasuresDict = {}
        for i, res in enumerate(
                system.transform_gen(topics, batch_size=batch_size)):
            if len(res) == 0:
                raise ValueError(
                    "batch of %d topics, but no results received in batch %d from %s"
                    % (batch_size, i, str(system)))
            endtime = timer()
            runtime += (endtime - starttime) * 1000.
            localEvalDict = _ir_measures_to_dict(
                ir_measures.iter_calc(metrics, qrels,
                                      res.rename(columns=_irmeasures_columns)),
                metrics, rev_mapping, num_q, True)
            evalMeasuresDict.update(localEvalDict)
            starttime = timer()
        if not perquery:
            aggregators = {
                rev_mapping.get(m, str(m)): m.aggregator()
                for m in metrics
            }
            for q in evalMeasuresDict:
                for metric in metrics:
                    s_metric = rev_mapping.get(metric, str(metric))
                    aggregators[s_metric].add(evalMeasuresDict[q][s_metric])
            evalMeasuresDict = {
                m: agg.result()
                for m, agg in aggregators.items()
            }
    return (runtime, evalMeasuresDict)
Exemple #7
0
    def create_query_objects(self,
                             run_1,
                             run_2,
                             qids,
                             qid2diff,
                             metric_name,
                             dataset,
                             qid2qrelscores=None):
        """
        TODO: Need a better name
        This method takes in 2 runs and a set of qids, and constructs a dict for each qid (format specified below)
        :param: run_1: TREC run of the format {qid: {docid: score}, ...}
        :param: run_2: TREC run of the format {qid: {docid: score}, ...}
        :param qids: A list of qids (strings)
        :param dataset: Instance of an ir-datasets object
        :return: A list of dicts. Each dict has the following format:
        {
            "fields": {"query_id": "qid", "title": "Title query", "desc": "Can be empty", ... everything else in ir-dataset query},
            "run_1": [
                {
                    "doc_id": "id of the doc",
                    "score": <score>,
                    "relevance": <comes from qrels>,
                    "weights": [
                        [field, start, stop, weight]
                        ^ Need more clarity. Return an empty list for now
                    ]

                }
            ],
            "run_2": <same format as run 1>
        }
        """
        assert dataset.has_qrels(
        ), "Cannot determine whether the doc is relevant - need qrels"
        qrels = dataset.qrels_dict()
        run1_metrics = defaultdict(lambda: defaultdict(lambda: None))
        for metrics in iter_calc([
                P @ 1, P @ 3, P @ 5, P @ 10, nDCG @ 1, nDCG @ 3, nDCG @ 5,
                nDCG @ 10
        ], qrels, run_1):
            run1_metrics[metrics.query_id][str(
                metrics.measure)] = metrics.value
        if run_2:
            run2_metrics = defaultdict(lambda: defaultdict(lambda: None))
            for metrics in iter_calc([
                    P @ 1, P @ 3, P @ 5, P @ 10, nDCG @ 1, nDCG @ 3, nDCG @ 5,
                    nDCG @ 10
            ], qrels, run_2):
                run2_metrics[metrics.query_id][str(
                    metrics.measure)] = metrics.value
        docstore = dataset.docs_store()
        qids_set = set(qids)  # Sets do O(1) lookups
        qid2object = {}
        for query in tqdm(dataset.queries_iter(), desc="analyzing queries"):
            if query.query_id not in qids_set:
                continue

            RESULT_COUNT = 10
            doc_ids = (set(
                list(run_1[query.query_id])[:RESULT_COUNT] +
                list(run_2[query.query_id])[:RESULT_COUNT]) if run_2 else list(
                    run_1[query.query_id])[:RESULT_COUNT])

            fields = query._asdict()
            fields["contrast"] = {
                "name": metric_name,
                "value": qid2diff[query.query_id]
            }
            if qid2qrelscores:
                fields[f"Run1 {metric_name}"] = qid2qrelscores[
                    query.query_id][0]
                fields[f"Run2 {metric_name}"] = qid2qrelscores[
                    query.query_id][1]
            qrels_for_query = qrels.get(query.query_id, {})
            run_1_for_query = []
            for rank, (doc_id,
                       score) in enumerate(run_1[query.query_id].items()):
                if doc_id not in doc_ids:
                    continue
                doc = docstore.get(doc_id)
                weights = self.weight.score_document_regions(query, doc, 0)
                run_1_for_query.append({
                    "doc_id":
                    doc_id,
                    "score":
                    score,
                    "relevance":
                    qrels_for_query.get(doc_id),
                    "rank":
                    rank + 1,
                    "weights":
                    weights,
                    "snippet":
                    self.find_snippet(weights, doc),
                })

            run_2_for_query = []

            if run_2 is not None:
                for rank, (doc_id,
                           score) in enumerate(run_2[query.query_id].items()):
                    if doc_id not in doc_ids:
                        continue
                    doc = docstore.get(doc_id)
                    weights = self.weight.score_document_regions(query, doc, 1)
                    run_2_for_query.append({
                        "doc_id":
                        doc_id,
                        "score":
                        score,
                        "relevance":
                        qrels_for_query.get(doc_id),
                        "rank":
                        rank + 1,
                        "weights":
                        weights,
                        "snippet":
                        self.find_snippet(weights, doc),
                    })

            qid2object[query.query_id] = {
                "fields":
                fields,
                "metrics": {
                    metric: [
                        run1_metrics[query.query_id][metric],
                        run2_metrics[query.query_id][metric]
                    ] if run_2 else [run1_metrics[query.query_id][metric]]
                    for metric in [
                        "P@1", "P@3", "P@5", "P@10", "nDCG@1", "nDCG@3",
                        "nDCG@5", "nDCG@10"
                    ]
                },
                "run_1":
                run_1_for_query,
                "run_2":
                run_2_for_query,
                "summary":
                self.create_summary(run_1_for_query, run_2_for_query),
                "mergedWeights":
                self.merge_weights(run_1_for_query, run_2_for_query),
            }

        return [qid2object[id] for id in qids]