예제 #1
0
    def find_crossvalidated_results(self):
        searcher_runs = {}
        rank_results = self.rank.evaluate()
        for fold in self.benchmark.folds:
            searcher_runs[fold] = {
                "dev": Searcher.load_trec_run(rank_results["path"][fold])
            }
            searcher_runs[fold]["test"] = searcher_runs[fold]["dev"]

        reranker_runs = {}
        train_output_path = self.get_results_path()
        test_output_path = train_output_path / "pred" / "test" / "best"
        dev_output_path = train_output_path / "pred" / "dev" / "best"
        for fold in self.benchmark.folds:
            # TODO fix by using multiple Tasks
            test_path = Path(test_output_path.as_posix().replace(
                "fold-" + self.config["fold"], "fold-" + fold))
            if os.path.exists(test_path):
                reranker_runs.setdefault(
                    fold, {})["test"] = Searcher.load_trec_run(test_path)

                dev_path = Path(dev_output_path.as_posix().replace(
                    "fold-" + self.config["fold"], "fold-" + fold))
                reranker_runs.setdefault(
                    fold, {})["dev"] = Searcher.load_trec_run(dev_path)

        return searcher_runs, reranker_runs
예제 #2
0
def search_best_run(runfile_dirs, benchmark, primary_metric, metrics=None, folds=None):
    """
    Select the runfile with respect to the specified metric

    Args:
        runfile_dirs: the directory path to all the runfiles to select from
        benchmark: Benchmark class
        primary_metric: str, metric used to select the best runfile , e.g. ndcg_cut_20, etc
        metrics: str or list, metric expected by be calculated on the best runs
        folds: str, the name of fold to select from

    Returns:
       a dict storing specified metric score and path to the corresponding runfile
    """

    if not isinstance(runfile_dirs, (list, tuple)):
        runfile_dirs = [runfile_dirs]

    metrics = [] if not metrics else ([metrics] if isinstance(metrics, str) else list(metrics))
    if primary_metric not in metrics:
        metrics = [primary_metric] + metrics

    folds = {s: benchmark.folds[s] for s in [folds]} if folds else benchmark.folds
    runfiles = [
        os.path.join(runfile_dir, f)
        for runfile_dir in runfile_dirs
        for f in os.listdir(runfile_dir)
        if (f != "done" and not os.path.isdir(os.path.join(runfile_dir, f)))
    ]

    best_scores = {s: {primary_metric: 0, "path": None} for s in folds}
    for runfile in runfiles:
        runs = Searcher.load_trec_run(runfile)
        for s, v in folds.items():
            score = _eval_runs(
                runs,
                benchmark.qrels,
                [primary_metric],
                (set(v["train_qids"]) | set(v["predict"]["dev"])),
                benchmark.relevance_level,
            )[primary_metric]
            if score > best_scores[s][primary_metric]:
                best_scores[s] = {primary_metric: score, "path": runfile}

    test_runs = {}
    for s, score_dict in best_scores.items():
        test_qids = folds[s]["predict"]["test"]
        # any empty (no results) queries need to be added so they contribute zeros to the average
        test_runs.update({qid: {} for qid in test_qids})
        test_runs.update({qid: v for qid, v in Searcher.load_trec_run(score_dict["path"]).items() if qid in test_qids})

    scores = eval_runs(test_runs, benchmark.qrels, metrics, benchmark.relevance_level)
    return {"score": scores, "path": {s: v["path"] for s, v in best_scores.items()}}
예제 #3
0
def evaluate(config, modules):
    metric = "map"
    fold = config["fold"]
    train_output_path = _pipeline_path(config, modules)
    test_output_path = train_output_path / "pred" / "test" / "best"

    searcher = modules["searcher"]
    benchmark = modules["benchmark"]
    reranker = modules["reranker"]

    if os.path.exists(test_output_path):
        test_preds = Searcher.load_trec_run(test_output_path)
    else:
        topics_fn = benchmark.topic_file
        searcher_cache_dir = os.path.join(searcher.get_cache_path(), benchmark.name)
        searcher_run_dir = searcher.query_from_file(topics_fn, searcher_cache_dir)

        best_search_run_path = evaluator.search_best_run(searcher_run_dir, benchmark, metric)["path"][fold]
        best_search_run = searcher.load_trec_run(best_search_run_path)

        docids = set(docid for querydocs in best_search_run.values() for docid in querydocs)
        reranker["extractor"].create(qids=best_search_run.keys(), docids=docids, topics=benchmark.topics[benchmark.query_type])
        reranker.build()

        reranker["trainer"].load_best_model(reranker, train_output_path)

        test_run = {qid: docs for qid, docs in best_search_run.items() if qid in benchmark.folds[fold]["predict"]["test"]}
        test_dataset = PredDataset(qid_docid_to_rank=test_run, extractor=reranker["extractor"], mode="test")

        test_preds = reranker["trainer"].predict(reranker, test_dataset, test_output_path)

    metrics = evaluator.eval_runs(test_preds, benchmark.qrels, ["ndcg_cut_20", "ndcg_cut_10", "map", "P_20", "P_10"])
    print("test metrics for fold=%s:" % fold, metrics)

    print("\ncomputing metrics across all folds")
    avg = {}
    found = 0
    for fold in benchmark.folds:
        pred_path = _pipeline_path(config, modules, fold=fold) / "pred" / "test" / "best"
        if not os.path.exists(pred_path):
            print("\tfold=%s results are missing and will not be included" % fold)
            continue

        found += 1
        preds = Searcher.load_trec_run(pred_path)
        metrics = evaluator.eval_runs(preds, benchmark.qrels, ["ndcg_cut_20", "ndcg_cut_10", "map", "P_20", "P_10"])
        for metric, val in metrics.items():
            avg.setdefault(metric, []).append(val)

    avg = {k: np.mean(v) for k, v in avg.items()}
    print(f"average metrics across {found}/{len(benchmark.folds)} folds:", avg)
예제 #4
0
    def predict(self):
        fold = self.config["fold"]
        self.rank.search()
        threshold = self.config["threshold"]
        rank_results = self.rank.evaluate()
        best_search_run_path = rank_results["path"][fold]
        best_search_run = Searcher.load_trec_run(best_search_run_path)

        docids = set(docid for querydocs in best_search_run.values() for docid in querydocs)
        self.reranker.extractor.preprocess(
            qids=best_search_run.keys(), docids=docids, topics=self.benchmark.topics[self.benchmark.query_type]
        )
        train_output_path = self.get_results_path()
        self.reranker.build_model()
        self.reranker.trainer.load_best_model(self.reranker, train_output_path)

        test_run = defaultdict(dict)
        # This is possible because best_search_run is an OrderedDict
        for qid, docs in best_search_run.items():
            if qid in self.benchmark.folds[fold]["predict"]["test"]:
                for idx, (docid, score) in enumerate(docs.items()):
                    if idx >= threshold:
                        break
                    test_run[qid][docid] = score

        test_dataset = PredSampler()
        test_dataset.prepare(
            test_run, self.benchmark.qrels, self.reranker.extractor, relevance_level=self.benchmark.relevance_level
        )
        test_output_path = train_output_path / "pred" / "test" / "best"
        test_preds = self.reranker.trainer.predict(self.reranker, test_dataset, test_output_path)

        preds = {"test": test_preds}

        return preds
예제 #5
0
def test_write_run(tmpdir):
    """ write a TREC searcher file """
    fn = tmpdir / "searcher"
    run_dict = {"q1": {"d1": 1.1, "d2": 1.0}, "q2": {"d5": 9.0}}

    Searcher.write_trec_run(run_dict, fn)
    run = Searcher.load_trec_run(fn)
    assert sorted(run.items()) == sorted(run_dict.items())
예제 #6
0
    def train(self):
        fold = self.config["fold"]

        self.rank.search()
        rank_results = self.rank.evaluate()
        best_search_run_path = rank_results["path"][fold]
        best_search_run = Searcher.load_trec_run(best_search_run_path)

        return self.rerank_run(best_search_run, self.get_results_path())
    def train(self, init_path=""):
        fold = self.config["fold"]

        self.rank.search()
        rank_results = self.rank.evaluate()
        best_search_run_path = rank_results["path"][fold]
        best_search_run = Searcher.load_trec_run(best_search_run_path)
        wandb.save(str(best_search_run_path))
        return self.rerank_run(best_search_run,
                               self.get_results_path(),
                               init_path=init_path)
예제 #8
0
def eval_runfile(runfile, qrels, metrics, relevance_level):
    """
    Evaluate a single runfile produced by ranker or reranker

    Args:
        runfile: str, path to runfile
        qrels: dict, containing the judgements provided by benchmark
        metrics: str or list, metrics expected to calculate, e.g. ndcg_cut_20, etc

    Returns:
        a dict with format {metric: score}, containing the evaluation score of specified metrics
    """
    metrics = [metrics] if isinstance(metrics, str) else list(metrics)
    runs = Searcher.load_trec_run(runfile)
    return _eval_runs(runs, qrels, metrics, list(qrels.keys()), relevance_level)
예제 #9
0
def test_load_run(tmpdir):
    """ load a TREC searcher file """

    run_txt = """
             q1 Q0 d1 1 1.1 tag
             q1 Q0 d2 2 1.0 tag
             q2 Q0 d5 1 9.0 tag
    """
    run_dict = {"q1": {"d1": 1.1, "d2": 1.0}, "q2": {"d5": 9.0}}

    fn = tmpdir / "searcher"
    with open(fn, "wt", encoding="utf-8") as outf:
        outf.write(run_txt)

    run = Searcher.load_trec_run(fn)
    assert sorted(run.items()) == sorted(run_dict.items())
예제 #10
0
    def train(self):
        fold = self.config["fold"]
        logger.debug("results path: %s", self.get_results_path())

        self.rank.search()
        rank_results = self.rank.evaluate()
        best_search_run_path = rank_results["path"][fold]
        best_search_run = Searcher.load_trec_run(best_search_run_path)

        second_stage_results = self.rerank1.rerank_run(
            best_search_run,
            self.rerank1.get_results_path(),
            include_train=True)
        second_stage_topn = {
            qid: dict(
                sorted(docids.items(), key=lambda x: x[1],
                       reverse=True)[:self.config["topn"]])
            for split in ("train", "dev", "test")
            for qid, docids in second_stage_results[split].items()
        }

        third_stage_results = self.rerank2.rerank_run(second_stage_topn,
                                                      self.get_results_path())
        return third_stage_results
예제 #11
0
    def evaluate(self):
        fold = self.config["fold"]
        train_output_path = self.get_results_path()
        test_output_path = train_output_path / "pred" / "test" / "best"
        logger.debug("results path: %s", train_output_path)

        if os.path.exists(test_output_path):
            test_preds = Searcher.load_trec_run(test_output_path)
        else:
            self.rank.search()
            rank_results = self.rank.evaluate()
            best_search_run_path = rank_results["path"][fold]
            best_search_run = Searcher.load_trec_run(best_search_run_path)

            docids = set(docid for querydocs in best_search_run.values()
                         for docid in querydocs)
            self.reranker.extractor.preprocess(
                qids=best_search_run.keys(),
                docids=docids,
                topics=self.benchmark.topics[self.benchmark.query_type])
            self.reranker.build_model()
            self.reranker.searcher_scores = best_search_run

            self.reranker.trainer.load_best_model(self.reranker,
                                                  train_output_path)

            test_run = {
                qid: docs
                for qid, docs in best_search_run.items()
                if qid in self.benchmark.folds[fold]["predict"]["test"]
            }
            test_dataset = PredSampler()
            test_dataset.prepare(test_run, self.benchmark.qrels,
                                 self.reranker.extractor)

            test_preds = self.reranker.trainer.predict(self.reranker,
                                                       test_dataset,
                                                       test_output_path)

        metrics = evaluator.eval_runs(test_preds, self.benchmark.qrels,
                                      evaluator.DEFAULT_METRICS,
                                      self.benchmark.relevance_level)
        logger.info("rerank: fold=%s test metrics: %s", fold, metrics)

        print("\ncomputing metrics across all folds")
        avg = {}
        found = 0
        for fold in self.benchmark.folds:
            # TODO fix by using multiple Tasks
            from pathlib import Path

            pred_path = Path(test_output_path.as_posix().replace(
                "fold-" + self.config["fold"], "fold-" + fold))
            if not os.path.exists(pred_path):
                print(
                    "\tfold=%s results are missing and will not be included" %
                    fold)
                continue

            found += 1
            preds = Searcher.load_trec_run(pred_path)
            metrics = evaluator.eval_runs(preds, self.benchmark.qrels,
                                          evaluator.DEFAULT_METRICS,
                                          self.benchmark.relevance_level)
            for metric, val in metrics.items():
                avg.setdefault(metric, []).append(val)

        avg = {k: np.mean(v) for k, v in avg.items()}
        logger.info(
            "rerank: average cross-validated metrics when choosing iteration based on '%s':",
            self.config["optimize"])
        for metric, score in sorted(avg.items()):
            logger.info("%25s: %0.4f", metric, score)
예제 #12
0
def search_best_run(runfile_dir,
                    benchmark,
                    primary_metric,
                    metrics=None,
                    folds=None):
    """
    Select the runfile with respect to the specified metric

    Args:
        runfile_dir: the directory path to all the runfiles to select from
        benchmark: Benchmark class
        primary_metric: str, metric used to select the best runfile , e.g. ndcg_cut_20, etc
        metrics: str or list, metric expected by be calculated on the best runs
        folds: str, the name of fold to select from

    Returns:
       a dict storing specified metric score and path to the corresponding runfile
    """
    metrics = [] if not metrics else (
        [metrics] if isinstance(metrics, str) else list(metrics))
    if primary_metric not in metrics:
        metrics = [primary_metric] + metrics
    _verify_metric(metrics)

    folds = {s: benchmark.folds[s]
             for s in [folds]} if folds else benchmark.folds
    runfiles = [
        os.path.join(runfile_dir, f) for f in os.listdir(runfile_dir)
        if (f != "done" and not os.path.isdir(os.path.join(runfile_dir, f)))
    ]

    if len(runfiles) == 1:
        return {
            "score": eval_runfile(runfiles[0], benchmark.qrels, metrics),
            "path": {s: runfiles[0]
                     for s in folds}
        }

    best_scores = {s: {primary_metric: 0, "path": None} for s in folds}
    for runfile in runfiles:
        runs = Searcher.load_trec_run(runfile)
        for s, v in folds.items():
            score = _eval_runs(
                runs,
                benchmark.qrels, [primary_metric],
                dev_qids=(set(v["train_qids"])
                          | set(v["predict"]["dev"])))[primary_metric]
            if score > best_scores[s][primary_metric]:
                best_scores[s] = {primary_metric: score, "path": runfile}

    test_runs, test_qrels = {}, {}
    for s, score_dict in best_scores.items():
        test_qids = folds[s]["predict"]["test"]
        test_runs.update({
            qid: v
            for qid, v in Searcher.load_trec_run(score_dict["path"]).items()
            if qid in test_qids
        })
        test_qrels.update(
            {qid: v
             for qid, v in benchmark.qrels.items() if qid in test_qids})

    scores = eval_runs(test_runs, benchmark.qrels, metrics)
    return {
        "score": scores,
        "path": {s: v["path"]
                 for s, v in best_scores.items()}
    }
    def predict_and_eval(self, init_path=None):
        fold = self.config["fold"]
        self.reranker.build_model()
        if not init_path or init_path == "none":
            logger.info(f"Loading self best ckpt: {init_path}")
            logger.info("No init path given, using default parameters")
            self.reranker.build_model()
        else:
            logger.info(f"Load from {init_path}")
            init_path = Path(
                init_path) if not init_path.startswith("gs:") else init_path
            self.reranker.trainer.load_best_model(self.reranker,
                                                  init_path,
                                                  do_not_hash=True)

        dirname = str(init_path).split("/")[-1] if init_path else "noinitpath"
        savedir = Path(
            __file__).parent.absolute() / "downloaded_runfiles" / dirname
        dev_output_path = savedir / fold / "dev"
        test_output_path = savedir / fold / "test"
        test_output_path.parent.mkdir(exist_ok=True, parents=True)

        self.rank.search()
        threshold = self.config["threshold"]
        rank_results = self.rank.evaluate()
        best_search_run_path = rank_results["path"][fold]
        best_search_run = Searcher.load_trec_run(best_search_run_path)

        docids = set(docid for querydocs in best_search_run.values()
                     for docid in querydocs)
        self.reranker.extractor.preprocess(
            qids=best_search_run.keys(),
            docids=docids,
            topics=self.benchmark.topics[self.benchmark.query_type])

        # dev run
        dev_run = defaultdict(dict)
        for qid, docs in best_search_run.items():
            if qid in self.benchmark.folds[fold]["predict"]["dev"]:
                for idx, (docid, score) in enumerate(docs.items()):
                    if idx >= threshold:
                        assert len(
                            dev_run[qid]
                        ) == threshold, f"Expect {threshold} on each qid, got {len(dev_run[qid])} for query {qid}"
                        break
                    dev_run[qid][docid] = score
        dev_dataset = PredSampler()
        dev_dataset.prepare(dev_run,
                            self.benchmark.qrels,
                            self.reranker.extractor,
                            relevance_level=self.benchmark.relevance_level)

        # test_run
        test_run = defaultdict(dict)
        # This is possible because best_search_run is an OrderedDict
        for qid, docs in best_search_run.items():
            if qid in self.benchmark.folds[fold]["predict"]["test"]:
                for idx, (docid, score) in enumerate(docs.items()):
                    if idx >= threshold:
                        assert len(
                            test_run[qid]
                        ) == threshold, f"Expect {threshold} on each qid, got {len(dev_run[qid])} for query {qid}"
                        break
                    test_run[qid][docid] = score

        unsampled_qrels = self.benchmark.unsampled_qrels if hasattr(
            self.benchmark, "unsampled_qrels") else self.benchmark.qrels
        test_dataset = PredSampler()
        test_dataset.prepare(test_run,
                             unsampled_qrels,
                             self.reranker.extractor,
                             relevance_level=self.benchmark.relevance_level)
        logger.info("test prepared")

        # prediction
        dev_preds = self.reranker.trainer.predict(self.reranker, dev_dataset,
                                                  dev_output_path)
        fold_dev_metrics = evaluator.eval_runs(dev_preds, unsampled_qrels,
                                               self.metrics,
                                               self.benchmark.relevance_level)
        logger.info("rerank: fold=%s dev metrics: %s", fold, fold_dev_metrics)

        test_preds = self.reranker.trainer.predict(self.reranker, test_dataset,
                                                   test_output_path)
        fold_test_metrics = evaluator.eval_runs(test_preds, unsampled_qrels,
                                                self.metrics,
                                                self.benchmark.relevance_level)
        logger.info("rerank: fold=%s test metrics: %s", fold,
                    fold_test_metrics)
        wandb.save(str(dev_output_path))
        wandb.save(str(test_output_path))

        # add cross validate results:
        n_folds = len(self.benchmark.folds)
        folds_fn = {
            f"s{i}": savedir / f"s{i}" / "test"
            for i in range(1, n_folds + 1)
        }
        if not all([fn.exists() for fn in folds_fn.values()]):
            return {"fold_test_metrics": fold_test_metrics, "cv_metrics": None}

        all_preds = {}
        reranker_runs = {
            fold: {
                "dev": Searcher.load_trec_run(fn.parent / "dev"),
                "test": Searcher.load_trec_run(fn)
            }
            for fold, fn in folds_fn.items()
        }

        for fold, dev_test in reranker_runs.items():
            preds = dev_test["test"]
            qids = self.benchmark.folds[fold]["predict"]["test"]
            for qid, docscores in preds.items():
                if qid not in qids:
                    continue
                all_preds.setdefault(qid, {})
                for docid, score in docscores.items():
                    all_preds[qid][docid] = score

        cv_metrics = evaluator.eval_runs(all_preds, unsampled_qrels,
                                         self.metrics,
                                         self.benchmark.relevance_level)
        for metric, score in sorted(cv_metrics.items()):
            logger.info("%25s: %0.4f", metric, score)

        searcher_runs = {}
        rank_results = self.rank.evaluate()
        for fold in self.benchmark.folds:
            searcher_runs[fold] = {
                "dev": Searcher.load_trec_run(rank_results["path"][fold])
            }
            searcher_runs[fold]["test"] = searcher_runs[fold]["dev"]

        interpolated_results = evaluator.interpolated_eval(
            searcher_runs, reranker_runs, self.benchmark,
            self.config["optimize"], self.metrics)

        return {
            "fold_test_metrics": fold_test_metrics,
            "cv_metrics": cv_metrics,
            "interpolated_results": interpolated_results,
        }
예제 #14
0
def interpolate(_config):
    from capreolus.searcher import Searcher
    import pytrec_eval

    pipeline.initialize(_config)
    logger.info("initialized pipeline with results path: %s",
                pipeline.reranker_path)

    benchmark = pipeline.benchmark
    benchmark.build()  # TODO move this to pipeline.initialize?

    test_metrics = {}
    for foldname, fold in sorted(benchmark.folds.items()):
        if not (len(fold["predict"]) == 2 and "dev" in fold["predict"]
                and "test" in fold["predict"]):
            raise RuntimeError(
                "this evaluation command is only supported for benchmarks with 'dev' and 'test' folds"
            )

        logger.debug("evaluating fold: %s", foldname)
        predict_path = os.path.join(pipeline.reranker_path, foldname,
                                    "predict")

        dev_qids = set(fold["predict"]["dev"])
        dev_qrels = {
            qid: labels
            for qid, labels in pipeline.collection.qrels.items()
            if qid in dev_qids
        }
        dev_eval = pytrec_eval.RelevanceEvaluator(dev_qrels,
                                                  {"ndcg_cut", "P", "map"})

        test_qids = set(fold["predict"]["test"])
        test_qrels = {
            qid: labels
            for qid, labels in pipeline.collection.qrels.items()
            if qid in test_qids
        }
        searcher_dev = {
            qid: docscores
            for qid, docscores in benchmark.reranking_runs[foldname].items()
            if qid in dev_qids
        }
        searcher_test = {
            qid: docscores
            for qid, docscores in benchmark.reranking_runs[foldname].items()
            if qid in test_qids
        }

        best_metric, best_iter, dev_run = -np.inf, None, None
        target_metric = "ndcg_cut_20"
        # target_metric = "map"
        devpath = os.path.join(predict_path, "dev")
        for iterfn in os.listdir(devpath):
            dev_run = Searcher.load_trec_run(os.path.join(devpath, iterfn))
            test_run = Searcher.load_trec_run(
                os.path.join(predict_path, "test", iterfn))
            alpha, interpolated_test_run, interpolated_dev_run = Searcher.crossvalidated_interpolation(
                dev={
                    "reranker": dev_run,
                    "searcher": searcher_dev,
                    "qrels": dev_qrels
                },
                test={
                    "reranker": test_run,
                    "searcher": searcher_test,
                    "qrels": test_qrels
                },
                metric=target_metric,
            )

            this_metric = np.mean([
                q[target_metric]
                for q in dev_eval.evaluate(interpolated_dev_run).values()
            ])
            if this_metric > best_metric:
                best_metric = this_metric
                best_iter = iterfn
                use_run = interpolated_test_run
                print(foldname, iterfn, best_metric, alpha)
        logger.debug("best dev %s was on iteration #%s", target_metric,
                     best_iter)

        # test_run = Searcher.load_trec_run(os.path.join(predict_path, "test", best_iter))
        test_run = use_run
        test_eval = pytrec_eval.RelevanceEvaluator(test_qrels,
                                                   {"ndcg_cut", "P", "map"})
        for qid, metrics in test_eval.evaluate(test_run).items():
            assert qid in test_qids
            for metric, value in metrics.items():
                test_metrics.setdefault(metric, {})
                assert qid not in test_metrics[metric], "fold testqid overlap"
                test_metrics[metric][qid] = value

        # output files for Anserini interpolation script
        Searcher.write_trec_run(
            Searcher.load_trec_run(os.path.join(predict_path, "dev",
                                                best_iter)),
            f"runs.rerankerIES.{foldname}.dev")
        Searcher.write_trec_run(
            Searcher.load_trec_run(
                os.path.join(predict_path, "test", best_iter)),
            f"runs.rerankerIES.{foldname}.test")

    logger.info(f"optimized for {target_metric}")
    logger.info(f"results on {len(test_metrics[metric])} aggregated test qids")
    for metric in ["ndcg_cut_20", "map", "P_5", "P_20"]:
        interpolated_avg = np.mean([*test_metrics[metric].values()])
        logger.info(f"[interpolated] avg {metric}: {interpolated_avg:0.3f}")
예제 #15
0
def evaluate(_config):
    from capreolus.searcher import Searcher
    import pytrec_eval

    pipeline.initialize(_config)
    logger.debug("initialized pipeline with results path: %s",
                 pipeline.reranker_path)

    benchmark = pipeline.benchmark
    benchmark.build()  # TODO move this to pipeline.initialize?

    test_metrics = {}
    searcher_test_metrics = {}
    interpolated_test_metrics = {}
    for foldname, fold in sorted(benchmark.folds.items()):
        if not (len(fold["predict"]) == 2 and "dev" in fold["predict"]
                and "test" in fold["predict"]):
            raise RuntimeError(
                "this evaluation command is only supported for benchmarks with 'dev' and 'test' folds"
            )

        logger.debug("evaluating fold: %s", foldname)
        predict_path = os.path.join(pipeline.reranker_path, foldname,
                                    "predict")

        dev_qids = set(fold["predict"]["dev"])
        dev_qrels = {
            qid: labels
            for qid, labels in pipeline.collection.qrels.items()
            if qid in dev_qids
        }
        dev_eval = pytrec_eval.RelevanceEvaluator(dev_qrels,
                                                  {"ndcg_cut", "P", "map"})

        best_metric, best_iter, dev_run = -np.inf, None, None
        target_metric = "ndcg_cut_20"
        # target_metric = "map"
        devpath = os.path.join(predict_path, "dev")
        for iterfn in os.listdir(devpath):
            run = Searcher.load_trec_run(os.path.join(devpath, iterfn))
            this_metric = np.mean(
                [q[target_metric] for q in dev_eval.evaluate(run).values()])
            if this_metric > best_metric:
                best_metric = this_metric
                best_iter = iterfn
                dev_run = run
        logger.debug("best dev %s=%0.3f was on iteration #%s", target_metric,
                     best_metric, best_iter)

        test_run = Searcher.load_trec_run(
            os.path.join(predict_path, "test", best_iter))
        test_qids = set(fold["predict"]["test"])
        test_qrels = {
            qid: labels
            for qid, labels in pipeline.collection.qrels.items()
            if qid in test_qids
        }
        test_eval = pytrec_eval.RelevanceEvaluator(test_qrels,
                                                   {"ndcg_cut", "P", "map"})
        for qid, metrics in test_eval.evaluate(test_run).items():
            assert qid in test_qids
            for metric, value in metrics.items():
                test_metrics.setdefault(metric, {})
                assert qid not in test_metrics[metric], "fold testqid overlap"
                test_metrics[metric][qid] = value

        # compute metrics for the run being reranked
        for qid, metrics in test_eval.evaluate(
                benchmark.reranking_runs[foldname]).items():
            assert qid in test_qids
            for metric, value in metrics.items():
                searcher_test_metrics.setdefault(metric, {})
                assert qid not in searcher_test_metrics[
                    metric], "fold testqid overlap"
                searcher_test_metrics[metric][qid] = value

        # choose an alpha for interpolation using the dev_qids,
        # then create a run by interpolating the searcher and reranker scores
        searcher_dev = {
            qid: docscores
            for qid, docscores in benchmark.reranking_runs[foldname].items()
            if qid in dev_qids
        }
        searcher_test = {
            qid: docscores
            for qid, docscores in benchmark.reranking_runs[foldname].items()
            if qid in test_qids
        }
        alpha, interpolated_test_run, _ = Searcher.crossvalidated_interpolation(
            dev={
                "reranker": dev_run,
                "searcher": searcher_dev,
                "qrels": dev_qrels
            },
            test={
                "reranker": test_run,
                "searcher": searcher_test,
                "qrels": test_qrels
            },
            metric=target_metric,
        )

        # output files for Anserini interpolation script
        Searcher.write_trec_run(dev_run, f"runs.reranker.{foldname}.dev")
        Searcher.write_trec_run(test_run, f"runs.reranker.{foldname}.test")
        Searcher.write_trec_run(searcher_dev, f"runs.searcher.{foldname}.dev")
        Searcher.write_trec_run(searcher_test,
                                f"runs.searcher.{foldname}.test")

        logger.debug(f"interpolation alpha={alpha}")
        for qid, metrics in test_eval.evaluate(interpolated_test_run).items():
            assert qid in test_qids
            for metric, value in metrics.items():
                interpolated_test_metrics.setdefault(metric, {})
                assert qid not in interpolated_test_metrics[
                    metric], "fold testqid overlap"
                interpolated_test_metrics[metric][qid] = value

    logger.info(f"optimized for {target_metric}")
    logger.info(f"results on {len(test_metrics[metric])} aggregated test qids")
    for metric in ["map", "P_20", "ndcg_cut_20"]:
        assert len(test_metrics[metric]) == len(searcher_test_metrics[metric])
        assert len(test_metrics[metric]) == len(
            interpolated_test_metrics[metric])

        searcher_avg = np.mean([*searcher_test_metrics[metric].values()])
        logger.info(f"[searcher] avg {metric}: {searcher_avg:0.3f}")

        sigtest_qids = sorted(test_metrics[metric].keys())
        sigtest = ttest_rel(
            [searcher_test_metrics[metric][qid] for qid in sigtest_qids],
            [test_metrics[metric][qid] for qid in sigtest_qids])

        avg = np.mean([*test_metrics[metric].values()])
        logger.info(
            f"[reranker] avg {metric}: {avg:0.3f}\tp={sigtest.pvalue:0.3f} (vs. searcher)"
        )

        interpolated_avg = np.mean(
            [*interpolated_test_metrics[metric].values()])
        logger.info(f"[interpolated] avg {metric}: {interpolated_avg:0.3f}")

    with open(os.path.join(predict_path, "results.json"), "wt") as outf:
        json.dump(
            (test_metrics, searcher_test_metrics, interpolated_test_metrics),
            outf)