Beispiel #1
0
def interpolate(_config):
    from capreolus.searcher import Searcher
    import pytrec_eval

    pipeline.initialize(_config)
    logger.info("initialized pipeline with results path: %s",
                pipeline.reranker_path)

    benchmark = pipeline.benchmark
    benchmark.build()  # TODO move this to pipeline.initialize?

    test_metrics = {}
    for foldname, fold in sorted(benchmark.folds.items()):
        if not (len(fold["predict"]) == 2 and "dev" in fold["predict"]
                and "test" in fold["predict"]):
            raise RuntimeError(
                "this evaluation command is only supported for benchmarks with 'dev' and 'test' folds"
            )

        logger.debug("evaluating fold: %s", foldname)
        predict_path = os.path.join(pipeline.reranker_path, foldname,
                                    "predict")

        dev_qids = set(fold["predict"]["dev"])
        dev_qrels = {
            qid: labels
            for qid, labels in pipeline.collection.qrels.items()
            if qid in dev_qids
        }
        dev_eval = pytrec_eval.RelevanceEvaluator(dev_qrels,
                                                  {"ndcg_cut", "P", "map"})

        test_qids = set(fold["predict"]["test"])
        test_qrels = {
            qid: labels
            for qid, labels in pipeline.collection.qrels.items()
            if qid in test_qids
        }
        searcher_dev = {
            qid: docscores
            for qid, docscores in benchmark.reranking_runs[foldname].items()
            if qid in dev_qids
        }
        searcher_test = {
            qid: docscores
            for qid, docscores in benchmark.reranking_runs[foldname].items()
            if qid in test_qids
        }

        best_metric, best_iter, dev_run = -np.inf, None, None
        target_metric = "ndcg_cut_20"
        # target_metric = "map"
        devpath = os.path.join(predict_path, "dev")
        for iterfn in os.listdir(devpath):
            dev_run = Searcher.load_trec_run(os.path.join(devpath, iterfn))
            test_run = Searcher.load_trec_run(
                os.path.join(predict_path, "test", iterfn))
            alpha, interpolated_test_run, interpolated_dev_run = Searcher.crossvalidated_interpolation(
                dev={
                    "reranker": dev_run,
                    "searcher": searcher_dev,
                    "qrels": dev_qrels
                },
                test={
                    "reranker": test_run,
                    "searcher": searcher_test,
                    "qrels": test_qrels
                },
                metric=target_metric,
            )

            this_metric = np.mean([
                q[target_metric]
                for q in dev_eval.evaluate(interpolated_dev_run).values()
            ])
            if this_metric > best_metric:
                best_metric = this_metric
                best_iter = iterfn
                use_run = interpolated_test_run
                print(foldname, iterfn, best_metric, alpha)
        logger.debug("best dev %s was on iteration #%s", target_metric,
                     best_iter)

        # test_run = Searcher.load_trec_run(os.path.join(predict_path, "test", best_iter))
        test_run = use_run
        test_eval = pytrec_eval.RelevanceEvaluator(test_qrels,
                                                   {"ndcg_cut", "P", "map"})
        for qid, metrics in test_eval.evaluate(test_run).items():
            assert qid in test_qids
            for metric, value in metrics.items():
                test_metrics.setdefault(metric, {})
                assert qid not in test_metrics[metric], "fold testqid overlap"
                test_metrics[metric][qid] = value

        # output files for Anserini interpolation script
        Searcher.write_trec_run(
            Searcher.load_trec_run(os.path.join(predict_path, "dev",
                                                best_iter)),
            f"runs.rerankerIES.{foldname}.dev")
        Searcher.write_trec_run(
            Searcher.load_trec_run(
                os.path.join(predict_path, "test", best_iter)),
            f"runs.rerankerIES.{foldname}.test")

    logger.info(f"optimized for {target_metric}")
    logger.info(f"results on {len(test_metrics[metric])} aggregated test qids")
    for metric in ["ndcg_cut_20", "map", "P_5", "P_20"]:
        interpolated_avg = np.mean([*test_metrics[metric].values()])
        logger.info(f"[interpolated] avg {metric}: {interpolated_avg:0.3f}")
Beispiel #2
0
def evaluate(_config):
    from capreolus.searcher import Searcher
    import pytrec_eval

    pipeline.initialize(_config)
    logger.debug("initialized pipeline with results path: %s",
                 pipeline.reranker_path)

    benchmark = pipeline.benchmark
    benchmark.build()  # TODO move this to pipeline.initialize?

    test_metrics = {}
    searcher_test_metrics = {}
    interpolated_test_metrics = {}
    for foldname, fold in sorted(benchmark.folds.items()):
        if not (len(fold["predict"]) == 2 and "dev" in fold["predict"]
                and "test" in fold["predict"]):
            raise RuntimeError(
                "this evaluation command is only supported for benchmarks with 'dev' and 'test' folds"
            )

        logger.debug("evaluating fold: %s", foldname)
        predict_path = os.path.join(pipeline.reranker_path, foldname,
                                    "predict")

        dev_qids = set(fold["predict"]["dev"])
        dev_qrels = {
            qid: labels
            for qid, labels in pipeline.collection.qrels.items()
            if qid in dev_qids
        }
        dev_eval = pytrec_eval.RelevanceEvaluator(dev_qrels,
                                                  {"ndcg_cut", "P", "map"})

        best_metric, best_iter, dev_run = -np.inf, None, None
        target_metric = "ndcg_cut_20"
        # target_metric = "map"
        devpath = os.path.join(predict_path, "dev")
        for iterfn in os.listdir(devpath):
            run = Searcher.load_trec_run(os.path.join(devpath, iterfn))
            this_metric = np.mean(
                [q[target_metric] for q in dev_eval.evaluate(run).values()])
            if this_metric > best_metric:
                best_metric = this_metric
                best_iter = iterfn
                dev_run = run
        logger.debug("best dev %s=%0.3f was on iteration #%s", target_metric,
                     best_metric, best_iter)

        test_run = Searcher.load_trec_run(
            os.path.join(predict_path, "test", best_iter))
        test_qids = set(fold["predict"]["test"])
        test_qrels = {
            qid: labels
            for qid, labels in pipeline.collection.qrels.items()
            if qid in test_qids
        }
        test_eval = pytrec_eval.RelevanceEvaluator(test_qrels,
                                                   {"ndcg_cut", "P", "map"})
        for qid, metrics in test_eval.evaluate(test_run).items():
            assert qid in test_qids
            for metric, value in metrics.items():
                test_metrics.setdefault(metric, {})
                assert qid not in test_metrics[metric], "fold testqid overlap"
                test_metrics[metric][qid] = value

        # compute metrics for the run being reranked
        for qid, metrics in test_eval.evaluate(
                benchmark.reranking_runs[foldname]).items():
            assert qid in test_qids
            for metric, value in metrics.items():
                searcher_test_metrics.setdefault(metric, {})
                assert qid not in searcher_test_metrics[
                    metric], "fold testqid overlap"
                searcher_test_metrics[metric][qid] = value

        # choose an alpha for interpolation using the dev_qids,
        # then create a run by interpolating the searcher and reranker scores
        searcher_dev = {
            qid: docscores
            for qid, docscores in benchmark.reranking_runs[foldname].items()
            if qid in dev_qids
        }
        searcher_test = {
            qid: docscores
            for qid, docscores in benchmark.reranking_runs[foldname].items()
            if qid in test_qids
        }
        alpha, interpolated_test_run, _ = Searcher.crossvalidated_interpolation(
            dev={
                "reranker": dev_run,
                "searcher": searcher_dev,
                "qrels": dev_qrels
            },
            test={
                "reranker": test_run,
                "searcher": searcher_test,
                "qrels": test_qrels
            },
            metric=target_metric,
        )

        # output files for Anserini interpolation script
        Searcher.write_trec_run(dev_run, f"runs.reranker.{foldname}.dev")
        Searcher.write_trec_run(test_run, f"runs.reranker.{foldname}.test")
        Searcher.write_trec_run(searcher_dev, f"runs.searcher.{foldname}.dev")
        Searcher.write_trec_run(searcher_test,
                                f"runs.searcher.{foldname}.test")

        logger.debug(f"interpolation alpha={alpha}")
        for qid, metrics in test_eval.evaluate(interpolated_test_run).items():
            assert qid in test_qids
            for metric, value in metrics.items():
                interpolated_test_metrics.setdefault(metric, {})
                assert qid not in interpolated_test_metrics[
                    metric], "fold testqid overlap"
                interpolated_test_metrics[metric][qid] = value

    logger.info(f"optimized for {target_metric}")
    logger.info(f"results on {len(test_metrics[metric])} aggregated test qids")
    for metric in ["map", "P_20", "ndcg_cut_20"]:
        assert len(test_metrics[metric]) == len(searcher_test_metrics[metric])
        assert len(test_metrics[metric]) == len(
            interpolated_test_metrics[metric])

        searcher_avg = np.mean([*searcher_test_metrics[metric].values()])
        logger.info(f"[searcher] avg {metric}: {searcher_avg:0.3f}")

        sigtest_qids = sorted(test_metrics[metric].keys())
        sigtest = ttest_rel(
            [searcher_test_metrics[metric][qid] for qid in sigtest_qids],
            [test_metrics[metric][qid] for qid in sigtest_qids])

        avg = np.mean([*test_metrics[metric].values()])
        logger.info(
            f"[reranker] avg {metric}: {avg:0.3f}\tp={sigtest.pvalue:0.3f} (vs. searcher)"
        )

        interpolated_avg = np.mean(
            [*interpolated_test_metrics[metric].values()])
        logger.info(f"[interpolated] avg {metric}: {interpolated_avg:0.3f}")

    with open(os.path.join(predict_path, "results.json"), "wt") as outf:
        json.dump(
            (test_metrics, searcher_test_metrics, interpolated_test_metrics),
            outf)