def interpolate(_config): from capreolus.searcher import Searcher import pytrec_eval pipeline.initialize(_config) logger.info("initialized pipeline with results path: %s", pipeline.reranker_path) benchmark = pipeline.benchmark benchmark.build() # TODO move this to pipeline.initialize? test_metrics = {} for foldname, fold in sorted(benchmark.folds.items()): if not (len(fold["predict"]) == 2 and "dev" in fold["predict"] and "test" in fold["predict"]): raise RuntimeError( "this evaluation command is only supported for benchmarks with 'dev' and 'test' folds" ) logger.debug("evaluating fold: %s", foldname) predict_path = os.path.join(pipeline.reranker_path, foldname, "predict") dev_qids = set(fold["predict"]["dev"]) dev_qrels = { qid: labels for qid, labels in pipeline.collection.qrels.items() if qid in dev_qids } dev_eval = pytrec_eval.RelevanceEvaluator(dev_qrels, {"ndcg_cut", "P", "map"}) test_qids = set(fold["predict"]["test"]) test_qrels = { qid: labels for qid, labels in pipeline.collection.qrels.items() if qid in test_qids } searcher_dev = { qid: docscores for qid, docscores in benchmark.reranking_runs[foldname].items() if qid in dev_qids } searcher_test = { qid: docscores for qid, docscores in benchmark.reranking_runs[foldname].items() if qid in test_qids } best_metric, best_iter, dev_run = -np.inf, None, None target_metric = "ndcg_cut_20" # target_metric = "map" devpath = os.path.join(predict_path, "dev") for iterfn in os.listdir(devpath): dev_run = Searcher.load_trec_run(os.path.join(devpath, iterfn)) test_run = Searcher.load_trec_run( os.path.join(predict_path, "test", iterfn)) alpha, interpolated_test_run, interpolated_dev_run = Searcher.crossvalidated_interpolation( dev={ "reranker": dev_run, "searcher": searcher_dev, "qrels": dev_qrels }, test={ "reranker": test_run, "searcher": searcher_test, "qrels": test_qrels }, metric=target_metric, ) this_metric = np.mean([ q[target_metric] for q in dev_eval.evaluate(interpolated_dev_run).values() ]) if this_metric > best_metric: best_metric = this_metric best_iter = iterfn use_run = interpolated_test_run print(foldname, iterfn, best_metric, alpha) logger.debug("best dev %s was on iteration #%s", target_metric, best_iter) # test_run = Searcher.load_trec_run(os.path.join(predict_path, "test", best_iter)) test_run = use_run test_eval = pytrec_eval.RelevanceEvaluator(test_qrels, {"ndcg_cut", "P", "map"}) for qid, metrics in test_eval.evaluate(test_run).items(): assert qid in test_qids for metric, value in metrics.items(): test_metrics.setdefault(metric, {}) assert qid not in test_metrics[metric], "fold testqid overlap" test_metrics[metric][qid] = value # output files for Anserini interpolation script Searcher.write_trec_run( Searcher.load_trec_run(os.path.join(predict_path, "dev", best_iter)), f"runs.rerankerIES.{foldname}.dev") Searcher.write_trec_run( Searcher.load_trec_run( os.path.join(predict_path, "test", best_iter)), f"runs.rerankerIES.{foldname}.test") logger.info(f"optimized for {target_metric}") logger.info(f"results on {len(test_metrics[metric])} aggregated test qids") for metric in ["ndcg_cut_20", "map", "P_5", "P_20"]: interpolated_avg = np.mean([*test_metrics[metric].values()]) logger.info(f"[interpolated] avg {metric}: {interpolated_avg:0.3f}")
def evaluate(_config): from capreolus.searcher import Searcher import pytrec_eval pipeline.initialize(_config) logger.debug("initialized pipeline with results path: %s", pipeline.reranker_path) benchmark = pipeline.benchmark benchmark.build() # TODO move this to pipeline.initialize? test_metrics = {} searcher_test_metrics = {} interpolated_test_metrics = {} for foldname, fold in sorted(benchmark.folds.items()): if not (len(fold["predict"]) == 2 and "dev" in fold["predict"] and "test" in fold["predict"]): raise RuntimeError( "this evaluation command is only supported for benchmarks with 'dev' and 'test' folds" ) logger.debug("evaluating fold: %s", foldname) predict_path = os.path.join(pipeline.reranker_path, foldname, "predict") dev_qids = set(fold["predict"]["dev"]) dev_qrels = { qid: labels for qid, labels in pipeline.collection.qrels.items() if qid in dev_qids } dev_eval = pytrec_eval.RelevanceEvaluator(dev_qrels, {"ndcg_cut", "P", "map"}) best_metric, best_iter, dev_run = -np.inf, None, None target_metric = "ndcg_cut_20" # target_metric = "map" devpath = os.path.join(predict_path, "dev") for iterfn in os.listdir(devpath): run = Searcher.load_trec_run(os.path.join(devpath, iterfn)) this_metric = np.mean( [q[target_metric] for q in dev_eval.evaluate(run).values()]) if this_metric > best_metric: best_metric = this_metric best_iter = iterfn dev_run = run logger.debug("best dev %s=%0.3f was on iteration #%s", target_metric, best_metric, best_iter) test_run = Searcher.load_trec_run( os.path.join(predict_path, "test", best_iter)) test_qids = set(fold["predict"]["test"]) test_qrels = { qid: labels for qid, labels in pipeline.collection.qrels.items() if qid in test_qids } test_eval = pytrec_eval.RelevanceEvaluator(test_qrels, {"ndcg_cut", "P", "map"}) for qid, metrics in test_eval.evaluate(test_run).items(): assert qid in test_qids for metric, value in metrics.items(): test_metrics.setdefault(metric, {}) assert qid not in test_metrics[metric], "fold testqid overlap" test_metrics[metric][qid] = value # compute metrics for the run being reranked for qid, metrics in test_eval.evaluate( benchmark.reranking_runs[foldname]).items(): assert qid in test_qids for metric, value in metrics.items(): searcher_test_metrics.setdefault(metric, {}) assert qid not in searcher_test_metrics[ metric], "fold testqid overlap" searcher_test_metrics[metric][qid] = value # choose an alpha for interpolation using the dev_qids, # then create a run by interpolating the searcher and reranker scores searcher_dev = { qid: docscores for qid, docscores in benchmark.reranking_runs[foldname].items() if qid in dev_qids } searcher_test = { qid: docscores for qid, docscores in benchmark.reranking_runs[foldname].items() if qid in test_qids } alpha, interpolated_test_run, _ = Searcher.crossvalidated_interpolation( dev={ "reranker": dev_run, "searcher": searcher_dev, "qrels": dev_qrels }, test={ "reranker": test_run, "searcher": searcher_test, "qrels": test_qrels }, metric=target_metric, ) # output files for Anserini interpolation script Searcher.write_trec_run(dev_run, f"runs.reranker.{foldname}.dev") Searcher.write_trec_run(test_run, f"runs.reranker.{foldname}.test") Searcher.write_trec_run(searcher_dev, f"runs.searcher.{foldname}.dev") Searcher.write_trec_run(searcher_test, f"runs.searcher.{foldname}.test") logger.debug(f"interpolation alpha={alpha}") for qid, metrics in test_eval.evaluate(interpolated_test_run).items(): assert qid in test_qids for metric, value in metrics.items(): interpolated_test_metrics.setdefault(metric, {}) assert qid not in interpolated_test_metrics[ metric], "fold testqid overlap" interpolated_test_metrics[metric][qid] = value logger.info(f"optimized for {target_metric}") logger.info(f"results on {len(test_metrics[metric])} aggregated test qids") for metric in ["map", "P_20", "ndcg_cut_20"]: assert len(test_metrics[metric]) == len(searcher_test_metrics[metric]) assert len(test_metrics[metric]) == len( interpolated_test_metrics[metric]) searcher_avg = np.mean([*searcher_test_metrics[metric].values()]) logger.info(f"[searcher] avg {metric}: {searcher_avg:0.3f}") sigtest_qids = sorted(test_metrics[metric].keys()) sigtest = ttest_rel( [searcher_test_metrics[metric][qid] for qid in sigtest_qids], [test_metrics[metric][qid] for qid in sigtest_qids]) avg = np.mean([*test_metrics[metric].values()]) logger.info( f"[reranker] avg {metric}: {avg:0.3f}\tp={sigtest.pvalue:0.3f} (vs. searcher)" ) interpolated_avg = np.mean( [*interpolated_test_metrics[metric].values()]) logger.info(f"[interpolated] avg {metric}: {interpolated_avg:0.3f}") with open(os.path.join(predict_path, "results.json"), "wt") as outf: json.dump( (test_metrics, searcher_test_metrics, interpolated_test_metrics), outf)