def find_crossvalidated_results(self): searcher_runs = {} rank_results = self.rank.evaluate() for fold in self.benchmark.folds: searcher_runs[fold] = { "dev": Searcher.load_trec_run(rank_results["path"][fold]) } searcher_runs[fold]["test"] = searcher_runs[fold]["dev"] reranker_runs = {} train_output_path = self.get_results_path() test_output_path = train_output_path / "pred" / "test" / "best" dev_output_path = train_output_path / "pred" / "dev" / "best" for fold in self.benchmark.folds: # TODO fix by using multiple Tasks test_path = Path(test_output_path.as_posix().replace( "fold-" + self.config["fold"], "fold-" + fold)) if os.path.exists(test_path): reranker_runs.setdefault( fold, {})["test"] = Searcher.load_trec_run(test_path) dev_path = Path(dev_output_path.as_posix().replace( "fold-" + self.config["fold"], "fold-" + fold)) reranker_runs.setdefault( fold, {})["dev"] = Searcher.load_trec_run(dev_path) return searcher_runs, reranker_runs
def search_best_run(runfile_dirs, benchmark, primary_metric, metrics=None, folds=None): """ Select the runfile with respect to the specified metric Args: runfile_dirs: the directory path to all the runfiles to select from benchmark: Benchmark class primary_metric: str, metric used to select the best runfile , e.g. ndcg_cut_20, etc metrics: str or list, metric expected by be calculated on the best runs folds: str, the name of fold to select from Returns: a dict storing specified metric score and path to the corresponding runfile """ if not isinstance(runfile_dirs, (list, tuple)): runfile_dirs = [runfile_dirs] metrics = [] if not metrics else ([metrics] if isinstance(metrics, str) else list(metrics)) if primary_metric not in metrics: metrics = [primary_metric] + metrics folds = {s: benchmark.folds[s] for s in [folds]} if folds else benchmark.folds runfiles = [ os.path.join(runfile_dir, f) for runfile_dir in runfile_dirs for f in os.listdir(runfile_dir) if (f != "done" and not os.path.isdir(os.path.join(runfile_dir, f))) ] best_scores = {s: {primary_metric: 0, "path": None} for s in folds} for runfile in runfiles: runs = Searcher.load_trec_run(runfile) for s, v in folds.items(): score = _eval_runs( runs, benchmark.qrels, [primary_metric], (set(v["train_qids"]) | set(v["predict"]["dev"])), benchmark.relevance_level, )[primary_metric] if score > best_scores[s][primary_metric]: best_scores[s] = {primary_metric: score, "path": runfile} test_runs = {} for s, score_dict in best_scores.items(): test_qids = folds[s]["predict"]["test"] # any empty (no results) queries need to be added so they contribute zeros to the average test_runs.update({qid: {} for qid in test_qids}) test_runs.update({qid: v for qid, v in Searcher.load_trec_run(score_dict["path"]).items() if qid in test_qids}) scores = eval_runs(test_runs, benchmark.qrels, metrics, benchmark.relevance_level) return {"score": scores, "path": {s: v["path"] for s, v in best_scores.items()}}
def evaluate(config, modules): metric = "map" fold = config["fold"] train_output_path = _pipeline_path(config, modules) test_output_path = train_output_path / "pred" / "test" / "best" searcher = modules["searcher"] benchmark = modules["benchmark"] reranker = modules["reranker"] if os.path.exists(test_output_path): test_preds = Searcher.load_trec_run(test_output_path) else: topics_fn = benchmark.topic_file searcher_cache_dir = os.path.join(searcher.get_cache_path(), benchmark.name) searcher_run_dir = searcher.query_from_file(topics_fn, searcher_cache_dir) best_search_run_path = evaluator.search_best_run(searcher_run_dir, benchmark, metric)["path"][fold] best_search_run = searcher.load_trec_run(best_search_run_path) docids = set(docid for querydocs in best_search_run.values() for docid in querydocs) reranker["extractor"].create(qids=best_search_run.keys(), docids=docids, topics=benchmark.topics[benchmark.query_type]) reranker.build() reranker["trainer"].load_best_model(reranker, train_output_path) test_run = {qid: docs for qid, docs in best_search_run.items() if qid in benchmark.folds[fold]["predict"]["test"]} test_dataset = PredDataset(qid_docid_to_rank=test_run, extractor=reranker["extractor"], mode="test") test_preds = reranker["trainer"].predict(reranker, test_dataset, test_output_path) metrics = evaluator.eval_runs(test_preds, benchmark.qrels, ["ndcg_cut_20", "ndcg_cut_10", "map", "P_20", "P_10"]) print("test metrics for fold=%s:" % fold, metrics) print("\ncomputing metrics across all folds") avg = {} found = 0 for fold in benchmark.folds: pred_path = _pipeline_path(config, modules, fold=fold) / "pred" / "test" / "best" if not os.path.exists(pred_path): print("\tfold=%s results are missing and will not be included" % fold) continue found += 1 preds = Searcher.load_trec_run(pred_path) metrics = evaluator.eval_runs(preds, benchmark.qrels, ["ndcg_cut_20", "ndcg_cut_10", "map", "P_20", "P_10"]) for metric, val in metrics.items(): avg.setdefault(metric, []).append(val) avg = {k: np.mean(v) for k, v in avg.items()} print(f"average metrics across {found}/{len(benchmark.folds)} folds:", avg)
def predict(self): fold = self.config["fold"] self.rank.search() threshold = self.config["threshold"] rank_results = self.rank.evaluate() best_search_run_path = rank_results["path"][fold] best_search_run = Searcher.load_trec_run(best_search_run_path) docids = set(docid for querydocs in best_search_run.values() for docid in querydocs) self.reranker.extractor.preprocess( qids=best_search_run.keys(), docids=docids, topics=self.benchmark.topics[self.benchmark.query_type] ) train_output_path = self.get_results_path() self.reranker.build_model() self.reranker.trainer.load_best_model(self.reranker, train_output_path) test_run = defaultdict(dict) # This is possible because best_search_run is an OrderedDict for qid, docs in best_search_run.items(): if qid in self.benchmark.folds[fold]["predict"]["test"]: for idx, (docid, score) in enumerate(docs.items()): if idx >= threshold: break test_run[qid][docid] = score test_dataset = PredSampler() test_dataset.prepare( test_run, self.benchmark.qrels, self.reranker.extractor, relevance_level=self.benchmark.relevance_level ) test_output_path = train_output_path / "pred" / "test" / "best" test_preds = self.reranker.trainer.predict(self.reranker, test_dataset, test_output_path) preds = {"test": test_preds} return preds
def test_write_run(tmpdir): """ write a TREC searcher file """ fn = tmpdir / "searcher" run_dict = {"q1": {"d1": 1.1, "d2": 1.0}, "q2": {"d5": 9.0}} Searcher.write_trec_run(run_dict, fn) run = Searcher.load_trec_run(fn) assert sorted(run.items()) == sorted(run_dict.items())
def train(self): fold = self.config["fold"] self.rank.search() rank_results = self.rank.evaluate() best_search_run_path = rank_results["path"][fold] best_search_run = Searcher.load_trec_run(best_search_run_path) return self.rerank_run(best_search_run, self.get_results_path())
def train(self, init_path=""): fold = self.config["fold"] self.rank.search() rank_results = self.rank.evaluate() best_search_run_path = rank_results["path"][fold] best_search_run = Searcher.load_trec_run(best_search_run_path) wandb.save(str(best_search_run_path)) return self.rerank_run(best_search_run, self.get_results_path(), init_path=init_path)
def eval_runfile(runfile, qrels, metrics, relevance_level): """ Evaluate a single runfile produced by ranker or reranker Args: runfile: str, path to runfile qrels: dict, containing the judgements provided by benchmark metrics: str or list, metrics expected to calculate, e.g. ndcg_cut_20, etc Returns: a dict with format {metric: score}, containing the evaluation score of specified metrics """ metrics = [metrics] if isinstance(metrics, str) else list(metrics) runs = Searcher.load_trec_run(runfile) return _eval_runs(runs, qrels, metrics, list(qrels.keys()), relevance_level)
def test_load_run(tmpdir): """ load a TREC searcher file """ run_txt = """ q1 Q0 d1 1 1.1 tag q1 Q0 d2 2 1.0 tag q2 Q0 d5 1 9.0 tag """ run_dict = {"q1": {"d1": 1.1, "d2": 1.0}, "q2": {"d5": 9.0}} fn = tmpdir / "searcher" with open(fn, "wt", encoding="utf-8") as outf: outf.write(run_txt) run = Searcher.load_trec_run(fn) assert sorted(run.items()) == sorted(run_dict.items())
def train(self): fold = self.config["fold"] logger.debug("results path: %s", self.get_results_path()) self.rank.search() rank_results = self.rank.evaluate() best_search_run_path = rank_results["path"][fold] best_search_run = Searcher.load_trec_run(best_search_run_path) second_stage_results = self.rerank1.rerank_run( best_search_run, self.rerank1.get_results_path(), include_train=True) second_stage_topn = { qid: dict( sorted(docids.items(), key=lambda x: x[1], reverse=True)[:self.config["topn"]]) for split in ("train", "dev", "test") for qid, docids in second_stage_results[split].items() } third_stage_results = self.rerank2.rerank_run(second_stage_topn, self.get_results_path()) return third_stage_results
def evaluate(self): fold = self.config["fold"] train_output_path = self.get_results_path() test_output_path = train_output_path / "pred" / "test" / "best" logger.debug("results path: %s", train_output_path) if os.path.exists(test_output_path): test_preds = Searcher.load_trec_run(test_output_path) else: self.rank.search() rank_results = self.rank.evaluate() best_search_run_path = rank_results["path"][fold] best_search_run = Searcher.load_trec_run(best_search_run_path) docids = set(docid for querydocs in best_search_run.values() for docid in querydocs) self.reranker.extractor.preprocess( qids=best_search_run.keys(), docids=docids, topics=self.benchmark.topics[self.benchmark.query_type]) self.reranker.build_model() self.reranker.searcher_scores = best_search_run self.reranker.trainer.load_best_model(self.reranker, train_output_path) test_run = { qid: docs for qid, docs in best_search_run.items() if qid in self.benchmark.folds[fold]["predict"]["test"] } test_dataset = PredSampler() test_dataset.prepare(test_run, self.benchmark.qrels, self.reranker.extractor) test_preds = self.reranker.trainer.predict(self.reranker, test_dataset, test_output_path) metrics = evaluator.eval_runs(test_preds, self.benchmark.qrels, evaluator.DEFAULT_METRICS, self.benchmark.relevance_level) logger.info("rerank: fold=%s test metrics: %s", fold, metrics) print("\ncomputing metrics across all folds") avg = {} found = 0 for fold in self.benchmark.folds: # TODO fix by using multiple Tasks from pathlib import Path pred_path = Path(test_output_path.as_posix().replace( "fold-" + self.config["fold"], "fold-" + fold)) if not os.path.exists(pred_path): print( "\tfold=%s results are missing and will not be included" % fold) continue found += 1 preds = Searcher.load_trec_run(pred_path) metrics = evaluator.eval_runs(preds, self.benchmark.qrels, evaluator.DEFAULT_METRICS, self.benchmark.relevance_level) for metric, val in metrics.items(): avg.setdefault(metric, []).append(val) avg = {k: np.mean(v) for k, v in avg.items()} logger.info( "rerank: average cross-validated metrics when choosing iteration based on '%s':", self.config["optimize"]) for metric, score in sorted(avg.items()): logger.info("%25s: %0.4f", metric, score)
def search_best_run(runfile_dir, benchmark, primary_metric, metrics=None, folds=None): """ Select the runfile with respect to the specified metric Args: runfile_dir: the directory path to all the runfiles to select from benchmark: Benchmark class primary_metric: str, metric used to select the best runfile , e.g. ndcg_cut_20, etc metrics: str or list, metric expected by be calculated on the best runs folds: str, the name of fold to select from Returns: a dict storing specified metric score and path to the corresponding runfile """ metrics = [] if not metrics else ( [metrics] if isinstance(metrics, str) else list(metrics)) if primary_metric not in metrics: metrics = [primary_metric] + metrics _verify_metric(metrics) folds = {s: benchmark.folds[s] for s in [folds]} if folds else benchmark.folds runfiles = [ os.path.join(runfile_dir, f) for f in os.listdir(runfile_dir) if (f != "done" and not os.path.isdir(os.path.join(runfile_dir, f))) ] if len(runfiles) == 1: return { "score": eval_runfile(runfiles[0], benchmark.qrels, metrics), "path": {s: runfiles[0] for s in folds} } best_scores = {s: {primary_metric: 0, "path": None} for s in folds} for runfile in runfiles: runs = Searcher.load_trec_run(runfile) for s, v in folds.items(): score = _eval_runs( runs, benchmark.qrels, [primary_metric], dev_qids=(set(v["train_qids"]) | set(v["predict"]["dev"])))[primary_metric] if score > best_scores[s][primary_metric]: best_scores[s] = {primary_metric: score, "path": runfile} test_runs, test_qrels = {}, {} for s, score_dict in best_scores.items(): test_qids = folds[s]["predict"]["test"] test_runs.update({ qid: v for qid, v in Searcher.load_trec_run(score_dict["path"]).items() if qid in test_qids }) test_qrels.update( {qid: v for qid, v in benchmark.qrels.items() if qid in test_qids}) scores = eval_runs(test_runs, benchmark.qrels, metrics) return { "score": scores, "path": {s: v["path"] for s, v in best_scores.items()} }
def predict_and_eval(self, init_path=None): fold = self.config["fold"] self.reranker.build_model() if not init_path or init_path == "none": logger.info(f"Loading self best ckpt: {init_path}") logger.info("No init path given, using default parameters") self.reranker.build_model() else: logger.info(f"Load from {init_path}") init_path = Path( init_path) if not init_path.startswith("gs:") else init_path self.reranker.trainer.load_best_model(self.reranker, init_path, do_not_hash=True) dirname = str(init_path).split("/")[-1] if init_path else "noinitpath" savedir = Path( __file__).parent.absolute() / "downloaded_runfiles" / dirname dev_output_path = savedir / fold / "dev" test_output_path = savedir / fold / "test" test_output_path.parent.mkdir(exist_ok=True, parents=True) self.rank.search() threshold = self.config["threshold"] rank_results = self.rank.evaluate() best_search_run_path = rank_results["path"][fold] best_search_run = Searcher.load_trec_run(best_search_run_path) docids = set(docid for querydocs in best_search_run.values() for docid in querydocs) self.reranker.extractor.preprocess( qids=best_search_run.keys(), docids=docids, topics=self.benchmark.topics[self.benchmark.query_type]) # dev run dev_run = defaultdict(dict) for qid, docs in best_search_run.items(): if qid in self.benchmark.folds[fold]["predict"]["dev"]: for idx, (docid, score) in enumerate(docs.items()): if idx >= threshold: assert len( dev_run[qid] ) == threshold, f"Expect {threshold} on each qid, got {len(dev_run[qid])} for query {qid}" break dev_run[qid][docid] = score dev_dataset = PredSampler() dev_dataset.prepare(dev_run, self.benchmark.qrels, self.reranker.extractor, relevance_level=self.benchmark.relevance_level) # test_run test_run = defaultdict(dict) # This is possible because best_search_run is an OrderedDict for qid, docs in best_search_run.items(): if qid in self.benchmark.folds[fold]["predict"]["test"]: for idx, (docid, score) in enumerate(docs.items()): if idx >= threshold: assert len( test_run[qid] ) == threshold, f"Expect {threshold} on each qid, got {len(dev_run[qid])} for query {qid}" break test_run[qid][docid] = score unsampled_qrels = self.benchmark.unsampled_qrels if hasattr( self.benchmark, "unsampled_qrels") else self.benchmark.qrels test_dataset = PredSampler() test_dataset.prepare(test_run, unsampled_qrels, self.reranker.extractor, relevance_level=self.benchmark.relevance_level) logger.info("test prepared") # prediction dev_preds = self.reranker.trainer.predict(self.reranker, dev_dataset, dev_output_path) fold_dev_metrics = evaluator.eval_runs(dev_preds, unsampled_qrels, self.metrics, self.benchmark.relevance_level) logger.info("rerank: fold=%s dev metrics: %s", fold, fold_dev_metrics) test_preds = self.reranker.trainer.predict(self.reranker, test_dataset, test_output_path) fold_test_metrics = evaluator.eval_runs(test_preds, unsampled_qrels, self.metrics, self.benchmark.relevance_level) logger.info("rerank: fold=%s test metrics: %s", fold, fold_test_metrics) wandb.save(str(dev_output_path)) wandb.save(str(test_output_path)) # add cross validate results: n_folds = len(self.benchmark.folds) folds_fn = { f"s{i}": savedir / f"s{i}" / "test" for i in range(1, n_folds + 1) } if not all([fn.exists() for fn in folds_fn.values()]): return {"fold_test_metrics": fold_test_metrics, "cv_metrics": None} all_preds = {} reranker_runs = { fold: { "dev": Searcher.load_trec_run(fn.parent / "dev"), "test": Searcher.load_trec_run(fn) } for fold, fn in folds_fn.items() } for fold, dev_test in reranker_runs.items(): preds = dev_test["test"] qids = self.benchmark.folds[fold]["predict"]["test"] for qid, docscores in preds.items(): if qid not in qids: continue all_preds.setdefault(qid, {}) for docid, score in docscores.items(): all_preds[qid][docid] = score cv_metrics = evaluator.eval_runs(all_preds, unsampled_qrels, self.metrics, self.benchmark.relevance_level) for metric, score in sorted(cv_metrics.items()): logger.info("%25s: %0.4f", metric, score) searcher_runs = {} rank_results = self.rank.evaluate() for fold in self.benchmark.folds: searcher_runs[fold] = { "dev": Searcher.load_trec_run(rank_results["path"][fold]) } searcher_runs[fold]["test"] = searcher_runs[fold]["dev"] interpolated_results = evaluator.interpolated_eval( searcher_runs, reranker_runs, self.benchmark, self.config["optimize"], self.metrics) return { "fold_test_metrics": fold_test_metrics, "cv_metrics": cv_metrics, "interpolated_results": interpolated_results, }
def interpolate(_config): from capreolus.searcher import Searcher import pytrec_eval pipeline.initialize(_config) logger.info("initialized pipeline with results path: %s", pipeline.reranker_path) benchmark = pipeline.benchmark benchmark.build() # TODO move this to pipeline.initialize? test_metrics = {} for foldname, fold in sorted(benchmark.folds.items()): if not (len(fold["predict"]) == 2 and "dev" in fold["predict"] and "test" in fold["predict"]): raise RuntimeError( "this evaluation command is only supported for benchmarks with 'dev' and 'test' folds" ) logger.debug("evaluating fold: %s", foldname) predict_path = os.path.join(pipeline.reranker_path, foldname, "predict") dev_qids = set(fold["predict"]["dev"]) dev_qrels = { qid: labels for qid, labels in pipeline.collection.qrels.items() if qid in dev_qids } dev_eval = pytrec_eval.RelevanceEvaluator(dev_qrels, {"ndcg_cut", "P", "map"}) test_qids = set(fold["predict"]["test"]) test_qrels = { qid: labels for qid, labels in pipeline.collection.qrels.items() if qid in test_qids } searcher_dev = { qid: docscores for qid, docscores in benchmark.reranking_runs[foldname].items() if qid in dev_qids } searcher_test = { qid: docscores for qid, docscores in benchmark.reranking_runs[foldname].items() if qid in test_qids } best_metric, best_iter, dev_run = -np.inf, None, None target_metric = "ndcg_cut_20" # target_metric = "map" devpath = os.path.join(predict_path, "dev") for iterfn in os.listdir(devpath): dev_run = Searcher.load_trec_run(os.path.join(devpath, iterfn)) test_run = Searcher.load_trec_run( os.path.join(predict_path, "test", iterfn)) alpha, interpolated_test_run, interpolated_dev_run = Searcher.crossvalidated_interpolation( dev={ "reranker": dev_run, "searcher": searcher_dev, "qrels": dev_qrels }, test={ "reranker": test_run, "searcher": searcher_test, "qrels": test_qrels }, metric=target_metric, ) this_metric = np.mean([ q[target_metric] for q in dev_eval.evaluate(interpolated_dev_run).values() ]) if this_metric > best_metric: best_metric = this_metric best_iter = iterfn use_run = interpolated_test_run print(foldname, iterfn, best_metric, alpha) logger.debug("best dev %s was on iteration #%s", target_metric, best_iter) # test_run = Searcher.load_trec_run(os.path.join(predict_path, "test", best_iter)) test_run = use_run test_eval = pytrec_eval.RelevanceEvaluator(test_qrels, {"ndcg_cut", "P", "map"}) for qid, metrics in test_eval.evaluate(test_run).items(): assert qid in test_qids for metric, value in metrics.items(): test_metrics.setdefault(metric, {}) assert qid not in test_metrics[metric], "fold testqid overlap" test_metrics[metric][qid] = value # output files for Anserini interpolation script Searcher.write_trec_run( Searcher.load_trec_run(os.path.join(predict_path, "dev", best_iter)), f"runs.rerankerIES.{foldname}.dev") Searcher.write_trec_run( Searcher.load_trec_run( os.path.join(predict_path, "test", best_iter)), f"runs.rerankerIES.{foldname}.test") logger.info(f"optimized for {target_metric}") logger.info(f"results on {len(test_metrics[metric])} aggregated test qids") for metric in ["ndcg_cut_20", "map", "P_5", "P_20"]: interpolated_avg = np.mean([*test_metrics[metric].values()]) logger.info(f"[interpolated] avg {metric}: {interpolated_avg:0.3f}")
def evaluate(_config): from capreolus.searcher import Searcher import pytrec_eval pipeline.initialize(_config) logger.debug("initialized pipeline with results path: %s", pipeline.reranker_path) benchmark = pipeline.benchmark benchmark.build() # TODO move this to pipeline.initialize? test_metrics = {} searcher_test_metrics = {} interpolated_test_metrics = {} for foldname, fold in sorted(benchmark.folds.items()): if not (len(fold["predict"]) == 2 and "dev" in fold["predict"] and "test" in fold["predict"]): raise RuntimeError( "this evaluation command is only supported for benchmarks with 'dev' and 'test' folds" ) logger.debug("evaluating fold: %s", foldname) predict_path = os.path.join(pipeline.reranker_path, foldname, "predict") dev_qids = set(fold["predict"]["dev"]) dev_qrels = { qid: labels for qid, labels in pipeline.collection.qrels.items() if qid in dev_qids } dev_eval = pytrec_eval.RelevanceEvaluator(dev_qrels, {"ndcg_cut", "P", "map"}) best_metric, best_iter, dev_run = -np.inf, None, None target_metric = "ndcg_cut_20" # target_metric = "map" devpath = os.path.join(predict_path, "dev") for iterfn in os.listdir(devpath): run = Searcher.load_trec_run(os.path.join(devpath, iterfn)) this_metric = np.mean( [q[target_metric] for q in dev_eval.evaluate(run).values()]) if this_metric > best_metric: best_metric = this_metric best_iter = iterfn dev_run = run logger.debug("best dev %s=%0.3f was on iteration #%s", target_metric, best_metric, best_iter) test_run = Searcher.load_trec_run( os.path.join(predict_path, "test", best_iter)) test_qids = set(fold["predict"]["test"]) test_qrels = { qid: labels for qid, labels in pipeline.collection.qrels.items() if qid in test_qids } test_eval = pytrec_eval.RelevanceEvaluator(test_qrels, {"ndcg_cut", "P", "map"}) for qid, metrics in test_eval.evaluate(test_run).items(): assert qid in test_qids for metric, value in metrics.items(): test_metrics.setdefault(metric, {}) assert qid not in test_metrics[metric], "fold testqid overlap" test_metrics[metric][qid] = value # compute metrics for the run being reranked for qid, metrics in test_eval.evaluate( benchmark.reranking_runs[foldname]).items(): assert qid in test_qids for metric, value in metrics.items(): searcher_test_metrics.setdefault(metric, {}) assert qid not in searcher_test_metrics[ metric], "fold testqid overlap" searcher_test_metrics[metric][qid] = value # choose an alpha for interpolation using the dev_qids, # then create a run by interpolating the searcher and reranker scores searcher_dev = { qid: docscores for qid, docscores in benchmark.reranking_runs[foldname].items() if qid in dev_qids } searcher_test = { qid: docscores for qid, docscores in benchmark.reranking_runs[foldname].items() if qid in test_qids } alpha, interpolated_test_run, _ = Searcher.crossvalidated_interpolation( dev={ "reranker": dev_run, "searcher": searcher_dev, "qrels": dev_qrels }, test={ "reranker": test_run, "searcher": searcher_test, "qrels": test_qrels }, metric=target_metric, ) # output files for Anserini interpolation script Searcher.write_trec_run(dev_run, f"runs.reranker.{foldname}.dev") Searcher.write_trec_run(test_run, f"runs.reranker.{foldname}.test") Searcher.write_trec_run(searcher_dev, f"runs.searcher.{foldname}.dev") Searcher.write_trec_run(searcher_test, f"runs.searcher.{foldname}.test") logger.debug(f"interpolation alpha={alpha}") for qid, metrics in test_eval.evaluate(interpolated_test_run).items(): assert qid in test_qids for metric, value in metrics.items(): interpolated_test_metrics.setdefault(metric, {}) assert qid not in interpolated_test_metrics[ metric], "fold testqid overlap" interpolated_test_metrics[metric][qid] = value logger.info(f"optimized for {target_metric}") logger.info(f"results on {len(test_metrics[metric])} aggregated test qids") for metric in ["map", "P_20", "ndcg_cut_20"]: assert len(test_metrics[metric]) == len(searcher_test_metrics[metric]) assert len(test_metrics[metric]) == len( interpolated_test_metrics[metric]) searcher_avg = np.mean([*searcher_test_metrics[metric].values()]) logger.info(f"[searcher] avg {metric}: {searcher_avg:0.3f}") sigtest_qids = sorted(test_metrics[metric].keys()) sigtest = ttest_rel( [searcher_test_metrics[metric][qid] for qid in sigtest_qids], [test_metrics[metric][qid] for qid in sigtest_qids]) avg = np.mean([*test_metrics[metric].values()]) logger.info( f"[reranker] avg {metric}: {avg:0.3f}\tp={sigtest.pvalue:0.3f} (vs. searcher)" ) interpolated_avg = np.mean( [*interpolated_test_metrics[metric].values()]) logger.info(f"[interpolated] avg {metric}: {interpolated_avg:0.3f}") with open(os.path.join(predict_path, "results.json"), "wt") as outf: json.dump( (test_metrics, searcher_test_metrics, interpolated_test_metrics), outf)