def run(self): output_dir = self.get_results_path() # read the title queries from the chosen benchmark's topic file results1 = self.searcher1.query_from_file( self.benchmark.get_topics_file(), output_dir / "searcher1") results2 = self.searcher2.query_from_file( self.benchmark.get_topics_file(), output_dir / "searcher2") searcher_results = [results1, results2] # using the benchmark's folds, which each contain train/validation/test queries, # choose the best run in `output_dir` for the fold based on the validation queries # and return metrics calculated on the test queries best_results = evaluator.search_best_run( searcher_results, self.benchmark, primary_metric=self.config["optimize"], metrics=evaluator.DEFAULT_METRICS) for fold, path in best_results["path"].items(): shortpath = "..." + path[-40:] logger.info("fold=%s best run: %s", fold, shortpath) logger.info("cross-validated results when optimizing for '%s':", self.config["optimize"]) for metric, score in sorted(best_results["score"].items()): logger.info("%15s: %0.4f", metric, score) return best_results
def evaluate(config, modules): metric = "map" fold = config["fold"] train_output_path = _pipeline_path(config, modules) test_output_path = train_output_path / "pred" / "test" / "best" searcher = modules["searcher"] benchmark = modules["benchmark"] reranker = modules["reranker"] if os.path.exists(test_output_path): test_preds = Searcher.load_trec_run(test_output_path) else: topics_fn = benchmark.topic_file searcher_cache_dir = os.path.join(searcher.get_cache_path(), benchmark.name) searcher_run_dir = searcher.query_from_file(topics_fn, searcher_cache_dir) best_search_run_path = evaluator.search_best_run(searcher_run_dir, benchmark, metric)["path"][fold] best_search_run = searcher.load_trec_run(best_search_run_path) docids = set(docid for querydocs in best_search_run.values() for docid in querydocs) reranker["extractor"].create(qids=best_search_run.keys(), docids=docids, topics=benchmark.topics[benchmark.query_type]) reranker.build() reranker["trainer"].load_best_model(reranker, train_output_path) test_run = {qid: docs for qid, docs in best_search_run.items() if qid in benchmark.folds[fold]["predict"]["test"]} test_dataset = PredDataset(qid_docid_to_rank=test_run, extractor=reranker["extractor"], mode="test") test_preds = reranker["trainer"].predict(reranker, test_dataset, test_output_path) metrics = evaluator.eval_runs(test_preds, benchmark.qrels, ["ndcg_cut_20", "ndcg_cut_10", "map", "P_20", "P_10"]) print("test metrics for fold=%s:" % fold, metrics) print("\ncomputing metrics across all folds") avg = {} found = 0 for fold in benchmark.folds: pred_path = _pipeline_path(config, modules, fold=fold) / "pred" / "test" / "best" if not os.path.exists(pred_path): print("\tfold=%s results are missing and will not be included" % fold) continue found += 1 preds = Searcher.load_trec_run(pred_path) metrics = evaluator.eval_runs(preds, benchmark.qrels, ["ndcg_cut_20", "ndcg_cut_10", "map", "P_20", "P_10"]) for metric, val in metrics.items(): avg.setdefault(metric, []).append(val) avg = {k: np.mean(v) for k, v in avg.items()} print(f"average metrics across {found}/{len(benchmark.folds)} folds:", avg)
def evaluate(config, modules): # output_path = _pipeline_path(config, modules) searcher = modules["searcher"] benchmark = modules["benchmark"] metric = config["optimize"] all_metric = ["ndcg_cut_20", "ndcg_cut_10", "map", "P_20", "P_10", "set_recall"] output_dir = searcher.get_cache_path() / benchmark.name best_results = evaluator.search_best_run(output_dir, benchmark, primary_metric=metric, metrics=all_metric) pathes = [f"\t{s}: {path}" for s, path in best_results["path"].items()] print("path for each split: \n", "\n".join(pathes)) scores = [f"\t{s}: {score}" for s, score in best_results["score"].items()] print(f"cross-validated results when optimizing for {metric}: \n", "\n".join(scores))
def train(config, modules): random.seed(config["seed"]) np.random.seed(config["seed"]) torch.manual_seed(config["seed"]) torch.cuda.manual_seed_all(config["seed"]) metric = "map" fold = config["fold"] searcher = modules["searcher"] benchmark = modules["benchmark"] reranker = modules["reranker"] if "index" in searcher.modules: searcher["index"].create_index() topics_fn = benchmark.topic_file searcher_cache_dir = os.path.join(searcher.get_cache_path(), benchmark.name) searcher_run_dir = searcher.query_from_file(topics_fn, searcher_cache_dir) results = evaluator.search_best_run(searcher_run_dir, benchmark, metric) best_search_run_path = results["path"][fold] best_search_run = searcher.load_trec_run(best_search_run_path) if config["rundocsonly"]: docids = set(docid for querydocs in best_search_run.values() for docid in querydocs) reranker["extractor"].create(qids=best_search_run.keys(), docids=docids, topics=benchmark.topics[benchmark.query_type]) train_run = {qid: docs for qid, docs in best_search_run.items() if qid in benchmark.folds[fold]["train_qids"]} dev_run = {qid: docs for qid, docs in best_search_run.items() if qid in benchmark.folds[fold]["predict"]["dev"]} else: docids = set(docid for querydocs in benchmark.qrels.values() for docid in querydocs) reranker["extractor"].create(qids=benchmark.qrels.keys(), docids=docids, topics=benchmark.topics[benchmark.query_type]) train_run = {qid: docs for qid, docs in benchmark.qrels.items() if qid in benchmark.folds[fold]["train_qids"]} dev_run = {qid: docs for qid, docs in benchmark.qrels.items() if qid in benchmark.folds[fold]["predict"]["dev"]} reranker.build() train_dataset = TrainDataset(qid_docid_to_rank=train_run, qrels=benchmark.qrels, extractor=reranker["extractor"]) dev_dataset = PredDataset(qid_docid_to_rank=dev_run, qrels=benchmark.qrels, extractor=reranker["extractor"], mode="val") train_output_path = _pipeline_path(config, modules) dev_output_path = train_output_path / "pred" / "dev" reranker["trainer"].train(reranker, train_dataset, train_output_path, dev_dataset, dev_output_path, benchmark.qrels, metric)
def evaluate(self): metrics = self.config["metrics"] if list(self.config["metrics"]) != [ "default" ] else evaluator.DEFAULT_METRICS best_results = evaluator.search_best_run( self.get_results_path(), self.benchmark, primary_metric=self.config["optimize"], metrics=metrics) for fold, path in best_results["path"].items(): logger.info("rank: fold=%s best run: %s", fold, path) logger.info("rank: cross-validated results when optimizing for '%s':", self.config["optimize"]) for metric, score in sorted(best_results["score"].items()): logger.info("%25s: %0.4f", metric, score) return best_results