def test_trec_run_read(self): input_path = 'tests/resources/simple_trec_run_read.txt' verify_path = 'tests/resources/simple_trec_run_read_verify.txt' run = TrecRun(filepath=input_path) run.save_to_txt(self.output_path) self.assertTrue(filecmp.cmp(verify_path, self.output_path))
def bm25(qid, query, docs, index_path): s = SimpleSearcher(index_path) hits = s.search(query, 1000) n = 1 seen_docids = {} with open(f'run-passage-{qid}.txt', 'w') as writer: for i in range(0, len(hits)): if hits[i].docid in seen_docids: continue writer.write(f'{qid} Q0 {hits[i].docid} {n} {hits[i].score:.5f} pyserini\n') n = n + 1 seen_docids[hits[i].docid] = 1 with open(f'run-doc-{qid}.txt', 'w') as writer: for doc in docs: writer.write(f'{qid} Q0 {doc["docid"]} {doc["rank"]} {doc["score"]} base\n') n = n + 1 os.system(f'python -m pyserini.fusion --method rrf --runs run-passage-{qid}.txt run-doc-{qid}.txt ' + f'--output run-rrf-{qid}.txt --runtag test') fused_run = TrecRun(f'run-rrf-{qid}.txt') output = [] for idx, r in fused_run.get_docs_by_topic(qid).iterrows(): output.append([qid, r["docid"], r["rank"]]) return output
def main(args): if args.cache and not os.path.exists(args.cache): os.mkdir(args.cache) # Load queries: queries = load_queries(args.queries) # Load base run to rerank: base_run = TrecRun(args.input) # SimpleSearcher to fetch document texts. searcher = SimpleSearcher.from_prebuilt_index('msmarco-doc') output = [] if args.bm25: reranker = 'bm25' elif args.ance: reranker = 'ance' elif not args.identity: sys.exit('Unknown reranking method!') cnt = 1 for row in queries: qid = int(row[0]) query = row[1] print(f'{cnt} {qid} {query}') qid_results = base_run.get_docs_by_topic(qid) # Don't actually do reranking, just pass along the base run: if args.identity: rank = 1 for docid in qid_results['docid'].tolist(): output.append([qid, docid, rank]) rank = rank + 1 cnt = cnt + 1 continue # Gather results for reranking: results_to_rerank = [] for index, result in qid_results.iterrows(): raw_doc = searcher.doc( result['docid']).raw().lstrip('<TEXT>').rstrip('</TEXT>') results_to_rerank.append({ 'docid': result['docid'], 'rank': result['rank'], 'score': result['score'], 'text': raw_doc }) # Perform the actual reranking: output.extend( rerank(args.cache, qid, query, results_to_rerank, reranker)) cnt = cnt + 1 # Write the output run file: with open(args.output, 'w') as writer: for r in output: writer.write(f'{r[0]}\t{r[1]}\t{r[2]}\n')
def test_retain_qrels(self): run = TrecRun('tests/resources/simple_trec_run_filter.txt') qrels = Qrels('tools/topics-and-qrels/qrels.covid-round1.txt') run.retain_qrels(qrels, clone=True).save_to_txt(output_path=self.output_path) self.assertTrue( filecmp.cmp('tests/resources/simple_trec_run_keep_verify.txt', self.output_path))
def test_trec_run_topics(self): input_path = os.path.join( self.root, 'tests/resources/simple_trec_run_msmarco_doc1.txt') run = TrecRun(filepath=input_path) self.assertEqual(run.topics(), {320792, 174249, 1090270, 1101279}) for topic in run.topics(): self.assertEqual(len(run.get_docs_by_topic(topic)), 5)
def test_normalize_scores(self): run = TrecRun( os.path.join(self.root, 'tests/resources/simple_trec_run_fusion_1.txt')) run.rescore(RescoreMethod.NORMALIZE).save_to_txt(self.output_path) self.assertTrue( filecmp.cmp( os.path.join( self.root, 'tests/resources/simple_trec_run_normalize_verify.txt'), self.output_path))
def test_discard_qrels(self): run = TrecRun( os.path.join(self.root, 'tests/resources/simple_trec_run_filter.txt')) qrels = Qrels( os.path.join(self.root, 'tools/topics-and-qrels/qrels.covid-round1.txt')) run.discard_qrels( qrels, clone=False).save_to_txt(output_path=self.output_path) self.assertTrue( filecmp.cmp( os.path.join( self.root, 'tests/resources/simple_trec_run_remove_verify.txt'), self.output_path))