def test_simple_fusion_searcher(self): index_dirs = [ 'lucene-index-cord19-abstract-2020-05-01/', 'lucene-index-cord19-full-text-2020-05-01/', 'lucene-index-cord19-paragraph-2020-05-01/' ] searcher = SimpleFusionSearcher(index_dirs, method=FusionMethod.RRF) runs, topics = [], get_topics('covid_round2') for topic in tqdm(sorted(topics.keys())): query = topics[topic]['question'] + ' ' + topics[topic]['query'] hits = searcher.search(query, k=10000, query_generator=None, strip_segment_id=True, remove_dups=True) docid_score_pair = [(hit.docid, hit.score) for hit in hits] run = TrecRun.from_search_results(docid_score_pair, topic=topic) runs.append(run) all_topics_run = TrecRun.concat(runs) all_topics_run.save_to_txt(output_path='fused.txt', tag='reciprocal_rank_fusion_k=60') # Only keep topic, docid and rank. Scores have different floating point precisions. os.system("""awk '{print $1" "$3" "$4}' fused.txt > this.txt""") os.system( """awk '{print $1" "$3" "$4}' anserini.covid-r2.fusion1.txt > that.txt""" ) self.assertTrue(filecmp.cmp('this.txt', 'that.txt'))
def test_simple_fusion_searcher(self): index_dirs = ['indexes/lucene-index-cord19-abstract-2020-05-01/', 'indexes/lucene-index-cord19-full-text-2020-05-01/', 'indexes/lucene-index-cord19-paragraph-2020-05-01/'] searcher = SimpleFusionSearcher(index_dirs, method=FusionMethod.RRF) runs, topics = [], get_topics('covid-round2') for topic in tqdm(sorted(topics.keys())): query = topics[topic]['question'] + ' ' + topics[topic]['query'] hits = searcher.search(query, k=10000, query_generator=None, strip_segment_id=True, remove_dups=True) docid_score_pair = [(hit.docid, hit.score) for hit in hits] run = TrecRun.from_search_results(docid_score_pair, topic=topic) runs.append(run) all_topics_run = TrecRun.concat(runs) all_topics_run.save_to_txt(output_path='runs/fused.txt', tag='reciprocal_rank_fusion_k=60') # Only keep topic, docid, and rank. Scores may be slightly different due to floating point precision issues and underlying lib versions. # TODO: We should probably do this in Python as opposed to calling out to shell for better portability. # This has also proven to be a somewhat brittle test, see https://github.com/castorini/pyserini/issues/947 # A stopgap for above issue, we're restricting comparison to only top-100 ranks. os.system("""awk '$4 <= 100 {print $1" "$3" "$4}' runs/fused.txt > runs/this.txt""") os.system("""awk '$4 <= 100 {print $1" "$3" "$4}' runs/anserini.covid-r2.fusion1.txt > runs/that.txt""") self.assertTrue(filecmp.cmp('runs/this.txt', 'runs/that.txt'))
def test_trec_run_read(self): input_path = 'tests/resources/simple_trec_run_read.txt' verify_path = 'tests/resources/simple_trec_run_read_verify.txt' run = TrecRun(filepath=input_path) run.save_to_txt(self.output_path) self.assertTrue(filecmp.cmp(verify_path, self.output_path))
def bm25(qid, query, docs, index_path): s = SimpleSearcher(index_path) hits = s.search(query, 1000) n = 1 seen_docids = {} with open(f'run-passage-{qid}.txt', 'w') as writer: for i in range(0, len(hits)): if hits[i].docid in seen_docids: continue writer.write(f'{qid} Q0 {hits[i].docid} {n} {hits[i].score:.5f} pyserini\n') n = n + 1 seen_docids[hits[i].docid] = 1 with open(f'run-doc-{qid}.txt', 'w') as writer: for doc in docs: writer.write(f'{qid} Q0 {doc["docid"]} {doc["rank"]} {doc["score"]} base\n') n = n + 1 os.system(f'python -m pyserini.fusion --method rrf --runs run-passage-{qid}.txt run-doc-{qid}.txt ' + f'--output run-rrf-{qid}.txt --runtag test') fused_run = TrecRun(f'run-rrf-{qid}.txt') output = [] for idx, r in fused_run.get_docs_by_topic(qid).iterrows(): output.append([qid, r["docid"], r["rank"]]) return output
def main(args): if args.cache and not os.path.exists(args.cache): os.mkdir(args.cache) # Load queries: queries = load_queries(args.queries) # Load base run to rerank: base_run = TrecRun(args.input) # SimpleSearcher to fetch document texts. searcher = SimpleSearcher.from_prebuilt_index('msmarco-doc') output = [] if args.bm25: reranker = 'bm25' elif args.ance: reranker = 'ance' elif not args.identity: sys.exit('Unknown reranking method!') cnt = 1 for row in queries: qid = int(row[0]) query = row[1] print(f'{cnt} {qid} {query}') qid_results = base_run.get_docs_by_topic(qid) # Don't actually do reranking, just pass along the base run: if args.identity: rank = 1 for docid in qid_results['docid'].tolist(): output.append([qid, docid, rank]) rank = rank + 1 cnt = cnt + 1 continue # Gather results for reranking: results_to_rerank = [] for index, result in qid_results.iterrows(): raw_doc = searcher.doc( result['docid']).raw().lstrip('<TEXT>').rstrip('</TEXT>') results_to_rerank.append({ 'docid': result['docid'], 'rank': result['rank'], 'score': result['score'], 'text': raw_doc }) # Perform the actual reranking: output.extend( rerank(args.cache, qid, query, results_to_rerank, reranker)) cnt = cnt + 1 # Write the output run file: with open(args.output, 'w') as writer: for r in output: writer.write(f'{r[0]}\t{r[1]}\t{r[2]}\n')
def test_retain_qrels(self): run = TrecRun('tests/resources/simple_trec_run_filter.txt') qrels = Qrels('tools/topics-and-qrels/qrels.covid-round1.txt') run.retain_qrels(qrels, clone=True).save_to_txt(output_path=self.output_path) self.assertTrue( filecmp.cmp('tests/resources/simple_trec_run_keep_verify.txt', self.output_path))
def test_trec_run_topics(self): input_path = os.path.join( self.root, 'tests/resources/simple_trec_run_msmarco_doc1.txt') run = TrecRun(filepath=input_path) self.assertEqual(run.topics(), {320792, 174249, 1090270, 1101279}) for topic in run.topics(): self.assertEqual(len(run.get_docs_by_topic(topic)), 5)
def test_normalize_scores(self): run = TrecRun( os.path.join(self.root, 'tests/resources/simple_trec_run_fusion_1.txt')) run.rescore(RescoreMethod.NORMALIZE).save_to_txt(self.output_path) self.assertTrue( filecmp.cmp( os.path.join( self.root, 'tests/resources/simple_trec_run_normalize_verify.txt'), self.output_path))
def search(self, q: Union[str, JQuery], k: int = 10, query_generator: JQueryGenerator = None, strip_segment_id=False, remove_dups=False) -> List[JLuceneSearcherResult]: trec_runs, docid_to_search_result = list(), dict() for searcher in self.searchers: docid_score_pair = list() hits = searcher.search(q, k=k, query_generator=query_generator, strip_segment_id=strip_segment_id, remove_dups=remove_dups) for hit in hits: docid_to_search_result[hit.docid] = hit docid_score_pair.append((hit.docid, hit.score)) run = TrecRun.from_search_results(docid_score_pair) trec_runs.append(run) if self.method == FusionMethod.RRF: fused_run = reciprocal_rank_fusion(trec_runs, rrf_k=60, depth=1000, k=1000) else: raise NotImplementedError() return self.convert_to_search_result(fused_run, docid_to_search_result)
def reciprocal_rank_fusion(runs: List[TrecRun], rrf_k: int = 60, depth: int = None, k: int = None): """Perform reciprocal rank fusion on a list of ``TrecRun`` objects. Implementation follows Cormack et al. (SIGIR 2009) paper titled "Reciprocal Rank Fusion Outperforms Condorcet and Individual Rank Learning Methods." Parameters ---------- runs : List[TrecRun] List of ``TrecRun`` objects. rrf_k : int Parameter to avoid vanishing importance of lower-ranked documents. Note that this is different from the *k* in top *k* retrieval; set to 60 by default, per Cormack et al. depth : int Maximum number of results from each input run to consider. Set to ``None`` by default, which indicates that the complete list of results is considered. k : int Length of final results list. Set to ``None`` by default, which indicates that the union of all input documents are ranked. Returns ------- TrecRun Output ``TrecRun`` that combines input runs via reciprocal rank fusion. """ # TODO: Add option to *not* clone runs, thus making the method destructive, but also more efficient. rrf_runs = [ run.clone().rescore(method=RescoreMethod.RRF, rrf_k=rrf_k) for run in runs ] return TrecRun.merge(rrf_runs, AggregationMethod.SUM, depth=depth, k=k)
def average(runs: List[TrecRun], depth: int = None, k: int = None): """Perform fusion by averaging on a list of ``TrecRun`` objects. Parameters ---------- runs : List[TrecRun] List of ``TrecRun`` objects. depth : int Maximum number of results from each input run to consider. Set to ``None`` by default, which indicates that the complete list of results is considered. k : int Length of final results list. Set to ``None`` by default, which indicates that the union of all input documents are ranked. Returns ------- TrecRun Output ``TrecRun`` that combines input runs via averaging. """ scaled_runs = [ run.clone().rescore(method=RescoreMethod.SCALE, scale=(1 / len(runs))) for run in runs ] return TrecRun.merge(scaled_runs, AggregationMethod.SUM, depth=depth, k=k)
def test_discard_qrels(self): run = TrecRun( os.path.join(self.root, 'tests/resources/simple_trec_run_filter.txt')) qrels = Qrels( os.path.join(self.root, 'tools/topics-and-qrels/qrels.covid-round1.txt')) run.discard_qrels( qrels, clone=False).save_to_txt(output_path=self.output_path) self.assertTrue( filecmp.cmp( os.path.join( self.root, 'tests/resources/simple_trec_run_remove_verify.txt'), self.output_path))
def convert_to_search_result(run: TrecRun, docid_to_search_result: Dict[str, JSimpleSearcherResult]) -> List[JSimpleSearcherResult]: search_results = [] for _, _, docid, _, score, _ in run.to_numpy(): search_result = docid_to_search_result[docid] search_result.score = score search_results.append(search_result) return search_results
def interpolation(runs: List[TrecRun], alpha: int = 0.5, depth: int = None, k: int = None): """Perform fusion by interpolation on a list of exactly two ``TrecRun`` objects. new_score = first_run_score * alpha + (1 - alpha) * second_run_score. Parameters ---------- runs : List[TrecRun] List of ``TrecRun`` objects. Exactly two runs. alpha : int Parameter alpha will be applied on the first run and (1 - alpha) will be applied on the second run. depth : int Maximum number of results from each input run to consider. Set to ``None`` by default, which indicates that the complete list of results is considered. k : int Length of final results list. Set to ``None`` by default, which indicates that the union of all input documents are ranked. Returns ------- TrecRun Output ``TrecRun`` that combines input runs via interpolation. """ if len(runs) != 2: raise Exception('Interpolation must be performed on exactly two runs.') scaled_runs = [] scaled_runs.append(runs[0].clone().rescore(method=RescoreMethod.SCALE, scale=alpha)) scaled_runs.append(runs[1].clone().rescore(method=RescoreMethod.SCALE, scale=(1 - alpha))) return TrecRun.merge(scaled_runs, AggregationMethod.SUM, depth=depth, k=k)