def test_simple_fusion_searcher(self): index_dirs = [ 'lucene-index-cord19-abstract-2020-05-01/', 'lucene-index-cord19-full-text-2020-05-01/', 'lucene-index-cord19-paragraph-2020-05-01/' ] searcher = SimpleFusionSearcher(index_dirs, method=FusionMethod.RRF) runs, topics = [], get_topics('covid_round2') for topic in tqdm(sorted(topics.keys())): query = topics[topic]['question'] + ' ' + topics[topic]['query'] hits = searcher.search(query, k=10000, query_generator=None, strip_segment_id=True, remove_dups=True) docid_score_pair = [(hit.docid, hit.score) for hit in hits] run = TrecRun.from_search_results(docid_score_pair, topic=topic) runs.append(run) all_topics_run = TrecRun.concat(runs) all_topics_run.save_to_txt(output_path='fused.txt', tag='reciprocal_rank_fusion_k=60') # Only keep topic, docid and rank. Scores have different floating point precisions. os.system("""awk '{print $1" "$3" "$4}' fused.txt > this.txt""") os.system( """awk '{print $1" "$3" "$4}' anserini.covid-r2.fusion1.txt > that.txt""" ) self.assertTrue(filecmp.cmp('this.txt', 'that.txt'))
def test_simple_fusion_searcher(self): index_dirs = ['indexes/lucene-index-cord19-abstract-2020-05-01/', 'indexes/lucene-index-cord19-full-text-2020-05-01/', 'indexes/lucene-index-cord19-paragraph-2020-05-01/'] searcher = SimpleFusionSearcher(index_dirs, method=FusionMethod.RRF) runs, topics = [], get_topics('covid-round2') for topic in tqdm(sorted(topics.keys())): query = topics[topic]['question'] + ' ' + topics[topic]['query'] hits = searcher.search(query, k=10000, query_generator=None, strip_segment_id=True, remove_dups=True) docid_score_pair = [(hit.docid, hit.score) for hit in hits] run = TrecRun.from_search_results(docid_score_pair, topic=topic) runs.append(run) all_topics_run = TrecRun.concat(runs) all_topics_run.save_to_txt(output_path='runs/fused.txt', tag='reciprocal_rank_fusion_k=60') # Only keep topic, docid, and rank. Scores may be slightly different due to floating point precision issues and underlying lib versions. # TODO: We should probably do this in Python as opposed to calling out to shell for better portability. # This has also proven to be a somewhat brittle test, see https://github.com/castorini/pyserini/issues/947 # A stopgap for above issue, we're restricting comparison to only top-100 ranks. os.system("""awk '$4 <= 100 {print $1" "$3" "$4}' runs/fused.txt > runs/this.txt""") os.system("""awk '$4 <= 100 {print $1" "$3" "$4}' runs/anserini.covid-r2.fusion1.txt > runs/that.txt""") self.assertTrue(filecmp.cmp('runs/this.txt', 'runs/that.txt'))