def from_topics(cls, topics_path: str): if os.path.exists(topics_path): if topics_path.endswith('.json'): with open(topics_path, 'r') as f: topics = json.load(f) elif topics_path.endswith('.tsv'): topics = get_topics_with_reader( 'io.anserini.search.topicreader.TsvIntTopicReader', topics_path) elif topics_path.endswith('.trec'): topics = get_topics_with_reader( 'io.anserini.search.topicreader.TrecTopicReader', topics_path) elif 'cacm' in topics_path: topics = get_topics_with_reader( 'io.anserini.search.topicreader.CacmTopicReader', topics_path) else: raise NotImplementedError( f"Not sure how to parse {topics_path}. Please specify the file extension." ) else: topics = get_topics(topics_path) if not topics: raise FileNotFoundError(f'Topic {topics_path} Not Found') order = QueryIterator.get_predefined_order(topics_path) return cls(topics, order)
def test_trec_topicreader(self): # Running from command-line, we're in root of repo, but running in IDE, we're in tests/ path = 'tools/topics-and-qrels/topics.robust04.txt' if not os.path.exists(path): path = f'../{path}' self.assertTrue(os.path.exists(path)) topics = search.get_topics_with_reader('io.anserini.search.topicreader.TrecTopicReader', path) self.assertEqual(len(topics), 250) self.assertTrue(isinstance(next(iter(topics.keys())), int)) self.assertEqual(search.get_topics('robust04'), topics)
def test_tsv_int_topicreader(self): # Running from command-line, we're in root of repo, but running in IDE, we're in tests/ path = 'tools/topics-and-qrels/topics.msmarco-doc.dev.txt' if not os.path.exists(path): path = f'../{path}' self.assertTrue(os.path.exists(path)) topics = search.get_topics_with_reader('io.anserini.search.topicreader.TsvIntTopicReader', path) self.assertEqual(len(topics), 5193) self.assertTrue(isinstance(next(iter(topics.keys())), int)) self.assertEqual(search.get_topics('msmarco_doc_dev'), topics)