def __init__(self, config, logger, vocab): super().__init__(config, logger, vocab) base_path = util.path_dataset(self) global_base_path = "/".join(base_path.split("/")[:-1]) #setup msmarco _base_path = global_base_path + "/msmarco" self.ms_index_stem = indices.AnseriniIndex(os.path.join( _base_path, 'anserini.porter'), stemmer='porter') self.ms_index_doctttttquery_stem = indices.AnseriniIndex( os.path.join(_base_path, 'anserini.doctttttquery.porter'), stemmer='porter') self.ms_doc_store = indices.SqliteDocstore( os.path.join(_base_path, 'docs.sqllite')) #setup microblog _base_path = global_base_path + "/microblog" self.mb_index_stem = indices.AnseriniIndex(os.path.join( _base_path, 'anserini.porter'), stemmer='porter') self.mb_index = indices.AnseriniIndex(os.path.join( _base_path, 'anserini'), stemmer='none') self.mb_doc_store = indices.SqliteDocstore( os.path.join(_base_path, 'docs.sqllite')) self.msds = msmarco.MsmarcoDataset( self.msmarco_config(self.config['subset'], config), logger, vocab) self.mbds = microblog.MicroblogDataset( self.microblog_config(self.config['subset'], config), logger, vocab)
def __init__(self, config, vocab, logger): super().__init__(config, logger, vocab) self.index_spanish = indices.AnseriniIndex(os.path.join( util.path_dataset(self), 'anserini.es'), lang=self._lang()) self.doc_store = indices.SqliteDocstore( os.path.join(util.path_dataset(self), 'docs.sqlite'))
def __init__(self, config, logger, vocab): super().__init__(config, logger, vocab) base_path = util.path_dataset(self) global_base_path = "/".join(base_path.split("/")[:-1]) #setup msmarco _base_path = global_base_path + "/msmarco" self.ms_index_stem = indices.AnseriniIndex(os.path.join( _base_path, 'anserini.porter'), stemmer='porter') self.ms_index_doctttttquery_stem = indices.AnseriniIndex( os.path.join(_base_path, 'anserini.doctttttquery.porter'), stemmer='porter') self.ms_doc_store = indices.SqliteDocstore( os.path.join(_base_path, 'docs.sqllite')) #setup cord _base_path = global_base_path + "/covid/2020-07-16" self.cord_index_stem = indices.MultifieldAnseriniIndex( os.path.join(_base_path, 'anserini_multifield'), stemmer='porter', primary_field=config['bs_field']) self.cord_index_stem_2020 = indices.MultifieldAnseriniIndex( os.path.join(_base_path, 'anserini_multifield_2020'), stemmer='porter', primary_field=config['bs_field']) self.cord_doc_store = indices.MultifieldSqliteDocstore( os.path.join(_base_path, 'docs_multifield.sqlite'), primary_field=config['rr_field']) self.msds = msmarco.MsmarcoDataset( self.msmarco_config(self.config['subset'], config), logger, vocab) self.cordds = covid.CovidDataset( self.cord_config(self.config['subset'], config), logger, vocab)
def __init__(self): super().__init__() self.index = indices.AnseriniIndex(self.index.path, stemmer="none", name="fullindex") self.index_stem = indices.AnseriniIndex(self.index_stem.path, name="stemindex") self.doc_store = indices.SqliteDocstore(self.docstore.path)
def __init__(self, config, logger, vocab): super().__init__(config, logger, vocab) base_path = util.path_dataset(self) self.index_stem = indices.AnseriniIndex(os.path.join( base_path, 'anserini.porter'), stemmer='porter') self.doc_store = indices.SqliteDocstore( os.path.join(base_path, 'docs.sqllite'))
def __init__(self, config, logger, vocab): super().__init__(config, logger, vocab) base_path = os.path.join(util.path_dataset(self), config['subset']) os.makedirs(base_path, exist_ok=True) self.index = indices.AnseriniIndex(os.path.join(base_path, 'anserini'), stemmer='none') self.index_stem = indices.AnseriniIndex(os.path.join( base_path, 'anserini.porter'), stemmer='porter') self.doc_store = indices.SqliteDocstore( os.path.join(base_path, 'docs.sqllite'))
def test_build(self): df = plaintext.read_tsv('etc/dummy_datafile.tsv') docs = [indices.RawDoc(did, dtext) for t, did, dtext in df if t == 'doc'] with tempfile.TemporaryDirectory() as tmpdir: idxs = [ (indices.AnseriniIndex(os.path.join(tmpdir, 'anserini')), False), (indices.AnseriniIndex(os.path.join(tmpdir, 'anserini.rawdocs'), store_raw_docs=True), True), (indices.SqliteDocstore(os.path.join(tmpdir, 'sqlite')), True), ] for index, check_raw_docs in idxs: with self.subTest(index=index): self.assertFalse(index.built()) index.build(iter(docs)) self.assertTrue(index.built()) self.assertEqual(index.num_docs(), len(docs)) if check_raw_docs: for doc in docs: self.assertEqual(index.get_raw(doc.did), doc.data['text'])
def execute(self): idxs = [indices.SqliteDocstore(self.path)] _init_indices_parallel(idxs, _iter_collection(self.collection.path), True)