Exemple #1
0
    def __init__(self, config, logger, vocab):
        super().__init__(config, logger, vocab)
        base_path = util.path_dataset(self)

        global_base_path = "/".join(base_path.split("/")[:-1])
        #setup msmarco
        _base_path = global_base_path + "/msmarco"
        self.ms_index_stem = indices.AnseriniIndex(os.path.join(
            _base_path, 'anserini.porter'),
                                                   stemmer='porter')
        self.ms_index_doctttttquery_stem = indices.AnseriniIndex(
            os.path.join(_base_path, 'anserini.doctttttquery.porter'),
            stemmer='porter')
        self.ms_doc_store = indices.SqliteDocstore(
            os.path.join(_base_path, 'docs.sqllite'))

        #setup microblog
        _base_path = global_base_path + "/microblog"
        self.mb_index_stem = indices.AnseriniIndex(os.path.join(
            _base_path, 'anserini.porter'),
                                                   stemmer='porter')
        self.mb_index = indices.AnseriniIndex(os.path.join(
            _base_path, 'anserini'),
                                              stemmer='none')
        self.mb_doc_store = indices.SqliteDocstore(
            os.path.join(_base_path, 'docs.sqllite'))

        self.msds = msmarco.MsmarcoDataset(
            self.msmarco_config(self.config['subset'], config), logger, vocab)
        self.mbds = microblog.MicroblogDataset(
            self.microblog_config(self.config['subset'], config), logger,
            vocab)
Exemple #2
0
 def __init__(self, config, vocab, logger):
     super().__init__(config, logger, vocab)
     self.index_spanish = indices.AnseriniIndex(os.path.join(
         util.path_dataset(self), 'anserini.es'),
                                                lang=self._lang())
     self.doc_store = indices.SqliteDocstore(
         os.path.join(util.path_dataset(self), 'docs.sqlite'))
Exemple #3
0
    def __init__(self, config, logger, vocab):
        super().__init__(config, logger, vocab)
        base_path = util.path_dataset(self)

        global_base_path = "/".join(base_path.split("/")[:-1])
        #setup msmarco
        _base_path = global_base_path + "/msmarco"
        self.ms_index_stem = indices.AnseriniIndex(os.path.join(
            _base_path, 'anserini.porter'),
                                                   stemmer='porter')
        self.ms_index_doctttttquery_stem = indices.AnseriniIndex(
            os.path.join(_base_path, 'anserini.doctttttquery.porter'),
            stemmer='porter')
        self.ms_doc_store = indices.SqliteDocstore(
            os.path.join(_base_path, 'docs.sqllite'))

        #setup cord
        _base_path = global_base_path + "/covid/2020-07-16"
        self.cord_index_stem = indices.MultifieldAnseriniIndex(
            os.path.join(_base_path, 'anserini_multifield'),
            stemmer='porter',
            primary_field=config['bs_field'])
        self.cord_index_stem_2020 = indices.MultifieldAnseriniIndex(
            os.path.join(_base_path, 'anserini_multifield_2020'),
            stemmer='porter',
            primary_field=config['bs_field'])
        self.cord_doc_store = indices.MultifieldSqliteDocstore(
            os.path.join(_base_path, 'docs_multifield.sqlite'),
            primary_field=config['rr_field'])

        self.msds = msmarco.MsmarcoDataset(
            self.msmarco_config(self.config['subset'], config), logger, vocab)
        self.cordds = covid.CovidDataset(
            self.cord_config(self.config['subset'], config), logger, vocab)
Exemple #4
0
 def __init__(self):
     super().__init__()
     self.index = indices.AnseriniIndex(self.index.path,
                                        stemmer="none",
                                        name="fullindex")
     self.index_stem = indices.AnseriniIndex(self.index_stem.path,
                                             name="stemindex")
     self.doc_store = indices.SqliteDocstore(self.docstore.path)
Exemple #5
0
 def __init__(self, config, logger, vocab):
     super().__init__(config, logger, vocab)
     base_path = util.path_dataset(self)
     self.index_stem = indices.AnseriniIndex(os.path.join(
         base_path, 'anserini.porter'),
                                             stemmer='porter')
     self.doc_store = indices.SqliteDocstore(
         os.path.join(base_path, 'docs.sqllite'))
Exemple #6
0
 def __init__(self, config, logger, vocab):
     super().__init__(config, logger, vocab)
     base_path = os.path.join(util.path_dataset(self), config['subset'])
     os.makedirs(base_path, exist_ok=True)
     self.index = indices.AnseriniIndex(os.path.join(base_path, 'anserini'),
                                        stemmer='none')
     self.index_stem = indices.AnseriniIndex(os.path.join(
         base_path, 'anserini.porter'),
                                             stemmer='porter')
     self.doc_store = indices.SqliteDocstore(
         os.path.join(base_path, 'docs.sqllite'))
Exemple #7
0
 def test_build(self):
     df = plaintext.read_tsv('etc/dummy_datafile.tsv')
     docs = [indices.RawDoc(did, dtext) for t, did, dtext in df if t == 'doc']
     with tempfile.TemporaryDirectory() as tmpdir:
         idxs = [
             (indices.AnseriniIndex(os.path.join(tmpdir, 'anserini')), False),
             (indices.AnseriniIndex(os.path.join(tmpdir, 'anserini.rawdocs'), store_raw_docs=True), True),
             (indices.SqliteDocstore(os.path.join(tmpdir, 'sqlite')), True),
         ]
         for index, check_raw_docs in idxs:
             with self.subTest(index=index):
                 self.assertFalse(index.built())
                 index.build(iter(docs))
                 self.assertTrue(index.built())
                 self.assertEqual(index.num_docs(), len(docs))
                 if check_raw_docs:
                     for doc in docs:
                         self.assertEqual(index.get_raw(doc.did), doc.data['text'])
Exemple #8
0
 def execute(self):
     idxs = [indices.SqliteDocstore(self.path)]
     _init_indices_parallel(idxs, _iter_collection(self.collection.path),
                            True)