Ejemplo n.º 1
0
    def __init__(self, config, logger, vocab):
        super().__init__(config, logger, vocab)
        base_path = util.path_dataset(self)

        global_base_path = "/".join(base_path.split("/")[:-1])
        #setup msmarco
        _base_path = global_base_path + "/msmarco"
        self.ms_index_stem = indices.AnseriniIndex(os.path.join(
            _base_path, 'anserini.porter'),
                                                   stemmer='porter')
        self.ms_index_doctttttquery_stem = indices.AnseriniIndex(
            os.path.join(_base_path, 'anserini.doctttttquery.porter'),
            stemmer='porter')
        self.ms_doc_store = indices.SqliteDocstore(
            os.path.join(_base_path, 'docs.sqllite'))

        #setup cord
        _base_path = global_base_path + "/covid/2020-07-16"
        self.cord_index_stem = indices.MultifieldAnseriniIndex(
            os.path.join(_base_path, 'anserini_multifield'),
            stemmer='porter',
            primary_field=config['bs_field'])
        self.cord_index_stem_2020 = indices.MultifieldAnseriniIndex(
            os.path.join(_base_path, 'anserini_multifield_2020'),
            stemmer='porter',
            primary_field=config['bs_field'])
        self.cord_doc_store = indices.MultifieldSqliteDocstore(
            os.path.join(_base_path, 'docs_multifield.sqlite'),
            primary_field=config['rr_field'])

        self.msds = msmarco.MsmarcoDataset(
            self.msmarco_config(self.config['subset'], config), logger, vocab)
        self.cordds = covid.CovidDataset(
            self.cord_config(self.config['subset'], config), logger, vocab)
Ejemplo n.º 2
0
    def __init__(self, config, logger, vocab):
        super().__init__(config, logger, vocab)
        base_path = util.path_dataset(self)

        global_base_path = "/".join(base_path.split("/")[:-1])
        #setup msmarco
        _base_path = global_base_path + "/msmarco"
        self.ms_index_stem = indices.AnseriniIndex(os.path.join(
            _base_path, 'anserini.porter'),
                                                   stemmer='porter')
        self.ms_index_doctttttquery_stem = indices.AnseriniIndex(
            os.path.join(_base_path, 'anserini.doctttttquery.porter'),
            stemmer='porter')
        self.ms_doc_store = indices.SqliteDocstore(
            os.path.join(_base_path, 'docs.sqllite'))

        #setup microblog
        _base_path = global_base_path + "/microblog"
        self.mb_index_stem = indices.AnseriniIndex(os.path.join(
            _base_path, 'anserini.porter'),
                                                   stemmer='porter')
        self.mb_index = indices.AnseriniIndex(os.path.join(
            _base_path, 'anserini'),
                                              stemmer='none')
        self.mb_doc_store = indices.SqliteDocstore(
            os.path.join(_base_path, 'docs.sqllite'))

        self.msds = msmarco.MsmarcoDataset(
            self.msmarco_config(self.config['subset'], config), logger, vocab)
        self.mbds = microblog.MicroblogDataset(
            self.microblog_config(self.config['subset'], config), logger,
            vocab)
Ejemplo n.º 3
0
 def __init__(self):
     super().__init__()
     self.index = indices.AnseriniIndex(self.index.path,
                                        stemmer="none",
                                        name="fullindex")
     self.index_stem = indices.AnseriniIndex(self.index_stem.path,
                                             name="stemindex")
     self.doc_store = indices.SqliteDocstore(self.docstore.path)
Ejemplo n.º 4
0
 def __init__(self, config, logger, vocab):
     super().__init__(config, logger, vocab)
     base_path = util.path_dataset(self)
     self.index = indices.AnseriniIndex(os.path.join(base_path, 'anserini'),
                                        stemmer='none')
     self.index_stem = indices.AnseriniIndex(os.path.join(
         base_path, 'anserini.porter'),
                                             stemmer='porter')
     self.doc_store = indices.SqliteDocstore(
         os.path.join(base_path, 'docs.sqllite'))
Ejemplo n.º 5
0
 def __init__(self, config, vocab, logger):
     super().__init__(config, logger, vocab)
     self.index_spanish = indices.AnseriniIndex(os.path.join(
         util.path_dataset(self), 'anserini.es'),
                                                lang=self._lang())
     self.doc_store = indices.SqliteDocstore(
         os.path.join(util.path_dataset(self), 'docs.sqlite'))
Ejemplo n.º 6
0
 def test_build(self):
     df = plaintext.read_tsv('etc/dummy_datafile.tsv')
     docs = [indices.RawDoc(did, dtext) for t, did, dtext in df if t == 'doc']
     with tempfile.TemporaryDirectory() as tmpdir:
         idxs = [
             (indices.AnseriniIndex(os.path.join(tmpdir, 'anserini')), False),
             (indices.AnseriniIndex(os.path.join(tmpdir, 'anserini.rawdocs'), store_raw_docs=True), True),
             (indices.SqliteDocstore(os.path.join(tmpdir, 'sqlite')), True),
         ]
         for index, check_raw_docs in idxs:
             with self.subTest(index=index):
                 self.assertFalse(index.built())
                 index.build(iter(docs))
                 self.assertTrue(index.built())
                 self.assertEqual(index.num_docs(), len(docs))
                 if check_raw_docs:
                     for doc in docs:
                         self.assertEqual(index.get_raw(doc.did), doc.data['text'])
Ejemplo n.º 7
0
 def _init_iter_collection(self):
     # Using the trick here from capreolus, pulling document content out of public index:
     # <https://github.com/capreolus-ir/capreolus/blob/d6ae210b24c32ff817f615370a9af37b06d2da89/capreolus/collection/robust04.yaml#L15>
     index = indices.AnseriniIndex(f'../Tweets2013')
     for did in self.logger.pbar(index.docids(), desc='documents'):
         raw_doc = index.get_raw(did)
         #dict_doc = json.loads(raw_doc)
         pattern = '"text":"(.*?)","source":'
         raw_txt = re.search(pattern, raw_doc).group(1)
         yield indices.RawDoc(did, raw_txt)
Ejemplo n.º 8
0
 def _init_iter_collection(self):
     # Using the trick here from capreolus, pulling document content out of public index:
     # <https://github.com/capreolus-ir/capreolus/blob/d6ae210b24c32ff817f615370a9af37b06d2da89/capreolus/collection/robust04.yaml#L15>
     with util.download_tmp(**_FILES['index']) as f:
         fd = f'{f.name}.d'
         util.extract_tarball(f.name,
                              fd,
                              self.logger,
                              reset_permissions=True)
         index = indices.AnseriniIndex(f'{fd}/index-robust04-20191213')
         for did in self.logger.pbar(index.docids(), desc='documents'):
             raw_doc = index.get_raw(did)
             yield indices.RawDoc(did, raw_doc)
Ejemplo n.º 9
0
 def test_batch_query(self):
     df = list(plaintext.read_tsv('etc/dummy_datafile.tsv'))
     docs = [indices.RawDoc(did, dtext) for t, did, dtext in df if t == 'doc']
     queries = [(qid, qtext) for t, qid, qtext in df if t == 'query']
     with tempfile.TemporaryDirectory() as tmpdir:
         idxs = [
             indices.AnseriniIndex(os.path.join(tmpdir, 'anserini')),
         ]
         models = [
             'bm25', 'bm25_k1-1.5', 'bm25_b-0.2', 'bm25_k1-1.6_b-0.8',
             'bm25_rm3', 'bm25_rm3_k1-1.5', 'bm25_rm3_b-0.2', 'bm25_rm3_k1-1.6_b-0.8',
             'bm25_rm3_rm3.fbTerms-2_rm3.fbDocs-2', 'bm25_rm3_rm3.fbTerms-2_rm3.fbDocs-2_k1-1.5',
             'bm25_rm3_rm3.fbTerms-2_rm3.fbDocs-2_b-0.2', 'bm25_rm3_rm3.fbTerms-2_rm3.fbDocs-2_k1-1.6_b-0.8',
             'ql', 'ql_mu-0.4',
             'sdm', 'sdm_uw-0.3_ow-0.2_tw-0.5',
         ]
         for index in idxs:
             index.build(docs)
             for model in models:
                 with self.subTest(index=index, model=model):
                     index.batch_query(queries, model, topk=10)
                     index.batch_query(queries, model, topk=10, quiet=True)
Ejemplo n.º 10
0
    def __init__(self, config, logger, vocab):
        super().__init__(config, logger, vocab)
        if config['ds']:
            ds = ir_datasets.load(config['ds'])
            if not config['docs_ds']:
                # HACK: find "parent" dataset that contains same docs handler so we don't re-build the index for the same collection
                segments = config['ds'].split('/')
                docs_handler = ds.docs_handler()
                parent_docs_ds = config['ds']
                while len(segments) > 1:
                    segments = segments[:-1]
                    parent_ds = ir_datasets.load('/'.join(segments))
                    if parent_ds.has_docs() and parent_ds.docs_handler(
                    ) == docs_handler:
                        parent_docs_ds = '/'.join(segments)
                config['docs_ds'] = parent_docs_ds
            if not config['queries_ds']:
                config['queries_ds'] = config['ds']

        if config['doc_fields']:
            if not config['docs_index_fields']:
                config['docs_index_fields'] = config['doc_fields']
            if not config['docs_rerank_fields']:
                config['docs_rerank_fields'] = config['doc_fields']

        if config['query_fields']:
            if not config['queries_index_fields']:
                config['queries_index_fields'] = config['query_fields']
            if not config['queries_rerank_fields']:
                config['queries_rerank_fields'] = config['query_fields']

        self.docs_ds = ir_datasets.load(config['docs_ds'])
        self.queries_ds = ir_datasets.load(config['queries_ds'])

        assert self.docs_ds.has_docs()
        assert self.queries_ds.has_queries()

        if not config['docs_index_fields']:
            config['docs_index_fields'] = ','.join(
                self.docs_ds.docs_cls()._fields[1:])
            self.logger.info(
                'auto-filled docs_index_fields as {docs_index_fields}'.format(
                    **config))
        if not config['docs_rerank_fields']:
            config['docs_rerank_fields'] = ','.join(
                self.docs_ds.docs_cls()._fields[1:])
            self.logger.info(
                'auto-filled docs_rerank_fields as {docs_rerank_fields}'.
                format(**config))
        if not config['queries_index_fields']:
            config['queries_index_fields'] = ','.join(
                self.queries_ds.queries_cls()._fields[1:])
            self.logger.info(
                'auto-filled queries_index_fields as {queries_index_fields}'.
                format(**config))
        if not config['queries_rerank_fields']:
            config['queries_rerank_fields'] = ','.join(
                self.queries_ds.queries_cls()._fields[1:])
            self.logger.info(
                'auto-filled queries_rerank_fields as {queries_rerank_fields}'.
                format(**config))

        base_path = os.path.join(util.path_dataset(self),
                                 sanitize_path(self.config['docs_ds']))
        os.makedirs(base_path, exist_ok=True)
        real_anserini_path = os.path.join(
            base_path,
            'anserini.porter.{docs_index_fields}'.format(**self.config))
        os.makedirs(real_anserini_path, exist_ok=True)
        virtual_anserini_path = '{}.{}'.format(
            real_anserini_path, sanitize_path(config['queries_ds']))
        if not os.path.exists(virtual_anserini_path):
            os.symlink(real_anserini_path,
                       virtual_anserini_path,
                       target_is_directory=True)
        self.index = indices.AnseriniIndex(virtual_anserini_path,
                                           stemmer='porter')
        self.doc_store = indices.IrdsDocstore(self.docs_ds.docs_store(),
                                              config['docs_rerank_fields'])
Ejemplo n.º 11
0
 def execute(self):
     idxs = [indices.AnseriniIndex(self.path, stemmer=self.stemmer)]
     _init_indices_parallel(idxs, _iter_collection(self.collection.path),
                            True)