Ejemplo n.º 1
0
 def __init__(self, config, vocab, logger):
     super().__init__(config, logger, vocab)
     self.index_spanish = indices.AnseriniIndex(os.path.join(
         util.path_dataset(self), 'anserini.es'),
                                                lang=self._lang())
     self.doc_store = indices.SqliteDocstore(
         os.path.join(util.path_dataset(self), 'docs.sqlite'))
Ejemplo n.º 2
0
    def __init__(self, config, logger, vocab):
        super().__init__(config, logger, vocab)
        base_path = util.path_dataset(self)

        global_base_path = "/".join(base_path.split("/")[:-1])
        #setup msmarco
        _base_path = global_base_path + "/msmarco"
        self.ms_index_stem = indices.AnseriniIndex(os.path.join(
            _base_path, 'anserini.porter'),
                                                   stemmer='porter')
        self.ms_index_doctttttquery_stem = indices.AnseriniIndex(
            os.path.join(_base_path, 'anserini.doctttttquery.porter'),
            stemmer='porter')
        self.ms_doc_store = indices.SqliteDocstore(
            os.path.join(_base_path, 'docs.sqllite'))

        #setup cord
        _base_path = global_base_path + "/covid/2020-07-16"
        self.cord_index_stem = indices.MultifieldAnseriniIndex(
            os.path.join(_base_path, 'anserini_multifield'),
            stemmer='porter',
            primary_field=config['bs_field'])
        self.cord_index_stem_2020 = indices.MultifieldAnseriniIndex(
            os.path.join(_base_path, 'anserini_multifield_2020'),
            stemmer='porter',
            primary_field=config['bs_field'])
        self.cord_doc_store = indices.MultifieldSqliteDocstore(
            os.path.join(_base_path, 'docs_multifield.sqlite'),
            primary_field=config['rr_field'])

        self.msds = msmarco.MsmarcoDataset(
            self.msmarco_config(self.config['subset'], config), logger, vocab)
        self.cordds = covid.CovidDataset(
            self.cord_config(self.config['subset'], config), logger, vocab)
Ejemplo n.º 3
0
 def _confirm_dua(self):
     self._has_confirmed_dua = True
     return self._has_confirmed_dua
     if self._has_confirmed_dua is None and self.DUA is not None:
         self._has_confirmed_dua = util.confirm(
             self.DUA.format(ds_path=util.path_dataset(self)))
     return self._has_confirmed_dua
Ejemplo n.º 4
0
 def _load_topics(self):
     result = {}
     for item, qid, text in plaintext.read_tsv(
             os.path.join(util.path_dataset(self), 'topics.txt')):
         if item == 'topic':
             result[qid] = text
     return result
Ejemplo n.º 5
0
    def __init__(self, config, logger, vocab):
        super().__init__(config, logger, vocab)
        base_path = util.path_dataset(self)

        global_base_path = "/".join(base_path.split("/")[:-1])
        #setup msmarco
        _base_path = global_base_path + "/msmarco"
        self.ms_index_stem = indices.AnseriniIndex(os.path.join(
            _base_path, 'anserini.porter'),
                                                   stemmer='porter')
        self.ms_index_doctttttquery_stem = indices.AnseriniIndex(
            os.path.join(_base_path, 'anserini.doctttttquery.porter'),
            stemmer='porter')
        self.ms_doc_store = indices.SqliteDocstore(
            os.path.join(_base_path, 'docs.sqllite'))

        #setup microblog
        _base_path = global_base_path + "/microblog"
        self.mb_index_stem = indices.AnseriniIndex(os.path.join(
            _base_path, 'anserini.porter'),
                                                   stemmer='porter')
        self.mb_index = indices.AnseriniIndex(os.path.join(
            _base_path, 'anserini'),
                                              stemmer='none')
        self.mb_doc_store = indices.SqliteDocstore(
            os.path.join(_base_path, 'docs.sqllite'))

        self.msds = msmarco.MsmarcoDataset(
            self.msmarco_config(self.config['subset'], config), logger, vocab)
        self.mbds = microblog.MicroblogDataset(
            self.microblog_config(self.config['subset'], config), logger,
            vocab)
Ejemplo n.º 6
0
 def _load_qrels(self, subset, fmt):
     with self.logger.duration('loading qrels'):
         base_path = util.path_dataset(self)
         path = os.path.join(base_path,
                             f'{subset}.{self.config["collection"]}.qrels')
         self.logger.info(path)
         return trec.read_qrels_fmt(path, fmt)
Ejemplo n.º 7
0
 def _init_qrels(self, subset, qrels_files, force=False, expected_md5=None):
     qrelsf = os.path.join(util.path_dataset(self), f'{subset}.qrels')
     if (force or not os.path.exists(qrelsf)) and self._confirm_dua():
         qrels = itertools.chain(*(trec.read_qrels(
             util.download_stream(f, 'utf8', expected_md5=expected_md5))
                                   for f in qrels_files))
         trec.write_qrels(qrelsf, qrels)
Ejemplo n.º 8
0
    def init(self, force=False):
        path = util.path_dataset(self)
        needs_collection = []
        for index in [self.index, self.index_stem, self.doc_store]:
            if force or not index.built():
                needs_collection.append(index.build)

        for subset in ['main', 'heldout']:
            is_heldout = (subset == 'heldout')
            query_file = os.path.join(path, f'{subset}.queries')
            if force or not os.path.exists(query_file):
                needs_collection.append(
                    self._init_build_queryfile(query_file, is_heldout))

            qrels_file = os.path.join(path, f'{subset}.qrels')
            if force or not os.path.exists(query_file):
                needs_collection.append(
                    self._init_build_qrels(qrels_file, is_heldout))

        if needs_collection and self._confirm_dua():
            with contextlib.ExitStack() as stack:
                collection_iter = logger.pbar(self._init_iter_corpus(),
                                              desc='collection')
                sub_iters = util.blocking_tee(collection_iter,
                                              len(needs_collection))
                for fn, it in zip(needs_collection, sub_iters):
                    stack.enter_context(
                        util.CtxtThread(functools.partial(fn, it)))
Ejemplo n.º 9
0
 def _init_collection_iter(self, doc_paths, encoding):
     doc_paths = (os.path.join(util.path_dataset(self), p)
                  for p in doc_paths)
     doc_iter = itertools.chain(*(trec.parse_doc_format(p, encoding)
                                  for p in doc_paths))
     doc_iter = self.logger.pbar(doc_iter, desc='documents')
     return doc_iter
Ejemplo n.º 10
0
 def pair_iter(self, fields, pos_source='intersect', neg_source='run', sampling='query', pos_minrel=1, unjudged_rel=0, num_neg=1, random=None, inf=False):
     special = self.config['special']
     if special == '':
         raise NotImplementedError
     assert pos_minrel == 1, f"{special} only supports pos_minrel=1"
     assert unjudged_rel == 0, f"{special} only supports unjudged_rel=1"
     assert num_neg == 1, f"{special} only supports num_neg=1"
     assert self.config['subset'] in ('train', 'train10'), f"{special} only supported with subset=train[10]"
     self.logger.warn(f'Using {special}; ingoring pair_iter arguments pos_source={pos_source} neg_source={neg_source} sampling={sampling}')
     first = True
     while first or inf:
         first = False
         if special == 'mspairs':
             f = gzip.open(os.path.join(util.path_dataset(self), '{subset}.mspairs.gz'.format(**self.config)), 'rt')
         else:
             raise ValueError(f'unsupported special={special}')
         with f:
             for qid, pos_did, neg_did in plaintext.read_tsv(f):
                 if qid in MINI_DEV:
                     continue
                 result = {f: [] for f in fields}
                 for did in [pos_did, neg_did]:
                     record = self.build_record(fields, query_id=qid, doc_id=did)
                     for f in fields:
                         result[f].append(record[f])
                 yield result
Ejemplo n.º 11
0
    def init(self, force=False):
        base_path = util.path_dataset(self)
        idxs = [self.index, self.index_stem, self.doc_store]
        self._init_indices_parallel(idxs, self._init_iter_collection(), force)

        qrels_file = os.path.join(base_path, 'qrels.robust2004.txt')
        if (force or not os.path.exists(qrels_file)) and self._confirm_dua():
            util.download(**_FILES['qrels'], file_name=qrels_file)

        for fold in FOLDS:
            fold_qrels_file = os.path.join(base_path, f'{fold}.qrels')
            if (force or not os.path.exists(fold_qrels_file)):
                all_qrels = trec.read_qrels_dict(qrels_file)
                fold_qrels = {
                    qid: dids
                    for qid, dids in all_qrels.items() if qid in FOLDS[fold]
                }
                trec.write_qrels_dict(fold_qrels_file, fold_qrels)

        query_file = os.path.join(base_path, 'topics.txt')
        if (force or not os.path.exists(query_file)) and self._confirm_dua():
            query_file_stream = util.download_stream(**_FILES['queries'],
                                                     encoding='utf8')
            with util.finialized_file(query_file, 'wt') as f:
                plaintext.write_tsv(f,
                                    trec.parse_query_format(query_file_stream))
Ejemplo n.º 12
0
 def _base_qrels(self, subset):
     rnd, _ = subset.split('-', 1)
     path = os.path.join(util.path_dataset(self), f'{rnd}.qrels')
     if os.path.exists(path):
         return trec.read_qrels_dict(path)
     self.logger.info(f'missing qrels for {rnd} -- returning empty qrels')
     return {}
Ejemplo n.º 13
0
    def init(self, force=False):
        base_dir = os.path.join(util.path_dataset(self), self.subset)

        if self.subset == 'dummy':
            datafile = os.path.join(base_dir, 'datafile.tsv')
            qrels = os.path.join(base_dir, 'qrels.txt')
            if not os.path.exists(datafile):
                os.symlink(os.path.abspath('etc/dummy_datafile.tsv'), datafile)
            if not os.path.exists(qrels):
                os.symlink(os.path.abspath('etc/dummy_qrels.txt'), qrels)

        needs_datafile = []
        if force or not self.index.built():
            needs_datafile.append(lambda it: self.index.build(
                indices.RawDoc(did, txt) for t, did, txt in it if t == 'doc'))

        if force or not self.index_stem.built():
            needs_datafile.append(lambda it: self.index_stem.build(
                indices.RawDoc(did, txt) for t, did, txt in it if t == 'doc'))

        if force or not self.doc_store.built():
            needs_datafile.append(lambda it: self.doc_store.build(
                indices.RawDoc(did, txt) for t, did, txt in it if t == 'doc'))

        query_file = os.path.join(base_dir, 'queries.tsv')
        if force or not os.path.exists(query_file):
            needs_datafile.append(
                lambda it: plaintext.write_tsv(query_file, (
                    (qid, txt) for t, qid, txt in it if t == 'query')))

        if needs_datafile:
            df_glob = os.path.join(base_dir, 'datafile*.tsv')
            datafiles = glob(df_glob)
            while not datafiles:
                c = util.confirm(
                    f'No data files found. Please move/link data files to {df_glob}.\n'
                    'Data files should contain both queries and documents in the '
                    'following format (one per line):\n'
                    '[query|doc] [TAB] [qid/did] [TAB] [text]')
                if not c:
                    sys.exit(1)
                datafiles = glob(df_glob)
            main_iter = itertools.chain(*(plaintext.read_tsv(df)
                                          for df in datafiles))
            main_iter = tqdm(main_iter, desc='reading datafiles')
            iters = util.blocking_tee(main_iter, len(needs_datafile))
            with contextlib.ExitStack() as stack:
                for fn, it in zip(needs_datafile, iters):
                    stack.enter_context(
                        util.CtxtThread(functools.partial(fn, it)))

        qrels_file = os.path.join(base_dir, 'qrels.txt')
        while not os.path.exists(qrels_file):
            c = util.confirm(
                f'No qrels file found. Please move/link qrels file to {qrels_file}.\n'
                'Qrels file should be in the TREC format:\n'
                '[qid] [SPACE] Q0 [SPACE] [did] [SPACE] [score]')
            if not c:
                sys.exit(1)
Ejemplo n.º 14
0
    def init(self, force=False):
        base_path = util.path_dataset(self)
        base = Path(base_path)

        # DOCUMENT COLLECTION
        idx = [self.index, self.index_stem, self.doc_store]
        self._init_indices_parallel(idx, self._init_doc_iter(), force)

        # TRAIN

        files = {}
        files.update({
            base / f'train-f{f}.auto.qrels': f'train/train.fold{f}.cbor.hierarchical.qrels' for f in range(5)
        })
        files.update({
            base / f'train-f{f}.queries.tsv': f'train/train.fold{f}.cbor.outlines' for f in range(5)
        })
        if force or not all(f.exists() for f in files) and self._confirm_dua():
            with util.download_tmp(_SOURCES['train'], tarf=True) as f:
                for member in f:
                    for f_out, f_in in files.items():
                        if member.name == f_in:
                            if f_out.suffix == '.qrels':
                                self._init_file_copy(f.extractfile(member), f_out, force)
                            elif f_out.suffix == '.tsv':
                                self._init_queryfile(f.extractfile(member), f_out, force)

        # TEST

        files = {
            base / 'test.queries.tsv': 'benchmarkY1test.public/test.benchmarkY1test.cbor.outlines'
        }
        if force or not all(f.exists() for f in files) and self._confirm_dua():
            with util.download_tmp(_SOURCES['test'], tarf=True) as f:
                for f_out, f_in in files.items():
                    self._init_queryfile(f.extractfile(f_in), f_out, force)

        files = {
            base / 'test.auto.qrels': 'TREC_CAR_2017_qrels/automatic.benchmarkY1test.cbor.hierarchical.qrels',
            base /'test.manual.qrels': 'TREC_CAR_2017_qrels/manual.benchmarkY1test.cbor.hierarchical.qrels',
        }
        if force or not all(f.exists() for f in files) and self._confirm_dua():
            with util.download_tmp(_SOURCES['test-qrels'], tarf=True) as f:
                for f_out, f_in in files.items():
                    self._init_file_copy(f.extractfile(f_in), f_out, force)

        # TEST200

        files = {
            base / 'test200.auto.qrels': 'test200/train.test200.cbor.hierarchical.qrels',
            base / 'test200.queries.tsv': 'test200/train.test200.cbor.outlines',
        }
        if force or not all(f.exists() for f in files) and self._confirm_dua():
            with util.download_tmp(_SOURCES['test200'], tarf=True) as f:
                for f_out, f_in in files.items():
                    if f_out.suffix == '.qrels':
                        self._init_file_copy(f.extractfile(f_in), f_out, force)
                    elif f_out.suffix == '.tsv':
                        self._init_queryfile(f.extractfile(f_in), f_out, force)
Ejemplo n.º 15
0
 def _base_qrels(self, subset, fmt):
     fold, rnd, _ = subset.split('-', 2)
     path = os.path.join(util.path_dataset(self), f'{fold}-{rnd}.qrels')
     if os.path.exists(path):
         aaa = trec.read_qrels_fmt(path, fmt)
         return aaa
     self.logger.info(f'missing qrels for {rnd} -- returning empty qrels')
     return {}
Ejemplo n.º 16
0
 def __init__(self, config, logger, vocab):
     super().__init__(config, logger, vocab)
     base_path = util.path_dataset(self)
     self.index_stem = indices.AnseriniIndex(os.path.join(
         base_path, 'anserini.porter'),
                                             stemmer='porter')
     self.doc_store = indices.SqliteDocstore(
         os.path.join(base_path, 'docs.sqllite'))
Ejemplo n.º 17
0
 def _init_iter_corpus(self):
     nyt_corpus_dir = os.path.join(util.path_dataset(self), 'nyt_corpus')
     with Pool(onir.util.safe_thread_count()) as pool:
         for tgz_file in sorted(glob(f'{nyt_corpus_dir}/*/*.tgz')):
             logger.debug(f'reading file {tgz_file}')
             for doc in pool.imap(_parse_file, _read_tarfile(tgz_file)):
                 if doc:
                     yield doc
Ejemplo n.º 18
0
 def _load_queries_base(self, subset):
     querysource = self.querysource
     query_path = os.path.join(util.path_dataset(self), f'{subset}.topics')
     return {
         qid: text
         for t, qid, text in plaintext.read_tsv(query_path)
         if t == querysource
     }
Ejemplo n.º 19
0
 def record_iter(self, fields, source, minrel=None, shuf=True, random=None, inf=False, run_threshold=None):
     special = self.config['special']
     if special == '':
         raise NotImplementedError
     assert minrel is None or minrel < 1
     if source != 'run':
         self.logger.warn(f'Using special={special}; ingoring record_iter arguments source={source}')
     if run_threshold is not None:
         self.logger.warn(f'Using special={special}; ingoring record_iter arguments run_threshold={run_threshold}')
     first = True
     while first or inf:
         first = False
         if special == 'mspairs':
             f = gzip.open(os.path.join(util.path_dataset(self), '{subset}.mspairs.gz'.format(**self.config)), 'rt')
             it = plaintext.read_tsv(f)
             fields = fields - {'relscore'} # don't request relscore from typical channels (i.e., qrels) because we already know and this is faster.
         elif special == 'msrun':
             f = os.path.join(util.path_dataset(self), '{subset}.msrun'.format(**self.config))
             it = ((qid, did) for qid, did, rank, score in trec.read_run(f))
         elif special == 'validrun':
             f = os.path.join(util.path_dataset(self), '{subset}.validrun'.format(**self.config))
             it = plaintext.read_sv(f, ' ')
         else:
             raise ValueError(f'unsupported special={special}')
         if shuf:
             if special in ('msrun', 'mspairs'):
                 self.logger.warn(f'ignoring shuf=True with special={special}')
             else:
                 it = list(it)
                 random.shuffle(it)
         for cols in it:
             if len(cols) == 3:
                 qid, pos_did, neg_did = cols
                 dids = [pos_did, neg_did] if (minrel is None or minrel <= 0) else [pos_did]
                 if qid in MINI_DEV:
                     continue
             elif len(cols) == 2:
                 qid, did = cols
                 dids = [did]
             for did in dids:
                 record = self.build_record(fields, query_id=qid, doc_id=did)
                 result = {f: record[f] for f in fields}
                 if len(cols) == 3:
                     result['relscore'] = (1 if did == pos_did else 0)
                 yield result
Ejemplo n.º 20
0
 def _load_queries_base(self, subset):
     rnd, fields = subset.split('-', 1)
     fields = fields.split('-')
     path = os.path.join(util.path_dataset(self), f'{rnd}.tsv')
     return {
         qid: qtext
         for qid, qtype, qtext in plaintext.read_tsv(path)
         if qtype in fields
     }
Ejemplo n.º 21
0
 def _load_queries_base(self, subset):
     fold, rnd, fields = subset.split('-', 2)
     fields = fields.split('-')
     path = os.path.join(util.path_dataset(self), f'rnd5.tsv')
     filter_queries = {
         str(qid): qtext
         for qid, qtype, qtext in plaintext.read_tsv(path)
         if qtype in fields and str(qid) in FOLDS[fold]
     }
     return filter_queries
Ejemplo n.º 22
0
 def __init__(self, config, logger, vocab):
     super().__init__(config, logger, vocab)
     base_path = os.path.join(util.path_dataset(self), config['subset'])
     os.makedirs(base_path, exist_ok=True)
     self.index = indices.AnseriniIndex(os.path.join(base_path, 'anserini'),
                                        stemmer='none')
     self.index_stem = indices.AnseriniIndex(os.path.join(
         base_path, 'anserini.porter'),
                                             stemmer='porter')
     self.doc_store = indices.SqliteDocstore(
         os.path.join(base_path, 'docs.sqllite'))
Ejemplo n.º 23
0
 def _load_topics(self, subset):
     result = {}
     for qid, text in plaintext.read_tsv(
             os.path.join(util.path_dataset(self), 'topics.txt')):
         #nqid=int(qid.replace('MB','').strip())
         if subset == 'valid' and (int(qid) in VALIDATION_QIDS):
             result[qid] = text
         elif subset == 'test' and (int(qid) in TEST_QIDS):
             result[qid] = text
         elif subset == 'train' and (int(qid) not in VALIDATION_QIDS) and (
                 int(qid) not in TEST_QIDS):
             result[qid] = text
     return result
Ejemplo n.º 24
0
    def init(self, force=False):
        base_path = util.path_dataset(self)
        idxs = [self.index, self.index_stem, self.doc_store]
        self._init_indices_parallel(idxs, self._init_iter_collection(), force)
        train_qrels = os.path.join(base_path, 'train.qrels.txt')
        valid_qrels = os.path.join(base_path, 'valid.qrels.txt')
        test_qrels = os.path.join(base_path, 'test.qrels.txt')

        if (force or not os.path.exists(train_qrels)
                or not os.path.exists(valid_qrels)) and self._confirm_dua():
            source_stream = util.download_stream(**_FILES['qrels_2013'],
                                                 encoding='utf8')
            source_stream2 = util.download_stream(**_FILES['qrels_2014'],
                                                  encoding='utf8')
            with util.finialized_file(train_qrels, 'wt') as tf, \
                 util.finialized_file(valid_qrels, 'wt') as vf, \
                 util.finialized_file(test_qrels, 'wt') as Tf:
                for line in source_stream:
                    cols = line.strip().split()
                    if int(cols[0]) in VALIDATION_QIDS:
                        vf.write(' '.join(cols) + '\n')
                    elif int(cols[0]) in TEST_QIDS:
                        Tf.write(' '.join(cols) + '\n')
                    else:
                        tf.write(' '.join(cols) + '\n')
                for line in source_stream2:
                    cols = line.strip().split()
                    if cols[0] in VALIDATION_QIDS:
                        vf.write(' '.join(cols) + '\n')
                    elif int(cols[0]) in TEST_QIDS:
                        Tf.write(' '.join(cols) + '\n')
                    else:
                        tf.write(' '.join(cols) + '\n')

        all_queries = os.path.join(base_path, 'topics.txt')

        if (force or not os.path.exists(all_queries)) and self._confirm_dua():
            source_stream = util.download_stream(**_FILES['queries_2013'],
                                                 encoding='utf8')
            source_stream2 = util.download_stream(**_FILES['queries_2014'],
                                                  encoding='utf8')
            train, valid = [], []
            for _id, _query in trec.parse_query_mbformat(source_stream):
                nid = _id.replace('MB', '').strip()
                train.append([nid, _query])

            for _id, _query in trec.parse_query_mbformat(source_stream2):
                nid = _id.replace('MB', '').strip()
                train.append([nid, _query])

            plaintext.write_tsv(all_queries, train)
Ejemplo n.º 25
0
 def __init__(self, config, logger, vocab):
     super().__init__(config, logger, vocab)
     base_path = os.path.join(util.path_dataset(self), config['date'])
     os.makedirs(base_path, exist_ok=True)
     self.index_stem = indices.MultifieldAnseriniIndex(
         os.path.join(base_path, 'anserini_multifield'),
         stemmer='porter',
         primary_field=config['bs_field'])
     self.index_stem_2020 = indices.MultifieldAnseriniIndex(
         os.path.join(base_path, 'anserini_multifield_2020'),
         stemmer='porter',
         primary_field=config['bs_field'])
     self.doc_store = indices.MultifieldSqliteDocstore(
         os.path.join(base_path, 'docs_multifield.sqlite'),
         primary_field=config['rr_field'])
Ejemplo n.º 26
0
    def init(self, force=False):
        idxs = [self.index, self.index_stem, self.doc_store]
        self._init_indices_parallel(idxs, self._init_iter_collection(), force)

        train_qrels = os.path.join(util.path_dataset(self), 'train.qrels.txt')
        valid_qrels = os.path.join(util.path_dataset(self), 'valid.qrels.txt')
        if (force or not os.path.exists(train_qrels)
                or not os.path.exists(valid_qrels)) and self._confirm_dua():
            source_stream = util.download_stream(
                'https://ciir.cs.umass.edu/downloads/Antique/antique-train.qrel',
                encoding='utf8')
            with util.finialized_file(train_qrels, 'wt') as tf, \
                 util.finialized_file(valid_qrels, 'wt') as vf:
                for line in source_stream:
                    cols = line.strip().split()
                    if cols[0] in VALIDATION_QIDS:
                        vf.write(' '.join(cols) + '\n')
                    else:
                        tf.write(' '.join(cols) + '\n')

        train_queries = os.path.join(util.path_dataset(self),
                                     'train.queries.txt')
        valid_queries = os.path.join(util.path_dataset(self),
                                     'valid.queries.txt')
        if (force or not os.path.exists(train_queries)
                or not os.path.exists(valid_queries)) and self._confirm_dua():
            source_stream = util.download_stream(
                'https://ciir.cs.umass.edu/downloads/Antique/antique-train-queries.txt',
                encoding='utf8')
            train, valid = [], []
            for cols in plaintext.read_tsv(source_stream):
                if cols[0] in VALIDATION_QIDS:
                    valid.append(cols)
                else:
                    train.append(cols)
            plaintext.write_tsv(train_queries, train)
            plaintext.write_tsv(valid_queries, valid)

        test_qrels = os.path.join(util.path_dataset(self), 'test.qrels.txt')
        if (force or not os.path.exists(test_qrels)) and self._confirm_dua():
            util.download(
                'https://ciir.cs.umass.edu/downloads/Antique/antique-test.qrel',
                test_qrels)

        test_queries = os.path.join(util.path_dataset(self),
                                    'test.queries.txt')
        if (force or not os.path.exists(test_queries)) and self._confirm_dua():
            util.download(
                'https://ciir.cs.umass.edu/downloads/Antique/antique-test-queries.txt',
                test_queries)
Ejemplo n.º 27
0
 def _init_topics(self,
                  subset,
                  topic_files,
                  qid_prefix=None,
                  encoding=None,
                  xml_prefix=None,
                  force=False,
                  expected_md5=None):
     topicf = os.path.join(util.path_dataset(self), f'{subset}.topics')
     if (force or not os.path.exists(topicf)) and self._confirm_dua():
         topics = []
         for topic_file in topic_files:
             topic_file_stream = util.download_stream(
                 topic_file, encoding, expected_md5=expected_md5)
             for t, qid, text in trec.parse_query_format(
                     topic_file_stream, xml_prefix):
                 if qid_prefix is not None:
                     qid = qid.replace(qid_prefix, '')
                 topics.append((t, qid, text))
         plaintext.write_tsv(topicf, topics)
Ejemplo n.º 28
0
    def init(self, force=False):
        needs_docs = []
        for index in [self.index_stem, self.index_stem_2020, self.doc_store]:
            if force or not index.built():
                needs_docs.append(index)

        if needs_docs and self._confirm_dua():
            with contextlib.ExitStack() as stack:
                doc_iter = self._init_iter_collection()
                doc_iter = self.logger.pbar(doc_iter, desc='articles')
                doc_iters = util.blocking_tee(doc_iter, len(needs_docs))
                for idx, it in zip(needs_docs, doc_iters):
                    if idx is self.index_stem_2020:
                        it = (d for d in it if '2020' in d.data['date'])
                    stack.enter_context(
                        util.CtxtThread(functools.partial(idx.build, it)))

        path = os.path.join(util.path_dataset(self), 'rnd1.tsv')
        if not os.path.exists(path) and self._confirm_dua():
            with util.download_tmp('https://ir.nist.gov/covidSubmit/data/topics-rnd1.xml', expected_md5="cf1b605222f45f7dbc90ca8e4d9b2c31") as f, \
                 util.finialized_file(path, 'wt') as fout:
                soup = BeautifulSoup(f.read(), 'lxml-xml')
                for topic in soup.find_all('topic'):
                    qid = topic['number']
                    plaintext.write_tsv(fout, [
                        (qid, 'query', topic.find('query').get_text()),
                        (qid, 'quest', topic.find('question').get_text()),
                        (qid, 'narr', topic.find('narrative').get_text()),
                    ])

        udel_flag = path + '.includes_udel'
        if not os.path.exists(udel_flag):
            with open(path,
                      'at') as fout, util.finialized_file(udel_flag, 'wt'):
                with util.download_tmp(
                        'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/topics.covid-round1-udel.xml',
                        expected_md5="2915cf59ae222f0aa20b2a671f67fd7a") as f:
                    soup = BeautifulSoup(f.read(), 'lxml-xml')
                    for topic in soup.find_all('topic'):
                        qid = topic['number']
                        plaintext.write_tsv(fout, [
                            (qid, 'udel', topic.find('query').get_text()),
                        ])

        path = os.path.join(util.path_dataset(self), 'rnd2.tsv')
        if not os.path.exists(path) and self._confirm_dua():
            with util.download_tmp('https://ir.nist.gov/covidSubmit/data/topics-rnd2.xml', expected_md5="550129e71c83de3fb4d6d29a172c5842") as f, \
                 util.finialized_file(path, 'wt') as fout:
                soup = BeautifulSoup(f.read(), 'lxml-xml')
                for topic in soup.find_all('topic'):
                    qid = topic['number']
                    plaintext.write_tsv(fout, [
                        (qid, 'query', topic.find('query').get_text()),
                        (qid, 'quest', topic.find('question').get_text()),
                        (qid, 'narr', topic.find('narrative').get_text()),
                    ])

        udel_flag = path + '.includes_udel'
        if not os.path.exists(udel_flag):
            with open(path,
                      'at') as fout, util.finialized_file(udel_flag, 'wt'):
                with util.download_tmp(
                        'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/topics.covid-round2-udel.xml',
                        expected_md5="a8988734e6f812921d5125249c197985") as f:
                    soup = BeautifulSoup(f.read(), 'lxml-xml')
                    for topic in soup.find_all('topic'):
                        qid = topic['number']
                        plaintext.write_tsv(fout, [
                            (qid, 'udel', topic.find('query').get_text()),
                        ])

        path = os.path.join(util.path_dataset(self), 'rnd1.qrels')
        if not os.path.exists(path) and self._confirm_dua():
            util.download(
                'https://ir.nist.gov/covidSubmit/data/qrels-rnd1.txt',
                path,
                expected_md5="d58586df5823e7d1d0b3619a73b31518")
Ejemplo n.º 29
0
 def _load_qrels(self, subset, fmt):
     with logger.duration('loading qrels'):
         base_path = util.path_dataset(self)
         path = os.path.join(base_path, f'{subset}.qrels')
         return trec.read_qrels_fmt(path, fmt)
Ejemplo n.º 30
0
 def _load_queries_base(self, subset):
     with logger.duration('loading queries'):
         base_path = util.path_dataset(self)
         path = os.path.join(base_path, f'{subset}.queries')
         return dict(plaintext.read_tsv(path))