def __init__(self, config, vocab, logger): super().__init__(config, logger, vocab) self.index_spanish = indices.AnseriniIndex(os.path.join( util.path_dataset(self), 'anserini.es'), lang=self._lang()) self.doc_store = indices.SqliteDocstore( os.path.join(util.path_dataset(self), 'docs.sqlite'))
def __init__(self, config, logger, vocab): super().__init__(config, logger, vocab) base_path = util.path_dataset(self) global_base_path = "/".join(base_path.split("/")[:-1]) #setup msmarco _base_path = global_base_path + "/msmarco" self.ms_index_stem = indices.AnseriniIndex(os.path.join( _base_path, 'anserini.porter'), stemmer='porter') self.ms_index_doctttttquery_stem = indices.AnseriniIndex( os.path.join(_base_path, 'anserini.doctttttquery.porter'), stemmer='porter') self.ms_doc_store = indices.SqliteDocstore( os.path.join(_base_path, 'docs.sqllite')) #setup cord _base_path = global_base_path + "/covid/2020-07-16" self.cord_index_stem = indices.MultifieldAnseriniIndex( os.path.join(_base_path, 'anserini_multifield'), stemmer='porter', primary_field=config['bs_field']) self.cord_index_stem_2020 = indices.MultifieldAnseriniIndex( os.path.join(_base_path, 'anserini_multifield_2020'), stemmer='porter', primary_field=config['bs_field']) self.cord_doc_store = indices.MultifieldSqliteDocstore( os.path.join(_base_path, 'docs_multifield.sqlite'), primary_field=config['rr_field']) self.msds = msmarco.MsmarcoDataset( self.msmarco_config(self.config['subset'], config), logger, vocab) self.cordds = covid.CovidDataset( self.cord_config(self.config['subset'], config), logger, vocab)
def _confirm_dua(self): self._has_confirmed_dua = True return self._has_confirmed_dua if self._has_confirmed_dua is None and self.DUA is not None: self._has_confirmed_dua = util.confirm( self.DUA.format(ds_path=util.path_dataset(self))) return self._has_confirmed_dua
def _load_topics(self): result = {} for item, qid, text in plaintext.read_tsv( os.path.join(util.path_dataset(self), 'topics.txt')): if item == 'topic': result[qid] = text return result
def __init__(self, config, logger, vocab): super().__init__(config, logger, vocab) base_path = util.path_dataset(self) global_base_path = "/".join(base_path.split("/")[:-1]) #setup msmarco _base_path = global_base_path + "/msmarco" self.ms_index_stem = indices.AnseriniIndex(os.path.join( _base_path, 'anserini.porter'), stemmer='porter') self.ms_index_doctttttquery_stem = indices.AnseriniIndex( os.path.join(_base_path, 'anserini.doctttttquery.porter'), stemmer='porter') self.ms_doc_store = indices.SqliteDocstore( os.path.join(_base_path, 'docs.sqllite')) #setup microblog _base_path = global_base_path + "/microblog" self.mb_index_stem = indices.AnseriniIndex(os.path.join( _base_path, 'anserini.porter'), stemmer='porter') self.mb_index = indices.AnseriniIndex(os.path.join( _base_path, 'anserini'), stemmer='none') self.mb_doc_store = indices.SqliteDocstore( os.path.join(_base_path, 'docs.sqllite')) self.msds = msmarco.MsmarcoDataset( self.msmarco_config(self.config['subset'], config), logger, vocab) self.mbds = microblog.MicroblogDataset( self.microblog_config(self.config['subset'], config), logger, vocab)
def _load_qrels(self, subset, fmt): with self.logger.duration('loading qrels'): base_path = util.path_dataset(self) path = os.path.join(base_path, f'{subset}.{self.config["collection"]}.qrels') self.logger.info(path) return trec.read_qrels_fmt(path, fmt)
def _init_qrels(self, subset, qrels_files, force=False, expected_md5=None): qrelsf = os.path.join(util.path_dataset(self), f'{subset}.qrels') if (force or not os.path.exists(qrelsf)) and self._confirm_dua(): qrels = itertools.chain(*(trec.read_qrels( util.download_stream(f, 'utf8', expected_md5=expected_md5)) for f in qrels_files)) trec.write_qrels(qrelsf, qrels)
def init(self, force=False): path = util.path_dataset(self) needs_collection = [] for index in [self.index, self.index_stem, self.doc_store]: if force or not index.built(): needs_collection.append(index.build) for subset in ['main', 'heldout']: is_heldout = (subset == 'heldout') query_file = os.path.join(path, f'{subset}.queries') if force or not os.path.exists(query_file): needs_collection.append( self._init_build_queryfile(query_file, is_heldout)) qrels_file = os.path.join(path, f'{subset}.qrels') if force or not os.path.exists(query_file): needs_collection.append( self._init_build_qrels(qrels_file, is_heldout)) if needs_collection and self._confirm_dua(): with contextlib.ExitStack() as stack: collection_iter = logger.pbar(self._init_iter_corpus(), desc='collection') sub_iters = util.blocking_tee(collection_iter, len(needs_collection)) for fn, it in zip(needs_collection, sub_iters): stack.enter_context( util.CtxtThread(functools.partial(fn, it)))
def _init_collection_iter(self, doc_paths, encoding): doc_paths = (os.path.join(util.path_dataset(self), p) for p in doc_paths) doc_iter = itertools.chain(*(trec.parse_doc_format(p, encoding) for p in doc_paths)) doc_iter = self.logger.pbar(doc_iter, desc='documents') return doc_iter
def pair_iter(self, fields, pos_source='intersect', neg_source='run', sampling='query', pos_minrel=1, unjudged_rel=0, num_neg=1, random=None, inf=False): special = self.config['special'] if special == '': raise NotImplementedError assert pos_minrel == 1, f"{special} only supports pos_minrel=1" assert unjudged_rel == 0, f"{special} only supports unjudged_rel=1" assert num_neg == 1, f"{special} only supports num_neg=1" assert self.config['subset'] in ('train', 'train10'), f"{special} only supported with subset=train[10]" self.logger.warn(f'Using {special}; ingoring pair_iter arguments pos_source={pos_source} neg_source={neg_source} sampling={sampling}') first = True while first or inf: first = False if special == 'mspairs': f = gzip.open(os.path.join(util.path_dataset(self), '{subset}.mspairs.gz'.format(**self.config)), 'rt') else: raise ValueError(f'unsupported special={special}') with f: for qid, pos_did, neg_did in plaintext.read_tsv(f): if qid in MINI_DEV: continue result = {f: [] for f in fields} for did in [pos_did, neg_did]: record = self.build_record(fields, query_id=qid, doc_id=did) for f in fields: result[f].append(record[f]) yield result
def init(self, force=False): base_path = util.path_dataset(self) idxs = [self.index, self.index_stem, self.doc_store] self._init_indices_parallel(idxs, self._init_iter_collection(), force) qrels_file = os.path.join(base_path, 'qrels.robust2004.txt') if (force or not os.path.exists(qrels_file)) and self._confirm_dua(): util.download(**_FILES['qrels'], file_name=qrels_file) for fold in FOLDS: fold_qrels_file = os.path.join(base_path, f'{fold}.qrels') if (force or not os.path.exists(fold_qrels_file)): all_qrels = trec.read_qrels_dict(qrels_file) fold_qrels = { qid: dids for qid, dids in all_qrels.items() if qid in FOLDS[fold] } trec.write_qrels_dict(fold_qrels_file, fold_qrels) query_file = os.path.join(base_path, 'topics.txt') if (force or not os.path.exists(query_file)) and self._confirm_dua(): query_file_stream = util.download_stream(**_FILES['queries'], encoding='utf8') with util.finialized_file(query_file, 'wt') as f: plaintext.write_tsv(f, trec.parse_query_format(query_file_stream))
def _base_qrels(self, subset): rnd, _ = subset.split('-', 1) path = os.path.join(util.path_dataset(self), f'{rnd}.qrels') if os.path.exists(path): return trec.read_qrels_dict(path) self.logger.info(f'missing qrels for {rnd} -- returning empty qrels') return {}
def init(self, force=False): base_dir = os.path.join(util.path_dataset(self), self.subset) if self.subset == 'dummy': datafile = os.path.join(base_dir, 'datafile.tsv') qrels = os.path.join(base_dir, 'qrels.txt') if not os.path.exists(datafile): os.symlink(os.path.abspath('etc/dummy_datafile.tsv'), datafile) if not os.path.exists(qrels): os.symlink(os.path.abspath('etc/dummy_qrels.txt'), qrels) needs_datafile = [] if force or not self.index.built(): needs_datafile.append(lambda it: self.index.build( indices.RawDoc(did, txt) for t, did, txt in it if t == 'doc')) if force or not self.index_stem.built(): needs_datafile.append(lambda it: self.index_stem.build( indices.RawDoc(did, txt) for t, did, txt in it if t == 'doc')) if force or not self.doc_store.built(): needs_datafile.append(lambda it: self.doc_store.build( indices.RawDoc(did, txt) for t, did, txt in it if t == 'doc')) query_file = os.path.join(base_dir, 'queries.tsv') if force or not os.path.exists(query_file): needs_datafile.append( lambda it: plaintext.write_tsv(query_file, ( (qid, txt) for t, qid, txt in it if t == 'query'))) if needs_datafile: df_glob = os.path.join(base_dir, 'datafile*.tsv') datafiles = glob(df_glob) while not datafiles: c = util.confirm( f'No data files found. Please move/link data files to {df_glob}.\n' 'Data files should contain both queries and documents in the ' 'following format (one per line):\n' '[query|doc] [TAB] [qid/did] [TAB] [text]') if not c: sys.exit(1) datafiles = glob(df_glob) main_iter = itertools.chain(*(plaintext.read_tsv(df) for df in datafiles)) main_iter = tqdm(main_iter, desc='reading datafiles') iters = util.blocking_tee(main_iter, len(needs_datafile)) with contextlib.ExitStack() as stack: for fn, it in zip(needs_datafile, iters): stack.enter_context( util.CtxtThread(functools.partial(fn, it))) qrels_file = os.path.join(base_dir, 'qrels.txt') while not os.path.exists(qrels_file): c = util.confirm( f'No qrels file found. Please move/link qrels file to {qrels_file}.\n' 'Qrels file should be in the TREC format:\n' '[qid] [SPACE] Q0 [SPACE] [did] [SPACE] [score]') if not c: sys.exit(1)
def init(self, force=False): base_path = util.path_dataset(self) base = Path(base_path) # DOCUMENT COLLECTION idx = [self.index, self.index_stem, self.doc_store] self._init_indices_parallel(idx, self._init_doc_iter(), force) # TRAIN files = {} files.update({ base / f'train-f{f}.auto.qrels': f'train/train.fold{f}.cbor.hierarchical.qrels' for f in range(5) }) files.update({ base / f'train-f{f}.queries.tsv': f'train/train.fold{f}.cbor.outlines' for f in range(5) }) if force or not all(f.exists() for f in files) and self._confirm_dua(): with util.download_tmp(_SOURCES['train'], tarf=True) as f: for member in f: for f_out, f_in in files.items(): if member.name == f_in: if f_out.suffix == '.qrels': self._init_file_copy(f.extractfile(member), f_out, force) elif f_out.suffix == '.tsv': self._init_queryfile(f.extractfile(member), f_out, force) # TEST files = { base / 'test.queries.tsv': 'benchmarkY1test.public/test.benchmarkY1test.cbor.outlines' } if force or not all(f.exists() for f in files) and self._confirm_dua(): with util.download_tmp(_SOURCES['test'], tarf=True) as f: for f_out, f_in in files.items(): self._init_queryfile(f.extractfile(f_in), f_out, force) files = { base / 'test.auto.qrels': 'TREC_CAR_2017_qrels/automatic.benchmarkY1test.cbor.hierarchical.qrels', base /'test.manual.qrels': 'TREC_CAR_2017_qrels/manual.benchmarkY1test.cbor.hierarchical.qrels', } if force or not all(f.exists() for f in files) and self._confirm_dua(): with util.download_tmp(_SOURCES['test-qrels'], tarf=True) as f: for f_out, f_in in files.items(): self._init_file_copy(f.extractfile(f_in), f_out, force) # TEST200 files = { base / 'test200.auto.qrels': 'test200/train.test200.cbor.hierarchical.qrels', base / 'test200.queries.tsv': 'test200/train.test200.cbor.outlines', } if force or not all(f.exists() for f in files) and self._confirm_dua(): with util.download_tmp(_SOURCES['test200'], tarf=True) as f: for f_out, f_in in files.items(): if f_out.suffix == '.qrels': self._init_file_copy(f.extractfile(f_in), f_out, force) elif f_out.suffix == '.tsv': self._init_queryfile(f.extractfile(f_in), f_out, force)
def _base_qrels(self, subset, fmt): fold, rnd, _ = subset.split('-', 2) path = os.path.join(util.path_dataset(self), f'{fold}-{rnd}.qrels') if os.path.exists(path): aaa = trec.read_qrels_fmt(path, fmt) return aaa self.logger.info(f'missing qrels for {rnd} -- returning empty qrels') return {}
def __init__(self, config, logger, vocab): super().__init__(config, logger, vocab) base_path = util.path_dataset(self) self.index_stem = indices.AnseriniIndex(os.path.join( base_path, 'anserini.porter'), stemmer='porter') self.doc_store = indices.SqliteDocstore( os.path.join(base_path, 'docs.sqllite'))
def _init_iter_corpus(self): nyt_corpus_dir = os.path.join(util.path_dataset(self), 'nyt_corpus') with Pool(onir.util.safe_thread_count()) as pool: for tgz_file in sorted(glob(f'{nyt_corpus_dir}/*/*.tgz')): logger.debug(f'reading file {tgz_file}') for doc in pool.imap(_parse_file, _read_tarfile(tgz_file)): if doc: yield doc
def _load_queries_base(self, subset): querysource = self.querysource query_path = os.path.join(util.path_dataset(self), f'{subset}.topics') return { qid: text for t, qid, text in plaintext.read_tsv(query_path) if t == querysource }
def record_iter(self, fields, source, minrel=None, shuf=True, random=None, inf=False, run_threshold=None): special = self.config['special'] if special == '': raise NotImplementedError assert minrel is None or minrel < 1 if source != 'run': self.logger.warn(f'Using special={special}; ingoring record_iter arguments source={source}') if run_threshold is not None: self.logger.warn(f'Using special={special}; ingoring record_iter arguments run_threshold={run_threshold}') first = True while first or inf: first = False if special == 'mspairs': f = gzip.open(os.path.join(util.path_dataset(self), '{subset}.mspairs.gz'.format(**self.config)), 'rt') it = plaintext.read_tsv(f) fields = fields - {'relscore'} # don't request relscore from typical channels (i.e., qrels) because we already know and this is faster. elif special == 'msrun': f = os.path.join(util.path_dataset(self), '{subset}.msrun'.format(**self.config)) it = ((qid, did) for qid, did, rank, score in trec.read_run(f)) elif special == 'validrun': f = os.path.join(util.path_dataset(self), '{subset}.validrun'.format(**self.config)) it = plaintext.read_sv(f, ' ') else: raise ValueError(f'unsupported special={special}') if shuf: if special in ('msrun', 'mspairs'): self.logger.warn(f'ignoring shuf=True with special={special}') else: it = list(it) random.shuffle(it) for cols in it: if len(cols) == 3: qid, pos_did, neg_did = cols dids = [pos_did, neg_did] if (minrel is None or minrel <= 0) else [pos_did] if qid in MINI_DEV: continue elif len(cols) == 2: qid, did = cols dids = [did] for did in dids: record = self.build_record(fields, query_id=qid, doc_id=did) result = {f: record[f] for f in fields} if len(cols) == 3: result['relscore'] = (1 if did == pos_did else 0) yield result
def _load_queries_base(self, subset): rnd, fields = subset.split('-', 1) fields = fields.split('-') path = os.path.join(util.path_dataset(self), f'{rnd}.tsv') return { qid: qtext for qid, qtype, qtext in plaintext.read_tsv(path) if qtype in fields }
def _load_queries_base(self, subset): fold, rnd, fields = subset.split('-', 2) fields = fields.split('-') path = os.path.join(util.path_dataset(self), f'rnd5.tsv') filter_queries = { str(qid): qtext for qid, qtype, qtext in plaintext.read_tsv(path) if qtype in fields and str(qid) in FOLDS[fold] } return filter_queries
def __init__(self, config, logger, vocab): super().__init__(config, logger, vocab) base_path = os.path.join(util.path_dataset(self), config['subset']) os.makedirs(base_path, exist_ok=True) self.index = indices.AnseriniIndex(os.path.join(base_path, 'anserini'), stemmer='none') self.index_stem = indices.AnseriniIndex(os.path.join( base_path, 'anserini.porter'), stemmer='porter') self.doc_store = indices.SqliteDocstore( os.path.join(base_path, 'docs.sqllite'))
def _load_topics(self, subset): result = {} for qid, text in plaintext.read_tsv( os.path.join(util.path_dataset(self), 'topics.txt')): #nqid=int(qid.replace('MB','').strip()) if subset == 'valid' and (int(qid) in VALIDATION_QIDS): result[qid] = text elif subset == 'test' and (int(qid) in TEST_QIDS): result[qid] = text elif subset == 'train' and (int(qid) not in VALIDATION_QIDS) and ( int(qid) not in TEST_QIDS): result[qid] = text return result
def init(self, force=False): base_path = util.path_dataset(self) idxs = [self.index, self.index_stem, self.doc_store] self._init_indices_parallel(idxs, self._init_iter_collection(), force) train_qrels = os.path.join(base_path, 'train.qrels.txt') valid_qrels = os.path.join(base_path, 'valid.qrels.txt') test_qrels = os.path.join(base_path, 'test.qrels.txt') if (force or not os.path.exists(train_qrels) or not os.path.exists(valid_qrels)) and self._confirm_dua(): source_stream = util.download_stream(**_FILES['qrels_2013'], encoding='utf8') source_stream2 = util.download_stream(**_FILES['qrels_2014'], encoding='utf8') with util.finialized_file(train_qrels, 'wt') as tf, \ util.finialized_file(valid_qrels, 'wt') as vf, \ util.finialized_file(test_qrels, 'wt') as Tf: for line in source_stream: cols = line.strip().split() if int(cols[0]) in VALIDATION_QIDS: vf.write(' '.join(cols) + '\n') elif int(cols[0]) in TEST_QIDS: Tf.write(' '.join(cols) + '\n') else: tf.write(' '.join(cols) + '\n') for line in source_stream2: cols = line.strip().split() if cols[0] in VALIDATION_QIDS: vf.write(' '.join(cols) + '\n') elif int(cols[0]) in TEST_QIDS: Tf.write(' '.join(cols) + '\n') else: tf.write(' '.join(cols) + '\n') all_queries = os.path.join(base_path, 'topics.txt') if (force or not os.path.exists(all_queries)) and self._confirm_dua(): source_stream = util.download_stream(**_FILES['queries_2013'], encoding='utf8') source_stream2 = util.download_stream(**_FILES['queries_2014'], encoding='utf8') train, valid = [], [] for _id, _query in trec.parse_query_mbformat(source_stream): nid = _id.replace('MB', '').strip() train.append([nid, _query]) for _id, _query in trec.parse_query_mbformat(source_stream2): nid = _id.replace('MB', '').strip() train.append([nid, _query]) plaintext.write_tsv(all_queries, train)
def __init__(self, config, logger, vocab): super().__init__(config, logger, vocab) base_path = os.path.join(util.path_dataset(self), config['date']) os.makedirs(base_path, exist_ok=True) self.index_stem = indices.MultifieldAnseriniIndex( os.path.join(base_path, 'anserini_multifield'), stemmer='porter', primary_field=config['bs_field']) self.index_stem_2020 = indices.MultifieldAnseriniIndex( os.path.join(base_path, 'anserini_multifield_2020'), stemmer='porter', primary_field=config['bs_field']) self.doc_store = indices.MultifieldSqliteDocstore( os.path.join(base_path, 'docs_multifield.sqlite'), primary_field=config['rr_field'])
def init(self, force=False): idxs = [self.index, self.index_stem, self.doc_store] self._init_indices_parallel(idxs, self._init_iter_collection(), force) train_qrels = os.path.join(util.path_dataset(self), 'train.qrels.txt') valid_qrels = os.path.join(util.path_dataset(self), 'valid.qrels.txt') if (force or not os.path.exists(train_qrels) or not os.path.exists(valid_qrels)) and self._confirm_dua(): source_stream = util.download_stream( 'https://ciir.cs.umass.edu/downloads/Antique/antique-train.qrel', encoding='utf8') with util.finialized_file(train_qrels, 'wt') as tf, \ util.finialized_file(valid_qrels, 'wt') as vf: for line in source_stream: cols = line.strip().split() if cols[0] in VALIDATION_QIDS: vf.write(' '.join(cols) + '\n') else: tf.write(' '.join(cols) + '\n') train_queries = os.path.join(util.path_dataset(self), 'train.queries.txt') valid_queries = os.path.join(util.path_dataset(self), 'valid.queries.txt') if (force or not os.path.exists(train_queries) or not os.path.exists(valid_queries)) and self._confirm_dua(): source_stream = util.download_stream( 'https://ciir.cs.umass.edu/downloads/Antique/antique-train-queries.txt', encoding='utf8') train, valid = [], [] for cols in plaintext.read_tsv(source_stream): if cols[0] in VALIDATION_QIDS: valid.append(cols) else: train.append(cols) plaintext.write_tsv(train_queries, train) plaintext.write_tsv(valid_queries, valid) test_qrels = os.path.join(util.path_dataset(self), 'test.qrels.txt') if (force or not os.path.exists(test_qrels)) and self._confirm_dua(): util.download( 'https://ciir.cs.umass.edu/downloads/Antique/antique-test.qrel', test_qrels) test_queries = os.path.join(util.path_dataset(self), 'test.queries.txt') if (force or not os.path.exists(test_queries)) and self._confirm_dua(): util.download( 'https://ciir.cs.umass.edu/downloads/Antique/antique-test-queries.txt', test_queries)
def _init_topics(self, subset, topic_files, qid_prefix=None, encoding=None, xml_prefix=None, force=False, expected_md5=None): topicf = os.path.join(util.path_dataset(self), f'{subset}.topics') if (force or not os.path.exists(topicf)) and self._confirm_dua(): topics = [] for topic_file in topic_files: topic_file_stream = util.download_stream( topic_file, encoding, expected_md5=expected_md5) for t, qid, text in trec.parse_query_format( topic_file_stream, xml_prefix): if qid_prefix is not None: qid = qid.replace(qid_prefix, '') topics.append((t, qid, text)) plaintext.write_tsv(topicf, topics)
def init(self, force=False): needs_docs = [] for index in [self.index_stem, self.index_stem_2020, self.doc_store]: if force or not index.built(): needs_docs.append(index) if needs_docs and self._confirm_dua(): with contextlib.ExitStack() as stack: doc_iter = self._init_iter_collection() doc_iter = self.logger.pbar(doc_iter, desc='articles') doc_iters = util.blocking_tee(doc_iter, len(needs_docs)) for idx, it in zip(needs_docs, doc_iters): if idx is self.index_stem_2020: it = (d for d in it if '2020' in d.data['date']) stack.enter_context( util.CtxtThread(functools.partial(idx.build, it))) path = os.path.join(util.path_dataset(self), 'rnd1.tsv') if not os.path.exists(path) and self._confirm_dua(): with util.download_tmp('https://ir.nist.gov/covidSubmit/data/topics-rnd1.xml', expected_md5="cf1b605222f45f7dbc90ca8e4d9b2c31") as f, \ util.finialized_file(path, 'wt') as fout: soup = BeautifulSoup(f.read(), 'lxml-xml') for topic in soup.find_all('topic'): qid = topic['number'] plaintext.write_tsv(fout, [ (qid, 'query', topic.find('query').get_text()), (qid, 'quest', topic.find('question').get_text()), (qid, 'narr', topic.find('narrative').get_text()), ]) udel_flag = path + '.includes_udel' if not os.path.exists(udel_flag): with open(path, 'at') as fout, util.finialized_file(udel_flag, 'wt'): with util.download_tmp( 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/topics.covid-round1-udel.xml', expected_md5="2915cf59ae222f0aa20b2a671f67fd7a") as f: soup = BeautifulSoup(f.read(), 'lxml-xml') for topic in soup.find_all('topic'): qid = topic['number'] plaintext.write_tsv(fout, [ (qid, 'udel', topic.find('query').get_text()), ]) path = os.path.join(util.path_dataset(self), 'rnd2.tsv') if not os.path.exists(path) and self._confirm_dua(): with util.download_tmp('https://ir.nist.gov/covidSubmit/data/topics-rnd2.xml', expected_md5="550129e71c83de3fb4d6d29a172c5842") as f, \ util.finialized_file(path, 'wt') as fout: soup = BeautifulSoup(f.read(), 'lxml-xml') for topic in soup.find_all('topic'): qid = topic['number'] plaintext.write_tsv(fout, [ (qid, 'query', topic.find('query').get_text()), (qid, 'quest', topic.find('question').get_text()), (qid, 'narr', topic.find('narrative').get_text()), ]) udel_flag = path + '.includes_udel' if not os.path.exists(udel_flag): with open(path, 'at') as fout, util.finialized_file(udel_flag, 'wt'): with util.download_tmp( 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/topics.covid-round2-udel.xml', expected_md5="a8988734e6f812921d5125249c197985") as f: soup = BeautifulSoup(f.read(), 'lxml-xml') for topic in soup.find_all('topic'): qid = topic['number'] plaintext.write_tsv(fout, [ (qid, 'udel', topic.find('query').get_text()), ]) path = os.path.join(util.path_dataset(self), 'rnd1.qrels') if not os.path.exists(path) and self._confirm_dua(): util.download( 'https://ir.nist.gov/covidSubmit/data/qrels-rnd1.txt', path, expected_md5="d58586df5823e7d1d0b3619a73b31518")
def _load_qrels(self, subset, fmt): with logger.duration('loading qrels'): base_path = util.path_dataset(self) path = os.path.join(base_path, f'{subset}.qrels') return trec.read_qrels_fmt(path, fmt)
def _load_queries_base(self, subset): with logger.duration('loading queries'): base_path = util.path_dataset(self) path = os.path.join(base_path, f'{subset}.queries') return dict(plaintext.read_tsv(path))