Beispiel #1
0
    def init(self, force=False):
        base_path = util.path_dataset(self)
        idxs = [self.index, self.index_stem, self.doc_store]
        self._init_indices_parallel(idxs, self._init_iter_collection(), force)

        qrels_file = os.path.join(base_path, 'qrels.robust2004.txt')
        if (force or not os.path.exists(qrels_file)) and self._confirm_dua():
            util.download(**_FILES['qrels'], file_name=qrels_file)

        for fold in FOLDS:
            fold_qrels_file = os.path.join(base_path, f'{fold}.qrels')
            if (force or not os.path.exists(fold_qrels_file)):
                all_qrels = trec.read_qrels_dict(qrels_file)
                fold_qrels = {
                    qid: dids
                    for qid, dids in all_qrels.items() if qid in FOLDS[fold]
                }
                trec.write_qrels_dict(fold_qrels_file, fold_qrels)

        query_file = os.path.join(base_path, 'topics.txt')
        if (force or not os.path.exists(query_file)) and self._confirm_dua():
            query_file_stream = util.download_stream(**_FILES['queries'],
                                                     encoding='utf8')
            with util.finialized_file(query_file, 'wt') as f:
                plaintext.write_tsv(f,
                                    trec.parse_query_format(query_file_stream))
Beispiel #2
0
 def qrels_path(self, fold_qrels_file):
     if not fold_qrels_file.is_file():
         with self.assessments.path.open("r") as fp:
             all_qrels = trec.read_qrels_dict(fp)
         fold_qrels = {
             qid: dids
             for qid, dids in all_qrels.items() if qid in self.qids
         }
         trec.write_qrels_dict(fold_qrels_file, fold_qrels)
     return fold_qrels_file
Beispiel #3
0
 def wrapped(it):
     with util.finialized_file(file, 'wt') as f:
         for doc in it:
             if is_heldout == (doc.did in _HELD_OUT_IDS):
                 trec.write_qrels_dict(f, {doc.did: {doc.did: 1}})
Beispiel #4
0
    def init(self, force=False):
        needs_docs = []
        for index in [self.index_stem, self.index_stem_2020, self.doc_store]:
            if force or not index.built():
                needs_docs.append(index)

        if needs_docs and self._confirm_dua():
            with contextlib.ExitStack() as stack:
                doc_iter = self._init_iter_collection()
                doc_iter = self.logger.pbar(doc_iter, desc='articles')
                doc_iters = util.blocking_tee(doc_iter, len(needs_docs))
                for idx, it in zip(needs_docs, doc_iters):
                    if idx is self.index_stem_2020:
                        it = (d for d in it if '2020' in d.data['date'])
                    stack.enter_context(
                        util.CtxtThread(functools.partial(idx.build, it)))

        path = os.path.join(util.path_dataset(self), 'rnd1.tsv')
        if not os.path.exists(path) and self._confirm_dua():
            with util.download_tmp('https://ir.nist.gov/covidSubmit/data/topics-rnd1.xml', expected_md5="cf1b605222f45f7dbc90ca8e4d9b2c31") as f, \
                 util.finialized_file(path, 'wt') as fout:
                soup = BeautifulSoup(f.read(), 'lxml-xml')
                for topic in soup.find_all('topic'):
                    qid = topic['number']
                    plaintext.write_tsv(fout, [
                        (qid, 'query', topic.find('query').get_text()),
                        (qid, 'quest', topic.find('question').get_text()),
                        (qid, 'narr', topic.find('narrative').get_text()),
                    ])

        udel_flag = path + '.includes_udel'
        if not os.path.exists(udel_flag):
            with open(path,
                      'at') as fout, util.finialized_file(udel_flag, 'wt'):
                with util.download_tmp(
                        'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/topics.covid-round1-udel.xml',
                        expected_md5="2915cf59ae222f0aa20b2a671f67fd7a") as f:
                    soup = BeautifulSoup(f.read(), 'lxml-xml')
                    for topic in soup.find_all('topic'):
                        qid = topic['number']
                        plaintext.write_tsv(fout, [
                            (qid, 'udel', topic.find('query').get_text()),
                        ])

        path = os.path.join(util.path_dataset(self), 'rnd2.tsv')
        if not os.path.exists(path) and self._confirm_dua():
            with util.download_tmp('https://ir.nist.gov/covidSubmit/data/topics-rnd2.xml', expected_md5="550129e71c83de3fb4d6d29a172c5842") as f, \
                 util.finialized_file(path, 'wt') as fout:
                soup = BeautifulSoup(f.read(), 'lxml-xml')
                for topic in soup.find_all('topic'):
                    qid = topic['number']
                    plaintext.write_tsv(fout, [
                        (qid, 'query', topic.find('query').get_text()),
                        (qid, 'quest', topic.find('question').get_text()),
                        (qid, 'narr', topic.find('narrative').get_text()),
                    ])

        udel_flag = path + '.includes_udel'
        if not os.path.exists(udel_flag):
            with open(path,
                      'at') as fout, util.finialized_file(udel_flag, 'wt'):
                with util.download_tmp(
                        'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/topics.covid-round2-udel.xml',
                        expected_md5="a8988734e6f812921d5125249c197985") as f:
                    soup = BeautifulSoup(f.read(), 'lxml-xml')
                    for topic in soup.find_all('topic'):
                        qid = topic['number']
                        plaintext.write_tsv(fout, [
                            (qid, 'udel', topic.find('query').get_text()),
                        ])

        path = os.path.join(util.path_dataset(self), 'rnd5.tsv')
        if not os.path.exists(path) and self._confirm_dua():
            with util.download_tmp('https://ir.nist.gov/covidSubmit/data/topics-rnd5.xml', expected_md5="0307a37b6b9f1a5f233340a769d538ea") as f, \
                 util.finialized_file(path, 'wt') as fout:
                soup = BeautifulSoup(f.read(), 'lxml-xml')
                for topic in soup.find_all('topic'):
                    qid = topic['number']
                    plaintext.write_tsv(fout, [
                        (qid, 'query', topic.find('query').get_text()),
                        (qid, 'quest', topic.find('question').get_text()),
                        (qid, 'narr', topic.find('narrative').get_text()),
                    ])

        udel_flag = path + '.includes_udel'
        if not os.path.exists(udel_flag):
            with open(path,
                      'at') as fout, util.finialized_file(udel_flag, 'wt'):
                with util.download_tmp(
                        'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/topics.covid-round5-udel.xml',
                        expected_md5="966a49487348dc853634bcdd0829fd26") as f:
                    soup = BeautifulSoup(f.read(), 'lxml-xml')
                    for topic in soup.find_all('topic'):
                        qid = topic['number']
                        plaintext.write_tsv(fout, [
                            (qid, 'udel', topic.find('query').get_text()),
                        ])

        qrels_file = os.path.join(util.path_dataset(self), 'rnd5.qrels')
        if (force or not os.path.exists(qrels_file)) and self._confirm_dua():
            util.download(
                'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round4-cumulative.txt',
                qrels_file)

        for fold in FOLDS:
            fold_qrels_file = os.path.join(util.path_dataset(self),
                                           f'{fold}-rnd5.qrels')
            if (force or not os.path.exists(fold_qrels_file)):
                all_qrels = trec.read_qrels_dict(qrels_file)

                fold_qrels = {
                    qid: dids
                    for qid, dids in all_qrels.items()
                    if str(qid) in FOLDS[fold]
                }
                trec.write_qrels_dict(fold_qrels_file, fold_qrels)