Example #1
0
    def init(self, force=False):
        base_dir = os.path.join(util.path_dataset(self), self.subset)

        if self.subset == 'dummy':
            datafile = os.path.join(base_dir, 'datafile.tsv')
            qrels = os.path.join(base_dir, 'qrels.txt')
            if not os.path.exists(datafile):
                os.symlink(os.path.abspath('etc/dummy_datafile.tsv'), datafile)
            if not os.path.exists(qrels):
                os.symlink(os.path.abspath('etc/dummy_qrels.txt'), qrels)

        needs_datafile = []
        if force or not self.index.built():
            needs_datafile.append(lambda it: self.index.build(
                indices.RawDoc(did, txt) for t, did, txt in it if t == 'doc'))

        if force or not self.index_stem.built():
            needs_datafile.append(lambda it: self.index_stem.build(
                indices.RawDoc(did, txt) for t, did, txt in it if t == 'doc'))

        if force or not self.doc_store.built():
            needs_datafile.append(lambda it: self.doc_store.build(
                indices.RawDoc(did, txt) for t, did, txt in it if t == 'doc'))

        query_file = os.path.join(base_dir, 'queries.tsv')
        if force or not os.path.exists(query_file):
            needs_datafile.append(
                lambda it: plaintext.write_tsv(query_file, (
                    (qid, txt) for t, qid, txt in it if t == 'query')))

        if needs_datafile:
            df_glob = os.path.join(base_dir, 'datafile*.tsv')
            datafiles = glob(df_glob)
            while not datafiles:
                c = util.confirm(
                    f'No data files found. Please move/link data files to {df_glob}.\n'
                    'Data files should contain both queries and documents in the '
                    'following format (one per line):\n'
                    '[query|doc] [TAB] [qid/did] [TAB] [text]')
                if not c:
                    sys.exit(1)
                datafiles = glob(df_glob)
            main_iter = itertools.chain(*(plaintext.read_tsv(df)
                                          for df in datafiles))
            main_iter = tqdm(main_iter, desc='reading datafiles')
            iters = util.blocking_tee(main_iter, len(needs_datafile))
            with contextlib.ExitStack() as stack:
                for fn, it in zip(needs_datafile, iters):
                    stack.enter_context(
                        util.CtxtThread(functools.partial(fn, it)))

        qrels_file = os.path.join(base_dir, 'qrels.txt')
        while not os.path.exists(qrels_file):
            c = util.confirm(
                f'No qrels file found. Please move/link qrels file to {qrels_file}.\n'
                'Qrels file should be in the TREC format:\n'
                '[qid] [SPACE] Q0 [SPACE] [did] [SPACE] [score]')
            if not c:
                sys.exit(1)
Example #2
0
 def _init_iter_collection(self, zipf, collection):
     with zipf.open(f'wikIR{collection}/documents.csv') as f:
         f = io.TextIOWrapper(f)
         f.readline()  # head
         for did, text in self.logger.pbar(plaintext.read_sv(f, ','),
                                           desc='documents'):
             yield indices.RawDoc(did, text)
Example #3
0
 def _init_iter_collection(self):
     docs_cls = self.docs_ds.docs_cls()
     fields = self.config['docs_index_fields'].split(',')
     assert all(f in docs_cls._fields for f in fields)
     field_idxs = [docs_cls._fields.index(f) for f in fields]
     for doc in self.docs_ds.docs_iter():
         yield indices.RawDoc(doc.doc_id,
                              '\n'.join(str(doc[i]) for i in field_idxs))
Example #4
0
 def _init_doctttttquery_iter(self):
     with util.download_tmp(_SOURCES['doctttttquery-predictions'], expected_md5=_HASHES['doctttttquery-predictions']) as f1, \
          util.download_tmp(_SOURCES['collection'], expected_md5=_HASHES['collection']) as f2:
         with zipfile.ZipFile(f1) as zipf, tarfile.open(fileobj=f2) as tarf:
             collection_stream = io.TextIOWrapper(tarf.extractfile('collection.tsv'))
             d5_iter = self._init_doctttttquery_zipf_iter(zipf)
             for (did, text), d5text in self.logger.pbar(zip(plaintext.read_tsv(collection_stream), d5_iter), desc='documents'):
                 yield indices.RawDoc(did, f'{text} {d5text}')
Example #5
0
 def _init_iter_collection(self):
     with util.download_tmp(_SOURCES['collection']) as f:
         with tarfile.open(fileobj=f) as tarf:
             collection_stream = io.TextIOWrapper(
                 tarf.extractfile('collection.tsv'))
             for did, text in self.logger.pbar(
                     plaintext.read_tsv(collection_stream),
                     desc='documents'):
                 yield indices.RawDoc(did, text)
Example #6
0
 def _init_iter_collection(self):
     # Using the trick here from capreolus, pulling document content out of public index:
     # <https://github.com/capreolus-ir/capreolus/blob/d6ae210b24c32ff817f615370a9af37b06d2da89/capreolus/collection/robust04.yaml#L15>
     index = indices.AnseriniIndex(f'../Tweets2013')
     for did in self.logger.pbar(index.docids(), desc='documents'):
         raw_doc = index.get_raw(did)
         #dict_doc = json.loads(raw_doc)
         pattern = '"text":"(.*?)","source":'
         raw_txt = re.search(pattern, raw_doc).group(1)
         yield indices.RawDoc(did, raw_txt)
Example #7
0
 def _init_iter_collection(self):
     # Using the trick here from capreolus, pulling document content out of public index:
     # <https://github.com/capreolus-ir/capreolus/blob/d6ae210b24c32ff817f615370a9af37b06d2da89/capreolus/collection/robust04.yaml#L15>
     with util.download_tmp(**_FILES['index']) as f:
         fd = f'{f.name}.d'
         util.extract_tarball(f.name,
                              fd,
                              self.logger,
                              reset_permissions=True)
         index = indices.AnseriniIndex(f'{fd}/index-robust04-20191213')
         for did in self.logger.pbar(index.docids(), desc='documents'):
             raw_doc = index.get_raw(did)
             yield indices.RawDoc(did, raw_doc)
Example #8
0
 def test_build(self):
     df = plaintext.read_tsv('etc/dummy_datafile.tsv')
     docs = [indices.RawDoc(did, dtext) for t, did, dtext in df if t == 'doc']
     with tempfile.TemporaryDirectory() as tmpdir:
         idxs = [
             (indices.AnseriniIndex(os.path.join(tmpdir, 'anserini')), False),
             (indices.AnseriniIndex(os.path.join(tmpdir, 'anserini.rawdocs'), store_raw_docs=True), True),
             (indices.SqliteDocstore(os.path.join(tmpdir, 'sqlite')), True),
         ]
         for index, check_raw_docs in idxs:
             with self.subTest(index=index):
                 self.assertFalse(index.built())
                 index.build(iter(docs))
                 self.assertTrue(index.built())
                 self.assertEqual(index.num_docs(), len(docs))
                 if check_raw_docs:
                     for doc in docs:
                         self.assertEqual(index.get_raw(doc.did), doc.data['text'])
Example #9
0
def _parse_doc_file(args):
    path, encoding = args
    docs = []
    if path.endswith('.gz'):
        open_fn = gzip.open
    else:
        open_fn = open
    with open_fn(path, 'rt', encoding=encoding, errors='replace') as file:
        docid = None
        doc_text = ''
        tag_no = None
        while file:
            line = next(file, StopIteration)
            if line is StopIteration:
                break
            if line.startswith('<DOC ') or line.startswith('<DOC>'):
                match = re.match(r".*id=\"([^\"]+)\".*", line)
                if match:
                    docid = match.group(1)
            elif line.startswith('<DOCNO>'):
                while '</DOCNO>' not in line:
                    l = next(file, StopIteration)
                    if l is StopIteration:
                        break
                    line += l
                docid = line.replace('<DOCNO>', '').replace('</DOCNO>',
                                                            '').strip()
            elif line.startswith('</DOC>'):
                assert docid is not None
                docs.append(indices.RawDoc(docid, _strip_html(doc_text)))
                docid = None
                doc_text = ''
                tag_no = None
            elif tag_no is not None:
                doc_text += line
                if line.startswith(DOC_TEXT_END_TAGS[tag_no]):
                    tag_no = None
            else:
                for i, tag in enumerate(DOC_TEXT_TAGS):
                    if line.startswith(tag):
                        tag_no = i
                        doc_text += line
                        break
    return docs
Example #10
0
 def test_batch_query(self):
     df = list(plaintext.read_tsv('etc/dummy_datafile.tsv'))
     docs = [indices.RawDoc(did, dtext) for t, did, dtext in df if t == 'doc']
     queries = [(qid, qtext) for t, qid, qtext in df if t == 'query']
     with tempfile.TemporaryDirectory() as tmpdir:
         idxs = [
             indices.AnseriniIndex(os.path.join(tmpdir, 'anserini')),
         ]
         models = [
             'bm25', 'bm25_k1-1.5', 'bm25_b-0.2', 'bm25_k1-1.6_b-0.8',
             'bm25_rm3', 'bm25_rm3_k1-1.5', 'bm25_rm3_b-0.2', 'bm25_rm3_k1-1.6_b-0.8',
             'bm25_rm3_rm3.fbTerms-2_rm3.fbDocs-2', 'bm25_rm3_rm3.fbTerms-2_rm3.fbDocs-2_k1-1.5',
             'bm25_rm3_rm3.fbTerms-2_rm3.fbDocs-2_b-0.2', 'bm25_rm3_rm3.fbTerms-2_rm3.fbDocs-2_k1-1.6_b-0.8',
             'ql', 'ql_mu-0.4',
             'sdm', 'sdm_uw-0.3_ow-0.2_tw-0.5',
         ]
         for index in idxs:
             index.build(docs)
             for model in models:
                 with self.subTest(index=index, model=model):
                     index.batch_query(queries, model, topk=10)
                     index.batch_query(queries, model, topk=10, quiet=True)
Example #11
0
    def _init_iter_collection(self):
        files = {
            '2020-04-10': {
                'comm_use_subset':
                ('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-10/comm_use_subset.tar.gz',
                 "253cecb4fee2582a611fb77a4d537dc5"),
                'noncomm_use_subset':
                ('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-10/noncomm_use_subset.tar.gz',
                 "734b462133b3c00da578a909f945f4ae"),
                'custom_license':
                ('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-10/custom_license.tar.gz',
                 "2f1c9864348025987523b86d6236c40b"),
                'biorxiv_medrxiv':
                ('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-10/biorxiv_medrxiv.tar.gz',
                 "c12acdec8b3ad31918d752ba3db36121"),
            },
            '2020-05-01': {
                'comm_use_subset':
                ('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-05-01/comm_use_subset.tar.gz',
                 "af4202340182209881d3d8cba2d58a24"),
                'noncomm_use_subset':
                ('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-05-01/noncomm_use_subset.tar.gz',
                 "9cc25b9e8674197446e7cbd4381f643b"),
                'custom_license':
                ('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-05-01/custom_license.tar.gz',
                 "1cb6936a7300a31344cd8a5ecc9ca778"),
                'biorxiv_medrxiv':
                ('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-05-01/biorxiv_medrxiv.tar.gz',
                 "9d6c6dc5d64b01e528086f6652b3ccb7"),
                'arxiv':
                ('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-05-01/arxiv.tar.gz',
                 "f10890174d6f864f306800d4b02233bc"),
            }
        }
        metadata = {
            '2020-04-10':
            ('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-10/metadata.csv',
             "42a21f386be86c24647a41bedde34046"),
            '2020-05-01':
            ('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-05-01/metadata.csv',
             "b1d2e409026494e0c8034278bacd1248"),
        }
        meta_url, meta_md5 = metadata[self.config['date']]

        fulltexts = {}
        with contextlib.ExitStack() as stack:
            for fid, (file, md5) in files[self.config['date']].items():
                fulltexts[fid] = stack.enter_context(
                    util.download_tmp(file, tarf=True, expected_md5=md5))
            meta = pd.read_csv(
                util.download_stream(meta_url, expected_md5=meta_md5))
            for _, row in meta.iterrows():
                did = str(row['cord_uid'])
                title = str(row['title'])
                doi = str(row['doi'])
                abstract = str(row['abstract'])
                date = str(row['publish_time'])
                body = ''
                heads = ''
                if row['has_pmc_xml_parse']:
                    path = os.path.join(row['full_text_file'], 'pmc_json',
                                        row['pmcid'] + '.xml.json')
                    data = json.load(
                        fulltexts[row['full_text_file']].extractfile(path))
                    if 'body_text' in data:
                        body = '\n'.join(b['text'] for b in data['body_text'])
                        heads = '\n'.join(
                            set(b['section'] for b in data['body_text']))
                elif row['has_pdf_parse']:
                    path = os.path.join(
                        row['full_text_file'], 'pdf_json',
                        row['sha'].split(';')[0].strip() + '.json')
                    data = json.load(
                        fulltexts[row['full_text_file']].extractfile(path))
                    if 'body_text' in data:
                        body = '\n'.join(b['text'] for b in data['body_text'])
                        heads = '\n'.join(
                            set(b['section'] for b in data['body_text']))
                contents = f'{title}\n\n{abstract}\n\n{body}\n\n{heads}'
                doc = indices.RawDoc(did,
                                     text=contents,
                                     title=title,
                                     abstract=abstract,
                                     title_abs=f'{title}\n\n{abstract}',
                                     body=body,
                                     doi=doi,
                                     date=date)
                yield doc
Example #12
0
 def _init_doc_iter(self):
     with util.download_tmp(_SOURCES['corpus'], tarf=True) as f:
         cbor_file = f.extract('paragraphcorpus/paragraphcorpus.cbor')
         for did, text in self.logger.pbar(car.iter_paras(cbor_file), desc='documents'):
             yield indices.RawDoc(did, text)
Example #13
0
def _iter_collection(path):
    logger = log.easy()
    with path.open("rt") as collection_stream:
        for did, text in logger.pbar(plaintext.read_tsv(collection_stream),
                                     desc='documents'):
            yield indices.RawDoc(did, text)
Example #14
0
 def _init_iter_collection(self):
     strm = util.download_stream(
         'https://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt',
         'utf8')
     for did, text in plaintext.read_tsv(strm):
         yield indices.RawDoc(did, text)