Beispiel #1
0
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    collection = NytDocs(dlc['source'])

    base = Dataset(collection, documentation('_'))

    all_queries = NytQueries(collection)
    all_qrels = NytQrels(collection)

    match_qids = Lazy(lambda: VALID_IDS)
    subsets['train'] = Dataset(
        FilteredQueries(all_queries, match_qids, mode='exclude'),
        FilteredQrels(all_qrels, match_qids, mode='exclude'),
        collection,
        documentation('train'))
    subsets['valid'] = Dataset(
        FilteredQueries(all_queries, match_qids, mode='include'),
        FilteredQrels(all_qrels, match_qids, mode='include'),
        collection,
        documentation('valid'))

    ir_datasets.registry.register('nyt', base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'nyt/{s}', subsets[s])

    return base, subsets
Beispiel #2
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    subsets = {}

    collection = TrecDocs(dlc['docs'],
                          path_globs=[
                              '**/FBIS/FB*', '**/FR94/??/FR*', '**/FT/*/FT*',
                              '**/LATIMES/LA*'
                          ],
                          namespace=NAME,
                          lang='en',
                          expected_file_count=2295,
                          count_hint=ir_datasets.util.count_hint(NAME))

    queries = TrecQueries(GzipExtract(dlc['queries']),
                          namespace=NAME,
                          lang='en')
    qrels = TrecQrels(dlc['qrels'], QREL_DEFS)

    base = Dataset(collection, queries, qrels, documentation('_'))

    for fold in FOLDS:
        qid_filter = make_filter(fold)
        subsets[fold] = Dataset(FilteredQueries(queries, qid_filter),
                                FilteredQrels(qrels, qid_filter), collection,
                                documentation(fold))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Beispiel #3
0
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    migrator = Migrator(base_path/'irds_version.txt', 'v2',
        affected_files=[base_path/'nyt.tgz.pklz4'],
        message='Migrating nyt (extracting body text)')

    collection = migrator(NytDocs(dlc['source']))

    base = Dataset(collection, documentation('_'))

    # core17
    subsets['trec-core-2017'] = Dataset(
        TrecQueries(dlc['trec-core-2017/queries'], namespace='trec-core-2017', lang='en'),
        TrecQrels(dlc['trec-core-2017/qrels'], CORE_QREL_DEFS),
        collection,
        documentation('trec-core-2017'))

    # wksup
    all_queries = NytQueries(collection)
    all_qrels = NytQrels(collection)
    match_qids = Lazy(lambda: VALID_IDS)
    subsets['wksup'] = Dataset(
        all_queries,
        all_qrels,
        collection,
        documentation('wksup/train'))
    subsets['wksup/train'] = Dataset(
        FilteredQueries(all_queries, match_qids, mode='exclude'),
        FilteredQrels(all_qrels, match_qids, mode='exclude'),
        collection,
        documentation('wksup/train'))
    subsets['wksup/valid'] = Dataset(
        FilteredQueries(all_queries, match_qids, mode='include'),
        FilteredQrels(all_qrels, match_qids, mode='include'),
        collection,
        documentation('wksup/valid'))

    ir_datasets.registry.register('nyt', base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'nyt/{s}', subsets[s])

    return base, subsets
Beispiel #4
0
def _init():
    base_path = ir_datasets.util.home_path()/NAME
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    subsets = {}
    migrator = Migrator(base_path/'irds_version.txt', 'v2',
        affected_files=[base_path/'msmarco_v2_passage.tar.pklz4'],
        message='Cleaning up pklz4 lookup structure in favor of ID-based lookups')
    collection = MsMarcoV2Passages(dlc['passages'])
    collection = migrator(collection)

    qrels_migrator = Migrator(base_path/'qrels_version.txt', 'v2',
        affected_files=[base_path/'train'/'qrels.tsv', base_path/'dev1'/'qrels.tsv', base_path/'dev2'/'qrels.tsv'],
        message='Updating qrels (task organizers removed duplicates)')

    subsets['train'] = Dataset(
        collection,
        TsvQueries(dlc['train/queries'], namespace='msmarco', lang='en'),
        qrels_migrator(TrecQrels(dlc['train/qrels'], QRELS_DEFS)),
        TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])),
    )
    subsets['dev1'] = Dataset(
        collection,
        TsvQueries(dlc['dev1/queries'], namespace='msmarco', lang='en'),
        qrels_migrator(TrecQrels(dlc['dev1/qrels'], QRELS_DEFS)),
        TrecScoredDocs(GzipExtract(dlc['dev1/scoreddocs'])),
    )
    subsets['dev2'] = Dataset(
        collection,
        TsvQueries(dlc['dev2/queries'], namespace='msmarco', lang='en'),
        qrels_migrator(TrecQrels(dlc['dev2/qrels'], QRELS_DEFS)),
        TrecScoredDocs(GzipExtract(dlc['dev2/scoreddocs'])),
    )
    subsets['trec-dl-2021'] = Dataset(
        collection,
        TsvQueries(dlc['trec-dl-2021/queries'], namespace='msmarco', lang='en'),
        TrecQrels(dlc['trec-dl-2021/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['trec-dl-2021/scoreddocs'])),
    )
    dl21_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2021'].qrels_iter()})
    subsets['trec-dl-2021/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2021'].queries_handler(), dl21_judged),
        FilteredScoredDocs(subsets['trec-dl-2021'].scoreddocs_handler(), dl21_judged),
        subsets['trec-dl-2021'],
    )

    ir_datasets.registry.register(NAME, Dataset(collection, documentation("_")))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s)))

    return collection, subsets
Beispiel #5
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    migrator = Migrator(base_path / 'irds_version.txt',
                        'v2',
                        affected_files=[
                            base_path / 'collection.tsv',
                            base_path / 'collection.tsv.pklz4'
                        ],
                        message=f'Migrating {NAME} (fixing passage encoding)')

    collection = TsvDocs(Cache(
        FixEncoding(TarExtract(dlc['collectionandqueries'], 'collection.tsv')),
        base_path / 'collection.tsv'),
                         namespace='msmarco',
                         lang='en',
                         docstore_size_hint=14373971970,
                         count_hint=ir_datasets.util.count_hint(NAME))
    collection = migrator(collection)
    subsets = {}

    subsets['train'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.train.tsv'),
                         base_path / 'train/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(dlc['train/qrels'], QRELS_DEFS),
        TsvDocPairs(GzipExtract(dlc['train/docpairs'])),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(
                    TarExtract(dlc['train/scoreddocs'], 'top1000.train.txt')),
                base_path / 'train/ms.run')),
    )

    subsets['train/triples-v2'] = Dataset(
        collection,
        subsets['train'].queries_handler(),
        subsets['train'].qrels_handler(),
        TsvDocPairs(GzipExtract(dlc['train/docpairs/v2'])),
        subsets['train'].scoreddocs_handler(),
    )

    subsets['train/triples-small'] = Dataset(
        collection,
        subsets['train'].queries_handler(),
        subsets['train'].qrels_handler(),
        TsvDocPairs(
            Cache(
                MapSmallTriplesQidPid(
                    TarExtract(dlc['train/docpairs/small'],
                               'triples.train.small.tsv'),
                    TarExtract(dlc['collectionandqueries'], 'collection.tsv'),
                    subsets['train'].queries_handler()),
                base_path / 'train/small.triples.qidpid.tsv')),
        subsets['train'].scoreddocs_handler(),
    )

    subsets['dev'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.dev.tsv'),
                         base_path / 'dev/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(dlc['dev/qrels'], QRELS_DEFS),
    )

    subsets['dev/small'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(dlc['collectionandqueries'], 'queries.dev.small.tsv'),
            base_path / 'dev/small/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(
            Cache(
                TarExtract(dlc['collectionandqueries'], 'qrels.dev.small.tsv'),
                base_path / 'dev/small/qrels'), QRELS_DEFS),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(TarExtract(dlc['dev/scoreddocs'],
                                         'top1000.dev')),
                base_path / 'dev/ms.run')),
    )

    subsets['eval'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.eval.tsv'),
                         base_path / 'eval/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
    )

    subsets['eval/small'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(dlc['collectionandqueries'], 'queries.eval.small.tsv'),
            base_path / 'eval/small/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(
                    TarExtract(dlc['eval/scoreddocs'], 'top1000.eval')),
                base_path / 'eval/ms.run')),
    )

    subsets['trec-dl-2019'] = Dataset(
        collection,
        TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS),
        TsvQueries(Cache(GzipExtract(dlc['trec-dl-2019/queries']),
                         base_path / 'trec-dl-2019/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecScoredDocs(
            Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2019/scoreddocs'])),
                  base_path / 'trec-dl-2019/ms.run')),
    )

    subsets['trec-dl-2020'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(
            Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2020/scoreddocs'])),
                  base_path / 'trec-dl-2020/ms.run')),
    )

    # A few subsets that are contrainted to just the queries/qrels/docpairs that have at least
    # 1 relevance assessment
    train_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['train'].qrels_iter()})
    subsets['train/judged'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(), train_judged),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           train_judged),
        subsets['train'],
    )

    dev_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['dev'].qrels_iter()})
    subsets['dev/judged'] = Dataset(
        FilteredQueries(subsets['dev'].queries_handler(), dev_judged),
        subsets['dev'],
    )

    dl19_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2019'].qrels_iter()})
    subsets['trec-dl-2019/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2019'].queries_handler(),
                        dl19_judged),
        FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(),
                           dl19_judged),
        subsets['trec-dl-2019'],
    )

    dl20_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2020'].qrels_iter()})
    subsets['trec-dl-2020/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2020'].queries_handler(),
                        dl20_judged),
        FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(),
                           dl20_judged),
        subsets['trec-dl-2020'],
    )

    # split200 -- 200 queries held out from the training data for validation
    split200 = Lazy(lambda: SPLIT200_QIDS)
    subsets['train/split200-train'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(),
                        split200,
                        mode='exclude'),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           split200,
                           mode='exclude'),
        FilteredQrels(subsets['train'].qrels_handler(),
                      split200,
                      mode='exclude'),
        FilteredDocPairs(subsets['train'].docpairs_handler(),
                         split200,
                         mode='exclude'),
        subsets['train'],
    )
    subsets['train/split200-valid'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(),
                        split200,
                        mode='include'),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           split200,
                           mode='include'),
        FilteredQrels(subsets['train'].qrels_handler(),
                      split200,
                      mode='include'),
        FilteredDocPairs(subsets['train'].docpairs_handler(),
                         split200,
                         mode='include'),
        subsets['train'],
    )

    # Medical subset
    def train_med():
        with dlc['medmarco_ids'].stream() as stream:
            stream = codecs.getreader('utf8')(stream)
            return {l.rstrip() for l in stream}

    train_med = Lazy(train_med)
    subsets['train/medical'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(), train_med),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_med),
        FilteredDocPairs(subsets['train'].docpairs_handler(), train_med),
        FilteredQrels(subsets['train'].qrels_handler(), train_med),
        subsets['train'],
    )

    # DL-Hard
    dl_hard_qrels_migrator = Migrator(
        base_path / 'trec-dl-hard' / 'irds_version.txt',
        'v3',
        affected_files=[base_path / 'trec-dl-hard' / 'qrels'],
        message='Updating trec-dl-hard qrels')
    hard_qids = Lazy(lambda: DL_HARD_QIDS)
    dl_hard_base_queries = TsvQueries([
        Cache(GzipExtract(dlc['trec-dl-2019/queries']),
              base_path / 'trec-dl-2019/queries.tsv'),
        Cache(GzipExtract(dlc['trec-dl-2020/queries']),
              base_path / 'trec-dl-2020/queries.tsv')
    ],
                                      namespace='msmarco',
                                      lang='en')
    subsets['trec-dl-hard'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        dl_hard_qrels_migrator(
            TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)),
        documentation('trec-dl-hard'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1'])
    subsets['trec-dl-hard/fold1'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold1'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2'])
    subsets['trec-dl-hard/fold2'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold2'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3'])
    subsets['trec-dl-hard/fold3'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold3'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4'])
    subsets['trec-dl-hard/fold4'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold4'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5'])
    subsets['trec-dl-hard/fold5'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold5'))

    ir_datasets.registry.register(NAME, Dataset(collection,
                                                documentation('_')))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}',
                                      Dataset(subsets[s], documentation(s)))

    return collection, subsets
def _init():
    base_path = ir_datasets.util.home_path() / 'msmarco-document'
    documentation = YamlDocumentation('docs/msmarco-document.yaml')
    dlc = DownloadConfig.context('msmarco-document', base_path, dua=DUA)
    subsets = {}
    collection = MsMarcoTrecDocs(GzipExtract(dlc['docs']))

    subsets['train'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['train/queries']), namespace='msmarco'),
        TrecQrels(GzipExtract(dlc['train/qrels']), QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])),
    )

    subsets['dev'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['dev/queries']), namespace='msmarco'),
        TrecQrels(GzipExtract(dlc['dev/qrels']), QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['dev/scoreddocs'])),
    )

    subsets['eval'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['eval/queries']), namespace='msmarco'),
        TrecScoredDocs(GzipExtract(dlc['eval/scoreddocs'])),
    )

    subsets['trec-dl-2019'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']),
                   namespace='msmarco'),
        TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['trec-dl-2019/scoreddocs'])),
    )

    subsets['trec-dl-2020'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']),
                   namespace='msmarco'),
        TrecScoredDocs(GzipExtract(dlc['trec-dl-2020/scoreddocs'])),
    )

    subsets['orcas'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['orcas/queries']), namespace='orcas'),
        TrecQrels(GzipExtract(dlc['orcas/qrels']), ORCAS_QLRES_DEFS),
        TrecScoredDocs(GzipExtract(dlc['orcas/scoreddocs'])),
    )

    dl19_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2019'].qrels_iter()})
    subsets['trec-dl-2019/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2019'].queries_handler(),
                        dl19_judged),
        FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(),
                           dl19_judged),
        subsets['trec-dl-2019'],
    )

    ir_datasets.registry.register('msmarco-document',
                                  Dataset(collection, documentation("_")))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'msmarco-document/{s}',
                                      Dataset(subsets[s], documentation(s)))

    return collection, subsets
Beispiel #7
0
def _init():
    base_path = ir_datasets.util.home_path() / NAME
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    subsets = {}
    collection = MsMarcoV2Docs(dlc['docs'])

    subsets['train'] = Dataset(
        collection,
        TsvQueries(dlc['train_queries'], namespace='msmarco', lang='en'),
        TrecQrels(dlc['train_qrels'], QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['train_scoreddocs'])),
    )
    subsets['dev1'] = Dataset(
        collection,
        TsvQueries(dlc['dev1_queries'], namespace='msmarco', lang='en'),
        TrecQrels(dlc['dev1_qrels'], QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['dev1_scoreddocs'])),
    )
    subsets['dev2'] = Dataset(
        collection,
        TsvQueries(dlc['dev2_queries'], namespace='msmarco', lang='en'),
        TrecQrels(dlc['dev2_qrels'], QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['dev2_scoreddocs'])),
    )
    subsets['trec-dl-2019'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(GzipExtract(dlc['trec_dl_2019_qrels']), TREC_DL_QRELS_DEFS),
    )
    subsets['trec-dl-2020'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(GzipExtract(dlc['trec_dl_2020_qrels']), TREC_DL_QRELS_DEFS),
    )
    dl19_v2_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2019'].qrels_iter()})
    subsets['trec-dl-2019/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2019'].queries_handler(),
                        dl19_v2_judged),
        subsets['trec-dl-2019'],
    )
    dl20_v2_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2020'].qrels_iter()})
    subsets['trec-dl-2020/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2020'].queries_handler(),
                        dl20_v2_judged),
        subsets['trec-dl-2020'],
    )
    subsets['trec-dl-2021'] = Dataset(
        collection,
        TsvQueries(dlc['trec-dl-2021/queries'], namespace='msmarco',
                   lang='en'),
        TrecQrels(dlc['trec-dl-2021/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['trec-dl-2021/scoreddocs'])),
    )
    dl21_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2021'].qrels_iter()})
    subsets['trec-dl-2021/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2021'].queries_handler(),
                        dl21_judged),
        FilteredScoredDocs(subsets['trec-dl-2021'].scoreddocs_handler(),
                           dl21_judged),
        subsets['trec-dl-2021'],
    )

    subsets['anchor-text'] = Dataset(
        MsMarcoV2AnchorTextDocs(Cache(GzipExtract(dlc['anchor-text']),
                                      base_path / "anchor-text.json"),
                                count_hint=4821244),
        documentation('anchor-text'))

    ir_datasets.registry.register(NAME, Dataset(collection,
                                                documentation("_")))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}',
                                      Dataset(subsets[s], documentation(s)))

    return collection, subsets
Beispiel #8
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)

    subsets = {}

    train_qrels = ir_datasets.registry['msmarco-passage/train'].qrels_handler()
    train_docparis = TsvDocPairs(dlc['train/triples'])
    dev_qrels = TrecQrels(dlc['dev/qrels'], QRELS_DEFS)
    dev_small_qrels = TrecQrels(dlc['dev/qrels-small'], QRELS_DEFS)
    small_dev_qids = Lazy(
        lambda: {q.query_id
                 for q in dev_small_qrels.qrels_iter()})

    for lang in ['es', 'fr', 'pt', 'it', 'id', 'de', 'ru', 'zh']:
        collection = TsvDocs(
            dlc[f'{lang}/docs'],
            namespace=f'mmarco/{lang}',
            lang=lang,
            count_hint=ir_datasets.util.count_hint(f'{NAME}/{lang}'))
        subsets[f'{lang}'] = Dataset(collection, documentation(f'{lang}'))
        subsets[f'{lang}/train'] = Dataset(
            collection,
            TsvQueries(dlc[f'{lang}/queries/train'],
                       namespace=f'mmarco/{lang}',
                       lang=lang), train_qrels, train_docparis,
            documentation(f'{lang}/train'))
        subsets[f'{lang}/dev'] = Dataset(
            collection,
            TsvQueries(dlc[f'{lang}/queries/dev'],
                       namespace=f'mmarco/{lang}',
                       lang=lang), dev_qrels, documentation(f'{lang}/dev'))
        subsets[f'{lang}/dev/small'] = Dataset(
            collection,
            FilteredQueries(subsets[f'{lang}/dev'].queries_handler(),
                            small_dev_qids,
                            mode='include'), dev_small_qrels,
            TrecScoredDocs(dlc[f'{lang}/scoreddocs/dev'])
            if lang not in ('zh', 'pt') else None,
            documentation(f'{lang}/dev/small'))
        if lang in ('zh', 'pt'):
            subsets[f'{lang}/dev/v1.1'] = Dataset(
                collection,
                TsvQueries(dlc[f'{lang}/queries/dev/v1.1'],
                           namespace=f'mmarco/{lang}',
                           lang=lang), dev_qrels,
                documentation(f'{lang}/dev/v1.1'))
            subsets[f'{lang}/dev/small/v1.1'] = Dataset(
                collection,
                FilteredQueries(subsets[f'{lang}/dev/v1.1'].queries_handler(),
                                small_dev_qids,
                                mode='include'), dev_small_qrels,
                TrecScoredDocs(dlc[f'{lang}/scoreddocs/dev/v1.1']),
                documentation(f'{lang}/dev/v1.1'))
        if lang in ('pt', ):
            subsets[f'{lang}/train/v1.1'] = Dataset(
                collection,
                TsvQueries(dlc[f'{lang}/queries/train/v1.1'],
                           namespace=f'mmarco/{lang}',
                           lang=lang), train_qrels, train_docparis,
                documentation(f'{lang}/train/v1.1'))

    for lang in [
            'ar', 'zh', 'dt', 'fr', 'de', 'hi', 'id', 'it', 'ja', 'pt', 'ru',
            'es', 'vi'
    ]:
        collection = TsvDocs(
            dlc[f'v2/{lang}/docs'],
            namespace=f'mmarco/{lang}',
            lang=lang,
            count_hint=ir_datasets.util.count_hint(f'{NAME}/v2/{lang}'))
        subsets[f'v2/{lang}'] = Dataset(collection,
                                        documentation(f'v2/{lang}'))
        subsets[f'v2/{lang}/train'] = Dataset(
            collection,
            TsvQueries(dlc[f'v2/{lang}/queries/train'],
                       namespace=f'mmarco/v2/{lang}',
                       lang=lang), train_qrels, train_docparis,
            documentation(f'v2/{lang}/train'))
        subsets[f'v2/{lang}/dev'] = Dataset(
            collection,
            TsvQueries(dlc[f'v2/{lang}/queries/dev'],
                       namespace=f'v2/mmarco/{lang}',
                       lang=lang), dev_qrels, documentation(f'v2/{lang}/dev'))
        subsets[f'v2/{lang}/dev/small'] = Dataset(
            collection,
            FilteredQueries(subsets[f'v2/{lang}/dev'].queries_handler(),
                            small_dev_qids,
                            mode='include'), dev_small_qrels,
            TrecScoredDocs(dlc[f'v2/{lang}/scoreddocs/dev'],
                           negate_score=True),
            documentation(f'v2/{lang}/dev/small'))

    ir_datasets.registry.register(NAME, Dataset(documentation('_')))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return collection, subsets
Beispiel #9
0
def _init():
    base_path = ir_datasets.util.home_path()/NAME
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    subsets = {}
    collection = MsMarcoTrecDocs(GzipExtract(dlc['docs']))

    subsets['train'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['train/queries']), namespace='msmarco', lang='en'),
        TrecQrels(GzipExtract(dlc['train/qrels']), QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])),
    )

    subsets['dev'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['dev/queries']), namespace='msmarco', lang='en'),
        TrecQrels(GzipExtract(dlc['dev/qrels']), QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['dev/scoreddocs'])),
    )

    subsets['eval'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['eval/queries']), namespace='msmarco', lang='en'),
        TrecScoredDocs(GzipExtract(dlc['eval/scoreddocs'])),
    )

    subsets['trec-dl-2019'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']), namespace='msmarco', lang='en'),
        TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['trec-dl-2019/scoreddocs'])),
    )

    subsets['trec-dl-2020'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'),
        TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['trec-dl-2020/scoreddocs'])),
    )

    subsets['orcas'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['orcas/queries']), namespace='orcas', lang='en'),
        TrecQrels(GzipExtract(dlc['orcas/qrels']), ORCAS_QLRES_DEFS),
        TrecScoredDocs(GzipExtract(dlc['orcas/scoreddocs'])),
    )

    dl19_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()})
    subsets['trec-dl-2019/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged),
        FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged),
        subsets['trec-dl-2019'],
    )

    dl20_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()})
    subsets['trec-dl-2020/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_judged),
        FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(), dl20_judged),
        subsets['trec-dl-2020'],
    )

    # DL-Hard
    dl_hard_qrels_migrator = Migrator(base_path/'trec-dl-hard'/'irds_version.txt', 'v2',
        affected_files=[base_path/'trec-dl-hard'/'qrels'],
        message='Updating trec-dl-hard qrels')
    hard_qids = Lazy(lambda: DL_HARD_QIDS)
    dl_hard_base_queries = TsvQueries([
            Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path/'trec-dl-2019/queries.tsv'),
            Cache(GzipExtract(dlc['trec-dl-2020/queries']), base_path/'trec-dl-2020/queries.tsv')], namespace='msmarco', lang='en')
    subsets['trec-dl-hard'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        dl_hard_qrels_migrator(TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)),
        documentation('trec-dl-hard')
    )
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1'])
    subsets['trec-dl-hard/fold1'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold1')
    )
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2'])
    subsets['trec-dl-hard/fold2'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold2')
    )
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3'])
    subsets['trec-dl-hard/fold3'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold3')
    )
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4'])
    subsets['trec-dl-hard/fold4'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold4')
    )
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5'])
    subsets['trec-dl-hard/fold5'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold5')
    )
    
    subsets['anchor-text'] = Dataset(
        MsMarcoAnchorTextDocs(
            Cache(GzipExtract(dlc['anchor-text']), base_path / "anchor-text.json"),
            count_hint=1703834
        ),
        documentation('anchor-text')
    )

    ir_datasets.registry.register(NAME, Dataset(collection, documentation("_")))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s)))

    return collection, subsets
Beispiel #10
0
def _init():
    documentation = YamlDocumentation('docs/msmarco-passage.yaml')
    base_path = ir_datasets.util.home_path() / 'msmarco-passage'
    dlc = DownloadConfig.context('msmarco-passage', base_path, dua=DUA)
    collection = TsvDocs(Cache(
        FixEncoding(TarExtract(dlc['collectionandqueries'], 'collection.tsv')),
        base_path / 'collection.tsv'),
                         namespace='msmarco')
    subsets = {}

    subsets['train'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.train.tsv'),
                         base_path / 'train/queries.tsv'),
                   namespace='msmarco'),
        TrecQrels(dlc['train/qrels'], QRELS_DEFS),
        TsvDocPairs(GzipExtract(dlc['train/docpairs'])),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(
                    TarExtract(dlc['train/scoreddocs'], 'top1000.train.txt')),
                base_path / 'train/ms.run')),
    )

    subsets['dev'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.dev.tsv'),
                         base_path / 'dev/queries.tsv'),
                   namespace='msmarco'),
        TrecQrels(dlc['dev/qrels'], QRELS_DEFS),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(TarExtract(dlc['dev/scoreddocs'],
                                         'top1000.dev')),
                base_path / 'dev/ms.run')),
    )

    subsets['dev/small'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(dlc['collectionandqueries'], 'queries.dev.small.tsv'),
            base_path / 'dev/small/queries.tsv'),
                   namespace='msmarco'),
        TrecQrels(
            Cache(
                TarExtract(dlc['collectionandqueries'], 'qrels.dev.small.tsv'),
                base_path / 'dev/small/qrels'), QRELS_DEFS),
    )

    subsets['eval'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.eval.tsv'),
                         base_path / 'eval/queries.tsv'),
                   namespace='msmarco'),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(
                    TarExtract(dlc['eval/scoreddocs'], 'top1000.eval')),
                base_path / 'eval/ms.run')),
    )

    subsets['eval/small'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(dlc['collectionandqueries'], 'queries.eval.small.tsv'),
            base_path / 'eval/small/queries.tsv'),
                   namespace='msmarco'),
    )

    subsets['trec-dl-2019'] = Dataset(
        collection,
        TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS),
        TsvQueries(Cache(GzipExtract(dlc['trec-dl-2019/queries']),
                         base_path / 'trec-dl-2019/queries.tsv'),
                   namespace='msmarco'),
        TrecScoredDocs(
            Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2019/scoreddocs'])),
                  base_path / 'trec-dl-2019/ms.run')),
    )

    subsets['trec-dl-2020'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']),
                   namespace='msmarco'),
        TrecScoredDocs(
            Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2020/scoreddocs'])),
                  base_path / 'trec-dl-2020/ms.run')),
    )

    # A few subsets that are contrainted to just the queries/qrels/docpairs that have at least
    # 1 relevance assessment
    train_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['train'].qrels_iter()})
    subsets['train/judged'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(), train_judged),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           train_judged),
        subsets['train'],
    )

    dev_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['dev'].qrels_iter()})
    subsets['dev/judged'] = Dataset(
        FilteredQueries(subsets['dev'].queries_handler(), dev_judged),
        FilteredScoredDocs(subsets['dev'].scoreddocs_handler(), dev_judged),
        subsets['dev'],
    )

    dl19_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2019'].qrels_iter()})
    subsets['trec-dl-2019/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2019'].queries_handler(),
                        dl19_judged),
        FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(),
                           dl19_judged),
        subsets['trec-dl-2019'],
    )

    # split200 -- 200 queries held out from the training data for validation
    split200 = Lazy(lambda: SPLIT200_QIDS)
    subsets['train/split200-train'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(),
                        split200,
                        mode='exclude'),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           split200,
                           mode='exclude'),
        FilteredQrels(subsets['train'].qrels_handler(),
                      split200,
                      mode='exclude'),
        FilteredDocPairs(subsets['train'].docpairs_handler(),
                         split200,
                         mode='exclude'),
        subsets['train'],
    )
    subsets['train/split200-valid'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(),
                        split200,
                        mode='include'),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           split200,
                           mode='include'),
        FilteredQrels(subsets['train'].qrels_handler(),
                      split200,
                      mode='include'),
        FilteredDocPairs(subsets['train'].docpairs_handler(),
                         split200,
                         mode='include'),
        subsets['train'],
    )

    # Medical subset
    def train_med():
        with dlc['medmarco_ids'].stream() as stream:
            stream = codecs.getreader('utf8')(stream)
            return {l.rstrip() for l in stream}

    train_med = Lazy(train_med)
    subsets['train/medical'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(), train_med),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_med),
        FilteredDocPairs(subsets['train'].docpairs_handler(), train_med),
        FilteredQrels(subsets['train'].qrels_handler(), train_med),
        subsets['train'],
    )

    ir_datasets.registry.register('msmarco-passage',
                                  Dataset(collection, documentation('_')))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'msmarco-passage/{s}',
                                      Dataset(subsets[s], documentation(s)))

    return collection, subsets
Beispiel #11
0
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    def wapo_converter(dsid):
        def wrapped():
            BeautifulSoup = ir_datasets.lazy_libs.bs4().BeautifulSoup
            # NOTE: These rules are very specific in order to replicate the behaviour present in the official script
            # here: <https://github.com/grill-lab/trec-cast-tools/blob/8fa243a7e058ce4b1b378c99768c53546460c0fe/src/main/python/wapo_trecweb.py>
            # Specifically, things like skipping empty documents, filtering by "paragraph" subtype, and starting the
            # paragraph index at 1 are all needed to perfectly match the above script.
            # Note that the script does NOT strip HTML markup, which is meant to be removed out in a later stage (e.g., indexing).
            # We do that here for user simplicity, as it will allow the text to be consumed directly by various models
            # without the need for further pre-processing. (Though a bit of information is lost.)
            for wapo_doc in ir_datasets.load(
                    dsid).docs_handler().docs_wapo_raw_iter():
                doc_id = wapo_doc['id']
                pid = itertools.count(1)  # paragrah index starts at 1
                for paragraph in wapo_doc['contents']:
                    if paragraph is not None and paragraph.get(
                            'subtype'
                    ) == 'paragraph' and paragraph['content'] != '':
                        text = paragraph['content']
                        if paragraph.get('mime') == 'text/html':
                            text = BeautifulSoup(f'<OUTER>{text}</OUTER>',
                                                 'lxml-xml').get_text()
                        yield GenericDoc(f'WAPO_{doc_id}-{next(pid)}', text)

        return wrapped

    def prefixer(dsid, prefix):
        def wrapped():
            for doc in ir_datasets.load(dsid).docs_iter():
                yield GenericDoc(f'{prefix}_{doc.doc_id}', doc.text)

        return wrapped

    WAPO_v2 = wapo_converter('wapo/v2')
    MARCO = prefixer('msmarco-passage', 'MARCO')
    CAR = prefixer('car/v2.0', 'CAR')

    docs_v0 = CastDocs('docs_v0', [
        ('WAPO', WAPO_v2, dlc['wapo_dupes']),
        ('MARCO', MARCO, dlc['marco_dupes']),
        ('CAR', CAR, None),
    ])

    docs_v1 = CastDocs('docs_v1', [
        ('MARCO', MARCO, dlc['marco_dupes']),
        ('CAR', CAR, None),
    ])

    base = Dataset(documentation('_'))

    subsets['v0'] = Dataset(docs_v0)

    subsets['v0/train'] = Dataset(
        docs_v0, CastQueries(dlc['2019/train/queries'], Cast2019Query),
        TrecQrels(dlc['2019/train/qrels'], QRELS_DEFS_TRAIN),
        TrecScoredDocs(dlc['2019/train/scoreddocs']))
    qids_train_v0 = Lazy(
        lambda: {q.query_id
                 for q in subsets['v0/train'].qrels_iter()})
    subsets['v0/train/judged'] = Dataset(
        docs_v0,
        FilteredQueries(subsets['v0/train'].queries_handler(), qids_train_v0),
        subsets['v0/train'].qrels_handler(),
        FilteredScoredDocs(subsets['v0/train'].scoreddocs_handler(),
                           qids_train_v0),
    )

    subsets['v1'] = Dataset(docs_v1)

    subsets['v1/2019'] = Dataset(
        docs_v1, CastQueries(dlc['2019/eval/queries'], Cast2019Query),
        TrecQrels(dlc['2019/eval/qrels'], QRELS_DEFS),
        TrecScoredDocs(dlc['2019/eval/scoreddocs']))
    qids_2019 = Lazy(
        lambda: {q.query_id
                 for q in subsets['v1/2019'].qrels_iter()})
    subsets['v1/2019/judged'] = Dataset(
        docs_v1,
        FilteredQueries(subsets['v1/2019'].queries_handler(), qids_2019),
        subsets['v1/2019'].qrels_handler(),
        FilteredScoredDocs(subsets['v1/2019'].scoreddocs_handler(), qids_2019),
    )

    subsets['v1/2020'] = Dataset(
        docs_v1,
        CastQueries(dlc['2020/queries'], Cast2020Query),
        TrecQrels(dlc['2020/qrels'], QRELS_DEFS),
    )
    qids_2020 = Lazy(
        lambda: {q.query_id
                 for q in subsets['v1/2020'].qrels_iter()})
    subsets['v1/2020/judged'] = Dataset(
        docs_v1,
        FilteredQueries(subsets['v1/2020'].queries_handler(), qids_2020),
        subsets['v1/2020'].qrels_handler(),
    )

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}',
                                      Dataset(subsets[s], documentation(s)))

    return base, subsets
Beispiel #12
0
def _init():
    base_path = ir_datasets.util.home_path() / NAME
    dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    base = Dataset(documentation('_'))

    subsets = {}

    benchmarks = {
        'msmarco': (['train', 'dev', 'test'], GenericDoc, GenericQuery),
        'trec-covid': (['test'], BeirCordDoc, BeirCovidQuery),
        'nfcorpus': (['train', 'dev', 'test'], BeirTitleUrlDoc, BeirUrlQuery),
        'nq': (['test'], BeirTitleDoc, GenericQuery),
        'hotpotqa': (['train', 'dev', 'test'], BeirTitleUrlDoc, GenericQuery),
        'fiqa': (['train', 'dev', 'test'], GenericDoc, GenericQuery),
        'arguana': (['test'], BeirTitleDoc, GenericQuery),
        'webis-touche2020': (['test'], BeirToucheDoc, BeirToucheQuery),
        'webis-touche2020/v2': (['test'], BeirToucheDoc, BeirToucheQuery),
        'quora': (['dev', 'test'], GenericDoc, GenericQuery),
        'dbpedia-entity': (['dev', 'test'], BeirTitleUrlDoc, GenericQuery),
        'scidocs': (['test'], BeirSciDoc, BeirSciQuery),
        'fever': (['train', 'dev', 'test'], BeirTitleDoc, GenericQuery),
        'climate-fever': (['test'], BeirTitleDoc, GenericQuery),
        'scifact': (['train', 'test'], BeirTitleDoc, GenericQuery),
    }

    for ds, (qrels, doc_type, query_type) in benchmarks.items():
        dlc_ds = dlc[ds]
        ds_zip = ds.split('/')[0]
        docs_migrator = Migrator(
            base_path / ds / 'irds_version.txt',
            'v2',
            affected_files=[f'{base_path/ds}/docs.pklz4'],
            message=f'Migrating {NAME}/{ds} (structuring fields)')
        docs = docs_migrator(
            BeirDocs(ds, ZipExtract(dlc_ds, f'{ds_zip}/corpus.jsonl'),
                     doc_type))
        queries = BeirQueries(
            ds,
            Cache(ZipExtract(dlc_ds, f'{ds_zip}/queries.jsonl'),
                  base_path / ds / 'queries.json'), query_type)
        if len(qrels) == 1:
            subsets[ds] = Dataset(
                docs, queries,
                BeirQrels(Cache(
                    ZipExtract(dlc_ds, f'{ds_zip}/qrels/{qrels[0]}.tsv'),
                    base_path / ds / f'{qrels[0]}.qrels'),
                          qrels_defs={}), documentation(ds))
        else:
            subsets[ds] = Dataset(docs, queries, documentation(ds))
            for qrel in qrels:
                subset_qrels = BeirQrels(Cache(
                    ZipExtract(dlc_ds, f'{ds_zip}/qrels/{qrel}.tsv'),
                    base_path / ds / f'{qrel}.qrels'),
                                         qrels_defs={})
                subset_qids = qid_filter(subset_qrels)
                subsets[f'{ds}/{qrel}'] = Dataset(
                    docs, FilteredQueries(queries, subset_qids,
                                          mode='include'), subset_qrels,
                    documentation(f'{ds}/{qrel}'))

    cqa = [
        'android', 'english', 'gaming', 'gis', 'mathematica', 'physics',
        'programmers', 'stats', 'tex', 'unix', 'webmasters', 'wordpress'
    ]
    cqa_dlc = dlc['cqadupstack']
    for ds in cqa:
        docs_migrator = Migrator(
            base_path / 'cqadupstack' / ds / 'irds_version.txt',
            'v2',
            affected_files=[f'{base_path/"cqadupstack"/ds}/docs.pklz4'],
            message=f'Migrating {NAME}/cqadupstack/{ds} (structuring fields)')
        subsets[f'cqadupstack/{ds}'] = Dataset(
            docs_migrator(
                BeirDocs(f'cqadupstack/{ds}',
                         ZipExtract(cqa_dlc, f'cqadupstack/{ds}/corpus.jsonl'),
                         BeirCqaDoc)),
            BeirQueries(
                f'cqadupstack/{ds}',
                Cache(ZipExtract(cqa_dlc, f'cqadupstack/{ds}/queries.jsonl'),
                      base_path / 'cqadupstack' / ds / 'queries.json'),
                BeirCqaQuery),
            BeirQrels(Cache(
                ZipExtract(cqa_dlc, f'cqadupstack/{ds}/qrels/test.tsv'),
                base_path / 'cqadupstack' / ds / f'test.qrels'),
                      qrels_defs={}), documentation(f'cqadupstack/{ds}'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets