コード例 #1
0
ファイル: nyt.py プロジェクト: prateekchandrajha/ir_datasets
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    collection = NytDocs(dlc['source'])

    base = Dataset(collection, documentation('_'))

    all_queries = NytQueries(collection)
    all_qrels = NytQrels(collection)

    match_qids = Lazy(lambda: VALID_IDS)
    subsets['train'] = Dataset(
        FilteredQueries(all_queries, match_qids, mode='exclude'),
        FilteredQrels(all_qrels, match_qids, mode='exclude'),
        collection,
        documentation('train'))
    subsets['valid'] = Dataset(
        FilteredQueries(all_queries, match_qids, mode='include'),
        FilteredQrels(all_qrels, match_qids, mode='include'),
        collection,
        documentation('valid'))

    ir_datasets.registry.register('nyt', base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'nyt/{s}', subsets[s])

    return base, subsets
コード例 #2
0
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    collection = TrecDocs(dlc['docs'],
                          encoding='utf8',
                          path_globs=[
                              'aquaint_comp/apw/*/*.gz',
                              'aquaint_comp/nyt/*/*.gz',
                              'aquaint_comp/xie/*/*.gz'
                          ],
                          namespace=NAME,
                          lang='en',
                          count_hint=ir_datasets.util.count_hint(NAME))

    base = Dataset(collection, documentation('_'))

    subsets['trec-robust-2005'] = Dataset(
        TrecQueries(dlc['trec-robust-2005/queries'],
                    qtype_map=QTYPE_MAP,
                    namespace='trec-robust',
                    lang='en'),
        TrecQrels(dlc['trec-robust-2005/qrels'], QREL_DEFS), collection,
        documentation('trec-robust-2005'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
コード例 #3
0
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    collection = TrecDocs(
        dlc['docs'],
        encoding='utf8',
        path_globs=['arabic_newswire_a/transcripts/*/*.sgm.gz'],
        namespace=NAME)

    base = Dataset(collection, documentation('_'))

    subsets['ar2001'] = Dataset(
        TrecQueries(dlc['ar2001/queries'],
                    qtype_map=QTYPE_MAP,
                    encoding='ISO-8859-6',
                    namespace=NAME), TrecQrels(dlc['ar2001/qrels'], QREL_DEFS),
        collection, documentation('ar2001'))

    subsets['ar2002'] = Dataset(
        TrecQueries(dlc['ar2002/queries'],
                    qtype_map=QTYPE_MAP,
                    encoding='ISO-8859-6',
                    namespace=NAME), TrecQrels(dlc['ar2002/qrels'], QREL_DEFS),
        collection, documentation('ar2002'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
コード例 #4
0
ファイル: c4.py プロジェクト: allenai/ir_datasets
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    en_noclean_tr_collection = C4Docs(
        GzipExtract(dlc['en-noclean/sources']),
        TarExtractAll(dlc['en-noclean/checkpoints'],
                      base_path / 'en.noclean.checkpoints'),
        base_path,
        source_name_filter=r'en\.noclean\.c4-train',
        filter_name='train')  # exclude validation files (only include train)
    base = Dataset(documentation('_'))

    subsets['en-noclean-tr'] = Dataset(en_noclean_tr_collection,
                                       documentation('en-noclean-tr'))

    subsets['en-noclean-tr/trec-misinfo-2021'] = Dataset(
        en_noclean_tr_collection,
        TrecXmlQueries(dlc['trec-misinfo-2021/queries'],
                       qtype=MisinfoQuery,
                       qtype_map=misinfo_map,
                       namespace='trec-misinfo',
                       lang='en'),
        documentation('en-noclean-tr/trec-misinfo-2021'))

    ir_datasets.registry.register(NAME, base)
    for subset in subsets:
        ir_datasets.registry.register(f'{NAME}/{subset}', subsets[subset])

    return base, subsets
コード例 #5
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    subsets = {}

    collection = TrecDocs(dlc['docs'],
                          path_globs=[
                              '**/FBIS/FB*', '**/FR94/??/FR*', '**/FT/*/FT*',
                              '**/LATIMES/LA*'
                          ],
                          namespace=NAME,
                          lang='en',
                          expected_file_count=2295,
                          count_hint=ir_datasets.util.count_hint(NAME))

    queries = TrecQueries(GzipExtract(dlc['queries']),
                          namespace=NAME,
                          lang='en')
    qrels = TrecQrels(dlc['qrels'], QREL_DEFS)

    base = Dataset(collection, queries, qrels, documentation('_'))

    for fold in FOLDS:
        qid_filter = make_filter(fold)
        subsets[fold] = Dataset(FilteredQueries(queries, qid_filter),
                                FilteredQrels(qrels, qid_filter), collection,
                                documentation(fold))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
コード例 #6
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    collection = HighwireDocs(dlc, dlc['legalspans'])
    base = Dataset(collection, documentation('_'))

    subsets['trec-genomics-2006'] = Dataset(
        collection,
        TrecGenomicsQueries(dlc['trec-genomics-2006/queries']),
        HighwireQrels(dlc['trec-genomics-2006/qrels'], QREL_DEFS_06),
        documentation('trec-genomics-2006'),
    )
    subsets['trec-genomics-2007'] = Dataset(
        collection,
        TrecGenomicsQueries(dlc['trec-genomics-2007/queries']),
        HighwireQrels(dlc['trec-genomics-2007/qrels'], QREL_DEFS_07),
        documentation('trec-genomics-2007'),
    )

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
コード例 #7
0
ファイル: hc4.py プロジェクト: allenai/ir_datasets
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    base = Dataset(documentation('_')) # dummy top level ds

    for lang in ['zh', 'fa', 'ru']:
        lang_docs = HC4Docs(dlc[f'{lang}/docs'], subset_lang=lang)
        subsets[lang] = Dataset(
            lang_docs,
            documentation(lang)
        )
        for sep in ['train', 'dev', 'test']:
            subsets[f'{lang}/{sep}'] = Dataset(
                lang_docs,
                HC4Queries(dlc[f'{sep}/topics'], subset_lang=lang),
                TrecQrels(dlc[f'{lang}/{sep}/qrels'], QREL_DEFS),
                documentation(f'{lang}/{sep}'),
            )
    
    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
コード例 #8
0
def _init():
    subsets = {}
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)

    collection = TrecDocs(dlc['docs'], encoding='GB18030', path_globs=['**/xinhua/x*', '**/peoples-daily/pd*'], namespace=NAME, lang='zh', count_hint=ir_datasets.util.count_hint(NAME))

    base = Dataset(collection, documentation('_'))

    subsets['trec5'] = Dataset(
        TrecQueries(GzipExtract(dlc['trec5/queries']), qtype=TrecMandarinQuery, qtype_map=QTYPE_MAP, encoding='GBK', namespace=NAME, lang=None), # queries have multiple languages
        TrecQrels(GzipExtract(dlc['trec5/qrels']), QREL_DEFS),
        collection,
        documentation('trec5'))

    subsets['trec6'] = Dataset(
        TrecQueries(GzipExtract(dlc['trec6/queries']), qtype=TrecMandarinQuery, qtype_map=QTYPE_MAP, encoding='GBK', namespace=NAME, lang=None), # queries have multiple languages
        TrecQrels(GzipExtract(dlc['trec6/qrels']), QREL_DEFS),
        collection,
        documentation('trec6'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
コード例 #9
0
def _init():
    documentation = YamlDocumentation('docs/antique.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    collection = TsvDocs(dlc['docs'],
                         namespace=NAME,
                         lang='en',
                         count_hint=ir_datasets.util.count_hint(NAME))

    subsets = {}
    for subset in ('train', 'test'):
        qrels = TrecQrels(dlc[f'{subset}/qrels'], QREL_DEFS)
        queries = TsvQueries(dlc[f'{subset}/queries'],
                             namespace=NAME,
                             lang='en')
        subsets[subset] = Dataset(collection, queries, qrels)

    # Split the training data into training and validation data
    validation_qids = Lazy(lambda: VALIDATION_QIDS)
    subsets['train/split200-train'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(),
                        validation_qids,
                        mode='exclude'),
        FilteredQrels(subsets['train'].qrels_handler(),
                      validation_qids,
                      mode='exclude'), subsets['train'])
    subsets['train/split200-valid'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(),
                        validation_qids,
                        mode='include'),
        FilteredQrels(subsets['train'].qrels_handler(),
                      validation_qids,
                      mode='include'), subsets['train'])

    # Separate test set removing the "offensive (and noisy)" questions
    disallow_list = dlc['disallow_list']

    def disllow_qids():
        with disallow_list.stream() as stream:
            stream = io.TextIOWrapper(stream)
            return {l.rstrip() for l in stream}

    disllow_qids = Lazy(disllow_qids)
    subsets['test/non-offensive'] = Dataset(
        FilteredQueries(subsets['test'].queries_handler(),
                        disllow_qids,
                        mode='exclude'),
        FilteredQrels(subsets['test'].qrels_handler(),
                      disllow_qids,
                      mode='exclude'), subsets['test'])

    ir_datasets.registry.register(NAME, Dataset(collection,
                                                documentation('_')))

    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}',
                                      Dataset(subsets[s], documentation(s)))

    return collection, subsets
コード例 #10
0
ファイル: wapo.py プロジェクト: allenai/ir_datasets
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    collection_v2 = WapoDocs(dlc['v2'])

    base = Dataset(documentation('_'))

    subsets['v2'] = Dataset(collection_v2, documentation('v2'))

    subsets['v2/trec-core-2018'] = Dataset(
        collection_v2,
        TrecQueries(dlc['trec-core-2018/queries'],
                    namespace='trec-core-2018',
                    lang='en',
                    remove_tags=RM_TAGS),
        TrecQrels(dlc['trec-core-2018/qrels'], CORE_QREL_DEFS),
        documentation('v2/trec-core-2018'))

    subsets['v2/trec-news-2018'] = Dataset(
        collection_v2,
        TrecQueries(dlc['trec-news-2018/queries'],
                    namespace='trec-news-2018',
                    lang='en',
                    qtype=TrecBackgroundLinkingQuery,
                    qtype_map=BL_MAP,
                    remove_tags=RM_TAGS),
        TrecQrels(dlc['trec-news-2018/qrels'], BL_QREL_DEFS),
        documentation('v2/trec-news-2018'))

    subsets['v2/trec-news-2019'] = Dataset(
        collection_v2,
        TrecQueries(dlc['trec-news-2019/queries'],
                    namespace='trec-news-2019',
                    lang='en',
                    qtype=TrecBackgroundLinkingQuery,
                    qtype_map=BL_MAP,
                    remove_tags=RM_TAGS),
        TrecQrels(dlc['trec-news-2019/qrels'], BL_QREL_DEFS),
        documentation('v2/trec-news-2019'))

    subsets['v3/trec-news-2020'] = Dataset(
        TrecQueries(dlc['trec-news-2020/queries'],
                    namespace='trec-news-2020',
                    lang='en',
                    qtype=TrecBackgroundLinkingQuery,
                    qtype_map=BL_MAP,
                    remove_tags=RM_TAGS),
        TrecQrels(dlc['trec-news-2020/qrels'], BL_QREL_DEFS),
        documentation('v3/trec-news-2020'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
コード例 #11
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    docs_dlc = dlc['docs']
    docs_chk_dlc = TarExtractAll(dlc['docs.chk'], base_path/'corpus.chk')
    b13_dlc = Bz2Extract(Cache(TarExtract(dlc['cw12b-info'], 'ClueWeb12-CreateB13/software/CreateClueWeb12B13Dataset.jar'), base_path/'CreateClueWeb12B13Dataset.jar'))

    collection = ClueWeb12Docs(docs_dlc, docs_chk_dlc)
    collection_b13 = ClueWeb12Docs(ClueWeb12b13Extractor(docs_dlc, b13_dlc))

    base = Dataset(collection, documentation('_'))

    subsets['b13'] = Dataset(collection_b13, documentation('b13'))

    subsets['trec-web-2013'] = Dataset(
        collection,
        TrecXmlQueries(dlc['trec-web-2013/queries'], qtype=TrecWebTrackQuery, namespace=NAME),
        TrecQrels(dlc['trec-web-2013/qrels.adhoc'], QREL_DEFS),
        documentation('trec-web-2013'))

    subsets['trec-web-2014'] = Dataset(
        collection,
        TrecXmlQueries(dlc['trec-web-2014/queries'], qtype=TrecWebTrackQuery, namespace=NAME),
        TrecQrels(dlc['trec-web-2014/qrels.adhoc'], QREL_DEFS),
        documentation('trec-web-2014'))

    subsets['b13/ntcir-www-1'] = Dataset(
        collection_b13,
        TrecXmlQueries(Cache(ZipExtract(dlc['ntcir-www-1/queries'], 'eng.queries.xml'), base_path/'ntcir-www-1'/'queries.xml'), qtype=GenericQuery, qtype_map={'qid': 'query_id', 'content': 'text'}, namespace=NAME),
        NtcirQrels(dlc['ntcir-www-1/qrels'], NTCIR_QREL_DEFS),
        documentation('ntcir-www-1'))

    subsets['b13/ntcir-www-2'] = Dataset(
        collection_b13,
        TrecXmlQueries(Cache(ZipExtract(dlc['ntcir-www-2/queries'], 'qEng.xml'), base_path/'ntcir-www-2'/'queries.xml'), qtype=NtcirQuery, qtype_map=ntcir_map, namespace=NAME),
        NtcirQrels(dlc['ntcir-www-2/qrels'], NTCIR_QREL_DEFS),
        documentation('ntcir-www-2'))

    subsets['b13/ntcir-www-3'] = Dataset(
        collection_b13,
        TrecXmlQueries(dlc['ntcir-www-3/queries'], qtype=NtcirQuery, qtype_map=ntcir_map, namespace=NAME),
        documentation('ntcir-www-3'))

    subsets['b13/trec-misinfo-2019'] = Dataset(
        collection_b13,
        TrecXmlQueries(dlc['trec-misinfo-2019/queries'], qtype=MisinfoQuery, qtype_map=misinfo_map, namespace=NAME),
        MsinfoQrels(dlc['trec-misinfo-2019/qrels'], MISINFO_QREL_DEFS),
        documentation('trec-misinfo-2019'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
コード例 #12
0
def _init():
    base_path = ir_datasets.util.home_path()/NAME
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    subsets = {}
    migrator = Migrator(base_path/'irds_version.txt', 'v2',
        affected_files=[base_path/'msmarco_v2_passage.tar.pklz4'],
        message='Cleaning up pklz4 lookup structure in favor of ID-based lookups')
    collection = MsMarcoV2Passages(dlc['passages'])
    collection = migrator(collection)

    qrels_migrator = Migrator(base_path/'qrels_version.txt', 'v2',
        affected_files=[base_path/'train'/'qrels.tsv', base_path/'dev1'/'qrels.tsv', base_path/'dev2'/'qrels.tsv'],
        message='Updating qrels (task organizers removed duplicates)')

    subsets['train'] = Dataset(
        collection,
        TsvQueries(dlc['train/queries'], namespace='msmarco', lang='en'),
        qrels_migrator(TrecQrels(dlc['train/qrels'], QRELS_DEFS)),
        TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])),
    )
    subsets['dev1'] = Dataset(
        collection,
        TsvQueries(dlc['dev1/queries'], namespace='msmarco', lang='en'),
        qrels_migrator(TrecQrels(dlc['dev1/qrels'], QRELS_DEFS)),
        TrecScoredDocs(GzipExtract(dlc['dev1/scoreddocs'])),
    )
    subsets['dev2'] = Dataset(
        collection,
        TsvQueries(dlc['dev2/queries'], namespace='msmarco', lang='en'),
        qrels_migrator(TrecQrels(dlc['dev2/qrels'], QRELS_DEFS)),
        TrecScoredDocs(GzipExtract(dlc['dev2/scoreddocs'])),
    )
    subsets['trec-dl-2021'] = Dataset(
        collection,
        TsvQueries(dlc['trec-dl-2021/queries'], namespace='msmarco', lang='en'),
        TrecQrels(dlc['trec-dl-2021/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['trec-dl-2021/scoreddocs'])),
    )
    dl21_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2021'].qrels_iter()})
    subsets['trec-dl-2021/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2021'].queries_handler(), dl21_judged),
        FilteredScoredDocs(subsets['trec-dl-2021'].scoreddocs_handler(), dl21_judged),
        subsets['trec-dl-2021'],
    )

    ir_datasets.registry.register(NAME, Dataset(collection, documentation("_")))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s)))

    return collection, subsets
コード例 #13
0
def _init():
    base_path = home_path() / NAME

    documentation = YamlDocumentation(f"docs/{NAME}.yaml")
    download_config = DownloadConfig.context(NAME, base_path)

    base = Dataset(documentation('_'))

    # Arguments that can be loaded from Zenodo.
    arguments: Dict[str, ArgsMeDocs] = {
        name: ArgsMeDocs(Cache(ZipExtract(download_config[name], zip_path),
                               base_path / f"{name}.json"),
                         namespace=f"{NAME}/{name}",
                         language=language,
                         count_hint=count_hint)
        for name, (count_hint, language, zip_path) in SUBSETS.items()
    }

    # Arguments that are combined versions of other subsets.
    combined_arguments: Dict[str, ArgsMeCombinedArguments] = {
        name: ArgsMeCombinedArguments(
            base_path / f"{name}.json",
            [arguments[subset_name] for subset_name in subset_names],
            namespace=f"{NAME}/{name}",
            language=language,
            count_hint=count_hint)
        for name, (subset_names, count_hint,
                   language) in COMBINED_SUBSETS.items()
    }

    # Wrap in datasets with documentation.
    datasets = {
        name: Dataset(arguments, documentation(name))
        for name, arguments in chain(arguments.items(),
                                     combined_arguments.items())
    }

    # NOTE: the following datasets are defined in touche.py:
    #  - argsme/1.0/touche-2020-task-1/uncorrected
    #  - argsme/2020-04-01/touche-2020-task-1
    #  - argsme/2020-04-01/touche-2020-task-1/uncorrected
    #  - argsme/2020-04-01/touche-2021-task-1

    # Register datasets.
    registry.register(NAME, base)
    for name, arguments in datasets.items():
        registry.register(f'{NAME}/{name}', arguments)

    return base, datasets
コード例 #14
0
ファイル: medline.py プロジェクト: allenai/ir_datasets
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    base = Dataset(documentation('_'))

    collection04 = MedlineDocs('2004', [GzipExtract(dlc['2004/a']), GzipExtract(dlc['2004/b']), GzipExtract(dlc['2004/c']), GzipExtract(dlc['2004/d'])], count_hint=ir_datasets.util.count_hint(f'{NAME}/2004'))

    subsets['2004'] = Dataset(collection04, documentation('2004'))

    subsets['2004/trec-genomics-2004'] = Dataset(
        collection04,
        TrecXmlQueries(ZipExtract(dlc['trec-genomics-2004/queries'], 'Official.xml'), qtype=TrecGenomicsQuery, qtype_map=TREC04_XML_MAP, namespace='trec-genomics', lang='en'),
        TrecQrels(dlc['trec-genomics-2004/qrels'], QREL_DEFS),
        documentation('trec-genomics-2004'),
    )
    subsets['2004/trec-genomics-2005'] = Dataset(
        collection04,
        TrecGenomicsQueries(dlc['trec-genomics-2005/queries']),
        TrecQrels(dlc['trec-genomics-2005/qrels'], QREL_DEFS),
        documentation('trec-genomics-2005'),
    )

    collection17 = ConcatDocs([
        AacrAscoDocs(dlc['2017/aacr_asco_extra']),
        MedlineDocs('2017', [dlc['2017/part1'], dlc['2017/part2'], dlc['2017/part3'], dlc['2017/part4'], dlc['2017/part5']]),
    ], count_hint=ir_datasets.util.count_hint(f'{NAME}/2017'))
    subsets['2017'] = Dataset(collection17, documentation('2017'))

    subsets['2017/trec-pm-2017'] = Dataset(
        collection17,
        TrecXmlQueries(dlc['trec-pm-2017/queries'], qtype=TrecPm2017Query, namespace='trec-pm-2017', lang='en'),
        TrecQrels(dlc['trec-pm-2017/qrels'], QREL_DEFS),
        documentation('trec-pm-2017'),
    )
    subsets['2017/trec-pm-2018'] = Dataset(
        collection17,
        TrecXmlQueries(dlc['trec-pm-2018/queries'], qtype=TrecPmQuery, namespace='trec-pm-2018', lang='en'),
        TrecQrels(dlc['trec-pm-2018/qrels'], QREL_DEFS),
        documentation('trec-pm-2018'),
    )

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
コード例 #15
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    langs = ['python', 'java', 'go', 'php', 'ruby', 'javascript']

    dlcs = {
        lang: ZipExtractCache(dlc[lang], base_path / lang)
        for lang in langs
    }
    all_dlcs = [dlcs[lang] for lang in langs]
    base = Dataset(
        CodeSearchNetDocs(all_dlcs),
        documentation('_'),
    )
    subsets['train'] = Dataset(
        CodeSearchNetDocs(all_dlcs),
        CodeSearchNetQueries(all_dlcs, 'train'),
        CodeSearchNetQrels(all_dlcs, 'train'),
        documentation('train'),
    )
    subsets['valid'] = Dataset(
        CodeSearchNetDocs(all_dlcs),
        CodeSearchNetQueries(all_dlcs, 'valid'),
        CodeSearchNetQrels(all_dlcs, 'valid'),
        documentation('valid'),
    )
    subsets['test'] = Dataset(
        CodeSearchNetDocs(all_dlcs),
        CodeSearchNetQueries(all_dlcs, 'test'),
        CodeSearchNetQrels(all_dlcs, 'test'),
        documentation('test'),
    )
    challenge_queries = CodeSearchNetChallengeQueries(dlc['challenge/queries'])
    subsets['challenge'] = Dataset(
        CodeSearchNetDocs(all_dlcs),
        challenge_queries,
        CodeSearchNetChallengeQrels(dlc['challenge/qrels'], challenge_queries),
        documentation('challenge'),
    )

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
コード例 #16
0
def _init():
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    subsets = {}

    sources = [
        ('en1k', 'wikIR1k'),
        ('en59k', 'wikIR59k'),
        ('en78k', 'enwikIR'),
        ('ens78k', 'enwikIRS'),
        ('fr14k', 'FRwikIR14k'),
        ('es13k', 'ESwikIR13k'),
        ('it16k', 'ITwikIR16k'),
    ]

    for source, zip_dir_name in sources:
        source_dlc = ZipExtractCache(dlc[source], base_path / source)
        docs = CsvDocs(
            RelativePath(source_dlc, f"{zip_dir_name}/documents.csv"),
            namespace=source,
            lang=source[:2],
            count_hint=ir_datasets.util.count_hint(f'{NAME}/{source}'),
            docstore_path=ir_datasets.util.home_path() / NAME /
            f'{source}.pklz4')
        subsets[source] = Dataset(docs, documentation(source))
        for split in ['training', 'validation', 'test']:
            subsets[f'{source}/{split}'] = Dataset(
                docs,
                CsvQueries(RelativePath(source_dlc,
                                        f"{zip_dir_name}/{split}/queries.csv"),
                           lang=source[:2]),
                TrecQrels(RelativePath(source_dlc,
                                       f"{zip_dir_name}/{split}/qrels"),
                          qrels_defs=QRELS_DEFS),
                TrecScoredDocs(
                    RelativePath(source_dlc,
                                 f"{zip_dir_name}/{split}/BM25.res")),
                documentation(f'{source}/{split}'))

    base = Dataset(documentation('_'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
コード例 #17
0
ファイル: clean.py プロジェクト: allenai/ir_datasets
def clean(dataset, yes=False, list=False, human=True):
    base_path = os.path.join(ir_datasets.util.home_path() / dataset)
    dlc = DownloadConfig.context(dataset, base_path)
    skips = []
    for dl_item in dlc.contents().values():
        if 'instructions' in dl_item and 'cache_path' in dl_item:  # non-downloadble item
            skips.append(os.path.join(base_path, dl_item['cache_path']))
    size, files = walk_path(base_path, skips)
    files_fmt = f'{len(files)} files'
    if human:
        size_fmt = ir_datasets.util.format_file_size(size)
        if size > 1_000_000_000:  # sizes over 1GB: list in red
            size_fmt = f'{RED}{size_fmt}{RES}'
    else:
        size_fmt = str(size)
    if list:
        if size > 0:
            print(f'{size_fmt}\t{files_fmt}\t{dataset}')
        return
    if not yes:
        inp = None
        while inp not in ('y', 'yes'):
            inp = input(
                f'clean up {size_fmt} from {dataset} ({files_fmt})?\n[y(es) / n(o) / l(ist files)] '
            ).lower()
            if inp in ('l', 'list', 'list files'):
                for file in files:
                    f_size = os.path.getsize(file)
                    if human:
                        fsize_fmt = ir_datasets.util.format_file_size(f_size)
                        if f_size > 1_000_000_000:  # sizes over 1GB: list in red
                            fsize_fmt = f'{RED}{fsize_fmt}{RES}'
                    else:
                        fsize_fmt = str(size)
                    print(f'{fsize_fmt}\t{file}')
            if inp in ('n', 'no'):
                return

    # remove identified files
    for file in files:
        os.remove(file)

    # remove empty directories
    for dirpath, dirnames, filenames in os.walk(base_path, topdown=False):
        if not dirnames and not filenames:
            os.rmdir(dirpath)
コード例 #18
0
ファイル: gov.py プロジェクト: allenai/ir_datasets
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    collection = GovDocs(dlc['docs'])
    base = Dataset(collection, documentation('_'))

    subsets['trec-web-2002'] = Dataset(
        collection,
        TrecQueries(GzipExtract(dlc['trec-web-2002/queries']), namespace='gov/trec-web-2002', lang='en'),
        TrecQrels(GzipExtract(dlc['trec-web-2002/qrels']), QREL_DEFS),
        documentation('trec-web-2002')
    )
    subsets['trec-web-2002/named-page'] = Dataset(
        collection,
        TrecQueries(GzipExtract(dlc['trec-web-2002/named-page/queries']), qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace='gov/trec-web-2002/named-page', lang='en'),
        TrecQrels(GzipExtract(dlc['trec-web-2002/named-page/qrels']), NAMED_PAGE_QREL_DEFS),
        documentation('trec-web-2002/named-page')
    )
    subsets['trec-web-2003'] = Dataset(
        collection,
        TrecQueries(dlc['trec-web-2003/queries'], qtype=GovWeb02Query, qtype_map=WEB03_QTYPE_MAP, namespace='gov/trec-web-2003', lang='en'),
        TrecQrels(dlc['trec-web-2003/qrels'], QREL_DEFS),
        documentation('trec-web-2003')
    )
    subsets['trec-web-2003/named-page'] = Dataset(
        collection,
        TrecQueries(dlc['trec-web-2003/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace='gov/trec-web-2003/named-page', lang='en'),
        TrecQrels(dlc['trec-web-2003/named-page/qrels'], NAMED_PAGE_QREL_DEFS),
        documentation('trec-web-2003/named-page')
    )
    subsets['trec-web-2004'] = Dataset(
        collection,
        TrecQueries(dlc['trec-web-2004/queries'], qtype=GenericQuery, qtype_map=WEB04_QTYPE_MAP, namespace='gov/trec-web-2004', lang='en'),
        TrecQrels(dlc['trec-web-2004/qrels'], QREL_DEFS),
        documentation('trec-web-2004')
    )

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
コード例 #19
0
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    migrator = Migrator(base_path/'irds_version.txt', 'v2',
        affected_files=[base_path/'nyt.tgz.pklz4'],
        message='Migrating nyt (extracting body text)')

    collection = migrator(NytDocs(dlc['source']))

    base = Dataset(collection, documentation('_'))

    # core17
    subsets['trec-core-2017'] = Dataset(
        TrecQueries(dlc['trec-core-2017/queries'], namespace='trec-core-2017', lang='en'),
        TrecQrels(dlc['trec-core-2017/qrels'], CORE_QREL_DEFS),
        collection,
        documentation('trec-core-2017'))

    # wksup
    all_queries = NytQueries(collection)
    all_qrels = NytQrels(collection)
    match_qids = Lazy(lambda: VALID_IDS)
    subsets['wksup'] = Dataset(
        all_queries,
        all_qrels,
        collection,
        documentation('wksup/train'))
    subsets['wksup/train'] = Dataset(
        FilteredQueries(all_queries, match_qids, mode='exclude'),
        FilteredQrels(all_qrels, match_qids, mode='exclude'),
        collection,
        documentation('wksup/train'))
    subsets['wksup/valid'] = Dataset(
        FilteredQueries(all_queries, match_qids, mode='include'),
        FilteredQrels(all_qrels, match_qids, mode='include'),
        collection,
        documentation('wksup/valid'))

    ir_datasets.registry.register('nyt', base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'nyt/{s}', subsets[s])

    return base, subsets
コード例 #20
0
ファイル: msmarco_qna.py プロジェクト: allenai/ir_datasets
def _init():
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    manager = MsMarcoQnAManager(GzipExtract(dlc['train']), GzipExtract(dlc['dev']), GzipExtract(dlc['eval']), base_path)
    migrator = Migrator(base_path/'irds_version.txt', 'v2',
        affected_files=[
            base_path/'docs.pklz4',
            base_path/'train.run', base_path/'train.qrels',
            base_path/'dev.run', base_path/'dev.qrels',
            base_path/'eval.run',
        ],
        message='Migrating msmarco-qna (correcting doc_ids)')

    collection = DocstoreBackedDocs(manager.docs_store, docs_cls=MsMarcoQnADoc, namespace=NAME, lang='en')
    collection = migrator(collection)

    subsets = {}

    subsets['train'] = Dataset(
        collection,
        TsvQueries(manager.file_ref('train.queries.tsv'), query_cls=MsMarcoQnAQuery, namespace='msmarco', lang='en'),
        migrator(TrecQrels(manager.file_ref('train.qrels'), QRELS_DEFS)),
        migrator(TrecScoredDocs(manager.file_ref('train.run'))),
    )

    subsets['dev'] = Dataset(
        collection,
        TsvQueries(manager.file_ref('dev.queries.tsv'), query_cls=MsMarcoQnAQuery, namespace='msmarco', lang='en'),
        migrator(TrecQrels(manager.file_ref('dev.qrels'), QRELS_DEFS)),
        migrator(TrecScoredDocs(manager.file_ref('dev.run'))),
    )

    subsets['eval'] = Dataset(
        collection,
        TsvQueries(manager.file_ref('eval.queries.tsv'), query_cls=MsMarcoQnAEvalQuery, namespace='msmarco', lang='en'),
        migrator(TrecScoredDocs(manager.file_ref('eval.run'))),
    )

    ir_datasets.registry.register(NAME, Dataset(collection, documentation('_')))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s)))

    return collection, subsets
コード例 #21
0
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    collection = TrecDocs(
        dlc['docs'],
        encoding='ISO-8859-1',
        path_globs=['**/afp_text/af*', '**/infosel_data/ism_*'],
        namespace=NAME,
        lang='es',
        count_hint=ir_datasets.util.count_hint(NAME))

    base = Dataset(collection, documentation('_'))

    subsets['trec3'] = Dataset(
        TrecSpanishTranslateQueries(
            TrecQueries(GzipExtract(dlc['trec3/queries']),
                        qtype_map=QTYPE_MAP_3,
                        encoding='ISO-8859-1',
                        namespace=NAME,
                        lang=None), TrecSpanish3Query),
        TrecQrels(GzipExtract(dlc['trec3/qrels']), QREL_DEFS), collection,
        documentation('trec3'))

    subsets['trec4'] = Dataset(
        TrecSpanishTranslateQueries(
            TrecQueries(GzipExtract(dlc['trec4/queries']),
                        qtype=TrecDescOnlyQuery,
                        qtype_map=QTYPE_MAP_4,
                        encoding='ISO-8859-1',
                        namespace=NAME,
                        lang=None), TrecSpanish4Query),
        TrecQrels(GzipExtract(dlc['trec4/qrels']), QREL_DEFS), collection,
        documentation('trec4'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
コード例 #22
0
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    collection = Cord19Docs(dlc['docs/2020-07-16'], base_path/'2020-07-16', '2020-07-16')

    base = Dataset(collection, documentation('_'))

    subsets['trec-covid'] = Dataset(
        TrecXmlQueries(dlc['trec-covid/queries'], qtype_map={'query': 'title', 'question': 'description', 'narrative': 'narrative'}, namespace=NAME),
        TrecQrels(dlc['trec-covid/qrels'], QRELS_DEFS),
        collection,
        documentation('trec-covid'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
コード例 #23
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    main_dlc = dlc['main']
    base = Dataset(
        VaswaniDocs(
            Cache(TarExtract(main_dlc, 'doc-text'), base_path / 'docs.txt')),
        VaswaniQueries(
            Cache(TarExtract(main_dlc, 'query-text'),
                  base_path / 'queries.txt')),
        VaswaniQrels(
            Cache(TarExtract(main_dlc, 'rlv-ass'), base_path / 'qrels.txt')),
        documentation('_'),
    )

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
コード例 #24
0
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    docs_v15 = CarDocs(TarExtract(dlc['docs'], 'paragraphcorpus/paragraphcorpus.cbor', compression='xz'))
    base = Dataset(documentation('_'))

    subsets['v1.5'] = Dataset(docs_v15, documentation('v1.5'))

    subsets['v1.5/trec-y1'] = Dataset(
        docs_v15,
        CarQueries(TarExtract(dlc['trec-y1/queries'], 'benchmarkY1test.public/test.benchmarkY1test.cbor.outlines', compression='xz')),)
    subsets['v1.5/trec-y1/manual'] = Dataset(
        subsets['v1.5/trec-y1'],
        TrecQrels(TarExtract(dlc['trec-y1/qrels'], 'TREC_CAR_2017_qrels/manual.benchmarkY1test.cbor.hierarchical.qrels'), MANUAL_QRELS))
    subsets['v1.5/trec-y1/auto'] = Dataset(
        subsets['v1.5/trec-y1'],
        TrecQrels(TarExtract(dlc['trec-y1/qrels'], 'TREC_CAR_2017_qrels/automatic.benchmarkY1test.cbor.hierarchical.qrels'), AUTO_QRELS))

    subsets['v1.5/test200'] = Dataset(
        docs_v15,
        CarQueries(TarExtract(dlc['test200'], 'test200/train.test200.cbor.outlines', compression='xz')),
        TrecQrels(TarExtract(dlc['test200'], 'test200/train.test200.cbor.hierarchical.qrels', compression='xz'), AUTO_QRELS))

    train_data = ReTar(dlc['train'], base_path/'train.smaller.tar.xz', ['train/train.fold?.cbor.outlines', 'train/train.fold?.cbor.hierarchical.qrels'], compression='xz')
    subsets['v1.5/train/fold0'] = Dataset(
        docs_v15,
        CarQueries(TarExtract(train_data, 'train/train.fold0.cbor.outlines', compression='xz')),
        TrecQrels(TarExtract(train_data, 'train/train.fold0.cbor.hierarchical.qrels', compression='xz'), AUTO_QRELS))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s)))

    return base, subsets
コード例 #25
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    collection = MedlineDocs([
        GzipExtract(dlc['docs/a']),
        GzipExtract(dlc['docs/b']),
        GzipExtract(dlc['docs/c']),
        GzipExtract(dlc['docs/d'])
    ])
    base = Dataset(collection, documentation('_'))

    subsets['trec-genomics-2004'] = Dataset(
        collection,
        TrecXmlQueries(ZipExtract(dlc['trec-genomics-2004/queries'],
                                  'Official.xml'),
                       qtype=TrecGenomicsQuery,
                       qtype_map=TREC04_XML_MAP,
                       namespace='trec-genomics'),
        TrecQrels(dlc['trec-genomics-2004/qrels'], QREL_DEFS),
        documentation('trec-genomics-2004'),
    )
    subsets['trec-genomics-2005'] = Dataset(
        collection,
        TrecGenomicsQueries(dlc['trec-genomics-2005/queries']),
        TrecQrels(dlc['trec-genomics-2005/qrels'], QREL_DEFS),
        documentation('trec-genomics-2005'),
    )

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
コード例 #26
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    main_dlc = dlc['main']
    base = Dataset(
        CranfieldDocs(
            Cache(TarExtract(main_dlc, 'cran.all.1400'),
                  base_path / 'docs.txt')),
        CranfieldQueries(
            Cache(TarExtract(main_dlc, 'cran.qry'),
                  base_path / 'queries.txt')),
        CranfieldQrels(
            Cache(TarExtract(main_dlc, 'cranqrel'), base_path / 'qrels.txt')),
        documentation('_'),
    )

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
コード例 #27
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    migrator = Migrator(base_path / 'irds_version.txt',
                        'v2',
                        affected_files=[
                            base_path / 'collection.tsv',
                            base_path / 'collection.tsv.pklz4'
                        ],
                        message=f'Migrating {NAME} (fixing passage encoding)')

    collection = TsvDocs(Cache(
        FixEncoding(TarExtract(dlc['collectionandqueries'], 'collection.tsv')),
        base_path / 'collection.tsv'),
                         namespace='msmarco',
                         lang='en',
                         docstore_size_hint=14373971970,
                         count_hint=ir_datasets.util.count_hint(NAME))
    collection = migrator(collection)
    subsets = {}

    subsets['train'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.train.tsv'),
                         base_path / 'train/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(dlc['train/qrels'], QRELS_DEFS),
        TsvDocPairs(GzipExtract(dlc['train/docpairs'])),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(
                    TarExtract(dlc['train/scoreddocs'], 'top1000.train.txt')),
                base_path / 'train/ms.run')),
    )

    subsets['train/triples-v2'] = Dataset(
        collection,
        subsets['train'].queries_handler(),
        subsets['train'].qrels_handler(),
        TsvDocPairs(GzipExtract(dlc['train/docpairs/v2'])),
        subsets['train'].scoreddocs_handler(),
    )

    subsets['train/triples-small'] = Dataset(
        collection,
        subsets['train'].queries_handler(),
        subsets['train'].qrels_handler(),
        TsvDocPairs(
            Cache(
                MapSmallTriplesQidPid(
                    TarExtract(dlc['train/docpairs/small'],
                               'triples.train.small.tsv'),
                    TarExtract(dlc['collectionandqueries'], 'collection.tsv'),
                    subsets['train'].queries_handler()),
                base_path / 'train/small.triples.qidpid.tsv')),
        subsets['train'].scoreddocs_handler(),
    )

    subsets['dev'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.dev.tsv'),
                         base_path / 'dev/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(dlc['dev/qrels'], QRELS_DEFS),
    )

    subsets['dev/small'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(dlc['collectionandqueries'], 'queries.dev.small.tsv'),
            base_path / 'dev/small/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(
            Cache(
                TarExtract(dlc['collectionandqueries'], 'qrels.dev.small.tsv'),
                base_path / 'dev/small/qrels'), QRELS_DEFS),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(TarExtract(dlc['dev/scoreddocs'],
                                         'top1000.dev')),
                base_path / 'dev/ms.run')),
    )

    subsets['eval'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.eval.tsv'),
                         base_path / 'eval/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
    )

    subsets['eval/small'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(dlc['collectionandqueries'], 'queries.eval.small.tsv'),
            base_path / 'eval/small/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(
                    TarExtract(dlc['eval/scoreddocs'], 'top1000.eval')),
                base_path / 'eval/ms.run')),
    )

    subsets['trec-dl-2019'] = Dataset(
        collection,
        TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS),
        TsvQueries(Cache(GzipExtract(dlc['trec-dl-2019/queries']),
                         base_path / 'trec-dl-2019/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecScoredDocs(
            Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2019/scoreddocs'])),
                  base_path / 'trec-dl-2019/ms.run')),
    )

    subsets['trec-dl-2020'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(
            Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2020/scoreddocs'])),
                  base_path / 'trec-dl-2020/ms.run')),
    )

    # A few subsets that are contrainted to just the queries/qrels/docpairs that have at least
    # 1 relevance assessment
    train_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['train'].qrels_iter()})
    subsets['train/judged'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(), train_judged),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           train_judged),
        subsets['train'],
    )

    dev_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['dev'].qrels_iter()})
    subsets['dev/judged'] = Dataset(
        FilteredQueries(subsets['dev'].queries_handler(), dev_judged),
        subsets['dev'],
    )

    dl19_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2019'].qrels_iter()})
    subsets['trec-dl-2019/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2019'].queries_handler(),
                        dl19_judged),
        FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(),
                           dl19_judged),
        subsets['trec-dl-2019'],
    )

    dl20_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2020'].qrels_iter()})
    subsets['trec-dl-2020/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2020'].queries_handler(),
                        dl20_judged),
        FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(),
                           dl20_judged),
        subsets['trec-dl-2020'],
    )

    # split200 -- 200 queries held out from the training data for validation
    split200 = Lazy(lambda: SPLIT200_QIDS)
    subsets['train/split200-train'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(),
                        split200,
                        mode='exclude'),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           split200,
                           mode='exclude'),
        FilteredQrels(subsets['train'].qrels_handler(),
                      split200,
                      mode='exclude'),
        FilteredDocPairs(subsets['train'].docpairs_handler(),
                         split200,
                         mode='exclude'),
        subsets['train'],
    )
    subsets['train/split200-valid'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(),
                        split200,
                        mode='include'),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           split200,
                           mode='include'),
        FilteredQrels(subsets['train'].qrels_handler(),
                      split200,
                      mode='include'),
        FilteredDocPairs(subsets['train'].docpairs_handler(),
                         split200,
                         mode='include'),
        subsets['train'],
    )

    # Medical subset
    def train_med():
        with dlc['medmarco_ids'].stream() as stream:
            stream = codecs.getreader('utf8')(stream)
            return {l.rstrip() for l in stream}

    train_med = Lazy(train_med)
    subsets['train/medical'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(), train_med),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_med),
        FilteredDocPairs(subsets['train'].docpairs_handler(), train_med),
        FilteredQrels(subsets['train'].qrels_handler(), train_med),
        subsets['train'],
    )

    # DL-Hard
    dl_hard_qrels_migrator = Migrator(
        base_path / 'trec-dl-hard' / 'irds_version.txt',
        'v3',
        affected_files=[base_path / 'trec-dl-hard' / 'qrels'],
        message='Updating trec-dl-hard qrels')
    hard_qids = Lazy(lambda: DL_HARD_QIDS)
    dl_hard_base_queries = TsvQueries([
        Cache(GzipExtract(dlc['trec-dl-2019/queries']),
              base_path / 'trec-dl-2019/queries.tsv'),
        Cache(GzipExtract(dlc['trec-dl-2020/queries']),
              base_path / 'trec-dl-2020/queries.tsv')
    ],
                                      namespace='msmarco',
                                      lang='en')
    subsets['trec-dl-hard'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        dl_hard_qrels_migrator(
            TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)),
        documentation('trec-dl-hard'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1'])
    subsets['trec-dl-hard/fold1'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold1'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2'])
    subsets['trec-dl-hard/fold2'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold2'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3'])
    subsets['trec-dl-hard/fold3'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold3'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4'])
    subsets['trec-dl-hard/fold4'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold4'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5'])
    subsets['trec-dl-hard/fold5'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold5'))

    ir_datasets.registry.register(NAME, Dataset(collection,
                                                documentation('_')))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}',
                                      Dataset(subsets[s], documentation(s)))

    return collection, subsets
コード例 #28
0
ファイル: clueweb12.py プロジェクト: allenai/ir_datasets
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    docs_dlc = dlc['docs']
    docs_chk_dlc = TarExtractAll(dlc['docs.chk'], base_path / 'corpus.chk')
    b13_dlc = Bz2Extract(
        Cache(
            TarExtract(
                dlc['cw12b-info'],
                'ClueWeb12-CreateB13/software/CreateClueWeb12B13Dataset.jar'),
            base_path / 'CreateClueWeb12B13Dataset.jar'))

    collection = ClueWeb12Docs(docs_dlc, docs_chk_dlc)
    collection_b13 = ClueWeb12Docs(ClueWeb12b13Extractor(docs_dlc, b13_dlc))

    base = Dataset(collection, documentation('_'))

    subsets['b13'] = Dataset(collection_b13, documentation('b13'))

    subsets['trec-web-2013'] = Dataset(
        collection,
        TrecXmlQueries(dlc['trec-web-2013/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace='trec-web',
                       lang='en'),
        TrecQrels(dlc['trec-web-2013/qrels.adhoc'], QREL_DEFS),
        documentation('trec-web-2013'))

    subsets['trec-web-2014'] = Dataset(
        collection,
        TrecXmlQueries(dlc['trec-web-2014/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace='trec-web',
                       lang='en'),
        TrecQrels(dlc['trec-web-2014/qrels.adhoc'], QREL_DEFS),
        documentation('trec-web-2014'))

    subsets['b13/ntcir-www-1'] = Dataset(
        collection_b13,
        TrecXmlQueries(Cache(
            ZipExtract(dlc['ntcir-www-1/queries'], 'eng.queries.xml'),
            base_path / 'ntcir-www-1' / 'queries.xml'),
                       qtype=GenericQuery,
                       qtype_map={
                           'qid': 'query_id',
                           'content': 'text'
                       },
                       namespace='ntcir-www',
                       lang='en'),
        NtcirQrels(dlc['ntcir-www-1/qrels'], NTCIR_QREL_DEFS),
        documentation('ntcir-www-1'))

    subsets['b13/ntcir-www-2'] = Dataset(
        collection_b13,
        TrecXmlQueries(Cache(
            ZipExtract(dlc['ntcir-www-2/queries'], 'qEng.xml'),
            base_path / 'ntcir-www-2' / 'queries.xml'),
                       qtype=NtcirQuery,
                       qtype_map=ntcir_map,
                       namespace='ntcir-www',
                       lang='en'),
        NtcirQrels(dlc['ntcir-www-2/qrels'], NTCIR_QREL_DEFS),
        documentation('ntcir-www-2'))

    subsets['b13/ntcir-www-3'] = Dataset(
        collection_b13,
        TrecXmlQueries(dlc['ntcir-www-3/queries'],
                       qtype=NtcirQuery,
                       qtype_map=ntcir_map,
                       namespace='ntcir-www',
                       lang='en'), documentation('ntcir-www-3'))

    subsets['b13/trec-misinfo-2019'] = Dataset(
        collection_b13,
        TrecXmlQueries(dlc['trec-misinfo-2019/queries'],
                       qtype=MisinfoQuery,
                       qtype_map=misinfo_map,
                       namespace='trec-misinfo-2019',
                       lang='en'),
        MsinfoQrels(dlc['trec-misinfo-2019/qrels'], MISINFO_QREL_DEFS),
        documentation('trec-misinfo-2019'))

    subsets['b13/clef-ehealth'] = Dataset(
        collection_b13,
        TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries']),
                       qtype=GenericQuery,
                       qtype_map=ehealth_map,
                       namespace='clef-ehealth',
                       lang='en'),
        EhealthQrels(
            [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
            [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
            [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
            EHEALTH_QREL_DEFS), documentation('clef-ehealth'))

    subsets['b13/clef-ehealth/cs'] = Dataset(
        collection_b13,
        TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/cs']),
                       qtype=GenericQuery,
                       qtype_map=ehealth_map,
                       namespace='clef-ehealth',
                       lang='cs'),
        EhealthQrels(
            [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
            [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
            [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
            EHEALTH_QREL_DEFS,
            query_id_suffix='-cs'), documentation('clef-ehealth/cs'))

    subsets['b13/clef-ehealth/de'] = Dataset(
        collection_b13,
        TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/de']),
                       qtype=GenericQuery,
                       qtype_map=ehealth_map,
                       namespace='clef-ehealth',
                       lang='de'),
        EhealthQrels(
            [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
            [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
            [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
            EHEALTH_QREL_DEFS,
            query_id_suffix='-de'), documentation('clef-ehealth/de'))

    subsets['b13/clef-ehealth/fr'] = Dataset(
        collection_b13,
        TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/fr']),
                       qtype=GenericQuery,
                       qtype_map=ehealth_map,
                       namespace='clef-ehealth',
                       lang='fr'),
        EhealthQrels(
            [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
            [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
            [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
            EHEALTH_QREL_DEFS,
            query_id_suffix='-fr'), documentation('clef-ehealth/fr'))

    subsets['b13/clef-ehealth/hu'] = Dataset(
        collection_b13,
        TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/hu']),
                       qtype=GenericQuery,
                       qtype_map=ehealth_map,
                       namespace='clef-ehealth',
                       lang='hu'),
        EhealthQrels(
            [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
            [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
            [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
            EHEALTH_QREL_DEFS,
            query_id_suffix='-hu'), documentation('clef-ehealth/hu'))

    subsets['b13/clef-ehealth/pl'] = Dataset(
        collection_b13,
        TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/pl']),
                       qtype=GenericQuery,
                       qtype_map=ehealth_map,
                       namespace='clef-ehealth',
                       lang='pl'),
        EhealthQrels(
            [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
            [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
            [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
            EHEALTH_QREL_DEFS,
            query_id_suffix='-pl'), documentation('clef-ehealth/pl'))

    subsets['b13/clef-ehealth/sv'] = Dataset(
        collection_b13,
        TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/sv']),
                       qtype=GenericQuery,
                       qtype_map=ehealth_map,
                       namespace='clef-ehealth',
                       lang='sv'),
        EhealthQrels(
            [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
            [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
            [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
            EHEALTH_QREL_DEFS,
            query_id_suffix='-sv'), documentation('clef-ehealth/sv'))

    # NOTE: the following datasets are defined in touche.py:
    # - clueweb12/touche-2020-task-2
    # - clueweb12/touche-2021-task-2

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
コード例 #29
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    docs_dlc = dlc['docs']
    chk_dlc = TarExtractAll(dlc['docs.chk'], base_path / 'corpus.chk')
    collection = ClueWeb09Docs(docs_dlc, chk_dlc, lang=None)  # multiple langs
    collection_ar = ClueWeb09Docs(docs_dlc,
                                  chk_dlc,
                                  dirs=['ClueWeb09_Arabic_1'],
                                  lang='ar')
    collection_zh = ClueWeb09Docs(docs_dlc,
                                  chk_dlc,
                                  dirs=[
                                      'ClueWeb09_Chinese_1',
                                      'ClueWeb09_Chinese_2',
                                      'ClueWeb09_Chinese_3',
                                      'ClueWeb09_Chinese_4'
                                  ],
                                  lang='zh')
    collection_en = ClueWeb09Docs(
        docs_dlc,
        chk_dlc,
        dirs=[
            'ClueWeb09_English_1', 'ClueWeb09_English_2',
            'ClueWeb09_English_3', 'ClueWeb09_English_4',
            'ClueWeb09_English_5', 'ClueWeb09_English_6',
            'ClueWeb09_English_7', 'ClueWeb09_English_8',
            'ClueWeb09_English_9', 'ClueWeb09_English_10'
        ],
        lang='en')
    collection_fr = ClueWeb09Docs(docs_dlc,
                                  chk_dlc,
                                  dirs=['ClueWeb09_French_1'],
                                  lang='fr')
    collection_de = ClueWeb09Docs(docs_dlc,
                                  chk_dlc,
                                  dirs=['ClueWeb09_German_1'],
                                  lang='de')
    collection_it = ClueWeb09Docs(docs_dlc,
                                  chk_dlc,
                                  dirs=['ClueWeb09_Italian_1'],
                                  lang='it')
    collection_ja = ClueWeb09Docs(
        docs_dlc,
        chk_dlc,
        dirs=['ClueWeb09_Japanese_1', 'ClueWeb09_Japanese_2'],
        lang='ja')
    collection_ko = ClueWeb09Docs(docs_dlc,
                                  chk_dlc,
                                  dirs=['ClueWeb09_Korean_1'],
                                  lang='ko')
    collection_pt = ClueWeb09Docs(docs_dlc,
                                  chk_dlc,
                                  dirs=['ClueWeb09_Portuguese_1'],
                                  lang='pt')
    collection_es = ClueWeb09Docs(
        docs_dlc,
        chk_dlc,
        dirs=['ClueWeb09_Spanish_1', 'ClueWeb09_Spanish_2'],
        lang='es')
    collection_catb = ClueWeb09Docs(docs_dlc,
                                    chk_dlc,
                                    dirs=['ClueWeb09_English_1'],
                                    lang='en')
    base = Dataset(collection, documentation('_'))

    subsets['ar'] = Dataset(collection_ar, documentation('ar'))
    subsets['zh'] = Dataset(collection_zh, documentation('zh'))
    subsets['en'] = Dataset(collection_en, documentation('en'))
    subsets['fr'] = Dataset(collection_fr, documentation('fr'))
    subsets['de'] = Dataset(collection_de, documentation('de'))
    subsets['it'] = Dataset(collection_it, documentation('it'))
    subsets['ja'] = Dataset(collection_ja, documentation('ja'))
    subsets['ko'] = Dataset(collection_ko, documentation('ko'))
    subsets['pt'] = Dataset(collection_pt, documentation('pt'))
    subsets['es'] = Dataset(collection_es, documentation('es'))
    subsets['catb'] = Dataset(collection_catb, documentation('catb'))

    subsets['en/trec-web-2009'] = Dataset(
        collection_en,
        TrecXmlQueries(dlc['trec-web-2009/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        TrecPrels(GzipExtract(dlc['trec-web-2009/qrels.adhoc']), QREL_DEFS_09),
        documentation('trec-web-2009'))

    subsets['en/trec-web-2010'] = Dataset(
        collection_en,
        TrecXmlQueries(dlc['trec-web-2010/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        TrecQrels(dlc['trec-web-2010/qrels.adhoc'], QREL_DEFS),
        documentation('trec-web-2010'))

    subsets['en/trec-web-2011'] = Dataset(
        collection_en,
        TrecXmlQueries(dlc['trec-web-2011/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        TrecQrels(dlc['trec-web-2011/qrels.adhoc'], QREL_DEFS),
        documentation('trec-web-2011'))

    subsets['en/trec-web-2012'] = Dataset(
        collection_en,
        TrecXmlQueries(dlc['trec-web-2012/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        TrecQrels(dlc['trec-web-2012/qrels.adhoc'], QREL_DEFS),
        documentation('trec-web-2012'))

    subsets['catb/trec-web-2009'] = Dataset(
        collection_catb,
        TrecXmlQueries(dlc['trec-web-2009/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        CatBQrelFilter(
            TrecPrels(GzipExtract(dlc['trec-web-2009/qrels.adhoc']),
                      QREL_DEFS_09)), documentation('trec-web-2009'))

    subsets['catb/trec-web-2010'] = Dataset(
        collection_catb,
        TrecXmlQueries(dlc['trec-web-2010/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        CatBQrelFilter(TrecQrels(dlc['trec-web-2010/qrels.adhoc'], QREL_DEFS)),
        documentation('trec-web-2010'))

    subsets['catb/trec-web-2011'] = Dataset(
        collection_catb,
        TrecXmlQueries(dlc['trec-web-2011/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        CatBQrelFilter(TrecQrels(dlc['trec-web-2011/qrels.adhoc'], QREL_DEFS)),
        documentation('trec-web-2011'))

    subsets['catb/trec-web-2012'] = Dataset(
        collection_catb,
        TrecXmlQueries(dlc['trec-web-2012/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        CatBQrelFilter(TrecQrels(dlc['trec-web-2012/qrels.adhoc'], QREL_DEFS)),
        documentation('trec-web-2012'))

    subsets['trec-mq-2009'] = Dataset(
        collection,
        TrecColonQueries(GzipExtract(dlc['trec-mq-2009/queries']),
                         encoding='latin1',
                         lang='en'),
        TrecPrels(GzipExtract(dlc['trec-mq-2009/qrels']), QREL_DEFS_09),
        documentation('trec-mq-2009'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
コード例 #30
0
def _init():
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    main_dlc = dlc['main']

    collection = TsvDocs(Cache(
        TarExtract(main_dlc, 'nfcorpus/raw/doc_dump.txt'),
        base_path / 'collection.tsv'),
                         doc_cls=NfCorpusDoc,
                         namespace=NAME)
    subsets = {}

    def read_lines(file):
        file = Cache(TarExtract(main_dlc, f'nfcorpus/raw/{file}'),
                     base_path / file)
        with file.stream() as stream:
            stream = codecs.getreader('utf8')(stream)
            return {l.rstrip() for l in stream}

    nontopic_qid_filter = Lazy(lambda: read_lines('nontopics.ids'))
    video_qid_filter = Lazy(lambda: read_lines('all_videos.ids'))

    subsets['train'] = Dataset(
        collection,
        ZipQueries([
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/train.titles.queries'),
                base_path / 'train/queries.titles.tsv'),
                       namespace=NAME),
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/train.all.queries'),
                base_path / 'train/queries.all.tsv'),
                       namespace=NAME),
        ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery),
        TrecQrels(
            Cache(TarExtract(main_dlc, 'nfcorpus/train.3-2-1.qrel'),
                  base_path / 'train/qrels'), QRELS_DEFS),
        documentation('train'),
    )

    subsets['train/nontopic'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(main_dlc, 'nfcorpus/train.nontopic-titles.queries'),
            base_path / 'train/nontopic/queries.tsv'),
                   namespace=NAME),
        FilteredQrels(subsets['train'].qrels_handler(),
                      nontopic_qid_filter,
                      mode='include'),
        documentation('train/nontopic'),
    )

    subsets['train/video'] = Dataset(
        collection,
        ZipQueries([
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/train.vid-titles.queries'),
                base_path / 'train/video/queries.titles.tsv'),
                       namespace=NAME),
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/train.vid-desc.queries'),
                base_path / 'train/video/queries.desc.tsv'),
                       namespace=NAME),
        ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery),
        TsvQueries(Cache(
            TarExtract(main_dlc, 'nfcorpus/train.nontopic-titles.queries'),
            base_path / 'train/video/queries.tsv'),
                   NfCorpusVideoQuery,
                   namespace=NAME),
        FilteredQrels(subsets['train'].qrels_handler(),
                      video_qid_filter,
                      mode='include'),
        documentation('train/video'),
    )

    subsets['dev'] = Dataset(
        collection,
        ZipQueries([
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/dev.titles.queries'),
                base_path / 'dev/queries.titles.tsv'),
                       namespace=NAME),
            TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/dev.all.queries'),
                             base_path / 'dev/queries.all.tsv'),
                       namespace=NAME),
        ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery),
        TrecQrels(
            Cache(TarExtract(main_dlc, 'nfcorpus/dev.3-2-1.qrel'),
                  base_path / 'dev/qrels'), QRELS_DEFS),
        documentation('dev'),
    )

    subsets['dev/nontopic'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(main_dlc, 'nfcorpus/dev.nontopic-titles.queries'),
            base_path / 'dev/nontopic/queries.tsv'),
                   namespace=NAME),
        FilteredQrels(subsets['dev'].qrels_handler(),
                      nontopic_qid_filter,
                      mode='include'),
        documentation('dev/nontopic'),
    )

    subsets['dev/video'] = Dataset(
        collection,
        ZipQueries([
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/dev.vid-titles.queries'),
                base_path / 'dev/video/queries.titles.tsv'),
                       namespace=NAME),
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/dev.vid-desc.queries'),
                base_path / 'dev/video/queries.desc.tsv'),
                       namespace=NAME),
        ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery),
        TsvQueries(Cache(
            TarExtract(main_dlc, 'nfcorpus/dev.nontopic-titles.queries'),
            base_path / 'dev/video/queries.tsv'),
                   NfCorpusVideoQuery,
                   namespace=NAME),
        FilteredQrels(subsets['dev'].qrels_handler(),
                      video_qid_filter,
                      mode='include'),
        documentation('dev/video'),
    )

    subsets['test'] = Dataset(
        collection,
        ZipQueries([
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/test.titles.queries'),
                base_path / 'test/queries.titles.tsv'),
                       namespace=NAME),
            TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/test.all.queries'),
                             base_path / 'test/queries.all.tsv'),
                       namespace=NAME),
        ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery),
        TrecQrels(
            Cache(TarExtract(main_dlc, 'nfcorpus/test.3-2-1.qrel'),
                  base_path / 'test/qrels'), QRELS_DEFS),
        documentation('test'),
    )

    subsets['test/nontopic'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(main_dlc, 'nfcorpus/test.nontopic-titles.queries'),
            base_path / 'test/nontopic/queries.tsv'),
                   namespace=NAME),
        FilteredQrels(subsets['test'].qrels_handler(),
                      nontopic_qid_filter,
                      mode='include'),
        documentation('test/nontopic'),
    )

    subsets['test/video'] = Dataset(
        collection,
        ZipQueries([
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/test.vid-titles.queries'),
                base_path / 'test/video/queries.titles.tsv'),
                       namespace=NAME),
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/test.vid-desc.queries'),
                base_path / 'test/video/queries.desc.tsv'),
                       namespace=NAME),
        ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery),
        TsvQueries(Cache(
            TarExtract(main_dlc, 'nfcorpus/test.nontopic-titles.queries'),
            base_path / 'test/video/queries.tsv'),
                   NfCorpusVideoQuery,
                   namespace=NAME),
        FilteredQrels(subsets['test'].qrels_handler(),
                      video_qid_filter,
                      mode='include'),
        documentation('test/video'),
    )

    ir_datasets.registry.register(NAME, Dataset(collection,
                                                documentation('_')))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return collection, subsets