Ejemplo n.º 1
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    docs_dlc = dlc['docs']
    docs_chk_dlc = TarExtractAll(dlc['docs.chk'], base_path/'corpus.chk')
    b13_dlc = Bz2Extract(Cache(TarExtract(dlc['cw12b-info'], 'ClueWeb12-CreateB13/software/CreateClueWeb12B13Dataset.jar'), base_path/'CreateClueWeb12B13Dataset.jar'))

    collection = ClueWeb12Docs(docs_dlc, docs_chk_dlc)
    collection_b13 = ClueWeb12Docs(ClueWeb12b13Extractor(docs_dlc, b13_dlc))

    base = Dataset(collection, documentation('_'))

    subsets['b13'] = Dataset(collection_b13, documentation('b13'))

    subsets['trec-web-2013'] = Dataset(
        collection,
        TrecXmlQueries(dlc['trec-web-2013/queries'], qtype=TrecWebTrackQuery, namespace=NAME),
        TrecQrels(dlc['trec-web-2013/qrels.adhoc'], QREL_DEFS),
        documentation('trec-web-2013'))

    subsets['trec-web-2014'] = Dataset(
        collection,
        TrecXmlQueries(dlc['trec-web-2014/queries'], qtype=TrecWebTrackQuery, namespace=NAME),
        TrecQrels(dlc['trec-web-2014/qrels.adhoc'], QREL_DEFS),
        documentation('trec-web-2014'))

    subsets['b13/ntcir-www-1'] = Dataset(
        collection_b13,
        TrecXmlQueries(Cache(ZipExtract(dlc['ntcir-www-1/queries'], 'eng.queries.xml'), base_path/'ntcir-www-1'/'queries.xml'), qtype=GenericQuery, qtype_map={'qid': 'query_id', 'content': 'text'}, namespace=NAME),
        NtcirQrels(dlc['ntcir-www-1/qrels'], NTCIR_QREL_DEFS),
        documentation('ntcir-www-1'))

    subsets['b13/ntcir-www-2'] = Dataset(
        collection_b13,
        TrecXmlQueries(Cache(ZipExtract(dlc['ntcir-www-2/queries'], 'qEng.xml'), base_path/'ntcir-www-2'/'queries.xml'), qtype=NtcirQuery, qtype_map=ntcir_map, namespace=NAME),
        NtcirQrels(dlc['ntcir-www-2/qrels'], NTCIR_QREL_DEFS),
        documentation('ntcir-www-2'))

    subsets['b13/ntcir-www-3'] = Dataset(
        collection_b13,
        TrecXmlQueries(dlc['ntcir-www-3/queries'], qtype=NtcirQuery, qtype_map=ntcir_map, namespace=NAME),
        documentation('ntcir-www-3'))

    subsets['b13/trec-misinfo-2019'] = Dataset(
        collection_b13,
        TrecXmlQueries(dlc['trec-misinfo-2019/queries'], qtype=MisinfoQuery, qtype_map=misinfo_map, namespace=NAME),
        MsinfoQrels(dlc['trec-misinfo-2019/qrels'], MISINFO_QREL_DEFS),
        documentation('trec-misinfo-2019'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Ejemplo n.º 2
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    main_dlc = dlc['main']
    base = Dataset(
        VaswaniDocs(
            Cache(TarExtract(main_dlc, 'doc-text'), base_path / 'docs.txt')),
        VaswaniQueries(
            Cache(TarExtract(main_dlc, 'query-text'),
                  base_path / 'queries.txt')),
        VaswaniQrels(
            Cache(TarExtract(main_dlc, 'rlv-ass'), base_path / 'qrels.txt')),
        documentation('_'),
    )

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Ejemplo n.º 3
0
def _init():
    base_path = home_path() / NAME

    documentation = YamlDocumentation(f"docs/{NAME}.yaml")
    download_config = DownloadConfig.context(NAME, base_path)

    base = Dataset(documentation('_'))

    # Arguments that can be loaded from Zenodo.
    arguments: Dict[str, ArgsMeDocs] = {
        name: ArgsMeDocs(Cache(ZipExtract(download_config[name], zip_path),
                               base_path / f"{name}.json"),
                         namespace=f"{NAME}/{name}",
                         language=language,
                         count_hint=count_hint)
        for name, (count_hint, language, zip_path) in SUBSETS.items()
    }

    # Arguments that are combined versions of other subsets.
    combined_arguments: Dict[str, ArgsMeCombinedArguments] = {
        name: ArgsMeCombinedArguments(
            base_path / f"{name}.json",
            [arguments[subset_name] for subset_name in subset_names],
            namespace=f"{NAME}/{name}",
            language=language,
            count_hint=count_hint)
        for name, (subset_names, count_hint,
                   language) in COMBINED_SUBSETS.items()
    }

    # Wrap in datasets with documentation.
    datasets = {
        name: Dataset(arguments, documentation(name))
        for name, arguments in chain(arguments.items(),
                                     combined_arguments.items())
    }

    # NOTE: the following datasets are defined in touche.py:
    #  - argsme/1.0/touche-2020-task-1/uncorrected
    #  - argsme/2020-04-01/touche-2020-task-1
    #  - argsme/2020-04-01/touche-2020-task-1/uncorrected
    #  - argsme/2020-04-01/touche-2021-task-1

    # Register datasets.
    registry.register(NAME, base)
    for name, arguments in datasets.items():
        registry.register(f'{NAME}/{name}', arguments)

    return base, datasets
Ejemplo n.º 4
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    main_dlc = dlc['main']
    base = Dataset(
        CranfieldDocs(
            Cache(TarExtract(main_dlc, 'cran.all.1400'),
                  base_path / 'docs.txt')),
        CranfieldQueries(
            Cache(TarExtract(main_dlc, 'cran.qry'),
                  base_path / 'queries.txt')),
        CranfieldQrels(
            Cache(TarExtract(main_dlc, 'cranqrel'), base_path / 'qrels.txt')),
        documentation('_'),
    )

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Ejemplo n.º 5
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    migrator = Migrator(base_path / 'irds_version.txt',
                        'v2',
                        affected_files=[
                            base_path / 'collection.tsv',
                            base_path / 'collection.tsv.pklz4'
                        ],
                        message=f'Migrating {NAME} (fixing passage encoding)')

    collection = TsvDocs(Cache(
        FixEncoding(TarExtract(dlc['collectionandqueries'], 'collection.tsv')),
        base_path / 'collection.tsv'),
                         namespace='msmarco',
                         lang='en',
                         docstore_size_hint=14373971970,
                         count_hint=ir_datasets.util.count_hint(NAME))
    collection = migrator(collection)
    subsets = {}

    subsets['train'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.train.tsv'),
                         base_path / 'train/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(dlc['train/qrels'], QRELS_DEFS),
        TsvDocPairs(GzipExtract(dlc['train/docpairs'])),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(
                    TarExtract(dlc['train/scoreddocs'], 'top1000.train.txt')),
                base_path / 'train/ms.run')),
    )

    subsets['train/triples-v2'] = Dataset(
        collection,
        subsets['train'].queries_handler(),
        subsets['train'].qrels_handler(),
        TsvDocPairs(GzipExtract(dlc['train/docpairs/v2'])),
        subsets['train'].scoreddocs_handler(),
    )

    subsets['train/triples-small'] = Dataset(
        collection,
        subsets['train'].queries_handler(),
        subsets['train'].qrels_handler(),
        TsvDocPairs(
            Cache(
                MapSmallTriplesQidPid(
                    TarExtract(dlc['train/docpairs/small'],
                               'triples.train.small.tsv'),
                    TarExtract(dlc['collectionandqueries'], 'collection.tsv'),
                    subsets['train'].queries_handler()),
                base_path / 'train/small.triples.qidpid.tsv')),
        subsets['train'].scoreddocs_handler(),
    )

    subsets['dev'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.dev.tsv'),
                         base_path / 'dev/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(dlc['dev/qrels'], QRELS_DEFS),
    )

    subsets['dev/small'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(dlc['collectionandqueries'], 'queries.dev.small.tsv'),
            base_path / 'dev/small/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(
            Cache(
                TarExtract(dlc['collectionandqueries'], 'qrels.dev.small.tsv'),
                base_path / 'dev/small/qrels'), QRELS_DEFS),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(TarExtract(dlc['dev/scoreddocs'],
                                         'top1000.dev')),
                base_path / 'dev/ms.run')),
    )

    subsets['eval'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.eval.tsv'),
                         base_path / 'eval/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
    )

    subsets['eval/small'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(dlc['collectionandqueries'], 'queries.eval.small.tsv'),
            base_path / 'eval/small/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(
                    TarExtract(dlc['eval/scoreddocs'], 'top1000.eval')),
                base_path / 'eval/ms.run')),
    )

    subsets['trec-dl-2019'] = Dataset(
        collection,
        TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS),
        TsvQueries(Cache(GzipExtract(dlc['trec-dl-2019/queries']),
                         base_path / 'trec-dl-2019/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecScoredDocs(
            Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2019/scoreddocs'])),
                  base_path / 'trec-dl-2019/ms.run')),
    )

    subsets['trec-dl-2020'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(
            Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2020/scoreddocs'])),
                  base_path / 'trec-dl-2020/ms.run')),
    )

    # A few subsets that are contrainted to just the queries/qrels/docpairs that have at least
    # 1 relevance assessment
    train_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['train'].qrels_iter()})
    subsets['train/judged'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(), train_judged),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           train_judged),
        subsets['train'],
    )

    dev_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['dev'].qrels_iter()})
    subsets['dev/judged'] = Dataset(
        FilteredQueries(subsets['dev'].queries_handler(), dev_judged),
        subsets['dev'],
    )

    dl19_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2019'].qrels_iter()})
    subsets['trec-dl-2019/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2019'].queries_handler(),
                        dl19_judged),
        FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(),
                           dl19_judged),
        subsets['trec-dl-2019'],
    )

    dl20_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2020'].qrels_iter()})
    subsets['trec-dl-2020/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2020'].queries_handler(),
                        dl20_judged),
        FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(),
                           dl20_judged),
        subsets['trec-dl-2020'],
    )

    # split200 -- 200 queries held out from the training data for validation
    split200 = Lazy(lambda: SPLIT200_QIDS)
    subsets['train/split200-train'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(),
                        split200,
                        mode='exclude'),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           split200,
                           mode='exclude'),
        FilteredQrels(subsets['train'].qrels_handler(),
                      split200,
                      mode='exclude'),
        FilteredDocPairs(subsets['train'].docpairs_handler(),
                         split200,
                         mode='exclude'),
        subsets['train'],
    )
    subsets['train/split200-valid'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(),
                        split200,
                        mode='include'),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           split200,
                           mode='include'),
        FilteredQrels(subsets['train'].qrels_handler(),
                      split200,
                      mode='include'),
        FilteredDocPairs(subsets['train'].docpairs_handler(),
                         split200,
                         mode='include'),
        subsets['train'],
    )

    # Medical subset
    def train_med():
        with dlc['medmarco_ids'].stream() as stream:
            stream = codecs.getreader('utf8')(stream)
            return {l.rstrip() for l in stream}

    train_med = Lazy(train_med)
    subsets['train/medical'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(), train_med),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_med),
        FilteredDocPairs(subsets['train'].docpairs_handler(), train_med),
        FilteredQrels(subsets['train'].qrels_handler(), train_med),
        subsets['train'],
    )

    # DL-Hard
    dl_hard_qrels_migrator = Migrator(
        base_path / 'trec-dl-hard' / 'irds_version.txt',
        'v3',
        affected_files=[base_path / 'trec-dl-hard' / 'qrels'],
        message='Updating trec-dl-hard qrels')
    hard_qids = Lazy(lambda: DL_HARD_QIDS)
    dl_hard_base_queries = TsvQueries([
        Cache(GzipExtract(dlc['trec-dl-2019/queries']),
              base_path / 'trec-dl-2019/queries.tsv'),
        Cache(GzipExtract(dlc['trec-dl-2020/queries']),
              base_path / 'trec-dl-2020/queries.tsv')
    ],
                                      namespace='msmarco',
                                      lang='en')
    subsets['trec-dl-hard'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        dl_hard_qrels_migrator(
            TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)),
        documentation('trec-dl-hard'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1'])
    subsets['trec-dl-hard/fold1'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold1'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2'])
    subsets['trec-dl-hard/fold2'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold2'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3'])
    subsets['trec-dl-hard/fold3'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold3'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4'])
    subsets['trec-dl-hard/fold4'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold4'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5'])
    subsets['trec-dl-hard/fold5'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold5'))

    ir_datasets.registry.register(NAME, Dataset(collection,
                                                documentation('_')))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}',
                                      Dataset(subsets[s], documentation(s)))

    return collection, subsets
Ejemplo n.º 6
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    docs_dlc = dlc['docs']
    docs_chk_dlc = TarExtractAll(dlc['docs.chk'], base_path / 'corpus.chk')
    b13_dlc = Bz2Extract(
        Cache(
            TarExtract(
                dlc['cw12b-info'],
                'ClueWeb12-CreateB13/software/CreateClueWeb12B13Dataset.jar'),
            base_path / 'CreateClueWeb12B13Dataset.jar'))

    collection = ClueWeb12Docs(docs_dlc, docs_chk_dlc)
    collection_b13 = ClueWeb12Docs(ClueWeb12b13Extractor(docs_dlc, b13_dlc))

    base = Dataset(collection, documentation('_'))

    subsets['b13'] = Dataset(collection_b13, documentation('b13'))

    subsets['trec-web-2013'] = Dataset(
        collection,
        TrecXmlQueries(dlc['trec-web-2013/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace='trec-web',
                       lang='en'),
        TrecQrels(dlc['trec-web-2013/qrels.adhoc'], QREL_DEFS),
        documentation('trec-web-2013'))

    subsets['trec-web-2014'] = Dataset(
        collection,
        TrecXmlQueries(dlc['trec-web-2014/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace='trec-web',
                       lang='en'),
        TrecQrels(dlc['trec-web-2014/qrels.adhoc'], QREL_DEFS),
        documentation('trec-web-2014'))

    subsets['b13/ntcir-www-1'] = Dataset(
        collection_b13,
        TrecXmlQueries(Cache(
            ZipExtract(dlc['ntcir-www-1/queries'], 'eng.queries.xml'),
            base_path / 'ntcir-www-1' / 'queries.xml'),
                       qtype=GenericQuery,
                       qtype_map={
                           'qid': 'query_id',
                           'content': 'text'
                       },
                       namespace='ntcir-www',
                       lang='en'),
        NtcirQrels(dlc['ntcir-www-1/qrels'], NTCIR_QREL_DEFS),
        documentation('ntcir-www-1'))

    subsets['b13/ntcir-www-2'] = Dataset(
        collection_b13,
        TrecXmlQueries(Cache(
            ZipExtract(dlc['ntcir-www-2/queries'], 'qEng.xml'),
            base_path / 'ntcir-www-2' / 'queries.xml'),
                       qtype=NtcirQuery,
                       qtype_map=ntcir_map,
                       namespace='ntcir-www',
                       lang='en'),
        NtcirQrels(dlc['ntcir-www-2/qrels'], NTCIR_QREL_DEFS),
        documentation('ntcir-www-2'))

    subsets['b13/ntcir-www-3'] = Dataset(
        collection_b13,
        TrecXmlQueries(dlc['ntcir-www-3/queries'],
                       qtype=NtcirQuery,
                       qtype_map=ntcir_map,
                       namespace='ntcir-www',
                       lang='en'), documentation('ntcir-www-3'))

    subsets['b13/trec-misinfo-2019'] = Dataset(
        collection_b13,
        TrecXmlQueries(dlc['trec-misinfo-2019/queries'],
                       qtype=MisinfoQuery,
                       qtype_map=misinfo_map,
                       namespace='trec-misinfo-2019',
                       lang='en'),
        MsinfoQrels(dlc['trec-misinfo-2019/qrels'], MISINFO_QREL_DEFS),
        documentation('trec-misinfo-2019'))

    subsets['b13/clef-ehealth'] = Dataset(
        collection_b13,
        TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries']),
                       qtype=GenericQuery,
                       qtype_map=ehealth_map,
                       namespace='clef-ehealth',
                       lang='en'),
        EhealthQrels(
            [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
            [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
            [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
            EHEALTH_QREL_DEFS), documentation('clef-ehealth'))

    subsets['b13/clef-ehealth/cs'] = Dataset(
        collection_b13,
        TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/cs']),
                       qtype=GenericQuery,
                       qtype_map=ehealth_map,
                       namespace='clef-ehealth',
                       lang='cs'),
        EhealthQrels(
            [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
            [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
            [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
            EHEALTH_QREL_DEFS,
            query_id_suffix='-cs'), documentation('clef-ehealth/cs'))

    subsets['b13/clef-ehealth/de'] = Dataset(
        collection_b13,
        TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/de']),
                       qtype=GenericQuery,
                       qtype_map=ehealth_map,
                       namespace='clef-ehealth',
                       lang='de'),
        EhealthQrels(
            [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
            [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
            [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
            EHEALTH_QREL_DEFS,
            query_id_suffix='-de'), documentation('clef-ehealth/de'))

    subsets['b13/clef-ehealth/fr'] = Dataset(
        collection_b13,
        TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/fr']),
                       qtype=GenericQuery,
                       qtype_map=ehealth_map,
                       namespace='clef-ehealth',
                       lang='fr'),
        EhealthQrels(
            [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
            [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
            [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
            EHEALTH_QREL_DEFS,
            query_id_suffix='-fr'), documentation('clef-ehealth/fr'))

    subsets['b13/clef-ehealth/hu'] = Dataset(
        collection_b13,
        TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/hu']),
                       qtype=GenericQuery,
                       qtype_map=ehealth_map,
                       namespace='clef-ehealth',
                       lang='hu'),
        EhealthQrels(
            [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
            [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
            [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
            EHEALTH_QREL_DEFS,
            query_id_suffix='-hu'), documentation('clef-ehealth/hu'))

    subsets['b13/clef-ehealth/pl'] = Dataset(
        collection_b13,
        TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/pl']),
                       qtype=GenericQuery,
                       qtype_map=ehealth_map,
                       namespace='clef-ehealth',
                       lang='pl'),
        EhealthQrels(
            [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
            [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
            [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
            EHEALTH_QREL_DEFS,
            query_id_suffix='-pl'), documentation('clef-ehealth/pl'))

    subsets['b13/clef-ehealth/sv'] = Dataset(
        collection_b13,
        TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/sv']),
                       qtype=GenericQuery,
                       qtype_map=ehealth_map,
                       namespace='clef-ehealth',
                       lang='sv'),
        EhealthQrels(
            [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
            [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
            [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
            EHEALTH_QREL_DEFS,
            query_id_suffix='-sv'), documentation('clef-ehealth/sv'))

    # NOTE: the following datasets are defined in touche.py:
    # - clueweb12/touche-2020-task-2
    # - clueweb12/touche-2021-task-2

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Ejemplo n.º 7
0
 def read_lines(file):
     file = Cache(TarExtract(main_dlc, f'nfcorpus/raw/{file}'),
                  base_path / file)
     with file.stream() as stream:
         stream = codecs.getreader('utf8')(stream)
         return {l.rstrip() for l in stream}
Ejemplo n.º 8
0
def _init():
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    main_dlc = dlc['main']

    collection = TsvDocs(Cache(
        TarExtract(main_dlc, 'nfcorpus/raw/doc_dump.txt'),
        base_path / 'collection.tsv'),
                         doc_cls=NfCorpusDoc,
                         namespace=NAME)
    subsets = {}

    def read_lines(file):
        file = Cache(TarExtract(main_dlc, f'nfcorpus/raw/{file}'),
                     base_path / file)
        with file.stream() as stream:
            stream = codecs.getreader('utf8')(stream)
            return {l.rstrip() for l in stream}

    nontopic_qid_filter = Lazy(lambda: read_lines('nontopics.ids'))
    video_qid_filter = Lazy(lambda: read_lines('all_videos.ids'))

    subsets['train'] = Dataset(
        collection,
        ZipQueries([
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/train.titles.queries'),
                base_path / 'train/queries.titles.tsv'),
                       namespace=NAME),
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/train.all.queries'),
                base_path / 'train/queries.all.tsv'),
                       namespace=NAME),
        ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery),
        TrecQrels(
            Cache(TarExtract(main_dlc, 'nfcorpus/train.3-2-1.qrel'),
                  base_path / 'train/qrels'), QRELS_DEFS),
        documentation('train'),
    )

    subsets['train/nontopic'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(main_dlc, 'nfcorpus/train.nontopic-titles.queries'),
            base_path / 'train/nontopic/queries.tsv'),
                   namespace=NAME),
        FilteredQrels(subsets['train'].qrels_handler(),
                      nontopic_qid_filter,
                      mode='include'),
        documentation('train/nontopic'),
    )

    subsets['train/video'] = Dataset(
        collection,
        ZipQueries([
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/train.vid-titles.queries'),
                base_path / 'train/video/queries.titles.tsv'),
                       namespace=NAME),
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/train.vid-desc.queries'),
                base_path / 'train/video/queries.desc.tsv'),
                       namespace=NAME),
        ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery),
        TsvQueries(Cache(
            TarExtract(main_dlc, 'nfcorpus/train.nontopic-titles.queries'),
            base_path / 'train/video/queries.tsv'),
                   NfCorpusVideoQuery,
                   namespace=NAME),
        FilteredQrels(subsets['train'].qrels_handler(),
                      video_qid_filter,
                      mode='include'),
        documentation('train/video'),
    )

    subsets['dev'] = Dataset(
        collection,
        ZipQueries([
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/dev.titles.queries'),
                base_path / 'dev/queries.titles.tsv'),
                       namespace=NAME),
            TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/dev.all.queries'),
                             base_path / 'dev/queries.all.tsv'),
                       namespace=NAME),
        ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery),
        TrecQrels(
            Cache(TarExtract(main_dlc, 'nfcorpus/dev.3-2-1.qrel'),
                  base_path / 'dev/qrels'), QRELS_DEFS),
        documentation('dev'),
    )

    subsets['dev/nontopic'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(main_dlc, 'nfcorpus/dev.nontopic-titles.queries'),
            base_path / 'dev/nontopic/queries.tsv'),
                   namespace=NAME),
        FilteredQrels(subsets['dev'].qrels_handler(),
                      nontopic_qid_filter,
                      mode='include'),
        documentation('dev/nontopic'),
    )

    subsets['dev/video'] = Dataset(
        collection,
        ZipQueries([
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/dev.vid-titles.queries'),
                base_path / 'dev/video/queries.titles.tsv'),
                       namespace=NAME),
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/dev.vid-desc.queries'),
                base_path / 'dev/video/queries.desc.tsv'),
                       namespace=NAME),
        ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery),
        TsvQueries(Cache(
            TarExtract(main_dlc, 'nfcorpus/dev.nontopic-titles.queries'),
            base_path / 'dev/video/queries.tsv'),
                   NfCorpusVideoQuery,
                   namespace=NAME),
        FilteredQrels(subsets['dev'].qrels_handler(),
                      video_qid_filter,
                      mode='include'),
        documentation('dev/video'),
    )

    subsets['test'] = Dataset(
        collection,
        ZipQueries([
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/test.titles.queries'),
                base_path / 'test/queries.titles.tsv'),
                       namespace=NAME),
            TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/test.all.queries'),
                             base_path / 'test/queries.all.tsv'),
                       namespace=NAME),
        ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery),
        TrecQrels(
            Cache(TarExtract(main_dlc, 'nfcorpus/test.3-2-1.qrel'),
                  base_path / 'test/qrels'), QRELS_DEFS),
        documentation('test'),
    )

    subsets['test/nontopic'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(main_dlc, 'nfcorpus/test.nontopic-titles.queries'),
            base_path / 'test/nontopic/queries.tsv'),
                   namespace=NAME),
        FilteredQrels(subsets['test'].qrels_handler(),
                      nontopic_qid_filter,
                      mode='include'),
        documentation('test/nontopic'),
    )

    subsets['test/video'] = Dataset(
        collection,
        ZipQueries([
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/test.vid-titles.queries'),
                base_path / 'test/video/queries.titles.tsv'),
                       namespace=NAME),
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/test.vid-desc.queries'),
                base_path / 'test/video/queries.desc.tsv'),
                       namespace=NAME),
        ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery),
        TsvQueries(Cache(
            TarExtract(main_dlc, 'nfcorpus/test.nontopic-titles.queries'),
            base_path / 'test/video/queries.tsv'),
                   NfCorpusVideoQuery,
                   namespace=NAME),
        FilteredQrels(subsets['test'].qrels_handler(),
                      video_qid_filter,
                      mode='include'),
        documentation('test/video'),
    )

    ir_datasets.registry.register(NAME, Dataset(collection,
                                                documentation('_')))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return collection, subsets
Ejemplo n.º 9
0
        return 'trec-robust04'

    def docs_lang(self):
        return 'en'


DL_ANSERINI_ROBUST04 = ir_datasets.util.Download(
    [
        ir_datasets.util.RequestsDownload(
            'https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-robust04-20191213.tar.gz'
        )
    ],
    expected_md5='15f3d001489c97849a010b0a4734d018')

DL_ANSERINI_ROBUST04 = Cache(
    TarExtract(DL_ANSERINI_ROBUST04, 'index-robust04-20191213/_h.fdt'),
    base_path / 'lucene_source.fdt')

collection = AnseriniRobustDocs(DL_ANSERINI_ROBUST04)

for ds_name in [
        'trec-robust04', 'trec-robust04/fold1', 'trec-robust04/fold2',
        'trec-robust04/fold3', 'trec-robust04/fold4', 'trec-robust04/fold5'
]:
    main_ds = ir_datasets.load(ds_name)
    dataset = ir_datasets.Dataset(
        collection,
        main_ds.queries_handler(),
        main_ds.qrels_handler(),
    )
Ejemplo n.º 10
0
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    collection = TrecDocs(dlc['benchmark'],
                          parser='tut',
                          path_globs=['**/docs_grp_*.txt'],
                          namespace=NAME,
                          lang='en',
                          count_hint=ir_datasets.util.count_hint(NAME))
    topics_and_qrels = TarExtractAll(
        dlc['benchmark'],
        base_path / "topics_and_qrels",
        path_globs=['**/topics.*.txt', '**/qrels.*.txt'])
    val_runs = TarExtractAll(dlc['dlfiles'],
                             base_path / "val_runs",
                             path_globs=['**/run.trip.BM25.*.val.txt'])
    test_runs = TarExtractAll(dlc['dlfiles_runs_test'],
                              base_path / "test_runs",
                              path_globs=['**/run.trip.BM25.*.test.txt'])

    base = Dataset(collection, documentation('_'))

    subsets['logs'] = Dataset(
        TsvDocs(Cache(
            FixAllarticles(TarExtract(dlc['logs'], 'logs/allarticles.txt')),
            base_path / 'allarticles-fixed.tsv'),
                doc_cls=TripClickPartialDoc,
                lang='en',
                count_hint=ir_datasets.util.count_hint(f'{NAME}/logs')),
        TripClickQlogs(
            TarExtractAll(dlc['logs'],
                          base_path / 'logs',
                          path_globs=['**/*.json'])), documentation('logs'))

    ### Train

    subsets['train/head'] = Dataset(
        collection,
        TrecQueries(RelativePath(topics_and_qrels,
                                 'benchmark/topics/topics.head.train.txt'),
                    qtype=GenericQuery,
                    qtype_map=QTYPE_MAP,
                    namespace=NAME,
                    lang='en'),
        TrecQrels(
            RelativePath(topics_and_qrels,
                         'benchmark/qrels/qrels.raw.head.train.txt'),
            QREL_DEFS), documentation('train/head'))

    subsets['train/head/dctr'] = Dataset(
        TrecQrels(
            RelativePath(topics_and_qrels,
                         'benchmark/qrels/qrels.dctr.head.train.txt'),
            QREL_DCTR_DEFS), subsets['train/head'],
        documentation('train/head/dctr'))

    subsets['train/torso'] = Dataset(
        collection,
        TrecQueries(RelativePath(topics_and_qrels,
                                 'benchmark/topics/topics.torso.train.txt'),
                    qtype=GenericQuery,
                    qtype_map=QTYPE_MAP,
                    namespace=NAME,
                    lang='en'),
        TrecQrels(
            RelativePath(topics_and_qrels,
                         'benchmark/qrels/qrels.raw.torso.train.txt'),
            QREL_DEFS), documentation('train/torso'))

    subsets['train/tail'] = Dataset(
        collection,
        TrecQueries(RelativePath(topics_and_qrels,
                                 'benchmark/topics/topics.tail.train.txt'),
                    qtype=GenericQuery,
                    qtype_map=QTYPE_MAP,
                    namespace=NAME,
                    lang='en'),
        TrecQrels(
            RelativePath(topics_and_qrels,
                         'benchmark/qrels/qrels.raw.tail.train.txt'),
            QREL_DEFS), documentation('train/tail'))

    train_queries = ConcatQueries([
        subsets['train/head'].queries_handler(),
        subsets['train/torso'].queries_handler(),
        subsets['train/tail'].queries_handler(),
    ])
    train_docpairs = DocPairGenerator(
        TarExtract(dlc['dlfiles'], 'dlfiles/triples.train.tsv'), collection,
        train_queries, base_path / 'train.docpairs')
    subsets['train'] = Dataset(
        collection, train_queries,
        ConcatQrels([
            subsets['train/head'].qrels_handler(),
            subsets['train/torso'].qrels_handler(),
            subsets['train/tail'].qrels_handler(),
        ]), TsvDocPairs(train_docpairs), documentation('train'))
    subsets['train/hofstaetter-triples'] = Dataset(
        collection, train_queries, subsets['train'].qrels_handler(),
        TsvDocPairs(dlc['hofstaetter-triples']),
        documentation('train/hofstaetter-triples'))

    ### Val

    subsets['val/head'] = Dataset(
        collection,
        TrecQueries(RelativePath(topics_and_qrels,
                                 'benchmark/topics/topics.head.val.txt'),
                    qtype=GenericQuery,
                    qtype_map=QTYPE_MAP,
                    namespace=NAME,
                    lang='en'),
        TrecQrels(
            RelativePath(topics_and_qrels,
                         'benchmark/qrels/qrels.raw.head.val.txt'), QREL_DEFS),
        TrecScoredDocs(
            RelativePath(val_runs, 'dlfiles/run.trip.BM25.head.val.txt')),
        documentation('val/head'))

    subsets['val/head/dctr'] = Dataset(
        TrecQrels(
            RelativePath(topics_and_qrels,
                         'benchmark/qrels/qrels.dctr.head.val.txt'),
            QREL_DCTR_DEFS), subsets['val/head'],
        documentation('val/head/dctr'))

    subsets['val/torso'] = Dataset(
        collection,
        TrecQueries(RelativePath(topics_and_qrels,
                                 'benchmark/topics/topics.torso.val.txt'),
                    qtype=GenericQuery,
                    qtype_map=QTYPE_MAP,
                    namespace=NAME,
                    lang='en'),
        TrecQrels(
            RelativePath(topics_and_qrels,
                         'benchmark/qrels/qrels.raw.torso.val.txt'),
            QREL_DEFS),
        TrecScoredDocs(
            RelativePath(val_runs, 'dlfiles/run.trip.BM25.torso.val.txt')),
        documentation('val/torso'))

    subsets['val/tail'] = Dataset(
        collection,
        TrecQueries(RelativePath(topics_and_qrels,
                                 'benchmark/topics/topics.tail.val.txt'),
                    qtype=GenericQuery,
                    qtype_map=QTYPE_MAP,
                    namespace=NAME,
                    lang='en'),
        TrecQrels(
            RelativePath(topics_and_qrels,
                         'benchmark/qrels/qrels.raw.tail.val.txt'), QREL_DEFS),
        TrecScoredDocs(
            RelativePath(val_runs, 'dlfiles/run.trip.BM25.tail.val.txt')),
        documentation('val/tail'))

    subsets['val'] = Dataset(
        collection,
        ConcatQueries([
            subsets['val/head'].queries_handler(),
            subsets['val/torso'].queries_handler(),
            subsets['val/tail'].queries_handler(),
        ]),
        ConcatQrels([
            subsets['val/head'].qrels_handler(),
            subsets['val/torso'].qrels_handler(),
            subsets['val/tail'].qrels_handler(),
        ]),
        ConcatScoreddocs([
            subsets['val/head'].scoreddocs_handler(),
            subsets['val/torso'].scoreddocs_handler(),
            subsets['val/tail'].scoreddocs_handler(),
        ]), documentation('val'))

    ### Test

    subsets['test/head'] = Dataset(
        collection,
        TrecQueries(RelativePath(topics_and_qrels,
                                 'benchmark/topics/topics.head.test.txt'),
                    qtype=GenericQuery,
                    qtype_map=QTYPE_MAP,
                    namespace=NAME,
                    lang='en'),
        TrecScoredDocs(
            RelativePath(test_runs, 'runs_test/run.trip.BM25.head.test.txt')),
        documentation('val/head'))

    subsets['test/torso'] = Dataset(
        collection,
        TrecQueries(RelativePath(topics_and_qrels,
                                 'benchmark/topics/topics.torso.test.txt'),
                    qtype=GenericQuery,
                    qtype_map=QTYPE_MAP,
                    namespace=NAME,
                    lang='en'),
        TrecScoredDocs(
            RelativePath(test_runs, 'runs_test/run.trip.BM25.torso.test.txt')),
        documentation('test/torso'))

    subsets['test/tail'] = Dataset(
        collection,
        TrecQueries(RelativePath(topics_and_qrels,
                                 'benchmark/topics/topics.tail.test.txt'),
                    qtype=GenericQuery,
                    qtype_map=QTYPE_MAP,
                    namespace=NAME,
                    lang='en'),
        TrecScoredDocs(
            RelativePath(test_runs, 'runs_test/run.trip.BM25.tail.test.txt')),
        documentation('test/tail'))

    subsets['test'] = Dataset(
        collection,
        ConcatQueries([
            subsets['test/head'].queries_handler(),
            subsets['test/torso'].queries_handler(),
            subsets['test/tail'].queries_handler(),
        ]),
        ConcatScoreddocs([
            subsets['test/head'].scoreddocs_handler(),
            subsets['test/torso'].scoreddocs_handler(),
            subsets['test/tail'].scoreddocs_handler(),
        ]), documentation('test'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Ejemplo n.º 11
0
def _init():
    base_path = ir_datasets.util.home_path() / NAME
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    subsets = {}
    collection = MsMarcoV2Docs(dlc['docs'])

    subsets['train'] = Dataset(
        collection,
        TsvQueries(dlc['train_queries'], namespace='msmarco', lang='en'),
        TrecQrels(dlc['train_qrels'], QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['train_scoreddocs'])),
    )
    subsets['dev1'] = Dataset(
        collection,
        TsvQueries(dlc['dev1_queries'], namespace='msmarco', lang='en'),
        TrecQrels(dlc['dev1_qrels'], QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['dev1_scoreddocs'])),
    )
    subsets['dev2'] = Dataset(
        collection,
        TsvQueries(dlc['dev2_queries'], namespace='msmarco', lang='en'),
        TrecQrels(dlc['dev2_qrels'], QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['dev2_scoreddocs'])),
    )
    subsets['trec-dl-2019'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(GzipExtract(dlc['trec_dl_2019_qrels']), TREC_DL_QRELS_DEFS),
    )
    subsets['trec-dl-2020'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(GzipExtract(dlc['trec_dl_2020_qrels']), TREC_DL_QRELS_DEFS),
    )
    dl19_v2_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2019'].qrels_iter()})
    subsets['trec-dl-2019/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2019'].queries_handler(),
                        dl19_v2_judged),
        subsets['trec-dl-2019'],
    )
    dl20_v2_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2020'].qrels_iter()})
    subsets['trec-dl-2020/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2020'].queries_handler(),
                        dl20_v2_judged),
        subsets['trec-dl-2020'],
    )
    subsets['trec-dl-2021'] = Dataset(
        collection,
        TsvQueries(dlc['trec-dl-2021/queries'], namespace='msmarco',
                   lang='en'),
        TrecQrels(dlc['trec-dl-2021/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['trec-dl-2021/scoreddocs'])),
    )
    dl21_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2021'].qrels_iter()})
    subsets['trec-dl-2021/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2021'].queries_handler(),
                        dl21_judged),
        FilteredScoredDocs(subsets['trec-dl-2021'].scoreddocs_handler(),
                           dl21_judged),
        subsets['trec-dl-2021'],
    )

    subsets['anchor-text'] = Dataset(
        MsMarcoV2AnchorTextDocs(Cache(GzipExtract(dlc['anchor-text']),
                                      base_path / "anchor-text.json"),
                                count_hint=4821244),
        documentation('anchor-text'))

    ir_datasets.registry.register(NAME, Dataset(collection,
                                                documentation("_")))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}',
                                      Dataset(subsets[s], documentation(s)))

    return collection, subsets
Ejemplo n.º 12
0
 def cached_zip_download(name: str, zip_path: str, extension: str) -> Cache:
     return Cache(ZipExtract(download_config[name], zip_path),
                  base_path / f"{name}.{extension}")
Ejemplo n.º 13
0
 def cached_download(name: str, extension: str) -> Cache:
     return Cache(download_config[name], base_path / f"{name}.{extension}")
Ejemplo n.º 14
0
def _init():
    base_path = ir_datasets.util.home_path()/NAME
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    subsets = {}
    collection = MsMarcoTrecDocs(GzipExtract(dlc['docs']))

    subsets['train'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['train/queries']), namespace='msmarco', lang='en'),
        TrecQrels(GzipExtract(dlc['train/qrels']), QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])),
    )

    subsets['dev'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['dev/queries']), namespace='msmarco', lang='en'),
        TrecQrels(GzipExtract(dlc['dev/qrels']), QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['dev/scoreddocs'])),
    )

    subsets['eval'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['eval/queries']), namespace='msmarco', lang='en'),
        TrecScoredDocs(GzipExtract(dlc['eval/scoreddocs'])),
    )

    subsets['trec-dl-2019'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']), namespace='msmarco', lang='en'),
        TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['trec-dl-2019/scoreddocs'])),
    )

    subsets['trec-dl-2020'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'),
        TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['trec-dl-2020/scoreddocs'])),
    )

    subsets['orcas'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['orcas/queries']), namespace='orcas', lang='en'),
        TrecQrels(GzipExtract(dlc['orcas/qrels']), ORCAS_QLRES_DEFS),
        TrecScoredDocs(GzipExtract(dlc['orcas/scoreddocs'])),
    )

    dl19_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()})
    subsets['trec-dl-2019/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged),
        FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged),
        subsets['trec-dl-2019'],
    )

    dl20_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()})
    subsets['trec-dl-2020/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_judged),
        FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(), dl20_judged),
        subsets['trec-dl-2020'],
    )

    # DL-Hard
    dl_hard_qrels_migrator = Migrator(base_path/'trec-dl-hard'/'irds_version.txt', 'v2',
        affected_files=[base_path/'trec-dl-hard'/'qrels'],
        message='Updating trec-dl-hard qrels')
    hard_qids = Lazy(lambda: DL_HARD_QIDS)
    dl_hard_base_queries = TsvQueries([
            Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path/'trec-dl-2019/queries.tsv'),
            Cache(GzipExtract(dlc['trec-dl-2020/queries']), base_path/'trec-dl-2020/queries.tsv')], namespace='msmarco', lang='en')
    subsets['trec-dl-hard'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        dl_hard_qrels_migrator(TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)),
        documentation('trec-dl-hard')
    )
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1'])
    subsets['trec-dl-hard/fold1'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold1')
    )
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2'])
    subsets['trec-dl-hard/fold2'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold2')
    )
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3'])
    subsets['trec-dl-hard/fold3'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold3')
    )
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4'])
    subsets['trec-dl-hard/fold4'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold4')
    )
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5'])
    subsets['trec-dl-hard/fold5'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold5')
    )
    
    subsets['anchor-text'] = Dataset(
        MsMarcoAnchorTextDocs(
            Cache(GzipExtract(dlc['anchor-text']), base_path / "anchor-text.json"),
            count_hint=1703834
        ),
        documentation('anchor-text')
    )

    ir_datasets.registry.register(NAME, Dataset(collection, documentation("_")))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s)))

    return collection, subsets
Ejemplo n.º 15
0
def _init():
    documentation = YamlDocumentation('docs/msmarco-passage.yaml')
    base_path = ir_datasets.util.home_path() / 'msmarco-passage'
    dlc = DownloadConfig.context('msmarco-passage', base_path, dua=DUA)
    collection = TsvDocs(Cache(
        FixEncoding(TarExtract(dlc['collectionandqueries'], 'collection.tsv')),
        base_path / 'collection.tsv'),
                         namespace='msmarco')
    subsets = {}

    subsets['train'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.train.tsv'),
                         base_path / 'train/queries.tsv'),
                   namespace='msmarco'),
        TrecQrels(dlc['train/qrels'], QRELS_DEFS),
        TsvDocPairs(GzipExtract(dlc['train/docpairs'])),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(
                    TarExtract(dlc['train/scoreddocs'], 'top1000.train.txt')),
                base_path / 'train/ms.run')),
    )

    subsets['dev'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.dev.tsv'),
                         base_path / 'dev/queries.tsv'),
                   namespace='msmarco'),
        TrecQrels(dlc['dev/qrels'], QRELS_DEFS),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(TarExtract(dlc['dev/scoreddocs'],
                                         'top1000.dev')),
                base_path / 'dev/ms.run')),
    )

    subsets['dev/small'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(dlc['collectionandqueries'], 'queries.dev.small.tsv'),
            base_path / 'dev/small/queries.tsv'),
                   namespace='msmarco'),
        TrecQrels(
            Cache(
                TarExtract(dlc['collectionandqueries'], 'qrels.dev.small.tsv'),
                base_path / 'dev/small/qrels'), QRELS_DEFS),
    )

    subsets['eval'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.eval.tsv'),
                         base_path / 'eval/queries.tsv'),
                   namespace='msmarco'),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(
                    TarExtract(dlc['eval/scoreddocs'], 'top1000.eval')),
                base_path / 'eval/ms.run')),
    )

    subsets['eval/small'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(dlc['collectionandqueries'], 'queries.eval.small.tsv'),
            base_path / 'eval/small/queries.tsv'),
                   namespace='msmarco'),
    )

    subsets['trec-dl-2019'] = Dataset(
        collection,
        TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS),
        TsvQueries(Cache(GzipExtract(dlc['trec-dl-2019/queries']),
                         base_path / 'trec-dl-2019/queries.tsv'),
                   namespace='msmarco'),
        TrecScoredDocs(
            Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2019/scoreddocs'])),
                  base_path / 'trec-dl-2019/ms.run')),
    )

    subsets['trec-dl-2020'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']),
                   namespace='msmarco'),
        TrecScoredDocs(
            Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2020/scoreddocs'])),
                  base_path / 'trec-dl-2020/ms.run')),
    )

    # A few subsets that are contrainted to just the queries/qrels/docpairs that have at least
    # 1 relevance assessment
    train_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['train'].qrels_iter()})
    subsets['train/judged'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(), train_judged),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           train_judged),
        subsets['train'],
    )

    dev_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['dev'].qrels_iter()})
    subsets['dev/judged'] = Dataset(
        FilteredQueries(subsets['dev'].queries_handler(), dev_judged),
        FilteredScoredDocs(subsets['dev'].scoreddocs_handler(), dev_judged),
        subsets['dev'],
    )

    dl19_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2019'].qrels_iter()})
    subsets['trec-dl-2019/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2019'].queries_handler(),
                        dl19_judged),
        FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(),
                           dl19_judged),
        subsets['trec-dl-2019'],
    )

    # split200 -- 200 queries held out from the training data for validation
    split200 = Lazy(lambda: SPLIT200_QIDS)
    subsets['train/split200-train'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(),
                        split200,
                        mode='exclude'),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           split200,
                           mode='exclude'),
        FilteredQrels(subsets['train'].qrels_handler(),
                      split200,
                      mode='exclude'),
        FilteredDocPairs(subsets['train'].docpairs_handler(),
                         split200,
                         mode='exclude'),
        subsets['train'],
    )
    subsets['train/split200-valid'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(),
                        split200,
                        mode='include'),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           split200,
                           mode='include'),
        FilteredQrels(subsets['train'].qrels_handler(),
                      split200,
                      mode='include'),
        FilteredDocPairs(subsets['train'].docpairs_handler(),
                         split200,
                         mode='include'),
        subsets['train'],
    )

    # Medical subset
    def train_med():
        with dlc['medmarco_ids'].stream() as stream:
            stream = codecs.getreader('utf8')(stream)
            return {l.rstrip() for l in stream}

    train_med = Lazy(train_med)
    subsets['train/medical'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(), train_med),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_med),
        FilteredDocPairs(subsets['train'].docpairs_handler(), train_med),
        FilteredQrels(subsets['train'].qrels_handler(), train_med),
        subsets['train'],
    )

    ir_datasets.registry.register('msmarco-passage',
                                  Dataset(collection, documentation('_')))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'msmarco-passage/{s}',
                                      Dataset(subsets[s], documentation(s)))

    return collection, subsets
Ejemplo n.º 16
0
def _init():
    base_path = ir_datasets.util.home_path() / NAME
    dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    base = Dataset(documentation('_'))

    subsets = {}

    benchmarks = {
        'msmarco': (['train', 'dev', 'test'], GenericDoc, GenericQuery),
        'trec-covid': (['test'], BeirCordDoc, BeirCovidQuery),
        'nfcorpus': (['train', 'dev', 'test'], BeirTitleUrlDoc, BeirUrlQuery),
        'nq': (['test'], BeirTitleDoc, GenericQuery),
        'hotpotqa': (['train', 'dev', 'test'], BeirTitleUrlDoc, GenericQuery),
        'fiqa': (['train', 'dev', 'test'], GenericDoc, GenericQuery),
        'arguana': (['test'], BeirTitleDoc, GenericQuery),
        'webis-touche2020': (['test'], BeirToucheDoc, BeirToucheQuery),
        'webis-touche2020/v2': (['test'], BeirToucheDoc, BeirToucheQuery),
        'quora': (['dev', 'test'], GenericDoc, GenericQuery),
        'dbpedia-entity': (['dev', 'test'], BeirTitleUrlDoc, GenericQuery),
        'scidocs': (['test'], BeirSciDoc, BeirSciQuery),
        'fever': (['train', 'dev', 'test'], BeirTitleDoc, GenericQuery),
        'climate-fever': (['test'], BeirTitleDoc, GenericQuery),
        'scifact': (['train', 'test'], BeirTitleDoc, GenericQuery),
    }

    for ds, (qrels, doc_type, query_type) in benchmarks.items():
        dlc_ds = dlc[ds]
        ds_zip = ds.split('/')[0]
        docs_migrator = Migrator(
            base_path / ds / 'irds_version.txt',
            'v2',
            affected_files=[f'{base_path/ds}/docs.pklz4'],
            message=f'Migrating {NAME}/{ds} (structuring fields)')
        docs = docs_migrator(
            BeirDocs(ds, ZipExtract(dlc_ds, f'{ds_zip}/corpus.jsonl'),
                     doc_type))
        queries = BeirQueries(
            ds,
            Cache(ZipExtract(dlc_ds, f'{ds_zip}/queries.jsonl'),
                  base_path / ds / 'queries.json'), query_type)
        if len(qrels) == 1:
            subsets[ds] = Dataset(
                docs, queries,
                BeirQrels(Cache(
                    ZipExtract(dlc_ds, f'{ds_zip}/qrels/{qrels[0]}.tsv'),
                    base_path / ds / f'{qrels[0]}.qrels'),
                          qrels_defs={}), documentation(ds))
        else:
            subsets[ds] = Dataset(docs, queries, documentation(ds))
            for qrel in qrels:
                subset_qrels = BeirQrels(Cache(
                    ZipExtract(dlc_ds, f'{ds_zip}/qrels/{qrel}.tsv'),
                    base_path / ds / f'{qrel}.qrels'),
                                         qrels_defs={})
                subset_qids = qid_filter(subset_qrels)
                subsets[f'{ds}/{qrel}'] = Dataset(
                    docs, FilteredQueries(queries, subset_qids,
                                          mode='include'), subset_qrels,
                    documentation(f'{ds}/{qrel}'))

    cqa = [
        'android', 'english', 'gaming', 'gis', 'mathematica', 'physics',
        'programmers', 'stats', 'tex', 'unix', 'webmasters', 'wordpress'
    ]
    cqa_dlc = dlc['cqadupstack']
    for ds in cqa:
        docs_migrator = Migrator(
            base_path / 'cqadupstack' / ds / 'irds_version.txt',
            'v2',
            affected_files=[f'{base_path/"cqadupstack"/ds}/docs.pklz4'],
            message=f'Migrating {NAME}/cqadupstack/{ds} (structuring fields)')
        subsets[f'cqadupstack/{ds}'] = Dataset(
            docs_migrator(
                BeirDocs(f'cqadupstack/{ds}',
                         ZipExtract(cqa_dlc, f'cqadupstack/{ds}/corpus.jsonl'),
                         BeirCqaDoc)),
            BeirQueries(
                f'cqadupstack/{ds}',
                Cache(ZipExtract(cqa_dlc, f'cqadupstack/{ds}/queries.jsonl'),
                      base_path / 'cqadupstack' / ds / 'queries.json'),
                BeirCqaQuery),
            BeirQrels(Cache(
                ZipExtract(cqa_dlc, f'cqadupstack/{ds}/qrels/test.tsv'),
                base_path / 'cqadupstack' / ds / f'test.qrels'),
                      qrels_defs={}), documentation(f'cqadupstack/{ds}'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets