Example #1
0
def _init():
    subsets = {}
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)

    collection = TrecDocs(dlc['docs'], encoding='GB18030', path_globs=['**/xinhua/x*', '**/peoples-daily/pd*'], namespace=NAME, lang='zh', count_hint=ir_datasets.util.count_hint(NAME))

    base = Dataset(collection, documentation('_'))

    subsets['trec5'] = Dataset(
        TrecQueries(GzipExtract(dlc['trec5/queries']), qtype=TrecMandarinQuery, qtype_map=QTYPE_MAP, encoding='GBK', namespace=NAME, lang=None), # queries have multiple languages
        TrecQrels(GzipExtract(dlc['trec5/qrels']), QREL_DEFS),
        collection,
        documentation('trec5'))

    subsets['trec6'] = Dataset(
        TrecQueries(GzipExtract(dlc['trec6/queries']), qtype=TrecMandarinQuery, qtype_map=QTYPE_MAP, encoding='GBK', namespace=NAME, lang=None), # queries have multiple languages
        TrecQrels(GzipExtract(dlc['trec6/qrels']), QREL_DEFS),
        collection,
        documentation('trec6'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Example #2
0
def _init():
    base_path = ir_datasets.util.home_path()/NAME
    dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    collection = FairTrecDocs(GzipExtract(dlc["docs"]), GzipExtract(dlc["metadata"]))

    base = Dataset(
        collection,
        documentation('_'))

    subsets = {}

    train_topics = GzipExtract(dlc["train/topics"])
    subsets['train'] = Dataset(
        collection,
        FairTrecQueries(train_topics, FairTrecQuery),
        FairTrecQrels(train_topics),
        documentation('train'))

    subsets['eval'] = Dataset(
        collection,
        FairTrecQueries(GzipExtract(dlc['eval/topics']), FairTrecEvalQuery),
        documentation('eval'))

    ir_datasets.registry.register(NAME, base)
    for s in subsets:
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Example #3
0
def _init():
    base_path = ir_datasets.util.home_path() / NAME
    dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    collection = TsvDocs(GzipExtract(dlc['docs']),
                         doc_cls=DprW100Doc,
                         namespace=NAME,
                         lang='en',
                         skip_first_line=True,
                         docstore_size_hint=12827215492,
                         count_hint=ir_datasets.util.count_hint(NAME))
    base = Dataset(collection, documentation('_'))

    subsets = {}

    nq_dev_manager = DprW100Manager(GzipExtract(dlc['nq-dev']),
                                    base_path / 'nq-dev')
    subsets['natural-questions/dev'] = Dataset(
        collection, DprW100Queries(nq_dev_manager.file_ref('queries.tsv')),
        TrecQrels(nq_dev_manager.file_ref('qrels'), QREL_DEFS),
        documentation('natural-questions/dev'))

    nq_train_manager = DprW100Manager(GzipExtract(dlc['nq-train']),
                                      base_path / 'nq-train')
    subsets['natural-questions/train'] = Dataset(
        collection, DprW100Queries(nq_train_manager.file_ref('queries.tsv')),
        TrecQrels(nq_train_manager.file_ref('qrels'), QREL_DEFS),
        documentation('natural-questions/train'))

    tqa_dev_manager = DprW100Manager(GzipExtract(dlc['tqa-dev']),
                                     base_path / 'tqa-dev',
                                     passage_id_key='psg_id')
    subsets['trivia-qa/dev'] = Dataset(
        collection, DprW100Queries(tqa_dev_manager.file_ref('queries.tsv')),
        TrecQrels(tqa_dev_manager.file_ref('qrels'), QREL_DEFS),
        documentation('trivia-qa/dev'))

    tqa_train_manager = DprW100Manager(GzipExtract(dlc['tqa-train']),
                                       base_path / 'tqa-train',
                                       passage_id_key='psg_id')
    subsets['trivia-qa/train'] = Dataset(
        collection, DprW100Queries(tqa_train_manager.file_ref('queries.tsv')),
        TrecQrels(tqa_train_manager.file_ref('qrels'), QREL_DEFS),
        documentation('trivia-qa/train'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Example #4
0
def _init():
    base_path = ir_datasets.util.home_path()/NAME
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    subsets = {}
    migrator = Migrator(base_path/'irds_version.txt', 'v2',
        affected_files=[base_path/'msmarco_v2_passage.tar.pklz4'],
        message='Cleaning up pklz4 lookup structure in favor of ID-based lookups')
    collection = MsMarcoV2Passages(dlc['passages'])
    collection = migrator(collection)

    qrels_migrator = Migrator(base_path/'qrels_version.txt', 'v2',
        affected_files=[base_path/'train'/'qrels.tsv', base_path/'dev1'/'qrels.tsv', base_path/'dev2'/'qrels.tsv'],
        message='Updating qrels (task organizers removed duplicates)')

    subsets['train'] = Dataset(
        collection,
        TsvQueries(dlc['train/queries'], namespace='msmarco', lang='en'),
        qrels_migrator(TrecQrels(dlc['train/qrels'], QRELS_DEFS)),
        TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])),
    )
    subsets['dev1'] = Dataset(
        collection,
        TsvQueries(dlc['dev1/queries'], namespace='msmarco', lang='en'),
        qrels_migrator(TrecQrels(dlc['dev1/qrels'], QRELS_DEFS)),
        TrecScoredDocs(GzipExtract(dlc['dev1/scoreddocs'])),
    )
    subsets['dev2'] = Dataset(
        collection,
        TsvQueries(dlc['dev2/queries'], namespace='msmarco', lang='en'),
        qrels_migrator(TrecQrels(dlc['dev2/qrels'], QRELS_DEFS)),
        TrecScoredDocs(GzipExtract(dlc['dev2/scoreddocs'])),
    )
    subsets['trec-dl-2021'] = Dataset(
        collection,
        TsvQueries(dlc['trec-dl-2021/queries'], namespace='msmarco', lang='en'),
        TrecQrels(dlc['trec-dl-2021/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['trec-dl-2021/scoreddocs'])),
    )
    dl21_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2021'].qrels_iter()})
    subsets['trec-dl-2021/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2021'].queries_handler(), dl21_judged),
        FilteredScoredDocs(subsets['trec-dl-2021'].scoreddocs_handler(), dl21_judged),
        subsets['trec-dl-2021'],
    )

    ir_datasets.registry.register(NAME, Dataset(collection, documentation("_")))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s)))

    return collection, subsets
Example #5
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    base = Dataset(documentation('_'))

    collection04 = MedlineDocs('2004', [GzipExtract(dlc['2004/a']), GzipExtract(dlc['2004/b']), GzipExtract(dlc['2004/c']), GzipExtract(dlc['2004/d'])], count_hint=ir_datasets.util.count_hint(f'{NAME}/2004'))

    subsets['2004'] = Dataset(collection04, documentation('2004'))

    subsets['2004/trec-genomics-2004'] = Dataset(
        collection04,
        TrecXmlQueries(ZipExtract(dlc['trec-genomics-2004/queries'], 'Official.xml'), qtype=TrecGenomicsQuery, qtype_map=TREC04_XML_MAP, namespace='trec-genomics', lang='en'),
        TrecQrels(dlc['trec-genomics-2004/qrels'], QREL_DEFS),
        documentation('trec-genomics-2004'),
    )
    subsets['2004/trec-genomics-2005'] = Dataset(
        collection04,
        TrecGenomicsQueries(dlc['trec-genomics-2005/queries']),
        TrecQrels(dlc['trec-genomics-2005/qrels'], QREL_DEFS),
        documentation('trec-genomics-2005'),
    )

    collection17 = ConcatDocs([
        AacrAscoDocs(dlc['2017/aacr_asco_extra']),
        MedlineDocs('2017', [dlc['2017/part1'], dlc['2017/part2'], dlc['2017/part3'], dlc['2017/part4'], dlc['2017/part5']]),
    ], count_hint=ir_datasets.util.count_hint(f'{NAME}/2017'))
    subsets['2017'] = Dataset(collection17, documentation('2017'))

    subsets['2017/trec-pm-2017'] = Dataset(
        collection17,
        TrecXmlQueries(dlc['trec-pm-2017/queries'], qtype=TrecPm2017Query, namespace='trec-pm-2017', lang='en'),
        TrecQrels(dlc['trec-pm-2017/qrels'], QREL_DEFS),
        documentation('trec-pm-2017'),
    )
    subsets['2017/trec-pm-2018'] = Dataset(
        collection17,
        TrecXmlQueries(dlc['trec-pm-2018/queries'], qtype=TrecPmQuery, namespace='trec-pm-2018', lang='en'),
        TrecQrels(dlc['trec-pm-2018/qrels'], QREL_DEFS),
        documentation('trec-pm-2018'),
    )

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Example #6
0
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    en_noclean_tr_collection = C4Docs(
        GzipExtract(dlc['en-noclean/sources']),
        TarExtractAll(dlc['en-noclean/checkpoints'],
                      base_path / 'en.noclean.checkpoints'),
        base_path,
        source_name_filter=r'en\.noclean\.c4-train',
        filter_name='train')  # exclude validation files (only include train)
    base = Dataset(documentation('_'))

    subsets['en-noclean-tr'] = Dataset(en_noclean_tr_collection,
                                       documentation('en-noclean-tr'))

    subsets['en-noclean-tr/trec-misinfo-2021'] = Dataset(
        en_noclean_tr_collection,
        TrecXmlQueries(dlc['trec-misinfo-2021/queries'],
                       qtype=MisinfoQuery,
                       qtype_map=misinfo_map,
                       namespace='trec-misinfo',
                       lang='en'),
        documentation('en-noclean-tr/trec-misinfo-2021'))

    ir_datasets.registry.register(NAME, base)
    for subset in subsets:
        ir_datasets.registry.register(f'{NAME}/{subset}', subsets[subset])

    return base, subsets
Example #7
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    subsets = {}

    collection = TrecDocs(dlc['docs'],
                          path_globs=[
                              '**/FBIS/FB*', '**/FR94/??/FR*', '**/FT/*/FT*',
                              '**/LATIMES/LA*'
                          ],
                          namespace=NAME,
                          lang='en',
                          expected_file_count=2295,
                          count_hint=ir_datasets.util.count_hint(NAME))

    queries = TrecQueries(GzipExtract(dlc['queries']),
                          namespace=NAME,
                          lang='en')
    qrels = TrecQrels(dlc['qrels'], QREL_DEFS)

    base = Dataset(collection, queries, qrels, documentation('_'))

    for fold in FOLDS:
        qid_filter = make_filter(fold)
        subsets[fold] = Dataset(FilteredQueries(queries, qid_filter),
                                FilteredQrels(qrels, qid_filter), collection,
                                documentation(fold))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Example #8
0
 def _docs_initializer(lang_code):
     if lang_code not in _docs_cache:
         dlc = _dlc().context("clirmatrix_docs", base_path)
         docs = TsvDocs(GzipExtract(dlc[f'docs/{lang_code}']),
                        namespace=f'{NAME}/{lang_code}',
                        lang=lang_code)
         _docs_cache[lang_code] = docs
     return _docs_cache[lang_code]
Example #9
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    collection = GovDocs(dlc['docs'])
    base = Dataset(collection, documentation('_'))

    subsets['trec-web-2002'] = Dataset(
        collection,
        TrecQueries(GzipExtract(dlc['trec-web-2002/queries']), namespace='gov/trec-web-2002', lang='en'),
        TrecQrels(GzipExtract(dlc['trec-web-2002/qrels']), QREL_DEFS),
        documentation('trec-web-2002')
    )
    subsets['trec-web-2002/named-page'] = Dataset(
        collection,
        TrecQueries(GzipExtract(dlc['trec-web-2002/named-page/queries']), qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace='gov/trec-web-2002/named-page', lang='en'),
        TrecQrels(GzipExtract(dlc['trec-web-2002/named-page/qrels']), NAMED_PAGE_QREL_DEFS),
        documentation('trec-web-2002/named-page')
    )
    subsets['trec-web-2003'] = Dataset(
        collection,
        TrecQueries(dlc['trec-web-2003/queries'], qtype=GovWeb02Query, qtype_map=WEB03_QTYPE_MAP, namespace='gov/trec-web-2003', lang='en'),
        TrecQrels(dlc['trec-web-2003/qrels'], QREL_DEFS),
        documentation('trec-web-2003')
    )
    subsets['trec-web-2003/named-page'] = Dataset(
        collection,
        TrecQueries(dlc['trec-web-2003/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace='gov/trec-web-2003/named-page', lang='en'),
        TrecQrels(dlc['trec-web-2003/named-page/qrels'], NAMED_PAGE_QREL_DEFS),
        documentation('trec-web-2003/named-page')
    )
    subsets['trec-web-2004'] = Dataset(
        collection,
        TrecQueries(dlc['trec-web-2004/queries'], qtype=GenericQuery, qtype_map=WEB04_QTYPE_MAP, namespace='gov/trec-web-2004', lang='en'),
        TrecQrels(dlc['trec-web-2004/qrels'], QREL_DEFS),
        documentation('trec-web-2004')
    )

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Example #10
0
def _init():
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    manager = MsMarcoQnAManager(GzipExtract(dlc['train']), GzipExtract(dlc['dev']), GzipExtract(dlc['eval']), base_path)
    migrator = Migrator(base_path/'irds_version.txt', 'v2',
        affected_files=[
            base_path/'docs.pklz4',
            base_path/'train.run', base_path/'train.qrels',
            base_path/'dev.run', base_path/'dev.qrels',
            base_path/'eval.run',
        ],
        message='Migrating msmarco-qna (correcting doc_ids)')

    collection = DocstoreBackedDocs(manager.docs_store, docs_cls=MsMarcoQnADoc, namespace=NAME, lang='en')
    collection = migrator(collection)

    subsets = {}

    subsets['train'] = Dataset(
        collection,
        TsvQueries(manager.file_ref('train.queries.tsv'), query_cls=MsMarcoQnAQuery, namespace='msmarco', lang='en'),
        migrator(TrecQrels(manager.file_ref('train.qrels'), QRELS_DEFS)),
        migrator(TrecScoredDocs(manager.file_ref('train.run'))),
    )

    subsets['dev'] = Dataset(
        collection,
        TsvQueries(manager.file_ref('dev.queries.tsv'), query_cls=MsMarcoQnAQuery, namespace='msmarco', lang='en'),
        migrator(TrecQrels(manager.file_ref('dev.qrels'), QRELS_DEFS)),
        migrator(TrecScoredDocs(manager.file_ref('dev.run'))),
    )

    subsets['eval'] = Dataset(
        collection,
        TsvQueries(manager.file_ref('eval.queries.tsv'), query_cls=MsMarcoQnAEvalQuery, namespace='msmarco', lang='en'),
        migrator(TrecScoredDocs(manager.file_ref('eval.run'))),
    )

    ir_datasets.registry.register(NAME, Dataset(collection, documentation('_')))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s)))

    return collection, subsets
Example #11
0
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    collection = TrecDocs(
        dlc['docs'],
        encoding='ISO-8859-1',
        path_globs=['**/afp_text/af*', '**/infosel_data/ism_*'],
        namespace=NAME,
        lang='es',
        count_hint=ir_datasets.util.count_hint(NAME))

    base = Dataset(collection, documentation('_'))

    subsets['trec3'] = Dataset(
        TrecSpanishTranslateQueries(
            TrecQueries(GzipExtract(dlc['trec3/queries']),
                        qtype_map=QTYPE_MAP_3,
                        encoding='ISO-8859-1',
                        namespace=NAME,
                        lang=None), TrecSpanish3Query),
        TrecQrels(GzipExtract(dlc['trec3/qrels']), QREL_DEFS), collection,
        documentation('trec3'))

    subsets['trec4'] = Dataset(
        TrecSpanishTranslateQueries(
            TrecQueries(GzipExtract(dlc['trec4/queries']),
                        qtype=TrecDescOnlyQuery,
                        qtype_map=QTYPE_MAP_4,
                        encoding='ISO-8859-1',
                        namespace=NAME,
                        lang=None), TrecSpanish4Query),
        TrecQrels(GzipExtract(dlc['trec4/qrels']), QREL_DEFS), collection,
        documentation('trec4'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Example #12
0
 def _initializer(dsid, args, dlc_context=None):
     docs_lang, queries_lang, split = args
     docs = _docs_initializer(docs_lang)
     components = [docs]
     if queries_lang:  # queries & split are optional
         dlc = _dlc().context(dlc_context, base_path)
         dlc_key = f'queries/{queries_lang}_{docs_lang}/{split}'
         qrel_dlc = GzipExtract(dlc[dlc_key])
         qrels = CLIRMatrixQrels(qrel_dlc, QRELS_DEFS)
         queries = CLIRMatrixQueries(qrel_dlc, queries_lang)
         components += [queries, qrels]
     result = Dataset(*components)
     result = Dataset(MetadataComponent(dsid, result, metadata), result)
     return result
Example #13
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    collection = MedlineDocs([
        GzipExtract(dlc['docs/a']),
        GzipExtract(dlc['docs/b']),
        GzipExtract(dlc['docs/c']),
        GzipExtract(dlc['docs/d'])
    ])
    base = Dataset(collection, documentation('_'))

    subsets['trec-genomics-2004'] = Dataset(
        collection,
        TrecXmlQueries(ZipExtract(dlc['trec-genomics-2004/queries'],
                                  'Official.xml'),
                       qtype=TrecGenomicsQuery,
                       qtype_map=TREC04_XML_MAP,
                       namespace='trec-genomics'),
        TrecQrels(dlc['trec-genomics-2004/qrels'], QREL_DEFS),
        documentation('trec-genomics-2004'),
    )
    subsets['trec-genomics-2005'] = Dataset(
        collection,
        TrecGenomicsQueries(dlc['trec-genomics-2005/queries']),
        TrecQrels(dlc['trec-genomics-2005/qrels'], QREL_DEFS),
        documentation('trec-genomics-2005'),
    )

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Example #14
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    docs_dlc = dlc['docs']
    chk_dlc = TarExtractAll(dlc['docs.chk'], base_path / 'corpus.chk')
    collection = ClueWeb09Docs(docs_dlc, chk_dlc, lang=None)  # multiple langs
    collection_ar = ClueWeb09Docs(docs_dlc,
                                  chk_dlc,
                                  dirs=['ClueWeb09_Arabic_1'],
                                  lang='ar')
    collection_zh = ClueWeb09Docs(docs_dlc,
                                  chk_dlc,
                                  dirs=[
                                      'ClueWeb09_Chinese_1',
                                      'ClueWeb09_Chinese_2',
                                      'ClueWeb09_Chinese_3',
                                      'ClueWeb09_Chinese_4'
                                  ],
                                  lang='zh')
    collection_en = ClueWeb09Docs(
        docs_dlc,
        chk_dlc,
        dirs=[
            'ClueWeb09_English_1', 'ClueWeb09_English_2',
            'ClueWeb09_English_3', 'ClueWeb09_English_4',
            'ClueWeb09_English_5', 'ClueWeb09_English_6',
            'ClueWeb09_English_7', 'ClueWeb09_English_8',
            'ClueWeb09_English_9', 'ClueWeb09_English_10'
        ],
        lang='en')
    collection_fr = ClueWeb09Docs(docs_dlc,
                                  chk_dlc,
                                  dirs=['ClueWeb09_French_1'],
                                  lang='fr')
    collection_de = ClueWeb09Docs(docs_dlc,
                                  chk_dlc,
                                  dirs=['ClueWeb09_German_1'],
                                  lang='de')
    collection_it = ClueWeb09Docs(docs_dlc,
                                  chk_dlc,
                                  dirs=['ClueWeb09_Italian_1'],
                                  lang='it')
    collection_ja = ClueWeb09Docs(
        docs_dlc,
        chk_dlc,
        dirs=['ClueWeb09_Japanese_1', 'ClueWeb09_Japanese_2'],
        lang='ja')
    collection_ko = ClueWeb09Docs(docs_dlc,
                                  chk_dlc,
                                  dirs=['ClueWeb09_Korean_1'],
                                  lang='ko')
    collection_pt = ClueWeb09Docs(docs_dlc,
                                  chk_dlc,
                                  dirs=['ClueWeb09_Portuguese_1'],
                                  lang='pt')
    collection_es = ClueWeb09Docs(
        docs_dlc,
        chk_dlc,
        dirs=['ClueWeb09_Spanish_1', 'ClueWeb09_Spanish_2'],
        lang='es')
    collection_catb = ClueWeb09Docs(docs_dlc,
                                    chk_dlc,
                                    dirs=['ClueWeb09_English_1'],
                                    lang='en')
    base = Dataset(collection, documentation('_'))

    subsets['ar'] = Dataset(collection_ar, documentation('ar'))
    subsets['zh'] = Dataset(collection_zh, documentation('zh'))
    subsets['en'] = Dataset(collection_en, documentation('en'))
    subsets['fr'] = Dataset(collection_fr, documentation('fr'))
    subsets['de'] = Dataset(collection_de, documentation('de'))
    subsets['it'] = Dataset(collection_it, documentation('it'))
    subsets['ja'] = Dataset(collection_ja, documentation('ja'))
    subsets['ko'] = Dataset(collection_ko, documentation('ko'))
    subsets['pt'] = Dataset(collection_pt, documentation('pt'))
    subsets['es'] = Dataset(collection_es, documentation('es'))
    subsets['catb'] = Dataset(collection_catb, documentation('catb'))

    subsets['en/trec-web-2009'] = Dataset(
        collection_en,
        TrecXmlQueries(dlc['trec-web-2009/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        TrecPrels(GzipExtract(dlc['trec-web-2009/qrels.adhoc']), QREL_DEFS_09),
        documentation('trec-web-2009'))

    subsets['en/trec-web-2010'] = Dataset(
        collection_en,
        TrecXmlQueries(dlc['trec-web-2010/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        TrecQrels(dlc['trec-web-2010/qrels.adhoc'], QREL_DEFS),
        documentation('trec-web-2010'))

    subsets['en/trec-web-2011'] = Dataset(
        collection_en,
        TrecXmlQueries(dlc['trec-web-2011/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        TrecQrels(dlc['trec-web-2011/qrels.adhoc'], QREL_DEFS),
        documentation('trec-web-2011'))

    subsets['en/trec-web-2012'] = Dataset(
        collection_en,
        TrecXmlQueries(dlc['trec-web-2012/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        TrecQrels(dlc['trec-web-2012/qrels.adhoc'], QREL_DEFS),
        documentation('trec-web-2012'))

    subsets['catb/trec-web-2009'] = Dataset(
        collection_catb,
        TrecXmlQueries(dlc['trec-web-2009/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        CatBQrelFilter(
            TrecPrels(GzipExtract(dlc['trec-web-2009/qrels.adhoc']),
                      QREL_DEFS_09)), documentation('trec-web-2009'))

    subsets['catb/trec-web-2010'] = Dataset(
        collection_catb,
        TrecXmlQueries(dlc['trec-web-2010/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        CatBQrelFilter(TrecQrels(dlc['trec-web-2010/qrels.adhoc'], QREL_DEFS)),
        documentation('trec-web-2010'))

    subsets['catb/trec-web-2011'] = Dataset(
        collection_catb,
        TrecXmlQueries(dlc['trec-web-2011/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        CatBQrelFilter(TrecQrels(dlc['trec-web-2011/qrels.adhoc'], QREL_DEFS)),
        documentation('trec-web-2011'))

    subsets['catb/trec-web-2012'] = Dataset(
        collection_catb,
        TrecXmlQueries(dlc['trec-web-2012/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        CatBQrelFilter(TrecQrels(dlc['trec-web-2012/qrels.adhoc'], QREL_DEFS)),
        documentation('trec-web-2012'))

    subsets['trec-mq-2009'] = Dataset(
        collection,
        TrecColonQueries(GzipExtract(dlc['trec-mq-2009/queries']),
                         encoding='latin1',
                         lang='en'),
        TrecPrels(GzipExtract(dlc['trec-mq-2009/qrels']), QREL_DEFS_09),
        documentation('trec-mq-2009'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
def _init():
    base_path = ir_datasets.util.home_path() / 'msmarco-document'
    documentation = YamlDocumentation('docs/msmarco-document.yaml')
    dlc = DownloadConfig.context('msmarco-document', base_path, dua=DUA)
    subsets = {}
    collection = MsMarcoTrecDocs(GzipExtract(dlc['docs']))

    subsets['train'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['train/queries']), namespace='msmarco'),
        TrecQrels(GzipExtract(dlc['train/qrels']), QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])),
    )

    subsets['dev'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['dev/queries']), namespace='msmarco'),
        TrecQrels(GzipExtract(dlc['dev/qrels']), QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['dev/scoreddocs'])),
    )

    subsets['eval'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['eval/queries']), namespace='msmarco'),
        TrecScoredDocs(GzipExtract(dlc['eval/scoreddocs'])),
    )

    subsets['trec-dl-2019'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']),
                   namespace='msmarco'),
        TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['trec-dl-2019/scoreddocs'])),
    )

    subsets['trec-dl-2020'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']),
                   namespace='msmarco'),
        TrecScoredDocs(GzipExtract(dlc['trec-dl-2020/scoreddocs'])),
    )

    subsets['orcas'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['orcas/queries']), namespace='orcas'),
        TrecQrels(GzipExtract(dlc['orcas/qrels']), ORCAS_QLRES_DEFS),
        TrecScoredDocs(GzipExtract(dlc['orcas/scoreddocs'])),
    )

    dl19_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2019'].qrels_iter()})
    subsets['trec-dl-2019/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2019'].queries_handler(),
                        dl19_judged),
        FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(),
                           dl19_judged),
        subsets['trec-dl-2019'],
    )

    ir_datasets.registry.register('msmarco-document',
                                  Dataset(collection, documentation("_")))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'msmarco-document/{s}',
                                      Dataset(subsets[s], documentation(s)))

    return collection, subsets
Example #16
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    docs_dlc = dlc['docs']
    doccount_dlc = Gov2DocCountFile(os.path.join(base_path, 'corpus.doccounts'), docs_dlc)
    collection = Gov2Docs(docs_dlc, doccount_dlc)
    base = Dataset(collection, documentation('_'))

    subsets['trec-tb-2004'] = Dataset(
        collection,
        TrecQueries(dlc['trec-tb-2004/queries'], namespace=NAME, lang='en'),
        TrecQrels(dlc['trec-tb-2004/qrels'], QREL_DEFS),
        documentation('trec-tb-2004')
    )
    subsets['trec-tb-2005'] = Dataset(
        collection,
        TrecQueries(dlc['trec-tb-2005/queries'], namespace=NAME, lang='en'),
        TrecQrels(dlc['trec-tb-2005/qrels'], QREL_DEFS),
        documentation('trec-tb-2005')
    )
    subsets['trec-tb-2005/named-page'] = Dataset(
        collection,
        TrecQueries(dlc['trec-tb-2005/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace=NAME, lang='en'),
        TrecQrels(dlc['trec-tb-2005/named-page/qrels'], NAMED_PAGE_QREL_DEFS),
        documentation('trec-tb-2005/named-page')
    )
    subsets['trec-tb-2005/efficiency'] = Dataset(
        collection,
        TrecColonQueries(GzipExtract(dlc['trec-tb-2005/efficiency/queries']), encoding='latin1', namespace=NAME, lang='en'),
        RewriteQids(TrecQrels(dlc['trec-tb-2005/qrels'], QREL_DEFS), EFF_MAP_05),
        documentation('trec-tb-2005/efficiency')
    )
    subsets['trec-tb-2006'] = Dataset(
        collection,
        TrecQueries(dlc['trec-tb-2006/queries'], namespace=NAME, lang='en'),
        TrecQrels(dlc['trec-tb-2006/qrels'], QREL_DEFS),
        documentation('trec-tb-2006')
    )
    subsets['trec-tb-2006/named-page'] = Dataset(
        collection,
        TrecQueries(dlc['trec-tb-2006/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace=NAME, lang='en'),
        TrecQrels(dlc['trec-tb-2006/named-page/qrels'], NAMED_PAGE_QREL_DEFS),
        documentation('trec-tb-2006/named-page')
    )
    subsets['trec-tb-2006/efficiency'] = Dataset(
        collection,
        TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.all'), encoding='latin1', namespace=NAME, lang='en'),
        RewriteQids(TrecQrels(dlc['trec-tb-2006/qrels'], QREL_DEFS), EFF_MAP_06),
        documentation('trec-tb-2006/efficiency')
    )
    subsets['trec-tb-2006/efficiency/10k'] = Dataset(
        collection,
        TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.10k'), encoding='latin1', namespace=NAME, lang='en'),
        documentation('trec-tb-2006/efficiency/10k')
    )
    subsets['trec-tb-2006/efficiency/stream1'] = Dataset(
        collection,
        TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-1'), encoding='latin1', namespace=NAME, lang='en'),
        documentation('trec-tb-2006/efficiency/stream1')
    )
    subsets['trec-tb-2006/efficiency/stream2'] = Dataset(
        collection,
        TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-2'), encoding='latin1', namespace=NAME, lang='en'),
        documentation('trec-tb-2006/efficiency/stream2')
    )
    subsets['trec-tb-2006/efficiency/stream3'] = Dataset(
        collection,
        TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-3'), encoding='latin1', namespace=NAME, lang='en'),
        RewriteQids(TrecQrels(dlc['trec-tb-2006/qrels'], QREL_DEFS), EFF_MAP_06),
        documentation('trec-tb-2006/efficiency/stream3')
    )
    subsets['trec-tb-2006/efficiency/stream4'] = Dataset(
        collection,
        TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-4'), encoding='latin1', namespace=NAME, lang='en'),
        documentation('trec-tb-2006/efficiency/stream4')
    )

    subsets['trec-mq-2007'] = Dataset(
        collection,
        TrecColonQueries(GzipExtract(dlc['trec-mq-2007/queries']), encoding='latin1'),
        TrecPrels(dlc['trec-mq-2007/qrels'], QREL_DEFS),
        documentation('trec-mq-2007')
    )
    subsets['trec-mq-2008'] = Dataset(
        collection,
        TrecColonQueries(GzipExtract(dlc['trec-mq-2008/queries']), encoding='latin1', namespace='trec-mq', lang='en'),
        TrecPrels(TarExtract(dlc['trec-mq-2008/qrels'], '2008.RC1/prels'), QREL_DEFS),
        documentation('trec-mq-2008')
    )

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Example #17
0
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    manager = AolManager([
        GzipExtract(
            TarExtract(
                dlc['logs'],
                'AOL-user-ct-collection/user-ct-test-collection-01.txt.gz')),
        GzipExtract(
            TarExtract(
                dlc['logs'],
                'AOL-user-ct-collection/user-ct-test-collection-02.txt.gz')),
        GzipExtract(
            TarExtract(
                dlc['logs'],
                'AOL-user-ct-collection/user-ct-test-collection-03.txt.gz')),
        GzipExtract(
            TarExtract(
                dlc['logs'],
                'AOL-user-ct-collection/user-ct-test-collection-04.txt.gz')),
        GzipExtract(
            TarExtract(
                dlc['logs'],
                'AOL-user-ct-collection/user-ct-test-collection-05.txt.gz')),
        GzipExtract(
            TarExtract(
                dlc['logs'],
                'AOL-user-ct-collection/user-ct-test-collection-06.txt.gz')),
        GzipExtract(
            TarExtract(
                dlc['logs'],
                'AOL-user-ct-collection/user-ct-test-collection-07.txt.gz')),
        GzipExtract(
            TarExtract(
                dlc['logs'],
                'AOL-user-ct-collection/user-ct-test-collection-08.txt.gz')),
        GzipExtract(
            TarExtract(
                dlc['logs'],
                'AOL-user-ct-collection/user-ct-test-collection-09.txt.gz')),
        GzipExtract(
            TarExtract(
                dlc['logs'],
                'AOL-user-ct-collection/user-ct-test-collection-10.txt.gz')),
    ], GzipExtract(dlc['id2wb']), base_path)

    base = Dataset(
        DocstoreBackedDocs(manager.docs_store,
                           docs_cls=AolIaDoc,
                           namespace=NAME,
                           lang=None),
        TsvQueries(manager.file_ref('queries.tsv'), lang=None),
        TrecQrels(manager.file_ref('qrels'), QREL_DEFS),
        AolQlogs(manager.file_ref('log.pkl.lz4')), documentation('_'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets, manager, base_path
Example #18
0
def _init():
    base_path = ir_datasets.util.home_path() / NAME
    dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    base = Dataset(documentation('_'))

    subsets = {}

    langs = {
        'ar': 'mrtydi-v1.0-arabic',
        'bn': 'mrtydi-v1.0-bengali',
        'en': 'mrtydi-v1.0-english',
        'fi': 'mrtydi-v1.0-finnish',
        'id': 'mrtydi-v1.0-indonesian',
        'ja': 'mrtydi-v1.0-japanese',
        'ko': 'mrtydi-v1.0-korean',
        'ru': 'mrtydi-v1.0-russian',
        'sw': 'mrtydi-v1.0-swahili',
        'te': 'mrtydi-v1.0-telugu',
        'th': 'mrtydi-v1.0-thai',
    }

    migrator = Migrator(base_path / 'irds_version.txt',
                        'v2',
                        affected_files=[base_path / lang for lang in langs],
                        message='Migrating mr-tydi (restructuring directory)')

    for lang, file_name in langs.items():
        dlc_ds = TarExtractAll(dlc[lang], f'{base_path/lang}.data')
        docs = MrTydiDocs(
            GzipExtract(
                RelativePath(dlc_ds, f'{file_name}/collection/docs.jsonl.gz')),
            lang,
            count_hint=ir_datasets.util.count_hint(f'{NAME}/{lang}'))
        docs = migrator(docs)
        subsets[lang] = Dataset(
            docs,
            TsvQueries(RelativePath(dlc_ds, f'{file_name}/topic.tsv'),
                       lang=lang),
            TrecQrels(RelativePath(dlc_ds, f'{file_name}/qrels.txt'),
                      QREL_DEFS), documentation(lang))
        subsets[f'{lang}/train'] = Dataset(
            docs,
            TsvQueries(RelativePath(dlc_ds, f'{file_name}/topic.train.tsv'),
                       lang=lang),
            TrecQrels(RelativePath(dlc_ds, f'{file_name}/qrels.train.txt'),
                      QREL_DEFS), documentation(f'{lang}/train'))
        subsets[f'{lang}/dev'] = Dataset(
            docs,
            TsvQueries(RelativePath(dlc_ds, f'{file_name}/topic.dev.tsv'),
                       lang=lang),
            TrecQrels(RelativePath(dlc_ds, f'{file_name}/qrels.dev.txt'),
                      QREL_DEFS), documentation(f'{lang}/dev'))
        subsets[f'{lang}/test'] = Dataset(
            docs,
            TsvQueries(RelativePath(dlc_ds, f'{file_name}/topic.test.tsv'),
                       lang=lang),
            TrecQrels(RelativePath(dlc_ds, f'{file_name}/qrels.test.txt'),
                      QREL_DEFS), documentation(f'{lang}/test'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Example #19
0
def _init():
    base_path = ir_datasets.util.home_path()/NAME
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    subsets = {}
    collection = MsMarcoTrecDocs(GzipExtract(dlc['docs']))

    subsets['train'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['train/queries']), namespace='msmarco', lang='en'),
        TrecQrels(GzipExtract(dlc['train/qrels']), QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])),
    )

    subsets['dev'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['dev/queries']), namespace='msmarco', lang='en'),
        TrecQrels(GzipExtract(dlc['dev/qrels']), QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['dev/scoreddocs'])),
    )

    subsets['eval'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['eval/queries']), namespace='msmarco', lang='en'),
        TrecScoredDocs(GzipExtract(dlc['eval/scoreddocs'])),
    )

    subsets['trec-dl-2019'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']), namespace='msmarco', lang='en'),
        TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['trec-dl-2019/scoreddocs'])),
    )

    subsets['trec-dl-2020'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'),
        TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['trec-dl-2020/scoreddocs'])),
    )

    subsets['orcas'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['orcas/queries']), namespace='orcas', lang='en'),
        TrecQrels(GzipExtract(dlc['orcas/qrels']), ORCAS_QLRES_DEFS),
        TrecScoredDocs(GzipExtract(dlc['orcas/scoreddocs'])),
    )

    dl19_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()})
    subsets['trec-dl-2019/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged),
        FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged),
        subsets['trec-dl-2019'],
    )

    dl20_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()})
    subsets['trec-dl-2020/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_judged),
        FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(), dl20_judged),
        subsets['trec-dl-2020'],
    )

    # DL-Hard
    dl_hard_qrels_migrator = Migrator(base_path/'trec-dl-hard'/'irds_version.txt', 'v2',
        affected_files=[base_path/'trec-dl-hard'/'qrels'],
        message='Updating trec-dl-hard qrels')
    hard_qids = Lazy(lambda: DL_HARD_QIDS)
    dl_hard_base_queries = TsvQueries([
            Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path/'trec-dl-2019/queries.tsv'),
            Cache(GzipExtract(dlc['trec-dl-2020/queries']), base_path/'trec-dl-2020/queries.tsv')], namespace='msmarco', lang='en')
    subsets['trec-dl-hard'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        dl_hard_qrels_migrator(TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)),
        documentation('trec-dl-hard')
    )
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1'])
    subsets['trec-dl-hard/fold1'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold1')
    )
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2'])
    subsets['trec-dl-hard/fold2'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold2')
    )
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3'])
    subsets['trec-dl-hard/fold3'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold3')
    )
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4'])
    subsets['trec-dl-hard/fold4'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold4')
    )
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5'])
    subsets['trec-dl-hard/fold5'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold5')
    )
    
    subsets['anchor-text'] = Dataset(
        MsMarcoAnchorTextDocs(
            Cache(GzipExtract(dlc['anchor-text']), base_path / "anchor-text.json"),
            count_hint=1703834
        ),
        documentation('anchor-text')
    )

    ir_datasets.registry.register(NAME, Dataset(collection, documentation("_")))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s)))

    return collection, subsets
Example #20
0
def _init():
    documentation = YamlDocumentation('docs/msmarco-passage.yaml')
    base_path = ir_datasets.util.home_path() / 'msmarco-passage'
    dlc = DownloadConfig.context('msmarco-passage', base_path, dua=DUA)
    collection = TsvDocs(Cache(
        FixEncoding(TarExtract(dlc['collectionandqueries'], 'collection.tsv')),
        base_path / 'collection.tsv'),
                         namespace='msmarco')
    subsets = {}

    subsets['train'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.train.tsv'),
                         base_path / 'train/queries.tsv'),
                   namespace='msmarco'),
        TrecQrels(dlc['train/qrels'], QRELS_DEFS),
        TsvDocPairs(GzipExtract(dlc['train/docpairs'])),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(
                    TarExtract(dlc['train/scoreddocs'], 'top1000.train.txt')),
                base_path / 'train/ms.run')),
    )

    subsets['dev'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.dev.tsv'),
                         base_path / 'dev/queries.tsv'),
                   namespace='msmarco'),
        TrecQrels(dlc['dev/qrels'], QRELS_DEFS),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(TarExtract(dlc['dev/scoreddocs'],
                                         'top1000.dev')),
                base_path / 'dev/ms.run')),
    )

    subsets['dev/small'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(dlc['collectionandqueries'], 'queries.dev.small.tsv'),
            base_path / 'dev/small/queries.tsv'),
                   namespace='msmarco'),
        TrecQrels(
            Cache(
                TarExtract(dlc['collectionandqueries'], 'qrels.dev.small.tsv'),
                base_path / 'dev/small/qrels'), QRELS_DEFS),
    )

    subsets['eval'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.eval.tsv'),
                         base_path / 'eval/queries.tsv'),
                   namespace='msmarco'),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(
                    TarExtract(dlc['eval/scoreddocs'], 'top1000.eval')),
                base_path / 'eval/ms.run')),
    )

    subsets['eval/small'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(dlc['collectionandqueries'], 'queries.eval.small.tsv'),
            base_path / 'eval/small/queries.tsv'),
                   namespace='msmarco'),
    )

    subsets['trec-dl-2019'] = Dataset(
        collection,
        TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS),
        TsvQueries(Cache(GzipExtract(dlc['trec-dl-2019/queries']),
                         base_path / 'trec-dl-2019/queries.tsv'),
                   namespace='msmarco'),
        TrecScoredDocs(
            Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2019/scoreddocs'])),
                  base_path / 'trec-dl-2019/ms.run')),
    )

    subsets['trec-dl-2020'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']),
                   namespace='msmarco'),
        TrecScoredDocs(
            Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2020/scoreddocs'])),
                  base_path / 'trec-dl-2020/ms.run')),
    )

    # A few subsets that are contrainted to just the queries/qrels/docpairs that have at least
    # 1 relevance assessment
    train_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['train'].qrels_iter()})
    subsets['train/judged'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(), train_judged),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           train_judged),
        subsets['train'],
    )

    dev_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['dev'].qrels_iter()})
    subsets['dev/judged'] = Dataset(
        FilteredQueries(subsets['dev'].queries_handler(), dev_judged),
        FilteredScoredDocs(subsets['dev'].scoreddocs_handler(), dev_judged),
        subsets['dev'],
    )

    dl19_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2019'].qrels_iter()})
    subsets['trec-dl-2019/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2019'].queries_handler(),
                        dl19_judged),
        FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(),
                           dl19_judged),
        subsets['trec-dl-2019'],
    )

    # split200 -- 200 queries held out from the training data for validation
    split200 = Lazy(lambda: SPLIT200_QIDS)
    subsets['train/split200-train'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(),
                        split200,
                        mode='exclude'),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           split200,
                           mode='exclude'),
        FilteredQrels(subsets['train'].qrels_handler(),
                      split200,
                      mode='exclude'),
        FilteredDocPairs(subsets['train'].docpairs_handler(),
                         split200,
                         mode='exclude'),
        subsets['train'],
    )
    subsets['train/split200-valid'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(),
                        split200,
                        mode='include'),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           split200,
                           mode='include'),
        FilteredQrels(subsets['train'].qrels_handler(),
                      split200,
                      mode='include'),
        FilteredDocPairs(subsets['train'].docpairs_handler(),
                         split200,
                         mode='include'),
        subsets['train'],
    )

    # Medical subset
    def train_med():
        with dlc['medmarco_ids'].stream() as stream:
            stream = codecs.getreader('utf8')(stream)
            return {l.rstrip() for l in stream}

    train_med = Lazy(train_med)
    subsets['train/medical'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(), train_med),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_med),
        FilteredDocPairs(subsets['train'].docpairs_handler(), train_med),
        FilteredQrels(subsets['train'].qrels_handler(), train_med),
        subsets['train'],
    )

    ir_datasets.registry.register('msmarco-passage',
                                  Dataset(collection, documentation('_')))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'msmarco-passage/{s}',
                                      Dataset(subsets[s], documentation(s)))

    return collection, subsets
Example #21
0
 def _dlc_init():
     with GzipExtract(base_dlc['downloads']).stream() as f:
         clirmatrix_dlc = _DownloadConfig(contents=json.load(f))
     return clirmatrix_dlc
Example #22
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    migrator = Migrator(base_path / 'irds_version.txt',
                        'v2',
                        affected_files=[
                            base_path / 'collection.tsv',
                            base_path / 'collection.tsv.pklz4'
                        ],
                        message=f'Migrating {NAME} (fixing passage encoding)')

    collection = TsvDocs(Cache(
        FixEncoding(TarExtract(dlc['collectionandqueries'], 'collection.tsv')),
        base_path / 'collection.tsv'),
                         namespace='msmarco',
                         lang='en',
                         docstore_size_hint=14373971970,
                         count_hint=ir_datasets.util.count_hint(NAME))
    collection = migrator(collection)
    subsets = {}

    subsets['train'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.train.tsv'),
                         base_path / 'train/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(dlc['train/qrels'], QRELS_DEFS),
        TsvDocPairs(GzipExtract(dlc['train/docpairs'])),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(
                    TarExtract(dlc['train/scoreddocs'], 'top1000.train.txt')),
                base_path / 'train/ms.run')),
    )

    subsets['train/triples-v2'] = Dataset(
        collection,
        subsets['train'].queries_handler(),
        subsets['train'].qrels_handler(),
        TsvDocPairs(GzipExtract(dlc['train/docpairs/v2'])),
        subsets['train'].scoreddocs_handler(),
    )

    subsets['train/triples-small'] = Dataset(
        collection,
        subsets['train'].queries_handler(),
        subsets['train'].qrels_handler(),
        TsvDocPairs(
            Cache(
                MapSmallTriplesQidPid(
                    TarExtract(dlc['train/docpairs/small'],
                               'triples.train.small.tsv'),
                    TarExtract(dlc['collectionandqueries'], 'collection.tsv'),
                    subsets['train'].queries_handler()),
                base_path / 'train/small.triples.qidpid.tsv')),
        subsets['train'].scoreddocs_handler(),
    )

    subsets['dev'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.dev.tsv'),
                         base_path / 'dev/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(dlc['dev/qrels'], QRELS_DEFS),
    )

    subsets['dev/small'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(dlc['collectionandqueries'], 'queries.dev.small.tsv'),
            base_path / 'dev/small/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(
            Cache(
                TarExtract(dlc['collectionandqueries'], 'qrels.dev.small.tsv'),
                base_path / 'dev/small/qrels'), QRELS_DEFS),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(TarExtract(dlc['dev/scoreddocs'],
                                         'top1000.dev')),
                base_path / 'dev/ms.run')),
    )

    subsets['eval'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.eval.tsv'),
                         base_path / 'eval/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
    )

    subsets['eval/small'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(dlc['collectionandqueries'], 'queries.eval.small.tsv'),
            base_path / 'eval/small/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(
                    TarExtract(dlc['eval/scoreddocs'], 'top1000.eval')),
                base_path / 'eval/ms.run')),
    )

    subsets['trec-dl-2019'] = Dataset(
        collection,
        TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS),
        TsvQueries(Cache(GzipExtract(dlc['trec-dl-2019/queries']),
                         base_path / 'trec-dl-2019/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecScoredDocs(
            Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2019/scoreddocs'])),
                  base_path / 'trec-dl-2019/ms.run')),
    )

    subsets['trec-dl-2020'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(
            Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2020/scoreddocs'])),
                  base_path / 'trec-dl-2020/ms.run')),
    )

    # A few subsets that are contrainted to just the queries/qrels/docpairs that have at least
    # 1 relevance assessment
    train_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['train'].qrels_iter()})
    subsets['train/judged'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(), train_judged),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           train_judged),
        subsets['train'],
    )

    dev_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['dev'].qrels_iter()})
    subsets['dev/judged'] = Dataset(
        FilteredQueries(subsets['dev'].queries_handler(), dev_judged),
        subsets['dev'],
    )

    dl19_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2019'].qrels_iter()})
    subsets['trec-dl-2019/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2019'].queries_handler(),
                        dl19_judged),
        FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(),
                           dl19_judged),
        subsets['trec-dl-2019'],
    )

    dl20_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2020'].qrels_iter()})
    subsets['trec-dl-2020/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2020'].queries_handler(),
                        dl20_judged),
        FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(),
                           dl20_judged),
        subsets['trec-dl-2020'],
    )

    # split200 -- 200 queries held out from the training data for validation
    split200 = Lazy(lambda: SPLIT200_QIDS)
    subsets['train/split200-train'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(),
                        split200,
                        mode='exclude'),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           split200,
                           mode='exclude'),
        FilteredQrels(subsets['train'].qrels_handler(),
                      split200,
                      mode='exclude'),
        FilteredDocPairs(subsets['train'].docpairs_handler(),
                         split200,
                         mode='exclude'),
        subsets['train'],
    )
    subsets['train/split200-valid'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(),
                        split200,
                        mode='include'),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           split200,
                           mode='include'),
        FilteredQrels(subsets['train'].qrels_handler(),
                      split200,
                      mode='include'),
        FilteredDocPairs(subsets['train'].docpairs_handler(),
                         split200,
                         mode='include'),
        subsets['train'],
    )

    # Medical subset
    def train_med():
        with dlc['medmarco_ids'].stream() as stream:
            stream = codecs.getreader('utf8')(stream)
            return {l.rstrip() for l in stream}

    train_med = Lazy(train_med)
    subsets['train/medical'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(), train_med),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_med),
        FilteredDocPairs(subsets['train'].docpairs_handler(), train_med),
        FilteredQrels(subsets['train'].qrels_handler(), train_med),
        subsets['train'],
    )

    # DL-Hard
    dl_hard_qrels_migrator = Migrator(
        base_path / 'trec-dl-hard' / 'irds_version.txt',
        'v3',
        affected_files=[base_path / 'trec-dl-hard' / 'qrels'],
        message='Updating trec-dl-hard qrels')
    hard_qids = Lazy(lambda: DL_HARD_QIDS)
    dl_hard_base_queries = TsvQueries([
        Cache(GzipExtract(dlc['trec-dl-2019/queries']),
              base_path / 'trec-dl-2019/queries.tsv'),
        Cache(GzipExtract(dlc['trec-dl-2020/queries']),
              base_path / 'trec-dl-2020/queries.tsv')
    ],
                                      namespace='msmarco',
                                      lang='en')
    subsets['trec-dl-hard'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        dl_hard_qrels_migrator(
            TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)),
        documentation('trec-dl-hard'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1'])
    subsets['trec-dl-hard/fold1'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold1'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2'])
    subsets['trec-dl-hard/fold2'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold2'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3'])
    subsets['trec-dl-hard/fold3'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold3'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4'])
    subsets['trec-dl-hard/fold4'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold4'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5'])
    subsets['trec-dl-hard/fold5'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold5'))

    ir_datasets.registry.register(NAME, Dataset(collection,
                                                documentation('_')))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}',
                                      Dataset(subsets[s], documentation(s)))

    return collection, subsets
Example #23
0
def _init():
    base_path = ir_datasets.util.home_path() / NAME
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    subsets = {}
    collection = MsMarcoV2Docs(dlc['docs'])

    subsets['train'] = Dataset(
        collection,
        TsvQueries(dlc['train_queries'], namespace='msmarco', lang='en'),
        TrecQrels(dlc['train_qrels'], QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['train_scoreddocs'])),
    )
    subsets['dev1'] = Dataset(
        collection,
        TsvQueries(dlc['dev1_queries'], namespace='msmarco', lang='en'),
        TrecQrels(dlc['dev1_qrels'], QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['dev1_scoreddocs'])),
    )
    subsets['dev2'] = Dataset(
        collection,
        TsvQueries(dlc['dev2_queries'], namespace='msmarco', lang='en'),
        TrecQrels(dlc['dev2_qrels'], QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['dev2_scoreddocs'])),
    )
    subsets['trec-dl-2019'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(GzipExtract(dlc['trec_dl_2019_qrels']), TREC_DL_QRELS_DEFS),
    )
    subsets['trec-dl-2020'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(GzipExtract(dlc['trec_dl_2020_qrels']), TREC_DL_QRELS_DEFS),
    )
    dl19_v2_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2019'].qrels_iter()})
    subsets['trec-dl-2019/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2019'].queries_handler(),
                        dl19_v2_judged),
        subsets['trec-dl-2019'],
    )
    dl20_v2_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2020'].qrels_iter()})
    subsets['trec-dl-2020/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2020'].queries_handler(),
                        dl20_v2_judged),
        subsets['trec-dl-2020'],
    )
    subsets['trec-dl-2021'] = Dataset(
        collection,
        TsvQueries(dlc['trec-dl-2021/queries'], namespace='msmarco',
                   lang='en'),
        TrecQrels(dlc['trec-dl-2021/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['trec-dl-2021/scoreddocs'])),
    )
    dl21_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2021'].qrels_iter()})
    subsets['trec-dl-2021/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2021'].queries_handler(),
                        dl21_judged),
        FilteredScoredDocs(subsets['trec-dl-2021'].scoreddocs_handler(),
                           dl21_judged),
        subsets['trec-dl-2021'],
    )

    subsets['anchor-text'] = Dataset(
        MsMarcoV2AnchorTextDocs(Cache(GzipExtract(dlc['anchor-text']),
                                      base_path / "anchor-text.json"),
                                count_hint=4821244),
        documentation('anchor-text'))

    ir_datasets.registry.register(NAME, Dataset(collection,
                                                documentation("_")))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}',
                                      Dataset(subsets[s], documentation(s)))

    return collection, subsets