def _init(): subsets = {} documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) collection = TrecDocs(dlc['docs'], encoding='GB18030', path_globs=['**/xinhua/x*', '**/peoples-daily/pd*'], namespace=NAME, lang='zh', count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset(collection, documentation('_')) subsets['trec5'] = Dataset( TrecQueries(GzipExtract(dlc['trec5/queries']), qtype=TrecMandarinQuery, qtype_map=QTYPE_MAP, encoding='GBK', namespace=NAME, lang=None), # queries have multiple languages TrecQrels(GzipExtract(dlc['trec5/qrels']), QREL_DEFS), collection, documentation('trec5')) subsets['trec6'] = Dataset( TrecQueries(GzipExtract(dlc['trec6/queries']), qtype=TrecMandarinQuery, qtype_map=QTYPE_MAP, encoding='GBK', namespace=NAME, lang=None), # queries have multiple languages TrecQrels(GzipExtract(dlc['trec6/qrels']), QREL_DEFS), collection, documentation('trec6')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path()/NAME dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = FairTrecDocs(GzipExtract(dlc["docs"]), GzipExtract(dlc["metadata"])) base = Dataset( collection, documentation('_')) subsets = {} train_topics = GzipExtract(dlc["train/topics"]) subsets['train'] = Dataset( collection, FairTrecQueries(train_topics, FairTrecQuery), FairTrecQrels(train_topics), documentation('train')) subsets['eval'] = Dataset( collection, FairTrecQueries(GzipExtract(dlc['eval/topics']), FairTrecEvalQuery), documentation('eval')) ir_datasets.registry.register(NAME, base) for s in subsets: ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path() / NAME dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TsvDocs(GzipExtract(dlc['docs']), doc_cls=DprW100Doc, namespace=NAME, lang='en', skip_first_line=True, docstore_size_hint=12827215492, count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset(collection, documentation('_')) subsets = {} nq_dev_manager = DprW100Manager(GzipExtract(dlc['nq-dev']), base_path / 'nq-dev') subsets['natural-questions/dev'] = Dataset( collection, DprW100Queries(nq_dev_manager.file_ref('queries.tsv')), TrecQrels(nq_dev_manager.file_ref('qrels'), QREL_DEFS), documentation('natural-questions/dev')) nq_train_manager = DprW100Manager(GzipExtract(dlc['nq-train']), base_path / 'nq-train') subsets['natural-questions/train'] = Dataset( collection, DprW100Queries(nq_train_manager.file_ref('queries.tsv')), TrecQrels(nq_train_manager.file_ref('qrels'), QREL_DEFS), documentation('natural-questions/train')) tqa_dev_manager = DprW100Manager(GzipExtract(dlc['tqa-dev']), base_path / 'tqa-dev', passage_id_key='psg_id') subsets['trivia-qa/dev'] = Dataset( collection, DprW100Queries(tqa_dev_manager.file_ref('queries.tsv')), TrecQrels(tqa_dev_manager.file_ref('qrels'), QREL_DEFS), documentation('trivia-qa/dev')) tqa_train_manager = DprW100Manager(GzipExtract(dlc['tqa-train']), base_path / 'tqa-train', passage_id_key='psg_id') subsets['trivia-qa/train'] = Dataset( collection, DprW100Queries(tqa_train_manager.file_ref('queries.tsv')), TrecQrels(tqa_train_manager.file_ref('qrels'), QREL_DEFS), documentation('trivia-qa/train')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path()/NAME documentation = YamlDocumentation(f'docs/{NAME}.yaml') dlc = DownloadConfig.context(NAME, base_path, dua=DUA) subsets = {} migrator = Migrator(base_path/'irds_version.txt', 'v2', affected_files=[base_path/'msmarco_v2_passage.tar.pklz4'], message='Cleaning up pklz4 lookup structure in favor of ID-based lookups') collection = MsMarcoV2Passages(dlc['passages']) collection = migrator(collection) qrels_migrator = Migrator(base_path/'qrels_version.txt', 'v2', affected_files=[base_path/'train'/'qrels.tsv', base_path/'dev1'/'qrels.tsv', base_path/'dev2'/'qrels.tsv'], message='Updating qrels (task organizers removed duplicates)') subsets['train'] = Dataset( collection, TsvQueries(dlc['train/queries'], namespace='msmarco', lang='en'), qrels_migrator(TrecQrels(dlc['train/qrels'], QRELS_DEFS)), TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])), ) subsets['dev1'] = Dataset( collection, TsvQueries(dlc['dev1/queries'], namespace='msmarco', lang='en'), qrels_migrator(TrecQrels(dlc['dev1/qrels'], QRELS_DEFS)), TrecScoredDocs(GzipExtract(dlc['dev1/scoreddocs'])), ) subsets['dev2'] = Dataset( collection, TsvQueries(dlc['dev2/queries'], namespace='msmarco', lang='en'), qrels_migrator(TrecQrels(dlc['dev2/qrels'], QRELS_DEFS)), TrecScoredDocs(GzipExtract(dlc['dev2/scoreddocs'])), ) subsets['trec-dl-2021'] = Dataset( collection, TsvQueries(dlc['trec-dl-2021/queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2021/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2021/scoreddocs'])), ) dl21_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2021'].qrels_iter()}) subsets['trec-dl-2021/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2021'].queries_handler(), dl21_judged), FilteredScoredDocs(subsets['trec-dl-2021'].scoreddocs_handler(), dl21_judged), subsets['trec-dl-2021'], ) ir_datasets.registry.register(NAME, Dataset(collection, documentation("_"))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} base = Dataset(documentation('_')) collection04 = MedlineDocs('2004', [GzipExtract(dlc['2004/a']), GzipExtract(dlc['2004/b']), GzipExtract(dlc['2004/c']), GzipExtract(dlc['2004/d'])], count_hint=ir_datasets.util.count_hint(f'{NAME}/2004')) subsets['2004'] = Dataset(collection04, documentation('2004')) subsets['2004/trec-genomics-2004'] = Dataset( collection04, TrecXmlQueries(ZipExtract(dlc['trec-genomics-2004/queries'], 'Official.xml'), qtype=TrecGenomicsQuery, qtype_map=TREC04_XML_MAP, namespace='trec-genomics', lang='en'), TrecQrels(dlc['trec-genomics-2004/qrels'], QREL_DEFS), documentation('trec-genomics-2004'), ) subsets['2004/trec-genomics-2005'] = Dataset( collection04, TrecGenomicsQueries(dlc['trec-genomics-2005/queries']), TrecQrels(dlc['trec-genomics-2005/qrels'], QREL_DEFS), documentation('trec-genomics-2005'), ) collection17 = ConcatDocs([ AacrAscoDocs(dlc['2017/aacr_asco_extra']), MedlineDocs('2017', [dlc['2017/part1'], dlc['2017/part2'], dlc['2017/part3'], dlc['2017/part4'], dlc['2017/part5']]), ], count_hint=ir_datasets.util.count_hint(f'{NAME}/2017')) subsets['2017'] = Dataset(collection17, documentation('2017')) subsets['2017/trec-pm-2017'] = Dataset( collection17, TrecXmlQueries(dlc['trec-pm-2017/queries'], qtype=TrecPm2017Query, namespace='trec-pm-2017', lang='en'), TrecQrels(dlc['trec-pm-2017/qrels'], QREL_DEFS), documentation('trec-pm-2017'), ) subsets['2017/trec-pm-2018'] = Dataset( collection17, TrecXmlQueries(dlc['trec-pm-2018/queries'], qtype=TrecPmQuery, namespace='trec-pm-2018', lang='en'), TrecQrels(dlc['trec-pm-2018/qrels'], QREL_DEFS), documentation('trec-pm-2018'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') en_noclean_tr_collection = C4Docs( GzipExtract(dlc['en-noclean/sources']), TarExtractAll(dlc['en-noclean/checkpoints'], base_path / 'en.noclean.checkpoints'), base_path, source_name_filter=r'en\.noclean\.c4-train', filter_name='train') # exclude validation files (only include train) base = Dataset(documentation('_')) subsets['en-noclean-tr'] = Dataset(en_noclean_tr_collection, documentation('en-noclean-tr')) subsets['en-noclean-tr/trec-misinfo-2021'] = Dataset( en_noclean_tr_collection, TrecXmlQueries(dlc['trec-misinfo-2021/queries'], qtype=MisinfoQuery, qtype_map=misinfo_map, namespace='trec-misinfo', lang='en'), documentation('en-noclean-tr/trec-misinfo-2021')) ir_datasets.registry.register(NAME, base) for subset in subsets: ir_datasets.registry.register(f'{NAME}/{subset}', subsets[subset]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) subsets = {} collection = TrecDocs(dlc['docs'], path_globs=[ '**/FBIS/FB*', '**/FR94/??/FR*', '**/FT/*/FT*', '**/LATIMES/LA*' ], namespace=NAME, lang='en', expected_file_count=2295, count_hint=ir_datasets.util.count_hint(NAME)) queries = TrecQueries(GzipExtract(dlc['queries']), namespace=NAME, lang='en') qrels = TrecQrels(dlc['qrels'], QREL_DEFS) base = Dataset(collection, queries, qrels, documentation('_')) for fold in FOLDS: qid_filter = make_filter(fold) subsets[fold] = Dataset(FilteredQueries(queries, qid_filter), FilteredQrels(qrels, qid_filter), collection, documentation(fold)) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _docs_initializer(lang_code): if lang_code not in _docs_cache: dlc = _dlc().context("clirmatrix_docs", base_path) docs = TsvDocs(GzipExtract(dlc[f'docs/{lang_code}']), namespace=f'{NAME}/{lang_code}', lang=lang_code) _docs_cache[lang_code] = docs return _docs_cache[lang_code]
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} collection = GovDocs(dlc['docs']) base = Dataset(collection, documentation('_')) subsets['trec-web-2002'] = Dataset( collection, TrecQueries(GzipExtract(dlc['trec-web-2002/queries']), namespace='gov/trec-web-2002', lang='en'), TrecQrels(GzipExtract(dlc['trec-web-2002/qrels']), QREL_DEFS), documentation('trec-web-2002') ) subsets['trec-web-2002/named-page'] = Dataset( collection, TrecQueries(GzipExtract(dlc['trec-web-2002/named-page/queries']), qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace='gov/trec-web-2002/named-page', lang='en'), TrecQrels(GzipExtract(dlc['trec-web-2002/named-page/qrels']), NAMED_PAGE_QREL_DEFS), documentation('trec-web-2002/named-page') ) subsets['trec-web-2003'] = Dataset( collection, TrecQueries(dlc['trec-web-2003/queries'], qtype=GovWeb02Query, qtype_map=WEB03_QTYPE_MAP, namespace='gov/trec-web-2003', lang='en'), TrecQrels(dlc['trec-web-2003/qrels'], QREL_DEFS), documentation('trec-web-2003') ) subsets['trec-web-2003/named-page'] = Dataset( collection, TrecQueries(dlc['trec-web-2003/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace='gov/trec-web-2003/named-page', lang='en'), TrecQrels(dlc['trec-web-2003/named-page/qrels'], NAMED_PAGE_QREL_DEFS), documentation('trec-web-2003/named-page') ) subsets['trec-web-2004'] = Dataset( collection, TrecQueries(dlc['trec-web-2004/queries'], qtype=GenericQuery, qtype_map=WEB04_QTYPE_MAP, namespace='gov/trec-web-2004', lang='en'), TrecQrels(dlc['trec-web-2004/qrels'], QREL_DEFS), documentation('trec-web-2004') ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) documentation = YamlDocumentation(f'docs/{NAME}.yaml') manager = MsMarcoQnAManager(GzipExtract(dlc['train']), GzipExtract(dlc['dev']), GzipExtract(dlc['eval']), base_path) migrator = Migrator(base_path/'irds_version.txt', 'v2', affected_files=[ base_path/'docs.pklz4', base_path/'train.run', base_path/'train.qrels', base_path/'dev.run', base_path/'dev.qrels', base_path/'eval.run', ], message='Migrating msmarco-qna (correcting doc_ids)') collection = DocstoreBackedDocs(manager.docs_store, docs_cls=MsMarcoQnADoc, namespace=NAME, lang='en') collection = migrator(collection) subsets = {} subsets['train'] = Dataset( collection, TsvQueries(manager.file_ref('train.queries.tsv'), query_cls=MsMarcoQnAQuery, namespace='msmarco', lang='en'), migrator(TrecQrels(manager.file_ref('train.qrels'), QRELS_DEFS)), migrator(TrecScoredDocs(manager.file_ref('train.run'))), ) subsets['dev'] = Dataset( collection, TsvQueries(manager.file_ref('dev.queries.tsv'), query_cls=MsMarcoQnAQuery, namespace='msmarco', lang='en'), migrator(TrecQrels(manager.file_ref('dev.qrels'), QRELS_DEFS)), migrator(TrecScoredDocs(manager.file_ref('dev.run'))), ) subsets['eval'] = Dataset( collection, TsvQueries(manager.file_ref('eval.queries.tsv'), query_cls=MsMarcoQnAEvalQuery, namespace='msmarco', lang='en'), migrator(TrecScoredDocs(manager.file_ref('eval.run'))), ) ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TrecDocs( dlc['docs'], encoding='ISO-8859-1', path_globs=['**/afp_text/af*', '**/infosel_data/ism_*'], namespace=NAME, lang='es', count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset(collection, documentation('_')) subsets['trec3'] = Dataset( TrecSpanishTranslateQueries( TrecQueries(GzipExtract(dlc['trec3/queries']), qtype_map=QTYPE_MAP_3, encoding='ISO-8859-1', namespace=NAME, lang=None), TrecSpanish3Query), TrecQrels(GzipExtract(dlc['trec3/qrels']), QREL_DEFS), collection, documentation('trec3')) subsets['trec4'] = Dataset( TrecSpanishTranslateQueries( TrecQueries(GzipExtract(dlc['trec4/queries']), qtype=TrecDescOnlyQuery, qtype_map=QTYPE_MAP_4, encoding='ISO-8859-1', namespace=NAME, lang=None), TrecSpanish4Query), TrecQrels(GzipExtract(dlc['trec4/qrels']), QREL_DEFS), collection, documentation('trec4')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _initializer(dsid, args, dlc_context=None): docs_lang, queries_lang, split = args docs = _docs_initializer(docs_lang) components = [docs] if queries_lang: # queries & split are optional dlc = _dlc().context(dlc_context, base_path) dlc_key = f'queries/{queries_lang}_{docs_lang}/{split}' qrel_dlc = GzipExtract(dlc[dlc_key]) qrels = CLIRMatrixQrels(qrel_dlc, QRELS_DEFS) queries = CLIRMatrixQueries(qrel_dlc, queries_lang) components += [queries, qrels] result = Dataset(*components) result = Dataset(MetadataComponent(dsid, result, metadata), result) return result
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} collection = MedlineDocs([ GzipExtract(dlc['docs/a']), GzipExtract(dlc['docs/b']), GzipExtract(dlc['docs/c']), GzipExtract(dlc['docs/d']) ]) base = Dataset(collection, documentation('_')) subsets['trec-genomics-2004'] = Dataset( collection, TrecXmlQueries(ZipExtract(dlc['trec-genomics-2004/queries'], 'Official.xml'), qtype=TrecGenomicsQuery, qtype_map=TREC04_XML_MAP, namespace='trec-genomics'), TrecQrels(dlc['trec-genomics-2004/qrels'], QREL_DEFS), documentation('trec-genomics-2004'), ) subsets['trec-genomics-2005'] = Dataset( collection, TrecGenomicsQueries(dlc['trec-genomics-2005/queries']), TrecQrels(dlc['trec-genomics-2005/qrels'], QREL_DEFS), documentation('trec-genomics-2005'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} docs_dlc = dlc['docs'] chk_dlc = TarExtractAll(dlc['docs.chk'], base_path / 'corpus.chk') collection = ClueWeb09Docs(docs_dlc, chk_dlc, lang=None) # multiple langs collection_ar = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Arabic_1'], lang='ar') collection_zh = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=[ 'ClueWeb09_Chinese_1', 'ClueWeb09_Chinese_2', 'ClueWeb09_Chinese_3', 'ClueWeb09_Chinese_4' ], lang='zh') collection_en = ClueWeb09Docs( docs_dlc, chk_dlc, dirs=[ 'ClueWeb09_English_1', 'ClueWeb09_English_2', 'ClueWeb09_English_3', 'ClueWeb09_English_4', 'ClueWeb09_English_5', 'ClueWeb09_English_6', 'ClueWeb09_English_7', 'ClueWeb09_English_8', 'ClueWeb09_English_9', 'ClueWeb09_English_10' ], lang='en') collection_fr = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_French_1'], lang='fr') collection_de = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_German_1'], lang='de') collection_it = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Italian_1'], lang='it') collection_ja = ClueWeb09Docs( docs_dlc, chk_dlc, dirs=['ClueWeb09_Japanese_1', 'ClueWeb09_Japanese_2'], lang='ja') collection_ko = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Korean_1'], lang='ko') collection_pt = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Portuguese_1'], lang='pt') collection_es = ClueWeb09Docs( docs_dlc, chk_dlc, dirs=['ClueWeb09_Spanish_1', 'ClueWeb09_Spanish_2'], lang='es') collection_catb = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_English_1'], lang='en') base = Dataset(collection, documentation('_')) subsets['ar'] = Dataset(collection_ar, documentation('ar')) subsets['zh'] = Dataset(collection_zh, documentation('zh')) subsets['en'] = Dataset(collection_en, documentation('en')) subsets['fr'] = Dataset(collection_fr, documentation('fr')) subsets['de'] = Dataset(collection_de, documentation('de')) subsets['it'] = Dataset(collection_it, documentation('it')) subsets['ja'] = Dataset(collection_ja, documentation('ja')) subsets['ko'] = Dataset(collection_ko, documentation('ko')) subsets['pt'] = Dataset(collection_pt, documentation('pt')) subsets['es'] = Dataset(collection_es, documentation('es')) subsets['catb'] = Dataset(collection_catb, documentation('catb')) subsets['en/trec-web-2009'] = Dataset( collection_en, TrecXmlQueries(dlc['trec-web-2009/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), TrecPrels(GzipExtract(dlc['trec-web-2009/qrels.adhoc']), QREL_DEFS_09), documentation('trec-web-2009')) subsets['en/trec-web-2010'] = Dataset( collection_en, TrecXmlQueries(dlc['trec-web-2010/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), TrecQrels(dlc['trec-web-2010/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2010')) subsets['en/trec-web-2011'] = Dataset( collection_en, TrecXmlQueries(dlc['trec-web-2011/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), TrecQrels(dlc['trec-web-2011/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2011')) subsets['en/trec-web-2012'] = Dataset( collection_en, TrecXmlQueries(dlc['trec-web-2012/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), TrecQrels(dlc['trec-web-2012/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2012')) subsets['catb/trec-web-2009'] = Dataset( collection_catb, TrecXmlQueries(dlc['trec-web-2009/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), CatBQrelFilter( TrecPrels(GzipExtract(dlc['trec-web-2009/qrels.adhoc']), QREL_DEFS_09)), documentation('trec-web-2009')) subsets['catb/trec-web-2010'] = Dataset( collection_catb, TrecXmlQueries(dlc['trec-web-2010/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), CatBQrelFilter(TrecQrels(dlc['trec-web-2010/qrels.adhoc'], QREL_DEFS)), documentation('trec-web-2010')) subsets['catb/trec-web-2011'] = Dataset( collection_catb, TrecXmlQueries(dlc['trec-web-2011/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), CatBQrelFilter(TrecQrels(dlc['trec-web-2011/qrels.adhoc'], QREL_DEFS)), documentation('trec-web-2011')) subsets['catb/trec-web-2012'] = Dataset( collection_catb, TrecXmlQueries(dlc['trec-web-2012/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), CatBQrelFilter(TrecQrels(dlc['trec-web-2012/qrels.adhoc'], QREL_DEFS)), documentation('trec-web-2012')) subsets['trec-mq-2009'] = Dataset( collection, TrecColonQueries(GzipExtract(dlc['trec-mq-2009/queries']), encoding='latin1', lang='en'), TrecPrels(GzipExtract(dlc['trec-mq-2009/qrels']), QREL_DEFS_09), documentation('trec-mq-2009')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path() / 'msmarco-document' documentation = YamlDocumentation('docs/msmarco-document.yaml') dlc = DownloadConfig.context('msmarco-document', base_path, dua=DUA) subsets = {} collection = MsMarcoTrecDocs(GzipExtract(dlc['docs'])) subsets['train'] = Dataset( collection, TsvQueries(GzipExtract(dlc['train/queries']), namespace='msmarco'), TrecQrels(GzipExtract(dlc['train/qrels']), QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])), ) subsets['dev'] = Dataset( collection, TsvQueries(GzipExtract(dlc['dev/queries']), namespace='msmarco'), TrecQrels(GzipExtract(dlc['dev/qrels']), QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['dev/scoreddocs'])), ) subsets['eval'] = Dataset( collection, TsvQueries(GzipExtract(dlc['eval/queries']), namespace='msmarco'), TrecScoredDocs(GzipExtract(dlc['eval/scoreddocs'])), ) subsets['trec-dl-2019'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']), namespace='msmarco'), TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2019/scoreddocs'])), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco'), TrecScoredDocs(GzipExtract(dlc['trec-dl-2020/scoreddocs'])), ) subsets['orcas'] = Dataset( collection, TsvQueries(GzipExtract(dlc['orcas/queries']), namespace='orcas'), TrecQrels(GzipExtract(dlc['orcas/qrels']), ORCAS_QLRES_DEFS), TrecScoredDocs(GzipExtract(dlc['orcas/scoreddocs'])), ) dl19_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged), FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged), subsets['trec-dl-2019'], ) ir_datasets.registry.register('msmarco-document', Dataset(collection, documentation("_"))) for s in sorted(subsets): ir_datasets.registry.register(f'msmarco-document/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} docs_dlc = dlc['docs'] doccount_dlc = Gov2DocCountFile(os.path.join(base_path, 'corpus.doccounts'), docs_dlc) collection = Gov2Docs(docs_dlc, doccount_dlc) base = Dataset(collection, documentation('_')) subsets['trec-tb-2004'] = Dataset( collection, TrecQueries(dlc['trec-tb-2004/queries'], namespace=NAME, lang='en'), TrecQrels(dlc['trec-tb-2004/qrels'], QREL_DEFS), documentation('trec-tb-2004') ) subsets['trec-tb-2005'] = Dataset( collection, TrecQueries(dlc['trec-tb-2005/queries'], namespace=NAME, lang='en'), TrecQrels(dlc['trec-tb-2005/qrels'], QREL_DEFS), documentation('trec-tb-2005') ) subsets['trec-tb-2005/named-page'] = Dataset( collection, TrecQueries(dlc['trec-tb-2005/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels(dlc['trec-tb-2005/named-page/qrels'], NAMED_PAGE_QREL_DEFS), documentation('trec-tb-2005/named-page') ) subsets['trec-tb-2005/efficiency'] = Dataset( collection, TrecColonQueries(GzipExtract(dlc['trec-tb-2005/efficiency/queries']), encoding='latin1', namespace=NAME, lang='en'), RewriteQids(TrecQrels(dlc['trec-tb-2005/qrels'], QREL_DEFS), EFF_MAP_05), documentation('trec-tb-2005/efficiency') ) subsets['trec-tb-2006'] = Dataset( collection, TrecQueries(dlc['trec-tb-2006/queries'], namespace=NAME, lang='en'), TrecQrels(dlc['trec-tb-2006/qrels'], QREL_DEFS), documentation('trec-tb-2006') ) subsets['trec-tb-2006/named-page'] = Dataset( collection, TrecQueries(dlc['trec-tb-2006/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels(dlc['trec-tb-2006/named-page/qrels'], NAMED_PAGE_QREL_DEFS), documentation('trec-tb-2006/named-page') ) subsets['trec-tb-2006/efficiency'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.all'), encoding='latin1', namespace=NAME, lang='en'), RewriteQids(TrecQrels(dlc['trec-tb-2006/qrels'], QREL_DEFS), EFF_MAP_06), documentation('trec-tb-2006/efficiency') ) subsets['trec-tb-2006/efficiency/10k'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.10k'), encoding='latin1', namespace=NAME, lang='en'), documentation('trec-tb-2006/efficiency/10k') ) subsets['trec-tb-2006/efficiency/stream1'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-1'), encoding='latin1', namespace=NAME, lang='en'), documentation('trec-tb-2006/efficiency/stream1') ) subsets['trec-tb-2006/efficiency/stream2'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-2'), encoding='latin1', namespace=NAME, lang='en'), documentation('trec-tb-2006/efficiency/stream2') ) subsets['trec-tb-2006/efficiency/stream3'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-3'), encoding='latin1', namespace=NAME, lang='en'), RewriteQids(TrecQrels(dlc['trec-tb-2006/qrels'], QREL_DEFS), EFF_MAP_06), documentation('trec-tb-2006/efficiency/stream3') ) subsets['trec-tb-2006/efficiency/stream4'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-4'), encoding='latin1', namespace=NAME, lang='en'), documentation('trec-tb-2006/efficiency/stream4') ) subsets['trec-mq-2007'] = Dataset( collection, TrecColonQueries(GzipExtract(dlc['trec-mq-2007/queries']), encoding='latin1'), TrecPrels(dlc['trec-mq-2007/qrels'], QREL_DEFS), documentation('trec-mq-2007') ) subsets['trec-mq-2008'] = Dataset( collection, TrecColonQueries(GzipExtract(dlc['trec-mq-2008/queries']), encoding='latin1', namespace='trec-mq', lang='en'), TrecPrels(TarExtract(dlc['trec-mq-2008/qrels'], '2008.RC1/prels'), QREL_DEFS), documentation('trec-mq-2008') ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') manager = AolManager([ GzipExtract( TarExtract( dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-01.txt.gz')), GzipExtract( TarExtract( dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-02.txt.gz')), GzipExtract( TarExtract( dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-03.txt.gz')), GzipExtract( TarExtract( dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-04.txt.gz')), GzipExtract( TarExtract( dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-05.txt.gz')), GzipExtract( TarExtract( dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-06.txt.gz')), GzipExtract( TarExtract( dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-07.txt.gz')), GzipExtract( TarExtract( dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-08.txt.gz')), GzipExtract( TarExtract( dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-09.txt.gz')), GzipExtract( TarExtract( dlc['logs'], 'AOL-user-ct-collection/user-ct-test-collection-10.txt.gz')), ], GzipExtract(dlc['id2wb']), base_path) base = Dataset( DocstoreBackedDocs(manager.docs_store, docs_cls=AolIaDoc, namespace=NAME, lang=None), TsvQueries(manager.file_ref('queries.tsv'), lang=None), TrecQrels(manager.file_ref('qrels'), QREL_DEFS), AolQlogs(manager.file_ref('log.pkl.lz4')), documentation('_')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets, manager, base_path
def _init(): base_path = ir_datasets.util.home_path() / NAME dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') base = Dataset(documentation('_')) subsets = {} langs = { 'ar': 'mrtydi-v1.0-arabic', 'bn': 'mrtydi-v1.0-bengali', 'en': 'mrtydi-v1.0-english', 'fi': 'mrtydi-v1.0-finnish', 'id': 'mrtydi-v1.0-indonesian', 'ja': 'mrtydi-v1.0-japanese', 'ko': 'mrtydi-v1.0-korean', 'ru': 'mrtydi-v1.0-russian', 'sw': 'mrtydi-v1.0-swahili', 'te': 'mrtydi-v1.0-telugu', 'th': 'mrtydi-v1.0-thai', } migrator = Migrator(base_path / 'irds_version.txt', 'v2', affected_files=[base_path / lang for lang in langs], message='Migrating mr-tydi (restructuring directory)') for lang, file_name in langs.items(): dlc_ds = TarExtractAll(dlc[lang], f'{base_path/lang}.data') docs = MrTydiDocs( GzipExtract( RelativePath(dlc_ds, f'{file_name}/collection/docs.jsonl.gz')), lang, count_hint=ir_datasets.util.count_hint(f'{NAME}/{lang}')) docs = migrator(docs) subsets[lang] = Dataset( docs, TsvQueries(RelativePath(dlc_ds, f'{file_name}/topic.tsv'), lang=lang), TrecQrels(RelativePath(dlc_ds, f'{file_name}/qrels.txt'), QREL_DEFS), documentation(lang)) subsets[f'{lang}/train'] = Dataset( docs, TsvQueries(RelativePath(dlc_ds, f'{file_name}/topic.train.tsv'), lang=lang), TrecQrels(RelativePath(dlc_ds, f'{file_name}/qrels.train.txt'), QREL_DEFS), documentation(f'{lang}/train')) subsets[f'{lang}/dev'] = Dataset( docs, TsvQueries(RelativePath(dlc_ds, f'{file_name}/topic.dev.tsv'), lang=lang), TrecQrels(RelativePath(dlc_ds, f'{file_name}/qrels.dev.txt'), QREL_DEFS), documentation(f'{lang}/dev')) subsets[f'{lang}/test'] = Dataset( docs, TsvQueries(RelativePath(dlc_ds, f'{file_name}/topic.test.tsv'), lang=lang), TrecQrels(RelativePath(dlc_ds, f'{file_name}/qrels.test.txt'), QREL_DEFS), documentation(f'{lang}/test')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path()/NAME documentation = YamlDocumentation(f'docs/{NAME}.yaml') dlc = DownloadConfig.context(NAME, base_path, dua=DUA) subsets = {} collection = MsMarcoTrecDocs(GzipExtract(dlc['docs'])) subsets['train'] = Dataset( collection, TsvQueries(GzipExtract(dlc['train/queries']), namespace='msmarco', lang='en'), TrecQrels(GzipExtract(dlc['train/qrels']), QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])), ) subsets['dev'] = Dataset( collection, TsvQueries(GzipExtract(dlc['dev/queries']), namespace='msmarco', lang='en'), TrecQrels(GzipExtract(dlc['dev/qrels']), QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['dev/scoreddocs'])), ) subsets['eval'] = Dataset( collection, TsvQueries(GzipExtract(dlc['eval/queries']), namespace='msmarco', lang='en'), TrecScoredDocs(GzipExtract(dlc['eval/scoreddocs'])), ) subsets['trec-dl-2019'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']), namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2019/scoreddocs'])), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2020/scoreddocs'])), ) subsets['orcas'] = Dataset( collection, TsvQueries(GzipExtract(dlc['orcas/queries']), namespace='orcas', lang='en'), TrecQrels(GzipExtract(dlc['orcas/qrels']), ORCAS_QLRES_DEFS), TrecScoredDocs(GzipExtract(dlc['orcas/scoreddocs'])), ) dl19_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged), FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged), subsets['trec-dl-2019'], ) dl20_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()}) subsets['trec-dl-2020/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_judged), FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(), dl20_judged), subsets['trec-dl-2020'], ) # DL-Hard dl_hard_qrels_migrator = Migrator(base_path/'trec-dl-hard'/'irds_version.txt', 'v2', affected_files=[base_path/'trec-dl-hard'/'qrels'], message='Updating trec-dl-hard qrels') hard_qids = Lazy(lambda: DL_HARD_QIDS) dl_hard_base_queries = TsvQueries([ Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path/'trec-dl-2019/queries.tsv'), Cache(GzipExtract(dlc['trec-dl-2020/queries']), base_path/'trec-dl-2020/queries.tsv')], namespace='msmarco', lang='en') subsets['trec-dl-hard'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), dl_hard_qrels_migrator(TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)), documentation('trec-dl-hard') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1']) subsets['trec-dl-hard/fold1'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold1') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2']) subsets['trec-dl-hard/fold2'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold2') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3']) subsets['trec-dl-hard/fold3'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold3') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4']) subsets['trec-dl-hard/fold4'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold4') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5']) subsets['trec-dl-hard/fold5'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold5') ) subsets['anchor-text'] = Dataset( MsMarcoAnchorTextDocs( Cache(GzipExtract(dlc['anchor-text']), base_path / "anchor-text.json"), count_hint=1703834 ), documentation('anchor-text') ) ir_datasets.registry.register(NAME, Dataset(collection, documentation("_"))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): documentation = YamlDocumentation('docs/msmarco-passage.yaml') base_path = ir_datasets.util.home_path() / 'msmarco-passage' dlc = DownloadConfig.context('msmarco-passage', base_path, dua=DUA) collection = TsvDocs(Cache( FixEncoding(TarExtract(dlc['collectionandqueries'], 'collection.tsv')), base_path / 'collection.tsv'), namespace='msmarco') subsets = {} subsets['train'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.train.tsv'), base_path / 'train/queries.tsv'), namespace='msmarco'), TrecQrels(dlc['train/qrels'], QRELS_DEFS), TsvDocPairs(GzipExtract(dlc['train/docpairs'])), TrecScoredDocs( Cache( ExtractQidPid( TarExtract(dlc['train/scoreddocs'], 'top1000.train.txt')), base_path / 'train/ms.run')), ) subsets['dev'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.dev.tsv'), base_path / 'dev/queries.tsv'), namespace='msmarco'), TrecQrels(dlc['dev/qrels'], QRELS_DEFS), TrecScoredDocs( Cache( ExtractQidPid(TarExtract(dlc['dev/scoreddocs'], 'top1000.dev')), base_path / 'dev/ms.run')), ) subsets['dev/small'] = Dataset( collection, TsvQueries(Cache( TarExtract(dlc['collectionandqueries'], 'queries.dev.small.tsv'), base_path / 'dev/small/queries.tsv'), namespace='msmarco'), TrecQrels( Cache( TarExtract(dlc['collectionandqueries'], 'qrels.dev.small.tsv'), base_path / 'dev/small/qrels'), QRELS_DEFS), ) subsets['eval'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.eval.tsv'), base_path / 'eval/queries.tsv'), namespace='msmarco'), TrecScoredDocs( Cache( ExtractQidPid( TarExtract(dlc['eval/scoreddocs'], 'top1000.eval')), base_path / 'eval/ms.run')), ) subsets['eval/small'] = Dataset( collection, TsvQueries(Cache( TarExtract(dlc['collectionandqueries'], 'queries.eval.small.tsv'), base_path / 'eval/small/queries.tsv'), namespace='msmarco'), ) subsets['trec-dl-2019'] = Dataset( collection, TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS), TsvQueries(Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path / 'trec-dl-2019/queries.tsv'), namespace='msmarco'), TrecScoredDocs( Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2019/scoreddocs'])), base_path / 'trec-dl-2019/ms.run')), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco'), TrecScoredDocs( Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2020/scoreddocs'])), base_path / 'trec-dl-2020/ms.run')), ) # A few subsets that are contrainted to just the queries/qrels/docpairs that have at least # 1 relevance assessment train_judged = Lazy( lambda: {q.query_id for q in subsets['train'].qrels_iter()}) subsets['train/judged'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_judged), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_judged), subsets['train'], ) dev_judged = Lazy( lambda: {q.query_id for q in subsets['dev'].qrels_iter()}) subsets['dev/judged'] = Dataset( FilteredQueries(subsets['dev'].queries_handler(), dev_judged), FilteredScoredDocs(subsets['dev'].scoreddocs_handler(), dev_judged), subsets['dev'], ) dl19_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged), FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged), subsets['trec-dl-2019'], ) # split200 -- 200 queries held out from the training data for validation split200 = Lazy(lambda: SPLIT200_QIDS) subsets['train/split200-train'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='exclude'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='exclude'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='exclude'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='exclude'), subsets['train'], ) subsets['train/split200-valid'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='include'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='include'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='include'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='include'), subsets['train'], ) # Medical subset def train_med(): with dlc['medmarco_ids'].stream() as stream: stream = codecs.getreader('utf8')(stream) return {l.rstrip() for l in stream} train_med = Lazy(train_med) subsets['train/medical'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_med), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_med), FilteredDocPairs(subsets['train'].docpairs_handler(), train_med), FilteredQrels(subsets['train'].qrels_handler(), train_med), subsets['train'], ) ir_datasets.registry.register('msmarco-passage', Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'msmarco-passage/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _dlc_init(): with GzipExtract(base_dlc['downloads']).stream() as f: clirmatrix_dlc = _DownloadConfig(contents=json.load(f)) return clirmatrix_dlc
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) migrator = Migrator(base_path / 'irds_version.txt', 'v2', affected_files=[ base_path / 'collection.tsv', base_path / 'collection.tsv.pklz4' ], message=f'Migrating {NAME} (fixing passage encoding)') collection = TsvDocs(Cache( FixEncoding(TarExtract(dlc['collectionandqueries'], 'collection.tsv')), base_path / 'collection.tsv'), namespace='msmarco', lang='en', docstore_size_hint=14373971970, count_hint=ir_datasets.util.count_hint(NAME)) collection = migrator(collection) subsets = {} subsets['train'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.train.tsv'), base_path / 'train/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels(dlc['train/qrels'], QRELS_DEFS), TsvDocPairs(GzipExtract(dlc['train/docpairs'])), TrecScoredDocs( Cache( ExtractQidPid( TarExtract(dlc['train/scoreddocs'], 'top1000.train.txt')), base_path / 'train/ms.run')), ) subsets['train/triples-v2'] = Dataset( collection, subsets['train'].queries_handler(), subsets['train'].qrels_handler(), TsvDocPairs(GzipExtract(dlc['train/docpairs/v2'])), subsets['train'].scoreddocs_handler(), ) subsets['train/triples-small'] = Dataset( collection, subsets['train'].queries_handler(), subsets['train'].qrels_handler(), TsvDocPairs( Cache( MapSmallTriplesQidPid( TarExtract(dlc['train/docpairs/small'], 'triples.train.small.tsv'), TarExtract(dlc['collectionandqueries'], 'collection.tsv'), subsets['train'].queries_handler()), base_path / 'train/small.triples.qidpid.tsv')), subsets['train'].scoreddocs_handler(), ) subsets['dev'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.dev.tsv'), base_path / 'dev/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels(dlc['dev/qrels'], QRELS_DEFS), ) subsets['dev/small'] = Dataset( collection, TsvQueries(Cache( TarExtract(dlc['collectionandqueries'], 'queries.dev.small.tsv'), base_path / 'dev/small/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels( Cache( TarExtract(dlc['collectionandqueries'], 'qrels.dev.small.tsv'), base_path / 'dev/small/qrels'), QRELS_DEFS), TrecScoredDocs( Cache( ExtractQidPid(TarExtract(dlc['dev/scoreddocs'], 'top1000.dev')), base_path / 'dev/ms.run')), ) subsets['eval'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.eval.tsv'), base_path / 'eval/queries.tsv'), namespace='msmarco', lang='en'), ) subsets['eval/small'] = Dataset( collection, TsvQueries(Cache( TarExtract(dlc['collectionandqueries'], 'queries.eval.small.tsv'), base_path / 'eval/small/queries.tsv'), namespace='msmarco', lang='en'), TrecScoredDocs( Cache( ExtractQidPid( TarExtract(dlc['eval/scoreddocs'], 'top1000.eval')), base_path / 'eval/ms.run')), ) subsets['trec-dl-2019'] = Dataset( collection, TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS), TsvQueries(Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path / 'trec-dl-2019/queries.tsv'), namespace='msmarco', lang='en'), TrecScoredDocs( Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2019/scoreddocs'])), base_path / 'trec-dl-2019/ms.run')), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs( Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2020/scoreddocs'])), base_path / 'trec-dl-2020/ms.run')), ) # A few subsets that are contrainted to just the queries/qrels/docpairs that have at least # 1 relevance assessment train_judged = Lazy( lambda: {q.query_id for q in subsets['train'].qrels_iter()}) subsets['train/judged'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_judged), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_judged), subsets['train'], ) dev_judged = Lazy( lambda: {q.query_id for q in subsets['dev'].qrels_iter()}) subsets['dev/judged'] = Dataset( FilteredQueries(subsets['dev'].queries_handler(), dev_judged), subsets['dev'], ) dl19_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged), FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged), subsets['trec-dl-2019'], ) dl20_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()}) subsets['trec-dl-2020/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_judged), FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(), dl20_judged), subsets['trec-dl-2020'], ) # split200 -- 200 queries held out from the training data for validation split200 = Lazy(lambda: SPLIT200_QIDS) subsets['train/split200-train'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='exclude'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='exclude'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='exclude'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='exclude'), subsets['train'], ) subsets['train/split200-valid'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='include'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='include'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='include'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='include'), subsets['train'], ) # Medical subset def train_med(): with dlc['medmarco_ids'].stream() as stream: stream = codecs.getreader('utf8')(stream) return {l.rstrip() for l in stream} train_med = Lazy(train_med) subsets['train/medical'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_med), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_med), FilteredDocPairs(subsets['train'].docpairs_handler(), train_med), FilteredQrels(subsets['train'].qrels_handler(), train_med), subsets['train'], ) # DL-Hard dl_hard_qrels_migrator = Migrator( base_path / 'trec-dl-hard' / 'irds_version.txt', 'v3', affected_files=[base_path / 'trec-dl-hard' / 'qrels'], message='Updating trec-dl-hard qrels') hard_qids = Lazy(lambda: DL_HARD_QIDS) dl_hard_base_queries = TsvQueries([ Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path / 'trec-dl-2019/queries.tsv'), Cache(GzipExtract(dlc['trec-dl-2020/queries']), base_path / 'trec-dl-2020/queries.tsv') ], namespace='msmarco', lang='en') subsets['trec-dl-hard'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), dl_hard_qrels_migrator( TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)), documentation('trec-dl-hard')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1']) subsets['trec-dl-hard/fold1'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold1')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2']) subsets['trec-dl-hard/fold2'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold2')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3']) subsets['trec-dl-hard/fold3'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold3')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4']) subsets['trec-dl-hard/fold4'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold4')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5']) subsets['trec-dl-hard/fold5'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold5')) ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): base_path = ir_datasets.util.home_path() / NAME documentation = YamlDocumentation(f'docs/{NAME}.yaml') dlc = DownloadConfig.context(NAME, base_path, dua=DUA) subsets = {} collection = MsMarcoV2Docs(dlc['docs']) subsets['train'] = Dataset( collection, TsvQueries(dlc['train_queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['train_qrels'], QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['train_scoreddocs'])), ) subsets['dev1'] = Dataset( collection, TsvQueries(dlc['dev1_queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['dev1_qrels'], QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['dev1_scoreddocs'])), ) subsets['dev2'] = Dataset( collection, TsvQueries(dlc['dev2_queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['dev2_qrels'], QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['dev2_scoreddocs'])), ) subsets['trec-dl-2019'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']), namespace='msmarco', lang='en'), TrecQrels(GzipExtract(dlc['trec_dl_2019_qrels']), TREC_DL_QRELS_DEFS), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'), TrecQrels(GzipExtract(dlc['trec_dl_2020_qrels']), TREC_DL_QRELS_DEFS), ) dl19_v2_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_v2_judged), subsets['trec-dl-2019'], ) dl20_v2_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()}) subsets['trec-dl-2020/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_v2_judged), subsets['trec-dl-2020'], ) subsets['trec-dl-2021'] = Dataset( collection, TsvQueries(dlc['trec-dl-2021/queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2021/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2021/scoreddocs'])), ) dl21_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2021'].qrels_iter()}) subsets['trec-dl-2021/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2021'].queries_handler(), dl21_judged), FilteredScoredDocs(subsets['trec-dl-2021'].scoreddocs_handler(), dl21_judged), subsets['trec-dl-2021'], ) subsets['anchor-text'] = Dataset( MsMarcoV2AnchorTextDocs(Cache(GzipExtract(dlc['anchor-text']), base_path / "anchor-text.json"), count_hint=4821244), documentation('anchor-text')) ir_datasets.registry.register(NAME, Dataset(collection, documentation("_"))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets