def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} docs_dlc = dlc['docs'] docs_chk_dlc = TarExtractAll(dlc['docs.chk'], base_path/'corpus.chk') b13_dlc = Bz2Extract(Cache(TarExtract(dlc['cw12b-info'], 'ClueWeb12-CreateB13/software/CreateClueWeb12B13Dataset.jar'), base_path/'CreateClueWeb12B13Dataset.jar')) collection = ClueWeb12Docs(docs_dlc, docs_chk_dlc) collection_b13 = ClueWeb12Docs(ClueWeb12b13Extractor(docs_dlc, b13_dlc)) base = Dataset(collection, documentation('_')) subsets['b13'] = Dataset(collection_b13, documentation('b13')) subsets['trec-web-2013'] = Dataset( collection, TrecXmlQueries(dlc['trec-web-2013/queries'], qtype=TrecWebTrackQuery, namespace=NAME), TrecQrels(dlc['trec-web-2013/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2013')) subsets['trec-web-2014'] = Dataset( collection, TrecXmlQueries(dlc['trec-web-2014/queries'], qtype=TrecWebTrackQuery, namespace=NAME), TrecQrels(dlc['trec-web-2014/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2014')) subsets['b13/ntcir-www-1'] = Dataset( collection_b13, TrecXmlQueries(Cache(ZipExtract(dlc['ntcir-www-1/queries'], 'eng.queries.xml'), base_path/'ntcir-www-1'/'queries.xml'), qtype=GenericQuery, qtype_map={'qid': 'query_id', 'content': 'text'}, namespace=NAME), NtcirQrels(dlc['ntcir-www-1/qrels'], NTCIR_QREL_DEFS), documentation('ntcir-www-1')) subsets['b13/ntcir-www-2'] = Dataset( collection_b13, TrecXmlQueries(Cache(ZipExtract(dlc['ntcir-www-2/queries'], 'qEng.xml'), base_path/'ntcir-www-2'/'queries.xml'), qtype=NtcirQuery, qtype_map=ntcir_map, namespace=NAME), NtcirQrels(dlc['ntcir-www-2/qrels'], NTCIR_QREL_DEFS), documentation('ntcir-www-2')) subsets['b13/ntcir-www-3'] = Dataset( collection_b13, TrecXmlQueries(dlc['ntcir-www-3/queries'], qtype=NtcirQuery, qtype_map=ntcir_map, namespace=NAME), documentation('ntcir-www-3')) subsets['b13/trec-misinfo-2019'] = Dataset( collection_b13, TrecXmlQueries(dlc['trec-misinfo-2019/queries'], qtype=MisinfoQuery, qtype_map=misinfo_map, namespace=NAME), MsinfoQrels(dlc['trec-misinfo-2019/qrels'], MISINFO_QREL_DEFS), documentation('trec-misinfo-2019')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} main_dlc = dlc['main'] base = Dataset( VaswaniDocs( Cache(TarExtract(main_dlc, 'doc-text'), base_path / 'docs.txt')), VaswaniQueries( Cache(TarExtract(main_dlc, 'query-text'), base_path / 'queries.txt')), VaswaniQrels( Cache(TarExtract(main_dlc, 'rlv-ass'), base_path / 'qrels.txt')), documentation('_'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): base_path = home_path() / NAME documentation = YamlDocumentation(f"docs/{NAME}.yaml") download_config = DownloadConfig.context(NAME, base_path) base = Dataset(documentation('_')) # Arguments that can be loaded from Zenodo. arguments: Dict[str, ArgsMeDocs] = { name: ArgsMeDocs(Cache(ZipExtract(download_config[name], zip_path), base_path / f"{name}.json"), namespace=f"{NAME}/{name}", language=language, count_hint=count_hint) for name, (count_hint, language, zip_path) in SUBSETS.items() } # Arguments that are combined versions of other subsets. combined_arguments: Dict[str, ArgsMeCombinedArguments] = { name: ArgsMeCombinedArguments( base_path / f"{name}.json", [arguments[subset_name] for subset_name in subset_names], namespace=f"{NAME}/{name}", language=language, count_hint=count_hint) for name, (subset_names, count_hint, language) in COMBINED_SUBSETS.items() } # Wrap in datasets with documentation. datasets = { name: Dataset(arguments, documentation(name)) for name, arguments in chain(arguments.items(), combined_arguments.items()) } # NOTE: the following datasets are defined in touche.py: # - argsme/1.0/touche-2020-task-1/uncorrected # - argsme/2020-04-01/touche-2020-task-1 # - argsme/2020-04-01/touche-2020-task-1/uncorrected # - argsme/2020-04-01/touche-2021-task-1 # Register datasets. registry.register(NAME, base) for name, arguments in datasets.items(): registry.register(f'{NAME}/{name}', arguments) return base, datasets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} main_dlc = dlc['main'] base = Dataset( CranfieldDocs( Cache(TarExtract(main_dlc, 'cran.all.1400'), base_path / 'docs.txt')), CranfieldQueries( Cache(TarExtract(main_dlc, 'cran.qry'), base_path / 'queries.txt')), CranfieldQrels( Cache(TarExtract(main_dlc, 'cranqrel'), base_path / 'qrels.txt')), documentation('_'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) migrator = Migrator(base_path / 'irds_version.txt', 'v2', affected_files=[ base_path / 'collection.tsv', base_path / 'collection.tsv.pklz4' ], message=f'Migrating {NAME} (fixing passage encoding)') collection = TsvDocs(Cache( FixEncoding(TarExtract(dlc['collectionandqueries'], 'collection.tsv')), base_path / 'collection.tsv'), namespace='msmarco', lang='en', docstore_size_hint=14373971970, count_hint=ir_datasets.util.count_hint(NAME)) collection = migrator(collection) subsets = {} subsets['train'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.train.tsv'), base_path / 'train/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels(dlc['train/qrels'], QRELS_DEFS), TsvDocPairs(GzipExtract(dlc['train/docpairs'])), TrecScoredDocs( Cache( ExtractQidPid( TarExtract(dlc['train/scoreddocs'], 'top1000.train.txt')), base_path / 'train/ms.run')), ) subsets['train/triples-v2'] = Dataset( collection, subsets['train'].queries_handler(), subsets['train'].qrels_handler(), TsvDocPairs(GzipExtract(dlc['train/docpairs/v2'])), subsets['train'].scoreddocs_handler(), ) subsets['train/triples-small'] = Dataset( collection, subsets['train'].queries_handler(), subsets['train'].qrels_handler(), TsvDocPairs( Cache( MapSmallTriplesQidPid( TarExtract(dlc['train/docpairs/small'], 'triples.train.small.tsv'), TarExtract(dlc['collectionandqueries'], 'collection.tsv'), subsets['train'].queries_handler()), base_path / 'train/small.triples.qidpid.tsv')), subsets['train'].scoreddocs_handler(), ) subsets['dev'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.dev.tsv'), base_path / 'dev/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels(dlc['dev/qrels'], QRELS_DEFS), ) subsets['dev/small'] = Dataset( collection, TsvQueries(Cache( TarExtract(dlc['collectionandqueries'], 'queries.dev.small.tsv'), base_path / 'dev/small/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels( Cache( TarExtract(dlc['collectionandqueries'], 'qrels.dev.small.tsv'), base_path / 'dev/small/qrels'), QRELS_DEFS), TrecScoredDocs( Cache( ExtractQidPid(TarExtract(dlc['dev/scoreddocs'], 'top1000.dev')), base_path / 'dev/ms.run')), ) subsets['eval'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.eval.tsv'), base_path / 'eval/queries.tsv'), namespace='msmarco', lang='en'), ) subsets['eval/small'] = Dataset( collection, TsvQueries(Cache( TarExtract(dlc['collectionandqueries'], 'queries.eval.small.tsv'), base_path / 'eval/small/queries.tsv'), namespace='msmarco', lang='en'), TrecScoredDocs( Cache( ExtractQidPid( TarExtract(dlc['eval/scoreddocs'], 'top1000.eval')), base_path / 'eval/ms.run')), ) subsets['trec-dl-2019'] = Dataset( collection, TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS), TsvQueries(Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path / 'trec-dl-2019/queries.tsv'), namespace='msmarco', lang='en'), TrecScoredDocs( Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2019/scoreddocs'])), base_path / 'trec-dl-2019/ms.run')), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs( Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2020/scoreddocs'])), base_path / 'trec-dl-2020/ms.run')), ) # A few subsets that are contrainted to just the queries/qrels/docpairs that have at least # 1 relevance assessment train_judged = Lazy( lambda: {q.query_id for q in subsets['train'].qrels_iter()}) subsets['train/judged'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_judged), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_judged), subsets['train'], ) dev_judged = Lazy( lambda: {q.query_id for q in subsets['dev'].qrels_iter()}) subsets['dev/judged'] = Dataset( FilteredQueries(subsets['dev'].queries_handler(), dev_judged), subsets['dev'], ) dl19_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged), FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged), subsets['trec-dl-2019'], ) dl20_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()}) subsets['trec-dl-2020/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_judged), FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(), dl20_judged), subsets['trec-dl-2020'], ) # split200 -- 200 queries held out from the training data for validation split200 = Lazy(lambda: SPLIT200_QIDS) subsets['train/split200-train'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='exclude'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='exclude'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='exclude'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='exclude'), subsets['train'], ) subsets['train/split200-valid'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='include'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='include'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='include'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='include'), subsets['train'], ) # Medical subset def train_med(): with dlc['medmarco_ids'].stream() as stream: stream = codecs.getreader('utf8')(stream) return {l.rstrip() for l in stream} train_med = Lazy(train_med) subsets['train/medical'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_med), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_med), FilteredDocPairs(subsets['train'].docpairs_handler(), train_med), FilteredQrels(subsets['train'].qrels_handler(), train_med), subsets['train'], ) # DL-Hard dl_hard_qrels_migrator = Migrator( base_path / 'trec-dl-hard' / 'irds_version.txt', 'v3', affected_files=[base_path / 'trec-dl-hard' / 'qrels'], message='Updating trec-dl-hard qrels') hard_qids = Lazy(lambda: DL_HARD_QIDS) dl_hard_base_queries = TsvQueries([ Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path / 'trec-dl-2019/queries.tsv'), Cache(GzipExtract(dlc['trec-dl-2020/queries']), base_path / 'trec-dl-2020/queries.tsv') ], namespace='msmarco', lang='en') subsets['trec-dl-hard'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), dl_hard_qrels_migrator( TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)), documentation('trec-dl-hard')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1']) subsets['trec-dl-hard/fold1'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold1')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2']) subsets['trec-dl-hard/fold2'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold2')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3']) subsets['trec-dl-hard/fold3'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold3')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4']) subsets['trec-dl-hard/fold4'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold4')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5']) subsets['trec-dl-hard/fold5'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold5')) ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} docs_dlc = dlc['docs'] docs_chk_dlc = TarExtractAll(dlc['docs.chk'], base_path / 'corpus.chk') b13_dlc = Bz2Extract( Cache( TarExtract( dlc['cw12b-info'], 'ClueWeb12-CreateB13/software/CreateClueWeb12B13Dataset.jar'), base_path / 'CreateClueWeb12B13Dataset.jar')) collection = ClueWeb12Docs(docs_dlc, docs_chk_dlc) collection_b13 = ClueWeb12Docs(ClueWeb12b13Extractor(docs_dlc, b13_dlc)) base = Dataset(collection, documentation('_')) subsets['b13'] = Dataset(collection_b13, documentation('b13')) subsets['trec-web-2013'] = Dataset( collection, TrecXmlQueries(dlc['trec-web-2013/queries'], qtype=TrecWebTrackQuery, namespace='trec-web', lang='en'), TrecQrels(dlc['trec-web-2013/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2013')) subsets['trec-web-2014'] = Dataset( collection, TrecXmlQueries(dlc['trec-web-2014/queries'], qtype=TrecWebTrackQuery, namespace='trec-web', lang='en'), TrecQrels(dlc['trec-web-2014/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2014')) subsets['b13/ntcir-www-1'] = Dataset( collection_b13, TrecXmlQueries(Cache( ZipExtract(dlc['ntcir-www-1/queries'], 'eng.queries.xml'), base_path / 'ntcir-www-1' / 'queries.xml'), qtype=GenericQuery, qtype_map={ 'qid': 'query_id', 'content': 'text' }, namespace='ntcir-www', lang='en'), NtcirQrels(dlc['ntcir-www-1/qrels'], NTCIR_QREL_DEFS), documentation('ntcir-www-1')) subsets['b13/ntcir-www-2'] = Dataset( collection_b13, TrecXmlQueries(Cache( ZipExtract(dlc['ntcir-www-2/queries'], 'qEng.xml'), base_path / 'ntcir-www-2' / 'queries.xml'), qtype=NtcirQuery, qtype_map=ntcir_map, namespace='ntcir-www', lang='en'), NtcirQrels(dlc['ntcir-www-2/qrels'], NTCIR_QREL_DEFS), documentation('ntcir-www-2')) subsets['b13/ntcir-www-3'] = Dataset( collection_b13, TrecXmlQueries(dlc['ntcir-www-3/queries'], qtype=NtcirQuery, qtype_map=ntcir_map, namespace='ntcir-www', lang='en'), documentation('ntcir-www-3')) subsets['b13/trec-misinfo-2019'] = Dataset( collection_b13, TrecXmlQueries(dlc['trec-misinfo-2019/queries'], qtype=MisinfoQuery, qtype_map=misinfo_map, namespace='trec-misinfo-2019', lang='en'), MsinfoQrels(dlc['trec-misinfo-2019/qrels'], MISINFO_QREL_DEFS), documentation('trec-misinfo-2019')) subsets['b13/clef-ehealth'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='en'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS), documentation('clef-ehealth')) subsets['b13/clef-ehealth/cs'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/cs']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='cs'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-cs'), documentation('clef-ehealth/cs')) subsets['b13/clef-ehealth/de'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/de']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='de'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-de'), documentation('clef-ehealth/de')) subsets['b13/clef-ehealth/fr'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/fr']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='fr'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-fr'), documentation('clef-ehealth/fr')) subsets['b13/clef-ehealth/hu'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/hu']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='hu'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-hu'), documentation('clef-ehealth/hu')) subsets['b13/clef-ehealth/pl'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/pl']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='pl'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-pl'), documentation('clef-ehealth/pl')) subsets['b13/clef-ehealth/sv'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/sv']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='sv'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-sv'), documentation('clef-ehealth/sv')) # NOTE: the following datasets are defined in touche.py: # - clueweb12/touche-2020-task-2 # - clueweb12/touche-2021-task-2 ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def read_lines(file): file = Cache(TarExtract(main_dlc, f'nfcorpus/raw/{file}'), base_path / file) with file.stream() as stream: stream = codecs.getreader('utf8')(stream) return {l.rstrip() for l in stream}
def _init(): base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') main_dlc = dlc['main'] collection = TsvDocs(Cache( TarExtract(main_dlc, 'nfcorpus/raw/doc_dump.txt'), base_path / 'collection.tsv'), doc_cls=NfCorpusDoc, namespace=NAME) subsets = {} def read_lines(file): file = Cache(TarExtract(main_dlc, f'nfcorpus/raw/{file}'), base_path / file) with file.stream() as stream: stream = codecs.getreader('utf8')(stream) return {l.rstrip() for l in stream} nontopic_qid_filter = Lazy(lambda: read_lines('nontopics.ids')) video_qid_filter = Lazy(lambda: read_lines('all_videos.ids')) subsets['train'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.titles.queries'), base_path / 'train/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.all.queries'), base_path / 'train/queries.all.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery), TrecQrels( Cache(TarExtract(main_dlc, 'nfcorpus/train.3-2-1.qrel'), base_path / 'train/qrels'), QRELS_DEFS), documentation('train'), ) subsets['train/nontopic'] = Dataset( collection, TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.nontopic-titles.queries'), base_path / 'train/nontopic/queries.tsv'), namespace=NAME), FilteredQrels(subsets['train'].qrels_handler(), nontopic_qid_filter, mode='include'), documentation('train/nontopic'), ) subsets['train/video'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.vid-titles.queries'), base_path / 'train/video/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.vid-desc.queries'), base_path / 'train/video/queries.desc.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.nontopic-titles.queries'), base_path / 'train/video/queries.tsv'), NfCorpusVideoQuery, namespace=NAME), FilteredQrels(subsets['train'].qrels_handler(), video_qid_filter, mode='include'), documentation('train/video'), ) subsets['dev'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/dev.titles.queries'), base_path / 'dev/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/dev.all.queries'), base_path / 'dev/queries.all.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery), TrecQrels( Cache(TarExtract(main_dlc, 'nfcorpus/dev.3-2-1.qrel'), base_path / 'dev/qrels'), QRELS_DEFS), documentation('dev'), ) subsets['dev/nontopic'] = Dataset( collection, TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/dev.nontopic-titles.queries'), base_path / 'dev/nontopic/queries.tsv'), namespace=NAME), FilteredQrels(subsets['dev'].qrels_handler(), nontopic_qid_filter, mode='include'), documentation('dev/nontopic'), ) subsets['dev/video'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/dev.vid-titles.queries'), base_path / 'dev/video/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/dev.vid-desc.queries'), base_path / 'dev/video/queries.desc.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/dev.nontopic-titles.queries'), base_path / 'dev/video/queries.tsv'), NfCorpusVideoQuery, namespace=NAME), FilteredQrels(subsets['dev'].qrels_handler(), video_qid_filter, mode='include'), documentation('dev/video'), ) subsets['test'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/test.titles.queries'), base_path / 'test/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/test.all.queries'), base_path / 'test/queries.all.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery), TrecQrels( Cache(TarExtract(main_dlc, 'nfcorpus/test.3-2-1.qrel'), base_path / 'test/qrels'), QRELS_DEFS), documentation('test'), ) subsets['test/nontopic'] = Dataset( collection, TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/test.nontopic-titles.queries'), base_path / 'test/nontopic/queries.tsv'), namespace=NAME), FilteredQrels(subsets['test'].qrels_handler(), nontopic_qid_filter, mode='include'), documentation('test/nontopic'), ) subsets['test/video'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/test.vid-titles.queries'), base_path / 'test/video/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/test.vid-desc.queries'), base_path / 'test/video/queries.desc.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/test.nontopic-titles.queries'), base_path / 'test/video/queries.tsv'), NfCorpusVideoQuery, namespace=NAME), FilteredQrels(subsets['test'].qrels_handler(), video_qid_filter, mode='include'), documentation('test/video'), ) ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return collection, subsets
return 'trec-robust04' def docs_lang(self): return 'en' DL_ANSERINI_ROBUST04 = ir_datasets.util.Download( [ ir_datasets.util.RequestsDownload( 'https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-robust04-20191213.tar.gz' ) ], expected_md5='15f3d001489c97849a010b0a4734d018') DL_ANSERINI_ROBUST04 = Cache( TarExtract(DL_ANSERINI_ROBUST04, 'index-robust04-20191213/_h.fdt'), base_path / 'lucene_source.fdt') collection = AnseriniRobustDocs(DL_ANSERINI_ROBUST04) for ds_name in [ 'trec-robust04', 'trec-robust04/fold1', 'trec-robust04/fold2', 'trec-robust04/fold3', 'trec-robust04/fold4', 'trec-robust04/fold5' ]: main_ds = ir_datasets.load(ds_name) dataset = ir_datasets.Dataset( collection, main_ds.queries_handler(), main_ds.qrels_handler(), )
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TrecDocs(dlc['benchmark'], parser='tut', path_globs=['**/docs_grp_*.txt'], namespace=NAME, lang='en', count_hint=ir_datasets.util.count_hint(NAME)) topics_and_qrels = TarExtractAll( dlc['benchmark'], base_path / "topics_and_qrels", path_globs=['**/topics.*.txt', '**/qrels.*.txt']) val_runs = TarExtractAll(dlc['dlfiles'], base_path / "val_runs", path_globs=['**/run.trip.BM25.*.val.txt']) test_runs = TarExtractAll(dlc['dlfiles_runs_test'], base_path / "test_runs", path_globs=['**/run.trip.BM25.*.test.txt']) base = Dataset(collection, documentation('_')) subsets['logs'] = Dataset( TsvDocs(Cache( FixAllarticles(TarExtract(dlc['logs'], 'logs/allarticles.txt')), base_path / 'allarticles-fixed.tsv'), doc_cls=TripClickPartialDoc, lang='en', count_hint=ir_datasets.util.count_hint(f'{NAME}/logs')), TripClickQlogs( TarExtractAll(dlc['logs'], base_path / 'logs', path_globs=['**/*.json'])), documentation('logs')) ### Train subsets['train/head'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.head.train.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.head.train.txt'), QREL_DEFS), documentation('train/head')) subsets['train/head/dctr'] = Dataset( TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.dctr.head.train.txt'), QREL_DCTR_DEFS), subsets['train/head'], documentation('train/head/dctr')) subsets['train/torso'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.torso.train.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.torso.train.txt'), QREL_DEFS), documentation('train/torso')) subsets['train/tail'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.tail.train.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.tail.train.txt'), QREL_DEFS), documentation('train/tail')) train_queries = ConcatQueries([ subsets['train/head'].queries_handler(), subsets['train/torso'].queries_handler(), subsets['train/tail'].queries_handler(), ]) train_docpairs = DocPairGenerator( TarExtract(dlc['dlfiles'], 'dlfiles/triples.train.tsv'), collection, train_queries, base_path / 'train.docpairs') subsets['train'] = Dataset( collection, train_queries, ConcatQrels([ subsets['train/head'].qrels_handler(), subsets['train/torso'].qrels_handler(), subsets['train/tail'].qrels_handler(), ]), TsvDocPairs(train_docpairs), documentation('train')) subsets['train/hofstaetter-triples'] = Dataset( collection, train_queries, subsets['train'].qrels_handler(), TsvDocPairs(dlc['hofstaetter-triples']), documentation('train/hofstaetter-triples')) ### Val subsets['val/head'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.head.val.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.head.val.txt'), QREL_DEFS), TrecScoredDocs( RelativePath(val_runs, 'dlfiles/run.trip.BM25.head.val.txt')), documentation('val/head')) subsets['val/head/dctr'] = Dataset( TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.dctr.head.val.txt'), QREL_DCTR_DEFS), subsets['val/head'], documentation('val/head/dctr')) subsets['val/torso'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.torso.val.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.torso.val.txt'), QREL_DEFS), TrecScoredDocs( RelativePath(val_runs, 'dlfiles/run.trip.BM25.torso.val.txt')), documentation('val/torso')) subsets['val/tail'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.tail.val.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.tail.val.txt'), QREL_DEFS), TrecScoredDocs( RelativePath(val_runs, 'dlfiles/run.trip.BM25.tail.val.txt')), documentation('val/tail')) subsets['val'] = Dataset( collection, ConcatQueries([ subsets['val/head'].queries_handler(), subsets['val/torso'].queries_handler(), subsets['val/tail'].queries_handler(), ]), ConcatQrels([ subsets['val/head'].qrels_handler(), subsets['val/torso'].qrels_handler(), subsets['val/tail'].qrels_handler(), ]), ConcatScoreddocs([ subsets['val/head'].scoreddocs_handler(), subsets['val/torso'].scoreddocs_handler(), subsets['val/tail'].scoreddocs_handler(), ]), documentation('val')) ### Test subsets['test/head'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.head.test.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecScoredDocs( RelativePath(test_runs, 'runs_test/run.trip.BM25.head.test.txt')), documentation('val/head')) subsets['test/torso'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.torso.test.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecScoredDocs( RelativePath(test_runs, 'runs_test/run.trip.BM25.torso.test.txt')), documentation('test/torso')) subsets['test/tail'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.tail.test.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecScoredDocs( RelativePath(test_runs, 'runs_test/run.trip.BM25.tail.test.txt')), documentation('test/tail')) subsets['test'] = Dataset( collection, ConcatQueries([ subsets['test/head'].queries_handler(), subsets['test/torso'].queries_handler(), subsets['test/tail'].queries_handler(), ]), ConcatScoreddocs([ subsets['test/head'].scoreddocs_handler(), subsets['test/torso'].scoreddocs_handler(), subsets['test/tail'].scoreddocs_handler(), ]), documentation('test')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path() / NAME documentation = YamlDocumentation(f'docs/{NAME}.yaml') dlc = DownloadConfig.context(NAME, base_path, dua=DUA) subsets = {} collection = MsMarcoV2Docs(dlc['docs']) subsets['train'] = Dataset( collection, TsvQueries(dlc['train_queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['train_qrels'], QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['train_scoreddocs'])), ) subsets['dev1'] = Dataset( collection, TsvQueries(dlc['dev1_queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['dev1_qrels'], QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['dev1_scoreddocs'])), ) subsets['dev2'] = Dataset( collection, TsvQueries(dlc['dev2_queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['dev2_qrels'], QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['dev2_scoreddocs'])), ) subsets['trec-dl-2019'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']), namespace='msmarco', lang='en'), TrecQrels(GzipExtract(dlc['trec_dl_2019_qrels']), TREC_DL_QRELS_DEFS), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'), TrecQrels(GzipExtract(dlc['trec_dl_2020_qrels']), TREC_DL_QRELS_DEFS), ) dl19_v2_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_v2_judged), subsets['trec-dl-2019'], ) dl20_v2_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()}) subsets['trec-dl-2020/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_v2_judged), subsets['trec-dl-2020'], ) subsets['trec-dl-2021'] = Dataset( collection, TsvQueries(dlc['trec-dl-2021/queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2021/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2021/scoreddocs'])), ) dl21_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2021'].qrels_iter()}) subsets['trec-dl-2021/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2021'].queries_handler(), dl21_judged), FilteredScoredDocs(subsets['trec-dl-2021'].scoreddocs_handler(), dl21_judged), subsets['trec-dl-2021'], ) subsets['anchor-text'] = Dataset( MsMarcoV2AnchorTextDocs(Cache(GzipExtract(dlc['anchor-text']), base_path / "anchor-text.json"), count_hint=4821244), documentation('anchor-text')) ir_datasets.registry.register(NAME, Dataset(collection, documentation("_"))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def cached_zip_download(name: str, zip_path: str, extension: str) -> Cache: return Cache(ZipExtract(download_config[name], zip_path), base_path / f"{name}.{extension}")
def cached_download(name: str, extension: str) -> Cache: return Cache(download_config[name], base_path / f"{name}.{extension}")
def _init(): base_path = ir_datasets.util.home_path()/NAME documentation = YamlDocumentation(f'docs/{NAME}.yaml') dlc = DownloadConfig.context(NAME, base_path, dua=DUA) subsets = {} collection = MsMarcoTrecDocs(GzipExtract(dlc['docs'])) subsets['train'] = Dataset( collection, TsvQueries(GzipExtract(dlc['train/queries']), namespace='msmarco', lang='en'), TrecQrels(GzipExtract(dlc['train/qrels']), QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])), ) subsets['dev'] = Dataset( collection, TsvQueries(GzipExtract(dlc['dev/queries']), namespace='msmarco', lang='en'), TrecQrels(GzipExtract(dlc['dev/qrels']), QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['dev/scoreddocs'])), ) subsets['eval'] = Dataset( collection, TsvQueries(GzipExtract(dlc['eval/queries']), namespace='msmarco', lang='en'), TrecScoredDocs(GzipExtract(dlc['eval/scoreddocs'])), ) subsets['trec-dl-2019'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']), namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2019/scoreddocs'])), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2020/scoreddocs'])), ) subsets['orcas'] = Dataset( collection, TsvQueries(GzipExtract(dlc['orcas/queries']), namespace='orcas', lang='en'), TrecQrels(GzipExtract(dlc['orcas/qrels']), ORCAS_QLRES_DEFS), TrecScoredDocs(GzipExtract(dlc['orcas/scoreddocs'])), ) dl19_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged), FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged), subsets['trec-dl-2019'], ) dl20_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()}) subsets['trec-dl-2020/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_judged), FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(), dl20_judged), subsets['trec-dl-2020'], ) # DL-Hard dl_hard_qrels_migrator = Migrator(base_path/'trec-dl-hard'/'irds_version.txt', 'v2', affected_files=[base_path/'trec-dl-hard'/'qrels'], message='Updating trec-dl-hard qrels') hard_qids = Lazy(lambda: DL_HARD_QIDS) dl_hard_base_queries = TsvQueries([ Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path/'trec-dl-2019/queries.tsv'), Cache(GzipExtract(dlc['trec-dl-2020/queries']), base_path/'trec-dl-2020/queries.tsv')], namespace='msmarco', lang='en') subsets['trec-dl-hard'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), dl_hard_qrels_migrator(TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)), documentation('trec-dl-hard') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1']) subsets['trec-dl-hard/fold1'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold1') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2']) subsets['trec-dl-hard/fold2'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold2') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3']) subsets['trec-dl-hard/fold3'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold3') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4']) subsets['trec-dl-hard/fold4'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold4') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5']) subsets['trec-dl-hard/fold5'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold5') ) subsets['anchor-text'] = Dataset( MsMarcoAnchorTextDocs( Cache(GzipExtract(dlc['anchor-text']), base_path / "anchor-text.json"), count_hint=1703834 ), documentation('anchor-text') ) ir_datasets.registry.register(NAME, Dataset(collection, documentation("_"))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): documentation = YamlDocumentation('docs/msmarco-passage.yaml') base_path = ir_datasets.util.home_path() / 'msmarco-passage' dlc = DownloadConfig.context('msmarco-passage', base_path, dua=DUA) collection = TsvDocs(Cache( FixEncoding(TarExtract(dlc['collectionandqueries'], 'collection.tsv')), base_path / 'collection.tsv'), namespace='msmarco') subsets = {} subsets['train'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.train.tsv'), base_path / 'train/queries.tsv'), namespace='msmarco'), TrecQrels(dlc['train/qrels'], QRELS_DEFS), TsvDocPairs(GzipExtract(dlc['train/docpairs'])), TrecScoredDocs( Cache( ExtractQidPid( TarExtract(dlc['train/scoreddocs'], 'top1000.train.txt')), base_path / 'train/ms.run')), ) subsets['dev'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.dev.tsv'), base_path / 'dev/queries.tsv'), namespace='msmarco'), TrecQrels(dlc['dev/qrels'], QRELS_DEFS), TrecScoredDocs( Cache( ExtractQidPid(TarExtract(dlc['dev/scoreddocs'], 'top1000.dev')), base_path / 'dev/ms.run')), ) subsets['dev/small'] = Dataset( collection, TsvQueries(Cache( TarExtract(dlc['collectionandqueries'], 'queries.dev.small.tsv'), base_path / 'dev/small/queries.tsv'), namespace='msmarco'), TrecQrels( Cache( TarExtract(dlc['collectionandqueries'], 'qrels.dev.small.tsv'), base_path / 'dev/small/qrels'), QRELS_DEFS), ) subsets['eval'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.eval.tsv'), base_path / 'eval/queries.tsv'), namespace='msmarco'), TrecScoredDocs( Cache( ExtractQidPid( TarExtract(dlc['eval/scoreddocs'], 'top1000.eval')), base_path / 'eval/ms.run')), ) subsets['eval/small'] = Dataset( collection, TsvQueries(Cache( TarExtract(dlc['collectionandqueries'], 'queries.eval.small.tsv'), base_path / 'eval/small/queries.tsv'), namespace='msmarco'), ) subsets['trec-dl-2019'] = Dataset( collection, TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS), TsvQueries(Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path / 'trec-dl-2019/queries.tsv'), namespace='msmarco'), TrecScoredDocs( Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2019/scoreddocs'])), base_path / 'trec-dl-2019/ms.run')), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco'), TrecScoredDocs( Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2020/scoreddocs'])), base_path / 'trec-dl-2020/ms.run')), ) # A few subsets that are contrainted to just the queries/qrels/docpairs that have at least # 1 relevance assessment train_judged = Lazy( lambda: {q.query_id for q in subsets['train'].qrels_iter()}) subsets['train/judged'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_judged), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_judged), subsets['train'], ) dev_judged = Lazy( lambda: {q.query_id for q in subsets['dev'].qrels_iter()}) subsets['dev/judged'] = Dataset( FilteredQueries(subsets['dev'].queries_handler(), dev_judged), FilteredScoredDocs(subsets['dev'].scoreddocs_handler(), dev_judged), subsets['dev'], ) dl19_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged), FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged), subsets['trec-dl-2019'], ) # split200 -- 200 queries held out from the training data for validation split200 = Lazy(lambda: SPLIT200_QIDS) subsets['train/split200-train'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='exclude'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='exclude'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='exclude'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='exclude'), subsets['train'], ) subsets['train/split200-valid'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='include'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='include'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='include'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='include'), subsets['train'], ) # Medical subset def train_med(): with dlc['medmarco_ids'].stream() as stream: stream = codecs.getreader('utf8')(stream) return {l.rstrip() for l in stream} train_med = Lazy(train_med) subsets['train/medical'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_med), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_med), FilteredDocPairs(subsets['train'].docpairs_handler(), train_med), FilteredQrels(subsets['train'].qrels_handler(), train_med), subsets['train'], ) ir_datasets.registry.register('msmarco-passage', Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'msmarco-passage/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): base_path = ir_datasets.util.home_path() / NAME dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') base = Dataset(documentation('_')) subsets = {} benchmarks = { 'msmarco': (['train', 'dev', 'test'], GenericDoc, GenericQuery), 'trec-covid': (['test'], BeirCordDoc, BeirCovidQuery), 'nfcorpus': (['train', 'dev', 'test'], BeirTitleUrlDoc, BeirUrlQuery), 'nq': (['test'], BeirTitleDoc, GenericQuery), 'hotpotqa': (['train', 'dev', 'test'], BeirTitleUrlDoc, GenericQuery), 'fiqa': (['train', 'dev', 'test'], GenericDoc, GenericQuery), 'arguana': (['test'], BeirTitleDoc, GenericQuery), 'webis-touche2020': (['test'], BeirToucheDoc, BeirToucheQuery), 'webis-touche2020/v2': (['test'], BeirToucheDoc, BeirToucheQuery), 'quora': (['dev', 'test'], GenericDoc, GenericQuery), 'dbpedia-entity': (['dev', 'test'], BeirTitleUrlDoc, GenericQuery), 'scidocs': (['test'], BeirSciDoc, BeirSciQuery), 'fever': (['train', 'dev', 'test'], BeirTitleDoc, GenericQuery), 'climate-fever': (['test'], BeirTitleDoc, GenericQuery), 'scifact': (['train', 'test'], BeirTitleDoc, GenericQuery), } for ds, (qrels, doc_type, query_type) in benchmarks.items(): dlc_ds = dlc[ds] ds_zip = ds.split('/')[0] docs_migrator = Migrator( base_path / ds / 'irds_version.txt', 'v2', affected_files=[f'{base_path/ds}/docs.pklz4'], message=f'Migrating {NAME}/{ds} (structuring fields)') docs = docs_migrator( BeirDocs(ds, ZipExtract(dlc_ds, f'{ds_zip}/corpus.jsonl'), doc_type)) queries = BeirQueries( ds, Cache(ZipExtract(dlc_ds, f'{ds_zip}/queries.jsonl'), base_path / ds / 'queries.json'), query_type) if len(qrels) == 1: subsets[ds] = Dataset( docs, queries, BeirQrels(Cache( ZipExtract(dlc_ds, f'{ds_zip}/qrels/{qrels[0]}.tsv'), base_path / ds / f'{qrels[0]}.qrels'), qrels_defs={}), documentation(ds)) else: subsets[ds] = Dataset(docs, queries, documentation(ds)) for qrel in qrels: subset_qrels = BeirQrels(Cache( ZipExtract(dlc_ds, f'{ds_zip}/qrels/{qrel}.tsv'), base_path / ds / f'{qrel}.qrels'), qrels_defs={}) subset_qids = qid_filter(subset_qrels) subsets[f'{ds}/{qrel}'] = Dataset( docs, FilteredQueries(queries, subset_qids, mode='include'), subset_qrels, documentation(f'{ds}/{qrel}')) cqa = [ 'android', 'english', 'gaming', 'gis', 'mathematica', 'physics', 'programmers', 'stats', 'tex', 'unix', 'webmasters', 'wordpress' ] cqa_dlc = dlc['cqadupstack'] for ds in cqa: docs_migrator = Migrator( base_path / 'cqadupstack' / ds / 'irds_version.txt', 'v2', affected_files=[f'{base_path/"cqadupstack"/ds}/docs.pklz4'], message=f'Migrating {NAME}/cqadupstack/{ds} (structuring fields)') subsets[f'cqadupstack/{ds}'] = Dataset( docs_migrator( BeirDocs(f'cqadupstack/{ds}', ZipExtract(cqa_dlc, f'cqadupstack/{ds}/corpus.jsonl'), BeirCqaDoc)), BeirQueries( f'cqadupstack/{ds}', Cache(ZipExtract(cqa_dlc, f'cqadupstack/{ds}/queries.jsonl'), base_path / 'cqadupstack' / ds / 'queries.json'), BeirCqaQuery), BeirQrels(Cache( ZipExtract(cqa_dlc, f'cqadupstack/{ds}/qrels/test.tsv'), base_path / 'cqadupstack' / ds / f'test.qrels'), qrels_defs={}), documentation(f'cqadupstack/{ds}')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets