def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = NytDocs(dlc['source']) base = Dataset(collection, documentation('_')) all_queries = NytQueries(collection) all_qrels = NytQrels(collection) match_qids = Lazy(lambda: VALID_IDS) subsets['train'] = Dataset( FilteredQueries(all_queries, match_qids, mode='exclude'), FilteredQrels(all_qrels, match_qids, mode='exclude'), collection, documentation('train')) subsets['valid'] = Dataset( FilteredQueries(all_queries, match_qids, mode='include'), FilteredQrels(all_qrels, match_qids, mode='include'), collection, documentation('valid')) ir_datasets.registry.register('nyt', base) for s in sorted(subsets): ir_datasets.registry.register(f'nyt/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TrecDocs(dlc['docs'], encoding='utf8', path_globs=[ 'aquaint_comp/apw/*/*.gz', 'aquaint_comp/nyt/*/*.gz', 'aquaint_comp/xie/*/*.gz' ], namespace=NAME, lang='en', count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset(collection, documentation('_')) subsets['trec-robust-2005'] = Dataset( TrecQueries(dlc['trec-robust-2005/queries'], qtype_map=QTYPE_MAP, namespace='trec-robust', lang='en'), TrecQrels(dlc['trec-robust-2005/qrels'], QREL_DEFS), collection, documentation('trec-robust-2005')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TrecDocs( dlc['docs'], encoding='utf8', path_globs=['arabic_newswire_a/transcripts/*/*.sgm.gz'], namespace=NAME) base = Dataset(collection, documentation('_')) subsets['ar2001'] = Dataset( TrecQueries(dlc['ar2001/queries'], qtype_map=QTYPE_MAP, encoding='ISO-8859-6', namespace=NAME), TrecQrels(dlc['ar2001/qrels'], QREL_DEFS), collection, documentation('ar2001')) subsets['ar2002'] = Dataset( TrecQueries(dlc['ar2002/queries'], qtype_map=QTYPE_MAP, encoding='ISO-8859-6', namespace=NAME), TrecQrels(dlc['ar2002/qrels'], QREL_DEFS), collection, documentation('ar2002')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') en_noclean_tr_collection = C4Docs( GzipExtract(dlc['en-noclean/sources']), TarExtractAll(dlc['en-noclean/checkpoints'], base_path / 'en.noclean.checkpoints'), base_path, source_name_filter=r'en\.noclean\.c4-train', filter_name='train') # exclude validation files (only include train) base = Dataset(documentation('_')) subsets['en-noclean-tr'] = Dataset(en_noclean_tr_collection, documentation('en-noclean-tr')) subsets['en-noclean-tr/trec-misinfo-2021'] = Dataset( en_noclean_tr_collection, TrecXmlQueries(dlc['trec-misinfo-2021/queries'], qtype=MisinfoQuery, qtype_map=misinfo_map, namespace='trec-misinfo', lang='en'), documentation('en-noclean-tr/trec-misinfo-2021')) ir_datasets.registry.register(NAME, base) for subset in subsets: ir_datasets.registry.register(f'{NAME}/{subset}', subsets[subset]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) subsets = {} collection = TrecDocs(dlc['docs'], path_globs=[ '**/FBIS/FB*', '**/FR94/??/FR*', '**/FT/*/FT*', '**/LATIMES/LA*' ], namespace=NAME, lang='en', expected_file_count=2295, count_hint=ir_datasets.util.count_hint(NAME)) queries = TrecQueries(GzipExtract(dlc['queries']), namespace=NAME, lang='en') qrels = TrecQrels(dlc['qrels'], QREL_DEFS) base = Dataset(collection, queries, qrels, documentation('_')) for fold in FOLDS: qid_filter = make_filter(fold) subsets[fold] = Dataset(FilteredQueries(queries, qid_filter), FilteredQrels(qrels, qid_filter), collection, documentation(fold)) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} collection = HighwireDocs(dlc, dlc['legalspans']) base = Dataset(collection, documentation('_')) subsets['trec-genomics-2006'] = Dataset( collection, TrecGenomicsQueries(dlc['trec-genomics-2006/queries']), HighwireQrels(dlc['trec-genomics-2006/qrels'], QREL_DEFS_06), documentation('trec-genomics-2006'), ) subsets['trec-genomics-2007'] = Dataset( collection, TrecGenomicsQueries(dlc['trec-genomics-2007/queries']), HighwireQrels(dlc['trec-genomics-2007/qrels'], QREL_DEFS_07), documentation('trec-genomics-2007'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') base = Dataset(documentation('_')) # dummy top level ds for lang in ['zh', 'fa', 'ru']: lang_docs = HC4Docs(dlc[f'{lang}/docs'], subset_lang=lang) subsets[lang] = Dataset( lang_docs, documentation(lang) ) for sep in ['train', 'dev', 'test']: subsets[f'{lang}/{sep}'] = Dataset( lang_docs, HC4Queries(dlc[f'{sep}/topics'], subset_lang=lang), TrecQrels(dlc[f'{lang}/{sep}/qrels'], QREL_DEFS), documentation(f'{lang}/{sep}'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) collection = TrecDocs(dlc['docs'], encoding='GB18030', path_globs=['**/xinhua/x*', '**/peoples-daily/pd*'], namespace=NAME, lang='zh', count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset(collection, documentation('_')) subsets['trec5'] = Dataset( TrecQueries(GzipExtract(dlc['trec5/queries']), qtype=TrecMandarinQuery, qtype_map=QTYPE_MAP, encoding='GBK', namespace=NAME, lang=None), # queries have multiple languages TrecQrels(GzipExtract(dlc['trec5/qrels']), QREL_DEFS), collection, documentation('trec5')) subsets['trec6'] = Dataset( TrecQueries(GzipExtract(dlc['trec6/queries']), qtype=TrecMandarinQuery, qtype_map=QTYPE_MAP, encoding='GBK', namespace=NAME, lang=None), # queries have multiple languages TrecQrels(GzipExtract(dlc['trec6/qrels']), QREL_DEFS), collection, documentation('trec6')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation('docs/antique.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) collection = TsvDocs(dlc['docs'], namespace=NAME, lang='en', count_hint=ir_datasets.util.count_hint(NAME)) subsets = {} for subset in ('train', 'test'): qrels = TrecQrels(dlc[f'{subset}/qrels'], QREL_DEFS) queries = TsvQueries(dlc[f'{subset}/queries'], namespace=NAME, lang='en') subsets[subset] = Dataset(collection, queries, qrels) # Split the training data into training and validation data validation_qids = Lazy(lambda: VALIDATION_QIDS) subsets['train/split200-train'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), validation_qids, mode='exclude'), FilteredQrels(subsets['train'].qrels_handler(), validation_qids, mode='exclude'), subsets['train']) subsets['train/split200-valid'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), validation_qids, mode='include'), FilteredQrels(subsets['train'].qrels_handler(), validation_qids, mode='include'), subsets['train']) # Separate test set removing the "offensive (and noisy)" questions disallow_list = dlc['disallow_list'] def disllow_qids(): with disallow_list.stream() as stream: stream = io.TextIOWrapper(stream) return {l.rstrip() for l in stream} disllow_qids = Lazy(disllow_qids) subsets['test/non-offensive'] = Dataset( FilteredQueries(subsets['test'].queries_handler(), disllow_qids, mode='exclude'), FilteredQrels(subsets['test'].qrels_handler(), disllow_qids, mode='exclude'), subsets['test']) ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection_v2 = WapoDocs(dlc['v2']) base = Dataset(documentation('_')) subsets['v2'] = Dataset(collection_v2, documentation('v2')) subsets['v2/trec-core-2018'] = Dataset( collection_v2, TrecQueries(dlc['trec-core-2018/queries'], namespace='trec-core-2018', lang='en', remove_tags=RM_TAGS), TrecQrels(dlc['trec-core-2018/qrels'], CORE_QREL_DEFS), documentation('v2/trec-core-2018')) subsets['v2/trec-news-2018'] = Dataset( collection_v2, TrecQueries(dlc['trec-news-2018/queries'], namespace='trec-news-2018', lang='en', qtype=TrecBackgroundLinkingQuery, qtype_map=BL_MAP, remove_tags=RM_TAGS), TrecQrels(dlc['trec-news-2018/qrels'], BL_QREL_DEFS), documentation('v2/trec-news-2018')) subsets['v2/trec-news-2019'] = Dataset( collection_v2, TrecQueries(dlc['trec-news-2019/queries'], namespace='trec-news-2019', lang='en', qtype=TrecBackgroundLinkingQuery, qtype_map=BL_MAP, remove_tags=RM_TAGS), TrecQrels(dlc['trec-news-2019/qrels'], BL_QREL_DEFS), documentation('v2/trec-news-2019')) subsets['v3/trec-news-2020'] = Dataset( TrecQueries(dlc['trec-news-2020/queries'], namespace='trec-news-2020', lang='en', qtype=TrecBackgroundLinkingQuery, qtype_map=BL_MAP, remove_tags=RM_TAGS), TrecQrels(dlc['trec-news-2020/qrels'], BL_QREL_DEFS), documentation('v3/trec-news-2020')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} docs_dlc = dlc['docs'] docs_chk_dlc = TarExtractAll(dlc['docs.chk'], base_path/'corpus.chk') b13_dlc = Bz2Extract(Cache(TarExtract(dlc['cw12b-info'], 'ClueWeb12-CreateB13/software/CreateClueWeb12B13Dataset.jar'), base_path/'CreateClueWeb12B13Dataset.jar')) collection = ClueWeb12Docs(docs_dlc, docs_chk_dlc) collection_b13 = ClueWeb12Docs(ClueWeb12b13Extractor(docs_dlc, b13_dlc)) base = Dataset(collection, documentation('_')) subsets['b13'] = Dataset(collection_b13, documentation('b13')) subsets['trec-web-2013'] = Dataset( collection, TrecXmlQueries(dlc['trec-web-2013/queries'], qtype=TrecWebTrackQuery, namespace=NAME), TrecQrels(dlc['trec-web-2013/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2013')) subsets['trec-web-2014'] = Dataset( collection, TrecXmlQueries(dlc['trec-web-2014/queries'], qtype=TrecWebTrackQuery, namespace=NAME), TrecQrels(dlc['trec-web-2014/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2014')) subsets['b13/ntcir-www-1'] = Dataset( collection_b13, TrecXmlQueries(Cache(ZipExtract(dlc['ntcir-www-1/queries'], 'eng.queries.xml'), base_path/'ntcir-www-1'/'queries.xml'), qtype=GenericQuery, qtype_map={'qid': 'query_id', 'content': 'text'}, namespace=NAME), NtcirQrels(dlc['ntcir-www-1/qrels'], NTCIR_QREL_DEFS), documentation('ntcir-www-1')) subsets['b13/ntcir-www-2'] = Dataset( collection_b13, TrecXmlQueries(Cache(ZipExtract(dlc['ntcir-www-2/queries'], 'qEng.xml'), base_path/'ntcir-www-2'/'queries.xml'), qtype=NtcirQuery, qtype_map=ntcir_map, namespace=NAME), NtcirQrels(dlc['ntcir-www-2/qrels'], NTCIR_QREL_DEFS), documentation('ntcir-www-2')) subsets['b13/ntcir-www-3'] = Dataset( collection_b13, TrecXmlQueries(dlc['ntcir-www-3/queries'], qtype=NtcirQuery, qtype_map=ntcir_map, namespace=NAME), documentation('ntcir-www-3')) subsets['b13/trec-misinfo-2019'] = Dataset( collection_b13, TrecXmlQueries(dlc['trec-misinfo-2019/queries'], qtype=MisinfoQuery, qtype_map=misinfo_map, namespace=NAME), MsinfoQrels(dlc['trec-misinfo-2019/qrels'], MISINFO_QREL_DEFS), documentation('trec-misinfo-2019')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path()/NAME documentation = YamlDocumentation(f'docs/{NAME}.yaml') dlc = DownloadConfig.context(NAME, base_path, dua=DUA) subsets = {} migrator = Migrator(base_path/'irds_version.txt', 'v2', affected_files=[base_path/'msmarco_v2_passage.tar.pklz4'], message='Cleaning up pklz4 lookup structure in favor of ID-based lookups') collection = MsMarcoV2Passages(dlc['passages']) collection = migrator(collection) qrels_migrator = Migrator(base_path/'qrels_version.txt', 'v2', affected_files=[base_path/'train'/'qrels.tsv', base_path/'dev1'/'qrels.tsv', base_path/'dev2'/'qrels.tsv'], message='Updating qrels (task organizers removed duplicates)') subsets['train'] = Dataset( collection, TsvQueries(dlc['train/queries'], namespace='msmarco', lang='en'), qrels_migrator(TrecQrels(dlc['train/qrels'], QRELS_DEFS)), TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])), ) subsets['dev1'] = Dataset( collection, TsvQueries(dlc['dev1/queries'], namespace='msmarco', lang='en'), qrels_migrator(TrecQrels(dlc['dev1/qrels'], QRELS_DEFS)), TrecScoredDocs(GzipExtract(dlc['dev1/scoreddocs'])), ) subsets['dev2'] = Dataset( collection, TsvQueries(dlc['dev2/queries'], namespace='msmarco', lang='en'), qrels_migrator(TrecQrels(dlc['dev2/qrels'], QRELS_DEFS)), TrecScoredDocs(GzipExtract(dlc['dev2/scoreddocs'])), ) subsets['trec-dl-2021'] = Dataset( collection, TsvQueries(dlc['trec-dl-2021/queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2021/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2021/scoreddocs'])), ) dl21_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2021'].qrels_iter()}) subsets['trec-dl-2021/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2021'].queries_handler(), dl21_judged), FilteredScoredDocs(subsets['trec-dl-2021'].scoreddocs_handler(), dl21_judged), subsets['trec-dl-2021'], ) ir_datasets.registry.register(NAME, Dataset(collection, documentation("_"))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): base_path = home_path() / NAME documentation = YamlDocumentation(f"docs/{NAME}.yaml") download_config = DownloadConfig.context(NAME, base_path) base = Dataset(documentation('_')) # Arguments that can be loaded from Zenodo. arguments: Dict[str, ArgsMeDocs] = { name: ArgsMeDocs(Cache(ZipExtract(download_config[name], zip_path), base_path / f"{name}.json"), namespace=f"{NAME}/{name}", language=language, count_hint=count_hint) for name, (count_hint, language, zip_path) in SUBSETS.items() } # Arguments that are combined versions of other subsets. combined_arguments: Dict[str, ArgsMeCombinedArguments] = { name: ArgsMeCombinedArguments( base_path / f"{name}.json", [arguments[subset_name] for subset_name in subset_names], namespace=f"{NAME}/{name}", language=language, count_hint=count_hint) for name, (subset_names, count_hint, language) in COMBINED_SUBSETS.items() } # Wrap in datasets with documentation. datasets = { name: Dataset(arguments, documentation(name)) for name, arguments in chain(arguments.items(), combined_arguments.items()) } # NOTE: the following datasets are defined in touche.py: # - argsme/1.0/touche-2020-task-1/uncorrected # - argsme/2020-04-01/touche-2020-task-1 # - argsme/2020-04-01/touche-2020-task-1/uncorrected # - argsme/2020-04-01/touche-2021-task-1 # Register datasets. registry.register(NAME, base) for name, arguments in datasets.items(): registry.register(f'{NAME}/{name}', arguments) return base, datasets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} base = Dataset(documentation('_')) collection04 = MedlineDocs('2004', [GzipExtract(dlc['2004/a']), GzipExtract(dlc['2004/b']), GzipExtract(dlc['2004/c']), GzipExtract(dlc['2004/d'])], count_hint=ir_datasets.util.count_hint(f'{NAME}/2004')) subsets['2004'] = Dataset(collection04, documentation('2004')) subsets['2004/trec-genomics-2004'] = Dataset( collection04, TrecXmlQueries(ZipExtract(dlc['trec-genomics-2004/queries'], 'Official.xml'), qtype=TrecGenomicsQuery, qtype_map=TREC04_XML_MAP, namespace='trec-genomics', lang='en'), TrecQrels(dlc['trec-genomics-2004/qrels'], QREL_DEFS), documentation('trec-genomics-2004'), ) subsets['2004/trec-genomics-2005'] = Dataset( collection04, TrecGenomicsQueries(dlc['trec-genomics-2005/queries']), TrecQrels(dlc['trec-genomics-2005/qrels'], QREL_DEFS), documentation('trec-genomics-2005'), ) collection17 = ConcatDocs([ AacrAscoDocs(dlc['2017/aacr_asco_extra']), MedlineDocs('2017', [dlc['2017/part1'], dlc['2017/part2'], dlc['2017/part3'], dlc['2017/part4'], dlc['2017/part5']]), ], count_hint=ir_datasets.util.count_hint(f'{NAME}/2017')) subsets['2017'] = Dataset(collection17, documentation('2017')) subsets['2017/trec-pm-2017'] = Dataset( collection17, TrecXmlQueries(dlc['trec-pm-2017/queries'], qtype=TrecPm2017Query, namespace='trec-pm-2017', lang='en'), TrecQrels(dlc['trec-pm-2017/qrels'], QREL_DEFS), documentation('trec-pm-2017'), ) subsets['2017/trec-pm-2018'] = Dataset( collection17, TrecXmlQueries(dlc['trec-pm-2018/queries'], qtype=TrecPmQuery, namespace='trec-pm-2018', lang='en'), TrecQrels(dlc['trec-pm-2018/qrels'], QREL_DEFS), documentation('trec-pm-2018'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} langs = ['python', 'java', 'go', 'php', 'ruby', 'javascript'] dlcs = { lang: ZipExtractCache(dlc[lang], base_path / lang) for lang in langs } all_dlcs = [dlcs[lang] for lang in langs] base = Dataset( CodeSearchNetDocs(all_dlcs), documentation('_'), ) subsets['train'] = Dataset( CodeSearchNetDocs(all_dlcs), CodeSearchNetQueries(all_dlcs, 'train'), CodeSearchNetQrels(all_dlcs, 'train'), documentation('train'), ) subsets['valid'] = Dataset( CodeSearchNetDocs(all_dlcs), CodeSearchNetQueries(all_dlcs, 'valid'), CodeSearchNetQrels(all_dlcs, 'valid'), documentation('valid'), ) subsets['test'] = Dataset( CodeSearchNetDocs(all_dlcs), CodeSearchNetQueries(all_dlcs, 'test'), CodeSearchNetQrels(all_dlcs, 'test'), documentation('test'), ) challenge_queries = CodeSearchNetChallengeQueries(dlc['challenge/queries']) subsets['challenge'] = Dataset( CodeSearchNetDocs(all_dlcs), challenge_queries, CodeSearchNetChallengeQrels(dlc['challenge/qrels'], challenge_queries), documentation('challenge'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') subsets = {} sources = [ ('en1k', 'wikIR1k'), ('en59k', 'wikIR59k'), ('en78k', 'enwikIR'), ('ens78k', 'enwikIRS'), ('fr14k', 'FRwikIR14k'), ('es13k', 'ESwikIR13k'), ('it16k', 'ITwikIR16k'), ] for source, zip_dir_name in sources: source_dlc = ZipExtractCache(dlc[source], base_path / source) docs = CsvDocs( RelativePath(source_dlc, f"{zip_dir_name}/documents.csv"), namespace=source, lang=source[:2], count_hint=ir_datasets.util.count_hint(f'{NAME}/{source}'), docstore_path=ir_datasets.util.home_path() / NAME / f'{source}.pklz4') subsets[source] = Dataset(docs, documentation(source)) for split in ['training', 'validation', 'test']: subsets[f'{source}/{split}'] = Dataset( docs, CsvQueries(RelativePath(source_dlc, f"{zip_dir_name}/{split}/queries.csv"), lang=source[:2]), TrecQrels(RelativePath(source_dlc, f"{zip_dir_name}/{split}/qrels"), qrels_defs=QRELS_DEFS), TrecScoredDocs( RelativePath(source_dlc, f"{zip_dir_name}/{split}/BM25.res")), documentation(f'{source}/{split}')) base = Dataset(documentation('_')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def clean(dataset, yes=False, list=False, human=True): base_path = os.path.join(ir_datasets.util.home_path() / dataset) dlc = DownloadConfig.context(dataset, base_path) skips = [] for dl_item in dlc.contents().values(): if 'instructions' in dl_item and 'cache_path' in dl_item: # non-downloadble item skips.append(os.path.join(base_path, dl_item['cache_path'])) size, files = walk_path(base_path, skips) files_fmt = f'{len(files)} files' if human: size_fmt = ir_datasets.util.format_file_size(size) if size > 1_000_000_000: # sizes over 1GB: list in red size_fmt = f'{RED}{size_fmt}{RES}' else: size_fmt = str(size) if list: if size > 0: print(f'{size_fmt}\t{files_fmt}\t{dataset}') return if not yes: inp = None while inp not in ('y', 'yes'): inp = input( f'clean up {size_fmt} from {dataset} ({files_fmt})?\n[y(es) / n(o) / l(ist files)] ' ).lower() if inp in ('l', 'list', 'list files'): for file in files: f_size = os.path.getsize(file) if human: fsize_fmt = ir_datasets.util.format_file_size(f_size) if f_size > 1_000_000_000: # sizes over 1GB: list in red fsize_fmt = f'{RED}{fsize_fmt}{RES}' else: fsize_fmt = str(size) print(f'{fsize_fmt}\t{file}') if inp in ('n', 'no'): return # remove identified files for file in files: os.remove(file) # remove empty directories for dirpath, dirnames, filenames in os.walk(base_path, topdown=False): if not dirnames and not filenames: os.rmdir(dirpath)
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} collection = GovDocs(dlc['docs']) base = Dataset(collection, documentation('_')) subsets['trec-web-2002'] = Dataset( collection, TrecQueries(GzipExtract(dlc['trec-web-2002/queries']), namespace='gov/trec-web-2002', lang='en'), TrecQrels(GzipExtract(dlc['trec-web-2002/qrels']), QREL_DEFS), documentation('trec-web-2002') ) subsets['trec-web-2002/named-page'] = Dataset( collection, TrecQueries(GzipExtract(dlc['trec-web-2002/named-page/queries']), qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace='gov/trec-web-2002/named-page', lang='en'), TrecQrels(GzipExtract(dlc['trec-web-2002/named-page/qrels']), NAMED_PAGE_QREL_DEFS), documentation('trec-web-2002/named-page') ) subsets['trec-web-2003'] = Dataset( collection, TrecQueries(dlc['trec-web-2003/queries'], qtype=GovWeb02Query, qtype_map=WEB03_QTYPE_MAP, namespace='gov/trec-web-2003', lang='en'), TrecQrels(dlc['trec-web-2003/qrels'], QREL_DEFS), documentation('trec-web-2003') ) subsets['trec-web-2003/named-page'] = Dataset( collection, TrecQueries(dlc['trec-web-2003/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace='gov/trec-web-2003/named-page', lang='en'), TrecQrels(dlc['trec-web-2003/named-page/qrels'], NAMED_PAGE_QREL_DEFS), documentation('trec-web-2003/named-page') ) subsets['trec-web-2004'] = Dataset( collection, TrecQueries(dlc['trec-web-2004/queries'], qtype=GenericQuery, qtype_map=WEB04_QTYPE_MAP, namespace='gov/trec-web-2004', lang='en'), TrecQrels(dlc['trec-web-2004/qrels'], QREL_DEFS), documentation('trec-web-2004') ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') migrator = Migrator(base_path/'irds_version.txt', 'v2', affected_files=[base_path/'nyt.tgz.pklz4'], message='Migrating nyt (extracting body text)') collection = migrator(NytDocs(dlc['source'])) base = Dataset(collection, documentation('_')) # core17 subsets['trec-core-2017'] = Dataset( TrecQueries(dlc['trec-core-2017/queries'], namespace='trec-core-2017', lang='en'), TrecQrels(dlc['trec-core-2017/qrels'], CORE_QREL_DEFS), collection, documentation('trec-core-2017')) # wksup all_queries = NytQueries(collection) all_qrels = NytQrels(collection) match_qids = Lazy(lambda: VALID_IDS) subsets['wksup'] = Dataset( all_queries, all_qrels, collection, documentation('wksup/train')) subsets['wksup/train'] = Dataset( FilteredQueries(all_queries, match_qids, mode='exclude'), FilteredQrels(all_qrels, match_qids, mode='exclude'), collection, documentation('wksup/train')) subsets['wksup/valid'] = Dataset( FilteredQueries(all_queries, match_qids, mode='include'), FilteredQrels(all_qrels, match_qids, mode='include'), collection, documentation('wksup/valid')) ir_datasets.registry.register('nyt', base) for s in sorted(subsets): ir_datasets.registry.register(f'nyt/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) documentation = YamlDocumentation(f'docs/{NAME}.yaml') manager = MsMarcoQnAManager(GzipExtract(dlc['train']), GzipExtract(dlc['dev']), GzipExtract(dlc['eval']), base_path) migrator = Migrator(base_path/'irds_version.txt', 'v2', affected_files=[ base_path/'docs.pklz4', base_path/'train.run', base_path/'train.qrels', base_path/'dev.run', base_path/'dev.qrels', base_path/'eval.run', ], message='Migrating msmarco-qna (correcting doc_ids)') collection = DocstoreBackedDocs(manager.docs_store, docs_cls=MsMarcoQnADoc, namespace=NAME, lang='en') collection = migrator(collection) subsets = {} subsets['train'] = Dataset( collection, TsvQueries(manager.file_ref('train.queries.tsv'), query_cls=MsMarcoQnAQuery, namespace='msmarco', lang='en'), migrator(TrecQrels(manager.file_ref('train.qrels'), QRELS_DEFS)), migrator(TrecScoredDocs(manager.file_ref('train.run'))), ) subsets['dev'] = Dataset( collection, TsvQueries(manager.file_ref('dev.queries.tsv'), query_cls=MsMarcoQnAQuery, namespace='msmarco', lang='en'), migrator(TrecQrels(manager.file_ref('dev.qrels'), QRELS_DEFS)), migrator(TrecScoredDocs(manager.file_ref('dev.run'))), ) subsets['eval'] = Dataset( collection, TsvQueries(manager.file_ref('eval.queries.tsv'), query_cls=MsMarcoQnAEvalQuery, namespace='msmarco', lang='en'), migrator(TrecScoredDocs(manager.file_ref('eval.run'))), ) ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TrecDocs( dlc['docs'], encoding='ISO-8859-1', path_globs=['**/afp_text/af*', '**/infosel_data/ism_*'], namespace=NAME, lang='es', count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset(collection, documentation('_')) subsets['trec3'] = Dataset( TrecSpanishTranslateQueries( TrecQueries(GzipExtract(dlc['trec3/queries']), qtype_map=QTYPE_MAP_3, encoding='ISO-8859-1', namespace=NAME, lang=None), TrecSpanish3Query), TrecQrels(GzipExtract(dlc['trec3/qrels']), QREL_DEFS), collection, documentation('trec3')) subsets['trec4'] = Dataset( TrecSpanishTranslateQueries( TrecQueries(GzipExtract(dlc['trec4/queries']), qtype=TrecDescOnlyQuery, qtype_map=QTYPE_MAP_4, encoding='ISO-8859-1', namespace=NAME, lang=None), TrecSpanish4Query), TrecQrels(GzipExtract(dlc['trec4/qrels']), QREL_DEFS), collection, documentation('trec4')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = Cord19Docs(dlc['docs/2020-07-16'], base_path/'2020-07-16', '2020-07-16') base = Dataset(collection, documentation('_')) subsets['trec-covid'] = Dataset( TrecXmlQueries(dlc['trec-covid/queries'], qtype_map={'query': 'title', 'question': 'description', 'narrative': 'narrative'}, namespace=NAME), TrecQrels(dlc['trec-covid/qrels'], QRELS_DEFS), collection, documentation('trec-covid')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} main_dlc = dlc['main'] base = Dataset( VaswaniDocs( Cache(TarExtract(main_dlc, 'doc-text'), base_path / 'docs.txt')), VaswaniQueries( Cache(TarExtract(main_dlc, 'query-text'), base_path / 'queries.txt')), VaswaniQrels( Cache(TarExtract(main_dlc, 'rlv-ass'), base_path / 'qrels.txt')), documentation('_'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') docs_v15 = CarDocs(TarExtract(dlc['docs'], 'paragraphcorpus/paragraphcorpus.cbor', compression='xz')) base = Dataset(documentation('_')) subsets['v1.5'] = Dataset(docs_v15, documentation('v1.5')) subsets['v1.5/trec-y1'] = Dataset( docs_v15, CarQueries(TarExtract(dlc['trec-y1/queries'], 'benchmarkY1test.public/test.benchmarkY1test.cbor.outlines', compression='xz')),) subsets['v1.5/trec-y1/manual'] = Dataset( subsets['v1.5/trec-y1'], TrecQrels(TarExtract(dlc['trec-y1/qrels'], 'TREC_CAR_2017_qrels/manual.benchmarkY1test.cbor.hierarchical.qrels'), MANUAL_QRELS)) subsets['v1.5/trec-y1/auto'] = Dataset( subsets['v1.5/trec-y1'], TrecQrels(TarExtract(dlc['trec-y1/qrels'], 'TREC_CAR_2017_qrels/automatic.benchmarkY1test.cbor.hierarchical.qrels'), AUTO_QRELS)) subsets['v1.5/test200'] = Dataset( docs_v15, CarQueries(TarExtract(dlc['test200'], 'test200/train.test200.cbor.outlines', compression='xz')), TrecQrels(TarExtract(dlc['test200'], 'test200/train.test200.cbor.hierarchical.qrels', compression='xz'), AUTO_QRELS)) train_data = ReTar(dlc['train'], base_path/'train.smaller.tar.xz', ['train/train.fold?.cbor.outlines', 'train/train.fold?.cbor.hierarchical.qrels'], compression='xz') subsets['v1.5/train/fold0'] = Dataset( docs_v15, CarQueries(TarExtract(train_data, 'train/train.fold0.cbor.outlines', compression='xz')), TrecQrels(TarExtract(train_data, 'train/train.fold0.cbor.hierarchical.qrels', compression='xz'), AUTO_QRELS)) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} collection = MedlineDocs([ GzipExtract(dlc['docs/a']), GzipExtract(dlc['docs/b']), GzipExtract(dlc['docs/c']), GzipExtract(dlc['docs/d']) ]) base = Dataset(collection, documentation('_')) subsets['trec-genomics-2004'] = Dataset( collection, TrecXmlQueries(ZipExtract(dlc['trec-genomics-2004/queries'], 'Official.xml'), qtype=TrecGenomicsQuery, qtype_map=TREC04_XML_MAP, namespace='trec-genomics'), TrecQrels(dlc['trec-genomics-2004/qrels'], QREL_DEFS), documentation('trec-genomics-2004'), ) subsets['trec-genomics-2005'] = Dataset( collection, TrecGenomicsQueries(dlc['trec-genomics-2005/queries']), TrecQrels(dlc['trec-genomics-2005/qrels'], QREL_DEFS), documentation('trec-genomics-2005'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} main_dlc = dlc['main'] base = Dataset( CranfieldDocs( Cache(TarExtract(main_dlc, 'cran.all.1400'), base_path / 'docs.txt')), CranfieldQueries( Cache(TarExtract(main_dlc, 'cran.qry'), base_path / 'queries.txt')), CranfieldQrels( Cache(TarExtract(main_dlc, 'cranqrel'), base_path / 'qrels.txt')), documentation('_'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) migrator = Migrator(base_path / 'irds_version.txt', 'v2', affected_files=[ base_path / 'collection.tsv', base_path / 'collection.tsv.pklz4' ], message=f'Migrating {NAME} (fixing passage encoding)') collection = TsvDocs(Cache( FixEncoding(TarExtract(dlc['collectionandqueries'], 'collection.tsv')), base_path / 'collection.tsv'), namespace='msmarco', lang='en', docstore_size_hint=14373971970, count_hint=ir_datasets.util.count_hint(NAME)) collection = migrator(collection) subsets = {} subsets['train'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.train.tsv'), base_path / 'train/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels(dlc['train/qrels'], QRELS_DEFS), TsvDocPairs(GzipExtract(dlc['train/docpairs'])), TrecScoredDocs( Cache( ExtractQidPid( TarExtract(dlc['train/scoreddocs'], 'top1000.train.txt')), base_path / 'train/ms.run')), ) subsets['train/triples-v2'] = Dataset( collection, subsets['train'].queries_handler(), subsets['train'].qrels_handler(), TsvDocPairs(GzipExtract(dlc['train/docpairs/v2'])), subsets['train'].scoreddocs_handler(), ) subsets['train/triples-small'] = Dataset( collection, subsets['train'].queries_handler(), subsets['train'].qrels_handler(), TsvDocPairs( Cache( MapSmallTriplesQidPid( TarExtract(dlc['train/docpairs/small'], 'triples.train.small.tsv'), TarExtract(dlc['collectionandqueries'], 'collection.tsv'), subsets['train'].queries_handler()), base_path / 'train/small.triples.qidpid.tsv')), subsets['train'].scoreddocs_handler(), ) subsets['dev'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.dev.tsv'), base_path / 'dev/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels(dlc['dev/qrels'], QRELS_DEFS), ) subsets['dev/small'] = Dataset( collection, TsvQueries(Cache( TarExtract(dlc['collectionandqueries'], 'queries.dev.small.tsv'), base_path / 'dev/small/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels( Cache( TarExtract(dlc['collectionandqueries'], 'qrels.dev.small.tsv'), base_path / 'dev/small/qrels'), QRELS_DEFS), TrecScoredDocs( Cache( ExtractQidPid(TarExtract(dlc['dev/scoreddocs'], 'top1000.dev')), base_path / 'dev/ms.run')), ) subsets['eval'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.eval.tsv'), base_path / 'eval/queries.tsv'), namespace='msmarco', lang='en'), ) subsets['eval/small'] = Dataset( collection, TsvQueries(Cache( TarExtract(dlc['collectionandqueries'], 'queries.eval.small.tsv'), base_path / 'eval/small/queries.tsv'), namespace='msmarco', lang='en'), TrecScoredDocs( Cache( ExtractQidPid( TarExtract(dlc['eval/scoreddocs'], 'top1000.eval')), base_path / 'eval/ms.run')), ) subsets['trec-dl-2019'] = Dataset( collection, TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS), TsvQueries(Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path / 'trec-dl-2019/queries.tsv'), namespace='msmarco', lang='en'), TrecScoredDocs( Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2019/scoreddocs'])), base_path / 'trec-dl-2019/ms.run')), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs( Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2020/scoreddocs'])), base_path / 'trec-dl-2020/ms.run')), ) # A few subsets that are contrainted to just the queries/qrels/docpairs that have at least # 1 relevance assessment train_judged = Lazy( lambda: {q.query_id for q in subsets['train'].qrels_iter()}) subsets['train/judged'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_judged), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_judged), subsets['train'], ) dev_judged = Lazy( lambda: {q.query_id for q in subsets['dev'].qrels_iter()}) subsets['dev/judged'] = Dataset( FilteredQueries(subsets['dev'].queries_handler(), dev_judged), subsets['dev'], ) dl19_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged), FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged), subsets['trec-dl-2019'], ) dl20_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()}) subsets['trec-dl-2020/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_judged), FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(), dl20_judged), subsets['trec-dl-2020'], ) # split200 -- 200 queries held out from the training data for validation split200 = Lazy(lambda: SPLIT200_QIDS) subsets['train/split200-train'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='exclude'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='exclude'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='exclude'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='exclude'), subsets['train'], ) subsets['train/split200-valid'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='include'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='include'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='include'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='include'), subsets['train'], ) # Medical subset def train_med(): with dlc['medmarco_ids'].stream() as stream: stream = codecs.getreader('utf8')(stream) return {l.rstrip() for l in stream} train_med = Lazy(train_med) subsets['train/medical'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_med), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_med), FilteredDocPairs(subsets['train'].docpairs_handler(), train_med), FilteredQrels(subsets['train'].qrels_handler(), train_med), subsets['train'], ) # DL-Hard dl_hard_qrels_migrator = Migrator( base_path / 'trec-dl-hard' / 'irds_version.txt', 'v3', affected_files=[base_path / 'trec-dl-hard' / 'qrels'], message='Updating trec-dl-hard qrels') hard_qids = Lazy(lambda: DL_HARD_QIDS) dl_hard_base_queries = TsvQueries([ Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path / 'trec-dl-2019/queries.tsv'), Cache(GzipExtract(dlc['trec-dl-2020/queries']), base_path / 'trec-dl-2020/queries.tsv') ], namespace='msmarco', lang='en') subsets['trec-dl-hard'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), dl_hard_qrels_migrator( TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)), documentation('trec-dl-hard')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1']) subsets['trec-dl-hard/fold1'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold1')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2']) subsets['trec-dl-hard/fold2'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold2')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3']) subsets['trec-dl-hard/fold3'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold3')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4']) subsets['trec-dl-hard/fold4'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold4')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5']) subsets['trec-dl-hard/fold5'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold5')) ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} docs_dlc = dlc['docs'] docs_chk_dlc = TarExtractAll(dlc['docs.chk'], base_path / 'corpus.chk') b13_dlc = Bz2Extract( Cache( TarExtract( dlc['cw12b-info'], 'ClueWeb12-CreateB13/software/CreateClueWeb12B13Dataset.jar'), base_path / 'CreateClueWeb12B13Dataset.jar')) collection = ClueWeb12Docs(docs_dlc, docs_chk_dlc) collection_b13 = ClueWeb12Docs(ClueWeb12b13Extractor(docs_dlc, b13_dlc)) base = Dataset(collection, documentation('_')) subsets['b13'] = Dataset(collection_b13, documentation('b13')) subsets['trec-web-2013'] = Dataset( collection, TrecXmlQueries(dlc['trec-web-2013/queries'], qtype=TrecWebTrackQuery, namespace='trec-web', lang='en'), TrecQrels(dlc['trec-web-2013/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2013')) subsets['trec-web-2014'] = Dataset( collection, TrecXmlQueries(dlc['trec-web-2014/queries'], qtype=TrecWebTrackQuery, namespace='trec-web', lang='en'), TrecQrels(dlc['trec-web-2014/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2014')) subsets['b13/ntcir-www-1'] = Dataset( collection_b13, TrecXmlQueries(Cache( ZipExtract(dlc['ntcir-www-1/queries'], 'eng.queries.xml'), base_path / 'ntcir-www-1' / 'queries.xml'), qtype=GenericQuery, qtype_map={ 'qid': 'query_id', 'content': 'text' }, namespace='ntcir-www', lang='en'), NtcirQrels(dlc['ntcir-www-1/qrels'], NTCIR_QREL_DEFS), documentation('ntcir-www-1')) subsets['b13/ntcir-www-2'] = Dataset( collection_b13, TrecXmlQueries(Cache( ZipExtract(dlc['ntcir-www-2/queries'], 'qEng.xml'), base_path / 'ntcir-www-2' / 'queries.xml'), qtype=NtcirQuery, qtype_map=ntcir_map, namespace='ntcir-www', lang='en'), NtcirQrels(dlc['ntcir-www-2/qrels'], NTCIR_QREL_DEFS), documentation('ntcir-www-2')) subsets['b13/ntcir-www-3'] = Dataset( collection_b13, TrecXmlQueries(dlc['ntcir-www-3/queries'], qtype=NtcirQuery, qtype_map=ntcir_map, namespace='ntcir-www', lang='en'), documentation('ntcir-www-3')) subsets['b13/trec-misinfo-2019'] = Dataset( collection_b13, TrecXmlQueries(dlc['trec-misinfo-2019/queries'], qtype=MisinfoQuery, qtype_map=misinfo_map, namespace='trec-misinfo-2019', lang='en'), MsinfoQrels(dlc['trec-misinfo-2019/qrels'], MISINFO_QREL_DEFS), documentation('trec-misinfo-2019')) subsets['b13/clef-ehealth'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='en'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS), documentation('clef-ehealth')) subsets['b13/clef-ehealth/cs'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/cs']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='cs'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-cs'), documentation('clef-ehealth/cs')) subsets['b13/clef-ehealth/de'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/de']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='de'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-de'), documentation('clef-ehealth/de')) subsets['b13/clef-ehealth/fr'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/fr']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='fr'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-fr'), documentation('clef-ehealth/fr')) subsets['b13/clef-ehealth/hu'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/hu']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='hu'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-hu'), documentation('clef-ehealth/hu')) subsets['b13/clef-ehealth/pl'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/pl']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='pl'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-pl'), documentation('clef-ehealth/pl')) subsets['b13/clef-ehealth/sv'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/sv']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='sv'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-sv'), documentation('clef-ehealth/sv')) # NOTE: the following datasets are defined in touche.py: # - clueweb12/touche-2020-task-2 # - clueweb12/touche-2021-task-2 ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} docs_dlc = dlc['docs'] chk_dlc = TarExtractAll(dlc['docs.chk'], base_path / 'corpus.chk') collection = ClueWeb09Docs(docs_dlc, chk_dlc, lang=None) # multiple langs collection_ar = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Arabic_1'], lang='ar') collection_zh = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=[ 'ClueWeb09_Chinese_1', 'ClueWeb09_Chinese_2', 'ClueWeb09_Chinese_3', 'ClueWeb09_Chinese_4' ], lang='zh') collection_en = ClueWeb09Docs( docs_dlc, chk_dlc, dirs=[ 'ClueWeb09_English_1', 'ClueWeb09_English_2', 'ClueWeb09_English_3', 'ClueWeb09_English_4', 'ClueWeb09_English_5', 'ClueWeb09_English_6', 'ClueWeb09_English_7', 'ClueWeb09_English_8', 'ClueWeb09_English_9', 'ClueWeb09_English_10' ], lang='en') collection_fr = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_French_1'], lang='fr') collection_de = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_German_1'], lang='de') collection_it = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Italian_1'], lang='it') collection_ja = ClueWeb09Docs( docs_dlc, chk_dlc, dirs=['ClueWeb09_Japanese_1', 'ClueWeb09_Japanese_2'], lang='ja') collection_ko = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Korean_1'], lang='ko') collection_pt = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Portuguese_1'], lang='pt') collection_es = ClueWeb09Docs( docs_dlc, chk_dlc, dirs=['ClueWeb09_Spanish_1', 'ClueWeb09_Spanish_2'], lang='es') collection_catb = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_English_1'], lang='en') base = Dataset(collection, documentation('_')) subsets['ar'] = Dataset(collection_ar, documentation('ar')) subsets['zh'] = Dataset(collection_zh, documentation('zh')) subsets['en'] = Dataset(collection_en, documentation('en')) subsets['fr'] = Dataset(collection_fr, documentation('fr')) subsets['de'] = Dataset(collection_de, documentation('de')) subsets['it'] = Dataset(collection_it, documentation('it')) subsets['ja'] = Dataset(collection_ja, documentation('ja')) subsets['ko'] = Dataset(collection_ko, documentation('ko')) subsets['pt'] = Dataset(collection_pt, documentation('pt')) subsets['es'] = Dataset(collection_es, documentation('es')) subsets['catb'] = Dataset(collection_catb, documentation('catb')) subsets['en/trec-web-2009'] = Dataset( collection_en, TrecXmlQueries(dlc['trec-web-2009/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), TrecPrels(GzipExtract(dlc['trec-web-2009/qrels.adhoc']), QREL_DEFS_09), documentation('trec-web-2009')) subsets['en/trec-web-2010'] = Dataset( collection_en, TrecXmlQueries(dlc['trec-web-2010/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), TrecQrels(dlc['trec-web-2010/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2010')) subsets['en/trec-web-2011'] = Dataset( collection_en, TrecXmlQueries(dlc['trec-web-2011/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), TrecQrels(dlc['trec-web-2011/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2011')) subsets['en/trec-web-2012'] = Dataset( collection_en, TrecXmlQueries(dlc['trec-web-2012/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), TrecQrels(dlc['trec-web-2012/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2012')) subsets['catb/trec-web-2009'] = Dataset( collection_catb, TrecXmlQueries(dlc['trec-web-2009/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), CatBQrelFilter( TrecPrels(GzipExtract(dlc['trec-web-2009/qrels.adhoc']), QREL_DEFS_09)), documentation('trec-web-2009')) subsets['catb/trec-web-2010'] = Dataset( collection_catb, TrecXmlQueries(dlc['trec-web-2010/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), CatBQrelFilter(TrecQrels(dlc['trec-web-2010/qrels.adhoc'], QREL_DEFS)), documentation('trec-web-2010')) subsets['catb/trec-web-2011'] = Dataset( collection_catb, TrecXmlQueries(dlc['trec-web-2011/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), CatBQrelFilter(TrecQrels(dlc['trec-web-2011/qrels.adhoc'], QREL_DEFS)), documentation('trec-web-2011')) subsets['catb/trec-web-2012'] = Dataset( collection_catb, TrecXmlQueries(dlc['trec-web-2012/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), CatBQrelFilter(TrecQrels(dlc['trec-web-2012/qrels.adhoc'], QREL_DEFS)), documentation('trec-web-2012')) subsets['trec-mq-2009'] = Dataset( collection, TrecColonQueries(GzipExtract(dlc['trec-mq-2009/queries']), encoding='latin1', lang='en'), TrecPrels(GzipExtract(dlc['trec-mq-2009/qrels']), QREL_DEFS_09), documentation('trec-mq-2009')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') main_dlc = dlc['main'] collection = TsvDocs(Cache( TarExtract(main_dlc, 'nfcorpus/raw/doc_dump.txt'), base_path / 'collection.tsv'), doc_cls=NfCorpusDoc, namespace=NAME) subsets = {} def read_lines(file): file = Cache(TarExtract(main_dlc, f'nfcorpus/raw/{file}'), base_path / file) with file.stream() as stream: stream = codecs.getreader('utf8')(stream) return {l.rstrip() for l in stream} nontopic_qid_filter = Lazy(lambda: read_lines('nontopics.ids')) video_qid_filter = Lazy(lambda: read_lines('all_videos.ids')) subsets['train'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.titles.queries'), base_path / 'train/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.all.queries'), base_path / 'train/queries.all.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery), TrecQrels( Cache(TarExtract(main_dlc, 'nfcorpus/train.3-2-1.qrel'), base_path / 'train/qrels'), QRELS_DEFS), documentation('train'), ) subsets['train/nontopic'] = Dataset( collection, TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.nontopic-titles.queries'), base_path / 'train/nontopic/queries.tsv'), namespace=NAME), FilteredQrels(subsets['train'].qrels_handler(), nontopic_qid_filter, mode='include'), documentation('train/nontopic'), ) subsets['train/video'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.vid-titles.queries'), base_path / 'train/video/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.vid-desc.queries'), base_path / 'train/video/queries.desc.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.nontopic-titles.queries'), base_path / 'train/video/queries.tsv'), NfCorpusVideoQuery, namespace=NAME), FilteredQrels(subsets['train'].qrels_handler(), video_qid_filter, mode='include'), documentation('train/video'), ) subsets['dev'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/dev.titles.queries'), base_path / 'dev/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/dev.all.queries'), base_path / 'dev/queries.all.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery), TrecQrels( Cache(TarExtract(main_dlc, 'nfcorpus/dev.3-2-1.qrel'), base_path / 'dev/qrels'), QRELS_DEFS), documentation('dev'), ) subsets['dev/nontopic'] = Dataset( collection, TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/dev.nontopic-titles.queries'), base_path / 'dev/nontopic/queries.tsv'), namespace=NAME), FilteredQrels(subsets['dev'].qrels_handler(), nontopic_qid_filter, mode='include'), documentation('dev/nontopic'), ) subsets['dev/video'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/dev.vid-titles.queries'), base_path / 'dev/video/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/dev.vid-desc.queries'), base_path / 'dev/video/queries.desc.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/dev.nontopic-titles.queries'), base_path / 'dev/video/queries.tsv'), NfCorpusVideoQuery, namespace=NAME), FilteredQrels(subsets['dev'].qrels_handler(), video_qid_filter, mode='include'), documentation('dev/video'), ) subsets['test'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/test.titles.queries'), base_path / 'test/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/test.all.queries'), base_path / 'test/queries.all.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery), TrecQrels( Cache(TarExtract(main_dlc, 'nfcorpus/test.3-2-1.qrel'), base_path / 'test/qrels'), QRELS_DEFS), documentation('test'), ) subsets['test/nontopic'] = Dataset( collection, TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/test.nontopic-titles.queries'), base_path / 'test/nontopic/queries.tsv'), namespace=NAME), FilteredQrels(subsets['test'].qrels_handler(), nontopic_qid_filter, mode='include'), documentation('test/nontopic'), ) subsets['test/video'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/test.vid-titles.queries'), base_path / 'test/video/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/test.vid-desc.queries'), base_path / 'test/video/queries.desc.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/test.nontopic-titles.queries'), base_path / 'test/video/queries.tsv'), NfCorpusVideoQuery, namespace=NAME), FilteredQrels(subsets['test'].qrels_handler(), video_qid_filter, mode='include'), documentation('test/video'), ) ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return collection, subsets