def _init(): subsets = {} documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) collection = TrecDocs(dlc['docs'], encoding='GB18030', path_globs=['**/xinhua/x*', '**/peoples-daily/pd*'], namespace=NAME, lang='zh', count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset(collection, documentation('_')) subsets['trec5'] = Dataset( TrecQueries(GzipExtract(dlc['trec5/queries']), qtype=TrecMandarinQuery, qtype_map=QTYPE_MAP, encoding='GBK', namespace=NAME, lang=None), # queries have multiple languages TrecQrels(GzipExtract(dlc['trec5/qrels']), QREL_DEFS), collection, documentation('trec5')) subsets['trec6'] = Dataset( TrecQueries(GzipExtract(dlc['trec6/queries']), qtype=TrecMandarinQuery, qtype_map=QTYPE_MAP, encoding='GBK', namespace=NAME, lang=None), # queries have multiple languages TrecQrels(GzipExtract(dlc['trec6/qrels']), QREL_DEFS), collection, documentation('trec6')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TrecDocs( dlc['docs'], encoding='utf8', path_globs=['arabic_newswire_a/transcripts/*/*.sgm.gz'], namespace=NAME) base = Dataset(collection, documentation('_')) subsets['ar2001'] = Dataset( TrecQueries(dlc['ar2001/queries'], qtype_map=QTYPE_MAP, encoding='ISO-8859-6', namespace=NAME), TrecQrels(dlc['ar2001/qrels'], QREL_DEFS), collection, documentation('ar2001')) subsets['ar2002'] = Dataset( TrecQueries(dlc['ar2002/queries'], qtype_map=QTYPE_MAP, encoding='ISO-8859-6', namespace=NAME), TrecQrels(dlc['ar2002/qrels'], QREL_DEFS), collection, documentation('ar2002')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection_v2 = WapoDocs(dlc['v2']) base = Dataset(documentation('_')) subsets['v2'] = Dataset(collection_v2, documentation('v2')) subsets['v2/trec-core-2018'] = Dataset( collection_v2, TrecQueries(dlc['trec-core-2018/queries'], namespace='trec-core-2018', lang='en', remove_tags=RM_TAGS), TrecQrels(dlc['trec-core-2018/qrels'], CORE_QREL_DEFS), documentation('v2/trec-core-2018')) subsets['v2/trec-news-2018'] = Dataset( collection_v2, TrecQueries(dlc['trec-news-2018/queries'], namespace='trec-news-2018', lang='en', qtype=TrecBackgroundLinkingQuery, qtype_map=BL_MAP, remove_tags=RM_TAGS), TrecQrels(dlc['trec-news-2018/qrels'], BL_QREL_DEFS), documentation('v2/trec-news-2018')) subsets['v2/trec-news-2019'] = Dataset( collection_v2, TrecQueries(dlc['trec-news-2019/queries'], namespace='trec-news-2019', lang='en', qtype=TrecBackgroundLinkingQuery, qtype_map=BL_MAP, remove_tags=RM_TAGS), TrecQrels(dlc['trec-news-2019/qrels'], BL_QREL_DEFS), documentation('v2/trec-news-2019')) subsets['v3/trec-news-2020'] = Dataset( TrecQueries(dlc['trec-news-2020/queries'], namespace='trec-news-2020', lang='en', qtype=TrecBackgroundLinkingQuery, qtype_map=BL_MAP, remove_tags=RM_TAGS), TrecQrels(dlc['trec-news-2020/qrels'], BL_QREL_DEFS), documentation('v3/trec-news-2020')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TrecDocs(dlc['docs'], encoding='utf8', path_globs=[ 'aquaint_comp/apw/*/*.gz', 'aquaint_comp/nyt/*/*.gz', 'aquaint_comp/xie/*/*.gz' ], namespace=NAME, lang='en', count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset(collection, documentation('_')) subsets['trec-robust-2005'] = Dataset( TrecQueries(dlc['trec-robust-2005/queries'], qtype_map=QTYPE_MAP, namespace='trec-robust', lang='en'), TrecQrels(dlc['trec-robust-2005/qrels'], QREL_DEFS), collection, documentation('trec-robust-2005')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) subsets = {} collection = TrecDocs(dlc['docs'], path_globs=[ '**/FBIS/FB*', '**/FR94/??/FR*', '**/FT/*/FT*', '**/LATIMES/LA*' ], namespace=NAME, lang='en', expected_file_count=2295, count_hint=ir_datasets.util.count_hint(NAME)) queries = TrecQueries(GzipExtract(dlc['queries']), namespace=NAME, lang='en') qrels = TrecQrels(dlc['qrels'], QREL_DEFS) base = Dataset(collection, queries, qrels, documentation('_')) for fold in FOLDS: qid_filter = make_filter(fold) subsets[fold] = Dataset(FilteredQueries(queries, qid_filter), FilteredQrels(qrels, qid_filter), collection, documentation(fold)) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} collection = GovDocs(dlc['docs']) base = Dataset(collection, documentation('_')) subsets['trec-web-2002'] = Dataset( collection, TrecQueries(GzipExtract(dlc['trec-web-2002/queries']), namespace='gov/trec-web-2002', lang='en'), TrecQrels(GzipExtract(dlc['trec-web-2002/qrels']), QREL_DEFS), documentation('trec-web-2002') ) subsets['trec-web-2002/named-page'] = Dataset( collection, TrecQueries(GzipExtract(dlc['trec-web-2002/named-page/queries']), qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace='gov/trec-web-2002/named-page', lang='en'), TrecQrels(GzipExtract(dlc['trec-web-2002/named-page/qrels']), NAMED_PAGE_QREL_DEFS), documentation('trec-web-2002/named-page') ) subsets['trec-web-2003'] = Dataset( collection, TrecQueries(dlc['trec-web-2003/queries'], qtype=GovWeb02Query, qtype_map=WEB03_QTYPE_MAP, namespace='gov/trec-web-2003', lang='en'), TrecQrels(dlc['trec-web-2003/qrels'], QREL_DEFS), documentation('trec-web-2003') ) subsets['trec-web-2003/named-page'] = Dataset( collection, TrecQueries(dlc['trec-web-2003/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace='gov/trec-web-2003/named-page', lang='en'), TrecQrels(dlc['trec-web-2003/named-page/qrels'], NAMED_PAGE_QREL_DEFS), documentation('trec-web-2003/named-page') ) subsets['trec-web-2004'] = Dataset( collection, TrecQueries(dlc['trec-web-2004/queries'], qtype=GenericQuery, qtype_map=WEB04_QTYPE_MAP, namespace='gov/trec-web-2004', lang='en'), TrecQrels(dlc['trec-web-2004/qrels'], QREL_DEFS), documentation('trec-web-2004') ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TrecDocs( dlc['docs'], encoding='ISO-8859-1', path_globs=['**/afp_text/af*', '**/infosel_data/ism_*'], namespace=NAME, lang='es', count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset(collection, documentation('_')) subsets['trec3'] = Dataset( TrecSpanishTranslateQueries( TrecQueries(GzipExtract(dlc['trec3/queries']), qtype_map=QTYPE_MAP_3, encoding='ISO-8859-1', namespace=NAME, lang=None), TrecSpanish3Query), TrecQrels(GzipExtract(dlc['trec3/qrels']), QREL_DEFS), collection, documentation('trec3')) subsets['trec4'] = Dataset( TrecSpanishTranslateQueries( TrecQueries(GzipExtract(dlc['trec4/queries']), qtype=TrecDescOnlyQuery, qtype_map=QTYPE_MAP_4, encoding='ISO-8859-1', namespace=NAME, lang=None), TrecSpanish4Query), TrecQrels(GzipExtract(dlc['trec4/qrels']), QREL_DEFS), collection, documentation('trec4')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def test_queries(self): mock_file = StringFile(''' <top> <num> Number: Q100A <title> Topic: Some title <desc> Description: Descriptive text split on multiple lines <narr> Narrative: Further elaboration of the query intent split on multiple lines </top> <top> <num> 102 <title> Query 2 <desc> Q2 description <narr> Narrative: Q2 narrative </top> '''.lstrip()) expected_results = [ TrecQuery( 'Q100A', 'Some title', "Descriptive text\nsplit on multiple lines", 'Further elaboration of the query intent\nsplit on multiple lines' ), TrecQuery('102', 'Query 2', "Q2 description", 'Q2 narrative'), ] queries = TrecQueries(mock_file) self.assertEqual(queries.queries_path(), 'MOCK') self.assertEqual(list(queries.queries_iter()), expected_results)
def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') migrator = Migrator(base_path/'irds_version.txt', 'v2', affected_files=[base_path/'nyt.tgz.pklz4'], message='Migrating nyt (extracting body text)') collection = migrator(NytDocs(dlc['source'])) base = Dataset(collection, documentation('_')) # core17 subsets['trec-core-2017'] = Dataset( TrecQueries(dlc['trec-core-2017/queries'], namespace='trec-core-2017', lang='en'), TrecQrels(dlc['trec-core-2017/qrels'], CORE_QREL_DEFS), collection, documentation('trec-core-2017')) # wksup all_queries = NytQueries(collection) all_qrels = NytQrels(collection) match_qids = Lazy(lambda: VALID_IDS) subsets['wksup'] = Dataset( all_queries, all_qrels, collection, documentation('wksup/train')) subsets['wksup/train'] = Dataset( FilteredQueries(all_queries, match_qids, mode='exclude'), FilteredQrels(all_qrels, match_qids, mode='exclude'), collection, documentation('wksup/train')) subsets['wksup/valid'] = Dataset( FilteredQueries(all_queries, match_qids, mode='include'), FilteredQrels(all_qrels, match_qids, mode='include'), collection, documentation('wksup/valid')) ir_datasets.registry.register('nyt', base) for s in sorted(subsets): ir_datasets.registry.register(f'nyt/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} docs_dlc = dlc['docs'] doccount_dlc = Gov2DocCountFile(os.path.join(base_path, 'corpus.doccounts'), docs_dlc) collection = Gov2Docs(docs_dlc, doccount_dlc) base = Dataset(collection, documentation('_')) subsets['trec-tb-2004'] = Dataset( collection, TrecQueries(dlc['trec-tb-2004/queries'], namespace=NAME, lang='en'), TrecQrels(dlc['trec-tb-2004/qrels'], QREL_DEFS), documentation('trec-tb-2004') ) subsets['trec-tb-2005'] = Dataset( collection, TrecQueries(dlc['trec-tb-2005/queries'], namespace=NAME, lang='en'), TrecQrels(dlc['trec-tb-2005/qrels'], QREL_DEFS), documentation('trec-tb-2005') ) subsets['trec-tb-2005/named-page'] = Dataset( collection, TrecQueries(dlc['trec-tb-2005/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels(dlc['trec-tb-2005/named-page/qrels'], NAMED_PAGE_QREL_DEFS), documentation('trec-tb-2005/named-page') ) subsets['trec-tb-2005/efficiency'] = Dataset( collection, TrecColonQueries(GzipExtract(dlc['trec-tb-2005/efficiency/queries']), encoding='latin1', namespace=NAME, lang='en'), RewriteQids(TrecQrels(dlc['trec-tb-2005/qrels'], QREL_DEFS), EFF_MAP_05), documentation('trec-tb-2005/efficiency') ) subsets['trec-tb-2006'] = Dataset( collection, TrecQueries(dlc['trec-tb-2006/queries'], namespace=NAME, lang='en'), TrecQrels(dlc['trec-tb-2006/qrels'], QREL_DEFS), documentation('trec-tb-2006') ) subsets['trec-tb-2006/named-page'] = Dataset( collection, TrecQueries(dlc['trec-tb-2006/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels(dlc['trec-tb-2006/named-page/qrels'], NAMED_PAGE_QREL_DEFS), documentation('trec-tb-2006/named-page') ) subsets['trec-tb-2006/efficiency'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.all'), encoding='latin1', namespace=NAME, lang='en'), RewriteQids(TrecQrels(dlc['trec-tb-2006/qrels'], QREL_DEFS), EFF_MAP_06), documentation('trec-tb-2006/efficiency') ) subsets['trec-tb-2006/efficiency/10k'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.10k'), encoding='latin1', namespace=NAME, lang='en'), documentation('trec-tb-2006/efficiency/10k') ) subsets['trec-tb-2006/efficiency/stream1'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-1'), encoding='latin1', namespace=NAME, lang='en'), documentation('trec-tb-2006/efficiency/stream1') ) subsets['trec-tb-2006/efficiency/stream2'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-2'), encoding='latin1', namespace=NAME, lang='en'), documentation('trec-tb-2006/efficiency/stream2') ) subsets['trec-tb-2006/efficiency/stream3'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-3'), encoding='latin1', namespace=NAME, lang='en'), RewriteQids(TrecQrels(dlc['trec-tb-2006/qrels'], QREL_DEFS), EFF_MAP_06), documentation('trec-tb-2006/efficiency/stream3') ) subsets['trec-tb-2006/efficiency/stream4'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-4'), encoding='latin1', namespace=NAME, lang='en'), documentation('trec-tb-2006/efficiency/stream4') ) subsets['trec-mq-2007'] = Dataset( collection, TrecColonQueries(GzipExtract(dlc['trec-mq-2007/queries']), encoding='latin1'), TrecPrels(dlc['trec-mq-2007/qrels'], QREL_DEFS), documentation('trec-mq-2007') ) subsets['trec-mq-2008'] = Dataset( collection, TrecColonQueries(GzipExtract(dlc['trec-mq-2008/queries']), encoding='latin1', namespace='trec-mq', lang='en'), TrecPrels(TarExtract(dlc['trec-mq-2008/qrels'], '2008.RC1/prels'), QREL_DEFS), documentation('trec-mq-2008') ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TrecDocs(dlc['benchmark'], parser='tut', path_globs=['**/docs_grp_*.txt'], namespace=NAME, lang='en', count_hint=ir_datasets.util.count_hint(NAME)) topics_and_qrels = TarExtractAll( dlc['benchmark'], base_path / "topics_and_qrels", path_globs=['**/topics.*.txt', '**/qrels.*.txt']) val_runs = TarExtractAll(dlc['dlfiles'], base_path / "val_runs", path_globs=['**/run.trip.BM25.*.val.txt']) test_runs = TarExtractAll(dlc['dlfiles_runs_test'], base_path / "test_runs", path_globs=['**/run.trip.BM25.*.test.txt']) base = Dataset(collection, documentation('_')) subsets['logs'] = Dataset( TsvDocs(Cache( FixAllarticles(TarExtract(dlc['logs'], 'logs/allarticles.txt')), base_path / 'allarticles-fixed.tsv'), doc_cls=TripClickPartialDoc, lang='en', count_hint=ir_datasets.util.count_hint(f'{NAME}/logs')), TripClickQlogs( TarExtractAll(dlc['logs'], base_path / 'logs', path_globs=['**/*.json'])), documentation('logs')) ### Train subsets['train/head'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.head.train.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.head.train.txt'), QREL_DEFS), documentation('train/head')) subsets['train/head/dctr'] = Dataset( TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.dctr.head.train.txt'), QREL_DCTR_DEFS), subsets['train/head'], documentation('train/head/dctr')) subsets['train/torso'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.torso.train.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.torso.train.txt'), QREL_DEFS), documentation('train/torso')) subsets['train/tail'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.tail.train.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.tail.train.txt'), QREL_DEFS), documentation('train/tail')) train_queries = ConcatQueries([ subsets['train/head'].queries_handler(), subsets['train/torso'].queries_handler(), subsets['train/tail'].queries_handler(), ]) train_docpairs = DocPairGenerator( TarExtract(dlc['dlfiles'], 'dlfiles/triples.train.tsv'), collection, train_queries, base_path / 'train.docpairs') subsets['train'] = Dataset( collection, train_queries, ConcatQrels([ subsets['train/head'].qrels_handler(), subsets['train/torso'].qrels_handler(), subsets['train/tail'].qrels_handler(), ]), TsvDocPairs(train_docpairs), documentation('train')) subsets['train/hofstaetter-triples'] = Dataset( collection, train_queries, subsets['train'].qrels_handler(), TsvDocPairs(dlc['hofstaetter-triples']), documentation('train/hofstaetter-triples')) ### Val subsets['val/head'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.head.val.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.head.val.txt'), QREL_DEFS), TrecScoredDocs( RelativePath(val_runs, 'dlfiles/run.trip.BM25.head.val.txt')), documentation('val/head')) subsets['val/head/dctr'] = Dataset( TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.dctr.head.val.txt'), QREL_DCTR_DEFS), subsets['val/head'], documentation('val/head/dctr')) subsets['val/torso'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.torso.val.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.torso.val.txt'), QREL_DEFS), TrecScoredDocs( RelativePath(val_runs, 'dlfiles/run.trip.BM25.torso.val.txt')), documentation('val/torso')) subsets['val/tail'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.tail.val.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.tail.val.txt'), QREL_DEFS), TrecScoredDocs( RelativePath(val_runs, 'dlfiles/run.trip.BM25.tail.val.txt')), documentation('val/tail')) subsets['val'] = Dataset( collection, ConcatQueries([ subsets['val/head'].queries_handler(), subsets['val/torso'].queries_handler(), subsets['val/tail'].queries_handler(), ]), ConcatQrels([ subsets['val/head'].qrels_handler(), subsets['val/torso'].qrels_handler(), subsets['val/tail'].qrels_handler(), ]), ConcatScoreddocs([ subsets['val/head'].scoreddocs_handler(), subsets['val/torso'].scoreddocs_handler(), subsets['val/tail'].scoreddocs_handler(), ]), documentation('val')) ### Test subsets['test/head'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.head.test.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecScoredDocs( RelativePath(test_runs, 'runs_test/run.trip.BM25.head.test.txt')), documentation('val/head')) subsets['test/torso'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.torso.test.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecScoredDocs( RelativePath(test_runs, 'runs_test/run.trip.BM25.torso.test.txt')), documentation('test/torso')) subsets['test/tail'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.tail.test.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecScoredDocs( RelativePath(test_runs, 'runs_test/run.trip.BM25.tail.test.txt')), documentation('test/tail')) subsets['test'] = Dataset( collection, ConcatQueries([ subsets['test/head'].queries_handler(), subsets['test/torso'].queries_handler(), subsets['test/tail'].queries_handler(), ]), ConcatScoreddocs([ subsets['test/head'].scoreddocs_handler(), subsets['test/torso'].scoreddocs_handler(), subsets['test/tail'].scoreddocs_handler(), ]), documentation('test')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets