Exemple #1
0
def _init():
    subsets = {}
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)

    collection = TrecDocs(dlc['docs'], encoding='GB18030', path_globs=['**/xinhua/x*', '**/peoples-daily/pd*'], namespace=NAME, lang='zh', count_hint=ir_datasets.util.count_hint(NAME))

    base = Dataset(collection, documentation('_'))

    subsets['trec5'] = Dataset(
        TrecQueries(GzipExtract(dlc['trec5/queries']), qtype=TrecMandarinQuery, qtype_map=QTYPE_MAP, encoding='GBK', namespace=NAME, lang=None), # queries have multiple languages
        TrecQrels(GzipExtract(dlc['trec5/qrels']), QREL_DEFS),
        collection,
        documentation('trec5'))

    subsets['trec6'] = Dataset(
        TrecQueries(GzipExtract(dlc['trec6/queries']), qtype=TrecMandarinQuery, qtype_map=QTYPE_MAP, encoding='GBK', namespace=NAME, lang=None), # queries have multiple languages
        TrecQrels(GzipExtract(dlc['trec6/qrels']), QREL_DEFS),
        collection,
        documentation('trec6'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    collection = TrecDocs(
        dlc['docs'],
        encoding='utf8',
        path_globs=['arabic_newswire_a/transcripts/*/*.sgm.gz'],
        namespace=NAME)

    base = Dataset(collection, documentation('_'))

    subsets['ar2001'] = Dataset(
        TrecQueries(dlc['ar2001/queries'],
                    qtype_map=QTYPE_MAP,
                    encoding='ISO-8859-6',
                    namespace=NAME), TrecQrels(dlc['ar2001/qrels'], QREL_DEFS),
        collection, documentation('ar2001'))

    subsets['ar2002'] = Dataset(
        TrecQueries(dlc['ar2002/queries'],
                    qtype_map=QTYPE_MAP,
                    encoding='ISO-8859-6',
                    namespace=NAME), TrecQrels(dlc['ar2002/qrels'], QREL_DEFS),
        collection, documentation('ar2002'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Exemple #3
0
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    collection_v2 = WapoDocs(dlc['v2'])

    base = Dataset(documentation('_'))

    subsets['v2'] = Dataset(collection_v2, documentation('v2'))

    subsets['v2/trec-core-2018'] = Dataset(
        collection_v2,
        TrecQueries(dlc['trec-core-2018/queries'],
                    namespace='trec-core-2018',
                    lang='en',
                    remove_tags=RM_TAGS),
        TrecQrels(dlc['trec-core-2018/qrels'], CORE_QREL_DEFS),
        documentation('v2/trec-core-2018'))

    subsets['v2/trec-news-2018'] = Dataset(
        collection_v2,
        TrecQueries(dlc['trec-news-2018/queries'],
                    namespace='trec-news-2018',
                    lang='en',
                    qtype=TrecBackgroundLinkingQuery,
                    qtype_map=BL_MAP,
                    remove_tags=RM_TAGS),
        TrecQrels(dlc['trec-news-2018/qrels'], BL_QREL_DEFS),
        documentation('v2/trec-news-2018'))

    subsets['v2/trec-news-2019'] = Dataset(
        collection_v2,
        TrecQueries(dlc['trec-news-2019/queries'],
                    namespace='trec-news-2019',
                    lang='en',
                    qtype=TrecBackgroundLinkingQuery,
                    qtype_map=BL_MAP,
                    remove_tags=RM_TAGS),
        TrecQrels(dlc['trec-news-2019/qrels'], BL_QREL_DEFS),
        documentation('v2/trec-news-2019'))

    subsets['v3/trec-news-2020'] = Dataset(
        TrecQueries(dlc['trec-news-2020/queries'],
                    namespace='trec-news-2020',
                    lang='en',
                    qtype=TrecBackgroundLinkingQuery,
                    qtype_map=BL_MAP,
                    remove_tags=RM_TAGS),
        TrecQrels(dlc['trec-news-2020/qrels'], BL_QREL_DEFS),
        documentation('v3/trec-news-2020'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Exemple #4
0
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    collection = TrecDocs(dlc['docs'],
                          encoding='utf8',
                          path_globs=[
                              'aquaint_comp/apw/*/*.gz',
                              'aquaint_comp/nyt/*/*.gz',
                              'aquaint_comp/xie/*/*.gz'
                          ],
                          namespace=NAME,
                          lang='en',
                          count_hint=ir_datasets.util.count_hint(NAME))

    base = Dataset(collection, documentation('_'))

    subsets['trec-robust-2005'] = Dataset(
        TrecQueries(dlc['trec-robust-2005/queries'],
                    qtype_map=QTYPE_MAP,
                    namespace='trec-robust',
                    lang='en'),
        TrecQrels(dlc['trec-robust-2005/qrels'], QREL_DEFS), collection,
        documentation('trec-robust-2005'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Exemple #5
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    subsets = {}

    collection = TrecDocs(dlc['docs'],
                          path_globs=[
                              '**/FBIS/FB*', '**/FR94/??/FR*', '**/FT/*/FT*',
                              '**/LATIMES/LA*'
                          ],
                          namespace=NAME,
                          lang='en',
                          expected_file_count=2295,
                          count_hint=ir_datasets.util.count_hint(NAME))

    queries = TrecQueries(GzipExtract(dlc['queries']),
                          namespace=NAME,
                          lang='en')
    qrels = TrecQrels(dlc['qrels'], QREL_DEFS)

    base = Dataset(collection, queries, qrels, documentation('_'))

    for fold in FOLDS:
        qid_filter = make_filter(fold)
        subsets[fold] = Dataset(FilteredQueries(queries, qid_filter),
                                FilteredQrels(qrels, qid_filter), collection,
                                documentation(fold))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Exemple #6
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    collection = GovDocs(dlc['docs'])
    base = Dataset(collection, documentation('_'))

    subsets['trec-web-2002'] = Dataset(
        collection,
        TrecQueries(GzipExtract(dlc['trec-web-2002/queries']), namespace='gov/trec-web-2002', lang='en'),
        TrecQrels(GzipExtract(dlc['trec-web-2002/qrels']), QREL_DEFS),
        documentation('trec-web-2002')
    )
    subsets['trec-web-2002/named-page'] = Dataset(
        collection,
        TrecQueries(GzipExtract(dlc['trec-web-2002/named-page/queries']), qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace='gov/trec-web-2002/named-page', lang='en'),
        TrecQrels(GzipExtract(dlc['trec-web-2002/named-page/qrels']), NAMED_PAGE_QREL_DEFS),
        documentation('trec-web-2002/named-page')
    )
    subsets['trec-web-2003'] = Dataset(
        collection,
        TrecQueries(dlc['trec-web-2003/queries'], qtype=GovWeb02Query, qtype_map=WEB03_QTYPE_MAP, namespace='gov/trec-web-2003', lang='en'),
        TrecQrels(dlc['trec-web-2003/qrels'], QREL_DEFS),
        documentation('trec-web-2003')
    )
    subsets['trec-web-2003/named-page'] = Dataset(
        collection,
        TrecQueries(dlc['trec-web-2003/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace='gov/trec-web-2003/named-page', lang='en'),
        TrecQrels(dlc['trec-web-2003/named-page/qrels'], NAMED_PAGE_QREL_DEFS),
        documentation('trec-web-2003/named-page')
    )
    subsets['trec-web-2004'] = Dataset(
        collection,
        TrecQueries(dlc['trec-web-2004/queries'], qtype=GenericQuery, qtype_map=WEB04_QTYPE_MAP, namespace='gov/trec-web-2004', lang='en'),
        TrecQrels(dlc['trec-web-2004/qrels'], QREL_DEFS),
        documentation('trec-web-2004')
    )

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Exemple #7
0
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    collection = TrecDocs(
        dlc['docs'],
        encoding='ISO-8859-1',
        path_globs=['**/afp_text/af*', '**/infosel_data/ism_*'],
        namespace=NAME,
        lang='es',
        count_hint=ir_datasets.util.count_hint(NAME))

    base = Dataset(collection, documentation('_'))

    subsets['trec3'] = Dataset(
        TrecSpanishTranslateQueries(
            TrecQueries(GzipExtract(dlc['trec3/queries']),
                        qtype_map=QTYPE_MAP_3,
                        encoding='ISO-8859-1',
                        namespace=NAME,
                        lang=None), TrecSpanish3Query),
        TrecQrels(GzipExtract(dlc['trec3/qrels']), QREL_DEFS), collection,
        documentation('trec3'))

    subsets['trec4'] = Dataset(
        TrecSpanishTranslateQueries(
            TrecQueries(GzipExtract(dlc['trec4/queries']),
                        qtype=TrecDescOnlyQuery,
                        qtype_map=QTYPE_MAP_4,
                        encoding='ISO-8859-1',
                        namespace=NAME,
                        lang=None), TrecSpanish4Query),
        TrecQrels(GzipExtract(dlc['trec4/qrels']), QREL_DEFS), collection,
        documentation('trec4'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
    def test_queries(self):
        mock_file = StringFile('''
<top>

<num> Number: Q100A 
<title>    Topic: Some title

<desc>  Description:  
Descriptive text
split on multiple lines

<narr> Narrative: 
Further elaboration of the query intent
split on multiple lines

</top>

<top>

<num> 102 
<title> Query 2

<desc>
Q2 description

<narr> Narrative: 
Q2 narrative

</top>
'''.lstrip())
        expected_results = [
            TrecQuery(
                'Q100A', 'Some title',
                "Descriptive text\nsplit on multiple lines",
                'Further elaboration of the query intent\nsplit on multiple lines'
            ),
            TrecQuery('102', 'Query 2', "Q2 description", 'Q2 narrative'),
        ]

        queries = TrecQueries(mock_file)
        self.assertEqual(queries.queries_path(), 'MOCK')
        self.assertEqual(list(queries.queries_iter()), expected_results)
Exemple #9
0
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    migrator = Migrator(base_path/'irds_version.txt', 'v2',
        affected_files=[base_path/'nyt.tgz.pklz4'],
        message='Migrating nyt (extracting body text)')

    collection = migrator(NytDocs(dlc['source']))

    base = Dataset(collection, documentation('_'))

    # core17
    subsets['trec-core-2017'] = Dataset(
        TrecQueries(dlc['trec-core-2017/queries'], namespace='trec-core-2017', lang='en'),
        TrecQrels(dlc['trec-core-2017/qrels'], CORE_QREL_DEFS),
        collection,
        documentation('trec-core-2017'))

    # wksup
    all_queries = NytQueries(collection)
    all_qrels = NytQrels(collection)
    match_qids = Lazy(lambda: VALID_IDS)
    subsets['wksup'] = Dataset(
        all_queries,
        all_qrels,
        collection,
        documentation('wksup/train'))
    subsets['wksup/train'] = Dataset(
        FilteredQueries(all_queries, match_qids, mode='exclude'),
        FilteredQrels(all_qrels, match_qids, mode='exclude'),
        collection,
        documentation('wksup/train'))
    subsets['wksup/valid'] = Dataset(
        FilteredQueries(all_queries, match_qids, mode='include'),
        FilteredQrels(all_qrels, match_qids, mode='include'),
        collection,
        documentation('wksup/valid'))

    ir_datasets.registry.register('nyt', base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'nyt/{s}', subsets[s])

    return base, subsets
Exemple #10
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    docs_dlc = dlc['docs']
    doccount_dlc = Gov2DocCountFile(os.path.join(base_path, 'corpus.doccounts'), docs_dlc)
    collection = Gov2Docs(docs_dlc, doccount_dlc)
    base = Dataset(collection, documentation('_'))

    subsets['trec-tb-2004'] = Dataset(
        collection,
        TrecQueries(dlc['trec-tb-2004/queries'], namespace=NAME, lang='en'),
        TrecQrels(dlc['trec-tb-2004/qrels'], QREL_DEFS),
        documentation('trec-tb-2004')
    )
    subsets['trec-tb-2005'] = Dataset(
        collection,
        TrecQueries(dlc['trec-tb-2005/queries'], namespace=NAME, lang='en'),
        TrecQrels(dlc['trec-tb-2005/qrels'], QREL_DEFS),
        documentation('trec-tb-2005')
    )
    subsets['trec-tb-2005/named-page'] = Dataset(
        collection,
        TrecQueries(dlc['trec-tb-2005/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace=NAME, lang='en'),
        TrecQrels(dlc['trec-tb-2005/named-page/qrels'], NAMED_PAGE_QREL_DEFS),
        documentation('trec-tb-2005/named-page')
    )
    subsets['trec-tb-2005/efficiency'] = Dataset(
        collection,
        TrecColonQueries(GzipExtract(dlc['trec-tb-2005/efficiency/queries']), encoding='latin1', namespace=NAME, lang='en'),
        RewriteQids(TrecQrels(dlc['trec-tb-2005/qrels'], QREL_DEFS), EFF_MAP_05),
        documentation('trec-tb-2005/efficiency')
    )
    subsets['trec-tb-2006'] = Dataset(
        collection,
        TrecQueries(dlc['trec-tb-2006/queries'], namespace=NAME, lang='en'),
        TrecQrels(dlc['trec-tb-2006/qrels'], QREL_DEFS),
        documentation('trec-tb-2006')
    )
    subsets['trec-tb-2006/named-page'] = Dataset(
        collection,
        TrecQueries(dlc['trec-tb-2006/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace=NAME, lang='en'),
        TrecQrels(dlc['trec-tb-2006/named-page/qrels'], NAMED_PAGE_QREL_DEFS),
        documentation('trec-tb-2006/named-page')
    )
    subsets['trec-tb-2006/efficiency'] = Dataset(
        collection,
        TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.all'), encoding='latin1', namespace=NAME, lang='en'),
        RewriteQids(TrecQrels(dlc['trec-tb-2006/qrels'], QREL_DEFS), EFF_MAP_06),
        documentation('trec-tb-2006/efficiency')
    )
    subsets['trec-tb-2006/efficiency/10k'] = Dataset(
        collection,
        TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.10k'), encoding='latin1', namespace=NAME, lang='en'),
        documentation('trec-tb-2006/efficiency/10k')
    )
    subsets['trec-tb-2006/efficiency/stream1'] = Dataset(
        collection,
        TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-1'), encoding='latin1', namespace=NAME, lang='en'),
        documentation('trec-tb-2006/efficiency/stream1')
    )
    subsets['trec-tb-2006/efficiency/stream2'] = Dataset(
        collection,
        TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-2'), encoding='latin1', namespace=NAME, lang='en'),
        documentation('trec-tb-2006/efficiency/stream2')
    )
    subsets['trec-tb-2006/efficiency/stream3'] = Dataset(
        collection,
        TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-3'), encoding='latin1', namespace=NAME, lang='en'),
        RewriteQids(TrecQrels(dlc['trec-tb-2006/qrels'], QREL_DEFS), EFF_MAP_06),
        documentation('trec-tb-2006/efficiency/stream3')
    )
    subsets['trec-tb-2006/efficiency/stream4'] = Dataset(
        collection,
        TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-4'), encoding='latin1', namespace=NAME, lang='en'),
        documentation('trec-tb-2006/efficiency/stream4')
    )

    subsets['trec-mq-2007'] = Dataset(
        collection,
        TrecColonQueries(GzipExtract(dlc['trec-mq-2007/queries']), encoding='latin1'),
        TrecPrels(dlc['trec-mq-2007/qrels'], QREL_DEFS),
        documentation('trec-mq-2007')
    )
    subsets['trec-mq-2008'] = Dataset(
        collection,
        TrecColonQueries(GzipExtract(dlc['trec-mq-2008/queries']), encoding='latin1', namespace='trec-mq', lang='en'),
        TrecPrels(TarExtract(dlc['trec-mq-2008/qrels'], '2008.RC1/prels'), QREL_DEFS),
        documentation('trec-mq-2008')
    )

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Exemple #11
0
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    collection = TrecDocs(dlc['benchmark'],
                          parser='tut',
                          path_globs=['**/docs_grp_*.txt'],
                          namespace=NAME,
                          lang='en',
                          count_hint=ir_datasets.util.count_hint(NAME))
    topics_and_qrels = TarExtractAll(
        dlc['benchmark'],
        base_path / "topics_and_qrels",
        path_globs=['**/topics.*.txt', '**/qrels.*.txt'])
    val_runs = TarExtractAll(dlc['dlfiles'],
                             base_path / "val_runs",
                             path_globs=['**/run.trip.BM25.*.val.txt'])
    test_runs = TarExtractAll(dlc['dlfiles_runs_test'],
                              base_path / "test_runs",
                              path_globs=['**/run.trip.BM25.*.test.txt'])

    base = Dataset(collection, documentation('_'))

    subsets['logs'] = Dataset(
        TsvDocs(Cache(
            FixAllarticles(TarExtract(dlc['logs'], 'logs/allarticles.txt')),
            base_path / 'allarticles-fixed.tsv'),
                doc_cls=TripClickPartialDoc,
                lang='en',
                count_hint=ir_datasets.util.count_hint(f'{NAME}/logs')),
        TripClickQlogs(
            TarExtractAll(dlc['logs'],
                          base_path / 'logs',
                          path_globs=['**/*.json'])), documentation('logs'))

    ### Train

    subsets['train/head'] = Dataset(
        collection,
        TrecQueries(RelativePath(topics_and_qrels,
                                 'benchmark/topics/topics.head.train.txt'),
                    qtype=GenericQuery,
                    qtype_map=QTYPE_MAP,
                    namespace=NAME,
                    lang='en'),
        TrecQrels(
            RelativePath(topics_and_qrels,
                         'benchmark/qrels/qrels.raw.head.train.txt'),
            QREL_DEFS), documentation('train/head'))

    subsets['train/head/dctr'] = Dataset(
        TrecQrels(
            RelativePath(topics_and_qrels,
                         'benchmark/qrels/qrels.dctr.head.train.txt'),
            QREL_DCTR_DEFS), subsets['train/head'],
        documentation('train/head/dctr'))

    subsets['train/torso'] = Dataset(
        collection,
        TrecQueries(RelativePath(topics_and_qrels,
                                 'benchmark/topics/topics.torso.train.txt'),
                    qtype=GenericQuery,
                    qtype_map=QTYPE_MAP,
                    namespace=NAME,
                    lang='en'),
        TrecQrels(
            RelativePath(topics_and_qrels,
                         'benchmark/qrels/qrels.raw.torso.train.txt'),
            QREL_DEFS), documentation('train/torso'))

    subsets['train/tail'] = Dataset(
        collection,
        TrecQueries(RelativePath(topics_and_qrels,
                                 'benchmark/topics/topics.tail.train.txt'),
                    qtype=GenericQuery,
                    qtype_map=QTYPE_MAP,
                    namespace=NAME,
                    lang='en'),
        TrecQrels(
            RelativePath(topics_and_qrels,
                         'benchmark/qrels/qrels.raw.tail.train.txt'),
            QREL_DEFS), documentation('train/tail'))

    train_queries = ConcatQueries([
        subsets['train/head'].queries_handler(),
        subsets['train/torso'].queries_handler(),
        subsets['train/tail'].queries_handler(),
    ])
    train_docpairs = DocPairGenerator(
        TarExtract(dlc['dlfiles'], 'dlfiles/triples.train.tsv'), collection,
        train_queries, base_path / 'train.docpairs')
    subsets['train'] = Dataset(
        collection, train_queries,
        ConcatQrels([
            subsets['train/head'].qrels_handler(),
            subsets['train/torso'].qrels_handler(),
            subsets['train/tail'].qrels_handler(),
        ]), TsvDocPairs(train_docpairs), documentation('train'))
    subsets['train/hofstaetter-triples'] = Dataset(
        collection, train_queries, subsets['train'].qrels_handler(),
        TsvDocPairs(dlc['hofstaetter-triples']),
        documentation('train/hofstaetter-triples'))

    ### Val

    subsets['val/head'] = Dataset(
        collection,
        TrecQueries(RelativePath(topics_and_qrels,
                                 'benchmark/topics/topics.head.val.txt'),
                    qtype=GenericQuery,
                    qtype_map=QTYPE_MAP,
                    namespace=NAME,
                    lang='en'),
        TrecQrels(
            RelativePath(topics_and_qrels,
                         'benchmark/qrels/qrels.raw.head.val.txt'), QREL_DEFS),
        TrecScoredDocs(
            RelativePath(val_runs, 'dlfiles/run.trip.BM25.head.val.txt')),
        documentation('val/head'))

    subsets['val/head/dctr'] = Dataset(
        TrecQrels(
            RelativePath(topics_and_qrels,
                         'benchmark/qrels/qrels.dctr.head.val.txt'),
            QREL_DCTR_DEFS), subsets['val/head'],
        documentation('val/head/dctr'))

    subsets['val/torso'] = Dataset(
        collection,
        TrecQueries(RelativePath(topics_and_qrels,
                                 'benchmark/topics/topics.torso.val.txt'),
                    qtype=GenericQuery,
                    qtype_map=QTYPE_MAP,
                    namespace=NAME,
                    lang='en'),
        TrecQrels(
            RelativePath(topics_and_qrels,
                         'benchmark/qrels/qrels.raw.torso.val.txt'),
            QREL_DEFS),
        TrecScoredDocs(
            RelativePath(val_runs, 'dlfiles/run.trip.BM25.torso.val.txt')),
        documentation('val/torso'))

    subsets['val/tail'] = Dataset(
        collection,
        TrecQueries(RelativePath(topics_and_qrels,
                                 'benchmark/topics/topics.tail.val.txt'),
                    qtype=GenericQuery,
                    qtype_map=QTYPE_MAP,
                    namespace=NAME,
                    lang='en'),
        TrecQrels(
            RelativePath(topics_and_qrels,
                         'benchmark/qrels/qrels.raw.tail.val.txt'), QREL_DEFS),
        TrecScoredDocs(
            RelativePath(val_runs, 'dlfiles/run.trip.BM25.tail.val.txt')),
        documentation('val/tail'))

    subsets['val'] = Dataset(
        collection,
        ConcatQueries([
            subsets['val/head'].queries_handler(),
            subsets['val/torso'].queries_handler(),
            subsets['val/tail'].queries_handler(),
        ]),
        ConcatQrels([
            subsets['val/head'].qrels_handler(),
            subsets['val/torso'].qrels_handler(),
            subsets['val/tail'].qrels_handler(),
        ]),
        ConcatScoreddocs([
            subsets['val/head'].scoreddocs_handler(),
            subsets['val/torso'].scoreddocs_handler(),
            subsets['val/tail'].scoreddocs_handler(),
        ]), documentation('val'))

    ### Test

    subsets['test/head'] = Dataset(
        collection,
        TrecQueries(RelativePath(topics_and_qrels,
                                 'benchmark/topics/topics.head.test.txt'),
                    qtype=GenericQuery,
                    qtype_map=QTYPE_MAP,
                    namespace=NAME,
                    lang='en'),
        TrecScoredDocs(
            RelativePath(test_runs, 'runs_test/run.trip.BM25.head.test.txt')),
        documentation('val/head'))

    subsets['test/torso'] = Dataset(
        collection,
        TrecQueries(RelativePath(topics_and_qrels,
                                 'benchmark/topics/topics.torso.test.txt'),
                    qtype=GenericQuery,
                    qtype_map=QTYPE_MAP,
                    namespace=NAME,
                    lang='en'),
        TrecScoredDocs(
            RelativePath(test_runs, 'runs_test/run.trip.BM25.torso.test.txt')),
        documentation('test/torso'))

    subsets['test/tail'] = Dataset(
        collection,
        TrecQueries(RelativePath(topics_and_qrels,
                                 'benchmark/topics/topics.tail.test.txt'),
                    qtype=GenericQuery,
                    qtype_map=QTYPE_MAP,
                    namespace=NAME,
                    lang='en'),
        TrecScoredDocs(
            RelativePath(test_runs, 'runs_test/run.trip.BM25.tail.test.txt')),
        documentation('test/tail'))

    subsets['test'] = Dataset(
        collection,
        ConcatQueries([
            subsets['test/head'].queries_handler(),
            subsets['test/torso'].queries_handler(),
            subsets['test/tail'].queries_handler(),
        ]),
        ConcatScoreddocs([
            subsets['test/head'].scoreddocs_handler(),
            subsets['test/torso'].scoreddocs_handler(),
            subsets['test/tail'].scoreddocs_handler(),
        ]), documentation('test'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets