Ejemplo n.º 1
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    base = Dataset(documentation('_'))

    collection04 = MedlineDocs('2004', [GzipExtract(dlc['2004/a']), GzipExtract(dlc['2004/b']), GzipExtract(dlc['2004/c']), GzipExtract(dlc['2004/d'])], count_hint=ir_datasets.util.count_hint(f'{NAME}/2004'))

    subsets['2004'] = Dataset(collection04, documentation('2004'))

    subsets['2004/trec-genomics-2004'] = Dataset(
        collection04,
        TrecXmlQueries(ZipExtract(dlc['trec-genomics-2004/queries'], 'Official.xml'), qtype=TrecGenomicsQuery, qtype_map=TREC04_XML_MAP, namespace='trec-genomics', lang='en'),
        TrecQrels(dlc['trec-genomics-2004/qrels'], QREL_DEFS),
        documentation('trec-genomics-2004'),
    )
    subsets['2004/trec-genomics-2005'] = Dataset(
        collection04,
        TrecGenomicsQueries(dlc['trec-genomics-2005/queries']),
        TrecQrels(dlc['trec-genomics-2005/qrels'], QREL_DEFS),
        documentation('trec-genomics-2005'),
    )

    collection17 = ConcatDocs([
        AacrAscoDocs(dlc['2017/aacr_asco_extra']),
        MedlineDocs('2017', [dlc['2017/part1'], dlc['2017/part2'], dlc['2017/part3'], dlc['2017/part4'], dlc['2017/part5']]),
    ], count_hint=ir_datasets.util.count_hint(f'{NAME}/2017'))
    subsets['2017'] = Dataset(collection17, documentation('2017'))

    subsets['2017/trec-pm-2017'] = Dataset(
        collection17,
        TrecXmlQueries(dlc['trec-pm-2017/queries'], qtype=TrecPm2017Query, namespace='trec-pm-2017', lang='en'),
        TrecQrels(dlc['trec-pm-2017/qrels'], QREL_DEFS),
        documentation('trec-pm-2017'),
    )
    subsets['2017/trec-pm-2018'] = Dataset(
        collection17,
        TrecXmlQueries(dlc['trec-pm-2018/queries'], qtype=TrecPmQuery, namespace='trec-pm-2018', lang='en'),
        TrecQrels(dlc['trec-pm-2018/qrels'], QREL_DEFS),
        documentation('trec-pm-2018'),
    )

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Ejemplo n.º 2
0
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    en_noclean_tr_collection = C4Docs(
        GzipExtract(dlc['en-noclean/sources']),
        TarExtractAll(dlc['en-noclean/checkpoints'],
                      base_path / 'en.noclean.checkpoints'),
        base_path,
        source_name_filter=r'en\.noclean\.c4-train',
        filter_name='train')  # exclude validation files (only include train)
    base = Dataset(documentation('_'))

    subsets['en-noclean-tr'] = Dataset(en_noclean_tr_collection,
                                       documentation('en-noclean-tr'))

    subsets['en-noclean-tr/trec-misinfo-2021'] = Dataset(
        en_noclean_tr_collection,
        TrecXmlQueries(dlc['trec-misinfo-2021/queries'],
                       qtype=MisinfoQuery,
                       qtype_map=misinfo_map,
                       namespace='trec-misinfo',
                       lang='en'),
        documentation('en-noclean-tr/trec-misinfo-2021'))

    ir_datasets.registry.register(NAME, base)
    for subset in subsets:
        ir_datasets.registry.register(f'{NAME}/{subset}', subsets[subset])

    return base, subsets
Ejemplo n.º 3
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    docs_dlc = dlc['docs']
    docs_chk_dlc = TarExtractAll(dlc['docs.chk'], base_path/'corpus.chk')
    b13_dlc = Bz2Extract(Cache(TarExtract(dlc['cw12b-info'], 'ClueWeb12-CreateB13/software/CreateClueWeb12B13Dataset.jar'), base_path/'CreateClueWeb12B13Dataset.jar'))

    collection = ClueWeb12Docs(docs_dlc, docs_chk_dlc)
    collection_b13 = ClueWeb12Docs(ClueWeb12b13Extractor(docs_dlc, b13_dlc))

    base = Dataset(collection, documentation('_'))

    subsets['b13'] = Dataset(collection_b13, documentation('b13'))

    subsets['trec-web-2013'] = Dataset(
        collection,
        TrecXmlQueries(dlc['trec-web-2013/queries'], qtype=TrecWebTrackQuery, namespace=NAME),
        TrecQrels(dlc['trec-web-2013/qrels.adhoc'], QREL_DEFS),
        documentation('trec-web-2013'))

    subsets['trec-web-2014'] = Dataset(
        collection,
        TrecXmlQueries(dlc['trec-web-2014/queries'], qtype=TrecWebTrackQuery, namespace=NAME),
        TrecQrels(dlc['trec-web-2014/qrels.adhoc'], QREL_DEFS),
        documentation('trec-web-2014'))

    subsets['b13/ntcir-www-1'] = Dataset(
        collection_b13,
        TrecXmlQueries(Cache(ZipExtract(dlc['ntcir-www-1/queries'], 'eng.queries.xml'), base_path/'ntcir-www-1'/'queries.xml'), qtype=GenericQuery, qtype_map={'qid': 'query_id', 'content': 'text'}, namespace=NAME),
        NtcirQrels(dlc['ntcir-www-1/qrels'], NTCIR_QREL_DEFS),
        documentation('ntcir-www-1'))

    subsets['b13/ntcir-www-2'] = Dataset(
        collection_b13,
        TrecXmlQueries(Cache(ZipExtract(dlc['ntcir-www-2/queries'], 'qEng.xml'), base_path/'ntcir-www-2'/'queries.xml'), qtype=NtcirQuery, qtype_map=ntcir_map, namespace=NAME),
        NtcirQrels(dlc['ntcir-www-2/qrels'], NTCIR_QREL_DEFS),
        documentation('ntcir-www-2'))

    subsets['b13/ntcir-www-3'] = Dataset(
        collection_b13,
        TrecXmlQueries(dlc['ntcir-www-3/queries'], qtype=NtcirQuery, qtype_map=ntcir_map, namespace=NAME),
        documentation('ntcir-www-3'))

    subsets['b13/trec-misinfo-2019'] = Dataset(
        collection_b13,
        TrecXmlQueries(dlc['trec-misinfo-2019/queries'], qtype=MisinfoQuery, qtype_map=misinfo_map, namespace=NAME),
        MsinfoQrels(dlc['trec-misinfo-2019/qrels'], MISINFO_QREL_DEFS),
        documentation('trec-misinfo-2019'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Ejemplo n.º 4
0
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    collection = Cord19Docs(dlc['docs/2020-07-16'], base_path/'2020-07-16', '2020-07-16')

    base = Dataset(collection, documentation('_'))

    subsets['trec-covid'] = Dataset(
        TrecXmlQueries(dlc['trec-covid/queries'], qtype_map={'query': 'title', 'question': 'description', 'narrative': 'narrative'}, namespace=NAME),
        TrecQrels(dlc['trec-covid/qrels'], QRELS_DEFS),
        collection,
        documentation('trec-covid'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Ejemplo n.º 5
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    collection = MedlineDocs([
        GzipExtract(dlc['docs/a']),
        GzipExtract(dlc['docs/b']),
        GzipExtract(dlc['docs/c']),
        GzipExtract(dlc['docs/d'])
    ])
    base = Dataset(collection, documentation('_'))

    subsets['trec-genomics-2004'] = Dataset(
        collection,
        TrecXmlQueries(ZipExtract(dlc['trec-genomics-2004/queries'],
                                  'Official.xml'),
                       qtype=TrecGenomicsQuery,
                       qtype_map=TREC04_XML_MAP,
                       namespace='trec-genomics'),
        TrecQrels(dlc['trec-genomics-2004/qrels'], QREL_DEFS),
        documentation('trec-genomics-2004'),
    )
    subsets['trec-genomics-2005'] = Dataset(
        collection,
        TrecGenomicsQueries(dlc['trec-genomics-2005/queries']),
        TrecQrels(dlc['trec-genomics-2005/qrels'], QREL_DEFS),
        documentation('trec-genomics-2005'),
    )

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Ejemplo n.º 6
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    docs_dlc = dlc['docs']
    docs_chk_dlc = TarExtractAll(dlc['docs.chk'], base_path / 'corpus.chk')
    b13_dlc = Bz2Extract(
        Cache(
            TarExtract(
                dlc['cw12b-info'],
                'ClueWeb12-CreateB13/software/CreateClueWeb12B13Dataset.jar'),
            base_path / 'CreateClueWeb12B13Dataset.jar'))

    collection = ClueWeb12Docs(docs_dlc, docs_chk_dlc)
    collection_b13 = ClueWeb12Docs(ClueWeb12b13Extractor(docs_dlc, b13_dlc))

    base = Dataset(collection, documentation('_'))

    subsets['b13'] = Dataset(collection_b13, documentation('b13'))

    subsets['trec-web-2013'] = Dataset(
        collection,
        TrecXmlQueries(dlc['trec-web-2013/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace='trec-web',
                       lang='en'),
        TrecQrels(dlc['trec-web-2013/qrels.adhoc'], QREL_DEFS),
        documentation('trec-web-2013'))

    subsets['trec-web-2014'] = Dataset(
        collection,
        TrecXmlQueries(dlc['trec-web-2014/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace='trec-web',
                       lang='en'),
        TrecQrels(dlc['trec-web-2014/qrels.adhoc'], QREL_DEFS),
        documentation('trec-web-2014'))

    subsets['b13/ntcir-www-1'] = Dataset(
        collection_b13,
        TrecXmlQueries(Cache(
            ZipExtract(dlc['ntcir-www-1/queries'], 'eng.queries.xml'),
            base_path / 'ntcir-www-1' / 'queries.xml'),
                       qtype=GenericQuery,
                       qtype_map={
                           'qid': 'query_id',
                           'content': 'text'
                       },
                       namespace='ntcir-www',
                       lang='en'),
        NtcirQrels(dlc['ntcir-www-1/qrels'], NTCIR_QREL_DEFS),
        documentation('ntcir-www-1'))

    subsets['b13/ntcir-www-2'] = Dataset(
        collection_b13,
        TrecXmlQueries(Cache(
            ZipExtract(dlc['ntcir-www-2/queries'], 'qEng.xml'),
            base_path / 'ntcir-www-2' / 'queries.xml'),
                       qtype=NtcirQuery,
                       qtype_map=ntcir_map,
                       namespace='ntcir-www',
                       lang='en'),
        NtcirQrels(dlc['ntcir-www-2/qrels'], NTCIR_QREL_DEFS),
        documentation('ntcir-www-2'))

    subsets['b13/ntcir-www-3'] = Dataset(
        collection_b13,
        TrecXmlQueries(dlc['ntcir-www-3/queries'],
                       qtype=NtcirQuery,
                       qtype_map=ntcir_map,
                       namespace='ntcir-www',
                       lang='en'), documentation('ntcir-www-3'))

    subsets['b13/trec-misinfo-2019'] = Dataset(
        collection_b13,
        TrecXmlQueries(dlc['trec-misinfo-2019/queries'],
                       qtype=MisinfoQuery,
                       qtype_map=misinfo_map,
                       namespace='trec-misinfo-2019',
                       lang='en'),
        MsinfoQrels(dlc['trec-misinfo-2019/qrels'], MISINFO_QREL_DEFS),
        documentation('trec-misinfo-2019'))

    subsets['b13/clef-ehealth'] = Dataset(
        collection_b13,
        TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries']),
                       qtype=GenericQuery,
                       qtype_map=ehealth_map,
                       namespace='clef-ehealth',
                       lang='en'),
        EhealthQrels(
            [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
            [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
            [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
            EHEALTH_QREL_DEFS), documentation('clef-ehealth'))

    subsets['b13/clef-ehealth/cs'] = Dataset(
        collection_b13,
        TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/cs']),
                       qtype=GenericQuery,
                       qtype_map=ehealth_map,
                       namespace='clef-ehealth',
                       lang='cs'),
        EhealthQrels(
            [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
            [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
            [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
            EHEALTH_QREL_DEFS,
            query_id_suffix='-cs'), documentation('clef-ehealth/cs'))

    subsets['b13/clef-ehealth/de'] = Dataset(
        collection_b13,
        TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/de']),
                       qtype=GenericQuery,
                       qtype_map=ehealth_map,
                       namespace='clef-ehealth',
                       lang='de'),
        EhealthQrels(
            [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
            [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
            [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
            EHEALTH_QREL_DEFS,
            query_id_suffix='-de'), documentation('clef-ehealth/de'))

    subsets['b13/clef-ehealth/fr'] = Dataset(
        collection_b13,
        TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/fr']),
                       qtype=GenericQuery,
                       qtype_map=ehealth_map,
                       namespace='clef-ehealth',
                       lang='fr'),
        EhealthQrels(
            [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
            [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
            [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
            EHEALTH_QREL_DEFS,
            query_id_suffix='-fr'), documentation('clef-ehealth/fr'))

    subsets['b13/clef-ehealth/hu'] = Dataset(
        collection_b13,
        TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/hu']),
                       qtype=GenericQuery,
                       qtype_map=ehealth_map,
                       namespace='clef-ehealth',
                       lang='hu'),
        EhealthQrels(
            [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
            [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
            [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
            EHEALTH_QREL_DEFS,
            query_id_suffix='-hu'), documentation('clef-ehealth/hu'))

    subsets['b13/clef-ehealth/pl'] = Dataset(
        collection_b13,
        TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/pl']),
                       qtype=GenericQuery,
                       qtype_map=ehealth_map,
                       namespace='clef-ehealth',
                       lang='pl'),
        EhealthQrels(
            [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
            [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
            [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
            EHEALTH_QREL_DEFS,
            query_id_suffix='-pl'), documentation('clef-ehealth/pl'))

    subsets['b13/clef-ehealth/sv'] = Dataset(
        collection_b13,
        TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/sv']),
                       qtype=GenericQuery,
                       qtype_map=ehealth_map,
                       namespace='clef-ehealth',
                       lang='sv'),
        EhealthQrels(
            [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']],
            [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']],
            [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']],
            EHEALTH_QREL_DEFS,
            query_id_suffix='-sv'), documentation('clef-ehealth/sv'))

    # NOTE: the following datasets are defined in touche.py:
    # - clueweb12/touche-2020-task-2
    # - clueweb12/touche-2021-task-2

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Ejemplo n.º 7
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    docs_dlc = dlc['docs']
    chk_dlc = TarExtractAll(dlc['docs.chk'], base_path / 'corpus.chk')
    collection = ClueWeb09Docs(docs_dlc, chk_dlc, lang=None)  # multiple langs
    collection_ar = ClueWeb09Docs(docs_dlc,
                                  chk_dlc,
                                  dirs=['ClueWeb09_Arabic_1'],
                                  lang='ar')
    collection_zh = ClueWeb09Docs(docs_dlc,
                                  chk_dlc,
                                  dirs=[
                                      'ClueWeb09_Chinese_1',
                                      'ClueWeb09_Chinese_2',
                                      'ClueWeb09_Chinese_3',
                                      'ClueWeb09_Chinese_4'
                                  ],
                                  lang='zh')
    collection_en = ClueWeb09Docs(
        docs_dlc,
        chk_dlc,
        dirs=[
            'ClueWeb09_English_1', 'ClueWeb09_English_2',
            'ClueWeb09_English_3', 'ClueWeb09_English_4',
            'ClueWeb09_English_5', 'ClueWeb09_English_6',
            'ClueWeb09_English_7', 'ClueWeb09_English_8',
            'ClueWeb09_English_9', 'ClueWeb09_English_10'
        ],
        lang='en')
    collection_fr = ClueWeb09Docs(docs_dlc,
                                  chk_dlc,
                                  dirs=['ClueWeb09_French_1'],
                                  lang='fr')
    collection_de = ClueWeb09Docs(docs_dlc,
                                  chk_dlc,
                                  dirs=['ClueWeb09_German_1'],
                                  lang='de')
    collection_it = ClueWeb09Docs(docs_dlc,
                                  chk_dlc,
                                  dirs=['ClueWeb09_Italian_1'],
                                  lang='it')
    collection_ja = ClueWeb09Docs(
        docs_dlc,
        chk_dlc,
        dirs=['ClueWeb09_Japanese_1', 'ClueWeb09_Japanese_2'],
        lang='ja')
    collection_ko = ClueWeb09Docs(docs_dlc,
                                  chk_dlc,
                                  dirs=['ClueWeb09_Korean_1'],
                                  lang='ko')
    collection_pt = ClueWeb09Docs(docs_dlc,
                                  chk_dlc,
                                  dirs=['ClueWeb09_Portuguese_1'],
                                  lang='pt')
    collection_es = ClueWeb09Docs(
        docs_dlc,
        chk_dlc,
        dirs=['ClueWeb09_Spanish_1', 'ClueWeb09_Spanish_2'],
        lang='es')
    collection_catb = ClueWeb09Docs(docs_dlc,
                                    chk_dlc,
                                    dirs=['ClueWeb09_English_1'],
                                    lang='en')
    base = Dataset(collection, documentation('_'))

    subsets['ar'] = Dataset(collection_ar, documentation('ar'))
    subsets['zh'] = Dataset(collection_zh, documentation('zh'))
    subsets['en'] = Dataset(collection_en, documentation('en'))
    subsets['fr'] = Dataset(collection_fr, documentation('fr'))
    subsets['de'] = Dataset(collection_de, documentation('de'))
    subsets['it'] = Dataset(collection_it, documentation('it'))
    subsets['ja'] = Dataset(collection_ja, documentation('ja'))
    subsets['ko'] = Dataset(collection_ko, documentation('ko'))
    subsets['pt'] = Dataset(collection_pt, documentation('pt'))
    subsets['es'] = Dataset(collection_es, documentation('es'))
    subsets['catb'] = Dataset(collection_catb, documentation('catb'))

    subsets['en/trec-web-2009'] = Dataset(
        collection_en,
        TrecXmlQueries(dlc['trec-web-2009/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        TrecPrels(GzipExtract(dlc['trec-web-2009/qrels.adhoc']), QREL_DEFS_09),
        documentation('trec-web-2009'))

    subsets['en/trec-web-2010'] = Dataset(
        collection_en,
        TrecXmlQueries(dlc['trec-web-2010/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        TrecQrels(dlc['trec-web-2010/qrels.adhoc'], QREL_DEFS),
        documentation('trec-web-2010'))

    subsets['en/trec-web-2011'] = Dataset(
        collection_en,
        TrecXmlQueries(dlc['trec-web-2011/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        TrecQrels(dlc['trec-web-2011/qrels.adhoc'], QREL_DEFS),
        documentation('trec-web-2011'))

    subsets['en/trec-web-2012'] = Dataset(
        collection_en,
        TrecXmlQueries(dlc['trec-web-2012/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        TrecQrels(dlc['trec-web-2012/qrels.adhoc'], QREL_DEFS),
        documentation('trec-web-2012'))

    subsets['catb/trec-web-2009'] = Dataset(
        collection_catb,
        TrecXmlQueries(dlc['trec-web-2009/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        CatBQrelFilter(
            TrecPrels(GzipExtract(dlc['trec-web-2009/qrels.adhoc']),
                      QREL_DEFS_09)), documentation('trec-web-2009'))

    subsets['catb/trec-web-2010'] = Dataset(
        collection_catb,
        TrecXmlQueries(dlc['trec-web-2010/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        CatBQrelFilter(TrecQrels(dlc['trec-web-2010/qrels.adhoc'], QREL_DEFS)),
        documentation('trec-web-2010'))

    subsets['catb/trec-web-2011'] = Dataset(
        collection_catb,
        TrecXmlQueries(dlc['trec-web-2011/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        CatBQrelFilter(TrecQrels(dlc['trec-web-2011/qrels.adhoc'], QREL_DEFS)),
        documentation('trec-web-2011'))

    subsets['catb/trec-web-2012'] = Dataset(
        collection_catb,
        TrecXmlQueries(dlc['trec-web-2012/queries'],
                       qtype=TrecWebTrackQuery,
                       namespace=NAME,
                       lang='en'),
        CatBQrelFilter(TrecQrels(dlc['trec-web-2012/qrels.adhoc'], QREL_DEFS)),
        documentation('trec-web-2012'))

    subsets['trec-mq-2009'] = Dataset(
        collection,
        TrecColonQueries(GzipExtract(dlc['trec-mq-2009/queries']),
                         encoding='latin1',
                         lang='en'),
        TrecPrels(GzipExtract(dlc['trec-mq-2009/qrels']), QREL_DEFS_09),
        documentation('trec-mq-2009'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Ejemplo n.º 8
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    v1_collection = PmcDocs(
        [
            dlc['v1/source0'], dlc['v1/source1'], dlc['v1/source2'],
            dlc['v1/source3']
        ],
        ir_datasets.util.home_path() / NAME / 'v1' / 'corpus',
        duplicate_dlcs=[dlc['v1/dup1'], dlc['v1/dup2']],
        count_hint=ir_datasets.util.count_hint(f'{NAME}/v1'))
    v2_collection = PmcDocs(
        [
            dlc['v2/source0'], dlc['v2/source1'], dlc['v2/source2'],
            dlc['v2/source3']
        ],
        ir_datasets.util.home_path() / NAME / 'v2' / 'corpus',
        count_hint=ir_datasets.util.count_hint(f'{NAME}/v2'))
    base = Dataset(documentation('_'))

    subsets['v1'] = Dataset(v1_collection, documentation('v1'))
    subsets['v2'] = Dataset(v2_collection, documentation('v2'))

    subsets['v1/trec-cds-2014'] = Dataset(
        v1_collection,
        TrecXmlQueries(dlc['trec-cds-2014/queries'],
                       TrecCdsQuery,
                       QUERY_FILE_MAP,
                       namespace='trec-cds-2014',
                       lang='en'),
        TrecQrels(dlc['trec-cds-2014/qrels'], QREL_DEFS),
        documentation('v1/trec-cds-2014'),
    )

    subsets['v1/trec-cds-2015'] = Dataset(
        v1_collection,
        TrecXmlQueries(dlc['trec-cds-2015/queries'],
                       TrecCdsQuery,
                       QUERY_FILE_MAP,
                       namespace='trec-cds-2015',
                       lang='en'),
        TrecQrels(dlc['trec-cds-2015/qrels'], QREL_DEFS),
        documentation('v1/trec-cds-2015'),
    )

    subsets['v2/trec-cds-2016'] = Dataset(
        v2_collection,
        TrecXmlQueries(dlc['trec-cds-2016/queries'],
                       TrecCds2016Query,
                       QUERY_FILE_MAP,
                       namespace='trec-cds-2016',
                       lang='en'),
        TrecQrels(dlc['trec-cds-2016/qrels'], QREL_DEFS),
        documentation('v2/trec-cds-2016'),
    )

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Ejemplo n.º 9
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    subsets = {}

    base = Dataset(documentation('_'))

    collection17 = ClinicalTrialsDocs(
        '2017', [dlc['docs/2017']],
        count_hint=ir_datasets.util.count_hint(f'{NAME}/2017'))
    collection19 = ClinicalTrialsDocs(
        '2019', [
            dlc['docs/2019/0'], dlc['docs/2019/1'], dlc['docs/2019/2'],
            dlc['docs/2019/3']
        ],
        count_hint=ir_datasets.util.count_hint(f'{NAME}/2019'))
    collection21 = ClinicalTrialsDocs(
        '2021', [
            dlc['docs/2021/1'], dlc['docs/2021/2'], dlc['docs/2021/3'],
            dlc['docs/2021/4'], dlc['docs/2021/5']
        ],
        compress_format='zip',
        count_hint=ir_datasets.util.count_hint(f'{NAME}/2021'))

    subsets['2017'] = Dataset(collection17, documentation('2017'))

    subsets['2019'] = Dataset(collection19, documentation('2019'))

    subsets['2021'] = Dataset(collection21, documentation('2021'))

    subsets['2017/trec-pm-2017'] = Dataset(
        collection17, medline.subsets['2017/trec-pm-2017'].queries_handler(),
        TrecQrels(dlc['trec-pm-2017/qrels'], QREL_DEFS),
        documentation('trec-pm-2017'))

    subsets['2017/trec-pm-2018'] = Dataset(
        collection17, medline.subsets['2017/trec-pm-2018'].queries_handler(),
        TrecQrels(dlc['trec-pm-2018/qrels'], QREL_DEFS),
        documentation('trec-pm-2018'))

    subsets['2019/trec-pm-2019'] = Dataset(
        collection19,
        TrecXmlQueries(dlc['trec-pm-2019/queries'],
                       qtype=medline.TrecPmQuery,
                       namespace='trec-pm-2019',
                       lang='en'),
        TrecQrels(dlc['trec-pm-2019/qrels'], QREL_DEFS),
        documentation('trec-pm-2019'))

    subsets['2021/trec-ct-2021'] = Dataset(
        collection21,
        TrecXmlQueries(dlc['trec-ct-2021/queries'],
                       qtype=GenericQuery,
                       qtype_map=ct_qmap,
                       namespace='trec-pm-2019',
                       lang='en'), documentation('trec-ct-2021'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Ejemplo n.º 10
0
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    collection = Cord19Docs(dlc['docs/2020-07-16/metadata'],
                            base_path / '2020-07-16',
                            '2020-07-16',
                            count_hint=ir_datasets.util.count_hint(f'{NAME}'))
    collection_ft = Cord19Docs(
        dlc['docs/2020-07-16'],
        base_path / '2020-07-16.fulltext',
        '2020-07-16',
        include_fulltext=True,
        count_hint=ir_datasets.util.count_hint(f'{NAME}/fulltext'))

    queries = TrecXmlQueries(dlc['trec-covid/queries'],
                             qtype_map=QTYPE_MAP,
                             namespace=NAME,
                             lang='en')
    qrels = TrecQrels(dlc['trec-covid/qrels'], QRELS_DEFS)

    base = Dataset(collection, documentation('_'))

    subsets['trec-covid'] = Dataset(queries, qrels, collection,
                                    documentation('trec-covid'))
    subsets['fulltext'] = Dataset(collection_ft, documentation('fulltext'))
    subsets['fulltext/trec-covid'] = Dataset(
        queries, qrels, collection_ft, documentation('fulltext/trec-covid'))

    subsets['trec-covid/round1'] = Dataset(
        Cord19Docs(dlc['docs/2020-04-10/metadata'],
                   base_path / '2020-04-10',
                   '2020-04-10',
                   count_hint=ir_datasets.util.count_hint(f'{NAME}/round1')),
        TrecXmlQueries(dlc['trec-covid/round1/queries'],
                       qtype_map=QTYPE_MAP,
                       namespace=NAME,
                       lang='en'),
        TrecQrels(dlc['trec-covid/round1/qrels'], QRELS_DEFS),
        documentation('trec-covid/round1'))

    subsets['trec-covid/round2'] = Dataset(
        Cord19Docs(dlc['docs/2020-05-01/metadata'],
                   base_path / '2020-05-01',
                   '2020-05-01',
                   count_hint=ir_datasets.util.count_hint(f'{NAME}/round2')),
        TrecXmlQueries(dlc['trec-covid/round2/queries'],
                       qtype_map=QTYPE_MAP,
                       namespace=NAME,
                       lang='en'),
        TrecQrels(dlc['trec-covid/round2/qrels'], QRELS_DEFS),
        documentation('trec-covid/round2'))

    subsets['trec-covid/round3'] = Dataset(
        Cord19Docs(dlc['docs/2020-05-19/metadata'],
                   base_path / '2020-05-19',
                   '2020-05-19',
                   count_hint=ir_datasets.util.count_hint(f'{NAME}/round3')),
        TrecXmlQueries(dlc['trec-covid/round3/queries'],
                       qtype_map=QTYPE_MAP,
                       namespace=NAME,
                       lang='en'),
        TrecQrels(dlc['trec-covid/round3/qrels'], QRELS_DEFS),
        documentation('trec-covid/round3'))

    subsets['trec-covid/round4'] = Dataset(
        Cord19Docs(dlc['docs/2020-06-19/metadata'],
                   base_path / '2020-06-19',
                   '2020-06-19',
                   count_hint=ir_datasets.util.count_hint(f'{NAME}/round4')),
        TrecXmlQueries(dlc['trec-covid/round4/queries'],
                       qtype_map=QTYPE_MAP,
                       namespace=NAME,
                       lang='en'),
        TrecQrels(dlc['trec-covid/round4/qrels'], QRELS_DEFS),
        documentation('trec-covid/round4'))

    subsets['trec-covid/round5'] = Dataset(
        collection, queries,
        TrecQrels(dlc['trec-covid/round5/qrels'], QRELS_DEFS),
        documentation('trec-covid/round5'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets