Ejemplo n.º 1
0
    def test_core(self):
        data_type = namedtuple('data_type', ['doc_id', 'field1', 'field2'])
        mock_file = StringFile('''
123\tsome field\tanother field
123\t  repeated  entry \tshouldn't filter

456\tanother query\tsomething
'''.lstrip())
        expected_results = [
            data_type('123', 'some field', 'another field'),
            data_type('123', '  repeated  entry ', 'shouldn\'t filter'),
            data_type('456', 'another query', 'something'),
        ]

        queries = TsvQueries(mock_file, data_type)
        self.assertEqual(queries.queries_path(), 'MOCK')
        self.assertEqual(list(queries.queries_iter()), expected_results)

        docs = TsvDocs(mock_file, data_type)
        self.assertEqual(docs.docs_path(), 'MOCK')
        self.assertEqual(list(docs.docs_iter()), expected_results)

        docpairs = TsvDocPairs(mock_file, data_type)
        self.assertEqual(docpairs.docpairs_path(), 'MOCK')
        self.assertEqual(list(docpairs.docpairs_iter()), expected_results)
Ejemplo n.º 2
0
    def test_flex_columns(self):
        class data_type(NamedTuple):
            doc_id: str
            field1: str
            field2: Tuple[str, ...]

        mock_file = StringFile('''
123\tsome field\tanother field
123\ttoo few fields

456\tanother query\tsomething
456\tanother query\tsomething\ttoo many fields\teven more
'''.strip())

        expected_results = [
            data_type('123', 'some field', ('another field', )),
            data_type('123', 'too few fields', ()),
            data_type('456', 'another query', ('something', )),
            data_type('456', 'another query',
                      ('something', 'too many fields', 'even more')),
        ]

        queries = TsvQueries(mock_file, data_type)
        self.assertEqual(queries.queries_path(), 'MOCK')
        self.assertEqual(list(queries.queries_iter()), expected_results)

        docs = TsvDocs(mock_file, data_type)
        self.assertEqual(docs.docs_path(), 'MOCK')
        self.assertEqual(list(docs.docs_iter()), expected_results)

        docpairs = TsvDocPairs(mock_file, data_type)
        self.assertEqual(docpairs.docpairs_path(), 'MOCK')
        self.assertEqual(list(docpairs.docpairs_iter()), expected_results)
Ejemplo n.º 3
0
def _init():
    base_path = ir_datasets.util.home_path()/NAME
    dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path)
    manager = NqManager(dlc, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    collection = DocstoreBackedDocs(manager.docs_store, docs_cls=NqPassageDoc, namespace=NAME, lang='en')
    base = Dataset(
        collection,
        documentation('_'))

    subsets = {}
    subsets['train'] = Dataset(
        collection,
        TsvQueries(manager.file_ref('train.queries.tsv'), namespace=NAME, lang='en'),
        NqQrels(manager.file_ref('train.qrels.jsonl')),
        NqScoredDocs(manager.file_ref('train.scoreddocs.tsv')),
        documentation('train'),
        )
    subsets['dev'] = Dataset(
        collection,
        TsvQueries(manager.file_ref('dev.queries.tsv'), namespace=NAME, lang='en'),
        NqQrels(manager.file_ref('dev.qrels.jsonl')),
        NqScoredDocs(manager.file_ref('dev.scoreddocs.tsv')),
        documentation('dev'),
        )

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Ejemplo n.º 4
0
def _init():
    base_path = ir_datasets.util.home_path()/NAME
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    subsets = {}
    migrator = Migrator(base_path/'irds_version.txt', 'v2',
        affected_files=[base_path/'msmarco_v2_passage.tar.pklz4'],
        message='Cleaning up pklz4 lookup structure in favor of ID-based lookups')
    collection = MsMarcoV2Passages(dlc['passages'])
    collection = migrator(collection)

    qrels_migrator = Migrator(base_path/'qrels_version.txt', 'v2',
        affected_files=[base_path/'train'/'qrels.tsv', base_path/'dev1'/'qrels.tsv', base_path/'dev2'/'qrels.tsv'],
        message='Updating qrels (task organizers removed duplicates)')

    subsets['train'] = Dataset(
        collection,
        TsvQueries(dlc['train/queries'], namespace='msmarco', lang='en'),
        qrels_migrator(TrecQrels(dlc['train/qrels'], QRELS_DEFS)),
        TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])),
    )
    subsets['dev1'] = Dataset(
        collection,
        TsvQueries(dlc['dev1/queries'], namespace='msmarco', lang='en'),
        qrels_migrator(TrecQrels(dlc['dev1/qrels'], QRELS_DEFS)),
        TrecScoredDocs(GzipExtract(dlc['dev1/scoreddocs'])),
    )
    subsets['dev2'] = Dataset(
        collection,
        TsvQueries(dlc['dev2/queries'], namespace='msmarco', lang='en'),
        qrels_migrator(TrecQrels(dlc['dev2/qrels'], QRELS_DEFS)),
        TrecScoredDocs(GzipExtract(dlc['dev2/scoreddocs'])),
    )
    subsets['trec-dl-2021'] = Dataset(
        collection,
        TsvQueries(dlc['trec-dl-2021/queries'], namespace='msmarco', lang='en'),
        TrecQrels(dlc['trec-dl-2021/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['trec-dl-2021/scoreddocs'])),
    )
    dl21_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2021'].qrels_iter()})
    subsets['trec-dl-2021/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2021'].queries_handler(), dl21_judged),
        FilteredScoredDocs(subsets['trec-dl-2021'].scoreddocs_handler(), dl21_judged),
        subsets['trec-dl-2021'],
    )

    ir_datasets.registry.register(NAME, Dataset(collection, documentation("_")))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s)))

    return collection, subsets
Ejemplo n.º 5
0
def _init():
    documentation = YamlDocumentation('docs/antique.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    collection = TsvDocs(dlc['docs'],
                         namespace=NAME,
                         lang='en',
                         count_hint=ir_datasets.util.count_hint(NAME))

    subsets = {}
    for subset in ('train', 'test'):
        qrels = TrecQrels(dlc[f'{subset}/qrels'], QREL_DEFS)
        queries = TsvQueries(dlc[f'{subset}/queries'],
                             namespace=NAME,
                             lang='en')
        subsets[subset] = Dataset(collection, queries, qrels)

    # Split the training data into training and validation data
    validation_qids = Lazy(lambda: VALIDATION_QIDS)
    subsets['train/split200-train'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(),
                        validation_qids,
                        mode='exclude'),
        FilteredQrels(subsets['train'].qrels_handler(),
                      validation_qids,
                      mode='exclude'), subsets['train'])
    subsets['train/split200-valid'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(),
                        validation_qids,
                        mode='include'),
        FilteredQrels(subsets['train'].qrels_handler(),
                      validation_qids,
                      mode='include'), subsets['train'])

    # Separate test set removing the "offensive (and noisy)" questions
    disallow_list = dlc['disallow_list']

    def disllow_qids():
        with disallow_list.stream() as stream:
            stream = io.TextIOWrapper(stream)
            return {l.rstrip() for l in stream}

    disllow_qids = Lazy(disllow_qids)
    subsets['test/non-offensive'] = Dataset(
        FilteredQueries(subsets['test'].queries_handler(),
                        disllow_qids,
                        mode='exclude'),
        FilteredQrels(subsets['test'].qrels_handler(),
                      disllow_qids,
                      mode='exclude'), subsets['test'])

    ir_datasets.registry.register(NAME, Dataset(collection,
                                                documentation('_')))

    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}',
                                      Dataset(subsets[s], documentation(s)))

    return collection, subsets
Ejemplo n.º 6
0
def _init():
    base_path = ir_datasets.util.home_path()/NAME
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    manager = MsMarcoQnAManager(GzipExtract(dlc['train']), GzipExtract(dlc['dev']), GzipExtract(dlc['eval']), base_path)
    migrator = Migrator(base_path/'irds_version.txt', 'v2',
        affected_files=[
            base_path/'docs.pklz4',
            base_path/'train.run', base_path/'train.qrels',
            base_path/'dev.run', base_path/'dev.qrels',
            base_path/'eval.run',
        ],
        message='Migrating msmarco-qna (correcting doc_ids)')

    collection = DocstoreBackedDocs(manager.docs_store, docs_cls=MsMarcoQnADoc, namespace=NAME, lang='en')
    collection = migrator(collection)

    subsets = {}

    subsets['train'] = Dataset(
        collection,
        TsvQueries(manager.file_ref('train.queries.tsv'), query_cls=MsMarcoQnAQuery, namespace='msmarco', lang='en'),
        migrator(TrecQrels(manager.file_ref('train.qrels'), QRELS_DEFS)),
        migrator(TrecScoredDocs(manager.file_ref('train.run'))),
    )

    subsets['dev'] = Dataset(
        collection,
        TsvQueries(manager.file_ref('dev.queries.tsv'), query_cls=MsMarcoQnAQuery, namespace='msmarco', lang='en'),
        migrator(TrecQrels(manager.file_ref('dev.qrels'), QRELS_DEFS)),
        migrator(TrecScoredDocs(manager.file_ref('dev.run'))),
    )

    subsets['eval'] = Dataset(
        collection,
        TsvQueries(manager.file_ref('eval.queries.tsv'), query_cls=MsMarcoQnAEvalQuery, namespace='msmarco', lang='en'),
        migrator(TrecScoredDocs(manager.file_ref('eval.run'))),
    )

    ir_datasets.registry.register(NAME, Dataset(collection, documentation('_')))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s)))

    return collection, subsets
Ejemplo n.º 7
0
    def test_too_few_columns(self):
        data_type = namedtuple('data_type', ['doc_id', 'field1', 'field2'])
        mock_file = StringFile('''
123\tsome field\tanother field
123\ttoo few fields

456\tanother query\tsomething
'''.strip())

        queries = TsvQueries(mock_file, data_type)
        with self.assertRaises(RuntimeError):
            list(queries.queries_iter())

        docs = TsvDocs(mock_file, data_type)
        with self.assertRaises(RuntimeError):
            list(docs.docs_iter())

        docpairs = TsvDocPairs(mock_file, data_type)
        with self.assertRaises(RuntimeError):
            list(docpairs.docpairs_iter())
Ejemplo n.º 8
0
def _init():
    base_path = ir_datasets.util.home_path() / NAME
    dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    base_dlc = TarExtractAll(dlc['source'], base_path / 'lotte_extracted')

    base = Dataset(documentation('_'))

    subsets = {}

    domains = [
        ('lifestyle', ),
        ('recreation', ),
        ('science', ),
        ('technology', ),
        ('writing', ),
        ('pooled', ),
    ]

    for (domain, ) in domains:
        for split in ['dev', 'test']:
            corpus = TsvDocs(RelativePath(
                base_dlc, f'lotte/{domain}/{split}/collection.tsv'),
                             lang='en')
            subsets[f'{domain}/{split}'] = Dataset(
                corpus, documentation(f'{domain}/{split}'))
            for qtype in ['search', 'forum']:
                subsets[f'{domain}/{split}/{qtype}'] = Dataset(
                    corpus,
                    TsvQueries(RelativePath(
                        base_dlc,
                        f'lotte/{domain}/{split}/questions.{qtype}.tsv'),
                               lang='en'),
                    LotteQrels(
                        RelativePath(
                            base_dlc,
                            f'lotte/{domain}/{split}/qas.{qtype}.jsonl')),
                    documentation(f'{domain}/{split}/{qtype}'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Ejemplo n.º 9
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    migrator = Migrator(base_path / 'irds_version.txt',
                        'v2',
                        affected_files=[
                            base_path / 'collection.tsv',
                            base_path / 'collection.tsv.pklz4'
                        ],
                        message=f'Migrating {NAME} (fixing passage encoding)')

    collection = TsvDocs(Cache(
        FixEncoding(TarExtract(dlc['collectionandqueries'], 'collection.tsv')),
        base_path / 'collection.tsv'),
                         namespace='msmarco',
                         lang='en',
                         docstore_size_hint=14373971970,
                         count_hint=ir_datasets.util.count_hint(NAME))
    collection = migrator(collection)
    subsets = {}

    subsets['train'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.train.tsv'),
                         base_path / 'train/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(dlc['train/qrels'], QRELS_DEFS),
        TsvDocPairs(GzipExtract(dlc['train/docpairs'])),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(
                    TarExtract(dlc['train/scoreddocs'], 'top1000.train.txt')),
                base_path / 'train/ms.run')),
    )

    subsets['train/triples-v2'] = Dataset(
        collection,
        subsets['train'].queries_handler(),
        subsets['train'].qrels_handler(),
        TsvDocPairs(GzipExtract(dlc['train/docpairs/v2'])),
        subsets['train'].scoreddocs_handler(),
    )

    subsets['train/triples-small'] = Dataset(
        collection,
        subsets['train'].queries_handler(),
        subsets['train'].qrels_handler(),
        TsvDocPairs(
            Cache(
                MapSmallTriplesQidPid(
                    TarExtract(dlc['train/docpairs/small'],
                               'triples.train.small.tsv'),
                    TarExtract(dlc['collectionandqueries'], 'collection.tsv'),
                    subsets['train'].queries_handler()),
                base_path / 'train/small.triples.qidpid.tsv')),
        subsets['train'].scoreddocs_handler(),
    )

    subsets['dev'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.dev.tsv'),
                         base_path / 'dev/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(dlc['dev/qrels'], QRELS_DEFS),
    )

    subsets['dev/small'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(dlc['collectionandqueries'], 'queries.dev.small.tsv'),
            base_path / 'dev/small/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(
            Cache(
                TarExtract(dlc['collectionandqueries'], 'qrels.dev.small.tsv'),
                base_path / 'dev/small/qrels'), QRELS_DEFS),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(TarExtract(dlc['dev/scoreddocs'],
                                         'top1000.dev')),
                base_path / 'dev/ms.run')),
    )

    subsets['eval'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.eval.tsv'),
                         base_path / 'eval/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
    )

    subsets['eval/small'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(dlc['collectionandqueries'], 'queries.eval.small.tsv'),
            base_path / 'eval/small/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(
                    TarExtract(dlc['eval/scoreddocs'], 'top1000.eval')),
                base_path / 'eval/ms.run')),
    )

    subsets['trec-dl-2019'] = Dataset(
        collection,
        TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS),
        TsvQueries(Cache(GzipExtract(dlc['trec-dl-2019/queries']),
                         base_path / 'trec-dl-2019/queries.tsv'),
                   namespace='msmarco',
                   lang='en'),
        TrecScoredDocs(
            Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2019/scoreddocs'])),
                  base_path / 'trec-dl-2019/ms.run')),
    )

    subsets['trec-dl-2020'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(
            Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2020/scoreddocs'])),
                  base_path / 'trec-dl-2020/ms.run')),
    )

    # A few subsets that are contrainted to just the queries/qrels/docpairs that have at least
    # 1 relevance assessment
    train_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['train'].qrels_iter()})
    subsets['train/judged'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(), train_judged),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           train_judged),
        subsets['train'],
    )

    dev_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['dev'].qrels_iter()})
    subsets['dev/judged'] = Dataset(
        FilteredQueries(subsets['dev'].queries_handler(), dev_judged),
        subsets['dev'],
    )

    dl19_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2019'].qrels_iter()})
    subsets['trec-dl-2019/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2019'].queries_handler(),
                        dl19_judged),
        FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(),
                           dl19_judged),
        subsets['trec-dl-2019'],
    )

    dl20_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2020'].qrels_iter()})
    subsets['trec-dl-2020/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2020'].queries_handler(),
                        dl20_judged),
        FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(),
                           dl20_judged),
        subsets['trec-dl-2020'],
    )

    # split200 -- 200 queries held out from the training data for validation
    split200 = Lazy(lambda: SPLIT200_QIDS)
    subsets['train/split200-train'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(),
                        split200,
                        mode='exclude'),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           split200,
                           mode='exclude'),
        FilteredQrels(subsets['train'].qrels_handler(),
                      split200,
                      mode='exclude'),
        FilteredDocPairs(subsets['train'].docpairs_handler(),
                         split200,
                         mode='exclude'),
        subsets['train'],
    )
    subsets['train/split200-valid'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(),
                        split200,
                        mode='include'),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           split200,
                           mode='include'),
        FilteredQrels(subsets['train'].qrels_handler(),
                      split200,
                      mode='include'),
        FilteredDocPairs(subsets['train'].docpairs_handler(),
                         split200,
                         mode='include'),
        subsets['train'],
    )

    # Medical subset
    def train_med():
        with dlc['medmarco_ids'].stream() as stream:
            stream = codecs.getreader('utf8')(stream)
            return {l.rstrip() for l in stream}

    train_med = Lazy(train_med)
    subsets['train/medical'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(), train_med),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_med),
        FilteredDocPairs(subsets['train'].docpairs_handler(), train_med),
        FilteredQrels(subsets['train'].qrels_handler(), train_med),
        subsets['train'],
    )

    # DL-Hard
    dl_hard_qrels_migrator = Migrator(
        base_path / 'trec-dl-hard' / 'irds_version.txt',
        'v3',
        affected_files=[base_path / 'trec-dl-hard' / 'qrels'],
        message='Updating trec-dl-hard qrels')
    hard_qids = Lazy(lambda: DL_HARD_QIDS)
    dl_hard_base_queries = TsvQueries([
        Cache(GzipExtract(dlc['trec-dl-2019/queries']),
              base_path / 'trec-dl-2019/queries.tsv'),
        Cache(GzipExtract(dlc['trec-dl-2020/queries']),
              base_path / 'trec-dl-2020/queries.tsv')
    ],
                                      namespace='msmarco',
                                      lang='en')
    subsets['trec-dl-hard'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        dl_hard_qrels_migrator(
            TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)),
        documentation('trec-dl-hard'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1'])
    subsets['trec-dl-hard/fold1'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold1'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2'])
    subsets['trec-dl-hard/fold2'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold2'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3'])
    subsets['trec-dl-hard/fold3'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold3'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4'])
    subsets['trec-dl-hard/fold4'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold4'))
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5'])
    subsets['trec-dl-hard/fold5'] = Dataset(
        collection, FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold5'))

    ir_datasets.registry.register(NAME, Dataset(collection,
                                                documentation('_')))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}',
                                      Dataset(subsets[s], documentation(s)))

    return collection, subsets
Ejemplo n.º 10
0
def _init():
    subsets = {}
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    manager = AolManager([
        GzipExtract(
            TarExtract(
                dlc['logs'],
                'AOL-user-ct-collection/user-ct-test-collection-01.txt.gz')),
        GzipExtract(
            TarExtract(
                dlc['logs'],
                'AOL-user-ct-collection/user-ct-test-collection-02.txt.gz')),
        GzipExtract(
            TarExtract(
                dlc['logs'],
                'AOL-user-ct-collection/user-ct-test-collection-03.txt.gz')),
        GzipExtract(
            TarExtract(
                dlc['logs'],
                'AOL-user-ct-collection/user-ct-test-collection-04.txt.gz')),
        GzipExtract(
            TarExtract(
                dlc['logs'],
                'AOL-user-ct-collection/user-ct-test-collection-05.txt.gz')),
        GzipExtract(
            TarExtract(
                dlc['logs'],
                'AOL-user-ct-collection/user-ct-test-collection-06.txt.gz')),
        GzipExtract(
            TarExtract(
                dlc['logs'],
                'AOL-user-ct-collection/user-ct-test-collection-07.txt.gz')),
        GzipExtract(
            TarExtract(
                dlc['logs'],
                'AOL-user-ct-collection/user-ct-test-collection-08.txt.gz')),
        GzipExtract(
            TarExtract(
                dlc['logs'],
                'AOL-user-ct-collection/user-ct-test-collection-09.txt.gz')),
        GzipExtract(
            TarExtract(
                dlc['logs'],
                'AOL-user-ct-collection/user-ct-test-collection-10.txt.gz')),
    ], GzipExtract(dlc['id2wb']), base_path)

    base = Dataset(
        DocstoreBackedDocs(manager.docs_store,
                           docs_cls=AolIaDoc,
                           namespace=NAME,
                           lang=None),
        TsvQueries(manager.file_ref('queries.tsv'), lang=None),
        TrecQrels(manager.file_ref('qrels'), QREL_DEFS),
        AolQlogs(manager.file_ref('log.pkl.lz4')), documentation('_'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets, manager, base_path
Ejemplo n.º 11
0
# What to the relevance levels in qrels mean?
QREL_DEFS = {
    1: 'relevant',
    0: 'not relevant',
}

# Specify where to find the content. Here it's just from the repository, but it could be anywhere.
DL_DOCS = ir_datasets.util.RequestsDownload(
    'https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/docs.tsv'
)
DL_QUERIES = ir_datasets.util.RequestsDownload(
    'https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/queries.tsv'
)
DL_QRELS = ir_datasets.util.RequestsDownload(
    'https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/qrels'
)

# where the content is cached
base_path = ir_datasets.util.home_path() / NAME

# Dataset definition: it provides docs, queries, and qrels
dataset = ir_datasets.Dataset(
    TsvDocs(ir_datasets.util.Cache(DL_DOCS, base_path / 'docs.tsv')),
    TsvQueries(ir_datasets.util.Cache(DL_QUERIES, base_path / 'queries.tsv')),
    TrecQrels(ir_datasets.util.Cache(DL_QRELS, base_path / 'qrels'),
              QREL_DEFS),
)

# Register the dataset with ir_datasets
ir_datasets.registry.register(NAME, dataset)
Ejemplo n.º 12
0
def _init():
    base_path = ir_datasets.util.home_path()/NAME
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    subsets = {}
    collection = MsMarcoTrecDocs(GzipExtract(dlc['docs']))

    subsets['train'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['train/queries']), namespace='msmarco', lang='en'),
        TrecQrels(GzipExtract(dlc['train/qrels']), QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])),
    )

    subsets['dev'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['dev/queries']), namespace='msmarco', lang='en'),
        TrecQrels(GzipExtract(dlc['dev/qrels']), QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['dev/scoreddocs'])),
    )

    subsets['eval'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['eval/queries']), namespace='msmarco', lang='en'),
        TrecScoredDocs(GzipExtract(dlc['eval/scoreddocs'])),
    )

    subsets['trec-dl-2019'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']), namespace='msmarco', lang='en'),
        TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['trec-dl-2019/scoreddocs'])),
    )

    subsets['trec-dl-2020'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'),
        TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['trec-dl-2020/scoreddocs'])),
    )

    subsets['orcas'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['orcas/queries']), namespace='orcas', lang='en'),
        TrecQrels(GzipExtract(dlc['orcas/qrels']), ORCAS_QLRES_DEFS),
        TrecScoredDocs(GzipExtract(dlc['orcas/scoreddocs'])),
    )

    dl19_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()})
    subsets['trec-dl-2019/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged),
        FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged),
        subsets['trec-dl-2019'],
    )

    dl20_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()})
    subsets['trec-dl-2020/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_judged),
        FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(), dl20_judged),
        subsets['trec-dl-2020'],
    )

    # DL-Hard
    dl_hard_qrels_migrator = Migrator(base_path/'trec-dl-hard'/'irds_version.txt', 'v2',
        affected_files=[base_path/'trec-dl-hard'/'qrels'],
        message='Updating trec-dl-hard qrels')
    hard_qids = Lazy(lambda: DL_HARD_QIDS)
    dl_hard_base_queries = TsvQueries([
            Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path/'trec-dl-2019/queries.tsv'),
            Cache(GzipExtract(dlc['trec-dl-2020/queries']), base_path/'trec-dl-2020/queries.tsv')], namespace='msmarco', lang='en')
    subsets['trec-dl-hard'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        dl_hard_qrels_migrator(TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)),
        documentation('trec-dl-hard')
    )
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1'])
    subsets['trec-dl-hard/fold1'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold1')
    )
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2'])
    subsets['trec-dl-hard/fold2'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold2')
    )
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3'])
    subsets['trec-dl-hard/fold3'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold3')
    )
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4'])
    subsets['trec-dl-hard/fold4'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold4')
    )
    hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5'])
    subsets['trec-dl-hard/fold5'] = Dataset(
        collection,
        FilteredQueries(dl_hard_base_queries, hard_qids),
        FilteredQrels(subsets['trec-dl-hard'], hard_qids),
        documentation('trec-dl-hard/fold5')
    )
    
    subsets['anchor-text'] = Dataset(
        MsMarcoAnchorTextDocs(
            Cache(GzipExtract(dlc['anchor-text']), base_path / "anchor-text.json"),
            count_hint=1703834
        ),
        documentation('anchor-text')
    )

    ir_datasets.registry.register(NAME, Dataset(collection, documentation("_")))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s)))

    return collection, subsets
Ejemplo n.º 13
0
def _init():
    documentation = YamlDocumentation('docs/msmarco-passage.yaml')
    base_path = ir_datasets.util.home_path() / 'msmarco-passage'
    dlc = DownloadConfig.context('msmarco-passage', base_path, dua=DUA)
    collection = TsvDocs(Cache(
        FixEncoding(TarExtract(dlc['collectionandqueries'], 'collection.tsv')),
        base_path / 'collection.tsv'),
                         namespace='msmarco')
    subsets = {}

    subsets['train'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.train.tsv'),
                         base_path / 'train/queries.tsv'),
                   namespace='msmarco'),
        TrecQrels(dlc['train/qrels'], QRELS_DEFS),
        TsvDocPairs(GzipExtract(dlc['train/docpairs'])),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(
                    TarExtract(dlc['train/scoreddocs'], 'top1000.train.txt')),
                base_path / 'train/ms.run')),
    )

    subsets['dev'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.dev.tsv'),
                         base_path / 'dev/queries.tsv'),
                   namespace='msmarco'),
        TrecQrels(dlc['dev/qrels'], QRELS_DEFS),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(TarExtract(dlc['dev/scoreddocs'],
                                         'top1000.dev')),
                base_path / 'dev/ms.run')),
    )

    subsets['dev/small'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(dlc['collectionandqueries'], 'queries.dev.small.tsv'),
            base_path / 'dev/small/queries.tsv'),
                   namespace='msmarco'),
        TrecQrels(
            Cache(
                TarExtract(dlc['collectionandqueries'], 'qrels.dev.small.tsv'),
                base_path / 'dev/small/qrels'), QRELS_DEFS),
    )

    subsets['eval'] = Dataset(
        collection,
        TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.eval.tsv'),
                         base_path / 'eval/queries.tsv'),
                   namespace='msmarco'),
        TrecScoredDocs(
            Cache(
                ExtractQidPid(
                    TarExtract(dlc['eval/scoreddocs'], 'top1000.eval')),
                base_path / 'eval/ms.run')),
    )

    subsets['eval/small'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(dlc['collectionandqueries'], 'queries.eval.small.tsv'),
            base_path / 'eval/small/queries.tsv'),
                   namespace='msmarco'),
    )

    subsets['trec-dl-2019'] = Dataset(
        collection,
        TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS),
        TsvQueries(Cache(GzipExtract(dlc['trec-dl-2019/queries']),
                         base_path / 'trec-dl-2019/queries.tsv'),
                   namespace='msmarco'),
        TrecScoredDocs(
            Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2019/scoreddocs'])),
                  base_path / 'trec-dl-2019/ms.run')),
    )

    subsets['trec-dl-2020'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']),
                   namespace='msmarco'),
        TrecScoredDocs(
            Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2020/scoreddocs'])),
                  base_path / 'trec-dl-2020/ms.run')),
    )

    # A few subsets that are contrainted to just the queries/qrels/docpairs that have at least
    # 1 relevance assessment
    train_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['train'].qrels_iter()})
    subsets['train/judged'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(), train_judged),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           train_judged),
        subsets['train'],
    )

    dev_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['dev'].qrels_iter()})
    subsets['dev/judged'] = Dataset(
        FilteredQueries(subsets['dev'].queries_handler(), dev_judged),
        FilteredScoredDocs(subsets['dev'].scoreddocs_handler(), dev_judged),
        subsets['dev'],
    )

    dl19_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2019'].qrels_iter()})
    subsets['trec-dl-2019/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2019'].queries_handler(),
                        dl19_judged),
        FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(),
                           dl19_judged),
        subsets['trec-dl-2019'],
    )

    # split200 -- 200 queries held out from the training data for validation
    split200 = Lazy(lambda: SPLIT200_QIDS)
    subsets['train/split200-train'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(),
                        split200,
                        mode='exclude'),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           split200,
                           mode='exclude'),
        FilteredQrels(subsets['train'].qrels_handler(),
                      split200,
                      mode='exclude'),
        FilteredDocPairs(subsets['train'].docpairs_handler(),
                         split200,
                         mode='exclude'),
        subsets['train'],
    )
    subsets['train/split200-valid'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(),
                        split200,
                        mode='include'),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(),
                           split200,
                           mode='include'),
        FilteredQrels(subsets['train'].qrels_handler(),
                      split200,
                      mode='include'),
        FilteredDocPairs(subsets['train'].docpairs_handler(),
                         split200,
                         mode='include'),
        subsets['train'],
    )

    # Medical subset
    def train_med():
        with dlc['medmarco_ids'].stream() as stream:
            stream = codecs.getreader('utf8')(stream)
            return {l.rstrip() for l in stream}

    train_med = Lazy(train_med)
    subsets['train/medical'] = Dataset(
        FilteredQueries(subsets['train'].queries_handler(), train_med),
        FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_med),
        FilteredDocPairs(subsets['train'].docpairs_handler(), train_med),
        FilteredQrels(subsets['train'].qrels_handler(), train_med),
        subsets['train'],
    )

    ir_datasets.registry.register('msmarco-passage',
                                  Dataset(collection, documentation('_')))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'msmarco-passage/{s}',
                                      Dataset(subsets[s], documentation(s)))

    return collection, subsets
Ejemplo n.º 14
0
def _init():
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    main_dlc = dlc['main']

    collection = TsvDocs(Cache(
        TarExtract(main_dlc, 'nfcorpus/raw/doc_dump.txt'),
        base_path / 'collection.tsv'),
                         doc_cls=NfCorpusDoc,
                         namespace=NAME)
    subsets = {}

    def read_lines(file):
        file = Cache(TarExtract(main_dlc, f'nfcorpus/raw/{file}'),
                     base_path / file)
        with file.stream() as stream:
            stream = codecs.getreader('utf8')(stream)
            return {l.rstrip() for l in stream}

    nontopic_qid_filter = Lazy(lambda: read_lines('nontopics.ids'))
    video_qid_filter = Lazy(lambda: read_lines('all_videos.ids'))

    subsets['train'] = Dataset(
        collection,
        ZipQueries([
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/train.titles.queries'),
                base_path / 'train/queries.titles.tsv'),
                       namespace=NAME),
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/train.all.queries'),
                base_path / 'train/queries.all.tsv'),
                       namespace=NAME),
        ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery),
        TrecQrels(
            Cache(TarExtract(main_dlc, 'nfcorpus/train.3-2-1.qrel'),
                  base_path / 'train/qrels'), QRELS_DEFS),
        documentation('train'),
    )

    subsets['train/nontopic'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(main_dlc, 'nfcorpus/train.nontopic-titles.queries'),
            base_path / 'train/nontopic/queries.tsv'),
                   namespace=NAME),
        FilteredQrels(subsets['train'].qrels_handler(),
                      nontopic_qid_filter,
                      mode='include'),
        documentation('train/nontopic'),
    )

    subsets['train/video'] = Dataset(
        collection,
        ZipQueries([
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/train.vid-titles.queries'),
                base_path / 'train/video/queries.titles.tsv'),
                       namespace=NAME),
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/train.vid-desc.queries'),
                base_path / 'train/video/queries.desc.tsv'),
                       namespace=NAME),
        ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery),
        TsvQueries(Cache(
            TarExtract(main_dlc, 'nfcorpus/train.nontopic-titles.queries'),
            base_path / 'train/video/queries.tsv'),
                   NfCorpusVideoQuery,
                   namespace=NAME),
        FilteredQrels(subsets['train'].qrels_handler(),
                      video_qid_filter,
                      mode='include'),
        documentation('train/video'),
    )

    subsets['dev'] = Dataset(
        collection,
        ZipQueries([
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/dev.titles.queries'),
                base_path / 'dev/queries.titles.tsv'),
                       namespace=NAME),
            TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/dev.all.queries'),
                             base_path / 'dev/queries.all.tsv'),
                       namespace=NAME),
        ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery),
        TrecQrels(
            Cache(TarExtract(main_dlc, 'nfcorpus/dev.3-2-1.qrel'),
                  base_path / 'dev/qrels'), QRELS_DEFS),
        documentation('dev'),
    )

    subsets['dev/nontopic'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(main_dlc, 'nfcorpus/dev.nontopic-titles.queries'),
            base_path / 'dev/nontopic/queries.tsv'),
                   namespace=NAME),
        FilteredQrels(subsets['dev'].qrels_handler(),
                      nontopic_qid_filter,
                      mode='include'),
        documentation('dev/nontopic'),
    )

    subsets['dev/video'] = Dataset(
        collection,
        ZipQueries([
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/dev.vid-titles.queries'),
                base_path / 'dev/video/queries.titles.tsv'),
                       namespace=NAME),
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/dev.vid-desc.queries'),
                base_path / 'dev/video/queries.desc.tsv'),
                       namespace=NAME),
        ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery),
        TsvQueries(Cache(
            TarExtract(main_dlc, 'nfcorpus/dev.nontopic-titles.queries'),
            base_path / 'dev/video/queries.tsv'),
                   NfCorpusVideoQuery,
                   namespace=NAME),
        FilteredQrels(subsets['dev'].qrels_handler(),
                      video_qid_filter,
                      mode='include'),
        documentation('dev/video'),
    )

    subsets['test'] = Dataset(
        collection,
        ZipQueries([
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/test.titles.queries'),
                base_path / 'test/queries.titles.tsv'),
                       namespace=NAME),
            TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/test.all.queries'),
                             base_path / 'test/queries.all.tsv'),
                       namespace=NAME),
        ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery),
        TrecQrels(
            Cache(TarExtract(main_dlc, 'nfcorpus/test.3-2-1.qrel'),
                  base_path / 'test/qrels'), QRELS_DEFS),
        documentation('test'),
    )

    subsets['test/nontopic'] = Dataset(
        collection,
        TsvQueries(Cache(
            TarExtract(main_dlc, 'nfcorpus/test.nontopic-titles.queries'),
            base_path / 'test/nontopic/queries.tsv'),
                   namespace=NAME),
        FilteredQrels(subsets['test'].qrels_handler(),
                      nontopic_qid_filter,
                      mode='include'),
        documentation('test/nontopic'),
    )

    subsets['test/video'] = Dataset(
        collection,
        ZipQueries([
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/test.vid-titles.queries'),
                base_path / 'test/video/queries.titles.tsv'),
                       namespace=NAME),
            TsvQueries(Cache(
                TarExtract(main_dlc, 'nfcorpus/test.vid-desc.queries'),
                base_path / 'test/video/queries.desc.tsv'),
                       namespace=NAME),
        ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery),
        TsvQueries(Cache(
            TarExtract(main_dlc, 'nfcorpus/test.nontopic-titles.queries'),
            base_path / 'test/video/queries.tsv'),
                   NfCorpusVideoQuery,
                   namespace=NAME),
        FilteredQrels(subsets['test'].qrels_handler(),
                      video_qid_filter,
                      mode='include'),
        documentation('test/video'),
    )

    ir_datasets.registry.register(NAME, Dataset(collection,
                                                documentation('_')))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return collection, subsets
def _init():
    base_path = ir_datasets.util.home_path() / 'msmarco-document'
    documentation = YamlDocumentation('docs/msmarco-document.yaml')
    dlc = DownloadConfig.context('msmarco-document', base_path, dua=DUA)
    subsets = {}
    collection = MsMarcoTrecDocs(GzipExtract(dlc['docs']))

    subsets['train'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['train/queries']), namespace='msmarco'),
        TrecQrels(GzipExtract(dlc['train/qrels']), QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])),
    )

    subsets['dev'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['dev/queries']), namespace='msmarco'),
        TrecQrels(GzipExtract(dlc['dev/qrels']), QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['dev/scoreddocs'])),
    )

    subsets['eval'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['eval/queries']), namespace='msmarco'),
        TrecScoredDocs(GzipExtract(dlc['eval/scoreddocs'])),
    )

    subsets['trec-dl-2019'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']),
                   namespace='msmarco'),
        TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['trec-dl-2019/scoreddocs'])),
    )

    subsets['trec-dl-2020'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']),
                   namespace='msmarco'),
        TrecScoredDocs(GzipExtract(dlc['trec-dl-2020/scoreddocs'])),
    )

    subsets['orcas'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['orcas/queries']), namespace='orcas'),
        TrecQrels(GzipExtract(dlc['orcas/qrels']), ORCAS_QLRES_DEFS),
        TrecScoredDocs(GzipExtract(dlc['orcas/scoreddocs'])),
    )

    dl19_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2019'].qrels_iter()})
    subsets['trec-dl-2019/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2019'].queries_handler(),
                        dl19_judged),
        FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(),
                           dl19_judged),
        subsets['trec-dl-2019'],
    )

    ir_datasets.registry.register('msmarco-document',
                                  Dataset(collection, documentation("_")))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'msmarco-document/{s}',
                                      Dataset(subsets[s], documentation(s)))

    return collection, subsets
Ejemplo n.º 16
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)

    subsets = {}

    train_qrels = ir_datasets.registry['msmarco-passage/train'].qrels_handler()
    train_docparis = TsvDocPairs(dlc['train/triples'])
    dev_qrels = TrecQrels(dlc['dev/qrels'], QRELS_DEFS)
    dev_small_qrels = TrecQrels(dlc['dev/qrels-small'], QRELS_DEFS)
    small_dev_qids = Lazy(
        lambda: {q.query_id
                 for q in dev_small_qrels.qrels_iter()})

    for lang in ['es', 'fr', 'pt', 'it', 'id', 'de', 'ru', 'zh']:
        collection = TsvDocs(
            dlc[f'{lang}/docs'],
            namespace=f'mmarco/{lang}',
            lang=lang,
            count_hint=ir_datasets.util.count_hint(f'{NAME}/{lang}'))
        subsets[f'{lang}'] = Dataset(collection, documentation(f'{lang}'))
        subsets[f'{lang}/train'] = Dataset(
            collection,
            TsvQueries(dlc[f'{lang}/queries/train'],
                       namespace=f'mmarco/{lang}',
                       lang=lang), train_qrels, train_docparis,
            documentation(f'{lang}/train'))
        subsets[f'{lang}/dev'] = Dataset(
            collection,
            TsvQueries(dlc[f'{lang}/queries/dev'],
                       namespace=f'mmarco/{lang}',
                       lang=lang), dev_qrels, documentation(f'{lang}/dev'))
        subsets[f'{lang}/dev/small'] = Dataset(
            collection,
            FilteredQueries(subsets[f'{lang}/dev'].queries_handler(),
                            small_dev_qids,
                            mode='include'), dev_small_qrels,
            TrecScoredDocs(dlc[f'{lang}/scoreddocs/dev'])
            if lang not in ('zh', 'pt') else None,
            documentation(f'{lang}/dev/small'))
        if lang in ('zh', 'pt'):
            subsets[f'{lang}/dev/v1.1'] = Dataset(
                collection,
                TsvQueries(dlc[f'{lang}/queries/dev/v1.1'],
                           namespace=f'mmarco/{lang}',
                           lang=lang), dev_qrels,
                documentation(f'{lang}/dev/v1.1'))
            subsets[f'{lang}/dev/small/v1.1'] = Dataset(
                collection,
                FilteredQueries(subsets[f'{lang}/dev/v1.1'].queries_handler(),
                                small_dev_qids,
                                mode='include'), dev_small_qrels,
                TrecScoredDocs(dlc[f'{lang}/scoreddocs/dev/v1.1']),
                documentation(f'{lang}/dev/v1.1'))
        if lang in ('pt', ):
            subsets[f'{lang}/train/v1.1'] = Dataset(
                collection,
                TsvQueries(dlc[f'{lang}/queries/train/v1.1'],
                           namespace=f'mmarco/{lang}',
                           lang=lang), train_qrels, train_docparis,
                documentation(f'{lang}/train/v1.1'))

    for lang in [
            'ar', 'zh', 'dt', 'fr', 'de', 'hi', 'id', 'it', 'ja', 'pt', 'ru',
            'es', 'vi'
    ]:
        collection = TsvDocs(
            dlc[f'v2/{lang}/docs'],
            namespace=f'mmarco/{lang}',
            lang=lang,
            count_hint=ir_datasets.util.count_hint(f'{NAME}/v2/{lang}'))
        subsets[f'v2/{lang}'] = Dataset(collection,
                                        documentation(f'v2/{lang}'))
        subsets[f'v2/{lang}/train'] = Dataset(
            collection,
            TsvQueries(dlc[f'v2/{lang}/queries/train'],
                       namespace=f'mmarco/v2/{lang}',
                       lang=lang), train_qrels, train_docparis,
            documentation(f'v2/{lang}/train'))
        subsets[f'v2/{lang}/dev'] = Dataset(
            collection,
            TsvQueries(dlc[f'v2/{lang}/queries/dev'],
                       namespace=f'v2/mmarco/{lang}',
                       lang=lang), dev_qrels, documentation(f'v2/{lang}/dev'))
        subsets[f'v2/{lang}/dev/small'] = Dataset(
            collection,
            FilteredQueries(subsets[f'v2/{lang}/dev'].queries_handler(),
                            small_dev_qids,
                            mode='include'), dev_small_qrels,
            TrecScoredDocs(dlc[f'v2/{lang}/scoreddocs/dev'],
                           negate_score=True),
            documentation(f'v2/{lang}/dev/small'))

    ir_datasets.registry.register(NAME, Dataset(documentation('_')))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return collection, subsets
Ejemplo n.º 17
0
def _init():
    base_path = ir_datasets.util.home_path() / NAME
    dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path)
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')

    base = Dataset(documentation('_'))

    subsets = {}

    langs = {
        'ar': 'mrtydi-v1.0-arabic',
        'bn': 'mrtydi-v1.0-bengali',
        'en': 'mrtydi-v1.0-english',
        'fi': 'mrtydi-v1.0-finnish',
        'id': 'mrtydi-v1.0-indonesian',
        'ja': 'mrtydi-v1.0-japanese',
        'ko': 'mrtydi-v1.0-korean',
        'ru': 'mrtydi-v1.0-russian',
        'sw': 'mrtydi-v1.0-swahili',
        'te': 'mrtydi-v1.0-telugu',
        'th': 'mrtydi-v1.0-thai',
    }

    migrator = Migrator(base_path / 'irds_version.txt',
                        'v2',
                        affected_files=[base_path / lang for lang in langs],
                        message='Migrating mr-tydi (restructuring directory)')

    for lang, file_name in langs.items():
        dlc_ds = TarExtractAll(dlc[lang], f'{base_path/lang}.data')
        docs = MrTydiDocs(
            GzipExtract(
                RelativePath(dlc_ds, f'{file_name}/collection/docs.jsonl.gz')),
            lang,
            count_hint=ir_datasets.util.count_hint(f'{NAME}/{lang}'))
        docs = migrator(docs)
        subsets[lang] = Dataset(
            docs,
            TsvQueries(RelativePath(dlc_ds, f'{file_name}/topic.tsv'),
                       lang=lang),
            TrecQrels(RelativePath(dlc_ds, f'{file_name}/qrels.txt'),
                      QREL_DEFS), documentation(lang))
        subsets[f'{lang}/train'] = Dataset(
            docs,
            TsvQueries(RelativePath(dlc_ds, f'{file_name}/topic.train.tsv'),
                       lang=lang),
            TrecQrels(RelativePath(dlc_ds, f'{file_name}/qrels.train.txt'),
                      QREL_DEFS), documentation(f'{lang}/train'))
        subsets[f'{lang}/dev'] = Dataset(
            docs,
            TsvQueries(RelativePath(dlc_ds, f'{file_name}/topic.dev.tsv'),
                       lang=lang),
            TrecQrels(RelativePath(dlc_ds, f'{file_name}/qrels.dev.txt'),
                      QREL_DEFS), documentation(f'{lang}/dev'))
        subsets[f'{lang}/test'] = Dataset(
            docs,
            TsvQueries(RelativePath(dlc_ds, f'{file_name}/topic.test.tsv'),
                       lang=lang),
            TrecQrels(RelativePath(dlc_ds, f'{file_name}/qrels.test.txt'),
                      QREL_DEFS), documentation(f'{lang}/test'))

    ir_datasets.registry.register(NAME, base)
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return base, subsets
Ejemplo n.º 18
0
def _init():
    base_path = ir_datasets.util.home_path() / NAME
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
    subsets = {}
    collection = MsMarcoV2Docs(dlc['docs'])

    subsets['train'] = Dataset(
        collection,
        TsvQueries(dlc['train_queries'], namespace='msmarco', lang='en'),
        TrecQrels(dlc['train_qrels'], QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['train_scoreddocs'])),
    )
    subsets['dev1'] = Dataset(
        collection,
        TsvQueries(dlc['dev1_queries'], namespace='msmarco', lang='en'),
        TrecQrels(dlc['dev1_qrels'], QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['dev1_scoreddocs'])),
    )
    subsets['dev2'] = Dataset(
        collection,
        TsvQueries(dlc['dev2_queries'], namespace='msmarco', lang='en'),
        TrecQrels(dlc['dev2_qrels'], QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['dev2_scoreddocs'])),
    )
    subsets['trec-dl-2019'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(GzipExtract(dlc['trec_dl_2019_qrels']), TREC_DL_QRELS_DEFS),
    )
    subsets['trec-dl-2020'] = Dataset(
        collection,
        TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']),
                   namespace='msmarco',
                   lang='en'),
        TrecQrels(GzipExtract(dlc['trec_dl_2020_qrels']), TREC_DL_QRELS_DEFS),
    )
    dl19_v2_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2019'].qrels_iter()})
    subsets['trec-dl-2019/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2019'].queries_handler(),
                        dl19_v2_judged),
        subsets['trec-dl-2019'],
    )
    dl20_v2_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2020'].qrels_iter()})
    subsets['trec-dl-2020/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2020'].queries_handler(),
                        dl20_v2_judged),
        subsets['trec-dl-2020'],
    )
    subsets['trec-dl-2021'] = Dataset(
        collection,
        TsvQueries(dlc['trec-dl-2021/queries'], namespace='msmarco',
                   lang='en'),
        TrecQrels(dlc['trec-dl-2021/qrels'], TREC_DL_QRELS_DEFS),
        TrecScoredDocs(GzipExtract(dlc['trec-dl-2021/scoreddocs'])),
    )
    dl21_judged = Lazy(
        lambda: {q.query_id
                 for q in subsets['trec-dl-2021'].qrels_iter()})
    subsets['trec-dl-2021/judged'] = Dataset(
        FilteredQueries(subsets['trec-dl-2021'].queries_handler(),
                        dl21_judged),
        FilteredScoredDocs(subsets['trec-dl-2021'].scoreddocs_handler(),
                           dl21_judged),
        subsets['trec-dl-2021'],
    )

    subsets['anchor-text'] = Dataset(
        MsMarcoV2AnchorTextDocs(Cache(GzipExtract(dlc['anchor-text']),
                                      base_path / "anchor-text.json"),
                                count_hint=4821244),
        documentation('anchor-text'))

    ir_datasets.registry.register(NAME, Dataset(collection,
                                                documentation("_")))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}',
                                      Dataset(subsets[s], documentation(s)))

    return collection, subsets