Beispiel #1
0
def test_from_dataset(data, fields):
    example_factory = ExampleFactory(fields)
    examples = [example_factory.from_list(raw_example) for raw_example in data]
    dataset = Dataset(examples, fields)
    pyarrow_dataset = DiskBackedDataset.from_dataset(dataset)

    for ds_ex, arrow_ex in zip(dataset, pyarrow_dataset):
        assert ds_ex.number == arrow_ex.number
        assert ds_ex.tokens == arrow_ex.tokens

    pyarrow_dataset.delete_cache()
Beispiel #2
0
def test_slice_view_to_dataset(dataset, tmp_path):
    start, stop, step = 3, 8, 2
    slc = slice(start, stop, step)
    dataset_view = DatasetSlicedView(dataset, s=slc)

    # cast to Dataset
    ds = Dataset.from_dataset(dataset_view)
    assert isinstance(ds, Dataset)
    assert len(ds) == len(dataset_view)
    for ex_view, ex_dataset in zip(dataset_view, ds):
        for f in dataset.fields:
            assert ex_view[f.name] == ex_dataset[f.name]

    # cast to DiskBackedDataset
    ds = DiskBackedDataset.from_dataset(dataset_view, cache_path=tmp_path)
    assert isinstance(ds, DiskBackedDataset)
    assert len(ds) == len(dataset_view)
    for ex_view, ex_dataset in zip(dataset_view, ds):
        for f in dataset.fields:
            assert ex_view[f.name] == ex_dataset[f.name]