def test_dataset_map(with_none): ds = datasets.Dataset.from_dict({"path": ["path1", "path2"]}) def process_data(batch): batch = { "image": [ np.array( [ [[1, 2, 3], [4, 5, 6], [7, 8, 9]], [[10, 20, 30], [40, 50, 60], [70, 80, 90]], [[100, 200, 300], [400, 500, 600], [700, 800, 900]], ] ) for _ in batch["path"] ] } if with_none: batch["image"][0] = None return batch features = datasets.Features({"image": Array3D(dtype="int32", shape=(3, 3, 3))}) processed_ds = ds.map(process_data, batched=True, remove_columns=ds.column_names, features=features) assert processed_ds.shape == (2, 1) with processed_ds.with_format("numpy") as pds: for i, example in enumerate(pds): assert "image" in example assert isinstance(example["image"], np.ndarray) assert example["image"].shape == (3, 3, 3) if with_none and i == 0: assert np.all(np.isnan(example["image"]))
def test_dataset_map(): ds = datasets.Dataset.from_dict({"path": ["path1", "path2"]}) def process_data(batch): return { "image": [ np.array([ [[1, 2, 3], [4, 5, 6], [7, 8, 9]], [[10, 20, 30], [40, 50, 60], [70, 80, 90]], [[100, 200, 300], [400, 500, 600], [700, 800, 900]], ]) for _ in batch["path"] ] } features = datasets.Features( {"image": Array3D(dtype="int32", shape=(3, 3, 3))}) processed_ds = ds.map(process_data, batched=True, remove_columns=ds.column_names, features=features) assert processed_ds.shape == (2, 1) with processed_ds.with_format("numpy") as pds: for example in pds: assert "image" in example assert isinstance(example["image"], np.ndarray)
def get_two_col_datasset(self, first_dim_list, fixed_shape): features = datasets.Features( {"image": Array3D(shape=(None, *fixed_shape), dtype="float32"), "text": Value("string")} ) dict_values = { "image": [np.random.rand(fdim, *fixed_shape).astype("float32") for fdim in first_dim_list], "text": ["text" for _ in first_dim_list], } dataset = datasets.Dataset.from_dict(dict_values, features=features) return dataset