def test_flatten(self): dset = Dataset.from_dict( { "a": [{ "b": { "c": ["text"] } }] * 10, "foo": [1] * 10 }, features=Features({ "a": { "b": Sequence({"c": Value("string")}) }, "foo": Value("int64") }), ) dset.flatten() self.assertListEqual(dset.column_names, ["a.b.c", "foo"]) self.assertListEqual(list(dset.features.keys()), ["a.b.c", "foo"]) self.assertDictEqual( dset.features, Features({ "a.b.c": Sequence(Value("string")), "foo": Value("int64") }))
def test_keep_features_with_new_features(self): features = Features({ "tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"])) }) dset = Dataset.from_dict( { "tokens": [["foo"] * 5] * 10, "labels": [[1] * 5] * 10 }, features=features) def invert_labels(x): return { "labels": [(1 - label) for label in x["labels"]], "labels2": x["labels"] } expected_features = Features({ "tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"])), "labels2": Sequence(Value("int64")), }) with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") inverted_dset = dset.map(invert_labels, cache_file_name=tmp_file) self.assertEqual(inverted_dset.features.type, expected_features.type) self.assertDictEqual(inverted_dset.features, expected_features)
def test_keep_features_after_transform_in_memory(self): features = Features( {"tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"]))} ) dset = Dataset.from_dict({"tokens": [["foo"] * 5] * 10, "labels": [[1] * 5] * 10}, features=features) def invert_labels(x): return {"labels": [(1 - label) for label in x["labels"]]} inverted_dset = dset.map(invert_labels, keep_in_memory=True) self.assertEqual(inverted_dset.features.type, features.type) self.assertDictEqual(inverted_dset.features, features)
def test_format_ragged_vectors(self): dset = self._create_dummy_dataset() import numpy as np import tensorflow as tf import torch with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") dset = dset.map(lambda ex, i: {"vec": np.ones(3 + i) * i}, with_indices=True, cache_file_name=tmp_file) columns = dset.column_names self.assertIsNotNone(dset[0]) self.assertIsNotNone(dset[:2]) for col in columns: self.assertIsInstance(dset[0][col], (str, list)) self.assertIsInstance(dset[:2][col], list) self.assertDictEqual( dset.features, Features({ "filename": Value("string"), "vec": Sequence(Value("float64")) })) dset.set_format("tensorflow") self.assertIsNotNone(dset[0]) self.assertIsNotNone(dset[:2]) for col in columns: self.assertIsInstance(dset[0][col], (tf.Tensor, tf.RaggedTensor)) self.assertIsInstance(dset[:2][col], (tf.Tensor, tf.RaggedTensor)) self.assertIsInstance(dset[col], (tf.Tensor, tf.RaggedTensor)) # dim is None for ragged vectors in tensorflow self.assertListEqual(dset[:2]["vec"].shape.as_list(), [2, None]) self.assertListEqual(dset["vec"][:2].shape.as_list(), [2, None]) dset.set_format("numpy") self.assertIsNotNone(dset[0]) self.assertIsNotNone(dset[:2]) for col in columns: self.assertIsInstance(dset[0][col], np.ndarray) self.assertIsInstance(dset[:2][col], np.ndarray) self.assertIsInstance(dset[col], np.ndarray) # array is flat for raged vectors in numpy self.assertEqual(dset[:2]["vec"].shape, (2, )) self.assertEqual(dset["vec"][:2].shape, (2, )) dset.set_format("torch", columns=["vec"]) self.assertIsNotNone(dset[0]) self.assertIsNotNone(dset[:2]) # torch.Tensor is only for numerical columns self.assertIsInstance(dset[0]["vec"], torch.Tensor) self.assertIsInstance(dset[:2]["vec"][0], torch.Tensor) self.assertIsInstance(dset["vec"][0], torch.Tensor) # pytorch doesn't support ragged tensors, so we should have lists self.assertIsInstance(dset[:2]["vec"], list) self.assertIsInstance(dset["vec"][:2], list)
def test_format_vectors(self): dset = self._create_dummy_dataset() import numpy as np import tensorflow as tf import torch with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") dset = dset.map(lambda ex, i: {"vec": np.ones(3) * i}, with_indices=True, cache_file_name=tmp_file) columns = dset.column_names self.assertIsNotNone(dset[0]) self.assertIsNotNone(dset[:2]) for col in columns: self.assertIsInstance(dset[0][col], (str, list)) self.assertIsInstance(dset[:2][col], list) self.assertDictEqual( dset.features, Features({ "filename": Value("string"), "vec": Sequence(Value("float64")) })) # don't test if torch and tensorflow are stacked accross examples # we need to use the features definition to know at what depth we have to to the conversion dset.set_format("tensorflow") self.assertIsNotNone(dset[0]) self.assertIsNotNone(dset[:2]) for col in columns: self.assertIsInstance(dset[0][col], (tf.Tensor, tf.RaggedTensor)) self.assertIsInstance( dset[:2][col][0], (tf.Tensor, tf.RaggedTensor)) # not stacked dset.set_format("numpy") self.assertIsNotNone(dset[0]) self.assertIsNotNone(dset[:2]) for col in columns: self.assertIsInstance(dset[0][col], np.ndarray) self.assertIsInstance(dset[:2][col], np.ndarray) # stacked self.assertEqual(dset[:2]["vec"].shape, (2, 3)) # stacked dset.set_format("torch", columns=["vec"]) self.assertIsNotNone(dset[0]) self.assertIsNotNone(dset[:2]) # torch.Tensor is only for numerical columns self.assertIsInstance(dset[0]["vec"], torch.Tensor) self.assertIsInstance(dset[:2]["vec"][0], torch.Tensor) # not stacked
def test_from_arrow_schema_with_sequence(self): data = {"a": [{"b": {"c": ["text"]}}] * 10, "foo": [1] * 10} original_features = Features({ "a": { "b": Sequence({"c": Value("string")}) }, "foo": Value("int64") }) dset = Dataset.from_dict(data, features=original_features) new_features = Features.from_arrow_schema(dset.schema) new_dset = Dataset.from_dict(data, features=new_features) self.assertDictEqual(dset[0], new_dset[0]) self.assertDictEqual(dset[:], new_dset[:])
def test_format_nested(self): dset = self._create_dummy_dataset() import numpy as np import tensorflow as tf import torch with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") dset = dset.map( lambda ex: {"nested": [{ "foo": np.ones(3) }] * len(ex["filename"])}, cache_file_name=tmp_file, batched=True, ) self.assertDictEqual( dset.features, Features({ "filename": Value("string"), "nested": { "foo": Sequence(Value("float64")) } })) dset.set_format("tensorflow") self.assertIsNotNone(dset[0]) self.assertIsInstance(dset[0]["nested"]["foo"], (tf.Tensor, tf.RaggedTensor)) self.assertIsNotNone(dset[:2]) self.assertIsInstance(dset[:2]["nested"][0]["foo"], (tf.Tensor, tf.RaggedTensor)) self.assertIsInstance(dset["nested"][0]["foo"], (tf.Tensor, tf.RaggedTensor)) dset.set_format("numpy") self.assertIsNotNone(dset[0]) self.assertIsInstance(dset[0]["nested"]["foo"], np.ndarray) self.assertIsNotNone(dset[:2]) self.assertIsInstance(dset[:2]["nested"][0]["foo"], np.ndarray) self.assertIsInstance(dset["nested"][0]["foo"], np.ndarray) dset.set_format("torch", columns="nested") self.assertIsNotNone(dset[0]) self.assertIsInstance(dset[0]["nested"]["foo"], torch.Tensor) self.assertIsNotNone(dset[:2]) self.assertIsInstance(dset[:2]["nested"][0]["foo"], torch.Tensor) self.assertIsInstance(dset["nested"][0]["foo"], torch.Tensor)
def test_map_numpy(self): dset = self._create_dummy_dataset() def func(example): return {"tensor": np.array([1.0, 2, 3])} with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") dset_test = dset.map(func, cache_file_name=tmp_file) self.assertEqual(len(dset_test), 30) self.assertDictEqual( dset_test.features, Features({ "filename": Value("string"), "tensor": Sequence(Value("float64")) })) self.assertListEqual(dset_test[0]["tensor"], [1, 2, 3])