def test_remove_colums(self): dset = self._create_dummy_dataset() with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") dset = dset.map(lambda x, i: { "name": x["filename"][:-2], "id": i }, with_indices=True, cache_file_name=tmp_file) self.assertTrue("id" in dset[0]) self.assertDictEqual( dset.features, Features({ "filename": Value("string"), "name": Value("string"), "id": Value("int64") })) with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") dset = dset.map(lambda x: x, remove_columns=["id"], cache_file_name=tmp_file) self.assertTrue("id" not in dset[0]) self.assertDictEqual( dset.features, Features({ "filename": Value("string"), "name": Value("string") }))
def test_flatten(self): dset = Dataset.from_dict( { "a": [{ "b": { "c": ["text"] } }] * 10, "foo": [1] * 10 }, features=Features({ "a": { "b": Sequence({"c": Value("string")}) }, "foo": Value("int64") }), ) dset.flatten() self.assertListEqual(dset.column_names, ["a.b.c", "foo"]) self.assertListEqual(list(dset.features.keys()), ["a.b.c", "foo"]) self.assertDictEqual( dset.features, Features({ "a.b.c": Sequence(Value("string")), "foo": Value("int64") }))
def test_keep_features_with_new_features(self): features = Features({ "tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"])) }) dset = Dataset.from_dict( { "tokens": [["foo"] * 5] * 10, "labels": [[1] * 5] * 10 }, features=features) def invert_labels(x): return { "labels": [(1 - label) for label in x["labels"]], "labels2": x["labels"] } expected_features = Features({ "tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"])), "labels2": Sequence(Value("int64")), }) with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") inverted_dset = dset.map(invert_labels, cache_file_name=tmp_file) self.assertEqual(inverted_dset.features.type, expected_features.type) self.assertDictEqual(inverted_dset.features, expected_features)
def test_filter(self): dset = self._create_dummy_dataset() # keep only first five examples with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") dset_filter_first_five = dset.filter(lambda x, i: i < 5, with_indices=True, cache_file_name=tmp_file) self.assertEqual(len(dset_filter_first_five), 5) self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual(dset_filter_first_five.features, Features({"filename": Value("string")})) # filter filenames with even id at the end with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") dset_filter_even_num = dset.filter( lambda x: (int(x["filename"][-1]) % 2 == 0), cache_file_name=tmp_file) self.assertEqual(len(dset_filter_even_num), 15) self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual(dset_filter_even_num.features, Features({"filename": Value("string")}))
def test_sort(self): dset = self._create_dummy_dataset() with tempfile.TemporaryDirectory() as tmp_dir: # Keep only 10 examples tmp_file = os.path.join(tmp_dir, "test.arrow") dset = dset.select(range(10), cache_file_name=tmp_file) tmp_file = os.path.join(tmp_dir, "test_2.arrow") dset = dset.shuffle(seed=1234, cache_file_name=tmp_file) self.assertEqual(len(dset), 10) self.assertEqual(dset[0]["filename"], "my_name-train_8") self.assertEqual(dset[1]["filename"], "my_name-train_9") # Sort tmp_file = os.path.join(tmp_dir, "test_3.arrow") dset_sorted = dset.sort("filename", cache_file_name=tmp_file) for i, row in enumerate(dset_sorted): self.assertEqual(int(row["filename"][-1]), i) self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual(dset_sorted.features, Features({"filename": Value("string")})) # Sort reversed tmp_file = os.path.join(tmp_dir, "test_4.arrow") dset_sorted = dset.sort("filename", cache_file_name=tmp_file, reverse=True) for i, row in enumerate(dset_sorted): self.assertEqual(int(row["filename"][-1]), len(dset_sorted) - 1 - i) self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual(dset_sorted.features, Features({"filename": Value("string")}))
def test_from_pandas(self): data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]} df = pd.DataFrame.from_dict(data) dset = Dataset.from_pandas(df) self.assertListEqual(dset["col_1"], data["col_1"]) self.assertListEqual(dset["col_2"], data["col_2"]) self.assertListEqual(list(dset.features.keys()), ["col_1", "col_2"]) features = Features({ "col_1": Value("int64"), "col_2": Value("string") }) dset = Dataset.from_pandas(df, features=features) self.assertListEqual(dset["col_1"], data["col_1"]) self.assertListEqual(dset["col_2"], data["col_2"]) self.assertListEqual(list(dset.features.keys()), ["col_1", "col_2"]) features = Features({ "col_1": Value("string"), "col_2": Value("string") }) self.assertRaises(pa.ArrowTypeError, Dataset.from_pandas, df, features=features)
def test_shard(self): dset = self._create_dummy_dataset() with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") dset = dset.select(range(10), cache_file_name=tmp_file) self.assertEqual(len(dset), 10) # Shard dset_sharded = dset.shard(num_shards=8, index=1) self.assertEqual(2, len(dset_sharded)) self.assertEqual(["my_name-train_1", "my_name-train_9"], dset_sharded["filename"]) self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual(dset_sharded.features, Features({"filename": Value("string")})) # Shard contiguous dset_sharded_contiguous = dset.shard(num_shards=3, index=0, contiguous=True) self.assertEqual([f"my_name-train_{i}" for i in (0, 1, 2, 3)], dset_sharded_contiguous["filename"]) self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual(dset_sharded.features, Features({"filename": Value("string")})) # Test lengths of sharded contiguous self.assertEqual([4, 3, 3], [ len(dset.shard(3, index=i, contiguous=True)) for i in range(3) ])
def test_map_batched(self): dset = self._create_dummy_dataset() def map_batched(example): return {"filename_new": [x + "_extension" for x in example["filename"]]} with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") dset_test_batched = dset.map(map_batched, batched=True, cache_file_name=tmp_file) self.assertEqual(len(dset_test_batched), 30) self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual( dset_test_batched.features, Features({"filename": Value("string"), "filename_new": Value("string")}) ) def map_batched_with_indices(example, idx): return {"filename_new": [x + "_extension_" + str(idx) for x in example["filename"]]} with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") dset_test_with_indices_batched = dset.map( map_batched_with_indices, batched=True, with_indices=True, cache_file_name=tmp_file ) self.assertEqual(len(dset_test_with_indices_batched), 30) self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual( dset_test_with_indices_batched.features, Features({"filename": Value("string"), "filename_new": Value("string")}), )
def test_format_ragged_vectors(self): dset = self._create_dummy_dataset() import numpy as np import tensorflow as tf import torch with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") dset = dset.map(lambda ex, i: {"vec": np.ones(3 + i) * i}, with_indices=True, cache_file_name=tmp_file) columns = dset.column_names self.assertIsNotNone(dset[0]) self.assertIsNotNone(dset[:2]) for col in columns: self.assertIsInstance(dset[0][col], (str, list)) self.assertIsInstance(dset[:2][col], list) self.assertDictEqual( dset.features, Features({ "filename": Value("string"), "vec": Sequence(Value("float64")) })) dset.set_format("tensorflow") self.assertIsNotNone(dset[0]) self.assertIsNotNone(dset[:2]) for col in columns: self.assertIsInstance(dset[0][col], (tf.Tensor, tf.RaggedTensor)) self.assertIsInstance(dset[:2][col], (tf.Tensor, tf.RaggedTensor)) self.assertIsInstance(dset[col], (tf.Tensor, tf.RaggedTensor)) # dim is None for ragged vectors in tensorflow self.assertListEqual(dset[:2]["vec"].shape.as_list(), [2, None]) self.assertListEqual(dset["vec"][:2].shape.as_list(), [2, None]) dset.set_format("numpy") self.assertIsNotNone(dset[0]) self.assertIsNotNone(dset[:2]) for col in columns: self.assertIsInstance(dset[0][col], np.ndarray) self.assertIsInstance(dset[:2][col], np.ndarray) self.assertIsInstance(dset[col], np.ndarray) # array is flat for raged vectors in numpy self.assertEqual(dset[:2]["vec"].shape, (2, )) self.assertEqual(dset["vec"][:2].shape, (2, )) dset.set_format("torch", columns=["vec"]) self.assertIsNotNone(dset[0]) self.assertIsNotNone(dset[:2]) # torch.Tensor is only for numerical columns self.assertIsInstance(dset[0]["vec"], torch.Tensor) self.assertIsInstance(dset[:2]["vec"][0], torch.Tensor) self.assertIsInstance(dset["vec"][0], torch.Tensor) # pytorch doesn't support ragged tensors, so we should have lists self.assertIsInstance(dset[:2]["vec"], list) self.assertIsInstance(dset["vec"][:2], list)
def test_cast_(self): dset = self._create_dummy_dataset(multiple_columns=True) features = dset.features features["col_1"] = Value("float64") dset.cast_(features) self.assertEqual(dset.num_columns, 2) self.assertEqual(dset.features["col_1"], Value("float64")) self.assertIsInstance(dset[0]["col_1"], float)
def _info(self): return MetricInfo( description="dummy metric for tests", citation="insert citation here", features=Features({ "predictions": Value("int64"), "references": Value("int64") }), )
def test_map(self): dset = self._create_dummy_dataset() with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") self.assertDictEqual(dset.features, Features({"filename": Value("string")})) dset_test = dset.map( lambda x: {"name": x["filename"][:-2], "id": int(x["filename"][-1])}, cache_file_name=tmp_file ) self.assertEqual(len(dset_test), 30) self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual( dset_test.features, Features({"filename": Value("string"), "name": Value("string"), "id": Value("int64")}), ) with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") dset_test_with_indices = dset.map( lambda x, i: {"name": x["filename"][:-2], "id": i}, with_indices=True, cache_file_name=tmp_file ) self.assertEqual(len(dset_test_with_indices), 30) self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual( dset_test_with_indices.features, Features({"filename": Value("string"), "name": Value("string"), "id": Value("int64")}), ) with tempfile.TemporaryDirectory() as tmp_dir: def func(x, i): if i == 4: raise KeyboardInterrupt() return {"name": x["filename"][:-2], "id": i} tmp_file = os.path.join(tmp_dir, "test.arrow") self.assertRaises( KeyboardInterrupt, dset.map, function=func, with_indices=True, cache_file_name=tmp_file, writer_batch_size=2, ) self.assertFalse(os.path.exists(tmp_file)) dset_test_with_indices = dset.map( lambda x, i: {"name": x["filename"][:-2], "id": i}, with_indices=True, cache_file_name=tmp_file, writer_batch_size=2, ) self.assertTrue(os.path.exists(tmp_file)) self.assertEqual(len(dset_test_with_indices), 30) self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual( dset_test_with_indices.features, Features({"filename": Value("string"), "name": Value("string"), "id": Value("int64")}), )
def test_cast_(self): dset = self._create_dummy_dataset_dict(multiple_columns=True) features = dset["train"].features features["col_1"] = Value("float64") dset.cast_(features) for dset_split in dset.values(): self.assertEqual(dset_split.num_columns, 2) self.assertEqual(dset_split.features["col_1"], Value("float64")) self.assertIsInstance(dset_split[0]["col_1"], float)
def test_dummy_dataset(self): dset = self._create_dummy_dataset() self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertEqual(dset[0]["filename"], "my_name-train_0") self.assertEqual(dset["filename"][0], "my_name-train_0") dset = self._create_dummy_dataset(multiple_columns=True) self.assertDictEqual(dset.features, Features({"col_1": Value("int64"), "col_2": Value("string")})) self.assertEqual(dset[0]["col_1"], 3) self.assertEqual(dset["col_1"][0], 3)
def test_format_vectors(self): dset = self._create_dummy_dataset() import numpy as np import tensorflow as tf import torch with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") dset = dset.map(lambda ex, i: {"vec": np.ones(3) * i}, with_indices=True, cache_file_name=tmp_file) columns = dset.column_names self.assertIsNotNone(dset[0]) self.assertIsNotNone(dset[:2]) for col in columns: self.assertIsInstance(dset[0][col], (str, list)) self.assertIsInstance(dset[:2][col], list) self.assertDictEqual( dset.features, Features({ "filename": Value("string"), "vec": Sequence(Value("float64")) })) # don't test if torch and tensorflow are stacked accross examples # we need to use the features definition to know at what depth we have to to the conversion dset.set_format("tensorflow") self.assertIsNotNone(dset[0]) self.assertIsNotNone(dset[:2]) for col in columns: self.assertIsInstance(dset[0][col], (tf.Tensor, tf.RaggedTensor)) self.assertIsInstance( dset[:2][col][0], (tf.Tensor, tf.RaggedTensor)) # not stacked dset.set_format("numpy") self.assertIsNotNone(dset[0]) self.assertIsNotNone(dset[:2]) for col in columns: self.assertIsInstance(dset[0][col], np.ndarray) self.assertIsInstance(dset[:2][col], np.ndarray) # stacked self.assertEqual(dset[:2]["vec"].shape, (2, 3)) # stacked dset.set_format("torch", columns=["vec"]) self.assertIsNotNone(dset[0]) self.assertIsNotNone(dset[:2]) # torch.Tensor is only for numerical columns self.assertIsInstance(dset[0]["vec"], torch.Tensor) self.assertIsInstance(dset[:2]["vec"][0], torch.Tensor) # not stacked
def test_from_dict(self): data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]} dset = Dataset.from_dict(data) self.assertListEqual(dset["col_1"], data["col_1"]) self.assertListEqual(dset["col_2"], data["col_2"]) features = Features({"col_1": Value("int64"), "col_2": Value("string")}) dset = Dataset.from_dict(data, features=features) self.assertListEqual(dset["col_1"], data["col_1"]) self.assertListEqual(dset["col_2"], data["col_2"]) features = Features({"col_1": Value("string"), "col_2": Value("string")}) self.assertRaises(pa.ArrowTypeError, Dataset.from_dict, data, features=features)
def test_select(self): dset = self._create_dummy_dataset() # select every two example indices = list(range(0, len(dset), 2)) with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") dset_select_even = dset.select(indices, cache_file_name=tmp_file) self.assertEqual(len(dset_select_even), 15) for row in dset_select_even: self.assertEqual(int(row["filename"][-1]) % 2, 0) self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual(dset_select_even.features, Features({"filename": Value("string")}))
def test_from_arrow_schema_with_sequence(self): data = {"a": [{"b": {"c": ["text"]}}] * 10, "foo": [1] * 10} original_features = Features({ "a": { "b": Sequence({"c": Value("string")}) }, "foo": Value("int64") }) dset = Dataset.from_dict(data, features=original_features) new_features = Features.from_arrow_schema(dset.schema) new_dset = Dataset.from_dict(data, features=new_features) self.assertDictEqual(dset[0], new_dset[0]) self.assertDictEqual(dset[:], new_dset[:])
def test_map_not_cached(self): dset = self._create_dummy_dataset() self.assertDictEqual(dset.features, Features({"filename": Value("string")})) dset_test = dset.map(lambda x: { "name": x["filename"][:-2], "id": int(x["filename"][-1]) }, cache_file_name=None) self.assertEqual(len(dset_test), 30) self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual( dset_test.features, Features({ "filename": Value("string"), "name": Value("string"), "id": Value("int64") }), ) self.assertDictEqual(dset.features, Features({"filename": Value("string")})) dset_test = dset.map(lambda x: None, cache_file_name=None) self.assertEqual(len(dset_test), 30) self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual( dset_test.features, Features({"filename": Value("string")}), )
def test_format_nested(self): dset = self._create_dummy_dataset() import numpy as np import tensorflow as tf import torch with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") dset = dset.map( lambda ex: {"nested": [{ "foo": np.ones(3) }] * len(ex["filename"])}, cache_file_name=tmp_file, batched=True, ) self.assertDictEqual( dset.features, Features({ "filename": Value("string"), "nested": { "foo": Sequence(Value("float64")) } })) dset.set_format("tensorflow") self.assertIsNotNone(dset[0]) self.assertIsInstance(dset[0]["nested"]["foo"], (tf.Tensor, tf.RaggedTensor)) self.assertIsNotNone(dset[:2]) self.assertIsInstance(dset[:2]["nested"][0]["foo"], (tf.Tensor, tf.RaggedTensor)) self.assertIsInstance(dset["nested"][0]["foo"], (tf.Tensor, tf.RaggedTensor)) dset.set_format("numpy") self.assertIsNotNone(dset[0]) self.assertIsInstance(dset[0]["nested"]["foo"], np.ndarray) self.assertIsNotNone(dset[:2]) self.assertIsInstance(dset[:2]["nested"][0]["foo"], np.ndarray) self.assertIsInstance(dset["nested"][0]["foo"], np.ndarray) dset.set_format("torch", columns="nested") self.assertIsNotNone(dset[0]) self.assertIsInstance(dset[0]["nested"]["foo"], torch.Tensor) self.assertIsNotNone(dset[:2]) self.assertIsInstance(dset[:2]["nested"][0]["foo"], torch.Tensor) self.assertIsInstance(dset["nested"][0]["foo"], torch.Tensor)
def test_as_dataset(self): with tempfile.TemporaryDirectory() as tmp_dir: dummy_builder = DummyBuilder(cache_dir=tmp_dir, name="dummy") os.makedirs(dummy_builder.cache_dir) dummy_builder.info.splits = SplitDict() dummy_builder.info.splits.add(SplitInfo("train", num_examples=10)) dummy_builder.info.splits.add(SplitInfo("test", num_examples=10)) for split in dummy_builder.info.splits: writer = ArrowWriter( path=os.path.join(dummy_builder.cache_dir, f"dummy_builder-{split}.arrow"), features=Features({"text": Value("string")}), ) writer.write_batch({"text": ["foo"] * 10}) writer.finalize() dsets = dummy_builder.as_dataset() self.assertIsInstance(dsets, DatasetDict) self.assertListEqual(list(dsets.keys()), ["train", "test"]) self.assertEqual(len(dsets["train"]), 10) self.assertEqual(len(dsets["test"]), 10) dset = dummy_builder.as_dataset("train") self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train") self.assertEqual(len(dset), 10) dset = dummy_builder.as_dataset("train+test[:30%]") self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train+test[:30%]") self.assertEqual(len(dset), 13)
def test_shuffle(self): dset = self._create_dummy_dataset() with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") dset_shuffled = dset.shuffle(seed=1234, cache_file_name=tmp_file) self.assertEqual(len(dset_shuffled), 30) self.assertEqual(dset_shuffled[0]["filename"], "my_name-train_28") self.assertEqual(dset_shuffled[2]["filename"], "my_name-train_10") self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual(dset_shuffled.features, Features({"filename": Value("string")})) # Reproducibility tmp_file = os.path.join(tmp_dir, "test_2.arrow") dset_shuffled_2 = dset.shuffle(seed=1234, cache_file_name=tmp_file) self.assertListEqual(dset_shuffled["filename"], dset_shuffled_2["filename"])
def test_from_arrow_schema_simple(self): data = {"a": [{"b": {"c": "text"}}] * 10, "foo": [1] * 10} original_features = Features({ "a": { "b": { "c": Value("string") } }, "foo": Value("int64") }) dset = Dataset.from_dict(data, features=original_features) new_features = dset.features new_dset = Dataset.from_dict(data, features=new_features) self.assertEqual(original_features.type, new_features.type) self.assertDictEqual(dset[0], new_dset[0]) self.assertDictEqual(dset[:], new_dset[:])
def test_map_numpy(self): dset = self._create_dummy_dataset() def func(example): return {"tensor": np.array([1.0, 2, 3])} with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") dset_test = dset.map(func, cache_file_name=tmp_file) self.assertEqual(len(dset_test), 30) self.assertDictEqual( dset_test.features, Features({ "filename": Value("string"), "tensor": Sequence(Value("float64")) })) self.assertListEqual(dset_test[0]["tensor"], [1, 2, 3])
def test_map(self): dset = self._create_dummy_dataset() with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") self.assertDictEqual(dset.features, Features({"filename": Value("string")})) dset_test = dset.map( lambda x: {"name": x["filename"][:-2], "id": int(x["filename"][-1])}, cache_file_name=tmp_file ) self.assertEqual(len(dset_test), 30) self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual( dset_test.features, Features({"filename": Value("string"), "name": Value("string"), "id": Value("int64")}), ) with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") dset_test_with_indices = dset.map( lambda x, i: {"name": x["filename"][:-2], "id": i}, with_indices=True, cache_file_name=tmp_file ) self.assertEqual(len(dset_test_with_indices), 30) self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual( dset_test_with_indices.features, Features({"filename": Value("string"), "name": Value("string"), "id": Value("int64")}), )
def test_select(self): dset = self._create_dummy_dataset() # select every two example indices = list(range(0, len(dset), 2)) with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") dset_select_even = dset.select(indices, cache_file_name=tmp_file) self.assertEqual(len(dset_select_even), 15) for row in dset_select_even: self.assertEqual(int(row["filename"][-1]) % 2, 0) self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual(dset_select_even.features, Features({"filename": Value("string")})) with tempfile.TemporaryDirectory() as tmp_dir: bad_indices = list(range(5)) bad_indices[3] = "foo" tmp_file = os.path.join(tmp_dir, "test.arrow") self.assertRaises( Exception, dset.select, indices=bad_indices, cache_file_name=tmp_file, writer_batch_size=2, reader_batch_size=2, ) self.assertFalse(os.path.exists(tmp_file)) dset_select_five = dset.select(list(range(5)), cache_file_name=tmp_file, writer_batch_size=2, reader_batch_size=2) self.assertTrue(os.path.exists(tmp_file)) self.assertEqual(len(dset_select_five), 5) for i, row in enumerate(dset_select_five): self.assertEqual(int(row["filename"][-1]), i) self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual(dset_select_even.features, Features({"filename": Value("string")}))
def test_keep_features_after_transform_in_memory(self): features = Features( {"tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"]))} ) dset = Dataset.from_dict({"tokens": [["foo"] * 5] * 10, "labels": [[1] * 5] * 10}, features=features) def invert_labels(x): return {"labels": [(1 - label) for label in x["labels"]]} inverted_dset = dset.map(invert_labels, keep_in_memory=True) self.assertEqual(inverted_dset.features.type, features.type) self.assertDictEqual(inverted_dset.features, features)
def test_new_features(self): dset = self._create_dummy_dataset() with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") features = Features({"filename": Value("string"), "label": ClassLabel(names=["positive", "negative"])}) dset_test_with_indices = dset.map( lambda x, i: {"label": i % 2}, with_indices=True, cache_file_name=tmp_file, features=features ) self.assertEqual(len(dset_test_with_indices), 30) self.assertDictEqual( dset_test_with_indices.features, features, )
def test_download_and_prepare(self): with tempfile.TemporaryDirectory() as tmp_dir: dummy_builder = DummyBuilder(cache_dir=tmp_dir, name="dummy") dummy_builder.download_and_prepare(try_from_hf_gcs=False, download_mode=FORCE_REDOWNLOAD) self.assertTrue( os.path.exists( os.path.join(tmp_dir, "dummy_builder", "dummy", "0.0.0", "dummy_builder-train.arrow"))) self.assertDictEqual(dummy_builder.info.features, Features({"text": Value("string")})) self.assertEqual(dummy_builder.info.splits["train"].num_examples, 100) self.assertTrue( os.path.exists( os.path.join(tmp_dir, "dummy_builder", "dummy", "0.0.0", "dataset_info.json")))
def test_train_test_split(self): dset = self._create_dummy_dataset() with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") tmp_file_2 = os.path.join(tmp_dir, "test_2.arrow") dset_dict = dset.train_test_split(test_size=10, shuffle=False, train_cache_file_name=tmp_file, test_cache_file_name=tmp_file_2) self.assertListEqual(list(dset_dict.keys()), ["train", "test"]) dset_train = dset_dict["train"] dset_test = dset_dict["test"] self.assertEqual(len(dset_train), 20) self.assertEqual(len(dset_test), 10) self.assertEqual(dset_train[0]["filename"], "my_name-train_0") self.assertEqual(dset_train[-1]["filename"], "my_name-train_19") self.assertEqual(dset_test[0]["filename"], "my_name-train_20") self.assertEqual(dset_test[-1]["filename"], "my_name-train_29") self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual(dset_train.features, Features({"filename": Value("string")})) self.assertDictEqual(dset_test.features, Features({"filename": Value("string")})) tmp_file = os.path.join(tmp_dir, "test_3.arrow") tmp_file_2 = os.path.join(tmp_dir, "test_4.arrow") dset_dict = dset.train_test_split(test_size=0.5, shuffle=False, train_cache_file_name=tmp_file, test_cache_file_name=tmp_file_2) self.assertListEqual(list(dset_dict.keys()), ["train", "test"]) dset_train = dset_dict["train"] dset_test = dset_dict["test"] self.assertEqual(len(dset_train), 15) self.assertEqual(len(dset_test), 15) self.assertEqual(dset_train[0]["filename"], "my_name-train_0") self.assertEqual(dset_train[-1]["filename"], "my_name-train_14") self.assertEqual(dset_test[0]["filename"], "my_name-train_15") self.assertEqual(dset_test[-1]["filename"], "my_name-train_29") self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual(dset_train.features, Features({"filename": Value("string")})) self.assertDictEqual(dset_test.features, Features({"filename": Value("string")})) tmp_file = os.path.join(tmp_dir, "test_5.arrow") tmp_file_2 = os.path.join(tmp_dir, "test_6.arrow") dset_dict = dset.train_test_split(train_size=10, shuffle=False, train_cache_file_name=tmp_file, test_cache_file_name=tmp_file_2) self.assertListEqual(list(dset_dict.keys()), ["train", "test"]) dset_train = dset_dict["train"] dset_test = dset_dict["test"] self.assertEqual(len(dset_train), 10) self.assertEqual(len(dset_test), 20) self.assertEqual(dset_train[0]["filename"], "my_name-train_0") self.assertEqual(dset_train[-1]["filename"], "my_name-train_9") self.assertEqual(dset_test[0]["filename"], "my_name-train_10") self.assertEqual(dset_test[-1]["filename"], "my_name-train_29") self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual(dset_train.features, Features({"filename": Value("string")})) self.assertDictEqual(dset_test.features, Features({"filename": Value("string")})) tmp_file = os.path.join(tmp_dir, "test_7.arrow") tmp_file_2 = os.path.join(tmp_dir, "test_8.arrow") dset_dict = dset.train_test_split(train_size=10, train_cache_file_name=tmp_file, test_cache_file_name=tmp_file_2, seed=42) self.assertListEqual(list(dset_dict.keys()), ["train", "test"]) dset_train = dset_dict["train"] dset_test = dset_dict["test"] self.assertEqual(len(dset_train), 10) self.assertEqual(len(dset_test), 20) self.assertNotEqual(dset_train[0]["filename"], "my_name-train_0") self.assertNotEqual(dset_train[-1]["filename"], "my_name-train_9") self.assertNotEqual(dset_test[0]["filename"], "my_name-train_10") self.assertNotEqual(dset_test[-1]["filename"], "my_name-train_29") self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertDictEqual(dset_train.features, Features({"filename": Value("string")})) self.assertDictEqual(dset_test.features, Features({"filename": Value("string")}))