Esempio n. 1
0
 def test_flatten(self):
     dset = Dataset.from_dict(
         {
             "a": [{
                 "b": {
                     "c": ["text"]
                 }
             }] * 10,
             "foo": [1] * 10
         },
         features=Features({
             "a": {
                 "b": Sequence({"c": Value("string")})
             },
             "foo": Value("int64")
         }),
     )
     dset.flatten()
     self.assertListEqual(dset.column_names, ["a.b.c", "foo"])
     self.assertListEqual(list(dset.features.keys()), ["a.b.c", "foo"])
     self.assertDictEqual(
         dset.features,
         Features({
             "a.b.c": Sequence(Value("string")),
             "foo": Value("int64")
         }))
Esempio n. 2
0
    def test_keep_features_with_new_features(self):
        features = Features({
            "tokens":
            Sequence(Value("string")),
            "labels":
            Sequence(ClassLabel(names=["negative", "positive"]))
        })
        dset = Dataset.from_dict(
            {
                "tokens": [["foo"] * 5] * 10,
                "labels": [[1] * 5] * 10
            },
            features=features)

        def invert_labels(x):
            return {
                "labels": [(1 - label) for label in x["labels"]],
                "labels2": x["labels"]
            }

        expected_features = Features({
            "tokens":
            Sequence(Value("string")),
            "labels":
            Sequence(ClassLabel(names=["negative", "positive"])),
            "labels2":
            Sequence(Value("int64")),
        })

        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_file = os.path.join(tmp_dir, "test.arrow")
            inverted_dset = dset.map(invert_labels, cache_file_name=tmp_file)
            self.assertEqual(inverted_dset.features.type,
                             expected_features.type)
            self.assertDictEqual(inverted_dset.features, expected_features)
Esempio n. 3
0
    def test_keep_features_after_transform_in_memory(self):
        features = Features(
            {"tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"]))}
        )
        dset = Dataset.from_dict({"tokens": [["foo"] * 5] * 10, "labels": [[1] * 5] * 10}, features=features)

        def invert_labels(x):
            return {"labels": [(1 - label) for label in x["labels"]]}

        inverted_dset = dset.map(invert_labels, keep_in_memory=True)
        self.assertEqual(inverted_dset.features.type, features.type)
        self.assertDictEqual(inverted_dset.features, features)
Esempio n. 4
0
    def test_format_ragged_vectors(self):
        dset = self._create_dummy_dataset()
        import numpy as np
        import tensorflow as tf
        import torch

        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_file = os.path.join(tmp_dir, "test.arrow")
            dset = dset.map(lambda ex, i: {"vec": np.ones(3 + i) * i},
                            with_indices=True,
                            cache_file_name=tmp_file)
            columns = dset.column_names

            self.assertIsNotNone(dset[0])
            self.assertIsNotNone(dset[:2])
            for col in columns:
                self.assertIsInstance(dset[0][col], (str, list))
                self.assertIsInstance(dset[:2][col], list)
            self.assertDictEqual(
                dset.features,
                Features({
                    "filename": Value("string"),
                    "vec": Sequence(Value("float64"))
                }))

            dset.set_format("tensorflow")
            self.assertIsNotNone(dset[0])
            self.assertIsNotNone(dset[:2])
            for col in columns:
                self.assertIsInstance(dset[0][col],
                                      (tf.Tensor, tf.RaggedTensor))
                self.assertIsInstance(dset[:2][col],
                                      (tf.Tensor, tf.RaggedTensor))
                self.assertIsInstance(dset[col], (tf.Tensor, tf.RaggedTensor))
            # dim is None for ragged vectors in tensorflow
            self.assertListEqual(dset[:2]["vec"].shape.as_list(), [2, None])
            self.assertListEqual(dset["vec"][:2].shape.as_list(), [2, None])

            dset.set_format("numpy")
            self.assertIsNotNone(dset[0])
            self.assertIsNotNone(dset[:2])
            for col in columns:
                self.assertIsInstance(dset[0][col], np.ndarray)
                self.assertIsInstance(dset[:2][col], np.ndarray)
                self.assertIsInstance(dset[col], np.ndarray)
            # array is flat for raged vectors in numpy
            self.assertEqual(dset[:2]["vec"].shape, (2, ))
            self.assertEqual(dset["vec"][:2].shape, (2, ))

            dset.set_format("torch", columns=["vec"])
            self.assertIsNotNone(dset[0])
            self.assertIsNotNone(dset[:2])
            # torch.Tensor is only for numerical columns
            self.assertIsInstance(dset[0]["vec"], torch.Tensor)
            self.assertIsInstance(dset[:2]["vec"][0], torch.Tensor)
            self.assertIsInstance(dset["vec"][0], torch.Tensor)
            # pytorch doesn't support ragged tensors, so we should have lists
            self.assertIsInstance(dset[:2]["vec"], list)
            self.assertIsInstance(dset["vec"][:2], list)
Esempio n. 5
0
    def test_format_vectors(self):
        dset = self._create_dummy_dataset()
        import numpy as np
        import tensorflow as tf
        import torch

        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_file = os.path.join(tmp_dir, "test.arrow")
            dset = dset.map(lambda ex, i: {"vec": np.ones(3) * i},
                            with_indices=True,
                            cache_file_name=tmp_file)
            columns = dset.column_names

            self.assertIsNotNone(dset[0])
            self.assertIsNotNone(dset[:2])
            for col in columns:
                self.assertIsInstance(dset[0][col], (str, list))
                self.assertIsInstance(dset[:2][col], list)
            self.assertDictEqual(
                dset.features,
                Features({
                    "filename": Value("string"),
                    "vec": Sequence(Value("float64"))
                }))

            # don't test if torch and tensorflow are stacked accross examples
            # we need to use the features definition to know at what depth we have to to the conversion

            dset.set_format("tensorflow")
            self.assertIsNotNone(dset[0])
            self.assertIsNotNone(dset[:2])
            for col in columns:
                self.assertIsInstance(dset[0][col],
                                      (tf.Tensor, tf.RaggedTensor))
                self.assertIsInstance(
                    dset[:2][col][0],
                    (tf.Tensor, tf.RaggedTensor))  # not stacked

            dset.set_format("numpy")
            self.assertIsNotNone(dset[0])
            self.assertIsNotNone(dset[:2])
            for col in columns:
                self.assertIsInstance(dset[0][col], np.ndarray)
                self.assertIsInstance(dset[:2][col], np.ndarray)  # stacked
            self.assertEqual(dset[:2]["vec"].shape, (2, 3))  # stacked

            dset.set_format("torch", columns=["vec"])
            self.assertIsNotNone(dset[0])
            self.assertIsNotNone(dset[:2])
            # torch.Tensor is only for numerical columns
            self.assertIsInstance(dset[0]["vec"], torch.Tensor)
            self.assertIsInstance(dset[:2]["vec"][0],
                                  torch.Tensor)  # not stacked
Esempio n. 6
0
 def test_from_arrow_schema_with_sequence(self):
     data = {"a": [{"b": {"c": ["text"]}}] * 10, "foo": [1] * 10}
     original_features = Features({
         "a": {
             "b": Sequence({"c": Value("string")})
         },
         "foo": Value("int64")
     })
     dset = Dataset.from_dict(data, features=original_features)
     new_features = Features.from_arrow_schema(dset.schema)
     new_dset = Dataset.from_dict(data, features=new_features)
     self.assertDictEqual(dset[0], new_dset[0])
     self.assertDictEqual(dset[:], new_dset[:])
Esempio n. 7
0
    def test_format_nested(self):
        dset = self._create_dummy_dataset()
        import numpy as np
        import tensorflow as tf
        import torch

        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_file = os.path.join(tmp_dir, "test.arrow")
            dset = dset.map(
                lambda ex:
                {"nested": [{
                    "foo": np.ones(3)
                }] * len(ex["filename"])},
                cache_file_name=tmp_file,
                batched=True,
            )
            self.assertDictEqual(
                dset.features,
                Features({
                    "filename": Value("string"),
                    "nested": {
                        "foo": Sequence(Value("float64"))
                    }
                }))

            dset.set_format("tensorflow")
            self.assertIsNotNone(dset[0])
            self.assertIsInstance(dset[0]["nested"]["foo"],
                                  (tf.Tensor, tf.RaggedTensor))
            self.assertIsNotNone(dset[:2])
            self.assertIsInstance(dset[:2]["nested"][0]["foo"],
                                  (tf.Tensor, tf.RaggedTensor))
            self.assertIsInstance(dset["nested"][0]["foo"],
                                  (tf.Tensor, tf.RaggedTensor))

            dset.set_format("numpy")
            self.assertIsNotNone(dset[0])
            self.assertIsInstance(dset[0]["nested"]["foo"], np.ndarray)
            self.assertIsNotNone(dset[:2])
            self.assertIsInstance(dset[:2]["nested"][0]["foo"], np.ndarray)
            self.assertIsInstance(dset["nested"][0]["foo"], np.ndarray)

            dset.set_format("torch", columns="nested")
            self.assertIsNotNone(dset[0])
            self.assertIsInstance(dset[0]["nested"]["foo"], torch.Tensor)
            self.assertIsNotNone(dset[:2])
            self.assertIsInstance(dset[:2]["nested"][0]["foo"], torch.Tensor)
            self.assertIsInstance(dset["nested"][0]["foo"], torch.Tensor)
Esempio n. 8
0
    def test_map_numpy(self):
        dset = self._create_dummy_dataset()

        def func(example):
            return {"tensor": np.array([1.0, 2, 3])}

        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_file = os.path.join(tmp_dir, "test.arrow")
            dset_test = dset.map(func, cache_file_name=tmp_file)
            self.assertEqual(len(dset_test), 30)
            self.assertDictEqual(
                dset_test.features,
                Features({
                    "filename": Value("string"),
                    "tensor": Sequence(Value("float64"))
                }))
            self.assertListEqual(dset_test[0]["tensor"], [1, 2, 3])