Example #1
0
 def _create_dummy_dataset(self, multiple_columns=False):
     if multiple_columns:
         data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]}
         dset = Dataset.from_dict(data)
     else:
         dset = Dataset.from_dict({"filename": ["my_name-train" + "_" + str(x) for x in np.arange(30).tolist()]})
     return dset
Example #2
0
    def test_from_dict(self):
        data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]}
        dset = Dataset.from_dict(data)
        self.assertListEqual(dset["col_1"], data["col_1"])
        self.assertListEqual(dset["col_2"], data["col_2"])
        self.assertListEqual(list(dset.features.keys()), ["col_1", "col_2"])
        self.assertDictEqual(
            dset.features,
            Features({
                "col_1": Value("int64"),
                "col_2": Value("string")
            }))

        features = Features({
            "col_1": Value("int64"),
            "col_2": Value("string")
        })
        dset = Dataset.from_dict(data, features=features)
        self.assertListEqual(dset["col_1"], data["col_1"])
        self.assertListEqual(dset["col_2"], data["col_2"])
        self.assertListEqual(list(dset.features.keys()), ["col_1", "col_2"])
        self.assertDictEqual(
            dset.features,
            Features({
                "col_1": Value("int64"),
                "col_2": Value("string")
            }))

        features = Features({
            "col_1": Value("int64"),
            "col_2": Value("string")
        })
        dset = Dataset.from_dict(data,
                                 features=features,
                                 info=DatasetInfo(features=features))
        self.assertListEqual(dset["col_1"], data["col_1"])
        self.assertListEqual(dset["col_2"], data["col_2"])
        self.assertListEqual(list(dset.features.keys()), ["col_1", "col_2"])
        self.assertDictEqual(
            dset.features,
            Features({
                "col_1": Value("int64"),
                "col_2": Value("string")
            }))

        features = Features({
            "col_1": Value("string"),
            "col_2": Value("string")
        })
        self.assertRaises(pa.ArrowTypeError,
                          Dataset.from_dict,
                          data,
                          features=features)
Example #3
0
    def test_concatenate(self):
        data1, data2, data3 = {"id": [0, 1, 2]}, {"id": [3, 4, 5]}, {"id": [6, 7]}
        info1 = DatasetInfo(description="Dataset1")
        info2 = DatasetInfo(description="Dataset2")
        dset1, dset2, dset3 = (
            Dataset.from_dict(data1, info=info1),
            Dataset.from_dict(data2, info=info2),
            Dataset.from_dict(data3),
        )

        dset_concat = concatenate_datasets([dset1, dset2, dset3])
        self.assertEquals(len(dset_concat), len(dset1) + len(dset2) + len(dset3))
        self.assertEquals(dset_concat.info.description, "Dataset1\n\nDataset2\n\n")
Example #4
0
 def test_from_arrow_schema_with_sequence(self):
     data = {"a": [{"b": {"c": ["text"]}}] * 10, "foo": [1] * 10}
     original_features = Features({
         "a": {
             "b": Sequence({"c": Value("string")})
         },
         "foo": Value("int64")
     })
     dset = Dataset.from_dict(data, features=original_features)
     new_features = Features.from_arrow_schema(dset.schema)
     new_dset = Dataset.from_dict(data, features=new_features)
     self.assertDictEqual(dset[0], new_dset[0])
     self.assertDictEqual(dset[:], new_dset[:])
Example #5
0
    def test_keep_features_with_new_features(self):
        features = Features({
            "tokens":
            Sequence(Value("string")),
            "labels":
            Sequence(ClassLabel(names=["negative", "positive"]))
        })
        dset = Dataset.from_dict(
            {
                "tokens": [["foo"] * 5] * 10,
                "labels": [[1] * 5] * 10
            },
            features=features)

        def invert_labels(x):
            return {
                "labels": [(1 - label) for label in x["labels"]],
                "labels2": x["labels"]
            }

        expected_features = Features({
            "tokens":
            Sequence(Value("string")),
            "labels":
            Sequence(ClassLabel(names=["negative", "positive"])),
            "labels2":
            Sequence(Value("int64")),
        })

        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_file = os.path.join(tmp_dir, "test.arrow")
            inverted_dset = dset.map(invert_labels, cache_file_name=tmp_file)
            self.assertEqual(inverted_dset.features.type,
                             expected_features.type)
            self.assertDictEqual(inverted_dset.features, expected_features)
Example #6
0
 def test_flatten(self):
     dset = Dataset.from_dict(
         {
             "a": [{
                 "b": {
                     "c": ["text"]
                 }
             }] * 10,
             "foo": [1] * 10
         },
         features=Features({
             "a": {
                 "b": Sequence({"c": Value("string")})
             },
             "foo": Value("int64")
         }),
     )
     dset.flatten()
     self.assertListEqual(dset.column_names, ["a.b.c", "foo"])
     self.assertListEqual(list(dset.features.keys()), ["a.b.c", "foo"])
     self.assertDictEqual(
         dset.features,
         Features({
             "a.b.c": Sequence(Value("string")),
             "foo": Value("int64")
         }))
Example #7
0
 def test_from_arrow_schema_simple(self):
     data = {"a": [{"b": {"c": "text"}}] * 10, "foo": [1] * 10}
     original_features = Features({
         "a": {
             "b": {
                 "c": Value("string")
             }
         },
         "foo": Value("int64")
     })
     dset = Dataset.from_dict(data, features=original_features)
     new_features = dset.features
     new_dset = Dataset.from_dict(data, features=new_features)
     self.assertEqual(original_features.type, new_features.type)
     self.assertDictEqual(dset[0], new_dset[0])
     self.assertDictEqual(dset[:], new_dset[:])
    def test_concatenate(self):
        data1, data2, data3 = {
            "id": [0, 1, 2]
        }, {
            "id": [3, 4, 5]
        }, {
            "id": [6, 7]
        }
        dset1, dset2, dset3 = Dataset.from_dict(data1), Dataset.from_dict(
            data2), Dataset.from_dict(data3)
        dset1._info = DatasetInfo(description="Dataset1")
        dset2._info = DatasetInfo(description="Dataset2")
        dset3._info = None

        dset_concat = concatenate_datasets([dset1, dset2, dset3])
        self.assertEquals(len(dset_concat),
                          len(dset1) + len(dset2) + len(dset3))
        self.assertEquals(dset_concat.info.description, "Dataset1\n\nDataset2")
Example #9
0
    def test_keep_features_after_transform_in_memory(self):
        features = Features(
            {"tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"]))}
        )
        dset = Dataset.from_dict({"tokens": [["foo"] * 5] * 10, "labels": [[1] * 5] * 10}, features=features)

        def invert_labels(x):
            return {"labels": [(1 - label) for label in x["labels"]]}

        inverted_dset = dset.map(invert_labels, keep_in_memory=True)
        self.assertEqual(inverted_dset.features.type, features.type)
        self.assertDictEqual(inverted_dset.features, features)
Example #10
0
 def _create_dummy_dataset(self):
     dset = Dataset.from_dict({
         "filename":
         ["my_name-train" + "_" + str(x) for x in np.arange(30).tolist()]
     })
     return dset