def _create_dummy_dataset(self, multiple_columns=False): if multiple_columns: data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]} dset = Dataset.from_dict(data) else: dset = Dataset.from_dict({"filename": ["my_name-train" + "_" + str(x) for x in np.arange(30).tolist()]}) return dset
def test_from_dict(self): data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]} dset = Dataset.from_dict(data) self.assertListEqual(dset["col_1"], data["col_1"]) self.assertListEqual(dset["col_2"], data["col_2"]) self.assertListEqual(list(dset.features.keys()), ["col_1", "col_2"]) self.assertDictEqual( dset.features, Features({ "col_1": Value("int64"), "col_2": Value("string") })) features = Features({ "col_1": Value("int64"), "col_2": Value("string") }) dset = Dataset.from_dict(data, features=features) self.assertListEqual(dset["col_1"], data["col_1"]) self.assertListEqual(dset["col_2"], data["col_2"]) self.assertListEqual(list(dset.features.keys()), ["col_1", "col_2"]) self.assertDictEqual( dset.features, Features({ "col_1": Value("int64"), "col_2": Value("string") })) features = Features({ "col_1": Value("int64"), "col_2": Value("string") }) dset = Dataset.from_dict(data, features=features, info=DatasetInfo(features=features)) self.assertListEqual(dset["col_1"], data["col_1"]) self.assertListEqual(dset["col_2"], data["col_2"]) self.assertListEqual(list(dset.features.keys()), ["col_1", "col_2"]) self.assertDictEqual( dset.features, Features({ "col_1": Value("int64"), "col_2": Value("string") })) features = Features({ "col_1": Value("string"), "col_2": Value("string") }) self.assertRaises(pa.ArrowTypeError, Dataset.from_dict, data, features=features)
def test_concatenate(self): data1, data2, data3 = {"id": [0, 1, 2]}, {"id": [3, 4, 5]}, {"id": [6, 7]} info1 = DatasetInfo(description="Dataset1") info2 = DatasetInfo(description="Dataset2") dset1, dset2, dset3 = ( Dataset.from_dict(data1, info=info1), Dataset.from_dict(data2, info=info2), Dataset.from_dict(data3), ) dset_concat = concatenate_datasets([dset1, dset2, dset3]) self.assertEquals(len(dset_concat), len(dset1) + len(dset2) + len(dset3)) self.assertEquals(dset_concat.info.description, "Dataset1\n\nDataset2\n\n")
def test_from_arrow_schema_with_sequence(self): data = {"a": [{"b": {"c": ["text"]}}] * 10, "foo": [1] * 10} original_features = Features({ "a": { "b": Sequence({"c": Value("string")}) }, "foo": Value("int64") }) dset = Dataset.from_dict(data, features=original_features) new_features = Features.from_arrow_schema(dset.schema) new_dset = Dataset.from_dict(data, features=new_features) self.assertDictEqual(dset[0], new_dset[0]) self.assertDictEqual(dset[:], new_dset[:])
def test_keep_features_with_new_features(self): features = Features({ "tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"])) }) dset = Dataset.from_dict( { "tokens": [["foo"] * 5] * 10, "labels": [[1] * 5] * 10 }, features=features) def invert_labels(x): return { "labels": [(1 - label) for label in x["labels"]], "labels2": x["labels"] } expected_features = Features({ "tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"])), "labels2": Sequence(Value("int64")), }) with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") inverted_dset = dset.map(invert_labels, cache_file_name=tmp_file) self.assertEqual(inverted_dset.features.type, expected_features.type) self.assertDictEqual(inverted_dset.features, expected_features)
def test_flatten(self): dset = Dataset.from_dict( { "a": [{ "b": { "c": ["text"] } }] * 10, "foo": [1] * 10 }, features=Features({ "a": { "b": Sequence({"c": Value("string")}) }, "foo": Value("int64") }), ) dset.flatten() self.assertListEqual(dset.column_names, ["a.b.c", "foo"]) self.assertListEqual(list(dset.features.keys()), ["a.b.c", "foo"]) self.assertDictEqual( dset.features, Features({ "a.b.c": Sequence(Value("string")), "foo": Value("int64") }))
def test_from_arrow_schema_simple(self): data = {"a": [{"b": {"c": "text"}}] * 10, "foo": [1] * 10} original_features = Features({ "a": { "b": { "c": Value("string") } }, "foo": Value("int64") }) dset = Dataset.from_dict(data, features=original_features) new_features = dset.features new_dset = Dataset.from_dict(data, features=new_features) self.assertEqual(original_features.type, new_features.type) self.assertDictEqual(dset[0], new_dset[0]) self.assertDictEqual(dset[:], new_dset[:])
def test_concatenate(self): data1, data2, data3 = { "id": [0, 1, 2] }, { "id": [3, 4, 5] }, { "id": [6, 7] } dset1, dset2, dset3 = Dataset.from_dict(data1), Dataset.from_dict( data2), Dataset.from_dict(data3) dset1._info = DatasetInfo(description="Dataset1") dset2._info = DatasetInfo(description="Dataset2") dset3._info = None dset_concat = concatenate_datasets([dset1, dset2, dset3]) self.assertEquals(len(dset_concat), len(dset1) + len(dset2) + len(dset3)) self.assertEquals(dset_concat.info.description, "Dataset1\n\nDataset2")
def test_keep_features_after_transform_in_memory(self): features = Features( {"tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"]))} ) dset = Dataset.from_dict({"tokens": [["foo"] * 5] * 10, "labels": [[1] * 5] * 10}, features=features) def invert_labels(x): return {"labels": [(1 - label) for label in x["labels"]]} inverted_dset = dset.map(invert_labels, keep_in_memory=True) self.assertEqual(inverted_dset.features.type, features.type) self.assertDictEqual(inverted_dset.features, features)
def _create_dummy_dataset(self): dset = Dataset.from_dict({ "filename": ["my_name-train" + "_" + str(x) for x in np.arange(30).tolist()] }) return dset