Ejemplo n.º 1
0
    def test_concatenate(self):
        data1, data2, data3 = {"id": [0, 1, 2]}, {"id": [3, 4, 5]}, {"id": [6, 7]}
        info1 = DatasetInfo(description="Dataset1")
        info2 = DatasetInfo(description="Dataset2")
        dset1, dset2, dset3 = (
            Dataset.from_dict(data1, info=info1),
            Dataset.from_dict(data2, info=info2),
            Dataset.from_dict(data3),
        )

        dset_concat = concatenate_datasets([dset1, dset2, dset3])
        self.assertEquals(len(dset_concat), len(dset1) + len(dset2) + len(dset3))
        self.assertEquals(dset_concat.info.description, "Dataset1\n\nDataset2\n\n")
    def test_read(self):
        name = "my_name"
        train_info = SplitInfo(name="train", num_examples=100)
        test_info = SplitInfo(name="test", num_examples=100)
        split_infos = [train_info, test_info]
        split_dict = SplitDict()
        split_dict.add(train_info)
        split_dict.add(test_info)
        info = DatasetInfo(splits=split_dict)
        reader = ReaderTest("", info)

        instructions = "test[:33%]"
        dset = reader.read(name, instructions, split_infos)
        self.assertEqual(dset["filename"][0], f"{name}-test")
        self.assertEqual(dset.num_rows, 33)
        self.assertEqual(dset.num_columns, 1)

        instructions = ["train", "test[:33%]"]
        train_dset, test_dset = reader.read(name, instructions, split_infos)
        self.assertEqual(train_dset["filename"][0], f"{name}-train")
        self.assertEqual(train_dset.num_rows, 100)
        self.assertEqual(train_dset.num_columns, 1)
        self.assertEqual(test_dset["filename"][0], f"{name}-test")
        self.assertEqual(test_dset.num_rows, 33)
        self.assertEqual(test_dset.num_columns, 1)
Ejemplo n.º 3
0
    def test_from_pandas(self):
        data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]}
        df = pd.DataFrame.from_dict(data)
        dset = Dataset.from_pandas(df)
        self.assertListEqual(dset["col_1"], data["col_1"])
        self.assertListEqual(dset["col_2"], data["col_2"])
        self.assertListEqual(list(dset.features.keys()), ["col_1", "col_2"])
        self.assertDictEqual(dset.features, Features({"col_1": Value("int64"), "col_2": Value("string")}))

        features = Features({"col_1": Value("int64"), "col_2": Value("string")})
        dset = Dataset.from_pandas(df, features=features)
        self.assertListEqual(dset["col_1"], data["col_1"])
        self.assertListEqual(dset["col_2"], data["col_2"])
        self.assertListEqual(list(dset.features.keys()), ["col_1", "col_2"])
        self.assertDictEqual(dset.features, Features({"col_1": Value("int64"), "col_2": Value("string")}))

        features = Features({"col_1": Value("int64"), "col_2": Value("string")})
        dset = Dataset.from_pandas(df, features=features, info=DatasetInfo(features=features))
        self.assertListEqual(dset["col_1"], data["col_1"])
        self.assertListEqual(dset["col_2"], data["col_2"])
        self.assertListEqual(list(dset.features.keys()), ["col_1", "col_2"])
        self.assertDictEqual(dset.features, Features({"col_1": Value("int64"), "col_2": Value("string")}))

        features = Features({"col_1": Value("string"), "col_2": Value("string")})
        self.assertRaises(pa.ArrowTypeError, Dataset.from_pandas, df, features=features)
Ejemplo n.º 4
0
    def test_concatenate(self):
        data1, data2, data3 = {
            "id": [0, 1, 2]
        }, {
            "id": [3, 4, 5]
        }, {
            "id": [6, 7]
        }
        dset1, dset2, dset3 = Dataset.from_dict(data1), Dataset.from_dict(
            data2), Dataset.from_dict(data3)
        dset1._info = DatasetInfo(description="Dataset1")
        dset2._info = DatasetInfo(description="Dataset2")
        dset3._info = None

        dset_concat = concatenate_datasets([dset1, dset2, dset3])
        self.assertEquals(len(dset_concat),
                          len(dset1) + len(dset2) + len(dset3))
        self.assertEquals(dset_concat.info.description, "Dataset1\n\nDataset2")
Ejemplo n.º 5
0
 def _create_dummy_dataset(self):
     name = "my_name"
     train_info = SplitInfo(name="train", num_examples=30)
     test_info = SplitInfo(name="test", num_examples=30)
     split_infos = [train_info, test_info]
     split_dict = SplitDict()
     split_dict.add(train_info)
     split_dict.add(test_info)
     info = DatasetInfo(splits=split_dict)
     reader = ReaderTester("", info)
     dset = reader.read(name, "train", split_infos)
     return dset
Ejemplo n.º 6
0
    def test_read_files(self):
        train_info = SplitInfo(name="train", num_examples=100)
        test_info = SplitInfo(name="test", num_examples=100)
        split_dict = SplitDict()
        split_dict.add(train_info)
        split_dict.add(test_info)
        info = DatasetInfo(splits=split_dict)
        reader = ReaderTest("", info)

        files = [{"filename": "train"}, {"filename": "test", "skip": 10, "take": 10}]
        dset = reader.read_files(files, original_instructions="")
        self.assertEqual(dset.num_rows, 110)
        self.assertEqual(dset.num_columns, 1)
        self.assertEqual(dset._data_files, files)
Ejemplo n.º 7
0
 def _info(self):
     return DatasetInfo(features=Features({"text": Value("string")}))