コード例 #1
0
ファイル: test_arrow_reader.py プロジェクト: yngtodd/datasets
    def test_read(self):
        name = "my_name"
        train_info = SplitInfo(name="train", num_examples=100)
        test_info = SplitInfo(name="test", num_examples=100)
        split_infos = [train_info, test_info]
        split_dict = SplitDict()
        split_dict.add(train_info)
        split_dict.add(test_info)
        info = DatasetInfo(splits=split_dict)

        with tempfile.TemporaryDirectory() as tmp_dir:
            reader = ReaderTest(tmp_dir, info)

            instructions = "test[:33%]"
            dset = Dataset(**reader.read(name, instructions, split_infos))
            self.assertEqual(dset["filename"][0], f"{name}-test")
            self.assertEqual(dset.num_rows, 33)
            self.assertEqual(dset.num_columns, 1)

            instructions = ["train", "test[:33%]"]
            datasets_kwargs = [
                reader.read(name, instr, split_infos) for instr in instructions
            ]
            train_dset, test_dset = [
                Dataset(**dataset_kwargs) for dataset_kwargs in datasets_kwargs
            ]
            self.assertEqual(train_dset["filename"][0], f"{name}-train")
            self.assertEqual(train_dset.num_rows, 100)
            self.assertEqual(train_dset.num_columns, 1)
            self.assertEqual(test_dset["filename"][0], f"{name}-test")
            self.assertEqual(test_dset.num_rows, 33)
            self.assertEqual(test_dset.num_columns, 1)
            del train_dset, test_dset
コード例 #2
0
ファイル: test_arrow_reader.py プロジェクト: yngtodd/datasets
    def test_read_files(self):
        train_info = SplitInfo(name="train", num_examples=100)
        test_info = SplitInfo(name="test", num_examples=100)
        split_dict = SplitDict()
        split_dict.add(train_info)
        split_dict.add(test_info)
        info = DatasetInfo(splits=split_dict)

        with tempfile.TemporaryDirectory() as tmp_dir:
            reader = ReaderTest(tmp_dir, info)

            files = [
                {
                    "filename": os.path.join(tmp_dir, "train")
                },
                {
                    "filename": os.path.join(tmp_dir, "test"),
                    "skip": 10,
                    "take": 10
                },
            ]
            dset = Dataset(
                **reader.read_files(files, original_instructions=""))
            self.assertEqual(dset.num_rows, 110)
            self.assertEqual(dset.num_columns, 1)
            self.assertEqual(dset._data_files, files)
            del dset
コード例 #3
0
    def test_read(self):
        name = "my_name"
        train_info = SplitInfo(name="train", num_examples=100)
        test_info = SplitInfo(name="test", num_examples=100)
        split_infos = [train_info, test_info]
        split_dict = SplitDict()
        split_dict.add(train_info)
        split_dict.add(test_info)
        info = DatasetInfo(splits=split_dict)

        with tempfile.TemporaryDirectory() as tmp_dir:
            reader = ReaderTest(tmp_dir, info)

            instructions = "test[:33%]"
            dset = Dataset(**reader.read(name, instructions, split_infos))
            self.assertEqual(dset["filename"][0], f"{name}-test")
            self.assertEqual(dset.num_rows, 33)
            self.assertEqual(dset.num_columns, 1)

            instructions1 = ["train", "test[:33%]"]
            instructions2 = [
                Split.TRAIN,
                ReadInstruction.from_spec("test[:33%]")
            ]
            for instructions in [instructions1, instructions2]:
                datasets_kwargs = [
                    reader.read(name, instr, split_infos)
                    for instr in instructions
                ]
                train_dset, test_dset = (Dataset(**dataset_kwargs)
                                         for dataset_kwargs in datasets_kwargs)
                self.assertEqual(train_dset["filename"][0], f"{name}-train")
                self.assertEqual(train_dset.num_rows, 100)
                self.assertEqual(train_dset.num_columns, 1)
                self.assertIsInstance(train_dset.split, NamedSplit)
                self.assertEqual(str(train_dset.split), "train")
                self.assertEqual(test_dset["filename"][0], f"{name}-test")
                self.assertEqual(test_dset.num_rows, 33)
                self.assertEqual(test_dset.num_columns, 1)
                self.assertIsInstance(test_dset.split, NamedSplit)
                self.assertEqual(str(test_dset.split), "test[:33%]")
                del train_dset, test_dset