def test_read(self): name = "my_name" train_info = SplitInfo(name="train", num_examples=100) test_info = SplitInfo(name="test", num_examples=100) split_infos = [train_info, test_info] split_dict = SplitDict() split_dict.add(train_info) split_dict.add(test_info) info = DatasetInfo(splits=split_dict) reader = ReaderTest("", info) instructions = "test[:33%]" dset = reader.read(name, instructions, split_infos) self.assertEqual(dset["filename"][0], f"{name}-test") self.assertEqual(dset.num_rows, 33) self.assertEqual(dset.num_columns, 1) instructions = ["train", "test[:33%]"] train_dset, test_dset = reader.read(name, instructions, split_infos) self.assertEqual(train_dset["filename"][0], f"{name}-train") self.assertEqual(train_dset.num_rows, 100) self.assertEqual(train_dset.num_columns, 1) self.assertEqual(test_dset["filename"][0], f"{name}-test") self.assertEqual(test_dset.num_rows, 33) self.assertEqual(test_dset.num_columns, 1)
def _create_dummy_dataset(self): name = "my_name" train_info = SplitInfo(name="train", num_examples=30) test_info = SplitInfo(name="test", num_examples=30) split_infos = [train_info, test_info] split_dict = SplitDict() split_dict.add(train_info) split_dict.add(test_info) info = DatasetInfo(splits=split_dict) reader = ReaderTester("", info) dset = reader.read(name, "train", split_infos) return dset
def test_read_files(self): train_info = SplitInfo(name="train", num_examples=100) test_info = SplitInfo(name="test", num_examples=100) split_dict = SplitDict() split_dict.add(train_info) split_dict.add(test_info) info = DatasetInfo(splits=split_dict) reader = ReaderTest("", info) files = [{"filename": "train"}, {"filename": "test", "skip": 10, "take": 10}] dset = reader.read_files(files, original_instructions="") self.assertEqual(dset.num_rows, 110) self.assertEqual(dset.num_columns, 1) self.assertEqual(dset._data_files, files)