def test_builder_as_dataset(split, expected_dataset_class, expected_dataset_length, in_memory, tmp_path): cache_dir = str(tmp_path) dummy_builder = DummyBuilder(cache_dir=cache_dir, name="dummy") os.makedirs(dummy_builder.cache_dir) dummy_builder.info.splits = SplitDict() dummy_builder.info.splits.add(SplitInfo("train", num_examples=10)) dummy_builder.info.splits.add(SplitInfo("test", num_examples=10)) for info_split in dummy_builder.info.splits: with ArrowWriter( path=os.path.join(dummy_builder.cache_dir, f"dummy_builder-{info_split}.arrow"), features=Features({"text": Value("string")}), ) as writer: writer.write_batch({"text": ["foo"] * 10}) writer.finalize() with assert_arrow_memory_increases() if in_memory else assert_arrow_memory_doesnt_increase(): dataset = dummy_builder.as_dataset(split=split, in_memory=in_memory) assert isinstance(dataset, expected_dataset_class) if isinstance(dataset, DatasetDict): assert list(dataset.keys()) == ["train", "test"] datasets = dataset.values() expected_splits = ["train", "test"] elif isinstance(dataset, Dataset): datasets = [dataset] expected_splits = [split] for dataset, expected_split in zip(datasets, expected_splits): assert dataset.split == expected_split assert len(dataset) == expected_dataset_length assert dataset.features == Features({"text": Value("string")}) dataset.column_names == ["text"]
def test_read_files(self): train_info = SplitInfo(name="train", num_examples=100) test_info = SplitInfo(name="test", num_examples=100) split_dict = SplitDict() split_dict.add(train_info) split_dict.add(test_info) info = DatasetInfo(splits=split_dict) with tempfile.TemporaryDirectory() as tmp_dir: reader = ReaderTest(tmp_dir, info) files = [ { "filename": os.path.join(tmp_dir, "train") }, { "filename": os.path.join(tmp_dir, "test"), "skip": 10, "take": 10 }, ] dset = Dataset( **reader.read_files(files, original_instructions="")) self.assertEqual(dset.num_rows, 110) self.assertEqual(dset.num_columns, 1) self.assertEqual(dset._data_files, files) del dset
def test_read(self): name = "my_name" train_info = SplitInfo(name="train", num_examples=100) test_info = SplitInfo(name="test", num_examples=100) split_infos = [train_info, test_info] split_dict = SplitDict() split_dict.add(train_info) split_dict.add(test_info) info = DatasetInfo(splits=split_dict) with tempfile.TemporaryDirectory() as tmp_dir: reader = ReaderTest(tmp_dir, info) instructions = "test[:33%]" dset = Dataset(**reader.read(name, instructions, split_infos)) self.assertEqual(dset["filename"][0], f"{name}-test") self.assertEqual(dset.num_rows, 33) self.assertEqual(dset.num_columns, 1) instructions = ["train", "test[:33%]"] datasets_kwargs = [ reader.read(name, instr, split_infos) for instr in instructions ] train_dset, test_dset = [ Dataset(**dataset_kwargs) for dataset_kwargs in datasets_kwargs ] self.assertEqual(train_dset["filename"][0], f"{name}-train") self.assertEqual(train_dset.num_rows, 100) self.assertEqual(train_dset.num_columns, 1) self.assertEqual(test_dset["filename"][0], f"{name}-test") self.assertEqual(test_dset.num_rows, 33) self.assertEqual(test_dset.num_columns, 1) del train_dset, test_dset
def test_as_dataset(self): with tempfile.TemporaryDirectory() as tmp_dir: dummy_builder = DummyBuilder(cache_dir=tmp_dir, name="dummy") os.makedirs(dummy_builder.cache_dir) dummy_builder.info.splits = SplitDict() dummy_builder.info.splits.add(SplitInfo("train", num_examples=10)) dummy_builder.info.splits.add(SplitInfo("test", num_examples=10)) for split in dummy_builder.info.splits: writer = ArrowWriter( path=os.path.join(dummy_builder.cache_dir, f"dummy_builder-{split}.arrow"), features=Features({"text": Value("string")}), ) writer.write_batch({"text": ["foo"] * 10}) writer.finalize() dsets = dummy_builder.as_dataset() self.assertIsInstance(dsets, DatasetDict) self.assertListEqual(list(dsets.keys()), ["train", "test"]) self.assertEqual(len(dsets["train"]), 10) self.assertEqual(len(dsets["test"]), 10) self.assertDictEqual(dsets["train"].features, Features({"text": Value("string")})) self.assertDictEqual(dsets["test"].features, Features({"text": Value("string")})) self.assertListEqual(dsets["train"].column_names, ["text"]) self.assertListEqual(dsets["test"].column_names, ["text"]) del dsets dset = dummy_builder.as_dataset("train") self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train") self.assertEqual(len(dset), 10) self.assertDictEqual(dset.features, Features({"text": Value("string")})) self.assertListEqual(dset.column_names, ["text"]) del dset dset = dummy_builder.as_dataset("train+test[:30%]") self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train+test[:30%]") self.assertEqual(len(dset), 13) self.assertDictEqual(dset.features, Features({"text": Value("string")})) self.assertListEqual(dset.column_names, ["text"]) del dset
def test_read(self): name = "my_name" train_info = SplitInfo(name="train", num_examples=100) test_info = SplitInfo(name="test", num_examples=100) split_infos = [train_info, test_info] split_dict = SplitDict() split_dict.add(train_info) split_dict.add(test_info) info = DatasetInfo(splits=split_dict) with tempfile.TemporaryDirectory() as tmp_dir: reader = ReaderTest(tmp_dir, info) instructions = "test[:33%]" dset = Dataset(**reader.read(name, instructions, split_infos)) self.assertEqual(dset["filename"][0], f"{name}-test") self.assertEqual(dset.num_rows, 33) self.assertEqual(dset.num_columns, 1) instructions1 = ["train", "test[:33%]"] instructions2 = [ Split.TRAIN, ReadInstruction.from_spec("test[:33%]") ] for instructions in [instructions1, instructions2]: datasets_kwargs = [ reader.read(name, instr, split_infos) for instr in instructions ] train_dset, test_dset = (Dataset(**dataset_kwargs) for dataset_kwargs in datasets_kwargs) self.assertEqual(train_dset["filename"][0], f"{name}-train") self.assertEqual(train_dset.num_rows, 100) self.assertEqual(train_dset.num_columns, 1) self.assertIsInstance(train_dset.split, NamedSplit) self.assertEqual(str(train_dset.split), "train") self.assertEqual(test_dset["filename"][0], f"{name}-test") self.assertEqual(test_dset.num_rows, 33) self.assertEqual(test_dset.num_columns, 1) self.assertIsInstance(test_dset.split, NamedSplit) self.assertEqual(str(test_dset.split), "test[:33%]") del train_dset, test_dset
def test_as_dataset_with_post_process_with_index(self): def _post_process(self, dataset, resources_paths): if os.path.exists(resources_paths["index"]): dataset.load_faiss_index("my_index", resources_paths["index"]) return dataset else: dataset.add_faiss_index_from_external_arrays( external_arrays=np.ones((len(dataset), 8)), string_factory="Flat", index_name="my_index") dataset.save_faiss_index("my_index", resources_paths["index"]) return dataset def _post_processing_resources(self, split): return {"index": "Flat-{split}.faiss".format(split=split)} with tempfile.TemporaryDirectory() as tmp_dir: dummy_builder = DummyBuilder(cache_dir=tmp_dir, name="dummy") dummy_builder._post_process = types.MethodType( _post_process, dummy_builder) dummy_builder._post_processing_resources = types.MethodType( _post_processing_resources, dummy_builder) os.makedirs(dummy_builder.cache_dir) dummy_builder.info.splits = SplitDict() dummy_builder.info.splits.add(SplitInfo("train", num_examples=10)) dummy_builder.info.splits.add(SplitInfo("test", num_examples=10)) for split in dummy_builder.info.splits: writer = ArrowWriter( path=os.path.join(dummy_builder.cache_dir, f"dummy_builder-{split}.arrow"), features=Features({"text": Value("string")}), ) writer.write_batch({"text": ["foo"] * 10}) writer.finalize() writer = ArrowWriter( path=os.path.join(dummy_builder.cache_dir, f"small_dataset-{split}.arrow"), features=Features({"text": Value("string")}), ) writer.write_batch({"text": ["foo"] * 2}) writer.finalize() dsets = dummy_builder.as_dataset() self.assertIsInstance(dsets, DatasetDict) self.assertListEqual(list(dsets.keys()), ["train", "test"]) self.assertEqual(len(dsets["train"]), 10) self.assertEqual(len(dsets["test"]), 10) self.assertDictEqual(dsets["train"].features, Features({"text": Value("string")})) self.assertDictEqual(dsets["test"].features, Features({"text": Value("string")})) self.assertListEqual(dsets["train"].column_names, ["text"]) self.assertListEqual(dsets["test"].column_names, ["text"]) self.assertListEqual(dsets["train"].list_indexes(), ["my_index"]) self.assertListEqual(dsets["test"].list_indexes(), ["my_index"]) self.assertGreater(dummy_builder.info.post_processing_size, 0) self.assertGreater( dummy_builder.info.post_processed.resources_checksums["train"] ["index"]["num_bytes"], 0) del dsets dset = dummy_builder.as_dataset("train") self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train") self.assertEqual(len(dset), 10) self.assertDictEqual(dset.features, Features({"text": Value("string")})) self.assertListEqual(dset.column_names, ["text"]) self.assertListEqual(dset.list_indexes(), ["my_index"]) del dset dset = dummy_builder.as_dataset("train+test[:30%]") self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train+test[:30%]") self.assertEqual(len(dset), 13) self.assertDictEqual(dset.features, Features({"text": Value("string")})) self.assertListEqual(dset.column_names, ["text"]) self.assertListEqual(dset.list_indexes(), ["my_index"]) del dset
def test_as_dataset_with_post_process(self): def _post_process(self, dataset, resources_paths): def char_tokenize(example): return {"tokens": list(example["text"])} return dataset.map( char_tokenize, cache_file_name=resources_paths["tokenized_dataset"]) def _post_processing_resources(self, split): return { "tokenized_dataset": "tokenized_dataset-{split}.arrow".format(split=split) } with tempfile.TemporaryDirectory() as tmp_dir: dummy_builder = DummyBuilder(cache_dir=tmp_dir, name="dummy") dummy_builder.info.post_processed = PostProcessedInfo( features=Features({ "text": Value("string"), "tokens": [Value("string")] })) dummy_builder._post_process = types.MethodType( _post_process, dummy_builder) dummy_builder._post_processing_resources = types.MethodType( _post_processing_resources, dummy_builder) os.makedirs(dummy_builder.cache_dir) dummy_builder.info.splits = SplitDict() dummy_builder.info.splits.add(SplitInfo("train", num_examples=10)) dummy_builder.info.splits.add(SplitInfo("test", num_examples=10)) for split in dummy_builder.info.splits: writer = ArrowWriter( path=os.path.join(dummy_builder.cache_dir, f"dummy_builder-{split}.arrow"), features=Features({"text": Value("string")}), ) writer.write_batch({"text": ["foo"] * 10}) writer.finalize() writer = ArrowWriter( path=os.path.join(dummy_builder.cache_dir, f"tokenized_dataset-{split}.arrow"), features=Features({ "text": Value("string"), "tokens": [Value("string")] }), ) writer.write_batch({ "text": ["foo"] * 10, "tokens": [list("foo")] * 10 }) writer.finalize() dsets = dummy_builder.as_dataset() self.assertIsInstance(dsets, DatasetDict) self.assertListEqual(list(dsets.keys()), ["train", "test"]) self.assertEqual(len(dsets["train"]), 10) self.assertEqual(len(dsets["test"]), 10) self.assertDictEqual( dsets["train"].features, Features({ "text": Value("string"), "tokens": [Value("string")] })) self.assertDictEqual( dsets["test"].features, Features({ "text": Value("string"), "tokens": [Value("string")] })) self.assertListEqual(dsets["train"].column_names, ["text", "tokens"]) self.assertListEqual(dsets["test"].column_names, ["text", "tokens"]) del dsets dset = dummy_builder.as_dataset("train") self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train") self.assertEqual(len(dset), 10) self.assertDictEqual( dset.features, Features({ "text": Value("string"), "tokens": [Value("string")] })) self.assertListEqual(dset.column_names, ["text", "tokens"]) self.assertGreater(dummy_builder.info.post_processing_size, 0) self.assertGreater( dummy_builder.info.post_processed.resources_checksums["train"] ["tokenized_dataset"]["num_bytes"], 0) del dset dset = dummy_builder.as_dataset("train+test[:30%]") self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train+test[:30%]") self.assertEqual(len(dset), 13) self.assertDictEqual( dset.features, Features({ "text": Value("string"), "tokens": [Value("string")] })) self.assertListEqual(dset.column_names, ["text", "tokens"]) del dset def _post_process(self, dataset, resources_paths): return dataset.select([0, 1], keep_in_memory=True) with tempfile.TemporaryDirectory() as tmp_dir: dummy_builder = DummyBuilder(cache_dir=tmp_dir, name="dummy") dummy_builder._post_process = types.MethodType( _post_process, dummy_builder) os.makedirs(dummy_builder.cache_dir) dummy_builder.info.splits = SplitDict() dummy_builder.info.splits.add(SplitInfo("train", num_examples=10)) dummy_builder.info.splits.add(SplitInfo("test", num_examples=10)) for split in dummy_builder.info.splits: writer = ArrowWriter( path=os.path.join(dummy_builder.cache_dir, f"dummy_builder-{split}.arrow"), features=Features({"text": Value("string")}), ) writer.write_batch({"text": ["foo"] * 10}) writer.finalize() writer = ArrowWriter( path=os.path.join(dummy_builder.cache_dir, f"small_dataset-{split}.arrow"), features=Features({"text": Value("string")}), ) writer.write_batch({"text": ["foo"] * 2}) writer.finalize() dsets = dummy_builder.as_dataset() self.assertIsInstance(dsets, DatasetDict) self.assertListEqual(list(dsets.keys()), ["train", "test"]) self.assertEqual(len(dsets["train"]), 2) self.assertEqual(len(dsets["test"]), 2) self.assertDictEqual(dsets["train"].features, Features({"text": Value("string")})) self.assertDictEqual(dsets["test"].features, Features({"text": Value("string")})) self.assertListEqual(dsets["train"].column_names, ["text"]) self.assertListEqual(dsets["test"].column_names, ["text"]) del dsets dset = dummy_builder.as_dataset("train") self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train") self.assertEqual(len(dset), 2) self.assertDictEqual(dset.features, Features({"text": Value("string")})) self.assertListEqual(dset.column_names, ["text"]) del dset dset = dummy_builder.as_dataset("train+test[:30%]") self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train+test[:30%]") self.assertEqual(len(dset), 2) self.assertDictEqual(dset.features, Features({"text": Value("string")})) self.assertListEqual(dset.column_names, ["text"]) del dset