def test_load(self): name = "empty_dataset_builder/k1=1" data_dir = "foo" as_dataset_kwargs = dict(a=1, b=2) # EmptyDatasetBuilder returns self from as_dataset builder = registered.load(name=name, split=dataset_builder.Split.TEST, data_dir=data_dir, download=False, as_dataset_kwargs=as_dataset_kwargs) self.assertTrue(builder.as_dataset_called) self.assertFalse(builder.download_called) print(as_dataset_kwargs) print(builder.as_dataset_kwargs) self.assertEqual(dataset_builder.Split.TEST, builder.as_dataset_kwargs.pop("split")) print(builder.as_dataset_kwargs) self.assertEqual(builder.as_dataset_kwargs, as_dataset_kwargs) self.assertEqual(dict(data_dir=data_dir, k1=1), builder.kwargs) builder = registered.load(name=name, split=dataset_builder.Split.TRAIN, data_dir=data_dir, download=True, as_dataset_kwargs=as_dataset_kwargs) self.assertTrue(builder.as_dataset_called) self.assertTrue(builder.download_called)
def test_load_from_gcs(self): from tensorflow_datasets.image import mnist # pylint:disable=g-import-not-at-top with testing.tmp_dir(self.get_temp_dir()) as tmp_dir: with absltest.mock.patch.object( mnist.MNIST, "_download_and_prepare", side_effect=NotImplementedError): # Make sure the dataset cannot be generated. with self.assertRaises(NotImplementedError): registered.load( name="mnist", data_dir=tmp_dir) # Enable GCS access so that dataset will be loaded from GCS. with self.gcs_access(): _, info = registered.load( name="mnist", data_dir=tmp_dir, with_info=True) self.assertSetEqual( set(["dataset_info.json", "image.image.json", "mnist-test.counts.txt-00000-of-00001", "mnist-test.tfrecord-00000-of-00001", "mnist-train.counts.txt-00000-of-00001"] + ["mnist-train.tfrecord-0000%d-of-00010" % i for i in range(10)]), set(tf.io.gfile.listdir(os.path.join(tmp_dir, "mnist/1.0.0")))) self.assertEqual(set(info.splits.keys()), set(["train", "test"]))
def test_load(self): name = "empty_dataset_builder/k1=1" data_dir = "foo" as_dataset_kwargs = dict(a=1, b=2) # EmptyDatasetBuilder returns self from as_dataset builder = registered.load(name=name, split=splits.Split.TEST, data_dir=data_dir, download=False, as_dataset_kwargs=as_dataset_kwargs) self.assertTrue(builder.as_dataset_called) self.assertFalse(builder.download_called) self.assertEqual(splits.Split.TEST, builder.as_dataset_kwargs.pop("split")) self.assertEqual(1, builder.as_dataset_kwargs.pop("batch_size")) self.assertFalse(builder.as_dataset_kwargs.pop("as_supervised")) self.assertEqual(builder.as_dataset_kwargs, as_dataset_kwargs) self.assertEqual(dict(data_dir=data_dir, k1=1, config=None), builder.kwargs) builder = registered.load(name, split=splits.Split.TRAIN, data_dir=data_dir, download=True, as_dataset_kwargs=as_dataset_kwargs) self.assertTrue(builder.as_dataset_called) self.assertTrue(builder.download_called)
def test_invalid_split_dataset(self): with test_utils.tmp_dir(self.get_temp_dir()) as tmp_dir: with self.assertRaisesWithPredicateMatch(ValueError, "ALL is a special"): # Raise error during .download_and_prepare() registered.load( name="invalid_split_dataset", data_dir=tmp_dir, )
def test_show_examples(self, mock_fig): with testing.mock_data(num_examples=20): ds, ds_info = registered.load('imagenet2012', split='train', with_info=True) visualization.show_examples(ds_info, ds) ds, ds_info = registered.load('crema_d', split='validation', with_info=True) visualization.show_examples(ds_info, ds)
def test_load_with_config(self): data_dir = "foo" name = "empty_dataset_builder/bar/k1=1" # EmptyDatasetBuilder returns self from as_dataset builder = registered.load(name=name, split=splits.Split.TEST, data_dir=data_dir) self.assertEqual(dict(data_dir=data_dir, k1=1, config="bar"), builder.kwargs) name = "empty_dataset_builder/bar" builder = registered.load(name=name, split=splits.Split.TEST, data_dir=data_dir) self.assertEqual(dict(data_dir=data_dir, config="bar"), builder.kwargs)
def test_mocking_imagenet(self): with mocking.mock_data(): ds = registered.load('imagenet2012', split='train') for ex in ds.take(10): self.assertCountEqual(list(ex.keys()), ['file_name', 'image', 'label']) ex['image'].shape.assert_is_compatible_with((None, None, 3))
def test_read_config(self): is_called = [] def interleave_sort(lists): is_called.append(True) return lists with testing.tmp_dir(self.get_temp_dir()) as tmp_dir: read_config = read_config_lib.ReadConfig( experimental_interleave_sort_fn=interleave_sort, ) read_config.options.experimental_stats.prefix = "tfds_prefix" ds = registered.load( name="dummy_dataset_shared_generator", data_dir=tmp_dir, split="train", read_config=read_config, shuffle_files=True, ) # Check that the ReadConfig options are properly set self.assertEqual(ds.options().experimental_stats.prefix, "tfds_prefix") # The instruction function should have been called self.assertEqual(is_called, [True])
def test_determinism(self): with testing.tmp_dir(self.get_temp_dir()) as tmp_dir: ds = registered.load(name="dummy_dataset_shared_generator", data_dir=tmp_dir, split=splits_lib.Split.TRAIN, as_dataset_kwargs=dict(shuffle_files=False)) ds_values = list(dataset_utils.as_numpy(ds)) # Ensure determinism. If this test fail, this mean that numpy random # module isn't always determinist (maybe between version, architecture, # ...), and so our datasets aren't guaranteed either. l = list(range(20)) np.random.RandomState(42).shuffle(l) self.assertEqual(l, [ 0, 17, 15, 1, 8, 5, 11, 3, 18, 16, 13, 2, 9, 19, 4, 12, 7, 10, 14, 6 ]) # Ensure determinism. If this test fails, this mean the dataset are not # deterministically generated. self.assertEqual( [e["x"] for e in ds_values], [ 16, 1, 2, 3, 10, 17, 0, 11, 14, 7, 4, 9, 18, 15, 8, 19, 6, 13, 12, 5 ], )
def test_load(self): with test_utils.tmp_dir(self.get_temp_dir()) as tmp_dir: dataset = registered.load(name="dummy_dataset_shared_generator", data_dir=tmp_dir, download=True, split=dataset_builder.Split.TRAIN) data = list(dataset) self.assertEqual(20, len(data))
def test_load(self): name = "empty_dataset_builder/k1=1" data_dir = "foo" as_dataset_kwargs = dict(a=1, b=2) # EmptyDatasetBuilder returns self from as_dataset builder = registered.load( name=name, data_dir=data_dir, download=False, **as_dataset_kwargs) self.assertTrue(builder.as_dataset_called) self.assertFalse(builder.download_called) self.assertEqual(builder.as_dataset_kwargs, as_dataset_kwargs) self.assertEqual(dict(data_dir=data_dir, k1=1), builder.kwargs) builder = registered.load( name=name, data_dir=data_dir, download=True, **as_dataset_kwargs) self.assertTrue(builder.as_dataset_called) self.assertTrue(builder.download_called)
def test_load(self): with testing.tmp_dir(self.get_temp_dir()) as tmp_dir: dataset = registered.load(name="dummy_dataset_with_configs", data_dir=tmp_dir, download=True, split=splits_lib.Split.TRAIN) data = list(dataset_utils.as_numpy(dataset)) self.assertEqual(20, len(data)) self.assertLess(data[0]["x"], 30)
def test_mocking_lm1b(self): with mocking.mock_data(): ds = registered.load('lm1b/bytes', split='train') self.assertEqual(ds.element_spec, { 'text': tf.TensorSpec(shape=(None,), dtype=tf.int64), }) for ex in ds.take(10): self.assertEqual(ex['text'].dtype, tf.int64) ex['text'].shape.assert_is_compatible_with((None,))
def test_mocking_imagenet(self): with mocking.mock_data(): ds = registered.load('imagenet2012', split='train') self.assertEqual(ds.element_spec, { 'file_name': tf.TensorSpec(shape=(), dtype=tf.string), 'image': tf.TensorSpec(shape=(None, None, 3), dtype=tf.uint8), 'label': tf.TensorSpec(shape=(), dtype=tf.int64), }) list(ds.take(3)) # Iteration should work
def test_load(self): name = "empty_dataset_builder/k1=1" data_dir = "foo" as_dataset_kwargs = dict(a=1, b=2) # EmptyDatasetBuilder returns self from as_dataset builder = registered.load(name=name, split=splits.Split.TEST, data_dir=data_dir, download=False, as_dataset_kwargs=as_dataset_kwargs) self.assertTrue(builder.as_dataset_called) self.assertFalse(builder.download_called) self.assertEqual(splits.Split.TEST, builder.as_dataset_kwargs.pop("split")) self.assertEqual(None, builder.as_dataset_kwargs.pop("batch_size")) self.assertFalse(builder.as_dataset_kwargs.pop("as_supervised")) self.assertFalse(builder.as_dataset_kwargs.pop("decoders")) self.assertIsNone(builder.as_dataset_kwargs.pop("in_memory")) self.assertIsNone(builder.as_dataset_kwargs.pop("read_config")) self.assertFalse(builder.as_dataset_kwargs.pop("shuffle_files")) self.assertEqual(builder.as_dataset_kwargs, as_dataset_kwargs) self.assertEqual(dict(data_dir=data_dir, k1=1), builder.kwargs) builder = registered.load(name, split=splits.Split.TRAIN, data_dir=data_dir, download=True, as_dataset_kwargs=as_dataset_kwargs) self.assertTrue(builder.as_dataset_called) self.assertTrue(builder.download_called) # Tests for different batch_size # By default batch_size=None builder = registered.load(name=name, split=splits.Split.TEST, data_dir=data_dir) self.assertEqual(None, builder.as_dataset_kwargs.pop("batch_size")) # Setting batch_size=1 builder = registered.load(name=name, split=splits.Split.TEST, data_dir=data_dir, batch_size=1) self.assertEqual(1, builder.as_dataset_kwargs.pop("batch_size"))
def test_max_values(self): with mocking.mock_data(num_examples=50): ds = registered.load('mnist', split='train') for ex in ds.take(50): self.assertLessEqual(tf.math.reduce_max(ex['label']).numpy(), 10) self.assertEqual( # Test determinism [ex['label'].numpy() for ex in ds.take(5)], [1, 9, 2, 5, 3], ) self.assertEqual( # Iterating twice should yield the same samples [ex['label'].numpy() for ex in ds.take(5)], [1, 9, 2, 5, 3], )
def test_multi_split(self): with testing.tmp_dir(self.get_temp_dir()) as tmp_dir: ds_train, ds_test = registered.load( name="dummy_dataset_shared_generator", data_dir=tmp_dir, split=["train", "test"], shuffle_files=False) data = list(dataset_utils.as_numpy(ds_train)) self.assertEqual(20, len(data)) data = list(dataset_utils.as_numpy(ds_test)) self.assertEqual(10, len(data))
def test_multi_split(self): with testing.tmp_dir(self.get_temp_dir()) as tmp_dir: ds_train, ds_test = registered.load( name="dummy_dataset_shared_generator", data_dir=tmp_dir, split=[splits_lib.Split.TRAIN, splits_lib.Split.TEST], as_dataset_kwargs=dict(shuffle_files=False)) data = list(dataset_utils.as_numpy(ds_train)) self.assertEqual(20, len(data)) data = list(dataset_utils.as_numpy(ds_test)) self.assertEqual(10, len(data))
def test_custom_as_dataset(self): def _as_dataset(self, *args, **kwargs): # pylint: disable=unused-argument return tf.data.Dataset.from_generator( lambda: ({ # pylint: disable=g-long-lambda 'text': t, } for t in ['some sentence', 'some other sentence']), output_types=self.info.features.dtype, output_shapes=self.info.features.shape, ) with mocking.mock_data(as_dataset_fn=_as_dataset): ds = registered.load('lm1b', split='train') out = [ex['text'] for ex in dataset_utils.as_numpy(ds)] self.assertEqual(out, [b'some sentence', b'some other sentence'])
def test_max_values(self): with mocking.mock_data(num_examples=50): ds = registered.load('mnist', split='train') self.assertEqual(ds.element_spec, { 'image': tf.TensorSpec(shape=(28, 28, 1), dtype=tf.uint8), 'label': tf.TensorSpec(shape=(), dtype=tf.int64), }) for ex in ds.take(50): self.assertLessEqual(tf.math.reduce_max(ex['label']).numpy(), 10) self.assertEqual( # Test determinism [ex['label'].numpy() for ex in ds.take(5)], [1, 9, 2, 5, 3], ) self.assertEqual( # Iterating twice should yield the same samples [ex['label'].numpy() for ex in ds.take(5)], [1, 9, 2, 5, 3], )
def test_nested_sequence(self): with testing.tmp_dir(self.get_temp_dir()) as tmp_dir: ds_train, ds_info = registered.load(name="nested_sequence_builder", data_dir=tmp_dir, split="train", with_info=True, shuffle_files=False) ex0, ex1, ex2 = [ ex["frames"]["coordinates"] for ex in dataset_utils.as_numpy(ds_train) ] self.assertAllEqual( ex0, tf.ragged.constant([ [[0, 1], [2, 3], [4, 5]], [], [[6, 7]], ], inner_shape=(2, ))) self.assertAllEqual(ex1, tf.ragged.constant([], ragged_rank=1)) self.assertAllEqual( ex2, tf.ragged.constant([ [[10, 11]], [[12, 13], [14, 15]], ], inner_shape=(2, ))) self.assertEqual( ds_info.features.dtype, {"frames": { "coordinates": tf.int32 }}, ) self.assertEqual( ds_info.features.shape, {"frames": { "coordinates": (None, None, 2) }}, ) nested_tensor_info = ds_info.features.get_tensor_info() self.assertEqual( nested_tensor_info["frames"]["coordinates"].sequence_rank, 2, )
def test_mocking_imagenet_decoders(self): with mocking.mock_data(): ds, ds_info = registered.load( 'imagenet2012', split='train', decoders={'image': decode.SkipDecoding()}, with_info=True, ) self.assertEqual(ds.element_spec, { 'file_name': tf.TensorSpec(shape=(), dtype=tf.string), 'image': tf.TensorSpec(shape=(), dtype=tf.string), # Encoded images 'label': tf.TensorSpec(shape=(), dtype=tf.int64), }) for ex in ds.take(10): # Image decoding should works image = ds_info.features['image'].decode_example(ex['image']) image.shape.assert_is_compatible_with((None, None, 3)) self.assertEqual(image.dtype, tf.uint8)
def test_load_all_splits(self): name = "empty_dataset_builder" # EmptyDatasetBuilder returns self from as_dataset builder = registered.load(name=name, data_dir="foo") self.assertTrue(builder.as_dataset_called) self.assertEqual(None, builder.as_dataset_kwargs.pop("split"))
def test_max_values(self): with mocking.mock_data(num_examples=50): ds = registered.load('mnist', split='train') for ex in ds.take(50): self.assertLessEqual(tf.math.reduce_max(ex['label']).numpy(), 10)
def _as_df(ds_name: str) -> pandas.DataFrame: """Loads the dataset as `pandas.DataFrame`.""" with testing.mock_data(num_examples=3): ds, ds_info = registered.load(ds_name, split='train', with_info=True) df = as_dataframe.as_dataframe(ds, ds_info) return df
def test_mocking_lm1b(self): with mocking.mock_data(): ds = registered.load('lm1b/bytes', split='train') for ex in ds.take(10): self.assertEqual(ex['text'].dtype, tf.int64) ex['text'].shape.assert_is_compatible_with((None, ))