def dataset_is_infinite(dataset): """True if the passed dataset is infinite.""" if ops.executing_eagerly_outside_functions(): return math_ops.equal(cardinality.cardinality(dataset), cardinality.INFINITE) else: dataset_size = K.get_session().run(cardinality.cardinality(dataset)) return dataset_size == cardinality.INFINITE
def testCorrectCardinality(self): dataset = dataset_ops.Dataset.range(10).filter(lambda x: True) self.assertEqual(self.evaluate(cardinality.cardinality(dataset)), cardinality.UNKNOWN) self.assertDatasetProduces(dataset, expected_output=range(10)) dataset = dataset.apply(cardinality.assert_cardinality(10)) self.assertEqual(self.evaluate(cardinality.cardinality(dataset)), 10) self.assertDatasetProduces(dataset, expected_output=range(10))
def test_on_dataset_with_unknown_cardinality_without_steps( self, distribution): with self.cached_session(): with distribution.scope(): model = get_model() optimizer = gradient_descent.GradientDescentOptimizer(0.001) loss = 'mse' metrics = ['mae', keras.metrics.CategoricalAccuracy()] model.compile(optimizer, loss, metrics=metrics) inputs = np.zeros((1000, 3), dtype=np.float32) targets = np.zeros((1000, 4), dtype=np.float32) # steps/steps_per_epoch are calculated when using numpy arrays as # input data. fit_with_numpy = model.fit(inputs, targets, epochs=1, batch_size=10).history fit_with_numpy_multiple_epochs = model.fit( inputs, targets, epochs=2, batch_size=10).history eval_with_numpy = model.evaluate(inputs, targets, batch_size=10) predict_with_numpy = model.predict(inputs, batch_size=10) dataset = convert_numpy_to_dataset_with_unknown_cardinality( inputs, targets) predict_dataset = convert_numpy_to_dataset_with_unknown_cardinality( inputs) self.assertEqual(keras.backend.get_value(cardinality.cardinality( dataset)), cardinality.UNKNOWN) self.assertEqual(keras.backend.get_value(cardinality.cardinality( predict_dataset)), cardinality.UNKNOWN) eval_with_ds = model.evaluate(dataset) predict_with_ds = model.predict(predict_dataset) self.assertAllClose( eval_with_numpy, eval_with_ds, atol=1e-4, rtol=1e-4) self.assertAllClose( predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4) if (distributed_training_utils.is_tpu_strategy(distribution) and distribution.extended.steps_per_run != 1): with self.assertRaisesRegexp(ValueError, '`steps_per_epoch` ' 'should be specified'): fit_with_ds = model.fit(dataset, epochs=1) else: fit_with_ds = model.fit(dataset, epochs=1).history fit_with_ds_multiple_epochs = model.fit(dataset, epochs=2).history self.assertAllClose( fit_with_numpy, fit_with_ds, atol=1e-4, rtol=1e-4) self.assertAllClose( fit_with_numpy_multiple_epochs, fit_with_ds_multiple_epochs, atol=1e-4, rtol=1e-4)
def _infer_steps(self, steps, dataset): """Infers steps_per_epoch needed to loop through a dataset.""" if steps is not None: return steps adapter_steps = self._adapter.get_size() if adapter_steps is not None: return adapter_steps if (ds_context.get_strategy().extended._in_multi_worker_mode() and # pylint: disable=protected-access (dataset.options().experimental_distribute.auto_shard_policy != distribute_options.AutoShardPolicy.OFF)): # If the dataset would be auto-sharded, we should not infer a local # steps_per_epoch due to the possible inbalanced sharding between workers. raise ValueError("When dataset is sharded across workers, please " "specify a reasonable `steps_per_epoch` such that all " "workers will train the same number of steps and each " "step can get data from dataset without EOF. This is " "required for allreduce to succeed. We will handle the " "last partial batch in the future.") size = cardinality.cardinality(dataset) if size == cardinality.INFINITE and steps is None: raise ValueError("When passing an infinitely repeating dataset, you " "must specify how many steps to draw.") if size >= 0: return size.numpy().item() return None
def testRoundtripMap(self): dataset = dataset_ops.Dataset.range(10).map(lambda x: x * x) variant = dataset_ops.to_variant(dataset) dataset = dataset_ops.from_variant(variant, dataset_ops.get_structure(dataset)) self.assertDatasetProduces(dataset, [x * x for x in range(10)]) self.assertEqual(self.evaluate(cardinality.cardinality(dataset)), 10)
def _infer_steps(self, steps): """Infers steps_per_epoch needed to loop through a dataset.""" if steps is not None: return steps adapter_steps = self._train_adapter.get_size() if adapter_steps is not None: return adapter_steps dataset = self._train_dataset if (ds_context.get_strategy().extended._in_multi_worker_mode() and # pylint: disable=protected-access (dataset.options().experimental_distribute.auto_shard_policy != distribute_options.AutoShardPolicy.OFF)): # If the dataset would be auto-sharded, we should not infer a local # steps_per_epoch due to the possible inbalanced sharding between workers. return None size = cardinality.cardinality(dataset) if size == cardinality.INFINITE and steps is None: raise ValueError( "When passing an infinitely repeating dataset, you " "must specify how many steps to draw.") if size >= 0: return size return None
def testRoundtripRange(self): dataset = dataset_ops.Dataset.range(10) variant = dataset_ops.to_variant(dataset) dataset = dataset_ops.from_variant(variant, dataset_ops.get_structure(dataset)) self.assertDatasetProduces(dataset, range(10)) self.assertEqual(self.evaluate(cardinality.cardinality(dataset)), 10)
def test_finite_dataset_unknown_cardinality_no_steps_arg(self): model = testing_utils.get_small_mlp(1, 4, input_dim=3) model.compile('rmsprop', 'mse', run_eagerly=testing_utils.should_run_eagerly(), experimental_run_tf_function=testing_utils. should_run_tf_function()) inputs = np.zeros((100, 3), dtype=np.float32) targets = np.random.randint(0, 4, size=100, dtype=np.int32) dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) dataset = dataset.filter(lambda x, y: True).batch(10) self.assertEqual( keras.backend.get_value(cardinality.cardinality(dataset)), cardinality.UNKNOWN) batch_counter = BatchCounterCallback() history = model.fit(dataset, epochs=2, verbose=1, callbacks=[batch_counter]) self.assertLen(history.history['loss'], 2) self.assertEqual(batch_counter.batch_end_count, 20) model.evaluate(dataset) out = model.predict(dataset) self.assertEqual(out.shape[0], 100)
def testRoundtripMap(self): dataset = dataset_ops.Dataset.range(10).map(lambda x: x*x) variant = dataset_ops.to_variant(dataset) dataset = dataset_ops.from_variant(variant, dataset_ops.get_structure(dataset)) self.assertDatasetProduces(dataset, [x * x for x in range(10)]) self.assertEqual(self.evaluate(cardinality.cardinality(dataset)), 10)
def test_finite_dataset_unknown_cardinality_out_of_data(self): model = testing_utils.get_small_mlp(1, 4, input_dim=3) model.compile('rmsprop', 'mse', run_eagerly=testing_utils.should_run_eagerly()) inputs = np.zeros((100, 3), dtype=np.float32) targets = np.random.randint(0, 4, size=100, dtype=np.int32) dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) dataset = dataset.filter(lambda x, y: True).batch(10) self.assertEqual( keras.backend.get_value(cardinality.cardinality(dataset)), cardinality.UNKNOWN) batch_counter = BatchCounterCallback() with test.mock.patch.object(logging, 'warning') as mock_log: # steps_per_epoch (200) is greater than the dataset size (100). As this is # unexpected, training will stop and not make it to the second epoch. history = model.fit(dataset, epochs=2, verbose=1, callbacks=[batch_counter], steps_per_epoch=200) self.assertIn( 'Your dataset ran out of data; interrupting training. ' 'Make sure that your dataset can generate at least ' '`steps_per_epoch * epochs` batches (in this case, 400 batches). ' 'You may need to use the repeat() function when ' 'building your dataset.', str(mock_log.call_args)) self.assertLen(history.history['loss'], 1) self.assertEqual(batch_counter.batch_count, 10) model.evaluate(dataset) out = model.predict(dataset) self.assertEqual(out.shape[0], 100)
def test_finite_dataset_unknown_cardinality_out_of_data(self): model = testing_utils.get_small_mlp(1, 4, input_dim=3) model.compile('rmsprop', 'mse', run_eagerly=testing_utils.should_run_eagerly()) inputs = np.zeros((100, 3), dtype=np.float32) targets = np.random.randint(0, 4, size=100, dtype=np.int32) dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) dataset = dataset.filter(lambda x, y: True).batch(10) self.assertEqual( keras.backend.get_value(cardinality.cardinality(dataset)), cardinality.UNKNOWN) batch_counter = BatchCounterCallback() with test.mock.patch.object(logging, 'warning') as mock_log: # steps_per_epoch (200) is greater than the dataset size (100). As this is # unexpected, training will stop and not make it to the second epoch. history = model.fit( dataset, epochs=2, verbose=1, callbacks=[batch_counter], steps_per_epoch=200) self.assertIn( 'Your dataset ran out of data; interrupting training. ' 'Make sure that your dataset can generate at least ' '`steps_per_epoch * epochs` batches (in this case, 400 batches). ' 'You may need to use the repeat() function when ' 'building your dataset.', str(mock_log.call_args)) self.assertLen(history.history['loss'], 1) self.assertEqual(batch_counter.batch_count, 10) model.evaluate(dataset) out = model.predict(dataset) self.assertEqual(out.shape[0], 100)
def test_finite_dataset_unknown_cardinality_no_step_with_train_and_val(self): class CaptureStdout(object): def __enter__(self): self._stdout = sys.stdout string_io = six.StringIO() sys.stdout = string_io self._stringio = string_io return self def __exit__(self, *args): self.output = self._stringio.getvalue() sys.stdout = self._stdout model = testing_utils.get_small_mlp(1, 4, input_dim=3) model.compile( 'rmsprop', 'mse', run_eagerly=testing_utils.should_run_eagerly(), experimental_run_tf_function=testing_utils.should_run_tf_function()) inputs = np.zeros((100, 3), dtype=np.float32) targets = np.random.randint(0, 4, size=100, dtype=np.int32) dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) dataset = dataset.filter(lambda x, y: True).batch(10) self.assertEqual( keras.backend.get_value(cardinality.cardinality(dataset)), cardinality.UNKNOWN) batch_counter = BatchCounterCallback() with CaptureStdout() as capture: history = model.fit( dataset, epochs=2, callbacks=[batch_counter], validation_data=dataset.take(3)) lines = capture.output.splitlines() self.assertIn('10/10', lines[-1]) self.assertLen(history.history['loss'], 2) # The first epoch will invoke batch begin 11 times, since it doesn't know # the cardinality. The second epoch should just invoke 10 times. if (testing_utils.should_run_eagerly() or testing_utils.should_run_tf_function()): expected_batch_begin_count = 21 else: expected_batch_begin_count = 20 self.assertEqual(batch_counter.batch_begin_count, expected_batch_begin_count) self.assertEqual(batch_counter.batch_end_count, 20) model.evaluate(dataset) out = model.predict(dataset) self.assertEqual(out.shape[0], 100)
def get_size(self): size = cardinality.cardinality(self._dataset) if size == cardinality.INFINITE and self._user_steps is None: raise ValueError( "When passing an infinitely repeating tf.data.Dataset, you " "must specify how many steps to draw.") elif size == cardinality.INFINITE: return self._user_steps elif size >= 0: return size.numpy().item()
def should_recreate_iterator(self): # Since DistributedDatasets have no cardinality, the user must provide # all steps that need to be run, calling `.repeat()` as needed. if _is_distributed_dataset(self._dataset): return False # If user doesn't supply `steps`, or if they supply `steps` that # exactly equals the size of the `Dataset`, create a new iterator # each epoch. return (self._user_steps is None or cardinality.cardinality(self._dataset).numpy() == self._user_steps)
def _validate_args(self, y, sample_weights, steps): """Validates `__init__` arguments.""" # Arguments that shouldn't be passed. if not is_none_or_empty(y): raise ValueError("`y` argument is not supported when using " "dataset as input.") if not is_none_or_empty(sample_weights): raise ValueError("`sample_weight` argument is not supported when using " "dataset as input.") size = cardinality.cardinality(self._dataset).numpy() if size == cardinality.INFINITE and steps is None: raise ValueError("When providing an infinite dataset, you must specify " "the number of steps to run.")
def test_finite_dataset_unknown_cardinality_no_step_with_train_and_val(self): if testing_utils.should_run_distributed(): self.skipTest('b/137397816') class CaptureStdout(object): def __enter__(self): self._stdout = sys.stdout string_io = six.StringIO() sys.stdout = string_io self._stringio = string_io return self def __exit__(self, *args): self.output = self._stringio.getvalue() sys.stdout = self._stdout model = testing_utils.get_small_mlp(1, 4, input_dim=3) model.compile( 'rmsprop', 'mse', run_eagerly=testing_utils.should_run_eagerly(), run_distributed=testing_utils.should_run_distributed()) inputs = np.zeros((100, 3), dtype=np.float32) targets = np.random.randint(0, 4, size=100, dtype=np.int32) dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) dataset = dataset.filter(lambda x, y: True).batch(10) self.assertEqual( keras.backend.get_value(cardinality.cardinality(dataset)), cardinality.UNKNOWN) batch_counter = BatchCounterCallback() with CaptureStdout() as capture: history = model.fit( dataset, epochs=2, callbacks=[batch_counter], validation_data=dataset.take(3)) lines = capture.output.splitlines() self.assertIn('1/Unknown', lines[2]) self.assertIn('10/10', lines[-1]) self.assertLen(history.history['loss'], 2) self.assertEqual(batch_counter.batch_count, 20) model.evaluate(dataset) out = model.predict(dataset) self.assertEqual(out.shape[0], 100)
def test_finite_dataset_unknown_cardinality_no_step_with_train_and_val(self): class CaptureStdout(object): def __enter__(self): self._stdout = sys.stdout string_io = six.StringIO() sys.stdout = string_io self._stringio = string_io return self def __exit__(self, *args): self.output = self._stringio.getvalue() sys.stdout = self._stdout model = testing_utils.get_small_mlp(1, 4, input_dim=3) model.compile( 'rmsprop', 'mse', run_eagerly=testing_utils.should_run_eagerly()) inputs = np.zeros((100, 3), dtype=np.float32) targets = np.random.randint(0, 4, size=100, dtype=np.int32) dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) dataset = dataset.filter(lambda x, y: True).batch(10) self.assertEqual( keras.backend.get_value(cardinality.cardinality(dataset)), cardinality.UNKNOWN) batch_counter = BatchCounterCallback() with CaptureStdout() as capture: history = model.fit( dataset, epochs=2, callbacks=[batch_counter], validation_data=dataset.take(3)) lines = capture.output.splitlines() self.assertIn('1/Unknown', lines[2]) self.assertIn('10/10', lines[-1]) self.assertLen(history.history['loss'], 2) self.assertEqual(batch_counter.batch_count, 20) model.evaluate(dataset) out = model.predict(dataset) self.assertEqual(out.shape[0], 100)
def _tf_dataset_len(s): l = cardinality.cardinality(s) msg = gen_string_ops.string_join([ 'len requires dataset with definitive cardinality, got ', gen_string_ops.as_string(l) ]) # TODO (yongtang): UNKNOWN is treated as an error. # In case there are more UNKNOWN cases for dataset, we could # use dataset.reduce() to find out the length (in an expensive way). with ops.control_dependencies([ control_flow_ops.Assert( math_ops.logical_and( math_ops.not_equal(l, cardinality.INFINITE), math_ops.not_equal(l, cardinality.UNKNOWN)), [msg]) ]): l = array_ops.identity(l) return l
def test_finite_dataset_unknown_cardinality_no_steps_arg(self): model = testing_utils.get_small_mlp(1, 4, input_dim=3) optimizer = RMSPropOptimizer(learning_rate=0.001) model.compile(optimizer, 'mse', run_eagerly=testing_utils.should_run_eagerly()) inputs = np.zeros((100, 3), dtype=np.float32) targets = np.random.randint(0, 4, size=100, dtype=np.int32) dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) dataset = dataset.filter(lambda x, y: True).batch(10) self.assertEqual(keras.backend.get_value(cardinality.cardinality(dataset)), cardinality.UNKNOWN) history = model.fit(dataset, epochs=2, verbose=1) self.assertEqual(len(history.history['loss']), 2) model.evaluate(dataset) out = model.predict(dataset) self.assertEqual(out.shape[0], 100)
def test_unknown_cardinality_dataset_with_steps_per_epoch(self): ds = dataset_ops.DatasetV2.from_tensor_slices([0, 1, 2, 3, 4, 5, 6]) filtered_ds = ds.filter(lambda x: x < 4) self.assertEqual( cardinality.cardinality(filtered_ds).numpy(), cardinality.UNKNOWN) # User can choose to only partially consume `Dataset`. data_handler = data_adapter.DataHandler( filtered_ds, initial_epoch=0, epochs=2, steps_per_epoch=2) self.assertFalse(data_handler._adapter.should_recreate_iterator()) returned_data = [] for _, iterator in data_handler.enumerate_epochs(): epoch_data = [] for _ in data_handler.steps(): epoch_data.append(next(iterator)) returned_data.append(epoch_data) returned_data = self.evaluate(returned_data) self.assertEqual(returned_data, [[0, 1], [2, 3]])
def test_finite_dataset_unknown_cardinality_no_steps_arg(self): model = testing_utils.get_small_mlp(1, 4, input_dim=3) optimizer = 'rmsprop' model.compile(optimizer, 'mse', run_eagerly=testing_utils.should_run_eagerly()) inputs = np.zeros((100, 3), dtype=np.float32) targets = np.random.randint(0, 4, size=100, dtype=np.int32) dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) dataset = dataset.filter(lambda x, y: True).batch(10) self.assertEqual(keras.backend.get_value(cardinality.cardinality(dataset)), cardinality.UNKNOWN) history = model.fit(dataset, epochs=2, verbose=1) self.assertEqual(len(history.history['loss']), 2) model.evaluate(dataset) out = model.predict(dataset) self.assertEqual(out.shape[0], 100)
def test_unknown_cardinality_dataset_without_steps_per_epoch(self): ds = dataset_ops.DatasetV2.from_tensor_slices([0, 1, 2, 3, 4, 5, 6]) filtered_ds = ds.filter(lambda x: x < 4) self.assertEqual( cardinality.cardinality(filtered_ds).numpy(), cardinality.UNKNOWN) data_handler = data_adapter.DataHandler( filtered_ds, initial_epoch=0, epochs=2) self.assertTrue(data_handler._adapter.should_recreate_iterator()) returned_data = [] for _, iterator in data_handler.enumerate_epochs(): epoch_data = [] with data_handler.catch_stop_iteration(): for _ in data_handler.steps(): epoch_data.append(next(iterator)) returned_data.append(epoch_data) returned_data = self.evaluate(returned_data) self.assertEqual(returned_data, [[0, 1, 2, 3], [0, 1, 2, 3]]) self.assertEqual(data_handler._steps_per_epoch, 4)
def _validate_args(self, y, sample_weights, steps): """Validates `__init__` arguments.""" # Arguments that shouldn't be passed. if not is_none_or_empty(y): raise ValueError("`y` argument is not supported when using " "dataset as input.") if not is_none_or_empty(sample_weights): raise ValueError("`sample_weight` argument is not supported when using " "dataset as input.") if steps is None: if _is_distributed_dataset(self._dataset): raise ValueError("When providing a distributed dataset, you must " "specify the number of steps to run.") size = cardinality.cardinality(self._dataset).numpy() if size == cardinality.INFINITE and steps is None: raise ValueError( "When providing an infinite dataset, you must specify " "the number of steps to run (if you did not intend to ." "create an infinite dataset, make sure to not call " "`repeat()` on the dataset).")
def _dataset_is_infinite(self, dataset): """True if the passed dataset is infinite.""" dataset_size = K.get_session().run(cardinality.cardinality(dataset)) return dataset_size == cardinality.INFINITE
def _dataset_is_infinite(self, dataset): """True if the passed dataset is infinite.""" return math_ops.equal(cardinality.cardinality(dataset), cardinality.INFINITE)
def should_recreate_iterator(self): # If user doesn't supply `steps`, or if they supply `steps` that # exactly equals the size of the `Dataset`, create a new iterator # each epoch. return (self._user_steps is None or cardinality.cardinality(self._dataset).numpy() == self._user_steps)
def testCardinality(self, dataset_fn, expected_result): self.assertEqual(self.evaluate(cardinality.cardinality(dataset_fn())), expected_result)
def testNumElements(self, dataset_fn, expected_result): with self.cached_session() as sess: self.assertEqual(sess.run(cardinality.cardinality(dataset_fn())), expected_result)
from __future__ import absolute_import, division, print_function # Import TensorFlow v2. import tensorflow as tf from tensorflow.python.data.experimental.ops import cardinality tf.autograph.set_verbosity(10, alsologtostdout=True) tf.config.run_functions_eagerly(True) def add(x): print("debug test test") return x + 1 dataset = tf.data.Dataset.range(10) # ==> [ 1, 2, 3, 4, 5 ] dataset = dataset.map(add) card = cardinality.cardinality(dataset) for item in dataset: print(item)
def testNumElements(self, dataset_fn, expected_result): with self.cached_session() as sess: self.assertEqual( sess.run(cardinality.cardinality(dataset_fn())), expected_result)