def testOneShotIteratorInitializerFails(self): # Define a dataset whose initialization will always fail. dataset = dataset_ops.Dataset.from_tensors( array_ops.check_numerics( constant_op.constant(1.0) / constant_op.constant(0.0), "oops")) iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() with self.test_session() as sess: with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"): sess.run(next_element) # Test that subsequent attempts to use the iterator also fail. with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"): sess.run(next_element) with self.test_session() as sess: def consumer_thread(): with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"): sess.run(next_element) num_threads = 8 threads = [ self.checkedThread(consumer_thread) for _ in range(num_threads) ] for t in threads: t.start() for t in threads: t.join()
def add_check_numerics_ops(): """Connect a `check_numerics` to every floating point tensor. `check_numerics` operations themselves are added for each `half`, `float`, or `double` tensor in the graph. For all ops in the graph, the `check_numerics` op for all of its (`half`, `float`, or `double`) inputs is guaranteed to run before the `check_numerics` op on any of its outputs. Returns: A `group` op depending on all `check_numerics` ops added. """ check_op = [] # This code relies on the ordering of ops in get_operations(). # The producer of a tensor always comes before that tensor's consumer in # this list. This is true because get_operations() returns ops in the order # added, and an op can only be added after its inputs are added. for op in ops.get_default_graph().get_operations(): for output in op.outputs: if output.dtype in [ dtypes.float32, dtypes.float64 ] and output.op._get_control_flow_context( ) == ops.get_default_graph()._get_control_flow_context(): message = op.name + ":" + str(output.value_index) with ops.control_dependencies(check_op): check_op = [ array_ops.check_numerics(output, message=message) ] return control_flow_ops.group(*check_op)
def testPassThrough(self): with self.session(graph=ops.Graph()): t1 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3]) checked = array_ops.check_numerics(t1, message="pass through test") value = self.evaluate(checked) self.assertAllEqual(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), value) self.assertEqual([2, 3], checked.get_shape())
def testOneShotIteratorInitializerFails(self): # Define a dataset whose initialization will always fail. dataset = dataset_ops.Dataset.from_tensors( array_ops.check_numerics( constant_op.constant(1.0) / constant_op.constant(0.0), "oops")) iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() with self.test_session() as sess: with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"): sess.run(next_element) # Test that subsequent attempts to use the iterator also fail. with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"): sess.run(next_element) with self.test_session() as sess: def consumer_thread(): with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"): sess.run(next_element) num_threads = 8 threads = [ self.checkedThread(consumer_thread) for _ in range(num_threads)] for t in threads: t.start() for t in threads: t.join()
def testPassThrough(self): with self.session(graph=ops.Graph()): t1 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3]) checked = array_ops.check_numerics(t1, message="pass through test") value = checked.eval() self.assertAllEqual(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), value) self.assertEqual([2, 3], checked.get_shape())
def testWindowIgnoreErrors(self): input_values = np.float32([1., np.nan, 2., np.nan, 3.]) dataset = dataset_ops.Dataset.from_tensor_slices(input_values).map( lambda x: array_ops.check_numerics(x, "message")).window( size=2, shift=2, stride=2, drop_remainder=True).flat_map(lambda x: x.batch(batch_size=2)) self.assertDatasetProduces( dataset, expected_output=[np.float32([1., 2.]), np.float32([2., 3.])])
def testParallelMapUnspecifiedOutputSize(self): components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32) dataset = (dataset_ops.Dataset.from_tensor_slices(components).map( lambda x: array_ops.check_numerics(x, "message"), num_parallel_calls=2)) get_next = self.getNext(dataset) for _ in range(3): self.evaluate(get_next())
def testParallelMapUnspecifiedOutputSize(self): components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32) dataset = (dataset_ops.Dataset.from_tensor_slices(components) .map(lambda x: array_ops.check_numerics(x, "message"), num_parallel_calls=2)) get_next = self.getNext(dataset) for _ in range(3): self.evaluate(get_next())
def _build_ds(self): components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32) dataset = dataset_ops.Dataset.from_tensor_slices(components) dataset = dataset.map(lambda x: array_ops.check_numerics(x, "message")) dataset = dataset.apply(error_ops.ignore_errors()) options = options_lib.Options() options.experimental_external_state_policy = ( options_lib.ExternalStatePolicy.IGNORE) return dataset.with_options(options)
def testMapAndBatchFails(self): """Test a dataset that maps a TF function across its input elements.""" with self.assertRaisesRegex(errors.InvalidArgumentError, "oops"): dataset = dataset_ops.Dataset.from_tensors( array_ops.check_numerics( constant_op.constant(1.0) / constant_op.constant(0.0), "oops")) dataset = dataset.apply(batching.map_and_batch(lambda x: x, 14)) get_next = self.getNext(dataset, requires_initialization=True) self.evaluate(get_next())
def add_check_numerics_ops(): """Connect a `check_numerics` to every floating point tensor. `check_numerics` operations themselves are added for each `half`, `float`, or `double` tensor in the graph. For all ops in the graph, the `check_numerics` op for all of its (`half`, `float`, or `double`) inputs is guaranteed to run before the `check_numerics` op on any of its outputs. Note: This API is not compatible with the use of `tf.cond` or `tf.while_loop`, and will raise a `ValueError` if you attempt to call it in such a graph. Returns: A `group` op depending on all `check_numerics` ops added. Raises: ValueError: If the graph contains any numeric operations in a control flow structure. RuntimeError: If called with eager execution enabled. @compatibility(eager) Not compatible with eager execution. To check for `Inf`s and `NaN`s under eager execution, call tfe.seterr(inf_or_nan='raise') once before executing the checked operations. @enc_compatibility """ if context.executing_eagerly(): raise RuntimeError( "add_check_numerics_ops() is not compatible with eager execution. " "To check for Inf's and NaN's under eager execution, call " "tfe.seterr(inf_or_nan='raise') once before executing the " "checked operations.") check_op = [] # This code relies on the ordering of ops in get_operations(). # The producer of a tensor always comes before that tensor's consumer in # this list. This is true because get_operations() returns ops in the order # added, and an op can only be added after its inputs are added. for op in ops.get_default_graph().get_operations(): for output in op.outputs: if output.dtype in [ dtypes.float16, dtypes.float32, dtypes.float64 ]: if op._get_control_flow_context() is not None: # pylint: disable=protected-access raise ValueError( "`tf.add_check_numerics_ops() is not compatible " "with TensorFlow control flow operations such as " "`tf.cond()` or `tf.while_loop()`.") message = op.name + ":" + str(output.value_index) with ops.control_dependencies(check_op): check_op = [ array_ops.check_numerics(output, message=message) ] return control_flow_ops.group(*check_op)
def testZipIgnoreError(self): a = dataset_ops.Dataset.from_tensor_slices([1., 2., 0., 4.]) b = a.map(lambda x: array_ops.check_numerics(1. / x, "error")) dataset = dataset_ops.Dataset.zip((b, a)).apply(error_ops.ignore_errors()) get_next = self.getNext(dataset) for x in [1., 2., 4.]: self.assertEqual((1. / x, x), self.evaluate(get_next())) with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next())
def testParallelMapIgnoreError(self): components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32) dataset = (dataset_ops.Dataset.from_tensor_slices(components).map( lambda x: array_ops.check_numerics(x, "message"), num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors())) get_next = self.getNext(dataset) for x in [1., 2., 3., 5.]: self.assertEqual(x, self.evaluate(get_next())) with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next())
def testBatchAndMapDatasetFails(self): """Test a dataset that maps a TF function across its input elements.""" dataset = dataset_ops.Dataset.from_tensors( array_ops.check_numerics( constant_op.constant(1.0) / constant_op.constant(0.0), "oops")) batch_size = array_ops.placeholder(dtypes.int64, shape=[]) iterator = (dataset.apply(batching.map_and_batch(lambda x: x, batch_size)) .make_initializable_iterator()) init_op = iterator.initializer with self.test_session() as sess: with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"): sess.run(init_op, feed_dict={batch_size: 14})
def testParallelMapUnspecifiedOutputSize(self): components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32) dataset = (dataset_ops.Dataset.from_tensor_slices(components).map( lambda x: array_ops.check_numerics(x, "message"), num_threads=2)) iterator = dataset.make_initializable_iterator() init_op = iterator.initializer get_next = iterator.get_next() with self.test_session() as sess: sess.run(init_op) for _ in range(3): sess.run(get_next)
def testWindowIgnoreErrors(self): input_values = np.float32([1., np.nan, 2., np.nan, 3.]) dataset = dataset_ops.Dataset.from_tensor_slices(input_values).map( lambda x: array_ops.check_numerics(x, "message")).window( size=2, shift=2, stride=2, drop_remainder=True).flat_map(lambda x: x.batch(batch_size=2)) get_next = dataset.make_one_shot_iterator().get_next() with self.cached_session() as sess: self.assertAllEqual(np.float32([1., 2.]), sess.run(get_next)) self.assertAllEqual(np.float32([2., 3.]), sess.run(get_next)) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def testParallelMapIgnoreError(self): components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32) dataset = ( dataset_ops.Dataset.from_tensor_slices(components).map( lambda x: array_ops.check_numerics(x, "message"), num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors())) get_next = self.getNext(dataset) for x in [1., 2., 3., 5.]: self.assertEqual(x, self.evaluate(get_next())) with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next())
def testWindowIgnoreErrors(self): input_values = np.float32([1., np.nan, 2., np.nan, 3.]) dataset = dataset_ops.Dataset.from_tensor_slices(input_values).map( lambda x: array_ops.check_numerics(x, "message")).window( size=2, shift=2, stride=2, drop_remainder=True).flat_map(lambda x: x.batch(batch_size=2)) get_next = dataset.make_one_shot_iterator().get_next() with self.cached_session() as sess: self.assertAllEqual(np.float32([1., 2.]), self.evaluate(get_next)) self.assertAllEqual(np.float32([2., 3.]), self.evaluate(get_next)) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def testMapAndBatchFails(self, numa_aware): """Test a dataset that maps a TF function across its input elements.""" with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"): dataset = dataset_ops.Dataset.from_tensors( array_ops.check_numerics( constant_op.constant(1.0) / constant_op.constant(0.0), "oops")) dataset = dataset.apply(batching.map_and_batch(lambda x: x, 14)) if numa_aware: options = dataset_ops.Options() options.experimental_numa_aware = True dataset = dataset.with_options(options) get_next = self.getNext(dataset) self.evaluate(get_next())
def testIgnoreError_withLogWarning(self): components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32) dataset = (dataset_ops.Dataset.from_tensor_slices(components).map( lambda x: array_ops.check_numerics(x, "message")).apply( error_ops.ignore_errors(log_warning=True))) get_next = self.getNext(dataset) for x in [1., 2., 3.]: self.assertEqual(x, self.evaluate(get_next())) with self.captureWritesToStream(sys.stderr) as logged: self.assertEqual(5., self.evaluate(get_next())) expected = "Tensor had NaN values" self.assertIn((expected), logged.contents()) with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next())
def testParallelMapUnspecifiedOutputSize(self): components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32) dataset = (dataset_ops.Dataset.from_tensor_slices(components) .map(lambda x: array_ops.check_numerics(x, "message"), num_parallel_calls=2)) iterator = dataset.make_initializable_iterator() init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: sess.run(init_op) for _ in range(3): sess.run(get_next)
def add_check_numerics_ops(): """Connect a `tf.debugging.check_numerics` to every floating point tensor. `check_numerics` operations themselves are added for each `half`, `float`, or `double` tensor in the current default graph. For all ops in the graph, the `check_numerics` op for all of its (`half`, `float`, or `double`) inputs is guaranteed to run before the `check_numerics` op on any of its outputs. Note: This API is not compatible with the use of `tf.cond` or `tf.while_loop`, and will raise a `ValueError` if you attempt to call it in such a graph. Returns: A `group` op depending on all `check_numerics` ops added. Raises: ValueError: If the graph contains any numeric operations in a control flow structure. RuntimeError: If called with eager execution enabled. @compatibility(eager) Not compatible with eager execution. To check for `Inf`s and `NaN`s under eager execution, call `tfe.seterr(inf_or_nan='raise')` once before executing the checked operations. @end_compatibility """ if context.executing_eagerly(): raise RuntimeError( "add_check_numerics_ops() is not compatible with eager execution. " "To check for Inf's and NaN's under eager execution, call " "tfe.seterr(inf_or_nan='raise') once before executing the " "checked operations.") check_op = [] # This code relies on the ordering of ops in get_operations(). # The producer of a tensor always comes before that tensor's consumer in # this list. This is true because get_operations() returns ops in the order # added, and an op can only be added after its inputs are added. for op in ops.get_default_graph().get_operations(): for output in op.outputs: if output.dtype in [dtypes.float16, dtypes.float32, dtypes.float64]: if op._get_control_flow_context() is not None: # pylint: disable=protected-access raise ValueError("`tf.add_check_numerics_ops() is not compatible " "with TensorFlow control flow operations such as " "`tf.cond()` or `tf.while_loop()`.") message = op.name + ":" + str(output.value_index) with ops.control_dependencies(check_op): check_op = [array_ops.check_numerics(output, message=message)] return control_flow_ops.group(*check_op)
def _model_fn(features, labels, mode, config, params=None): """model_fn.""" # TODO(jhseu): Move to EVAL and PREDICT to TPU. if mode != model_fn_lib.ModeKeys.TRAIN: return _call_model_fn_without_tpu(model_fn, features, labels, mode, config, params) # Now for TPU training. if params is not None and _BATCH_SIZE_KEY in params: params[_BATCH_SIZE_KEY] //= config.tpu_config.num_shards assert isinstance(features, _PerShardOutput) features = features.as_list() if labels is not None: assert isinstance(labels, _PerShardOutput) labels = labels.as_list() dequeue_fn, enqueue_fn = (_create_infeed_enqueue_ops_and_dequeue_fn( config, features, labels)) loss = _train_on_tpu_shards(config, train_step=_convert_model_fn_to_train_step( model_fn, dequeue_fn, mode, config, params)) # Gets the variables back from TPU nodes. This means the variables updated # by TPU will now be *synced* to host memory. update_ops = [ array_ops.check_numerics(v.read_value(), 'Gradient for %s is NaN' % v.name).op for v in variables.trainable_variables() ] hooks = [ TpuInfeedSessionHook(config, enqueue_fn), training.LoggingTensorHook( { 'loss': array_ops.identity(loss), 'step': training.get_global_step() }, every_n_secs=30) ] return model_fn_lib.EstimatorSpec( mode, loss=array_ops.identity(loss), training_hooks=hooks, train_op=control_flow_ops.group(*update_ops))
def testMapIgnoreError(self): components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32) dataset = (dataset_ops.Dataset.from_tensor_slices(components).map( lambda x: array_ops.check_numerics(x, "message")).ignore_errors()) iterator = dataset.make_initializable_iterator() init_op = iterator.initializer get_next = iterator.get_next() with self.test_session() as sess: sess.run(init_op) for x in [1., 2., 3., 5.]: self.assertEqual(x, sess.run(get_next)) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def _VerifyTensor(t, name, msg): """Assert that the tensor does not contain any NaN's. Args: t: Tensor name: name msg: message to log Returns: Tensor, but verified """ with ops.name_scope(name): with ops.device(t.device or ops.get_default_graph().get_default_device()): verify_input = array_ops.check_numerics(t, message=msg) out = control_flow_ops.with_dependencies([verify_input], t) return out
def testParallelMapIgnoreError(self): components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32) dataset = (dataset_ops.Dataset.from_tensor_slices(components).map( lambda x: array_ops.check_numerics(x, "message"), num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors())) iterator = dataset.make_initializable_iterator() init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: self.evaluate(init_op) for x in [1., 2., 3., 5.]: self.assertEqual(x, self.evaluate(get_next)) with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next)
def testParallelMapError(self): components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32) dataset = (dataset_ops.Dataset.from_tensor_slices(components) .map(lambda x: array_ops.check_numerics(x, "message"), num_parallel_calls=2)) get_next = self.getNext(dataset) for _ in range(3): self.evaluate(get_next()) # The 4th element is NaN, so `array_ops.check_numerics()` should fail. with self.assertRaises(errors.InvalidArgumentError): self.evaluate(get_next()) self.evaluate(get_next()) with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next())
def testPrefetchError(self): components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32) dataset = (dataset_ops.Dataset.from_tensor_slices(components).map( lambda x: array_ops.check_numerics(x, "message")).prefetch(2)) get_next = self.getNext(dataset) for _ in range(3): self.evaluate(get_next()) # The 4th element is NaN, so `array_ops.check_numerics()` should fail. with self.assertRaises(errors.InvalidArgumentError): self.evaluate(get_next()) self.evaluate(get_next()) with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next())
def testInterleaveDatasetError(self, input_values, cycle_length, block_length, num_parallel_calls): dataset = dataset_ops.Dataset.from_tensor_slices(input_values).map( lambda x: array_ops.check_numerics(x, "message")).interleave( dataset_ops.Dataset.from_tensors, cycle_length, block_length, num_parallel_calls) get_next = self.getNext(dataset) for value in input_values: if np.isnan(value): with self.assertRaises(errors.InvalidArgumentError): self.evaluate(get_next()) else: self.assertEqual(value, self.evaluate(get_next())) with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next())
def testMapIgnoreError(self): components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32) dataset = (dataset_ops.Dataset.from_tensor_slices(components) .map(lambda x: array_ops.check_numerics(x, "message")) .ignore_errors()) iterator = dataset.make_initializable_iterator() init_op = iterator.initializer get_next = iterator.get_next() with self.test_session() as sess: sess.run(init_op) for x in [1., 2., 3., 5.]: self.assertEqual(x, sess.run(get_next)) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def _model_fn(features, labels, mode, config, params=None): """model_fn.""" # TODO(jhseu): Move to EVAL and PREDICT to TPU. if mode != model_fn_lib.ModeKeys.TRAIN: return _call_model_fn_without_tpu( model_fn, features, labels, mode, config, params) # Now for TPU training. if params is not None and _BATCH_SIZE_KEY in params: params[_BATCH_SIZE_KEY] //= config.tpu_config.num_shards assert isinstance(features, _PerShardOutput) features = features.as_list() if labels is not None: assert isinstance(labels, _PerShardOutput) labels = labels.as_list() dequeue_fn, enqueue_fn = ( _create_infeed_enqueue_ops_and_dequeue_fn(config, features, labels)) loss = _train_on_tpu_shards( config, train_step=_convert_model_fn_to_train_step( model_fn, dequeue_fn, mode, config, params)) # Gets the variables back from TPU nodes. This means the variables updated # by TPU will now be *synced* to host memory. update_ops = [ array_ops.check_numerics(v.read_value(), 'Gradient for %s is NaN' % v.name).op for v in variables.trainable_variables() ] hooks = [ TpuInfeedSessionHook(config, enqueue_fn), training.LoggingTensorHook( {'loss': array_ops.identity(loss), 'step': training.get_global_step()}, every_n_secs=30) ] return model_fn_lib.EstimatorSpec( mode, loss=array_ops.identity(loss), training_hooks=hooks, train_op=control_flow_ops.group(*update_ops))
def verify_tensor_all_finite_v2(x, message, name=None): """Assert that the tensor does not contain any NaN's or Inf's. Args: x: Tensor to check. message: Message to log on failure. name: A name for this operation (optional). Returns: Same tensor as `x`. """ with ops.name_scope(name, "VerifyFinite", [x]) as name: x = ops.convert_to_tensor(x, name="x") with ops.colocate_with(x): verify_input = array_ops.check_numerics(x, message=message) out = control_flow_ops.with_dependencies([verify_input], x) return out
def testInterleaveDatasetError(self, input_values, cycle_length, block_length, num_parallel_calls): dataset = dataset_ops.Dataset.from_tensor_slices(input_values).map( lambda x: array_ops.check_numerics(x, "message")).interleave( dataset_ops.Dataset.from_tensors, cycle_length, block_length, num_parallel_calls) get_next = dataset.make_one_shot_iterator().get_next() with self.cached_session() as sess: for value in input_values: if np.isnan(value): with self.assertRaises(errors.InvalidArgumentError): sess.run(get_next) else: self.assertEqual(value, sess.run(get_next)) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def testParallelMapIgnoreError(self): components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32) dataset = ( dataset_ops.Dataset.from_tensor_slices(components).map( lambda x: array_ops.check_numerics(x, "message"), num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors())) iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: self.evaluate(init_op) for x in [1., 2., 3., 5.]: self.assertEqual(x, self.evaluate(get_next)) with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next)
def testMapAndBatchFails(self, numa_aware): """Test a dataset that maps a TF function across its input elements.""" dataset = dataset_ops.Dataset.from_tensors( array_ops.check_numerics( constant_op.constant(1.0) / constant_op.constant(0.0), "oops")) batch_size = array_ops.placeholder(dtypes.int64, shape=[]) dataset = dataset.apply(batching.map_and_batch(lambda x: x, batch_size)) if numa_aware: options = dataset_ops.Options() options.experimental_numa_aware = True dataset = dataset.with_options(options) iterator = dataset.make_initializable_iterator() init_op = iterator.initializer with self.cached_session() as sess: with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"): sess.run(init_op, feed_dict={batch_size: 14})
def verify_tensor_all_finite(t, msg, name=None): """Assert that the tensor does not contain any NaN's or Inf's. Args: t: Tensor to check. msg: Message to log on failure. name: A name for this operation (optional). Returns: Same tensor as `t`. """ with ops.op_scope([t], name, "VerifyFinite") as name: t = ops.convert_to_tensor(t, name="t") with ops.device(t.device): verify_input = array_ops.check_numerics(t, message=msg) out = control_flow_ops.with_dependencies([verify_input], t) return out
def testMapAndBatchFails(self, numa_aware): """Test a dataset that maps a TF function across its input elements.""" dataset = dataset_ops.Dataset.from_tensors( array_ops.check_numerics( constant_op.constant(1.0) / constant_op.constant(0.0), "oops")) batch_size = array_ops.placeholder(dtypes.int64, shape=[]) dataset = dataset.apply(batching.map_and_batch(lambda x: x, batch_size)) if numa_aware: options = dataset_ops.Options() options.experimental_numa_aware = True dataset = dataset.with_options(options) iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer with self.cached_session() as sess: with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"): sess.run(init_op, feed_dict={batch_size: 14})
def verify_tensor_all_finite(t, msg, name=None): """Assert that the tensor does not contain any NaN's or Inf's. Args: t: Tensor to check. msg: Message to log on failure. name: A name for this operation (optional). Returns: Same tensor as `t`. """ with ops.op_scope([t], name, "VerifyFinite") as name: t = ops.convert_to_tensor(t, name="t") with ops.device(t.device or t.graph.get_default_device()): verify_input = array_ops.check_numerics(t, message=msg) out = control_flow_ops.with_dependencies([verify_input], t) return out
def testPrefetchError(self): components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32) dataset = (dataset_ops.Dataset.from_tensor_slices(components).map( lambda x: array_ops.check_numerics(x, "message")).prefetch(2)) iterator = dataset.make_initializable_iterator() init_op = iterator.initializer get_next = iterator.get_next() with self.test_session() as sess: sess.run(init_op) for _ in range(3): sess.run(get_next) # The 4th element is NaN, so `array_ops.check_numerics()` should fail. with self.assertRaises(errors.InvalidArgumentError): sess.run(get_next) sess.run(get_next) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def testParallelMapError(self): components = [np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)] dataset = (dataset_ops.Dataset.from_tensor_slices(components) .map(lambda x: array_ops.check_numerics(x, "message"))) iterator = dataset.make_initializable_iterator() init_op = iterator.initializer get_next = iterator.get_next() with self.test_session() as sess: sess.run(init_op) for _ in range(3): sess.run(get_next) # The 4th element is NaN, so `array_ops.check_numerics()` should fail. with self.assertRaises(errors.InvalidArgumentError): sess.run(get_next) sess.run(get_next) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def _model_fn(features, labels, mode, config, params): """A Estimator `model_fn` for TPUEstimator.""" model_fn_wrapper = _ModelFnWrapper(model_fn, config, params, mode, train_batch_size) # TODO(jhseu): Move to EVAL and PREDICT to TPU. if not use_tpu or mode != model_fn_lib.ModeKeys.TRAIN: return model_fn_wrapper.call_without_tpu(features, labels) inputs = _InputsHolder(features=features, labels=labels, num_shards=config.tpu_config.num_shards) dequeue_fn, enqueue_fn = _create_infeed_enqueue_ops_and_dequeue_fn( inputs, config) loss = _train_on_tpu_system(model_fn_wrapper, dequeue_fn) # Gets the variables back from TPU nodes. This means the variables updated # by TPU will now be *synced* to host memory. update_ops = [ array_ops.check_numerics(v.read_value(), 'Gradient for %s is NaN' % v.name).op for v in variables.trainable_variables() ] hooks = [ TPUInfeedSessionHook(config, enqueue_fn), training.LoggingTensorHook( { 'loss': array_ops.identity(loss), 'step': training.get_global_step() }, every_n_secs=30) ] return model_fn_lib.EstimatorSpec( mode, loss=array_ops.identity(loss), training_hooks=hooks, train_op=control_flow_ops.group(*update_ops))
def testParallelMapUnspecifiedThreads(self): components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32) with warnings.catch_warnings(record=True) as w: dataset = (dataset_ops.Dataset.from_tensor_slices(components).map( lambda x: array_ops.check_numerics(x, "message"), output_buffer_size=2)) self.assertTrue(len(w) >= 1) self.assertTrue(( "Dataset.map() is ignoring output_buffer_size since the argument " "num_threads was not set. To buffer elements, set num_threads >= 1" ) in [str(x.message) for x in w]) iterator = dataset.make_initializable_iterator() init_op = iterator.initializer get_next = iterator.get_next() with self.test_session() as sess: sess.run(init_op) for _ in range(3): sess.run(get_next)
def _model_fn(features, labels, mode, config, params): """A Estimator `model_fn` for TPUEstimator.""" model_fn_wrapper = _ModelFnWrapper(model_fn, config, params, mode, train_batch_size) # TODO(jhseu): Move to EVAL and PREDICT to TPU. if not use_tpu or mode != model_fn_lib.ModeKeys.TRAIN: return model_fn_wrapper.call_without_tpu(features, labels) inputs = _InputsHolder(features=features, labels=labels, num_shards=config.tpu_config.num_shards) dequeue_fn, enqueue_fn = _create_infeed_enqueue_ops_and_dequeue_fn( inputs, config) loss = _train_on_tpu_system(model_fn_wrapper, dequeue_fn) # Gets the variables back from TPU nodes. This means the variables updated # by TPU will now be *synced* to host memory. update_ops = [ array_ops.check_numerics(v.read_value(), 'Gradient for %s is NaN' % v.name).op for v in variables.trainable_variables() ] hooks = [ TPUInfeedSessionHook(config, enqueue_fn), training.LoggingTensorHook( {'loss': array_ops.identity(loss), 'step': training.get_global_step()}, every_n_secs=30) ] return model_fn_lib.EstimatorSpec( mode, loss=array_ops.identity(loss), training_hooks=hooks, train_op=control_flow_ops.group(*update_ops))
def add_check_numerics_ops(): """Connect a `check_numerics` to every floating point tensor. `check_numerics` operations themselves are added for each `float` or `double` tensor in the graph. For all ops in the graph, the `check_numerics` op for all of its (`float` or `double`) inputs is guaranteed to run before the `check_numerics` op on any of its outputs. Returns: A `group` op depending on all `check_numerics` ops added. """ check_op = [] # This code relies on the ordering of ops in get_operations(). # The consumer of a tensor always comes before that tensor's producer in # this list. This is true because get_operations() returns ops in the order # added, and ops can only be added once its inputs are added. for op in ops.get_default_graph().get_operations(): for output in op.outputs: if output.dtype in [dtypes.float32, dtypes.float64]: message = op.name + ":" + str(output.value_index) with ops.control_dependencies(check_op): check_op = [array_ops.check_numerics(output, message=message)] return control_flow_ops.group(*check_op)
def _model_fn(features, labels, mode): """model_fn.""" # TODO(jhseu): Move to EVAL and PREDICT to TPU. if mode != model_fn_lib.ModeKeys.TRAIN: return model_fn(features, labels, mode) dequeue_fn, enqueue_fn = (_create_infeed_enqueue_ops_and_dequeue_fn( run_config, features, labels)) loss = _train_on_tpu_shards(run_config, train_step=_convert_model_fn_to_train_step( model_fn, dequeue_fn, mode, run_config)) # Gets the variables back from TPU nodes. This means the variables updated # by TPU will now be *synced* to host memory. update_ops = [ array_ops.check_numerics(v.read_value(), 'Gradient for %s is NaN' % v.name).op for v in variables.trainable_variables() ] hooks = [ TpuInfeedSessionHook(run_config, enqueue_fn), training.LoggingTensorHook( { 'loss': array_ops.identity(loss), 'step': training.get_global_step() }, every_n_secs=30) ] return model_fn_lib.EstimatorSpec( mode, loss=array_ops.identity(loss), training_hooks=hooks, train_op=control_flow_ops.group(*update_ops))
def add_check_numerics_ops(): """Connect a `check_numerics` to every floating point tensor. `check_numerics` operations themselves are added for each `half`, `float`, or `double` tensor in the graph. For all ops in the graph, the `check_numerics` op for all of its (`half`, `float`, or `double`) inputs is guaranteed to run before the `check_numerics` op on any of its outputs. Note: This API is not compatible with the use of @{tf.cond} or @{tf.while_loop}, and will raise a `ValueError` if you attempt to call it in such a graph. Returns: A `group` op depending on all `check_numerics` ops added. Raises: ValueError: If the graph contains any numeric operations in a control flow structure. """ check_op = [] # This code relies on the ordering of ops in get_operations(). # The producer of a tensor always comes before that tensor's consumer in # this list. This is true because get_operations() returns ops in the order # added, and an op can only be added after its inputs are added. for op in ops.get_default_graph().get_operations(): for output in op.outputs: if output.dtype in [dtypes.float16, dtypes.float32, dtypes.float64]: if op._get_control_flow_context() is not None: # pylint: disable=protected-access raise ValueError("`tf.add_check_numerics_ops() is not compatible " "with TensorFlow control flow operations such as " "`tf.cond()` or `tf.while_loop()`.") message = op.name + ":" + str(output.value_index) with ops.control_dependencies(check_op): check_op = [array_ops.check_numerics(output, message=message)] return control_flow_ops.group(*check_op)
def optimize_loss(loss, global_step, learning_rate, optimizer, clip_gradients=None, moving_average_decay=0.9, learning_rate_decay_fn=None, variables=None): """Given loss and parameters for optimizer, returns a training op. Args: loss: Tensor, 0 dimensional. global_step: Tensor, step counter for each update. learning_rate: float or Tensor, magnitude of update per each training step. optimizer: string, class or optimizer instance, used as trainer. string should be name of optimizer, like 'SGD', 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. class should be sub-class of tf.Optimizer that implements `compute_gradients` and `apply_gradients` functions. optimizer instance should be instantion of tf.Optimizer sub-class and have `compute_gradients` and `apply_gradients` functions. clip_gradients: float or None, clips gradients by this value. moving_average_decay: float or None, takes into account previous loss to make learning smoother due to outliers. learning_rate_decay_fn: function, takes learning_rate and global_step Tensors, returns Tensor. Can be used to implement any learning rate decay funcitons. For example: tf.train.exponential_decay. variables: list of variables to optimizer or none. Returns: Training op. Raises: ValueError: if optimizer is wrong type. """ # Moving average of the loss with decay. if moving_average_decay is not None: # Generate moving averages of the loss. loss_averages = train.ExponentialMovingAverage(moving_average_decay, name="avg") loss_averages_op = loss_averages.apply([loss]) logging_ops.scalar_summary("loss/mean", loss_averages.average(loss)) loss = control_flow_ops.with_dependencies([loss_averages_op], loss) # Learning rate variable, with possible decay. if isinstance(learning_rate, ops.Tensor) and len(learning_rate.get_shape()) == 0: lr = learning_rate elif isinstance(learning_rate, float): lr = vs.get_variable("learning_rate", [], trainable=False, initializer=init_ops.constant_initializer(learning_rate)) else: raise ValueError("Learning rate should be 0d Tensor or float. Got %s" % str(learning_rate)) if learning_rate_decay_fn is not None: lr = learning_rate_decay_fn(lr, global_step) # Create optimizer, given specified parameters. if isinstance(optimizer, six.string_types): if optimizer not in OPTIMIZER_CLS_NAMES: raise ValueError("Optimizer name should be one of [%s], you provided %s." % (", ".join(OPTIMIZER_CLS_NAMES), optimizer)) opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr) elif isinstance(optimizer, type) and issubclass(optimizer, optimizer_.Optimizer): opt = optimizer(learning_rate=lr) elif isinstance(optimizer, optimizer_.Optimizer): opt = optimizer else: raise ValueError("Unrecognized optimizer: should be string, " "subclass of Optimizer or instance of " "subclass of Optimizer. Got %s." % str(optimizer)) # All trainable variables, if specific variables are not specified. if variables is None: variables = vars_.trainable_variables() # Compute gradients and clip them if provided. gradients = opt.compute_gradients(loss, variables) if clip_gradients is not None: gradients, variables = zip(*gradients) clipped_gradients, _ = clip_ops.clip_by_global_norm(gradients, clip_gradients) gradients = list(zip(clipped_gradients, variables)) # Add scalar summary for loss. logging_ops.scalar_summary("loss", loss) # Add histograms for variables, gradients and gradient norms. for gradient, variable in gradients: if isinstance(gradient, ops.IndexedSlices): grad_values = gradient.values else: grad_values = gradient if grad_values is not None: logging_ops.histogram_summary(variable.name, variable) logging_ops.histogram_summary(variable.name + "/gradients", grad_values) logging_ops.histogram_summary(variable.name + "/gradient_norm", clip_ops.global_norm([grad_values])) # Create gradient updates. grad_updates = opt.apply_gradients(gradients, global_step=global_step, name="train") # Make sure total_loss is valid. final_loss = array_ops.check_numerics(loss, "Loss is inf or nan") # Ensure the train_tensor computes grad_updates. train_tensor = control_flow_ops.with_dependencies([grad_updates], final_loss) return train_tensor
def _CheckNumericsGrad(op, grad): """Gradient for check_numerics op.""" return array_ops.check_numerics( grad, "Not a number (NaN) or infinity (Inf) values detected in gradient. %s" % op.get_attr("message"))
def _CheckNumericsGrad(_, grad): """Gradient for check_numerics op.""" return array_ops.check_numerics( grad, "Not a number (NaN) or infinity (Inf) values detected in gradient.")
def create_train_op(total_loss, optimizer, global_step=_USE_GLOBAL_STEP, update_ops=None, variables_to_train=None, transform_grads_fn=None, summarize_gradients=False, gate_gradients=tf_optimizer.Optimizer.GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, check_numerics=True): """Creates an `Operation` that evaluates the gradients and returns the loss. Args: total_loss: A `Tensor` representing the total loss. optimizer: A tf.Optimizer to use for computing the gradients. global_step: A `Tensor` representing the global step variable. If left as `_USE_GLOBAL_STEP`, then tf.contrib.framework.global_step() is used. update_ops: An optional list of updates to execute. If `update_ops` is `None`, then the update ops are set to the contents of the `tf.GraphKeys.UPDATE_OPS` collection. If `update_ops` is not `None`, but it doesn't contain all of the update ops in `tf.GraphKeys.UPDATE_OPS`, a warning will be displayed. variables_to_train: an optional list of variables to train. If None, it will default to all tf.trainable_variables(). transform_grads_fn: A function which takes a single argument, a list of gradient to variable pairs (tuples), performs any requested gradient updates, such as gradient clipping or multipliers, and returns the updated list. summarize_gradients: Whether or not add summaries for each gradient. gate_gradients: How to gate the computation of gradients. See tf.Optimizer. aggregation_method: Specifies the method used to combine gradient terms. Valid values are defined in the class `AggregationMethod`. colocate_gradients_with_ops: Whether or not to try colocating the gradients with the ops that generated them. check_numerics: Whether or not we apply check_numerics. Returns: A `Tensor` that when evaluated, computes the gradients and returns the total loss value. """ if global_step is _USE_GLOBAL_STEP: global_step = training_util.get_or_create_global_step() # Update ops use GraphKeys.UPDATE_OPS collection if update_ops is None. global_update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS)) if update_ops is None: update_ops = global_update_ops else: update_ops = set(update_ops) if not global_update_ops.issubset(update_ops): logging.warning('update_ops in create_train_op does not contain all the ' 'update_ops in GraphKeys.UPDATE_OPS') # Make sure update_ops are computed before total_loss. if update_ops: with ops.control_dependencies(update_ops): barrier = control_flow_ops.no_op(name='update_barrier') total_loss = control_flow_ops.with_dependencies([barrier], total_loss) if variables_to_train is None: # Default to tf.trainable_variables() variables_to_train = tf_variables.trainable_variables() else: # Make sure that variables_to_train are in tf.trainable_variables() for v in variables_to_train: assert v.trainable or v in tf_variables.trainable_variables() assert variables_to_train # Create the gradients. Note that apply_gradients adds the gradient # computation to the current graph. grads = optimizer.compute_gradients( total_loss, variables_to_train, gate_gradients=gate_gradients, aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops) if transform_grads_fn: grads = transform_grads_fn(grads) # Summarize gradients. if summarize_gradients: with ops.name_scope('summarize_grads'): add_gradients_summaries(grads) # Create gradient updates. grad_updates = optimizer.apply_gradients(grads, global_step=global_step) with ops.name_scope('train_op'): # Make sure total_loss is valid. if check_numerics: total_loss = array_ops.check_numerics(total_loss, 'LossTensor is inf or nan') # Ensure the train_tensor computes grad_updates. train_op = control_flow_ops.with_dependencies([grad_updates], total_loss) # Add the operation used for training to the 'train_op' collection train_ops = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP) if train_op not in train_ops: train_ops.append(train_op) return train_op
def testExecuteStringAttr(self): three = tensor.Tensor(3.0) checked_three = array_ops.check_numerics(three, message='just checking') self.assertEqual([[3]], checked_three.numpy())
def create_train_op( total_loss, optimizer, global_step=None, update_ops=None, variables_to_train=None, clip_gradient_norm=0, summarize_gradients=False, gate_gradients=tf_optimizer.Optimizer.GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, gradient_multipliers=None): """Creates an `Operation` that evaluates the gradients and returns the loss. Args: total_loss: A `Tensor` representing the total loss. optimizer: A tf.Optimizer to use for computing the gradients. global_step: A `Tensor` representing the global step variable. If left as `None`, then slim.variables.global_step() is used. update_ops: an optional list of updates to execute. Note that the update_ops that are used are the union of those update_ops passed to the function and the value of slim.ops.GetUpdateOps(). Therefore, if `update_ops` is None, then the value of slim.ops.GetUpdateOps() is still used. variables_to_train: an optional list of variables to train. If None, it will default to all tf.trainable_variables(). clip_gradient_norm: If greater than 0 then the gradients would be clipped by it. summarize_gradients: Whether or not add summaries for each gradient. gate_gradients: How to gate the computation of gradients. See tf.Optimizer. aggregation_method: Specifies the method used to combine gradient terms. Valid values are defined in the class `AggregationMethod`. colocate_gradients_with_ops: Whether or not to try colocating the gradients with the ops that generated them. gradient_multipliers: A dictionary of either `Variables` or `Variable` op names to the coefficient by which the associated gradient should be scaled. Returns: A `Tensor` that when evaluated, computes the gradients and returns the total loss value. """ if global_step is None: global_step = variables.get_or_create_global_step() # Update ops use GraphKeys.UPDATE_OPS collection if update_ops is None. global_update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS)) if update_ops is None: update_ops = global_update_ops else: update_ops = set(update_ops) if not global_update_ops.issubset(update_ops): logging.warning('update_ops in create_train_op does not contain all the ' ' update_ops in GraphKeys.UPDATE_OPS') # Make sure update_ops are computed before total_loss. if update_ops: with ops.control_dependencies(update_ops): barrier = control_flow_ops.no_op(name='update_barrier') total_loss = control_flow_ops.with_dependencies([barrier], total_loss) if variables_to_train is None: # Default to tf.trainable_variables() variables_to_train = tf_variables.trainable_variables() else: # Make sure that variables_to_train are in tf.trainable_variables() for v in variables_to_train: assert v in tf_variables.trainable_variables() assert variables_to_train # Create the gradients. Note that apply_gradients adds the gradient # computation to the current graph. grads = optimizer.compute_gradients( total_loss, variables_to_train, gate_gradients=gate_gradients, aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops) # Scale gradients. if gradient_multipliers: grads = multiply_gradients(grads, gradient_multipliers) # Clip gradients. if clip_gradient_norm > 0: grads = clip_gradient_norms(grads, clip_gradient_norm) # Summarize gradients. if summarize_gradients: add_gradients_summaries(grads) # Create gradient updates. grad_updates = optimizer.apply_gradients(grads, global_step=global_step) # Make sure total_loss is valid. total_loss = array_ops.check_numerics(total_loss, 'LossTensor is inf or nan') # Ensure the train_tensor computes grad_updates. return control_flow_ops.with_dependencies([grad_updates], total_loss)