Exemple #1
0
    def testOneShotIteratorInitializerFails(self):
        # Define a dataset whose initialization will always fail.
        dataset = dataset_ops.Dataset.from_tensors(
            array_ops.check_numerics(
                constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
        iterator = dataset.make_one_shot_iterator()
        next_element = iterator.get_next()

        with self.test_session() as sess:
            with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
                sess.run(next_element)

            # Test that subsequent attempts to use the iterator also fail.
            with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
                sess.run(next_element)

        with self.test_session() as sess:

            def consumer_thread():
                with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                             "oops"):
                    sess.run(next_element)

            num_threads = 8
            threads = [
                self.checkedThread(consumer_thread) for _ in range(num_threads)
            ]
            for t in threads:
                t.start()
            for t in threads:
                t.join()
Exemple #2
0
def add_check_numerics_ops():
    """Connect a `check_numerics` to every floating point tensor.

  `check_numerics` operations themselves are added for each `half`, `float`,
  or `double` tensor in the graph. For all ops in the graph, the
  `check_numerics` op for all of its (`half`, `float`, or `double`) inputs
  is guaranteed to run before the `check_numerics` op on any of its outputs.

  Returns:
    A `group` op depending on all `check_numerics` ops added.
  """
    check_op = []
    # This code relies on the ordering of ops in get_operations().
    # The producer of a tensor always comes before that tensor's consumer in
    # this list. This is true because get_operations() returns ops in the order
    # added, and an op can only be added after its inputs are added.
    for op in ops.get_default_graph().get_operations():
        for output in op.outputs:
            if output.dtype in [
                    dtypes.float32, dtypes.float64
            ] and output.op._get_control_flow_context(
            ) == ops.get_default_graph()._get_control_flow_context():
                message = op.name + ":" + str(output.value_index)
                with ops.control_dependencies(check_op):
                    check_op = [
                        array_ops.check_numerics(output, message=message)
                    ]
    return control_flow_ops.group(*check_op)
 def testPassThrough(self):
   with self.session(graph=ops.Graph()):
     t1 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3])
     checked = array_ops.check_numerics(t1, message="pass through test")
     value = self.evaluate(checked)
     self.assertAllEqual(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), value)
     self.assertEqual([2, 3], checked.get_shape())
  def testOneShotIteratorInitializerFails(self):
    # Define a dataset whose initialization will always fail.
    dataset = dataset_ops.Dataset.from_tensors(
        array_ops.check_numerics(
            constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
    iterator = dataset.make_one_shot_iterator()
    next_element = iterator.get_next()

    with self.test_session() as sess:
      with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
        sess.run(next_element)

      # Test that subsequent attempts to use the iterator also fail.
      with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
        sess.run(next_element)

    with self.test_session() as sess:
      def consumer_thread():
        with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
          sess.run(next_element)

      num_threads = 8
      threads = [
          self.checkedThread(consumer_thread) for _ in range(num_threads)]
      for t in threads:
        t.start()
      for t in threads:
        t.join()
 def testPassThrough(self):
   with self.session(graph=ops.Graph()):
     t1 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3])
     checked = array_ops.check_numerics(t1, message="pass through test")
     value = checked.eval()
     self.assertAllEqual(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), value)
     self.assertEqual([2, 3], checked.get_shape())
Exemple #6
0
 def testWindowIgnoreErrors(self):
   input_values = np.float32([1., np.nan, 2., np.nan, 3.])
   dataset = dataset_ops.Dataset.from_tensor_slices(input_values).map(
       lambda x: array_ops.check_numerics(x, "message")).window(
           size=2, shift=2, stride=2,
           drop_remainder=True).flat_map(lambda x: x.batch(batch_size=2))
   self.assertDatasetProduces(
       dataset, expected_output=[np.float32([1., 2.]),
                                 np.float32([2., 3.])])
Exemple #7
0
 def testWindowIgnoreErrors(self):
   input_values = np.float32([1., np.nan, 2., np.nan, 3.])
   dataset = dataset_ops.Dataset.from_tensor_slices(input_values).map(
       lambda x: array_ops.check_numerics(x, "message")).window(
           size=2, shift=2, stride=2,
           drop_remainder=True).flat_map(lambda x: x.batch(batch_size=2))
   self.assertDatasetProduces(
       dataset, expected_output=[np.float32([1., 2.]),
                                 np.float32([2., 3.])])
Exemple #8
0
    def testParallelMapUnspecifiedOutputSize(self):
        components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)

        dataset = (dataset_ops.Dataset.from_tensor_slices(components).map(
            lambda x: array_ops.check_numerics(x, "message"),
            num_parallel_calls=2))
        get_next = self.getNext(dataset)

        for _ in range(3):
            self.evaluate(get_next())
Exemple #9
0
  def testParallelMapUnspecifiedOutputSize(self):
    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)

    dataset = (dataset_ops.Dataset.from_tensor_slices(components)
               .map(lambda x: array_ops.check_numerics(x, "message"),
                    num_parallel_calls=2))
    get_next = self.getNext(dataset)

    for _ in range(3):
      self.evaluate(get_next())
    def _build_ds(self):
        components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)

        dataset = dataset_ops.Dataset.from_tensor_slices(components)
        dataset = dataset.map(lambda x: array_ops.check_numerics(x, "message"))
        dataset = dataset.apply(error_ops.ignore_errors())
        options = options_lib.Options()
        options.experimental_external_state_policy = (
            options_lib.ExternalStatePolicy.IGNORE)
        return dataset.with_options(options)
Exemple #11
0
  def testMapAndBatchFails(self):
    """Test a dataset that maps a TF function across its input elements."""

    with self.assertRaisesRegex(errors.InvalidArgumentError, "oops"):
      dataset = dataset_ops.Dataset.from_tensors(
          array_ops.check_numerics(
              constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
      dataset = dataset.apply(batching.map_and_batch(lambda x: x, 14))
      get_next = self.getNext(dataset, requires_initialization=True)
      self.evaluate(get_next())
def add_check_numerics_ops():
    """Connect a `check_numerics` to every floating point tensor.

  `check_numerics` operations themselves are added for each `half`, `float`,
  or `double` tensor in the graph. For all ops in the graph, the
  `check_numerics` op for all of its (`half`, `float`, or `double`) inputs
  is guaranteed to run before the `check_numerics` op on any of its outputs.

  Note: This API is not compatible with the use of `tf.cond` or
  `tf.while_loop`, and will raise a `ValueError` if you attempt to call it
  in such a graph.

  Returns:
    A `group` op depending on all `check_numerics` ops added.

  Raises:
    ValueError: If the graph contains any numeric operations in a control flow
      structure.
    RuntimeError: If called with eager execution enabled.

  @compatibility(eager)
  Not compatible with eager execution. To check for `Inf`s and `NaN`s under
  eager execution, call tfe.seterr(inf_or_nan='raise') once before executing
  the checked operations.
  @enc_compatibility
  """
    if context.executing_eagerly():
        raise RuntimeError(
            "add_check_numerics_ops() is not compatible with eager execution. "
            "To check for Inf's and NaN's under eager execution, call "
            "tfe.seterr(inf_or_nan='raise') once before executing the "
            "checked operations.")

    check_op = []
    # This code relies on the ordering of ops in get_operations().
    # The producer of a tensor always comes before that tensor's consumer in
    # this list. This is true because get_operations() returns ops in the order
    # added, and an op can only be added after its inputs are added.
    for op in ops.get_default_graph().get_operations():
        for output in op.outputs:
            if output.dtype in [
                    dtypes.float16, dtypes.float32, dtypes.float64
            ]:
                if op._get_control_flow_context() is not None:  # pylint: disable=protected-access
                    raise ValueError(
                        "`tf.add_check_numerics_ops() is not compatible "
                        "with TensorFlow control flow operations such as "
                        "`tf.cond()` or `tf.while_loop()`.")

                message = op.name + ":" + str(output.value_index)
                with ops.control_dependencies(check_op):
                    check_op = [
                        array_ops.check_numerics(output, message=message)
                    ]
    return control_flow_ops.group(*check_op)
Exemple #13
0
  def testZipIgnoreError(self):
    a = dataset_ops.Dataset.from_tensor_slices([1., 2., 0., 4.])
    b = a.map(lambda x: array_ops.check_numerics(1. / x, "error"))

    dataset = dataset_ops.Dataset.zip((b, a)).apply(error_ops.ignore_errors())
    get_next = self.getNext(dataset)

    for x in [1., 2., 4.]:
      self.assertEqual((1. / x, x), self.evaluate(get_next()))
    with self.assertRaises(errors.OutOfRangeError):
      self.evaluate(get_next())
    def testParallelMapIgnoreError(self):
        components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)

        dataset = (dataset_ops.Dataset.from_tensor_slices(components).map(
            lambda x: array_ops.check_numerics(x, "message"),
            num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors()))
        get_next = self.getNext(dataset)

        for x in [1., 2., 3., 5.]:
            self.assertEqual(x, self.evaluate(get_next()))
        with self.assertRaises(errors.OutOfRangeError):
            self.evaluate(get_next())
Exemple #15
0
 def testBatchAndMapDatasetFails(self):
   """Test a dataset that maps a TF function across its input elements."""
   dataset = dataset_ops.Dataset.from_tensors(
       array_ops.check_numerics(
           constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
   batch_size = array_ops.placeholder(dtypes.int64, shape=[])
   iterator = (dataset.apply(batching.map_and_batch(lambda x: x, batch_size))
               .make_initializable_iterator())
   init_op = iterator.initializer
   with self.test_session() as sess:
     with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
       sess.run(init_op, feed_dict={batch_size: 14})
 def testBatchAndMapDatasetFails(self):
   """Test a dataset that maps a TF function across its input elements."""
   dataset = dataset_ops.Dataset.from_tensors(
       array_ops.check_numerics(
           constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
   batch_size = array_ops.placeholder(dtypes.int64, shape=[])
   iterator = (dataset.apply(batching.map_and_batch(lambda x: x, batch_size))
               .make_initializable_iterator())
   init_op = iterator.initializer
   with self.test_session() as sess:
     with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
       sess.run(init_op, feed_dict={batch_size: 14})
Exemple #17
0
    def testParallelMapUnspecifiedOutputSize(self):
        components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)

        dataset = (dataset_ops.Dataset.from_tensor_slices(components).map(
            lambda x: array_ops.check_numerics(x, "message"), num_threads=2))
        iterator = dataset.make_initializable_iterator()
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.test_session() as sess:
            sess.run(init_op)
            for _ in range(3):
                sess.run(get_next)
  def testWindowIgnoreErrors(self):
    input_values = np.float32([1., np.nan, 2., np.nan, 3.])
    dataset = dataset_ops.Dataset.from_tensor_slices(input_values).map(
        lambda x: array_ops.check_numerics(x, "message")).window(
            size=2, shift=2, stride=2,
            drop_remainder=True).flat_map(lambda x: x.batch(batch_size=2))
    get_next = dataset.make_one_shot_iterator().get_next()

    with self.cached_session() as sess:
      self.assertAllEqual(np.float32([1., 2.]), sess.run(get_next))
      self.assertAllEqual(np.float32([2., 3.]), sess.run(get_next))
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(get_next)
  def testParallelMapIgnoreError(self):
    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)

    dataset = (
        dataset_ops.Dataset.from_tensor_slices(components).map(
            lambda x: array_ops.check_numerics(x, "message"),
            num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors()))
    get_next = self.getNext(dataset)

    for x in [1., 2., 3., 5.]:
      self.assertEqual(x, self.evaluate(get_next()))
    with self.assertRaises(errors.OutOfRangeError):
      self.evaluate(get_next())
Exemple #20
0
    def testWindowIgnoreErrors(self):
        input_values = np.float32([1., np.nan, 2., np.nan, 3.])
        dataset = dataset_ops.Dataset.from_tensor_slices(input_values).map(
            lambda x: array_ops.check_numerics(x, "message")).window(
                size=2, shift=2, stride=2,
                drop_remainder=True).flat_map(lambda x: x.batch(batch_size=2))
        get_next = dataset.make_one_shot_iterator().get_next()

        with self.cached_session() as sess:
            self.assertAllEqual(np.float32([1., 2.]), self.evaluate(get_next))
            self.assertAllEqual(np.float32([2., 3.]), self.evaluate(get_next))
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)
  def testMapAndBatchFails(self, numa_aware):
    """Test a dataset that maps a TF function across its input elements."""

    with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
      dataset = dataset_ops.Dataset.from_tensors(
          array_ops.check_numerics(
              constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
      dataset = dataset.apply(batching.map_and_batch(lambda x: x, 14))
      if numa_aware:
        options = dataset_ops.Options()
        options.experimental_numa_aware = True
        dataset = dataset.with_options(options)
      get_next = self.getNext(dataset)
      self.evaluate(get_next())
 def testIgnoreError_withLogWarning(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
     dataset = (dataset_ops.Dataset.from_tensor_slices(components).map(
         lambda x: array_ops.check_numerics(x, "message")).apply(
             error_ops.ignore_errors(log_warning=True)))
     get_next = self.getNext(dataset)
     for x in [1., 2., 3.]:
         self.assertEqual(x, self.evaluate(get_next()))
     with self.captureWritesToStream(sys.stderr) as logged:
         self.assertEqual(5., self.evaluate(get_next()))
     expected = "Tensor had NaN values"
     self.assertIn((expected), logged.contents())
     with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(get_next())
Exemple #23
0
  def testParallelMapUnspecifiedOutputSize(self):
    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)

    dataset = (dataset_ops.Dataset.from_tensor_slices(components)
               .map(lambda x: array_ops.check_numerics(x, "message"),
                    num_parallel_calls=2))
    iterator = dataset.make_initializable_iterator()
    init_op = iterator.initializer
    get_next = iterator.get_next()

    with self.cached_session() as sess:
      sess.run(init_op)
      for _ in range(3):
        sess.run(get_next)
Exemple #24
0
def add_check_numerics_ops():
  """Connect a `tf.debugging.check_numerics` to every floating point tensor.

  `check_numerics` operations themselves are added for each `half`, `float`,
  or `double` tensor in the current default graph. For all ops in the graph, the
  `check_numerics` op for all of its (`half`, `float`, or `double`) inputs
  is guaranteed to run before the `check_numerics` op on any of its outputs.

  Note: This API is not compatible with the use of `tf.cond` or
  `tf.while_loop`, and will raise a `ValueError` if you attempt to call it
  in such a graph.

  Returns:
    A `group` op depending on all `check_numerics` ops added.

  Raises:
    ValueError: If the graph contains any numeric operations in a control flow
      structure.
    RuntimeError: If called with eager execution enabled.

  @compatibility(eager)
  Not compatible with eager execution. To check for `Inf`s and `NaN`s under
  eager execution, call `tfe.seterr(inf_or_nan='raise')` once before executing
  the checked operations.
  @end_compatibility
  """
  if context.executing_eagerly():
    raise RuntimeError(
        "add_check_numerics_ops() is not compatible with eager execution. "
        "To check for Inf's and NaN's under eager execution, call "
        "tfe.seterr(inf_or_nan='raise') once before executing the "
        "checked operations.")

  check_op = []
  # This code relies on the ordering of ops in get_operations().
  # The producer of a tensor always comes before that tensor's consumer in
  # this list. This is true because get_operations() returns ops in the order
  # added, and an op can only be added after its inputs are added.
  for op in ops.get_default_graph().get_operations():
    for output in op.outputs:
      if output.dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
        if op._get_control_flow_context() is not None:  # pylint: disable=protected-access
          raise ValueError("`tf.add_check_numerics_ops() is not compatible "
                           "with TensorFlow control flow operations such as "
                           "`tf.cond()` or `tf.while_loop()`.")

        message = op.name + ":" + str(output.value_index)
        with ops.control_dependencies(check_op):
          check_op = [array_ops.check_numerics(output, message=message)]
  return control_flow_ops.group(*check_op)
  def testMapAndBatchFails(self, numa_aware):
    """Test a dataset that maps a TF function across its input elements."""

    with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
      dataset = dataset_ops.Dataset.from_tensors(
          array_ops.check_numerics(
              constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
      dataset = dataset.apply(batching.map_and_batch(lambda x: x, 14))
      if numa_aware:
        options = dataset_ops.Options()
        options.experimental_numa_aware = True
        dataset = dataset.with_options(options)
      get_next = self.getNext(dataset)
      self.evaluate(get_next())
    def _model_fn(features, labels, mode, config, params=None):
        """model_fn."""

        # TODO(jhseu): Move to EVAL and PREDICT to TPU.
        if mode != model_fn_lib.ModeKeys.TRAIN:
            return _call_model_fn_without_tpu(model_fn, features, labels, mode,
                                              config, params)

        # Now for TPU training.
        if params is not None and _BATCH_SIZE_KEY in params:
            params[_BATCH_SIZE_KEY] //= config.tpu_config.num_shards

        assert isinstance(features, _PerShardOutput)
        features = features.as_list()
        if labels is not None:
            assert isinstance(labels, _PerShardOutput)
            labels = labels.as_list()

        dequeue_fn, enqueue_fn = (_create_infeed_enqueue_ops_and_dequeue_fn(
            config, features, labels))

        loss = _train_on_tpu_shards(config,
                                    train_step=_convert_model_fn_to_train_step(
                                        model_fn, dequeue_fn, mode, config,
                                        params))

        # Gets the variables back from TPU nodes. This means the variables updated
        # by TPU will now be *synced* to host memory.
        update_ops = [
            array_ops.check_numerics(v.read_value(),
                                     'Gradient for %s is NaN' % v.name).op
            for v in variables.trainable_variables()
        ]

        hooks = [
            TpuInfeedSessionHook(config, enqueue_fn),
            training.LoggingTensorHook(
                {
                    'loss': array_ops.identity(loss),
                    'step': training.get_global_step()
                },
                every_n_secs=30)
        ]

        return model_fn_lib.EstimatorSpec(
            mode,
            loss=array_ops.identity(loss),
            training_hooks=hooks,
            train_op=control_flow_ops.group(*update_ops))
Exemple #27
0
    def testMapIgnoreError(self):
        components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)

        dataset = (dataset_ops.Dataset.from_tensor_slices(components).map(
            lambda x: array_ops.check_numerics(x, "message")).ignore_errors())
        iterator = dataset.make_initializable_iterator()
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.test_session() as sess:
            sess.run(init_op)
            for x in [1., 2., 3., 5.]:
                self.assertEqual(x, sess.run(get_next))
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)
Exemple #28
0
def _VerifyTensor(t, name, msg):
    """Assert that the tensor does not contain any NaN's.

  Args:
    t: Tensor
    name: name
    msg: message to log
  Returns:
    Tensor, but verified
  """
    with ops.name_scope(name):
        with ops.device(t.device or ops.get_default_graph().get_default_device()):
            verify_input = array_ops.check_numerics(t, message=msg)
            out = control_flow_ops.with_dependencies([verify_input], t)
    return out
Exemple #29
0
def _VerifyTensor(t, name, msg):
    """Assert that the tensor does not contain any NaN's.

  Args:
    t: Tensor
    name: name
    msg: message to log
  Returns:
    Tensor, but verified
  """
    with ops.name_scope(name):
        with ops.device(t.device
                        or ops.get_default_graph().get_default_device()):
            verify_input = array_ops.check_numerics(t, message=msg)
            out = control_flow_ops.with_dependencies([verify_input], t)
    return out
    def testParallelMapIgnoreError(self):
        components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)

        dataset = (dataset_ops.Dataset.from_tensor_slices(components).map(
            lambda x: array_ops.check_numerics(x, "message"),
            num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors()))
        iterator = dataset.make_initializable_iterator()
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            self.evaluate(init_op)
            for x in [1., 2., 3., 5.]:
                self.assertEqual(x, self.evaluate(get_next))
            with self.assertRaises(errors.OutOfRangeError):
                self.evaluate(get_next)
Exemple #31
0
  def testParallelMapError(self):
    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)

    dataset = (dataset_ops.Dataset.from_tensor_slices(components)
               .map(lambda x: array_ops.check_numerics(x, "message"),
                    num_parallel_calls=2))
    get_next = self.getNext(dataset)

    for _ in range(3):
      self.evaluate(get_next())
    # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
    with self.assertRaises(errors.InvalidArgumentError):
      self.evaluate(get_next())
    self.evaluate(get_next())
    with self.assertRaises(errors.OutOfRangeError):
      self.evaluate(get_next())
Exemple #32
0
    def testPrefetchError(self):
        components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)

        dataset = (dataset_ops.Dataset.from_tensor_slices(components).map(
            lambda x: array_ops.check_numerics(x, "message")).prefetch(2))

        get_next = self.getNext(dataset)

        for _ in range(3):
            self.evaluate(get_next())
        # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
        with self.assertRaises(errors.InvalidArgumentError):
            self.evaluate(get_next())
        self.evaluate(get_next())
        with self.assertRaises(errors.OutOfRangeError):
            self.evaluate(get_next())
  def testInterleaveDatasetError(self, input_values, cycle_length, block_length,
                                 num_parallel_calls):
    dataset = dataset_ops.Dataset.from_tensor_slices(input_values).map(
        lambda x: array_ops.check_numerics(x, "message")).interleave(
            dataset_ops.Dataset.from_tensors, cycle_length, block_length,
            num_parallel_calls)
    get_next = self.getNext(dataset)

    for value in input_values:
      if np.isnan(value):
        with self.assertRaises(errors.InvalidArgumentError):
          self.evaluate(get_next())
      else:
        self.assertEqual(value, self.evaluate(get_next()))
    with self.assertRaises(errors.OutOfRangeError):
      self.evaluate(get_next())
    def testInterleaveDatasetError(self, input_values, cycle_length,
                                   block_length, num_parallel_calls):
        dataset = dataset_ops.Dataset.from_tensor_slices(input_values).map(
            lambda x: array_ops.check_numerics(x, "message")).interleave(
                dataset_ops.Dataset.from_tensors, cycle_length, block_length,
                num_parallel_calls)
        get_next = self.getNext(dataset)

        for value in input_values:
            if np.isnan(value):
                with self.assertRaises(errors.InvalidArgumentError):
                    self.evaluate(get_next())
            else:
                self.assertEqual(value, self.evaluate(get_next()))
        with self.assertRaises(errors.OutOfRangeError):
            self.evaluate(get_next())
  def testMapIgnoreError(self):
    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)

    dataset = (dataset_ops.Dataset.from_tensor_slices(components)
               .map(lambda x: array_ops.check_numerics(x, "message"))
               .ignore_errors())
    iterator = dataset.make_initializable_iterator()
    init_op = iterator.initializer
    get_next = iterator.get_next()

    with self.test_session() as sess:
      sess.run(init_op)
      for x in [1., 2., 3., 5.]:
        self.assertEqual(x, sess.run(get_next))
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(get_next)
Exemple #36
0
  def _model_fn(features, labels, mode, config, params=None):
    """model_fn."""

    # TODO(jhseu): Move to EVAL and PREDICT to TPU.
    if mode != model_fn_lib.ModeKeys.TRAIN:
      return _call_model_fn_without_tpu(
          model_fn, features, labels, mode, config, params)

    # Now for TPU training.
    if params is not None and _BATCH_SIZE_KEY in params:
      params[_BATCH_SIZE_KEY] //= config.tpu_config.num_shards

    assert isinstance(features, _PerShardOutput)
    features = features.as_list()
    if labels is not None:
      assert isinstance(labels, _PerShardOutput)
      labels = labels.as_list()

    dequeue_fn, enqueue_fn = (
        _create_infeed_enqueue_ops_and_dequeue_fn(config, features, labels))

    loss = _train_on_tpu_shards(
        config,
        train_step=_convert_model_fn_to_train_step(
            model_fn, dequeue_fn, mode, config, params))

    # Gets the variables back from TPU nodes. This means the variables updated
    # by TPU will now be *synced* to host memory.
    update_ops = [
        array_ops.check_numerics(v.read_value(),
                                 'Gradient for %s is NaN' % v.name).op
        for v in variables.trainable_variables()
    ]

    hooks = [
        TpuInfeedSessionHook(config, enqueue_fn),
        training.LoggingTensorHook(
            {'loss': array_ops.identity(loss),
             'step': training.get_global_step()},
            every_n_secs=30)
    ]

    return model_fn_lib.EstimatorSpec(
        mode,
        loss=array_ops.identity(loss),
        training_hooks=hooks,
        train_op=control_flow_ops.group(*update_ops))
Exemple #37
0
def verify_tensor_all_finite_v2(x, message, name=None):
  """Assert that the tensor does not contain any NaN's or Inf's.

  Args:
    x: Tensor to check.
    message: Message to log on failure.
    name: A name for this operation (optional).

  Returns:
    Same tensor as `x`.
  """
  with ops.name_scope(name, "VerifyFinite", [x]) as name:
    x = ops.convert_to_tensor(x, name="x")
    with ops.colocate_with(x):
      verify_input = array_ops.check_numerics(x, message=message)
      out = control_flow_ops.with_dependencies([verify_input], x)
  return out
  def testInterleaveDatasetError(self, input_values, cycle_length, block_length,
                                 num_parallel_calls):
    dataset = dataset_ops.Dataset.from_tensor_slices(input_values).map(
        lambda x: array_ops.check_numerics(x, "message")).interleave(
            dataset_ops.Dataset.from_tensors, cycle_length, block_length,
            num_parallel_calls)
    get_next = dataset.make_one_shot_iterator().get_next()

    with self.cached_session() as sess:
      for value in input_values:
        if np.isnan(value):
          with self.assertRaises(errors.InvalidArgumentError):
            sess.run(get_next)
        else:
          self.assertEqual(value, sess.run(get_next))
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(get_next)
  def testParallelMapIgnoreError(self):
    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)

    dataset = (
        dataset_ops.Dataset.from_tensor_slices(components).map(
            lambda x: array_ops.check_numerics(x, "message"),
            num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors()))
    iterator = dataset_ops.make_initializable_iterator(dataset)
    init_op = iterator.initializer
    get_next = iterator.get_next()

    with self.cached_session() as sess:
      self.evaluate(init_op)
      for x in [1., 2., 3., 5.]:
        self.assertEqual(x, self.evaluate(get_next))
      with self.assertRaises(errors.OutOfRangeError):
        self.evaluate(get_next)
  def testMapAndBatchFails(self, numa_aware):
    """Test a dataset that maps a TF function across its input elements."""
    dataset = dataset_ops.Dataset.from_tensors(
        array_ops.check_numerics(
            constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
    dataset = dataset.apply(batching.map_and_batch(lambda x: x, batch_size))
    if numa_aware:
      options = dataset_ops.Options()
      options.experimental_numa_aware = True
      dataset = dataset.with_options(options)
    iterator = dataset.make_initializable_iterator()

    init_op = iterator.initializer
    with self.cached_session() as sess:
      with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
        sess.run(init_op, feed_dict={batch_size: 14})
Exemple #41
0
def verify_tensor_all_finite(t, msg, name=None):
  """Assert that the tensor does not contain any NaN's or Inf's.

  Args:
    t: Tensor to check.
    msg: Message to log on failure.
    name: A name for this operation (optional).

  Returns:
    Same tensor as `t`.
  """
  with ops.op_scope([t], name, "VerifyFinite") as name:
    t = ops.convert_to_tensor(t, name="t")
    with ops.device(t.device):
      verify_input = array_ops.check_numerics(t, message=msg)
      out = control_flow_ops.with_dependencies([verify_input], t)
  return out
Exemple #42
0
def verify_tensor_all_finite_v2(x, message, name=None):
    """Assert that the tensor does not contain any NaN's or Inf's.

  Args:
    x: Tensor to check.
    message: Message to log on failure.
    name: A name for this operation (optional).

  Returns:
    Same tensor as `x`.
  """
    with ops.name_scope(name, "VerifyFinite", [x]) as name:
        x = ops.convert_to_tensor(x, name="x")
        with ops.colocate_with(x):
            verify_input = array_ops.check_numerics(x, message=message)
            out = control_flow_ops.with_dependencies([verify_input], x)
    return out
Exemple #43
0
    def testInterleaveDatasetError(self, input_values, cycle_length,
                                   block_length, num_parallel_calls):
        dataset = dataset_ops.Dataset.from_tensor_slices(input_values).map(
            lambda x: array_ops.check_numerics(x, "message")).interleave(
                dataset_ops.Dataset.from_tensors, cycle_length, block_length,
                num_parallel_calls)
        get_next = dataset.make_one_shot_iterator().get_next()

        with self.cached_session() as sess:
            for value in input_values:
                if np.isnan(value):
                    with self.assertRaises(errors.InvalidArgumentError):
                        sess.run(get_next)
                else:
                    self.assertEqual(value, sess.run(get_next))
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)
  def testMapAndBatchFails(self, numa_aware):
    """Test a dataset that maps a TF function across its input elements."""
    dataset = dataset_ops.Dataset.from_tensors(
        array_ops.check_numerics(
            constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
    dataset = dataset.apply(batching.map_and_batch(lambda x: x, batch_size))
    if numa_aware:
      options = dataset_ops.Options()
      options.experimental_numa_aware = True
      dataset = dataset.with_options(options)
    iterator = dataset_ops.make_initializable_iterator(dataset)

    init_op = iterator.initializer
    with self.cached_session() as sess:
      with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
        sess.run(init_op, feed_dict={batch_size: 14})
Exemple #45
0
def verify_tensor_all_finite(t, msg, name=None):
  """Assert that the tensor does not contain any NaN's or Inf's.

  Args:
    t: Tensor to check.
    msg: Message to log on failure.
    name: A name for this operation (optional).

  Returns:
    Same tensor as `t`.
  """
  with ops.op_scope([t], name, "VerifyFinite") as name:
    t = ops.convert_to_tensor(t, name="t")
    with ops.device(t.device or t.graph.get_default_device()):
      verify_input = array_ops.check_numerics(t, message=msg)
      out = control_flow_ops.with_dependencies([verify_input], t)
  return out
Exemple #46
0
    def testPrefetchError(self):
        components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)

        dataset = (dataset_ops.Dataset.from_tensor_slices(components).map(
            lambda x: array_ops.check_numerics(x, "message")).prefetch(2))
        iterator = dataset.make_initializable_iterator()
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.test_session() as sess:
            sess.run(init_op)
            for _ in range(3):
                sess.run(get_next)
            # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
            with self.assertRaises(errors.InvalidArgumentError):
                sess.run(get_next)
            sess.run(get_next)
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)
  def testParallelMapError(self):
    components = [np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)]

    dataset = (dataset_ops.Dataset.from_tensor_slices(components)
               .map(lambda x: array_ops.check_numerics(x, "message")))
    iterator = dataset.make_initializable_iterator()
    init_op = iterator.initializer
    get_next = iterator.get_next()

    with self.test_session() as sess:
      sess.run(init_op)
      for _ in range(3):
        sess.run(get_next)
      # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
      with self.assertRaises(errors.InvalidArgumentError):
        sess.run(get_next)
      sess.run(get_next)
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(get_next)
Exemple #48
0
    def _model_fn(features, labels, mode, config, params):
        """A Estimator `model_fn` for TPUEstimator."""
        model_fn_wrapper = _ModelFnWrapper(model_fn, config, params, mode,
                                           train_batch_size)

        # TODO(jhseu): Move to EVAL and PREDICT to TPU.
        if not use_tpu or mode != model_fn_lib.ModeKeys.TRAIN:
            return model_fn_wrapper.call_without_tpu(features, labels)

        inputs = _InputsHolder(features=features,
                               labels=labels,
                               num_shards=config.tpu_config.num_shards)

        dequeue_fn, enqueue_fn = _create_infeed_enqueue_ops_and_dequeue_fn(
            inputs, config)

        loss = _train_on_tpu_system(model_fn_wrapper, dequeue_fn)

        # Gets the variables back from TPU nodes. This means the variables updated
        # by TPU will now be *synced* to host memory.
        update_ops = [
            array_ops.check_numerics(v.read_value(),
                                     'Gradient for %s is NaN' % v.name).op
            for v in variables.trainable_variables()
        ]

        hooks = [
            TPUInfeedSessionHook(config, enqueue_fn),
            training.LoggingTensorHook(
                {
                    'loss': array_ops.identity(loss),
                    'step': training.get_global_step()
                },
                every_n_secs=30)
        ]

        return model_fn_lib.EstimatorSpec(
            mode,
            loss=array_ops.identity(loss),
            training_hooks=hooks,
            train_op=control_flow_ops.group(*update_ops))
Exemple #49
0
    def testParallelMapUnspecifiedThreads(self):
        components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)

        with warnings.catch_warnings(record=True) as w:
            dataset = (dataset_ops.Dataset.from_tensor_slices(components).map(
                lambda x: array_ops.check_numerics(x, "message"),
                output_buffer_size=2))
            self.assertTrue(len(w) >= 1)
            self.assertTrue((
                "Dataset.map() is ignoring output_buffer_size since the argument "
                "num_threads was not set. To buffer elements, set num_threads >= 1"
            ) in [str(x.message) for x in w])

        iterator = dataset.make_initializable_iterator()
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.test_session() as sess:
            sess.run(init_op)
            for _ in range(3):
                sess.run(get_next)
Exemple #50
0
  def _model_fn(features, labels, mode, config, params):
    """A Estimator `model_fn` for TPUEstimator."""
    model_fn_wrapper = _ModelFnWrapper(model_fn, config, params, mode,
                                       train_batch_size)

    # TODO(jhseu): Move to EVAL and PREDICT to TPU.
    if not use_tpu or mode != model_fn_lib.ModeKeys.TRAIN:
      return model_fn_wrapper.call_without_tpu(features, labels)

    inputs = _InputsHolder(features=features, labels=labels,
                           num_shards=config.tpu_config.num_shards)

    dequeue_fn, enqueue_fn = _create_infeed_enqueue_ops_and_dequeue_fn(
        inputs, config)

    loss = _train_on_tpu_system(model_fn_wrapper, dequeue_fn)

    # Gets the variables back from TPU nodes. This means the variables updated
    # by TPU will now be *synced* to host memory.
    update_ops = [
        array_ops.check_numerics(v.read_value(),
                                 'Gradient for %s is NaN' % v.name).op
        for v in variables.trainable_variables()
    ]

    hooks = [
        TPUInfeedSessionHook(config, enqueue_fn),
        training.LoggingTensorHook(
            {'loss': array_ops.identity(loss),
             'step': training.get_global_step()},
            every_n_secs=30)
    ]

    return model_fn_lib.EstimatorSpec(
        mode,
        loss=array_ops.identity(loss),
        training_hooks=hooks,
        train_op=control_flow_ops.group(*update_ops))
Exemple #51
0
def add_check_numerics_ops():
  """Connect a `check_numerics` to every floating point tensor.

  `check_numerics` operations themselves are added for each `float` or `double`
  tensor in the graph. For all ops in the graph, the `check_numerics` op for
  all of its (`float` or `double`) inputs is guaranteed to run before the
  `check_numerics` op on any of its outputs.

  Returns:
    A `group` op depending on all `check_numerics` ops added.
  """
  check_op = []
  # This code relies on the ordering of ops in get_operations().
  # The consumer of a tensor always comes before that tensor's producer in
  # this list. This is true because get_operations() returns ops in the order
  # added, and ops can only be added once its inputs are added.
  for op in ops.get_default_graph().get_operations():
    for output in op.outputs:
      if output.dtype in [dtypes.float32, dtypes.float64]:
        message = op.name + ":" + str(output.value_index)
        with ops.control_dependencies(check_op):
          check_op = [array_ops.check_numerics(output, message=message)]
  return control_flow_ops.group(*check_op)
Exemple #52
0
    def _model_fn(features, labels, mode):
        """model_fn."""
        # TODO(jhseu): Move to EVAL and PREDICT to TPU.
        if mode != model_fn_lib.ModeKeys.TRAIN:
            return model_fn(features, labels, mode)

        dequeue_fn, enqueue_fn = (_create_infeed_enqueue_ops_and_dequeue_fn(
            run_config, features, labels))

        loss = _train_on_tpu_shards(run_config,
                                    train_step=_convert_model_fn_to_train_step(
                                        model_fn, dequeue_fn, mode,
                                        run_config))

        # Gets the variables back from TPU nodes. This means the variables updated
        # by TPU will now be *synced* to host memory.
        update_ops = [
            array_ops.check_numerics(v.read_value(),
                                     'Gradient for %s is NaN' % v.name).op
            for v in variables.trainable_variables()
        ]

        hooks = [
            TpuInfeedSessionHook(run_config, enqueue_fn),
            training.LoggingTensorHook(
                {
                    'loss': array_ops.identity(loss),
                    'step': training.get_global_step()
                },
                every_n_secs=30)
        ]

        return model_fn_lib.EstimatorSpec(
            mode,
            loss=array_ops.identity(loss),
            training_hooks=hooks,
            train_op=control_flow_ops.group(*update_ops))
Exemple #53
0
def add_check_numerics_ops():
  """Connect a `check_numerics` to every floating point tensor.

  `check_numerics` operations themselves are added for each `half`, `float`,
  or `double` tensor in the graph. For all ops in the graph, the
  `check_numerics` op for all of its (`half`, `float`, or `double`) inputs
  is guaranteed to run before the `check_numerics` op on any of its outputs.

  Note: This API is not compatible with the use of @{tf.cond} or
  @{tf.while_loop}, and will raise a `ValueError` if you attempt to call it
  in such a graph.

  Returns:
    A `group` op depending on all `check_numerics` ops added.

  Raises:
    ValueError: If the graph contains any numeric operations in a control flow
      structure.
  """
  check_op = []
  # This code relies on the ordering of ops in get_operations().
  # The producer of a tensor always comes before that tensor's consumer in
  # this list. This is true because get_operations() returns ops in the order
  # added, and an op can only be added after its inputs are added.
  for op in ops.get_default_graph().get_operations():
    for output in op.outputs:
      if output.dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
        if op._get_control_flow_context() is not None:  # pylint: disable=protected-access
          raise ValueError("`tf.add_check_numerics_ops() is not compatible "
                           "with TensorFlow control flow operations such as "
                           "`tf.cond()` or `tf.while_loop()`.")

        message = op.name + ":" + str(output.value_index)
        with ops.control_dependencies(check_op):
          check_op = [array_ops.check_numerics(output, message=message)]
  return control_flow_ops.group(*check_op)
Exemple #54
0
def optimize_loss(loss,
                  global_step,
                  learning_rate,
                  optimizer,
                  clip_gradients=None,
                  moving_average_decay=0.9,
                  learning_rate_decay_fn=None,
                  variables=None):
  """Given loss and parameters for optimizer, returns a training op.

  Args:
    loss: Tensor, 0 dimensional.
    global_step: Tensor, step counter for each update.
    learning_rate: float or Tensor, magnitude of update per each training step.
    optimizer: string, class or optimizer instance, used as trainer.
               string should be name of optimizer, like 'SGD',
                 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant.
               class should be sub-class of tf.Optimizer that implements
                 `compute_gradients` and `apply_gradients` functions.
               optimizer instance should be instantion of tf.Optimizer sub-class
                 and have `compute_gradients` and `apply_gradients` functions.
    clip_gradients: float or None, clips gradients by this value.
    moving_average_decay: float or None, takes into account previous loss
                          to make learning smoother due to outliers.
    learning_rate_decay_fn: function, takes learning_rate and global_step
                            Tensors, returns Tensor. Can be used to implement
                            any learning rate decay funcitons.
                            For example: tf.train.exponential_decay.
    variables: list of variables to optimizer or none.

  Returns:
    Training op.

  Raises:
    ValueError: if optimizer is wrong type.
  """
  # Moving average of the loss with decay.
  if moving_average_decay is not None:
    # Generate moving averages of the loss.
    loss_averages = train.ExponentialMovingAverage(moving_average_decay,
                                                   name="avg")
    loss_averages_op = loss_averages.apply([loss])
    logging_ops.scalar_summary("loss/mean", loss_averages.average(loss))
    loss = control_flow_ops.with_dependencies([loss_averages_op], loss)

  # Learning rate variable, with possible decay.
  if isinstance(learning_rate, ops.Tensor) and len(learning_rate.get_shape()) == 0:
    lr = learning_rate
  elif isinstance(learning_rate, float):
    lr = vs.get_variable("learning_rate",
                         [],
                         trainable=False,
                         initializer=init_ops.constant_initializer(learning_rate))
  else:
    raise ValueError("Learning rate should be 0d Tensor or float. Got %s" %
        str(learning_rate))
  if learning_rate_decay_fn is not None:
    lr = learning_rate_decay_fn(lr, global_step)

  # Create optimizer, given specified parameters.
  if isinstance(optimizer, six.string_types):
    if optimizer not in OPTIMIZER_CLS_NAMES:
      raise ValueError("Optimizer name should be one of [%s], you provided %s."
                       % (", ".join(OPTIMIZER_CLS_NAMES), optimizer))
    opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr)
  elif isinstance(optimizer, type) and issubclass(optimizer,
                                                  optimizer_.Optimizer):
    opt = optimizer(learning_rate=lr)
  elif isinstance(optimizer, optimizer_.Optimizer):
    opt = optimizer
  else:
    raise ValueError("Unrecognized optimizer: should be string, "
                     "subclass of Optimizer or instance of "
                     "subclass of Optimizer. Got %s." % str(optimizer))

  # All trainable variables, if specific variables are not specified.
  if variables is None:
    variables = vars_.trainable_variables()

  # Compute gradients and clip them if provided.
  gradients = opt.compute_gradients(loss, variables)
  if clip_gradients is not None:
    gradients, variables = zip(*gradients)
    clipped_gradients, _ = clip_ops.clip_by_global_norm(gradients,
                                                        clip_gradients)
    gradients = list(zip(clipped_gradients, variables))

  # Add scalar summary for loss.
  logging_ops.scalar_summary("loss", loss)

  # Add histograms for variables, gradients and gradient norms.
  for gradient, variable in gradients:
    if isinstance(gradient, ops.IndexedSlices):
      grad_values = gradient.values
    else:
      grad_values = gradient

    if grad_values is not None:
      logging_ops.histogram_summary(variable.name, variable)
      logging_ops.histogram_summary(variable.name + "/gradients", grad_values)
      logging_ops.histogram_summary(variable.name + "/gradient_norm",
                                    clip_ops.global_norm([grad_values]))

  # Create gradient updates.
  grad_updates = opt.apply_gradients(gradients,
                                     global_step=global_step,
                                     name="train")
  # Make sure total_loss is valid.
  final_loss = array_ops.check_numerics(loss, "Loss is inf or nan")

  # Ensure the train_tensor computes grad_updates.
  train_tensor = control_flow_ops.with_dependencies([grad_updates], final_loss)

  return train_tensor
Exemple #55
0
def _CheckNumericsGrad(op, grad):
  """Gradient for check_numerics op."""
  return array_ops.check_numerics(
      grad,
      "Not a number (NaN) or infinity (Inf) values detected in gradient. %s" %
      op.get_attr("message"))
Exemple #56
0
def _CheckNumericsGrad(_, grad):
  """Gradient for check_numerics op."""
  return array_ops.check_numerics(
      grad, "Not a number (NaN) or infinity (Inf) values detected in gradient.")
Exemple #57
0
def create_train_op(total_loss,
                    optimizer,
                    global_step=_USE_GLOBAL_STEP,
                    update_ops=None,
                    variables_to_train=None,
                    transform_grads_fn=None,
                    summarize_gradients=False,
                    gate_gradients=tf_optimizer.Optimizer.GATE_OP,
                    aggregation_method=None,
                    colocate_gradients_with_ops=False,
                    check_numerics=True):
  """Creates an `Operation` that evaluates the gradients and returns the loss.

  Args:
    total_loss: A `Tensor` representing the total loss.
    optimizer: A tf.Optimizer to use for computing the gradients.
    global_step: A `Tensor` representing the global step variable. If left as
      `_USE_GLOBAL_STEP`, then tf.contrib.framework.global_step() is used.
    update_ops: An optional list of updates to execute. If `update_ops` is
      `None`, then the update ops are set to the contents of the
      `tf.GraphKeys.UPDATE_OPS` collection. If `update_ops` is not `None`, but
      it doesn't contain all of the update ops in `tf.GraphKeys.UPDATE_OPS`,
      a warning will be displayed.
    variables_to_train: an optional list of variables to train. If None, it will
      default to all tf.trainable_variables().
    transform_grads_fn: A function which takes a single argument, a list of
      gradient to variable pairs (tuples), performs any requested gradient
      updates, such as gradient clipping or multipliers, and returns the updated
      list.
    summarize_gradients: Whether or not add summaries for each gradient.
    gate_gradients: How to gate the computation of gradients. See tf.Optimizer.
    aggregation_method: Specifies the method used to combine gradient terms.
      Valid values are defined in the class `AggregationMethod`.
    colocate_gradients_with_ops: Whether or not to try colocating the gradients
      with the ops that generated them.
    check_numerics: Whether or not we apply check_numerics.

  Returns:
    A `Tensor` that when evaluated, computes the gradients and returns the total
      loss value.
  """
  if global_step is _USE_GLOBAL_STEP:
    global_step = training_util.get_or_create_global_step()

  # Update ops use GraphKeys.UPDATE_OPS collection if update_ops is None.
  global_update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
  if update_ops is None:
    update_ops = global_update_ops
  else:
    update_ops = set(update_ops)
  if not global_update_ops.issubset(update_ops):
    logging.warning('update_ops in create_train_op does not contain all the '
                    'update_ops in GraphKeys.UPDATE_OPS')

  # Make sure update_ops are computed before total_loss.
  if update_ops:
    with ops.control_dependencies(update_ops):
      barrier = control_flow_ops.no_op(name='update_barrier')
    total_loss = control_flow_ops.with_dependencies([barrier], total_loss)

  if variables_to_train is None:
    # Default to tf.trainable_variables()
    variables_to_train = tf_variables.trainable_variables()
  else:
    # Make sure that variables_to_train are in tf.trainable_variables()
    for v in variables_to_train:
      assert v.trainable or v in tf_variables.trainable_variables()

  assert variables_to_train

  # Create the gradients. Note that apply_gradients adds the gradient
  # computation to the current graph.
  grads = optimizer.compute_gradients(
      total_loss,
      variables_to_train,
      gate_gradients=gate_gradients,
      aggregation_method=aggregation_method,
      colocate_gradients_with_ops=colocate_gradients_with_ops)

  if transform_grads_fn:
    grads = transform_grads_fn(grads)

  # Summarize gradients.
  if summarize_gradients:
    with ops.name_scope('summarize_grads'):
      add_gradients_summaries(grads)

  # Create gradient updates.
  grad_updates = optimizer.apply_gradients(grads, global_step=global_step)

  with ops.name_scope('train_op'):
    # Make sure total_loss is valid.
    if check_numerics:
      total_loss = array_ops.check_numerics(total_loss,
                                            'LossTensor is inf or nan')

    # Ensure the train_tensor computes grad_updates.
    train_op = control_flow_ops.with_dependencies([grad_updates], total_loss)

  # Add the operation used for training to the 'train_op' collection
  train_ops = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
  if train_op not in train_ops:
    train_ops.append(train_op)

  return train_op
Exemple #58
0
 def testExecuteStringAttr(self):
   three = tensor.Tensor(3.0)
   checked_three = array_ops.check_numerics(three,
                                            message='just checking')
   self.assertEqual([[3]], checked_three.numpy())
Exemple #59
0
def _CheckNumericsGrad(op, grad):
    """Gradient for check_numerics op."""
    return array_ops.check_numerics(
        grad,
        "Not a number (NaN) or infinity (Inf) values detected in gradient. %s"
        % op.get_attr("message"))
Exemple #60
0
def create_train_op(
    total_loss,
    optimizer,
    global_step=None,
    update_ops=None,
    variables_to_train=None,
    clip_gradient_norm=0,
    summarize_gradients=False,
    gate_gradients=tf_optimizer.Optimizer.GATE_OP,
    aggregation_method=None,
    colocate_gradients_with_ops=False,
    gradient_multipliers=None):
  """Creates an `Operation` that evaluates the gradients and returns the loss.

  Args:
    total_loss: A `Tensor` representing the total loss.
    optimizer: A tf.Optimizer to use for computing the gradients.
    global_step: A `Tensor` representing the global step variable. If left as
      `None`, then slim.variables.global_step() is used.
    update_ops: an optional list of updates to execute. Note that the update_ops
      that are used are the union of those update_ops passed to the function and
      the value of slim.ops.GetUpdateOps(). Therefore, if `update_ops` is None,
      then the value of slim.ops.GetUpdateOps() is still used.
    variables_to_train: an optional list of variables to train. If None, it will
      default to all tf.trainable_variables().
    clip_gradient_norm: If greater than 0 then the gradients would be clipped
      by it.
    summarize_gradients: Whether or not add summaries for each gradient.
    gate_gradients: How to gate the computation of gradients. See tf.Optimizer.
    aggregation_method: Specifies the method used to combine gradient terms.
      Valid values are defined in the class `AggregationMethod`.
    colocate_gradients_with_ops: Whether or not to try colocating the gradients
      with the ops that generated them.
    gradient_multipliers: A dictionary of either `Variables` or `Variable` op
      names to the coefficient by which the associated gradient should be
      scaled.
  Returns:
    A `Tensor` that when evaluated, computes the gradients and returns the total
      loss value.
  """
  if global_step is None:
    global_step = variables.get_or_create_global_step()

  # Update ops use GraphKeys.UPDATE_OPS collection if update_ops is None.
  global_update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
  if update_ops is None:
    update_ops = global_update_ops
  else:
    update_ops = set(update_ops)
  if not global_update_ops.issubset(update_ops):
    logging.warning('update_ops in create_train_op does not contain all the '
                    ' update_ops in GraphKeys.UPDATE_OPS')

  # Make sure update_ops are computed before total_loss.
  if update_ops:
    with ops.control_dependencies(update_ops):
      barrier = control_flow_ops.no_op(name='update_barrier')
    total_loss = control_flow_ops.with_dependencies([barrier], total_loss)

  if variables_to_train is None:
    # Default to tf.trainable_variables()
    variables_to_train = tf_variables.trainable_variables()
  else:
    # Make sure that variables_to_train are in tf.trainable_variables()
    for v in variables_to_train:
      assert v in tf_variables.trainable_variables()

  assert variables_to_train

  # Create the gradients. Note that apply_gradients adds the gradient
  # computation to the current graph.
  grads = optimizer.compute_gradients(
      total_loss, variables_to_train, gate_gradients=gate_gradients,
      aggregation_method=aggregation_method,
      colocate_gradients_with_ops=colocate_gradients_with_ops)

  # Scale gradients.
  if gradient_multipliers:
    grads = multiply_gradients(grads, gradient_multipliers)

  # Clip gradients.
  if clip_gradient_norm > 0:
    grads = clip_gradient_norms(grads, clip_gradient_norm)

  # Summarize gradients.
  if summarize_gradients:
    add_gradients_summaries(grads)

  # Create gradient updates.
  grad_updates = optimizer.apply_gradients(grads, global_step=global_step)

  # Make sure total_loss is valid.
  total_loss = array_ops.check_numerics(total_loss, 'LossTensor is inf or nan')

  # Ensure the train_tensor computes grad_updates.
  return control_flow_ops.with_dependencies([grad_updates], total_loss)