def testCatchFunctionOpInfFloat64(self):
        """Test catching infinites generated in a FuncGraph."""

        check_numerics_callback.enable_check_numerics()

        @def_function.function
        def divide_sum_with_diff(x, y):
            w1 = x + y
            w2 = x - y
            u = w1 / w2
            return u * 2.0

        x = constant_op.constant(2.0, dtype=dtypes.float64)
        y = constant_op.constant(2.0, dtype=dtypes.float64)
        message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
            lambda: self.evaluate(divide_sum_with_diff(x, y)))

        # Check the content of the error message.
        self.assertTrue(re.search(r"graph op.*\"RealDiv\"", message))
        self.assertTrue(re.search(r"dtype.*float64", message))
        self.assertIn("shape: ()\n", message)
        self.assertIn("Input tensors (2):", message)
        # Check that the correct input ops are printed.
        self.assertTrue(re.search(r"0:.*Tensor.*add:0", message))
        self.assertTrue(re.search(r"1:.*Tensor.*sub:0", message))
        # Check that the correct line for op creation is printed.
        self.assertTrue(re.search(r"Stack trace of op's creation", message))
        self.assertIn("u = w1 / w2", message)
    def testKerasModelHealthyPredictAndFitCalls(self):
        """Test a simple healthy keras model runs fine under the callback."""
        check_numerics_callback.enable_check_numerics()

        model = models.Sequential()
        model.add(
            layers.Dense(units=100,
                         input_shape=(5, ),
                         use_bias=False,
                         activation="relu",
                         kernel_initializer="ones"))
        model.add(layers.BatchNormalization())
        model.add(layers.Dropout(0.5))
        model.add(
            layers.Dense(units=1,
                         activation="linear",
                         kernel_initializer="ones"))

        model.compile(loss="mse",
                      optimizer=optimizer_v2.gradient_descent.SGD(1e-3))

        batch_size = 16
        xs = np.zeros([batch_size, 5])
        ys = np.ones([batch_size, 1])

        outputs = model.predict(xs)
        self.assertEqual(outputs.shape, (batch_size, 1))

        epochs = 100
        history = model.fit(xs, ys, epochs=epochs, verbose=0)
        self.assertEqual(len(history.history["loss"]), epochs)
    def testControlFlowGraphWithNaNBFloat16(self):
        """Test catching bfloat16 NaNs in a control-flow-v2 FuncGraph."""
        check_numerics_callback.enable_check_numerics()

        @def_function.function
        def my_conditional(x):
            if math_ops.less(math_ops.reduce_sum(x), 0.0):
                return math_ops.log(x)
            else:
                return math_ops.log(-x)

        x = constant_op.constant([1.0, 2.0, 3.0], dtype=dtypes.bfloat16)
        message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
            lambda: self.evaluate(my_conditional(x)))
        # Check the content of the error message.
        self.assertTrue(re.search(r"graph op.*\"Log\"", message))
        self.assertTrue(re.search(r"dtype.*bfloat16", message))
        self.assertIn("shape: (3,)\n", message)
        # Check that the correct input op is printed.
        self.assertTrue(re.search(r"Input tensor.*Tensor.*Neg", message))
        # Check that the correct line for op creation is printed.
        self.assertTrue(re.search(r"Stack trace of op's creation", message))
        self.assertIn("return math_ops.log(-x)", message)
        if context.executing_eagerly():
            # The code path for raising error is slightly different under graph mode.
            self.assertTrue(message.endswith("\n"))
    def testCustomGradietWithNaNWithTfFunction(self):
        """Test that callback catches NaN in a gradient function during backprop."""
        check_numerics_callback.enable_check_numerics()

        @custom_gradient.custom_gradient
        def func_with_bad_grad(x):
            output = math_ops.sin(x)

            @def_function.function
            def grad(dy):
                # `dy` will come in as 1.0. Taking log of -1.0 leads to NaN.
                return math_ops.log(-dy)

            return output, grad

        x = constant_op.constant(-2.0, dtype=dtypes.float16)

        def f(x):
            return func_with_bad_grad(x)

        message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
            lambda: gradient_checker_v2.compute_gradient(f, [x]))

        # Check the content of the error message.
        self.assertTrue(re.search(r"graph op.*\"Log\"", message))
        self.assertTrue(re.search(r"dtype.*float16", message))
        if context.executing_eagerly():
            self.assertIn("shape: ()\n", message)
        self.assertTrue(re.search(r"Input tensor.*Tensor.*Neg:0", message))
        self.assertIn("-> |   return math_ops.log(-dy)", message)
    def testCheckingInfinityInMiniModelOnOneOrTwoDevices(
            self, distribution, inside_scope):
        if not inside_scope:
            check_numerics_callback.enable_check_numerics()
        with distribution.scope():
            if inside_scope:
                check_numerics_callback.enable_check_numerics()

            mini_model = MiniModel(generate_infinity=True)

            def train_step():
                with backprop.GradientTape() as tape:
                    loss = mini_model(array_ops.ones([1, 10]))
                    return tape.gradient(loss, mini_model.weights)

            caught_error = None
            try:
                distribution.experimental_run_v2(train_step)
            except errors.InvalidArgumentError as error:
                caught_error = error
            self.assertTrue(caught_error)
            self.assertTrue(
                re.search(r"Detected Infinity or NaN.*\"RealDiv\"",
                          caught_error.message))
            self.assertIn(
                "-> |   y = math_ops.divide(y, array_ops.zeros_like(y))",
                caught_error.message)
    def testNestedFunctionGradientCall(self):
        """Catching inf in the inner nested tf.function during backprop."""
        check_numerics_callback.enable_check_numerics()

        x = constant_op.constant(1.0 - 1e-8, dtype=dtypes.float32)

        @def_function.function
        def asinp1(x):
            # asin()'s gradient overflows at the value close to 1.0.
            return math_ops.asin(x) + 1.0

        @def_function.function
        def loss(x):
            return math_ops.square(asinp1(x))

        with backprop.GradientTape() as tape:
            tape.watch(x)
            y = loss(x)
            message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
                lambda: self.evaluate(tape.gradient(y, x)))
            # Check the content of the error message.
            # Assume the op Reciprocal or Xdivy is used in the gradient function for
            # asin().
            self.assertTrue((re.search(r"graph op.*\"Reciprocal\"", message)
                             or re.search(r"graph op.*\"Xdivy\"", message)))
            self.assertTrue(re.search(r"dtype.*float32", message))
Exemple #7
0
 def test_zero_grad_tape(self):
     try:
         check_numerics_callback.enable_check_numerics()
         x = constant_op.constant([-1, 0., 1.])
         with backprop.GradientTape() as tape:
             tape.watch(x)
             g = tape.gradient(math_ops.pow(x, 2), x)
         g = self.evaluate(g)
         self.assertAllClose([-2., 0., 2.], g)
     finally:
         check_numerics_callback.disable_check_numerics()
    def testMobileNetV2Fit(self):
        """Test training Keras MobileNetV2 application works w/ check numerics."""
        check_numerics_callback.enable_check_numerics()
        model = mobilenet_v2.MobileNetV2(alpha=0.1, weights=None)

        xs = np.zeros([2] + list(model.input_shape[1:]))
        ys = np.zeros([2] + list(model.output_shape[1:]))
        model.compile(optimizer="sgd", loss="categorical_crossentropy")
        epochs = 1
        history = model.fit(xs, ys, epochs=epochs, verbose=0)
        self.assertEqual(len(history.history["loss"]), epochs)
 def testEagerModeUsesCorrectPathLengthAndStackHeightLimits(self):
     check_numerics_callback.enable_check_numerics(stack_height_limit=123,
                                                   path_length_limit=1200)
     fake_get_check_numerics_error_message = test.mock.MagicMock(
         return_value="dummy_message")
     with test.mock.patch.object(check_numerics_callback,
                                 "get_check_numerics_error_message",
                                 fake_get_check_numerics_error_message):
         x = constant_op.constant(2.0)
         y = constant_op.constant(0.0)
         self._assertRaisesInvalidArgumentErrorAndGetMessage(
             lambda: x / y)  # Expected to generate an inf.
         (_, call_kwargs) = fake_get_check_numerics_error_message.call_args
         self.assertEqual(call_kwargs["stack_height_limit"], 123)
         self.assertEqual(call_kwargs["path_length_limit"], 1200)
 def testNanInConstIsCaptured(self):
   check_numerics_callback.enable_check_numerics()
   v = variables.Variable(3.0, dtype=dtypes.float32)
   @def_function.function
   def add_a_bad_constant(x):
     c = constant_op.constant(np.nan)
     return x + c
   if not context.executing_eagerly():
     self.evaluate(v.initializer)
   message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
       lambda: self.evaluate(add_a_bad_constant(v)))
   self.assertTrue(re.search(r"graph op.*\"Const\"", message))
   self.assertTrue(re.search(r"dtype:.*float32", message))
   self.assertTrue(re.search(r"shape:.*\(\)", message))
   self.assertTrue(re.search(r"Graph name:.*add_a_bad_constant", message))
Exemple #11
0
  def testDatasetMapHealthyResults(self):
    check_numerics_callback.enable_check_numerics()

    tensor = constant_op.constant(
        [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0])

    def map_fn(x):
      return math_ops.log(math_ops.square(x) + 1)

    dataset = dataset_ops.Dataset.from_tensor_slices(tensor).batch(2).map(
        map_fn)
    iterator = dataset_ops.make_one_shot_iterator(dataset)

    self.assertAllClose(self.evaluate(iterator.get_next()), np.log([1.25, 2]))
    self.assertAllClose(self.evaluate(iterator.get_next()), np.log([3.25, 5]))
  def testCatchEagerOpFloat32Inf(self):
    """Test catching Infinity in eager op execution: float32."""
    check_numerics_callback.enable_check_numerics()

    x = constant_op.constant([2.0, 3.0])
    y = constant_op.constant([1.0, 0.0])
    message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
        lambda: x / y)

    # Check the content of the error message.
    self.assertTrue(re.search(r"eagerly-executing op.*\"RealDiv\"", message))
    self.assertTrue(re.search(r"dtype.*float32", message))
    self.assertIn("shape: (2,)\n", message)
    self.assertIn("# of +Inf elements: 1\n", message)
    self.assertIn("0: %s" % x, message)
    self.assertIn("1: %s" % y, message)
  def testCatchEagerOpFloat16NaN(self):
    """Test catching Infinity in eager op execution: float16."""
    check_numerics_callback.enable_check_numerics()
    def log1p(x):
      y = 1.0 + x
      return math_ops.log(y)
    x = constant_op.constant([[-1.0]], dtype=dtypes.float16)
    message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
        lambda: log1p(x))

    # Check the content of the error message.
    self.assertTrue(re.search(r"eagerly-executing op.*\"Log\"", message))
    self.assertTrue(re.search(r"dtype.*float16", message))
    self.assertIn("shape: (1, 1)\n", message)
    self.assertIn("# of -Inf elements: 1\n", message)
    self.assertTrue(re.search(r"Input tensor.*0\.", message))
    def testKerasModelWithRNNHealthyPredictAndFitCalls(self):
        """Test a simple healthy keras recurrent model works under the callback."""
        check_numerics_callback.enable_check_numerics()

        model = models.Sequential()
        model.add(layers.LSTM(1, input_shape=(2, 4)))
        model.compile(loss="mse", optimizer="rmsprop")

        xs = np.zeros([8, 2, 4], dtype=np.float32)
        ys = np.zeros([8, 1], dtype=np.float32)

        model.predict(xs)

        epochs = 3
        history = model.fit(xs, ys, epochs=epochs, verbose=0)
        self.assertEqual(len(history.history["loss"]), epochs)
    def testKerasModelUnhealthyPredictAndFitCallsWithLargeLearningRate(self):
        """Test keras model training crashes with Infinity is caught by callback."""
        check_numerics_callback.enable_check_numerics()

        model = models.Sequential()
        # Use weight initializers for deterministic behavior during test.
        model.add(
            layers.Dense(units=100,
                         input_shape=(5, ),
                         activation="relu",
                         kernel_initializer="ones"))
        model.add(
            layers.Dense(units=1,
                         activation="linear",
                         kernel_initializer="ones"))

        lr = 1e3  # Intentionally huge learning rate.
        model.compile(loss="mse",
                      optimizer=optimizer_v2.gradient_descent.SGD(lr))

        batch_size = 16
        xs = np.zeros([batch_size, 5])
        ys = np.ones([batch_size, 1])

        outputs = model.predict(xs)
        self.assertEqual(outputs.shape, (batch_size, 1))

        epochs = 100
        message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
            lambda: model.fit(xs, ys, epochs=epochs, verbose=0))

        # Check the content of the error message.
        # Let's not hardcode the op name for future-proof.
        self.assertTrue(re.search(r"graph op.*\".*\"", message))
        self.assertTrue(re.search(r"dtype:.*float32", message))
        self.assertTrue(re.search(r"shape:.*\(.*\)", message))
        # Check that the correct input op is printed.
        self.assertTrue(re.search(r"Input tensor.*", message))
        # Check that the correct line for op creation is printed.
        self.assertTrue(re.search(r"Stack trace of op's creation", message))
        # The stacks are different between when eager execution is enabled and
        # when it's not (i.e., v1 graph). TODO(cais): Investigate if we can improve
        # this.
        if context.executing_eagerly():
            self.assertIn("lambda: model.fit(xs, ys,", message)
        else:
            self.assertIn("model.compile(", message)
  def testEnableCheckNumericsIsIdempotent(self):
    """Two calls to enable_check_numerics() have same effect as one call."""
    check_numerics_callback.enable_check_numerics()
    check_numerics_callback.enable_check_numerics()

    x = constant_op.constant([2.0, 3.0])
    y = constant_op.constant([1.0, 0.0])
    message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
        lambda: x / y)

    # Check the content of the error message.
    self.assertTrue(re.search(r"eagerly-executing op.*\"RealDiv\"", message))
    self.assertTrue(re.search(r"dtype.*float32", message))
    self.assertIn("shape: (2,)\n", message)
    self.assertIn("# of +Inf elements: 1\n", message)
    self.assertIn("0: %s" % x, message)
    self.assertIn("1: %s" % y, message)
    def testInfInCustomKerasLayerWithoutTfFuntionPredictCall(self):
        """Test catching Infinity in a custom layer, w/o tf.function."""
        check_numerics_callback.enable_check_numerics()

        class DivByXLayer(layers.Layer):

            # Not using the tf.function decorator here.
            def call(self, x):
                """The computation performed by the for-test custom layer.

        Generates Infinity by intention.

        Args:
          x: Input tensor of scalar shape.

        Returns:
          A scalar tensor.
        """
                one_over_x = 1.0 / x
                return one_over_x

        model = models.Sequential()
        model.add(DivByXLayer(input_shape=[5]))

        # TODO(b/140245224): Currently the model must be compiled prior to
        # predict() being called(). Or keras will fall back to V1 behavior.
        # Remove this after the bug is fixed.
        model.compile(loss="mse", optimizer="sgd")

        xs = np.ones([1, 5])
        # Calling the model with non-zero inputs should be fine.
        self.assertAllClose(model.predict(xs), [[1.0, 1.0, 1.0, 1.0, 1.0]])

        xs = np.zeros([1, 5])
        message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
            lambda: model.predict(xs))

        # Check the content of the error message.
        self.assertTrue(re.search(r"graph op.*\"RealDiv\"", message))
        self.assertTrue(re.search(r"dtype.*float32", message))
        self.assertTrue(re.search(r"shape: \(.*, 5\)", message))
        # Check that the correct input op is printed.
        self.assertIn("Input tensors (2):", message)
        # Check that the correct line for op creation is printed.
        self.assertTrue(re.search(r"Stack trace of op's creation", message))
        self.assertIn("one_over_x = 1.0 / x", message)
  def testExpectedNaNOpOutputs(self):
    """Test calling operations with benign NaN output."""
    check_numerics_callback.enable_check_numerics()

    # Empty input tensor
    x = constant_op.constant(1, dtype=dtypes.float32, shape=[0, 1, 1, 1])
    scale = constant_op.constant([1], dtype=dtypes.float32)
    offset = constant_op.constant([1], dtype=dtypes.float32)

    # Calling fused_batch_norm with an empty input should output a NaN in the
    # latter four outputs without triggering the check_numerics callback
    batch_norm_res = gen_nn_ops._fused_batch_norm(
        x=x, scale=scale, offset=offset, mean=[], variance=[])

    _, batch_mean, batch_variance, _, _ = self.evaluate(batch_norm_res)

    self.assertTrue(np.isnan(batch_mean.squeeze()))
    self.assertTrue(np.isnan(batch_variance.squeeze()))
    def testMobileNetV2Fit(self):
        """Test training Keras MobileNetV2 application works w/ check numerics."""

        if test_lib.is_built_with_rocm():
            # This test passes with MIOpen Find Mode (which is the default)
            # This bug is being tracked via MLOpen Issue #2379, re-enable this
            # test once the fix for that issue is available in a ROCm release
            self.skipTest("MIOpen bug results in test failure")

        check_numerics_callback.enable_check_numerics()
        model = mobilenet_v2.MobileNetV2(alpha=0.1, weights=None)

        xs = np.zeros([2] + list(model.input_shape[1:]))
        ys = np.zeros([2] + list(model.output_shape[1:]))
        model.compile(optimizer="sgd", loss="categorical_crossentropy")
        epochs = 1
        history = model.fit(xs, ys, epochs=epochs, verbose=0)
        self.assertEqual(len(history.history["loss"]), epochs)
    def testGraphModeUsesCorrectPathLengthAndStackHeightLimits(self):
        check_numerics_callback.enable_check_numerics(stack_height_limit=123,
                                                      path_length_limit=1200)

        @def_function.function
        def add_fn(x, y):
            return x + y

        fake_get_check_numerics_error_message = test.mock.MagicMock(
            return_value="dummy_message")
        with test.mock.patch.object(check_numerics_callback,
                                    "get_check_numerics_error_message",
                                    fake_get_check_numerics_error_message):
            x = constant_op.constant(2.0)
            y = constant_op.constant(3.0)
            self.assertAllClose(self.evaluate(add_fn(x, y)), 5.0)
            (_, call_kwargs) = fake_get_check_numerics_error_message.call_args
            self.assertEqual(call_kwargs["stack_height_limit"], 123)
            self.assertEqual(call_kwargs["path_length_limit"], 1200)
    def testDatasetMapHealthyResults(self):
        check_numerics_callback.enable_check_numerics()

        tensor = constant_op.constant(
            [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0])

        def map_fn(x):
            return math_ops.log(math_ops.square(x) + 1)

        dataset = dataset_ops.Dataset.from_tensor_slices(tensor).batch(2).map(
            map_fn)

        @def_function.function
        def get_batches():
            iterator = iter(dataset)
            return [next(iterator), next(iterator)]

        batches = self.evaluate(get_batches())
        self.assertLen(batches, 2)
        self.assertAllClose(batches[0], np.log([1.25, 2]))
        self.assertAllClose(batches[1], np.log([3.25, 5]))
    def testCatchInfinityInDatasetMapFunction(self):
        """Test that callback catches NaN in a tf.dataset map function."""
        check_numerics_callback.enable_check_numerics()

        def generate_nan(x):
            """Intetionally generates NaNs by taking log of negative number."""
            casted_x = math_ops.cast(x, dtypes.float32)
            return math_ops.log([[-1.0, 1.0], [3.0, 5.0]]) + casted_x

        dataset = dataset_ops.Dataset.range(10).map(generate_nan)
        iterator = dataset_ops.make_one_shot_iterator(dataset)

        message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
            lambda: self.evaluate(iterator.get_next()))

        # Check the content of the error message.
        self.assertTrue(re.search(r"graph op.*\"Log\"", message))
        self.assertTrue(re.search(r"dtype.*float32", message))
        self.assertIn("shape: (2, 2)\n", message)
        self.assertTrue(re.search(r"Input tensor.*Tensor.*Log/x:0", message))
        self.assertIn(
            "-> |   return math_ops.log([[-1.0, 1.0], [3.0, 5.0]]) + casted_x",
            message)
    def testOverflowInTfFunction(self):
        """Test catching Infinity caused by overflow in a tf.function with while."""
        check_numerics_callback.enable_check_numerics()

        @def_function.function
        def accumulation_function(counter, lim, accum):
            while math_ops.less(counter, lim):
                accum.assign(accum * 2.0)
                counter.assign_add(1)

        counter = variables.Variable(0, dtype=dtypes.int32)
        # Repeated `* 2.0` overflows a float32 tensor in 128 steps. So the
        # 1000-step limit is sufficient.
        lim = constant_op.constant(1000, dtype=dtypes.int32)
        accum = variables.Variable(1.0)

        if not context.executing_eagerly():
            self.evaluate([counter.initializer, accum.initializer])

        message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
            lambda: self.evaluate(accumulation_function(counter, lim, accum)))

        self.assertAllClose(self.evaluate(counter), 128)
        # Check the content of the error message.
        # The overflow to +Infinity happens during the `* 2.0` operation.
        self.assertTrue(re.search(r"graph op.*\"Mul\"", message))
        self.assertTrue(re.search(r"dtype.*float32", message))
        self.assertIn("shape: ()\n", message)
        # Check that the correct input op is printed.
        self.assertIn("Input tensors (2):", message)
        # Check that the correct input ops are printed.
        self.assertTrue(re.search(r"0:.*Tensor.*ReadVariableOp:0", message))
        self.assertTrue(re.search(r"1:.*Tensor.*mul/y:0", message))
        # Check that the correct line for op creation is printed.
        self.assertTrue(re.search(r"Stack trace of op's creation", message))
        self.assertIn("accum.assign(accum * 2.0)", message)
 def testNoCatchEagerOpExecution(self):
     """Test running multiple steps of eager execution without Inf/NaN."""
     check_numerics_callback.enable_check_numerics()
     x = constant_op.constant([2.0, 3.0])
     y = constant_op.constant([1.0, 0.0])
     self.assertAllClose((x + y) * (x - y), [3.0, 9.0])