Ejemplo n.º 1
0
    def test_advanced_model(self, strategy_fn, use_loss_scaling=False):
        # The advanced model tests mixed-precision-related features that would occur
        # in a resnet50 model. It tests a model that has:
        #  * Multiple layers, some which use auto-cast variables and some which do
        #    not
        #  * Regularization on some variables and not others.
        #  * A fixed loss scale (if use_loss_scaling is True)

        strategy = strategy_fn()
        if use_loss_scaling:
            loss_scale = 8.
        else:
            loss_scale = None
        learning_rate = 2**-14

        with strategy.scope():
            with policy.policy_scope(
                    policy.Policy('mixed_float16', loss_scale=loss_scale)):
                x = layers.Input(shape=(1, ), batch_size=2)
                layer1 = mp_test_util.MultiplyLayer(
                    assert_type=dtypes.float16,
                    regularizer=mp_test_util.IdentityRegularizer(),
                    use_operator=True)
                layer2 = MultiplyLayerWithoutAutoCast(
                    assert_type=dtypes.float16, use_operator=True)
                layer3 = mp_test_util.MultiplyLayer(assert_type=dtypes.float16,
                                                    use_operator=False)
                layer4 = MultiplyLayerWithoutAutoCast(
                    assert_type=dtypes.float16,
                    regularizer=mp_test_util.IdentityRegularizer(),
                    use_operator=False)
                y = layer1(x)
                y = layer2(y)
                y = layer3(y)
                y = layer4(y)
                if use_loss_scaling:
                    # The gradient of 'y' at this point is 1. With loss scaling, the
                    # gradient is 'loss_scale'. We divide by the batch size of 2 since the
                    # loss is averaged across batch elements.
                    expected_gradient = loss_scale / 2
                    identity_with_grad_check_fn = (
                        mp_test_util.create_identity_with_grad_check_fn(
                            expected_dtype=dtypes.float16,
                            expected_gradient=[expected_gradient]))
                    y = core.Lambda(identity_with_grad_check_fn)(y)
                model = models.Model(inputs=x, outputs=y)

                def loss_fn(y_true, y_pred):
                    del y_true
                    return math_ops.reduce_mean(y_pred)

                opt = gradient_descent.SGD(learning_rate)
                model.compile(opt,
                              loss=loss_fn,
                              run_eagerly=testing_utils.should_run_eagerly())

        x = np.ones((2, 1))
        y = np.ones((2, 1))
        dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(2)
        model.fit(dataset)
        for layer in (layer1, layer2, layer3, layer4):
            if layer.losses:
                # Layer has weight regularizer
                self.assertEqual(backend.eval(layer.v), 1 - 2 * learning_rate)
            else:
                # Layer does not have weight regularizer
                self.assertEqual(backend.eval(layer.v), 1 - learning_rate)
Ejemplo n.º 2
0
    def test_model(self,
                   strategy_fn,
                   use_operator=False,
                   use_regularizer=False,
                   policy_name='mixed_float16',
                   get_config=False,
                   save_format=None,
                   use_input_spec=False):
        self._skip_if_strategy_unsupported(strategy_fn)
        self._skip_if_save_format_unsupported(save_format)
        regularizer = (mp_test_util.IdentityRegularizer()
                       if use_regularizer else None)
        with strategy_fn().scope():
            # Pass loss_scale=None, as this test will fail if the DynamicLossScale
            # skips applying gradients for a step
            with policy.policy_scope(
                    policy.Policy(policy_name, loss_scale=None)):
                layer = mp_test_util.MultiplyLayer(assert_type=dtypes.float16,
                                                   use_operator=use_operator,
                                                   regularizer=regularizer,
                                                   input_shape=(1, ))
                if use_input_spec:
                    layer.input_spec = input_spec.InputSpec(shape=(2, 1))
                model = testing_utils.get_model_from_layers(
                    [layer], input_shape=(1, ), input_dtype=dtypes.float16)
                if get_config:
                    config = model.get_config()
                    model = model.__class__.from_config(
                        config,
                        custom_objects={
                            'MultiplyLayer': mp_test_util.MultiplyLayer
                        })
                    (layer, ) = (
                        layer for layer in model.layers
                        if isinstance(layer, mp_test_util.MultiplyLayer))

                def loss_fn(y_true, y_pred):
                    del y_true
                    return math_ops.reduce_mean(y_pred)

                # Learning rate is small enough that if applied to a float16 variable,
                # the variable will not change. So this tests the learning rate not
                # applied to a float16 value, but instead the float32 variable.
                opt = gradient_descent.SGD(2**-14)
                model.compile(opt,
                              loss=loss_fn,
                              run_eagerly=testing_utils.should_run_eagerly())

        x = np.ones((2, 1))
        y = np.ones((2, 1))
        dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(2)
        model.fit(dataset)
        # Variable starts at 1, and should have gradient of 2 ** -14 subtracted
        # from it.
        expected = 1 - 2**-14
        if use_regularizer:
            # Regularizer adds another 2 ** -14 to the gradient.
            expected -= 2**-14
        self.assertEqual(backend.eval(layer.v), expected)

        if save_format:
            with generic_utils.CustomObjectScope({
                    'MultiplyLayer':
                    mp_test_util.MultiplyLayer,
                    'loss_fn':
                    loss_fn
            }):
                self._test_saving(model, dataset, save_format, use_regularizer)
    def _test_mixed_precision(self, task_type, task_id, num_gpus):
        """Tests mixed precision works with the CollectiveAllReduceStrategy.

    This tests:
      1. Variables are in float32, by running with a small enough learning rate
         that if the variables are float16, their values wouldn't change when
         gradients are applied.
      2. The loss scale is doubled if there are no NaNs.
      3. The loss scale is halved if the first worker has a NaN, even if the
         other works do not have NaNs.

    Args:
      task_type: A string, such as "worker", indicating the type of the replica.
      task_id: Zero-indexed ID of the task.
      num_gpus: The number of GPUs to use.
    """
        d, master_target, config = self._get_test_object(
            task_type, task_id, num_gpus)
        # Should be set to mixed_float16 by caller.
        self.assertEqual(policy.global_policy().name, 'mixed_float16')

        with ops.Graph().as_default(), \
             self.cached_session(config=config,
                                 target=master_target) as sess:
            # The loss on the first worker is multiplied by this value. Allows
            # testing the first worker having NaN loss and gradients while keeping the
            # other workers' losses and gradients finite.
            loss_multiplier_for_first_worker = variables.Variable(
                1., dtype='float16', trainable=False)
            with d.scope():
                model = keras.Sequential([
                    mp_test_util.MultiplyLayer(assert_type=dtypes.float16,
                                               input_shape=(1, )),
                ])
                loss_scale = loss_scale_module.DynamicLossScale(
                    2**10, increment_period=1)

                def model_fn():
                    """Simple model to test mixed precision."""
                    x = np.ones((1, 1))
                    loss = model(x, training=True)

                    if ((task_type == 'worker' and task_id == 0)
                            or task_type is task_id is None):
                        loss *= loss_multiplier_for_first_worker
                    # Learning rate is small enough that if applied to a float16 variable,
                    # the variable will not change. So this tests the learning rate is not
                    # applied to a float16 value, but instead the float32 variable.
                    optimizer = gradient_descent.GradientDescentOptimizer(
                        2**-14)
                    optimizer = loss_scale_optimizer.MixedPrecisionLossScaleOptimizer(
                        optimizer, loss_scale)
                    train_op = optimizer.minimize(
                        loss, training_util.get_or_create_global_step())
                    return train_op

                train_op = d.extended.call_for_each_replica(model_fn)
                train_op = d.group(d.experimental_local_results(train_op))

            sess.run(variables.global_variables_initializer())
            sess.run(train_op)

            (var, ) = model.trainable_weights
            # Variable starts at 1. Each worker's gradient is 2 ** -14, the learning
            # rate, and each worker's gradient will be subtracted from the variable.
            expected = 1 - d.num_replicas_in_sync * 2**-14
            self.assertEqual(sess.run(var), expected)
            # Loss scale should double, as are gradients are finite.
            self.assertEqual(sess.run(loss_scale()), 2**11)

            # Set the first worker to have NaN loss and gradients.
            sess.run(loss_multiplier_for_first_worker.assign(float('NaN')))
            sess.run(train_op)
            # Variable should not change, since first worker had NaN
            self.assertEqual(sess.run(var), expected)
            # Loss scale should halve due to NaN
            self.assertEqual(sess.run(loss_scale()), 2**10)
Ejemplo n.º 4
0
    def test_config(self, strategy_fn):
        x = constant_op.constant([1.], dtype=dtypes.float16)
        with strategy_fn().scope():
            for layer, dtype in ((mp_test_util.MultiplyLayer(), 'float32'),
                                 (mp_test_util.MultiplyLayer(dtype='float64'),
                                  'float64'), (mp_test_util.MultiplyLayer(
                                      dtype=policy.Policy('float64')),
                                               'float64')):
                config = layer.get_config()
                self.assertEqual(config['dtype'], dtype)
                self.assertIsInstance(config['dtype'], str)
                layer = mp_test_util.MultiplyLayer.from_config(config)
                self.assertEqual(layer.dtype, dtype)
                self.assertEqual(layer(x).dtype, dtype)
                self.assertEqual(layer.v.dtype, dtype)

            layer = mp_test_util.MultiplyLayer(
                dtype=policy.Policy('mixed_float16'))
            config = layer.get_config()
            self.assertEqual(config['dtype'], {
                'class_name': 'Policy',
                'config': {
                    'name': 'mixed_float16'
                }
            })
            layer = mp_test_util.MultiplyLayer.from_config(config)
            self.assertEqual(layer.dtype, 'float32')
            self.assertEqual(layer(x).dtype, 'float16')
            self.assertEqual(layer.v.dtype, 'float32')

            layer = mp_test_util.MultiplyLayer(
                dtype=policy.Policy('mixed_float16', loss_scale=None))
            config = layer.get_config()
            self.assertEqual(
                config['dtype'], {
                    'class_name': 'Policy',
                    'config': {
                        'name': 'mixed_float16',
                        'loss_scale': None
                    }
                })
            layer = mp_test_util.MultiplyLayer.from_config(config)
            self.assertEqual(layer.dtype, 'float32')
            self.assertEqual(layer(x).dtype, 'float16')
            self.assertEqual(layer.v.dtype, 'float32')

            layer = mp_test_util.MultiplyLayer(
                dtype=policy.Policy('float64', loss_scale=2.))
            config = layer.get_config()
            self.assertEqual(
                config['dtype'], {
                    'class_name': 'Policy',
                    'config': {
                        'name': 'float64',
                        'loss_scale': {
                            'class_name': 'FixedLossScale',
                            'config': {
                                'loss_scale_value': 2.0
                            }
                        }
                    }
                })
            layer = mp_test_util.MultiplyLayer.from_config(config)
            self.assertEqual(layer.dtype, 'float64')
            self.assertEqual(layer(x).dtype, 'float64')
            self.assertEqual(layer.v.dtype, 'float64')

            layer = mp_test_util.MultiplyLayer(dtype=policy.Policy('infer'))
            config = layer.get_config()
            self.assertIsNone(config['dtype'])
            layer = mp_test_util.MultiplyLayer.from_config(config)
            # If a layer is serialized with the "infer" policy, when deserialized into
            # TF 2 it will have the global policy instead of "infer". This is because
            # "infer" is serialized into None, and passing dtype=None in TensorFlow 2
            # indicates to use the global policy.
            self.assertEqual(layer.dtype, 'float32')
            self.assertEqual(layer(x).dtype, 'float32')
            self.assertEqual(layer.v.dtype, 'float32')

            layer = mp_test_util.MultiplyLayer(
                dtype=policy.Policy('infer', loss_scale=2.))
            config = layer.get_config()
            self.assertEqual(
                config['dtype'], {
                    'class_name': 'Policy',
                    'config': {
                        'name': 'infer',
                        'loss_scale': {
                            'class_name': 'FixedLossScale',
                            'config': {
                                'loss_scale_value': 2.0
                            }
                        }
                    }
                })
            layer = mp_test_util.MultiplyLayer.from_config(config)
            self.assertEqual(layer.dtype, None)
            self.assertEqual(layer(x).dtype, 'float16')
            self.assertEqual(layer.v.dtype, 'float16')
Ejemplo n.º 5
0
    def test_dynamic_loss_scaling(self,
                                  strategy_fn,
                                  pass_loss_scale_to_policy=False,
                                  get_config=False):
        strategy = strategy_fn()
        initial_loss_scale = 2.
        batch_size = 4
        loss_scale = loss_scale_module.DynamicLossScale(
            initial_loss_scale=initial_loss_scale, increment_period=2)
        expected_gradient = backend.variable([initial_loss_scale / batch_size],
                                             dtype=dtypes.float16)
        # If this variable is set to True, the model below will have NaN gradients
        have_nan_gradients = backend.variable(False, dtype=dtypes.bool)
        with strategy.scope():
            opt = gradient_descent.SGD(1.)
            if pass_loss_scale_to_policy:
                p = policy.Policy('mixed_float16', loss_scale=loss_scale)
            else:
                p = policy.Policy('mixed_float16', loss_scale=None)
                opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
            with policy.policy_scope(p):
                x = layers.Input(shape=(1, ),
                                 batch_size=batch_size,
                                 dtype=dtypes.float16)
                layer = mp_test_util.MultiplyLayer(assert_type=dtypes.float16)
                y = layer(x)
                identity_with_nan_grads = (
                    mp_test_util.create_identity_with_nan_gradients_fn(
                        have_nan_gradients))
                y = core.Lambda(identity_with_nan_grads)(y)
                identity_with_grad_check_fn = (
                    mp_test_util.create_identity_with_grad_check_fn(
                        expected_dtype=dtypes.float16,
                        expected_gradient=expected_gradient))
                y = core.Lambda(identity_with_grad_check_fn)(y)
                model = models.Model(inputs=x, outputs=y)
                if get_config:
                    config = model.get_config()
                    model = model.__class__.from_config(
                        config,
                        custom_objects={
                            'MultiplyLayer': mp_test_util.MultiplyLayer
                        })
                    (layer, ) = (
                        layer for layer in model.layers
                        if isinstance(layer, mp_test_util.MultiplyLayer))

                def loss_fn(y_true, y_pred):
                    del y_true
                    return math_ops.reduce_mean(y_pred)

                model.compile(opt,
                              loss=loss_fn,
                              run_eagerly=testing_utils.should_run_eagerly())

        self.assertEqual(backend.eval(layer.v), 1)
        x = np.ones((batch_size, 1))
        y = np.ones((batch_size, 1))
        dataset = dataset_ops.Dataset.from_tensor_slices(
            (x, y)).batch(batch_size)
        model.fit(dataset)
        # The variables starts with 1 and has a gradient of 1, so will go down by 1
        # each step.
        self.assertEqual(backend.eval(layer.v), 0)

        model.fit(dataset)
        self.assertEqual(backend.eval(layer.v), -1)

        # There have been two steps without NaNs, so the loss scale will double
        backend.set_value(expected_gradient,
                          backend.get_value(expected_gradient * 2))
        model.fit(dataset)
        self.assertEqual(backend.eval(layer.v), -2)

        # Next test with NaN gradients.
        backend.set_value(have_nan_gradients, True)
        model.fit(dataset)
        # Variable should not be updated
        self.assertEqual(backend.eval(layer.v), -2)

        # Test with finite gradients again
        backend.set_value(have_nan_gradients, False)
        # The loss scale will be halved due to the NaNs, so the gradient will also
        # be halved
        backend.set_value(expected_gradient,
                          backend.get_value(expected_gradient / 2))
        model.fit(dataset)
        self.assertEqual(backend.eval(layer.v), -3)