def from_config(cls, config, custom_objects=None):
    config = config.copy()  # Make a copy, since we mutate config
    config['optimizer'] = optimizers.deserialize(
        config['optimizer'], custom_objects=custom_objects)

    # If loss_scale is in config, we assume we are deserializing a
    # LossScaleOptimizer from TF 2.3 or below. Otherwise, we assume we are
    # deserializing a LossScaleOptimizer from TF 2.4 or above.
    if 'loss_scale' in config:
      config['loss_scale'] = keras_loss_scale_module.deserialize(
          config['loss_scale'])
      if (isinstance(config['loss_scale'], loss_scale_module.DynamicLossScale)
          and config['loss_scale'].multiplier != 2):
        raise ValueError('Cannot deserialize LossScaleOptimizer with a '
                         'DynamicLossScale whose multiplier is not 2. Got '
                         'DynamicLossScale: %s' % (config['loss_scale'],))
      return cls(**config)

    # We convert the config, as generated by LossScaleOptimizer.get_config, to a
    # version that can be passed to LossScaleOptimizerV1.__init__
    if config['dynamic']:
      config['loss_scale'] = loss_scale_module.DynamicLossScale(
          config['initial_scale'], config['dynamic_growth_steps'], multiplier=2)
    else:
      config['loss_scale'] = loss_scale_module.FixedLossScale(
          config['initial_scale'])

    del config['dynamic']
    del config['initial_scale']
    del config['dynamic_growth_steps']
    return cls(**config)
Example #2
0
 def test_non_persistent_tapes_error(self):
     x = variables.Variable(3.0)
     with lsgt.LossScaleGradientTape(loss_scale_module.FixedLossScale(32),
                                     persistent=False) as g:
         y = x * x
         z = y * y
     g.gradient(z, x)
     with self.assertRaisesRegexp(RuntimeError, 'persistent'):
         g.gradient(y, x)
Example #3
0
 def test_fixed_scaling_no_change_non_finite_gradient(
         self, non_finite_term, is_non_finite):
     loss_scale = loss_scale_module.FixedLossScale(32)
     x = constant_op.constant(1.0)
     with lsgt.LossScalingGradientTape(loss_scale) as g:
         g.watch(x)
         y = x * non_finite_term
     dy_dx = g.gradient(y, x)
     self.assertTrue(is_non_finite(self.evaluate(dy_dx)))
     self.assertEqual(self.evaluate(loss_scale()), 32.0)
Example #4
0
 def test_non_persistent_tapes_error(self):
     x = constant_op.constant(3.0)
     with lsgt.LossScaleGradientTape(loss_scale_module.FixedLossScale(32),
                                     persistent=False) as g:
         g.watch(x)
         y = x * x
         z = y * y
     g.gradient(z, x)
     with self.assertRaisesRegex(RuntimeError, 'persistent'):
         g.gradient(y, x)
Example #5
0
def get(identifier):
    """Get a loss scale object."""
    if isinstance(identifier, dict):
        return deserialize(identifier)

    if isinstance(identifier, (int, float)):
        return loss_scale_module.FixedLossScale(identifier)
    if identifier == 'dynamic':
        return loss_scale_module.DynamicLossScale()
    if isinstance(identifier, loss_scale_module.LossScale):
        return identifier
    elif identifier is None:
        return None
    else:
        raise ValueError('Could not interpret loss scale identifier: %s' %
                         identifier)
Example #6
0
    def test_fixed_scaling_no_change_non_finite_gradient(
            self, strategy_fn, non_finite_term):
        loss_scale = loss_scale_module.FixedLossScale(32)

        def run_fn():
            x = constant_op.constant(1.0)
            with lsgt.LossScaleGradientTape(loss_scale) as g:
                g.watch(x)
                y = x * non_finite_term
            return g.gradient(y, x)

        dy_dx_list = self._run_with_strategy(run_fn, strategy_fn())
        check_fn = np.isposinf if non_finite_term == np.inf else np.isnan
        for dy_dx in dy_dx_list:
            self.assertTrue(check_fn(self.evaluate(dy_dx)))
        self.assertEqual(self.evaluate(loss_scale()), 32.0)
Example #7
0
    def test_jacobian_raises_error(self):
        loss_scale = loss_scale_module.FixedLossScale(2.)
        x = variables.Variable([1.0, 2.0])
        with lsgt.LossScaleGradientTape(loss_scale) as g:
            y = x * 2
        with self.assertRaisesRegexp(
                NotImplementedError,
                'LossScaleGradientTape.jacobian is not yet implemented'):
            g.jacobian(y, x)

        x = variables.Variable([[1.0, 2.0], [3.0, 4.0]])
        with lsgt.LossScaleGradientTape(loss_scale) as g:
            y = x * 2
        with self.assertRaisesRegexp(
                NotImplementedError,
                'LossScaleGradientTape.batch_jacobian is not yet implemented'):
            g.batch_jacobian(y, x)
Example #8
0
  def test_basic(self):
    loss_scale_value = 1000
    loss_scale = loss_scale_module.FixedLossScale(loss_scale_value)

    update_op, should_apply = loss_scale.update([constant_op.constant(0.)])
    self.evaluate(update_op)
    # should_apply should be a bool instead of a tensor, so that a tf.cond does
    # not have to be built in the graph by the caller.
    self.assertIsInstance(should_apply, bool)
    self.assertTrue(should_apply)
    self.assertEqual(loss_scale_value, self.evaluate(loss_scale()))

    update_op, should_apply = loss_scale.update(
        [constant_op.constant(float('NaN'))])
    self.evaluate(update_op)
    self.assertIsInstance(should_apply, bool)
    self.assertTrue(should_apply)
    self.assertEqual(loss_scale_value, self.evaluate(loss_scale()))
Example #9
0
    def testPassingV1LossScale(self, strategy_fn):
        strategy = strategy_fn()
        learning_rate = 2.
        with strategy.scope():
            # Test FixedLossScale
            var = variables.Variable([5.0])
            opt = gradient_descent.SGD(learning_rate)
            loss_scale = tf_loss_scale_module.FixedLossScale(2.)
            opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale)
            self.assertIsInstance(opt.loss_scale, ops.Tensor)
            self.evaluate(variables.global_variables_initializer())
            self.assertEqual(self.evaluate(opt.loss_scale), 2)
            run_fn = self._run_fn_with_grad_check(
                strategy, var, opt, 2 / strategy.num_replicas_in_sync)
            run_op = strategy.experimental_run(run_fn)
            self.evaluate(variables.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)
            # The loss is the identity of the variable. Therefore the gradient is 1,
            # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3
            self.assertAllClose([3.], self.evaluate(var))

            # Test DynamicLossScale
            var = variables.Variable([5.0])
            opt = gradient_descent.SGD(learning_rate)
            loss_scale = tf_loss_scale_module.DynamicLossScale(
                initial_loss_scale=4, increment_period=1, multiplier=2)
            loss_scale._current_loss_scale.assign(2)
            opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale)
            self.assertEqual(opt.initial_scale, 4)
            self.assertEqual(opt.dynamic_growth_steps, 1)
            self.evaluate(variables.global_variables_initializer())
            # Current loss scale is not copied so loss scale is reinitialized to 4
            self.assertEqual(self.evaluate(opt.loss_scale), 4)
            for s in strategy.experimental_local_results(opt.dynamic_counter):
                self.assertEqual(self.evaluate(s), 0)

            run_fn = self._run_fn_with_grad_check(
                strategy, var, opt, 4 / strategy.num_replicas_in_sync)
            run_op = strategy.experimental_run(run_fn)
            self.evaluate(variables.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)
            self.assertAllClose([3.], self.evaluate(var))
    def testHyperParametersExposed(self):
        with self.cached_session():
            opt = adam.Adam(learning_rate=1.0, beta_1=0.5, beta_2=0.9)
            lso = loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
            # Force hyperparameters to be created
            opt.lr  # pylint: disable=pointless-statement
            self.evaluate(variables.global_variables_initializer())

            self.assertEqual(self.evaluate(lso.beta_1), 0.5)
            self.assertIsInstance(lso.beta_1, variables.Variable)
            self.assertEqual(self.evaluate(lso.lr), 1.0)
            self.assertIs(lso.lr, opt.lr)
            self.assertIs(lso.lr, lso.learning_rate)

            lso.beta_1 = 0.25
            self.assertEqual(self.evaluate(lso.beta_1), 0.25)
            self.assertEqual(self.evaluate(opt.beta_1), 0.25)
            self.assertIs(lso.beta_1, opt.beta_1)
            opt.beta_1 = 0.75
            self.assertEqual(self.evaluate(lso.beta_1), 0.75)
            self.assertEqual(self.evaluate(opt.beta_1), 0.75)
            self.assertIs(lso.beta_1, opt.beta_1)
            lso.lr = 2.0
            self.assertEqual(self.evaluate(lso.lr), 2.0)
            self.assertEqual(self.evaluate(lso.learning_rate), 2.0)
            self.assertEqual(self.evaluate(opt.lr), 2.0)
            self.assertEqual(self.evaluate(opt.learning_rate), 2.0)
            self.assertIs(lso.lr, opt.lr)

            # Test setting attribute that is both attribute on LossScaleOptimizer and
            # hyperparameter on wrapped optimizer.
            class MyOpt(gradient_descent.SGD):
                def __init__(self):
                    super().__init__()
                    self._set_hyper('loss_scale', 123.)

            opt = MyOpt()
            lso = loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
            with self.assertRaises(AttributeError):
                lso.loss_scale = loss_scale_module.FixedLossScale(2.)
Example #11
0
    def _benchmark(self, gradient_type, num_gpus, mode, loss_scaling):
        """Benchmarks loss scaling.

    We run a simple model with several scalar variables. The loss is the sum of
    all variables. The model is simple because we want to measure only the
    performance of loss scaling, not the performance of the model itself.

    Args:
      gradient_type: "optimizer" or "gradient_tape". How gradients are computed.
        "optimizer" uses Optimizer.minimize. "gradient_tape" uses
        GradientTape.gradient along with LossScaleOptimizer.get_scaled_loss and
        LossScaleOptimizer.get_unscaled_gradients.
      num_gpus: The number of GPUs to use. Must be at least 1.
      mode: "eager" or "tf_function". "tf_function" causes all computations to
        be wrapped in a tf.function, while "eager" runs computations eagerly.
      loss_scaling: "fixed", "dynamic", or None. The type of loss scaling to
        use. None means use no loss scaling, which is useful as a baseline to
        see how much slower loss scaling is in comparison.
    """
        ls_str = loss_scaling or 'no_loss_scaling'
        name = '%s_%d_GPU_%s_%s' % (gradient_type, num_gpus, mode, ls_str)
        with context.eager_mode(), _get_strategy(num_gpus).scope() as strategy:
            opt = adam.Adam()
            if loss_scaling == 'fixed':
                loss_scale = loss_scale_module.FixedLossScale(2.)
            elif loss_scaling == 'dynamic':
                # Make increment_period so high that it's effectively infinite. This
                # means the loss scale will never change. Any performance overhead
                # from increasing/decreasing the loss scale is typically negligible
                # since it happens infrequently, so we only benchmark the common case
                # of the loss scale not changing.
                increment_period = 1000000
                loss_scale = loss_scale_module.DynamicLossScale(
                    initial_loss_scale=2., increment_period=increment_period)
            else:
                assert loss_scaling is None
                loss_scale = None
            if loss_scale:
                opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)

            num_vars = 200
            num_warmup_iters = 1
            num_iters = 20
            # By using scalar variables, we reduce overhead of the actual GPU work of
            # multiplying variables, dividing gradients, and checking gradients for
            # NaNs. Measuring these overheads isn't very useful as there is little we
            # can do to reduce them (one such way would be to fuse dividing gradients
            # and checking them for NaNs). We still have all other overheads, such as
            # all-reducing the `is_finite` values and having a tf.cond or
            # tf.while_loop based on whether gradients are NaNs. Currently, these
            # other overheads are much more significant than the GPU work.
            var_list = [
                variables.Variable(i, dtype='float32') for i in range(num_vars)
            ]

            def get_loss():
                return math_ops.add_n(var_list)

            if gradient_type == 'gradient_tape':
                if loss_scale is None:

                    def minimize_fn():
                        with backprop.GradientTape() as tape:
                            loss = get_loss()
                        grads = tape.gradient(loss, var_list)
                        return opt.apply_gradients(zip(grads, var_list))
                else:

                    def minimize_fn():
                        with backprop.GradientTape() as tape:
                            loss = get_loss()
                            scaled_loss = opt.get_scaled_loss(loss)
                        scaled_grads = tape.gradient(scaled_loss, var_list)
                        grads = opt.get_unscaled_gradients(scaled_grads)
                        return opt.apply_gradients(zip(grads, var_list))
            else:
                assert gradient_type == 'optimizer'

                def minimize_fn():
                    return opt.minimize(get_loss, var_list)

            def run_fn():
                strategy.run(minimize_fn)

            if mode == 'tf_function':
                run_fn = def_function.function(run_fn)

            for _ in range(num_warmup_iters):
                run_fn()

            start = time.time()
            for _ in range(num_iters):
                run_fn()
            end = time.time()
            self.report_benchmark(iters=num_iters,
                                  wall_time=(end - start) / num_iters,
                                  name=name)
Example #12
0
 def test_repr(self):
   loss_scale = loss_scale_module.FixedLossScale(123)
   self.assertEqual(repr(loss_scale), 'FixedLossScale(123.0)')
Example #13
0
 def test_call_type(self):
   scalar = loss_scale_module.FixedLossScale(123)
   self.assertIsInstance(scalar(), ops.Tensor)