Ejemplo n.º 1
0
    def test_batchnorm_correctness(self, distribution):
        with self.cached_session():
            model = keras.models.Sequential()
            norm = keras.layers.BatchNormalization(input_shape=(10, ),
                                                   momentum=0.8)
            model.add(norm)
            model.compile(
                loss='mse',
                optimizer=gradient_descent.GradientDescentOptimizer(0.01),
                distribute=distribution)

            # centered on 5.0, variance 10.0
            x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10))
            x = x.astype('float32')
            dataset = dataset_ops.Dataset.from_tensor_slices((x, x))
            dataset = dataset.repeat(100)
            dataset = batch_wrapper(dataset, 32, distribution)

            model.fit(dataset, epochs=4, verbose=0, steps_per_epoch=10)
            out = model.predict(dataset, steps=2)
            out -= keras.backend.eval(norm.beta)
            out /= keras.backend.eval(norm.gamma)
            np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
            np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
Ejemplo n.º 2
0
  def test_specify_initial_state_non_keras_tensor(self):
    num_states = 2
    timesteps = 3
    embedding_dim = 4
    units = 3
    num_samples = 2

    # Test with non-Keras tensor
    inputs = keras.Input((timesteps, embedding_dim))
    initial_state = [
        keras.backend.random_normal_variable((num_samples, units), 0, 1)
        for _ in range(num_states)
    ]
    layer = rnn.LSTM(units)
    output = layer(inputs, initial_state=initial_state)

    model = keras.models.Model(inputs, output)
    model.compile(
        loss='categorical_crossentropy',
        optimizer=gradient_descent.GradientDescentOptimizer(0.01))

    inputs = np.random.random((num_samples, timesteps, embedding_dim))
    targets = np.random.random((num_samples, units))
    model.train_on_batch(inputs, targets)
Ejemplo n.º 3
0
def multi_inputs_multi_outputs_model():
  input_a = keras.layers.Input(shape=(16,), name='input_a')
  input_b = keras.layers.Input(shape=(16,), name='input_b')
  input_m = keras.layers.Input(shape=(8,), dtype='string', name='input_m')
  dense = keras.layers.Dense(8, name='dense_1')

  interm_a = dense(input_a)
  # Read m
  interm_m = keras.layers.Lambda(gen_parsing_ops.string_to_number)(input_m)
  interm_s = keras.layers.Lambda(lambda k: k[0] * k[1])([interm_m, interm_a])
  interm_b = dense(input_b)
  merged = keras.layers.concatenate([interm_s, interm_b], name='merge')
  output_c = keras.layers.Dense(3, activation='softmax', name='dense_2')(merged)
  output_d = keras.layers.Dense(2, activation='softmax', name='dense_3')(merged)
  model = keras.models.Model(
      inputs=[input_a, input_b, input_m], outputs=[output_c, output_d])
  model.compile(
      loss='categorical_crossentropy',
      optimizer=gradient_descent.GradientDescentOptimizer(0.001),
      metrics={
          'dense_2': 'categorical_accuracy',
          'dense_3': 'categorical_accuracy'
      })
  return model
Ejemplo n.º 4
0
  def test_calling_with_unsupported_predefined_callbacks(self, distribution):
    with self.cached_session():
      model = get_model()

      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
      loss = 'mse'
      metrics = ['mae']
      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)

      dataset = get_dataset(distribution)

      def schedule(_):
        return 0.001
      with self.assertRaisesRegexp(ValueError,
                                   'You must specify a Keras Optimizer V2 when '
                                   'using'):
        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
                  callbacks=[keras.callbacks.LearningRateScheduler(schedule)])

      with self.assertRaisesRegexp(ValueError,
                                   'You must specify a Keras Optimizer V2 when '
                                   'using'):
        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
                  callbacks=[keras.callbacks.ReduceLROnPlateau()])
  def test_multi_paths_2(self):
    """Test graph with multiple paths."""
    if test.is_gpu_available(cuda_only=True):
      random_seed.set_random_seed(0)
      x = _input([8, 8])
      y1 = _matmul_act(x)
      y2 = _matmul_act(x)
      y = y1 + y2 + x
      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
      g = optimizer.compute_gradients(y, [x])
      output = (g, y)

      output_val_ref, output_val, cost_graph = self._run(output)
      node_map = _build_node_map(cost_graph.node)

      self._assert_output_fp16(node_map, 'MatMul')
      self._assert_output_fp16(node_map, 'Relu')
      self._assert_output_fp16(node_map, 'MatMul_1')
      self._assert_output_fp16(node_map, 'Relu_1')
      # Bump up the tolerance for the ROCm platform
      # The default tolerance (1e-3) results in a tiny fraction (<1%) of
      # miscompares on ROCm platform, and hence the tolerance bump
      tol = 2e-3 if test.is_built_with_rocm else 1e-3
      self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
  def test_recurrent_lstm(self, mode):
    """Test graph with recurrent lstm."""
    self._maybe_skip(mode)
    with ops.device(_get_device(mode)):
      random_seed.set_random_seed(0)
      init_c = _input([8, 4])
      init_h = _input([8, 4])
      _, _, h, _ = _recurrent_lstm(init_c, init_h)
      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
      g = optimizer.compute_gradients(h, [init_c, init_h])
      output = (h, g)

    output_val_ref, output_val, cost_graph = self._run(mode, output)
    node_map = _build_node_map(cost_graph.node)

    self._assert_output_f16(mode, node_map, 'while/concat')
    self._assert_output_f16(mode, node_map, 'while/MatMul')
    self._assert_output_f16(mode, node_map, 'while/split')
    self._assert_output_f16(mode, node_map, 'while/Sigmoid')
    self._assert_output_f16(mode, node_map, 'while/Sigmoid_1')
    self._assert_output_f16(mode, node_map, 'while/Sigmoid_2')
    self._assert_output_f16(mode, node_map, 'while/Tanh')
    self._assert_output_f16(mode, node_map, 'while/Tanh_1')
    self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
Ejemplo n.º 7
0
    def testTrainWithNoneAsInitWhenUsingVarsRaisesError(self):
        logdir = os.path.join(tempfile.mkdtemp(prefix=self.get_temp_dir()),
                              'tmp_logs')
        with ops.Graph().as_default():
            random_seed.set_random_seed(0)
            tf_inputs = constant_op.constant(self._inputs,
                                             dtype=dtypes.float32)
            tf_labels = constant_op.constant(self._labels,
                                             dtype=dtypes.float32)

            tf_predictions = LogisticClassifier(tf_inputs)
            loss_ops.log_loss(tf_predictions, tf_labels)
            total_loss = loss_ops.get_total_loss()

            optimizer = gradient_descent.GradientDescentOptimizer(
                learning_rate=1.0)

            train_op = learning.create_train_op(total_loss, optimizer)

            with self.assertRaises(RuntimeError):
                learning.train(train_op,
                               logdir,
                               init_op=None,
                               number_of_steps=300)
Ejemplo n.º 8
0
  def setUp(self):
    """Test setup.

    Structure of the forward graph:
              f
             | |
        -----   -----
        |           |
        d           e
       | |         | |
    ---   ---------  ---
    |         |        |
    a         b        c

    Construct a backward graph using the GradientDescentOptimizer.
    """

    self.a = variables.Variable(1.0, name="a")
    self.b = variables.Variable(2.0, name="b")
    self.c = variables.Variable(4.0, name="c")
    self.d = math_ops.multiply(self.a, self.b, name="d")
    self.e = math_ops.multiply(self.b, self.c, name="e")
    self.f = math_ops.multiply(self.d, self.e, name="f")

    # Gradient descent optimizer that minimizes g.
    gradient_descent.GradientDescentOptimizer(0.01).minimize(
        self.f, name="optim")

    rewriter_config = rewriter_config_pb2.RewriterConfig(
        disable_model_pruning=True,
        arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
        constant_folding=rewriter_config_pb2.RewriterConfig.OFF)
    graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
    config = config_pb2.ConfigProto(graph_options=graph_options)
    self.sess = session.Session(config=config)
    self.sess.run(variables.global_variables_initializer())
Ejemplo n.º 9
0
    def testTrainWithNoneAsLogdirWhenUsingSaverRaisesError(self):
        with ops.Graph().as_default():
            random_seed.set_random_seed(0)
            tf_inputs = constant_op.constant(self._inputs,
                                             dtype=dtypes.float32)
            tf_labels = constant_op.constant(self._labels,
                                             dtype=dtypes.float32)

            tf_predictions = LogisticClassifier(tf_inputs)
            loss_ops.log_loss(tf_predictions, tf_labels)
            total_loss = loss_ops.get_total_loss()

            optimizer = gradient_descent.GradientDescentOptimizer(
                learning_rate=1.0)

            train_op = learning.create_train_op(total_loss, optimizer)
            saver = saver_lib.Saver()

            with self.assertRaises(ValueError):
                learning.train(train_op,
                               None,
                               init_op=None,
                               number_of_steps=300,
                               saver=saver)
    def testTrainWithTrace(self):
        logdir = os.path.join(tempfile.mkdtemp(prefix=self.get_temp_dir()),
                              'tmp_logs')
        with ops.Graph().as_default():
            random_seed.set_random_seed(0)
            tf_inputs = tf.constant(self._inputs, dtype=tf.float32)
            tf_labels = tf.constant(self._labels, dtype=tf.float32)

            tf_predictions = LogisticClassifier(tf_inputs)
            loss_ops.log_loss(tf_labels, tf_predictions)
            total_loss = loss_ops.get_total_loss()
            summary.scalar('total_loss', total_loss)

            optimizer = gradient_descent.GradientDescentOptimizer(
                learning_rate=1.0)

            train_op = learning.create_train_op(total_loss, optimizer)

            loss = learning.train(train_op,
                                  logdir,
                                  number_of_steps=300,
                                  log_every_n_steps=10,
                                  trace_every_n_steps=100)
        self.assertIsNotNone(loss)
        for trace_step in [1, 101, 201]:
            trace_filename = 'tf_trace-%d.json' % (trace_step - 1)
            trace_filename_legacy = 'tf_trace-%d.json' % trace_step

            trace_paths = [
                os.path.join(logdir, f)
                for f in (trace_filename, trace_filename_legacy)
            ]
            # Note: with resource variables the traces are created at 0/100/200
            # with legacy variables traces are created at 1/101/201
            self.assertTrue(any(os.path.isfile(path) for path in trace_paths),
                            trace_paths)
    def testEmptyUpdateOps(self):
        with ops.Graph().as_default():
            random_seed.set_random_seed(0)
            tf_inputs = tf.constant(self._inputs, dtype=tf.float32)
            tf_labels = tf.constant(self._labels, dtype=tf.float32)

            tf_predictions = BatchNormClassifier(tf_inputs)
            loss_ops.log_loss(tf_labels, tf_predictions)
            total_loss = loss_ops.get_total_loss()
            optimizer = gradient_descent.GradientDescentOptimizer(
                learning_rate=1.0)

            train_op = learning.create_train_op(total_loss,
                                                optimizer,
                                                update_ops=[])

            moving_mean = variables_lib2.get_variables_by_name(
                'moving_mean')[0]
            moving_variance = variables_lib2.get_variables_by_name(
                'moving_variance')[0]

            with tf.Session() as sess:
                # Initialize all variables
                sess.run(variables_lib.global_variables_initializer())
                mean, variance = sess.run([moving_mean, moving_variance])
                # After initialization moving_mean == 0 and moving_variance == 1.
                self.assertAllClose(mean, [0] * 4)
                self.assertAllClose(variance, [1] * 4)

                for _ in range(10):
                    sess.run([train_op])
                mean = moving_mean.eval()
                variance = moving_variance.eval()
                # Since we skip update_ops the moving_vars are not updated.
                self.assertAllClose(mean, [0] * 4)
                self.assertAllClose(variance, [1] * 4)
Ejemplo n.º 12
0
  def test_specify_state_with_masking(self):
    num_states = 2
    timesteps = 3
    embedding_dim = 4
    units = 3
    num_samples = 2

    inputs = keras.Input((timesteps, embedding_dim))
    _ = keras.layers.Masking()(inputs)
    initial_state = [keras.Input((units,)) for _ in range(num_states)]
    output = rnn.LSTM(units)(
        inputs, initial_state=initial_state)

    model = keras.models.Model([inputs] + initial_state, output)
    model.compile(
        loss='categorical_crossentropy',
        optimizer=gradient_descent.GradientDescentOptimizer(0.01))

    inputs = np.random.random((num_samples, timesteps, embedding_dim))
    initial_state = [
        np.random.random((num_samples, units)) for _ in range(num_states)
    ]
    targets = np.random.random((num_samples, units))
    model.train_on_batch([inputs] + initial_state, targets)
    def test_loop_with_vars_intertwined(self):
        """Test graph with intertwined while loops."""
        if test.is_gpu_available(cuda_only=True):
            random_seed.set_random_seed(0)
            x = _input([8, 8])
            _, _, k, l = _loop_vars_intertwined(
                array_ops.ones(array_ops.shape(x)), x, _matmul_act,
                _matmul_act)
            optimizer = gradient_descent.GradientDescentOptimizer(
                learning_rate=0.01)
            g = optimizer.compute_gradients(k, [x])
            output = (k, l, g)

            output_val_ref, output_val, cost_graph = self._run(output)
            node_map = _build_node_map(cost_graph.node)

            self._assert_output_fp16(node_map, 'while/MatMul')
            self._assert_output_fp16(node_map, 'while/Relu')
            self._assert_output_fp16(node_map, 'while/MatMul_1')
            self._assert_output_fp16(node_map, 'while/Relu_1')
            self.assertAllClose(output_val_ref,
                                output_val,
                                atol=1e-3,
                                rtol=1e-3)
Ejemplo n.º 14
0
    def test_calling_model_with_numpy_arrays(self, distribution):
        with self.cached_session():
            model = get_model()

            optimizer = gradient_descent.GradientDescentOptimizer(0.001)
            loss = 'mse'
            metrics = ['mae']
            model.compile(optimizer,
                          loss,
                          metrics=metrics,
                          distribute=distribution)

            inputs = np.zeros((64, 3), dtype=np.float32)
            targets = np.zeros((64, 4), dtype=np.float32)

            # Call fit with validation data
            model.fit(inputs,
                      targets,
                      epochs=1,
                      batch_size=2,
                      verbose=0,
                      validation_data=(inputs, targets))

            # TODO(anjalisridhar): We need tests for when the batch size and steps are
            # smaller and results in a 0 batch_size and steps value.
            model.evaluate(inputs, targets)
            # with steps
            model.evaluate(inputs, targets, steps=2)
            # with batch_size
            model.evaluate(inputs, targets, batch_size=8)

            model.predict(inputs)
            # with steps
            model.predict(inputs, steps=2)
            # with batch_size
            model.predict(inputs, batch_size=8)
Ejemplo n.º 15
0
 def testSparseBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
         with self.test_session():
             var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
             var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
             grads0 = ops.IndexedSlices(
                 constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
                 constant_op.constant([0]), constant_op.constant([2, 1]))
             grads1 = ops.IndexedSlices(
                 constant_op.constant([0.01], shape=[1, 1], dtype=dtype),
                 constant_op.constant([1]), constant_op.constant([2, 1]))
             sgd_op = gradient_descent.GradientDescentOptimizer(
                 3.0).apply_gradients(zip([grads0, grads1], [var0, var1]))
             variables.global_variables_initializer().run()
             # Fetch params to validate initial values
             self.assertAllCloseAccordingToType([[1.0], [2.0]], var0.eval())
             self.assertAllCloseAccordingToType([[3.0], [4.0]], var1.eval())
             # Run 1 step of sgd
             sgd_op.run()
             # Validate updated params
             self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
                                                var0.eval())
             self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
                                                var1.eval())
Ejemplo n.º 16
0
def get_sync_optimizer():
    return sync_replicas_optimizer.SyncReplicasOptimizer(
        gradient_descent.GradientDescentOptimizer(learning_rate=1.0),
        replicas_to_aggregate=1)
Ejemplo n.º 17
0
  def testTrainAllVarsHasLowerLossThanTrainSubsetOfVars(self):
    logdir = os.path.join(self.get_temp_dir(), 'tmp_logs3/')
    if gfile.Exists(logdir):  # For running on jenkins.
      gfile.DeleteRecursively(logdir)

    # First, train only the weights of the model.
    with ops.Graph().as_default():
      random_seed.set_random_seed(0)
      total_loss = self.ModelLoss()
      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
      weights = variables_lib.get_variables_by_name('weights')

      train_op = training.create_train_op(
          total_loss, optimizer, variables_to_train=weights)

      saver = saver_lib.Saver()
      loss = training.train(
          train_op,
          logdir,
          hooks=[
              basic_session_run_hooks.CheckpointSaverHook(
                  logdir, save_steps=1, saver=saver),
              basic_session_run_hooks.StopAtStepHook(num_steps=200),
          ])
      self.assertGreater(loss, .015)
      self.assertLess(loss, .05)

    # Next, train the biases of the model.
    with ops.Graph().as_default():
      random_seed.set_random_seed(1)
      total_loss = self.ModelLoss()
      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
      biases = variables_lib.get_variables_by_name('biases')

      train_op = training.create_train_op(
          total_loss, optimizer, variables_to_train=biases)

      saver = saver_lib.Saver()
      loss = training.train(
          train_op,
          logdir,
          hooks=[
              basic_session_run_hooks.CheckpointSaverHook(
                  logdir, save_steps=1, saver=saver),
              basic_session_run_hooks.StopAtStepHook(num_steps=300),
          ])
      self.assertGreater(loss, .015)
      self.assertLess(loss, .05)

    # Finally, train both weights and bias to get lower loss.
    with ops.Graph().as_default():
      random_seed.set_random_seed(2)
      total_loss = self.ModelLoss()
      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)

      train_op = training.create_train_op(total_loss, optimizer)
      saver = saver_lib.Saver()
      loss = training.train(
          train_op,
          logdir,
          hooks=[
              basic_session_run_hooks.CheckpointSaverHook(
                  logdir, save_steps=1, saver=saver),
              basic_session_run_hooks.StopAtStepHook(num_steps=400),
          ])
      self.assertIsNotNone(loss)
      self.assertLess(loss, .015)
Ejemplo n.º 18
0
    def _helpTestRun(self, use_resource=False, use_partitioned_vars=False):
        # Partitioned variables are represented as a "collection" of partitions.
        # To simplify the test and reuse as much code as possible we employ
        # following test strategy for partitioned variables.
        #
        # In the case of non-partitioned variables test runs on variables with
        # shape [2].
        #
        # In the case of partitioned variables we use shape [4] with two partitions,
        # thus each partition has shape [2].
        # For partitioned variables the test is run twice (for loop over
        # variable_part_names), first time on the first partition of each variable,
        # second time on the second partition of each variable.
        variable_part_names = ['part_0', 'part_1'
                               ] if use_partitioned_vars else ['']
        for sequential_update in [True, False]:
            for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
                for var_part_name in variable_part_names:
                    with self.session(graph=ops.Graph()) as sess:
                        orig_val0 = [1.0, 2.0]
                        orig_val1 = [3.0, 4.0]
                        grads0 = [0.1, 0.1]
                        grads1 = [0.01, 0.01]
                        if use_partitioned_vars:
                            # Use partitioned variables.
                            # Create partitioned and duplicate each value used as initial
                            # value of variables.
                            partitioner = partitioned_variables.fixed_size_partitioner(
                                num_shards=2)
                            orig_val0 = orig_val0 * 2
                            orig_val1 = orig_val1 * 2
                            grads0 = grads0 * 2
                            grads1 = grads1 * 2
                        else:
                            # Regular (non-partitioned) variables.
                            partitioner = None
                        var0 = variable_scope.get_variable(
                            'var0',
                            initializer=constant_op.constant(orig_val0,
                                                             dtype=dtype),
                            use_resource=use_resource,
                            partitioner=partitioner)
                        var1 = variable_scope.get_variable(
                            'var1',
                            initializer=constant_op.constant(orig_val1,
                                                             dtype=dtype),
                            use_resource=use_resource,
                            partitioner=partitioner)
                        # Make a fake loss, such that gradient(loss, var0) == grads0
                        # and gradient(loss, var1) == grads1
                        grads0 = constant_op.constant(grads0, dtype=dtype)
                        grads1 = constant_op.constant(grads1, dtype=dtype)
                        loss = (math_ops.reduce_sum(grads0 * var0) +
                                math_ops.reduce_sum(grads1 * var1))

                        opt = moving_average_optimizer.MovingAverageOptimizer(
                            gradient_descent.GradientDescentOptimizer(
                                learning_rate=2.0),
                            average_decay=0.5,
                            sequential_update=sequential_update)
                        save_dir = tempfile.mkdtemp(
                            prefix=os.path.join(self.get_temp_dir(), 'run_1'))
                        save_path = os.path.join(save_dir, 'model')

                        update = opt.minimize(loss)

                        # Get variables and their EMAs. In case of partitioned variables
                        # get proper part of each variable.
                        def _get_variable(var_name, part_name, ema):
                            """Returns variable of it's moving average by name."""
                            matches = [
                                v for v in variables.global_variables()
                                if ((var_name in v.op.name) and (
                                    part_name in v.op.name) and (
                                        ('ExponentialMovingAverage' in
                                         v.op.name) == ema))
                            ]
                            self.assertEqual(len(matches), 1)
                            return matches[0]

                        var0 = _get_variable('var0', var_part_name, ema=False)
                        var1 = _get_variable('var1', var_part_name, ema=False)
                        ema_var0 = _get_variable('var0',
                                                 var_part_name,
                                                 ema=True)
                        ema_var1 = _get_variable('var1',
                                                 var_part_name,
                                                 ema=True)

                        perturb = control_flow_ops.group([
                            state_ops.assign_add(var0, [1.0, 1.0]),
                            state_ops.assign_add(var1, [2.0, 2.0]),
                            state_ops.assign_add(ema_var0, [3.0, 3.0]),
                            state_ops.assign_add(ema_var1, [4.0, 4.0])
                        ])

                        # Test that saver with missing ema variables will fail.
                        with self.assertRaisesRegexp(ValueError,
                                                     r'Variable to swap'):
                            opt.swapping_saver(var_list=[var0])

                        train_saver = opt.swapping_saver()
                        train_saver_subset = opt.swapping_saver(
                            var_list=[var0, ema_var0])
                        inference_saver = saver.Saver()
                        variables.global_variables_initializer().run()
                        # Step 1.
                        update.run()
                        self.assertAllCloseAccordingToType([0.8, 1.8],
                                                           var0.eval())
                        self.assertAllCloseAccordingToType([2.98, 3.98],
                                                           var1.eval())
                        if sequential_update:
                            self.assertAllCloseAccordingToType([0.9, 1.9],
                                                               ema_var0.eval())
                            self.assertAllCloseAccordingToType([2.99, 3.99],
                                                               ema_var1.eval())
                        # Test that the swapping saver save/restore operation is identity.
                        train_saver.save(sess, save_path)
                        train_saver.restore(sess, save_path)
                        self.assertAllCloseAccordingToType([0.8, 1.8],
                                                           var0.eval())
                        self.assertAllCloseAccordingToType([2.98, 3.98],
                                                           var1.eval())
                        if sequential_update:
                            self.assertAllCloseAccordingToType([0.9, 1.9],
                                                               ema_var0.eval())
                            self.assertAllCloseAccordingToType([2.99, 3.99],
                                                               ema_var1.eval())
                        # Test that the subset saver saves the EMA variable as well.
                        if sequential_update:
                            subset_save_path = save_path + '_subset'
                            train_saver_subset.save(sess, subset_save_path)
                            perturb.run()
                            self.assertAllCloseAccordingToType([1.8, 2.8],
                                                               var0.eval())
                            self.assertAllCloseAccordingToType([3.9, 4.9],
                                                               ema_var0.eval())
                            self.assertAllCloseAccordingToType([4.98, 5.98],
                                                               var1.eval())
                            self.assertAllCloseAccordingToType([6.99, 7.99],
                                                               ema_var1.eval())
                            # Restoring should only restore var0 and ema_var0.
                            train_saver_subset.restore(sess, subset_save_path)
                            self.assertAllCloseAccordingToType([0.8, 1.8],
                                                               var0.eval())
                            self.assertAllCloseAccordingToType([0.9, 1.9],
                                                               ema_var0.eval())
                            self.assertAllCloseAccordingToType([4.98, 5.98],
                                                               var1.eval())
                            self.assertAllCloseAccordingToType([6.99, 7.99],
                                                               ema_var1.eval())
                            # Restore back to previous state.
                            train_saver.restore(sess, save_path)

                        # If updates are parallel,
                        # this is not always true after the 1st step.
                        if sequential_update:
                            # Test that the normal saver will have the averaged variables.
                            # We test that the average values are between the original value
                            # and the most recent variable values (since they are an average
                            # of the two).
                            val0 = var0.eval()
                            val1 = var1.eval()
                            train_saver.save(sess, save_path)
                            inference_saver.restore(sess, save_path)
                            avg_val0 = var0.eval()
                            avg_val1 = var1.eval()
                            for i in six.moves.range(len(val0)):
                                self.assertLess(val0[i], avg_val0[i])
                                self.assertLess(avg_val0[i], orig_val0[i])
                                self.assertLess(val1[i], avg_val1[i])
                                self.assertLess(avg_val1[i], orig_val1[i])
                            train_saver.restore(sess, save_path)
                        # Step 2.
                        update.run()
                        # Test that the normal saver will have the averaged variables.
                        # We test that the average values are between the original value and
                        # the most recent variable values (since they are an average of the
                        # two).
                        val0 = var0.eval()
                        val1 = var1.eval()
                        self.assertAllCloseAccordingToType([0.6, 1.6], val0)
                        self.assertAllCloseAccordingToType([2.96, 3.96], val1)
                        train_saver.save(sess, save_path)
                        inference_saver.restore(sess, save_path)
                        avg_val0 = var0.eval()
                        avg_val1 = var1.eval()
                        for i in six.moves.range(len(val0)):
                            self.assertLess(val0[i], avg_val0[i])
                            self.assertLess(avg_val0[i], orig_val0[i])
                            self.assertLess(val1[i], avg_val1[i])
                            self.assertLess(avg_val1[i], orig_val1[i])
Ejemplo n.º 19
0
    def _testCudnnCompatibleRnnCells(self, num_layers, seq_length, num_units,
                                     input_size, batch_size, rnn_mode,
                                     use_block_cell):
        has_state_c = rnn_mode == cudnn_rnn_ops.CUDNN_LSTM
        np.random.seed(0)
        # Train graph
        with ops.Graph().as_default():
            random_seed.set_random_seed(299)
            input_data = array_ops.placeholder(
                dtypes.float32, shape=[seq_length, batch_size, input_size])
            output_tuple, cudnn_model, cudnn_params = self._build_forward_cudnn_model(
                rnn_mode, num_layers, num_units, input_data, is_training=True)
            target_output = array_ops.placeholder(dtype=dtypes.float32,
                                                  shape=None)
            total_sum = sum(map(math_ops.reduce_sum, output_tuple))

            loss_op = losses.log_loss(labels=target_output,
                                      predictions=total_sum)
            optimizer = gradient_descent.GradientDescentOptimizer(
                learning_rate=1e-2)
            train_op = optimizer.minimize(loss_op)

            saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)

            # Train Cudnn model
            with self.test_session(use_gpu=True,
                                   graph=ops.get_default_graph()) as sess:
                sess.run(variables.global_variables_initializer())
                # Train 128 steps
                num_steps = 128
                for _ in range(num_steps):
                    inputs = np.random.rand(seq_length, batch_size,
                                            input_size).astype(np.float32)
                    targets = np.random.rand()
                    sess.run(train_op,
                             feed_dict={
                                 input_data: inputs,
                                 target_output: targets
                             })

                save_path = os.path.join(self.get_temp_dir(),
                                         ("cudnn-rnn-%s-test" % rnn_mode))
                save_v = saver.save(sess, save_path)
                self.assertEqual(save_path, save_v)
                cudnn_params_v = sess.run(cudnn_params)

        # cuDNN inference graph
        with ops.Graph().as_default():
            random_seed.set_random_seed(299)
            cudnn_inputs = array_ops.placeholder(
                dtypes.float32, shape=[seq_length, batch_size, input_size])
            (cudnn_output_tuple, cudnn_model,
             cudnn_params) = self._build_forward_cudnn_model(rnn_mode,
                                                             num_layers,
                                                             num_units,
                                                             cudnn_inputs,
                                                             is_training=False)
            saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)

            inference_input = np.random.rand(seq_length, batch_size,
                                             input_size).astype(np.float32)
            with self.test_session(use_gpu=True,
                                   graph=ops.get_default_graph()) as sess:
                sess.run(variables.global_variables_initializer())
                saver.restore(sess, save_path)
                restored_cudnn_params_v = sess.run(cudnn_params)
                self.assertAllEqual(cudnn_params_v, restored_cudnn_params_v)

                # Cudnn inference
                cudnn_output = sess.run(
                    cudnn_output_tuple,
                    feed_dict={cudnn_inputs: inference_input})

        # Canonical RNN inference graph
        with ops.Graph().as_default():
            random_seed.set_random_seed(299)
            cell_inputs = array_ops.placeholder(
                dtypes.float32, shape=[seq_length, batch_size, input_size])
            (output, states) = _create_cudnn_compatible_canonical_rnn(
                cudnn_model, cell_inputs, use_block_cell)
            saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)

            with self.test_session(use_gpu=True,
                                   graph=ops.get_default_graph()) as sess:
                saver.restore(sess, save_path)

                # BlockCell inference
                output_v, states_v = sess.run(
                    [output, states], feed_dict={cell_inputs: inference_input})

                # output across timestamps are packed into one tensor.
                self.assertAllClose(cudnn_output[0],
                                    output_v,
                                    atol=1e-6,
                                    rtol=1e-6)

                for i in range(num_layers):
                    if has_state_c:
                        # output_h
                        self.assertAllClose(cudnn_output[1][i, :],
                                            states_v[i].h,
                                            atol=1e-6,
                                            rtol=1e-6)
                        # output_c
                        self.assertAllClose(cudnn_output[2][i, :],
                                            states_v[i].c,
                                            atol=1e-6,
                                            rtol=1e-6)
                    else:
                        self.assertAllClose(cudnn_output[1][i, :],
                                            states_v[i],
                                            atol=1e-6,
                                            rtol=1e-6)
 def optimizer_fn(self):
     return gradient_descent.GradientDescentOptimizer(1.0)
 def optimizer_fn_without_params():
     return gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
 def optimizer_fn(self, params):
     return gradient_descent.GradientDescentOptimizer(
         params['learning_rate'])
Ejemplo n.º 23
0
    def test_statefulness_GRU(self):
        if test.is_built_with_rocm():
            self.skipTest('Skipping the test as ROCm MIOpen does not '
                          'support padded input yet.')

        num_samples = 2
        timesteps = 3
        embedding_dim = 4
        units = 2
        layer_class = rnn.GRU
        model = keras.models.Sequential()
        model.add(
            keras.layers.Embedding(4,
                                   embedding_dim,
                                   mask_zero=True,
                                   input_length=timesteps,
                                   batch_input_shape=(num_samples, timesteps)))
        layer = layer_class(units,
                            return_sequences=False,
                            stateful=True,
                            weights=None)
        model.add(layer)
        model.compile(
            optimizer=gradient_descent.GradientDescentOptimizer(0.01),
            loss='mse',
            run_eagerly=testing_utils.should_run_eagerly(),
            experimental_run_tf_function=testing_utils.should_run_tf_function(
            ))
        out1 = model.predict(np.ones((num_samples, timesteps)))
        self.assertEqual(out1.shape, (num_samples, units))

        # train once so that the states change
        model.train_on_batch(np.ones((num_samples, timesteps)),
                             np.ones((num_samples, units)))
        out2 = model.predict(np.ones((num_samples, timesteps)))

        # if the state is not reset, output should be different
        self.assertNotEqual(out1.max(), out2.max())

        # check that output changes after states are reset
        # (even though the model itself didn't change)
        layer.reset_states()
        out3 = model.predict(np.ones((num_samples, timesteps)))
        self.assertNotEqual(out2.max(), out3.max())

        # check that container-level reset_states() works
        model.reset_states()
        out4 = model.predict(np.ones((num_samples, timesteps)))
        np.testing.assert_allclose(out3, out4, atol=1e-5)

        # check that the call to `predict` updated the states
        out5 = model.predict(np.ones((num_samples, timesteps)))
        self.assertNotEqual(out4.max(), out5.max())

        # Check masking
        layer.reset_states()

        left_padded_input = np.ones((num_samples, timesteps))
        left_padded_input[0, :1] = 0
        left_padded_input[1, :2] = 0
        out6 = model.predict(left_padded_input)

        layer.reset_states()

        right_padded_input = np.ones((num_samples, timesteps))
        right_padded_input[0, -1:] = 0
        right_padded_input[1, -2:] = 0
        out7 = model.predict(right_padded_input)

        layer.reset_states()

        mix_padded_input = np.ones((num_samples, timesteps))
        mix_padded_input[0, 1] = 0
        mix_padded_input[1, 0] = 0
        mix_padded_input[1, 2] = 0
        out8 = model.predict(mix_padded_input)

        self.assertAllClose(out7, out6, atol=1e-5)
        self.assertAllClose(out8, out7, atol=1e-5)
Ejemplo n.º 24
0
    def testBatchNormsMatchFwdBwdSomeOnShard0SomeOnShard1(self):
        with self.session() as sess:
            with ops.device("/device:IPU:0"):
                x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])

                with variable_scope.variable_scope("vs", use_resource=True):
                    with ipu.scopes.ipu_shard(0):
                        y = convolutional.conv2d(
                            x,
                            2,
                            1,
                            use_bias=False,
                            kernel_initializer=init_ops.ones_initializer(),
                            name='conv1')
                        y = layers_norm.batch_normalization(y,
                                                            fused=True,
                                                            training=True)
                        y = convolutional.conv2d(
                            y,
                            2,
                            1,
                            use_bias=False,
                            kernel_initializer=init_ops.ones_initializer(),
                            name='conv2')
                        y = layers_norm.batch_normalization(y,
                                                            fused=True,
                                                            training=True)

                    with ipu.scopes.ipu_shard(1):
                        y = convolutional.conv2d(
                            y,
                            2,
                            1,
                            use_bias=False,
                            kernel_initializer=init_ops.ones_initializer(),
                            name='conv3')
                        y = layers_norm.batch_normalization(y,
                                                            fused=True,
                                                            training=True)

                loss = math_ops.reduce_sum(y)
                optimizer = gradient_descent.GradientDescentOptimizer(0.1)
                train = optimizer.minimize(loss)

            report = tu.ReportJSON(self, sess, sharded=True)
            tu.move_variable_initialization_to_cpu()

            sess.run(variables.global_variables_initializer())

            report.reset()

            sess.run([train, loss], {x: np.zeros([1, 4, 4, 2])})

            report.parse_log()

            # Two BN for forwards (on shards 0 and 1) and two BN for grad
            # (note that we don't cache gradient application)
            # pylint: disable=line-too-long
            ok = [
                '__seed*',
                '*OnTileCopy*',
                'Copy_',
                'vs/conv1/Conv2D/convolution.*/Conv_1x1',
                'vs/conv3/Conv2D/convolution.*/Conv_1x1',
                'vs/batch_normalization/FusedBatchNorm*/batch-norm-training.*/',
                'vs/batch_normalization_2/FusedBatchNorm*/batch-norm-training.*/',
                'Sum/reduce.*/ReduceOnTile/InToIntermediateNoExchange/Reduce',
                'Sum/reduce.*/ReduceFinalStage/IntermediateToOutput/Reduce',
                'gradients/vs/batch_normalization_2/FusedBatchNorm*_grad/FusedBatchNormGrad*/batch-norm-grad.*/',
                'gradients/vs/batch_normalization_1/FusedBatchNorm*_grad/FusedBatchNormGrad*/batch-norm-grad.*/',
                'GradientDescent/update_vs/batch_normalization/',
                'GradientDescent/update_vs/batch_normalization_1/',
                'GradientDescent/update_vs/batch_normalization_2/',
                'gradients/vs/conv3/Conv2D_grad/Conv2DBackpropFilter/fusion.*/AddTo',
                'gradients/vs/conv3/Conv2D_grad/Conv2DBackpropFilter/fusion.*/Conv_4x4',
                'gradients/vs/conv3/Conv2D_grad/Conv2DBackpropFilter/fusion.*/Transpose',
                'gradients/vs/conv3/Conv2D_grad/Conv2DBackpropInput/fusion/*Transpose',
                'gradients/vs/conv2/Conv2D_grad/Conv2DBackpropInput/fusion.*/*Transpose',
                'gradients/vs/conv1/Conv2D_grad/Conv2DBackpropFilter/fusion.*/Conv_4x4',
                'gradients/vs/conv1/Conv2D_grad/Conv2DBackpropFilter/fusion.*/AddTo',
                'gradients/vs/conv1/Conv2D_grad/Conv2DBackpropFilter/fusion.*/Transpose',
            ]
            # pylint: enable=line-too-long
            report.assert_all_compute_sets_and_list(ok)
Ejemplo n.º 25
0
 def optimizer_fn():
     return gradient_descent.GradientDescentOptimizer(learning_rate=0.1)
Ejemplo n.º 26
0
    def testPipelineCompare1(self):
        def dataset_fn():
            dataset = tu.create_single_increasing_dataset(7, shape=[4, 4, 2])
            dataset = dataset.batch(batch_size=2, drop_remainder=True)

            def dataset_parser(value):
                img = value / 7
                label = value[0][0][0][0]
                return img, label

            return dataset.map(dataset_parser)

        gradient_accumulation_count = 16
        repeat_count = 2
        optimizer = gradient_descent.GradientDescentOptimizer(0.01)

        def stage1(c, img, label):
            with variable_scope.variable_scope("stage1", use_resource=True):
                y = layers.Conv2D(
                    2,
                    1,
                    use_bias=True,
                    kernel_initializer=init_ops.constant_initializer(0.5),
                    bias_initializer=init_ops.constant_initializer(0.5),
                    name='conv1')(img)
                return y, c, label

        def stage2(x, c, label):
            with variable_scope.variable_scope("stage2", use_resource=True):
                return x * 20, c, label

        def stage3(x, c, label):
            with variable_scope.variable_scope("stage3", use_resource=True):
                return layers.Dense(
                    2,
                    kernel_initializer=init_ops.constant_initializer(0.5),
                    bias_initializer=init_ops.constant_initializer(0.5))(
                        x), c, label

        def stage4(x, c, label):
            with variable_scope.variable_scope("stage4", use_resource=True):
                return math_ops.reduce_sum(
                    layers.Dense(
                        2,
                        kernel_initializer=init_ops.constant_initializer(0.5),
                        bias_initializer=init_ops.constant_initializer(0.5))
                    (x)) + c + label

        def inputs_fn():
            with ops.device('cpu'):
                return [array_ops.placeholder(np.float32, shape=[])]

        pipelining_test_util.PipelineTester.compare_pipeline_to_cpu(
            [stage1, stage2, stage3, stage4],
            inputs_fn, [10.01],
            repeat_count,
            gradient_accumulation_count,
            dataset_fn,
            optimizer,
            self,
            13936,
            True,
            pipelining_ops.PipelineSchedule.Sequential,
            batch_serialization_iterations=4)
Ejemplo n.º 27
0
    def testPipelineCompare2(self):
        # Resnet like network.
        def dataset_fn():
            dataset = tu.create_single_increasing_dataset(100, shape=[4])
            dataset = dataset.batch(batch_size=32, drop_remainder=True)
            dataset = dataset.batch(batch_size=32, drop_remainder=True)
            dataset = dataset.batch(batch_size=2, drop_remainder=True)

            def dataset_parser(value):
                img = value
                label = math_ops.reduce_mean(img, axis=[1, 2, 3])
                return img, math_ops.cast(label, np.int32)

            return dataset.map(dataset_parser)

        gradient_accumulation_count = 18
        repeat_count = 2
        optimizer = gradient_descent.GradientDescentOptimizer(0.01)

        def fixed_padding(inputs, kernel_size):
            pad_total = kernel_size - 1
            pad_beg = pad_total // 2
            pad_end = pad_total - pad_beg
            padded_inputs = array_ops.pad(
                inputs,
                [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]])
            return padded_inputs

        def block(name, first_stride, out_filters, count, x):

            for i in range(count):
                shape_in = x.shape
                stride = first_stride if (i == 0) else 1
                if stride > 1:
                    x = fixed_padding(x, 3)
                sc = x

                with variable_scope.variable_scope(name + "/" + str(i) + "/1"):
                    x = conv(x, 3, stride, out_filters)
                    x = nn.relu(x)

                with variable_scope.variable_scope(name + "/" + str(i) + "/2"):
                    x = conv(x, 3, 1, out_filters)

                    # shortcut
                    if stride != 1:
                        sc = array_ops.strided_slice(
                            sc, [0, 0, 0, 0],
                            sc.shape,
                            strides=[1, stride, stride, 1])
                    pad = int(x.shape[3] - shape_in[3])
                    if pad != 0:
                        sc = array_ops.pad(sc,
                                           paddings=[[0, 0], [0, 0], [0, 0],
                                                     [0, pad]])

                    x = nn.relu(x + sc)

            return x

        def fc(x, num_units_out):
            return layers.Dense(
                num_units_out,
                kernel_initializer=init_ops.constant_initializer(0.1),
                bias_initializer=init_ops.constant_initializer(0.0))(x)

        def max_pool(x, ksize=3, stride=2):
            return layers.MaxPooling2D(ksize, stride, padding='SAME')(x)

        def conv(x, ksize, stride, filters_out):
            return layers.Conv2D(
                filters_out,
                ksize,
                stride,
                'SAME',
                kernel_initializer=init_ops.constant_initializer(0.1),
                bias_initializer=init_ops.constant_initializer(0.0))(x)

        def stage1(img, label):
            with variable_scope.variable_scope("stage1", use_resource=True):
                x = conv(img, 7, 2, 16)
                x = nn.relu(x)
                x = max_pool(x, ksize=3, stride=2)
                return x, label

        def stage2(x, label):
            with variable_scope.variable_scope("stage2", use_resource=True):
                x = block("b", 2, 64, 1, x)
                return x, label

        def stage3(x, label):
            with variable_scope.variable_scope("stage3", use_resource=True):
                x = math_ops.reduce_mean(x, axis=[1, 2])
                x = fc(x, 100)
                loss = math_ops.reduce_mean(
                    nn.sparse_softmax_cross_entropy_with_logits(logits=x,
                                                                labels=label))
                return loss

        pipelining_test_util.PipelineTester.compare_pipeline_to_sharding(
            [stage1, stage2, stage3],
            lambda: [], [],
            repeat_count,
            gradient_accumulation_count,
            dataset_fn,
            optimizer,
            self,
            57095,
            True,
            pipelining_ops.PipelineSchedule.Sequential,
            batch_serialization_iterations=5)
mirrored_strategy_with_cpu_1_and_2 = combinations.NamedDistribution(
    "Mirrored2CPU",
    lambda: mirrored_lib.MirroredStrategy(["/cpu:1", "/cpu:2"]))
central_storage_strategy_with_two_gpus = combinations.NamedDistribution(
    "CentralStorage2GPUs",
    lambda: central_storage_strategy.CentralStorageStrategy._from_num_gpus(2),  # pylint: disable=protected-access
    required_gpus=2)
central_storage_strategy_with_gpu_and_cpu = combinations.NamedDistribution(
    "CentralStorageCPUAndGPU",
    lambda: central_storage_strategy.CentralStorageStrategy(
        ["/gpu:0", "/cpu:0"]),
    required_gpus=1)

gradient_descent_optimizer_v1_fn = combinations.NamedObject(
    "GradientDescentV1",
    lambda: gradient_descent.GradientDescentOptimizer(0.2))
adagrad_optimizer_v1_fn = combinations.NamedObject(
    "AdagradV1", lambda: adagrad.AdagradOptimizer(0.001))
adam_optimizer_v1_fn = combinations.NamedObject(
    "AdamV1", lambda: adam.AdamOptimizer(0.001, epsilon=1))
rmsprop_optimizer_v1_fn = combinations.NamedObject(
    "RmsPropV1", lambda: rmsprop.RMSPropOptimizer(0.001))

# TODO(shiningsun): consider adding the other v1 optimizers
optimizers_v1 = [gradient_descent_optimizer_v1_fn, adagrad_optimizer_v1_fn]

adadelta_optimizer_keras_v2_fn = combinations.NamedObject(
    "AdadeltaKerasV2", lambda: adadelta_keras_v2.Adadelta(0.001))
adagrad_optimizer_keras_v2_fn = combinations.NamedObject(
    "AdagradKerasV2", lambda: adagrad_keras_v2.Adagrad(0.001))
adam_optimizer_keras_v2_fn = combinations.NamedObject(
Ejemplo n.º 29
0
    def test_statefulness_LSTM(self):
        num_samples = 2
        timesteps = 3
        embedding_dim = 4
        units = 2
        layer_class = keras.layers.UnifiedLSTM
        model = keras.models.Sequential()
        model.add(
            keras.layers.Embedding(4,
                                   embedding_dim,
                                   mask_zero=True,
                                   input_length=timesteps,
                                   batch_input_shape=(num_samples, timesteps)))
        layer = layer_class(units,
                            return_sequences=False,
                            stateful=True,
                            weights=None)
        model.add(layer)
        model.compile(
            optimizer=gradient_descent.GradientDescentOptimizer(0.01),
            loss='mse',
            run_eagerly=testing_utils.should_run_eagerly())
        out1 = model.predict(np.ones((num_samples, timesteps)))
        self.assertEqual(out1.shape, (num_samples, units))

        # train once so that the states change
        model.train_on_batch(np.ones((num_samples, timesteps)),
                             np.ones((num_samples, units)))
        out2 = model.predict(np.ones((num_samples, timesteps)))

        # if the state is not reset, output should be different
        self.assertNotEqual(out1.max(), out2.max())

        # check that output changes after states are reset
        # (even though the model itself didn't change)
        layer.reset_states()
        out3 = model.predict(np.ones((num_samples, timesteps)))
        self.assertNotEqual(out2.max(), out3.max())

        # check that container-level reset_states() works
        model.reset_states()
        out4 = model.predict(np.ones((num_samples, timesteps)))
        self.assertAllClose(out3, out4, atol=1e-5)

        # check that the call to `predict` updated the states
        out5 = model.predict(np.ones((num_samples, timesteps)))
        self.assertNotEqual(out4.max(), out5.max())

        # Check masking
        layer.reset_states()

        left_padded_input = np.ones((num_samples, timesteps))
        left_padded_input[0, :1] = 0
        left_padded_input[1, :2] = 0
        out6 = model.predict(left_padded_input)

        layer.reset_states()

        right_padded_input = np.ones((num_samples, timesteps))
        right_padded_input[0, -1:] = 0
        right_padded_input[1, -2:] = 0
        out7 = model.predict(right_padded_input)

        self.assertAllClose(out7, out6, atol=1e-5)
Ejemplo n.º 30
0
 def test_wrap_optimizer(self):
     opt = gradient_descent_v1.GradientDescentOptimizer(1.0)
     opt = enable_mixed_precision_graph_rewrite(opt, 123.)
     self.assertIsInstance(
         opt, loss_scale_optimizer_v1.MixedPrecisionLossScaleOptimizer)
     self.assertEqual(self.evaluate(opt._loss_scale()), 123.)