Exemple #1
0
    def test_convergence(self):
        for dtype in ['mixed', tf.float32]:
            with tf.Graph().as_default() as g:
                n_samples = 10
                n_hid = 10
                var_dtype = tf.float32 if dtype == tf.float32 else tf.float16

                np.random.seed(0)
                X = np.random.rand(n_samples, n_hid)
                y = np.random.rand(n_samples, 1)
                w = np.linalg.solve(X.T.dot(X), X.T.dot(y))

                x_ph = tf.placeholder(var_dtype, [n_samples, n_hid])
                y_ph = tf.placeholder(var_dtype, [n_samples, 1])

                y_pred = tf.layers.dense(x_ph, 1, use_bias=False)
                loss = tf.losses.mean_squared_error(y_ph, y_pred)
                loss += tf.losses.get_regularization_loss()
                train_op = optimize_loss(loss,
                                         "Adam", {},
                                         lambda gs: fixed_lr(gs, 0.05),
                                         dtype=dtype)

                with self.test_session(g, use_gpu=True) as sess:
                    sess.run(tf.global_variables_initializer())
                    for i in range(6000):
                        sess.run(train_op, {x_ph: X, y_ph: y})
                    w_learned = sess.run(tf.trainable_variables()[0])

                npt.assert_allclose(w_learned, w, atol=0.01)
Exemple #2
0
  def test_updates(self):
    try:
      import horovod.tensorflow as hvd
      hvd.init()
    except ImportError:
      print("Horovod not installed skipping test_updates")
      return

    dtype = tf.float32
    with tf.Graph().as_default() as g:
      n_samples = 10
      n_hid = 10
      var_dtype = tf.float32 if dtype == tf.float32 else tf.float16

      np.random.seed(0)
      X = np.random.rand(n_samples, n_hid)
      y = np.random.rand(n_samples, 1)
      w = np.linalg.solve(X.T.dot(X), X.T.dot(y))

      x_ph = tf.placeholder(var_dtype, [n_samples, n_hid])
      y_ph = tf.placeholder(var_dtype, [n_samples, 1])

      y_pred = tf.layers.dense(x_ph, 1, use_bias=False)
      loss = tf.losses.mean_squared_error(y_ph, y_pred)
      loss += tf.losses.get_regularization_loss()
      skip_update_ph = tf.placeholder(tf.bool)
      iter_size = 8
      train_op = optimize_loss(loss, "SGD", {},
                               lambda gs: fixed_lr(gs, 0.1), dtype=dtype,
                               iter_size=iter_size, on_horovod=True,
                               skip_update_ph=skip_update_ph)
      grad_accum = [var for var in tf.global_variables() if 'accum' in var.name][0]
      var = tf.trainable_variables()[0]
      with self.test_session(g, use_gpu=True) as sess:
        sess.run(tf.global_variables_initializer())
        for _ in range(3):
          g, v = sess.run([grad_accum, var])
          npt.assert_allclose(g, np.zeros(g.shape))

          true_g = 2 * (X.T.dot(X).dot(v) - X.T.dot(y)) / X.shape[0] / iter_size

          sess.run(train_op, {x_ph: X, y_ph: y, skip_update_ph: True})
          g_new, v_new = sess.run([grad_accum, var])
          npt.assert_allclose(g_new, true_g, atol=1e-7)
          npt.assert_allclose(v_new, v)

          sess.run(train_op, {x_ph: X, y_ph: y, skip_update_ph: True})
          g_new, v_new = sess.run([grad_accum, var])
          npt.assert_allclose(g_new, true_g * 2, atol=1e-7)
          npt.assert_allclose(v_new, v)

          sess.run(train_op, {x_ph: X, y_ph: y, skip_update_ph: True})
          g_new, v_new = sess.run([grad_accum, var])
          npt.assert_allclose(g_new, true_g * 3, atol=1e-7)
          npt.assert_allclose(v_new, v)

          sess.run(train_op, {x_ph: X, y_ph: y, skip_update_ph: False})
          g_new, v_new = sess.run([grad_accum, var])
          npt.assert_allclose(g_new, np.zeros(g.shape))
          npt.assert_allclose(v_new, v - 0.1 * true_g * 4, atol=1e-7)
Exemple #3
0
  def test_convergence(self):
    for dtype in ['mixed', tf.float32]:
      with tf.Graph().as_default() as g:
        n_samples = 10
        n_hid = 10
        var_dtype = tf.float32 if dtype == tf.float32 else tf.float16

        np.random.seed(0)
        X = np.random.rand(n_samples, n_hid)
        y = np.random.rand(n_samples, 1)
        w = np.linalg.solve(X.T.dot(X), X.T.dot(y))

        x_ph = tf.placeholder(var_dtype, [n_samples, n_hid])
        y_ph = tf.placeholder(var_dtype, [n_samples, 1])

        y_pred = tf.layers.dense(x_ph, 1, use_bias=False)
        loss = tf.losses.mean_squared_error(y_ph, y_pred)
        loss += tf.losses.get_regularization_loss()
        train_op = optimize_loss(loss, "Adam", {},
                                 lambda gs: fixed_lr(gs, 0.05), dtype=dtype)

        with self.test_session(g, use_gpu=True) as sess:
          sess.run(tf.global_variables_initializer())
          for i in range(6000):
            sess.run(train_op, {x_ph: X, y_ph: y})
          w_learned = sess.run(tf.trainable_variables()[0])

        npt.assert_allclose(w_learned, w, atol=0.01)
Exemple #4
0
    def compile(self, force_var_reuse=False):
        """TensorFlow graph is built here."""
        if 'initializer' not in self.params:
            initializer = None
        else:
            init_dict = self.params.get('initializer_params', {})
            initializer = self.params['initializer'](**init_dict)

        if not self.on_horovod:  # not using Horovod
            # below we follow data parallelism for multi-GPU training
            losses = []
            for gpu_cnt, gpu_id in enumerate(self._gpu_ids):
                with tf.device("/gpu:{}".format(gpu_id)), tf.variable_scope(
                        name_or_scope=tf.get_variable_scope(),
                        # re-using variables across GPUs.
                        reuse=force_var_reuse or (gpu_cnt > 0),
                        initializer=initializer,
                        dtype=self.get_tf_dtype(),
                ):
                    deco_print("Building graph on GPU:{}".format(gpu_id))

                    self.get_data_layer(gpu_cnt).build_graph()
                    input_tensors = self.get_data_layer(gpu_cnt).input_tensors

                    loss, self._outputs[
                        gpu_cnt] = self._build_forward_pass_graph(
                            input_tensors,
                            gpu_id=gpu_cnt,
                        )
                    if self._outputs[gpu_cnt] is not None and \
                       not isinstance(self._outputs[gpu_cnt], list):
                        raise ValueError(
                            'Decoder outputs have to be either None or list')
                    if self._mode == "train" or self._mode == "eval":
                        losses.append(loss)
            # end of for gpu_ind loop
            if self._mode == "train":
                self.loss = tf.reduce_mean(losses)
            if self._mode == "eval":
                self.eval_losses = losses
        else:  # is using Horovod
            # gpu_id should always be zero, since Horovod takes care of isolating
            # different processes to 1 GPU only
            with tf.device("/gpu:0"), tf.variable_scope(
                    name_or_scope=tf.get_variable_scope(),
                    reuse=force_var_reuse,
                    initializer=initializer,
                    dtype=self.get_tf_dtype(),
            ):
                deco_print("Building graph in Horovod rank: {}".format(
                    self._hvd.rank()))
                self.get_data_layer().build_graph()
                input_tensors = self.get_data_layer().input_tensors

                loss, self._output = self._build_forward_pass_graph(
                    input_tensors, gpu_id=0)
                if self._output is not None and not isinstance(
                        self._output, list):
                    raise ValueError(
                        'Decoder outputs have to be either None or list')

                if self._mode == "train":
                    self.loss = loss
                if self._mode == "eval":
                    self.eval_losses = [loss]

        try:
            self._num_objects_per_step = [
                self._get_num_objects_per_step(worker_id)
                for worker_id in range(self.num_gpus)
            ]
        except NotImplementedError:
            pass

        if self._mode == "train":
            if 'lr_policy' not in self.params:
                lr_policy = None
            else:
                lr_params = self.params.get('lr_policy_params', {})
                # adding default decay_steps = max_steps if lr_policy supports it and
                # different value is not provided
                func_params = signature(self.params['lr_policy']).parameters
                if 'decay_steps' in func_params and 'decay_steps' not in lr_params:
                    lr_params['decay_steps'] = self._last_step
                if 'steps_per_epoch' in func_params and \
                   'steps_per_epoch' not in lr_params and 'num_epochs' in self.params:
                    lr_params['steps_per_epoch'] = self.steps_in_epoch
                lr_policy = lambda gs: self.params['lr_policy'](global_step=gs,
                                                                **lr_params)

            if self.params.get('iter_size', 1) > 1:
                self.skip_update_ph = tf.placeholder(tf.bool)

            self.train_op = optimize_loss(
                loss=tf.cast(self.loss, tf.float32) +
                get_regularization_loss(),
                dtype=self.params['dtype'],
                optimizer=self.params['optimizer'],
                optimizer_params=self.params.get('optimizer_params', {}),
                clip_gradients=self.params.get('max_grad_norm', None),
                learning_rate_decay_fn=lr_policy,
                summaries=self.params.get('summaries', None),
                larc_params=self.params.get('larc_params', None),
                loss_scaling=self.params.get('loss_scaling', 1.0),
                on_horovod=self.on_horovod,
                iter_size=self.params.get('iter_size', 1),
                skip_update_ph=self.skip_update_ph,
            )
            tf.summary.scalar(name="train_loss", tensor=self.loss)
            if self.steps_in_epoch:
                tf.summary.scalar(
                    name="epoch",
                    tensor=tf.floor(
                        tf.train.get_global_step() /
                        tf.constant(self.steps_in_epoch, dtype=tf.int64)),
                )

            if not self.on_horovod or self._hvd.rank() == 0:
                deco_print("Trainable variables:")
                total_params = 0
                unknown_shape = False
                for var in tf.trainable_variables():
                    var_params = 1
                    deco_print('{}'.format(var.name), offset=2)
                    deco_print('shape: {}, {}'.format(var.get_shape(),
                                                      var.dtype),
                               offset=4)
                    if var.get_shape():
                        for dim in var.get_shape():
                            var_params *= dim.value
                        total_params += var_params
                    else:
                        unknown_shape = True
                if unknown_shape:
                    deco_print(
                        "Encountered unknown variable shape, can't compute total "
                        "number of parameters.")
                else:
                    deco_print(
                        'Total trainable parameters: {}'.format(total_params))
Exemple #5
0
    def compile(self, force_var_reuse=False):
        """TensorFlow graph is built here."""
        if 'initializer' not in self.params:
            initializer = None
        else:
            init_dict = self.params.get('initializer_params', {})
            initializer = self.params['initializer'](**init_dict)

        self.data_layer.build_graph()
        input_tensors = self.data_layer.get_input_tensors()

        if not self.on_horovod:  # not using Horovod
            # below we follow data parallelism for multi-GPU training
            losses = []
            for gpu_cnt, gpu_id in enumerate(self._gpu_ids):
                with tf.device("/gpu:{}".format(gpu_id)), tf.variable_scope(
                        name_or_scope=tf.get_variable_scope(),
                        # re-using variables across GPUs.
                        reuse=force_var_reuse or (gpu_cnt > 0),
                        initializer=initializer,
                        dtype=self.get_tf_dtype(),
                ):
                    deco_print("Building graph on GPU:{}".format(gpu_id))

                    loss, self._outputs[
                        gpu_cnt] = self._build_forward_pass_graph(
                            [
                                input_tensor[gpu_cnt]
                                for input_tensor in input_tensors
                            ],
                            gpu_id=gpu_cnt,
                        )
                    if self._mode == "train" or self._mode == "eval":
                        losses.append(loss)
            # end of for gpu_ind loop
            if self._mode == "train" or self._mode == "eval":
                self.loss = tf.reduce_mean(losses)
        else:  # is using Horovod
            # gpu_id should always be zero, since Horovod takes care of isolating
            # different processes to 1 GPU only
            with tf.device("/gpu:0"), tf.variable_scope(
                    name_or_scope=tf.get_variable_scope(),
                    reuse=force_var_reuse,
                    initializer=initializer,
                    dtype=self.get_tf_dtype(),
            ):
                deco_print("Building graph in Horovod rank: {}".format(
                    self._hvd.rank()))
                loss, self._outputs[0] = self._build_forward_pass_graph(
                    input_tensors, gpu_id=0)
                if self._mode == "train" or self._mode == "eval":
                    self.loss = loss

        if self._mode == "train":
            if 'lr_policy' not in self.params:
                lr_policy = None
            else:
                lr_params = self.params.get('lr_policy_params', {})
                # adding default decay_steps = max_steps if lr_policy supports it and
                # different value is not provided
                if 'decay_steps' in self.params['lr_policy'].__code__.co_varnames and \
                   'decay_steps' not in lr_params:
                    lr_params['decay_steps'] = self._last_step
                lr_policy = lambda lr, gs: self.params['lr_policy'](lr, gs, **
                                                                    lr_params)

            self.train_op = optimize_loss(
                loss=self.loss + get_regularization_loss(),
                dtype=self.params['dtype'],
                learning_rate=self.params['learning_rate'],
                optimizer=self.params['optimizer'],
                optimizer_params=self.params.get('optimizer_params', {}),
                gradient_noise_scale=None,
                gradient_multipliers=None,
                clip_gradients=self.params.get('max_grad_norm', None),
                learning_rate_decay_fn=lr_policy,
                update_ops=None,
                variables=None,
                name="Loss_Optimization",
                summaries=self.params.get('summaries', None),
                colocate_gradients_with_ops=True,
                increment_global_step=True,
                LARC_nu=self.params.get('larc_nu', None),
                LARC_mode=self.params.get('larc_mode', 'clip'),
                loss_scale=self.params.get('loss_scale', 1.0),
                automatic_loss_scaling=self.params.get(
                    'automatic_loss_scaling', None),
                on_horovod=self.on_horovod,
            )
            tf.summary.scalar(name="train_loss", tensor=self.loss)

            if not self.on_horovod or self._hvd.rank() == 0:
                deco_print("Trainable variables:")
                total_params = 0
                unknown_shape = False
                for var in tf.trainable_variables():
                    var_params = 1
                    deco_print('{}'.format(var.name), offset=2)
                    deco_print('shape: {}, {}'.format(var.get_shape(),
                                                      var.dtype),
                               offset=4)
                    if var.get_shape():
                        for dim in var.get_shape():
                            var_params *= dim.value
                        total_params += var_params
                    else:
                        unknown_shape = True
                if unknown_shape:
                    deco_print(
                        "Encountered unknown variable shape, can't compute total "
                        "number of parameters.")
                else:
                    deco_print(
                        'Total trainable parameters: {}'.format(total_params))
Exemple #6
0
  def compile(self, force_var_reuse=False):
    """TensorFlow graph is built here."""
    if 'initializer' not in self.params:
      initializer = None
    else:
      init_dict = self.params.get('initializer_params', {})
      initializer = self.params['initializer'](**init_dict)

    if not self.on_horovod:  # not using Horovod
      # below we follow data parallelism for multi-GPU training
      losses = []
      for gpu_cnt, gpu_id in enumerate(self._gpu_ids):
        with tf.device("/gpu:{}".format(gpu_id)), tf.variable_scope(
          name_or_scope=tf.get_variable_scope(),
          # re-using variables across GPUs.
          reuse=force_var_reuse or (gpu_cnt > 0),
          initializer=initializer,
          dtype=self.get_tf_dtype(),
        ):
          deco_print("Building graph on GPU:{}".format(gpu_id))

          self.get_data_layer(gpu_cnt).build_graph()
          input_tensors = self.get_data_layer(gpu_cnt).input_tensors

          loss, self._outputs[gpu_cnt] = self._build_forward_pass_graph(
            input_tensors,
            gpu_id=gpu_cnt,
          )
          if self._outputs[gpu_cnt] is not None and \
             not isinstance(self._outputs[gpu_cnt], list):
            raise ValueError('Decoder samples have to be either None or list')
          if self._mode == "train" or self._mode == "eval":
            losses.append(loss)
      # end of for gpu_ind loop
      if self._mode == "train":
        self.loss = tf.reduce_mean(losses)
      if self._mode == "eval":
        self.eval_losses = losses
    else:  # is using Horovod
      # gpu_id should always be zero, since Horovod takes care of isolating
      # different processes to 1 GPU only
      with tf.device("/gpu:0"), tf.variable_scope(
          name_or_scope=tf.get_variable_scope(),
          reuse=force_var_reuse,
          initializer=initializer,
          dtype=self.get_tf_dtype(),
      ):
        deco_print(
          "Building graph in Horovod rank: {}".format(self._hvd.rank())
        )
        self.get_data_layer().build_graph()
        input_tensors = self.get_data_layer().input_tensors

        loss, self._output = self._build_forward_pass_graph(input_tensors,
                                                            gpu_id=0)
        if self._output is not None and not isinstance(self._output, list):
          raise ValueError('Decoder samples have to be either None or list')

        if self._mode == "train":
          self.loss = loss
        if self._mode == "eval":
          self.eval_losses = [loss]

    if self._mode == "train":
      if 'lr_policy' not in self.params:
        lr_policy = None
      else:
        lr_params = self.params.get('lr_policy_params', {})
        # adding default decay_steps = max_steps if lr_policy supports it and
        # different value is not provided
        if 'decay_steps' in self.params['lr_policy'].__code__.co_varnames and \
           'decay_steps' not in lr_params:
          lr_params['decay_steps'] = self._last_step
        if 'steps_per_epoch' in self.params['lr_policy'].__code__.co_varnames and \
           'steps_per_epoch' not in lr_params and 'num_epochs' in self.params:
          lr_params['steps_per_epoch'] = self.steps_in_epoch
        lr_policy = lambda gs: self.params['lr_policy'](global_step=gs,
                                                        **lr_params)

      self.train_op = optimize_loss(
        loss=tf.cast(self.loss, tf.float32) + get_regularization_loss(),
        dtype=self.params['dtype'],
        optimizer=self.params['optimizer'],
        optimizer_params=self.params.get('optimizer_params', {}),
        gradient_noise_scale=None,
        gradient_multipliers=None,
        clip_gradients=self.params.get('max_grad_norm', None),
        learning_rate_decay_fn=lr_policy,
        update_ops=None,
        variables=None,
        name="Loss_Optimization",
        summaries=self.params.get('summaries', None),
        colocate_gradients_with_ops=True,
        increment_global_step=True,
        larc_params=self.params.get('larc_params', None),
        loss_scale=self.params.get('loss_scale', 1.0),
        automatic_loss_scaling=self.params.get('automatic_loss_scaling', None),
        on_horovod=self.on_horovod,
      )
      tf.summary.scalar(name="train_loss", tensor=self.loss)
      if self.steps_in_epoch:
        tf.summary.scalar(
          name="epoch",
          tensor=tf.floor(tf.train.get_global_step() /
                          tf.constant(self.steps_in_epoch, dtype=tf.int64)),
        )

      if not self.on_horovod or self._hvd.rank() == 0:
        deco_print("Trainable variables:")
        total_params = 0
        unknown_shape = False
        for var in tf.trainable_variables():
          var_params = 1
          deco_print('{}'.format(var.name), offset=2)
          deco_print('shape: {}, {}'.format(var.get_shape(), var.dtype),
                     offset=4)
          if var.get_shape():
            for dim in var.get_shape():
              var_params *= dim.value
            total_params += var_params
          else:
            unknown_shape = True
        if unknown_shape:
          deco_print("Encountered unknown variable shape, can't compute total "
                     "number of parameters.")
        else:
          deco_print('Total trainable parameters: {}'.format(total_params))
Exemple #7
0
    def compile(self, force_var_reuse=False, checkpoint=None):
        """TensorFlow graph is built here."""
        if 'initializer' not in self.params:
            initializer = None
        else:
            init_dict = self.params.get('initializer_params', {})
            initializer = self.params['initializer'](**init_dict)

        if not self.on_horovod:  # not using Horovod
            # below we follow data parallelism for multi-GPU training
            losses = []
            for gpu_cnt, gpu_id in enumerate(self._gpu_ids):
                with tf.device("/gpu:{}".format(gpu_id)), tf.variable_scope(
                        name_or_scope=tf.get_variable_scope(),
                        # re-using variables across GPUs.
                        reuse=force_var_reuse or (gpu_cnt > 0),
                        initializer=initializer,
                        dtype=self.get_tf_dtype(),
                ):
                    deco_print("Building graph on GPU:{}".format(gpu_id))

                    self.get_data_layer(gpu_cnt).build_graph()
                    input_tensors = self.get_data_layer(gpu_cnt).input_tensors
                    
                    if self.params.get("use_trt", False):
                        # Build TF-TRT graph
                        loss, self._outputs[gpu_cnt] = self.build_trt_forward_pass_graph(
                            input_tensors,
                            gpu_id=gpu_cnt,
                            checkpoint=checkpoint
                        )
                    else:
                        # Build regular TF graph
                        loss, self._outputs[gpu_cnt] = self._build_forward_pass_graph(
                            input_tensors,
                            gpu_id=gpu_cnt
                        )
                    
                    if self._outputs[gpu_cnt] is not None and \
                            not isinstance(self._outputs[gpu_cnt], list):
                        raise ValueError(
                            'Decoder outputs have to be either None or list')
                    if self._mode == "train" or self._mode == "eval":
                        losses.append(loss)

            # end of for gpu_ind loop
            if self._mode == "train":
                self.loss = tf.reduce_mean(losses)
            if self._mode == "eval":
                self.eval_losses = losses
        else:  # is using Horovod
            # gpu_id should always be zero, since Horovod takes care of isolating
            # different processes to 1 GPU only
            with tf.device("/gpu:0"), tf.variable_scope(
                    name_or_scope=tf.get_variable_scope(),
                    reuse=force_var_reuse,
                    initializer=initializer,
                    dtype=self.get_tf_dtype(),
            ):
                deco_print(
                    "Building graph in Horovod rank: {}".format(
                        self._hvd.rank())
                )
                self.get_data_layer().build_graph()
                input_tensors = self.get_data_layer().input_tensors

                if self.params.get("use_trt", False):
                    # Build TF-TRT graph
                    all_loss, self._output = self.build_trt_forward_pass_graph(
                        input_tensors,
                        gpu_id=0,
                        checkpoint=checkpoint
                    )
                else:
                    # Build regular TF graph
                    all_loss, self._output = self._build_forward_pass_graph(
                        input_tensors,
                        gpu_id=0
                    )
                if isinstance(all_loss, (dict,)):
                    loss = all_loss['loss']
                else:
                    loss = all_loss

                if self._output is not None and not isinstance(self._output, list):
                    raise ValueError(
                        'Decoder outputs have to be either None or list')

                if self._mode == "train":
                    self.loss = loss
                if self._mode == "eval":
                    self.eval_losses = [loss]

        try:
            self._num_objects_per_step = [self._get_num_objects_per_step(worker_id)
                                          for worker_id in range(self.num_gpus)]
        except NotImplementedError:
            pass

        if self._mode == "train":
            if 'lr_policy' not in self.params:
                lr_policy = None
            else:
                lr_params = self.params.get('lr_policy_params', {})
                # adding default decay_steps = max_steps if lr_policy supports it and
                # different value is not provided
                func_params = signature(self.params['lr_policy']).parameters
                if 'decay_steps' in func_params and 'decay_steps' not in lr_params:
                    lr_params['decay_steps'] = self._last_step
                    if 'begin_decay_at' in func_params:
                        if 'warmup_steps' in func_params:
                            lr_params['begin_decay_at'] = max(
                                lr_params.get('begin_decay_at', 0),
                                lr_params.get('warmup_steps', 0)
                            )
                        lr_params['decay_steps'] -= lr_params.get(
                            'begin_decay_at', 0)

                if 'steps_per_epoch' in func_params and 'steps_per_epoch' not in lr_params and 'num_epochs' in self.params:
                    lr_params['steps_per_epoch'] = self.steps_in_epoch

                def lr_policy(gs): return self.params['lr_policy'](
                    global_step=gs, **lr_params)

            if self.params.get('iter_size', 1) > 1:
                self.skip_update_ph = tf.placeholder(tf.bool)

            var_list = tf.trainable_variables()
            freeze_variables_regex = self.params.get(
                'freeze_variables_regex', None)
            if freeze_variables_regex is not None:
                pattern = re.compile(freeze_variables_regex)
                var_list = [var for var in tf.trainable_variables()
                            if not pattern.match(var.name)]

            self.train_op = optimize_loss(
                loss=tf.cast(self.loss, tf.float32) +
                get_regularization_loss(),
                dtype=self.params['dtype'],
                optimizer=self.params['optimizer'],
                optimizer_params=self.params.get('optimizer_params', {}),
                var_list=var_list,
                clip_gradients=self.params.get('max_grad_norm', None),
                learning_rate_decay_fn=lr_policy,
                summaries=self.params.get('summaries', None),
                larc_params=self.params.get('larc_params', None),
                loss_scaling=self.params.get('loss_scaling', 1.0),
                loss_scaling_params=self.params.get(
                    'loss_scaling_params', None),
                on_horovod=self.on_horovod,
                iter_size=self.params.get('iter_size', 1),
                skip_update_ph=self.skip_update_ph,
                model=self
            )
            tf.summary.scalar(name="train_loss", tensor=self.loss)
            if self.steps_in_epoch:
                tf.summary.scalar(
                    name="epoch",
                    tensor=tf.floor(tf.train.get_global_step(
                    ) / tf.constant(self.steps_in_epoch, dtype=tf.int64)),
                )