Exemple #1
0
  def _get_apply_grads_op(self, loss, trainable_vars_for_gradients):
    """
    :param tf.Tensor loss:
    :param list[tf.Variable] trainable_vars_for_gradients:
    :return: op with all variable updates combined, using the optimizer
    :rtype: tf.Operation
    """
    if not trainable_vars_for_gradients:
      return tf.no_op(name="no_grad_vars_no_op")
    # AccumulateN might not be deterministic but should be faster and should require less memory.
    # We might want to make this configurable.
    aggregation_method = tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
    grad_noise = self.config.float("gradient_noise", 0.0)
    grad_clip = self.config.float("gradient_clip", 0.0)
    grad_clip_global_norm = self.config.float("gradient_clip_global_norm", 0.0)
    # E.g. https://github.com/openai/baselines/blob/master/baselines/deepq/simple.py: grad_norm_clipping=10 -> tf.clip_by_norm

    # Extended self.optimizer.minimize() to optionally modify gradients.
    grads_and_vars = self.optimizer.compute_gradients(
      loss, var_list=trainable_vars_for_gradients,
      aggregation_method=aggregation_method)
    if not [v for g, v in grads_and_vars if g is not None]:
      raise Exception("no single variable to train")
    if self.config.bool("debug_grad_summaries", False):
      from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor
      for grad, var in grads_and_vars:
        with reuse_name_scope_of_tensor(grad, prefix="grads/"):
          variable_summaries(grad, name="grad_of_%s" % get_base_name(var))
        with reuse_name_scope_of_tensor(var, prefix="vars/"):
          variable_summaries(var, name=get_base_name(var))
    # Also see tf.contrib.layers.optimizers.optimize_loss() for reference.
    if self.config.bool("gradient_nan_inf_filter", False):
      from TFUtil import nan_to_num
      grads_and_vars = [(nan_to_num(grad, nan_num=0.0, inf_num=0.0), var) for (grad, var) in grads_and_vars]
    if grad_noise:
      assert grad_noise > 0
      from TFUtil import add_scaled_noise_to_gradients
      grads_and_vars = add_scaled_noise_to_gradients(grads_and_vars, grad_noise)
    if grad_clip:
      assert grad_clip > 0
      grads_and_vars = [(tf.clip_by_value(grad, -grad_clip, grad_clip), var) for grad, var in grads_and_vars]
    if grad_clip_global_norm:
      assert grad_clip_global_norm > 0
      grads_clipped, _ = tf.clip_by_global_norm([grad for (grad, _) in grads_and_vars], grad_clip_global_norm)
      grads_and_vars = zip(grads_clipped, [var for (_, var) in grads_and_vars])
    apply_grads = self.optimizer.apply_gradients(grads_and_vars)
    return apply_grads
Exemple #2
0
def accum_grad_multiple_step(grad, var, train_step, num_accum_steps):
  """
  :param tf.Tensor|tf.IndexedSlices grad:
  :param tf.Variable var:
  :param tf.Tensor train_step: int, scalar
  :param int num_accum_steps:
  :return: modified grad
  :rtype: tf.Tensor
  """
  from TFUtil import reuse_name_scope_of_tensor, get_base_name
  with reuse_name_scope_of_tensor(grad, postfix="/%s_accum_grad" % get_base_name(grad)):
    shape = var.get_shape().as_list()
    v = tf.get_variable(
      name="var_accum_grad", shape=shape, dtype=grad.dtype,
      initializer=tf.zeros_initializer(), trainable=False)
    return tf.cond(
      tf.less_equal(tf.mod(train_step, num_accum_steps), 0),
      lambda: tf.assign(v, grad),
      lambda: tf.assign_add(v, grad))
Exemple #3
0
def accum_grad_multiple_step(grad, var, train_step, num_accum_steps):
    """
  :param tf.Tensor|tf.IndexedSlices grad:
  :param tf.Variable var:
  :param tf.Tensor train_step: int, scalar
  :param int num_accum_steps:
  :return: modified grad
  :rtype: tf.Tensor
  """
    from TFUtil import reuse_name_scope_of_tensor, get_base_name
    with reuse_name_scope_of_tensor(grad,
                                    postfix="/%s_accum_grad" %
                                    get_base_name(grad)):
        shape = var.get_shape().as_list()
        v = tf.get_variable(name="var_accum_grad",
                            shape=shape,
                            dtype=grad.dtype,
                            initializer=tf.zeros_initializer(),
                            trainable=False)
        return tf.cond(tf.less_equal(tf.mod(train_step, num_accum_steps),
                                     0), lambda: tf.assign(v, grad),
                       lambda: tf.assign_add(v, grad))
Exemple #4
0
  def _post_process_grad(self, grad, var, global_info):
    """
    :param tf.Tensor grad:
    :param tf.Variable var:
    :param WrapOptimizer._GetGlobalInfo global_info:
    :return: new grad, apply grad opts
    :rtype: tf.Tensor, dict[str]
    """
    updater_opts = self._get_updater_opts_from_var(var)

    accum_grad_multiple_num_steps = updater_opts.get(
      "accum_grad_multiple_step", self.config.int("accum_grad_multiple_step", 0))
    grad_noise = updater_opts.get("gradient_noise", self.config.float("gradient_noise", 0.0))
    grad_clip = updater_opts.get("gradient_clip", self.config.float("gradient_clip", 0.0))
    # E.g. https://github.com/openai/baselines/blob/master/baselines/deepq/simple.py:
    #   grad_norm_clipping=10 -> tf.clip_by_norm
    grad_clip_norm = updater_opts.get("gradient_clip_norm", self.config.float("gradient_clip_norm", 0.0))
    grad_clip_avg_norm = updater_opts.get("gradient_clip_avg_norm", self.config.float("gradient_clip_avg_norm", 0.0))
    grad_clip_global_norm = updater_opts.get(
      "gradient_clip_global_norm", self.config.float("gradient_clip_global_norm", 0.0))
    global_norm_tag = updater_opts.get(
      "global_norm_tag", self.config.value("global_norm_tag", None))
    grad_clip_global_norm_tag = updater_opts.get(
      "gradient_clip_global_norm_tag", self.config.value("gradient_clip_global_norm_tag", global_norm_tag))
    grad_norm_to_clip_to_zero = updater_opts.get(
      "grad_norm_to_clip_to_zero", self.config.float("grad_norm_to_clip_to_zero", 0.0))
    maximize_grad_norm = updater_opts.get("maximize_grad_norm", self.config.float("maximize_grad_norm", 0))

    if maximize_grad_norm:
      grad_ext = global_info.get_maximize_grad_norm_grad(maximize_grad_norm, var)
      if grad_ext is not None:
        grad += grad_ext

    if accum_grad_multiple_num_steps >= 1:
      grad = accum_grad_multiple_step(
        grad, var, train_step=self.global_train_step, num_accum_steps=accum_grad_multiple_num_steps)

    if updater_opts.get("debug_grad_summaries", self.config.bool_or_other("debug_grad_summaries", False)):
      from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor
      with reuse_name_scope_of_tensor(grad, prefix="grads/"):
        variable_summaries(grad, name="grad_of_%s" % get_base_name(var))
      with reuse_name_scope_of_tensor(var, prefix="vars/"):
        variable_summaries(var, name=get_base_name(var))

    # Also see tf.contrib.layers.optimizers.optimize_loss() for reference.
    if grad_noise:
      assert grad_noise > 0
      from TFUtil import add_scaled_noise_to_gradients
      with tf.name_scope("grad_noise"):
        (grad, var), = add_scaled_noise_to_gradients([(grad, var)], grad_noise)
    if grad_clip:
      assert grad_clip > 0
      with tf.name_scope("grad_clip"):
        grad = tf.clip_by_value(grad, -grad_clip, grad_clip)
    if grad_clip_norm:
      assert grad_clip_norm > 0
      with tf.name_scope("grad_clip_norm"):
        grad = tf.clip_by_norm(grad, grad_clip_norm)
    if grad_clip_avg_norm:
      assert grad_clip_avg_norm > 0
      with tf.name_scope("grad_clip_avg_norm"):
        grad = tf.clip_by_average_norm(grad, grad_clip_avg_norm)
    if grad_clip_global_norm:
      assert grad_clip_global_norm > 0
      with tf.name_scope("grad_clip_global_norm"):
        grad = global_info.clip_by_global_norm(
          grad, clip_norm=grad_clip_global_norm, global_norm_tag=grad_clip_global_norm_tag)
    if updater_opts.get("gradient_nan_inf_filter", self.config.bool("gradient_nan_inf_filter", False)):
      from TFUtil import nan_to_num
      grad = nan_to_num(grad, nan_num=0.0, inf_num=0.0)
    if grad_norm_to_clip_to_zero:
      with tf.name_scope("grad_norm_to_clip_to_zero"):
        grad = global_info.set_zero_on_high_global_norm(
          grad, grad_norm_threshold=grad_norm_to_clip_to_zero, global_norm_tag=global_norm_tag)

    updater_opts.assert_all_read()

    opt_key, _ = self._get_optimizer_item_for_variable(var)
    apply_grad_opts = {
      "opt_key": opt_key, "accum_grad_multiple_num_steps": accum_grad_multiple_num_steps}
    return grad, apply_grad_opts
Exemple #5
0
    def _get_apply_grads_op(self, loss, trainable_vars_for_gradients):
        """
    :param tf.Tensor loss:
    :param list[tf.Variable] trainable_vars_for_gradients:
    :return: op with all variable updates combined, using the optimizer
    :rtype: tf.Operation
    """
        if not trainable_vars_for_gradients:
            return tf.no_op(name="no_grad_vars_no_op")
        # AccumulateN might not be deterministic but should be faster and should require less memory.
        # We might want to make this configurable.
        if self.config.is_true("deterministic_train"):
            aggregation_method = tf.AggregationMethod.ADD_N
        else:
            aggregation_method = tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
        accum_grad_multiple_num_steps = self.config.int(
            "accum_grad_multiple_step", 0)
        grad_noise = self.config.float("gradient_noise", 0.0)
        grad_clip = self.config.float("gradient_clip", 0.0)
        grad_clip_norm = self.config.float("gradient_clip_norm", 0.0)
        grad_clip_avg_norm = self.config.float("gradient_clip_avg_norm", 0.0)
        grad_clip_global_norm = self.config.float("gradient_clip_global_norm",
                                                  0.0)
        # E.g. https://github.com/openai/baselines/blob/master/baselines/deepq/simple.py: grad_norm_clipping=10 -> tf.clip_by_norm

        # Extended self.optimizer.minimize() to optionally modify gradients.
        grads_and_vars = self.optimizer.compute_gradients(
            loss,
            var_list=trainable_vars_for_gradients,
            aggregation_method=aggregation_method)
        if self.config.is_true("use_horovod") and self.config.value(
                "horovod_reduce_type", "") == "grad":
            import horovod.tensorflow as hvd
            grads_and_vars = [(hvd.allreduce(
                grad, average=self.config.is_true("horovod_avg_grad"))
                               if grad is not None else None, var)
                              for (grad, var) in grads_and_vars]
        var_grads = {
            var: grad
            for (grad, var) in grads_and_vars if grad is not None
        }
        if not var_grads:
            raise Exception("no single variable to train")
        if self.config.float("maximize_grad_norm", 0):
            f = self.config.float("maximize_grad_norm", 0)
            grad_norm = tf.add_n(
                [tf.nn.l2_loss(g) for g in var_grads.values()],
                name="grad_norm_half") * 2.0
            loss_ext = grad_norm * (-f)
            grads_and_vars_ext = self.optimizer.compute_gradients(
                loss_ext,
                var_list=list(var_grads.keys()),
                aggregation_method=aggregation_method)
            var_grads_ext = {
                var: grad
                for (grad, var) in grads_and_vars_ext if grad is not None
            }
            grads_and_vars = [(grad + var_grads_ext.get(var, 0.0), var)
                              for (grad, var) in grads_and_vars]
        if accum_grad_multiple_num_steps >= 1:
            grads_and_vars = [(accum_grad_multiple_step(
                grad,
                var,
                train_step=self.network.global_train_step,
                num_accum_steps=accum_grad_multiple_num_steps), var)
                              for (grad, var) in grads_and_vars]
        if self.config.bool("debug_grad_summaries", False):
            from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor
            for grad, var in grads_and_vars:
                with reuse_name_scope_of_tensor(grad, prefix="grads/"):
                    variable_summaries(grad,
                                       name="grad_of_%s" % get_base_name(var))
                with reuse_name_scope_of_tensor(var, prefix="vars/"):
                    variable_summaries(var, name=get_base_name(var))
        # Also see tf.contrib.layers.optimizers.optimize_loss() for reference.
        if self.config.bool("gradient_nan_inf_filter", False):
            from TFUtil import nan_to_num
            grads_and_vars = [(nan_to_num(grad, nan_num=0.0, inf_num=0.0), var)
                              for (grad, var) in grads_and_vars]
        if grad_noise:
            assert grad_noise > 0
            from TFUtil import add_scaled_noise_to_gradients
            with tf.name_scope("grad_noise"):
                grads_and_vars = add_scaled_noise_to_gradients(
                    grads_and_vars, grad_noise)
        if grad_clip:
            assert grad_clip > 0
            with tf.name_scope("grad_clip"):
                grads_and_vars = [(tf.clip_by_value(grad, -grad_clip,
                                                    grad_clip), var)
                                  for grad, var in grads_and_vars]
        if grad_clip_norm:
            assert grad_clip_norm > 0
            with tf.name_scope("grad_clip_norm"):
                grads_and_vars = [(tf.clip_by_norm(grad, grad_clip_norm), var)
                                  for grad, var in grads_and_vars]
        if grad_clip_avg_norm:
            assert grad_clip_avg_norm > 0
            with tf.name_scope("grad_clip_avg_norm"):
                grads_and_vars = [
                    (tf.clip_by_average_norm(grad, grad_clip_avg_norm), var)
                    for grad, var in grads_and_vars
                ]
        if grad_clip_global_norm:
            assert grad_clip_global_norm > 0
            with tf.name_scope("grad_clip_global_norm"):
                grads_clipped, _ = tf.clip_by_global_norm(
                    [grad for (grad, _) in grads_and_vars],
                    grad_clip_global_norm)
                grads_and_vars = zip(grads_clipped,
                                     [var for (_, var) in grads_and_vars])
        if accum_grad_multiple_num_steps >= 1:
            apply_grads = tf.cond(
                tf.equal(
                    tf.mod(self.network.global_train_step,
                           accum_grad_multiple_num_steps),
                    accum_grad_multiple_num_steps - 1),
                true_fn=lambda: self.optimizer.apply_gradients(grads_and_vars),
                false_fn=lambda: tf.no_op(),
                name="apply_grads/accum_grad_multiple_step")
        else:
            apply_grads = self.optimizer.apply_gradients(grads_and_vars)
        return apply_grads