def testClipByNormClipped(self):
    # Norm clipping when clip_norm < 5
    with self.session(use_gpu=True):
      x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
      # Norm of x = sqrt(3^2 + 4^2) = 5
      np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 0.0]]
      clip_norm = 4.0
      ans = clip_ops.clip_by_norm(x, clip_norm)
      tf_ans = self.evaluate(ans)

      ans = clip_ops.clip_by_norm(x, clip_norm)
      tf_ans_tensor = self.evaluate(ans)

    self.assertAllClose(np_ans, tf_ans)
    self.assertAllClose(np_ans, tf_ans_tensor)
Example #2
0
 def testClipByNormBadShape(self):
   with self.test_session(use_gpu=True):
     x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3, 1])
     # Use a nonsensical shape.
     clip = constant_op.constant([1.0, 2.0])
     with self.assertRaises(ValueError):
       _ = clip_ops.clip_by_norm(x, clip)
 def _testClipByNorm(self, inputs, max_norm, expected):
   with self.test_session() as sess:
     input_op = constant_op.constant(inputs)
     clipped = clip_ops.clip_by_norm(input_op, max_norm)
     check_op = numerics.add_check_numerics_ops()
     result, _ = sess.run([clipped, check_op])
   self.assertAllClose(result, expected)
Example #4
0
  def get_gradients(self, loss, params):
    """Returns gradients of `loss` with respect to `params`.

    Arguments:
      loss: Loss tensor.
      params: List of variables.

    Returns:
      List of gradient tensors.

    Raises:
      ValueError: In case any gradient cannot be computed (e.g. if gradient
        function not implemented).
    """
    params = nest.flatten(params)
    with backend.get_graph().as_default():
      grads = gradients.gradients(loss, params)
    for grad, param in zip(grads, params):
      if grad is None:
        raise ValueError("Variable {} has `None` for gradient. "
                         "Please make sure that all of your ops have a "
                         "gradient defined (i.e. are differentiable). "
                         "Common ops without gradient: "
                         "K.argmax, K.round, K.eval.".format(param))
    if hasattr(self, "clipnorm"):
      grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
    if hasattr(self, "clipvalue"):
      grads = [
          clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue)
          for g in grads
      ]
    return grads
Example #5
0
  def get_gradients(self, loss, params):
    """Returns gradients of `loss` with respect to `params`.

    Arguments:
      loss: Loss tensor.
      params: List of variables.

    Returns:
      List of gradient tensors.

    Raises:
      ValueError: In case any gradient cannot be computed (e.g. if gradient
        function not implemented).
    """
    loss = self._scale_loss(loss)
    grads = gradients.gradients(loss, params)
    if None in grads:
      raise ValueError("An operation has `None` for gradient. "
                       "Please make sure that all of your ops have a "
                       "gradient defined (i.e. are differentiable). "
                       "Common ops without gradient: "
                       "K.argmax, K.round, K.eval.")
    if hasattr(self, "clipnorm"):
      grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
    if hasattr(self, "clipvalue"):
      grads = [
          clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue)
          for g in grads
      ]
    return grads
 def _clip_dense(self, var):
   with self._maybe_colocate_with(var):
     updated_var_value = array_ops.identity(var.ref())
     normalized_var = clip_ops.clip_by_norm(
         updated_var_value, self._max_norm, self._vars_to_clip_dims[var])
     delta = updated_var_value - normalized_var
   with ops.colocate_with(var):
     return var.assign_sub(delta, use_locking=self._use_locking)
Example #7
0
 def maybe_normalize(x):
   if max_norm is not None:
     if x.get_shape().ndims is not None:
       ndims = x.get_shape().ndims
     else:
       ndims = array_ops.size(array_ops.shape(x))
     return clip_ops.clip_by_norm(x, max_norm, axes=list(range(1, ndims)))
   return x
  def _testClipIndexedSlicesByNorm(self, values, indices, shape, max_norm,
                                   axes):
    with self.cached_session() as sess:
      values = constant_op.constant(values)
      indices = constant_op.constant(indices)
      shape = constant_op.constant(shape)
      # IndexedSlices mode
      indixed_slices = ops.IndexedSlices(values, indices, shape)
      clipped = clip_ops.clip_by_norm(indixed_slices, max_norm, axes)
      # clipped should be IndexedSlices
      self.assertIsInstance(clipped, ops.IndexedSlices)
      clipped = ops.convert_to_tensor(clipped)

      # Tensor mode
      dense_tensor = ops.convert_to_tensor(indixed_slices)
      dense_clipped = clip_ops.clip_by_norm(dense_tensor, max_norm, axes)
      result, expected = sess.run([clipped, dense_clipped])
    self.assertAllClose(result, expected)
Example #9
0
  def testClipByNormNotClipped(self):
    # No norm clipping when clip_norm >= 5
    with self.test_session():
      x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
      # Norm of x = sqrt(3^2 + 4^2) = 5
      np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
      clip_norm = 6.0
      ans = clip_ops.clip_by_norm(x, clip_norm)
      tf_ans = ans.eval()

    self.assertAllClose(np_ans, tf_ans)
  def testClipByNormClippedWithDim1(self):
    # Norm clipping when clip_norm < 5
    with self.session(use_gpu=True):
      x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
      # Norm of x[0, :] = 3, x[1, :] = sqrt(3^2 + 4^2) = 5
      np_ans = [[-3.0, 0.0, 0.0], [3.2, 0.0, 2.4]]
      clip_norm = 4.0
      ans = clip_ops.clip_by_norm(x, clip_norm, [1])
      tf_ans = self.evaluate(ans)

    self.assertAllClose(np_ans, tf_ans)
Example #11
0
  def testClipByNormZero(self):
    # No norm clipping when norm = 0
    with self.test_session(use_gpu=True):
      x = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
      # Norm = 0, no changes
      np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
      clip_norm = 6.0
      ans = clip_ops.clip_by_norm(x, clip_norm)
      tf_ans = ans.eval()

    self.assertAllClose(np_ans, tf_ans)
Example #12
0
def clip_gradient_norms(gradients_to_variables, max_norm):
  """Clips the gradients by the given value.

  Args:
    gradients_to_variables: A list of gradient to variable pairs (tuples).
    max_norm: the maximum norm value.

  Returns:
    A list of clipped gradient to variable pairs.
  """
  clipped_grads_and_vars = []
  for grad, var in gradients_to_variables:
    if grad is not None:
      if isinstance(grad, ops.IndexedSlices):
        tmp = clip_ops.clip_by_norm(grad.values, max_norm)
        grad = ops.IndexedSlices(tmp, grad.indices, grad.dense_shape)
      else:
        grad = clip_ops.clip_by_norm(grad, max_norm)
    clipped_grads_and_vars.append((grad, var))
  return clipped_grads_and_vars
Example #13
0
  def testClipByNormNotClippedWithAxes(self):
    # No norm clipping when clip_norm >= 5
    with self.test_session(use_gpu=True):
      x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
      # Norm of x[0, :] = 3, x[1, :] = sqrt(3^2 + 4^2) = 5
      np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 3.0]]
      clip_norm = 6.0
      ans = clip_ops.clip_by_norm(x, clip_norm, [1])
      tf_ans = ans.eval()

    self.assertAllClose(np_ans, tf_ans)
Example #14
0
  def testClipByNormClippedWithDim0(self):
    # Norm clipping when clip_norm < 5
    with self.test_session(use_gpu=True):
      x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
      # Norm of x[:, 0] = sqrt(3^2 + 4^2) = 5, x[:, 2] = 3
      np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 3.0]]
      clip_norm = 4.0
      ans = clip_ops.clip_by_norm(x, clip_norm, [0])
      tf_ans = ans.eval()

    self.assertAllClose(np_ans, tf_ans)
Example #15
0
  def _compute_gradients(self, loss, var_list, grad_loss=None):
    """Compute gradients of `loss` for the variables in `var_list`.

    This is the first part of `minimize()`.  It returns a list
    of (gradient, variable) pairs where "gradient" is the gradient
    for "variable".  Note that "gradient" can be a `Tensor`, an
    `IndexedSlices`, or `None` if there is no gradient for the
    given variable.

    Args:
      loss: A callable taking no arguments which returns the value to minimize.
      var_list: list or tuple of `Variable` objects to update to minimize
        `loss`, or a callable returning the list or tuple of `Variable` objects.
        Use callable when the variable list would otherwise be incomplete before
        `minimize` and the variables are created at the first time when `loss`
        is called.
      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.

    Returns:
      A list of (gradient, variable) pairs. Variable is always present, but
      gradient can be `None`.

    Raises:
      TypeError: If `var_list` contains anything else than `Variable` objects.
      ValueError: If some arguments are invalid, or var_list is None.
    """
    # TODO(josh11b): Test that we handle weight decay in a reasonable way.
    with backprop.GradientTape() as tape:
      if not callable(var_list):
        tape.watch(var_list)
      loss_value = loss()
    if callable(var_list):
      var_list = var_list()
    var_list = nest.flatten(var_list)
    grads = tape.gradient(loss_value, var_list, grad_loss)

    if hasattr(self, "clipnorm"):
      grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
    if hasattr(self, "clipvalue"):
      grads = [
          clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue)
          for g in grads
      ]

    grads_and_vars = list(zip(grads, var_list))
    self._assert_valid_dtypes([
        v for g, v in grads_and_vars
        if g is not None and v.dtype != dtypes.resource
    ])

    return grads_and_vars
Example #16
0
 def testClipByAverageNormReplacedWithClipByNorm(self):
   # Check clip_by_average_norm(t) is the same as
   # clip_by_norm(t, clip_norm * tf.to_float(tf.size(t)))
   with self.session(use_gpu=True):
     x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
     # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
     # expected answer [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
     clip_norm = constant_op.constant(0.8)
     with_norm = clip_ops.clip_by_average_norm(x, clip_norm)
     without_norm = clip_ops.clip_by_norm(
         x, clip_norm * math_ops.to_float(array_ops.size(x)))
     clip_by_average_norm_ans = self.evaluate(with_norm)
     clip_by_norm_ans = self.evaluate(without_norm)
     self.assertAllClose(clip_by_average_norm_ans, clip_by_norm_ans)
Example #17
0
 def maybe_normalize(x):
   """Normalizes the embeddings in x if max_norm is not None."""
   if max_norm is None:
     return x
   static = True
   ids_rank = ops.convert_to_tensor(ids).get_shape().ndims
   if ids_rank is None:
     ids_rank = array_ops.rank(ids)
     static = False
   x_rank = x.get_shape().ndims
   if x_rank is None:
     x_rank = array_ops.rank(x)
     static = False
   return clip_ops.clip_by_norm(
       x, max_norm,
       axes=list(range(ids_rank, x_rank)) if static
       else math_ops.range(ids_rank, x_rank))
  def _clip_sparse(self, grad, var):
    assert isinstance(grad, ops.IndexedSlices)
    clip_dims = self._vars_to_clip_dims[var]
    if 0 in clip_dims:
      logging.warning("Clipping norm across dims %s for %s is inefficient "
                      "when including sparse dimension 0.", clip_dims,
                      var.op.name)
      return self._clip_dense(var)

    with ops.colocate_with(var):
      var_subset = array_ops.gather(var.ref(), grad.indices)
    with self._maybe_colocate_with(var):
      normalized_var_subset = clip_ops.clip_by_norm(
          var_subset, self._max_norm, clip_dims)
      delta = ops.IndexedSlices(
          var_subset - normalized_var_subset, grad.indices, grad.dense_shape)
    with ops.colocate_with(var):
      return var.scatter_sub(delta, use_locking=self._use_locking)
Example #19
0
 def _clip_gradients(self, grads):
   """Clip gradients according to the clipnorm and clipvalue attributes."""
   if self.clipnorm is not None:
     if distribute_ctx.has_strategy():
       raise ValueError("Gradient clipping in the optimizer "
                        "(by setting clipnorm or clipvalue) is currently "
                        "unsupported when using a distribution strategy.")
     grads = [None if g is None else clip_ops.clip_by_norm(g, self.clipnorm)
              for g in grads]
   if self.clipvalue is not None:
     if distribute_ctx.has_strategy():
       raise ValueError("Gradient clipping in the optimizer "
                        "(by setting clipnorm or clipvalue) is currently "
                        "unsupported when using a distribution strategy.")
     v = self.clipvalue
     grads = [
         None if g is None else clip_ops.clip_by_value(g, -v, v) for g in grads
     ]
   return grads
Example #20
0
    def _clip(params, ids, max_norm):
        def _rank(x):
            rank = ops.convert_to_tensor(x).get_shape().ndims
            if rank:
                return rank, True
            else:
                return array_ops.rank(x), False

        if max_norm is None:
            return params
        ids_rank, ids_static = _rank(ids)
        params_rank, params_static = _rank(params)
        return clip_ops.clip_by_norm(
            params,
            max_norm,
            axes=(list(range(ids_rank, params_rank))
                  if ids_static and params_static else math_ops.range(
                      ids_rank, params_rank)),
        )
Example #21
0
def _clip(params, ids, max_norm):
  """Helper function for _embedding_lookup_and_transform.

  This function optionally clips embeddings to an l2-norm of max_norm.

  Args:
    params: A `Tensor` of embeddings retrieved by `gather`.
    ids: The `ids` argument that was passed to `gather`.
    max_norm: If provided, the embeddings are l2-normalized to the value of
      max_norm.

  Returns:
    A `Tensor` with the same type as `params`.
  """

  def _rank(x):
    """Helper function to retrieve the rank of a tensor.

    Args:
      x: Something convertible to `Tensor`.

    Returns:
      Either a pair `(rank, True)` where `rank` is an integer or a pair
      `(rank, False)` where `rank` is an integer `Tensor`. In either case,
      `rank` is the rank of `x`.
    """
    rank = ops.convert_to_tensor(x).get_shape().ndims
    if rank:
      return rank, True
    else:
      return array_ops.rank(x), False

  if max_norm is None:
    return params
  ids_rank, ids_static = _rank(ids)
  params_rank, params_static = _rank(params)
  return clip_ops.clip_by_norm(
      params,
      max_norm,
      axes=(list(range(ids_rank, params_rank))
            if ids_static and params_static
            else math_ops.range(ids_rank, params_rank)))
def _clip(params, ids, max_norm):
  """Helper function for _embedding_lookup_and_transform.

  This function optionally clips embeddings to an l2-norm of max_norm.

  Args:
    params: A `Tensor` of embeddings retrieved by `gather`.
    ids: The `ids` argument that was passed to `gather`.
    max_norm: If not `None`, each embedding is clipped if its l2-norm is
      larger than this value.

  Returns:
    A `Tensor` with the same type as `params`.
  """

  def _rank(x):
    """Helper function to retrieve the rank of a tensor.

    Args:
      x: Something convertible to `Tensor`.

    Returns:
      Either a pair `(rank, True)` where `rank` is an integer or a pair
      `(rank, False)` where `rank` is an integer `Tensor`. In either case,
      `rank` is the rank of `x`.
    """
    rank = ops.convert_to_tensor(x).get_shape().ndims
    if rank:
      return rank, True
    else:
      return array_ops.rank(x), False

  if max_norm is None:
    return params
  ids_rank, ids_static = _rank(ids)
  params_rank, params_static = _rank(params)
  return clip_ops.clip_by_norm(
      params,
      max_norm,
      axes=(list(range(ids_rank, params_rank))
            if ids_static and params_static
            else math_ops.range(ids_rank, params_rank)))
Example #23
0
def get_gradients_for_keras(optimizer, loss, params):
    from tensorflow.python.util import nest
    from tensorflow.python.keras import backend
    from tensorflow.python.ops import gradients
    from tensorflow.python.ops import clip_ops
    from tensorflow.python.keras.optimizers import TFOptimizer

    params = nest.flatten(params)
    if isinstance(optimizer, TFOptimizer):
        scope_name = optimizer.optimizer._name
    else:
        scope_name = optimizer._name

    with backend.get_graph().as_default(), backend.name_scope(scope_name + "/gradients"):
        grads = gradients.gradients(loss, params)

        all_reduced_grads = []
        for grad, param in zip(grads, params):
            if grad is None:
                raise ValueError("Variable {} has `None` for gradient. "
                                 "Please make sure that all of your ops have a "
                                 "gradient defined (i.e. are differentiable). "
                                 "Common ops without gradient: "
                                 "K.argmax, K.round, K.eval.".format(param))
            grad = process_grad(grad)

            with tf.control_dependencies([param]):
                grad_i = tf.identity(grad, name="zoo_identity_op_for_grad")

            all_reduced_grads.append(grad_i)

        grads = all_reduced_grads

        if hasattr(optimizer, "clipnorm"):
            grads = [clip_ops.clip_by_norm(g, optimizer.clipnorm) for g in grads]
        if hasattr(optimizer, "clipvalue"):
            grads = [
                clip_ops.clip_by_value(g, -optimizer.clipvalue, optimizer.clipvalue)
                for g in grads
            ]
    return grads
    def train_one_sample(self, space_sample, reward):
        self.reset()
        with tf.GradientTape() as tape:
            self.reset()
            self.calc_log_prob(space_sample)
            if self.entropy_weight is not None:
                self.reward += self.entropy_weight * self.entropy
            self.baseline = self.baseline * self.baseline_decay + reward * (
                1 - self.baseline_decay)
            loss = self.log_prob * (reward - self.baseline)
            print(f'Reward: {reward}, Loss: {loss}')
            # loss += skip_weight * self.sample_skip_penalty
        grads = tape.gradient(loss, self.trainable_variables)

        if hasattr(self, "clipnorm"):
            grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
        if hasattr(self, "clipvalue"):
            grads = [
                clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue)
                for g in grads
            ]
        self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
        return loss
Example #25
0
def _gather_and_clip(params, ids, max_norm, name=None):
  """Helper function for _embedding_lookup_and_transform.

  This function gathers embeddings from a single tensor. The gather deals with
  resource variables specially. The embeddings are clipped to an l2-norm of
  max_norm if provided.

  Args:
    params: A `Tensor` of embeddings.
    ids: A `Tensor` indexing the embeddings to be retrieved from `params`.
    max_norm: If provided, embedding values are l2-normalized to the value of
      max_norm.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` with the same type as `params`.
  """
  if isinstance(params, resource_variable_ops.ResourceVariable):
    embs = params.sparse_read(ids, name=name)
  else:
    embs = array_ops.gather(params, ids, name=name)
  if max_norm is None:
    return embs
  static = True
  ids_rank = ops.convert_to_tensor(ids).get_shape().ndims
  if ids_rank is None:
    ids_rank = array_ops.rank(ids)
    static = False
  embs_rank = embs.get_shape().ndims
  if embs_rank is None:
    embs_rank = array_ops.rank(embs)
    static = False
  return clip_ops.clip_by_norm(
      embs,
      max_norm,
      axes=list(range(ids_rank, embs_rank))
      if static else math_ops.range(ids_rank, embs_rank))
Example #26
0
def _gather_and_clip(params, ids, max_norm, name=None):
  """Helper function for _embedding_lookup_and_transform.

  This function gathers embeddings from a single tensor. The gather deals with
  resource variables specially. The embeddings are clipped to an l2-norm of
  max_norm if provided.

  Args:
    params: A `Tensor` of embeddings.
    ids: A `Tensor` indexing the embeddings to be retrieved from `params`.
    max_norm: If provided, embedding values are l2-normalized to the value of
      max_norm.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` with the same type as `params`.
  """
  if isinstance(params, resource_variable_ops.ResourceVariable):
    embs = params.sparse_read(ids, name=name)
  else:
    embs = array_ops.gather(params, ids, name=name)
  if max_norm is None:
    return embs
  static = True
  ids_rank = ops.convert_to_tensor(ids).get_shape().ndims
  if ids_rank is None:
    ids_rank = array_ops.rank(ids)
    static = False
  embs_rank = embs.get_shape().ndims
  if embs_rank is None:
    embs_rank = array_ops.rank(embs)
    static = False
  return clip_ops.clip_by_norm(
      embs,
      max_norm,
      axes=list(range(ids_rank, embs_rank))
      if static else math_ops.range(ids_rank, embs_rank))
Example #27
0
    def get_gradients(self, loss, params):
        """Returns gradients of `loss` with respect to `params`.

    Arguments:
      loss: Loss tensor.
      params: List of variables.

    Returns:
      List of gradient tensors.

    Raises:
      ValueError: In case any gradient cannot be computed (e.g. if gradient
        function not implemented).
    """
        params = nest.flatten(params)
        with backend.get_graph().as_default(), backend.name_scope(
                self._name + "/gradients"):
            grads = gradients.gradients(loss, params)
            for grad, param in zip(grads, params):
                if grad is None:
                    raise ValueError(
                        "Variable {} has `None` for gradient. "
                        "Please make sure that all of your ops have a "
                        "gradient defined (i.e. are differentiable). "
                        "Common ops without gradient: "
                        "K.argmax, K.round, K.eval.".format(param))
            if hasattr(self, "clipnorm"):
                grads = [
                    clip_ops.clip_by_norm(g, self.clipnorm) for g in grads
                ]
            if hasattr(self, "clipvalue"):
                grads = [
                    clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue)
                    for g in grads
                ]
        return grads
Example #28
0
    def _resource_apply_dense(self, grad, var):
        step, beta1_power, beta2_power = self._get_beta_accumulators()
        beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
        beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
        lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)

        if self._initial_total_steps > 0:
            total_steps = math_ops.cast(self._total_steps_t,
                                        var.dtype.base_dtype)
            warmup_proportion = math_ops.cast(self._warmup_proportion_t,
                                              var.dtype.base_dtype)
            min_lr = math_ops.cast(self._min_lr_t, var.dtype.base_dtype)
            warmup_steps = total_steps * warmup_proportion
            decay_steps = math_ops.maximum(total_steps - warmup_steps, 1)
            decay_rate = (min_lr - lr_t) / decay_steps
            lr_t = tf.compat.v1.where(
                step <= warmup_steps,
                lr_t * (step / warmup_steps),
                lr_t + decay_rate *
                math_ops.minimum(step - warmup_steps, decay_steps),
            )

        beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
        beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
        epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)

        v = self.get_slot(var, "v")

        if self.clip_gradients:
            clipVal = math_ops.sqrt(
                tf.reduce_sum(input_tensor=v) /
                (1.0 -
                 beta2_power)) * self.clip_multiplier_t + self.clip_epsilon_t
            grad = clip_ops.clip_by_norm(grad, clipVal)

        sma_inf = 2.0 / (1.0 - beta2_t) - 1.0
        sma_t = sma_inf - 2.0 * step * beta2_power / (1.0 - beta2_power)

        m = self.get_slot(var, "m")

        v_t = state_ops.assign(v,
                               beta2_t * v +
                               (1.0 - beta2_t) * math_ops.square(grad),
                               use_locking=self._use_locking)
        v_corr_t = math_ops.sqrt(v_t / (1.0 - beta2_power)) + epsilon_t
        grad_corr = grad / v_corr_t

        m_t = state_ops.assign(m,
                               beta1_t * m + (1.0 - beta1_t) * grad_corr,
                               use_locking=self._use_locking)
        m_corr_t = m_t / (1.0 - beta1_power)

        r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) * (sma_t - 2.0) /
                            (sma_inf - 2.0) * sma_inf / sma_t)

        var_t = tf.compat.v1.where(sma_t >= 5.0, r_t * m_corr_t, m_corr_t)

        if var in self.reg_vars:
            if self._initial_weight_decay > 0.0:
                var_t += math_ops.cast(self._weight_decay_t,
                                       var.dtype.base_dtype) * var
            if self._L1_decay > 0.0:
                var_t += math_ops.cast(
                    self._L1_decay, var.dtype.base_dtype) * math_ops.sign(var)

        with tf.control_dependencies([var_t]):
            var_update = state_ops.assign_sub(var,
                                              lr_t * var_t,
                                              use_locking=self._use_locking)

        updates = [var_update, m_t, v_t]
        return control_flow_ops.group(*updates)
Example #29
0
 def testClipByNormGradientZeros(self):
   with self.session(use_gpu=True):
     x = array_ops.zeros([3])
     b = clip_ops.clip_by_norm(x, 1.)
     grad, = gradients_impl.gradients(b, x)
     self.assertAllEqual(grad, [1., 1., 1.])
Example #30
0
  def compute_gradients(self, loss, var_list=None,
                        gate_gradients=GATE_OP,
                        aggregation_method=None,
                        colocate_gradients_with_ops=False,
                        grad_loss=None, stop_gradients=None,
                        scale_loss_by_num_towers=None):
    """Compute gradients of `loss` for the variables in `var_list`.

    This is the first part of `minimize()`.  It returns a list
    of (gradient, variable) pairs where "gradient" is the gradient
    for "variable".  Note that "gradient" can be a `Tensor`, an
    `IndexedSlices`, or `None` if there is no gradient for the
    given variable.

    Args:
      loss: A callable taking no arguments which returns the value to minimize.
      var_list: list or tuple of `Variable` objects to update to minimize
        `loss`, or a callable returning the list or tuple of `Variable` objects.
        Use callable when the variable list would otherwise be incomplete before
        `minimize` and the variables are created at the first time when `loss`
        is called.
      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
      stop_gradients: Optional. A Tensor or list of tensors not to differentiate
        through.
      scale_loss_by_num_towers: Optional boolean. If true, scale the loss
        down by the number of towers. By default, auto-detects whether this
        is needed.

    Returns:
      A list of (gradient, variable) pairs. Variable is always present, but
      gradient can be `None`.

    Raises:
      TypeError: If `var_list` contains anything else than `Variable` objects.
      ValueError: If some arguments are invalid.
      RuntimeError: If called with eager execution enabled and `loss` is
        not callable.

    @compatibility(eager)
    When eager execution is enabled, `gate_gradients`, `aggregation_method`,
    and `colocate_gradients_with_ops` are ignored.
    @end_compatibility
    """
    # TODO(josh11b): Test that we handle weight decay in a reasonable way.
    with backprop.GradientTape() as tape:
      if not callable(var_list):
        tape.watch(var_list)
      loss_value = loss()
    if callable(var_list):
      var_list = var_list()
    var_list = nest.flatten(var_list)
    grads = tape.gradient(loss_value, var_list, grad_loss)

    if hasattr(self, "clipnorm"):
      grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
    if hasattr(self, "clipvalue"):
      grads = [
          clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue)
          for g in grads
      ]

    grads_and_vars = list(zip(grads, var_list))
    self._assert_valid_dtypes(
        [v for g, v in grads_and_vars
         if g is not None and v.dtype != dtypes.resource])
    return grads_and_vars
Example #31
0
 def _testClipTensorByNorm(self, inputs, max_norm, expected):
     input_op = constant_op.constant(inputs)
     clipped = clip_ops.clip_by_norm(input_op, max_norm)
     check_op = numerics.add_check_numerics_ops()
     result, _ = self.evaluate([clipped, check_op])
     self.assertAllClose(result, expected)
Example #32
0
    def _apply_sparse_shared(self, grad, var, indices, scatter_add):
        step, beta1_power, beta2_power = self._get_beta_accumulators()
        beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
        beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
        lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)

        if self._initial_total_steps > 0:
            total_steps = math_ops.cast(self._total_steps_t,
                                        var.dtype.base_dtype)
            warmup_proportion = math_ops.cast(self._warmup_proportion_t,
                                              var.dtype.base_dtype)
            min_lr = math_ops.cast(self._min_lr_t, var.dtype.base_dtype)
            warmup_steps = total_steps * warmup_proportion
            decay_steps = math_ops.maximum(total_steps - warmup_steps, 1)
            decay_rate = (min_lr - lr_t) / decay_steps
            lr_t = tf.compat.v1.where(
                step <= warmup_steps,
                lr_t * (step / warmup_steps),
                lr_t + decay_rate *
                math_ops.minimum(step - warmup_steps, decay_steps),
            )

        beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
        beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
        epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
        v = self.get_slot(var, "v")

        if self.clip_gradients:
            clipVal = math_ops.sqrt(
                tf.reduce_sum(input_tensor=v) /
                (1.0 -
                 beta2_power)) * self.clip_multiplier_t + self.clip_epsilon_t
            grad = clip_ops.clip_by_norm(grad, clipVal)

        sma_inf = 2.0 / (1.0 - beta2_t) - 1.0
        sma_t = sma_inf - 2.0 * step * beta2_power / (1.0 - beta2_power)

        m = self.get_slot(var, "m")
        m_scaled_g_values = grad * (1 - beta1_t)
        m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
        with ops.control_dependencies([m_t]):
            m_t = scatter_add(m, indices, m_scaled_g_values)
        m_corr_t = m_t / (1.0 - beta1_power)

        v_scaled_g_values = (grad * grad) * (1 - beta2_t)
        v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
        with ops.control_dependencies([v_t]):
            v_t = scatter_add(v, indices, v_scaled_g_values)
        if self._amsgrad:
            vhat = self.get_slot(var, 'vhat')
            vhat_t = state_ops.assign(vhat,
                                      math_ops.maximum(vhat, v_t),
                                      use_locking=self._use_locking)
            v_corr_t = math_ops.sqrt(vhat_t / (1.0 - beta2_power)) + epsilon_t
        else:
            v_corr_t = math_ops.sqrt(v_t / (1.0 - beta2_power)) + epsilon_t

        r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) * (sma_t - 2.0) /
                            (sma_inf - 2.0) * sma_inf / sma_t)

        var_t = tf.compat.v1.where(sma_t >= 5.0, r_t * m_corr_t / v_corr_t,
                                   m_corr_t)

        if var in self.reg_vars:
            if self._initial_weight_decay > 0.0:
                var_t += math_ops.cast(self._weight_decay_t,
                                       var.dtype.base_dtype) * var
            if self._L1_decay > 0.0:
                var_t += math_ops.cast(
                    self._L1_decay, var.dtype.base_dtype) * math_ops.sign(var)

        var_update = state_ops.assign_sub(var,
                                          lr_t * var_t,
                                          use_locking=self._use_locking)

        updates = [var_update, m_t, v_t]
        if self._amsgrad:
            updates.append(vhat_t)
        return control_flow_ops.group(*updates)
Example #33
0
def _clip_gradients_seperate_norm(grads_and_vars, clip_gradients):
  """Clips gradients by global norm."""
  gradients, variables = zip(*grads_and_vars)
  clipped_gradients = [clip_ops.clip_by_norm(grad, clip_gradients) for grad in gradients]
  return list(zip(clipped_gradients, variables))
def build_multi_tower_graph(images,
                            sketches,
                            images_d,
                            image_paired_class_ids,
                            image_paired_class_ids_d,
                            text_vocab_indiceses,
                            LSTM_hybrid,
                            vocab_size,
                            batch_size,
                            num_gpu,
                            batch_portion,
                            training,
                            learning_rates,
                            counter,
                            max_iter_step,
                            ld=10,
                            data_format='NCHW',
                            distance_map=True,
                            optimizer='Adam',
                            block_type='MRU'):
    """
    :param images: [batch_size, 3, H, W]
    :param sketches:  [batch_size, 3, H, W]
    :param images_d:  [batch_size, 3, H, W]
    :param image_paired_class_ids: [batch_size, ], class_number
    :param image_paired_class_ids_d: [batch_size, ]
    :param text_vocab_indiceses: [batch_size, 15]
    :return:
    """
    models.set_param(data_format=data_format)

    with tf.device('/cpu:0'):
        images_list = split_inputs(images, batch_size, batch_portion,
                                   num_gpu)  # [num_gpu, [N, C, H, W]]
        images_d_list = split_inputs(images_d, batch_size, batch_portion,
                                     num_gpu)
        sketches_list = split_inputs(sketches, batch_size, batch_portion,
                                     num_gpu)
        image_paired_class_ids_list = split_inputs(image_paired_class_ids,
                                                   batch_size, batch_portion,
                                                   num_gpu)
        image_paired_class_ids_d_list = split_inputs(image_paired_class_ids_d,
                                                     batch_size, batch_portion,
                                                     num_gpu)
        text_vocab_indiceses_list = split_inputs(text_vocab_indiceses,
                                                 batch_size, batch_portion,
                                                 num_gpu)

    lr_g = learning_rates['generator']
    lr_d = learning_rates['discriminator']
    optimizer = get_optimizer(optimizer)
    decay = tf.maximum(
        0.2, 1. - (tf.cast(counter, tf.float32) / max_iter_step * 0.9))
    tf.summary.scalar('learning_rate_g', lr_g * decay)
    optim_g = optimizer(learning_rate=lr_g * decay)
    optim_d = optimizer(learning_rate=lr_d * decay)

    tower_grads_g = []
    tower_grads_d = []
    for i in range(num_gpu):
        with tf.name_scope('%s_%d' % ('GPU', i)) as scope:
            loss_g, loss_d, grad_g, grad_d \
                = build_single_graph(images_list[i],
                                     sketches_list[i],
                                     images_d_list[i],
                                     image_paired_class_ids_list[i],
                                     image_paired_class_ids_d_list[i],
                                     text_vocab_indiceses_list[i],
                                     batch_size * batch_portion[i],
                                     training,
                                     LSTM_hybrid=LSTM_hybrid,
                                     vocab_size=vocab_size,
                                     ld=ld, data_format=data_format,
                                     distance_map=distance_map,
                                     optim_g=optim_g,
                                     optim_d=optim_d,
                                     block_type=block_type)

            tower_grads_g.append(grad_g)
            tower_grads_d.append(grad_d)

    assert len(tower_grads_g) == len(tower_grads_d)
    if len(tower_grads_d) == 1:
        ave_grad_g = grad_g
        ave_grad_d = grad_d
    else:
        ave_grad_g, ave_grad_d = average_gradients(
            (tower_grads_g, tower_grads_d))

    # Apply gradients
    tf.get_variable_scope(
    )._reuse = False  # Hack to force initialization of optimizer variables

    if Config.sn:
        # Get the update ops
        spectral_norm_update_ops = tf.get_collection(
            Config.SPECTRAL_NORM_UPDATE_OPS)
    else:
        spectral_norm_update_ops = [tf.no_op()]
        assign_ops = tf.no_op()

    # Clip gradients if using WGAN/DRAGAN
    global_grad_norm_G = None
    global_grad_norm_G_clipped = None
    global_grad_norm_D = None
    global_grad_norm_D_clipped = None

    if not Config.sn:
        max_grad_norm_G = 50.
        max_grad_norm_D = 100.
        hard_clip_norm_G = 5.
        hard_clip_norm_D = 10.

        ave_grad_g_tensors, ave_grad_g_vars = list(zip(*ave_grad_g))
        global_grad_norm_G = clip_ops.global_norm(ave_grad_g_tensors)
        ave_grad_g_tensors, _ = clip_ops.clip_by_global_norm(
            ave_grad_g_tensors, max_grad_norm_G, global_grad_norm_G)
        ave_grad_g_tensors = [
            clip_ops.clip_by_norm(t, hard_clip_norm_G)
            for t in ave_grad_g_tensors
        ]
        ave_grad_g = list(zip(ave_grad_g_tensors, ave_grad_g_vars))

        ave_grad_d_tensors, ave_grad_d_vars = list(zip(*ave_grad_d))
        global_grad_norm_D = clip_ops.global_norm(ave_grad_d_tensors)
        ave_grad_d_tensors, _ = clip_ops.clip_by_global_norm(
            ave_grad_d_tensors, max_grad_norm_D, global_grad_norm_D)
        ave_grad_d_tensors = [
            clip_ops.clip_by_norm(t, hard_clip_norm_D)
            for t in ave_grad_d_tensors
        ]
        ave_grad_d = list(zip(ave_grad_d_tensors, ave_grad_d_vars))
    with tf.control_dependencies(spectral_norm_update_ops):
        opt_g = optimize(ave_grad_g,
                         optim_g,
                         None,
                         'gradient_norm',
                         global_norm=global_grad_norm_G,
                         global_norm_clipped=global_grad_norm_G_clipped,
                         appendix='_G')
    opt_d = optimize(ave_grad_d,
                     optim_d,
                     None,
                     'gradient_norm',
                     global_norm=global_grad_norm_D,
                     global_norm_clipped=global_grad_norm_D_clipped,
                     appendix='_D')

    summaries = gather_summaries()
    loss_g, loss_d = gather_losses()

    # Generator output from last tower
    return opt_g, opt_d, loss_g, loss_d, summaries
 def testClipByNormGradientZeros(self):
   with self.session(use_gpu=True):
     x = array_ops.zeros([3])
     b = clip_ops.clip_by_norm(x, 1.)
     grad, = gradients_impl.gradients(b, x)
     self.assertAllEqual(grad.eval(), [1., 1., 1.])