Esempio n. 1
0
    def test_stable_global_norm_avoids_overflow(self):
        tensors = [array_ops.ones([4]), array_ops.ones([4, 4]) * 1e19, None]
        gnorm_is_inf = math_ops.is_inf(clip_ops.global_norm(tensors))
        stable_gnorm_is_inf = math_ops.is_inf(
            tfgan_losses._numerically_stable_global_norm(tensors))

        with self.test_session(use_gpu=True):
            self.assertTrue(gnorm_is_inf.eval())
            self.assertFalse(stable_gnorm_is_inf.eval())
Esempio n. 2
0
  def test_stable_global_norm_avoids_overflow(self):
    tensors = [array_ops.ones([4]), array_ops.ones([4, 4]) * 1e19, None]
    gnorm_is_inf = math_ops.is_inf(clip_ops.global_norm(tensors))
    stable_gnorm_is_inf = math_ops.is_inf(
        tfgan_losses._numerically_stable_global_norm(tensors))

    with self.test_session(use_gpu=True):
      self.assertTrue(gnorm_is_inf.eval())
      self.assertFalse(stable_gnorm_is_inf.eval())
Esempio n. 3
0
  def update_op(self, has_nan, amax):
    """Operation to update the scaling factor"""
    def overflow_case():
      new_scale_val = clip_ops.clip_by_value(\
                        self.scale / self.step_factor,
                        self.scale_min, self.scale_max)
      scale_assign = self.scale.assign(new_scale_val)
      overflow_iter_assign = self.last_overflow_iteration.assign(\
                        self.iteration)
      with ops.control_dependencies([scale_assign, overflow_iter_assign]):
        return array_ops.identity(self.scale)

    def scale_case():
      since_overflow = self.iteration - self.last_overflow_iteration
      should_update = math_ops.equal(since_overflow % self.step_window, 0)
      def scale_update_fn():
        new_scale_val = clip_ops.clip_by_value(\
                self.scale * self.step_factor,
                self.scale_min, self.scale_max)
        return self.scale.assign(new_scale_val)
      return control_flow_ops.cond(should_update,
                                   scale_update_fn,
                                   lambda: self.scale)

    iter_update = self.iteration.assign_add(1)
    overflow = math_ops.logical_or(has_nan, math_ops.is_inf(amax))

    update_op = control_flow_ops.cond(overflow,
                                      overflow_case,
                                      scale_case)
    with ops.control_dependencies([update_op]):
      return array_ops.identity(iter_update)
Esempio n. 4
0
 def _compare(self, x, use_gpu):
   np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x)
   with test_util.device(use_gpu=use_gpu):
     inx = ops.convert_to_tensor(x)
     ofinite, oinf, onan = math_ops.is_finite(inx), math_ops.is_inf(
         inx), math_ops.is_nan(inx)
     tf_finite, tf_inf, tf_nan = self.evaluate([ofinite, oinf, onan])
   self.assertAllEqual(np_inf, tf_inf)
   self.assertAllEqual(np_nan, tf_nan)
   self.assertAllEqual(np_finite, tf_finite)
   self.assertShapeEqual(np_inf, oinf)
   self.assertShapeEqual(np_nan, onan)
   self.assertShapeEqual(np_finite, ofinite)
Esempio n. 5
0
def sparsemax_loss(logits, sparsemax, labels, name=None):
    """Computes sparsemax loss function [1].

  [1]: https://arxiv.org/abs/1602.02068

  Args:
    logits: A `Tensor`. Must be one of the following types: `half`, `float32`,
      `float64`.
    sparsemax: A `Tensor`. Must have the same type as `logits`.
    labels: A `Tensor`. Must have the same type as `logits`.
    name: A name for the operation (optional).

  Returns:
    A `Tensor`. Has the same type as `logits`.
  """

    with ops.name_scope(name, "sparsemax_loss",
                        [logits, sparsemax, labels]) as name:
        logits = ops.convert_to_tensor(logits, name="logits")
        sparsemax = ops.convert_to_tensor(sparsemax, name="sparsemax")
        labels = ops.convert_to_tensor(labels, name="labels")

        # In the paper, they call the logits z.
        # A constant can be substracted from logits to make the algorithm
        # more numerically stable in theory. However, there are really no major
        # source numerical instability in this algorithm.
        z = logits

        # sum over support
        # Use a conditional where instead of a multiplication to support z = -inf.
        # If z = -inf, and there is no support (sparsemax = 0), a multiplication
        # would cause 0 * -inf = nan, which is not correct in this case.
        sum_s = array_ops.where(
            math_ops.logical_or(sparsemax > 0, math_ops.is_nan(sparsemax)),
            sparsemax * (z - 0.5 * sparsemax), array_ops.zeros_like(sparsemax))

        # - z_k + ||q||^2
        q_part = labels * (0.5 * labels - z)
        # Fix the case where labels = 0 and z = -inf, where q_part would
        # otherwise be 0 * -inf = nan. But since the lables = 0, no cost for
        # z = -inf should be consideredself.
        # The code below also coveres the case where z = inf. Howeverm in this
        # caose the sparsemax will be nan, which means the sum_s will also be nan,
        # therefor this case doesn't need addtional special treatment.
        q_part_safe = array_ops.where(
            math_ops.logical_and(math_ops.equal(labels,
                                                0), math_ops.is_inf(z)),
            array_ops.zeros_like(z), q_part)

        return math_ops.reduce_sum(sum_s + q_part_safe, axis=1)
Esempio n. 6
0
def sparsemax_loss(logits, sparsemax, labels, name=None):
  """Computes sparsemax loss function [1].

  [1]: https://arxiv.org/abs/1602.02068

  Args:
    logits: A `Tensor`. Must be one of the following types: `half`, `float32`,
      `float64`.
    sparsemax: A `Tensor`. Must have the same type as `logits`.
    labels: A `Tensor`. Must have the same type as `logits`.
    name: A name for the operation (optional).

  Returns:
    A `Tensor`. Has the same type as `logits`.
  """

  with ops.name_scope(name, "sparsemax_loss",
                      [logits, sparsemax, labels]) as name:
    logits = ops.convert_to_tensor(logits, name="logits")
    sparsemax = ops.convert_to_tensor(sparsemax, name="sparsemax")
    labels = ops.convert_to_tensor(labels, name="labels")

    # In the paper, they call the logits z.
    # A constant can be substracted from logits to make the algorithm
    # more numerically stable in theory. However, there are really no major
    # source numerical instability in this algorithm.
    z = logits

    # sum over support
    # Use a conditional where instead of a multiplication to support z = -inf.
    # If z = -inf, and there is no support (sparsemax = 0), a multiplication
    # would cause 0 * -inf = nan, which is not correct in this case.
    sum_s = array_ops.where(
        math_ops.logical_or(sparsemax > 0, math_ops.is_nan(sparsemax)),
        sparsemax * (z - 0.5 * sparsemax), array_ops.zeros_like(sparsemax))

    # - z_k + ||q||^2
    q_part = labels * (0.5 * labels - z)
    # Fix the case where labels = 0 and z = -inf, where q_part would
    # otherwise be 0 * -inf = nan. But since the lables = 0, no cost for
    # z = -inf should be consideredself.
    # The code below also coveres the case where z = inf. Howeverm in this
    # caose the sparsemax will be nan, which means the sum_s will also be nan,
    # therefor this case doesn't need addtional special treatment.
    q_part_safe = array_ops.where(
        math_ops.logical_and(math_ops.equal(labels, 0), math_ops.is_inf(z)),
        array_ops.zeros_like(z), q_part)

    return math_ops.reduce_sum(sum_s + q_part_safe, axis=1)
Esempio n. 7
0
 def _compare(self, x, use_gpu):
   np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x)
   with self.test_session(
       use_gpu=use_gpu,
       force_gpu=use_gpu and test_util.is_gpu_available()) as sess:
     inx = ops.convert_to_tensor(x)
     ofinite, oinf, onan = math_ops.is_finite(inx), math_ops.is_inf(
         inx), math_ops.is_nan(inx)
     tf_finite, tf_inf, tf_nan = sess.run([ofinite, oinf, onan])
   self.assertAllEqual(np_inf, tf_inf)
   self.assertAllEqual(np_nan, tf_nan)
   self.assertAllEqual(np_finite, tf_finite)
   self.assertShapeEqual(np_inf, oinf)
   self.assertShapeEqual(np_nan, onan)
   self.assertShapeEqual(np_finite, ofinite)
Esempio n. 8
0
    def convert_nan_or_inf_to_zero(self, grad):
        """Replace grad tensor with zero tensor if grad is NaN or Inf.

     This is mainly for improving training stability. We skip updating the
     variable by setting the grad to zero when there is NaN or Inf.

    Args:
      grad: Input gradient.

    Returns:
      a Tensor with the dtype equal to grad dtype.
    """
        return array_ops.where(
            math_ops.reduce_any(
                math_ops.logical_or(math_ops.is_nan(grad),
                                    math_ops.is_inf(grad))),
            array_ops.zeros_like(grad, dtype=grad.dtype), grad)
Esempio n. 9
0
 def _compare(self, x, use_gpu):
   with test_util.device(use_gpu=use_gpu):
     inx = ops.convert_to_tensor(x)
     ofinite, oinf, onan = math_ops.is_finite(inx), math_ops.is_inf(
         inx), math_ops.is_nan(inx)
     tf_finite, tf_inf, tf_nan = self.evaluate([ofinite, oinf, onan])
   if x.dtype == dtypes_lib.bfloat16.as_numpy_dtype:
     # Numpy will implicitly convert bfloat16 value to float16, so we cast to
     # float32 to avoid this.
     x = x.astype(np.float32)
   np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x)
   self.assertAllEqual(np_inf, tf_inf)
   self.assertAllEqual(np_nan, tf_nan)
   self.assertAllEqual(np_finite, tf_finite)
   self.assertShapeEqual(np_inf, oinf)
   self.assertShapeEqual(np_nan, onan)
   self.assertShapeEqual(np_finite, ofinite)
Esempio n. 10
0
def reduce_weighted_logsumexp(
    logx,
    w=None,
    axis=None,
    keep_dims=False,
    return_sign=False,
    name=None):
  """Computes `log(abs(sum(weight * exp(elements across tensor dimensions))))`.

  If all weights `w` are known to be positive, it is more efficient to directly
  use `reduce_logsumexp`, i.e., `tf.reduce_logsumexp(logx + tf.log(w))` is more
  efficient than `du.reduce_weighted_logsumexp(logx, w)`.

  Reduces `input_tensor` along the dimensions given in `axis`.
  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
  entry in `axis`. If `keep_dims` is true, the reduced dimensions
  are retained with length 1.

  If `axis` has no entries, all dimensions are reduced, and a
  tensor with a single element is returned.

  This function is more numerically stable than log(sum(w * exp(input))). It
  avoids overflows caused by taking the exp of large inputs and underflows
  caused by taking the log of small inputs.

  For example:

  ```python
  x = tf.constant([[0., 0, 0],
                   [0, 0, 0]])

  w = tf.constant([[-1., 1, 1],
                   [1, 1, 1]])

  du.reduce_weighted_logsumexp(x, w)
  # ==> log(-1*1 + 1*1 + 1*1 + 1*1 + 1*1 + 1*1) = log(4)

  du.reduce_weighted_logsumexp(x, w, axis=0)
  # ==> [log(-1+1), log(1+1), log(1+1)]

  du.reduce_weighted_logsumexp(x, w, axis=1)
  # ==> [log(-1+1+1), log(1+1+1)]

  du.reduce_weighted_logsumexp(x, w, axis=1, keep_dims=True)
  # ==> [[log(-1+1+1)], [log(1+1+1)]]

  du.reduce_weighted_logsumexp(x, w, axis=[0, 1])
  # ==> log(-1+5)
  ```

  Args:
    logx: The tensor to reduce. Should have numeric type.
    w: The weight tensor. Should have numeric type identical to `logx`.
    axis: The dimensions to reduce. If `None` (the default),
      reduces all dimensions. Must be in the range
      `[-rank(input_tensor), rank(input_tensor))`.
    keep_dims: If true, retains reduced dimensions with length 1.
    return_sign: If `True`, returns the sign of the result.
    name: A name for the operation (optional).

  Returns:
    lswe: The `log(abs(sum(weight * exp(x))))` reduced tensor.
    sign: (Optional) The sign of `sum(weight * exp(x))`.
  """
  with ops.name_scope(name, "reduce_weighted_logsumexp", [logx, w]):
    logx = ops.convert_to_tensor(logx, name="logx")
    if w is None:
      lswe = math_ops.reduce_logsumexp(logx, axis=axis, keep_dims=keep_dims)
      if return_sign:
        sgn = array_ops.ones_like(lswe)
        return lswe, sgn
      return lswe
    w = ops.convert_to_tensor(w, dtype=logx.dtype, name="w")
    log_absw_x = logx + math_ops.log(math_ops.abs(w))
    max_log_absw_x = math_ops.reduce_max(log_absw_x, axis=axis, keep_dims=True)
    # If the largest element is `-inf` or `inf` then we don't bother subtracting
    # off the max. We do this because otherwise we'd get `inf - inf = NaN`. That
    # this is ok follows from the fact that we're actually free to subtract any
    # value we like, so long as we add it back after taking the `log(sum(...))`.
    max_log_absw_x = array_ops.where(
        math_ops.is_inf(max_log_absw_x),
        array_ops.zeros_like(max_log_absw_x),
        max_log_absw_x)
    wx_over_max_absw_x = (
        math_ops.sign(w) * math_ops.exp(log_absw_x - max_log_absw_x))
    sum_wx_over_max_absw_x = math_ops.reduce_sum(
        wx_over_max_absw_x,
        axis=axis,
        keep_dims=keep_dims)
    if not keep_dims:
      max_log_absw_x = array_ops.squeeze(max_log_absw_x, axis)
    sgn = math_ops.sign(sum_wx_over_max_absw_x)
    lswe = max_log_absw_x + math_ops.log(sgn * sum_wx_over_max_absw_x)
    if return_sign:
      return lswe, sgn
    return lswe
Esempio n. 11
0
def reduce_weighted_logsumexp(logx,
                              w=None,
                              axis=None,
                              keep_dims=False,
                              return_sign=False,
                              name=None):
    """Computes `log(abs(sum(weight * exp(elements across tensor dimensions))))`.

  If all weights `w` are known to be positive, it is more efficient to directly
  use `reduce_logsumexp`, i.e., `tf.reduce_logsumexp(logx + tf.log(w))` is more
  efficient than `du.reduce_weighted_logsumexp(logx, w)`.

  Reduces `input_tensor` along the dimensions given in `axis`.
  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
  entry in `axis`. If `keep_dims` is true, the reduced dimensions
  are retained with length 1.

  If `axis` has no entries, all dimensions are reduced, and a
  tensor with a single element is returned.

  This function is more numerically stable than log(sum(w * exp(input))). It
  avoids overflows caused by taking the exp of large inputs and underflows
  caused by taking the log of small inputs.

  For example:

  ```python
  x = tf.constant([[0., 0, 0],
                   [0, 0, 0]])

  w = tf.constant([[-1., 1, 1],
                   [1, 1, 1]])

  du.reduce_weighted_logsumexp(x, w)
  # ==> log(-1*1 + 1*1 + 1*1 + 1*1 + 1*1 + 1*1) = log(4)

  du.reduce_weighted_logsumexp(x, w, axis=0)
  # ==> [log(-1+1), log(1+1), log(1+1)]

  du.reduce_weighted_logsumexp(x, w, axis=1)
  # ==> [log(-1+1+1), log(1+1+1)]

  du.reduce_weighted_logsumexp(x, w, axis=1, keep_dims=True)
  # ==> [[log(-1+1+1)], [log(1+1+1)]]

  du.reduce_weighted_logsumexp(x, w, axis=[0, 1])
  # ==> log(-1+5)
  ```

  Args:
    logx: The tensor to reduce. Should have numeric type.
    w: The weight tensor. Should have numeric type identical to `logx`.
    axis: The dimensions to reduce. If `None` (the default),
      reduces all dimensions. Must be in the range
      `[-rank(input_tensor), rank(input_tensor))`.
    keep_dims: If true, retains reduced dimensions with length 1.
    return_sign: If `True`, returns the sign of the result.
    name: A name for the operation (optional).

  Returns:
    lswe: The `log(abs(sum(weight * exp(x))))` reduced tensor.
    sign: (Optional) The sign of `sum(weight * exp(x))`.
  """
    with ops.name_scope(name, "reduce_weighted_logsumexp", [logx, w]):
        logx = ops.convert_to_tensor(logx, name="logx")
        if w is None:
            lswe = math_ops.reduce_logsumexp(logx,
                                             axis=axis,
                                             keep_dims=keep_dims)
            if return_sign:
                sgn = array_ops.ones_like(lswe)
                return lswe, sgn
            return lswe
        w = ops.convert_to_tensor(w, dtype=logx.dtype, name="w")
        log_absw_x = logx + math_ops.log(math_ops.abs(w))
        max_log_absw_x = math_ops.reduce_max(log_absw_x,
                                             axis=axis,
                                             keep_dims=True)
        # If the largest element is `-inf` or `inf` then we don't bother subtracting
        # off the max. We do this because otherwise we'd get `inf - inf = NaN`. That
        # this is ok follows from the fact that we're actually free to subtract any
        # value we like, so long as we add it back after taking the `log(sum(...))`.
        max_log_absw_x = array_ops.where(math_ops.is_inf(max_log_absw_x),
                                         array_ops.zeros_like(max_log_absw_x),
                                         max_log_absw_x)
        wx_over_max_absw_x = (math_ops.sign(w) *
                              math_ops.exp(log_absw_x - max_log_absw_x))
        sum_wx_over_max_absw_x = math_ops.reduce_sum(wx_over_max_absw_x,
                                                     axis=axis,
                                                     keep_dims=keep_dims)
        if not keep_dims:
            max_log_absw_x = array_ops.squeeze(max_log_absw_x, axis)
        sgn = math_ops.sign(sum_wx_over_max_absw_x)
        lswe = max_log_absw_x + math_ops.log(sgn * sum_wx_over_max_absw_x)
        if return_sign:
            return lswe, sgn
        return lswe
Esempio n. 12
0
  def gradients_with_scaling(ys,
                             xs,
                             grad_ys=None,
                             name="gradients",
                             colocate_gradients_with_ops=False,
                             gate_gradients=False,
                             aggregation_method=None,
                             stop_gradients=None,
                             unconnected_gradients=UnconnectedGradients.NONE):
    # with constant loss scaling
    ys = _AsList(ys)
    mp_config = _current_mp_config()
    # if mp_config is empty
    if not mp_config or len(ys) == 0 or ys[0].dtype == dtypes.variant:
      grads = gradients(ys, xs, grad_ys, name,
                        colocate_gradients_with_ops,
                        gate_gradients,
                        aggregation_method,
                        stop_gradients,
                        unconnected_gradients)
      return grads

    scale = 1.0
    if mp_config.get('auto'):
      scale = mp_config['auto'].loss_scale
    elif mp_config.get('constant'):
      scale = mp_config['constant']
    if isinstance(scale, ops.Tensor) or scale != 1.0:
      with ops.name_scope(name, "gradients"):
        gradient_uid = ops.get_default_graph().unique_name("uid",
                                                           mark_as_used=False)
        scaled_ys = []
        scale_ts = ops.convert_to_tensor(scale)
        for y in ys:
          with _maybe_colocate_with(y.op,
                                    gradient_uid,
                                    colocate_gradients_with_ops):
            y = math_ops.scalar_mul(math_ops.cast(scale_ts, dtype=y.dtype), y)
          scaled_ys.append(y)
        ys = scaled_ys
    grads_scaled = gradients(ys, xs, grad_ys,
                             name,
                             colocate_gradients_with_ops,
                             gate_gradients,
                             aggregation_method,
                             stop_gradients,
                             unconnected_gradients)
    if isinstance(scale, ops.Tensor) or scale != 1.0:
      with ops.name_scope(name, "gradients"):
        unscale = 1.0 / scale
        unscale_ts = ops.convert_to_tensor(unscale)
        grads = []
        for grad in grads_scaled:
          if grad is not None:
            with _maybe_colocate_with(grad.op,
                                      gradient_uid,
                                      colocate_gradients_with_ops):
              grad = math_ops.scalar_mul(
                  math_ops.cast(unscale_ts, dtype=grad.dtype), grad)
          grads.append(grad)
    else:
      grads = grads_scaled

    # if auto scaling: check nan and inf
    if mp_config.get('auto'):
      # check the grads
      grad_has_nans, grad_amax = AutomaticLossScaler.check_grads(grads)
      # the gradients will be ignored in the following two cases:
      #   1) there is Nan in the gradients;
      #   2) the maximum value is infinity
      should_skip_update = math_ops.logical_or(math_ops.is_inf(grad_amax),
                                               grad_has_nans)
      loss_scale_update_op = mp_config['auto'].update_op(grad_has_nans,
                                                         grad_amax)
      grads_update = []
      with ops.control_dependencies([loss_scale_update_op]):
        for grad in grads:
          if grad is not None:
            with _maybe_colocate_with(grad.op,
                                      gradient_uid,
                                      colocate_gradients_with_ops):
              grad_zero = _zero_grad(grad)
              grad = control_flow_ops.cond(should_skip_update,
                                           lambda: grad_zero,
                                           lambda: grad)
          grads_update.append(grad)
      return grads_update
    return grads