Example #1
0
 def _forward(self, x):
   x = self._maybe_assert_valid_x(x)
   if self.power == 0.:
     return math_ops.exp(x)
   # If large x accuracy is an issue, consider using:
   # (1. + x * self.power)**(1. / self.power) when x >> 1.
   return math_ops.exp(math_ops.log1p(x * self.power) / self.power)
 def test_one_dimensional_arg(self):
   # Should evaluate to 1 and 1/2.
   x_one = [1, 1.]
   x_one_half = [2, 1.]
   with self.test_session(use_gpu=self._use_gpu):
     self.assertAllClose(1, math_ops.exp(special_math_ops.lbeta(x_one)).eval())
     self.assertAllClose(
         0.5, math_ops.exp(special_math_ops.lbeta(x_one_half)).eval())
     self.assertEqual([], special_math_ops.lbeta(x_one).get_shape())
 def test_length_1_last_dimension_results_in_one(self):
   # If there is only one coefficient, the formula still works, and we get one
   # as the answer, always.
   x_a = [5.5]
   x_b = [0.1]
   with self.test_session(use_gpu=True):
     self.assertAllClose(1, math_ops.exp(special_math_ops.lbeta(x_a)).eval())
     self.assertAllClose(1, math_ops.exp(special_math_ops.lbeta(x_b)).eval())
     self.assertEqual((), special_math_ops.lbeta(x_a).get_shape())
def jensen_shannon(logu, self_normalized=False, name=None):
  """The Jensen-Shannon Csiszar-function in log-space.

  A Csiszar-function is a member of,

  ```none
  F = { f:R_+ to R : f convex }.
  ```

  When `self_normalized = True`, the Jensen-Shannon Csiszar-function is:

  ```none
  f(u) = u log(u) - (1 + u) log(1 + u) + (u + 1) log(2)
  ```

  When `self_normalized = False` the `(u + 1) log(2)` term is omitted.

  Observe that as an f-Divergence, this Csiszar-function implies:

  ```none
  D_f[p, q] = KL[p, m] + KL[q, m]
  m(x) = 0.5 p(x) + 0.5 q(x)
  ```

  In a sense, this divergence is the "reverse" of the Arithmetic-Geometric
  f-Divergence.

  This Csiszar-function induces a symmetric f-Divergence, i.e.,
  `D_f[p, q] = D_f[q, p]`.

  Warning: this function makes non-log-space calculations and may therefore be
  numerically unstable for `|logu| >> 0`.

  For more information, see:
    Lin, J. "Divergence measures based on the Shannon entropy." IEEE Trans.
    Inf. Th., 37, 145-151, 1991.

  Args:
    logu: Floating-type `Tensor` representing `log(u)` from above.
    self_normalized: Python `bool` indicating whether `f'(u=1)=0`. When
      `f'(u=1)=0` the implied Csiszar f-Divergence remains non-negative even
      when `p, q` are unnormalized measures.
    name: Python `str` name prefixed to Ops created by this function.

  Returns:
    jensen_shannon_of_u: Floating-type `Tensor` of the Csiszar-function
      evaluated at `u = exp(logu)`.
  """

  with ops.name_scope(name, "jensen_shannon", [logu]):
    logu = ops.convert_to_tensor(logu, name="logu")
    npdt = logu.dtype.as_numpy_dtype
    y = nn_ops.softplus(logu)
    if self_normalized:
      y -= np.log(2).astype(npdt)
    return math_ops.exp(logu) * logu - (1. + math_ops.exp(logu)) * y
Example #5
0
def _SoftplusGradGrad(op, grad):
  # Let:
  #   y = tf.nn.softplus(x)
  #   dx = gen_nn_ops.softplus_grad(dy, x) = dy / (1 + exp(-x))
  # This op computes (ddy, d2x) from op.inputs == [dy, x] and grad == ddx.
  dy, x = op.inputs
  with ops.control_dependencies([grad]):
    ddy = gen_nn_ops.softplus_grad(grad, x)
    d2x = grad * dy / (math_ops.exp(-x) + 2.0 + math_ops.exp(x))
    return (ddy, d2x)
Example #6
0
def _compute_energy_change(current_target_log_prob,
                           current_momentums,
                           proposed_target_log_prob,
                           proposed_momentums,
                           independent_chain_ndims,
                           name=None):
  """Helper to `kernel` which computes the energy change."""
  with ops.name_scope(
      name, "compute_energy_change",
      ([current_target_log_prob, proposed_target_log_prob,
        independent_chain_ndims] +
       current_momentums + proposed_momentums)):
    # Abbreviate lk0=log_kinetic_energy and lk1=proposed_log_kinetic_energy
    # since they're a mouthful and lets us inline more.
    lk0, lk1 = [], []
    for current_momentum, proposed_momentum in zip(current_momentums,
                                                   proposed_momentums):
      axis = math_ops.range(independent_chain_ndims,
                            array_ops.rank(current_momentum))
      lk0.append(_log_sum_sq(current_momentum, axis))
      lk1.append(_log_sum_sq(proposed_momentum, axis))

    lk0 = -np.log(2.) + math_ops.reduce_logsumexp(array_ops.stack(lk0, axis=-1),
                                                  axis=-1)
    lk1 = -np.log(2.) + math_ops.reduce_logsumexp(array_ops.stack(lk1, axis=-1),
                                                  axis=-1)
    lp0 = -current_target_log_prob   # log_potential
    lp1 = -proposed_target_log_prob  # proposed_log_potential
    x = array_ops.stack([lp1, math_ops.exp(lk1), -lp0, -math_ops.exp(lk0)],
                        axis=-1)

    # The sum is NaN if any element is NaN or we see both +Inf and -Inf.
    # Thus we will replace such rows with infinite energy change which implies
    # rejection. Recall that float-comparisons with NaN are always False.
    is_sum_determinate = (
        math_ops.reduce_all(math_ops.is_finite(x) | (x >= 0.), axis=-1) &
        math_ops.reduce_all(math_ops.is_finite(x) | (x <= 0.), axis=-1))
    is_sum_determinate = array_ops.tile(
        is_sum_determinate[..., array_ops.newaxis],
        multiples=array_ops.concat([
            array_ops.ones(array_ops.rank(is_sum_determinate),
                           dtype=dtypes.int32),
            [4],
        ], axis=0))
    x = array_ops.where(is_sum_determinate,
                        x,
                        array_ops.fill(array_ops.shape(x),
                                       value=x.dtype.as_numpy_dtype(np.inf)))

    return math_ops.reduce_sum(x, axis=-1)
Example #7
0
def cosh(x, name="cosh"):
  """Hyperbolic cosine:  `cosh(x) = (e**x + e**-x) / 2`.

  For `x in (-inf, inf)`, `arccosh(cosh(x)) = cosh(arccosh(x)) = x.`

  Args:
    x:  Numeric `Tensor`.
    name:  A string name to prepend to created Ops.

  Returns:
    Numeric `Tensor` of same `shape` and `dtype` as `x`.
  """
  with ops.name_scope(name):
    x = ops.convert_to_tensor(x, name="x")
    return 0.5 * (math_ops.exp(x) + math_ops.exp(-x))
Example #8
0
def ctc_loss_and_grad(logits, labels, label_length, logit_length, unique=None):
  """Computes the CTC loss and gradients.

  Most users will want fwd_bwd.ctc_loss

  This function returns the computed gradient, it does not have a gradient
  of its own defined.

  Args:
    logits: tensor of shape [frames, batch_size, num_labels]
    labels: tensor of shape [batch_size, max_label_seq_length]
    label_length: tensor of shape [batch_size]
      Length of reference label sequence in labels.
    logit_length: tensor of shape [batch_size]
      Length of input sequence in logits.
    unique: (optional) unique label indices as computed by unique(labels)
      If supplied, enables an implementation that is faster and more memory
      efficient on TPU.

  Returns:
    loss: tensor of shape [batch_size]
    gradient: tensor of shape [frames, batch_size, num_labels]
  """

  num_labels = _get_dim(logits, 2)
  max_label_seq_length = _get_dim(labels, 1)

  ilabel_log_probs = nn_ops.log_softmax(logits)
  state_log_probs = _ilabel_to_state(labels, num_labels, ilabel_log_probs)
  state_trans_probs = _ctc_state_trans(labels)
  initial_state_log_probs, final_state_log_probs = ctc_state_log_probs(
      label_length, max_label_seq_length)
  fwd_bwd_log_probs, log_likelihood = _forward_backward_log(
      state_trans_log_probs=math_ops.log(state_trans_probs),
      initial_state_log_probs=initial_state_log_probs,
      final_state_log_probs=final_state_log_probs,
      observed_log_probs=state_log_probs,
      sequence_length=logit_length)

  if unique:
    olabel_log_probs = _state_to_olabel_unique(
        labels, num_labels, fwd_bwd_log_probs, unique)
  else:
    olabel_log_probs = _state_to_olabel(labels, num_labels, fwd_bwd_log_probs)

  grad = math_ops.exp(ilabel_log_probs) - math_ops.exp(olabel_log_probs)
  loss = -log_likelihood
  return loss, grad
Example #9
0
  def _log_gamma_log_prob_grad(self, x, event_dims=()):
    """Computes log-pdf and gradient of a log-gamma random variable.

    Args:
      x: Value of the random variable.
      event_dims: Dimensions not to treat as independent. Default is (),
        i.e., all dimensions are independent.

    Returns:
      log_prob: The log-pdf up to a normalizing constant.
      grad: The gradient of the log-pdf with respect to x.
    """
    return (math_ops.reduce_sum(self._shape_param * x -
                                self._rate_param * math_ops.exp(x),
                                event_dims),
            self._shape_param - self._rate_param * math_ops.exp(x))
Example #10
0
def _BetaincGrad(op, grad):
  """Returns gradient of betainc(a, b, x) with respect to x."""
  # TODO(ebrevdo): Perhaps add the derivative w.r.t. a, b
  a, b, x = op.inputs

  # two cases: x is a scalar and a/b are same-shaped tensors, or vice
  # versa; so its sufficient to check against shape(a).
  sa = array_ops.shape(a)
  sx = array_ops.shape(x)
  # pylint: disable=protected-access
  _, rx = gen_array_ops._broadcast_gradient_args(sa, sx)
  # pylint: enable=protected-access

  # Perform operations in log space before summing, because terms
  # can grow large.
  log_beta = (
      gen_math_ops.lgamma(a) + gen_math_ops.lgamma(b) -
      gen_math_ops.lgamma(a + b))
  partial_x = math_ops.exp((b - 1) * math_ops.log(1 - x) +
                           (a - 1) * math_ops.log(x) - log_beta)

  # TODO(b/36815900): Mark None return values as NotImplemented
  return (
      None,  # da
      None,  # db
      array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx))
 def test_two_dimensional_arg(self):
   # Should evaluate to 1/2.
   x_one_half = [[2, 1.], [2, 1.]]
   with self.test_session(use_gpu=self._use_gpu):
     self.assertAllClose(
         [0.5, 0.5], math_ops.exp(special_math_ops.lbeta(x_one_half)).eval())
     self.assertEqual((2,), special_math_ops.lbeta(x_one_half).get_shape())
Example #12
0
def _Expm1Grad(op, grad):
  """Returns grad * exp(x)."""
  x = op.inputs[0]
  with ops.control_dependencies([grad]):
    x = math_ops.conj(x)
    y = math_ops.exp(x)
    return grad * y
Example #13
0
def _ErfGrad(op, grad):
  """Returns grad * 2/sqrt(pi) * exp(-x**2)."""
  x = op.inputs[0]
  two_over_root_pi = constant_op.constant(2 / np.sqrt(np.pi), dtype=grad.dtype)
  with ops.control_dependencies([grad]):
    x = math_ops.conj(x)
    return grad * two_over_root_pi * math_ops.exp(-math_ops.square(x))
def triangular(logu, name=None):
  """The Amari-alpha Csiszar-function in log-space.

  A Csiszar-function is a member of,

  ```none
  F = { f:R_+ to R : f convex }.
  ```

  The Triangular Csiszar-function is:

  ```none
  f(u) = (u - 1)**2 / (1 + u)
  ```

  This Csiszar-function induces a symmetric f-Divergence, i.e.,
  `D_f[p, q] = D_f[q, p]`.

  Warning: this function makes non-log-space calculations and may therefore be
  numerically unstable for `|logu| >> 0`.

  Args:
    logu: Floating-type `Tensor` representing `log(u)` from above.
    name: Python `str` name prefixed to Ops created by this function.

  Returns:
    triangular_of_u: Floating-type `Tensor` of the Csiszar-function evaluated
      at `u = exp(logu)`.
  """

  with ops.name_scope(name, "triangular", [logu]):
    logu = ops.convert_to_tensor(logu, name="logu")
    return pearson(logu) / (1. + math_ops.exp(logu))
Example #15
0
 def monte_carlo_hypersphere_volume(dist, num_samples, radius, center):
   # https://en.wikipedia.org/wiki/Importance_sampling
   x = dist.sample(num_samples, seed=seed)
   x = array_ops.identity(x)  # Invalidate bijector cacheing.
   return math_ops.reduce_mean(
       math_ops.exp(-dist.log_prob(x)) * is_in_ball(x, radius, center),
       axis=0)
Example #16
0
 def _determinant(self):
   logging.warn(
       "Using (possibly slow) default implementation of determinant."
       "  Requires conversion to a dense matrix and O(N^3) operations.")
   if self._can_use_cholesky():
     return math_ops.exp(self.log_abs_determinant())
   return linalg_ops.matrix_determinant(self._matrix)
 def _prob(self, y):
   x, ildj = self.bijector.inverse_and_inverse_log_det_jacobian(y)
   x = self._maybe_rotate_dims(x, rotate_right=True)
   prob = self.distribution.prob(x)
   if self._is_maybe_event_override:
     prob = math_ops.reduce_prod(prob, self._reduce_event_indices)
   return math_ops.exp(ildj) * prob
 def _finish_prob_for_one_fiber(self, y, x, ildj, distribution_kwargs):
   """Finish computation of prob on one element of the inverse image."""
   x = self._maybe_rotate_dims(x, rotate_right=True)
   prob = self.distribution.prob(x, **distribution_kwargs)
   if self._is_maybe_event_override:
     prob = math_ops.reduce_prod(prob, self._reduce_event_indices)
   return math_ops.exp(math_ops.cast(ildj, prob.dtype)) * prob
def _logspace_mean(log_values):
  """Evaluate `Log[E[values]]` in a stable manner.

  Args:
    log_values:  `Tensor` holding `Log[values]`.

  Returns:
    `Tensor` of same `dtype` as `log_values`, reduced across dim 0.
      `Log[Mean[values]]`.
  """
  # center = Max[Log[values]],  with stop-gradient
  # The center hopefully keep the exponentiated term small.  It is cancelled
  # from the final result, so putting stop gradient on it will not change the
  # final result.  We put stop gradient on to eliminate unnecessary computation.
  center = array_ops.stop_gradient(_sample_max(log_values))

  # centered_values = exp{Log[values] - E[Log[values]]}
  centered_values = math_ops.exp(log_values - center)

  # log_mean_of_values = Log[ E[centered_values] ] + center
  #                    = Log[ E[exp{log_values - E[log_values]}] ] + center
  #                    = Log[E[values]] - E[log_values] + center
  #                    = Log[E[values]]
  log_mean_of_values = math_ops.log(_sample_mean(centered_values)) + center

  return log_mean_of_values
Example #20
0
def _ErfcGrad(op, grad):
  """Returns -grad * 2/sqrt(pi) * exp(-x**2)."""
  x = op.inputs[0]
  minus_two_over_root_pi = constant_op.constant(-2 / np.sqrt(np.pi),
                                                dtype=grad.dtype)
  with ops.control_dependencies([grad.op]):
    return  grad * minus_two_over_root_pi * math_ops.exp(-math_ops.square(x))
Example #21
0
  def cdf(self, value, name="cdf", **condition_kwargs):
    """Cumulative distribution function.

    Given random variable `X`, the cumulative distribution function `cdf` is:

    ```
    cdf(x) := P[X <= x]
    ```

    Args:
      value: `float` or `double` `Tensor`.
      name: The name to give this op.
      **condition_kwargs: Named arguments forwarded to subclass implementation.

    Returns:
      cdf: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
        values of type `self.dtype`.
    """
    with self._name_scope(name, values=[value]):
      value = ops.convert_to_tensor(value, name="value")
      try:
        return self._cdf(value, **condition_kwargs)
      except NotImplementedError as original_exception:
        try:
          return math_ops.exp(self._log_cdf(value, **condition_kwargs))
        except NotImplementedError:
          raise original_exception
Example #22
0
def _adaptive_max_norm(norm, std_factor, decay, global_step, epsilon, name):
  """Find max_norm given norm and previous average."""
  with vs.variable_scope(name, "AdaptiveMaxNorm", [norm]):
    log_norm = math_ops.log(norm + epsilon)

    def moving_average(name, value, decay):
      moving_average_variable = vs.get_variable(
          name,
          shape=value.get_shape(),
          dtype=value.dtype,
          initializer=init_ops.zeros_initializer(),
          trainable=False)
      return moving_averages.assign_moving_average(
          moving_average_variable, value, decay, zero_debias=False)

    # quicker adaptation at the beginning
    if global_step is not None:
      n = math_ops.to_float(global_step)
      decay = math_ops.minimum(decay, n / (n + 1.))

    # update averages
    mean = moving_average("mean", log_norm, decay)
    sq_mean = moving_average("sq_mean", math_ops.square(log_norm), decay)

    variance = sq_mean - math_ops.square(mean)
    std = math_ops.sqrt(math_ops.maximum(epsilon, variance))
    max_norms = math_ops.exp(mean + std_factor * std)
    return max_norms, mean
Example #23
0
  def gradient_clipping(grads_and_vars):
    """Internal function for adaptive clipping."""
    grads, variables = zip(*grads_and_vars)

    norm = clip_ops.global_norm(grads)

    max_norm, log_mean = _adaptive_max_norm(norm, std_factor, decay,
                                            global_step, epsilon, name)

    # reports the max gradient norm for debugging
    if report_summary:
      summary.scalar("global_norm/adaptive_max_gradient_norm", max_norm)

    # factor will be 1. if norm is smaller than max_norm
    factor = array_ops.where(norm < max_norm,
                             array_ops.ones_like(norm),
                             math_ops.exp(log_mean) / norm)

    if static_max_norm is not None:
      factor = math_ops.minimum(static_max_norm / norm, factor)

    # apply factor
    clipped_grads = []
    for grad in grads:
      if grad is None:
        clipped_grads.append(None)
      elif isinstance(grad, ops.IndexedSlices):
        clipped_grads.append(
            ops.IndexedSlices(grad.values * factor, grad.indices,
                              grad.dense_shape))
      else:
        clipped_grads.append(grad * factor)

    return list(zip(clipped_grads, variables))
Example #24
0
  def full_exp_with_logits(name, labels=None, logits=None):
    """Computes exponential loss given `logits`.

    Args:
      name: A name for the operation (optional).
      labels: A `Tensor` of the same type and shape as `logits`.
      logits: A `Tensor` of type `float32` or `float64`.

    Returns:
      A `Tensor` of the same shape as `logits` with the componentwise
      exponential losses.

    Raises:
      ValueError: If `logits` and `labels` do not have the same shape.
    """
    with ops.name_scope(name, "exp_loss", [logits, labels]) as name:
      logits = ops.convert_to_tensor(logits, name="logits")
      labels = ops.convert_to_tensor(labels, name="labels")
      try:
        labels.get_shape().merge_with(logits.get_shape())
      except ValueError:
        raise ValueError("logits and labels must have the same shape (%s vs %s)"
                         % (logits.get_shape(), labels.get_shape()))

    # Default threshold of 0 to switch between classes
    zeros = array_ops.zeros_like(logits, dtype=logits.dtype)
    ones = array_ops.ones_like(logits, dtype=logits.dtype)
    neg_ones = -array_ops.ones_like(logits, dtype=logits.dtype)

    # Convert labels to 1 and -1
    cond_labels = (labels > zeros)
    labels_converted = array_ops.where(cond_labels, ones, neg_ones)

    return math_ops.exp(-1.0 * logits * labels_converted)
  def test_normal_distribution_second_moment_estimated_correctly(self):
    # Test the importance sampled estimate against an analytical result.
    n = int(1e6)
    with self.test_session():
      mu_p = constant_op.constant([0.0, 0.0], dtype=dtypes.float64)
      mu_q = constant_op.constant([-1.0, 1.0], dtype=dtypes.float64)
      sigma_p = constant_op.constant([1.0, 2 / 3.], dtype=dtypes.float64)
      sigma_q = constant_op.constant([1.0, 1.0], dtype=dtypes.float64)
      p = distributions.Normal(loc=mu_p, scale=sigma_p)
      q = distributions.Normal(loc=mu_q, scale=sigma_q)

      # Compute E_p[X^2].
      # Should equal [1, (2/3)^2]
      log_e_x2 = monte_carlo.expectation_importance_sampler_logspace(
          log_f=lambda x: math_ops.log(math_ops.square(x)),
          log_p=p.log_prob,
          sampling_dist_q=q,
          n=n,
          seed=42)
      e_x2 = math_ops.exp(log_e_x2)

      # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
      # pass.
      self.assertEqual(p.batch_shape, e_x2.get_shape())
      self.assertAllClose([1., (2 / 3.)**2], e_x2.eval(), rtol=0.02)
Example #26
0
 def _prob(self, x):
   y = (x - self.mu) / self.sigma
   half_df = 0.5 * self.df
   return (math_ops.exp(math_ops.lgamma(0.5 + half_df) -
                        math_ops.lgamma(half_df)) /
           (math_ops.sqrt(self.df) * math.sqrt(math.pi) * self.sigma) *
           math_ops.pow(1. + math_ops.square(y) / self.df, -(0.5 + half_df)))
 def test_two_dimensional_arg_dynamic(self):
   # Should evaluate to 1/2.
   x_one_half = [[2, 1.], [2, 1.]]
   with self.test_session(use_gpu=True):
     ph = array_ops.placeholder(dtypes.float32)
     beta_ph = math_ops.exp(special_math_ops.lbeta(ph))
     self.assertAllClose([0.5, 0.5], beta_ph.eval(feed_dict={ph: x_one_half}))
Example #28
0
def exact_laplacian_kernel(x, y, stddev):
  """Computes exact Laplacian kernel value(s) for tensors x and y using stddev.

  The Laplacian kernel for vectors u, v is defined as follows:
       K(u, v) = exp(-||u-v|| / stddev)
  where the norm is the l1-norm. x, y can be either vectors or matrices. If they
  are vectors, they must have the same dimension. If they are matrices, they
  must have the same number of columns. In the latter case, the method returns
  (as a matrix) K(u, v) values for all pairs (u, v) where u is a row from x and
  v is a row from y.

  Args:
    x: a tensor of rank 1 or 2. It's shape should be either [dim] or [m, dim].
    y: a tensor of rank 1 or 2. It's shape should be either [dim] or [n, dim].
    stddev: The width of the Gaussian kernel.

  Returns:
    A single value (scalar) with shape (1, 1)  if x, y are vectors or a matrix
    of shape (m, n) with entries K(u, v) (where K is the Laplacian kernel) for
    all (u,v) pairs where u, v are rows from x and y respectively.

  Raises:
    InvalidShapeError: if the shapes of x, y are not compatible.
  """
  x_aligned, y_aligned = _align_matrices(x, y)
  diff_l1_norm = math_ops.reduce_sum(
      math_ops.abs(math_ops.subtract(x_aligned, y_aligned)), 2)
  return math_ops.exp(-diff_l1_norm / stddev)
Example #29
0
  def predictions(self, examples):
    """Add operations to compute predictions by the model.

    If logistic_loss is being used, predicted probabilities are returned.
    If poisson_loss is being used, predictions are exponentiated.
    Otherwise, (raw) linear predictions (w*x) are returned.

    Args:
      examples: Examples to compute predictions on.

    Returns:
      An Operation that computes the predictions for examples.

    Raises:
      ValueError: if examples are not well defined.
    """
    self._assertSpecified(
        ['example_weights', 'sparse_features', 'dense_features'], examples)
    self._assertList(['sparse_features', 'dense_features'], examples)

    result = self._linear_predictions(examples)
    if self._options['loss_type'] == 'logistic_loss':
      # Convert logits to probability for logistic loss predictions.
      with name_scope('sdca/logistic_prediction'):
        result = math_ops.sigmoid(result)
    elif self._options['loss_type'] == 'poisson_loss':
      # Exponeniate the prediction for poisson loss predictions.
      with name_scope('sdca/poisson_prediction'):
        result = math_ops.exp(result)
    return result
Example #30
0
 def _call_cdf(self, value, name, **kwargs):
   with self._name_scope(name, values=[value]):
     value = ops.convert_to_tensor(value, name="value")
     try:
       return self._cdf(value, **kwargs)
     except NotImplementedError:
       return math_ops.exp(self._log_cdf(value, **kwargs))
Example #31
0
def weighted_cross_entropy_with_logits(logits, targets, pos_weight, name=None):
    """Computes a weighted cross entropy.

  This is like `sigmoid_cross_entropy_with_logits()` except that `pos_weight`,
  allows one to trade off recall and precision by up- or down-weighting the
  cost of a positive error relative to a negative error.

  The usual cross-entropy cost is defined as:

    targets * -log(sigmoid(logits)) + (1 - targets) * -log(1 - sigmoid(logits))

  The argument `pos_weight` is used as a multiplier for the positive targets:

    targets * -log(sigmoid(logits)) * pos_weight +
        (1 - targets) * -log(1 - sigmoid(logits))

  For brevity, let `x = logits`, `z = targets`, `q = pos_weight`.
  The loss is:

        qz * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
      = qz * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
      = qz * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
      = qz * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
      = (1 - z) * x + (qz +  1 - z) * log(1 + exp(-x))
      = (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x))

  Setting `l = (1 + (q - 1) * z)`, to ensure stability and avoid overflow,
  the implementation uses

      (1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0))

  `logits` and `targets` must have the same type and shape.

  Args:
    logits: A `Tensor` of type `float32` or `float64`.
    targets: A `Tensor` of the same type and shape as `logits`.
    pos_weight: A coefficient to use on the positive examples.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` of the same shape as `logits` with the componentwise
    weightedlogistic losses.

  Raises:
    ValueError: If `logits` and `targets` do not have the same shape.
  """
    with ops.op_scope([logits, targets], name, "logistic_loss") as name:
        logits = ops.convert_to_tensor(logits, name="logits")
        targets = ops.convert_to_tensor(targets, name="targets")
        try:
            targets.get_shape().merge_with(logits.get_shape())
        except ValueError:
            raise ValueError(
                "logits and targets must have the same shape (%s vs %s)" %
                (logits.get_shape(), targets.get_shape()))

        # The logistic loss formula from above is
        #   (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x))
        # For x < 0, a more numerically stable formula is
        #   (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(x)) - l * x
        # To avoid branching, we use the combined version
        #   (1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0))
        log_weight = 1 + (pos_weight - 1) * targets
        return math_ops.add(
            (1 - targets) * logits,
            log_weight *
            (math_ops.log(1 + math_ops.exp(-math_ops.abs(logits))) +
             nn_ops.relu(-logits)),
            name=name)
Example #32
0
 def _mode(self):
     adjusted_count = array_ops.where(
         1. < self.total_count, self.total_count - 1.,
         array_ops.zeros_like(self.total_count))
     return math_ops.floor(adjusted_count * math_ops.exp(self.logits))
Example #33
0
 def _prob(self, x):
   coeff = np.sqrt(2) / self.scale / np.sqrt(np.pi)
   pdf = coeff * math_ops.exp(- 0.5 * (x / self.scale) ** 2)
   return pdf * math_ops.cast(x >= 0, self.dtype)
Example #34
0
def natural_exp_decay(exploration_rate,
                      timestep,
                      decay_steps,
                      decay_rate,
                      staircase=False,
                      name=None):
    """Applies natural exponential decay to the initial exploration rate.

    When training a model, it is often recommended to lower the exploration rate as
    the training progresses.  This function applies an exponential decay function
    to a provided initial exploration rate.  It requires an `timestep` value to
    compute the decayed exploration rate.  You can just pass a TensorFlow variable
    that you increment at each training step.

    The function returns the decayed exploration rate.  It is computed as:

    ```python
    >>> decayed_exploration_rate = exploration_rate * exp(-decay_rate * timestep)
    ```

    Example: decay exponentially with a base of 0.96:

    ```python
    >>> timestep = tf.Variable(0, trainable=False)
    >>> exploration_rate = 0.1
    >>> k = 0.5
    >>> exploration_rate = tf.train.exponential_time_decay(exploration_rate, timestep, k)

    >>> # Passing timestep to minimize() will increment it at each step.
    >>> learning_step = (
    ...     tf.train.GradientDescentOptimizer(exploration_rate)
    ...     .minimize(...my loss..., timestep=timestep)
    ... )
    ```

    Args:
        exploration_rate: A scalar `float32` or `float64` `Tensor` or a
            Python number.  The initial exploration rate.
        timestep: A Python number.
            Global step to use for the decay computation.  Must not be negative.
        decay_steps: How often to apply decay.
        decay_rate: A Python number.  The decay rate.
        staircase: Whether to apply decay in a discrete staircase,
            as opposed to continuous, fashion.
        name: String.  Optional name of the operation.  Defaults to 'ExponentialTimeDecay'.

    Returns:
        A scalar `Tensor` of the same type as `exploration_rate`.  The decayed exploration rate.

    Raises:
        ValueError: if `timestep` is not supplied.
    """
    if timestep is None:
        raise ValueError("timestep is required for natural_exp_decay.")
    with get_name_scope(name=name,
                        scope="NaturalExpDecay",
                        values=[exploration_rate, timestep,
                                decay_rate]) as name:
        exploration_rate = ops.convert_to_tensor(exploration_rate,
                                                 name="exploration_rate")
        dtype = exploration_rate.dtype
        timestep = math_ops.cast(timestep, dtype)
        decay_steps = math_ops.cast(decay_steps, dtype)
        decay_rate = math_ops.cast(decay_rate, dtype)
        p = timestep / decay_steps
        if staircase:
            p = math_ops.floor(p)
        exponent = math_ops.exp(
            math_ops.multiply(math_ops.negative(decay_rate), p))
        return math_ops.multiply(exploration_rate, exponent, name=name)
Example #35
0
 def _prob(self, counts):
   return math_ops.exp(self._log_prob(counts))
 def _variance(self):
     return math_ops.exp(self._log_variance())
Example #37
0
def renyi_alpha(step,
                decay_time,
                alpha_min,
                alpha_max=0.99999,
                name='renyi_alpha'):
  r"""Exponentially decaying `Tensor` appropriate for Renyi ratios.

  When minimizing the Renyi divergence for `0 <= alpha < 1` (or maximizing the
  Renyi equivalent of elbo) in high dimensions, it is not uncommon to experience
  `NaN` and `inf` values when `alpha` is far from `1`.

  For that reason, it is often desirable to start the optimization with `alpha`
  very close to 1, and reduce it to a final `alpha_min` according to some
  schedule.  The user may even want to optimize using `elbo_ratio` for
  some fixed time before switching to Renyi based methods.

  This `Op` returns an `alpha` decaying exponentially with step:

  ```
  s(step) = (exp{step / decay_time} - 1) / (e - 1)
  t(s) = max(0, min(s, 1)),  (smooth growth from 0 to 1)
  alpha(t) = (1 - t) alpha_min + t alpha_max
  ```

  Args:
    step:  Non-negative scalar `Tensor`.  Typically the global step or an
      offset version thereof.
    decay_time:  Positive scalar `Tensor`.
    alpha_min:  `float` or `double` `Tensor`.
      The minimal, final value of `alpha`, achieved when `step >= decay_time`
    alpha_max:  `Tensor` of same `dtype` as `alpha_min`.
      The maximal, beginning value of `alpha`, achieved when `step == 0`
    name:  A name to give this `Op`.

  Returns:
    alpha:  A `Tensor` of same `dtype` as `alpha_min`.
  """
  with ops.name_scope(name, values=[step, decay_time, alpha_min, alpha_max]):
    alpha_min = ops.convert_to_tensor(alpha_min, name='alpha_min')
    dtype = alpha_min.dtype

    alpha_max = ops.convert_to_tensor(alpha_max, dtype=dtype, name='alpha_max')
    decay_time = math_ops.cast(decay_time, dtype)
    step = math_ops.cast(step, dtype)

    check_scalars = [
        check_ops.assert_rank(step, 0, message='step must be scalar'),
        check_ops.assert_rank(
            decay_time, 0, message='decay_time must be scalar'),
        check_ops.assert_rank(alpha_min, 0, message='alpha_min must be scalar'),
        check_ops.assert_rank(alpha_max, 0, message='alpha_max must be scalar'),
    ]
    check_sign = [
        check_ops.assert_non_negative(
            step, message='step must be non-negative'),
        check_ops.assert_positive(
            decay_time, message='decay_time must be positive'),
    ]

    with ops.control_dependencies(check_scalars + check_sign):
      theta = (math_ops.exp(step / decay_time) - 1.) / (math.e - 1.)
      theta = math_ops.minimum(math_ops.maximum(theta, 0.), 1.)
      return alpha_max * (1. - theta) + alpha_min * theta
 def _cdf(self, x):
     y = x - self.loc
     return (0.5 + 0.5 * math_ops.sign(y) *
             (1. - math_ops.exp(-math_ops.abs(y) / self.scale)))
 def _det(self):
   return math_ops.exp(self.log_det())
Example #40
0
 def _prob(self, event):
     return math_ops.exp(self._log_prob(event))
Example #41
0
 def _prob(self, x):
     return math_ops.exp(self._log_prob(x))
 def _stddev(self):
     return math_ops.exp(0.5 * self._log_variance())
Example #43
0
def natural_exp_decay(learning_rate,
                      global_step,
                      decay_steps,
                      decay_rate,
                      k_decay=1.0,
                      staircase=False,
                      name=None):
    """Applies natural exponential decay to the initial learning rate.

  When training a model, it is often recommended to lower the learning rate as
  the training progresses.  This function applies an exponential decay function
  to a provided initial learning rate.  It requires an `global_step` value to
  compute the decayed learning rate.  You can just pass a TensorFlow variable
  that you increment at each training step.

 
  The function returns the decayed learning rate.  It is computed as:

  ```python
  decayed_learning_rate = learning_rate * exp(-decay_rate * (global_step /
  decay_step) ^ k_decay )
  ```

  or, if `staircase` is `True`, as:

  ```python
  decayed_learning_rate = learning_rate * exp(-decay_rate * floor((global_step /
  decay_step) ^ k_decay))
  ```

  Example: decay exponentially with a base of 0.96:

  ```python
  ...
  global_step = tf.Variable(0, trainable=False)
  learning_rate = 0.1
  decay_steps = 5
  k = 0.5
  learning_rate = tf.compat.v1.train.natural_exp_decay(learning_rate,
  global_step,
                                             decay_steps, k)

  # Passing global_step to minimize() will increment it at each step.
  learning_step = (
      tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
      .minimize(...my loss..., global_step=global_step)
  )
  ```

  Args:
    learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number.
      The initial learning rate.
    global_step: A Python number. Global step to use for the decay computation.
      Must not be negative.
    decay_steps: How often to apply decay.
    decay_rate: A Python number.  The decay rate.
    k_decay: A scalar `float32` or `float64` `Tensor` or a Python number. The k values of
      the polynomial of k-decay method. Defaults to 1.0.
    staircase: Whether to apply decay in a discrete staircase, as opposed to
      continuous, fashion.
    name: String.  Optional name of the operation.  Defaults to
      'ExponentialTimeDecay'.
  Returns:
    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    learning rate.

  Raises:
    ValueError: if `global_step` is not supplied.

  References:
    k-decay: A New Method For Learning Rate Schedule:
      [Tao Zhang, Wei Li., 2020]
      ([pdf])(https://arxiv.org/abs/2004.05909)


  @compatibility(eager)
  When eager execution is enabled, this function returns a function which in
  turn returns the decayed learning rate Tensor. This can be useful for changing
  the learning rate value across different invocations of optimizer functions.
  @end_compatibility
  """
    natural_exp_rate = math_ops.exp(math_ops.negative(decay_rate))
    decayed_lr = learning_rate_schedule.ExponentialDecay(learning_rate,
                                                         decay_steps,
                                                         k_decay=k_decay,
                                                         natural_exp_rate,
                                                         staircase=staircase,
                                                         name=name)

    if not context.executing_eagerly():
        decayed_lr = decayed_lr(global_step)
    else:
        decayed_lr = functools.partial(decayed_lr, global_step)
    return decayed_lr
 def _mean(self):
     return math_ops.exp(
         math_ops.reduce_logsumexp(self.mixture_distribution.logits +
                                   self.distribution.log_rate,
                                   axis=-1))
Example #45
0
def natural_exp_decay(learning_rate,
                      global_step,
                      decay_steps,
                      decay_rate,
                      staircase=False,
                      name=None):
    """Applies natural exponential decay to the initial learning rate.

  When training a model, it is often recommended to lower the learning rate as
  the training progresses.  This function applies an exponential decay function
  to a provided initial learning rate.  It requires an `global_step` value to
  compute the decayed learning rate.  You can just pass a TensorFlow variable
  that you increment at each training step.

  The function returns the decayed learning rate.  It is computed as:

  ```python
  decayed_learning_rate = learning_rate * exp(-decay_rate * global_step)
  ```

  Example: decay exponetially with a base of 0.96:

  ```python
  ...
  global_step = tf.Variable(0, trainable=False)
  learning_rate = 0.1
  k = 0.5
  learning_rate = tf.train.exponential_time_decay(learning_rate, global_step, k)

  # Passing global_step to minimize() will increment it at each step.
  learning_step = (
      tf.GradientDescentOptimizer(learning_rate)
      .minimize(...my loss..., global_step=global_step)
  )
  ```

  Args:
    learning_rate: A scalar `float32` or `float64` `Tensor` or a
      Python number.  The initial learning rate.
    global_step: A Python number.
      Global step to use for the decay computation.  Must not be negative.
    decay_rate: A Python number.  The decay rate.
    name: String.  Optional name of the operation.  Defaults to
      'ExponentialTimeDecay'

  Returns:
    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    learning rate.
  """
    with ops.op_scope([learning_rate, global_step, decay_rate], name,
                      "NaturalExpDecay") as name:
        learning_rate = ops.convert_to_tensor(learning_rate,
                                              name="learning_rate")
        dtype = learning_rate.dtype
        global_step = math_ops.cast(global_step, dtype)
        decay_steps = math_ops.cast(decay_steps, dtype)
        decay_rate = math_ops.cast(decay_rate, dtype)
        p = global_step / decay_steps
        if staircase:
            p = math_ops.floor(p)
        exponent = math_ops.exp(math_ops.mul(math_ops.neg(decay_rate), p))
        return math_ops.mul(learning_rate, exponent, name=name)
Example #46
0
 def _prob(self, k):
     return math_ops.exp(self._log_prob(k))
Example #47
0
 def _mean(self):
     return self.total_count * math_ops.exp(self.logits)
Example #48
0
 def _cdf(self, x):
     z = self._z(x)
     return (0.5 + 0.5 * math_ops.sign(z) *
             (1. - math_ops.exp(-math_ops.abs(z))))
Example #49
0
 def _inverse(self, y):
     y = self._maybe_assert_valid(y)
     return math_ops.exp(
         math_ops.log1p(-(1 - y**self.concentration1)**self.concentration0))
Example #50
0
 def _mean(self):
     return math_ops.exp(-self.logits)
Example #51
0
def _ErfGrad(op, grad):
  """Returns grad * 2/sqrt(pi) * exp(-x**2)."""
  x = op.inputs[0]
  two_over_root_pi = constant_op.constant(2 / np.sqrt(np.pi), dtype=grad.dtype)
  with ops.control_dependencies([grad.op]):
    return  grad * two_over_root_pi * math_ops.exp(-math_ops.square(x))
Example #52
0
def sample(mu, log_var):
    noise_shape = mu.get_shape().as_list()
    noise = tf.random_normal(shape=noise_shape)
    samples = math_ops.add(math_ops.mul(math_ops.exp(log_var / 2.0), noise),
                           mu)
    return samples
Example #53
0
def sigmoid_cross_entropy_with_logits(logits, targets, name=None):
  """Computes sigmoid cross entropy given `logits`.

  Measures the probability error in discrete classification tasks in which each
  class is independent and not mutually exclusive.  For instance, one could
  perform multilabel classification where a picture can contain both an elephant
  and a dog at the same time.

  For brevity, let `x = logits`, `z = targets`.  The logistic loss is

        z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
      = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
      = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
      = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
      = (1 - z) * x + log(1 + exp(-x))
      = x - x * z + log(1 + exp(-x))

  For x < 0, to avoid overflow in exp(-x), we reformulate the above

        x - x * z + log(1 + exp(-x))
      = log(exp(x)) - x * z + log(1 + exp(-x))
      = - x * z + log(1 + exp(x))

  Hence, to ensure stability and avoid overflow, the implementation uses this
  equivalent formulation

      max(x, 0) - x * z + log(1 + exp(-abs(x)))

  `logits` and `targets` must have the same type and shape.

  Args:
    logits: A `Tensor` of type `float32` or `float64`.
    targets: A `Tensor` of the same type and shape as `logits`.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` of the same shape as `logits` with the componentwise
    logistic losses.

  Raises:
    ValueError: If `logits` and `targets` do not have the same shape.
  """
  with ops.name_scope(name, "logistic_loss", [logits, targets]) as name:
    logits = ops.convert_to_tensor(logits, name="logits")
    targets = ops.convert_to_tensor(targets, name="targets")
    try:
      targets.get_shape().merge_with(logits.get_shape())
    except ValueError:
      raise ValueError("logits and targets must have the same shape (%s vs %s)"
                       % (logits.get_shape(), targets.get_shape()))

    # The logistic loss formula from above is
    #   x - x * z + log(1 + exp(-x))
    # For x < 0, a more numerically stable formula is
    #   -x * z + log(1 + exp(x))
    # Note that these two expressions can be combined into the following:
    #   max(x, 0) - x * z + log(1 + exp(-abs(x)))
    # To allow computing gradients at zero, we define custom versions of max and
    # abs functions.
    zeros = array_ops.zeros_like(logits, dtype=logits.dtype)
    cond = (logits >= zeros)
    relu_logits = array_ops.where(cond, logits, zeros)
    neg_abs_logits = array_ops.where(cond, -logits, logits)
    return math_ops.add(relu_logits - logits * targets,
                        math_ops.log1p(math_ops.exp(neg_abs_logits)),
                        name=name)
Example #54
0
def expectation_importance_sampler(f,
                                   log_p,
                                   sampling_dist_q,
                                   z=None,
                                   n=None,
                                   seed=None,
                                   name='expectation_importance_sampler'):
    r"""Monte Carlo estimate of `E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]`.

  With `p(z) := exp{log_p(z)}`, this `Op` returns

  ```
  n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ],  z_i ~ q,
  \approx E_q[ f(Z) p(Z) / q(Z) ]
  =       E_p[f(Z)]
  ```

  This integral is done in log-space with max-subtraction to better handle the
  often extreme values that `f(z) p(z) / q(z)` can take on.

  If `f >= 0`, it is up to 2x more efficient to exponentiate the result of
  `expectation_importance_sampler_logspace` applied to `Log[f]`.

  User supplies either `Tensor` of samples `z`, or number of samples to draw `n`

  Args:
    f: Callable mapping samples from `sampling_dist_q` to `Tensors` with shape
      broadcastable to `q.batch_shape`.
      For example, `f` works "just like" `q.log_prob`.
    log_p:  Callable mapping samples from `sampling_dist_q` to `Tensors` with
      shape broadcastable to `q.batch_shape`.
      For example, `log_p` works "just like" `sampling_dist_q.log_prob`.
    sampling_dist_q:  The sampling distribution.
      `tf.contrib.distributions.Distribution`.
      `float64` `dtype` recommended.
      `log_p` and `q` should be supported on the same set.
    z:  `Tensor` of samples from `q`, produced by `q.sample_n`.
    n:  Integer `Tensor`.  Number of samples to generate if `z` is not provided.
    seed:  Python integer to seed the random number generator.
    name:  A name to give this `Op`.

  Returns:
    The importance sampling estimate.  `Tensor` with `shape` equal
      to batch shape of `q`, and `dtype` = `q.dtype`.
  """
    q = sampling_dist_q
    with ops.name_scope(name, values=[z, n]):
        z = _get_samples(q, z, n, seed)

        log_p_z = log_p(z)
        q_log_prob_z = q.log_prob(z)

        def _importance_sampler_positive_f(log_f_z):
            # Same as expectation_importance_sampler_logspace, but using Tensors
            # rather than samples and functions.  Allows us to sample once.
            log_values = log_f_z + log_p_z - q_log_prob_z
            return _logspace_mean(log_values)

        # With f_plus(z) = max(0, f(z)), f_minus(z) = max(0, -f(z)),
        # E_p[f(Z)] = E_p[f_plus(Z)] - E_p[f_minus(Z)]
        #           = E_p[f_plus(Z) + 1] - E_p[f_minus(Z) + 1]
        # Without incurring bias, 1 is added to each to prevent zeros in logspace.
        # The logarithm is approximately linear around 1 + epsilon, so this is good
        # for small values of 'z' as well.
        f_z = f(z)
        log_f_plus_z = math_ops.log(nn.relu(f_z) + 1.)
        log_f_minus_z = math_ops.log(nn.relu(-1. * f_z) + 1.)

        log_f_plus_integral = _importance_sampler_positive_f(log_f_plus_z)
        log_f_minus_integral = _importance_sampler_positive_f(log_f_minus_z)

    return math_ops.exp(log_f_plus_integral) - math_ops.exp(
        log_f_minus_integral)
Example #55
0
 def _forward(self, x):
     z = (x - self.loc) / self.scale
     return math_ops.exp(-math_ops.exp(-z))
Example #56
0
  def run_test_sample_consistent_log_prob(
      self, sess_run_fn, dist,
      num_samples=int(1e5), num_threshold=int(1e3), seed=42,
      batch_size=None,
      rtol=1e-2, atol=0.):
    """Tests that sample/log_prob are consistent with each other.

    "Consistency" means that `sample` and `log_prob` correspond to the same
    distribution.

    Note: this test only verifies a necessary condition for consistency--it does
    does not verify sufficiency hence does not prove `sample`, `log_prob` truly
    are consistent.

    Args:
      sess_run_fn: Python `callable` taking `list`-like of `Tensor`s and
        returning a list of results after running one "step" of TensorFlow
        computation, typically set to `sess.run`.
      dist: Distribution instance or object which implements `sample`,
        `log_prob`, `event_shape_tensor` and `batch_shape_tensor`.
      num_samples: Python `int` scalar indicating the number of Monte-Carlo
        samples to draw from `dist`.
      num_threshold: Python `int` scalar indicating the number of samples a
        bucket must contain before being compared to the probability.
        Default value: 1e3; must be at least 1.
        Warning, set too high will cause test to falsely pass but setting too
        low will cause the test to falsely fail.
      seed: Python `int` indicating the seed to use when sampling from `dist`.
        In general it is not recommended to use `None` during a test as this
        increases the likelihood of spurious test failure.
      batch_size: Hint for unpacking result of samples. Default: `None` means
        batch_size is inferred.
      rtol: Python `float`-type indicating the admissible relative error between
        analytical and sample statistics.
      atol: Python `float`-type indicating the admissible absolute error between
        analytical and sample statistics.

    Raises:
      ValueError: if `num_threshold < 1`.
    """
    if num_threshold < 1:
      raise ValueError("num_threshold({}) must be at least 1.".format(
          num_threshold))
    # Histogram only supports vectors so we call it once per batch coordinate.
    y = dist.sample(num_samples, seed=seed)
    y = array_ops.reshape(y, shape=[num_samples, -1])
    if batch_size is None:
      batch_size = math_ops.reduce_prod(dist.batch_shape_tensor())
    batch_dims = array_ops.shape(dist.batch_shape_tensor())[0]
    edges_expanded_shape = 1 + array_ops.pad([-2], paddings=[[0, batch_dims]])
    for b, x in enumerate(array_ops.unstack(y, num=batch_size, axis=1)):
      counts, edges = self.histogram(x)
      edges = array_ops.reshape(edges, edges_expanded_shape)
      probs = math_ops.exp(dist.log_prob(edges))
      probs = array_ops.reshape(probs, shape=[-1, batch_size])[:, b]

      [counts_, probs_] = sess_run_fn([counts, probs])
      valid = counts_ > num_threshold
      probs_ = probs_[valid]
      counts_ = counts_[valid]
      self.assertAllClose(probs_, counts_ / num_samples,
                          rtol=rtol, atol=atol)
Example #57
0
 def _forward(self, x):
     x = self._maybe_assert_valid(x)
     return math_ops.exp(
         math_ops.log1p(
             -math_ops.exp(math_ops.log1p(-x) / self.concentration0)) /
         self.concentration1)
Example #58
0
def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None):
  """Computes log Poisson loss given `log_input`.

  Gives the log-likelihood loss between the prediction and the target under the
  assumption that the target has a Poisson distribution.
  Caveat: By default, this is not the exact loss, but the loss minus a
    constant term [log(z!)]. That has no effect for optimization, but
    does not play well with relative loss comparisons. To compute an
    approximation of the log factorial term, specify
    compute_full_loss=True to enable Stirling's Approximation.

  For brevity, let `c = log(x) = log_input`, `z = targets`.  The log Poisson
  loss is

        -log(exp(-x) * (x^z) / z!)
      = -log(exp(-x) * (x^z)) + log(z!)
      ~ -log(exp(-x)) - log(x^z) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
          [ Note the second term is the Stirling's Approximation for log(z!).
            It is invariant to x and does not affect optimization, though
            important for correct relative loss comparisons. It is only
            computed when compute_full_loss == True. ]
      = x - z * log(x) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
      = exp(c) - z * c [+ z * log(z) - z + 0.5 * log(2 * pi * z)]

  Args:
    targets: A `Tensor` of the same type and shape as `log_input`.
    log_input: A `Tensor` of type `float32` or `float64`.
    compute_full_loss: whether to compute the full loss. If false, a constant
      term is dropped in favor of more efficient optimization.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` of the same shape as `log_input` with the componentwise
    logistic losses.

  Raises:
    ValueError: If `log_input` and `targets` do not have the same shape.
  """
  with ops.name_scope(name, "log_poisson_loss", [log_input, targets]) as name:
    log_input = ops.convert_to_tensor(log_input, name="log_input")
    targets = ops.convert_to_tensor(targets, name="targets")
    try:
      targets.get_shape().merge_with(log_input.get_shape())
    except ValueError:
      raise ValueError(
          "log_input and targets must have the same shape (%s vs %s)" %
          (log_input.get_shape(), targets.get_shape()))

    result = math_ops.exp(log_input) - log_input * targets
    if compute_full_loss:
      # need to create constant tensors here so that their dtypes can be matched
      # to that of the targets.
      point_five = constant_op.constant(0.5, dtype=targets.dtype)
      two_pi = constant_op.constant(2 * math.pi, dtype=targets.dtype)

      stirling_approx = (targets * math_ops.log(targets)) - targets + (
          point_five * math_ops.log(two_pi * targets))
      zeros = array_ops.zeros_like(targets, dtype=targets.dtype)
      ones = array_ops.ones_like(targets, dtype=targets.dtype)
      cond = math_ops.logical_and(targets >= zeros, targets <= ones)
      result += array_ops.where(cond, zeros, stirling_approx)
    return result
 def _prob(self, x):
     return 0.5 / self.scale * math_ops.exp(
         -math_ops.abs(x - self.loc) / self.scale)
Example #60
0
 def _forward_log_det_jacobian(self, x):
     z = (x - self.loc) / self.scale
     return -z - math_ops.exp(-z) - math_ops.log(self.scale)