Esempio n. 1
0
 def f(a, b):  # pylint: disable=missing-docstring
     dtype = a.dtype
     if np.issubdtype(dtype.as_numpy_dtype, np.inexact):
         rtol_ = ops.convert_to_tensor(rtol, dtype.real_dtype)
         atol_ = ops.convert_to_tensor(atol, dtype.real_dtype)
         result = (math_ops.abs(a - b) <= atol_ + rtol_ * math_ops.abs(b))
         if equal_nan:
             result = result | (math_ops.is_nan(a) & math_ops.is_nan(b))
         return result
     else:
         return a == b
Esempio n. 2
0
  def testUniformNans(self):
    a = 10.0
    b = [11.0, 100.0]
    uniform = uniform_lib.Uniform(low=a, high=b)

    no_nans = constant_op.constant(1.0)
    nans = constant_op.constant(0.0) / constant_op.constant(0.0)
    self.assertTrue(self.evaluate(math_ops.is_nan(nans)))
    with_nans = array_ops.stack([no_nans, nans])

    pdf = uniform.prob(with_nans)

    is_nan = self.evaluate(math_ops.is_nan(pdf))
    self.assertFalse(is_nan[0])
    self.assertTrue(is_nan[1])
Esempio n. 3
0
    def testUniformNans(self):
        a = 10.0
        b = [11.0, 100.0]
        uniform = uniform_lib.Uniform(low=a, high=b)

        no_nans = constant_op.constant(1.0)
        nans = constant_op.constant(0.0) / constant_op.constant(0.0)
        self.assertTrue(self.evaluate(math_ops.is_nan(nans)))
        with_nans = array_ops.stack([no_nans, nans])

        pdf = uniform.prob(with_nans)

        is_nan = self.evaluate(math_ops.is_nan(pdf))
        self.assertFalse(is_nan[0])
        self.assertTrue(is_nan[1])
Esempio n. 4
0
  def check_grads(grads_and_vars):
    """
    Check wether the gradients contain Inf or Nan.
    Args:
        grads_and_vars:
            list of tuple (grad, var),
            normally the output of opt.compute_gradients
    Output:
        has_nan: bool, True if there is Nan, otherwise it will be False
        amax: tensor denoting the maximum value in gradients
    """
    has_nan_ops = []
    amax_ops = []

    for grad in grads_and_vars:
      if isinstance(grad, tuple):
        grad = grad[0]
      if grad is not None:
        if isinstance(grad, ops.IndexedSlices):
          x = grad.values
        else:
          x = grad

        if x.dtype != dtypes.float32:
          x = math_ops.cast(x, dtypes.float32)
        has_nan_ops.append(math_ops.reduce_any(math_ops.is_nan(x)))
        amax_ops.append(math_ops.reduce_max(math_ops.abs(x)))

    has_nan = math_ops.reduce_any(has_nan_ops)
    amax = math_ops.reduce_max(amax_ops)
    return has_nan, amax
Esempio n. 5
0
  def pdf(self, x, name="pdf"):
    """The PDF of observations in `x` under these Uniform distribution(s).

    Args:
      x: tensor of dtype `dtype`, must be broadcastable with `a` and `b`.
      name: The name to give this op.

    Returns:
      pdf: tensor of dtype `dtype`, the pdf values of `x`. If `x` is `nan`, will
          return `nan`.
    """
    with ops.name_scope(self.name):
      with ops.op_scope([self.a, self.b, x], name):
        x = ops.convert_to_tensor(x, name="x")
        if x.dtype != self.dtype:
          raise TypeError("Input x dtype does not match dtype: %s vs. %s" %
                          (x.dtype, self.dtype))

        broadcasted_x = x * self._ones()
        return math_ops.select(
            math_ops.is_nan(broadcasted_x), broadcasted_x, math_ops.select(
                math_ops.logical_or(broadcasted_x < self.a,
                                    broadcasted_x > self.b),
                array_ops.zeros_like(broadcasted_x),
                (1.0 / self.range()) * array_ops.ones_like(broadcasted_x)))
Esempio n. 6
0
  def pdf(self, x, name="pdf"):
    """The PDF of observations in `x` under these Uniform distribution(s).

    Args:
      x: tensor of dtype `dtype`, must be broadcastable with `a` and `b`.
      name: The name to give this op.

    Returns:
      pdf: tensor of dtype `dtype`, the pdf values of `x`. If `x` is `nan`, will
          return `nan`.
    """
    with ops.name_scope(self.name):
      with ops.op_scope([self.a, self.b, x], name):
        x = ops.convert_to_tensor(x, name="x")
        if x.dtype != self.dtype:
          raise TypeError("Input x dtype does not match dtype: %s vs. %s" %
                          (x.dtype, self.dtype))

        broadcasted_x = x * self._ones()
        return math_ops.select(
            math_ops.is_nan(broadcasted_x), broadcasted_x, math_ops.select(
                math_ops.logical_or(broadcasted_x < self.a,
                                    broadcasted_x > self.b),
                array_ops.zeros_like(broadcasted_x),
                (1.0 / self.range()) * array_ops.ones_like(broadcasted_x)))
Esempio n. 7
0
  def testBasic(self):

    for dtype in [dtypes.float32, dtypes.float64]:
      one = constant_op.constant([1], dtype=dtype)
      two = constant_op.constant([2], dtype=dtype)
      zero = constant_op.constant([0], dtype=dtype)
      nan = constant_op.constant([np.nan], dtype=dtype)

      eps = constant_op.constant([np.finfo(dtype.as_numpy_dtype).eps],
                                 dtype=dtype)

      self.assertAllEqual(math_ops.nextafter(one, two) - one, eps)
      self.assertAllLess(math_ops.nextafter(one, zero) - one, 0)
      self.assertAllEqual(math_ops.is_nan(math_ops.nextafter(nan, one)), [True])
      self.assertAllEqual(math_ops.is_nan(math_ops.nextafter(one, nan)), [True])
      self.assertAllEqual(math_ops.nextafter(one, one), one)
Esempio n. 8
0
  def _apply_transform(self, input_tensors, **kwargs):
    """Applies the transformation to the `transform_input`.

    Args:
      input_tensors: a list of Tensors representing the input to
        the Transform.
      **kwargs: Additional keyword arguments, unused here.

    Returns:
        A namedtuple of Tensors representing the transformed output.
    """
    d = input_tensors[0]

    if self.strip_value is np.nan:
      strip_hot = math_ops.is_nan(d)
    else:
      strip_hot = math_ops.equal(d,
                                 array_ops.constant([self.strip_value],
                                                    dtype=d.dtype))
    keep_hot = math_ops.logical_not(strip_hot)

    length = array_ops.reshape(array_ops.shape(d), [])
    indices = array_ops.boolean_mask(math_ops.range(length), keep_hot)
    values = array_ops.boolean_mask(d, keep_hot)

    sparse_indices = array_ops.reshape(
        math_ops.cast(indices, dtypes.int64), [-1, 1])
    shape = math_ops.cast(array_ops.shape(d), dtypes.int64)

    # pylint: disable=not-callable
    return self.return_type(ops.SparseTensor(sparse_indices, values, shape))
Esempio n. 9
0
  def testUniformNans(self):
    with self.test_session():
      a = 10.0
      b = [11.0, 100.0]
      uniform = uniform_lib.Uniform(a=a, b=b)

      no_nans = constant_op.constant(1.0)
      nans = constant_op.constant(0.0) / constant_op.constant(0.0)
      self.assertTrue(math_ops.is_nan(nans).eval())
      with_nans = array_ops.stack([no_nans, nans])

      pdf = uniform.pdf(with_nans)

      is_nan = math_ops.is_nan(pdf).eval()
      self.assertFalse(is_nan[0])
      self.assertTrue(is_nan[1])
Esempio n. 10
0
    def _apply_transform(self, input_tensors):
        """Applies the transformation to the `transform_input`.

    Args:
        input_tensors: a list of Tensors representing the input to
        the Transform.

    Returns:
        A namedtuple of Tensors representing the transformed output.
    """
        d = input_tensors[0]

        if self.strip_value is np.nan:
            strip_hot = math_ops.is_nan(d)
        else:
            strip_hot = math_ops.equal(
                d, array_ops.constant([self.strip_value], dtype=d.dtype))
        keep_hot = math_ops.logical_not(strip_hot)

        length = array_ops.reshape(array_ops.shape(d), [])
        indices = array_ops.boolean_mask(math_ops.range(length), keep_hot)
        values = array_ops.boolean_mask(d, keep_hot)

        sparse_indices = array_ops.reshape(
            math_ops.cast(indices, dtypes.int64), [-1, 1])
        shape = math_ops.cast(array_ops.shape(d), dtypes.int64)

        # pylint: disable=not-callable
        return self.return_type(ops.SparseTensor(sparse_indices, values,
                                                 shape))
Esempio n. 11
0
def kl_divergence(distribution_a, distribution_b,
                  allow_nan_stats=True, name=None):
  """Get the KL-divergence KL(distribution_a || distribution_b).

  If there is no KL method registered specifically for `type(distribution_a)`
  and `type(distribution_b)`, then the class hierarchies of these types are
  searched.

  If one KL method is registered between any pairs of classes in these two
  parent hierarchies, it is used.

  If more than one such registered method exists, the method whose registered
  classes have the shortest sum MRO paths to the input types is used.

  If more than one such shortest path exists, the first method
  identified in the search is used (favoring a shorter MRO distance to
  `type(distribution_a)`).

  Args:
    distribution_a: The first distribution.
    distribution_b: The second distribution.
    allow_nan_stats: Python `bool`, default `True`. When `True`,
      statistics (e.g., mean, mode, variance) use the value "`NaN`" to
      indicate the result is undefined. When `False`, an exception is raised
      if one or more of the statistic's batch members are undefined.
    name: Python `str` name prefixed to Ops created by this class.

  Returns:
    A Tensor with the batchwise KL-divergence between `distribution_a`
    and `distribution_b`.

  Raises:
    NotImplementedError: If no KL method is defined for distribution types
      of `distribution_a` and `distribution_b`.
  """
  kl_fn = _registered_kl(type(distribution_a), type(distribution_b))
  if kl_fn is None:
    raise NotImplementedError(
        "No KL(distribution_a || distribution_b) registered for distribution_a "
        "type %s and distribution_b type %s"
        % (type(distribution_a).__name__, type(distribution_b).__name__))

  with ops.name_scope("KullbackLeibler"):
    kl_t = kl_fn(distribution_a, distribution_b, name=name)
    if allow_nan_stats:
      return kl_t

    # Check KL for NaNs
    kl_t = array_ops.identity(kl_t, name="kl")

    with ops.control_dependencies([
        control_flow_ops.Assert(
            math_ops.logical_not(
                math_ops.reduce_any(math_ops.is_nan(kl_t))),
            ["KL calculation between %s and %s returned NaN values "
             "(and was called with allow_nan_stats=False). Values:"
             % (distribution_a.name, distribution_b.name), kl_t])]):
      return array_ops.identity(kl_t, name="checked_kl")
Esempio n. 12
0
 def _prob(self, x):
     broadcasted_x = x * array_ops.ones(self.batch_shape_tensor())
     return array_ops.where(
         math_ops.is_nan(broadcasted_x), broadcasted_x,
         array_ops.where(
             math_ops.logical_or(broadcasted_x < self.low,
                                 broadcasted_x >= self.high),
             array_ops.zeros_like(broadcasted_x),
             array_ops.ones_like(broadcasted_x) / self.range()))
Esempio n. 13
0
 def _prob(self, x):
     broadcasted_x = x * array_ops.ones(self.batch_shape())
     return math_ops.select(
         math_ops.is_nan(broadcasted_x), broadcasted_x,
         math_ops.select(
             math_ops.logical_or(broadcasted_x < self.a,
                                 broadcasted_x > self.b),
             array_ops.zeros_like(broadcasted_x),
             (1. / self.range()) * array_ops.ones_like(broadcasted_x)))
Esempio n. 14
0
  def testBasic(self):

    for dtype in [dtypes.float32, dtypes.float64]:
      one = constant_op.constant([1], dtype=dtype)
      two = constant_op.constant([2], dtype=dtype)
      zero = constant_op.constant([0], dtype=dtype)
      nan = constant_op.constant([np.nan], dtype=dtype)

      eps = constant_op.constant([np.finfo(dtype.as_numpy_dtype).eps],
                                 dtype=dtype)

      self.assertAllEqual(math_ops.nextafter(one, two) - one, eps)
      self.assertAllLess(math_ops.nextafter(one, zero) - one, 0)
      self.assertAllEqual(
          math_ops.is_nan(math_ops.nextafter(nan, one)), [True])
      self.assertAllEqual(
          math_ops.is_nan(math_ops.nextafter(one, nan)), [True])
      self.assertAllEqual(math_ops.nextafter(one, one), one)
Esempio n. 15
0
def kl(dist_a, dist_b, allow_nan=False, name=None):
    """Get the KL-divergence KL(dist_a || dist_b).

  If there is no KL method registered specifically for `type(dist_a)` and
  `type(dist_b)`, then the class hierarchies of these types are searched.

  If one KL method is registered between any pairs of classes in these two
  parent hierarchies, it is used.

  If more than one such registered method exists, the method whose registered
  classes have the shortest sum MRO paths to the input types is used.

  If more than one such shortest path exists, the first method
  identified in the search is used (favoring a shorter MRO distance to
  `type(dist_a)`).

  Args:
    dist_a: The first distribution.
    dist_b: The second distribution.
    allow_nan: If `False` (default), a runtime error is raised
      if the KL returns NaN values for any batch entry of the given
      distributions. If `True`, the KL may return a NaN for the given entry.
    name: (optional) Name scope to use for created operations.

  Returns:
    A Tensor with the batchwise KL-divergence between dist_a and dist_b.

  Raises:
    NotImplementedError: If no KL method is defined for distribution types
      of dist_a and dist_b.
  """
    kl_fn = _registered_kl(type(dist_a), type(dist_b))
    if kl_fn is None:
        raise NotImplementedError(
            "No KL(dist_a || dist_b) registered for dist_a type %s and dist_b "
            "type %s" % (type(dist_a).__name__, type(dist_b).__name__))

    with ops.name_scope("KullbackLeibler"):
        kl_t = kl_fn(dist_a, dist_b, name=name)
        if allow_nan:
            return kl_t

        # Check KL for NaNs
        kl_t = array_ops.identity(kl_t, name="kl")

        with ops.control_dependencies([
                control_flow_ops.Assert(
                    math_ops.logical_not(
                        math_ops.reduce_any(math_ops.is_nan(kl_t))),
                    [
                        "KL calculation between %s and %s returned NaN values "
                        "(and was called with allow_nan=False). Values:" %
                        (dist_a.name, dist_b.name), kl_t
                    ])
        ]):
            return array_ops.identity(kl_t, name="checked_kl")
Esempio n. 16
0
 def _prob(self, x):
   broadcasted_x = x * array_ops.ones(self.batch_shape())
   return array_ops.where(
       math_ops.is_nan(broadcasted_x),
       broadcasted_x,
       array_ops.where(
           math_ops.logical_or(broadcasted_x < self.a,
                               broadcasted_x > self.b),
           array_ops.zeros_like(broadcasted_x),
           (1. / self.range()) * array_ops.ones_like(broadcasted_x)))
Esempio n. 17
0
 def _prob(self, x):
   broadcasted_x = x * array_ops.ones(self.batch_shape_tensor())
   return array_ops.where(
       math_ops.is_nan(broadcasted_x),
       broadcasted_x,
       array_ops.where(
           math_ops.logical_or(broadcasted_x < self.low,
                               broadcasted_x >= self.high),
           array_ops.zeros_like(broadcasted_x),
           array_ops.ones_like(broadcasted_x) / self.range()))
Esempio n. 18
0
def _calculate_acceptance_probabilities(init_probs, target_probs):
  """Calculate the per-class acceptance rates.

  Args:
    init_probs: The class probabilities of the data.
    target_probs: The desired class proportion in minibatches.
  Returns:
    A list of the per-class acceptance probabilities.

  This method is based on solving the following analysis:

  Let F be the probability of a rejection (on any example).
  Let p_i be the proportion of examples in the data in class i (init_probs)
  Let a_i is the rate the rejection sampler should *accept* class i
  Let t_i is the target proportion in the minibatches for class i (target_probs)

  ```
  F = sum_i(p_i * (1-a_i))
    = 1 - sum_i(p_i * a_i)     using sum_i(p_i) = 1
  ```

  An example with class `i` will be accepted if `k` rejections occur, then an
  example with class `i` is seen by the rejector, and it is accepted. This can
  be written as follows:

  ```
  t_i = sum_k=0^inf(F^k * p_i * a_i)
      = p_i * a_j / (1 - F)    using geometric series identity, since 0 <= F < 1
      = p_i * a_i / sum_j(p_j * a_j)        using F from above
  ```

  Note that the following constraints hold:
  ```
  0 <= p_i <= 1, sum_i(p_i) = 1
  0 <= a_i <= 1
  0 <= t_i <= 1, sum_i(t_i) = 1
  ```


  A solution for a_i in terms of the other variabes is the following:
    ```a_i = (t_i / p_i) / max_i[t_i / p_i]```
  """
  # Make list of t_i / p_i.
  ratio_l = target_probs / init_probs

  # Replace NaNs with 0s.
  ratio_l = math_ops.select(math_ops.is_nan(ratio_l),
                            array_ops.zeros_like(ratio_l),
                            ratio_l)

  # Calculate list of acceptance probabilities.
  max_ratio = math_ops.reduce_max(ratio_l)
  return ratio_l / max_ratio
Esempio n. 19
0
def kl(dist_a, dist_b, allow_nan=False, name=None):
  """Get the KL-divergence KL(dist_a || dist_b).

  If there is no KL method registered specifically for `type(dist_a)` and
  `type(dist_b)`, then the class hierarchies of these types are searched.

  If one KL method is registered between any pairs of classes in these two
  parent hierarchies, it is used.

  If more than one such registered method exists, the method whose registered
  classes have the shortest sum MRO paths to the input types is used.

  If more than one such shortest path exists, the first method
  identified in the search is used (favoring a shorter MRO distance to
  `type(dist_a)`).

  Args:
    dist_a: The first distribution.
    dist_b: The second distribution.
    allow_nan: If `False` (default), a runtime error is raised
      if the KL returns NaN values for any batch entry of the given
      distributions.  If `True`, the KL may return a NaN for the given entry.
    name: (optional) Name scope to use for created operations.

  Returns:
    A Tensor with the batchwise KL-divergence between dist_a and dist_b.

  Raises:
    NotImplementedError: If no KL method is defined for distribution types
      of dist_a and dist_b.
  """
  kl_fn = _registered_kl(type(dist_a), type(dist_b))
  if kl_fn is None:
    raise NotImplementedError(
        "No KL(dist_a || dist_b) registered for dist_a type %s and dist_b "
        "type %s" % ((type(dist_a).__name__, type(dist_b).__name__)))

  with ops.name_scope("KullbackLeibler"):
    kl_t = kl_fn(dist_a, dist_b, name=name)
    if allow_nan:
      return kl_t

    # Check KL for NaNs
    kl_t = array_ops.identity(kl_t, name="kl")

    with ops.control_dependencies([
        control_flow_ops.Assert(
            math_ops.logical_not(
                math_ops.reduce_any(math_ops.is_nan(kl_t))),
            ["KL calculation between %s and %s returned NaN values "
             "(and was called with allow_nan=False).  Values:"
             % (dist_a.name, dist_b.name), kl_t])]):
      return array_ops.identity(kl_t, name="checked_kl")
Esempio n. 20
0
 def testNotInvertibleCpu(self):
   # Non-invertible inputs result in lower-triangular NaNs.
   x = constant_op.constant([[1., -1., 0.], [-1., 1., -1.], [0., -1., 1.]])
   chol = linalg_ops.cholesky(x)
   # Extract the lower-triangular elements.
   lower_mask = array_ops.matrix_band_part(
       constant_op.constant(True, shape=x.shape), -1, 0)
   chol_lower = array_ops.boolean_mask(chol, lower_mask)
   # Assert all NaN.
   all_nan = self.evaluate(
       math_ops.reduce_all(math_ops.reduce_all(math_ops.is_nan(chol_lower))))
   self.assertTrue(all_nan)
Esempio n. 21
0
 def _compare(self, x, use_gpu):
   np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x)
   with test_util.device(use_gpu=use_gpu):
     inx = ops.convert_to_tensor(x)
     ofinite, oinf, onan = math_ops.is_finite(inx), math_ops.is_inf(
         inx), math_ops.is_nan(inx)
     tf_finite, tf_inf, tf_nan = self.evaluate([ofinite, oinf, onan])
   self.assertAllEqual(np_inf, tf_inf)
   self.assertAllEqual(np_nan, tf_nan)
   self.assertAllEqual(np_finite, tf_finite)
   self.assertShapeEqual(np_inf, oinf)
   self.assertShapeEqual(np_nan, onan)
   self.assertShapeEqual(np_finite, ofinite)
Esempio n. 22
0
def kl(dist_a, dist_b, allow_nan=False, name=None):
    """Get the KL-divergence KL(dist_a || dist_b).

  Args:
    dist_a: instance of distributions.Distribution.
    dist_b: instance of distributions.Distribution.
    allow_nan: If False (default), a runtime error is raised
      if the KL returns NaN values for any batch entry of the given
      distributions.  If True, the KL may return a NaN for the given entry.
    name: (optional) Name scope to use for created operations.

  Returns:
    A Tensor with the batchwise KL-divergence between dist_a and dist_b.

  Raises:
    TypeError: If dist_a or dist_b is not an instance of Distribution.
    NotImplementedError: If no KL method is defined for distribution types
      of dist_a and dist_b.
  """
    if not isinstance(dist_a, distribution.Distribution):
        raise TypeError(
            "dist_a is not an instance of Distribution, received type: %s" %
            type(dist_a))
    if not isinstance(dist_b, distribution.Distribution):
        raise TypeError(
            "dist_b is not an instance of Distribution, received type: %s" %
            type(dist_b))
    kl_fn = _DIVERGENCES.get((type(dist_a), type(dist_b)), None)
    if kl_fn is None:
        raise NotImplementedError(
            "No KL(dist_a || dist_b) registered for dist_a type %s and dist_b "
            "type %s" % ((type(dist_a).__name__, type(dist_b).__name__)))
    with ops.name_scope("KullbackLeibler"):
        kl_t = kl_fn(dist_a, dist_b, name=name)
        if allow_nan:
            return kl_t

        # Check KL for NaNs
        kl_t = array_ops.identity(kl_t, name="kl")

        with ops.control_dependencies([
                logging_ops.Assert(
                    math_ops.logical_not(
                        math_ops.reduce_any(math_ops.is_nan(kl_t))),
                    [
                        "KL calculation between %s and %s returned NaN values "
                        "(and was called with allow_nan=False).  Values:" %
                        (dist_a.name, dist_b.name), kl_t
                    ])
        ]):
            return array_ops.identity(kl_t, name="checked_kl")
Esempio n. 23
0
def sparsemax_loss(logits, sparsemax, labels, name=None):
    """Computes sparsemax loss function [1].

  [1]: https://arxiv.org/abs/1602.02068

  Args:
    logits: A `Tensor`. Must be one of the following types: `half`, `float32`,
      `float64`.
    sparsemax: A `Tensor`. Must have the same type as `logits`.
    labels: A `Tensor`. Must have the same type as `logits`.
    name: A name for the operation (optional).

  Returns:
    A `Tensor`. Has the same type as `logits`.
  """

    with ops.name_scope(name, "sparsemax_loss",
                        [logits, sparsemax, labels]) as name:
        logits = ops.convert_to_tensor(logits, name="logits")
        sparsemax = ops.convert_to_tensor(sparsemax, name="sparsemax")
        labels = ops.convert_to_tensor(labels, name="labels")

        # In the paper, they call the logits z.
        # A constant can be substracted from logits to make the algorithm
        # more numerically stable in theory. However, there are really no major
        # source numerical instability in this algorithm.
        z = logits

        # sum over support
        # Use a conditional where instead of a multiplication to support z = -inf.
        # If z = -inf, and there is no support (sparsemax = 0), a multiplication
        # would cause 0 * -inf = nan, which is not correct in this case.
        sum_s = array_ops.where(
            math_ops.logical_or(sparsemax > 0, math_ops.is_nan(sparsemax)),
            sparsemax * (z - 0.5 * sparsemax), array_ops.zeros_like(sparsemax))

        # - z_k + ||q||^2
        q_part = labels * (0.5 * labels - z)
        # Fix the case where labels = 0 and z = -inf, where q_part would
        # otherwise be 0 * -inf = nan. But since the lables = 0, no cost for
        # z = -inf should be consideredself.
        # The code below also coveres the case where z = inf. Howeverm in this
        # caose the sparsemax will be nan, which means the sum_s will also be nan,
        # therefor this case doesn't need addtional special treatment.
        q_part_safe = array_ops.where(
            math_ops.logical_and(math_ops.equal(labels,
                                                0), math_ops.is_inf(z)),
            array_ops.zeros_like(z), q_part)

        return math_ops.reduce_sum(sum_s + q_part_safe, axis=1)
def kl(dist_a, dist_b, allow_nan=False, name=None):
    """Get the KL-divergence KL(dist_a || dist_b).

  Args:
    dist_a: instance of distributions.Distribution.
    dist_b: instance of distributions.Distribution.
    allow_nan: If False (default), a runtime error is raised
      if the KL returns NaN values for any batch entry of the given
      distributions.  If True, the KL may return a NaN for the given entry.
    name: (optional) Name scope to use for created operations.

  Returns:
    A Tensor with the batchwise KL-divergence between dist_a and dist_b.

  Raises:
    TypeError: If dist_a or dist_b is not an instance of Distribution.
    NotImplementedError: If no KL method is defined for distribution types
      of dist_a and dist_b.
  """
    if not isinstance(dist_a, distribution.Distribution):
        raise TypeError("dist_a is not an instance of Distribution, received type: %s" % type(dist_a))
    if not isinstance(dist_b, distribution.Distribution):
        raise TypeError("dist_b is not an instance of Distribution, received type: %s" % type(dist_b))
    kl_fn = _DIVERGENCES.get((type(dist_a), type(dist_b)), None)
    if kl_fn is None:
        raise NotImplementedError(
            "No KL(dist_a || dist_b) registered for dist_a type %s and dist_b "
            "type %s" % ((type(dist_a).__name__, type(dist_b).__name__))
        )
    with ops.name_scope("KullbackLeibler"):
        kl_t = kl_fn(dist_a, dist_b, name=name)
        if allow_nan:
            return kl_t

        # Check KL for NaNs
        kl_t = array_ops.identity(kl_t, name="kl")

        with ops.control_dependencies(
            [
                logging_ops.Assert(
                    math_ops.logical_not(math_ops.reduce_any(math_ops.is_nan(kl_t))),
                    [
                        "KL calculation between %s and %s returned NaN values "
                        "(and was called with allow_nan=False).  Values:" % (dist_a.name, dist_b.name),
                        kl_t,
                    ],
                )
            ]
        ):
            return array_ops.identity(kl_t, name="checked_kl")
Esempio n. 25
0
def sparsemax_loss(logits, sparsemax, labels, name=None):
  """Computes sparsemax loss function [1].

  [1]: https://arxiv.org/abs/1602.02068

  Args:
    logits: A `Tensor`. Must be one of the following types: `half`, `float32`,
      `float64`.
    sparsemax: A `Tensor`. Must have the same type as `logits`.
    labels: A `Tensor`. Must have the same type as `logits`.
    name: A name for the operation (optional).

  Returns:
    A `Tensor`. Has the same type as `logits`.
  """

  with ops.name_scope(name, "sparsemax_loss",
                      [logits, sparsemax, labels]) as name:
    logits = ops.convert_to_tensor(logits, name="logits")
    sparsemax = ops.convert_to_tensor(sparsemax, name="sparsemax")
    labels = ops.convert_to_tensor(labels, name="labels")

    # In the paper, they call the logits z.
    # A constant can be substracted from logits to make the algorithm
    # more numerically stable in theory. However, there are really no major
    # source numerical instability in this algorithm.
    z = logits

    # sum over support
    # Use a conditional where instead of a multiplication to support z = -inf.
    # If z = -inf, and there is no support (sparsemax = 0), a multiplication
    # would cause 0 * -inf = nan, which is not correct in this case.
    sum_s = array_ops.where(
        math_ops.logical_or(sparsemax > 0, math_ops.is_nan(sparsemax)),
        sparsemax * (z - 0.5 * sparsemax), array_ops.zeros_like(sparsemax))

    # - z_k + ||q||^2
    q_part = labels * (0.5 * labels - z)
    # Fix the case where labels = 0 and z = -inf, where q_part would
    # otherwise be 0 * -inf = nan. But since the lables = 0, no cost for
    # z = -inf should be consideredself.
    # The code below also coveres the case where z = inf. Howeverm in this
    # caose the sparsemax will be nan, which means the sum_s will also be nan,
    # therefor this case doesn't need addtional special treatment.
    q_part_safe = array_ops.where(
        math_ops.logical_and(math_ops.equal(labels, 0), math_ops.is_inf(z)),
        array_ops.zeros_like(z), q_part)

    return math_ops.reduce_sum(sum_s + q_part_safe, axis=1)
Esempio n. 26
0
 def _compare(self, x, use_gpu):
   np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x)
   with self.test_session(
       use_gpu=use_gpu,
       force_gpu=use_gpu and test_util.is_gpu_available()) as sess:
     inx = ops.convert_to_tensor(x)
     ofinite, oinf, onan = math_ops.is_finite(inx), math_ops.is_inf(
         inx), math_ops.is_nan(inx)
     tf_finite, tf_inf, tf_nan = sess.run([ofinite, oinf, onan])
   self.assertAllEqual(np_inf, tf_inf)
   self.assertAllEqual(np_nan, tf_nan)
   self.assertAllEqual(np_finite, tf_finite)
   self.assertShapeEqual(np_inf, oinf)
   self.assertShapeEqual(np_nan, onan)
   self.assertShapeEqual(np_finite, ofinite)
Esempio n. 27
0
    def convert_nan_or_inf_to_zero(self, grad):
        """Replace grad tensor with zero tensor if grad is NaN or Inf.

     This is mainly for improving training stability. We skip updating the
     variable by setting the grad to zero when there is NaN or Inf.

    Args:
      grad: Input gradient.

    Returns:
      a Tensor with the dtype equal to grad dtype.
    """
        return array_ops.where(
            math_ops.reduce_any(
                math_ops.logical_or(math_ops.is_nan(grad),
                                    math_ops.is_inf(grad))),
            array_ops.zeros_like(grad, dtype=grad.dtype), grad)
Esempio n. 28
0
 def _compare(self, x, use_gpu):
   with test_util.device(use_gpu=use_gpu):
     inx = ops.convert_to_tensor(x)
     ofinite, oinf, onan = math_ops.is_finite(inx), math_ops.is_inf(
         inx), math_ops.is_nan(inx)
     tf_finite, tf_inf, tf_nan = self.evaluate([ofinite, oinf, onan])
   if x.dtype == dtypes_lib.bfloat16.as_numpy_dtype:
     # Numpy will implicitly convert bfloat16 value to float16, so we cast to
     # float32 to avoid this.
     x = x.astype(np.float32)
   np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x)
   self.assertAllEqual(np_inf, tf_inf)
   self.assertAllEqual(np_nan, tf_nan)
   self.assertAllEqual(np_finite, tf_finite)
   self.assertShapeEqual(np_inf, oinf)
   self.assertShapeEqual(np_nan, onan)
   self.assertShapeEqual(np_finite, ofinite)
Esempio n. 29
0
 def testSqrt(self):
   for dtype in [np.float16, np.float32, np.float64]:
     fi = np.finfo(dtype)
     for size in [1, 3, 4, 7, 8, 63, 64, 65]:
       # For float32 Eigen uses Carmack's fast vectorized sqrt algorithm.
       # It is not accurate for very large arguments, so we test for
       # fi.max/100 instead of fi.max here.
       for value in [fi.min, -2, -1, 0, fi.tiny, 1, 2, 1000, fi.max / 100]:
         x = np.full((size,), value, dtype=dtype)
         np_y = np.sqrt(x)
         np_nan = np.isnan(np_y)
         with test_util.use_gpu():
           tf_y = math_ops.sqrt(x)
           tf_nan = math_ops.is_nan(tf_y)
           if value < 0:
             self.assertAllEqual(np_nan, self.evaluate(tf_nan))
           else:
             self.assertAllCloseAccordingToType(np_y, self.evaluate(tf_y))
Esempio n. 30
0
def kernel(step_size, n_leapfrog_steps, x, target_log_prob_fn, event_dims=(),
           x_log_prob=None, x_grad=None, skip_metropolis_step=False, name=None):
  """Runs one iteration of Hamiltonian Monte Carlo.

  Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC)
  algorithm that takes a series of gradient-informed steps to produce
  a Metropolis proposal. This function applies one step of HMC to
  randomly update the variable `x`.

  This function can update multiple chains in parallel. It assumes
  that all dimensions of `x` not specified in `event_dims` are
  independent, and should therefore be updated independently. The
  output of `target_log_prob_fn()` should sum log-probabilities across
  all event dimensions. Slices along dimensions not in `event_dims`
  may have different target distributions; for example, if
  `event_dims == (1,)`, then `x[0, :]` could have a different target
  distribution from x[1, :]. This is up to `target_log_prob_fn()`.

  Args:
    step_size: Scalar step size or array of step sizes for the
      leapfrog integrator. Broadcasts to the shape of
      `x`. Larger step sizes lead to faster progress, but
      too-large step sizes make rejection exponentially more likely.
      When possible, it's often helpful to match per-variable step
      sizes to the standard deviations of the target distribution in
      each variable.
    n_leapfrog_steps: Integer number of steps to run the leapfrog
      integrator for. Total progress per HMC step is roughly
      proportional to step_size * n_leapfrog_steps.
    x: Tensor containing the value(s) of the random variable(s) to update.
    target_log_prob_fn: Python callable which takes an argument like `initial_x`
      and returns its (possibly unnormalized) log-density under the target
      distribution.
    event_dims: List of dimensions that should not be treated as
      independent. This allows for multiple chains to be run independently
      in parallel. Default is (), i.e., all dimensions are independent.
    x_log_prob (optional): Tensor containing the cached output of a previous
      call to `target_log_prob_fn()` evaluated at `x` (such as that provided by
      a previous call to `kernel()`). Providing `x_log_prob` and
      `x_grad` saves one gradient computation per call to `kernel()`.
    x_grad (optional): Tensor containing the cached gradient of
      `target_log_prob_fn()` evaluated at `x` (such as that provided by
      a previous call to `kernel()`). Providing `x_log_prob` and
      `x_grad` saves one gradient computation per call to `kernel()`.
    skip_metropolis_step (optional): boolean specifying whether to skip the
      Metropolis-Hastings step and directly return the newly proposed values
      by the integrator. The acceptance probabilities returned remain unchanged.
    name: Python `str` name prefixed to Ops created by this function.

  Returns:
    updated_x: The updated variable(s) x. Has shape matching `initial_x`.
    acceptance_probs: Tensor with the acceptance probabilities for the final
      iteration. This is useful for diagnosing step size problems etc. Has
      shape matching `target_log_prob_fn(initial_x)`.
    new_log_prob: The value of `target_log_prob_fn()` evaluated at `updated_x`.
    new_grad: The value of the gradient of `target_log_prob_fn()` evaluated at
      `updated_x`.

  #### Examples:

  ```python
  # Tuning acceptance rates:
  target_accept_rate = 0.631
  def target_log_prob(x):
    # Standard normal
    return tf.reduce_sum(-0.5 * tf.square(x))
  initial_x = tf.zeros([10])
  initial_log_prob = target_log_prob(initial_x)
  initial_grad = tf.gradients(initial_log_prob, initial_x)[0]
  # Algorithm state
  x = tf.Variable(initial_x, name='x')
  step_size = tf.Variable(1., name='step_size')
  last_log_prob = tf.Variable(initial_log_prob, name='last_log_prob')
  last_grad = tf.Variable(initial_grad, name='last_grad')
  # Compute updates
  new_x, acceptance_prob, log_prob, grad = hmc.kernel(step_size, 3, x,
                                                      target_log_prob,
                                                      event_dims=[0],
                                                      x_log_prob=last_log_prob)
  x_update = tf.assign(x, new_x)
  log_prob_update = tf.assign(last_log_prob, log_prob)
  grad_update = tf.assign(last_grad, grad)
  step_size_update = tf.assign(step_size,
                               tf.where(acceptance_prob > target_accept_rate,
                                        step_size * 1.01, step_size / 1.01))
  adaptive_updates = [x_update, log_prob_update, grad_update, step_size_update]
  sampling_updates = [x_update, log_prob_update, grad_update]

  sess = tf.Session()
  sess.run(tf.global_variables_initializer())
  # Warm up the sampler and adapt the step size
  for i in xrange(500):
    sess.run(adaptive_updates)
  # Collect samples without adapting step size
  samples = np.zeros([500, 10])
  for i in xrange(500):
    x_val, _ = sess.run([new_x, sampling_updates])
    samples[i] = x_val
  ```

  ```python
  # Empirical-Bayes estimation of a hyperparameter by MCMC-EM:

  # Problem setup
  N = 150
  D = 10
  x = np.random.randn(N, D).astype(np.float32)
  true_sigma = 0.5
  true_beta = true_sigma * np.random.randn(D).astype(np.float32)
  y = x.dot(true_beta) + np.random.randn(N).astype(np.float32)

  def log_prior(beta, log_sigma):
    return tf.reduce_sum(-0.5 / tf.exp(2 * log_sigma) * tf.square(beta) -
                         log_sigma)
  def regression_log_joint(beta, log_sigma, x, y):
    # This function returns log p(beta | log_sigma) + log p(y | x, beta).
    means = tf.matmul(tf.expand_dims(beta, 0), x, transpose_b=True)
    means = tf.squeeze(means)
    log_likelihood = tf.reduce_sum(-0.5 * tf.square(y - means))
    return log_prior(beta, log_sigma) + log_likelihood
  def log_joint_partial(beta):
    return regression_log_joint(beta, log_sigma, x, y)
  # Our estimate of log(sigma)
  log_sigma = tf.Variable(0., name='log_sigma')
  # The state of the Markov chain
  beta = tf.Variable(tf.random_normal([x.shape[1]]), name='beta')
  new_beta, _, _, _ = hmc.kernel(0.1, 5, beta, log_joint_partial,
                                 event_dims=[0])
  beta_update = tf.assign(beta, new_beta)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
  with tf.control_dependencies([beta_update]):
    log_sigma_update = optimizer.minimize(-log_prior(beta, log_sigma),
                                          var_list=[log_sigma])

  sess = tf.Session()
  sess.run(tf.global_variables_initializer())
  log_sigma_history = np.zeros(1000)
  for i in xrange(1000):
    log_sigma_val, _ = sess.run([log_sigma, log_sigma_update])
    log_sigma_history[i] = log_sigma_val
  # Should converge to something close to true_sigma
  plt.plot(np.exp(log_sigma_history))
  ```
  """
  with ops.name_scope(name, 'hmc_kernel', [step_size, n_leapfrog_steps, x]):
    potential_and_grad = _make_potential_and_grad(target_log_prob_fn)
    x = ops.convert_to_tensor(x, name='x')

    x_shape = array_ops.shape(x)
    m = random_ops.random_normal(x_shape, dtype=x.dtype)

    kinetic_0 = 0.5 * math_ops.reduce_sum(math_ops.square(m), event_dims)

    if (x_log_prob is not None) and (x_grad is not None):
      log_potential_0, grad_0 = -x_log_prob, -x_grad  # pylint: disable=invalid-unary-operand-type
    else:
      if x_log_prob is not None:
        logging.warn('x_log_prob was provided, but x_grad was not,'
                     ' so x_log_prob was not used.')
      if x_grad is not None:
        logging.warn('x_grad was provided, but x_log_prob was not,'
                     ' so x_grad was not used.')
      log_potential_0, grad_0 = potential_and_grad(x)

    new_x, new_m, log_potential_1, grad_1 = leapfrog_integrator(
        step_size, n_leapfrog_steps, x, m, potential_and_grad, grad_0)

    kinetic_1 = 0.5 * math_ops.reduce_sum(math_ops.square(new_m), event_dims)

    energy_change = log_potential_1 - log_potential_0 + kinetic_1 - kinetic_0
    # Treat NaN as infinite energy (and therefore guaranteed rejection).
    energy_change = array_ops.where(
        math_ops.is_nan(energy_change),
        array_ops.fill(array_ops.shape(energy_change),
                       energy_change.dtype.as_numpy_dtype(np.inf)),
        energy_change)
    acceptance_probs = math_ops.exp(math_ops.minimum(-energy_change, 0.))
    
    # If we are skipping the MH step directly return
    if skip_metropolis_step:
      return new_x, acceptance_probs, -log_potential_1, -grad_1
    
    accepted = (
        random_ops.random_uniform(
            array_ops.shape(acceptance_probs), dtype=x.dtype)
        < acceptance_probs)
    new_log_prob = -array_ops.where(accepted, log_potential_1, log_potential_0)

    # TODO(b/65738010): This should work, but it doesn't for now.
    # reduced_shape = math_ops.reduced_shape(x_shape, event_dims)
    reduced_shape = array_ops.shape(math_ops.reduce_sum(x, event_dims,
                                                        keep_dims=True))
    accepted = array_ops.reshape(accepted, reduced_shape)
    accepted = math_ops.logical_or(
        accepted, math_ops.cast(array_ops.zeros_like(x), dtypes.bool))
    new_x = array_ops.where(accepted, new_x, x)
    new_grad = -array_ops.where(accepted, grad_1, grad_0)

  # TODO(langmore) Gradients of acceptance_probs and new_log_prob with respect
  # to initial_x will propagate NaNs (see testNanFromGradsDontPropagate).  This
  # should be fixed.
  return new_x, acceptance_probs, new_log_prob, new_grad
Esempio n. 31
0
def sparsemax(logits, name=None):
    """Computes sparsemax activations [1].

  For each batch `i` and class `j` we have
    $$sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0)$$

  [1]: https://arxiv.org/abs/1602.02068

  Args:
    logits: A `Tensor`. Must be one of the following types: `half`, `float32`,
      `float64`.
    name: A name for the operation (optional).

  Returns:
    A `Tensor`. Has the same type as `logits`.
  """

    with ops.name_scope(name, "sparsemax", [logits]) as name:
        logits = ops.convert_to_tensor(logits, name="logits")
        obs = array_ops.shape(logits)[0]
        dims = array_ops.shape(logits)[1]

        # In the paper, they call the logits z.
        # The mean(logits) can be substracted from logits to make the algorithm
        # more numerically stable. the instability in this algorithm comes mostly
        # from the z_cumsum. Substacting the mean will cause z_cumsum to be close
        # to zero. However, in practise the numerical instability issues are very
        # minor and substacting the mean causes extra issues with inf and nan
        # input.
        z = logits

        # sort z
        z_sorted, _ = nn.top_k(z, k=dims)

        # calculate k(z)
        z_cumsum = math_ops.cumsum(z_sorted, axis=1)
        k = math_ops.range(1,
                           math_ops.cast(dims, logits.dtype) + 1,
                           dtype=logits.dtype)
        z_check = 1 + k * z_sorted > z_cumsum
        # because the z_check vector is always [1,1,...1,0,0,...0] finding the
        # (index + 1) of the last `1` is the same as just summing the number of 1.
        k_z = math_ops.reduce_sum(math_ops.cast(z_check, dtypes.int32), axis=1)

        # calculate tau(z)
        # If there are inf values or all values are -inf, the k_z will be zero,
        # this is mathematically invalid and will also cause the gather_nd to fail.
        # Prevent this issue for now by setting k_z = 1 if k_z = 0, this is then
        # fixed later (see p_safe) by returning p = nan. This results in the same
        # behavior as softmax.
        k_z_safe = math_ops.maximum(k_z, 1)
        indices = array_ops.stack([math_ops.range(0, obs), k_z_safe - 1],
                                  axis=1)
        tau_sum = array_ops.gather_nd(z_cumsum, indices)
        tau_z = (tau_sum - 1) / math_ops.cast(k_z, logits.dtype)

        # calculate p
        p = math_ops.maximum(math_ops.cast(0, logits.dtype),
                             z - tau_z[:, array_ops.newaxis])
        # If k_z = 0 or if z = nan, then the input is invalid
        p_safe = array_ops.where(
            math_ops.logical_or(math_ops.equal(k_z, 0),
                                math_ops.is_nan(z_cumsum[:, -1])),
            array_ops.fill([obs, dims],
                           math_ops.cast(float("nan"), logits.dtype)), p)

        return p_safe
Esempio n. 32
0
def sparsemax(logits, name=None):
  """Computes sparsemax activations [1].

  For each batch `i` and class `j` we have
    $$sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0)$$

  [1]: https://arxiv.org/abs/1602.02068

  Args:
    logits: A `Tensor`. Must be one of the following types: `half`, `float32`,
      `float64`.
    name: A name for the operation (optional).

  Returns:
    A `Tensor`. Has the same type as `logits`.
  """

  with ops.name_scope(name, "sparsemax", [logits]) as name:
    logits = ops.convert_to_tensor(logits, name="logits")
    obs = array_ops.shape(logits)[0]
    dims = array_ops.shape(logits)[1]

    # In the paper, they call the logits z.
    # The mean(logits) can be substracted from logits to make the algorithm
    # more numerically stable. the instability in this algorithm comes mostly
    # from the z_cumsum. Substacting the mean will cause z_cumsum to be close
    # to zero. However, in practise the numerical instability issues are very
    # minor and substacting the mean causes extra issues with inf and nan
    # input.
    z = logits

    # sort z
    z_sorted, _ = nn.top_k(z, k=dims)

    # calculate k(z)
    z_cumsum = math_ops.cumsum(z_sorted, axis=1)
    k = math_ops.range(
        1, math_ops.cast(dims, logits.dtype) + 1, dtype=logits.dtype)
    z_check = 1 + k * z_sorted > z_cumsum
    # because the z_check vector is always [1,1,...1,0,0,...0] finding the
    # (index + 1) of the last `1` is the same as just summing the number of 1.
    k_z = math_ops.reduce_sum(math_ops.cast(z_check, dtypes.int32), axis=1)

    # calculate tau(z)
    # If there are inf values or all values are -inf, the k_z will be zero,
    # this is mathematically invalid and will also cause the gather_nd to fail.
    # Prevent this issue for now by setting k_z = 1 if k_z = 0, this is then
    # fixed later (see p_safe) by returning p = nan. This results in the same
    # behavior as softmax.
    k_z_safe = math_ops.maximum(k_z, 1)
    indices = array_ops.stack([math_ops.range(0, obs), k_z_safe - 1], axis=1)
    tau_sum = array_ops.gather_nd(z_cumsum, indices)
    tau_z = (tau_sum - 1) / math_ops.cast(k_z, logits.dtype)

    # calculate p
    p = math_ops.maximum(
        math_ops.cast(0, logits.dtype), z - tau_z[:, array_ops.newaxis])
    # If k_z = 0 or if z = nan, then the input is invalid
    p_safe = array_ops.where(
        math_ops.logical_or(
            math_ops.equal(k_z, 0), math_ops.is_nan(z_cumsum[:, -1])),
        array_ops.fill([obs, dims], math_ops.cast(float("nan"), logits.dtype)),
        p)

    return p_safe
Esempio n. 33
0
def kernel(step_size, n_leapfrog_steps, x, target_log_prob_fn, event_dims=(),
           x_log_prob=None, x_grad=None, name=None):
  """Runs one iteration of Hamiltonian Monte Carlo.

  Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC)
  algorithm that takes a series of gradient-informed steps to produce
  a Metropolis proposal. This function applies one step of HMC to
  randomly update the variable `x`.

  This function can update multiple chains in parallel. It assumes
  that all dimensions of `x` not specified in `event_dims` are
  independent, and should therefore be updated independently. The
  output of `target_log_prob_fn()` should sum log-probabilities across
  all event dimensions. Slices along dimensions not in `event_dims`
  may have different target distributions; for example, if
  `event_dims == (1,)`, then `x[0, :]` could have a different target
  distribution from x[1, :]. This is up to `target_log_prob_fn()`.

  Args:
    step_size: Scalar step size or array of step sizes for the
      leapfrog integrator. Broadcasts to the shape of
      `x`. Larger step sizes lead to faster progress, but
      too-large step sizes make rejection exponentially more likely.
      When possible, it's often helpful to match per-variable step
      sizes to the standard deviations of the target distribution in
      each variable.
    n_leapfrog_steps: Integer number of steps to run the leapfrog
      integrator for. Total progress per HMC step is roughly
      proportional to step_size * n_leapfrog_steps.
    x: Tensor containing the value(s) of the random variable(s) to update.
    target_log_prob_fn: Python callable which takes an argument like `initial_x`
      and returns its (possibly unnormalized) log-density under the target
      distribution.
    event_dims: List of dimensions that should not be treated as
      independent. This allows for multiple chains to be run independently
      in parallel. Default is (), i.e., all dimensions are independent.
    x_log_prob (optional): Tensor containing the cached output of a previous
      call to `target_log_prob_fn()` evaluated at `x` (such as that provided by
      a previous call to `kernel()`). Providing `x_log_prob` and
      `x_grad` saves one gradient computation per call to `kernel()`.
    x_grad (optional): Tensor containing the cached gradient of
      `target_log_prob_fn()` evaluated at `x` (such as that provided by
      a previous call to `kernel()`). Providing `x_log_prob` and
      `x_grad` saves one gradient computation per call to `kernel()`.
    name: Python `str` name prefixed to Ops created by this function.

  Returns:
    updated_x: The updated variable(s) x. Has shape matching `initial_x`.
    acceptance_probs: Tensor with the acceptance probabilities for the final
      iteration. This is useful for diagnosing step size problems etc. Has
      shape matching `target_log_prob_fn(initial_x)`.
    new_log_prob: The value of `target_log_prob_fn()` evaluated at `updated_x`.
    new_grad: The value of the gradient of `target_log_prob_fn()` evaluated at
      `updated_x`.

  #### Examples:

  ```python
  # Tuning acceptance rates:
  target_accept_rate = 0.631
  def target_log_prob(x):
    # Standard normal
    return tf.reduce_sum(-0.5 * tf.square(x))
  initial_x = tf.zeros([10])
  initial_log_prob = target_log_prob(initial_x)
  initial_grad = tf.gradients(initial_log_prob, initial_x)[0]
  # Algorithm state
  x = tf.Variable(initial_x, name='x')
  step_size = tf.Variable(1., name='step_size')
  last_log_prob = tf.Variable(initial_log_prob, name='last_log_prob')
  last_grad = tf.Variable(initial_grad, name='last_grad')
  # Compute updates
  new_x, acceptance_prob, log_prob, grad = hmc.kernel(step_size, 3, x,
                                                      target_log_prob,
                                                      event_dims=[0],
                                                      x_log_prob=last_log_prob)
  x_update = tf.assign(x, new_x)
  log_prob_update = tf.assign(last_log_prob, log_prob)
  grad_update = tf.assign(last_grad, grad)
  step_size_update = tf.assign(step_size,
                               tf.where(acceptance_prob > target_accept_rate,
                                        step_size * 1.01, step_size / 1.01))
  adaptive_updates = [x_update, log_prob_update, grad_update, step_size_update]
  sampling_updates = [x_update, log_prob_update, grad_update]

  sess = tf.Session()
  sess.run(tf.global_variables_initializer())
  # Warm up the sampler and adapt the step size
  for i in xrange(500):
    sess.run(adaptive_updates)
  # Collect samples without adapting step size
  samples = np.zeros([500, 10])
  for i in xrange(500):
    x_val, _ = sess.run([new_x, sampling_updates])
    samples[i] = x_val
  ```

  ```python
  # Empirical-Bayes estimation of a hyperparameter by MCMC-EM:

  # Problem setup
  N = 150
  D = 10
  x = np.random.randn(N, D).astype(np.float32)
  true_sigma = 0.5
  true_beta = true_sigma * np.random.randn(D).astype(np.float32)
  y = x.dot(true_beta) + np.random.randn(N).astype(np.float32)

  def log_prior(beta, log_sigma):
    return tf.reduce_sum(-0.5 / tf.exp(2 * log_sigma) * tf.square(beta) -
                         log_sigma)
  def regression_log_joint(beta, log_sigma, x, y):
    # This function returns log p(beta | log_sigma) + log p(y | x, beta).
    means = tf.matmul(tf.expand_dims(beta, 0), x, transpose_b=True)
    means = tf.squeeze(means)
    log_likelihood = tf.reduce_sum(-0.5 * tf.square(y - means))
    return log_prior(beta, log_sigma) + log_likelihood
  def log_joint_partial(beta):
    return regression_log_joint(beta, log_sigma, x, y)
  # Our estimate of log(sigma)
  log_sigma = tf.Variable(0., name='log_sigma')
  # The state of the Markov chain
  beta = tf.Variable(tf.random_normal([x.shape[1]]), name='beta')
  new_beta, _, _, _ = hmc.kernel(0.1, 5, beta, log_joint_partial,
                                 event_dims=[0])
  beta_update = tf.assign(beta, new_beta)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
  with tf.control_dependencies([beta_update]):
    log_sigma_update = optimizer.minimize(-log_prior(beta, log_sigma),
                                          var_list=[log_sigma])

  sess = tf.Session()
  sess.run(tf.global_variables_initializer())
  log_sigma_history = np.zeros(1000)
  for i in xrange(1000):
    log_sigma_val, _ = sess.run([log_sigma, log_sigma_update])
    log_sigma_history[i] = log_sigma_val
  # Should converge to something close to true_sigma
  plt.plot(np.exp(log_sigma_history))
  ```
  """
  with ops.name_scope(name, 'hmc_kernel', [step_size, n_leapfrog_steps, x]):
    potential_and_grad = _make_potential_and_grad(target_log_prob_fn)
    x = ops.convert_to_tensor(x, name='x')

    x_shape = array_ops.shape(x)
    m = random_ops.random_normal(x_shape, dtype=x.dtype)

    kinetic_0 = 0.5 * math_ops.reduce_sum(math_ops.square(m), event_dims)

    if (x_log_prob is not None) and (x_grad is not None):
      log_potential_0, grad_0 = -x_log_prob, -x_grad  # pylint: disable=invalid-unary-operand-type
    else:
      if x_log_prob is not None:
        logging.warn('x_log_prob was provided, but x_grad was not,'
                     ' so x_log_prob was not used.')
      if x_grad is not None:
        logging.warn('x_grad was provided, but x_log_prob was not,'
                     ' so x_grad was not used.')
      log_potential_0, grad_0 = potential_and_grad(x)

    new_x, new_m, log_potential_1, grad_1 = leapfrog_integrator(
        step_size, n_leapfrog_steps, x, m, potential_and_grad, grad_0)

    kinetic_1 = 0.5 * math_ops.reduce_sum(math_ops.square(new_m), event_dims)

    energy_change = log_potential_1 - log_potential_0 + kinetic_1 - kinetic_0
    # Treat NaN as infinite energy (and therefore guaranteed rejection).
    energy_change = array_ops.where(
        math_ops.is_nan(energy_change),
        array_ops.fill(array_ops.shape(energy_change),
                       energy_change.dtype.as_numpy_dtype(np.inf)),
        energy_change)
    acceptance_probs = math_ops.exp(math_ops.minimum(-energy_change, 0.))
    accepted = (
        random_ops.random_uniform(
            array_ops.shape(acceptance_probs), dtype=x.dtype)
        < acceptance_probs)
    new_log_prob = -array_ops.where(accepted, log_potential_1, log_potential_0)

    # TODO(b/65738010): This should work, but it doesn't for now.
    # reduced_shape = math_ops.reduced_shape(x_shape, event_dims)
    reduced_shape = array_ops.shape(math_ops.reduce_sum(x, event_dims,
                                                        keep_dims=True))
    accepted = array_ops.reshape(accepted, reduced_shape)
    accepted = math_ops.logical_or(
        accepted, math_ops.cast(array_ops.zeros_like(x), dtypes.bool))
    new_x = array_ops.where(accepted, new_x, x)
    new_grad = -array_ops.where(accepted, grad_1, grad_0)

  # TODO(langmore) Gradients of acceptance_probs and new_log_prob with respect
  # to initial_x will propagate NaNs (see testNanFromGradsDontPropagate).  This
  # should be fixed.
  return new_x, acceptance_probs, new_log_prob, new_grad