def _second_order_terms(*args):
        """Computes entries of the (Hessian of `fn`) == (Jacobian of `_grad_fn`)."""
        # Partial derivatives of _grad_fn's first output (dy/dx) wrt `(x, *args)`.
        _, (d2y_dx2, *d2y_dx_dargs) = tfp_math.value_and_gradient(
            lambda x_and_args: _grad_fn(*x_and_args)[0], (x, ) + args,
            auto_unpack_single_arg=False)

        # Partial derivatives of additional outputs (dy/da, etc) wrt the input
        # *args (if any). Note that we don't need derivatives of these outputs wrt
        # `x`, since these are equal to the values we computed above in
        # `d2y_dx_dargs`by the [symmetry of partial derivatives](
        #   https://en.wikipedia.org/wiki/Symmetry_of_second_derivatives). This
        # could also in principle be applied to optimize redundant partial
        # derivatives computed in this loop, although this would be incompatible
        # with parallelizing the loop (which is probably a bigger win).
        d2y_dargs2 = []
        for i in range(len(args)):
            # It may be possible to run this loop in parallel with `vectorized_map`,
            # although this would only matter in cases with >> 1 arguments.
            _, d2y_dargs2_row = tfp_math.value_and_gradient(
                lambda args, i=i: _grad_fn(x, *args)[1 + i],
                args,
                auto_unpack_single_arg=False)
            d2y_dargs2.append(d2y_dargs2_row)

        return d2y_dx2, d2y_dx_dargs, d2y_dargs2
Example #2
0
 def optimizer_step(parameters, optimizer_state, seed=None):
     """Runs a single optimization step."""
     try:
         loss, grads = value_and_gradient(
             functools.partial(loss_fn, seed=seed), parameters)
     except TypeError:
         loss, grads = value_and_gradient(loss_fn, parameters)
     updates, optimizer_state = optimizer.update(grads, optimizer_state,
                                                 parameters)
     # Apply updates.
     parameters = tf.nest.map_structure(lambda a, b: a + b, parameters,
                                        updates)
     return loss, grads, parameters, optimizer_state
Example #3
0
 def optimizer_step(parameters, optimizer_state, seed=None):
     """Runs a single optimization step."""
     try:
         loss, grads = value_and_gradient(
             functools.partial(loss_fn, seed=seed), parameters)
     except TypeError:
         loss, grads = value_and_gradient(loss_fn, parameters)
     # Coerce grads to the same sequence type (e.g., namedtuple) as parameters.
     grads = tf.nest.pack_sequence_as(parameters, tf.nest.flatten(grads))
     updates, optimizer_state = optimizer.update(grads, optimizer_state,
                                                 parameters)
     # Apply updates.
     parameters = tf.nest.map_structure(lambda a, b: a + b, parameters,
                                        updates)
     return loss, grads, parameters, optimizer_state
Example #4
0
  def testGradientOnSupportInterior(self, dtype):
    # round_exponential_bump_function(x) = 0 for x right at the edge of the
    # support, e.g. x = -0.999.  This is expected, due to the exponential and
    # division.
    x = tf.convert_to_tensor([
        -0.9925,
        -0.5,
        0.,
        0.5,
        0.9925
    ], dtype=dtype)

    _, dy_dx = tfp_math.value_and_gradient(
        tfp_math.round_exponential_bump_function, x)

    self.assertDTypeEqual(dy_dx, dtype)

    dy_dx_ = self.evaluate(dy_dx)

    # grad[round_exponential_bump_function](0) = 0
    self.assertEqual(0., dy_dx_[2])
    self.assertAllFinite(dy_dx_)

    # Increasing on (-1, 0), decreasing on (0, 1).
    self.assertAllGreater(dy_dx_[:2], 0)
    self.assertAllLess(dy_dx_[-2:], 0)
Example #5
0
def gradients(f,
              xs,
              output_gradients=None,
              use_gradient_tape=False,
              name=None):
    """Computes the gradients of `f` wrt to `*xs`.

  Args:
    f: Python `callable` to be differentiated.
    xs: Python list of parameters of `f` for which to differentiate. (Can also
      be single `Tensor`.)
    output_gradients: A `Tensor` or list of `Tensor`s the same size as the
      result `ys = f(*xs)` and holding the gradients computed for each `y` in
      `ys`. This argument is forwarded to the underlying gradient implementation
      (i.e., either the `grad_ys` argument of `tf.gradients` or the
      `output_gradients` argument of `tf.GradientTape.gradient`).
    use_gradient_tape: Python `bool` indicating that `tf.GradientTape` should be
      used regardless of `tf.executing_eagerly()` status.
      Default value: `False`.
    name: Python `str` name prefixed to ops created by this function.
      Default value: `None` (i.e., `'gradients'`).

  Returns:
    A `Tensor` with the gradient of `y` wrt each of `xs`.
  """
    _, grad = value_and_gradient(f,
                                 xs,
                                 output_gradients=output_gradients,
                                 use_gradient_tape=use_gradient_tape,
                                 name=name or 'gradients')
    return grad
Example #6
0
    def testGradientOutsideAndOnEdgeOfSupport(self, dtype):
        finfo = np.finfo(dtype)
        x = tf.convert_to_tensor(
            [
                # Sqrt(finfo.max)**2 = finfo.max < Inf, so
                # round_exponential_bump_function == 0 here.
                -np.sqrt(finfo.max),
                # -2 is just outside the support, so round_exponential_bump_function
                # should == 0.
                -2.,
                # -1 is on boundary of support, so round_exponential_bump_function
                # should == 0.
                # The gradient should also equal 0.
                -1.,
                1.,
                2.0,
                np.sqrt(finfo.max),
            ],
            dtype=dtype)
        _, dy_dx = tfp_math.value_and_gradient(
            tfp_math.round_exponential_bump_function, x)

        self.assertDTypeEqual(dy_dx, dtype)

        dy_dx_ = self.evaluate(dy_dx)

        # Since x is outside the support, the gradient is zero.
        self.assertAllEqual(dy_dx_, np.zeros((6, )))
Example #7
0
 def test_can_take_loop_gradient_inside_xla(self):
   def loss_fn(v):
     return loop_util.trace_scan(lambda x, t: x + v,
                                 0.,
                                 tf.range(10),
                                 trace_fn=lambda x: x)[0]
   xla_grad = tf.function(lambda v: tfp_math.value_and_gradient(loss_fn, v)[1],
                          jit_compile=True)(0.)
   self.assertAllClose(xla_grad, 10.)
Example #8
0
 def testInverseGaussianFullyReparameterized(self):
     concentration = tf.constant(4.0)
     loc = tf.constant(3.0)
     _, [grad_concentration, grad_loc] = tfm.value_and_gradient(
         lambda a, b: tfd.InverseGaussian(a, b, validate_args=True).  # pylint: disable=g-long-lambda
         sample(100, seed=test_util.test_seed()),
         [concentration, loc])
     self.assertIsNotNone(grad_concentration)
     self.assertIsNotNone(grad_loc)
    def testLeftTailGrad(self, dtype, do_compile):
        x = np.linspace(-50., -8., 1000).astype(dtype)

        @tf.function(autograph=False, jit_compile=do_compile)
        def fn(x):
            return tf.math.log(tfb.Softplus().forward(x))

        _, grad = tfp_math.value_and_gradient(fn, x)

        true_grad = 1 / (1 + np.exp(-x)) / np.log1p(np.exp(x))
        self.assertAllClose(true_grad, self.evaluate(grad), atol=1e-3)
Example #10
0
    def testGradients(self):
        maf = tfb.MaskedAutoregressiveFlow(validate_args=True,
                                           **self._autoregressive_flow_kwargs)

        def _transform(x):
            y = maf.forward(x)
            return maf.inverse(tf.identity(y))

        self.evaluate(tf1.global_variables_initializer())
        _, gradient = tfp_math.value_and_gradient(_transform,
                                                  tf.zeros(self.event_shape))
        self.assertIsNotNone(gradient)
Example #11
0
def gradients(func_or_y,
              xs,
              output_gradients=None,
              use_gradient_tape=False,
              name=None):
    """Computes the gradients of `func_or_y` wrt to `*xs`.

  Args:
   func_or_y: Either a `Tensor` conencted to the input `x` or a Python callable
      accepting one `Tensor` of shape of `x` and returning a `Tensor` of any
      shape. The function whose gradient is to be computed. If eagerly
      executing, can only be a callable, i.e., one should not supply a Tensor
      in eager mode.
    xs: Python list of parameters of `f` for which to differentiate. (Can also
      be single `Tensor`.)
    output_gradients: A `Tensor` or list of `Tensor`s the same size as the
      result `ys = f(*xs)` and holding the gradients computed for each `y` in
      `ys`. This argument is forwarded to the underlying gradient implementation
      (i.e., either the `grad_ys` argument of `tf.gradients` or the
      `output_gradients` argument of `tf.GradientTape.gradient`).
      Default value: `None` which maps to a ones-like `Tensor` of `ys`.
    use_gradient_tape: Python `bool` indicating that `tf.GradientTape` should be
      used regardless of `tf.executing_eagerly()` status.
      Default value: `False`.
    name: Python `str` name prefixed to ops created by this function.
      Default value: `None` (i.e., 'gradients').

  Returns:
    A `Tensor` with the gradient of `y` wrt each of `xs` or a list of `Tensor`s
    if `xs` is a list.
  """
    f = _prepare_func(func_or_y)
    if not tf.executing_eagerly() and not use_gradient_tape:
        with tf.name_scope(name or "gradients"):
            xs, is_xs_list_like = _prepare_args(xs)
            y = f(*xs)
            grad = tf.gradients(y, xs, grad_ys=output_gradients)
            if is_xs_list_like:
                return grad
            else:
                return grad[0]
    if not callable(func_or_y):
        raise ValueError(
            "`func_or_y` should be a callable in eager mode or when "
            "`tf.GradientTape` is used.")
    _, grad = value_and_gradient(f,
                                 xs,
                                 output_gradients=output_gradients,
                                 use_gradient_tape=use_gradient_tape,
                                 name=name or "gradients")
    return grad
    def testDistribution(self, dist_name, data):
        dist = data.draw(
            dhps.base_distributions(
                dist_name=dist_name,
                enable_vars=False,
                # Unregularized MLEs can be numerically problematic, e.g., empirical
                # (co)variances can be singular. To avoid such numerical issues, we
                # sanity-check the MLE only for a fixed sample with assumed-sane
                # parameter values (zeros constrained to the parameter support).
                param_strategy_fn=_constrained_zeros_fn,
                batch_shape=data.draw(
                    tfp_hps.shapes(min_ndims=0, max_ndims=2, max_side=5))))
        x, lp = self.evaluate(
            dist.experimental_sample_and_log_prob(
                10, seed=test_util.test_seed(sampler_type='stateless')))

        try:
            parameters = self.evaluate(
                type(dist)._maximum_likelihood_parameters(x))
        except NotImplementedError:
            self.skipTest('Fitting not implemented.')

        flat_params = tf.nest.flatten(parameters)
        lp_fn = lambda *flat_params: type(dist)(  # pylint: disable=g-long-lambda
            validate_args=True,
            **tf.nest.pack_sequence_as(parameters, flat_params)).log_prob(x)
        lp_mle, grads = self.evaluate(
            tfp_math.value_and_gradient(lp_fn, flat_params))

        # Likelihood of MLE params should be higher than of the original params.
        self.assertAllGreaterEqual(
            tf.reduce_sum(lp_mle, axis=0) - tf.reduce_sum(lp, axis=0), -1e-4)

        if dist_name not in MLE_AT_CONSTRAINT_BOUNDARY:
            # MLE parameters should be a critical point of the log prob.
            for g in grads:
                if np.any(np.isnan(g)):
                    # Skip parameters with undefined or unstable gradients (e.g.,
                    # Categorical `num_classes`).
                    continue
                self.assertAllClose(tf.zeros_like(g), g, atol=1e-2)
Example #13
0
    def testCompareToExplicitGradient(self):
        """Compare to the explicit reparameterization derivative."""
        concentration_np = np.arange(4)[..., np.newaxis] + 1.
        concentration = tf.constant(concentration_np, self.dtype)
        loc_np = np.arange(3) + 1.
        loc = tf.constant(loc_np, self.dtype)

        def gen_samples(l, c):
            return tfd.InverseGaussian(l, c).sample(2,
                                                    seed=test_util.test_seed())

        samples, [loc_grad, concentration_grad] = self.evaluate(
            tfm.value_and_gradient(gen_samples, [loc, concentration]))
        self.assertEqual(samples.shape, (2, 4, 3))
        self.assertEqual(concentration_grad.shape, concentration.shape)
        self.assertEqual(loc_grad.shape, loc.shape)

        # Compute the gradient by computing the derivative of gammaincinv
        # over each entry and summing.
        def expected_grad(s, l, c):
            u = _scipy_invgauss(l, c).cdf(s)
            delta = 1e-4
            return (sp_misc.derivative(lambda x: _scipy_invgauss(x, c).ppf(u),
                                       l,
                                       dx=delta * l),
                    sp_misc.derivative(lambda x: _scipy_invgauss(l, x).ppf(u),
                                       c,
                                       dx=delta * c))

        expected_loc_grad, expected_concentration_grad = expected_grad(
            samples, loc_np, concentration_np)

        self.assertAllClose(concentration_grad,
                            np.sum(expected_concentration_grad,
                                   axis=(0, 2))[..., np.newaxis],
                            rtol=1e-3)

        self.assertAllClose(loc_grad,
                            np.sum(expected_loc_grad, axis=(0, 1)),
                            rtol=1e-3)
def _scatter_nd_batch(indices, updates, shape, batch_dims=0):
    """A partial implementation of `scatter_nd` supporting `batch_dims`."""

    # `tf.scatter_nd` does not support a `batch_dims` argument.
    # Instead we use the gradient of `tf.gather_nd`.
    # From a purely mathematical perspective this works because
    # (if `tf.scatter_nd` supported `batch_dims`)
    # `gather_nd` and `scatter_nd` (with matching `indices`) are
    # adjoint linear operators and
    # the gradient w.r.t `x` of `dot(y, A(x))` is `adjoint(A)(y)`.
    #
    # Another perspective: back propagating through a "neural" network
    # containing a gather operation carries derivatives backwards through the
    # network, accumulating the derivatives in the locations that
    # were gathered from, ie. they are scattered.
    # If the network multiplies each gathered element by
    # some quantity, then the backwardly propagating derivatives are scaled
    # by this quantity before being scattered.
    # Combining this with the fact that`GradientTape.gradient`
    # starts back-propagation with derivatives equal to `1`, this allows us
    # to use the multipliers to determine the quantities scattered.
    #
    # However, derivatives are only supported for floating point types
    # so we 'tunnel' our types through the `float64` type.
    # So the implmentation is "partial" in the sense that it supports
    # data that can be losslessly converted to `tf.float64` and back.
    dtype = updates.dtype
    internal_dtype = tf.float64
    multipliers = ps.cast(updates, internal_dtype)

    def weighted_gathered(zeros):
        return multipliers * tf.gather_nd(
            zeros, indices, batch_dims=batch_dims)

    zeros = tf.zeros(shape, dtype=internal_dtype)
    _, grad = value_and_gradient(weighted_gathered, zeros)
    return ps.cast(grad, dtype=dtype)
 def _dy_dx_fwd(unused_y):
     first_order = lambda x: tfp_math.value_and_gradient(fn, x)[1]
     dy_dx, d2y_dx2 = tfp_math.value_and_gradient(first_order, x)
     return (dy_dx, (dy_dx, d2y_dx2)
             )  # Auxiliary values for the second-order pass.
 def _dy_dx_jvp(primals, tangents):
     unused_y, = primals
     dy, = tangents
     first_order = lambda x: tfp_math.value_and_gradient(fn, x)[1]
     dy_dx, ddy_dx2 = tfp_math.value_and_gradient(first_order, x)
     return dy_dx, (dy / dy_dx) * ddy_dx2
 def _dy_dx_fn(y):
     del y  # Unused.
     _, dy_dx = tfp_math.value_and_gradient(fn, x)
     return dy_dx
Example #18
0
 def testBijectorForwardGradient(self):
     x_np = np.array([0.1, 2.23, 4.1], dtype=self.dtype)
     x = tf.constant(x_np)
     grad = value_and_gradient(tfb.Softfloor(self.dtype(1.2)).forward, x)[1]
     self.assertAllClose(_softfloor_grad_np(x_np, 1.2), grad)
Example #19
0
    def __init__(self,
                 target_log_prob_fn,
                 step_size,
                 max_tree_depth=10,
                 unrolled_leapfrog_steps=1,
                 use_auto_batching=True,
                 stackless=False,
                 backend=None,
                 seed=None,
                 name=None):
        """Initializes this transition kernel.

    Args:
      target_log_prob_fn: Python callable which takes an argument like
        `current_state` (or `*current_state` if it's a list) and returns its
        (possibly unnormalized) log-density under the target distribution.  Due
        to limitations of the underlying auto-batching system,
        target_log_prob_fn may be invoked with junk data at some batch indexes,
        which it must process without crashing.  (The results at those indexes
        are ignored).
      step_size: `Tensor` or Python `list` of `Tensor`s representing the step
        size for the leapfrog integrator. Must broadcast with the shape of
        `current_state`. Larger step sizes lead to faster progress, but
        too-large step sizes make rejection exponentially more likely. When
        possible, it's often helpful to match per-variable step sizes to the
        standard deviations of the target distribution in each variable.
      max_tree_depth: Maximum depth of the tree implicitly built by NUTS. The
        maximum number of leapfrog steps is bounded by `2**max_tree_depth-1`
        i.e. the number of nodes in a binary tree `max_tree_depth` nodes deep.
        The default setting of 10 takes up to 1023 leapfrog steps.
      unrolled_leapfrog_steps: The number of leapfrogs to unroll per tree
        expansion step. Applies a direct linear multipler to the maximum
        trajectory length implied by max_tree_depth. Defaults to 1. This
        parameter can be useful for amortizing the auto-batching control flow
        overhead.
      use_auto_batching: Boolean.  If `False`, do not invoke the auto-batching
        system; operate on batch size 1 only.
      stackless: Boolean.  If `True`, invoke the stackless version of
        the auto-batching system.  Only works in Eager mode.
      backend: Auto-batching backend object. Falls back to a default
        TensorFlowBackend().
      seed: Python integer to seed the random number generator.
      name: Python `str` name prefixed to Ops created by this function.
        Default value: `None` (i.e., 'nuts_kernel').
    """
        self._parameters = dict(locals())
        del self._parameters["self"]
        self.target_log_prob_fn = target_log_prob_fn
        self.step_size = step_size
        if max_tree_depth < 1:
            raise ValueError("max_tree_depth must be >= 1 but was {}".format(
                max_tree_depth))
        self.max_tree_depth = max_tree_depth
        self.unrolled_leapfrog_steps = unrolled_leapfrog_steps
        self.use_auto_batching = use_auto_batching
        self.stackless = stackless
        self.backend = backend
        self._seed_stream = distributions.SeedStream(seed, "nuts_one_step")
        self.name = "nuts_kernel" if name is None else name
        # TODO(b/125544625): Identify why we need `use_gradient_tape=True`, i.e.,
        # what's different between `tape.gradient` and `tf.gradient`.
        value_and_gradients_fn = lambda *args: tfp_math.value_and_gradient(  # pylint: disable=g-long-lambda
            self.target_log_prob_fn,
            args,
            use_gradient_tape=True)
        self.value_and_gradients_fn = _embed_no_none_gradient_check(
            value_and_gradients_fn)
        max_tree_edges = max_tree_depth - 1
        self.evolve_trajectory, self.autobatch_context = _make_evolve_trajectory(
            self.value_and_gradients_fn, max_tree_edges,
            unrolled_leapfrog_steps, self._seed_stream)
        self._block_code_cache = {}
Example #20
0
 def val_and_grad(x):
     return value_and_gradient(value_fn, x)
 def _grad_fn(x, *args):
     _, grads = tfp_math.value_and_gradient(fn, x, *args)
     return grads if args else [grads]  # Always return a list.
 def _vjp_bwd(x, grad_x):
     _, grads = tfp_math.value_and_gradient(self.fn, x)
     return (grad_x / grads, )