Ejemplo n.º 1
0
    def closure(self, x):
        """Evaluate the function, gradient, and hessian/hessian-product

        This method represents the core function call. It is used for
        computing newton/quasi newton directions, etc.
        """
        x = x.detach().requires_grad_(True)
        with torch.enable_grad():
            f = self.fun(x)
            grad = autograd.grad(f, x, create_graph=self._hessp
                                 or self._hess)[0]
        hessp = None
        hess = None
        if self._hessp:
            hessp = JacobianLinearOperator(x,
                                           grad,
                                           symmetric=self._twice_diffable)
        if self._hess:
            if self._I is None:
                self._I = torch.eye(x.numel(), dtype=x.dtype, device=x.device)
            hvp = lambda v: autograd.grad(grad, x, v, retain_graph=True)[0]
            hess = _vmap(hvp)(self._I)

        return sf_value(f=f.detach(),
                        grad=grad.detach(),
                        hessp=hessp,
                        hess=hess)
Ejemplo n.º 2
0
 def fun_with_jac(x):
     x = x.view_as(x0).detach().requires_grad_(True)
     with torch.enable_grad():
         f = fun(x).view(output_size)
     J = _vmap(lambda v: autograd.grad(f, x, v)[0])(I)
     J = J.view(output_size, input_size)
     return f.detach(), J
Ejemplo n.º 3
0
def _jacfwd(func, inputs, strict=False, vectorize=False):
    if strict:
        raise RuntimeError('torch.autograd.functional.jacobian: `strict=True` '
                           'and `strategy="forward-mode"` are not supported together (yet). '
                           'Please either set `strict=False` or '
                           '`strategy="reverse-mode"`.')
    is_inputs_tuple, inputs = _as_tuple(inputs, "inputs", "jacobian")
    output_info = []

    if vectorize:
        # See NOTE: [Computing jacobian with vmap and grad for multiple outputs]
        input_numels = tuple(input.numel() for input in inputs)

        # Step 1: Prepare tangents
        tangents = _construct_standard_basis_for(inputs, input_numels)

        # Step 2: Compute vmap over computation with dual tensors
        def jvp(tangents):
            with fwAD.dual_level():
                dual_inputs = tuple(
                    fwAD.make_dual(input, tangent.view_as(input)) for input, tangent in zip(inputs, tangents))
                _is_outputs_tuple, dual_outputs = _as_tuple(func(*dual_inputs), "outputs")
                output_info.append(_is_outputs_tuple)
                jv = []
                primal_outs = []
                for dual_out in dual_outputs:
                    primal, tangent = fwAD.unpack_dual(dual_out)
                    primal_outs.append(primal)
                    if tangent is not None:
                        jv.append(tangent)
                    else:
                        jv.append(torch.zeros_like(primal))
                output_info.append(primal_outs)
                return tuple(jv)

        outputs_before_split = _vmap(jvp)(tangents)
        is_outputs_tuple, outputs = output_info
        # Step 3: for each of the output tangents, split along dim 0
        jacobian_input_output = []
        for jac, output_i in zip(outputs_before_split, outputs):
            jacobian_output_i_output = []
            for jac, input_j in zip(jac.split(input_numels, dim=0), inputs):
                # We need to transpose the Jacobian because in forward AD, the
                # batch dimension represents that of the inputs
                jacobian_input_i_output_j = jac.permute(*range(1, jac.ndim), 0) \
                    .reshape(tuple([*output_i.shape, *input_j.shape]))  # noqa: C409

                jacobian_output_i_output.append(jacobian_input_i_output_j)
            jacobian_input_output.append(jacobian_output_i_output)

        # Omit [Step 4] because everything is already transposed w/ forward AD
        return _tuple_postprocess(jacobian_input_output, (is_outputs_tuple, is_inputs_tuple))
    else:
        raise NotImplementedError("Computing Jacobian using forward-AD or forward-over-reverse Hessian is"
                                  "only implemented for `vectorize=True`.")
Ejemplo n.º 4
0
    def closure(self, x):
        x = x.detach().requires_grad_(True)
        with torch.enable_grad():
            f = self.fun(x)
        jacp = None
        jac = None
        if self._jacp:
            jacp = JacobianLinearOperator(x, f)
        if self._jac:
            if self._I is None:
                self._I = torch.eye(x.numel(), dtype=x.dtype, device=x.device)
            jvp = lambda v: autograd.grad(f, x, v, retain_graph=True)[0]
            jac = _vmap(jvp)(self._I)

        return vf_value(f=f.detach(), jacp=jacp, jac=jac)
Ejemplo n.º 5
0
def _jacobian(inputs, outputs):
    """A modified variant of torch.autograd.functional.jacobian for
    pre-computed outputs

    This is only used for nonlinear parameter constraints (if provided)
    """
    is_inputs_tuple, inputs = _as_tuple(inputs, "inputs", "jacobian")
    is_outputs_tuple, outputs = _as_tuple(outputs, "outputs", "jacobian")

    output_numels = tuple(output.numel() for output in outputs)
    grad_outputs = _construct_standard_basis_for(outputs, output_numels)
    with torch.enable_grad():
        flat_outputs = tuple(output.reshape(-1) for output in outputs)

    def vjp(grad_output):
        vj = list(
            torch.autograd.grad(flat_outputs,
                                inputs,
                                grad_output,
                                allow_unused=True))
        for el_idx, vj_el in enumerate(vj):
            if vj_el is not None:
                continue
            vj[el_idx] = torch.zeros_like(inputs[el_idx])
        return tuple(vj)

    jacobians_of_flat_output = _vmap(vjp)(grad_outputs)

    jacobian_input_output = []
    for jac, input_i in zip(jacobians_of_flat_output, inputs):
        jacobian_input_i_output = []
        for jac, output_j in zip(jac.split(output_numels, dim=0), outputs):
            jacobian_input_i_output_j = jac.view(output_j.shape +
                                                 input_i.shape)
            jacobian_input_i_output.append(jacobian_input_i_output_j)
        jacobian_input_output.append(jacobian_input_i_output)

    jacobian_output_input = tuple(zip(*jacobian_input_output))

    jacobian_output_input = _grad_postprocess(jacobian_output_input,
                                              create_graph=False)
    return _tuple_postprocess(jacobian_output_input,
                              (is_outputs_tuple, is_inputs_tuple))
Ejemplo n.º 6
0
def jacobian(func, inputs, create_graph=False, strict=False, vectorize=False):
    r"""Function that computes the Jacobian of a given function.

    Args:
        func (function): a Python function that takes Tensor inputs and returns
            a tuple of Tensors or a Tensor.
        inputs (tuple of Tensors or Tensor): inputs to the function ``func``.
        create_graph (bool, optional): If ``True``, the Jacobian will be
            computed in a differentiable manner. Note that when ``strict`` is
            ``False``, the result can not require gradients or be disconnected
            from the inputs.  Defaults to ``False``.
        strict (bool, optional): If ``True``, an error will be raised when we
            detect that there exists an input such that all the outputs are
            independent of it. If ``False``, we return a Tensor of zeros as the
            jacobian for said inputs, which is the expected mathematical value.
            Defaults to ``False``.
        vectorize (bool, optional): This feature is experimental, please use at
            your own risk. When computing the jacobian, usually we invoke
            ``autograd.grad`` once per row of the jacobian. If this flag is
            ``True``, we use the vmap prototype feature as the backend to
            vectorize calls to ``autograd.grad`` so we only invoke it once
            instead of once per row. This should lead to performance
            improvements in many use cases, however, due to this feature
            being incomplete, there may be performance cliffs. Please
            use `torch._C._debug_only_display_vmap_fallback_warnings(True)`
            to show any performance warnings and file us issues if
            warnings exist for your use case. Defaults to ``False``.

    Returns:
        Jacobian (Tensor or nested tuple of Tensors): if there is a single
        input and output, this will be a single Tensor containing the
        Jacobian for the linearized inputs and output. If one of the two is
        a tuple, then the Jacobian will be a tuple of Tensors. If both of
        them are tuples, then the Jacobian will be a tuple of tuple of
        Tensors where ``Jacobian[i][j]`` will contain the Jacobian of the
        ``i``\th output and ``j``\th input and will have as size the
        concatenation of the sizes of the corresponding output and the
        corresponding input and will have same dtype and device as the
        corresponding input.

    Example:

        >>> def exp_reducer(x):
        ...   return x.exp().sum(dim=1)
        >>> inputs = torch.rand(2, 2)
        >>> jacobian(exp_reducer, inputs)
        tensor([[[1.4917, 2.4352],
                 [0.0000, 0.0000]],
                [[0.0000, 0.0000],
                 [2.4369, 2.3799]]])

        >>> jacobian(exp_reducer, inputs, create_graph=True)
        tensor([[[1.4917, 2.4352],
                 [0.0000, 0.0000]],
                [[0.0000, 0.0000],
                 [2.4369, 2.3799]]], grad_fn=<ViewBackward>)

        >>> def exp_adder(x, y):
        ...   return 2 * x.exp() + 3 * y
        >>> inputs = (torch.rand(2), torch.rand(2))
        >>> jacobian(exp_adder, inputs)
        (tensor([[2.8052, 0.0000],
                [0.0000, 3.3963]]),
         tensor([[3., 0.],
                 [0., 3.]]))
    """

    with torch.enable_grad():
        is_inputs_tuple, inputs = _as_tuple(inputs, "inputs", "jacobian")
        inputs = _grad_preprocess(inputs,
                                  create_graph=create_graph,
                                  need_graph=True)

        outputs = func(*inputs)
        is_outputs_tuple, outputs = _as_tuple(
            outputs, "outputs of the user-provided function", "jacobian")
        _check_requires_grad(outputs, "outputs", strict=strict)

        if vectorize:
            if strict:
                raise RuntimeError(
                    'torch.autograd.functional.jacobian: `strict=True` '
                    'and `vectorized=True` are not supported together. '
                    'Please either set `strict=False` or '
                    '`vectorize=False`.')
            # NOTE: [Computing jacobian with vmap and grad for multiple outputs]
            #
            # Let's consider f(x) = (x**2, x.sum()) and let x = torch.randn(3).
            # It turns out we can compute the jacobian of this function with a single
            # call to autograd.grad by using vmap over the correct grad_outputs.
            #
            # Firstly, one way to compute the jacobian is to stack x**2 and x.sum()
            # into a 4D vector. E.g., use g(x) = torch.stack([x**2, x.sum()])
            #
            # To get the first row of the jacobian, we call
            # >>> autograd.grad(g(x), x, grad_outputs=torch.tensor([1, 0, 0, 0]))
            # To get the 2nd row of the jacobian, we call
            # >>> autograd.grad(g(x), x, grad_outputs=torch.tensor([0, 1, 0, 0]))
            # and so on.
            #
            # Using vmap, we can vectorize all 4 of these computations into one by
            # passing the standard basis for R^4 as the grad_output.
            # vmap(partial(autograd.grad, g(x), x))(torch.eye(4)).
            #
            # Now, how do we compute the jacobian *without stacking the output*?
            # We can just split the standard basis across the outputs. So to
            # compute the jacobian of f(x), we'd use
            # >>> autograd.grad(f(x), x, grad_outputs=_construct_standard_basis_for(...))
            # The grad_outputs looks like the following:
            # ( torch.tensor([[1, 0, 0],
            #                 [0, 1, 0],
            #                 [0, 0, 1],
            #                 [0, 0, 0]]),
            #   torch.tensor([[0],
            #                 [0],
            #                 [0],
            #                 [1]]) )
            #
            # But we're not done yet!
            # >>> vmap(partial(autograd.grad(f(x), x, grad_outputs=...)))
            # returns a Tensor of shape [4, 3]. We have to remember to split the
            # jacobian of shape [4, 3] into two:
            # - one of shape [3, 3] for the first output
            # - one of shape [   3] for the second output

            # Step 1: Construct grad_outputs by splitting the standard basis
            output_numels = tuple(output.numel() for output in outputs)
            grad_outputs = _construct_standard_basis_for(
                outputs, output_numels)
            flat_outputs = tuple(output.reshape(-1) for output in outputs)

            # Step 2: Call vmap + autograd.grad
            def vjp(grad_output):
                vj = list(
                    _autograd_grad(flat_outputs,
                                   inputs,
                                   grad_output,
                                   create_graph=create_graph))
                for el_idx, vj_el in enumerate(vj):
                    if vj_el is not None:
                        continue
                    vj[el_idx] = torch.zeros_like(inputs[el_idx])
                return tuple(vj)

            jacobians_of_flat_output = _vmap(vjp)(grad_outputs)

            # Step 3: The returned jacobian is one big tensor per input. In this step,
            # we split each Tensor by output.
            jacobian_input_output = []
            for jac, input_i in zip(jacobians_of_flat_output, inputs):
                jacobian_input_i_output = []
                for jac, output_j in zip(jac.split(output_numels, dim=0),
                                         outputs):
                    jacobian_input_i_output_j = jac.view(output_j.shape +
                                                         input_i.shape)
                    jacobian_input_i_output.append(jacobian_input_i_output_j)
                jacobian_input_output.append(jacobian_input_i_output)

            # Step 4: Right now, `jacobian` is a List[List[Tensor]].
            # The outer List corresponds to the number of inputs,
            # the inner List corresponds to the number of outputs.
            # We need to exchange the order of these and convert to tuples
            # before returning.
            jacobian_output_input = tuple(zip(*jacobian_input_output))

            jacobian_output_input = _grad_postprocess(jacobian_output_input,
                                                      create_graph)
            return _tuple_postprocess(jacobian_output_input,
                                      (is_outputs_tuple, is_inputs_tuple))

        jacobian: Tuple[torch.Tensor, ...] = tuple()

        for i, out in enumerate(outputs):

            # mypy complains that expression and variable have different types due to the empty list
            jac_i: Tuple[List[torch.Tensor]] = tuple(
                [] for _ in range(len(inputs)))  # type: ignore[assignment]
            for j in range(out.nelement()):
                vj = _autograd_grad((out.reshape(-1)[j], ),
                                    inputs,
                                    retain_graph=True,
                                    create_graph=create_graph)

                for el_idx, (jac_i_el, vj_el,
                             inp_el) in enumerate(zip(jac_i, vj, inputs)):
                    if vj_el is not None:
                        if strict and create_graph and not vj_el.requires_grad:
                            msg = (
                                "The jacobian of the user-provided function is "
                                "independent of input {}. This is not allowed in "
                                "strict mode when create_graph=True.".format(i)
                            )
                            raise RuntimeError(msg)
                        jac_i_el.append(vj_el)
                    else:
                        if strict:
                            msg = (
                                "Output {} of the user-provided function is "
                                "independent of input {}. This is not allowed in "
                                "strict mode.".format(i, el_idx))
                            raise RuntimeError(msg)
                        jac_i_el.append(torch.zeros_like(inp_el))

            jacobian += (tuple(
                torch.stack(jac_i_el, dim=0).view(out.size() +
                                                  inputs[el_idx].size())
                for (el_idx, jac_i_el) in enumerate(jac_i)), )

        jacobian = _grad_postprocess(jacobian, create_graph)

        return _tuple_postprocess(jacobian,
                                  (is_outputs_tuple, is_inputs_tuple))
Ejemplo n.º 7
0
 def matmat(self, X):
     try:
         return _vmap(self.matvec)(X.T).T
     except:
         return torch.hstack([self.matvec(col).view(-1, 1) for col in X.T])