def rearrange_dict_grad(fun): """ Decorator that allows us to save memory on the forward pass, by precomputing the gradient """ @primitive def wrapped_fun_helper(xdict, dummy): ## ag.value_and_grad() to avoid second forward pass ## ag.checkpoint() ensures hessian gets properly checkpointed val, grad = ag.checkpoint(ag.value_and_grad(fun))(xdict) assert len(val.shape) == 0 dummy.cache = grad return val def wrapped_fun_helper_grad(ans, xdict, dummy): def grad(g): #print("foo") return {k: g * v for k, v in dummy.cache.items()} return grad defvjp(wrapped_fun_helper, wrapped_fun_helper_grad, None) @functools.wraps(fun) def wrapped_fun(xdict): return wrapped_fun_helper(ag.dict(xdict), lambda: None) return wrapped_fun
def make_stan_log_density(fitobj): @primitive def log_density(x): return _vectorize_if_needed(fitobj.log_prob, x) def log_density_vjp(ans, x): return lambda g: _ensure_2d(g) * _vectorize_if_needed(fitobj.grad_log_prob, x) defvjp(log_density, log_density_vjp) return log_density
def test_check_vjp_1st_order_fail(): @primitive def foo(x): return x * 2.0 defvjp(foo, lambda ans, x : lambda g: g * 2.001) assert_raises_regexp(AssertionError, "\(VJP\) check of foo failed", lambda: check_grads(foo, modes=['rev'])(1.0))
def test_check_vjp_1st_order_fail(): @primitive def foo(x): return x * 2.0 defvjp(foo, lambda ans, x: lambda g: g * 2.001) with raises(AssertionError, match="\\(VJP\\) check of foo failed"): check_grads(foo, modes=['rev'])(1.0)
def __new__(self, name, base, dic): cls = type.__new__(container_mateclass, name, base, dic) cls.register(_np.ndarray) for type_ in [ float, _np.float64, _np.float32, _np.float16, complex, _np.complex64, _np.complex128 ]: cls.register(type_) for method_name in nondiff_methods + diff_methods: setattr(cls, method_name, anp.__dict__[method_name]) setattr(cls, 'flatten', anp.__dict__['ravel']) defvjp(func(cls.__getitem__), lambda ans, A, idx: lambda g: untake(g, idx, vspace(A))) defjvp(func(cls.__getitem__), 'same') defjvp(untake, 'same') setattr(cls, 'reshape', wrapped_reshape) return cls
def get_model(self, *parameters, frame=None): """Get the model of the entire blend Parameters ---------- parameters: tuple of optimization parameters frame: `scarlet.Frame` Alternative Frame to project the model into Returns ------- model: array (Bands, Height, Width) data cube """ # boxed models of every source models = self.get_models_of_children(*parameters, frame=None) if frame is None: frame = self.frame # if this is the model frame then the slices are already cached if frame == self.frame: slices = tuple( (src._model_frame_slices, src._model_slices) for src in self.sources ) else: slices = tuple( overlapped_slices(frame.bbox, src.bbox) for src in self.sources ) # We have to declare the function that inserts sources # into the blend with autograd. # This has to be done each time we fit a blend, # since the number of components => the number of arguments, # which must be linked to the autograd primitive function. defvjp( _add_models, *([partial(_grad_add_models, index=k) for k in range(len(self.sources))]) ) full_model = np.zeros(frame.shape, dtype=frame.dtype) full_model = _add_models(*models, full_model=full_model, slices=slices) return full_model
def decorator(func): """Decorate a function to define its custome gradient(s). Parameters ---------- func : callable Function whose gradients will be assigned by grad_funcs. Returns ------- wrapped_function : callable Function func with gradients specified by grad_funcs. """ wrapped_function = primitive(func) def wrapped_grad_func(i, ans, *args, **kwargs): grads = grad_funcs[i](*args, **kwargs) if isinstance(grads, float): return lambda g: g * grads if grads.ndim == 2: return lambda g: g[..., None] * grads if grads.ndim == 3: return lambda g: g[..., None, None] * grads return lambda g: g * grads if len(grad_funcs) == 1: defvjp( wrapped_function, lambda ans, *args, **kwargs: wrapped_grad_func(0, ans, *args, **kwargs), ) elif len(grad_funcs) == 2: defvjp( wrapped_function, lambda ans, *args, **kwargs: wrapped_grad_func(0, ans, *args, **kwargs), lambda ans, *args, **kwargs: wrapped_grad_func(1, ans, *args, **kwargs), ) elif len(grad_funcs) == 3: defvjp( wrapped_function, lambda ans, *args, **kwargs: wrapped_grad_func(0, ans, *args, **kwargs), lambda ans, *args, **kwargs: wrapped_grad_func(1, ans, *args, **kwargs), lambda ans, *args, **kwargs: wrapped_grad_func(2, ans, *args, **kwargs), ) else: raise NotImplementedError( "custom_gradient is not yet implemented " "for more than 3 gradients." ) return wrapped_function
def decorator(func): wrapped_function = primitive(func) def wrapped_grad_func(i, ans, *args, **kwargs): grads = grad_funcs[i](*args, **kwargs) if isinstance(grads, float): return lambda g: g * grads if grads.ndim == 2: return lambda g: g[..., None] * grads if grads.ndim == 3: return lambda g: g[..., None, None] * grads return lambda g: g * grads if len(grad_funcs) == 1: defvjp( wrapped_function, lambda ans, *args, **kwargs: wrapped_grad_func(0, ans, *args, **kwargs), ) elif len(grad_funcs) == 2: defvjp( wrapped_function, lambda ans, *args, **kwargs: wrapped_grad_func(0, ans, *args, **kwargs), lambda ans, *args, **kwargs: wrapped_grad_func(1, ans, *args, **kwargs), ) elif len(grad_funcs) == 3: defvjp( wrapped_function, lambda ans, *args, **kwargs: wrapped_grad_func(0, ans, *args, **kwargs), lambda ans, *args, **kwargs: wrapped_grad_func(1, ans, *args, **kwargs), lambda ans, *args, **kwargs: wrapped_grad_func(2, ans, *args, **kwargs), ) else: raise NotImplementedError( "custom_gradient is not yet implemented " "for more than 3 gradients." ) return wrapped_function
if debug_log == 'true': logger.info("sigsq_final = {}".format(sigsq_init + jitter)) return x_plus_constant def AddJitterOp_vjp(ans: anp.ndarray, inputs: anp.ndarray, initial_jitter_factor=INITIAL_JITTER_FACTOR, jitter_growth=JITTER_GROWTH, debug_log='false'): return lambda g: anp.append(anp.reshape(g, (-1, )), anp.sum(anp.diag(g))) defvjp(AddJitterOp, AddJitterOp_vjp) @primitive def cholesky_factorization(a): """ Replacement for autograd.numpy.linalg.cholesky. Our backward (vjp) is faster and simpler, while somewhat less general (only works if a.ndim == 2). See https://arxiv.org/abs/1710.08717 for derivation of backward (vjp) expression. :param a: Symmmetric positive definite matrix A :return: Lower-triangular Cholesky factor L of A """
from __future__ import absolute_import import scipy.misc from autograd.extend import primitive, defvjp import autograd.numpy as anp from autograd.numpy.numpy_vjps import repeat_to_match_shape logsumexp = primitive(scipy.misc.logsumexp) def make_grad_logsumexp(ans, x, axis=None, b=1.0, keepdims=False): shape, dtype = anp.shape(x), anp.result_type(x) def vjp(g): g_repeated, _ = repeat_to_match_shape(g, shape, dtype, axis, keepdims) ans_repeated, _ = repeat_to_match_shape(ans, shape, dtype, axis, keepdims) return g_repeated * b * anp.exp(x - ans_repeated) return vjp defvjp(logsumexp, make_grad_logsumexp)
from __future__ import division import scipy.linalg import autograd.numpy as anp from autograd.numpy.numpy_wrapper import wrap_namespace from autograd.extend import defvjp wrap_namespace(scipy.linalg.__dict__, globals()) # populates module namespace defvjp(sqrtm, lambda ans, A, **kwargs: lambda g: solve_lyapunov(ans, g)) def _flip(a, trans): if anp.iscomplexobj(a): return 'H' if trans in ('N', 0) else 'N' else: return 'T' if trans in ('N', 0) else 'N' def grad_solve_triangular(ans, a, b, trans=0, lower=False, **kwargs): tri = anp.tril if (lower ^ (_flip(a, trans) == 'N')) else anp.triu transpose = lambda x: x if _flip(a, trans) != 'N' else x.T al2d = lambda x: x if x.ndim > 1 else x[..., None] def vjp(g): v = al2d(solve_triangular(a, g, trans=_flip(a, trans), lower=lower)) return -transpose(tri(anp.dot(v, al2d(ans).T))) return vjp
# Hotfix since _np.asarray doesn't have a gradient rule defined. @primitive def asarray(vals, *args, **kwargs): """Gradient supporting autograd asarray""" if isinstance(vals, (onp.ndarray, _np.ndarray)): return _np.asarray(vals, *args, **kwargs) return _np.array(vals, *args, **kwargs) def asarray_gradmaker(ans, *args, **kwargs): """Gradient maker for asarray""" del ans, args, kwargs return lambda g: g defvjp(asarray, asarray_gradmaker, argnums=(0, )) class tensor(_np.ndarray): """Constructs a PennyLane tensor for use with Autograd QNodes. The ``tensor`` class is a subclass of ``numpy.ndarray``, providing the same multidimensional, homogeneous data-structure of fixed-size items, with an additional flag to indicate to PennyLane whether the contained data is differentiable or not. .. warning:: PennyLane ``tensor`` objects are only used as part of the Autograd QNode interface. If using another machine learning library such as PyTorch or TensorFlow, use their built-in ``tf.Variable`` and ``torch.tensor`` classes
def vjp_maker_spdot(b, A, x): """ Gives vjp for b = spdot(A, x) w.r.t. x""" def vjp(v): return spdot(A.T, v) return vjp def jvp_spdot(g, b, A, x): """ Gives jvp for b = spdot(A, x) w.r.t. x""" return spdot(A, g) defvjp(spdot, None, vjp_maker_spdot) defjvp(spdot, None, jvp_spdot) """ =================== PLOTTING AND MEASUREMENT =================== """ import matplotlib.pylab as plt def aniplot(F, source, steps, component='Ez', num_panels=10): """ Animate an FDTD (F) with `source` for `steps` time steps. display the `component` field components at `num_panels` equally spaced. """ F.initialize_fields() # initialize the plot f, ax_list = plt.subplots(1, num_panels, figsize=(20 * num_panels, 20)) Nx, Ny, _ = F.eps_r.shape
# -*- coding: utf-8 -*- from __future__ import division from scipy.stats import norm as _scipy_norm import autograd.numpy as np from autograd.scipy.stats import norm from autograd.extend import primitive, defvjp from autograd.numpy.numpy_vjps import unbroadcast_f # TODO: next release of autograd will have this built in. logsf = primitive(_scipy_norm.logsf) defvjp( logsf, lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f( x, lambda g: -g * np.exp( norm.logpdf(x, loc, scale) - logsf(x, loc, scale))), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f( loc, lambda g: g * np.exp( norm.logpdf(x, loc, scale) - logsf(x, loc, scale))), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f( scale, lambda g: g * np.exp( norm.logpdf(x, loc, scale) - logsf(x, loc, scale)) * (x - loc) / scale), )
for i, j in enumerate(range(offset, m - 1, 2)): d[j, 0] = np.exp(1j * phis[i]) return d def build_phi_layer_vjp(ans, phis, m, offset): def _build_phi_layer_vjp(g): out = np.zeros(phis.shape) for i, j in enumerate(range(offset, m - 1, 2)): out[i] += np.real(ans[j, 0] * 1j * g[j, 0]) return out return _build_phi_layer_vjp defvjp(build_phi_layer, build_phi_layer_vjp, None, None) def clements_build(phis, m): U = np.eye(m, dtype=complex) ptr = 0 bss = [build_bs_layer(m, 0), build_bs_layer(m, 1)] for i in range(m): offset = i % 2 # Phis per layer ppl = (m - offset) // 2 bs = bss[offset] phi1 = build_phi_layer(phis[ptr:ptr + ppl], m, offset) phi2 = build_phi_layer(phis[ptr + ppl:ptr + 2 * ppl], m, offset) ptr += 2 * ppl
axes, shapes = parse_axes(A.shape, B.shape, axes, dot_axes, mode) if argnum == 0: X, Y = A, B _X_, _Y_ = 'A', 'B' ignore_Y = 'ignore_B' elif argnum == 1: X, Y = B, A _X_, _Y_ = 'B', 'A' ignore_Y = 'ignore_A' else: raise NotImplementedError("Can't take grad of convolve w.r.t. arg {0}".format(argnum)) if mode == 'full': new_mode = 'valid' else: if any([x_size > y_size for x_size, y_size in zip(shapes[_X_]['conv'], shapes[_Y_]['conv'])]): new_mode = 'full' else: new_mode = 'valid' def vjp(g): result = convolve(g, Y[flipped_idxs(Y.ndim, axes[_Y_]['conv'])], axes = [axes['out']['conv'], axes[_Y_]['conv']], dot_axes = [axes['out'][ignore_Y], axes[_Y_]['ignore']], mode = new_mode) new_order = npo.argsort(axes[_X_]['ignore'] + axes[_X_]['dot'] + axes[_X_]['conv']) return np.transpose(result, new_order) return vjp defvjp(convolve, partial(grad_convolve, 0), partial(grad_convolve, 1))
class RKHSFun(object): def __init__(self, kernel, alphas={}): self.alphas = alphas self.kernel = kernel self.vs = RKHSFunVSpace(self) @primitive def __call__(self, x): return sum([a * self.kernel(x, x_repr) for x_repr, a in self.alphas.items()], 0.0) def __add__(self, f): return self.vs.add(self, f) def __mul__(self, a): return self.vs.scalar_mul(self, a) # TODO: add vjp of __call__ wrt x (and show it in action) defvjp(func(RKHSFun.__call__), lambda ans, f, x: lambda g: RKHSFun(f.kernel, {x : 1}) * g) class RKHSFunBox(Box, RKHSFun): @property def kernel(self): return self._value.kernel RKHSFunBox.register(RKHSFun) class RKHSFunVSpace(VSpace): def __init__(self, value): self.kernel = value.kernel def zeros(self): return RKHSFun(self.kernel) def randn(self): # These arbitrary vectors are not analogous to randn in any meaningful way N = npr.randint(1,3) return RKHSFun(self.kernel, dict(zip(npr.randn(N), npr.randn(N))))
import scipy.stats import autograd.numpy as anp from autograd.extend import primitive, defvjp from autograd.numpy.numpy_vjps import unbroadcast_f pdf = primitive(scipy.stats.norm.pdf) cdf = primitive(scipy.stats.norm.cdf) sf = primitive(scipy.stats.norm.sf) logpdf = primitive(scipy.stats.norm.logpdf) logcdf = primitive(scipy.stats.norm.logcdf) logsf = primitive(scipy.stats.norm.logsf) defvjp(pdf, lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(x, lambda g: -g * ans * (x - loc) / scale**2), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(loc, lambda g: g * ans * (x - loc) / scale**2), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(scale, lambda g: g * ans * (((x - loc)/scale)**2 - 1.0)/scale)) defvjp(cdf, lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(x, lambda g: g * pdf(x, loc, scale)) , lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(loc, lambda g: -g * pdf(x, loc, scale)), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(scale, lambda g: -g * pdf(x, loc, scale)*(x-loc)/scale)) defvjp(logpdf, lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(x, lambda g: -g * (x - loc) / scale**2),
from __future__ import absolute_import import autograd.numpy as np import scipy.stats from autograd.extend import primitive, defvjp from autograd.numpy.numpy_vjps import unbroadcast_f cdf = primitive(scipy.stats.poisson.cdf) logpmf = primitive(scipy.stats.poisson.logpmf) pmf = primitive(scipy.stats.poisson.pmf) def grad_poisson_logpmf(k, mu): return np.where(k % 1 == 0, k / mu - 1, 0) defvjp(cdf, lambda ans, k, mu: unbroadcast_f(mu, lambda g: g * -pmf(np.floor(k), mu)), argnums=[1]) defvjp(logpmf, lambda ans, k, mu: unbroadcast_f(mu, lambda g: g * grad_poisson_logpmf(k, mu)), argnums=[1]) defvjp(pmf, lambda ans, k, mu: unbroadcast_f(mu, lambda g: g * ans * grad_poisson_logpmf(k, mu)), argnums=[1])
# Some formulas are from # "An extended collection of matrix derivative results # for forward and reverse mode algorithmic differentiation" # by Mike Giles # https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf # transpose by swapping last two dimensions def T(x): return anp.swapaxes(x, -1, -2) _dot = partial(anp.einsum, '...ij,...jk->...ik') # add two dimensions to the end of x def add2d(x): return anp.reshape(x, anp.shape(x) + (1, 1)) defvjp(det, lambda ans, x: lambda g: add2d(g) * add2d(ans) * T(inv(x))) defvjp(slogdet, lambda ans, x: lambda g: add2d(g[1]) * T(inv(x))) def grad_inv(ans, x): return lambda g: -_dot(_dot(T(ans), g), T(ans)) defvjp(inv, grad_inv) def grad_pinv(ans, x): # https://mathoverflow.net/questions/25778/analytical-formula-for-numerical-derivative-of-the-matrix-pseudo-inverse return lambda g: T( -_dot(_dot(ans, T(g)), ans) + _dot(_dot(_dot(ans, T(ans)), g), anp.eye(x.shape[-2]) - _dot(x,ans)) + _dot(_dot(_dot(anp.eye(ans.shape[-2]) - _dot(ans,x), g), T(ans)), ans) ) defvjp(pinv, grad_pinv)
return -diff * (1.0 + df) / (diff**2 + df) def grad_tlogpdf_x(x, df, loc, scale): return grad_tlogpdf_diff((x - loc) / scale, df) / scale def grad_tlogpdf_loc(x, df, loc, scale): return -grad_tlogpdf_diff((x - loc) / scale, df) / scale def grad_tlogpdf_scale(x, df, loc, scale): diff = x - loc return -(df * (scale**2 - diff**2))/(scale * (df * scale**2 + diff**2)) def grad_tlogpdf_df(x, df, loc, scale): y = (x - loc)/scale return 0.5 * ((y**2 * (df+1))/(df * (y**2 + df)) - np.log(y**2 / df + 1) - 1.0/df -psi(df/2.0) + psi((df + 1)/2.0)) defvjp(pdf, lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(x, lambda g: g * ans * grad_tlogpdf_x( x, df, loc, scale)), lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(df, lambda g: g * ans * grad_tlogpdf_df( x, df, loc, scale)), lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(loc, lambda g: g * ans * grad_tlogpdf_loc( x, df, loc, scale)), lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(scale, lambda g: g * ans * grad_tlogpdf_scale(x, df, loc, scale))) defvjp(cdf, lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(x, lambda g: g * pdf(x, df, loc, scale)), lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(loc, lambda g: -g * pdf(x, df, loc, scale)), argnums=(0,2)) defvjp(logpdf, lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(x, lambda g: g * grad_tlogpdf_x( x, df, loc, scale)), lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(df, lambda g: g * grad_tlogpdf_df( x, df, loc, scale)),
nograd_functions = [ anp.floor, anp.ceil, anp.round, anp.rint, anp.around, anp.fix, anp.trunc, anp.all, anp.any, anp.argmax, anp.argmin, anp.argpartition, anp.argsort, anp.argwhere, anp.nonzero, anp.flatnonzero, anp.count_nonzero, anp.searchsorted, anp.sign, anp.ndim, anp.shape, anp.floor_divide, anp.logical_and, anp.logical_or, anp.logical_not, anp.logical_xor, anp.isfinite, anp.isinf, anp.isnan, anp.isneginf, anp.isposinf, anp.allclose, anp.isclose, anp.array_equal, anp.array_equiv, anp.greater, anp.greater_equal, anp.less, anp.less_equal, anp.equal, anp.not_equal, anp.iscomplexobj, anp.iscomplex, anp.size, anp.isscalar, anp.isreal, anp.zeros_like, anp.ones_like, anp.result_type] for fun in nograd_functions: register_notrace(VJPNode, fun) # ----- Functions that are constant w.r.t. continuous inputs ----- defvjp(anp.nan_to_num, lambda ans, x: lambda g: anp.where(anp.isfinite(x), g, 0.)) # ----- Binary ufuncs ----- defvjp(anp.add, lambda ans, x, y : unbroadcast_f(x, lambda g: g), lambda ans, x, y : unbroadcast_f(y, lambda g: g)) defvjp(anp.multiply, lambda ans, x, y : unbroadcast_f(x, lambda g: y * g), lambda ans, x, y : unbroadcast_f(y, lambda g: x * g)) defvjp(anp.subtract, lambda ans, x, y : unbroadcast_f(x, lambda g: g), lambda ans, x, y : unbroadcast_f(y, lambda g: -g)) defvjp(anp.divide, lambda ans, x, y : unbroadcast_f(x, lambda g: g / y), lambda ans, x, y : unbroadcast_f(y, lambda g: - g * x / y**2)) defvjp(anp.maximum, lambda ans, x, y : unbroadcast_f(x, lambda g: g * balanced_eq(x, ans, y)), lambda ans, x, y : unbroadcast_f(y, lambda g: g * balanced_eq(y, ans, x))) defvjp(anp.minimum, lambda ans, x, y : unbroadcast_f(x, lambda g: g * balanced_eq(x, ans, y)), lambda ans, x, y : unbroadcast_f(y, lambda g: g * balanced_eq(y, ans, x)))
from __future__ import absolute_import import autograd.numpy as np import scipy.stats from autograd.extend import primitive, defvjp from autograd.numpy.numpy_vjps import unbroadcast_f from autograd.scipy.special import gamma, psi cdf = primitive(scipy.stats.gamma.cdf) logpdf = primitive(scipy.stats.gamma.logpdf) pdf = primitive(scipy.stats.gamma.pdf) def grad_gamma_logpdf_arg0(x, a): return (a - x - 1) / x def grad_gamma_logpdf_arg1(x, a): return np.log(x) - psi(a) defvjp(cdf, lambda ans, x, a: unbroadcast_f(x, lambda g: g * np.exp(-x) * np.power(x, a-1) / gamma(a)), argnums=[0]) defvjp(logpdf, lambda ans, x, a: unbroadcast_f(x, lambda g: g * grad_gamma_logpdf_arg0(x, a)), lambda ans, x, a: unbroadcast_f(a, lambda g: g * grad_gamma_logpdf_arg1(x, a))) defvjp(pdf, lambda ans, x, a: unbroadcast_f(x, lambda g: g * ans * grad_gamma_logpdf_arg0(x, a)), lambda ans, x, a: unbroadcast_f(a, lambda g: g * ans * grad_gamma_logpdf_arg1(x, a)))
from .numpy_wrapper import wrap_namespace from .numpy_vjps import match_complex from . import numpy_wrapper as anp from autograd.extend import primitive, defvjp, vspace wrap_namespace(ffto.__dict__, globals()) # TODO: make fft gradient work for a repeated axis, # e.g. by replacing fftn with repeated calls to 1d fft along each axis def fft_grad(get_args, fft_fun, ans, x, *args, **kwargs): axes, s, norm = get_args(x, *args, **kwargs) check_no_repeated_axes(axes) vs = vspace(x) return lambda g: match_complex(x, truncate_pad(fft_fun(g, *args, **kwargs), vs.shape)) defvjp(fft, lambda *args, **kwargs: fft_grad(get_fft_args, fft, *args, **kwargs)) defvjp(ifft, lambda *args, **kwargs: fft_grad(get_fft_args, ifft, *args, **kwargs)) defvjp(fft2, lambda *args, **kwargs: fft_grad(get_fft_args, fft2, *args, **kwargs)) defvjp(ifft2, lambda *args, **kwargs: fft_grad(get_fft_args, ifft2, *args, **kwargs)) defvjp(fftn, lambda *args, **kwargs: fft_grad(get_fft_args, fftn, *args, **kwargs)) defvjp(ifftn, lambda *args, **kwargs: fft_grad(get_fft_args, ifftn, *args, **kwargs)) def rfft_grad(get_args, irfft_fun, ans, x, *args, **kwargs): axes, s, norm = get_args(x, *args, **kwargs)
# on both the input to the original function (x), and the output of the # original function (ans). def logsumexp_vjp(ans, x): # If you want to be able to take higher-order derivatives, then all the # code inside this function must be itself differentiable by Autograd. # This closure multiplies g with the Jacobian of logsumexp (d_ans/d_x). # Because Autograd uses reverse-mode differentiation, g contains # the gradient of the objective w.r.t. ans, the output of logsumexp. # This returned VJP function doesn't close over `x`, so Python can # garbage-collect `x` if there are no references to it elsewhere. x_shape = x.shape return lambda g: np.full(x_shape, g) * np.exp(x - np.full(x_shape, ans)) # Now we tell Autograd that logsumexmp has a gradient-making function. defvjp(logsumexp, logsumexp_vjp) if __name__ == '__main__': # Now we can use logsumexp() inside a larger function that we want # to differentiate. def example_func(y): z = y**2 lse = logsumexp(z) return np.sum(lse) grad_of_example = grad(example_func) print("Gradient: \n", grad_of_example(npr.randn(10))) # Check the gradients numerically, just to be safe. check_grads(example_func, modes=['rev'])(npr.randn(10))
This function is required for integration with Autograd. """ # pylint: disable=unused-argument def gradient_product(g): """Vector Jacobian product operator. Args: g (array): scalar or vector multiplying the Jacobian from the left (output side). Returns: nested Sequence[float]: vector-Jacobian product, arranged into the nested structure of the QNode input arguments. """ # Jacobian matrix of the circuit jac = self.jacobian(args, **kwargs) if not g.shape: temp = g * jac # numpy treats 0d arrays as scalars, hence @ cannot be used else: temp = g @ jac # restore the nested structure of the input args temp = unflatten(temp.flat, args) return temp return gradient_product # define the vector-Jacobian product function for QNode.__call__() ae.defvjp(QNode.evaluate, QNode_vjp, argnums=[1])
""" Compute the expected error of A on W, under the following assumptions: 1. A is a sensitivity 1 strategy 2. A supports W """ AtA1 = np.linalg.pinv(np.dot(A.T, A)) return np.trace(np.dot(AtA1, WtW)) def grad_error(A, WtW): AtA1 = np.linalg.pinv(np.dot(A.T, A)) X = -np.dot(AtA1, np.dot(WtW, AtA1)) return 2 * np.dot(A, X) defvjp(mm_error, lambda ans, A, WtW: lambda g: g * grad_error(A, WtW), argnums=[0]) class CustomTemplate(templates.TemplateStrategy): """ The CustomTemplate strategy is specified by a function mapping parameters theta to a strategy A(theta). Gradients + Optimization are handled automatically as long as the passed function is compatible with autograd. """ def __init__(self, strategy, theta0, normalize=True, seed=None): """ :param strategy: a function mapping parameters theta to strategies A(theta) :param theta0: the initial parameters :param normalize: flag to determine if A(theta) should be normalized Note: if normalize=False, A(theta) must always have bounded sensitivity for any theta
g_repeated = np.zeros(shape) for I, (ist, ind) in enumerate(zip(Xstrides[:-1], Xstrides[1:])): for J, (jst, jnd) in enumerate(zip(Ystrides[:-1], Ystrides[1:])): if is_square is True: if I < J: g_repeated[ist:ind, jst:jnd] = g_repeated[jst:jnd, ist:ind].T continue g_repeated[ist:ind, jst:jnd] = g[I, J] / ((ind - ist) * (jnd - jst)) return g_repeated return vjp defvjp(average_kernel, grad_average_kernel, None, None) def symmetrize(p): Nsoap, Ncomp, _, nn = p.shape p2 = np.empty((Nsoap, Ncomp * (Ncomp + 1) / 2, nn)) stride = [0] + list(range(Ncomp, 0, -1)) stride = np.cumsum(stride) for i, st, nd in zip(range(Ncomp - 1), stride[:-1], stride[1:]): p2[:, st] = p[:, i, i] p2[:, st + 1:nd] = p[:, i, (i + 1):Ncomp] * np.sqrt(2.0) p2[:, -1] = p[:, Ncomp - 1, Ncomp - 1] return p2 def get_unlin_soap(rawsoap, params, global_species):
@primitive def __call__(self, x): return sum( [a * self.kernel(x, x_repr) for x_repr, a in self.alphas.items()], 0.0) def __add__(self, f): return self.vs.add(self, f) def __mul__(self, a): return self.vs.scalar_mul(self, a) # TODO: add vjp of __call__ wrt x (and show it in action) defvjp(func(RKHSFun.__call__), lambda ans, f, x: lambda g: RKHSFun(f.kernel, {x: 1}) * g) class RKHSFunBox(Box, RKHSFun): @property def kernel(self): return self._value.kernel RKHSFunBox.register(RKHSFun) class RKHSFunVSpace(VSpace): def __init__(self, value): self.kernel = value.kernel
from __future__ import absolute_import import scipy.special import autograd.numpy as np from autograd.extend import primitive, defvjp from autograd.numpy.numpy_vjps import unbroadcast_f ### Beta function ### beta = primitive(scipy.special.beta) betainc = primitive(scipy.special.betainc) betaln = primitive(scipy.special.betaln) defvjp(beta, lambda ans, a, b: unbroadcast_f(a, lambda g: g * ans * (psi(a) - psi(a + b))), lambda ans, a, b: unbroadcast_f(b, lambda g: g * ans * (psi(b) - psi(a + b)))) defvjp(betainc, lambda ans, a, b, x: unbroadcast_f(x, lambda g: g * np.power(x, a - 1) * np.power(1 - x, b - 1) / beta(a, b)), argnums=[2]) defvjp(betaln, lambda ans, a, b: unbroadcast_f(a, lambda g: g * (psi(a) - psi(a + b))), lambda ans, a, b: unbroadcast_f(b, lambda g: g * (psi(b) - psi(a + b)))) ### Gamma functions ### polygamma = primitive(scipy.special.polygamma) psi = primitive(scipy.special.psi) # psi(x) is just polygamma(0, x) digamma = primitive(scipy.special.digamma) # digamma is another name for psi. gamma = primitive(scipy.special.gamma) gammaln = primitive(scipy.special.gammaln) gammainc = primitive(scipy.special.gammainc) gammaincc = primitive(scipy.special.gammaincc) gammasgn = primitive(scipy.special.gammasgn) rgamma = primitive(scipy.special.rgamma)
Returns ------- np.ndarray shape=(2,2) """ # print('mmbbvv, pu2', pu2r(*tlist) +1j* pu2r(*tlist)) return pu2r(*tlist) + 1j * pu2i(*tlist) defvjp( pu2r, # defines vector-jacobian-product of pu2r # g.shape == pu2r.shape lambda ans, *tlist: lambda g: np.sum(g * np.real(d_u2(0, *tlist))), lambda ans, *tlist: lambda g: np.sum(g * np.real(d_u2(1, *tlist))), lambda ans, *tlist: lambda g: np.sum(g * np.real(d_u2(2, *tlist))), lambda ans, *tlist: lambda g: np.sum(g * np.real(d_u2(3, *tlist))), argnums=range(4)) defvjp( pu2i, # defines vector-jacobian-product of pu2i # g.shape == pu2i.shape lambda ans, *tlist: lambda g: np.sum(g * np.imag(d_u2(0, *tlist))), lambda ans, *tlist: lambda g: np.sum(g * np.imag(d_u2(1, *tlist))), lambda ans, *tlist: lambda g: np.sum(g * np.imag(d_u2(2, *tlist))), lambda ans, *tlist: lambda g: np.sum(g * np.imag(d_u2(3, *tlist))), argnums=range(4))
from __future__ import absolute_import, division import autograd.numpy as np import scipy.stats from autograd.extend import primitive, defvjp from autograd.numpy.numpy_vjps import unbroadcast_f from autograd.scipy.special import gamma cdf = primitive(scipy.stats.chi2.cdf) logpdf = primitive(scipy.stats.chi2.logpdf) pdf = primitive(scipy.stats.chi2.pdf) def grad_chi2_logpdf(x, df): return np.where(df % 1 == 0, (df - x - 2) / (2 * x), 0) defvjp(cdf, lambda ans, x, df: unbroadcast_f(x, lambda g: g * np.power(2., -df/2) * np.exp(-x/2) * np.power(x, df/2 - 1) / gamma(df/2)), argnums=[0]) defvjp(logpdf, lambda ans, x, df: unbroadcast_f(x, lambda g: g * grad_chi2_logpdf(x, df)), argnums=[0]) defvjp(pdf, lambda ans, x, df: unbroadcast_f(x, lambda g: g * ans * grad_chi2_logpdf(x, df)), argnums=[0])
from __future__ import absolute_import import scipy.special import autograd.numpy as np from autograd.extend import primitive, defvjp, defjvp from autograd.numpy.numpy_vjps import unbroadcast_f, repeat_to_match_shape ### Beta function ### beta = primitive(scipy.special.beta) betainc = primitive(scipy.special.betainc) betaln = primitive(scipy.special.betaln) defvjp(beta, lambda ans, a, b: unbroadcast_f(a, lambda g: g * ans * (psi(a) - psi(a + b))), lambda ans, a, b: unbroadcast_f(b, lambda g: g * ans * (psi(b) - psi(a + b)))) defvjp(betainc, lambda ans, a, b, x: unbroadcast_f(x, lambda g: g * np.power(x, a - 1) * np.power(1 - x, b - 1) / beta(a, b)), argnums=[2]) defvjp(betaln, lambda ans, a, b: unbroadcast_f(a, lambda g: g * (psi(a) - psi(a + b))), lambda ans, a, b: unbroadcast_f(b, lambda g: g * (psi(b) - psi(a + b)))) ### Gamma functions ### polygamma = primitive(scipy.special.polygamma) psi = primitive(scipy.special.psi) # psi(x) is just polygamma(0, x) digamma = primitive(scipy.special.digamma) # digamma is another name for psi. gamma = primitive(scipy.special.gamma) gammaln = primitive(scipy.special.gammaln) gammainc = primitive(scipy.special.gammainc) gammaincc = primitive(scipy.special.gammaincc) gammasgn = primitive(scipy.special.gammasgn) rgamma = primitive(scipy.special.rgamma)
raise NotImplementedError("The multivariate normal pdf is not " "differentiable w.r.t. a singular covariance matix") J = np.linalg.inv(cov) solved = np.matmul(J, np.expand_dims(x - mean, -1)) return 1./2 * (generalized_outer_product(solved) - J) def solve(allow_singular): if allow_singular: return lambda A, x: np.dot(np.linalg.pinv(A), x) else: return np.linalg.solve defvjp(logpdf, lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(x, lambda g: -np.expand_dims(g, 1) * solve(allow_singular)(cov, (x - mean).T).T), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(mean, lambda g: np.expand_dims(g, 1) * solve(allow_singular)(cov, (x - mean).T).T), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(cov, lambda g: -np.reshape(g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov, allow_singular))) # Same as log pdf, but multiplied by the pdf (ans). defvjp(pdf, lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(x, lambda g: -np.expand_dims(ans * g, 1) * solve(allow_singular)(cov, (x - mean).T).T), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(mean, lambda g: np.expand_dims(ans * g, 1) * solve(allow_singular)(cov, (x - mean).T).T), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(cov, lambda g: -np.reshape(ans * g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov, allow_singular))) defvjp(entropy, None, lambda ans, mean, cov:
T, K = ll.shape # Forward pass to get alphas alphas = np.zeros((T, K)) forward_pass(log_pi0, log_Ps, ll, alphas) grad_hmm_normalizer(log_Ps, alphas, dlog_pi0, dlog_Ps, dll) if argnum == 0: return lambda g: g * dlog_pi0 if argnum == 1: return lambda g: g * dlog_Ps if argnum == 2: return lambda g: g * dll defvjp(hmm_normalizer, partial(_make_grad_hmm_normalizer, 0), partial(_make_grad_hmm_normalizer, 1), partial(_make_grad_hmm_normalizer, 2)) def hmm_expected_states(log_pi0, log_Ps, ll): T, K = ll.shape # Make sure everything is C contiguous log_pi0 = to_c(log_pi0) log_Ps = to_c(log_Ps) ll = to_c(ll) alphas = np.zeros((T, K)) forward_pass(log_pi0, log_Ps, ll, alphas) normalizer = logsumexp(alphas[-1])
from __future__ import division import scipy.linalg import autograd.numpy as anp from autograd.numpy.numpy_wrapper import wrap_namespace from autograd.extend import defvjp, defvjp_argnums, defjvp, defjvp_argnums wrap_namespace(scipy.linalg.__dict__, globals()) # populates module namespace def _vjp_sqrtm(ans, A, disp=True, blocksize=64): assert disp, "sqrtm vjp not implemented for disp=False" ans_transp = anp.transpose(ans) def vjp(g): return anp.real(solve_sylvester(ans_transp, ans_transp, g)) return vjp defvjp(sqrtm, _vjp_sqrtm) def _flip(a, trans): if anp.iscomplexobj(a): return 'H' if trans in ('N', 0) else 'N' else: return 'T' if trans in ('N', 0) else 'N' def grad_solve_triangular(ans, a, b, trans=0, lower=False, **kwargs): tri = anp.tril if (lower ^ (_flip(a, trans) == 'N')) else anp.triu transpose = lambda x: x if _flip(a, trans) != 'N' else x.T al2d = lambda x: x if x.ndim > 1 else x[...,None] def vjp(g): v = al2d(solve_triangular(a, g, trans=_flip(a, trans), lower=lower)) return -transpose(tri(anp.dot(v, al2d(ans).T))) return vjp
raise NotImplementedError( "Can't take grad of convolve w.r.t. arg {0}".format(argnum)) if mode == 'full': new_mode = 'valid' else: if any([ x_size > y_size for x_size, y_size in zip( shapes[_X_]['conv'], shapes[_Y_]['conv']) ]): new_mode = 'full' else: new_mode = 'valid' def vjp(g): result = convolve( g, Y[tuple(_autograd_signal.flipped_idxs(Y.ndim, axes[_Y_]['conv']))], axes=[axes['out']['conv'], axes[_Y_]['conv']], dot_axes=[axes['out'][ignore_Y], axes[_Y_]['ignore']], mode=new_mode) new_order = npo.argsort(axes[_X_]['ignore'] + axes[_X_]['dot'] + axes[_X_]['conv']) return np.transpose(result, new_order) return vjp defvjp(_torch_convolve, partial(_torch_grad_convolve, 0), partial(_torch_grad_convolve, 1))
def check_probs_matrix(x): x = truncate0(x) rowsums = np.sum(x, axis=1) assert np.allclose(rowsums, 1.0) return np.einsum('ij,i->ij', x, 1.0 / rowsums) @primitive def set0(x, indices): y = np.array(x) y[indices] = 0 return y defvjp(set0, lambda ans, x, indices: lambda g: set0(g, indices)) #set0.defgrad(lambda ans, x, indices: lambda g: set0(g, indices)) #set0.defvjp(lambda g, ans, vs, gvs, x, indices: set0(g, indices)) def closeleq(x, y): return np.logical_or(np.isclose(x, y), x <= y) def closegeq(x, y): return np.logical_or(np.isclose(x, y), x >= y) @primitive def make_constant(x): return x
from autograd.misc.optimizers import adam, sgd from matplotlib import pyplot as plt from scipy.optimize import basinhopping @primitive def relu(x): return x * (x > 0) def relu_vjp(ans, x): return lambda x: (x > 0).astype(float) defvjp(relu, relu_vjp) def init_random_params(scale, layer_sizes, rs=npr.RandomState()): """Build a list of (weights, biases) tuples, one for each layer in the net.""" return [ [ scale * rs.randn(m, n), # weight matrix scale * rs.randn(n) ] # bias vector for m, n in zip(layer_sizes[:-1], layer_sizes[1:]) ] def init_ones_params(scale, layer_sizes, rs=npr.RandomState()):
T, K = ll.shape # Forward pass to get alphas alphas = np.zeros((T, K)) forward_pass(log_pi0, log_Ps, ll, alphas) grad_hmm_normalizer(log_Ps, alphas, dlog_pi0, dlog_Ps, dll) if argnum == 0: return lambda g: g * dlog_pi0 if argnum == 1: return lambda g: g * dlog_Ps if argnum == 2: return lambda g: g * dll defvjp(hmm_normalizer, partial(_make_grad_hmm_normalizer, 0), partial(_make_grad_hmm_normalizer, 1), partial(_make_grad_hmm_normalizer, 2)) def hmm_expected_states(log_pi0, log_Ps, ll): T, K = ll.shape # Make sure everything is C contiguous to_c = lambda arr: np.copy(arr, 'C') if not arr.flags['C_CONTIGUOUS'] else arr log_pi0 = to_c(getval(log_pi0)) log_Ps = to_c(getval(log_Ps)) ll = to_c(getval(ll)) alphas = np.zeros((T, K)) forward_pass(log_pi0, log_Ps, ll, alphas) normalizer = logsumexp(alphas[-1])
def logsumexp_vjp(ans, x): # If you want to be able to take higher-order derivatives, then all the # code inside this function must be itself differentiable by Autograd. # This closure multiplies g with the Jacobian of logsumexp (d_ans/d_x). # Because Autograd uses reverse-mode differentiation, g contains # the gradient of the objective w.r.t. ans, the output of logsumexp. # This returned VJP function doesn't close over `x`, so Python can # garbage-collect `x` if there are no references to it elsewhere. x_shape = x.shape return lambda g: np.full(x_shape, g) * np.exp(x - np.full(x_shape, ans)) # Now we tell Autograd that logsumexmp has a gradient-making function. defvjp(logsumexp, logsumexp_vjp) if __name__ == '__main__': # Now we can use logsumexp() inside a larger function that we want # to differentiate. def example_func(y): z = y**2 lse = logsumexp(z) return np.sum(lse) grad_of_example = grad(example_func) print("Gradient: \n", grad_of_example(npr.randn(10))) # Check the gradients numerically, just to be safe. check_grads(example_func, modes=['rev'])(npr.randn(10))
# batched diagonal, similar to matrix_diag in tensorflow def _matrix_diag(a): reps = anp.array(a.shape) reps[:-1] = 1 reps[-1] = a.shape[-1] newshape = list(a.shape) + [a.shape[-1]] return _diag(anp.tile(a, reps).reshape(newshape)) # add two dimensions to the end of x def add2d(x): return anp.reshape(x, anp.shape(x) + (1, 1)) defvjp(det, lambda ans, x: lambda g: add2d(g) * add2d(ans) * T(inv(x))) defvjp(slogdet, lambda ans, x: lambda g: add2d(g[1]) * T(inv(x))) def grad_inv(ans, x): return lambda g: -_dot(_dot(T(ans), g), T(ans)) defvjp(inv, grad_inv) def grad_pinv(ans, x): # https://mathoverflow.net/questions/25778/analytical-formula-for-numerical-derivative-of-the-matrix-pseudo-inverse return lambda g: T(-_dot(_dot(ans, T(g)), ans) + _dot( _dot(_dot(ans, T(ans)), g), anp.eye(x.shape[-2]) - _dot(x, ans)) + _dot(
from autograd.extend import primitive, defvjp, vspace from autograd.builtins import tuple from autograd import make_vjp @primitive def fixed_point(f, a, x0, distance, tol): _f = f(a) x, x_prev = _f(x0), x0 while distance(x, x_prev) > tol: x, x_prev = _f(x), x return x def fixed_point_vjp(ans, f, a, x0, distance, tol): def rev_iter(params): a, x_star, x_star_bar = params vjp_x, _ = make_vjp(f(a))(x_star) vs = vspace(x_star) return lambda g: vs.add(vjp_x(g), x_star_bar) vjp_a, _ = make_vjp(lambda x, y: f(x)(y))(a, ans) return lambda g: vjp_a(fixed_point(rev_iter, tuple((a, ans, g)), vspace(x0).zeros(), distance, tol)) defvjp(fixed_point, None, fixed_point_vjp, None)
out = ag_np.zeros_like(x) out[mask0] = ag_np.log1p(ag_np.exp(x[mask0])) out[mask1] = x[mask1] + ag_np.log1p(ag_np.exp(-x[mask1])) return out if x > 0: return x + ag_np.log1p(ag_np.exp(-x)) else: return ag_np.log1p(ag_np.exp(x)) def make_grad__to_common_arr(ans, x): x = ag_np.asarray(x) def gradient_product(g): return ag_np.full(x.shape, g) * ag_np.exp(x - ans) return gradient_product defvjp(to_common_arr, make_grad__to_common_arr) @primitive def to_unconstrained_arr(p): """ Numerically stable transform from positive reals to real line Implements ag_np.log(ag_np.exp(x) - 1.0) Autograd friendly and fully vectorized Args ---- p : array of values in (0, +\infty) Returns
from __future__ import absolute_import import scipy.stats import autograd.numpy as np from autograd.scipy.special import digamma from autograd.extend import primitive, defvjp rvs = primitive(scipy.stats.dirichlet.rvs) pdf = primitive(scipy.stats.dirichlet.pdf) logpdf = primitive(scipy.stats.dirichlet.logpdf) defvjp(logpdf,lambda ans, x, alpha: lambda g: g * (alpha - 1) / x, lambda ans, x, alpha: lambda g: g * (digamma(np.sum(alpha)) - digamma(alpha) + np.log(x))) # Same as log pdf, but multiplied by the pdf (ans). defvjp(pdf,lambda ans, x, alpha: lambda g: g * ans * (alpha - 1) / x, lambda ans, x, alpha: lambda g: g * ans * (digamma(np.sum(alpha)) - digamma(alpha) + np.log(x)))