return 0.5 * (np.tril(mat) + np.triu(mat, 1).T) elif len(mat.shape) == 3: return 0.5 * (np.tril(mat) + np.swapaxes(np.triu(mat, 1), 1,2)) else: raise ArithmeticError def generalized_outer_product(mat): if len(mat.shape) == 1: return np.outer(mat, mat) elif len(mat.shape) == 2: return np.einsum('ij,ik->ijk', mat, mat) else: raise ArithmeticError def covgrad(x, mean, cov): # I think once we have Cholesky we can make this nicer. solved = np.linalg.solve(cov, (x - mean).T).T return lower_half(np.linalg.inv(cov) - generalized_outer_product(solved)) logpdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast(vs, gvs, -np.expand_dims(g, 1) * np.linalg.solve(cov, (x - mean).T).T), argnum=0) logpdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast(vs, gvs, np.expand_dims(g, 1) * np.linalg.solve(cov, (x - mean).T).T), argnum=1) logpdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast(vs, gvs, -np.reshape(g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov)), argnum=2) # Same as log pdf, but multiplied by the pdf (ans). pdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast(vs, gvs, -g * ans * np.linalg.solve(cov, x - mean)), argnum=0) pdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast(vs, gvs, g * ans * np.linalg.solve(cov, x - mean)), argnum=1) pdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast(vs, gvs, -g * ans * covgrad(x, mean, cov)), argnum=2) entropy.defvjp_is_zero(argnums=(0,)) entropy.defvjp(lambda g, ans, vs, gvs, mean, cov: unbroadcast(vs, gvs, 0.5 * g * np.linalg.inv(cov).T), argnum=1)
if len(mat.shape) == 1: return np.outer(mat, mat) elif len(mat.shape) == 2: return np.einsum('ij,ik->ijk', mat, mat) else: raise ArithmeticError def covgrad(x, mean, cov): # I think once we have Cholesky we can make this nicer. solved = np.linalg.solve(cov, (x - mean).T).T return lower_half(np.linalg.inv(cov) - generalized_outer_product(solved)) logpdf.defgrad(lambda ans, x, mean, cov: unbroadcast( ans, x, lambda g: -np.expand_dims(g, 1) * np.linalg.solve( cov, (x - mean).T).T), argnum=0) logpdf.defgrad(lambda ans, x, mean, cov: unbroadcast( ans, mean, lambda g: np.expand_dims(g, 1) * np.linalg.solve( cov, (x - mean).T).T), argnum=1) logpdf.defgrad(lambda ans, x, mean, cov: unbroadcast( ans, cov, lambda g: -np.reshape(g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov)), argnum=2) # Same as log pdf, but multiplied by the pdf (ans). pdf.defgrad(lambda ans, x, mean, cov: unbroadcast( ans, x, lambda g: -g * ans * np.linalg.solve(cov, x - mean)),
"""Gradients of the normal distribution.""" from __future__ import absolute_import import scipy.stats import autograd.numpy as anp from autograd.core import primitive from autograd.numpy.numpy_grads import unbroadcast pdf = primitive(scipy.stats.norm.pdf) cdf = primitive(scipy.stats.norm.cdf) logpdf = primitive(scipy.stats.norm.logpdf) logcdf = primitive(scipy.stats.norm.logcdf) pdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, x, lambda g: -g * ans * (x - loc) / scale**2)) pdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, loc, lambda g: g * ans * (x - loc) / scale**2), argnum=1) pdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, scale, lambda g: g * ans * (((x - loc)/scale)**2 - 1.0)/scale), argnum=2) cdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, x, lambda g: g * pdf(x, loc, scale))) cdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, loc, lambda g: -g * pdf(x, loc, scale)), argnum=1) cdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, scale, lambda g: -g * pdf(x, loc, scale)*(x-loc)/scale), argnum=2) logpdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, x, lambda g: -g * (x - loc) / scale**2)) logpdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, loc, lambda g: g * (x - loc) / scale**2), argnum=1) logpdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, scale, lambda g: g * (-1.0/scale + (x - loc)**2/scale**3)), argnum=2) logcdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, x, lambda g: g * anp.exp(logpdf(x, loc, scale) - logcdf(x, loc, scale)))) logcdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, loc, lambda g: -g * anp.exp(logpdf(x, loc, scale) - logcdf(x, loc, scale))), argnum=1) logcdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, scale, lambda g: -g * anp.exp(logpdf(x, loc, scale) - logcdf(x, loc, scale))*(x-loc)/scale), argnum=2)
return 0.5 * (np.tril(mat) + np.triu(mat, 1).T) elif len(mat.shape) == 3: return 0.5 * (np.tril(mat) + np.swapaxes(np.triu(mat, 1), 1,2)) else: raise ArithmeticError def generalized_outer_product(mat): if len(mat.shape) == 1: return np.outer(mat, mat) elif len(mat.shape) == 2: return np.einsum('ij,ik->ijk', mat, mat) else: raise ArithmeticError def covgrad(x, mean, cov): # I think once we have Cholesky we can make this nicer. solved = np.linalg.solve(cov, (x - mean).T).T return lower_half(np.linalg.inv(cov) - generalized_outer_product(solved)) logpdf.defgrad(lambda ans, x, mean=None, cov=1, allow_singular=False: unbroadcast(ans, x, lambda g: -np.expand_dims(g, 1) * np.linalg.solve(cov, (x - mean).T).T), argnum=0) logpdf.defgrad(lambda ans, x, mean=None, cov=1, allow_singular=False: unbroadcast(ans, mean, lambda g: np.expand_dims(g, 1) * np.linalg.solve(cov, (x - mean).T).T), argnum=1) logpdf.defgrad(lambda ans, x, mean=None, cov=1, allow_singular=False: unbroadcast(ans, cov, lambda g: -np.reshape(g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov)), argnum=2) # Same as log pdf, but multiplied by the pdf (ans). pdf.defgrad(lambda ans, x, mean=None, cov=1, allow_singular=False: unbroadcast(ans, x, lambda g: -g * ans * np.linalg.solve(cov, x - mean)), argnum=0) pdf.defgrad(lambda ans, x, mean=None, cov=1, allow_singular=False: unbroadcast(ans, mean, lambda g: g * ans * np.linalg.solve(cov, x - mean)), argnum=1) pdf.defgrad(lambda ans, x, mean=None, cov=1, allow_singular=False: unbroadcast(ans, cov, lambda g: -g * ans * covgrad(x, mean, cov)), argnum=2) entropy.defgrad_is_zero(argnums=(0,)) entropy.defgrad(lambda ans, mean, cov: unbroadcast(ans, cov, lambda g: 0.5 * g * np.linalg.inv(cov).T), argnum=1)
def grad_tlogpdf_scale(x, df, loc, scale): diff = x - loc return -(df * (scale**2 - diff**2)) / (scale * (df * scale**2 + diff**2)) def grad_tlogpdf_df(x, df, loc, scale): y = (x - loc) / scale return 0.5 * ( (y**2 * (df + 1)) / (df * (y**2 + df)) - np.log(y**2 / df + 1) - 1.0 / df - psi(df / 2.0) + psi( (df + 1) / 2.0)) pdf.defvjp(lambda g, ans, vs, gvs, x, df, loc=0.0, scale=1.0: unbroadcast( vs, gvs, g * ans * grad_tlogpdf_x(x, df, loc, scale)), argnum=0) pdf.defvjp(lambda g, ans, vs, gvs, x, df, loc=0.0, scale=1.0: unbroadcast( vs, gvs, g * ans * grad_tlogpdf_df(x, df, loc, scale)), argnum=1) pdf.defvjp(lambda g, ans, vs, gvs, x, df, loc=0.0, scale=1.0: unbroadcast( vs, gvs, g * ans * grad_tlogpdf_loc(x, df, loc, scale)), argnum=2) pdf.defvjp(lambda g, ans, vs, gvs, x, df, loc=0.0, scale=1.0: unbroadcast( vs, gvs, g * ans * grad_tlogpdf_scale(x, df, loc, scale)), argnum=3) cdf.defvjp(lambda g, ans, vs, gvs, x, df, loc=0.0, scale=1.0: unbroadcast( vs, gvs, g * pdf(x, df, loc, scale)), argnum=0) cdf.defvjp(lambda g, ans, vs, gvs, x, df, loc=0.0, scale=1.0: unbroadcast(
logcdf = primitive(scipy.stats.t.logcdf) def grad_tlogpdf_diff(diff, df): return -diff * (1.0 + df) / (diff**2 + df) def grad_tlogpdf_x(x, df, loc, scale): return grad_tlogpdf_diff((x - loc) / scale, df) / scale def grad_tlogpdf_loc(x, df, loc, scale): return -grad_tlogpdf_diff((x - loc) / scale, df) / scale def grad_tlogpdf_scale(x, df, loc, scale): diff = x - loc return -(df * (scale**2 - diff**2))/(scale * (df * scale**2 + diff**2)) def grad_tlogpdf_df(x, df, loc, scale): y = (x - loc)/scale return 0.5 * ((y**2 * (df+1))/(df * (y**2 + df)) - np.log(y**2 / df + 1) - 1.0/df -psi(df/2.0) + psi((df + 1)/2.0)) pdf.defgrad(lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast(ans, x, lambda g: g * ans * grad_tlogpdf_x( x, df, loc, scale)), argnum=0) pdf.defgrad(lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast(ans, df, lambda g: g * ans * grad_tlogpdf_df( x, df, loc, scale)), argnum=1) pdf.defgrad(lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast(ans, loc, lambda g: g * ans * grad_tlogpdf_loc( x, df, loc, scale)), argnum=2) pdf.defgrad(lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast(ans, scale, lambda g: g * ans * grad_tlogpdf_scale(x, df, loc, scale)), argnum=3) cdf.defgrad(lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast(ans, x, lambda g: g * pdf(x, df, loc, scale)), argnum=0) cdf.defgrad(lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast(ans, loc, lambda g: -g * pdf(x, df, loc, scale)), argnum=2) # What is the gradient of the cdf wrt the degrees of freedom or scale? No one knows. logpdf.defgrad(lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast(ans, x, lambda g: g * grad_tlogpdf_x( x, df, loc, scale)), argnum=0) logpdf.defgrad(lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast(ans, df, lambda g: g * grad_tlogpdf_df( x, df, loc, scale)), argnum=1) logpdf.defgrad(lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast(ans, loc, lambda g: g * grad_tlogpdf_loc( x, df, loc, scale)), argnum=2) logpdf.defgrad(lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast(ans, scale, lambda g: g * grad_tlogpdf_scale(x, df, loc, scale)), argnum=3) logcdf.defgrad(lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast(ans, x, lambda g: g * np.exp(logpdf(x, df, loc, scale) - logcdf(x, df, loc, scale))), argnum=0) logcdf.defgrad(lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast(ans, loc, lambda g: -g * np.exp(logpdf(x, df, loc, scale) - logcdf(x, df, loc, scale))), argnum=2)
elif len(mat.shape) == 2: return np.einsum('ij,ik->ijk', mat, mat) else: raise ArithmeticError def covgrad(x, mean, cov, allow_singular=False): if allow_singular: raise NotImplementedError("The multivariate normal pdf is not " "differentiable w.r.t. a singular covariance matix") # I think once we have Cholesky we can make this nicer. solved = np.linalg.solve(cov, (x - mean).T).T return lower_half(np.linalg.inv(cov) - generalized_outer_product(solved)) def solve(allow_singular): if allow_singular: return lambda A, x: np.dot(np.linalg.pinv(A), x) else: return np.linalg.solve logpdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov, allow_singular=False: unbroadcast(vs, gvs, -np.expand_dims(g, 1) * solve(allow_singular)(cov, (x - mean).T).T), argnum=0) logpdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov, allow_singular=False: unbroadcast(vs, gvs, np.expand_dims(g, 1) * solve(allow_singular)(cov, (x - mean).T).T), argnum=1) logpdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov, allow_singular=False: unbroadcast(vs, gvs, -np.reshape(g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov, allow_singular)), argnum=2) # Same as log pdf, but multiplied by the pdf (ans). pdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov, allow_singular=False: unbroadcast(vs, gvs, -np.expand_dims(ans * g, 1) * solve(allow_singular)(cov, (x - mean).T).T), argnum=0) pdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov, allow_singular=False: unbroadcast(vs, gvs, np.expand_dims(ans * g, 1) * solve(allow_singular)(cov, (x - mean).T).T), argnum=1) pdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov, allow_singular=False: unbroadcast(vs, gvs, -np.reshape(ans * g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov, allow_singular)), argnum=2) entropy.defvjp_is_zero(argnums=(0,)) entropy.defvjp(lambda g, ans, vs, gvs, mean, cov: unbroadcast(vs, gvs, 0.5 * g * np.linalg.inv(cov).T), argnum=1)
def generalized_outer_product(mat): if len(mat.shape) == 1: return np.outer(mat, mat) elif len(mat.shape) == 2: return np.einsum('ij,ik->ijk', mat, mat) else: raise ArithmeticError def covgrad(x, mean, cov): # I think once we have Cholesky we can make this nicer. solved = np.linalg.solve(cov, (x - mean).T).T return lower_half(np.linalg.inv(cov) - generalized_outer_product(solved)) logpdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast( vs, gvs, -np.expand_dims(g, 1) * np.linalg.solve(cov, (x - mean).T).T), argnum=0) logpdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast( vs, gvs, np.expand_dims(g, 1) * np.linalg.solve(cov, (x - mean).T).T), argnum=1) logpdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast( vs, gvs, -np.reshape(g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov)), argnum=2) # Same as log pdf, but multiplied by the pdf (ans). pdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast( vs, gvs, -g * ans * np.linalg.solve(cov, x - mean)), argnum=0) pdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast(
"""Gradients of the normal distribution.""" from __future__ import absolute_import import scipy.stats import autograd.numpy as anp from autograd.core import primitive from autograd.numpy.numpy_grads import unbroadcast pdf = primitive(scipy.stats.norm.pdf) cdf = primitive(scipy.stats.norm.cdf) logpdf = primitive(scipy.stats.norm.logpdf) logcdf = primitive(scipy.stats.norm.logcdf) pdf.defvjp(lambda g, ans, vs, gvs, x, loc=0.0, scale=1.0: unbroadcast(vs, gvs, -g * ans * (x - loc) / scale**2)) pdf.defvjp(lambda g, ans, vs, gvs, x, loc=0.0, scale=1.0: unbroadcast(vs, gvs, g * ans * (x - loc) / scale**2), argnum=1) pdf.defvjp(lambda g, ans, vs, gvs, x, loc=0.0, scale=1.0: unbroadcast(vs, gvs, g * ans * (((x - loc)/scale)**2 - 1.0)/scale), argnum=2) cdf.defvjp(lambda g, ans, vs, gvs, x, loc=0.0, scale=1.0: unbroadcast(vs, gvs, g * pdf(x, loc, scale))) cdf.defvjp(lambda g, ans, vs, gvs, x, loc=0.0, scale=1.0: unbroadcast(vs, gvs, -g * pdf(x, loc, scale)), argnum=1) cdf.defvjp(lambda g, ans, vs, gvs, x, loc=0.0, scale=1.0: unbroadcast(vs, gvs, -g * pdf(x, loc, scale)*(x-loc)/scale), argnum=2) logpdf.defvjp(lambda g, ans, vs, gvs, x, loc=0.0, scale=1.0: unbroadcast(vs, gvs, -g * (x - loc) / scale**2)) logpdf.defvjp(lambda g, ans, vs, gvs, x, loc=0.0, scale=1.0: unbroadcast(vs, gvs, g * (x - loc) / scale**2), argnum=1) logpdf.defvjp(lambda g, ans, vs, gvs, x, loc=0.0, scale=1.0: unbroadcast(vs, gvs, g * (-1.0/scale + (x - loc)**2/scale**3)), argnum=2) logcdf.defvjp(lambda g, ans, vs, gvs, x, loc=0.0, scale=1.0: unbroadcast(vs, gvs, g * anp.exp(logpdf(x, loc, scale) - logcdf(x, loc, scale)))) logcdf.defvjp(lambda g, ans, vs, gvs, x, loc=0.0, scale=1.0: unbroadcast(vs, gvs, -g * anp.exp(logpdf(x, loc, scale) - logcdf(x, loc, scale))), argnum=1) logcdf.defvjp(lambda g, ans, vs, gvs, x, loc=0.0, scale=1.0: unbroadcast(vs, gvs, -g * anp.exp(logpdf(x, loc, scale) - logcdf(x, loc, scale))*(x-loc)/scale), argnum=2)
logcdf = primitive(scipy.stats.t.logcdf) def grad_tlogpdf_diff(diff, df): return -diff * (1.0 + df) / (diff**2 + df) def grad_tlogpdf_x(x, df, loc, scale): return grad_tlogpdf_diff((x - loc) / scale, df) / scale def grad_tlogpdf_loc(x, df, loc, scale): return -grad_tlogpdf_diff((x - loc) / scale, df) / scale def grad_tlogpdf_scale(x, df, loc, scale): diff = x - loc return -(df * (scale**2 - diff**2))/(scale * (df * scale**2 + diff**2)) def grad_tlogpdf_df(x, df, loc, scale): y = (x - loc)/scale return 0.5 * ((y**2 * (df+1))/(df * (y**2 + df)) - np.log(y**2 / df + 1) - 1.0/df -psi(df/2.0) + psi((df + 1)/2.0)) pdf.defvjp(lambda g, ans, vs, gvs, x, df, loc=0.0, scale=1.0: unbroadcast(vs, gvs, g * ans * grad_tlogpdf_x( x, df, loc, scale)), argnum=0) pdf.defvjp(lambda g, ans, vs, gvs, x, df, loc=0.0, scale=1.0: unbroadcast(vs, gvs, g * ans * grad_tlogpdf_df( x, df, loc, scale)), argnum=1) pdf.defvjp(lambda g, ans, vs, gvs, x, df, loc=0.0, scale=1.0: unbroadcast(vs, gvs, g * ans * grad_tlogpdf_loc( x, df, loc, scale)), argnum=2) pdf.defvjp(lambda g, ans, vs, gvs, x, df, loc=0.0, scale=1.0: unbroadcast(vs, gvs, g * ans * grad_tlogpdf_scale(x, df, loc, scale)), argnum=3) cdf.defvjp(lambda g, ans, vs, gvs, x, df, loc=0.0, scale=1.0: unbroadcast(vs, gvs, g * pdf(x, df, loc, scale)), argnum=0) cdf.defvjp(lambda g, ans, vs, gvs, x, df, loc=0.0, scale=1.0: unbroadcast(vs, gvs, -g * pdf(x, df, loc, scale)), argnum=2) # What is the gradient of the cdf wrt the degrees of freedom or scale? No one knows. logpdf.defvjp(lambda g, ans, vs, gvs, x, df, loc=0.0, scale=1.0: unbroadcast(vs, gvs, g * grad_tlogpdf_x( x, df, loc, scale)), argnum=0) logpdf.defvjp(lambda g, ans, vs, gvs, x, df, loc=0.0, scale=1.0: unbroadcast(vs, gvs, g * grad_tlogpdf_df( x, df, loc, scale)), argnum=1) logpdf.defvjp(lambda g, ans, vs, gvs, x, df, loc=0.0, scale=1.0: unbroadcast(vs, gvs, g * grad_tlogpdf_loc( x, df, loc, scale)), argnum=2) logpdf.defvjp(lambda g, ans, vs, gvs, x, df, loc=0.0, scale=1.0: unbroadcast(vs, gvs, g * grad_tlogpdf_scale(x, df, loc, scale)), argnum=3) logcdf.defvjp(lambda g, ans, vs, gvs, x, df, loc=0.0, scale=1.0: unbroadcast(vs, gvs, g * np.exp(logpdf(x, df, loc, scale) - logcdf(x, df, loc, scale))), argnum=0) logcdf.defvjp(lambda g, ans, vs, gvs, x, df, loc=0.0, scale=1.0: unbroadcast(vs, gvs, -g * np.exp(logpdf(x, df, loc, scale) - logcdf(x, df, loc, scale))), argnum=2)