def func(chol_vec, delta): chol = at.stack([ at.stack([at.exp(0.1 * chol_vec[0]), 0]), at.stack([chol_vec[1], 2 * at.exp(chol_vec[2])]), ]) cov = at.dot(chol, chol.T) return MvNormalLogp()(cov, delta)
def two_gaussians(x): """ Mixture of gaussians likelihood """ log_like1 = (-0.5 * n * at.log(2 * np.pi) - 0.5 * at.log(dsigma) - 0.5 * (x - mu1).T.dot(isigma).dot(x - mu1)) log_like2 = (-0.5 * n * at.log(2 * np.pi) - 0.5 * at.log(dsigma) - 0.5 * (x - mu2).T.dot(isigma).dot(x - mu2)) return at.log(w1 * at.exp(log_like1) + w2 * at.exp(log_like2))
def test_get_jaxified_logp(): with pm.Model() as m: x = pm.Flat("x") y = pm.Flat("y") pm.Potential("pot", at.log(at.exp(x) + at.exp(y))) jax_fn = get_jaxified_logp(m) # This would underflow if not optimized assert not np.isinf(jax_fn((np.array(5000.0), np.array(5000.0))))
def backward(self, rv_var, rv_value): a, b = self.param_extract_fn(rv_var) if a is not None and b is not None: sigmoid_x = at.sigmoid(rv_value) return sigmoid_x * b + (1 - sigmoid_x) * a elif a is not None: return at.exp(rv_value) + a elif b is not None: return b - at.exp(rv_value) else: return rv_value
def two_gaussians(x): log_like1 = ( -0.5 * n * aet.log(2 * np.pi) - 0.5 * aet.log(dsigma) - 0.5 * (x - mu1).T.dot(isigma).dot(x - mu1) ) log_like2 = ( -0.5 * n * aet.log(2 * np.pi) - 0.5 * aet.log(dsigma) - 0.5 * (x - mu2).T.dot(isigma).dot(x - mu2) ) return aet.log(w1 * aet.exp(log_like1) + w2 * aet.exp(log_like2))
def test_HSStep_NegativeBinomial(): np.random.seed(2032) M = 5 N = 50 X = np.random.normal(size=N * M).reshape((N, M)) beta_true = np.array([1, 1, 2, 2, 0]) y_nb = pm.NegativeBinomial.dist(np.exp(X.dot(beta_true)), 1).random() N_draws = 500 with pm.Model(): beta = HorseShoe("beta", tau=1, shape=M) pm.NegativeBinomial("y", mu=at.exp(beta.dot(X.T)), alpha=1, observed=y_nb) hsstep = HSStep([beta]) trace = pm.sample( draws=N_draws, step=hsstep, chains=1, return_inferencedata=True, compute_convergence_checks=False, ) beta_samples = trace.posterior["beta"][0].values assert beta_samples.shape == (N_draws, M) np.testing.assert_allclose(beta_samples.mean(0), beta_true, atol=0.5) with pm.Model(): beta = HorseShoe("beta", tau=1, shape=M, testval=beta_true * 0.1) pm.NegativeBinomial("y", mu=beta.dot(np.abs(X.T)), alpha=1, observed=y_nb) hsstep = HSStep([beta]) trace = pm.sample( draws=N_draws, step=hsstep, chains=1, return_inferencedata=True, compute_convergence_checks=False, ) beta_samples = trace.posterior["beta"][0].values assert beta_samples.shape == (N_draws, M) with pm.Model(): beta = HorseShoe("beta", tau=1, shape=M, testval=beta_true * 0.1) eta = pm.NegativeBinomial("eta", mu=beta.dot(X.T), alpha=1, shape=N) pm.Normal("y", mu=at.exp(eta), sigma=1, observed=y_nb) with pytest.raises(NotImplementedError): HSStep([beta])
def test_hessian(self): chol_vec = at.vector("chol_vec") chol_vec.tag.test_value = np.array([0.1, 2, 3]) chol = at.stack([ at.stack([at.exp(0.1 * chol_vec[0]), 0]), at.stack([chol_vec[1], 2 * at.exp(chol_vec[2])]), ]) cov = at.dot(chol, chol.T) delta = at.matrix("delta") delta.tag.test_value = np.ones((5, 2)) logp = MvNormalLogp()(cov, delta) g_cov, g_delta = at.grad(logp, [cov, delta]) at.grad(g_delta.sum() + g_cov.sum(), [delta, cov])
def test_hessian(self): chol_vec = at.vector("chol_vec") chol_vec.tag.test_value = floatX(np.array([0.1, 2, 3])) chol = at.stack([ at.stack([at.exp(0.1 * chol_vec[0]), 0]), at.stack([chol_vec[1], 2 * at.exp(chol_vec[2])]), ]) cov = at.dot(chol, chol.T) delta = at.matrix("delta") delta.tag.test_value = floatX(np.ones((5, 2))) logp = MvNormalLogp()(cov, delta) g_cov, g_delta = at.grad(logp, [cov, delta]) # TODO: What's the test? Something needs to be asserted. at.grad(g_delta.sum() + g_cov.sum(), [delta, cov])
def forward(self, rv_var, rv_value): """Inverse operation of softplus. y = Log(Exp(x) - 1) = Log(1 - Exp(-x)) + x """ return at.log(1.0 - at.exp(-rv_value)) + rv_value
def __call__(self, X): XY = X.dot(X.T) x2 = at.sum(X**2, axis=1).dimshuffle(0, "x") X2e = at.repeat(x2, X.shape[0], axis=1) H = X2e + X2e.T - 2.0 * XY V = at.sort(H.flatten()) length = V.shape[0] # median distance m = at.switch( at.eq((length % 2), 0), # if even vector at.mean(V[((length // 2) - 1):((length // 2) + 1)]), # if odd vector V[length // 2], ) h = 0.5 * m / at.log(floatX(H.shape[0]) + floatX(1)) # RBF Kxy = at.exp(-H / h / 2.0) # Derivative dxkxy = -at.dot(Kxy, X) sumkxy = at.sum(Kxy, axis=-1, keepdims=True) dxkxy = at.add(dxkxy, at.mul(X, sumkxy)) / h return Kxy, dxkxy
def test_lop_override(self, cls_ofg): x = tt.vector() y = 1.0 / (1.0 + tt.exp(-x)) def lop_ov(inps, outs, grads): (y_, ) = outs (dedy_, ) = grads return [2.0 * y_ * (1.0 - y_) * dedy_] y_, dedy = tt.vector(), tt.vector() op_lop_ov = cls_ofg([x, y_, dedy], [2.0 * y_ * (1.0 - y_) * dedy]) xx = tt.vector() yy1 = tt.sum(tt.nnet.sigmoid(xx)) gyy1 = 2.0 * tt.grad(yy1, xx) for ov in [lop_ov, op_lop_ov]: op = cls_ofg([x], [y], lop_overrides=ov) yy2 = tt.sum(op(xx)) gyy2 = tt.grad(yy2, xx) fn = function([xx], [gyy1, gyy2]) xval = np.random.rand(32).astype(config.floatX) y1val, y2val = fn(xval) assert np.allclose(y1val, y2val)
def logsumexp(x, axis=None, keepdims=True): # Adapted from https://github.com/Theano/Theano/issues/1563 x_max = aet.max(x, axis=axis, keepdims=True) x_max = aet.switch(aet.isinf(x_max), 0, x_max) res = aet.log(aet.sum(aet.exp(x - x_max), axis=axis, keepdims=True)) + x_max return res if keepdims else res.squeeze()
def test_HSStep_NegativeBinomial_sparse(): np.random.seed(2032) M = 5 N = 50 X = np.random.normal(size=N * M).reshape((N, M)) beta_true = np.array([1, 1, 2, 2, 0]) y_nb = pm.NegativeBinomial.dist(np.exp(X.dot(beta_true)), 1).random() X = sp.sparse.csr_matrix(X) N_draws = 500 with pm.Model(): beta = HorseShoe("beta", tau=1, shape=M) pm.NegativeBinomial("y", mu=at.exp(sp_dot(X, at.shape_padright(beta))), alpha=1, observed=y_nb) hsstep = HSStep([beta]) trace = pm.sample( draws=N_draws, step=hsstep, chains=1, return_inferencedata=True, compute_convergence_checks=False, ) beta_samples = trace.posterior["beta"][0].values assert beta_samples.shape == (N_draws, M) np.testing.assert_allclose(beta_samples.mean(0), beta_true, atol=0.5)
def incomplete_beta_ps(a, b, value): """Power series for incomplete beta Use when b*x is small and value not too close to 1. Based on Cephes library by Steve Moshier (incbet.c) """ one = aet.constant(1, dtype="float64") ai = one / a u = (one - b) * value t1 = u / (a + one) t = u threshold = np.MachAr().eps * ai s = aet.constant(0, dtype="float64") def _step(i, t, s): t *= (i - b) * value / i step = t / (a + i) s += step return ((t, s), until(aet.abs_(step) < threshold)) (t, s), _ = scan(_step, sequences=[aet.arange(2, 302)], outputs_info=[e for e in aet.cast((t, s), "float64")]) s = s[-1] + t1 + ai t = gammaln(a + b) - gammaln(a) - gammaln(b) + a * aet.log(value) + aet.log(s) return aet.exp(t)
def forward(self, value, *inputs): """Inverse operation of softplus. y = Log(Exp(x) - 1) = Log(1 - Exp(-x)) + x """ return at.log(1.0 - at.exp(-value)) + value
def test_logp_helper_exceptions(): with pytest.raises(TypeError, match="When RV is not a pure distribution"): logp(at.exp(Normal.dist()), [1, 2]) with pytest.raises(NotImplementedError, match="PyMC could not infer logp of input variable"): logp(at.cos(Normal.dist()), 1)
def backward(self, y_): y = y_.T y = aet.concatenate([y, -aet.sum(y, 0, keepdims=True)]) # "softmax" with vector support and no deprication warning: e_y = aet.exp(y - aet.max(y, 0, keepdims=True)) x = e_y / aet.sum(e_y, 0, keepdims=True) return floatX(x.T)
def forward(self, x): """Inverse operation of softplus. y = Log(Exp(x) - 1) = Log(1 - Exp(-x)) + x """ return aet.log(1.0 - aet.exp(-x)) + x
def full(self, X, Xs=None): X, Xs = self._slice(X, Xs) if Xs is None: Xs = X f1 = X.dimshuffle(0, "x", 1) f2 = Xs.dimshuffle("x", 0, 1) r = np.pi * (f1 - f2) / self.period r = at.sum(at.square(at.sin(r) / self.ls), 2) return at.exp(-0.5 * r)
def test_get_jaxified_graph(): # Check that jaxifying a graph does not emmit the Supervisor Warning. This test can # be removed once https://github.com/aesara-devs/aesara/issues/637 is sorted. x = at.scalar("x") y = at.exp(x) with pytest.warns(None) as record: fn = get_jaxified_graph(inputs=[x], outputs=[y]) assert not record assert np.isclose(fn(0), 1)
def log_diff_normal_cdf(mu, sigma, x, y): """ Compute :math:`\\log(\\Phi(\frac{x - \\mu}{\\sigma}) - \\Phi(\frac{y - \\mu}{\\sigma}))` safely in log space. Parameters ---------- mu: float mean sigma: float std x: float y: float must be strictly less than x. Returns ------- log (\\Phi(x) - \\Phi(y)) """ x = (x - mu) / sigma / aet.sqrt(2.0) y = (y - mu) / sigma / aet.sqrt(2.0) # To stabilize the computation, consider these three regions: # 1) x > y > 0 => Use erf(x) = 1 - e^{-x^2} erfcx(x) and erf(y) =1 - e^{-y^2} erfcx(y) # 2) 0 > x > y => Use erf(x) = e^{-x^2} erfcx(-x) and erf(y) = e^{-y^2} erfcx(-y) # 3) x > 0 > y => Naive formula log( (erf(x) - erf(y)) / 2 ) works fine. return aet.log(0.5) + aet.switch( aet.gt(y, 0), -aet.square(y) + aet.log( aet.erfcx(y) - aet.exp(aet.square(y) - aet.square(x)) * aet.erfcx(x)), aet.switch( aet.lt(x, 0), # 0 > x > y -aet.square(x) + aet.log( aet.erfcx(-x) - aet.exp(aet.square(x) - aet.square(y)) * aet.erfcx(-y)), aet.log(aet.erf(x) - aet.erf(y)), # x >0 > y ), )
def full(self, X, Xs=None): X, Xs = self._slice(X, Xs) rx = self.lfunc(at.as_tensor_variable(X), self.args) if Xs is None: rz = self.lfunc(at.as_tensor_variable(X), self.args) r2 = self.square_dist(X, X) else: rz = self.lfunc(at.as_tensor_variable(Xs), self.args) r2 = self.square_dist(X, Xs) rx2 = at.reshape(at.square(rx), (-1, 1)) rz2 = at.reshape(at.square(rz), (1, -1)) return at.sqrt((2.0 * at.outer(rx, rz)) / (rx2 + rz2)) * at.exp(-1.0 * r2 / (rx2 + rz2))
def backward(self, rv_var, rv_value): if rv_var.broadcastable[-1]: # If this variable is just a bunch of scalars/degenerate # Dirichlets, we can't transform it return rv_value y = rv_value.T y = at.concatenate([y, -at.sum(y, 0, keepdims=True)]) # "softmax" with vector support and no deprication warning: e_y = at.exp(y - at.max(y, 0, keepdims=True)) x = e_y / at.sum(e_y, 0, keepdims=True) return floatX(x.T)
def test_local_sigm_times_exp(self): # Test the `local_sigm_times_exp` optimization. # exp(x) * sigm(-x) -> sigm(x) # exp(-x) * sigm(x) -> sigm(-x) def match(func, ops): # print [node.op.scalar_op for node in func.maker.fgraph.toposort()] assert [node.op for node in func.maker.fgraph.toposort()] == ops m = self.get_mode(excluding=["local_elemwise_fusion", "inplace"]) x, y = tt.vectors("x", "y") f = aesara.function([x], sigmoid(-x) * tt.exp(x), mode=m) match(f, [sigmoid]) assert check_stack_trace(f, ops_to_check=sigmoid) f = aesara.function([x], sigmoid(x) * tt.exp(-x), mode=m) match(f, [tt.neg, sigmoid]) assert check_stack_trace(f, ops_to_check=sigmoid) f = aesara.function([x], -(-(-(sigmoid(x)))) * tt.exp(-x), mode=m) match(f, [tt.neg, sigmoid, tt.neg]) # assert check_stack_trace(f, ops_to_check=sigmoid) f = aesara.function( [x, y], (sigmoid(x) * sigmoid(-y) * -tt.exp(-x) * tt.exp(x * y) * tt.exp(y)), mode=m, ) topo = f.maker.fgraph.toposort() for op, nb in [(sigmoid, 2), (tt.mul, 2), (tt.neg, 1), (tt.exp, 1)]: assert sum([n.op == op for n in topo]) == nb
def grad(self, inputs, cost_grad): """ In defining the gradient, the Finite Fourier Transform is viewed as a complex-differentiable function of a complex variable """ a = inputs[0] n = inputs[1] axis = inputs[2] grad = cost_grad[0] if not isinstance(axis, tensor.TensorConstant): raise NotImplementedError( "%s: gradient is currently implemented" " only for axis being a Aesara constant" % self.__class__.__name__) axis = int(axis.data) # notice that the number of actual elements in wrto is independent of # possible padding or truncation: elem = tensor.arange(0, tensor.shape(a)[axis], 1) # accounts for padding: freq = tensor.arange(0, n, 1) outer = tensor.outer(freq, elem) pow_outer = tensor.exp(((-2 * math.pi * 1j) * outer) / (1.0 * n)) res = tensor.tensordot(grad, pow_outer, (axis, 0)) # This would be simpler but not implemented by aesara: # res = tensor.switch(tensor.lt(n, tensor.shape(a)[axis]), # tensor.set_subtensor(res[...,n::], 0, False, False), res) # Instead we resort to that to account for truncation: flip_shape = list(np.arange(0, a.ndim)[::-1]) res = res.dimshuffle(flip_shape) res = tensor.switch( tensor.lt(n, tensor.shape(a)[axis]), tensor.set_subtensor( res[n::, ], 0, False, False, ), res, ) res = res.dimshuffle(flip_shape) # insures that gradient shape conforms to input shape: out_shape = (list(np.arange(0, axis)) + [a.ndim - 1] + list(np.arange(axis, a.ndim - 1))) res = res.dimshuffle(*out_shape) return [res, None, None]
def test_1msigmoid(self): if not register_local_1msigmoid: return m = self.get_mode() x = tt.fmatrix() # tests exp_over_1_plus_exp f = aesara.function([x], 1 - tt.exp(x) / (1 + tt.exp(x)), mode=m) assert check_stack_trace(f, ops_to_check=[tt.neg, sigmoid_inplace]) assert [node.op for node in f.maker.fgraph.toposort()] == [ tt.neg, sigmoid_inplace, ] # tests inv_1_plus_exp f = aesara.function([x], 1 - tt.fill(x, 1.0) / (1 + tt.exp(-x)), mode=m) assert check_stack_trace(f, ops_to_check=[tt.neg, sigmoid_inplace]) assert [node.op for node in f.maker.fgraph.toposort()] == [ tt.neg, sigmoid_inplace, ]
def test_DownsampleFactorMax_hessian(self): # Example provided by Frans Cronje, see # https://groups.google.com/d/msg/theano-users/qpqUy_3glhw/JMwIvlN5wX4J x_vec = vector("x") z = aet.dot(x_vec.dimshuffle(0, "x"), x_vec.dimshuffle("x", 0)) y = pool_2d(input=z, ws=(2, 2), ignore_border=True) C = aet.exp(aet_sum(y)) grad_hess = aesara.gradient.hessian(cost=C, wrt=x_vec) fn_hess = function(inputs=[x_vec], outputs=grad_hess) # The value has been manually computed from the theoretical gradient, # and confirmed by the implementation. assert np.allclose(fn_hess([1, 2]), [[0.0, 0.0], [0.0, 982.7667]])
def test_log1pexp_to_softplus(self): m = aesara.config.mode if m == "FAST_COMPILE": m = "FAST_RUN" x = tt.vector() out = tt.log(1 + tt.exp(x)) f = aesara.function([x], out, mode=self.m) # Fix ticket #4581 first # assert check_stack_trace(f, ops_to_check='all') topo = f.maker.fgraph.toposort() assert len(topo) == 1 assert isinstance(topo[0].op.scalar_op, ScalarSoftplus) f(np.random.rand(54).astype(config.floatX))
def log1mexp(x): r"""Return log(1 - exp(-x)). This function is numerically more stable than the naive approach. For details, see https://cran.r-project.org/web/packages/Rmpfr/vignettes/log1mexp-note.pdf References ---------- .. [Machler2012] Martin Mächler (2012). "Accurately computing `\log(1-\exp(- \mid a \mid))` Assessed by the Rmpfr package" """ return at.switch(at.lt(x, 0.6931471805599453), at.log(-at.expm1(-x)), at.log1p(-at.exp(-x)))
def log_normal(x, mean, **kwargs): """ Calculate logarithm of normal distribution at point `x` with given `mean` and `std` Parameters ---------- x: Tensor point of evaluation mean: Tensor mean of normal distribution kwargs: one of parameters `{sigma, tau, w, rho}` Notes ----- There are four variants for density parametrization. They are: 1) standard deviation - `std` 2) `w`, logarithm of `std` :math:`w = log(std)` 3) `rho` that follows this equation :math:`rho = log(exp(std) - 1)` 4) `tau` that follows this equation :math:`tau = std^{-1}` ---- """ sigma = kwargs.get("sigma") w = kwargs.get("w") rho = kwargs.get("rho") tau = kwargs.get("tau") eps = kwargs.get("eps", 0.0) check = sum(map(lambda a: a is not None, [sigma, w, rho, tau])) if check > 1: raise ValueError("more than one required kwarg is passed") if check == 0: raise ValueError("none of required kwarg is passed") if sigma is not None: std = sigma elif w is not None: std = aet.exp(w) elif rho is not None: std = rho2sigma(rho) else: std = tau**(-1) std += f(eps) return f(c) - aet.log(aet.abs_(std)) - (x - mean)**2 / (2.0 * std**2)