def var_expectedstats(natparam): # Returns E_{q(v)}[\eta_z(V)] where \eta_z(V)_i = ln(V_i) + \sum_{j < i} ln(1-V_j) # q is truncated to level T with q(v_T) = 1 but p is not truncated, tho terms > T don't really feature in the calculations # natparam shape is (T-1)x2, refers to \gamma_{t,0/1} # Returned function shape will be T return np.append(np.array([ digamma(natparam[i, 0]) - digamma(natparam[i,0]+natparam[i,1])\ + np.sum(np.array([digamma(natparam[j, 1]) - digamma(natparam[j,0]+natparam[j,1]) for j in range(i)])) for i in range(natparam.shape[0])]), (np.sum(digamma(natparam[:,1])-digamma(natparam[:,0]+natparam[:,1]))))
def beta_entropy(params): alpha = np.exp(params["log_alpha"]) beta = np.exp(params["log_beta"]) return np.sum( betaln(alpha, beta) - (alpha - 1.0) * (digamma(alpha) - digamma(alpha + beta)) - (beta - 1.0) * (digamma(beta) - digamma(alpha + beta)))
def log_marginal_likelihood(paramslin, x, z, pij, pij_flatten, pij0sum, run_time,taus, gamma, alpha, precomp): params = util.unlinearise_params(paramslin, verbose=0) d, nz = z.shape nx = x.shape[1] s = params.L @ params.L.T+__nugget(params.L.shape[1]) eqn15sum = (params.m.T @ precomp.Kzzinv_psi_sum_Kzzinv @params.m)[0,0] eqn16a = np.trace(precomp.Kzzinv_psi_sum) eqn16b = np.trace(precomp.Kzzinv_psi_sum_Kzzinv @ s) eqn16sum = gamma*np.sum((run_time-x[0])**d)-eqn16a + eqn16b mutilde = (precomp.Kzzinv_kzx.T @ params.m).flatten() sigmaa = precomp.sigmas sigmab = np.sum(precomp.Kxz * precomp.Kzzinv_kzx.T, axis=1) sigmac = np.sum((params.L.T @ precomp.Kzzinv_kzx) ** 2, axis=0) sigmatilde = sigmaa - sigmab + sigmac eqn19a, eqn19b, eqn19c = expected_log_f2(mutilde, np.sqrt(sigmatilde)) eqn19sum = -(eqn19c + eqn19a + eqn19b)@pij_flatten ppij = pij[pij > 0] total = eqn15sum + eqn16sum + eqn19sum + run_time * params.shape * params.scale - \ pij0sum*(special.digamma(params.shape) + np.log(params.scale)) + ppij @ np.log(ppij) return -total
def _m_step_nu(self, expectations, datas, inputs, masks, tags): """ The shape parameter nu determines a gamma prior. We have tau_n ~ Gamma(nu/2, nu/2) y_n ~ N(mu, sigma^2 / tau_n) To update nu, we do EM and optimize the expected log likelihood using a generalized Newton's method. See the notebook in doc/students_t for complete details. """ K, D = self.K, self.D # Compute the precisions w for each data point E_taus = np.zeros(K) E_logtaus = np.zeros(K) weights = np.zeros(K) for y, (Ez, _, _) in zip(datas, expectations): # nu: (K,) mus: (K, D) sigmas: (K, D) y: (T, D) -> alpha/beta: (T, K, D) nus = np.exp(self.inv_nus[:, None]) alpha = nus/2 + 1/2 beta = nus/2 + 1/2 * (y[:, None, :] - self.mus)**2 / np.exp(self.inv_sigmas) E_taus += np.sum(Ez[:, :, None] * alpha / beta, axis=(0, 2)) E_logtaus += np.sum(Ez[:, :, None] * (digamma(alpha) - np.log(beta)), axis=(0, 2)) weights += np.sum(Ez, axis=0) * D E_taus /= weights E_logtaus /= weights for k in range(K): self.inv_nus[k] = np.log(generalized_newton_studentst_dof(E_taus[k], E_logtaus[k]))
def objective(paramslin, x, z, pij_flatten, pij0sum, run_time, taus, gamma, alpha, g0_params,precomp): params = util.unlinearise_params(paramslin, verbose=0) d, nz = z.shape nx = x.shape[1] kzzinv_m = precomp.Kzzinv @ params.m s = params.L @ params.L.T+__nugget(params.L.shape[1]) eqn15sum = (params.m.T @ precomp.Kzzinv_psi_sum_Kzzinv @params.m)[0,0] eqn16a = np.trace(precomp.Kzzinv_psi_sum) eqn16b = np.trace(precomp.Kzzinv_psi_sum_Kzzinv @ s) eqn16sum = gamma*np.sum((run_time-x[0])**d)-eqn16a + eqn16b mutilde = (precomp.Kzzinv_kzx.T @ params.m).flatten() sigmaa = precomp.sigmas sigmab = np.sum(precomp.Kxz * precomp.Kzzinv_kzx.T, axis=1) sigmac = np.sum((params.L.T @ precomp.Kzzinv_kzx) ** 2, axis=0) sigmatilde = sigmaa - sigmab + sigmac eqn19a, eqn19b, eqn19c = expected_log_f2(mutilde, np.sqrt(sigmatilde)) eqn19sum = -(eqn19c + eqn19a + eqn19b)@pij_flatten kl_normal = kl_tril(params.L, params.m, precomp.Lzz, 0) kl_g = kl_gamma(params.scale,params.shape, g0_params['scale'],g0_params['shape']) total = kl_normal+kl_g+eqn15sum + eqn16sum + eqn19sum +run_time*params.shape*params.scale-\ pij0sum*(special.digamma(params.shape)+np.log(params.scale)) return total
def _m_step_nu(self, expectations, datas, inputs, masks, tags, optimizer, num_iters, **kwargs): K, D = self.K, self.D E_taus = np.zeros(K) E_logtaus = np.zeros(K) weights = np.zeros(K) for ( Ez, _, _, ), data, input, mask, tag in zip(expectations, datas, inputs, masks, tags): # nu: (K,) mus: (K, D) sigmas: (K, D) y: (T, D) -> w: (T, K, D) mus = self._compute_mus(data, input, mask, tag) sigmas = self._compute_sigmas(data, input, mask, tag) nus = np.exp(self.inv_nus[:, None]) alpha = nus / 2 + 1 / 2 beta = nus / 2 + 1 / 2 * (data[:, None, :] - mus)**2 / sigmas E_taus += np.sum(Ez[:, :, None] * alpha / beta, axis=(0, 2)) E_logtaus += np.sum(Ez[:, :, None] * (digamma(alpha) - np.log(beta)), axis=(0, 2)) weights += np.sum(Ez, axis=0) * D E_taus /= weights E_logtaus /= weights for k in range(K): self.inv_nus[k] = np.log( generalized_newton_studentst_dof(E_taus[k], E_logtaus[k]))
def gamma_grad_logq(epsilon, alpha): """ Gradient of log-Gamma at proposed value. """ h_val = gamma_h(epsilon, alpha) h_der = gamma_grad_h(epsilon, alpha) return np.log(h_val) + (alpha-1.)*h_der/h_val - h_der - sp.digamma(alpha)
def expectedstats(natparam, fudge=1e-8): S, m, kappa, nu = natural_to_standard(natparam) d = m.shape[-1] E_J = nu[...,None,None] * symmetrize(np.linalg.inv(S)) + fudge * np.eye(d) E_h = np.matmul(E_J, m[...,None])[...,0] E_hTJinvh = d/kappa + np.matmul(m[...,None,:], E_h[...,None])[...,0,0] E_logdetJ = (np.sum(digamma((nu[...,None] - np.arange(d)[None,...])/2.), -1) \ + d*np.log(2.)) - np.linalg.slogdet(S)[1] return pack_dense(-1./2 * E_J, E_h, -1./2 * E_hTJinvh, 1./2 * E_logdetJ)
def grad_logQ(sample, alpha, m): """ Evaluates the gradient of the log of variational approximation, vectorized. """ gradient = np.zeros((alpha.shape[0], 2)) gradient[:, 0] = np.log(alpha) - np.log(m) + 1. + np.log(sample) - sample / m gradient[:, 0] -= sp.digamma(alpha) gradient[:, 1] = -alpha / m + alpha * sample / m**2 return gradient
def expectedstats(natparam, fudge=1e-8): S, m, kappa, nu = natural_to_standard(natparam) d = m.shape[-1] E_J = nu[..., None, None] * symmetrize( np.linalg.inv(S)) + fudge * np.eye(d) E_h = np.matmul(E_J, m[..., None])[..., 0] E_hTJinvh = d / kappa + np.matmul(m[..., None, :], E_h[..., None])[..., 0, 0] E_logdetJ = (np.sum(digamma((nu[...,None] - np.arange(d)[None,...])/2.), -1) \ + d*np.log(2.)) - np.linalg.slogdet(S)[1] return pack_dense(-1. / 2 * E_J, E_h, -1. / 2 * E_hTJinvh, 1. / 2 * E_logdetJ)
def expectedstats_standard(nu, S, M, K, fudge=1e-8): m = M.shape[0] E_Sigmainv = nu*symmetrize(np.linalg.inv(S)) + fudge*np.eye(S.shape[0]) E_Sigmainv_A = nu*np.linalg.solve(S, M) E_AT_Sigmainv_A = m*K + nu*symmetrize(np.dot(M.T, np.linalg.solve(S, M))) \ + fudge*np.eye(K.shape[0]) E_logdetSigmainv = digamma((nu-np.arange(m))/2.).sum() \ + m*np.log(2) - np.linalg.slogdet(S)[1] assert is_posdef(E_Sigmainv) assert is_posdef(E_AT_Sigmainv_A) return make_tuple( -1./2*E_AT_Sigmainv_A, E_Sigmainv_A.T, -1./2*E_Sigmainv, 1./2*E_logdetSigmainv)
def expectedstats_standard(nu, S, M, K, fudge=1e-8): m = M.shape[0] E_Sigmainv = nu * symmetrize(np.linalg.inv(S)) + fudge * np.eye(S.shape[0]) E_Sigmainv_A = nu * np.linalg.solve(S, M) E_AT_Sigmainv_A = m*K + nu*symmetrize(np.dot(M.T, np.linalg.solve(S, M))) \ + fudge*np.eye(K.shape[0]) E_logdetSigmainv = digamma((nu-np.arange(m))/2.).sum() \ + m*np.log(2) - np.linalg.slogdet(S)[1] assert is_posdef(E_Sigmainv) assert is_posdef(E_AT_Sigmainv_A) return tuple_((-1. / 2 * E_AT_Sigmainv_A, E_Sigmainv_A.T, -1. / 2 * E_Sigmainv, 1. / 2 * E_logdetSigmainv))
def grep_gradient(alpha, m, x, K, alphaz): gradient = np.zeros((alpha.shape[0], 2)) lmbda = npr.gamma(alpha, 1.) lmbda[lmbda < 1e-5] = 1e-5 Tinv_val = fun_Tinv(lmbda, alpha) h_val = fun_H(Tinv_val, alpha) u_val = fun_U(Tinv_val, alpha) zw = m * lmbda / alpha zw[zw < 1e-5] = 1e-5 logp_der = grad_logp(zw, K, x, alphaz) logp_val = logp(zw, K, x, alphaz) logq_der = grad_logQ_Z(zw, alpha) gradient[:, 0] = logp_der * (h_val - lmbda / alpha) * m / alpha gradient[:, 1] = logp_der * lmbda / alpha gradient[:, 0] += logp_val * ( np.log(lmbda) + (alpha / lmbda - 1.) * h_val - sp.digamma(alpha) + sp.polygamma(2, alpha) / 2. / sp.polygamma(1, alpha)) gradient += grad_entropy(alpha, m) return gradient
def update_pij(paramslin, taus, z, gamma, alpha, pij, kzzinv): nx = len(taus)+1 params = util.unlinearise_params(paramslin, verbose=0) kzzinv_m = kzzinv @ params.m expEmu = params.scale*np.exp(special.digamma(params.shape)) for i in range(nx-1): tau = taus[i] Kxz = k(tau, z, gamma, alpha) mutilde = (Kxz @ kzzinv_m).flatten() sigmaa = kdiag(tau, gamma) kzzinv_kzx = kzzinv @ Kxz.T sigmab = np.sum(Kxz * kzzinv_kzx.T, axis=1) sigmac = np.sum((params.L.T @ kzzinv_kzx) ** 2, axis=0) sigmatilde = sigmaa - sigmab + sigmac eqn19a, eqn19b, eqn19c = expected_log_f2(mutilde, np.sqrt(sigmatilde)) eqn19 = eqn19a + eqn19b + eqn19c expeqn19 = np.exp(eqn19) denom = expEmu + np.sum(expeqn19) pij[i+1][0] = expEmu / denom pij[i+1][1:tau.shape[1]+1] = expeqn19 / denom return pij
def I(a,b,c,d): return -c*d/a -b*np.log(a) - special.gammaln(b) + (b-1)*(special.digamma(d) + np.log(c))
def E_ln_pi_k(k, alpha): return digamma(alpha[k]) - digamma(np.sum(alpha))
from __future__ import absolute_import import scipy.stats import autograd.numpy as np from autograd.scipy.special import digamma from autograd.core import primitive rvs = primitive(scipy.stats.dirichlet.rvs) pdf = primitive(scipy.stats.dirichlet.pdf) logpdf = primitive(scipy.stats.dirichlet.logpdf) logpdf.defvjp(lambda g, ans, vs, gvs, x, alpha: g * (alpha - 1) / x, argnum=0) logpdf.defvjp(lambda g, ans, vs, gvs, x, alpha: g * (digamma(np.sum(alpha)) - digamma(alpha) + np.log(x)), argnum=1) # Same as log pdf, but multiplied by the pdf (ans). pdf.defvjp(lambda g, ans, vs, gvs, x, alpha: g * ans * (alpha - 1) / x, argnum=0) pdf.defvjp(lambda g, ans, vs, gvs, x, alpha: g * ans * (digamma(np.sum(alpha)) - digamma(alpha) + np.log(x)), argnum=1)
def ELBO_terms(param, prior, X, S, Ncon, G, M, K): eps = 1e-12 # get sample size and feature size [N, D] = np.shape(X) # unpack the input parameter vector [tau_a1, tau_a2, tau_b1, tau_b2, phi, tau_v1, tau_v2, mu_w, sigma_w,\ mu_b, sigma_b] = unpackParam(param, N, D, G, M, K) # compute eta given mu_w and mu_b eta = np.zeros((0, K)) for g in np.arange(G): t1 = np.exp(np.dot(X, mu_w[g]) + mu_b[g]) t2 = np.transpose(np.tile(np.sum(t1, axis=1), (K, 1))) eta = np.vstack((eta, t1 / t2)) eta = np.reshape(eta, (G, N, K)) # compute the expectation terms to be used later E_log_Alpha = digamma(tau_a1) - digamma(tau_a1 + tau_a2) # len(M) E_log_OneMinusAlpha = digamma(tau_a2) - digamma(tau_a1 + tau_a2) # len(M) E_log_Beta = digamma(tau_b1) - digamma(tau_b1 + tau_b2) # len(M) E_log_OneMinusBeta = digamma(tau_b2) - digamma(tau_b1 + tau_b2) # len(M) E_log_Nu = digamma(tau_v1) - digamma(tau_v1 + tau_v2) # len(G) E_log_OneMinusNu = digamma(tau_v2) - digamma(tau_v1 + tau_v2) # len(G) E_C = phi # shape(M, G) E_W = mu_w # shape(G, D, K) E_WMinusMuSqd = sigma_w**2 + (mu_w - prior['mu_w'])**2 # shape(G, D, K) E_BMinusMuSqd = sigma_b**2 + (mu_b - prior['mu_b'])**2 # shape(G, K) E_ExpB = np.exp(mu_b + 0.5 * sigma_b**2) # shape(G, K) E_logP_Alpha = (prior['tau_a1']-1) * E_log_Alpha + \ (prior['tau_a2']-1) * E_log_OneMinusAlpha - \ gammaln(prior['tau_a1']+eps) - \ gammaln(prior['tau_a2']+eps) + \ gammaln(prior['tau_a1']+prior['tau_a2']+eps) E_logP_Beta = (prior['tau_b1']-1) * E_log_Beta + \ (prior['tau_b2']-1) * E_log_OneMinusBeta - \ gammaln(prior['tau_b1']+eps) - \ gammaln(prior['tau_b2']+eps) + \ gammaln(prior['tau_b1']+prior['tau_b2']+eps) E_logQ_Alpha = (tau_a1-1)*E_log_Alpha + (tau_a2-1)*E_log_OneMinusAlpha - \ gammaln(tau_a1 + eps) - gammaln(tau_a2 + eps) + \ gammaln(tau_a1+tau_a2 + eps) E_logQ_Beta = (tau_b1-1)*E_log_Beta + (tau_b2-1)*E_log_OneMinusBeta - \ gammaln(tau_b1 + eps) - gammaln(tau_b2 + eps) + \ gammaln(tau_b1+tau_b2 + eps) E_logQ_C = np.sum(phi * np.log(phi + eps), axis=1) eta_N_GK = np.reshape(np.transpose(eta, (1, 0, 2)), (N, G * K)) # compute three terms and then add them up L_1, L_2, L_3 = [0., 0., 0.] # the first term and part of the second term for m in np.arange(M): idx_S = range(sum(Ncon[:m]), sum(Ncon[:m]) + Ncon[m]) tp_con = S[idx_S, 3] phi_rep = np.reshape(np.transpose(np.tile(phi[m], (K, 1))), G * K) E_A = np.dot(eta_N_GK, np.transpose(eta_N_GK * phi_rep)) E_A_use = E_A[S[idx_S, 1], S[idx_S, 2]] tp_Asum = np.sum(E_A_use) tp_AdotS = np.sum(E_A_use * tp_con) L_1 = L_1 + Ncon[m]*E_log_Beta[m] + np.sum(tp_con)*\ (E_log_OneMinusBeta[m]-E_log_Beta[m]) + \ tp_AdotS * (E_log_Alpha[m] + E_log_Beta[m] - \ E_log_OneMinusAlpha[m] - E_log_OneMinusBeta[m]) + \ tp_Asum * (E_log_OneMinusAlpha[m] - E_log_Beta[m]) fg = lambda g: phi[m, g] * np.sum(E_log_OneMinusNu[0:g - 1]) L_2 = L_2 + E_logP_Alpha[m] + E_logP_Beta[m] + \ np.dot(phi[m],E_log_Nu) + np.sum(map(fg, np.arange(G))) # the second term for g in np.arange(G): tp_Nug = (prior['gamma']-1)*E_log_OneMinusNu[g] + \ np.log(prior['gamma']+eps) t1 = np.dot(X, mu_w[g]) t2 = 0.5 * np.dot(X**2, sigma_w[g]**2) t3 = np.sum(eta[g], axis=1) t_mat_i = logsumexp(np.add(mu_b[g] + 0.5 * sigma_b[g]**2, t1 + t2), axis=1) tp_Zg = np.sum(eta[g] * np.add(t1, mu_b[g])) - np.dot(t3, t_mat_i) t5 = -np.log(np.sqrt(2*np.pi)*prior['sigma_w']) - \ 0.5/(prior['sigma_w']**2) * (sigma_w[g]**2 + \ (mu_w[g]-prior['mu_w'])**2) tp_Wg = np.sum(t5) t6 = -np.log(np.sqrt(2*np.pi)*prior['sigma_b']+eps) - \ 0.5/(prior['sigma_b']**2) * (sigma_b[g]**2 + \ (mu_b[g]-prior['mu_b'])**2) tp_bg = np.sum(t6) L_2 = L_2 + tp_Nug + tp_Zg + tp_Wg + tp_bg # the third term L_3 = np.sum(E_logQ_Alpha + E_logQ_Beta + E_logQ_C) for g in np.arange(G): tp_Nug3 = (tau_v1[g]-1)*E_log_Nu[g]+(tau_v2[g]-1)*E_log_OneMinusNu[g] -\ np.log(gamma(tau_v1[g])+eps) - np.log(gamma(tau_v2[g])+eps) + \ np.log(gamma(tau_v1[g]+tau_v2[g])+eps) tp_Zg3 = np.sum(eta[g] * np.log(eta[g] + eps)) tp_Wg3 = np.sum(-np.log(np.sqrt(2 * np.pi) * sigma_w[g] + eps) - 0.5) tp_bg3 = np.sum(-np.log(np.sqrt(2 * np.pi) * sigma_b[g] + eps) - 0.5) L_3 = L_3 + tp_Nug3 + tp_Zg3 + tp_Wg3 + tp_bg3 return (L_1, L_2, L_3)
def expectedstats(natparam): #Returns E_{q(v)}[\eta_z(V)] where \eta_z(V)_i = [ln(V_i), ln(1-V_i)] #natparam size is (T-1)x2 alpha_beta = natparam + 1 return digamma(alpha_beta) - digamma( np.sum(alpha_beta, axis=1, keepdims=True))
from __future__ import absolute_import import scipy.stats import autograd.numpy as np from autograd.scipy.special import digamma from autograd.core import primitive rvs = primitive(scipy.stats.dirichlet.rvs) pdf = primitive(scipy.stats.dirichlet.pdf) logpdf = primitive(scipy.stats.dirichlet.logpdf) logpdf.defgrad(lambda ans, x, alpha: lambda g: g * (alpha - 1) / x, argnum=0) logpdf.defgrad(lambda ans, x, alpha: lambda g: g * (digamma(np.sum(alpha)) - digamma(alpha) + np.log(x)), argnum=1) # Same as log pdf, but multiplied by the pdf (ans). pdf.defgrad(lambda ans, x, alpha: lambda g: g * ans * (alpha - 1) / x, argnum=0) pdf.defgrad(lambda ans, x, alpha: lambda g: g * ans * (digamma(np.sum(alpha)) - digamma(alpha) + np.log(x)), argnum=1)
def expectedstats(natparam): alpha = natparam + 1 return digamma(alpha) - digamma(alpha.sum(-1, keepdims=True))
def E_ln_lam_k(k, nu, W): return np.sum(digamma(nu[k] + 1 - np.arange(D) + 1)) + D * np.log(2) + np.log(det(W[k]))
def entropy(alpha, m): return alpha + np.log(m) - np.log(alpha) + sp.gammaln( alpha) + (1. - alpha) * sp.digamma(alpha)
def grad_logQ_alpha(samp, alpha): return np.log(samp) - sp.digamma(alpha)
from __future__ import absolute_import import autograd.scipy.stats.dirichlet as di import autograd.numpy as np from autograd.scipy.special import digamma di.logpdf.defjvp(lambda g, ans, gvs, vs, x, alpha: np.inner(g, (alpha - 1) / x), argnum=0) di.logpdf.defjvp(lambda g, ans, gvs, vs, x, alpha: np.inner(g, (digamma(np.sum(alpha)) - digamma(alpha) + np.log(x))), argnum=1) di.pdf.defjvp(lambda g, ans, gvs, vs, x, alpha: np.inner(g, ans * (alpha - 1) / x), argnum=0) di.pdf.defjvp(lambda g, ans, gvs, vs, x, alpha: np.inner(g, ans * (digamma(np.sum(alpha)) - digamma(alpha) + np.log(x))), argnum=1)
def NegELBO(param, prior, X, S, Ncon, G, M, K): """ Parameters ---------- param: length (2M + 2M + MG + 2G + GNK + GDK + GDK + GK + GK) variational parameters, including: 1) tau_a1: len(M), first parameter of q(alpha_m) 2) tau_a2: len(M), second parameter of q(alpha_m) 3) tau_b1: len(M), first parameter of q(beta_m) 4) tau_b2: len(M), second parameter of q(beta_m) 5) phi: shape(M, G), phi[m,:] is the paramter vector of q(c_m) 6) tau_v1: len(G), first parameter of q(nu_g) 7) tau_v2: len(G), second parameter of q(nu_g) 8) mu_w: shape(G, D, K), mu_w[g,d,k] is the mean parameter of q(W^g_{dk}) 9) sigma_w: shape(G, D, K), sigma_w[g,d,k] is the std parameter of q(W^g_{dk}) 10) mu_b: shape(G, K), mu_b[g,k] is the mean parameter of q(b^g_k) 11) sigma_b: shape(G, K), sigma_b[g,k] is the std parameter of q(b^g_k) prior: dictionary the naming of keys follow those in param {'tau_a1':val1, ...} X: shape(N, D) each row represents a sample and each column represents a feature S: shape(n_con, 4) each row represents a observed constrain (expert_id, sample1_id, sample2_id, constraint_type), where 1) expert_id: varies between [0, M-1] 2) sample1 id: varies between [0, N-1] 3) sample2 id: varies between [0, N-1] 4) constraint_type: 1 means must-link and 0 means cannot-link Ncon: shape(M, 1) number of constraints provided by each expert G: int number of local consensus in the posterior truncated Dirichlet Process M: int number of experts K: int maximal number of clusters among different solutions, due to the use of discriminative clustering, some local solution might have empty clusters Returns ------- """ eps = 1e-12 # get sample size and feature size [N, D] = np.shape(X) # unpack the input parameter vector [tau_a1, tau_a2, tau_b1, tau_b2, phi, tau_v1, tau_v2, mu_w, sigma_w,\ mu_b, sigma_b] = unpackParam(param, N, D, G, M, K) # compute eta given mu_w and mu_b eta = np.zeros((0, K)) for g in np.arange(G): t1 = np.exp(np.dot(X, mu_w[g]) + mu_b[g]) t2 = np.transpose(np.tile(np.sum(t1, axis=1), (K, 1))) eta = np.vstack((eta, t1 / t2)) eta = np.reshape(eta, (G, N, K)) # compute the expectation terms to be used later E_log_Alpha = digamma(tau_a1) - digamma(tau_a1 + tau_a2) # len(M) E_log_OneMinusAlpha = digamma(tau_a2) - digamma(tau_a1 + tau_a2) # len(M) E_log_Beta = digamma(tau_b1) - digamma(tau_b1 + tau_b2) # len(M) E_log_OneMinusBeta = digamma(tau_b2) - digamma(tau_b1 + tau_b2) # len(M) E_log_Nu = digamma(tau_v1) - digamma(tau_v1 + tau_v2) # len(G) E_log_OneMinusNu = digamma(tau_v2) - digamma(tau_v1 + tau_v2) # len(G) E_C = phi # shape(M, G) E_W = mu_w # shape(G, D, K) E_WMinusMuSqd = sigma_w**2 + (mu_w - prior['mu_w'])**2 # shape(G, D, K) E_BMinusMuSqd = sigma_b**2 + (mu_b - prior['mu_b'])**2 # shape(G, K) E_ExpB = np.exp(mu_b + 0.5 * sigma_b**2) # shape(G, K) E_logP_Alpha = (prior['tau_a1']-1) * E_log_Alpha + \ (prior['tau_a2']-1) * E_log_OneMinusAlpha - \ gammaln(prior['tau_a1']+eps) - \ gammaln(prior['tau_a2']+eps) + \ gammaln(prior['tau_a1']+prior['tau_a2']+eps) E_logP_Beta = (prior['tau_b1']-1) * E_log_Beta + \ (prior['tau_b2']-1) * E_log_OneMinusBeta - \ gammaln(prior['tau_b1']+eps) - \ gammaln(prior['tau_b2']+eps) + \ gammaln(prior['tau_b1']+prior['tau_b2']+eps) E_logQ_Alpha = (tau_a1-1)*E_log_Alpha + (tau_a2-1)*E_log_OneMinusAlpha - \ gammaln(tau_a1 + eps) - gammaln(tau_a2 + eps) + \ gammaln(tau_a1+tau_a2 + eps) E_logQ_Beta = (tau_b1-1)*E_log_Beta + (tau_b2-1)*E_log_OneMinusBeta - \ gammaln(tau_b1 + eps) - gammaln(tau_b2 + eps) + \ gammaln(tau_b1+tau_b2 + eps) E_logQ_C = np.sum(phi * np.log(phi + eps), axis=1) eta_N_GK = np.reshape(np.transpose(eta, (1, 0, 2)), (N, G * K)) # compute three terms and then add them up L_1, L_2, L_3 = [0., 0., 0.] # the first term and part of the second term for m in np.arange(M): idx_S = range(sum(Ncon[:m]), sum(Ncon[:m]) + Ncon[m]) tp_con = S[idx_S, 3] phi_rep = np.reshape(np.transpose(np.tile(phi[m], (K, 1))), G * K) E_A = np.dot(eta_N_GK, np.transpose(eta_N_GK * phi_rep)) E_A_use = E_A[S[idx_S, 1], S[idx_S, 2]] tp_Asum = np.sum(E_A_use) tp_AdotS = np.sum(E_A_use * tp_con) L_1 = L_1 + Ncon[m]*E_log_Beta[m] + np.sum(tp_con)*\ (E_log_OneMinusBeta[m]-E_log_Beta[m]) + \ tp_AdotS * (E_log_Alpha[m] + E_log_Beta[m] - \ E_log_OneMinusAlpha[m] - E_log_OneMinusBeta[m]) + \ tp_Asum * (E_log_OneMinusAlpha[m] - E_log_Beta[m]) fg = lambda g: phi[m, g] * np.sum(E_log_OneMinusNu[0:g - 1]) L_2 = L_2 + E_logP_Alpha[m] + E_logP_Beta[m] + \ np.dot(phi[m],E_log_Nu) + np.sum(map(fg, np.arange(G))) # the second term for g in np.arange(G): tp_Nug = (prior['gamma']-1)*E_log_OneMinusNu[g] + \ np.log(prior['gamma']+eps) t1 = np.dot(X, mu_w[g]) t2 = 0.5 * np.dot(X**2, sigma_w[g]**2) t3 = np.sum(eta[g], axis=1) t_mat_i = logsumexp(np.add(mu_b[g] + 0.5 * sigma_b[g]**2, t1 + t2), axis=1) tp_Zg = np.sum(eta[g] * np.add(t1, mu_b[g])) - np.dot(t3, t_mat_i) t5 = -np.log(np.sqrt(2*np.pi)*prior['sigma_w']) - \ 0.5/(prior['sigma_w']**2) * (sigma_w[g]**2 + \ (mu_w[g]-prior['mu_w'])**2) tp_Wg = np.sum(t5) t6 = -np.log(np.sqrt(2*np.pi)*prior['sigma_b']+eps) - \ 0.5/(prior['sigma_b']**2) * (sigma_b[g]**2 + \ (mu_b[g]-prior['mu_b'])**2) tp_bg = np.sum(t6) L_2 = L_2 + tp_Nug + tp_Zg + tp_Wg + tp_bg # the third term L_3 = np.sum(E_logQ_Alpha + E_logQ_Beta + E_logQ_C) for g in np.arange(G): tp_Nug3 = (tau_v1[g]-1)*E_log_Nu[g]+(tau_v2[g]-1)*E_log_OneMinusNu[g] -\ np.log(gamma(tau_v1[g])+eps) - np.log(gamma(tau_v2[g])+eps) + \ np.log(gamma(tau_v1[g]+tau_v2[g])+eps) tp_Zg3 = np.sum(eta[g] * np.log(eta[g] + eps)) tp_Wg3 = np.sum(-np.log(np.sqrt(2 * np.pi) * sigma_w[g] + eps) - 0.5) tp_bg3 = np.sum(-np.log(np.sqrt(2 * np.pi) * sigma_b[g] + eps) - 0.5) L_3 = L_3 + tp_Nug3 + tp_Zg3 + tp_Wg3 + tp_bg3 # Note the third term should have a minus sign before it ELBO = L_1 + L_2 - L_3 #ELBO = L_1 + L_2 return -ELBO
from __future__ import absolute_import import scipy.stats import autograd.numpy as np from autograd.scipy.special import digamma from autograd.extend import primitive, defvjp rvs = primitive(scipy.stats.dirichlet.rvs) pdf = primitive(scipy.stats.dirichlet.pdf) logpdf = primitive(scipy.stats.dirichlet.logpdf) defvjp( logpdf, lambda ans, x, alpha: lambda g: g * (alpha - 1) / x, lambda ans, x, alpha: lambda g: g * (digamma(np.sum(alpha)) - digamma(alpha) + np.log(x))) # Same as log pdf, but multiplied by the pdf (ans). defvjp( pdf, lambda ans, x, alpha: lambda g: g * ans * (alpha - 1) / x, lambda ans, x, alpha: lambda g: g * ans * (digamma(np.sum(alpha)) - digamma(alpha) + np.log(x)))
def fun_Tinv(z, alpha): return (np.log(z) - sp.digamma(alpha)) / np.sqrt(sp.polygamma(1, alpha))
def ln_lam_tilde_k(k, nu, W, D): return anp.sum(digamma(nu[k] + 1 - anp.arange(D) + 1)) + D * anp.log(2) + anp.log(anp.det(W[k]))
from __future__ import absolute_import import autograd.numpy as np import autograd.scipy.special as sp ### Gamma functions ### sp.polygamma.defjvp(lambda g, ans, gvs, vs, n, x: g * sp.polygamma(n + 1, x), argnum=1) sp.psi.defjvp(lambda g, ans, gvs, vs, x: g * sp.polygamma(1, x)) sp.digamma.defjvp(lambda g, ans, gvs, vs, x: g * sp.polygamma(1, x)) sp.gamma.defjvp(lambda g, ans, gvs, vs, x: g * ans * sp.psi(x)) sp.gammaln.defjvp(lambda g, ans, gvs, vs, x: g * sp.psi(x)) sp.rgamma.defjvp(lambda g, ans, gvs, vs, x: g * sp.psi(x) / -sp.gamma(x)) sp.multigammaln.defjvp(lambda g, ans, gvs, vs, a, d: g * np.sum( sp.digamma(np.expand_dims(a, -1) - np.arange(d) / 2.), -1)) ### Bessel functions ### sp.j0.defjvp(lambda g, ans, gvs, vs, x: -g * sp.j1(x)) sp.y0.defjvp(lambda g, ans, gvs, vs, x: -g * sp.y1(x)) sp.j1.defjvp(lambda g, ans, gvs, vs, x: g * (sp.j0(x) - sp.jn(2, x)) / 2.0) sp.y1.defjvp(lambda g, ans, gvs, vs, x: g * (sp.y0(x) - sp.yn(2, x)) / 2.0) sp.jn.defjvp(lambda g, ans, gvs, vs, n, x: g * (sp.jn(n - 1, x) - sp.jn(n + 1, x)) / 2.0, argnum=1) sp.yn.defjvp(lambda g, ans, gvs, vs, n, x: g * (sp.yn(n - 1, x) - sp.yn(n + 1, x)) / 2.0, argnum=1) ### Error Function ### sp.erf.defjvp( lambda g, ans, gvs, vs, x: 2. * g * sp.inv_root_pi * np.exp(-x**2))
def fun_T(eps, alpha): return np.exp(eps * np.sqrt(sp.polygamma(1, alpha)) + sp.digamma(alpha))
from __future__ import absolute_import import scipy.stats import autograd.numpy as np from autograd.scipy.special import digamma from autograd.extend import primitive, defvjp rvs = primitive(scipy.stats.dirichlet.rvs) pdf = primitive(scipy.stats.dirichlet.pdf) logpdf = primitive(scipy.stats.dirichlet.logpdf) defvjp(logpdf,lambda ans, x, alpha: lambda g: g * (alpha - 1) / x, lambda ans, x, alpha: lambda g: g * (digamma(np.sum(alpha)) - digamma(alpha) + np.log(x))) # Same as log pdf, but multiplied by the pdf (ans). defvjp(pdf,lambda ans, x, alpha: lambda g: g * ans * (alpha - 1) / x, lambda ans, x, alpha: lambda g: g * ans * (digamma(np.sum(alpha)) - digamma(alpha) + np.log(x)))