def gaussian_mixture_logpdf(x, w, mu, Sigma): # Shape(x) = (N, D) # Shape(w) = (K,) # Shape(mu) = (K, D) # Shape(Sigma) = (K, D, D) # Shape(result) = (N,) # Dimensionality D = np.shape(x)[-1] # Cholesky decomposition of the covariance matrix U = linalg.chol(Sigma) # Reshape x: # Shape(x) = (N, 1, D) x = np.expand_dims(x, axis=-2) # (x-mu) and (x-mu)'*inv(Sigma)*(x-mu): # Shape(v) = (N, K, D) # Shape(z) = (N, K) v = x - mu z = np.einsum('...i,...i', v, linalg.chol_solve(U, v)) # Log-determinant of Sigma: # Shape(ldet) = (K,) ldet = linalg.chol_logdet(U) # Compute log pdf for each cluster: # Shape(lpdf) = (N, K) lpdf = misc.gaussian_logpdf(z, 0, 0, ldet, D)
def lower_bound_contribution(self, gradient=False): # Get moment functions from parents m = self.parents[0].message_to_child(gradient=gradient) k = self.parents[1].message_to_child(gradient=gradient) if self.parents[2]: k_sparse = self.parents[2].message_to_child(gradient=gradient) else: k_sparse = None if self.parents[3]: pseudoinputs = self.parents[3].message_to_child(gradient=gradient) #pseudoinputs = self.parents[3].message_to_child(gradient=gradient)[0] else: pseudoinputs = None ## m = self.parents[0].message_to_child(gradient=gradient)[0] ## k = self.parents[1].message_to_child(gradient=gradient)[0] # Compute the parameters (covariance matrices etc) using # parents' moment functions DKs_xx = [] DKd_xx = [] DKd_xp = [] DKd_pp = [] Dxp = [] Dmu = [] if gradient: # FIXME: We are ignoring the covariance of mu now.. ((mu, _), Dmu) = m(self.x, gradient=True) ## if k_sparse: ## ((Ks_xx,), DKs_xx) = k_sparse(self.x, self.x, gradient=True) if pseudoinputs: ((Ks_xx, ), DKs_xx) = k_sparse(self.x, self.x, gradient=True) ((xp, ), Dxp) = pseudoinputs ((Kd_pp, ), DKd_pp) = k(xp, xp, gradient=True) ((Kd_xp, ), DKd_xp) = k(self.x, xp, gradient=True) else: ((K_xx, ), DKd_xx) = k(self.x, self.x, gradient=True) if k_sparse: ((Ks_xx, ), DKs_xx) = k_sparse(self.x, self.x, gradient=True) try: K_xx += Ks_xx except: K_xx = K_xx + Ks_xx else: # FIXME: We are ignoring the covariance of mu now.. (mu, _) = m(self.x) ## if k_sparse: ## (Ks_xx,) = k_sparse(self.x, self.x) if pseudoinputs: (Ks_xx, ) = k_sparse(self.x, self.x) (xp, ) = pseudoinputs (Kd_pp, ) = k(xp, xp) (Kd_xp, ) = k(self.x, xp) else: (K_xx, ) = k(self.x, self.x) if k_sparse: (Ks_xx, ) = k_sparse(self.x, self.x) try: K_xx += Ks_xx except: K_xx = K_xx + Ks_xx mu = mu[0] #K = K[0] # Log pdf if self.observed: ## Log pdf for directly observed GP f0 = self.f - mu #print('hereiam') #print(K) if pseudoinputs: ## Pseudo-input approximation # Decompose the full-rank sparse/noise covariance matrix try: Us_xx = utils.cholesky(Ks_xx) except linalg.LinAlgError: print('Noise/sparse covariance not positive definite') return -np.inf # Use Woodbury-Sherman-Morrison formula with the # following notation: # # y2 = f0' * inv(Kd_xp*inv(Kd_pp)*Kd_xp' + Ks_xx) * f0 # # z = Ks_xx \ f0 # Lambda = Kd_pp + Kd_xp'*inv(Ks_xx)*Kd_xp # nu = inv(Lambda) * (Kd_xp' * (Ks_xx \ f0)) # rho = Kd_xp * inv(Lambda) * (Kd_xp' * (Ks_xx \ f0)) # # y2 = f0' * z - z' * rho z = Us_xx.solve(f0) Lambda = Kd_pp + np.dot(Kd_xp.T, Us_xx.solve(Kd_xp)) ## z = utils.chol_solve(Us_xx, f0) ## Lambda = Kd_pp + np.dot(Kd_xp.T, ## utils.chol_solve(Us_xx, Kd_xp)) try: U_Lambda = utils.cholesky(Lambda) #U_Lambda = utils.chol(Lambda) except linalg.LinAlgError: print('Lambda not positive definite') return -np.inf nu = U_Lambda.solve(np.dot(Kd_xp.T, z)) #nu = utils.chol_solve(U_Lambda, np.dot(Kd_xp.T, z)) rho = np.dot(Kd_xp, nu) y2 = np.dot(f0, z) - np.dot(z, rho) # Use matrix determinant lemma # # det(Kd_xp*inv(Kd_pp)*Kd_xp' + Ks_xx) # = det(Kd_pp + Kd_xp'*inv(Ks_xx)*Kd_xp) # * det(inv(Kd_pp)) * det(Ks_xx) # = det(Lambda) * det(Ks_xx) / det(Kd_pp) try: Ud_pp = utils.cholesky(Kd_pp) #Ud_pp = utils.chol(Kd_pp) except linalg.LinAlgError: print('Covariance of pseudo inputs not positive definite') return -np.inf logdet = (U_Lambda.logdet() + Us_xx.logdet() - Ud_pp.logdet()) ## logdet = (utils.logdet_chol(U_Lambda) ## + utils.logdet_chol(Us_xx) ## - utils.logdet_chol(Ud_pp)) # Compute the log pdf L = gaussian_logpdf(y2, 0, 0, logdet, np.size(self.f)) # Add the variational cost of the pseudo-input # approximation # Compute gradients for (dmu, func) in Dmu: # Derivative w.r.t. mean vector d = np.nan # Send the derivative message func(d) for (dKs_xx, func) in DKs_xx: # Compute derivative w.r.t. covariance matrix d = np.nan # Send the derivative message func(d) for (dKd_xp, func) in DKd_xp: # Compute derivative w.r.t. covariance matrix d = np.nan # Send the derivative message func(d) V = Ud_pp.solve(Kd_xp.T) Z = Us_xx.solve(V.T) ## V = utils.chol_solve(Ud_pp, Kd_xp.T) ## Z = utils.chol_solve(Us_xx, V.T) for (dKd_pp, func) in DKd_pp: # Compute derivative w.r.t. covariance matrix d = (0.5 * np.trace(Ud_pp.solve(dKd_pp)) - 0.5 * np.trace(U_Lambda.solve(dKd_pp)) + np.dot(nu, np.dot(dKd_pp, nu)) + np.trace(np.dot(dKd_pp, np.dot(V, Z)))) ## d = (0.5 * np.trace(utils.chol_solve(Ud_pp, dKd_pp)) ## - 0.5 * np.trace(utils.chol_solve(U_Lambda, dKd_pp)) ## + np.dot(nu, np.dot(dKd_pp, nu)) ## + np.trace(np.dot(dKd_pp, ## np.dot(V,Z)))) # Send the derivative message func(d) for (dxp, func) in Dxp: # Compute derivative w.r.t. covariance matrix d = np.nan # Send the derivative message func(d) else: ## Full exact (no pseudo approximations) try: U = utils.cholesky(K_xx) #U = utils.chol(K_xx) except linalg.LinAlgError: print('non positive definite, return -inf') return -np.inf z = U.solve(f0) #z = utils.chol_solve(U, f0) #print(K) L = utils.gaussian_logpdf( np.dot(f0, z), 0, 0, U.logdet(), ## utils.logdet_chol(U), np.size(self.f)) for (dmu, func) in Dmu: # Derivative w.r.t. mean vector d = -np.sum(z) # Send the derivative message func(d) for (dK, func) in DKd_xx: # Compute derivative w.r.t. covariance matrix # # TODO: trace+chol_solve should be handled better # for sparse matrices. Use sparse-inverse! d = 0.5 * (dK.dot(z).dot(z) - U.trace_solve_gradient(dK)) ## - np.trace(U.solve(dK))) ## d = 0.5 * (dK.dot(z).dot(z) ## - np.trace(utils.chol_solve(U, dK))) #print('derivate', d, dK) ## d = 0.5 * (np.dot(z, np.dot(dK, z)) ## - np.trace(utils.chol_solve(U, dK))) # # Send the derivative message func(d) for (dK, func) in DKs_xx: # Compute derivative w.r.t. covariance matrix d = 0.5 * (dK.dot(z).dot(z) - U.trace_solve_gradient(dK)) ## - np.trace(U.solve(dK))) ## d = 0.5 * (dK.dot(z).dot(z) ## - np.trace(utils.chol_solve(U, dK))) ## d = 0.5 * (np.dot(z, np.dot(dK, z)) ## - np.trace(utils.chol_solve(U, dK))) # Send the derivative message func(d) else: ## Log pdf for latent GP raise Exception('Not implemented yet') return L
def lower_bound_contribution(self, gradient=False): # Get moment functions from parents m = self.parents[0].message_to_child(gradient=gradient) k = self.parents[1].message_to_child(gradient=gradient) if self.parents[2]: k_sparse = self.parents[2].message_to_child(gradient=gradient) else: k_sparse = None if self.parents[3]: pseudoinputs = self.parents[3].message_to_child(gradient=gradient) #pseudoinputs = self.parents[3].message_to_child(gradient=gradient)[0] else: pseudoinputs = None ## m = self.parents[0].message_to_child(gradient=gradient)[0] ## k = self.parents[1].message_to_child(gradient=gradient)[0] # Compute the parameters (covariance matrices etc) using # parents' moment functions DKs_xx = [] DKd_xx = [] DKd_xp = [] DKd_pp = [] Dxp = [] Dmu = [] if gradient: # FIXME: We are ignoring the covariance of mu now.. ((mu, _), Dmu) = m(self.x, gradient=True) ## if k_sparse: ## ((Ks_xx,), DKs_xx) = k_sparse(self.x, self.x, gradient=True) if pseudoinputs: ((Ks_xx,), DKs_xx) = k_sparse(self.x, self.x, gradient=True) ((xp,), Dxp) = pseudoinputs ((Kd_pp,), DKd_pp) = k(xp,xp, gradient=True) ((Kd_xp,), DKd_xp) = k(self.x, xp, gradient=True) else: ((K_xx,), DKd_xx) = k(self.x, self.x, gradient=True) if k_sparse: ((Ks_xx,), DKs_xx) = k_sparse(self.x, self.x, gradient=True) try: K_xx += Ks_xx except: K_xx = K_xx + Ks_xx else: # FIXME: We are ignoring the covariance of mu now.. (mu, _) = m(self.x) ## if k_sparse: ## (Ks_xx,) = k_sparse(self.x, self.x) if pseudoinputs: (Ks_xx,) = k_sparse(self.x, self.x) (xp,) = pseudoinputs (Kd_pp,) = k(xp, xp) (Kd_xp,) = k(self.x, xp) else: (K_xx,) = k(self.x, self.x) if k_sparse: (Ks_xx,) = k_sparse(self.x, self.x) try: K_xx += Ks_xx except: K_xx = K_xx + Ks_xx mu = mu[0] #K = K[0] # Log pdf if self.observed: ## Log pdf for directly observed GP f0 = self.f - mu #print('hereiam') #print(K) if pseudoinputs: ## Pseudo-input approximation # Decompose the full-rank sparse/noise covariance matrix try: Us_xx = utils.cholesky(Ks_xx) except linalg.LinAlgError: print('Noise/sparse covariance not positive definite') return -np.inf # Use Woodbury-Sherman-Morrison formula with the # following notation: # # y2 = f0' * inv(Kd_xp*inv(Kd_pp)*Kd_xp' + Ks_xx) * f0 # # z = Ks_xx \ f0 # Lambda = Kd_pp + Kd_xp'*inv(Ks_xx)*Kd_xp # nu = inv(Lambda) * (Kd_xp' * (Ks_xx \ f0)) # rho = Kd_xp * inv(Lambda) * (Kd_xp' * (Ks_xx \ f0)) # # y2 = f0' * z - z' * rho z = Us_xx.solve(f0) Lambda = Kd_pp + np.dot(Kd_xp.T, Us_xx.solve(Kd_xp)) ## z = utils.chol_solve(Us_xx, f0) ## Lambda = Kd_pp + np.dot(Kd_xp.T, ## utils.chol_solve(Us_xx, Kd_xp)) try: U_Lambda = utils.cholesky(Lambda) #U_Lambda = utils.chol(Lambda) except linalg.LinAlgError: print('Lambda not positive definite') return -np.inf nu = U_Lambda.solve(np.dot(Kd_xp.T, z)) #nu = utils.chol_solve(U_Lambda, np.dot(Kd_xp.T, z)) rho = np.dot(Kd_xp, nu) y2 = np.dot(f0, z) - np.dot(z, rho) # Use matrix determinant lemma # # det(Kd_xp*inv(Kd_pp)*Kd_xp' + Ks_xx) # = det(Kd_pp + Kd_xp'*inv(Ks_xx)*Kd_xp) # * det(inv(Kd_pp)) * det(Ks_xx) # = det(Lambda) * det(Ks_xx) / det(Kd_pp) try: Ud_pp = utils.cholesky(Kd_pp) #Ud_pp = utils.chol(Kd_pp) except linalg.LinAlgError: print('Covariance of pseudo inputs not positive definite') return -np.inf logdet = (U_Lambda.logdet() + Us_xx.logdet() - Ud_pp.logdet()) ## logdet = (utils.logdet_chol(U_Lambda) ## + utils.logdet_chol(Us_xx) ## - utils.logdet_chol(Ud_pp)) # Compute the log pdf L = gaussian_logpdf(y2, 0, 0, logdet, np.size(self.f)) # Add the variational cost of the pseudo-input # approximation # Compute gradients for (dmu, func) in Dmu: # Derivative w.r.t. mean vector d = np.nan # Send the derivative message func(d) for (dKs_xx, func) in DKs_xx: # Compute derivative w.r.t. covariance matrix d = np.nan # Send the derivative message func(d) for (dKd_xp, func) in DKd_xp: # Compute derivative w.r.t. covariance matrix d = np.nan # Send the derivative message func(d) V = Ud_pp.solve(Kd_xp.T) Z = Us_xx.solve(V.T) ## V = utils.chol_solve(Ud_pp, Kd_xp.T) ## Z = utils.chol_solve(Us_xx, V.T) for (dKd_pp, func) in DKd_pp: # Compute derivative w.r.t. covariance matrix d = (0.5 * np.trace(Ud_pp.solve(dKd_pp)) - 0.5 * np.trace(U_Lambda.solve(dKd_pp)) + np.dot(nu, np.dot(dKd_pp, nu)) + np.trace(np.dot(dKd_pp, np.dot(V,Z)))) ## d = (0.5 * np.trace(utils.chol_solve(Ud_pp, dKd_pp)) ## - 0.5 * np.trace(utils.chol_solve(U_Lambda, dKd_pp)) ## + np.dot(nu, np.dot(dKd_pp, nu)) ## + np.trace(np.dot(dKd_pp, ## np.dot(V,Z)))) # Send the derivative message func(d) for (dxp, func) in Dxp: # Compute derivative w.r.t. covariance matrix d = np.nan # Send the derivative message func(d) else: ## Full exact (no pseudo approximations) try: U = utils.cholesky(K_xx) #U = utils.chol(K_xx) except linalg.LinAlgError: print('non positive definite, return -inf') return -np.inf z = U.solve(f0) #z = utils.chol_solve(U, f0) #print(K) L = utils.gaussian_logpdf(np.dot(f0, z), 0, 0, U.logdet(), ## utils.logdet_chol(U), np.size(self.f)) for (dmu, func) in Dmu: # Derivative w.r.t. mean vector d = -np.sum(z) # Send the derivative message func(d) for (dK, func) in DKd_xx: # Compute derivative w.r.t. covariance matrix # # TODO: trace+chol_solve should be handled better # for sparse matrices. Use sparse-inverse! d = 0.5 * (dK.dot(z).dot(z) - U.trace_solve_gradient(dK)) ## - np.trace(U.solve(dK))) ## d = 0.5 * (dK.dot(z).dot(z) ## - np.trace(utils.chol_solve(U, dK))) #print('derivate', d, dK) ## d = 0.5 * (np.dot(z, np.dot(dK, z)) ## - np.trace(utils.chol_solve(U, dK))) # # Send the derivative message func(d) for (dK, func) in DKs_xx: # Compute derivative w.r.t. covariance matrix d = 0.5 * (dK.dot(z).dot(z) - U.trace_solve_gradient(dK)) ## - np.trace(U.solve(dK))) ## d = 0.5 * (dK.dot(z).dot(z) ## - np.trace(utils.chol_solve(U, dK))) ## d = 0.5 * (np.dot(z, np.dot(dK, z)) ## - np.trace(utils.chol_solve(U, dK))) # Send the derivative message func(d) else: ## Log pdf for latent GP raise Exception('Not implemented yet') return L