def parameters_changed(self): N, D = self.Y.shape Kss = self.kern.K(self.X) Ksu = self.kern.K(self.X, self.Z) wv = self.posterior.woodbury_vector wi = self.posterior.woodbury_inv a = self.Y - Ksu.dot(wv) C = Kss + np.eye(N)*self.likelihood.variance - Ksu.dot(wi).dot(Ksu.T) Lc = jitchol(C) LcInva = dtrtrs(Lc, a)[0] LcInv = dtrtri(Lc) CInva = dtrtrs(Lc, LcInva,trans=1)[0] self._log_marginal_likelihood = -N*D/2.*np.log(2*np.pi) - D*np.log(np.diag(Lc)).sum() - np.square(LcInva).sum()/2. dKsu = CInva.dot(wv.T) dKss = tdot(CInva)/2. -D* tdot(LcInv.T)/2. dKsu += -2. * dKss.dot(Ksu).dot(wi) X_grad = self.kern.gradients_X(dKss, self.X) X_grad += self.kern.gradients_X(dKsu, self.X, self.Z) self.X.gradient = X_grad if self.uncertain_input: # Update Log-likelihood KL_div = self.variational_prior.KL_divergence(self.X) # update for the KL divergence self.variational_prior.update_gradients_KL(self.X) self._log_marginal_likelihood += -KL_div
def test_checkFullRank(self): from GPy.util.debug import checkFullRank from GPy.util.linalg import tdot array = np.random.normal(0, 1, 100).reshape(25, 4) self.assertFalse(checkFullRank(tdot(array), name='test')) array = np.random.normal(0, 1, (25, 25)) self.assertTrue(checkFullRank(tdot(array)))
def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, K=None, variance=None, Z_tilde=None, A = None): """ Returns a Posterior class containing essential quantities of the posterior The comments below corresponds to Alg 2.1 in GPML textbook. """ # print('ExactGaussianInferenceGroup inference:') if mean_function is None: m = 0 else: m = mean_function.f(X) if variance is None: variance = likelihood.gaussian_variance(Y_metadata) YYT_factor = Y-m # NOTE: change K to AKA^T if K is None: if A is None: A = np.identity(X.shape[0]) K = A.dot(kern.K(X)).dot(A.T) # A_t k(X_t, X_t) A_t^T else: raise NotImplementedError('Need to be extended to group case!') Ky = K.copy() diag.add(Ky, variance+1e-8) # A_t k(X_t, X_t)A_t^T + sigma^2 I # pdinv: # Wi: inverse of Ky # LW: the Cholesky decomposition of Ky -> L # LWi: the Cholesky decomposition of Kyi (not used) # W_logdet: the log of the determinat of Ky Wi, LW, LWi, W_logdet = pdinv(Ky) # LAPACK: DPOTRS solves a system of linear equations A*X = B with a symmetric # positive definite matrix A using the Cholesky factorization # A = U**T*U or A = L*L**T computed by DPOTRF. alpha, _ = dpotrs(LW, YYT_factor, lower=1) # so this gives # (A_t k(X_t, X_t)A_t^T + sigma^2 I)^{-1} (Y_t - m) # Note: 20210827 confirm the log marginal likelihood log_marginal = 0.5*(-Y.size * log_2_pi - Y.shape[1] * W_logdet - np.sum(alpha * YYT_factor)) if Z_tilde is not None: # This is a correction term for the log marginal likelihood # In EP this is log Z_tilde, which is the difference between the # Gaussian marginal and Z_EP log_marginal += Z_tilde # REVIEW: since log_marginal does not change, the gradient does not need to change as well. # FIXME: confirm the gradient update is correct # dL_dK = 0.5 * (tdot(alpha) - Y.shape[1] * Wi) dL_dK = 0.5 * A.T.dot((tdot(alpha) - Y.shape[1] * Wi)).dot(A) # print('dL_dK shape', dL_dK.shape) dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK), Y_metadata) return PosteriorExactGroup(woodbury_chol=LW, woodbury_vector=alpha, K=K, A = A), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
def _create_kernel(self, V): self._kerns = [ RBF(1, ARD=True, active_dims=[i]) for i in range(self.n_dims) ] self._kernf = Fixed(self.n_dims, tdot(V)) self._kernb = Bias(self.n_dims) self.kernel = np.sum(self._kerns) + self._kernf + self._kernb
def vb_grad_natgrad(self): """ Natural Gradients of the bound with respect to phi, the variational parameters controlling assignment of the data to GPs """ grad_Lm = np.zeros_like(self.phi) for i, kern in enumerate(self.kern): K = kern.K(self.X) I = np.eye(self.N) B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance)) K_B_inv = pdinv(K + B_inv)[0] alpha = np.dot(K_B_inv, self.Y) dL_dB = tdot(alpha) - K_B_inv for n in range(self.phi.shape[0]): grad_B_inv_nonzero = -self.variance / (self.phi[n, i] ** 2 + 1e-6) grad_Lm[n, i] = 0.5 * dL_dB[n, n] * grad_B_inv_nonzero grad_phi = grad_Lm + self.mixing_prop_bound_grad() + self.Hgrad natgrad = grad_phi - np.sum(self.phi * grad_phi, 1)[:, None] grad = natgrad * self.phi return grad.flatten(), natgrad.flatten()
def gatherPsiStat(self, kern, X, Z, Y, beta, uncertain_inputs, D, missing_data): assert beta.size == 1 if uncertain_inputs: psi0 = kern.psi0(Z, X) psi1 = kern.psi1(Z, X)*beta psi2 = kern.psi2(Z, X)*beta if not missing_data else kern.psi2n(Z, X)*beta else: psi0 = kern.Kdiag(X) psi1 = kern.K(X, Z) if missing_data: psi2 = psi1[:,None,:]*psi1[:,:,None]*beta else: psi2 = tdot(psi1.T)*beta psi1 = psi1*beta if isinstance(Y, VariationalPosterior): m, s = Y.mean, Y.variance psi1Y = np.dot(m.T,psi1) # DxM YRY = (np.square(m).sum()+s.sum())*beta psi0 = (D*psi0).sum()*beta elif missing_data: psi1Y = np.dot((Y).T,psi1) # DxM trYYT = self.get_trYYT(Y) YRY = trYYT*beta psi0 = (psi0*D).sum()*beta else: psi1Y = np.dot(Y.T,psi1) # DxM trYYT = self.get_trYYT(Y) YRY = trYYT*beta psi0 = (psi0*D).sum()*beta return psi0, psi2, YRY, psi1, psi1Y
def _inference(K: np.ndarray, ga_approx: GaussianApproximation, cav_params: CavityParams, Z_tilde: float, y: List[Tuple[int, float]], yc: List[List[Tuple[int, int]]]) -> Tuple[Posterior, int, Dict]: """ Compute the posterior approximation :param K: prior covariance matrix :param ga_approx: Gaussian approximation of the batches :param cav_params: Cavity parameters of the posterior :param Z_tilde: Log marginal likelihood :param y: Direct observations as a list of tuples telling location index (row in X) and observation value. :param yc: Batch comparisons in a list of lists of tuples. Each batch is a list and tuples tell the comparisons (winner index, loser index) :return: A tuple consisting of the posterior approximation, log marginal likelihood and gradient dictionary """ log_marginal, post_params = _ep_marginal(K, ga_approx, Z_tilde,y,yc) tau_tilde_root = sqrtm_block(ga_approx.tau, y,yc) Sroot_tilde_K = np.dot(tau_tilde_root, K) aux_alpha , _ = dpotrs(post_params.L, np.dot(Sroot_tilde_K, ga_approx.v), lower=1) alpha = (ga_approx.v - np.dot(tau_tilde_root, aux_alpha))[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde) LWi, _ = dtrtrs(post_params.L, tau_tilde_root, lower=1) Wi = np.dot(LWi.T,LWi) symmetrify(Wi) #(K + Sigma^(\tilde))^(-1) dL_dK = 0.5 * (tdot(alpha) - Wi) dL_dthetaL = 0 return Posterior(woodbury_inv=np.asfortranarray(Wi), woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
def __init__(self, X, Y, K=2, kernels=None, variance=1., alpha=1., prior_Z='symmetric', name='OMGP'): N, self.D = Y.shape self.Y = Y self.YYT = tdot(self.Y) self.X = X if kernels == None: self.kern = [] for i in range(K): self.kern.append(GPy.kern.RBF(input_dim=1)) else: self.kern = kernels CollapsedMixture.__init__(self, N, K, prior_Z, alpha, name) self.link_parameter( GPy.core.parameterization.param.Param( 'variance', variance, GPy.core.parameterization.transformations.Logexp())) self.link_parameters(*self.kern)
def vb_grad_natgrad(self): """ Natural Gradients of the bound with respect to phi, the variational parameters controlling assignment of the data to GPs """ grad_Lm = np.zeros_like(self.phi) for i, kern in enumerate(self.kern): K = kern.K(self.X) I = np.eye(self.N) B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance)) alpha = np.linalg.solve(K + B_inv, self.Y) K_B_inv = pdinv(K + B_inv)[0] dL_dB = tdot(alpha) - K_B_inv for n in range(self.phi.shape[0]): grad_B_inv = np.zeros_like(B_inv) grad_B_inv[n, n] = -self.variance / (self.phi[n, i]**2 + 1e-6) grad_Lm[n, i] = 0.5 * np.trace(np.dot(dL_dB, grad_B_inv)) grad_phi = grad_Lm + self.mixing_prop_bound_grad() + self.Hgrad natgrad = grad_phi - np.sum(self.phi * grad_phi, 1)[:, None] grad = natgrad * self.phi return grad.flatten(), natgrad.flatten()
def inference(self, kern, X, W, likelihood, Y, mean_function=None, Y_metadata=None, K=None, variance=None, Z_tilde=None): """ Returns a Posterior class containing essential quantities of the posterior """ if mean_function is None: m = 0 else: m = mean_function.f(X) if variance is None: variance = likelihood.gaussian_variance(Y_metadata) YYT_factor = Y - m if K is None: K = kern.K(X) Ky = K.copy() diag.add(Ky, variance + 1e-8) Wi, LW, LWi, W_logdet = pdinv(Ky) alpha, _ = dpotrs(LW, YYT_factor, lower=1) log_marginal = 0.5 * (-Y.size * log_2_pi - Y.shape[1] * W_logdet - np.sum(alpha * YYT_factor)) if Z_tilde is not None: # This is a correction term for the log marginal likelihood # In EP this is log Z_tilde, which is the difference between the # Gaussian marginal and Z_EP log_marginal += Z_tilde dL_dK = 0.5 * (tdot(alpha) - Y.shape[1] * Wi) dL_dthetaL = likelihood.exact_inference_gradients( np.diag(dL_dK), Y_metadata) posterior_ = Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K) return posterior_, log_marginal, { 'dL_dK': dL_dK, 'dL_dthetaL': dL_dthetaL, 'dL_dm': alpha }, W_logdet
def compute_dL(self): # Common computation beta = 1./np.fmax(self.likelihood.variance, 1e-6) output_dim = self.Y.shape[-1] wv = self.posterior.woodbury_vector if self.missing_data: wv = wv[:,self.valid_dim] output_dim = self.valid_dim.sum() if self.ninan is not None: self.dL_dpsi2 = beta/2.*(self.posterior.woodbury_inv[:,:,self.valid_dim] - tdot(wv)[:, :, None]).sum(-1) else: self.dL_dpsi2 = beta/2.*(output_dim*self.posterior.woodbury_inv - tdot(wv)) self.dL_dpsi1 = beta*np.dot(self.Y[:,self.valid_dim], wv.T) self.dL_dpsi0 = - beta/2.* np.ones(self.Y.shape[0]) else: self.dL_dpsi2 = beta*(output_dim*self.posterior.woodbury_inv - tdot(wv))/2. #np.einsum('md,od->mo',wv, wv) self.dL_dpsi1 = beta*np.dot(self.Y, wv.T) self.dL_dpsi0 = -beta/2.*output_dim* np.ones(self.Y.shape[0])
def K(self, X, X2=None): if self.ARD: if X2 is None: return tdot(X * np.sqrt(self.variance)) else: rv = np.sqrt(self.variance) return np.dot(X * rv, (X2 * rv).T) else: return self._dot_product(X, X2) * self.variance
def comp_KL_qU(self, qU_mean ,qU_var): M,D = qU_mean.shape[0], qU_mean.shape[1] qU_L = self.mid['qU_L'] L = self.mid['L'] Linvmu = self.mid['Linvmu'] LinvLu = self.mid['LinvLu'] KuuInv = dpotri(L, lower=1)[0] Lu = qU_L LuInv = dtrtri(Lu) KL = D*M/-2. - np.log(np.diag(Lu)).sum()*D +np.log(np.diag(L)).sum()*D + np.square(LinvLu).sum()/2.*D + np.square(Linvmu).sum()/2. dKL_dqU_mean = dtrtrs(L, Linvmu, trans=True)[0] dKL_dqU_var = (tdot(LuInv.T)/-2. + KuuInv/2.)*D dKL_dKuu = KuuInv*D/2. -KuuInv.dot( tdot(qU_mean)+qU_var*D).dot(KuuInv)/2. return float(KL), dKL_dqU_mean, dKL_dqU_var, dKL_dKuu
def _get_YYTfactor(self, Y): """ find a matrix L which satisfies LLT = YYT. Note that L may have fewer columns than Y. """ N, D = Y.shape if (N>=D): return Y.view(np.ndarray) else: return jitchol(tdot(Y))
def gatherPsiStat(self, kern, X, Z, uncertain_inputs): if uncertain_inputs: psi0 = kern.psi0(Z, X).sum() psi1 = kern.psi1(Z, X) psi2 = kern.psi2(Z, X) else: psi0 = kern.Kdiag(X).sum() psi1 = kern.K(X, Z) psi2 = tdot(psi1.T) return psi0, psi1, psi2
def _raw_predict(self, kern, Xnew, A_ast, pred_var, full_cov=False): """ pred_var: _predictive_variable, X_t (all X up to round t) """ # print('PosteriorExactGroup _raw_predict') # NOTE: change Kx to AKx and add .dot(A_ast.T) # NOTE: 20210827 confirm mu and var (for self._woodbury_chol.ndim == 2 case) Kx = self.A.dot(kern.K(pred_var, Xnew)).dot(A_ast.T) # A_t k(X_t, X_\ast) A_\ast mu = np.dot(Kx.T, self.woodbury_vector) # mu = A_\ast k(X_t, X_\ast)^T A_t^T (A_t k(X_t, X_t)A_t^T + sigma^2 I)^{-1} (Y_t - m) if len(mu.shape) == 1: mu = mu.reshape(-1, 1) if full_cov: Kxx = kern.K(Xnew) # k(X_ast, X_ast) # self._woodbury_chol Cholesky decomposition of A_t k(X_t, X_t)A_t^T + sigma^2 I if self._woodbury_chol.ndim == 2: # DTRTRS solves a triangular system of the form A * X = B or A**T * X = B, where A is a triangular matrix of order N, and B is an N-by-NRHS matrix. A check is made to verify that A is nonsingular. tmp = dtrtrs(self._woodbury_chol, Kx)[0] # (A_t k(X_t, X_t)A_t^T + sigma^2 I)^{-1} k(X_ast, X_ast) -> v # tdot: returns np.dot(mat, mat.T), but faster for large 2D arrays of doubles. var = A_ast.dot(Kxx - tdot(tmp.T)).dot(A_ast.T) elif self._woodbury_chol.ndim == 3: # Missing data raise NotImplementedError('Need to be extended to group case!') var = np.empty((Kxx.shape[0], Kxx.shape[1], self._woodbury_chol.shape[2])) for i in range(var.shape[2]): tmp = dtrtrs(self._woodbury_chol[:, :, i], Kx)[0] var[:, :, i] = (Kxx - tdot(tmp.T)) var = var else: Kxx = np.diag(A_ast.dot(kern.K(Xnew, Xnew)).dot(A_ast.T)) if self._woodbury_chol.ndim == 2: tmp = dtrtrs(self._woodbury_chol, Kx)[0] var = (Kxx - np.square(tmp).sum(0))[:, None] elif self._woodbury_chol.ndim == 3: # Missing data raise NotImplementedError('Need to be extended to group case!') var = np.empty((Kxx.shape[0], self._woodbury_chol.shape[2])) for i in range(var.shape[1]): tmp = dtrtrs(self._woodbury_chol[:, :, i], Kx)[0] var[:, i] = (Kxx - np.square(tmp).sum(0)) var = var return mu, var
def comp_KL_qU(self, qU_mean, qU_var): M, D = qU_mean.shape[0], qU_mean.shape[1] qU_L = self.mid['qU_L'] L = self.mid['L'] Linvmu = self.mid['Linvmu'] LinvLu = self.mid['LinvLu'] KuuInv = dpotri(L, lower=1)[0] Lu = qU_L LuInv = dtrtri(Lu) KL = D * M / -2. - np.log(np.diag(Lu)).sum() * D + np.log( np.diag(L)).sum() * D + np.square( LinvLu).sum() / 2. * D + np.square(Linvmu).sum() / 2. dKL_dqU_mean = dtrtrs(L, Linvmu, trans=True)[0] dKL_dqU_var = (tdot(LuInv.T) / -2. + KuuInv / 2.) * D dKL_dKuu = KuuInv * D / 2. - KuuInv.dot(tdot(qU_mean) + qU_var * D).dot(KuuInv) / 2. return float(KL), dKL_dqU_mean, dKL_dqU_var, dKL_dKuu
def compute_dl_dK(posterior, K, eta, theta, prior_mean = 0): tau, v = theta, eta tau_tilde_root = np.sqrt(tau) Sroot_tilde_K = tau_tilde_root[:,None] * K aux_alpha , _ = dpotrs(posterior.L, np.dot(Sroot_tilde_K, v), lower=1) alpha = (v - tau_tilde_root * aux_alpha)[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde) LWi, _ = dtrtrs(posterior.L, np.diag(tau_tilde_root), lower=1) Wi = np.dot(LWi.T, LWi) symmetrify(Wi) #(K + Sigma^(\tilde))^(-1) dL_dK = 0.5 * (tdot(alpha) - Wi) return dL_dK
def _inference(K, ga_approx, cav_params, likelihood, Z_tilde, Y_metadata=None): log_marginal, post_params = _ep_marginal(K, ga_approx, Z_tilde) tau_tilde_root = np.sqrt(ga_approx.tau) Sroot_tilde_K = tau_tilde_root[:,None] * K aux_alpha , _ = dpotrs(post_params.L, np.dot(Sroot_tilde_K, ga_approx.v), lower=1) alpha = (ga_approx.v - tau_tilde_root * aux_alpha)[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde) LWi, _ = dtrtrs(post_params.L, np.diag(tau_tilde_root), lower=1) Wi = np.dot(LWi.T,LWi) symmetrify(Wi) #(K + Sigma^(\tilde))^(-1) dL_dK = 0.5 * (tdot(alpha) - Wi) dL_dthetaL = 0 #likelihood.ep_gradients(Y, cav_params.tau, cav_params.v, np.diag(dL_dK), Y_metadata=Y_metadata, quad_mode='gh') #temp2 = likelihood.ep_gradients(Y, cav_params.tau, cav_params.v, np.diag(dL_dK), Y_metadata=Y_metadata, quad_mode='naive') #temp = likelihood.exact_inference_gradients(np.diag(dL_dK), Y_metadata = Y_metadata) #print("exact: {}, approx: {}, Ztilde: {}, naive: {}".format(temp, dL_dthetaL, Z_tilde, temp2)) return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
def scaled_dist(self, X, X2=None): X = self.input_type.evaluate(X) if X2 is None: Xsq = np.sum(np.square(X), 1) r2 = -2. * tdot(X) + (Xsq[:, None] + Xsq[None, :]) util.diag.view( r2 )[:, ] = 0. # force diagnoal to be zero: sometime numerically a little negative r2 = np.clip(r2, 0, np.inf) return np.sqrt(r2) / self.lengthscale else: #X2, = self._slice_X(X2) X2 = self.input_type.evaluate(X2) X1sq = np.sum(np.square(X), 1) X2sq = np.sum(np.square(X2), 1) r2 = -2. * np.dot(X, X2.T) + (X1sq[:, None] + X2sq[None, :]) r2 = np.clip(r2, 0, np.inf) return np.sqrt(r2) / self.lengthscale
def __init__(self, X, Y, K=2, kernels=None, variance=1., alpha=1., prior_Z='symmetric', name='OMGP'): N, self.D = Y.shape self.Y = Y self.YYT = tdot(self.Y) self.X = X if kernels == None: self.kern = [] for i in range(K): self.kern.append(GPy.kern.RBF(input_dim=1)) else: self.kern = kernels CollapsedMixture.__init__(self, N, K, prior_Z, alpha, name) self.link_parameter(GPy.core.parameterization.param.Param('variance', variance, GPy.core.parameterization.transformations.Logexp())) self.link_parameters(*self.kern)
def _unscaled_dist(self, X, X2=None): """ Compute the Euclidean distance between each row of X and X2, or between each pair of rows of X if X2 is None. """ # X, = self._slice_X(X) if X2 is None: Xsq = np.sum(np.square(X), 1) r2 = -2. * tdot(X) + (Xsq[:, None] + Xsq[None, :]) util.diag.view( r2 )[:, ] = 0. # force diagnoal to be zero: sometime numerically a little negative r2 = np.clip(r2, 0, np.inf) return np.sqrt(r2) else: # X2, = self._slice_X(X2) X1sq = np.sum(np.square(X), 1) X2sq = np.sum(np.square(X2), 1) r2 = -2. * np.dot(X, X2.T) + (X1sq[:, None] + X2sq[None, :]) r2 = np.clip(r2, 0, np.inf) return np.sqrt(r2)
def gatherPsiStat(self, kern, X, Z, Y, beta, uncertain_inputs, D, missing_data): assert beta.size == 1 if uncertain_inputs: psi0 = kern.psi0(Z, X) psi1 = kern.psi1(Z, X) * beta psi2 = kern.psi2(Z, X) * beta if not missing_data else kern.psi2n( Z, X) * beta else: psi0 = kern.Kdiag(X) psi1 = kern.K(X, Z) if missing_data: psi2 = psi1[:, None, :] * psi1[:, :, None] * beta else: psi2 = tdot(psi1.T) * beta psi1 = psi1 * beta if isinstance(Y, VariationalPosterior): m, s = Y.mean, Y.variance psi1Y = np.dot(m.T, psi1) # DxM YRY = (np.square(m).sum() + s.sum()) * beta psi0 = (D * psi0).sum() * beta elif missing_data: psi1Y = np.dot((Y).T, psi1) # DxM trYYT = self.get_trYYT(Y) YRY = trYYT * beta psi0 = (psi0 * D).sum() * beta else: psi1Y = np.dot(Y.T, psi1) # DxM trYYT = self.get_trYYT(Y) YRY = trYYT * beta psi0 = (psi0 * D).sum() * beta return psi0, psi2, YRY, psi1, psi1Y
def get_YYTfactor(self, Y): N, D = Y.shape if (N>=D): return Y.view(np.ndarray) else: return jitchol(tdot(Y))
def _inference_vardtc(self): if self.svi: from GPy.util.linalg import tdot self.qU_var = tdot(self.qU_W)+np.eye(self.Z.shape[0])*self.qU_a self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y, self.qU_mean , self.qU_var, Kuu_sigma=self.Kuu_sigma) if self.mpi_comm is None or (self.mpi_comm is not None and self.mpi_comm.rank==self.mpi_root): KL, dKL_dqU_mean, dKL_dqU_var, dKL_dKuu = self.inference_method.comp_KL_qU(self.qU_mean ,self.qU_var) self._log_marginal_likelihood += -KL*self.qU_ratio self.grad_dict['dL_dqU_mean'] += -dKL_dqU_mean*self.qU_ratio self.grad_dict['dL_dqU_var'] += -dKL_dqU_var*self.qU_ratio self.grad_dict['dL_dKmm'] += -dKL_dKuu*self.qU_ratio else: self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata, Kuu_sigma=self.Kuu_sigma if hasattr(self, 'Kuu_sigma') else None) self.likelihood.update_gradients(self.grad_dict['dL_dthetaL']) dL_dKmm = self.grad_dict['dL_dKmm'] if (self.mpi_comm is None or (self.mpi_comm is not None and self.mpi_comm.rank==self.mpi_root)) and (hasattr(self, 'Kuu_sigma') and self.Kuu_sigma is not None): self.Kuu_sigma.gradient = np.diag(dL_dKmm) if isinstance(self.X, VariationalPosterior): #gradients wrt kernel if self.psicov: self.kern.update_gradients_expectations_psicov(variational_posterior=self.X, Z=self.Z, dL_dpsi0=self.grad_dict['dL_dpsi0'], dL_dpsi1=self.grad_dict['dL_dpsi1'], dL_dpsicov=self.grad_dict['dL_dpsicov']) else: self.kern.update_gradients_expectations(variational_posterior=self.X, Z=self.Z, dL_dpsi0=self.grad_dict['dL_dpsi0'], dL_dpsi1=self.grad_dict['dL_dpsi1'], dL_dpsi2=self.grad_dict['dL_dpsi2']) kerngrad = self.kern.gradient.copy() if self.mpi_comm is None: self.kern.update_gradients_full(dL_dKmm, self.Z, None) kerngrad += self.kern.gradient.copy() self.kern.gradient = kerngrad else: kerngrad = reduceArrays([kerngrad], self.mpi_comm, self.mpi_root)[0] if self.mpi_comm.rank==self.mpi_root: self.kern.update_gradients_full(dL_dKmm, self.Z, None) kerngrad += self.kern.gradient.copy() self.kern.gradient = kerngrad #gradients wrt Z if self.psicov: self.Z.gradient = self.kern.gradients_Z_expectations_psicov( self.grad_dict['dL_dpsi0'], self.grad_dict['dL_dpsi1'], self.grad_dict['dL_dpsicov'], Z=self.Z, variational_posterior=self.X) else: self.Z.gradient = self.kern.gradients_Z_expectations( self.grad_dict['dL_dpsi0'], self.grad_dict['dL_dpsi1'], self.grad_dict['dL_dpsi2'], Z=self.Z, variational_posterior=self.X) if self.mpi_comm is None: self.Z.gradient += self.kern.gradients_X(dL_dKmm, self.Z) else: self.Z.gradient = reduceArrays([self.Z.gradient], self.mpi_comm, self.mpi_root)[0] if self.mpi_comm.rank == self.mpi_root: self.Z.gradient += self.kern.gradients_X(dL_dKmm, self.Z) else: #gradients wrt kernel self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X) kerngrad = self.kern.gradient.copy() self.kern.update_gradients_full(self.grad_dict['dL_dKnm'], self.X, self.Z) kerngrad += self.kern.gradient if self.mpi_comm is None: self.kern.update_gradients_full(dL_dKmm, self.Z, None) self.kern.gradient += kerngrad else: kerngrad = reduceArrays([kerngrad], self.mpi_comm, self.mpi_root)[0] if self.mpi_comm.rank==self.mpi_root: self.kern.update_gradients_full(dL_dKmm, self.Z, None) kerngrad += self.kern.gradient.copy() self.kern.gradient = kerngrad #gradients wrt Z self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X) if self.mpi_comm is None: self.Z.gradient += self.kern.gradients_X(dL_dKmm, self.Z) else: self.Z.gradient = reduceArrays([self.Z.gradient], self.mpi_comm, self.mpi_root)[0] if self.mpi_comm.rank == self.mpi_root: self.Z.gradient += self.kern.gradients_X(dL_dKmm, self.Z) if self.svi: self.qU_mean.gradient = self.grad_dict['dL_dqU_mean'] self.qU_W.gradient = (self.grad_dict['dL_dqU_var']+self.grad_dict['dL_dqU_var'].T).dot(self.qU_W) self.qU_a.gradient = np.diag(self.grad_dict['dL_dqU_var']).sum()
def inference(self, kern, X, Z, likelihood, Y, qU): """ The SVI-VarDTC inference """ if isinstance(Y, np.ndarray) and np.any(np.isnan(Y)): missing_data = True N, M, Q = Y.shape[0], Z.shape[0], Z.shape[1] Ds = Y.shape[1] - (np.isnan(Y) * 1).sum(1) Ymask = 1 - np.isnan(Y) * 1 Y_masked = np.zeros_like(Y) Y_masked[Ymask == 1] = Y[Ymask == 1] ND = Ymask.sum() else: missing_data = False N, D, M, Q = Y.shape[0], Y.shape[1], Z.shape[0], Z.shape[1] ND = N * D uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1. / np.fmax(likelihood.variance, 1e-6) psi0, psi2, YRY, psi1, psi1Y = self.gatherPsiStat( kern, X, Z, Y if not missing_data else Y_masked, beta, uncertain_inputs, D if not missing_data else Ds, missing_data) #====================================================================== # Compute Common Components #====================================================================== mu, S = qU.mean, qU.covariance mupsi1Y = mu.dot(psi1Y) Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) if missing_data: S_mu = S[None, :, :] + mu.T[:, :, None] * mu.T[:, None, :] NS_mu = S_mu.T.dot(Ymask.T).T LmInv = dtrtri(Lm) LmInvPsi2LmInvT = np.swapaxes(psi2.dot(LmInv.T), 1, 2).dot(LmInv.T) LmInvSmuLmInvT = np.swapaxes(NS_mu.dot(LmInv.T), 1, 2).dot(LmInv.T) B = mupsi1Y + mupsi1Y.T + (Ds[:, None, None] * psi2).sum(0) tmp = backsub_both_sides(Lm, B, 'right') logL = -ND*log_2_pi/2. +ND*np.log(beta)/2. - psi0/2. - YRY/2. \ -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. +np.trace(tmp)/2. else: S_mu = S * D + tdot(mu) if uncertain_inputs: LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right') else: LmInvPsi2LmInvT = tdot(dtrtrs( Lm, psi1.T)[0]) / beta #tdot(psi1.dot(LmInv.T).T) /beta LmInvSmuLmInvT = backsub_both_sides(Lm, S_mu, 'right') B = mupsi1Y + mupsi1Y.T + D * psi2 tmp = backsub_both_sides(Lm, B, 'right') logL = -ND*log_2_pi/2. +ND*np.log(beta)/2. - psi0/2. - YRY/2. \ -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. +np.trace(tmp)/2. #====================================================================== # Compute dL_dKmm #====================================================================== dL_dKmm = np.eye(M) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = None #(YRY*beta + beta*output_dim*psi0 - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2).sum() - beta*np.trace(LLinvPsi1TYYTPsi1LLinvT) #====================================================================== # Compute dL_dpsi #====================================================================== if missing_data: dL_dpsi0 = -Ds * (beta * np.ones((N, ))) / 2. else: dL_dpsi0 = -D * (beta * np.ones((N, ))) / 2. if uncertain_outputs: Ym, Ys = Y.mean, Y.variance dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm, Ym.dot(mu.T).T)[0], trans=1)[0].T * beta else: if missing_data: dL_dpsi1 = dtrtrs( Lm, dtrtrs(Lm, (Y_masked).dot(mu.T).T)[0], trans=1)[0].T * beta else: dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm, Y.dot(mu.T).T)[0], trans=1)[0].T * beta if uncertain_inputs: if missing_data: dL_dpsi2 = np.swapaxes( (Ds[:, None, None] * np.eye(M)[None, :, :] - LmInvSmuLmInvT).dot(LmInv), 1, 2).dot(LmInv) * beta / 2. else: dL_dpsi2 = beta * backsub_both_sides( Lm, D * np.eye(M) - LmInvSmuLmInvT, 'left') / 2. else: dL_dpsi1 += beta * psi1.dot(dL_dpsi2 + dL_dpsi2.T) dL_dpsi2 = None if uncertain_inputs: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dpsi0': dL_dpsi0, 'dL_dpsi1': dL_dpsi1, 'dL_dpsi2': dL_dpsi2, 'dL_dthetaL': dL_dthetaL } else: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dKdiag': dL_dpsi0, 'dL_dKnm': dL_dpsi1, 'dL_dthetaL': dL_dthetaL } if uncertain_outputs: Ym = Y.mean grad_dict['dL_dYmean'] = -Ym * beta + dtrtrs(Lm, psi1.T)[0].T.dot( dtrtrs(Lm, mu)[0]) grad_dict['dL_dYvar'] = beta / -2. return logL, grad_dict
def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None): assert mean_function is None, "inference with a mean function not implemented" num_inducing, _ = Z.shape num_data, output_dim = Y.shape #make sure the noise is not hetero sigma_n = likelihood.gaussian_variance(Y_metadata) if sigma_n.size >1: raise NotImplementedError("no hetero noise with this implementation of PEP") Kmm = kern.K(Z) Knn = kern.Kdiag(X) Knm = kern.K(X, Z) U = Knm #factor Kmm diag.add(Kmm, self.const_jitter) Kmmi, L, Li, _ = pdinv(Kmm) #compute beta_star, the effective noise precision LiUT = np.dot(Li, U.T) sigma_star = sigma_n + self.alpha * (Knn - np.sum(np.square(LiUT),0)) beta_star = 1./sigma_star # Compute and factor A A = tdot(LiUT*np.sqrt(beta_star)) + np.eye(num_inducing) LA = jitchol(A) # back substitute to get b, P, v URiy = np.dot(U.T*beta_star,Y) tmp, _ = dtrtrs(L, URiy, lower=1) b, _ = dtrtrs(LA, tmp, lower=1) tmp, _ = dtrtrs(LA, b, lower=1, trans=1) v, _ = dtrtrs(L, tmp, lower=1, trans=1) tmp, _ = dtrtrs(LA, Li, lower=1, trans=0) P = tdot(tmp.T) alpha_const_term = (1.0-self.alpha) / self.alpha #compute log marginal log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \ -np.sum(np.log(np.diag(LA)))*output_dim + \ 0.5*output_dim*(1+alpha_const_term)*np.sum(np.log(beta_star)) + \ -0.5*np.sum(np.square(Y.T*np.sqrt(beta_star))) + \ 0.5*np.sum(np.square(b)) + 0.5*alpha_const_term*num_data*np.log(sigma_n) #compute dL_dR Uv = np.dot(U, v) dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - (1.0+alpha_const_term)/beta_star + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) \ + np.sum(np.square(Uv), 1))*beta_star**2 # Compute dL_dKmm vvT_P = tdot(v.reshape(-1,1)) + P dL_dK = 0.5*(Kmmi - vvT_P) KiU = np.dot(Kmmi, U.T) dL_dK += self.alpha * np.dot(KiU*dL_dR, KiU.T) # Compute dL_dU vY = np.dot(v.reshape(-1,1),Y.T) dL_dU = vY - np.dot(vvT_P, U.T) dL_dU *= beta_star dL_dU -= self.alpha * 2.*KiU*dL_dR dL_dthetaL = likelihood.exact_inference_gradients(dL_dR) dL_dthetaL += 0.5*alpha_const_term*num_data / sigma_n grad_dict = {'dL_dKmm': dL_dK, 'dL_dKdiag':dL_dR * self.alpha, 'dL_dKnm':dL_dU.T, 'dL_dthetaL':dL_dthetaL} #construct a posterior object post = Posterior(woodbury_inv=Kmmi-P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=L) return post, log_marginal, grad_dict
def inference(self, kern, X, Z, likelihood, Y, qU_mean, qU_var, Kuu_sigma=None): """ The SVI-VarDTC inference """ N, D, M, Q = Y.shape[0], Y.shape[1], Z.shape[0], Z.shape[1] uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1. / likelihood.variance psi0, psi2, YRY, psi1, psi1Y = self.gatherPsiStat( kern, X, Z, Y, beta, uncertain_inputs) #====================================================================== # Compute Common Components #====================================================================== Kuu = kern.K(Z).copy() if Kuu_sigma is not None: diag.add(Kuu, Kuu_sigma) else: diag.add(Kuu, self.const_jitter) Lm = jitchol(Kuu) mu, S = qU_mean, qU_var Ls = jitchol(S) LinvLs = dtrtrs(Lm, Ls)[0] Linvmu = dtrtrs(Lm, mu)[0] psi1YLinvT = dtrtrs(Lm, psi1Y.T)[0].T self.mid = {'qU_L': Ls, 'LinvLu': LinvLs, 'L': Lm, 'Linvmu': Linvmu} if uncertain_inputs: LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right') else: LmInvPsi2LmInvT = tdot(dtrtrs(Lm, psi1.T)[0]) / beta LmInvSmuLmInvT = tdot(LinvLs) * D + tdot(Linvmu) # logdet_L = np.sum(np.log(np.diag(Lm))) # logdet_S = np.sum(np.log(np.diag(Ls))) #====================================================================== # Compute log-likelihood #====================================================================== logL_R = -N * np.log(beta) logL = -N*D*log_2_pi/2. -D*logL_R/2. - D*psi0/2. - YRY/2. \ -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. + np.trace(LmInvPsi2LmInvT)*D/2.+(Linvmu*psi1YLinvT.T).sum() #====================================================================== # Compute dL_dKmm #====================================================================== tmp1 = backsub_both_sides(Lm, LmInvSmuLmInvT.dot(LmInvPsi2LmInvT), 'left') tmp2 = Linvmu.dot(psi1YLinvT) tmp3 = backsub_both_sides(Lm, -D * LmInvPsi2LmInvT - tmp2 - tmp2.T, 'left') / 2. dL_dKmm = (tmp1 + tmp1.T) / 2. + tmp3 #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = -D * N * beta / 2. - ( -D * psi0 / 2. - YRY / 2. - (LmInvSmuLmInvT * LmInvPsi2LmInvT).sum() / 2. + np.trace(LmInvPsi2LmInvT) * D / 2. + (Linvmu * psi1YLinvT.T).sum()) * beta #====================================================================== # Compute dL_dqU #====================================================================== tmp1 = backsub_both_sides(Lm, -LmInvPsi2LmInvT, 'left') dL_dqU_mean = tmp1.dot(mu) + dtrtrs(Lm, psi1YLinvT.T, trans=1)[0] dL_dqU_var = D / 2. * tmp1 #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== KuuInvmu = dtrtrs(Lm, Linvmu, trans=1)[0] tmp = backsub_both_sides(Lm, np.eye(M) - tdot(LinvLs), 'left') post = Posterior(woodbury_inv=tmp, woodbury_vector=KuuInvmu, K=Kuu, mean=mu, cov=S, K_chol=Lm) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0 = -D * (beta * np.ones((N, ))) / 2. if uncertain_outputs: dL_dpsi1 = Y.mean.dot(dtrtrs(Lm, Linvmu, trans=1)[0].T) * beta else: dL_dpsi1 = Y.dot(dtrtrs(Lm, Linvmu, trans=1)[0].T) * beta dL_dpsi2 = beta * backsub_both_sides(Lm, D * np.eye(M) - LmInvSmuLmInvT, 'left') / 2. if not uncertain_inputs: dL_dpsi1 += psi1.dot(dL_dpsi2 + dL_dpsi2.T) / beta dL_dpsi2 = None if uncertain_inputs: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dpsi0': dL_dpsi0, 'dL_dpsi1': dL_dpsi1, 'dL_dpsi2': dL_dpsi2, 'dL_dthetaL': dL_dthetaL, 'dL_dqU_mean': dL_dqU_mean, 'dL_dqU_var': dL_dqU_var } else: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dKdiag': dL_dpsi0, 'dL_dKnm': dL_dpsi1, 'dL_dthetaL': dL_dthetaL, 'dL_dqU_mean': dL_dqU_mean, 'dL_dqU_var': dL_dqU_var } if uncertain_outputs: m, s = Y.mean, Y.variance grad_dict['dL_dYmean'] = -m * beta + dtrtrs(Lm, psi1.T)[0].T.dot( dtrtrs(Lm, mu)[0]) grad_dict['dL_dYvar'] = beta / -2. return post, logL, grad_dict
def _dot_product(self, X, X2=None): if X2 is None: return tdot(X) else: return np.dot(X, X2.T)
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None, fixed_covs_kerns=None, **kw): _, output_dim = Y.shape uncertain_inputs = isinstance(X, VariationalPosterior) #see whether we've got a different noise variance for each datum beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6) # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency! #self.YYTfactor = self.get_YYTfactor(Y) #VVT_factor = self.get_VVTfactor(self.YYTfactor, beta) het_noise = beta.size > 1 if het_noise: raise(NotImplementedError("Heteroscedastic noise not implemented, should be possible though, feel free to try implementing it :)")) if beta.ndim == 1: beta = beta[:, None] # do the inference: num_inducing = Z.shape[0] num_data = Y.shape[0] # kernel computations, using BGPLVM notation Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) if Lm is None: Lm = jitchol(Kmm) # The rather complex computations of A, and the psi stats if uncertain_inputs: psi0 = kern.psi0(Z, X) psi1 = kern.psi1(Z, X) if het_noise: psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0) else: psi2_beta = kern.psi2(Z,X) * beta LmInv = dtrtri(Lm) A = LmInv.dot(psi2_beta.dot(LmInv.T)) else: psi0 = kern.Kdiag(X) psi1 = kern.K(X, Z) if het_noise: tmp = psi1 * (np.sqrt(beta)) else: tmp = psi1 * (np.sqrt(beta)) tmp, _ = dtrtrs(Lm, tmp.T, lower=1) A = tdot(tmp) # factor B B = np.eye(num_inducing) + A LB = jitchol(B) # back substutue C into psi1Vf #tmp, _ = dtrtrs(Lm, psi1.T.dot(VVT_factor), lower=1, trans=0) #_LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0) #tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1) #Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1) # data fit and derivative of L w.r.t. Kmm #delit = tdot(_LBi_Lmi_psi1Vf) # Expose YYT to get additional covariates in (YYT + Kgg): tmp, _ = dtrtrs(Lm, psi1.T, lower=1, trans=0) _LBi_Lmi_psi1, _ = dtrtrs(LB, tmp, lower=1, trans=0) tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1, lower=1, trans=1) Cpsi1, _ = dtrtrs(Lm, tmp, lower=1, trans=1) # TODO: cache this: # Compute fixed covariates covariance: if fixed_covs_kerns is not None: K_fixed = 0 for name, [cov, k] in fixed_covs_kerns.iteritems(): K_fixed += k.K(cov) #trYYT = self.get_trYYT(Y) YYT_covs = (tdot(Y) + K_fixed) data_term = beta**2 * YYT_covs trYYT_covs = np.trace(YYT_covs) else: data_term = beta**2 * tdot(Y) trYYT_covs = self.get_trYYT(Y) #trYYT = self.get_trYYT(Y) delit = mdot(_LBi_Lmi_psi1, data_term, _LBi_Lmi_psi1.T) data_fit = np.trace(delit) DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit) if dL_dKmm is None: delit = -0.5 * DBi_plus_BiPBi delit += -0.5 * B * output_dim delit += output_dim * np.eye(num_inducing) # Compute dL_dKmm dL_dKmm = backsub_both_sides(Lm, delit) # derivatives of L w.r.t. psi dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, data_term, Cpsi1, DBi_plus_BiPBi, psi1, het_noise, uncertain_inputs) # log marginal likelihood log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, psi0, A, LB, trYYT_covs, data_fit, Y) if self.save_per_dim: self.saved_vals = [psi0, A, LB, _LBi_Lmi_psi1, beta] # No heteroscedastics, so no _LBi_Lmi_psi1Vf: # For the interested reader, try implementing the heteroscedastic version, it should be possible _LBi_Lmi_psi1Vf = None # Is just here for documentation, so you can see, what it was. #noise derivatives dL_dR = _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, psi0, psi1, beta, data_fit, num_data, output_dim, trYYT_covs, Y, None) dL_dthetaL = likelihood.exact_inference_gradients(dL_dR,Y_metadata) #put the gradients in the right places if uncertain_inputs: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2, 'dL_dthetaL':dL_dthetaL} else: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL} if fixed_covs_kerns is not None: # For now, we do not take the gradients, we can compute them, # but the maximum likelihood solution is to switch off the additional covariates.... dL_dcovs = beta * np.eye(K_fixed.shape[0]) - beta**2*tdot(_LBi_Lmi_psi1.T) grad_dict['dL_dcovs'] = -.5 * dL_dcovs #get sufficient things for posterior prediction #TODO: do we really want to do this in the loop? if 1: woodbury_vector = (beta*Cpsi1).dot(Y) else: import ipdb; ipdb.set_trace() psi1V = np.dot(Y.T*beta, psi1).T tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0) tmp, _ = dpotrs(LB, tmp, lower=1) woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1) Bi, _ = dpotri(LB, lower=1) symmetrify(Bi) Bi = -dpotri(LB, lower=1)[0] diag.add(Bi, 1) woodbury_inv = backsub_both_sides(Lm, Bi) #construct a posterior object post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm) return post, log_marginal, grad_dict
def inference(self, kern_r, kern_c, Xr, Xc, Zr, Zc, likelihood, Y, qU_mean, qU_var_r, qU_var_c, indexD, output_dim): """ The SVI-VarDTC inference """ N, D, Mr, Mc, Qr, Qc = Y.shape[0], output_dim, Zr.shape[0], Zc.shape[ 0], Zr.shape[1], Zc.shape[1] uncertain_inputs_r = isinstance(Xr, VariationalPosterior) uncertain_inputs_c = isinstance(Xc, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) grad_dict = self._init_grad_dict(N, D, Mr, Mc) beta = 1. / likelihood.variance if len(beta) == 1: beta = np.zeros(D) + beta psi0_r, psi1_r, psi2_r = self.gatherPsiStat(kern_r, Xr, Zr, uncertain_inputs_r) psi0_c, psi1_c, psi2_c = self.gatherPsiStat(kern_c, Xc, Zc, uncertain_inputs_c) #====================================================================== # Compute Common Components #====================================================================== Kuu_r = kern_r.K(Zr).copy() diag.add(Kuu_r, self.const_jitter) Lr = jitchol(Kuu_r) Kuu_c = kern_c.K(Zc).copy() diag.add(Kuu_c, self.const_jitter) Lc = jitchol(Kuu_c) mu, Sr, Sc = qU_mean, qU_var_r, qU_var_c LSr = jitchol(Sr) LSc = jitchol(Sc) LcInvMLrInvT = dtrtrs(Lc, dtrtrs(Lr, mu.T)[0].T)[0] LcInvLSc = dtrtrs(Lc, LSc)[0] LrInvLSr = dtrtrs(Lr, LSr)[0] LcInvScLcInvT = tdot(LcInvLSc) LrInvSrLrInvT = tdot(LrInvLSr) tr_LrInvSrLrInvT = np.square(LrInvLSr).sum() tr_LcInvScLcInvT = np.square(LcInvLSc).sum() mid_res = { 'psi0_r': psi0_r, 'psi1_r': psi1_r, 'psi2_r': psi2_r, 'psi0_c': psi0_c, 'psi1_c': psi1_c, 'psi2_c': psi2_c, 'Lr': Lr, 'Lc': Lc, 'LcInvMLrInvT': LcInvMLrInvT, 'LcInvScLcInvT': LcInvScLcInvT, 'LrInvSrLrInvT': LrInvSrLrInvT, } #====================================================================== # Compute log-likelihood #====================================================================== logL = 0. for d in range(D): logL += self.inference_d(d, beta, Y, indexD, grad_dict, mid_res, uncertain_inputs_r, uncertain_inputs_c, Mr, Mc) logL += -Mc * (np.log(np.diag(Lr)).sum()-np.log(np.diag(LSr)).sum()) -Mr * (np.log(np.diag(Lc)).sum()-np.log(np.diag(LSc)).sum()) \ - np.square(LcInvMLrInvT).sum()/2. - tr_LrInvSrLrInvT * tr_LcInvScLcInvT/2. + Mr*Mc/2. #====================================================================== # Compute dL_dKuu #====================================================================== tmp = tdot( LcInvMLrInvT ) / 2. + tr_LrInvSrLrInvT / 2. * LcInvScLcInvT - Mr / 2. * np.eye(Mc) dL_dKuu_c = backsub_both_sides(Lc, tmp, 'left') dL_dKuu_c += dL_dKuu_c.T dL_dKuu_c *= 0.5 tmp = tdot( LcInvMLrInvT.T ) / 2. + tr_LcInvScLcInvT / 2. * LrInvSrLrInvT - Mc / 2. * np.eye(Mr) dL_dKuu_r = backsub_both_sides(Lr, tmp, 'left') dL_dKuu_r += dL_dKuu_r.T dL_dKuu_r *= 0.5 #====================================================================== # Compute dL_dqU #====================================================================== tmp = -LcInvMLrInvT dL_dqU_mean = dtrtrs(Lc, dtrtrs(Lr, tmp.T, trans=1)[0].T, trans=1)[0] LScInv = dtrtri(LSc) tmp = -tr_LrInvSrLrInvT / 2. * np.eye(Mc) dL_dqU_var_c = backsub_both_sides(Lc, tmp, 'left') + tdot(LScInv.T) * Mr / 2. LSrInv = dtrtri(LSr) tmp = -tr_LcInvScLcInvT / 2. * np.eye(Mr) dL_dqU_var_r = backsub_both_sides(Lr, tmp, 'left') + tdot(LSrInv.T) * Mc / 2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== post = PosteriorMultioutput(LcInvMLrInvT=LcInvMLrInvT, LcInvScLcInvT=LcInvScLcInvT, LrInvSrLrInvT=LrInvSrLrInvT, Lr=Lr, Lc=Lc, kern_r=kern_r, Xr=Xr, Zr=Zr) #====================================================================== # Compute dL_dpsi #====================================================================== grad_dict['dL_dqU_mean'] += dL_dqU_mean grad_dict['dL_dqU_var_c'] += dL_dqU_var_c grad_dict['dL_dqU_var_r'] += dL_dqU_var_r grad_dict['dL_dKuu_c'] += dL_dKuu_c grad_dict['dL_dKuu_r'] += dL_dKuu_r if not uncertain_inputs_c: grad_dict['dL_dKdiag_c'] = grad_dict['dL_dpsi0_c'] grad_dict['dL_dKfu_c'] = grad_dict['dL_dpsi1_c'] if not uncertain_inputs_r: grad_dict['dL_dKdiag_r'] = grad_dict['dL_dpsi0_r'] grad_dict['dL_dKfu_r'] = grad_dict['dL_dpsi1_r'] return post, logL, grad_dict
def _inference_vardtc(self): if self.svi: from GPy.util.linalg import tdot self.qU_var = tdot(self.qU_W)+np.eye(self.Z.shape[0])*self.qU_a self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y, self.qU_mean , self.qU_var, Kuu_sigma=self.Kuu_sigma) if self.mpi_comm is None or (self.mpi_comm is not None and self.mpi_comm.rank==self.mpi_root): KL, dKL_dqU_mean, dKL_dqU_var, dKL_dKuu = self.inference_method.comp_KL_qU(self.qU_mean ,self.qU_var) self._log_marginal_likelihood += -KL*self.qU_ratio self.grad_dict['dL_dqU_mean'] += -dKL_dqU_mean*self.qU_ratio self.grad_dict['dL_dqU_var'] += -dKL_dqU_var*self.qU_ratio self.grad_dict['dL_dKmm'] += -dKL_dKuu*self.qU_ratio else: self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata, Kuu_sigma=self.Kuu_sigma) self.likelihood.update_gradients(self.grad_dict['dL_dthetaL']) dL_dKmm = self.grad_dict['dL_dKmm'] if self.mpi_comm is None or (self.mpi_comm is not None and self.mpi_comm.rank==self.mpi_root): self.Kuu_sigma.gradient = np.diag(dL_dKmm) if isinstance(self.X, VariationalPosterior): #gradients wrt kernel if self.psicov: self.kern.update_gradients_expectations_psicov(variational_posterior=self.X, Z=self.Z, dL_dpsi0=self.grad_dict['dL_dpsi0'], dL_dpsi1=self.grad_dict['dL_dpsi1'], dL_dpsicov=self.grad_dict['dL_dpsicov']) else: self.kern.update_gradients_expectations(variational_posterior=self.X, Z=self.Z, dL_dpsi0=self.grad_dict['dL_dpsi0'], dL_dpsi1=self.grad_dict['dL_dpsi1'], dL_dpsi2=self.grad_dict['dL_dpsi2']) kerngrad = self.kern.gradient.copy() if self.mpi_comm is None: self.kern.update_gradients_full(dL_dKmm, self.Z, None) kerngrad += self.kern.gradient.copy() self.kern.gradient = kerngrad else: kerngrad = reduceArrays([kerngrad], self.mpi_comm, self.mpi_root)[0] if self.mpi_comm.rank==self.mpi_root: self.kern.update_gradients_full(dL_dKmm, self.Z, None) kerngrad += self.kern.gradient.copy() self.kern.gradient = kerngrad #gradients wrt Z if self.psicov: self.Z.gradient = self.kern.gradients_Z_expectations_psicov( self.grad_dict['dL_dpsi0'], self.grad_dict['dL_dpsi1'], self.grad_dict['dL_dpsicov'], Z=self.Z, variational_posterior=self.X) else: self.Z.gradient = self.kern.gradients_Z_expectations( self.grad_dict['dL_dpsi0'], self.grad_dict['dL_dpsi1'], self.grad_dict['dL_dpsi2'], Z=self.Z, variational_posterior=self.X) if self.mpi_comm is None: self.Z.gradient += self.kern.gradients_X(dL_dKmm, self.Z) else: self.Z.gradient = reduceArrays([self.Z.gradient], self.mpi_comm, self.mpi_root)[0] if self.mpi_comm.rank == self.mpi_root: self.Z.gradient += self.kern.gradients_X(dL_dKmm, self.Z) else: #gradients wrt kernel self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X) kerngrad = self.kern.gradient.copy() self.kern.update_gradients_full(self.grad_dict['dL_dKnm'], self.X, self.Z) kerngrad += self.kern.gradient if self.mpi_comm is None: self.kern.update_gradients_full(dL_dKmm, self.Z, None) self.kern.gradient += kerngrad else: kerngrad = reduceArrays([kerngrad], self.mpi_comm, self.mpi_root)[0] if self.mpi_comm.rank==self.mpi_root: self.kern.update_gradients_full(dL_dKmm, self.Z, None) kerngrad += self.kern.gradient.copy() self.kern.gradient = kerngrad #gradients wrt Z self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X) if self.mpi_comm is None: self.Z.gradient += self.kern.gradients_X(dL_dKmm, self.Z) else: self.Z.gradient = reduceArrays([self.Z.gradient], self.mpi_comm, self.mpi_root)[0] if self.mpi_comm.rank == self.mpi_root: self.Z.gradient += self.kern.gradients_X(dL_dKmm, self.Z) if self.svi: self.qU_mean.gradient = self.grad_dict['dL_dqU_mean'] self.qU_W.gradient = (self.grad_dict['dL_dqU_var']+self.grad_dict['dL_dqU_var'].T).dot(self.qU_W) self.qU_a.gradient = np.diag(self.grad_dict['dL_dqU_var']).sum()
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None): """ The first phase of inference: Compute: log-likelihood, dL_dKmm Cached intermediate results: Kmm, KmmInv, """ num_data, output_dim = Y.shape input_dim = Z.shape[0] uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1. / np.fmax(likelihood.variance, 1e-6) psi0, psi2, YRY, psi1, psi1Y, Shalf, psi1S = self.gatherPsiStat( kern, X, Z, Y, beta, uncertain_inputs) #====================================================================== # Compute Common Components #====================================================================== Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) #LmInv = dtrtri(Lm) if uncertain_inputs: LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right') else: LmInvPsi2LmInvT = tdot(dtrtrs( Lm, psi1.T)[0]) / beta #tdot(psi1.dot(LmInv.T).T) /beta Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT LL = jitchol(Lambda) LmLL = Lm.dot(LL) # LLInv = dtrtri(LL) # LmLLInv = LLInv.dot(LmInv) logdet_L = 2. * np.sum(np.log(np.diag(LL))) b = dtrtrs(LmLL, psi1Y.T)[0].T #psi1Y.dot(LmLLInv.T) bbt = np.square(b).sum() v = dtrtrs(LmLL, b.T, trans=1)[0].T #b.dot(LmLLInv) LLinvPsi1TYYTPsi1LLinvT = tdot(b.T) if psi1S is not None: psi1SLLinv = dtrtrs(LmLL, psi1S.T)[0].T #psi1S.dot(LmLLInv.T) bbt += np.square(psi1SLLinv).sum() LLinvPsi1TYYTPsi1LLinvT += tdot(psi1SLLinv.T) psi1SP = dtrtrs(LmLL, psi1SLLinv.T, trans=1)[0].T #psi1SLLinv.dot(LmLLInv) tmp = -backsub_both_sides( LL, LLinvPsi1TYYTPsi1LLinvT + output_dim * np.eye(input_dim)) dL_dpsi2R = backsub_both_sides( Lm, tmp + output_dim * np.eye(input_dim)) / 2 #tmp = -LLInv.T.dot(LLinvPsi1TYYTPsi1LLinvT+output_dim*np.eye(input_dim)).dot(LLInv) #dL_dpsi2R = LmInv.T.dot(tmp+output_dim*np.eye(input_dim)).dot(LmInv)/2. #====================================================================== # Compute log-likelihood #====================================================================== logL_R = -num_data * np.log(beta) logL = -( output_dim * (num_data * log_2_pi + logL_R + psi0 - np.trace(LmInvPsi2LmInvT)) + YRY - bbt) / 2. - output_dim * logdet_L / 2. #====================================================================== # Compute dL_dKmm #====================================================================== dL_dKmm = dL_dpsi2R - output_dim * backsub_both_sides( Lm, LmInvPsi2LmInvT) / 2 #LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== wd_inv = backsub_both_sides( Lm, np.eye(input_dim) - backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left') post = Posterior(woodbury_inv=wd_inv, woodbury_vector=v.T, K=Kmm, mean=None, cov=None, K_chol=Lm) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = (YRY * beta + beta * output_dim * psi0 - num_data * output_dim * beta) / 2. - beta * (dL_dpsi2R * psi2).sum( ) - beta * np.trace(LLinvPsi1TYYTPsi1LLinvT) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0 = -output_dim * (beta * np.ones((num_data, ))) / 2. if uncertain_outputs: m, s = Y.mean, Y.variance dL_dpsi1 = beta * (np.dot(m, v) + Shalf[:, None] * psi1SP) else: dL_dpsi1 = beta * np.dot(Y, v) if uncertain_inputs: dL_dpsi2 = beta * dL_dpsi2R else: dL_dpsi1 += np.dot(psi1, dL_dpsi2R) * 2. dL_dpsi2 = None if uncertain_inputs: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dpsi0': dL_dpsi0, 'dL_dpsi1': dL_dpsi1, 'dL_dpsi2': dL_dpsi2, 'dL_dthetaL': dL_dthetaL } else: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dKdiag': dL_dpsi0, 'dL_dKnm': dL_dpsi1, 'dL_dthetaL': dL_dthetaL } if uncertain_outputs: m, s = Y.mean, Y.variance psi1LmiLLi = dtrtrs(LmLL, psi1.T)[0].T #psi1.dot(LmLLInv.T) LLiLmipsi1Y = b.T grad_dict['dL_dYmean'] = -m * beta + psi1LmiLLi.dot(LLiLmipsi1Y) grad_dict['dL_dYvar'] = beta / -2. + np.square(psi1LmiLLi).sum( axis=1) / 2 return post, logL, grad_dict
def inference(self, kern, X, Z, likelihood, Y, qU_mean ,qU_var, Kuu_sigma=None): """ The SVI-VarDTC inference """ N, D, M, Q = Y.shape[0], Y.shape[1], Z.shape[0], Z.shape[1] uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1./likelihood.variance psi0, psi2, YRY, psi1, psi1Y = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs) #====================================================================== # Compute Common Components #====================================================================== Kuu = kern.K(Z).copy() if Kuu_sigma is not None: diag.add(Kuu, Kuu_sigma) else: diag.add(Kuu, self.const_jitter) Lm = jitchol(Kuu) mu, S = qU_mean, qU_var Ls = jitchol(S) LinvLs = dtrtrs(Lm, Ls)[0] Linvmu = dtrtrs(Lm, mu)[0] psi1YLinvT = dtrtrs(Lm,psi1Y.T)[0].T self.mid = { 'qU_L': Ls, 'LinvLu': LinvLs, 'L':Lm, 'Linvmu': Linvmu} if uncertain_inputs: LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right') else: LmInvPsi2LmInvT = tdot(dtrtrs(Lm, psi1.T)[0])/beta LmInvSmuLmInvT = tdot(LinvLs)*D+tdot(Linvmu) # logdet_L = np.sum(np.log(np.diag(Lm))) # logdet_S = np.sum(np.log(np.diag(Ls))) #====================================================================== # Compute log-likelihood #====================================================================== logL_R = -N*np.log(beta) logL = -N*D*log_2_pi/2. -D*logL_R/2. - D*psi0/2. - YRY/2. \ -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. + np.trace(LmInvPsi2LmInvT)*D/2.+(Linvmu*psi1YLinvT.T).sum() #====================================================================== # Compute dL_dKmm #====================================================================== tmp1 = backsub_both_sides(Lm,LmInvSmuLmInvT.dot(LmInvPsi2LmInvT), 'left') tmp2 = Linvmu.dot(psi1YLinvT) tmp3 = backsub_both_sides(Lm, - D*LmInvPsi2LmInvT -tmp2-tmp2.T, 'left')/2. dL_dKmm = (tmp1+tmp1.T)/2. + tmp3 #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = -D*N*beta/2. -(- D*psi0/2. - YRY/2.-(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. + np.trace(LmInvPsi2LmInvT)*D/2.+(Linvmu*psi1YLinvT.T).sum())*beta #====================================================================== # Compute dL_dqU #====================================================================== tmp1 = backsub_both_sides(Lm, - LmInvPsi2LmInvT, 'left') dL_dqU_mean = tmp1.dot(mu) + dtrtrs(Lm, psi1YLinvT.T,trans=1)[0] dL_dqU_var = D/2.*tmp1 #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== KuuInvmu = dtrtrs(Lm, Linvmu, trans=1)[0] tmp = backsub_both_sides(Lm, np.eye(M) - tdot(LinvLs), 'left') post = Posterior(woodbury_inv=tmp, woodbury_vector=KuuInvmu, K=Kuu, mean=mu, cov=S, K_chol=Lm) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0 = -D * (beta * np.ones((N,)))/2. if uncertain_outputs: dL_dpsi1 = Y.mean.dot(dtrtrs(Lm,Linvmu,trans=1)[0].T)*beta else: dL_dpsi1 = Y.dot(dtrtrs(Lm,Linvmu,trans=1)[0].T)*beta dL_dpsi2 = beta*backsub_both_sides(Lm, D*np.eye(M)-LmInvSmuLmInvT, 'left')/2. if not uncertain_inputs: dL_dpsi1 += psi1.dot(dL_dpsi2+dL_dpsi2.T)/beta dL_dpsi2 = None if uncertain_inputs: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2, 'dL_dthetaL':dL_dthetaL, 'dL_dqU_mean':dL_dqU_mean, 'dL_dqU_var':dL_dqU_var} else: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL, 'dL_dqU_mean':dL_dqU_mean, 'dL_dqU_var':dL_dqU_var} if uncertain_outputs: m,s = Y.mean, Y.variance grad_dict['dL_dYmean'] = -m*beta+ dtrtrs(Lm,psi1.T)[0].T.dot(dtrtrs(Lm,mu)[0]) grad_dict['dL_dYvar'] = beta/-2. return post, logL, grad_dict
def inference_root(self, kern, X, Z, likelihood, Y, Kuu_sigma=None, Y_metadata=None, Lm=None, dL_dKmm=None): """ The first phase of inference: Compute: log-likelihood, dL_dKmm Cached intermediate results: Kmm, KmmInv, """ num_data, output_dim = Y.shape input_dim = Z.shape[0] num_data_total = allReduceArrays([np.int32(num_data)], self.mpi_comm)[0] uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1./np.fmax(likelihood.variance, 1e-6) psi0, psi2, YRY, psi1, psi1Y, Shalf, psi1S = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs) #====================================================================== # Compute Common Components #====================================================================== try: Kmm = kern.K(Z).copy() if Kuu_sigma is not None: diag.add(Kmm, Kuu_sigma) else: diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) LmInv = dtrtri(Lm) LmInvPsi2LmInvT = LmInv.dot(psi2.dot(LmInv.T)) Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT LL = jitchol(Lambda) LLInv = dtrtri(LL) flag = np.zeros((1,),dtype=np.int32) self.mpi_comm.Bcast(flag,root=self.root) except LinAlgError as e: flag = np.ones((1,),dtype=np.int32) self.mpi_comm.Bcast(flag,root=self.root) raise e broadcastArrays([LmInv, LLInv],self.mpi_comm, self.root) LmLLInv = LLInv.dot(LmInv) logdet_L = 2.*np.sum(np.log(np.diag(LL))) b = psi1Y.dot(LmLLInv.T) bbt = np.square(b).sum() v = b.dot(LmLLInv) LLinvPsi1TYYTPsi1LLinvT = tdot(b.T) if psi1S is not None: psi1SLLinv = psi1S.dot(LmLLInv.T) bbt_sum = np.square(psi1SLLinv).sum() LLinvPsi1TYYTPsi1LLinvT_sum = tdot(psi1SLLinv.T) bbt_sum, LLinvPsi1TYYTPsi1LLinvT_sum = reduceArrays([bbt_sum, LLinvPsi1TYYTPsi1LLinvT_sum], self.mpi_comm, self.root) bbt += bbt_sum LLinvPsi1TYYTPsi1LLinvT += LLinvPsi1TYYTPsi1LLinvT_sum psi1SP = psi1SLLinv.dot(LmLLInv) tmp = -LLInv.T.dot(LLinvPsi1TYYTPsi1LLinvT+output_dim*np.eye(input_dim)).dot(LLInv) dL_dpsi2R = LmInv.T.dot(tmp+output_dim*np.eye(input_dim)).dot(LmInv)/2. broadcastArrays([dL_dpsi2R], self.mpi_comm, self.root) #====================================================================== # Compute log-likelihood #====================================================================== logL_R = -num_data_total*np.log(beta) logL = -(output_dim*(num_data_total*log_2_pi+logL_R+psi0-np.trace(LmInvPsi2LmInvT))+YRY- bbt)/2.-output_dim*logdet_L/2. #====================================================================== # Compute dL_dKmm #====================================================================== dL_dKmm = dL_dpsi2R - output_dim* LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== wd_inv = backsub_both_sides(Lm, np.eye(input_dim)- backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left') post = Posterior(woodbury_inv=wd_inv, woodbury_vector=v.T, K=Kmm, mean=None, cov=None, K_chol=Lm) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = (YRY*beta + beta*output_dim*psi0 - num_data_total*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2).sum() - beta*np.trace(LLinvPsi1TYYTPsi1LLinvT) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0 = -output_dim * (beta * np.ones((num_data,)))/2. if uncertain_outputs: m,s = Y.mean, Y.variance dL_dpsi1 = beta*(np.dot(m,v)+Shalf[:,None]*psi1SP) else: dL_dpsi1 = beta*np.dot(Y,v) if uncertain_inputs: dL_dpsi2 = beta* dL_dpsi2R else: dL_dpsi1 += np.dot(psi1,dL_dpsi2R)*2. dL_dpsi2 = None if uncertain_inputs: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2, 'dL_dthetaL':dL_dthetaL} else: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL} if uncertain_outputs: m,s = Y.mean, Y.variance psi1LmiLLi = psi1.dot(LmLLInv.T) LLiLmipsi1Y = b.T grad_dict['dL_dYmean'] = -m*beta+ psi1LmiLLi.dot(LLiLmipsi1Y) grad_dict['dL_dYvar'] = beta/-2.+ np.square(psi1LmiLLi).sum(axis=1)/2 return post, logL, grad_dict
def inference(self, kern_r, kern_c, Xr, Xc, Zr, Zc, likelihood, Y, qU_mean, qU_var_r, qU_var_c): """ The SVI-VarDTC inference """ N, D, Mr, Mc, Qr, Qc = Y.shape[0], Y.shape[1], Zr.shape[0], Zc.shape[ 0], Zr.shape[1], Zc.shape[1] uncertain_inputs_r = isinstance(Xr, VariationalPosterior) uncertain_inputs_c = isinstance(Xc, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1. / likelihood.variance psi0_r, psi1_r, psi2_r = self.gatherPsiStat(kern_r, Xr, Zr, uncertain_inputs_r) psi0_c, psi1_c, psi2_c = self.gatherPsiStat(kern_c, Xc, Zc, uncertain_inputs_c) #====================================================================== # Compute Common Components #====================================================================== Kuu_r = kern_r.K(Zr).copy() diag.add(Kuu_r, self.const_jitter) Lr = jitchol(Kuu_r) Kuu_c = kern_c.K(Zc).copy() diag.add(Kuu_c, self.const_jitter) Lc = jitchol(Kuu_c) mu, Sr, Sc = qU_mean, qU_var_r, qU_var_c LSr = jitchol(Sr) LSc = jitchol(Sc) LcInvMLrInvT = dtrtrs(Lc, dtrtrs(Lr, mu.T)[0].T)[0] LcInvPsi2_cLcInvT = backsub_both_sides(Lc, psi2_c, 'right') LrInvPsi2_rLrInvT = backsub_both_sides(Lr, psi2_r, 'right') LcInvLSc = dtrtrs(Lc, LSc)[0] LrInvLSr = dtrtrs(Lr, LSr)[0] LcInvScLcInvT = tdot(LcInvLSc) LrInvSrLrInvT = tdot(LrInvLSr) LcInvPsi1_cT = dtrtrs(Lc, psi1_c.T)[0] LrInvPsi1_rT = dtrtrs(Lr, psi1_r.T)[0] tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT = (LrInvPsi2_rLrInvT * LrInvSrLrInvT).sum() tr_LcInvPsi2_cLcInvT_LcInvScLcInvT = (LcInvPsi2_cLcInvT * LcInvScLcInvT).sum() tr_LrInvSrLrInvT = np.square(LrInvLSr).sum() tr_LcInvScLcInvT = np.square(LcInvLSc).sum() tr_LrInvPsi2_rLrInvT = np.trace(LrInvPsi2_rLrInvT) tr_LcInvPsi2_cLcInvT = np.trace(LcInvPsi2_cLcInvT) #====================================================================== # Compute log-likelihood #====================================================================== logL_A = - np.square(Y).sum() \ - (LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT)*LrInvPsi2_rLrInvT).sum() \ - tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT* tr_LcInvPsi2_cLcInvT_LcInvScLcInvT \ + 2 * (Y * LcInvPsi1_cT.T.dot(LcInvMLrInvT).dot(LrInvPsi1_rT)).sum() - psi0_c * psi0_r \ + tr_LrInvPsi2_rLrInvT * tr_LcInvPsi2_cLcInvT logL = -N*D/2.*(np.log(2.*np.pi)-np.log(beta)) + beta/2.* logL_A \ -Mc * (np.log(np.diag(Lr)).sum()-np.log(np.diag(LSr)).sum()) -Mr * (np.log(np.diag(Lc)).sum()-np.log(np.diag(LSc)).sum()) \ - np.square(LcInvMLrInvT).sum()/2. - tr_LrInvSrLrInvT * tr_LcInvScLcInvT/2. + Mr*Mc/2. #====================================================================== # Compute dL_dKuu #====================================================================== tmp = beta* LcInvPsi2_cLcInvT.dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT).dot(LcInvMLrInvT.T) \ + beta* tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvPsi2_cLcInvT.dot(LcInvScLcInvT) \ - beta* LcInvMLrInvT.dot(LrInvPsi1_rT).dot(Y.T).dot(LcInvPsi1_cT.T) \ - beta/2. * tr_LrInvPsi2_rLrInvT* LcInvPsi2_cLcInvT - Mr/2.*np.eye(Mc) \ + tdot(LcInvMLrInvT)/2. + tr_LrInvSrLrInvT/2. * LcInvScLcInvT dL_dKuu_c = backsub_both_sides(Lc, tmp, 'left') dL_dKuu_c += dL_dKuu_c.T dL_dKuu_c *= 0.5 tmp = beta* LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT) \ + beta* tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvPsi2_rLrInvT.dot(LrInvSrLrInvT) \ - beta* LrInvPsi1_rT.dot(Y.T).dot(LcInvPsi1_cT.T).dot(LcInvMLrInvT) \ - beta/2. * tr_LcInvPsi2_cLcInvT * LrInvPsi2_rLrInvT - Mc/2.*np.eye(Mr) \ + tdot(LcInvMLrInvT.T)/2. + tr_LcInvScLcInvT/2. * LrInvSrLrInvT dL_dKuu_r = backsub_both_sides(Lr, tmp, 'left') dL_dKuu_r += dL_dKuu_r.T dL_dKuu_r *= 0.5 #====================================================================== # Compute dL_dthetaL #====================================================================== dL_dthetaL = -D * N * beta / 2. - logL_A * beta * beta / 2. #====================================================================== # Compute dL_dqU #====================================================================== tmp = -beta * LcInvPsi2_cLcInvT.dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT)\ + beta* LcInvPsi1_cT.dot(Y).dot(LrInvPsi1_rT.T) - LcInvMLrInvT dL_dqU_mean = dtrtrs(Lc, dtrtrs(Lr, tmp.T, trans=1)[0].T, trans=1)[0] LScInv = dtrtri(LSc) tmp = -beta / 2. * tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvPsi2_cLcInvT - tr_LrInvSrLrInvT / 2. * np.eye( Mc) dL_dqU_var_c = backsub_both_sides(Lc, tmp, 'left') + tdot(LScInv.T) * Mr / 2. LSrInv = dtrtri(LSr) tmp = -beta / 2. * tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvPsi2_rLrInvT - tr_LcInvScLcInvT / 2. * np.eye( Mr) dL_dqU_var_r = backsub_both_sides(Lr, tmp, 'left') + tdot(LSrInv.T) * Mc / 2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== post = PosteriorMultioutput(LcInvMLrInvT=LcInvMLrInvT, LcInvScLcInvT=LcInvScLcInvT, LrInvSrLrInvT=LrInvSrLrInvT, Lr=Lr, Lc=Lc, kern_r=kern_r, Xr=Xr, Zr=Zr) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0_r = -psi0_c * beta / 2. * np.ones((D, )) dL_dpsi0_c = -psi0_r * beta / 2. * np.ones((N, )) dL_dpsi1_c = beta * dtrtrs( Lc, (Y.dot(LrInvPsi1_rT.T).dot(LcInvMLrInvT.T)).T, trans=1)[0].T dL_dpsi1_r = beta * dtrtrs( Lr, (Y.T.dot(LcInvPsi1_cT.T).dot(LcInvMLrInvT)).T, trans=1)[0].T tmp = beta / 2. * ( -LcInvMLrInvT.dot(LrInvPsi2_rLrInvT).dot(LcInvMLrInvT.T) - tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvScLcInvT + tr_LrInvPsi2_rLrInvT * np.eye(Mc)) dL_dpsi2_c = backsub_both_sides(Lc, tmp, 'left') tmp = beta / 2. * ( -LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT) - tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvSrLrInvT + tr_LcInvPsi2_cLcInvT * np.eye(Mr)) dL_dpsi2_r = backsub_both_sides(Lr, tmp, 'left') if not uncertain_inputs_r: dL_dpsi1_r += psi1_r.dot(dL_dpsi2_r + dL_dpsi2_r.T) if not uncertain_inputs_c: dL_dpsi1_c += psi1_c.dot(dL_dpsi2_c + dL_dpsi2_c.T) grad_dict = { 'dL_dthetaL': dL_dthetaL, 'dL_dqU_mean': dL_dqU_mean, 'dL_dqU_var_c': dL_dqU_var_c, 'dL_dqU_var_r': dL_dqU_var_r, 'dL_dKuu_c': dL_dKuu_c, 'dL_dKuu_r': dL_dKuu_r, } if uncertain_inputs_c: grad_dict['dL_dpsi0_c'] = dL_dpsi0_c grad_dict['dL_dpsi1_c'] = dL_dpsi1_c grad_dict['dL_dpsi2_c'] = dL_dpsi2_c else: grad_dict['dL_dKdiag_c'] = dL_dpsi0_c grad_dict['dL_dKfu_c'] = dL_dpsi1_c if uncertain_inputs_r: grad_dict['dL_dpsi0_r'] = dL_dpsi0_r grad_dict['dL_dpsi1_r'] = dL_dpsi1_r grad_dict['dL_dpsi2_r'] = dL_dpsi2_r else: grad_dict['dL_dKdiag_r'] = dL_dpsi0_r grad_dict['dL_dKfu_r'] = dL_dpsi1_r return post, logL, grad_dict
def incremental_inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, K=None, variance=None, Z_tilde=None): # do incremental update if mean_function is None: m = 0 else: m = mean_function.f(X) if variance is None: variance = likelihood.gaussian_variance(Y_metadata) YYT_factor = Y - m # K_tmp = kern.K(X, X[-1:]) K_inc = kern._K[:-1, -1] K_inc2 = kern._K[-1:, -1] # self._K = np.block([[self._K, K_inc], [K_inc.T, K_inc2]]) # Ky = K.copy() jitter = variance[ -1] + 1e-8 # variance can be given for each point individually, in which case we just take the last point # diag.add(Ky, jitter) # LW_old = self._old_posterior.woodbury_chol Wi, LW, LWi, W_logdet = pdinv_inc(self._old_LW, K_inc, K_inc2 + jitter, self._old_Wi) alpha, _ = dpotrs(LW, YYT_factor, lower=1) log_marginal = 0.5 * (-Y.size * log_2_pi - Y.shape[1] * W_logdet - np.sum(alpha * YYT_factor)) if Z_tilde is not None: # This is a correction term for the log marginal likelihood # In EP this is log Z_tilde, which is the difference between the # Gaussian marginal and Z_EP log_marginal += Z_tilde dL_dK = 0.5 * (tdot(alpha) - Y.shape[1] * Wi) dL_dthetaL = likelihood.exact_inference_gradients( np.diag(dL_dK), Y_metadata) self._old_LW = LW self._old_Wi = Wi posterior = Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K) # TODO add logdet to posterior ? return posterior, log_marginal, { 'dL_dK': dL_dK, 'dL_dthetaL': dL_dthetaL, 'dL_dm': alpha }
def get_YYTfactor(self, Y): N, D = Y.shape if (N >= D): return Y.view(np.ndarray) else: return jitchol(tdot(Y))
def inference(self, kern, X, Z, likelihood, Y, qU): """ The SVI-VarDTC inference """ if isinstance(Y, np.ndarray) and np.any(np.isnan(Y)): missing_data = True N, M, Q = Y.shape[0], Z.shape[0], Z.shape[1] Ds = Y.shape[1] - (np.isnan(Y)*1).sum(1) Ymask = 1-np.isnan(Y)*1 Y_masked = np.zeros_like(Y) Y_masked[Ymask==1] = Y[Ymask==1] ND = Ymask.sum() else: missing_data = False N, D, M, Q = Y.shape[0], Y.shape[1], Z.shape[0], Z.shape[1] ND = N*D uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1./np.fmax(likelihood.variance, 1e-6) psi0, psi2, YRY, psi1, psi1Y = self.gatherPsiStat(kern, X, Z, Y if not missing_data else Y_masked, beta, uncertain_inputs, D if not missing_data else Ds, missing_data) #====================================================================== # Compute Common Components #====================================================================== mu, S = qU.mean, qU.covariance mupsi1Y = mu.dot(psi1Y) Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) if missing_data: S_mu = S[None,:,:]+mu.T[:,:,None]*mu.T[:,None,:] NS_mu = S_mu.T.dot(Ymask.T).T LmInv = dtrtri(Lm) LmInvPsi2LmInvT = np.swapaxes(psi2.dot(LmInv.T),1,2).dot(LmInv.T) LmInvSmuLmInvT = np.swapaxes(NS_mu.dot(LmInv.T),1,2).dot(LmInv.T) B = mupsi1Y+ mupsi1Y.T +(Ds[:,None,None]*psi2).sum(0) tmp = backsub_both_sides(Lm, B,'right') logL = -ND*log_2_pi/2. +ND*np.log(beta)/2. - psi0/2. - YRY/2. \ -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. +np.trace(tmp)/2. else: S_mu = S*D+tdot(mu) if uncertain_inputs: LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right') else: LmInvPsi2LmInvT = tdot(dtrtrs(Lm, psi1.T)[0])/beta #tdot(psi1.dot(LmInv.T).T) /beta LmInvSmuLmInvT = backsub_both_sides(Lm, S_mu, 'right') B = mupsi1Y+ mupsi1Y.T +D*psi2 tmp = backsub_both_sides(Lm, B,'right') logL = -ND*log_2_pi/2. +ND*np.log(beta)/2. - psi0/2. - YRY/2. \ -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. +np.trace(tmp)/2. #====================================================================== # Compute dL_dKmm #====================================================================== dL_dKmm = np.eye(M) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = None #(YRY*beta + beta*output_dim*psi0 - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2).sum() - beta*np.trace(LLinvPsi1TYYTPsi1LLinvT) #====================================================================== # Compute dL_dpsi #====================================================================== if missing_data: dL_dpsi0 = -Ds * (beta * np.ones((N,)))/2. else: dL_dpsi0 = -D * (beta * np.ones((N,)))/2. if uncertain_outputs: Ym,Ys = Y.mean, Y.variance dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm, Ym.dot(mu.T).T)[0], trans=1)[0].T*beta else: if missing_data: dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm, (Y_masked).dot(mu.T).T)[0], trans=1)[0].T*beta else: dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm, Y.dot(mu.T).T)[0], trans=1)[0].T*beta if uncertain_inputs: if missing_data: dL_dpsi2 = np.swapaxes((Ds[:,None,None]*np.eye(M)[None,:,:]-LmInvSmuLmInvT).dot(LmInv),1,2).dot(LmInv)*beta/2. else: dL_dpsi2 = beta*backsub_both_sides(Lm, D*np.eye(M)-LmInvSmuLmInvT, 'left')/2. else: dL_dpsi1 += beta*psi1.dot(dL_dpsi2+dL_dpsi2.T) dL_dpsi2 = None if uncertain_inputs: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2, 'dL_dthetaL':dL_dthetaL} else: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL} if uncertain_outputs: Ym = Y.mean grad_dict['dL_dYmean'] = -Ym*beta+ dtrtrs(Lm,psi1.T)[0].T.dot(dtrtrs(Lm,mu)[0]) grad_dict['dL_dYvar'] = beta/-2. return logL, grad_dict
def _create_kernel(self, V): self._kerns = [RBF(1, ARD=True, active_dims=[i]) for i in range(self.n_dims)] self._kernf = Fixed(self.n_dims, tdot(V)) self._kernb = Bias(self.n_dims) self.kernel = np.sum(self._kerns) + self._kernf + self._kernb
def inference(self, kern, X, Z, likelihood, Y, indexD, output_dim, Y_metadata=None, Lm=None, dL_dKmm=None, Kuu_sigma=None): """ The first phase of inference: Compute: log-likelihood, dL_dKmm Cached intermediate results: Kmm, KmmInv, """ input_dim = Z.shape[0] uncertain_inputs = isinstance(X, VariationalPosterior) beta = 1. / likelihood.variance if len(beta) == 1: beta = np.zeros(output_dim) + beta beta_exp = np.zeros(indexD.shape[0]) for d in range(output_dim): beta_exp[indexD == d] = beta[d] psi0, psi1, psi2 = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs) psi2_sum = (beta_exp[:, None, None] * psi2).sum(0) / output_dim #====================================================================== # Compute Common Components #====================================================================== Kmm = kern.K(Z).copy() if Kuu_sigma is not None: diag.add(Kmm, Kuu_sigma) else: diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) logL = 0. dL_dthetaL = np.zeros(output_dim) dL_dKmm = np.zeros_like(Kmm) dL_dpsi0 = np.zeros_like(psi0) dL_dpsi1 = np.zeros_like(psi1) dL_dpsi2 = np.zeros_like(psi2) wv = np.empty((Kmm.shape[0], output_dim)) for d in range(output_dim): idx_d = indexD == d Y_d = Y[idx_d] N_d = Y_d.shape[0] beta_d = beta[d] psi2_d = psi2[idx_d].sum(0) * beta_d psi1Y = Y_d.T.dot(psi1[idx_d]) * beta_d psi0_d = psi0[idx_d].sum() * beta_d YRY_d = np.square(Y_d).sum() * beta_d LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2_d, 'right') Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT LL = jitchol(Lambda) LmLL = Lm.dot(LL) b = dtrtrs(LmLL, psi1Y.T)[0].T bbt = np.square(b).sum() v = dtrtrs(LmLL, b.T, trans=1)[0].T LLinvPsi1TYYTPsi1LLinvT = tdot(b.T) tmp = -backsub_both_sides(LL, LLinvPsi1TYYTPsi1LLinvT) dL_dpsi2R = backsub_both_sides(Lm, tmp + np.eye(input_dim)) / 2 logL_R = -N_d * np.log(beta_d) logL += -((N_d * log_2_pi + logL_R + psi0_d - np.trace(LmInvPsi2LmInvT)) + YRY_d - bbt) / 2. dL_dKmm += dL_dpsi2R - backsub_both_sides(Lm, LmInvPsi2LmInvT) / 2 dL_dthetaL[d:d + 1] = (YRY_d * beta_d + beta_d * psi0_d - N_d * beta_d) / 2. - beta_d * (dL_dpsi2R * psi2_d).sum( ) - beta_d * np.trace(LLinvPsi1TYYTPsi1LLinvT) dL_dpsi0[idx_d] = -beta_d / 2. dL_dpsi1[idx_d] = beta_d * np.dot(Y_d, v) dL_dpsi2[idx_d] = beta_d * dL_dpsi2R wv[:, d] = v LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2_sum, 'right') Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT LL = jitchol(Lambda) LmLL = Lm.dot(LL) logdet_L = 2. * np.sum(np.log(np.diag(LL))) dL_dpsi2R_common = dpotri(LmLL)[0] / -2. dL_dpsi2 += dL_dpsi2R_common[None, :, :] * beta_exp[:, None, None] for d in range(output_dim): dL_dthetaL[d] += (dL_dpsi2R_common * psi2[indexD == d].sum(0) ).sum() * -beta[d] * beta[d] dL_dKmm += dL_dpsi2R_common * output_dim logL += -output_dim * logdet_L / 2. #====================================================================== # Compute dL_dKmm #====================================================================== # dL_dKmm = dL_dpsi2R - output_dim* backsub_both_sides(Lm, LmInvPsi2LmInvT)/2 #LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== LLInvLmT = dtrtrs(LL, Lm.T)[0] cov = tdot(LLInvLmT.T) wd_inv = backsub_both_sides( Lm, np.eye(input_dim) - backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left') post = Posterior(woodbury_inv=wd_inv, woodbury_vector=wv, K=Kmm, mean=None, cov=cov, K_chol=Lm) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== # for d in range(output_dim): # dL_dthetaL[d:d+1] += - beta[d]*beta[d]*(dL_dpsi2R[None,:,:] * psi2[indexD==d]/output_dim).sum() # dL_dthetaL += - (dL_dpsi2R[None,:,:] * psi2_sum*D beta*(dL_dpsi2R*psi2).sum() #====================================================================== # Compute dL_dpsi #====================================================================== if not uncertain_inputs: dL_dpsi1 += (psi1[:, None, :] * dL_dpsi2).sum(2) * 2. if uncertain_inputs: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dpsi0': dL_dpsi0, 'dL_dpsi1': dL_dpsi1, 'dL_dpsi2': dL_dpsi2, 'dL_dthetaL': dL_dthetaL } else: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dKdiag': dL_dpsi0, 'dL_dKnm': dL_dpsi1, 'dL_dthetaL': dL_dthetaL } return post, logL, grad_dict
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None, Kuu_sigma=None): """ The first phase of inference: Compute: log-likelihood, dL_dKmm Cached intermediate results: Kmm, KmmInv, """ num_data, output_dim = Y.shape input_dim = Z.shape[0] uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) # from ..models.sslvm import Gaussian_Gamma # if isinstance(likelihood, Gaussian_Gamma): # beta = likelihood.expectation_beta() # logL_R = -num_data*likelihood.expectation_logbeta() # else: beta = 1./np.fmax(likelihood.variance, 1e-6) logL_R = -num_data*np.log(beta) psi0, psi2, YRY, psi1, psi1Y, Shalf, psi1S = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs) #====================================================================== # Compute Common Components #====================================================================== Kmm = kern.K(Z).copy() if Kuu_sigma is not None: diag.add(Kmm, Kuu_sigma) else: diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) #LmInv = dtrtri(Lm) if uncertain_inputs: LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right') else: LmInvPsi2LmInvT = tdot(dtrtrs(Lm, psi1.T)[0])/beta #tdot(psi1.dot(LmInv.T).T) /beta Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT LL = jitchol(Lambda) LmLL = Lm.dot(LL) # LLInv = dtrtri(LL) # LmLLInv = LLInv.dot(LmInv) logdet_L = 2.*np.sum(np.log(np.diag(LL))) b = dtrtrs(LmLL, psi1Y.T)[0].T #psi1Y.dot(LmLLInv.T) bbt = np.square(b).sum() v = dtrtrs(LmLL, b.T, trans=1)[0].T #b.dot(LmLLInv) LLinvPsi1TYYTPsi1LLinvT = tdot(b.T) if psi1S is not None: psi1SLLinv = dtrtrs(LmLL, psi1S.T)[0].T #psi1S.dot(LmLLInv.T) bbt += np.square(psi1SLLinv).sum() LLinvPsi1TYYTPsi1LLinvT += tdot(psi1SLLinv.T) psi1SP = dtrtrs(LmLL, psi1SLLinv.T, trans=1)[0].T #psi1SLLinv.dot(LmLLInv) tmp = -backsub_both_sides(LL, LLinvPsi1TYYTPsi1LLinvT+output_dim*np.eye(input_dim)) dL_dpsi2R = backsub_both_sides(Lm, tmp+output_dim*np.eye(input_dim))/2 #tmp = -LLInv.T.dot(LLinvPsi1TYYTPsi1LLinvT+output_dim*np.eye(input_dim)).dot(LLInv) #dL_dpsi2R = LmInv.T.dot(tmp+output_dim*np.eye(input_dim)).dot(LmInv)/2. #====================================================================== # Compute log-likelihood #====================================================================== logL = -(output_dim*(num_data*log_2_pi+logL_R+psi0-np.trace(LmInvPsi2LmInvT))+YRY- bbt)/2.-output_dim*logdet_L/2. #====================================================================== # Compute dL_dKmm #====================================================================== dL_dKmm = dL_dpsi2R - output_dim* backsub_both_sides(Lm, LmInvPsi2LmInvT)/2 #LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== LLInvLmT = dtrtrs(LL, Lm.T)[0] cov = tdot(LLInvLmT.T) wd_inv = backsub_both_sides(Lm, np.eye(input_dim)- backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left') post = Posterior(woodbury_inv=wd_inv, woodbury_vector=v.T, K=Kmm, mean=None, cov=cov, K_chol=Lm) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== # if isinstance(likelihood, Gaussian_Gamma): # from scipy.special import polygamma # dL_dthetaL = ((YRY + output_dim*psi0)/2. - (dL_dpsi2R*psi2).sum() - np.trace(LLinvPsi1TYYTPsi1LLinvT))/-beta # likelihood.q_a.gradient = num_data*output_dim/2.*polygamma(1, likelihood.q_a) + dL_dthetaL/likelihood.q_b # likelihood.q_b.gradient = num_data*output_dim/(-2.*likelihood.q_b) +dL_dthetaL*(-likelihood.q_a/(likelihood.q_b*likelihood.q_b)) # else: dL_dthetaL = (YRY*beta + beta*output_dim*psi0 - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2).sum() - beta*np.trace(LLinvPsi1TYYTPsi1LLinvT) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0 = -output_dim * (beta * np.ones((num_data,)))/2. if uncertain_outputs: m,s = Y.mean, Y.variance dL_dpsi1 = beta*(np.dot(m,v)+Shalf[:,None]*psi1SP) else: dL_dpsi1 = beta*np.dot(Y,v) if uncertain_inputs: dL_dpsi2 = beta* dL_dpsi2R else: dL_dpsi1 += np.dot(psi1,dL_dpsi2R)*2. dL_dpsi2 = None if uncertain_inputs: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2, 'dL_dthetaL':dL_dthetaL} else: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL} if uncertain_outputs: m,s = Y.mean, Y.variance psi1LmiLLi = dtrtrs(LmLL, psi1.T)[0].T LLiLmipsi1Y = b.T grad_dict['dL_dYmean'] = -m*beta+ psi1LmiLLi.dot(LLiLmipsi1Y) grad_dict['dL_dYvar'] = beta/-2.+ np.square(psi1LmiLLi).sum(axis=1)/2 return post, logL, grad_dict
def inference_nonroot(self, kern, X, Z, likelihood, Y,Y_metadata=None, Lm=None, dL_dKmm=None): num_data, output_dim = Y.shape num_data_total = allReduceArrays([np.int32(num_data)], self.mpi_comm)[0] input_dim = Z.shape[0] uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1./np.fmax(likelihood.variance, 1e-6) psi0, psi2, YRY, psi1, psi1Y, Shalf, psi1S = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs) flag = np.zeros((1,),dtype=np.int32) self.mpi_comm.Bcast(flag,root=self.root) if flag[0] == 1: raise LinAlgError('Linalg error!') LmInv, LLInv = np.empty((input_dim, input_dim)).T, np.empty((input_dim, input_dim)).T broadcastArrays([LmInv, LLInv], self.mpi_comm, self.root) LmLLInv = LLInv.dot(LmInv) b = psi1Y.dot(LmLLInv.T) v = b.dot(LmLLInv) if psi1S is not None: psi1SLLinv = psi1S.dot(LmLLInv.T) bbt_sum = np.square(psi1SLLinv).sum() LLinvPsi1TYYTPsi1LLinvT_sum = tdot(psi1SLLinv.T) reduceArrays([bbt_sum, LLinvPsi1TYYTPsi1LLinvT_sum], self.mpi_comm, self.root) psi1SP = psi1SLLinv.dot(LmLLInv) dL_dpsi2R = np.empty((input_dim, input_dim)) broadcastArrays([dL_dpsi2R], self.mpi_comm, self.root) dL_dpsi0 = -output_dim * (beta * np.ones((num_data,)))/2. if uncertain_outputs: m,s = Y.mean, Y.variance dL_dpsi1 = beta*(np.dot(m,v)+Shalf[:,None]*psi1SP) else: dL_dpsi1 = beta*np.dot(Y,v) if uncertain_inputs: dL_dpsi2 = beta* dL_dpsi2R else: dL_dpsi1 += np.dot(psi1,dL_dpsi2R)*2. dL_dpsi2 = None if uncertain_inputs: grad_dict = {'dL_dKmm': None, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2, 'dL_dthetaL':None} else: grad_dict = {'dL_dKmm': None, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':None} if uncertain_outputs: m,s = Y.mean, Y.variance psi1LmiLLi = psi1.dot(LmLLInv.T) LLiLmipsi1Y = b.T grad_dict['dL_dYmean'] = -m*beta+ psi1LmiLLi.dot(LLiLmipsi1Y) grad_dict['dL_dYvar'] = beta/-2.+ np.square(psi1LmiLLi).sum(axis=1)/2 return None, 0, grad_dict