def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, K=None, variance=None, Z_tilde=None, A = None): """ Returns a Posterior class containing essential quantities of the posterior The comments below corresponds to Alg 2.1 in GPML textbook. """ # print('ExactGaussianInferenceGroup inference:') if mean_function is None: m = 0 else: m = mean_function.f(X) if variance is None: variance = likelihood.gaussian_variance(Y_metadata) YYT_factor = Y-m # NOTE: change K to AKA^T if K is None: if A is None: A = np.identity(X.shape[0]) K = A.dot(kern.K(X)).dot(A.T) # A_t k(X_t, X_t) A_t^T else: raise NotImplementedError('Need to be extended to group case!') Ky = K.copy() diag.add(Ky, variance+1e-8) # A_t k(X_t, X_t)A_t^T + sigma^2 I # pdinv: # Wi: inverse of Ky # LW: the Cholesky decomposition of Ky -> L # LWi: the Cholesky decomposition of Kyi (not used) # W_logdet: the log of the determinat of Ky Wi, LW, LWi, W_logdet = pdinv(Ky) # LAPACK: DPOTRS solves a system of linear equations A*X = B with a symmetric # positive definite matrix A using the Cholesky factorization # A = U**T*U or A = L*L**T computed by DPOTRF. alpha, _ = dpotrs(LW, YYT_factor, lower=1) # so this gives # (A_t k(X_t, X_t)A_t^T + sigma^2 I)^{-1} (Y_t - m) # Note: 20210827 confirm the log marginal likelihood log_marginal = 0.5*(-Y.size * log_2_pi - Y.shape[1] * W_logdet - np.sum(alpha * YYT_factor)) if Z_tilde is not None: # This is a correction term for the log marginal likelihood # In EP this is log Z_tilde, which is the difference between the # Gaussian marginal and Z_EP log_marginal += Z_tilde # REVIEW: since log_marginal does not change, the gradient does not need to change as well. # FIXME: confirm the gradient update is correct # dL_dK = 0.5 * (tdot(alpha) - Y.shape[1] * Wi) dL_dK = 0.5 * A.T.dot((tdot(alpha) - Y.shape[1] * Wi)).dot(A) # print('dL_dK shape', dL_dK.shape) dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK), Y_metadata) return PosteriorExactGroup(woodbury_chol=LW, woodbury_vector=alpha, K=K, A = A), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
def inference(self, kern, X, W, likelihood, Y, mean_function=None, Y_metadata=None, K=None, variance=None, Z_tilde=None): """ Returns a Posterior class containing essential quantities of the posterior """ if mean_function is None: m = 0 else: m = mean_function.f(X) if variance is None: variance = likelihood.gaussian_variance(Y_metadata) YYT_factor = Y - m if K is None: K = kern.K(X) Ky = K.copy() diag.add(Ky, variance + 1e-8) Wi, LW, LWi, W_logdet = pdinv(Ky) alpha, _ = dpotrs(LW, YYT_factor, lower=1) log_marginal = 0.5 * (-Y.size * log_2_pi - Y.shape[1] * W_logdet - np.sum(alpha * YYT_factor)) if Z_tilde is not None: # This is a correction term for the log marginal likelihood # In EP this is log Z_tilde, which is the difference between the # Gaussian marginal and Z_EP log_marginal += Z_tilde dL_dK = 0.5 * (tdot(alpha) - Y.shape[1] * Wi) dL_dthetaL = likelihood.exact_inference_gradients( np.diag(dL_dK), Y_metadata) posterior_ = Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K) return posterior_, log_marginal, { 'dL_dK': dL_dK, 'dL_dthetaL': dL_dthetaL, 'dL_dm': alpha }, W_logdet
def update_model(self, xvals, zvals, incremental = True): assert(self.xvals is not None) assert(self.zvals is not None) Kx = self.kern.K(self.xvals, xvals) # Update K matrix self._K = np.block([ [self._K, Kx], [Kx.T, self.kern.K(xvals, xvals)] ]) # Update internal data self.xvals = np.vstack([self.xvals, xvals]) self.zvals = np.vstack([self.zvals, zvals]) # Update woodbury inverse, either incrementally or from scratch if incremental == True: Pinv = self.woodbury_inv Q = Kx R = Kx.T S = self.kern.K(xvals, xvals) M = S - np.dot(np.dot(R, Pinv), Q) # Adds some additional noise to ensure well-conditioned diag.add(M, self.noise + 1e-8) M, _, _, _ = pdinv(M) Pnew = Pinv + np.dot(np.dot(np.dot(np.dot(Pinv, Q), M), R), Pinv) Qnew = -np.dot(np.dot(Pinv, Q), M) Rnew = -np.dot(np.dot(M, R), Pinv) Snew = M self._woodbury_inv = np.block([ [Pnew, Qnew], [Rnew, Snew] ]) else: Ky = self.K.copy() # Adds some additional noise to ensure well-conditioned diag.add(Ky, self.noise + 1e-8) Wi, LW, LWi, W_logdet = pdinv(Ky) self._woodbury_inv = Wi self._woodbury_vector = np.dot(self.woodbury_inv, self.zvals) self._woodbury_chol = None self._mean = None self._covariance = None self._prior_mean = 0. self._K_chol = None
def init_model(self, xvals, zvals): # Update internal data self.xvals = xvals self.zvals = zvals self._K = self.kern.K(self.xvals) Ky = self._K.copy() # Adds some additional noise to ensure well-conditioned diag.add(Ky, self.noise + 1e-8) Wi, LW, LWi, W_logdet = pdinv(Ky) self._woodbury_inv = Wi self._woodbury_vector = np.dot(self._woodbury_inv, self.zvals) self._woodbury_chol = None self._mean = None self._covariance = None self._prior_mean = 0. self._K_chol = None
def inference(self, kern, X, Z, likelihood, Y, qU): """ The SVI-VarDTC inference """ if isinstance(Y, np.ndarray) and np.any(np.isnan(Y)): missing_data = True N, M, Q = Y.shape[0], Z.shape[0], Z.shape[1] Ds = Y.shape[1] - (np.isnan(Y) * 1).sum(1) Ymask = 1 - np.isnan(Y) * 1 Y_masked = np.zeros_like(Y) Y_masked[Ymask == 1] = Y[Ymask == 1] ND = Ymask.sum() else: missing_data = False N, D, M, Q = Y.shape[0], Y.shape[1], Z.shape[0], Z.shape[1] ND = N * D uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1. / np.fmax(likelihood.variance, 1e-6) psi0, psi2, YRY, psi1, psi1Y = self.gatherPsiStat( kern, X, Z, Y if not missing_data else Y_masked, beta, uncertain_inputs, D if not missing_data else Ds, missing_data) #====================================================================== # Compute Common Components #====================================================================== mu, S = qU.mean, qU.covariance mupsi1Y = mu.dot(psi1Y) Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) if missing_data: S_mu = S[None, :, :] + mu.T[:, :, None] * mu.T[:, None, :] NS_mu = S_mu.T.dot(Ymask.T).T LmInv = dtrtri(Lm) LmInvPsi2LmInvT = np.swapaxes(psi2.dot(LmInv.T), 1, 2).dot(LmInv.T) LmInvSmuLmInvT = np.swapaxes(NS_mu.dot(LmInv.T), 1, 2).dot(LmInv.T) B = mupsi1Y + mupsi1Y.T + (Ds[:, None, None] * psi2).sum(0) tmp = backsub_both_sides(Lm, B, 'right') logL = -ND*log_2_pi/2. +ND*np.log(beta)/2. - psi0/2. - YRY/2. \ -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. +np.trace(tmp)/2. else: S_mu = S * D + tdot(mu) if uncertain_inputs: LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right') else: LmInvPsi2LmInvT = tdot(dtrtrs( Lm, psi1.T)[0]) / beta #tdot(psi1.dot(LmInv.T).T) /beta LmInvSmuLmInvT = backsub_both_sides(Lm, S_mu, 'right') B = mupsi1Y + mupsi1Y.T + D * psi2 tmp = backsub_both_sides(Lm, B, 'right') logL = -ND*log_2_pi/2. +ND*np.log(beta)/2. - psi0/2. - YRY/2. \ -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. +np.trace(tmp)/2. #====================================================================== # Compute dL_dKmm #====================================================================== dL_dKmm = np.eye(M) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = None #(YRY*beta + beta*output_dim*psi0 - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2).sum() - beta*np.trace(LLinvPsi1TYYTPsi1LLinvT) #====================================================================== # Compute dL_dpsi #====================================================================== if missing_data: dL_dpsi0 = -Ds * (beta * np.ones((N, ))) / 2. else: dL_dpsi0 = -D * (beta * np.ones((N, ))) / 2. if uncertain_outputs: Ym, Ys = Y.mean, Y.variance dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm, Ym.dot(mu.T).T)[0], trans=1)[0].T * beta else: if missing_data: dL_dpsi1 = dtrtrs( Lm, dtrtrs(Lm, (Y_masked).dot(mu.T).T)[0], trans=1)[0].T * beta else: dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm, Y.dot(mu.T).T)[0], trans=1)[0].T * beta if uncertain_inputs: if missing_data: dL_dpsi2 = np.swapaxes( (Ds[:, None, None] * np.eye(M)[None, :, :] - LmInvSmuLmInvT).dot(LmInv), 1, 2).dot(LmInv) * beta / 2. else: dL_dpsi2 = beta * backsub_both_sides( Lm, D * np.eye(M) - LmInvSmuLmInvT, 'left') / 2. else: dL_dpsi1 += beta * psi1.dot(dL_dpsi2 + dL_dpsi2.T) dL_dpsi2 = None if uncertain_inputs: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dpsi0': dL_dpsi0, 'dL_dpsi1': dL_dpsi1, 'dL_dpsi2': dL_dpsi2, 'dL_dthetaL': dL_dthetaL } else: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dKdiag': dL_dpsi0, 'dL_dKnm': dL_dpsi1, 'dL_dthetaL': dL_dthetaL } if uncertain_outputs: Ym = Y.mean grad_dict['dL_dYmean'] = -Ym * beta + dtrtrs(Lm, psi1.T)[0].T.dot( dtrtrs(Lm, mu)[0]) grad_dict['dL_dYvar'] = beta / -2. return logL, grad_dict
def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None): assert mean_function is None, "inference with a mean function not implemented" num_inducing, _ = Z.shape num_data, output_dim = Y.shape #make sure the noise is not hetero sigma_n = likelihood.gaussian_variance(Y_metadata) if sigma_n.size >1: raise NotImplementedError("no hetero noise with this implementation of PEP") Kmm = kern.K(Z) Knn = kern.Kdiag(X) Knm = kern.K(X, Z) U = Knm #factor Kmm diag.add(Kmm, self.const_jitter) Kmmi, L, Li, _ = pdinv(Kmm) #compute beta_star, the effective noise precision LiUT = np.dot(Li, U.T) sigma_star = sigma_n + self.alpha * (Knn - np.sum(np.square(LiUT),0)) beta_star = 1./sigma_star # Compute and factor A A = tdot(LiUT*np.sqrt(beta_star)) + np.eye(num_inducing) LA = jitchol(A) # back substitute to get b, P, v URiy = np.dot(U.T*beta_star,Y) tmp, _ = dtrtrs(L, URiy, lower=1) b, _ = dtrtrs(LA, tmp, lower=1) tmp, _ = dtrtrs(LA, b, lower=1, trans=1) v, _ = dtrtrs(L, tmp, lower=1, trans=1) tmp, _ = dtrtrs(LA, Li, lower=1, trans=0) P = tdot(tmp.T) alpha_const_term = (1.0-self.alpha) / self.alpha #compute log marginal log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \ -np.sum(np.log(np.diag(LA)))*output_dim + \ 0.5*output_dim*(1+alpha_const_term)*np.sum(np.log(beta_star)) + \ -0.5*np.sum(np.square(Y.T*np.sqrt(beta_star))) + \ 0.5*np.sum(np.square(b)) + 0.5*alpha_const_term*num_data*np.log(sigma_n) #compute dL_dR Uv = np.dot(U, v) dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - (1.0+alpha_const_term)/beta_star + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) \ + np.sum(np.square(Uv), 1))*beta_star**2 # Compute dL_dKmm vvT_P = tdot(v.reshape(-1,1)) + P dL_dK = 0.5*(Kmmi - vvT_P) KiU = np.dot(Kmmi, U.T) dL_dK += self.alpha * np.dot(KiU*dL_dR, KiU.T) # Compute dL_dU vY = np.dot(v.reshape(-1,1),Y.T) dL_dU = vY - np.dot(vvT_P, U.T) dL_dU *= beta_star dL_dU -= self.alpha * 2.*KiU*dL_dR dL_dthetaL = likelihood.exact_inference_gradients(dL_dR) dL_dthetaL += 0.5*alpha_const_term*num_data / sigma_n grad_dict = {'dL_dKmm': dL_dK, 'dL_dKdiag':dL_dR * self.alpha, 'dL_dKnm':dL_dU.T, 'dL_dthetaL':dL_dthetaL} #construct a posterior object post = Posterior(woodbury_inv=Kmmi-P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=L) return post, log_marginal, grad_dict
def inference(self, kern, X, Z, likelihood, Y, qU_mean, qU_var, Kuu_sigma=None): """ The SVI-VarDTC inference """ N, D, M, Q = Y.shape[0], Y.shape[1], Z.shape[0], Z.shape[1] uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1. / likelihood.variance psi0, psi2, YRY, psi1, psi1Y = self.gatherPsiStat( kern, X, Z, Y, beta, uncertain_inputs) #====================================================================== # Compute Common Components #====================================================================== Kuu = kern.K(Z).copy() if Kuu_sigma is not None: diag.add(Kuu, Kuu_sigma) else: diag.add(Kuu, self.const_jitter) Lm = jitchol(Kuu) mu, S = qU_mean, qU_var Ls = jitchol(S) LinvLs = dtrtrs(Lm, Ls)[0] Linvmu = dtrtrs(Lm, mu)[0] psi1YLinvT = dtrtrs(Lm, psi1Y.T)[0].T self.mid = {'qU_L': Ls, 'LinvLu': LinvLs, 'L': Lm, 'Linvmu': Linvmu} if uncertain_inputs: LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right') else: LmInvPsi2LmInvT = tdot(dtrtrs(Lm, psi1.T)[0]) / beta LmInvSmuLmInvT = tdot(LinvLs) * D + tdot(Linvmu) # logdet_L = np.sum(np.log(np.diag(Lm))) # logdet_S = np.sum(np.log(np.diag(Ls))) #====================================================================== # Compute log-likelihood #====================================================================== logL_R = -N * np.log(beta) logL = -N*D*log_2_pi/2. -D*logL_R/2. - D*psi0/2. - YRY/2. \ -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. + np.trace(LmInvPsi2LmInvT)*D/2.+(Linvmu*psi1YLinvT.T).sum() #====================================================================== # Compute dL_dKmm #====================================================================== tmp1 = backsub_both_sides(Lm, LmInvSmuLmInvT.dot(LmInvPsi2LmInvT), 'left') tmp2 = Linvmu.dot(psi1YLinvT) tmp3 = backsub_both_sides(Lm, -D * LmInvPsi2LmInvT - tmp2 - tmp2.T, 'left') / 2. dL_dKmm = (tmp1 + tmp1.T) / 2. + tmp3 #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = -D * N * beta / 2. - ( -D * psi0 / 2. - YRY / 2. - (LmInvSmuLmInvT * LmInvPsi2LmInvT).sum() / 2. + np.trace(LmInvPsi2LmInvT) * D / 2. + (Linvmu * psi1YLinvT.T).sum()) * beta #====================================================================== # Compute dL_dqU #====================================================================== tmp1 = backsub_both_sides(Lm, -LmInvPsi2LmInvT, 'left') dL_dqU_mean = tmp1.dot(mu) + dtrtrs(Lm, psi1YLinvT.T, trans=1)[0] dL_dqU_var = D / 2. * tmp1 #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== KuuInvmu = dtrtrs(Lm, Linvmu, trans=1)[0] tmp = backsub_both_sides(Lm, np.eye(M) - tdot(LinvLs), 'left') post = Posterior(woodbury_inv=tmp, woodbury_vector=KuuInvmu, K=Kuu, mean=mu, cov=S, K_chol=Lm) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0 = -D * (beta * np.ones((N, ))) / 2. if uncertain_outputs: dL_dpsi1 = Y.mean.dot(dtrtrs(Lm, Linvmu, trans=1)[0].T) * beta else: dL_dpsi1 = Y.dot(dtrtrs(Lm, Linvmu, trans=1)[0].T) * beta dL_dpsi2 = beta * backsub_both_sides(Lm, D * np.eye(M) - LmInvSmuLmInvT, 'left') / 2. if not uncertain_inputs: dL_dpsi1 += psi1.dot(dL_dpsi2 + dL_dpsi2.T) / beta dL_dpsi2 = None if uncertain_inputs: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dpsi0': dL_dpsi0, 'dL_dpsi1': dL_dpsi1, 'dL_dpsi2': dL_dpsi2, 'dL_dthetaL': dL_dthetaL, 'dL_dqU_mean': dL_dqU_mean, 'dL_dqU_var': dL_dqU_var } else: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dKdiag': dL_dpsi0, 'dL_dKnm': dL_dpsi1, 'dL_dthetaL': dL_dthetaL, 'dL_dqU_mean': dL_dqU_mean, 'dL_dqU_var': dL_dqU_var } if uncertain_outputs: m, s = Y.mean, Y.variance grad_dict['dL_dYmean'] = -m * beta + dtrtrs(Lm, psi1.T)[0].T.dot( dtrtrs(Lm, mu)[0]) grad_dict['dL_dYvar'] = beta / -2. return post, logL, grad_dict
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None, fixed_covs_kerns=None, **kw): _, output_dim = Y.shape uncertain_inputs = isinstance(X, VariationalPosterior) #see whether we've got a different noise variance for each datum beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6) # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency! #self.YYTfactor = self.get_YYTfactor(Y) #VVT_factor = self.get_VVTfactor(self.YYTfactor, beta) het_noise = beta.size > 1 if het_noise: raise(NotImplementedError("Heteroscedastic noise not implemented, should be possible though, feel free to try implementing it :)")) if beta.ndim == 1: beta = beta[:, None] # do the inference: num_inducing = Z.shape[0] num_data = Y.shape[0] # kernel computations, using BGPLVM notation Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) if Lm is None: Lm = jitchol(Kmm) # The rather complex computations of A, and the psi stats if uncertain_inputs: psi0 = kern.psi0(Z, X) psi1 = kern.psi1(Z, X) if het_noise: psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0) else: psi2_beta = kern.psi2(Z,X) * beta LmInv = dtrtri(Lm) A = LmInv.dot(psi2_beta.dot(LmInv.T)) else: psi0 = kern.Kdiag(X) psi1 = kern.K(X, Z) if het_noise: tmp = psi1 * (np.sqrt(beta)) else: tmp = psi1 * (np.sqrt(beta)) tmp, _ = dtrtrs(Lm, tmp.T, lower=1) A = tdot(tmp) # factor B B = np.eye(num_inducing) + A LB = jitchol(B) # back substutue C into psi1Vf #tmp, _ = dtrtrs(Lm, psi1.T.dot(VVT_factor), lower=1, trans=0) #_LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0) #tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1) #Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1) # data fit and derivative of L w.r.t. Kmm #delit = tdot(_LBi_Lmi_psi1Vf) # Expose YYT to get additional covariates in (YYT + Kgg): tmp, _ = dtrtrs(Lm, psi1.T, lower=1, trans=0) _LBi_Lmi_psi1, _ = dtrtrs(LB, tmp, lower=1, trans=0) tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1, lower=1, trans=1) Cpsi1, _ = dtrtrs(Lm, tmp, lower=1, trans=1) # TODO: cache this: # Compute fixed covariates covariance: if fixed_covs_kerns is not None: K_fixed = 0 for name, [cov, k] in fixed_covs_kerns.iteritems(): K_fixed += k.K(cov) #trYYT = self.get_trYYT(Y) YYT_covs = (tdot(Y) + K_fixed) data_term = beta**2 * YYT_covs trYYT_covs = np.trace(YYT_covs) else: data_term = beta**2 * tdot(Y) trYYT_covs = self.get_trYYT(Y) #trYYT = self.get_trYYT(Y) delit = mdot(_LBi_Lmi_psi1, data_term, _LBi_Lmi_psi1.T) data_fit = np.trace(delit) DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit) if dL_dKmm is None: delit = -0.5 * DBi_plus_BiPBi delit += -0.5 * B * output_dim delit += output_dim * np.eye(num_inducing) # Compute dL_dKmm dL_dKmm = backsub_both_sides(Lm, delit) # derivatives of L w.r.t. psi dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, data_term, Cpsi1, DBi_plus_BiPBi, psi1, het_noise, uncertain_inputs) # log marginal likelihood log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, psi0, A, LB, trYYT_covs, data_fit, Y) if self.save_per_dim: self.saved_vals = [psi0, A, LB, _LBi_Lmi_psi1, beta] # No heteroscedastics, so no _LBi_Lmi_psi1Vf: # For the interested reader, try implementing the heteroscedastic version, it should be possible _LBi_Lmi_psi1Vf = None # Is just here for documentation, so you can see, what it was. #noise derivatives dL_dR = _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, psi0, psi1, beta, data_fit, num_data, output_dim, trYYT_covs, Y, None) dL_dthetaL = likelihood.exact_inference_gradients(dL_dR,Y_metadata) #put the gradients in the right places if uncertain_inputs: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2, 'dL_dthetaL':dL_dthetaL} else: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL} if fixed_covs_kerns is not None: # For now, we do not take the gradients, we can compute them, # but the maximum likelihood solution is to switch off the additional covariates.... dL_dcovs = beta * np.eye(K_fixed.shape[0]) - beta**2*tdot(_LBi_Lmi_psi1.T) grad_dict['dL_dcovs'] = -.5 * dL_dcovs #get sufficient things for posterior prediction #TODO: do we really want to do this in the loop? if 1: woodbury_vector = (beta*Cpsi1).dot(Y) else: import ipdb; ipdb.set_trace() psi1V = np.dot(Y.T*beta, psi1).T tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0) tmp, _ = dpotrs(LB, tmp, lower=1) woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1) Bi, _ = dpotri(LB, lower=1) symmetrify(Bi) Bi = -dpotri(LB, lower=1)[0] diag.add(Bi, 1) woodbury_inv = backsub_both_sides(Lm, Bi) #construct a posterior object post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm) return post, log_marginal, grad_dict
def inference(self, kern_r, kern_c, Xr, Xc, Zr, Zc, likelihood, Y, qU_mean, qU_var_r, qU_var_c): """ The SVI-VarDTC inference """ N, D, Mr, Mc, Qr, Qc = Y.shape[0], Y.shape[1], Zr.shape[0], Zc.shape[ 0], Zr.shape[1], Zc.shape[1] uncertain_inputs_r = isinstance(Xr, VariationalPosterior) uncertain_inputs_c = isinstance(Xc, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1. / likelihood.variance psi0_r, psi1_r, psi2_r = self.gatherPsiStat(kern_r, Xr, Zr, uncertain_inputs_r) psi0_c, psi1_c, psi2_c = self.gatherPsiStat(kern_c, Xc, Zc, uncertain_inputs_c) #====================================================================== # Compute Common Components #====================================================================== Kuu_r = kern_r.K(Zr).copy() diag.add(Kuu_r, self.const_jitter) Lr = jitchol(Kuu_r) Kuu_c = kern_c.K(Zc).copy() diag.add(Kuu_c, self.const_jitter) Lc = jitchol(Kuu_c) mu, Sr, Sc = qU_mean, qU_var_r, qU_var_c LSr = jitchol(Sr) LSc = jitchol(Sc) LcInvMLrInvT = dtrtrs(Lc, dtrtrs(Lr, mu.T)[0].T)[0] LcInvPsi2_cLcInvT = backsub_both_sides(Lc, psi2_c, 'right') LrInvPsi2_rLrInvT = backsub_both_sides(Lr, psi2_r, 'right') LcInvLSc = dtrtrs(Lc, LSc)[0] LrInvLSr = dtrtrs(Lr, LSr)[0] LcInvScLcInvT = tdot(LcInvLSc) LrInvSrLrInvT = tdot(LrInvLSr) LcInvPsi1_cT = dtrtrs(Lc, psi1_c.T)[0] LrInvPsi1_rT = dtrtrs(Lr, psi1_r.T)[0] tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT = (LrInvPsi2_rLrInvT * LrInvSrLrInvT).sum() tr_LcInvPsi2_cLcInvT_LcInvScLcInvT = (LcInvPsi2_cLcInvT * LcInvScLcInvT).sum() tr_LrInvSrLrInvT = np.square(LrInvLSr).sum() tr_LcInvScLcInvT = np.square(LcInvLSc).sum() tr_LrInvPsi2_rLrInvT = np.trace(LrInvPsi2_rLrInvT) tr_LcInvPsi2_cLcInvT = np.trace(LcInvPsi2_cLcInvT) #====================================================================== # Compute log-likelihood #====================================================================== logL_A = - np.square(Y).sum() \ - (LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT)*LrInvPsi2_rLrInvT).sum() \ - tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT* tr_LcInvPsi2_cLcInvT_LcInvScLcInvT \ + 2 * (Y * LcInvPsi1_cT.T.dot(LcInvMLrInvT).dot(LrInvPsi1_rT)).sum() - psi0_c * psi0_r \ + tr_LrInvPsi2_rLrInvT * tr_LcInvPsi2_cLcInvT logL = -N*D/2.*(np.log(2.*np.pi)-np.log(beta)) + beta/2.* logL_A \ -Mc * (np.log(np.diag(Lr)).sum()-np.log(np.diag(LSr)).sum()) -Mr * (np.log(np.diag(Lc)).sum()-np.log(np.diag(LSc)).sum()) \ - np.square(LcInvMLrInvT).sum()/2. - tr_LrInvSrLrInvT * tr_LcInvScLcInvT/2. + Mr*Mc/2. #====================================================================== # Compute dL_dKuu #====================================================================== tmp = beta* LcInvPsi2_cLcInvT.dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT).dot(LcInvMLrInvT.T) \ + beta* tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvPsi2_cLcInvT.dot(LcInvScLcInvT) \ - beta* LcInvMLrInvT.dot(LrInvPsi1_rT).dot(Y.T).dot(LcInvPsi1_cT.T) \ - beta/2. * tr_LrInvPsi2_rLrInvT* LcInvPsi2_cLcInvT - Mr/2.*np.eye(Mc) \ + tdot(LcInvMLrInvT)/2. + tr_LrInvSrLrInvT/2. * LcInvScLcInvT dL_dKuu_c = backsub_both_sides(Lc, tmp, 'left') dL_dKuu_c += dL_dKuu_c.T dL_dKuu_c *= 0.5 tmp = beta* LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT) \ + beta* tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvPsi2_rLrInvT.dot(LrInvSrLrInvT) \ - beta* LrInvPsi1_rT.dot(Y.T).dot(LcInvPsi1_cT.T).dot(LcInvMLrInvT) \ - beta/2. * tr_LcInvPsi2_cLcInvT * LrInvPsi2_rLrInvT - Mc/2.*np.eye(Mr) \ + tdot(LcInvMLrInvT.T)/2. + tr_LcInvScLcInvT/2. * LrInvSrLrInvT dL_dKuu_r = backsub_both_sides(Lr, tmp, 'left') dL_dKuu_r += dL_dKuu_r.T dL_dKuu_r *= 0.5 #====================================================================== # Compute dL_dthetaL #====================================================================== dL_dthetaL = -D * N * beta / 2. - logL_A * beta * beta / 2. #====================================================================== # Compute dL_dqU #====================================================================== tmp = -beta * LcInvPsi2_cLcInvT.dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT)\ + beta* LcInvPsi1_cT.dot(Y).dot(LrInvPsi1_rT.T) - LcInvMLrInvT dL_dqU_mean = dtrtrs(Lc, dtrtrs(Lr, tmp.T, trans=1)[0].T, trans=1)[0] LScInv = dtrtri(LSc) tmp = -beta / 2. * tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvPsi2_cLcInvT - tr_LrInvSrLrInvT / 2. * np.eye( Mc) dL_dqU_var_c = backsub_both_sides(Lc, tmp, 'left') + tdot(LScInv.T) * Mr / 2. LSrInv = dtrtri(LSr) tmp = -beta / 2. * tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvPsi2_rLrInvT - tr_LcInvScLcInvT / 2. * np.eye( Mr) dL_dqU_var_r = backsub_both_sides(Lr, tmp, 'left') + tdot(LSrInv.T) * Mc / 2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== post = PosteriorMultioutput(LcInvMLrInvT=LcInvMLrInvT, LcInvScLcInvT=LcInvScLcInvT, LrInvSrLrInvT=LrInvSrLrInvT, Lr=Lr, Lc=Lc, kern_r=kern_r, Xr=Xr, Zr=Zr) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0_r = -psi0_c * beta / 2. * np.ones((D, )) dL_dpsi0_c = -psi0_r * beta / 2. * np.ones((N, )) dL_dpsi1_c = beta * dtrtrs( Lc, (Y.dot(LrInvPsi1_rT.T).dot(LcInvMLrInvT.T)).T, trans=1)[0].T dL_dpsi1_r = beta * dtrtrs( Lr, (Y.T.dot(LcInvPsi1_cT.T).dot(LcInvMLrInvT)).T, trans=1)[0].T tmp = beta / 2. * ( -LcInvMLrInvT.dot(LrInvPsi2_rLrInvT).dot(LcInvMLrInvT.T) - tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvScLcInvT + tr_LrInvPsi2_rLrInvT * np.eye(Mc)) dL_dpsi2_c = backsub_both_sides(Lc, tmp, 'left') tmp = beta / 2. * ( -LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT) - tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvSrLrInvT + tr_LcInvPsi2_cLcInvT * np.eye(Mr)) dL_dpsi2_r = backsub_both_sides(Lr, tmp, 'left') if not uncertain_inputs_r: dL_dpsi1_r += psi1_r.dot(dL_dpsi2_r + dL_dpsi2_r.T) if not uncertain_inputs_c: dL_dpsi1_c += psi1_c.dot(dL_dpsi2_c + dL_dpsi2_c.T) grad_dict = { 'dL_dthetaL': dL_dthetaL, 'dL_dqU_mean': dL_dqU_mean, 'dL_dqU_var_c': dL_dqU_var_c, 'dL_dqU_var_r': dL_dqU_var_r, 'dL_dKuu_c': dL_dKuu_c, 'dL_dKuu_r': dL_dKuu_r, } if uncertain_inputs_c: grad_dict['dL_dpsi0_c'] = dL_dpsi0_c grad_dict['dL_dpsi1_c'] = dL_dpsi1_c grad_dict['dL_dpsi2_c'] = dL_dpsi2_c else: grad_dict['dL_dKdiag_c'] = dL_dpsi0_c grad_dict['dL_dKfu_c'] = dL_dpsi1_c if uncertain_inputs_r: grad_dict['dL_dpsi0_r'] = dL_dpsi0_r grad_dict['dL_dpsi1_r'] = dL_dpsi1_r grad_dict['dL_dpsi2_r'] = dL_dpsi2_r else: grad_dict['dL_dKdiag_r'] = dL_dpsi0_r grad_dict['dL_dKfu_r'] = dL_dpsi1_r return post, logL, grad_dict
def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None): assert mean_function is None, "inference with a mean function not implemented" num_inducing, _ = Z.shape num_data, output_dim = Y.shape #make sure the noise is not hetero sigma_n = likelihood.gaussian_variance(Y_metadata) if sigma_n.size > 1: raise NotImplementedError( "no hetero noise with this implementation of PEP") Kmm = kern.K(Z) Knn = kern.Kdiag(X) Knm = kern.K(X, Z) U = Knm #factor Kmm diag.add(Kmm, self.const_jitter) Kmmi, L, Li, _ = pdinv(Kmm) #compute beta_star, the effective noise precision LiUT = np.dot(Li, U.T) sigma_star = sigma_n + self.alpha * (Knn - np.sum(np.square(LiUT), 0)) beta_star = 1. / sigma_star # Compute and factor A A = tdot(LiUT * np.sqrt(beta_star)) + np.eye(num_inducing) LA = jitchol(A) # back substitute to get b, P, v URiy = np.dot(U.T * beta_star, Y) tmp, _ = dtrtrs(L, URiy, lower=1) b, _ = dtrtrs(LA, tmp, lower=1) tmp, _ = dtrtrs(LA, b, lower=1, trans=1) v, _ = dtrtrs(L, tmp, lower=1, trans=1) tmp, _ = dtrtrs(LA, Li, lower=1, trans=0) P = tdot(tmp.T) alpha_const_term = (1.0 - self.alpha) / self.alpha #compute log marginal log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \ -np.sum(np.log(np.diag(LA)))*output_dim + \ 0.5*output_dim*(1+alpha_const_term)*np.sum(np.log(beta_star)) + \ -0.5*np.sum(np.square(Y.T*np.sqrt(beta_star))) + \ 0.5*np.sum(np.square(b)) + 0.5*alpha_const_term*num_data*np.log(sigma_n) #compute dL_dR Uv = np.dot(U, v) dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - (1.0+alpha_const_term)/beta_star + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) \ + np.sum(np.square(Uv), 1))*beta_star**2 # Compute dL_dKmm vvT_P = tdot(v.reshape(-1, 1)) + P dL_dK = 0.5 * (Kmmi - vvT_P) KiU = np.dot(Kmmi, U.T) dL_dK += self.alpha * np.dot(KiU * dL_dR, KiU.T) # Compute dL_dU vY = np.dot(v.reshape(-1, 1), Y.T) dL_dU = vY - np.dot(vvT_P, U.T) dL_dU *= beta_star dL_dU -= self.alpha * 2. * KiU * dL_dR dL_dthetaL = likelihood.exact_inference_gradients(dL_dR) dL_dthetaL += 0.5 * alpha_const_term * num_data / sigma_n grad_dict = { 'dL_dKmm': dL_dK, 'dL_dKdiag': dL_dR * self.alpha, 'dL_dKnm': dL_dU.T, 'dL_dthetaL': dL_dthetaL } #construct a posterior object post = Posterior(woodbury_inv=Kmmi - P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=L) return post, log_marginal, grad_dict
def predict_value(self, xvals, include_noise=True, full_cov=False): # Calculate for the test point assert (xvals.shape[0] >= 1) assert (xvals.shape[1] == self.dimension) n_points, input_dim = xvals.shape # With no observations, predict 0 mean everywhere and prior variance if self.xvals is None: return np.zeros((n_points, 1)), np.ones( (n_points, 1)) * self.variance # Find neightbors within radius point_group = self.spatial_tree.query_ball_point( xvals, self.neighbor_radius) point_list = [] for points in point_group: for index in points: point_list.append(index) point_set = Set(point_list) xpoints = [self.xvals[index] for index in point_set] zpoints = [self.zvals[index] for index in point_set] # print "Size before:", len(xpoints) # Brute force check the points in the waiting queue if self.xwait is not None and self.xwait.shape[0] > 0: wait_list = [] for i, u in enumerate(self.xwait): for j, v in enumerate(xvals): # if xvals.shape[0] < 10: # print "Comparing", i, j # print "Points:", u, v dist = sp.spatial.distance.minkowski(u, v, p=2.0) if dist <= self.neighbor_radius: wait_list.append(i) # if xvals.shape[0] < 10: # print "Adding point", u # if xvals.shape[0] < 10: # print "The wait list:", wait_list wait_set = Set(wait_list) xpoints = [self.xwait[index] for index in wait_set] + xpoints zpoints = [self.zwait[index] for index in wait_set] + zpoints # print "Size after:", len(xpoints) xpoints = np.array(xpoints).reshape(-1, 2) zpoints = np.array(zpoints).reshape(-1, 1) if xpoints.shape[0] == 0: "No nearby points!" return np.zeros((n_points, 1)), np.ones( (n_points, 1)) * self.variance # if self.xvals is not None: # print "Size of kernel array:", self.xvals # if self.xwait is not None: # print "Size of wait array:", self.xwait.shape # if xpoints is not None: # print "Size of returned points:", xpoints.shape Kx = self.kern.K(xpoints, xvals) K = self.kern.K(xpoints, xpoints) # Adds some additional noise to ensure well-conditioned Ky = K.copy() diag.add(Ky, self.noise + 1e-8) Wi, LW, LWi, W_logdet = pdinv(Ky) woodbury_inv = Wi woodbury_vector = np.dot(woodbury_inv, zpoints) mu = np.dot(Kx.T, woodbury_vector) if len(mu.shape) == 1: mu = mu.reshape(-1, 1) if full_cov: Kxx = self.kern.K(xvals) if self.woodbury_inv.ndim == 2: var = Kxx - np.dot(Kx.T, np.dot(woodbury_inv, Kx)) else: Kxx = self.kern.Kdiag(xvals) var = (Kxx - np.sum(np.dot(woodbury_inv.T, Kx) * Kx, 0))[:, None] # If model noise should be included in the prediction if include_noise: var += self.noise update_legacy = False if update_legacy: # With no observations, predict 0 mean everywhere and prior variance if self.model == None: mean, variance = np.zeros((n_points, 1)), np.ones( (n_points, 1)) * self.variance # Else, return the predicted values mean, variance = self.model.predict( xvals, full_cov=False, include_likelihood=include_noise) if xvals.shape[0] < 10: # print "-------- MEAN ------------" # print "spatial method:" # print mu # print "default method:" # print mean # print "-------- VARIANCE ------------" # print "spatial method:" # print var # print "default method:" # print variance print np.sum(mu - mean) print np.sum(var - variance) return mu, var
def reset_epoch(self): # update kernel with new hyperparams self.kern.lengthscale = self.params['ls'].copy() self.kern.variance = self.params['σ0']**2 σ_n2 = self.params['σn']**2 Z = self.params['R'] # initialize all prior quantities self.n = np.zeros( self.num_inducing) # natural mean vector (num_output = 1!) self.P = self.kern.K(Z) # covariance matrix diag.add(self.P, self.const_jitter) L_P = jitchol(self.P) self.C, _ = dpotri(L_P, lower=1) # precision matrix self._log_marginal_likelihood = 0.0 # log marginal likelihood self._log_Det_C = -2 * sum(np.log( np.diag(L_P))) # log determinant of C self.Krr = self.P self.iKrr = self.C # derivative quantities J = self.num_inducing # number of inducing points JD = self.num_inducing * self.kern.input_dim # number of inducing points times dimension if self.params_EST['R']: self.dn_dR = np.zeros( (J, JD) ) # derivative of natural mean wrt inducing inputs (Rjd: R11,...,R1D, R21,...,RJD) self.dC_dR = np.zeros( (J, J, JD) ) # derivative of precision matrix wrt inducing inputs (Rjd: R11,...,R1D, R21,...,RJD) self.dψ_dR = np.zeros( (J, self.kern.input_dim)) # gradients of inducing inputs dKrr_sparse = self.kern.dK_dX(Z) for j in range(0, self.num_inducing): for d in range(0, self.kern.input_dim): jd = j * self.kern.input_dim + d self.dC_dR[:, :, jd] = -np.outer( np.dot(self.C, dKrr_sparse[:, j, d]), self.C[:, j]) self.dC_dR[:, :, jd] = self.dC_dR[:, :, jd] + self.dC_dR[:, :, jd].T else: self.dψ_dR = 0.0 self.dn_dR = 0.0 self.dC_dR = 0.0 dKrr_dσ02 = self.kern.dK_dσ02(Z) self.dn_dσ02 = np.zeros(J) self.dC_dσ02 = -np.dot(np.dot(self.C, dKrr_dσ02), self.C) self.dψ_dσ02 = 0.0 dKrr_dl = self.kern.dK_dl(Z) num_lengthscales = dKrr_dl.shape[2] self.dn_dl = np.zeros((J, num_lengthscales)) self.dC_dl = np.zeros((J, J, num_lengthscales)) self.dψ_dl = np.zeros(num_lengthscales) for d in range(0, num_lengthscales): self.dC_dl[:, :, d] = -np.dot(np.dot(self.C, dKrr_dl[:, :, d]), self.C) self.dn_dσn2 = np.zeros(J) self.dC_dσn2 = np.zeros((J, J)) self.dψ_dσn2 = 0.0
def inference(self, n0, C0, P0, log_marginal_likelihood0, log_Det_C0, dn_dR, dC_dR, dψ_dR, dn_dσ02, dC_dσ02, dψ_dσ02, dn_dl, dC_dl, dψ_dl, dn_dσn2, dC_dσn2, dψ_dσn2, X, Y): α = self.α α_const = (1 - α) / α num_data, _ = Y.shape num_inducing = n0.shape[0] # it only works with num_outputs = 1 y = Y[:, 0] # it only works with num_outputs = 1 # update kernel with new hyperparams self.kern.lengthscale = self.params['ls'].copy() self.kern.variance = self.params['σ0']**2 σ_n2 = self.params['σn']**2 Z = self.params['R'] # compute kernel quantities Krr = self.kern.K(Z) # kernel matrix of inducing inputs diag.add(Krr, self.const_jitter) # add some jitter for stability reasons Kxr = self.kern.K( X, Z) # kernel matrix between mini-batch and inducing inputs kxx = self.kern.Kdiag( X ) #+const_jitter # diagonal of kernel matrix auf mini-batch L_K = jitchol(Krr) # lower cholesky matrix of kernel matrix iKrr, _ = dpotri(L_K) # inverse of kernel matrix of inducinv inputs self.Krr = Krr self.iKrr = iKrr # compute state space matrices (and temporary matrices) H = np.dot(Kxr, iKrr) # observation matrix Ht = H.T # transpose of observation matrix d = kxx - np.sum(H * Kxr, 1) # diagonal of correction matrix v = α * d + σ_n2 # diagonal of actual noise matrix a = α_const * (np.sum(np.log(v)) - num_data * np.log(σ_n2) ) # PEP correction term in marignal likelihoo A_ = Ht / v α_ = np.dot(P0, n0) r = y - np.dot(H, α_) # update natural mean and precision + inversion yielding covariance matrix # n1 = ns + np.dot(A_,y) # C1 = Cs + np.dot(A_,H) n1 = n0 + np.dot(A_, y) C1 = C0 + np.dot(A_, H) L_C = jitchol(C1) P1, _ = dpotri(L_C) # more temporary matrices B_ = np.dot(H, P1) # iV * H * Li' # LAPACK? β_ = r / v γ_ = np.dot(B_.T, β_) δ_ = β_ - np.dot(A_.T, γ_) # update marginal log likelihood log_Det_C1 = 2 * sum(np.log(np.diag(L_C))) log_Ddet_V = sum(np.log(v)) Δ0 = num_data * np.log( 2 * np.pi) + log_Det_C1 - log_Det_C0 + log_Ddet_V + np.sum( r * δ_) + a log_marginal_likelihood1 = log_marginal_likelihood0 - 0.5 * Δ0 # print('lik_i '+str(0.5*Δ0)) # compute constant derivatives of likelihood wrt kernel matrices dL_dH = 2 * ((B_.T / v).T - np.outer(δ_, α_ + γ_)) dL_dv = -(np.sum(H * B_, 1) - v / α + (r - np.dot(H, γ_))**2) / (v**2) D_ = α * (Ht * dL_dv).T E_ = np.dot(dL_dH, iKrr) dL_dKxr = E_ - 2 * D_ dL_dKrr = -np.dot(Ht, E_ - D_) dL_dkxx = α * dL_dv dL_dn = -2 * np.dot(P0, np.dot(Ht, δ_)) dL_dC = P1 - P0 - np.outer(dL_dn, α_) + np.outer(γ_, γ_) # dL_d_dn = 2*σ_n2 *sum(dL_dv) -2*num_data*α_const # wrt to dn dL_d_dn = sum(dL_dv) - num_data * α_const / σ_n2 # wrt to σn2 iVy = y / v dH = np.zeros((num_data, num_inducing)) scaleFact = 1 ### if self.params_EST['R']: # compute sparse kernel derivatives # dKrr_sparse = np.zeros((J,J,D)) dKrr_sparse = self.kern.dK_dX(Z) #, dK_dR=dKrr_sparse) # dKxr_sparse = np.zeros((B,J,D)) dKxr_sparse = self.kern.dK_dX(X, Z) #, dK_dR=dKxr_sparse) # loop over all inducing points for j in range(0, num_inducing): for d in range(0, self.D): jd = j * self.D + d kjd = dKrr_sparse[:, j, d] k2jd = dKxr_sparse[:, j, d] #dψ_dR[j,d] = dψ_dR[j,d] -0.5*( np.sum(dL_dKrr[:,j]*kjd) + np.sum(dL_dKrr[j,:]*kjd) + np.sum(dL_dKxr[:,j]*k2jd) + np.sum( dL_dn*dn_dR[:,jd]) + np.sum( dL_dC*dC_dR[:,:,jd]) ) ### dψ_dR[j,d] = dψ_dR[j,d] -0.5*( np.sum(dL_dkxx *dKxx_diag) + dL_d_dn ) delta = -0.5 * (np.sum(dL_dKrr[:, j] * kjd) + np.sum( dL_dKrr[j, :] * kjd) + np.sum(dL_dKxr[:, j] * k2jd) + np.sum(dL_dn * dn_dR[:, jd]) + np.sum(dL_dC * dC_dR[:, :, jd])) dψ_dR[j, d] = delta * scaleFact dH = -np.outer(H[:, j], kjd) dH[:, j] += -np.dot(H, kjd) + k2jd dH = np.dot(dH, iKrr) dd = -np.sum(dH * Kxr, 1 ) - H[:, j] * k2jd #### dKxx_diag for theta!! div = -α * dd / (v**2) dn_dR[:, jd] = dn_dR[:, jd] + np.dot(dH.T, iVy) + np.dot( Ht, div * y) F_ = np.dot(A_, dH) dC_dR[:, :, jd] = dC_dR[:, :, jd] + F_ + F_.T + np.dot( Ht * div, H) # compute kernel derivatives wrt variance_0 dKrr_dσ02 = self.kern.dK_dσ02(Z) dKxr_dσ02 = self.kern.dK_dσ02(X, Z) dkxx_dσ02 = self.kern.dK_dσ02_diag(X) # dψ_dσ02 = dψ_dσ02 - 0.5*( np.sum(dL_dKrr*dKrr_dσ02) + np.sum(dL_dKxr*dKxr_dσ02) + np.sum( dL_dn*dn_dσ02) + np.sum( dL_dC*dC_dσ02) ) # dψ_dσ02 = dψ_dσ02 - 0.5* np.sum(dL_dkxx *dkxx_dσ02) delta = -0.5 * (np.sum(dL_dKrr * dKrr_dσ02) + np.sum(dL_dKxr * dKxr_dσ02) + np.sum(dL_dn * dn_dσ02) + np.sum(dL_dC * dC_dσ02)) delta = delta - 0.5 * np.sum(dL_dkxx * dkxx_dσ02) dψ_dσ02 = delta * scaleFact dH = dKxr_dσ02 - np.dot(H, dKrr_dσ02) dH = np.dot(dH, iKrr) dd = dkxx_dσ02 - np.sum(dH * Kxr, 1) - np.sum(H * dKxr_dσ02, 1) div = -α * dd / (v**2) dn_dσ02 = dn_dσ02 + np.dot(dH.T, iVy) + np.dot(Ht, div * y) F_ = np.dot(A_, dH) dC_dσ02 = dC_dσ02 + F_ + F_.T + np.dot(Ht * div, H) # compute kernel derivatives wrt lengthsacle(s) dKrr_dl = self.kern.dK_dl(Z) dKxr_dl = self.kern.dK_dl(X, Z) # dkxx_dl = kern.dK_dl_diag(X) # zero anyway # loop over all lengthscales num_lengthscales = dKrr_dl.shape[2] for d in range(0, num_lengthscales): delta = -0.5 * (np.sum(dL_dKrr * dKrr_dl[:, :, d]) + np.sum( dL_dKxr * dKxr_dl[:, :, d]) + np.sum(dL_dn * dn_dl[:, d]) + np.sum(dL_dC * dC_dl[:, :, d])) ############################# dψ_dl[d] = delta * scaleFact dH = dKxr_dl[:, :, d] - np.dot(H, dKrr_dl[:, :, d]) dH = np.dot(dH, iKrr) dd = -np.sum(dH * Kxr, 1) - np.sum(H * dKxr_dl[:, :, d], 1) div = -α * dd / (v**2) dn_dl[:, d] = dn_dl[:, d] + np.dot(dH.T, iVy) + np.dot(Ht, div * y) F_ = np.dot(A_, dH) dC_dl[:, :, d] = dC_dl[:, :, d] + F_ + F_.T + np.dot(Ht * div, H) # gaussian noise variance delta = -0.5 * (np.sum(dL_dn * dn_dσn2) + np.sum(dL_dC * dC_dσn2) + dL_d_dn) # dψ_dσn2 = dψ_dσn2 dψ_dσn2 = delta * scaleFact div = -1.0 / (v**2) dn_dσn2 = dn_dσn2 + np.dot(Ht, div * y) dC_dσn2 = dC_dσn2 + np.dot(Ht * div, H) m1 = np.dot(P1, n1) return log_marginal_likelihood1, n1, m1, C1, P1, log_Det_C1, dn_dR, dC_dR, dψ_dR, dn_dσ02, dC_dσ02, dψ_dσ02, dn_dl, dC_dl, dψ_dl, dn_dσn2, dC_dσn2, dψ_dσn2
def inference_root(self, kern, X, Z, likelihood, Y, Kuu_sigma=None, Y_metadata=None, Lm=None, dL_dKmm=None): """ The first phase of inference: Compute: log-likelihood, dL_dKmm Cached intermediate results: Kmm, KmmInv, """ num_data, output_dim = Y.shape input_dim = Z.shape[0] num_data_total = allReduceArrays([np.int32(num_data)], self.mpi_comm)[0] uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1. / np.fmax(likelihood.variance, 1e-6) psi0, psi2, YRY, psi1, psi1Y, Shalf, psi1S = self.gatherPsiStat( kern, X, Z, Y, beta, uncertain_inputs) #====================================================================== # Compute Common Components #====================================================================== try: Kmm = kern.K(Z).copy() if Kuu_sigma is not None: diag.add(Kmm, Kuu_sigma) else: diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) LmInv = dtrtri(Lm) LmInvPsi2LmInvT = LmInv.dot(psi2.dot(LmInv.T)) Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT LL = jitchol(Lambda) LLInv = dtrtri(LL) flag = np.zeros((1, ), dtype=np.int32) self.mpi_comm.Bcast(flag, root=self.root) except LinAlgError as e: flag = np.ones((1, ), dtype=np.int32) self.mpi_comm.Bcast(flag, root=self.root) raise e broadcastArrays([LmInv, LLInv], self.mpi_comm, self.root) LmLLInv = LLInv.dot(LmInv) logdet_L = 2. * np.sum(np.log(np.diag(LL))) b = psi1Y.dot(LmLLInv.T) bbt = np.square(b).sum() v = b.dot(LmLLInv) LLinvPsi1TYYTPsi1LLinvT = tdot(b.T) if psi1S is not None: psi1SLLinv = psi1S.dot(LmLLInv.T) bbt_sum = np.square(psi1SLLinv).sum() LLinvPsi1TYYTPsi1LLinvT_sum = tdot(psi1SLLinv.T) bbt_sum, LLinvPsi1TYYTPsi1LLinvT_sum = reduceArrays( [bbt_sum, LLinvPsi1TYYTPsi1LLinvT_sum], self.mpi_comm, self.root) bbt += bbt_sum LLinvPsi1TYYTPsi1LLinvT += LLinvPsi1TYYTPsi1LLinvT_sum psi1SP = psi1SLLinv.dot(LmLLInv) tmp = -LLInv.T.dot(LLinvPsi1TYYTPsi1LLinvT + output_dim * np.eye(input_dim)).dot(LLInv) dL_dpsi2R = LmInv.T.dot(tmp + output_dim * np.eye(input_dim)).dot(LmInv) / 2. broadcastArrays([dL_dpsi2R], self.mpi_comm, self.root) #====================================================================== # Compute log-likelihood #====================================================================== logL_R = -num_data_total * np.log(beta) logL = -(output_dim * (num_data_total * log_2_pi + logL_R + psi0 - np.trace(LmInvPsi2LmInvT)) + YRY - bbt) / 2. - output_dim * logdet_L / 2. #====================================================================== # Compute dL_dKmm #====================================================================== dL_dKmm = dL_dpsi2R - output_dim * LmInv.T.dot(LmInvPsi2LmInvT).dot( LmInv) / 2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== wd_inv = backsub_both_sides( Lm, np.eye(input_dim) - backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left') post = Posterior(woodbury_inv=wd_inv, woodbury_vector=v.T, K=Kmm, mean=None, cov=None, K_chol=Lm) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = (YRY * beta + beta * output_dim * psi0 - num_data_total * output_dim * beta) / 2. - beta * (dL_dpsi2R * psi2).sum( ) - beta * np.trace(LLinvPsi1TYYTPsi1LLinvT) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0 = -output_dim * (beta * np.ones((num_data, ))) / 2. if uncertain_outputs: m, s = Y.mean, Y.variance dL_dpsi1 = beta * (np.dot(m, v) + Shalf[:, None] * psi1SP) else: dL_dpsi1 = beta * np.dot(Y, v) if uncertain_inputs: dL_dpsi2 = beta * dL_dpsi2R else: dL_dpsi1 += np.dot(psi1, dL_dpsi2R) * 2. dL_dpsi2 = None if uncertain_inputs: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dpsi0': dL_dpsi0, 'dL_dpsi1': dL_dpsi1, 'dL_dpsi2': dL_dpsi2, 'dL_dthetaL': dL_dthetaL } else: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dKdiag': dL_dpsi0, 'dL_dKnm': dL_dpsi1, 'dL_dthetaL': dL_dthetaL } if uncertain_outputs: m, s = Y.mean, Y.variance psi1LmiLLi = psi1.dot(LmLLInv.T) LLiLmipsi1Y = b.T grad_dict['dL_dYmean'] = -m * beta + psi1LmiLLi.dot(LLiLmipsi1Y) grad_dict['dL_dYvar'] = beta / -2. + np.square(psi1LmiLLi).sum( axis=1) / 2 return post, logL, grad_dict
def inference(self, kern, X, Z, likelihood, Y, qU_mean ,qU_var, Kuu_sigma=None): """ The SVI-VarDTC inference """ N, D, M, Q = Y.shape[0], Y.shape[1], Z.shape[0], Z.shape[1] uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1./likelihood.variance psi0, psi2, YRY, psi1, psi1Y = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs) #====================================================================== # Compute Common Components #====================================================================== Kuu = kern.K(Z).copy() if Kuu_sigma is not None: diag.add(Kuu, Kuu_sigma) else: diag.add(Kuu, self.const_jitter) Lm = jitchol(Kuu) mu, S = qU_mean, qU_var Ls = jitchol(S) LinvLs = dtrtrs(Lm, Ls)[0] Linvmu = dtrtrs(Lm, mu)[0] psi1YLinvT = dtrtrs(Lm,psi1Y.T)[0].T self.mid = { 'qU_L': Ls, 'LinvLu': LinvLs, 'L':Lm, 'Linvmu': Linvmu} if uncertain_inputs: LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right') else: LmInvPsi2LmInvT = tdot(dtrtrs(Lm, psi1.T)[0])/beta LmInvSmuLmInvT = tdot(LinvLs)*D+tdot(Linvmu) # logdet_L = np.sum(np.log(np.diag(Lm))) # logdet_S = np.sum(np.log(np.diag(Ls))) #====================================================================== # Compute log-likelihood #====================================================================== logL_R = -N*np.log(beta) logL = -N*D*log_2_pi/2. -D*logL_R/2. - D*psi0/2. - YRY/2. \ -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. + np.trace(LmInvPsi2LmInvT)*D/2.+(Linvmu*psi1YLinvT.T).sum() #====================================================================== # Compute dL_dKmm #====================================================================== tmp1 = backsub_both_sides(Lm,LmInvSmuLmInvT.dot(LmInvPsi2LmInvT), 'left') tmp2 = Linvmu.dot(psi1YLinvT) tmp3 = backsub_both_sides(Lm, - D*LmInvPsi2LmInvT -tmp2-tmp2.T, 'left')/2. dL_dKmm = (tmp1+tmp1.T)/2. + tmp3 #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = -D*N*beta/2. -(- D*psi0/2. - YRY/2.-(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. + np.trace(LmInvPsi2LmInvT)*D/2.+(Linvmu*psi1YLinvT.T).sum())*beta #====================================================================== # Compute dL_dqU #====================================================================== tmp1 = backsub_both_sides(Lm, - LmInvPsi2LmInvT, 'left') dL_dqU_mean = tmp1.dot(mu) + dtrtrs(Lm, psi1YLinvT.T,trans=1)[0] dL_dqU_var = D/2.*tmp1 #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== KuuInvmu = dtrtrs(Lm, Linvmu, trans=1)[0] tmp = backsub_both_sides(Lm, np.eye(M) - tdot(LinvLs), 'left') post = Posterior(woodbury_inv=tmp, woodbury_vector=KuuInvmu, K=Kuu, mean=mu, cov=S, K_chol=Lm) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0 = -D * (beta * np.ones((N,)))/2. if uncertain_outputs: dL_dpsi1 = Y.mean.dot(dtrtrs(Lm,Linvmu,trans=1)[0].T)*beta else: dL_dpsi1 = Y.dot(dtrtrs(Lm,Linvmu,trans=1)[0].T)*beta dL_dpsi2 = beta*backsub_both_sides(Lm, D*np.eye(M)-LmInvSmuLmInvT, 'left')/2. if not uncertain_inputs: dL_dpsi1 += psi1.dot(dL_dpsi2+dL_dpsi2.T)/beta dL_dpsi2 = None if uncertain_inputs: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2, 'dL_dthetaL':dL_dthetaL, 'dL_dqU_mean':dL_dqU_mean, 'dL_dqU_var':dL_dqU_var} else: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL, 'dL_dqU_mean':dL_dqU_mean, 'dL_dqU_var':dL_dqU_var} if uncertain_outputs: m,s = Y.mean, Y.variance grad_dict['dL_dYmean'] = -m*beta+ dtrtrs(Lm,psi1.T)[0].T.dot(dtrtrs(Lm,mu)[0]) grad_dict['dL_dYvar'] = beta/-2. return post, logL, grad_dict
def inference(self, kern, X, Z, likelihood, Y, qU): """ The SVI-VarDTC inference """ if isinstance(Y, np.ndarray) and np.any(np.isnan(Y)): missing_data = True N, M, Q = Y.shape[0], Z.shape[0], Z.shape[1] Ds = Y.shape[1] - (np.isnan(Y)*1).sum(1) Ymask = 1-np.isnan(Y)*1 Y_masked = np.zeros_like(Y) Y_masked[Ymask==1] = Y[Ymask==1] ND = Ymask.sum() else: missing_data = False N, D, M, Q = Y.shape[0], Y.shape[1], Z.shape[0], Z.shape[1] ND = N*D uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1./np.fmax(likelihood.variance, 1e-6) psi0, psi2, YRY, psi1, psi1Y = self.gatherPsiStat(kern, X, Z, Y if not missing_data else Y_masked, beta, uncertain_inputs, D if not missing_data else Ds, missing_data) #====================================================================== # Compute Common Components #====================================================================== mu, S = qU.mean, qU.covariance mupsi1Y = mu.dot(psi1Y) Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) if missing_data: S_mu = S[None,:,:]+mu.T[:,:,None]*mu.T[:,None,:] NS_mu = S_mu.T.dot(Ymask.T).T LmInv = dtrtri(Lm) LmInvPsi2LmInvT = np.swapaxes(psi2.dot(LmInv.T),1,2).dot(LmInv.T) LmInvSmuLmInvT = np.swapaxes(NS_mu.dot(LmInv.T),1,2).dot(LmInv.T) B = mupsi1Y+ mupsi1Y.T +(Ds[:,None,None]*psi2).sum(0) tmp = backsub_both_sides(Lm, B,'right') logL = -ND*log_2_pi/2. +ND*np.log(beta)/2. - psi0/2. - YRY/2. \ -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. +np.trace(tmp)/2. else: S_mu = S*D+tdot(mu) if uncertain_inputs: LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right') else: LmInvPsi2LmInvT = tdot(dtrtrs(Lm, psi1.T)[0])/beta #tdot(psi1.dot(LmInv.T).T) /beta LmInvSmuLmInvT = backsub_both_sides(Lm, S_mu, 'right') B = mupsi1Y+ mupsi1Y.T +D*psi2 tmp = backsub_both_sides(Lm, B,'right') logL = -ND*log_2_pi/2. +ND*np.log(beta)/2. - psi0/2. - YRY/2. \ -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. +np.trace(tmp)/2. #====================================================================== # Compute dL_dKmm #====================================================================== dL_dKmm = np.eye(M) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = None #(YRY*beta + beta*output_dim*psi0 - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2).sum() - beta*np.trace(LLinvPsi1TYYTPsi1LLinvT) #====================================================================== # Compute dL_dpsi #====================================================================== if missing_data: dL_dpsi0 = -Ds * (beta * np.ones((N,)))/2. else: dL_dpsi0 = -D * (beta * np.ones((N,)))/2. if uncertain_outputs: Ym,Ys = Y.mean, Y.variance dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm, Ym.dot(mu.T).T)[0], trans=1)[0].T*beta else: if missing_data: dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm, (Y_masked).dot(mu.T).T)[0], trans=1)[0].T*beta else: dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm, Y.dot(mu.T).T)[0], trans=1)[0].T*beta if uncertain_inputs: if missing_data: dL_dpsi2 = np.swapaxes((Ds[:,None,None]*np.eye(M)[None,:,:]-LmInvSmuLmInvT).dot(LmInv),1,2).dot(LmInv)*beta/2. else: dL_dpsi2 = beta*backsub_both_sides(Lm, D*np.eye(M)-LmInvSmuLmInvT, 'left')/2. else: dL_dpsi1 += beta*psi1.dot(dL_dpsi2+dL_dpsi2.T) dL_dpsi2 = None if uncertain_inputs: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2, 'dL_dthetaL':dL_dthetaL} else: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL} if uncertain_outputs: Ym = Y.mean grad_dict['dL_dYmean'] = -Ym*beta+ dtrtrs(Lm,psi1.T)[0].T.dot(dtrtrs(Lm,mu)[0]) grad_dict['dL_dYvar'] = beta/-2. return logL, grad_dict
def inference(self, kern, X, Z, likelihood, Y, indexD, output_dim, Y_metadata=None, Lm=None, dL_dKmm=None, Kuu_sigma=None): """ The first phase of inference: Compute: log-likelihood, dL_dKmm Cached intermediate results: Kmm, KmmInv, """ input_dim = Z.shape[0] uncertain_inputs = isinstance(X, VariationalPosterior) beta = 1. / likelihood.variance if len(beta) == 1: beta = np.zeros(output_dim) + beta beta_exp = np.zeros(indexD.shape[0]) for d in range(output_dim): beta_exp[indexD == d] = beta[d] psi0, psi1, psi2 = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs) psi2_sum = (beta_exp[:, None, None] * psi2).sum(0) / output_dim #====================================================================== # Compute Common Components #====================================================================== Kmm = kern.K(Z).copy() if Kuu_sigma is not None: diag.add(Kmm, Kuu_sigma) else: diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) logL = 0. dL_dthetaL = np.zeros(output_dim) dL_dKmm = np.zeros_like(Kmm) dL_dpsi0 = np.zeros_like(psi0) dL_dpsi1 = np.zeros_like(psi1) dL_dpsi2 = np.zeros_like(psi2) wv = np.empty((Kmm.shape[0], output_dim)) for d in range(output_dim): idx_d = indexD == d Y_d = Y[idx_d] N_d = Y_d.shape[0] beta_d = beta[d] psi2_d = psi2[idx_d].sum(0) * beta_d psi1Y = Y_d.T.dot(psi1[idx_d]) * beta_d psi0_d = psi0[idx_d].sum() * beta_d YRY_d = np.square(Y_d).sum() * beta_d LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2_d, 'right') Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT LL = jitchol(Lambda) LmLL = Lm.dot(LL) b = dtrtrs(LmLL, psi1Y.T)[0].T bbt = np.square(b).sum() v = dtrtrs(LmLL, b.T, trans=1)[0].T LLinvPsi1TYYTPsi1LLinvT = tdot(b.T) tmp = -backsub_both_sides(LL, LLinvPsi1TYYTPsi1LLinvT) dL_dpsi2R = backsub_both_sides(Lm, tmp + np.eye(input_dim)) / 2 logL_R = -N_d * np.log(beta_d) logL += -((N_d * log_2_pi + logL_R + psi0_d - np.trace(LmInvPsi2LmInvT)) + YRY_d - bbt) / 2. dL_dKmm += dL_dpsi2R - backsub_both_sides(Lm, LmInvPsi2LmInvT) / 2 dL_dthetaL[d:d + 1] = (YRY_d * beta_d + beta_d * psi0_d - N_d * beta_d) / 2. - beta_d * (dL_dpsi2R * psi2_d).sum( ) - beta_d * np.trace(LLinvPsi1TYYTPsi1LLinvT) dL_dpsi0[idx_d] = -beta_d / 2. dL_dpsi1[idx_d] = beta_d * np.dot(Y_d, v) dL_dpsi2[idx_d] = beta_d * dL_dpsi2R wv[:, d] = v LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2_sum, 'right') Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT LL = jitchol(Lambda) LmLL = Lm.dot(LL) logdet_L = 2. * np.sum(np.log(np.diag(LL))) dL_dpsi2R_common = dpotri(LmLL)[0] / -2. dL_dpsi2 += dL_dpsi2R_common[None, :, :] * beta_exp[:, None, None] for d in range(output_dim): dL_dthetaL[d] += (dL_dpsi2R_common * psi2[indexD == d].sum(0) ).sum() * -beta[d] * beta[d] dL_dKmm += dL_dpsi2R_common * output_dim logL += -output_dim * logdet_L / 2. #====================================================================== # Compute dL_dKmm #====================================================================== # dL_dKmm = dL_dpsi2R - output_dim* backsub_both_sides(Lm, LmInvPsi2LmInvT)/2 #LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== LLInvLmT = dtrtrs(LL, Lm.T)[0] cov = tdot(LLInvLmT.T) wd_inv = backsub_both_sides( Lm, np.eye(input_dim) - backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left') post = Posterior(woodbury_inv=wd_inv, woodbury_vector=wv, K=Kmm, mean=None, cov=cov, K_chol=Lm) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== # for d in range(output_dim): # dL_dthetaL[d:d+1] += - beta[d]*beta[d]*(dL_dpsi2R[None,:,:] * psi2[indexD==d]/output_dim).sum() # dL_dthetaL += - (dL_dpsi2R[None,:,:] * psi2_sum*D beta*(dL_dpsi2R*psi2).sum() #====================================================================== # Compute dL_dpsi #====================================================================== if not uncertain_inputs: dL_dpsi1 += (psi1[:, None, :] * dL_dpsi2).sum(2) * 2. if uncertain_inputs: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dpsi0': dL_dpsi0, 'dL_dpsi1': dL_dpsi1, 'dL_dpsi2': dL_dpsi2, 'dL_dthetaL': dL_dthetaL } else: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dKdiag': dL_dpsi0, 'dL_dKnm': dL_dpsi1, 'dL_dthetaL': dL_dthetaL } return post, logL, grad_dict
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None, Kuu_sigma=None): """ The first phase of inference: Compute: log-likelihood, dL_dKmm Cached intermediate results: Kmm, KmmInv, """ num_data, output_dim = Y.shape input_dim = Z.shape[0] uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) # from ..models.sslvm import Gaussian_Gamma # if isinstance(likelihood, Gaussian_Gamma): # beta = likelihood.expectation_beta() # logL_R = -num_data*likelihood.expectation_logbeta() # else: beta = 1./np.fmax(likelihood.variance, 1e-6) logL_R = -num_data*np.log(beta) psi0, psi2, YRY, psi1, psi1Y, Shalf, psi1S = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs) #====================================================================== # Compute Common Components #====================================================================== Kmm = kern.K(Z).copy() if Kuu_sigma is not None: diag.add(Kmm, Kuu_sigma) else: diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) #LmInv = dtrtri(Lm) if uncertain_inputs: LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right') else: LmInvPsi2LmInvT = tdot(dtrtrs(Lm, psi1.T)[0])/beta #tdot(psi1.dot(LmInv.T).T) /beta Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT LL = jitchol(Lambda) LmLL = Lm.dot(LL) # LLInv = dtrtri(LL) # LmLLInv = LLInv.dot(LmInv) logdet_L = 2.*np.sum(np.log(np.diag(LL))) b = dtrtrs(LmLL, psi1Y.T)[0].T #psi1Y.dot(LmLLInv.T) bbt = np.square(b).sum() v = dtrtrs(LmLL, b.T, trans=1)[0].T #b.dot(LmLLInv) LLinvPsi1TYYTPsi1LLinvT = tdot(b.T) if psi1S is not None: psi1SLLinv = dtrtrs(LmLL, psi1S.T)[0].T #psi1S.dot(LmLLInv.T) bbt += np.square(psi1SLLinv).sum() LLinvPsi1TYYTPsi1LLinvT += tdot(psi1SLLinv.T) psi1SP = dtrtrs(LmLL, psi1SLLinv.T, trans=1)[0].T #psi1SLLinv.dot(LmLLInv) tmp = -backsub_both_sides(LL, LLinvPsi1TYYTPsi1LLinvT+output_dim*np.eye(input_dim)) dL_dpsi2R = backsub_both_sides(Lm, tmp+output_dim*np.eye(input_dim))/2 #tmp = -LLInv.T.dot(LLinvPsi1TYYTPsi1LLinvT+output_dim*np.eye(input_dim)).dot(LLInv) #dL_dpsi2R = LmInv.T.dot(tmp+output_dim*np.eye(input_dim)).dot(LmInv)/2. #====================================================================== # Compute log-likelihood #====================================================================== logL = -(output_dim*(num_data*log_2_pi+logL_R+psi0-np.trace(LmInvPsi2LmInvT))+YRY- bbt)/2.-output_dim*logdet_L/2. #====================================================================== # Compute dL_dKmm #====================================================================== dL_dKmm = dL_dpsi2R - output_dim* backsub_both_sides(Lm, LmInvPsi2LmInvT)/2 #LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== LLInvLmT = dtrtrs(LL, Lm.T)[0] cov = tdot(LLInvLmT.T) wd_inv = backsub_both_sides(Lm, np.eye(input_dim)- backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left') post = Posterior(woodbury_inv=wd_inv, woodbury_vector=v.T, K=Kmm, mean=None, cov=cov, K_chol=Lm) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== # if isinstance(likelihood, Gaussian_Gamma): # from scipy.special import polygamma # dL_dthetaL = ((YRY + output_dim*psi0)/2. - (dL_dpsi2R*psi2).sum() - np.trace(LLinvPsi1TYYTPsi1LLinvT))/-beta # likelihood.q_a.gradient = num_data*output_dim/2.*polygamma(1, likelihood.q_a) + dL_dthetaL/likelihood.q_b # likelihood.q_b.gradient = num_data*output_dim/(-2.*likelihood.q_b) +dL_dthetaL*(-likelihood.q_a/(likelihood.q_b*likelihood.q_b)) # else: dL_dthetaL = (YRY*beta + beta*output_dim*psi0 - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2).sum() - beta*np.trace(LLinvPsi1TYYTPsi1LLinvT) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0 = -output_dim * (beta * np.ones((num_data,)))/2. if uncertain_outputs: m,s = Y.mean, Y.variance dL_dpsi1 = beta*(np.dot(m,v)+Shalf[:,None]*psi1SP) else: dL_dpsi1 = beta*np.dot(Y,v) if uncertain_inputs: dL_dpsi2 = beta* dL_dpsi2R else: dL_dpsi1 += np.dot(psi1,dL_dpsi2R)*2. dL_dpsi2 = None if uncertain_inputs: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2, 'dL_dthetaL':dL_dthetaL} else: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL} if uncertain_outputs: m,s = Y.mean, Y.variance psi1LmiLLi = dtrtrs(LmLL, psi1.T)[0].T LLiLmipsi1Y = b.T grad_dict['dL_dYmean'] = -m*beta+ psi1LmiLLi.dot(LLiLmipsi1Y) grad_dict['dL_dYvar'] = beta/-2.+ np.square(psi1LmiLLi).sum(axis=1)/2 return post, logL, grad_dict
def inference_root(self, kern, X, Z, likelihood, Y, Kuu_sigma=None, Y_metadata=None, Lm=None, dL_dKmm=None): """ The first phase of inference: Compute: log-likelihood, dL_dKmm Cached intermediate results: Kmm, KmmInv, """ num_data, output_dim = Y.shape input_dim = Z.shape[0] num_data_total = allReduceArrays([np.int32(num_data)], self.mpi_comm)[0] uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1./np.fmax(likelihood.variance, 1e-6) psi0, psi2, YRY, psi1, psi1Y, Shalf, psi1S = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs) #====================================================================== # Compute Common Components #====================================================================== try: Kmm = kern.K(Z).copy() if Kuu_sigma is not None: diag.add(Kmm, Kuu_sigma) else: diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) LmInv = dtrtri(Lm) LmInvPsi2LmInvT = LmInv.dot(psi2.dot(LmInv.T)) Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT LL = jitchol(Lambda) LLInv = dtrtri(LL) flag = np.zeros((1,),dtype=np.int32) self.mpi_comm.Bcast(flag,root=self.root) except LinAlgError as e: flag = np.ones((1,),dtype=np.int32) self.mpi_comm.Bcast(flag,root=self.root) raise e broadcastArrays([LmInv, LLInv],self.mpi_comm, self.root) LmLLInv = LLInv.dot(LmInv) logdet_L = 2.*np.sum(np.log(np.diag(LL))) b = psi1Y.dot(LmLLInv.T) bbt = np.square(b).sum() v = b.dot(LmLLInv) LLinvPsi1TYYTPsi1LLinvT = tdot(b.T) if psi1S is not None: psi1SLLinv = psi1S.dot(LmLLInv.T) bbt_sum = np.square(psi1SLLinv).sum() LLinvPsi1TYYTPsi1LLinvT_sum = tdot(psi1SLLinv.T) bbt_sum, LLinvPsi1TYYTPsi1LLinvT_sum = reduceArrays([bbt_sum, LLinvPsi1TYYTPsi1LLinvT_sum], self.mpi_comm, self.root) bbt += bbt_sum LLinvPsi1TYYTPsi1LLinvT += LLinvPsi1TYYTPsi1LLinvT_sum psi1SP = psi1SLLinv.dot(LmLLInv) tmp = -LLInv.T.dot(LLinvPsi1TYYTPsi1LLinvT+output_dim*np.eye(input_dim)).dot(LLInv) dL_dpsi2R = LmInv.T.dot(tmp+output_dim*np.eye(input_dim)).dot(LmInv)/2. broadcastArrays([dL_dpsi2R], self.mpi_comm, self.root) #====================================================================== # Compute log-likelihood #====================================================================== logL_R = -num_data_total*np.log(beta) logL = -(output_dim*(num_data_total*log_2_pi+logL_R+psi0-np.trace(LmInvPsi2LmInvT))+YRY- bbt)/2.-output_dim*logdet_L/2. #====================================================================== # Compute dL_dKmm #====================================================================== dL_dKmm = dL_dpsi2R - output_dim* LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== wd_inv = backsub_both_sides(Lm, np.eye(input_dim)- backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left') post = Posterior(woodbury_inv=wd_inv, woodbury_vector=v.T, K=Kmm, mean=None, cov=None, K_chol=Lm) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = (YRY*beta + beta*output_dim*psi0 - num_data_total*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2).sum() - beta*np.trace(LLinvPsi1TYYTPsi1LLinvT) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0 = -output_dim * (beta * np.ones((num_data,)))/2. if uncertain_outputs: m,s = Y.mean, Y.variance dL_dpsi1 = beta*(np.dot(m,v)+Shalf[:,None]*psi1SP) else: dL_dpsi1 = beta*np.dot(Y,v) if uncertain_inputs: dL_dpsi2 = beta* dL_dpsi2R else: dL_dpsi1 += np.dot(psi1,dL_dpsi2R)*2. dL_dpsi2 = None if uncertain_inputs: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2, 'dL_dthetaL':dL_dthetaL} else: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL} if uncertain_outputs: m,s = Y.mean, Y.variance psi1LmiLLi = psi1.dot(LmLLInv.T) LLiLmipsi1Y = b.T grad_dict['dL_dYmean'] = -m*beta+ psi1LmiLLi.dot(LLiLmipsi1Y) grad_dict['dL_dYvar'] = beta/-2.+ np.square(psi1LmiLLi).sum(axis=1)/2 return post, logL, grad_dict