def predictive(self, x_new, lik_noise=False): # Matrices q_m = self.q_m.detach().numpy() q_L = torch.tril(self.q_L) q_S = torch.mm(q_L, q_L.t()).detach().numpy() Kuu = self.kernel.K(self.z, self.z).detach().numpy() posterior = Posterior(mean=q_m, cov=q_S, K=Kuu, prior_mean=np.zeros(q_m.shape)) Kx = self.kernel.K(self.z, x_new).detach().numpy() Kxx = self.kernel.K(x_new, x_new).detach().numpy() # GP Predictive Posterior - mean + variance gp_mu = np.dot(Kx.T, posterior.woodbury_vector) Kxx = np.diag(Kxx) gp_var = (Kxx - np.sum(np.dot(np.atleast_3d(posterior.woodbury_inv).T, Kx) * Kx[None, :, :], 1)).T gp = gp_mu if lik_noise: gp_upper = gp_mu + 2 * np.sqrt(gp_var) + 2 * self.likelihood.sigma.detach().numpy() gp_lower = gp_mu - 2 * np.sqrt(gp_var) - 2 * self.likelihood.sigma.detach().numpy() else: gp_upper = gp_mu + 2*np.sqrt(gp_var) gp_lower = gp_mu - 2*np.sqrt(gp_var) return gp, gp_upper, gp_lower
def _outer_loop_without_missing_data(self): if self.posterior is None: woodbury_inv = np.zeros( (self.num_inducing, self.num_inducing, self.output_dim)) woodbury_vector = np.zeros((self.num_inducing, self.output_dim)) else: woodbury_inv = self.posterior._woodbury_inv woodbury_vector = self.posterior._woodbury_vector d = self.stochastics.d[0][0] posterior, log_marginal_likelihood, grad_dict = self._inner_parameters_changed( self.kern, self.X, self.Z, self.likelihood, self.Y_normalized[:, d], self.Y_metadata) self.grad_dict = grad_dict self._log_marginal_likelihood = log_marginal_likelihood self._outer_values_update(self.grad_dict) woodbury_inv[:, :, d] = posterior.woodbury_inv[:, :, None] woodbury_vector[:, d] = posterior.woodbury_vector if self.posterior is None: self.posterior = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=posterior._K, mean=None, cov=None, K_chol=posterior.K_chol)
def _inference(K: np.ndarray, ga_approx: GaussianApproximation, cav_params: CavityParams, Z_tilde: float, y: List[Tuple[int, float]], yc: List[List[Tuple[int, int]]]) -> Tuple[Posterior, int, Dict]: """ Compute the posterior approximation :param K: prior covariance matrix :param ga_approx: Gaussian approximation of the batches :param cav_params: Cavity parameters of the posterior :param Z_tilde: Log marginal likelihood :param y: Direct observations as a list of tuples telling location index (row in X) and observation value. :param yc: Batch comparisons in a list of lists of tuples. Each batch is a list and tuples tell the comparisons (winner index, loser index) :return: A tuple consisting of the posterior approximation, log marginal likelihood and gradient dictionary """ log_marginal, post_params = _ep_marginal(K, ga_approx, Z_tilde,y,yc) tau_tilde_root = sqrtm_block(ga_approx.tau, y,yc) Sroot_tilde_K = np.dot(tau_tilde_root, K) aux_alpha , _ = dpotrs(post_params.L, np.dot(Sroot_tilde_K, ga_approx.v), lower=1) alpha = (ga_approx.v - np.dot(tau_tilde_root, aux_alpha))[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde) LWi, _ = dtrtrs(post_params.L, tau_tilde_root, lower=1) Wi = np.dot(LWi.T,LWi) symmetrify(Wi) #(K + Sigma^(\tilde))^(-1) dL_dK = 0.5 * (tdot(alpha) - Wi) dL_dthetaL = 0 return Posterior(woodbury_inv=np.asfortranarray(Wi), woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
def inference(self, kern, X, W, likelihood, Y, mean_function=None, Y_metadata=None, K=None, variance=None, Z_tilde=None): """ Returns a Posterior class containing essential quantities of the posterior """ if mean_function is None: m = 0 else: m = mean_function.f(X) if variance is None: variance = likelihood.gaussian_variance(Y_metadata) YYT_factor = Y - m if K is None: K = kern.K(X) Ky = K.copy() diag.add(Ky, variance + 1e-8) Wi, LW, LWi, W_logdet = pdinv(Ky) alpha, _ = dpotrs(LW, YYT_factor, lower=1) log_marginal = 0.5 * (-Y.size * log_2_pi - Y.shape[1] * W_logdet - np.sum(alpha * YYT_factor)) if Z_tilde is not None: # This is a correction term for the log marginal likelihood # In EP this is log Z_tilde, which is the difference between the # Gaussian marginal and Z_EP log_marginal += Z_tilde dL_dK = 0.5 * (tdot(alpha) - Y.shape[1] * Wi) dL_dthetaL = likelihood.exact_inference_gradients( np.diag(dL_dK), Y_metadata) posterior_ = Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K) return posterior_, log_marginal, { 'dL_dK': dL_dK, 'dL_dthetaL': dL_dthetaL, 'dL_dm': alpha }, W_logdet
def _outer_loop_for_missing_data(self): Lm = None dL_dKmm = None self._log_marginal_likelihood = 0 self.full_values = self._outer_init_full_values() if self.posterior is None: woodbury_inv = np.zeros( (self.num_inducing, self.num_inducing, self.output_dim)) woodbury_vector = np.zeros((self.num_inducing, self.output_dim)) else: woodbury_inv = self.posterior._woodbury_inv woodbury_vector = self.posterior._woodbury_vector if not self.stochastics: m_f = lambda i: "Inference with missing_data: {: >7.2%}".format( float(i + 1) / self.output_dim) message = m_f(-1) print message, for d in self.stochastics.d: ninan = self.ninan[:, d] if not self.stochastics: print ' ' * (len(message)) + '\r', message = m_f(d) print message, posterior, log_marginal_likelihood, \ grad_dict, current_values, value_indices = self._inner_parameters_changed( self.kern, self.X[ninan], self.Z, self.likelihood, self.Ylist[d], self.Y_metadata, Lm, dL_dKmm, subset_indices=dict(outputs=d, samples=ninan)) self._inner_take_over_or_update(self.full_values, current_values, value_indices) self._inner_values_update(current_values) Lm = posterior.K_chol dL_dKmm = grad_dict['dL_dKmm'] woodbury_inv[:, :, d] = posterior.woodbury_inv woodbury_vector[:, d:d + 1] = posterior.woodbury_vector self._log_marginal_likelihood += log_marginal_likelihood if not self.stochastics: print '' if self.posterior is None: self.posterior = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=posterior._K, mean=None, cov=None, K_chol=posterior.K_chol) self._outer_values_update(self.full_values)
def vi_comparison(X: np.ndarray, y: List[Tuple[int, float]], yc: List[List[Tuple[int, int]]], kern: GPy.kern.Kern, sigma2s: np.ndarray, alpha: np.ndarray, beta: np.ndarray, max_iters: int=200, lr: float=1e-3, method: str='fr', optimize: str="adam", get_logger: Callable=None) -> Tuple[Posterior, float, Dict, np.ndarray, np.ndarray]: """ :param X: All locations of both direct observations and batch comparisons :param y: Direct observations in as a list of tuples telling location index (row in X) and observation value. :param yc: Batch comparisons in a list of lists of tuples. Each batch is a list and tuples tell the comparisons (winner index, loser index) :param kern: Prior covariance kernel :param sigma2s: Noise variance of the observations :param alpha: Initial values for alpha :param beta: Initial values for beta :param max_iter: macimum number of optimization iterations :param method: full rank 'fr' or mean field 'mf' methods :param optimize: optimization algorithm. adam or l-bfgs-B :param get_logger: Function for receiving the legger where the prints are forwarded. :return: A Tuple containing the posterior, log marginal likelihood, its gradients with respect to hyper parameters (not supported at the moment) and alpha and beta values """ if(method == 'fr'): recompute_posterior = recompute_posterior_fr s_to_l = dL_fr else: recompute_posterior = recompute_posterior_mf s_to_l = dL_mf K = kern.K(X) K = K + 1e-6*np.identity(len(K)) N = X.shape[0] X0 = np.r_[alpha, beta] args = [K, sigma2s, y, yc, recompute_posterior, s_to_l] if optimize is "adam": X, log_marginal, _ = adam(log_lik, X0.flatten(), args, bounds=None, max_it=max_iters, get_logger=get_logger) else: res = sp.optimize.minimize(fun=log_lik, x0=X0.flatten(), args= args, method='L-BFGS-B', jac=True, bounds=None ) X = res.x.reshape(-1) log_marginal = res.fun alpha = X[:K.shape[0]].reshape(-1,1) beta = X[K.shape[0]:].reshape(-1,1) # Create posterior instance m, L, L_inv, KL, dKL_db_, dKL_da_ = recompute_posterior(alpha, beta, K) posterior = Posterior(mean=m, cov=L @ L.T, K=K) grad_dict = {}# {'dL_dK': dF_dK - dKL_dK, 'dL_dthetaL':dL_dthetaL} # return posterior, log_marginal, grad_dict return posterior, log_marginal, grad_dict, alpha, beta
def _inference(K, ga_approx, cav_params, likelihood, Z_tilde, Y_metadata=None): log_marginal, post_params = _ep_marginal(K, ga_approx, Z_tilde) tau_tilde_root = np.sqrt(ga_approx.tau) Sroot_tilde_K = tau_tilde_root[:,None] * K aux_alpha , _ = dpotrs(post_params.L, np.dot(Sroot_tilde_K, ga_approx.v), lower=1) alpha = (ga_approx.v - tau_tilde_root * aux_alpha)[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde) LWi, _ = dtrtrs(post_params.L, np.diag(tau_tilde_root), lower=1) Wi = np.dot(LWi.T,LWi) symmetrify(Wi) #(K + Sigma^(\tilde))^(-1) dL_dK = 0.5 * (tdot(alpha) - Wi) dL_dthetaL = 0 #likelihood.ep_gradients(Y, cav_params.tau, cav_params.v, np.diag(dL_dK), Y_metadata=Y_metadata, quad_mode='gh') #temp2 = likelihood.ep_gradients(Y, cav_params.tau, cav_params.v, np.diag(dL_dK), Y_metadata=Y_metadata, quad_mode='naive') #temp = likelihood.exact_inference_gradients(np.diag(dL_dK), Y_metadata = Y_metadata) #print("exact: {}, approx: {}, Ztilde: {}, naive: {}".format(temp, dL_dthetaL, Z_tilde, temp2)) return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
def posteriors(self, q_u_means, q_u_chols, X, Y, Z, kern_list, likelihood, B_list, Y_metadata): """ Description: """ ####### Dimensions ####### D = likelihood.num_output_functions(Y_metadata) Q = len(kern_list) M = Z.shape[0] T = len(Y) dimensions = {'D': D, 'Q': Q, 'M': M, 'T': T} ####### Distributions ####### Kuu, Luu, Kuui = multi_output.latent_funs_cov(Z, kern_list) p_U = pu(Kuu=Kuu, Luu=Luu, Kuui=Kuui) q_U = qu(mu_u=q_u_means, chols_u=q_u_chols) posteriors = [] f_index = Y_metadata['function_index'].flatten() ####### q(F) posterior computations ####### for d in range(D): Xtask = X[f_index[d]] q_fd = self.variational_q_fd(X=Xtask, Z=Z, q_U=q_U, p_U=p_U, kern_list=kern_list, B=B_list, N=Xtask.shape[0], dims=dimensions, d=d) Knew_d = multi_output.function_covariance(X=Xtask, B=B_list, kernel_list=kern_list, d=d) posterior_fd = Posterior(mean=q_fd.m_fd.copy(), cov=q_fd.S_fd.copy(), K=Knew_d, prior_mean=np.zeros(q_fd.m_fd.shape)) posteriors.append(posterior_fd) return posteriors
def inference(self, kern, X, Z, likelihood, Y, indexD, output_dim, Y_metadata=None, Lm=None, dL_dKmm=None, Kuu_sigma=None): """ The first phase of inference: Compute: log-likelihood, dL_dKmm Cached intermediate results: Kmm, KmmInv, """ input_dim = Z.shape[0] uncertain_inputs = isinstance(X, VariationalPosterior) beta = 1. / likelihood.variance if len(beta) == 1: beta = np.zeros(output_dim) + beta beta_exp = np.zeros(indexD.shape[0]) for d in range(output_dim): beta_exp[indexD == d] = beta[d] psi0, psi1, psi2 = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs) psi2_sum = (beta_exp[:, None, None] * psi2).sum(0) / output_dim #====================================================================== # Compute Common Components #====================================================================== Kmm = kern.K(Z).copy() if Kuu_sigma is not None: diag.add(Kmm, Kuu_sigma) else: diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) logL = 0. dL_dthetaL = np.zeros(output_dim) dL_dKmm = np.zeros_like(Kmm) dL_dpsi0 = np.zeros_like(psi0) dL_dpsi1 = np.zeros_like(psi1) dL_dpsi2 = np.zeros_like(psi2) wv = np.empty((Kmm.shape[0], output_dim)) for d in range(output_dim): idx_d = indexD == d Y_d = Y[idx_d] N_d = Y_d.shape[0] beta_d = beta[d] psi2_d = psi2[idx_d].sum(0) * beta_d psi1Y = Y_d.T.dot(psi1[idx_d]) * beta_d psi0_d = psi0[idx_d].sum() * beta_d YRY_d = np.square(Y_d).sum() * beta_d LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2_d, 'right') Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT LL = jitchol(Lambda) LmLL = Lm.dot(LL) b = dtrtrs(LmLL, psi1Y.T)[0].T bbt = np.square(b).sum() v = dtrtrs(LmLL, b.T, trans=1)[0].T LLinvPsi1TYYTPsi1LLinvT = tdot(b.T) tmp = -backsub_both_sides(LL, LLinvPsi1TYYTPsi1LLinvT) dL_dpsi2R = backsub_both_sides(Lm, tmp + np.eye(input_dim)) / 2 logL_R = -N_d * np.log(beta_d) logL += -((N_d * log_2_pi + logL_R + psi0_d - np.trace(LmInvPsi2LmInvT)) + YRY_d - bbt) / 2. dL_dKmm += dL_dpsi2R - backsub_both_sides(Lm, LmInvPsi2LmInvT) / 2 dL_dthetaL[d:d + 1] = (YRY_d * beta_d + beta_d * psi0_d - N_d * beta_d) / 2. - beta_d * (dL_dpsi2R * psi2_d).sum( ) - beta_d * np.trace(LLinvPsi1TYYTPsi1LLinvT) dL_dpsi0[idx_d] = -beta_d / 2. dL_dpsi1[idx_d] = beta_d * np.dot(Y_d, v) dL_dpsi2[idx_d] = beta_d * dL_dpsi2R wv[:, d] = v LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2_sum, 'right') Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT LL = jitchol(Lambda) LmLL = Lm.dot(LL) logdet_L = 2. * np.sum(np.log(np.diag(LL))) dL_dpsi2R_common = dpotri(LmLL)[0] / -2. dL_dpsi2 += dL_dpsi2R_common[None, :, :] * beta_exp[:, None, None] for d in range(output_dim): dL_dthetaL[d] += (dL_dpsi2R_common * psi2[indexD == d].sum(0) ).sum() * -beta[d] * beta[d] dL_dKmm += dL_dpsi2R_common * output_dim logL += -output_dim * logdet_L / 2. #====================================================================== # Compute dL_dKmm #====================================================================== # dL_dKmm = dL_dpsi2R - output_dim* backsub_both_sides(Lm, LmInvPsi2LmInvT)/2 #LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== LLInvLmT = dtrtrs(LL, Lm.T)[0] cov = tdot(LLInvLmT.T) wd_inv = backsub_both_sides( Lm, np.eye(input_dim) - backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left') post = Posterior(woodbury_inv=wd_inv, woodbury_vector=wv, K=Kmm, mean=None, cov=cov, K_chol=Lm) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== # for d in range(output_dim): # dL_dthetaL[d:d+1] += - beta[d]*beta[d]*(dL_dpsi2R[None,:,:] * psi2[indexD==d]/output_dim).sum() # dL_dthetaL += - (dL_dpsi2R[None,:,:] * psi2_sum*D beta*(dL_dpsi2R*psi2).sum() #====================================================================== # Compute dL_dpsi #====================================================================== if not uncertain_inputs: dL_dpsi1 += (psi1[:, None, :] * dL_dpsi2).sum(2) * 2. if uncertain_inputs: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dpsi0': dL_dpsi0, 'dL_dpsi1': dL_dpsi1, 'dL_dpsi2': dL_dpsi2, 'dL_dthetaL': dL_dthetaL } else: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dKdiag': dL_dpsi0, 'dL_dKnm': dL_dpsi1, 'dL_dthetaL': dL_dthetaL } return post, logL, grad_dict
def inference(self, q_u_means, q_u_chols, X, Y, Z, kern_list, kern_list_Gdj, kern_aux, likelihood, B_list, Y_metadata, KL_scale=1.0, batch_scale=None, predictive=False, Gauss_Newton=False): M = Z.shape[0] T = len(Y) if batch_scale is None: batch_scale = [1.0] * T Ntask = [] [Ntask.append(Y[t].shape[0]) for t in range(T)] Q = len(kern_list) D = likelihood.num_output_functions(Y_metadata) Kuu, Luu, Kuui = util.latent_funs_cov(Z, kern_list) p_U = pu(Kuu=Kuu, Luu=Luu, Kuui=Kuui) q_U = qu(mu_u=q_u_means.copy(), chols_u=q_u_chols.copy()) S_u = np.empty((Q, M, M)) L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)] Su_add_Kuu = np.zeros((Q, M, M)) Su_add_Kuu_chol = np.zeros((Q, M, M)) for q in range(Q): Su_add_Kuu[q, :, :] = S_u[q, :, :] + Kuu[q, :, :] Su_add_Kuu_chol[q, :, :] = linalg.jitchol(Su_add_Kuu[q, :, :]) # for every latent function f_d calculate q(f_d) and keep it as q(F): q_F = [] posteriors_F = [] f_index = Y_metadata['function_index'].flatten() d_index = Y_metadata['d_index'].flatten() for d in range(D): Xtask = X[f_index[d]] q_fd, q_U = self.calculate_q_f(X=Xtask, Z=Z, q_U=q_U, S_u=S_u, p_U=p_U, kern_list=kern_list, kern_list_Gdj=kern_list_Gdj, kern_aux=kern_aux, B=B_list, M=M, N=Xtask.shape[0], Q=Q, D=D, d=d) # Posterior objects for output functions (used in prediction) #I have to get rid of function below Posterior for it is not necessary posterior_fd = Posterior(mean=q_fd.m_fd.copy(), cov=q_fd.S_fd.copy(), K=util.conv_function_covariance( X=Xtask, B=B_list, kernel_list=kern_list, kernel_list_Gdj=kern_list_Gdj, kff_aux=kern_aux, d=d), prior_mean=np.zeros(q_fd.m_fd.shape)) posteriors_F.append(posterior_fd) q_F.append(q_fd) mu_F = [] v_F = [] for t in range(T): mu_F_task = np.empty((X[t].shape[0], 1)) v_F_task = np.empty((X[t].shape[0], 1)) for d, q_fd in enumerate(q_F): if f_index[d] == t: mu_F_task = np.hstack((mu_F_task, q_fd.m_fd)) v_F_task = np.hstack((v_F_task, q_fd.v_fd)) mu_F.append(mu_F_task[:, 1:]) v_F.append(v_F_task[:, 1:]) # posterior_Fnew for predictive if predictive: return posteriors_F # inference for rest of cases else: # Variational Expectations VE = likelihood.var_exp(Y, mu_F, v_F, Y_metadata) VE_dm, VE_dv = likelihood.var_exp_derivatives( Y, mu_F, v_F, Y_metadata, Gauss_Newton) for t in range(T): VE[t] = VE[t] * batch_scale[t] VE_dm[t] = VE_dm[t] * batch_scale[t] VE_dv[t] = VE_dv[t] * batch_scale[t] # KL Divergence KL = self.calculate_KL(q_U=q_U, Su_add_Kuu=Su_add_Kuu, Su_add_Kuu_chol=Su_add_Kuu_chol, p_U=p_U, M=M, Q=Q, D=D) # Log Marginal log(p(Y)) F = 0 for t in range(T): F += VE[t].sum() log_marginal = F - KL # Gradients and Posteriors dL_dS_u = [] dL_dmu_u = [] dL_dL_u = [] dL_dKmm = [] dL_dKmn = [] dL_dKdiag = [] posteriors = [] for q in range(Q): (dL_dmu_q, dL_dL_q, dL_dS_q, posterior_q, dL_dKqq, dL_dKdq, dL_dKdiag_q) = self.calculate_gradients( q_U=q_U, S_u=S_u, Su_add_Kuu_chol=Su_add_Kuu_chol, p_U=p_U, q_F=q_F, VE_dm=VE_dm, VE_dv=VE_dv, Ntask=Ntask, M=M, Q=Q, D=D, f_index=f_index, d_index=d_index, q=q) dL_dmu_u.append(dL_dmu_q) dL_dL_u.append(dL_dL_q) dL_dS_u.append(dL_dS_q) dL_dKmm.append(dL_dKqq) dL_dKmn.append(dL_dKdq) dL_dKdiag.append(dL_dKdiag_q) posteriors.append(posterior_q) gradients = { 'dL_dmu_u': dL_dmu_u, 'dL_dL_u': dL_dL_u, 'dL_dS_u': dL_dS_u, 'dL_dKmm': dL_dKmm, 'dL_dKmn': dL_dKmn, 'dL_dKdiag': dL_dKdiag } return log_marginal, gradients, posteriors, posteriors_F
def calculate_gradients(self, q_U, S_u, Su_add_Kuu_chol, p_U, q_F, VE_dm, VE_dv, Ntask, M, Q, D, f_index, d_index, q): """ Calculates gradients of the Log-marginal distribution p(Y) wrt variational parameters mu_q, S_q """ # Algebra for q(u) and p(u): m_u = q_U.mu_u.copy() #L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) #S_u = np.empty((Q, M, M)) #[np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)] Kuu = p_U.Kuu.copy() Luu = p_U.Luu.copy() Kuui = p_U.Kuui.copy() S_qi, _ = linalg.dpotri(np.asfortranarray(Su_add_Kuu_chol[q, :, :])) if np.any(np.isinf(S_qi)): raise ValueError("Sqi: Cholesky representation unstable") # KL Terms dKL_dmu_q = [] dKL_dKqq = 0 for d in range(D): dKL_dmu_q.append(np.dot(Kuui[q, :, :], m_u[d][:, q, None])) #same dKL_dKqq += -0.5 * S_qi + 0.5 * Kuui[q, :, :] - 0.5 * Kuui[q, :, :].dot(S_u[q, :, :]).dot(Kuui[q, :, :]) \ - 0.5 * np.dot(Kuui[q, :, :], np.dot(m_u[d][:, q, None], m_u[d][:, q, None].T)).dot(Kuui[q, :, :].T) # same #dKL_dS_q = 0.5 * (Kuui[q,:,:] - S_qi) #old dKL_dS_q = 0.5 * (Kuui[q, :, :] - S_qi) * D # VE Terms #dVE_dmu_q = np.zeros((M, 1)) dVE_dmu_q = [] dVE_dS_q = np.zeros((M, M)) dVE_dKqq = np.zeros((M, M)) dVE_dKqd = [] dVE_dKdiag = [] dL_dmu_q = [] for d, q_fd in enumerate(q_F): Nt = Ntask[f_index[d]] dVE_dmu_q.append( np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:, d_index[d]])[:, None]) dL_dmu_q.append(dVE_dmu_q[d] - dKL_dmu_q[d]) Adv = q_fd.Afdu[q, :, :].T * VE_dv[f_index[d]][:, d_index[d], None].T Adv = np.ascontiguousarray(Adv) AdvA = np.dot(Adv.reshape(-1, Nt), q_fd.Afdu[q, :, :]).reshape(M, M) dVE_dS_q += AdvA # Derivatives dKuquq tmp_dv = np.dot(AdvA, S_u[q, :, :]).dot(Kuui[q, :, :]) dVE_dKqq += -tmp_dv - tmp_dv.T #+ AdvA last term not included in the derivative Adm = np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:, d_index[d], None]) dVE_dKqq += -np.dot(Adm, np.dot(Kuui[q, :, :], m_u[d][:, q, None]).T) # Derivatives dKuqfd tmp = np.dot(S_u[q, :, :], Kuui[q, :, :]) tmp = 2. * tmp #2. * (tmp - np.eye(M)) # the term -2Adv not included dve_kqd = np.dot(np.dot(Kuui[q, :, :], m_u[d][:, q, None]), VE_dm[f_index[d]][:, d_index[d], None].T) dve_kqd += np.dot(tmp.T, Adv) dVE_dKqd.append(dve_kqd) # Derivatives dKdiag dVE_dKdiag.append(VE_dv[f_index[d]][:, d_index[d]]) dVE_dKqq = 0.5 * (dVE_dKqq + dVE_dKqq.T) # Sum of VE and KL terms #dL_dmu_q = dVE_dmu_q - dKL_dmu_q dL_dS_q = dVE_dS_q - dKL_dS_q dL_dKqq = dVE_dKqq - dKL_dKqq dL_dKdq = dVE_dKqd dL_dKdiag = dVE_dKdiag # Pass S_q gradients to its low-triangular representation L_q chol_u = q_U.chols_u.copy() L_q = choleskies.flat_to_triang(chol_u[:, q:q + 1]) dL_dL_q = 2. * np.array( [np.dot(a, b) for a, b in zip(dL_dS_q[None, :, :], L_q)]) dL_dL_q = choleskies.triang_to_flat(dL_dL_q) # Posterior posterior_q = [] for d in range(D): posterior_q.append( Posterior(mean=m_u[d][:, q, None], cov=S_u[q, :, :] + Kuu[q, :, :], K=Kuu[q, :, :], prior_mean=np.zeros(m_u[d][:, q, None].shape))) return dL_dmu_q, dL_dL_q, dL_dS_q, posterior_q, dL_dKqq, dL_dKdq, dL_dKdiag
def calculate_gradients(self, q_U, p_U, q_F, VE_dm, VE_dv, Ntask, M, Q, D, f_index, d_index, j): """ Calculates gradients of the Log-marginal distribution p(Y) wrt variational parameters mu_q, S_q """ # Algebra for q(u) and p(u): m_u = q_U.mu_u.copy() L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) #S_u = np.empty((Q, M, M)) S_u = np.dot( L_u[j, :, :], L_u[j, :, :].T ) #This could be done outside and recieve it to reduce computation #[np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)] Kuu = p_U.Kuu.copy() Luu = p_U.Luu.copy() Kuui = p_U.Kuui.copy() S_qi, _ = linalg.dpotri(np.asfortranarray(L_u[j, :, :])) if np.any(np.isinf(S_qi)): raise ValueError("Sqi: Cholesky representation unstable") # KL Terms dKL_dmu_j = np.dot(Kuui[j, :, :], m_u[:, j, None]) dKL_dS_j = 0.5 * (Kuui[j, :, :] - S_qi) dKL_dKjj = 0.5 * Kuui[j,:,:] - 0.5 * Kuui[j,:,:].dot(S_u).dot(Kuui[j,:,:]) \ - 0.5 * np.dot(Kuui[j,:,:],np.dot(m_u[:, j, None],m_u[:, j, None].T)).dot(Kuui[j,:,:].T) # VE Terms dVE_dmu_j = np.zeros((M, 1)) dVE_dS_j = np.zeros((M, M)) dVE_dKjj = np.zeros((M, M)) dVE_dKjd = [] dVE_dKdiag = [] Nt = Ntask[f_index[j]] dVE_dmu_j += np.dot(q_F[j].Afdu.T, VE_dm[f_index[j]][:, d_index[j]])[:, None] Adv = q_F[j].Afdu.T * VE_dv[f_index[j]][:, d_index[j], None].T Adv = np.ascontiguousarray(Adv) AdvA = np.dot(Adv.reshape(-1, Nt), q_F[j].Afdu).reshape(M, M) dVE_dS_j += AdvA # Derivatives dKuquq tmp_dv = np.dot(AdvA, S_u).dot(Kuui[j, :, :]) dVE_dKjj += AdvA - tmp_dv - tmp_dv.T Adm = np.dot(q_F[j].Afdu.T, VE_dm[f_index[j]][:, d_index[j], None]) dVE_dKjj += -np.dot(Adm, np.dot(Kuui[j, :, :], m_u[:, j, None]).T) # Derivatives dKuqfd tmp = np.dot(S_u, Kuui[j, :, :]) tmp = 2. * (tmp - np.eye(M)) dve_kjd = np.dot(np.dot(Kuui[j, :, :], m_u[:, j, None]), VE_dm[f_index[j]][:, d_index[j], None].T) dve_kjd += np.dot(tmp.T, Adv) dVE_dKjd.append(dve_kjd) # Derivatives dKdiag dVE_dKdiag.append(VE_dv[f_index[j]][:, d_index[j]]) dVE_dKjj = 0.5 * (dVE_dKjj + dVE_dKjj.T) # Sum of VE and KL terms dL_dmu_j = dVE_dmu_j - dKL_dmu_j dL_dS_j = dVE_dS_j - dKL_dS_j dL_dKjj = dVE_dKjj - dKL_dKjj dL_dKdj = dVE_dKjd[0].copy() #Here we just pass the unique position dL_dKdiag = dVE_dKdiag[0].copy( ) #Here we just pass the unique position # Pass S_q gradients to its low-triangular representation L_q chol_u = q_U.chols_u.copy() L_j = choleskies.flat_to_triang(chol_u[:, j:j + 1]) dL_dL_j = 2. * np.array( [np.dot(a, b) for a, b in zip(dL_dS_j[None, :, :], L_j)]) dL_dL_j = choleskies.triang_to_flat(dL_dL_j) # Posterior posterior_j = Posterior(mean=m_u[:, j, None], cov=S_u, K=Kuu[j, :, :], prior_mean=np.zeros(m_u[:, j, None].shape)) return dL_dmu_j, dL_dL_j, dL_dS_j, posterior_j, dL_dKjj, dL_dKdj, dL_dKdiag
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None): """ The first phase of inference: Compute: log-likelihood, dL_dKmm Cached intermediate results: Kmm, KmmInv, """ num_data, output_dim = Y.shape input_dim = Z.shape[0] uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1. / np.fmax(likelihood.variance, 1e-6) psi0, psi2, YRY, psi1, psi1Y, Shalf, psi1S = self.gatherPsiStat( kern, X, Z, Y, beta, uncertain_inputs) #====================================================================== # Compute Common Components #====================================================================== Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) #LmInv = dtrtri(Lm) if uncertain_inputs: LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right') else: LmInvPsi2LmInvT = tdot(dtrtrs( Lm, psi1.T)[0]) / beta #tdot(psi1.dot(LmInv.T).T) /beta Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT LL = jitchol(Lambda) LmLL = Lm.dot(LL) # LLInv = dtrtri(LL) # LmLLInv = LLInv.dot(LmInv) logdet_L = 2. * np.sum(np.log(np.diag(LL))) b = dtrtrs(LmLL, psi1Y.T)[0].T #psi1Y.dot(LmLLInv.T) bbt = np.square(b).sum() v = dtrtrs(LmLL, b.T, trans=1)[0].T #b.dot(LmLLInv) LLinvPsi1TYYTPsi1LLinvT = tdot(b.T) if psi1S is not None: psi1SLLinv = dtrtrs(LmLL, psi1S.T)[0].T #psi1S.dot(LmLLInv.T) bbt += np.square(psi1SLLinv).sum() LLinvPsi1TYYTPsi1LLinvT += tdot(psi1SLLinv.T) psi1SP = dtrtrs(LmLL, psi1SLLinv.T, trans=1)[0].T #psi1SLLinv.dot(LmLLInv) tmp = -backsub_both_sides( LL, LLinvPsi1TYYTPsi1LLinvT + output_dim * np.eye(input_dim)) dL_dpsi2R = backsub_both_sides( Lm, tmp + output_dim * np.eye(input_dim)) / 2 #tmp = -LLInv.T.dot(LLinvPsi1TYYTPsi1LLinvT+output_dim*np.eye(input_dim)).dot(LLInv) #dL_dpsi2R = LmInv.T.dot(tmp+output_dim*np.eye(input_dim)).dot(LmInv)/2. #====================================================================== # Compute log-likelihood #====================================================================== logL_R = -num_data * np.log(beta) logL = -( output_dim * (num_data * log_2_pi + logL_R + psi0 - np.trace(LmInvPsi2LmInvT)) + YRY - bbt) / 2. - output_dim * logdet_L / 2. #====================================================================== # Compute dL_dKmm #====================================================================== dL_dKmm = dL_dpsi2R - output_dim * backsub_both_sides( Lm, LmInvPsi2LmInvT) / 2 #LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== wd_inv = backsub_both_sides( Lm, np.eye(input_dim) - backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left') post = Posterior(woodbury_inv=wd_inv, woodbury_vector=v.T, K=Kmm, mean=None, cov=None, K_chol=Lm) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = (YRY * beta + beta * output_dim * psi0 - num_data * output_dim * beta) / 2. - beta * (dL_dpsi2R * psi2).sum( ) - beta * np.trace(LLinvPsi1TYYTPsi1LLinvT) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0 = -output_dim * (beta * np.ones((num_data, ))) / 2. if uncertain_outputs: m, s = Y.mean, Y.variance dL_dpsi1 = beta * (np.dot(m, v) + Shalf[:, None] * psi1SP) else: dL_dpsi1 = beta * np.dot(Y, v) if uncertain_inputs: dL_dpsi2 = beta * dL_dpsi2R else: dL_dpsi1 += np.dot(psi1, dL_dpsi2R) * 2. dL_dpsi2 = None if uncertain_inputs: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dpsi0': dL_dpsi0, 'dL_dpsi1': dL_dpsi1, 'dL_dpsi2': dL_dpsi2, 'dL_dthetaL': dL_dthetaL } else: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dKdiag': dL_dpsi0, 'dL_dKnm': dL_dpsi1, 'dL_dthetaL': dL_dthetaL } if uncertain_outputs: m, s = Y.mean, Y.variance psi1LmiLLi = dtrtrs(LmLL, psi1.T)[0].T #psi1.dot(LmLLInv.T) LLiLmipsi1Y = b.T grad_dict['dL_dYmean'] = -m * beta + psi1LmiLLi.dot(LLiLmipsi1Y) grad_dict['dL_dYvar'] = beta / -2. + np.square(psi1LmiLLi).sum( axis=1) / 2 return post, logL, grad_dict
def inference(self, kern, X, Z, likelihood, Y, qU_mean, qU_var, Kuu_sigma=None): """ The SVI-VarDTC inference """ N, D, M, Q = Y.shape[0], Y.shape[1], Z.shape[0], Z.shape[1] uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1. / likelihood.variance psi0, psi2, YRY, psi1, psi1Y = self.gatherPsiStat( kern, X, Z, Y, beta, uncertain_inputs) #====================================================================== # Compute Common Components #====================================================================== Kuu = kern.K(Z).copy() if Kuu_sigma is not None: diag.add(Kuu, Kuu_sigma) else: diag.add(Kuu, self.const_jitter) Lm = jitchol(Kuu) mu, S = qU_mean, qU_var Ls = jitchol(S) LinvLs = dtrtrs(Lm, Ls)[0] Linvmu = dtrtrs(Lm, mu)[0] psi1YLinvT = dtrtrs(Lm, psi1Y.T)[0].T self.mid = {'qU_L': Ls, 'LinvLu': LinvLs, 'L': Lm, 'Linvmu': Linvmu} if uncertain_inputs: LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right') else: LmInvPsi2LmInvT = tdot(dtrtrs(Lm, psi1.T)[0]) / beta LmInvSmuLmInvT = tdot(LinvLs) * D + tdot(Linvmu) # logdet_L = np.sum(np.log(np.diag(Lm))) # logdet_S = np.sum(np.log(np.diag(Ls))) #====================================================================== # Compute log-likelihood #====================================================================== logL_R = -N * np.log(beta) logL = -N*D*log_2_pi/2. -D*logL_R/2. - D*psi0/2. - YRY/2. \ -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. + np.trace(LmInvPsi2LmInvT)*D/2.+(Linvmu*psi1YLinvT.T).sum() #====================================================================== # Compute dL_dKmm #====================================================================== tmp1 = backsub_both_sides(Lm, LmInvSmuLmInvT.dot(LmInvPsi2LmInvT), 'left') tmp2 = Linvmu.dot(psi1YLinvT) tmp3 = backsub_both_sides(Lm, -D * LmInvPsi2LmInvT - tmp2 - tmp2.T, 'left') / 2. dL_dKmm = (tmp1 + tmp1.T) / 2. + tmp3 #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = -D * N * beta / 2. - ( -D * psi0 / 2. - YRY / 2. - (LmInvSmuLmInvT * LmInvPsi2LmInvT).sum() / 2. + np.trace(LmInvPsi2LmInvT) * D / 2. + (Linvmu * psi1YLinvT.T).sum()) * beta #====================================================================== # Compute dL_dqU #====================================================================== tmp1 = backsub_both_sides(Lm, -LmInvPsi2LmInvT, 'left') dL_dqU_mean = tmp1.dot(mu) + dtrtrs(Lm, psi1YLinvT.T, trans=1)[0] dL_dqU_var = D / 2. * tmp1 #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== KuuInvmu = dtrtrs(Lm, Linvmu, trans=1)[0] tmp = backsub_both_sides(Lm, np.eye(M) - tdot(LinvLs), 'left') post = Posterior(woodbury_inv=tmp, woodbury_vector=KuuInvmu, K=Kuu, mean=mu, cov=S, K_chol=Lm) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0 = -D * (beta * np.ones((N, ))) / 2. if uncertain_outputs: dL_dpsi1 = Y.mean.dot(dtrtrs(Lm, Linvmu, trans=1)[0].T) * beta else: dL_dpsi1 = Y.dot(dtrtrs(Lm, Linvmu, trans=1)[0].T) * beta dL_dpsi2 = beta * backsub_both_sides(Lm, D * np.eye(M) - LmInvSmuLmInvT, 'left') / 2. if not uncertain_inputs: dL_dpsi1 += psi1.dot(dL_dpsi2 + dL_dpsi2.T) / beta dL_dpsi2 = None if uncertain_inputs: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dpsi0': dL_dpsi0, 'dL_dpsi1': dL_dpsi1, 'dL_dpsi2': dL_dpsi2, 'dL_dthetaL': dL_dthetaL, 'dL_dqU_mean': dL_dqU_mean, 'dL_dqU_var': dL_dqU_var } else: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dKdiag': dL_dpsi0, 'dL_dKnm': dL_dpsi1, 'dL_dthetaL': dL_dthetaL, 'dL_dqU_mean': dL_dqU_mean, 'dL_dqU_var': dL_dqU_var } if uncertain_outputs: m, s = Y.mean, Y.variance grad_dict['dL_dYmean'] = -m * beta + dtrtrs(Lm, psi1.T)[0].T.dot( dtrtrs(Lm, mu)[0]) grad_dict['dL_dYvar'] = beta / -2. return post, logL, grad_dict
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None, fixed_covs_kerns=None, **kw): _, output_dim = Y.shape uncertain_inputs = isinstance(X, VariationalPosterior) #see whether we've got a different noise variance for each datum beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6) # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency! #self.YYTfactor = self.get_YYTfactor(Y) #VVT_factor = self.get_VVTfactor(self.YYTfactor, beta) het_noise = beta.size > 1 if het_noise: raise(NotImplementedError("Heteroscedastic noise not implemented, should be possible though, feel free to try implementing it :)")) if beta.ndim == 1: beta = beta[:, None] # do the inference: num_inducing = Z.shape[0] num_data = Y.shape[0] # kernel computations, using BGPLVM notation Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) if Lm is None: Lm = jitchol(Kmm) # The rather complex computations of A, and the psi stats if uncertain_inputs: psi0 = kern.psi0(Z, X) psi1 = kern.psi1(Z, X) if het_noise: psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0) else: psi2_beta = kern.psi2(Z,X) * beta LmInv = dtrtri(Lm) A = LmInv.dot(psi2_beta.dot(LmInv.T)) else: psi0 = kern.Kdiag(X) psi1 = kern.K(X, Z) if het_noise: tmp = psi1 * (np.sqrt(beta)) else: tmp = psi1 * (np.sqrt(beta)) tmp, _ = dtrtrs(Lm, tmp.T, lower=1) A = tdot(tmp) # factor B B = np.eye(num_inducing) + A LB = jitchol(B) # back substutue C into psi1Vf #tmp, _ = dtrtrs(Lm, psi1.T.dot(VVT_factor), lower=1, trans=0) #_LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0) #tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1) #Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1) # data fit and derivative of L w.r.t. Kmm #delit = tdot(_LBi_Lmi_psi1Vf) # Expose YYT to get additional covariates in (YYT + Kgg): tmp, _ = dtrtrs(Lm, psi1.T, lower=1, trans=0) _LBi_Lmi_psi1, _ = dtrtrs(LB, tmp, lower=1, trans=0) tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1, lower=1, trans=1) Cpsi1, _ = dtrtrs(Lm, tmp, lower=1, trans=1) # TODO: cache this: # Compute fixed covariates covariance: if fixed_covs_kerns is not None: K_fixed = 0 for name, [cov, k] in fixed_covs_kerns.iteritems(): K_fixed += k.K(cov) #trYYT = self.get_trYYT(Y) YYT_covs = (tdot(Y) + K_fixed) data_term = beta**2 * YYT_covs trYYT_covs = np.trace(YYT_covs) else: data_term = beta**2 * tdot(Y) trYYT_covs = self.get_trYYT(Y) #trYYT = self.get_trYYT(Y) delit = mdot(_LBi_Lmi_psi1, data_term, _LBi_Lmi_psi1.T) data_fit = np.trace(delit) DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit) if dL_dKmm is None: delit = -0.5 * DBi_plus_BiPBi delit += -0.5 * B * output_dim delit += output_dim * np.eye(num_inducing) # Compute dL_dKmm dL_dKmm = backsub_both_sides(Lm, delit) # derivatives of L w.r.t. psi dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, data_term, Cpsi1, DBi_plus_BiPBi, psi1, het_noise, uncertain_inputs) # log marginal likelihood log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, psi0, A, LB, trYYT_covs, data_fit, Y) if self.save_per_dim: self.saved_vals = [psi0, A, LB, _LBi_Lmi_psi1, beta] # No heteroscedastics, so no _LBi_Lmi_psi1Vf: # For the interested reader, try implementing the heteroscedastic version, it should be possible _LBi_Lmi_psi1Vf = None # Is just here for documentation, so you can see, what it was. #noise derivatives dL_dR = _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, psi0, psi1, beta, data_fit, num_data, output_dim, trYYT_covs, Y, None) dL_dthetaL = likelihood.exact_inference_gradients(dL_dR,Y_metadata) #put the gradients in the right places if uncertain_inputs: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2, 'dL_dthetaL':dL_dthetaL} else: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL} if fixed_covs_kerns is not None: # For now, we do not take the gradients, we can compute them, # but the maximum likelihood solution is to switch off the additional covariates.... dL_dcovs = beta * np.eye(K_fixed.shape[0]) - beta**2*tdot(_LBi_Lmi_psi1.T) grad_dict['dL_dcovs'] = -.5 * dL_dcovs #get sufficient things for posterior prediction #TODO: do we really want to do this in the loop? if 1: woodbury_vector = (beta*Cpsi1).dot(Y) else: import ipdb; ipdb.set_trace() psi1V = np.dot(Y.T*beta, psi1).T tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0) tmp, _ = dpotrs(LB, tmp, lower=1) woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1) Bi, _ = dpotri(LB, lower=1) symmetrify(Bi) Bi = -dpotri(LB, lower=1)[0] diag.add(Bi, 1) woodbury_inv = backsub_both_sides(Lm, Bi) #construct a posterior object post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm) return post, log_marginal, grad_dict
def inference(self, q_u_means, q_u_chols, kern, X, Z, likelihood, Y, mean_functions, Y_metadata=None, KL_scale=1.0, batch_scale=1.0): num_inducing = Z.shape[0] num_data, num_outputs = Y.shape num_latent_funcs = likelihood.request_num_latent_functions(Y) #For each latent function, calculate some required values latent_function_details = [] for latent_ind in range(num_latent_funcs): q_u_meanj = q_u_means[:, latent_ind*num_outputs:(latent_ind+1)*num_outputs] q_u_cholj = q_u_chols[:, latent_ind*num_outputs:(latent_ind+1)*num_outputs] kernj = kern[latent_ind] mean_functionj = mean_functions[latent_ind] latent_detail = self.calculate_mu_var(X, Y, Z, q_u_meanj, q_u_cholj, kernj, mean_functionj, num_inducing, num_data, num_outputs) latent_function_details.append(latent_detail) mu = np.hstack([l.mu for l in latent_function_details]) v = np.hstack([l.v for l in latent_function_details]) #mu = [l.mu for l in latent_function_details] #v = [l.v for l in latent_function_details] #Hack shouldn't be necessary #Y = np.hstack([Y]*num_latent_funcs) #quadrature for the likelihood F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v, Y_metadata=Y_metadata) #for latent_ind in range(num_latent_functions): #l.dF_dmu = dF_dmu[:, latent_ind][:, None] #l.dF_dv = dF_dv[:, latent_ind][:, None] #rescale the F term if working on a batch F, dF_dmu, dF_dv = F*batch_scale, dF_dmu*batch_scale, dF_dv*batch_scale if dF_dthetaL is not None: dF_dthetaL = dF_dthetaL.sum(1).sum(1)*batch_scale #sum (gradients of) expected likelihood and KL part log_marginal = F.sum() dL_dKmm = [] dL_dKmn = [] dL_dKdiag = [] dL_dm = [] dL_dchol = [] dL_dmfZ = [] dL_dmfX = [] posteriors = [] #For each latent function (and thus for each kernel the latent function uses) #calculate the gradients and generate a posterior for latent_ind in range(num_latent_funcs): l = latent_function_details[latent_ind] #q_u_meanj = q_u_means[:, latent_ind*num_outputs:(latent_ind+1)*num_outputs] #q_u_cholj = q_u_chols[:, latent_ind*num_outputs:(latent_ind+1)*num_outputs] dF_dmui = dF_dmu[:, latent_ind*num_outputs:(latent_ind+1)*num_outputs] dF_dvi = dF_dv[:, latent_ind*num_outputs:(latent_ind+1)*num_outputs] (log_marginal, dL_dKmmi, dL_dKmni, dL_dKdiagi, dL_dmi, dL_dcholi, dL_dmfZi, dL_dmfXi) = self.calculate_gradients(log_marginal, l, dF_dmui, dF_dvi, num_inducing, num_outputs, num_data) posterior = Posterior(mean=l.q_u_mean, cov=l.S.T, K=l.Kmm, prior_mean=l.prior_mean_u) dL_dKmm.append(dL_dKmmi) dL_dKmn.append(dL_dKmni) dL_dKdiag.append(dL_dKdiagi) dL_dm.append(dL_dmi) dL_dchol.append(dL_dcholi) dL_dmfZ.append(dL_dmfZi) dL_dmfX.append(dL_dmfXi) posteriors.append(posterior) grad_dict = {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dL_dKdiag, 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL} #If not all of the mean functions are null, fill out the others gradients with zeros if not all(mean_function is None for mean_function in mean_functions): for mean_function in mean_functions: if mean_function is None: grad_dict['dL_dmfZ'] = np.zeros(Z.shape) grad_dict['dL_dmfX'] = np.zeros(X.shape) else: grad_dict['dL_dmfZ'] = dL_dmfZ grad_dict['dL_dmfX'] = dL_dmfX return posteriors, log_marginal, grad_dict
def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None): assert mean_function is None, "inference with a mean function not implemented" num_inducing, _ = Z.shape num_data, output_dim = Y.shape #make sure the noise is not hetero sigma_n = likelihood.gaussian_variance(Y_metadata) if sigma_n.size > 1: raise NotImplementedError( "no hetero noise with this implementation of PEP") Kmm = kern.K(Z) Knn = kern.Kdiag(X) Knm = kern.K(X, Z) U = Knm #factor Kmm diag.add(Kmm, self.const_jitter) Kmmi, L, Li, _ = pdinv(Kmm) #compute beta_star, the effective noise precision LiUT = np.dot(Li, U.T) sigma_star = sigma_n + self.alpha * (Knn - np.sum(np.square(LiUT), 0)) beta_star = 1. / sigma_star # Compute and factor A A = tdot(LiUT * np.sqrt(beta_star)) + np.eye(num_inducing) LA = jitchol(A) # back substitute to get b, P, v URiy = np.dot(U.T * beta_star, Y) tmp, _ = dtrtrs(L, URiy, lower=1) b, _ = dtrtrs(LA, tmp, lower=1) tmp, _ = dtrtrs(LA, b, lower=1, trans=1) v, _ = dtrtrs(L, tmp, lower=1, trans=1) tmp, _ = dtrtrs(LA, Li, lower=1, trans=0) P = tdot(tmp.T) alpha_const_term = (1.0 - self.alpha) / self.alpha #compute log marginal log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \ -np.sum(np.log(np.diag(LA)))*output_dim + \ 0.5*output_dim*(1+alpha_const_term)*np.sum(np.log(beta_star)) + \ -0.5*np.sum(np.square(Y.T*np.sqrt(beta_star))) + \ 0.5*np.sum(np.square(b)) + 0.5*alpha_const_term*num_data*np.log(sigma_n) #compute dL_dR Uv = np.dot(U, v) dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - (1.0+alpha_const_term)/beta_star + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) \ + np.sum(np.square(Uv), 1))*beta_star**2 # Compute dL_dKmm vvT_P = tdot(v.reshape(-1, 1)) + P dL_dK = 0.5 * (Kmmi - vvT_P) KiU = np.dot(Kmmi, U.T) dL_dK += self.alpha * np.dot(KiU * dL_dR, KiU.T) # Compute dL_dU vY = np.dot(v.reshape(-1, 1), Y.T) dL_dU = vY - np.dot(vvT_P, U.T) dL_dU *= beta_star dL_dU -= self.alpha * 2. * KiU * dL_dR dL_dthetaL = likelihood.exact_inference_gradients(dL_dR) dL_dthetaL += 0.5 * alpha_const_term * num_data / sigma_n grad_dict = { 'dL_dKmm': dL_dK, 'dL_dKdiag': dL_dR * self.alpha, 'dL_dKnm': dL_dU.T, 'dL_dthetaL': dL_dthetaL } #construct a posterior object post = Posterior(woodbury_inv=Kmmi - P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=L) return post, log_marginal, grad_dict
def calculate_gradients(self, q_U, p_U_new, p_U_old, p_U_var, q_F, VE_dm, VE_dv, Ntask, M, Q, D, f_index, d_index,q): """ Calculates gradients of the Log-marginal distribution p(Y) wrt variational parameters mu_q, S_q """ # Algebra for q(u): m_u = q_U.mu_u.copy() L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) S_u = np.empty((Q, M, M)) [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)] S_qi, _ = linalg.dpotri(np.asfortranarray(L_u[q, :, :])) if np.any(np.isinf(S_qi)): raise ValueError("Sqi: Cholesky representation unstable") # Algebra for p(u) Kuu_new = p_U_new.Kuu.copy() Luu_new = p_U_new.Luu.copy() Kuui_new = p_U_new.Kuui.copy() Kuu_old = p_U_old.Kuu.copy() Luu_old = p_U_old.Luu.copy() Kuui_old = p_U_old.Kuui.copy() Mu_var = p_U_var.Mu.copy() Kuu_var = p_U_var.Kuu.copy() Luu_var = p_U_var.Luu.copy() Kuui_var = p_U_var.Kuui.copy() # KL Terms dKLnew_dmu_q = np.dot(Kuui_new[q,:,:], m_u[:, q, None]) dKLnew_dS_q = 0.5 * (Kuui_new[q,:,:] - S_qi) dKLold_dmu_q = np.dot(Kuui_old[q,:,:], m_u[:, q, None]) dKLold_dS_q = 0.5 * (Kuui_old[q,:,:] - S_qi) dKLvar_dmu_q = np.dot(Kuui_var[q,:,:], (m_u[:, q, None] - Mu_var[q, :, :])) # important!! (Eq. 69 MCB) dKLvar_dS_q = 0.5 * (Kuui_var[q,:,:] - S_qi) dKLnew_dKqq = 0.5 * Kuui_new[q,:,:] - 0.5 * Kuui_new[q,:,:].dot(S_u[q, :, :]).dot(Kuui_new[q,:,:]) \ - 0.5 * np.dot(Kuui_new[q,:,:],np.dot(m_u[:, q, None],m_u[:, q, None].T)).dot(Kuui_new[q,:,:].T) dKLold_dKqq = 0.5 * Kuui_old[q,:,:] - 0.5 * Kuui_old[q,:,:].dot(S_u[q, :, :]).dot(Kuui_old[q,:,:]) \ - 0.5 * np.dot(Kuui_old[q,:,:],np.dot(m_u[:, q, None],m_u[:, q, None].T)).dot(Kuui_old[q,:,:].T) #dKLvar_dKqq = 0.5 * Kuui_var[q,:,:] - 0.5 * Kuui_var[q,:,:].dot(S_u[q, :, :]).dot(Kuui_var[q,:,:]) \ # - 0.5 * np.dot(Kuui_var[q,:,:],np.dot(m_u[:, q, None],m_u[:, q, None].T)).dot(Kuui_var[q,:,:].T) \ # + 0.5 * np.dot(Kuui_var[q,:,:], np.dot(m_u[:,q,None], Mu_var[q,:,:].T)).dot(Kuui_var[q,:,:].T) \ # + 0.5 * np.dot(Kuui_var[q,:,:], np.dot(Mu_var[q,:,:], m_u[:,q,None].T)).dot(Kuui_var[q,:,:].T) \ # - 0.5 * np.dot(Kuui_var[q,:,:],np.dot(Mu_var[q,:,:], Mu_var[q,:,:].T)).dot(Kuui_var[q,:,:].T) #KLvar += 0.5 * np.sum(Kuui_var[q, :, :] * S_u[q, :, :]) \ # + 0.5 * np.dot((Mu_var[q, :, :] - m_u[:, q, None]).T, np.dot(Kuui_var[q, :, :], (Mu_var[q, :, :] - m_u[:, q, None]))) \ # - 0.5 * M \ # + 0.5 * 2. * np.sum(np.log(np.abs(np.diag(Luu_var[q, :, :])))) \ # - 0.5 * 2. * np.sum(np.log(np.abs(np.diag(L_u[q, :, :])))) # # VE Terms dVE_dmu_q = np.zeros((M, 1)) dVE_dS_q = np.zeros((M, M)) dVE_dKqq = np.zeros((M, M)) dVE_dKqd = [] dVE_dKdiag = [] for d, q_fd in enumerate(q_F): Nt = Ntask[f_index[d]] dVE_dmu_q += np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:,d_index[d]])[:, None] Adv = q_fd.Afdu[q,:,:].T * VE_dv[f_index[d]][:,d_index[d],None].T Adv = np.ascontiguousarray(Adv) AdvA = np.dot(Adv.reshape(-1, Nt), q_fd.Afdu[q, :, :]).reshape(M, M) dVE_dS_q += AdvA # Derivatives dKuquq tmp_dv = np.dot(AdvA, S_u[q, :, :]).dot(Kuui_new[q,:,:]) dVE_dKqq += AdvA - tmp_dv - tmp_dv.T Adm = np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:,d_index[d],None]) dVE_dKqq += - np.dot(Adm, np.dot(Kuui_new[q,:,:], m_u[:, q, None]).T) # Derivatives dKuqfd tmp = np.dot(S_u[q, :, :], Kuui_new[q,:,:]) tmp = 2. * (tmp - np.eye(M)) dve_kqd = np.dot(np.dot(Kuui_new[q,:,:], m_u[:, q, None]), VE_dm[f_index[d]][:,d_index[d],None].T) dve_kqd += np.dot(tmp.T, Adv) dVE_dKqd.append(dve_kqd) # Derivatives dKdiag dVE_dKdiag.append(VE_dv[f_index[d]][:,d_index[d]]) dVE_dKqq = 0.5 * (dVE_dKqq + dVE_dKqq.T) # Derivatives of variational parameters dL_dmu_q = dVE_dmu_q - dKLnew_dmu_q + dKLold_dmu_q - dKLvar_dmu_q dL_dS_q = dVE_dS_q - dKLnew_dS_q + dKLold_dS_q - dKLvar_dS_q # Derivatives of prior hyperparameters # if using Zgrad, dL_dKqq = dVE_dKqq - dKLnew_dKqq + dKLold_dKqq - dKLvar_dKqq # otherwise for hyperparameters: dL_dKqq = dVE_dKqq - dKLnew_dKqq dL_dKqq = dVE_dKqq - dKLnew_dKqq #+ dKLold_dKqq - dKLvar_dKqq # dKLold_dKqq sólo para Zgrad, dKLvar_dKqq to be done (for Zgrad) dL_dKdq = dVE_dKqd dL_dKdiag = dVE_dKdiag # Pass S_q gradients to its low-triangular representation L_q chol_u = q_U.chols_u.copy() L_q = choleskies.flat_to_triang(chol_u[:,q:q+1]) dL_dL_q = 2. * np.array([np.dot(a, b) for a, b in zip(dL_dS_q[None,:,:], L_q)]) dL_dL_q = choleskies.triang_to_flat(dL_dL_q) # Posterior posterior_q = Posterior(mean=m_u[:, q, None], cov=S_u[q, :, :], K=Kuu_new[q,:,:], prior_mean=np.zeros(m_u[:, q, None].shape)) return dL_dmu_q, dL_dL_q, dL_dS_q, posterior_q, dL_dKqq, dL_dKdq, dL_dKdiag
def inference(self, q_u_means, q_u_chols, X, Y, Z, kern_list, likelihood, B_list, Y_metadata, KL_scale=1.0, batch_scale=None): M = Z.shape[0] T = len(Y) if batch_scale is None: batch_scale = [1.0] * T Ntask = [] [Ntask.append(Y[t].shape[0]) for t in range(T)] Q = len(kern_list) D = likelihood.num_output_functions(Y_metadata) Kuu, Luu, Kuui = util.latent_funs_cov(Z, kern_list) p_U = pu(Kuu=Kuu, Luu=Luu, Kuui=Kuui) q_U = qu(mu_u=q_u_means, chols_u=q_u_chols) # for every latent function f_d calculate q(f_d) and keep it as q(F): q_F = [] posteriors_F = [] f_index = Y_metadata['function_index'].flatten() d_index = Y_metadata['d_index'].flatten() for d in range(D): Xtask = X[f_index[d]] q_fd = self.calculate_q_f(X=Xtask, Z=Z, q_U=q_U, p_U=p_U, kern_list=kern_list, B=B_list, M=M, N=Xtask.shape[0], Q=Q, D=D, d=d) # Posterior objects for output functions (used in prediction) posterior_fd = Posterior(mean=q_fd.m_fd.copy(), cov=q_fd.S_fd.copy(), K=util.function_covariance( X=Xtask, B=B_list, kernel_list=kern_list, d=d), prior_mean=np.zeros(q_fd.m_fd.shape)) posteriors_F.append(posterior_fd) q_F.append(q_fd) mu_F = [] v_F = [] for t in range(T): mu_F_task = np.empty((Y[t].shape[0], 1)) v_F_task = np.empty((Y[t].shape[0], 1)) for d, q_fd in enumerate(q_F): if f_index[d] == t: mu_F_task = np.hstack((mu_F_task, q_fd.m_fd)) v_F_task = np.hstack((v_F_task, q_fd.v_fd)) mu_F.append(mu_F_task[:, 1:]) v_F.append(v_F_task[:, 1:]) # Variational Expectations VE = likelihood.var_exp(Y, mu_F, v_F, Y_metadata) VE_dm, VE_dv = likelihood.var_exp_derivatives(Y, mu_F, v_F, Y_metadata) for t in range(T): VE[t] = VE[t] * batch_scale[t] VE_dm[t] = VE_dm[t] * batch_scale[t] VE_dv[t] = VE_dv[t] * batch_scale[t] # KL Divergence KL = self.calculate_KL(q_U=q_U, p_U=p_U, M=M, Q=Q) # Log Marginal log(p(Y)) F = 0 for t in range(T): F += VE[t].sum() log_marginal = F - KL # Gradients and Posteriors dL_dmu_u = [] dL_dL_u = [] dL_dKmm = [] dL_dKmn = [] dL_dKdiag = [] posteriors = [] for q in range(Q): (dL_dmu_q, dL_dL_q, posterior_q, dL_dKqq, dL_dKdq, dL_dKdiag_q) = self.calculate_gradients(q_U=q_U, p_U=p_U, q_F=q_F, VE_dm=VE_dm, VE_dv=VE_dv, Ntask=Ntask, M=M, Q=Q, D=D, f_index=f_index, d_index=d_index, q=q) dL_dmu_u.append(dL_dmu_q) dL_dL_u.append(dL_dL_q) dL_dKmm.append(dL_dKqq) dL_dKmn.append(dL_dKdq) dL_dKdiag.append(dL_dKdiag_q) posteriors.append(posterior_q) gradients = { 'dL_dmu_u': dL_dmu_u, 'dL_dL_u': dL_dL_u, 'dL_dKmm': dL_dKmm, 'dL_dKmn': dL_dKmn, 'dL_dKdiag': dL_dKdiag } return log_marginal, gradients, posteriors, posteriors_F
def inference_root(self, kern, X, Z, likelihood, Y, Kuu_sigma=None, Y_metadata=None, Lm=None, dL_dKmm=None): """ The first phase of inference: Compute: log-likelihood, dL_dKmm Cached intermediate results: Kmm, KmmInv, """ num_data, output_dim = Y.shape input_dim = Z.shape[0] num_data_total = allReduceArrays([np.int32(num_data)], self.mpi_comm)[0] uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1. / np.fmax(likelihood.variance, 1e-6) psi0, psi2, YRY, psi1, psi1Y, Shalf, psi1S = self.gatherPsiStat( kern, X, Z, Y, beta, uncertain_inputs) #====================================================================== # Compute Common Components #====================================================================== try: Kmm = kern.K(Z).copy() if Kuu_sigma is not None: diag.add(Kmm, Kuu_sigma) else: diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) LmInv = dtrtri(Lm) LmInvPsi2LmInvT = LmInv.dot(psi2.dot(LmInv.T)) Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT LL = jitchol(Lambda) LLInv = dtrtri(LL) flag = np.zeros((1, ), dtype=np.int32) self.mpi_comm.Bcast(flag, root=self.root) except LinAlgError as e: flag = np.ones((1, ), dtype=np.int32) self.mpi_comm.Bcast(flag, root=self.root) raise e broadcastArrays([LmInv, LLInv], self.mpi_comm, self.root) LmLLInv = LLInv.dot(LmInv) logdet_L = 2. * np.sum(np.log(np.diag(LL))) b = psi1Y.dot(LmLLInv.T) bbt = np.square(b).sum() v = b.dot(LmLLInv) LLinvPsi1TYYTPsi1LLinvT = tdot(b.T) if psi1S is not None: psi1SLLinv = psi1S.dot(LmLLInv.T) bbt_sum = np.square(psi1SLLinv).sum() LLinvPsi1TYYTPsi1LLinvT_sum = tdot(psi1SLLinv.T) bbt_sum, LLinvPsi1TYYTPsi1LLinvT_sum = reduceArrays( [bbt_sum, LLinvPsi1TYYTPsi1LLinvT_sum], self.mpi_comm, self.root) bbt += bbt_sum LLinvPsi1TYYTPsi1LLinvT += LLinvPsi1TYYTPsi1LLinvT_sum psi1SP = psi1SLLinv.dot(LmLLInv) tmp = -LLInv.T.dot(LLinvPsi1TYYTPsi1LLinvT + output_dim * np.eye(input_dim)).dot(LLInv) dL_dpsi2R = LmInv.T.dot(tmp + output_dim * np.eye(input_dim)).dot(LmInv) / 2. broadcastArrays([dL_dpsi2R], self.mpi_comm, self.root) #====================================================================== # Compute log-likelihood #====================================================================== logL_R = -num_data_total * np.log(beta) logL = -(output_dim * (num_data_total * log_2_pi + logL_R + psi0 - np.trace(LmInvPsi2LmInvT)) + YRY - bbt) / 2. - output_dim * logdet_L / 2. #====================================================================== # Compute dL_dKmm #====================================================================== dL_dKmm = dL_dpsi2R - output_dim * LmInv.T.dot(LmInvPsi2LmInvT).dot( LmInv) / 2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== wd_inv = backsub_both_sides( Lm, np.eye(input_dim) - backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left') post = Posterior(woodbury_inv=wd_inv, woodbury_vector=v.T, K=Kmm, mean=None, cov=None, K_chol=Lm) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = (YRY * beta + beta * output_dim * psi0 - num_data_total * output_dim * beta) / 2. - beta * (dL_dpsi2R * psi2).sum( ) - beta * np.trace(LLinvPsi1TYYTPsi1LLinvT) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0 = -output_dim * (beta * np.ones((num_data, ))) / 2. if uncertain_outputs: m, s = Y.mean, Y.variance dL_dpsi1 = beta * (np.dot(m, v) + Shalf[:, None] * psi1SP) else: dL_dpsi1 = beta * np.dot(Y, v) if uncertain_inputs: dL_dpsi2 = beta * dL_dpsi2R else: dL_dpsi1 += np.dot(psi1, dL_dpsi2R) * 2. dL_dpsi2 = None if uncertain_inputs: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dpsi0': dL_dpsi0, 'dL_dpsi1': dL_dpsi1, 'dL_dpsi2': dL_dpsi2, 'dL_dthetaL': dL_dthetaL } else: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dKdiag': dL_dpsi0, 'dL_dKnm': dL_dpsi1, 'dL_dthetaL': dL_dthetaL } if uncertain_outputs: m, s = Y.mean, Y.variance psi1LmiLLi = psi1.dot(LmLLInv.T) LLiLmipsi1Y = b.T grad_dict['dL_dYmean'] = -m * beta + psi1LmiLLi.dot(LLiLmipsi1Y) grad_dict['dL_dYvar'] = beta / -2. + np.square(psi1LmiLLi).sum( axis=1) / 2 return post, logL, grad_dict
def incremental_inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, K=None, variance=None, Z_tilde=None): # do incremental update if mean_function is None: m = 0 else: m = mean_function.f(X) if variance is None: variance = likelihood.gaussian_variance(Y_metadata) YYT_factor = Y - m # K_tmp = kern.K(X, X[-1:]) K_inc = kern._K[:-1, -1] K_inc2 = kern._K[-1:, -1] # self._K = np.block([[self._K, K_inc], [K_inc.T, K_inc2]]) # Ky = K.copy() jitter = variance[ -1] + 1e-8 # variance can be given for each point individually, in which case we just take the last point # diag.add(Ky, jitter) # LW_old = self._old_posterior.woodbury_chol Wi, LW, LWi, W_logdet = pdinv_inc(self._old_LW, K_inc, K_inc2 + jitter, self._old_Wi) alpha, _ = dpotrs(LW, YYT_factor, lower=1) log_marginal = 0.5 * (-Y.size * log_2_pi - Y.shape[1] * W_logdet - np.sum(alpha * YYT_factor)) if Z_tilde is not None: # This is a correction term for the log marginal likelihood # In EP this is log Z_tilde, which is the difference between the # Gaussian marginal and Z_EP log_marginal += Z_tilde dL_dK = 0.5 * (tdot(alpha) - Y.shape[1] * Wi) dL_dthetaL = likelihood.exact_inference_gradients( np.diag(dL_dK), Y_metadata) self._old_LW = LW self._old_Wi = Wi posterior = Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K) # TODO add logdet to posterior ? return posterior, log_marginal, { 'dL_dK': dL_dK, 'dL_dthetaL': dL_dthetaL, 'dL_dm': alpha }
def posteriors_F(self, Xnew, which_out=None): # This function returns all the q(f*) associated to each output (It is the ) # We assume that Xnew can be a list of length equal to the number of likelihoods defined for the HetMOGP # or Xnew can be a numpy array so that we can replicate it per each outout if isinstance(Xnew, list): Xmulti_all_new = Xnew else: Xmulti_all_new = [] for i in range(self.num_output_funcs): Xmulti_all_new.append(Xnew.copy()) M = self.Z.shape[0] Q = len(self.kern_list) D = self.likelihood.num_output_functions(self.Y_metadata) Kuu, Luu, Kuui = util.VIK_covariance(self.Z, self.kern_list, self.kern_list_Tq, self.kern_aux) p_U = pu(Kuu=Kuu, Luu=Luu, Kuui=Kuui) q_U = qu(mu_u=self.q_u_means, chols_u=self.q_u_chols) S_u = np.empty((Q, M, M)) L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)] # for every latent function f_d calculate q(f_d) and keep it as q(F): posteriors_F = [] f_index = self.Y_metadata['function_index'].flatten() d_index = self.Y_metadata['d_index'].flatten() if which_out is None: indix_aux = f_index.copy() else: which_out = np.array(which_out) indix_aux = -1 * np.ones_like(f_index) for i in range(which_out.shape[0]): posix = np.where(f_index == which_out[i]) indix_aux[posix] = f_index[posix].copy() for d in range(D): if f_index[d] == indix_aux[d]: Xtask = Xmulti_all_new[f_index[d]] q_fd, _ = self.inference_method.calculate_q_f( X=Xtask, Z=self.Z, q_U=q_U, S_u=S_u, p_U=p_U, kern_list=self.kern_list, kern_list_Gdj=self.kern_list_Gdj, kern_list_Tq=self.kern_list_Tq, kern_aux=self.kern_aux, B=self.B_list, M=M, N=Xtask.shape[0], Q=Q, D=D, d=d) # Posterior objects for output functions (used in prediction) posterior_fd = Posterior(mean=q_fd.m_fd.copy(), cov=q_fd.S_fd.copy(), K=util.function_covariance( X=Xtask, B=self.B_list, kernel_list=self.kern_list, d=d), prior_mean=np.zeros(q_fd.m_fd.shape)) posteriors_F.append(posterior_fd) else: #posteriors_F.append(fake_posterior) posteriors_F.append([]) return posteriors_F
def _outer_loop_for_missing_data(self): Lm = None dL_dKmm = None self._log_marginal_likelihood = 0 self.full_values = self._outer_init_full_values() if self.posterior is None: woodbury_inv = np.zeros( (self.num_inducing, self.num_inducing, self.output_dim)) woodbury_vector = np.zeros((self.num_inducing, self.output_dim)) else: woodbury_inv = self.posterior._woodbury_inv woodbury_vector = self.posterior._woodbury_vector if not self.stochastics: m_f = lambda i: "Inference with missing_data: {: >7.2%}".format( float(i + 1) / self.output_dim) message = m_f(-1) print(message, end=' ') for d, ninan in self.stochastics.d: if not self.stochastics: print(' ' * (len(message)) + '\r', end=' ') message = m_f(d) print(message, end=' ') psi0ni = self.psi0[ninan] psi1ni = self.psi1[ninan] if self.has_uncertain_inputs(): psi2ni = self.psi2[ninan] value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan, dL_dpsi2=ninan) else: psi2ni = None value_indices = dict(outputs=d, samples=ninan, dL_dKdiag=ninan, dL_dKnm=ninan) posterior, log_marginal_likelihood, grad_dict = self._inner_parameters_changed( self.kern, self.X[ninan], self.Z, self.likelihood, self.Y_normalized[ninan][:, d], self.Y_metadata, Lm, dL_dKmm, psi0=psi0ni, psi1=psi1ni, psi2=psi2ni) # Fill out the full values by adding in the apporpriate grad_dict # values self._inner_take_over_or_update(self.full_values, grad_dict, value_indices) self._inner_values_update(grad_dict) # What is this for? -> MRD woodbury_inv[:, :, d] = posterior.woodbury_inv[:, :, None] woodbury_vector[:, d] = posterior.woodbury_vector self._log_marginal_likelihood += log_marginal_likelihood if not self.stochastics: print('') if self.posterior is None: self.posterior = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=posterior._K, mean=None, cov=None, K_chol=posterior.K_chol) self._outer_values_update(self.full_values) if self.has_uncertain_inputs(): self.kern.return_psi2_n = False