def _inference(K: np.ndarray, ga_approx: GaussianApproximation, cav_params: CavityParams, Z_tilde: float, y: List[Tuple[int, float]], yc: List[List[Tuple[int, int]]]) -> Tuple[Posterior, int, Dict]: """ Compute the posterior approximation :param K: prior covariance matrix :param ga_approx: Gaussian approximation of the batches :param cav_params: Cavity parameters of the posterior :param Z_tilde: Log marginal likelihood :param y: Direct observations as a list of tuples telling location index (row in X) and observation value. :param yc: Batch comparisons in a list of lists of tuples. Each batch is a list and tuples tell the comparisons (winner index, loser index) :return: A tuple consisting of the posterior approximation, log marginal likelihood and gradient dictionary """ log_marginal, post_params = _ep_marginal(K, ga_approx, Z_tilde,y,yc) tau_tilde_root = sqrtm_block(ga_approx.tau, y,yc) Sroot_tilde_K = np.dot(tau_tilde_root, K) aux_alpha , _ = dpotrs(post_params.L, np.dot(Sroot_tilde_K, ga_approx.v), lower=1) alpha = (ga_approx.v - np.dot(tau_tilde_root, aux_alpha))[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde) LWi, _ = dtrtrs(post_params.L, tau_tilde_root, lower=1) Wi = np.dot(LWi.T,LWi) symmetrify(Wi) #(K + Sigma^(\tilde))^(-1) dL_dK = 0.5 * (tdot(alpha) - Wi) dL_dthetaL = 0 return Posterior(woodbury_inv=np.asfortranarray(Wi), woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
def compute_dl_dK(posterior, K, eta, theta, prior_mean = 0): tau, v = theta, eta tau_tilde_root = np.sqrt(tau) Sroot_tilde_K = tau_tilde_root[:,None] * K aux_alpha , _ = dpotrs(posterior.L, np.dot(Sroot_tilde_K, v), lower=1) alpha = (v - tau_tilde_root * aux_alpha)[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde) LWi, _ = dtrtrs(posterior.L, np.diag(tau_tilde_root), lower=1) Wi = np.dot(LWi.T, LWi) symmetrify(Wi) #(K + Sigma^(\tilde))^(-1) dL_dK = 0.5 * (tdot(alpha) - Wi) return dL_dK
def _inference(K, ga_approx, cav_params, likelihood, Z_tilde, Y_metadata=None): log_marginal, post_params = _ep_marginal(K, ga_approx, Z_tilde) tau_tilde_root = np.sqrt(ga_approx.tau) Sroot_tilde_K = tau_tilde_root[:,None] * K aux_alpha , _ = dpotrs(post_params.L, np.dot(Sroot_tilde_K, ga_approx.v), lower=1) alpha = (ga_approx.v - tau_tilde_root * aux_alpha)[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde) LWi, _ = dtrtrs(post_params.L, np.diag(tau_tilde_root), lower=1) Wi = np.dot(LWi.T,LWi) symmetrify(Wi) #(K + Sigma^(\tilde))^(-1) dL_dK = 0.5 * (tdot(alpha) - Wi) dL_dthetaL = 0 #likelihood.ep_gradients(Y, cav_params.tau, cav_params.v, np.diag(dL_dK), Y_metadata=Y_metadata, quad_mode='gh') #temp2 = likelihood.ep_gradients(Y, cav_params.tau, cav_params.v, np.diag(dL_dK), Y_metadata=Y_metadata, quad_mode='naive') #temp = likelihood.exact_inference_gradients(np.diag(dL_dK), Y_metadata = Y_metadata) #print("exact: {}, approx: {}, Ztilde: {}, naive: {}".format(temp, dL_dthetaL, Z_tilde, temp2)) return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
def woodbury_inv(self): """ The inverse of the woodbury matrix, in the gaussian likelihood case it is defined as $$ (K_{xx} + \Sigma_{xx})^{-1} \Sigma_{xx} := \texttt{Likelihood.variance / Approximate likelihood covariance} $$ """ if self._woodbury_inv is None: if self._woodbury_chol is not None: self._woodbury_inv, _ = dpotri(self._woodbury_chol, lower=1) symmetrify(self._woodbury_inv) elif self._covariance is not None: B = np.atleast_3d(self._K) - np.atleast_3d(self._covariance) self._woodbury_inv = np.empty_like(B) for i in range(B.shape[-1]): tmp, _ = dpotrs(self.K_chol, B[:, :, i]) self._woodbury_inv[:, :, i], _ = dpotrs(self.K_chol, tmp.T) return self._woodbury_inv
def pdinv_inc(L_old, A_inc, A_inc2, Ai_old, *args): """ similar to pdinv, but uses old choleski decompositon to compute new as proposed in https://github.com/SheffieldML/GPy/issues/464#issuecomment-285500122 :rval Ai: the inverse of A :rtype Ai: np.ndarray :rval L: the Cholesky decomposition of A :rtype L: np.ndarray :rval Li: the Cholesky decomposition of Ai :rtype Li: np.ndarray (set to None for now, because not needed) :rval logdet: the log of the determinant of A :rtype logdet: float64 """ """ """ # A_inc = A_inc.reshape(-1) u = dtrtrs(L_old, A_inc, lower=1)[0] v = np.sqrt(A_inc2 - np.sum(u * u)).reshape(1) z = np.zeros((A_inc.shape[0], 1)) L = np.asfortranarray(np.block([[L_old, z], [u, v]])) logdet = 2. * np.sum(np.log(np.diag(L))) # _Ai, _ = dpotri(L, lower=1) # consider also incrementally updating this # incrementally update the inverse alpha = Ai_old.dot(A_inc).reshape((-1, 1)) # print('alpha', alpha) gamma = alpha.dot(alpha.T) # print('gamma', gamma) beta = 1 / (A_inc2 - alpha.T.dot(A_inc)).reshape(-1) beta_alpha = -beta * alpha # print('beta', beta) Ai = np.block([[Ai_old + beta * gamma, beta_alpha], [beta_alpha.T, beta]]) # print(Ai -_Ai) symmetrify(Ai) return Ai, L, None, logdet # could also return Li as in pdinv, but we don't need this right now
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None, fixed_covs_kerns=None, **kw): _, output_dim = Y.shape uncertain_inputs = isinstance(X, VariationalPosterior) #see whether we've got a different noise variance for each datum beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6) # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency! #self.YYTfactor = self.get_YYTfactor(Y) #VVT_factor = self.get_VVTfactor(self.YYTfactor, beta) het_noise = beta.size > 1 if het_noise: raise(NotImplementedError("Heteroscedastic noise not implemented, should be possible though, feel free to try implementing it :)")) if beta.ndim == 1: beta = beta[:, None] # do the inference: num_inducing = Z.shape[0] num_data = Y.shape[0] # kernel computations, using BGPLVM notation Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) if Lm is None: Lm = jitchol(Kmm) # The rather complex computations of A, and the psi stats if uncertain_inputs: psi0 = kern.psi0(Z, X) psi1 = kern.psi1(Z, X) if het_noise: psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0) else: psi2_beta = kern.psi2(Z,X) * beta LmInv = dtrtri(Lm) A = LmInv.dot(psi2_beta.dot(LmInv.T)) else: psi0 = kern.Kdiag(X) psi1 = kern.K(X, Z) if het_noise: tmp = psi1 * (np.sqrt(beta)) else: tmp = psi1 * (np.sqrt(beta)) tmp, _ = dtrtrs(Lm, tmp.T, lower=1) A = tdot(tmp) # factor B B = np.eye(num_inducing) + A LB = jitchol(B) # back substutue C into psi1Vf #tmp, _ = dtrtrs(Lm, psi1.T.dot(VVT_factor), lower=1, trans=0) #_LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0) #tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1) #Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1) # data fit and derivative of L w.r.t. Kmm #delit = tdot(_LBi_Lmi_psi1Vf) # Expose YYT to get additional covariates in (YYT + Kgg): tmp, _ = dtrtrs(Lm, psi1.T, lower=1, trans=0) _LBi_Lmi_psi1, _ = dtrtrs(LB, tmp, lower=1, trans=0) tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1, lower=1, trans=1) Cpsi1, _ = dtrtrs(Lm, tmp, lower=1, trans=1) # TODO: cache this: # Compute fixed covariates covariance: if fixed_covs_kerns is not None: K_fixed = 0 for name, [cov, k] in fixed_covs_kerns.iteritems(): K_fixed += k.K(cov) #trYYT = self.get_trYYT(Y) YYT_covs = (tdot(Y) + K_fixed) data_term = beta**2 * YYT_covs trYYT_covs = np.trace(YYT_covs) else: data_term = beta**2 * tdot(Y) trYYT_covs = self.get_trYYT(Y) #trYYT = self.get_trYYT(Y) delit = mdot(_LBi_Lmi_psi1, data_term, _LBi_Lmi_psi1.T) data_fit = np.trace(delit) DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit) if dL_dKmm is None: delit = -0.5 * DBi_plus_BiPBi delit += -0.5 * B * output_dim delit += output_dim * np.eye(num_inducing) # Compute dL_dKmm dL_dKmm = backsub_both_sides(Lm, delit) # derivatives of L w.r.t. psi dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, data_term, Cpsi1, DBi_plus_BiPBi, psi1, het_noise, uncertain_inputs) # log marginal likelihood log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, psi0, A, LB, trYYT_covs, data_fit, Y) if self.save_per_dim: self.saved_vals = [psi0, A, LB, _LBi_Lmi_psi1, beta] # No heteroscedastics, so no _LBi_Lmi_psi1Vf: # For the interested reader, try implementing the heteroscedastic version, it should be possible _LBi_Lmi_psi1Vf = None # Is just here for documentation, so you can see, what it was. #noise derivatives dL_dR = _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, psi0, psi1, beta, data_fit, num_data, output_dim, trYYT_covs, Y, None) dL_dthetaL = likelihood.exact_inference_gradients(dL_dR,Y_metadata) #put the gradients in the right places if uncertain_inputs: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2, 'dL_dthetaL':dL_dthetaL} else: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL} if fixed_covs_kerns is not None: # For now, we do not take the gradients, we can compute them, # but the maximum likelihood solution is to switch off the additional covariates.... dL_dcovs = beta * np.eye(K_fixed.shape[0]) - beta**2*tdot(_LBi_Lmi_psi1.T) grad_dict['dL_dcovs'] = -.5 * dL_dcovs #get sufficient things for posterior prediction #TODO: do we really want to do this in the loop? if 1: woodbury_vector = (beta*Cpsi1).dot(Y) else: import ipdb; ipdb.set_trace() psi1V = np.dot(Y.T*beta, psi1).T tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0) tmp, _ = dpotrs(LB, tmp, lower=1) woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1) Bi, _ = dpotri(LB, lower=1) symmetrify(Bi) Bi = -dpotri(LB, lower=1)[0] diag.add(Bi, 1) woodbury_inv = backsub_both_sides(Lm, Bi) #construct a posterior object post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm) return post, log_marginal, grad_dict