def approxConditionals(s, lhat, rest=None): """ y: Full mean (all dimensions) s: Full cov. matrix (all dimensions) lhat: The dimensions for which to compute p(lhat | rest). a = lhat (unobserved indices) |a| - dimensional b = rest (observed indices) |b| - dimensional (a,b) ~ N(M, S) Then: S_{a|b} = Saa - Sab Sbb^-1 Sba M_{a|b} = Sab Sbb^-1 Mb """ # y is L x 1 # s is L x L L = s.shape[1] if rest is None: A = list(set(range(L)) - set([lhat])) else: A = rest # Slice s, to get the indices denoted by A,A (ie all rows in A and columns in A) sAA = sliceArr(s,A,A) try: inv_sAA = pdinv(sAA)[0] except: inv_sAA = pdinv(sAA + 0.00000001*np.eye(sAA.shape[0],sAA.shape[1]))[0] print "Warning: Added more jitter!" sc = sliceArr(s,[lhat],A) sInvs = np.dot(np.dot(sc,inv_sAA),sc.T) s_U = sliceArr(s,[lhat],[lhat]) - sInvs return s_U
def update_kern_grads(self): """ Set the derivative of the lower bound wrt the (kernel) parameters """ for i, kern in enumerate(self.kern): K = kern.K(self.X) B_inv = np.diag(1. / (self.phi[:, i] / self.variance)) alpha = linalg.cho_solve(linalg.cho_factor(K + B_inv), self.Y) K_B_inv = pdinv(K + B_inv)[0] dL_dK = np.outer(alpha, alpha) - K_B_inv kern.update_gradients_full(dL_dK=dL_dK, X=self.X) # variance gradient grad_Lm_variance = 0.0 for i, kern in enumerate(self.kern): K = kern.K(self.X) I = np.eye(self.N) B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance)) alpha = np.linalg.solve(K + B_inv, self.Y) K_B_inv = pdinv(K + B_inv)[0] dL_dB = np.outer(alpha, alpha) - K_B_inv grad_B_inv = np.diag(1. / (self.phi[:, i] + 1e-6)) grad_Lm_variance += 0.5 * np.trace(np.dot(dL_dB, grad_B_inv)) self.variance.gradient = grad_Lm_variance
def woodbury_chol(self): """ return $L_{W}$ where L is the lower triangular Cholesky decomposition of the Woodbury matrix $$ L_{W}L_{W}^{\top} = W^{-1} W^{-1} := \texttt{Woodbury inv} $$ """ if self._woodbury_chol is None: #compute woodbury chol from if self._woodbury_inv is not None: winv = np.atleast_3d(self._woodbury_inv) self._woodbury_chol = np.zeros(winv.shape) for p in range(winv.shape[-1]): self._woodbury_chol[:, :, p] = pdinv(winv[:, :, p])[2] elif self._covariance is not None: raise NotImplementedError("TODO: check code here") B = self._K - self._covariance tmp, _ = dpotrs(self.K_chol, B) self._woodbury_inv, _ = dpotrs(self.K_chol, tmp.T) _, _, self._woodbury_chol, _ = pdinv(self._woodbury_inv) else: raise ValueError( "insufficient information to compute posterior") return self._woodbury_chol
def _update_batch(self, eta: float, delta: float, post_params: posteriorParams, marg_moments: MarginalMoments, batch: List[int], get_logger: Callable=None, sigma2s: np.ndarray=None): """ Computes new gaussian approximation for a batch given posterior and marginal moments. See e.g. 3.59 in http://www.gaussianprocess.org/gpml/chapters/RW.pdf :param eta: parameter for fractional updates. :param delta: damping updates factor. :param post_params: Posterior approximation :param marg_moments: Marginal moments at this iteration :param batch: list of indices of the parameters to be updated :param get_logger: Function for receiving the legger where the prints are forwarded. """ sigma_hat_inv,_,_,_ = pdinv(marg_moments.sigma2_hat[np.ix_(batch,batch)]) post_sigma_inv,_,_,_ = pdinv(post_params.Sigma[np.ix_(batch,batch)]) tmp0 = sigma_hat_inv - post_sigma_inv delta_tau = delta/eta* tmp0 delta_v = delta/eta*(np.dot(marg_moments.mu_hat[batch],sigma_hat_inv) - np.dot(post_params.mu[batch], post_sigma_inv)) tau_tilde_prev = self.tau[np.ix_(batch,batch)] tmp = (1-delta)*self.tau[np.ix_(batch,batch)] + delta_tau #Let us umake sure that sigma_hat_inv-post_sigma_inv is positive definite tmp, added_value = nearestPD.nearest_pd.nearestPD(tmp) update = True if (added_value > 1) and (sigma2s is not None): update = False sigma2s *= 1.05 if get_logger is not None: get_logger().error('Increasing batch noise. Not updating gaussian approximation ({})'.format(sigma2s[0])) if update: self.tau[np.ix_(batch,batch)] = tmp self.v[batch] = (1-delta)*self.v[batch] + delta_v return (delta_tau, delta_v), sigma2s
def update_model(self, xvals, zvals, incremental = True): assert(self.xvals is not None) assert(self.zvals is not None) Kx = self.kern.K(self.xvals, xvals) # Update K matrix self._K = np.block([ [self._K, Kx], [Kx.T, self.kern.K(xvals, xvals)] ]) # Update internal data self.xvals = np.vstack([self.xvals, xvals]) self.zvals = np.vstack([self.zvals, zvals]) # Update woodbury inverse, either incrementally or from scratch if incremental == True: Pinv = self.woodbury_inv Q = Kx R = Kx.T S = self.kern.K(xvals, xvals) M = S - np.dot(np.dot(R, Pinv), Q) # Adds some additional noise to ensure well-conditioned diag.add(M, self.noise + 1e-8) M, _, _, _ = pdinv(M) Pnew = Pinv + np.dot(np.dot(np.dot(np.dot(Pinv, Q), M), R), Pinv) Qnew = -np.dot(np.dot(Pinv, Q), M) Rnew = -np.dot(np.dot(M, R), Pinv) Snew = M self._woodbury_inv = np.block([ [Pnew, Qnew], [Rnew, Snew] ]) else: Ky = self.K.copy() # Adds some additional noise to ensure well-conditioned diag.add(Ky, self.noise + 1e-8) Wi, LW, LWi, W_logdet = pdinv(Ky) self._woodbury_inv = Wi self._woodbury_vector = np.dot(self.woodbury_inv, self.zvals) self._woodbury_chol = None self._mean = None self._covariance = None self._prior_mean = 0. self._K_chol = None
def bound(self): """ Compute the lower bound on the marginal likelihood (conditioned on the GP hyper parameters). """ GP_bound = 0.0 for i, kern in enumerate(self.kern): K = kern.K(self.X) B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance)) # Make more stable using cholesky factorization: Bi, LB, LBi, Blogdet = pdinv(K+B_inv) # Data fit # alpha = linalg.cho_solve(linalg.cho_factor(K + B_inv), self.Y) # GP_bound += -0.5 * np.dot(self.Y.T, alpha).trace() GP_bound -= .5 * dpotrs(LB, self.YYT)[0].trace() # Penalty # GP_bound += -0.5 * np.linalg.slogdet(K + B_inv)[1] GP_bound -= 0.5 * Blogdet # Constant, weighted by model assignment per point #GP_bound += -0.5 * (self.phi[:, i] * np.log(2 * np.pi * self.variance)).sum() GP_bound -= .5*self.D * np.einsum('j,j->',self.phi[:, i], np.log(2 * np.pi * self.variance)) return GP_bound + self.mixing_prop_bound() + self.H
def IntKKNorm(self, x1, x2, mu, sigma): """Compute \int k(x1,x') k(x',x2) Normal_x'(\mu, \Sigma) dx'. Parameters ---------- x1 : array, size (n1, n_dim) x2 : array, size (n2, n_dim) mu : array, size (n_dim) The mean of the Gaussian distribution. cov : array, size (n_dim, n_dim) The covariance of the Gaussian distribution. Returns ------- res : array, size (n1, n2) """ ndim = self.input_dim var = self.variance ell2 = np.ones((ndim, )) * self.lengthscale**2 sqrt_det = np.sqrt(np.prod(ell2)) cov = sigma + 0.5 * np.diag(ell2) cov_inv, _, _, ld, = pdinv(cov) x_shift = 0.5 * (x1[:, None] + x2[None, :]) - mu arg = np.sum(x_shift * np.matmul(x_shift, cov_inv), axis=2) k1 = var * np.exp(-0.5 * arg) k2 = self.K(x1 / np.sqrt(2), x2 / np.sqrt(2)) const = np.exp(-0.5 * ld) * sqrt_det / np.power(2, ndim / 2.0) return const * k1 * k2
def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, K=None, variance=None, Z_tilde=None, A = None): """ Returns a Posterior class containing essential quantities of the posterior The comments below corresponds to Alg 2.1 in GPML textbook. """ # print('ExactGaussianInferenceGroup inference:') if mean_function is None: m = 0 else: m = mean_function.f(X) if variance is None: variance = likelihood.gaussian_variance(Y_metadata) YYT_factor = Y-m # NOTE: change K to AKA^T if K is None: if A is None: A = np.identity(X.shape[0]) K = A.dot(kern.K(X)).dot(A.T) # A_t k(X_t, X_t) A_t^T else: raise NotImplementedError('Need to be extended to group case!') Ky = K.copy() diag.add(Ky, variance+1e-8) # A_t k(X_t, X_t)A_t^T + sigma^2 I # pdinv: # Wi: inverse of Ky # LW: the Cholesky decomposition of Ky -> L # LWi: the Cholesky decomposition of Kyi (not used) # W_logdet: the log of the determinat of Ky Wi, LW, LWi, W_logdet = pdinv(Ky) # LAPACK: DPOTRS solves a system of linear equations A*X = B with a symmetric # positive definite matrix A using the Cholesky factorization # A = U**T*U or A = L*L**T computed by DPOTRF. alpha, _ = dpotrs(LW, YYT_factor, lower=1) # so this gives # (A_t k(X_t, X_t)A_t^T + sigma^2 I)^{-1} (Y_t - m) # Note: 20210827 confirm the log marginal likelihood log_marginal = 0.5*(-Y.size * log_2_pi - Y.shape[1] * W_logdet - np.sum(alpha * YYT_factor)) if Z_tilde is not None: # This is a correction term for the log marginal likelihood # In EP this is log Z_tilde, which is the difference between the # Gaussian marginal and Z_EP log_marginal += Z_tilde # REVIEW: since log_marginal does not change, the gradient does not need to change as well. # FIXME: confirm the gradient update is correct # dL_dK = 0.5 * (tdot(alpha) - Y.shape[1] * Wi) dL_dK = 0.5 * A.T.dot((tdot(alpha) - Y.shape[1] * Wi)).dot(A) # print('dL_dK shape', dL_dK.shape) dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK), Y_metadata) return PosteriorExactGroup(woodbury_chol=LW, woodbury_vector=alpha, K=K, A = A), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
def dIntKKNorm_dX(self, x1, x2, mu, sigma): """Compute d/dx1 \int k(x1,x') k(x',x2) Normal_x'(\mu, \Sigma) dx'. Parameters ---------- x1 : array, size (n1, n_dim) x2 : array, size (n2, n_dim) mu : array, size (n_dim) The mean of the Gaussian distribution. cov : array, size (n_dim, n_dim) The covariance of the Gaussian distribution. Returns ------- jac : array, size (n1, n2, n_dim) The gradients of IntKKNorm w.r.t. x1. """ ndim = self.input_dim ell2 = np.ones((ndim, )) * self.lengthscale**2 cov = sigma + 0.5 * np.diag(ell2) cov_inv, _, _, ld, = pdinv(cov) x_avg = 0.5 * (x1[:, None] + x2[None, :]) aa = np.dot(2 * x_avg / ell2, sigma) + mu const = -x1[:, None] / ell2 + 0.5 * np.dot(aa, cov_inv) integral = self.IntKKNorm(x1, x2, mu, sigma) jacobian = const * integral[:, :, None] return jacobian
def vb_grad_natgrad(self): """ Natural Gradients of the bound with respect to phi, the variational parameters controlling assignment of the data to GPs """ grad_Lm = np.zeros_like(self.phi) for i, kern in enumerate(self.kern): K = kern.K(self.X) I = np.eye(self.N) B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance)) alpha = np.linalg.solve(K + B_inv, self.Y) K_B_inv = pdinv(K + B_inv)[0] dL_dB = tdot(alpha) - K_B_inv for n in range(self.phi.shape[0]): grad_B_inv = np.zeros_like(B_inv) grad_B_inv[n, n] = -self.variance / (self.phi[n, i]**2 + 1e-6) grad_Lm[n, i] = 0.5 * np.trace(np.dot(dL_dB, grad_B_inv)) grad_phi = grad_Lm + self.mixing_prop_bound_grad() + self.Hgrad natgrad = grad_phi - np.sum(self.phi * grad_phi, 1)[:, None] grad = natgrad * self.phi return grad.flatten(), natgrad.flatten()
def _log_likelihood(self, log_params): # Returns log likelihood, p(D|hyperparams) params = np.exp(log_params) l_scales = params[0:self.X_dim] output_var = params[self.X_dim] # Vertical length scale noise_var = params[self.X_dim + 1] # compute eta eta = np.min(self.Y) - params[self.X_dim + 2] # QUESTION: what is this? # compute the observed value for g instead of y g_ob = np.sqrt(2.0 * (self.Y - eta)) kernel = GPy.kern.RBF(input_dim=self.X_dim, ARD=True, variance=output_var, lengthscale=l_scales) Kng = kernel.K(self.X) # QUESTION: does not seem to follow conditional variance form in eqn 6 # compute posterior mean distribution for g TODO update this # GPg = GPy.models.GPRegression(self.X, g_ob, kernel, noise_var=1e-8) # mg,_ = GPg.predict(self.X) mg = g_ob # approximate covariance matrix of y using linearisation technique Kny = mg * Kng * mg.T + (noise_var+1e-8) * np.eye(Kng.shape[0]) # compute likelihood terms Wi, LW, LWi, W_logdet = pdinv(Kny) # from GPy module # Wi = inverse of Kny (ndarray) # LW = Cholesky decomposition of Kny (ndarray) # LWi = Cholesky decomposition of inverse of Kny (ndarray) # W_logdet = log determinant of Kny (float) alpha, _ = dpotrs(LW, self.Y, lower=1) loglikelihood = 0.5 * (-self.Y.size * np.log(2 * np.pi) - self.Y.shape[1] * W_logdet - np.sum(alpha * self.Y)) # Log marginal likelihood for GP, based on Rasmussen eqn 2.30 return loglikelihood
def bound(self): """ Compute the lower bound on the marginal likelihood (conditioned on the GP hyper parameters). """ GP_bound = 0.0 for i, kern in enumerate(self.kern): K = kern.K(self.X) B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance)) # Make more stable using cholesky factorization: Bi, LB, LBi, Blogdet = pdinv(K + B_inv) # Data fit # alpha = linalg.cho_solve(linalg.cho_factor(K + B_inv), self.Y) # GP_bound += -0.5 * np.dot(self.Y.T, alpha).trace() GP_bound -= .5 * dpotrs(LB, self.YYT)[0].trace() # Penalty # GP_bound += -0.5 * np.linalg.slogdet(K + B_inv)[1] GP_bound -= 0.5 * Blogdet # Constant, weighted by model assignment per point #GP_bound += -0.5 * (self.phi[:, i] * np.log(2 * np.pi * self.variance)).sum() GP_bound -= .5 * self.D * np.einsum( 'j,j->', self.phi[:, i], np.log(2 * np.pi * self.variance)) return GP_bound + self.mixing_prop_bound() + self.H
def vb_grad_natgrad(self): """ Natural Gradients of the bound with respect to phi, the variational parameters controlling assignment of the data to GPs """ grad_Lm = np.zeros_like(self.phi) for i, kern in enumerate(self.kern): K = kern.K(self.X) I = np.eye(self.N) B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance)) alpha = np.linalg.solve(K + B_inv, self.Y) K_B_inv = pdinv(K + B_inv)[0] dL_dB = np.outer(alpha, alpha) - K_B_inv for n in range(self.phi.shape[0]): grad_B_inv = np.zeros_like(B_inv) grad_B_inv[n, n] = -self.variance / (self.phi[n, i] ** 2 + 1e-6) grad_Lm[n, i] = 0.5 * np.trace(np.dot(dL_dB, grad_B_inv)) grad_phi = grad_Lm + self.mixing_prop_bound_grad() + self.Hgrad natgrad = grad_phi - np.sum(self.phi * grad_phi, 1)[:, None] grad = natgrad * self.phi return grad.flatten(), natgrad.flatten()
def omgp_model_bound(omgp): ''' Calculate the part of the omgp bound which does not depend on the response variable. ''' GP_bound = 0.0 LBs = [] # Precalculate the bound minus data fit, # and LB matrices used for data fit term. for i, kern in enumerate(omgp.kern): K = kern.K(omgp.X) B_inv = np.diag(1. / ((omgp.phi[:, i] + 1e-6) / omgp.variance)) Bi, LB, LBi, Blogdet = pdinv(K + B_inv) LBs.append(LB) # Penalty GP_bound -= 0.5 * Blogdet # Constant GP_bound -= 0.5 * omgp.D * np.einsum('j,j->', omgp.phi[:, i], np.log(2 * np.pi * omgp.variance)) model_bound = GP_bound + omgp.mixing_prop_bound() + omgp.H return model_bound, LBs
def __init__(self, X, kernF, kernY, Y, K=2, alpha=1., prior_Z='symmetric', name='MOHGP'): N, self.D = Y.shape self.Y = Y self.X = X assert X.shape[0] == self.D, "input data don't match observations" CollapsedMixture.__init__(self, N, K, prior_Z, alpha, name) self.kernF = kernF self.kernY = kernY self.link_parameters(self.kernF, self.kernY) #initialize kernels self.Sf = self.kernF.K(self.X) self.Sy = self.kernY.K(self.X) self.Sy_inv, self.Sy_chol, self.Sy_chol_inv, self.Sy_logdet = pdinv( self.Sy + np.eye(self.D) * 1e-6) #Computations that can be done outside the optimisation loop self.YYT = self.Y[:, :, np.newaxis] * self.Y[:, np.newaxis, :] self.YTY = np.dot(self.Y.T, self.Y) self.do_computations()
def inference(self, kern, X, W, likelihood, Y, mean_function=None, Y_metadata=None, K=None, variance=None, Z_tilde=None): """ Returns a Posterior class containing essential quantities of the posterior """ if mean_function is None: m = 0 else: m = mean_function.f(X) if variance is None: variance = likelihood.gaussian_variance(Y_metadata) YYT_factor = Y - m if K is None: K = kern.K(X) Ky = K.copy() diag.add(Ky, variance + 1e-8) Wi, LW, LWi, W_logdet = pdinv(Ky) alpha, _ = dpotrs(LW, YYT_factor, lower=1) log_marginal = 0.5 * (-Y.size * log_2_pi - Y.shape[1] * W_logdet - np.sum(alpha * YYT_factor)) if Z_tilde is not None: # This is a correction term for the log marginal likelihood # In EP this is log Z_tilde, which is the difference between the # Gaussian marginal and Z_EP log_marginal += Z_tilde dL_dK = 0.5 * (tdot(alpha) - Y.shape[1] * Wi) dL_dthetaL = likelihood.exact_inference_gradients( np.diag(dL_dK), Y_metadata) posterior_ = Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K) return posterior_, log_marginal, { 'dL_dK': dL_dK, 'dL_dthetaL': dL_dthetaL, 'dL_dm': alpha }, W_logdet
def __init__(self, mu, var): self.mu = np.array(mu).flatten() self.var = np.array(var) assert len(self.var.shape) == 2 assert self.var.shape[0] == self.var.shape[1] assert self.var.shape[0] == self.mu.size self.input_dim = self.mu.size self.inv, self.hld = pdinv(self.var) self.constant = -0.5 * self.input_dim * np.log(2 * np.pi)
def parameters_changed(self): """ Set the kernel parameters. Note that the variational parameters are handled separately.""" #get the latest kernel matrices, decompose self.Sf = self.kernF.K(self.X) self.Sy = self.kernY.K(self.X) self.Sy_inv, self.Sy_chol, self.Sy_chol_inv, self.Sy_logdet = pdinv(self.Sy+np.eye(self.D)*1e-6) #update everything self.do_computations() self.update_kern_grads()
def parameters_changed(self): """ Set the kernel parameters. Note that the variational parameters are handled separately.""" #get the latest kernel matrices, decompose self.Sf = self.kernF.K(self.X) self.Sy = self.kernY.K(self.X) self.Sy_inv, self.Sy_chol, self.Sy_chol_inv, self.Sy_logdet = pdinv( self.Sy + np.eye(self.D) * 1e-6) #update everything self.do_computations() self.update_kern_grads()
def _set_params(self,x): """ Set the kernel parameters. Note that the variational parameters are handled separately.""" #st the kernels with their parameters self.kernF._set_params_transformed(x[:self.kernF.num_params]) self.kernY._set_params_transformed(x[self.kernF.num_params:]) #get the latest kernel matrices, decompose self.Sf = self.kernF.K(self.X) self.Sy = self.kernY.K(self.X) self.Sy_inv, self.Sy_chol, self.Sy_chol_inv, self.Sy_logdet = pdinv(self.Sy) #update everything self.do_computations()
def _set_params(self, x): """ Set the kernel parameters. Note that the variational parameters are handled separately.""" #st the kernels with their parameters self.kernF._set_params_transformed(x[:self.kernF.num_params]) self.kernY._set_params_transformed(x[self.kernF.num_params:]) #get the latest kernel matrices, decompose self.Sf = self.kernF.K(self.X) self.Sy = self.kernY.K(self.X) self.Sy_inv, self.Sy_chol, self.Sy_chol_inv, self.Sy_logdet = pdinv( self.Sy) #update everything self.do_computations()
def comp_Ckk(X, kernel, mean_i, cov_i): """Compute C_kk = \int k(X,x') k(x',X) Normal_i dx'.""" ndim = kernel.input_dim var = kernel.variance Xsh = (X - mean_i) / 2. ell2 = np.ones((ndim, )) * kernel.lengthscale**2 sqrt_det = np.power(np.prod(ell2), 1 / 2.) cov = cov_i + 0.5 * np.diag(ell2) cov_inv, _, _, ld, = pdinv(cov) X1s = np.sum(Xsh * np.dot(Xsh, cov_inv), 1) arg = 2.*np.dot(Xsh, np.dot(Xsh, cov_inv).T) \ + X1s[:,None] + X1s[None,:] con = -0.5 * (ndim * np.log(2 * np.pi) + ld) zc = np.exp(con - 0.5 * arg) norm_const = var * np.power(np.pi, ndim/2.0) * sqrt_det \ * kernel.K(X/np.sqrt(2)) return norm_const * zc, cov_inv
def recompute_posterior_mf( alpha: np.ndarray, beta: np.ndarray, K: np.ndarray ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ Recompute the posterior approximation (for the mean field approximation) mean: K alpha, covariance inv(K + beta) :param alpha: Alpha vector used to parametrize the posterior approximation :param beta: Beta vector/matrix used to parametrize the posterior approximation :param K: prior covariance :return: Tuple containing the mean and cholesky of the covariance, its inverse and derivatives of the KL divergence with respect to beta and alpha """ N = alpha.shape[0] # Lambda = diag(lam) = diag(beta.^2) lam_sqrt = beta.ravel() lam = beta.ravel()**2 # Handle A = I + Lambda*K*Lambda KB = K @ np.diag(lam_sqrt) BKB = np.diag(lam_sqrt) @ KB A = np.eye(N) + BKB Ai, LA, Li, Alogdet = pdinv(A) # Compute Mean m = K @ alpha # Compute covariance matrix W = Li @ np.diag( 1.0 / lam_sqrt ) # can be accelerated using broadcasting instead of matrix multiplication Sigma = ( np.diag(1.0 / lam) - W.T @ W ) # computes np.diag(1./lam) - np.diag(1. / lam_sqrt) @ Ai @ np.diag(1. / lam_sqrt) # Compute KL KL = 0.5 * (Alogdet + np.trace(Ai) - N + np.sum(m * alpha)) # Compute Gradients A_A2 = Ai - Ai.dot(Ai) dKL_db = np.diag(np.dot(KB.T, A_A2)).reshape(-1, 1) # dKL_da = K @ alpha dKL_da = m.copy() L = GPy.util.linalg.jitchol(Sigma) L_inv = np.linalg.inv(L) return m, L, L_inv, KL, dKL_db, dKL_da
def predict(self, Xnew, i): """ Predictive mean for a given component """ kern = self.kern[i] K = kern.K(self.X) kx = kern.K(self.X, Xnew) # Predict mean # This works but should Cholesky for stability B_inv = np.diag(1. / (self.phi[:, i] / self.variance)) K_B_inv = pdinv(K + B_inv)[0] mu = kx.T.dot(np.dot(K_B_inv, self.Y)) # Predict variance kxx = kern.K(Xnew, Xnew) va = self.variance + kxx - kx.T.dot(np.dot(K_B_inv, kx)) return mu, va
def __init__(self, X, kernF, kernY, Y, K=2, alpha=1., prior_Z='symmetric'): N, self.D = Y.shape self.Y = Y self.X = X assert X.shape[0] == self.D, "input data don't match observations" #initialize kernels self.kernF = kernF self.kernY = kernY self.Sf = self.kernF.K(self.X) self.Sy = self.kernY.K(self.X) self.Sy_inv, self.Sy_chol, self.Sy_chol_inv, self.Sy_logdet = pdinv( self.Sy) #Computations that can be done outside the optimisation loop self.YYT = self.Y[:, :, np.newaxis] * self.Y[:, np.newaxis, :] self.YTY = np.dot(self.Y.T, self.Y) collapsed_mixture.__init__(self, N, K, prior_Z, alpha)
def __init__(self, X, kernF, kernY, Y, K=2, alpha=1., prior_Z='symmetric'): N,self.D = Y.shape self.Y = Y self.X = X assert X.shape[0]==self.D, "input data don't match observations" #initialize kernels self.kernF = kernF self.kernY = kernY self.Sf = self.kernF.K(self.X) self.Sy = self.kernY.K(self.X) self.Sy_inv, self.Sy_chol, self.Sy_chol_inv, self.Sy_logdet = pdinv(self.Sy) #Computations that can be done outside the optimisation loop self.YYT = self.Y[:,:,np.newaxis]*self.Y[:,np.newaxis,:] self.YTY = np.dot(self.Y.T,self.Y) collapsed_mixture.__init__(self, N, K, prior_Z, alpha)
def update_kern_grads(self): """ Set the derivative of the lower bound wrt the (kernel) parameters """ grad_Lm_variance = 0.0 for i, kern in enumerate(self.kern): K = kern.K(self.X) B_inv = np.diag(1. / (self.phi[:, i] / self.variance)) # Numerically more stable version using cholesky decomposition #alpha = linalg.cho_solve(linalg.cho_factor(K + B_inv), self.Y) #K_B_inv = pdinv(K + B_inv)[0] #dL_dK = .5*(tdot(alpha) - K_B_inv) # Make more stable using cholesky factorization: Bi, LB, LBi, Blogdet = pdinv(K + B_inv) tmp = dpotrs(LB, self.YYT)[0] GPy.util.diag.subtract(tmp, 1) dL_dB = dpotrs(LB, tmp.T)[0] kern.update_gradients_full(dL_dK=.5 * dL_dB, X=self.X) # variance gradient #for i, kern in enumerate(self.kern): K = kern.K(self.X) #I = np.eye(self.N) B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance)) #alpha = np.linalg.solve(K + B_inv, self.Y) #K_B_inv = pdinv(K + B_inv)[0] #dL_dB = tdot(alpha) - K_B_inv grad_B_inv = np.diag(1. / (self.phi[:, i] + 1e-6)) grad_Lm_variance += 0.5 * np.trace(np.dot(dL_dB, grad_B_inv)) grad_Lm_variance -= .5 * self.D * np.einsum( 'j,j->', self.phi[:, i], 1. / self.variance) self.variance.gradient = grad_Lm_variance
def init_model(self, xvals, zvals): # Update internal data self.xvals = xvals self.zvals = zvals self._K = self.kern.K(self.xvals) Ky = self._K.copy() # Adds some additional noise to ensure well-conditioned diag.add(Ky, self.noise + 1e-8) Wi, LW, LWi, W_logdet = pdinv(Ky) self._woodbury_inv = Wi self._woodbury_vector = np.dot(self._woodbury_inv, self.zvals) self._woodbury_chol = None self._mean = None self._covariance = None self._prior_mean = 0. self._K_chol = None
def update_kern_grads(self): """ Set the derivative of the lower bound wrt the (kernel) parameters """ grad_Lm_variance = 0.0 for i, kern in enumerate(self.kern): K = kern.K(self.X) B_inv = np.diag(1. / (self.phi[:, i] / self.variance)) # Numerically more stable version using cholesky decomposition #alpha = linalg.cho_solve(linalg.cho_factor(K + B_inv), self.Y) #K_B_inv = pdinv(K + B_inv)[0] #dL_dK = .5*(tdot(alpha) - K_B_inv) # Make more stable using cholesky factorization: Bi, LB, LBi, Blogdet = pdinv(K+B_inv) tmp = dpotrs(LB, self.YYT)[0] GPy.util.diag.subtract(tmp, 1) dL_dB = dpotrs(LB, tmp.T)[0] kern.update_gradients_full(dL_dK=.5*dL_dB, X=self.X) # variance gradient #for i, kern in enumerate(self.kern): K = kern.K(self.X) #I = np.eye(self.N) B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance)) #alpha = np.linalg.solve(K + B_inv, self.Y) #K_B_inv = pdinv(K + B_inv)[0] #dL_dB = tdot(alpha) - K_B_inv grad_B_inv = np.diag(1. / (self.phi[:, i] + 1e-6)) grad_Lm_variance += 0.5 * np.trace(np.dot(dL_dB, grad_B_inv)) grad_Lm_variance -= .5*self.D * np.einsum('j,j->',self.phi[:, i], 1./self.variance) self.variance.gradient = grad_Lm_variance
def vb_grad_natgrad(self): """ Natural Gradients of the bound with respect to phi, the variational parameters controlling assignment of the data to GPs """ grad_Lm = np.zeros_like(self.phi) for i, kern in enumerate(self.kern): K = kern.K(self.X) I = np.eye(self.N) B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance)) K_B_inv, L_B, _, _ = pdinv(K + B_inv) alpha, _ = dpotrs(L_B, self.Y) dL_dB_diag = np.sum(np.square(alpha), 1) - np.diag(K_B_inv) grad_Lm[:,i] = -0.5 * self.variance * dL_dB_diag / (self.phi[:,i]**2 + 1e-6) grad_phi = grad_Lm + self.mixing_prop_bound_grad() + self.Hgrad natgrad = grad_phi - np.sum(self.phi * grad_phi, 1)[:, None] grad = natgrad * self.phi return grad.flatten(), natgrad.flatten()
def __init__(self, X, kernF, kernY, Y, K=2, alpha=1., prior_Z='symmetric', name='MOHGP'): N,self.D = Y.shape self.Y = Y self.X = X assert X.shape[0]==self.D, "input data don't match observations" CollapsedMixture.__init__(self, N, K, prior_Z, alpha, name) self.kernF = kernF self.kernY = kernY self.link_parameters(self.kernF, self.kernY) #initialize kernels self.Sf = self.kernF.K(self.X) self.Sy = self.kernY.K(self.X) self.Sy_inv, self.Sy_chol, self.Sy_chol_inv, self.Sy_logdet = pdinv(self.Sy+np.eye(self.D)*1e-6) #Computations that can be done outside the optimisation loop self.YYT = self.Y[:,:,np.newaxis]*self.Y[:,np.newaxis,:] self.YTY = np.dot(self.Y.T,self.Y) self.do_computations()
def vb_grad_natgrad(self): """ Natural Gradients of the bound with respect to phi, the variational parameters controlling assignment of the data to GPs """ grad_Lm = np.zeros_like(self.phi) for i, kern in enumerate(self.kern): K = kern.K(self.X) I = np.eye(self.N) B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance)) K_B_inv, L_B, _, _ = pdinv(K + B_inv) alpha, _ = dpotrs(L_B, self.Y) dL_dB_diag = np.sum(np.square(alpha), 1) - np.diag(K_B_inv) grad_Lm[:, i] = -0.5 * self.variance * dL_dB_diag / ( self.phi[:, i]**2 + 1e-6) grad_phi = grad_Lm + self.mixing_prop_bound_grad() + self.Hgrad natgrad = grad_phi - np.sum(self.phi * grad_phi, 1)[:, None] grad = natgrad * self.phi return grad.flatten(), natgrad.flatten()
def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None): assert mean_function is None, "inference with a mean function not implemented" num_inducing, _ = Z.shape num_data, output_dim = Y.shape #make sure the noise is not hetero sigma_n = likelihood.gaussian_variance(Y_metadata) if sigma_n.size > 1: raise NotImplementedError( "no hetero noise with this implementation of PEP") Kmm = kern.K(Z) Knn = kern.Kdiag(X) Knm = kern.K(X, Z) U = Knm #factor Kmm diag.add(Kmm, self.const_jitter) Kmmi, L, Li, _ = pdinv(Kmm) #compute beta_star, the effective noise precision LiUT = np.dot(Li, U.T) sigma_star = sigma_n + self.alpha * (Knn - np.sum(np.square(LiUT), 0)) beta_star = 1. / sigma_star # Compute and factor A A = tdot(LiUT * np.sqrt(beta_star)) + np.eye(num_inducing) LA = jitchol(A) # back substitute to get b, P, v URiy = np.dot(U.T * beta_star, Y) tmp, _ = dtrtrs(L, URiy, lower=1) b, _ = dtrtrs(LA, tmp, lower=1) tmp, _ = dtrtrs(LA, b, lower=1, trans=1) v, _ = dtrtrs(L, tmp, lower=1, trans=1) tmp, _ = dtrtrs(LA, Li, lower=1, trans=0) P = tdot(tmp.T) alpha_const_term = (1.0 - self.alpha) / self.alpha #compute log marginal log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \ -np.sum(np.log(np.diag(LA)))*output_dim + \ 0.5*output_dim*(1+alpha_const_term)*np.sum(np.log(beta_star)) + \ -0.5*np.sum(np.square(Y.T*np.sqrt(beta_star))) + \ 0.5*np.sum(np.square(b)) + 0.5*alpha_const_term*num_data*np.log(sigma_n) #compute dL_dR Uv = np.dot(U, v) dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - (1.0+alpha_const_term)/beta_star + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) \ + np.sum(np.square(Uv), 1))*beta_star**2 # Compute dL_dKmm vvT_P = tdot(v.reshape(-1, 1)) + P dL_dK = 0.5 * (Kmmi - vvT_P) KiU = np.dot(Kmmi, U.T) dL_dK += self.alpha * np.dot(KiU * dL_dR, KiU.T) # Compute dL_dU vY = np.dot(v.reshape(-1, 1), Y.T) dL_dU = vY - np.dot(vvT_P, U.T) dL_dU *= beta_star dL_dU -= self.alpha * 2. * KiU * dL_dR dL_dthetaL = likelihood.exact_inference_gradients(dL_dR) dL_dthetaL += 0.5 * alpha_const_term * num_data / sigma_n grad_dict = { 'dL_dKmm': dL_dK, 'dL_dKdiag': dL_dR * self.alpha, 'dL_dKnm': dL_dU.T, 'dL_dthetaL': dL_dthetaL } #construct a posterior object post = Posterior(woodbury_inv=Kmmi - P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=L) return post, log_marginal, grad_dict
def update_model(self, x, y, opt_hyp=False, replace_old=True, noise_diag=1e-5, choose_data=True): """ Update the model based on the current settings and new data Parameters ---------- x: n x (n_s + n_u) array[float] The training set y: n x n_s The training targets train: bool, optional If this is set to TRUE the hyperparameters are re-optimized """ if replace_old: x_new = x y_new = y else: x_new = np.vstack((self.x_train, x)) y_new = np.vstack((self.y_train, y)) if opt_hyp or not self.gp_trained: self.train(x_new, y_new, self.m, opt_hyp=opt_hyp, Z=self.Z) else: n_data = np.shape(x_new)[0] inv_K = [None] * self.n_s_out if self.m is None: n_beta = n_data Z = x_new y_z = y_new else: if n_data < self.m: warnings.warn( """The desired number of datapoints is not available. Dataset consist of {} Datapoints! """.format(n_data)) Z = x_new y_z = y_new n_beta = n_data else: if choose_data: Z, y_z = self.choose_datapoints_maxvar( x_new, y_new, self.m) else: idx = np.random.choice(n_data, size=self.m, replace=False) Z = x_new[idx, :] y_z = y_new[idx, :] n_beta = self.m beta = np.empty((n_beta, self.n_s_out)) for i in range(self.n_s_out): if self.do_sparse_gp: self.gps[i].set_XY(x_new, y_new[:, i].reshape(-1, 1)) if not self.z_fixed: self.gps[i].set_Z(Z) else: self.gps[i].set_XY(Z, y_z[:, i].reshape(-1, 1)) post = self.gps[i].posterior if noise_diag > 0.0: inv_K[i] = pdinv( post._K + float(self.gps[i].Gaussian_noise.variance + noise_diag) * np.eye(n_beta))[0] else: inv_K[i] = post.woodbury_inv beta[:, i] = post.woodbury_vector.reshape(-1, ) self.x_train = x_new self.y_train = y_new self.z = Z self.inv_K = inv_K self.beta = beta
def predict_value(self, xvals, include_noise=True, full_cov=False): # Calculate for the test point assert (xvals.shape[0] >= 1) assert (xvals.shape[1] == self.dimension) n_points, input_dim = xvals.shape # With no observations, predict 0 mean everywhere and prior variance if self.xvals is None: return np.zeros((n_points, 1)), np.ones( (n_points, 1)) * self.variance # Find neightbors within radius point_group = self.spatial_tree.query_ball_point( xvals, self.neighbor_radius) point_list = [] for points in point_group: for index in points: point_list.append(index) point_set = Set(point_list) xpoints = [self.xvals[index] for index in point_set] zpoints = [self.zvals[index] for index in point_set] # print "Size before:", len(xpoints) # Brute force check the points in the waiting queue if self.xwait is not None and self.xwait.shape[0] > 0: wait_list = [] for i, u in enumerate(self.xwait): for j, v in enumerate(xvals): # if xvals.shape[0] < 10: # print "Comparing", i, j # print "Points:", u, v dist = sp.spatial.distance.minkowski(u, v, p=2.0) if dist <= self.neighbor_radius: wait_list.append(i) # if xvals.shape[0] < 10: # print "Adding point", u # if xvals.shape[0] < 10: # print "The wait list:", wait_list wait_set = Set(wait_list) xpoints = [self.xwait[index] for index in wait_set] + xpoints zpoints = [self.zwait[index] for index in wait_set] + zpoints # print "Size after:", len(xpoints) xpoints = np.array(xpoints).reshape(-1, 2) zpoints = np.array(zpoints).reshape(-1, 1) if xpoints.shape[0] == 0: "No nearby points!" return np.zeros((n_points, 1)), np.ones( (n_points, 1)) * self.variance # if self.xvals is not None: # print "Size of kernel array:", self.xvals # if self.xwait is not None: # print "Size of wait array:", self.xwait.shape # if xpoints is not None: # print "Size of returned points:", xpoints.shape Kx = self.kern.K(xpoints, xvals) K = self.kern.K(xpoints, xpoints) # Adds some additional noise to ensure well-conditioned Ky = K.copy() diag.add(Ky, self.noise + 1e-8) Wi, LW, LWi, W_logdet = pdinv(Ky) woodbury_inv = Wi woodbury_vector = np.dot(woodbury_inv, zpoints) mu = np.dot(Kx.T, woodbury_vector) if len(mu.shape) == 1: mu = mu.reshape(-1, 1) if full_cov: Kxx = self.kern.K(xvals) if self.woodbury_inv.ndim == 2: var = Kxx - np.dot(Kx.T, np.dot(woodbury_inv, Kx)) else: Kxx = self.kern.Kdiag(xvals) var = (Kxx - np.sum(np.dot(woodbury_inv.T, Kx) * Kx, 0))[:, None] # If model noise should be included in the prediction if include_noise: var += self.noise update_legacy = False if update_legacy: # With no observations, predict 0 mean everywhere and prior variance if self.model == None: mean, variance = np.zeros((n_points, 1)), np.ones( (n_points, 1)) * self.variance # Else, return the predicted values mean, variance = self.model.predict( xvals, full_cov=False, include_likelihood=include_noise) if xvals.shape[0] < 10: # print "-------- MEAN ------------" # print "spatial method:" # print mu # print "default method:" # print mean # print "-------- VARIANCE ------------" # print "spatial method:" # print var # print "default method:" # print variance print np.sum(mu - mean) print np.sum(var - variance) return mu, var
def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None): assert mean_function is None, "inference with a mean function not implemented" num_inducing, _ = Z.shape num_data, output_dim = Y.shape #make sure the noise is not hetero sigma_n = likelihood.gaussian_variance(Y_metadata) if sigma_n.size >1: raise NotImplementedError("no hetero noise with this implementation of PEP") Kmm = kern.K(Z) Knn = kern.Kdiag(X) Knm = kern.K(X, Z) U = Knm #factor Kmm diag.add(Kmm, self.const_jitter) Kmmi, L, Li, _ = pdinv(Kmm) #compute beta_star, the effective noise precision LiUT = np.dot(Li, U.T) sigma_star = sigma_n + self.alpha * (Knn - np.sum(np.square(LiUT),0)) beta_star = 1./sigma_star # Compute and factor A A = tdot(LiUT*np.sqrt(beta_star)) + np.eye(num_inducing) LA = jitchol(A) # back substitute to get b, P, v URiy = np.dot(U.T*beta_star,Y) tmp, _ = dtrtrs(L, URiy, lower=1) b, _ = dtrtrs(LA, tmp, lower=1) tmp, _ = dtrtrs(LA, b, lower=1, trans=1) v, _ = dtrtrs(L, tmp, lower=1, trans=1) tmp, _ = dtrtrs(LA, Li, lower=1, trans=0) P = tdot(tmp.T) alpha_const_term = (1.0-self.alpha) / self.alpha #compute log marginal log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \ -np.sum(np.log(np.diag(LA)))*output_dim + \ 0.5*output_dim*(1+alpha_const_term)*np.sum(np.log(beta_star)) + \ -0.5*np.sum(np.square(Y.T*np.sqrt(beta_star))) + \ 0.5*np.sum(np.square(b)) + 0.5*alpha_const_term*num_data*np.log(sigma_n) #compute dL_dR Uv = np.dot(U, v) dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - (1.0+alpha_const_term)/beta_star + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) \ + np.sum(np.square(Uv), 1))*beta_star**2 # Compute dL_dKmm vvT_P = tdot(v.reshape(-1,1)) + P dL_dK = 0.5*(Kmmi - vvT_P) KiU = np.dot(Kmmi, U.T) dL_dK += self.alpha * np.dot(KiU*dL_dR, KiU.T) # Compute dL_dU vY = np.dot(v.reshape(-1,1),Y.T) dL_dU = vY - np.dot(vvT_P, U.T) dL_dU *= beta_star dL_dU -= self.alpha * 2.*KiU*dL_dR dL_dthetaL = likelihood.exact_inference_gradients(dL_dR) dL_dthetaL += 0.5*alpha_const_term*num_data / sigma_n grad_dict = {'dL_dKmm': dL_dK, 'dL_dKdiag':dL_dR * self.alpha, 'dL_dKnm':dL_dU.T, 'dL_dthetaL':dL_dthetaL} #construct a posterior object post = Posterior(woodbury_inv=Kmmi-P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=L) return post, log_marginal, grad_dict
def __init__(self, domain, mu, cov): super().__init__(domain) self.mu, self.cov = process_parameters(self.input_dim, mu, cov) self.inv, _, _, ld, = pdinv(self.cov) self.constant = -0.5*(self.input_dim * np.log(2*np.pi) + ld)