def predict(self): """ GP predictions Returns: predictions """ Ks = [] for d in range(self.X.shape[1]): K = self.kernels[d].eval(self.kernels[d].params, self.X_dims[d]) Ks.append(K) f_pred = kron_mvp(Ks, kron_mvp(self.K_invs, self.q_mu)) return f_pred
def predict(self): """ GP predictions Returns: predictions """ Ks = [] for i in range(self.X.shape[1]): K = self.kernels[i].eval( self.kernels[i].params, np.expand_dims(np.unique(self.X[:, i]), 1)) Ks.append(K) f_pred = kron_mvp(Ks, kron_mvp(self.K_invs, self.q_mu)) return f_pred
def grad_like(self, r, eps): """ Gradient of likelihood w.r.t variational parameters Args: r (): Transformed random sample eps (): Random sample Returns: gradient w.r.t covariance, gradient w.r.t mean """ if self.obs_idx is not None: r_obs = r[self.obs_idx] else: r_obs = r dr = self.likelihood_grad(r_obs, self.y) dr[np.isnan(dr)] = 0. self.dr = dr grads_R = [] for d in range(len(self.Rs)): Rs_copy = deepcopy(self.Rs) n = Rs_copy[d].shape[0] grad_R = np.zeros((n, n)) for i, j in zip(*np.triu_indices(n)): R_d = np.zeros((n, n)) R_d[i, j] = 1. Rs_copy[d] = R_d dR_eps = kron_mvp(Rs_copy, eps) if self.obs_idx is not None: dR_eps = dR_eps[self.obs_idx] grad_R[i, j] = np.sum(np.multiply(dr, dR_eps)) grads_R.append(grad_R) grad_mu = np.zeros(self.n) grad_mu[self.obs_idx] = dr return grads_R, grad_mu
def grad_KL_mu(self): """ Gradient of KL divergence w.r.t variational mean Returns: returns gradient """ return kron_mvp(self.K_invs, self.q_mu - self.mu)
def line_search(self, Rs_grads, mu_grads, obj_init, r, eps): """ Performs line search to find optimal step size Args: Rs_grads (): Gradients of R (variational covariances) mu_grads (): Gradients of mu (variational mean) obj_init (): Initial objective value r (): transformed random Gaussian sample eps (): random Gaussian sample Returns: Optimal step size """ step = 1. while step > 1e-15: R_search = [ np.clip(R + step * R_grad, 0., np.max(R)) for (R_grad, R) in Rs_grads ] mu_search = mu_grads[1] + step * mu_grads[0] r_search = mu_search + kron_mvp(R_search, eps) obj_search, kl_search, like_search = self.eval_obj( R_search, mu_search, r_search) if obj_init - obj_search > step: pos_def = True for R in R_search: if np.all(np.linalg.eigvals(R) > 0) == False: pos_def = False if pos_def: return R_search, mu_search, obj_search, step step = step * 0.5 return None
def grad_KL_mu(self): """ Natural gradient of KL divergence w.r.t variational mean Returns: returns gradient """ return np.multiply(np.exp(self.q_S), -kron_mvp(self.K_invs, self.mu - self.q_mu))
def variance(self, n_s): """ Stochastic approximator of predictive variance. Follows "Massively Scalable GPs" Args: n_s (int): Number of iterations to run stochastic approximation Returns: Approximate predictive variance at grid points """ if self.root_eigdecomp is None: self.sqrt_eig() if self.obs_idx is not None: root_K = self.root_eigdecomp[self.obs_idx, :] else: root_K = self.root_eigdecomp diag = kron_list_diag(self.Ks) samples = [] for i in range(n_s): g_m = np.random.normal(size=self.m) g_n = np.random.normal(size=self.n) right_side = np.sqrt(self.W).dot(np.dot(root_K, g_m)) +\ np.sqrt(self.noise) * g_n r = self.opt.cg(self.Ks, right_side) if self.obs_idx is not None: Wr = np.zeros(self.m) Wr[self.obs_idx] = np.multiply(np.sqrt(self.W), r) else: Wr = np.multiply(np.sqrt(self.W), r) samples.append(kron_mvp(self.Ks, Wr)) var = np.var(samples, axis=0) return np.clip(diag - var, 0, 1e12).flatten(), var
def search_step(self, obj_prev, min_obj, delta_alpha, step_size, max_it, t, opt_step): """ Executes one step of a backtracking line search Args: obj_prev (np.array): previous objective obj_search (np.array): current objective min_obj (np.array): current minimum objective delta_alpha (np.array): change in step size step_size (np.array): current step size max_it (int): maximum number of line search iterations t (np.array): current line search iteration opt_step (np.array): optimal step size until now Returns: updated parameters """ alpha_search = np.squeeze(self.alpha + step_size * delta_alpha) f_search = np.squeeze(kron_mvp(self.Ks, alpha_search)) + self.mu if self.k_diag is not None: f_search += np.multiply(self.k_diag, alpha_search) obj_search = self.log_joint(f_search, alpha_search) if min_obj > obj_search: opt_step = step_size min_obj = obj_search step_size = self.tau * step_size t = t + 1 return obj_prev, min_obj, delta_alpha,\ step_size, max_it, t, opt_step
def sample_post(self): """ Draws a sample from the GPR posterior Returns: sample """ eps = np.random.normal(size=self.n) return self.q_mu + kron_mvp(self.Rs, eps)
def cg_prod(self, Ks, p): """ Args: p (): potential solution to linear system Returns: product Ap (left side of linear system) """ if self.precondition is None: return p + np.multiply(np.sqrt(self.W), kron_mvp(Ks, np.multiply(np.sqrt(self.W), p))) Cp = np.multiply(self.precondition, p) noise = np.multiply(np.multiply(self.precondition, np.multiply(self.W, self.k_diag)), Cp) wkw = np.multiply(np.multiply(self.precondition, np.sqrt(self.W)), kron_mvp(Ks, np.multiply(np.sqrt(self.W), Cp))) return noise + wkw + np.multiply(self.precondition, Cp)
def run(self, its): """ Runs stochastic variational inference Args: its (): Number of iterations Returns: Nothing, but updates instance variables """ t = trange(its, leave=True) for i in t: self.calc_trace_term() KL_grad_R = self.grad_KL_R() KL_grad_mu = self.grad_KL_mu() eps = np.random.normal(size=self.n) r = self.q_mu + kron_mvp(self.Rs, eps) like_grad_R, like_grad_mu = self.grad_like(r, eps) grad_R = [ -KL_grad_R[i] + like_grad_R[i] for i in range(len(KL_grad_R)) ] grad_mu = -KL_grad_mu + like_grad_mu R_and_grads = list(zip(grad_R, self.Rs)) mu_and_grad = (grad_mu, self.q_mu) obj, kl, like = self.eval_obj(self.Rs, self.q_mu, r) self.elbos.append(-obj) if self.linesearch: ls_res = self.line_search(R_and_grads, mu_and_grad, obj, r, eps) step = 0. if ls_res is not None: step = ls_res[-1] t.set_description("ELBO: " + '{0:.2f}'.format(-obj) + " | KL: " + '{0:.2f}'.format(kl) + " | logL: " + '{0:.2f}'.format(like) + " | step: " + str(step)) if ls_res is not None: self.Rs = ls_res[0] self.q_mu = ls_res[1] else: t.set_description("ELBO: " + '{0:.2f}'.format(-obj) + " | KL: " + '{0:.2f}'.format(kl) + " | logL: " + '{0:.2f}'.format(like)) self.q_mu, self.mu_params = \ self.optimizer.step(mu_and_grad, self.mu_params) for d in range(self.d): self.Rs[d], self.R_params[d] = \ self.optimizer.step(R_and_grads[d], self.R_params[d]) self.f_pred = self.predict() return
def variance_pmap(self, n_s=30): """ Stochastic approximator of predictive variance. Follows "Massively Scalable GPs" Args: n_s (int): Number of iterations to run stochastic approximation Returns: Approximate predictive variance at grid points """ if self.eigvals or self.eigvecs is None: self.eig_decomp() Q = self.eigvecs Q_t = [v.T for v in self.eigvecs] Vr = [np.nan_to_num(np.sqrt(e)) for e in self.eigvals] diag = kron_list_diag(self.Ks) + self.noise samples = [] for i in range(n_s): g_m = np.random.normal(size=self.m) g_n = np.random.normal(size=self.n) Kroot_g = kron_mvp(Q, kron_mvp(Vr, kron_mvp(Q_t, g_m))) if self.obs_idx is not None: Kroot_g = Kroot_g[self.obs_idx] right_side = Kroot_g + np.sqrt(self.noise) * g_n r = self.cg_opt.cg(self.Ks, right_side) if self.obs_idx is not None: Wr = np.zeros(self.m) Wr[self.obs_idx] = r else: Wr = r samples.append(kron_mvp(self.Ks, Wr)) est = np.var(samples, axis=0) return np.clip(diag - est, 0, a_max=None).flatten()
def KLqp(self, S, q_mu): """ Calculates KL divergence between q and p Args: S (): Variational variances q_mu (): Variational mean Returns: KL divergence between q and p """ k_inv_mu = kron_mvp(self.K_invs, self.mu - q_mu) mu_penalty = np.sum(np.multiply(self.mu - q_mu, k_inv_mu)) det_S = np.sum(S) trace_term = np.sum(np.multiply(self.k_inv_diag, np.exp(S))) kl = 0.5 * (self.det_K - self.m - det_S + trace_term + mu_penalty) return kl
def KL_calc(self, Rs, q_mu): """ Calculates KL divergence between q and p Args: Rs (): Variational covariance q_mu (): Variational mean Returns: KL divergence between q and p """ k_inv_mu = kron_mvp(self.K_invs, self.mu - q_mu) mu_penalty = np.sum(np.multiply(self.mu - q_mu, k_inv_mu)) det_S = self.log_det_S(Rs) trace_term = self.calc_trace_term(Rs)[0] kl = 0.5 * (self.det_K - self.n - det_S + trace_term + mu_penalty) return max(0, kl)
def step(self, max_it, it, delta): """ Runs one step of Kronecker inference Args: max_it (int): maximum number of Kronecker iterations it (int): current iteration delta (np.array): change in step size Returns: max iteration, current iteration, previous objective, change in objective """ self.f = kron_mvp(self.Ks, self.alpha) + self.mu if self.k_diag is not None: self.f += np.multiply(self.alpha, self.k_diag) psi = self.log_joint(self.f, self.alpha) self.update_derivs() b = np.multiply(self.W, self.f - self.mu) + self.grads if self.precondition is not None: z = self.opt.cg(self.Ks, np.multiply(self.precondition, np.multiply(1.0/np.sqrt(self.W), b))) else: z = self.opt.cg(self.Ks, np.multiply(1.0/np.sqrt(self.W), b)) delta_alpha = np.multiply(np.sqrt(self.W), z) - self.alpha step_size = self.line_search(delta_alpha, psi, 20) delta = step_size if delta > 1e-9: self.alpha = self.alpha + delta_alpha*step_size self.alpha = np.where(np.isnan(self.alpha), np.ones_like(self.alpha) * 1e-9, self.alpha) it = it + 1 return max_it, it, delta, step_size, psi
def run(self, max_it): """ Runs Kronecker inference. Updates instance variables. Args: max_it (int): maximum number of iterations. Returns: max iterations, iteration number, objective """ if self.obs_idx is not None: k_diag = np.ones(self.X.shape[0]) * 1e12 k_diag[self.obs_idx] = self.noise self.k_diag = k_diag self.precondition = np.clip(1.0 / np.sqrt(self.k_diag), 0, 1) else: self.k_diag = None self.precondition = None delta = sys.float_info.max it = 0 t = trange(max_it) for i in t: max_it, it, delta, step, psi = self.step(max_it, it, delta) t.set_description("Objective: " + '{0:.2f}'.format(psi) + " | Step Size: " + '{0:.2f}'.format(step)) if delta < 1e-9: break self.f_pred = kron_mvp(self.Ks, self.alpha) + self.mu self.update_derivs() return