def recompute_posterior_fr(alpha: np.ndarray, beta: np.ndarray, K: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ Recompute the posterior approximation (for the full rank approximation) mean: K alpha, covariance inv(K + beta) :param alpha: Alpha vector used to parametrize the posterior approximation :param beta: Beta vector/matrix used to parametrize the posterior approximation :param K: prior covariance :return: Tuple containing the mean and cholesky of the covariance, its inverse and derivatives of the KL divergence with respect to beta and alpha """ N = K.shape[0] L = choleskies._flat_to_triang_pure(beta) assert(L.shape[0]==1) L = L[0,:,:] lam_sqrt= np.diag(L) lam = lam_sqrt**2 # Compute Mean m = K @ alpha jitter = 1e-5 dKL_da = m.copy() Kinv = np.linalg.inv(K+ np.eye(N)*jitter) L_inv = np.linalg.inv(L) Sigma = np.empty((alpha.size, alpha.shape[0])) Lamda_full_rank = np.dot(L, L.T) dKL_db_triang = -dL_fr(L, 0.5*(np.linalg.inv(Lamda_full_rank) - Kinv), None, None, None) mat1 = np.linalg.inv(K + Lamda_full_rank) #Sigma = np.linalg.inv(Kinv + np.linalg.inv(Lamda_full_rank)) Sigma = Lamda_full_rank # Compute KL KL = 0.5*(-N + (m.T@Kinv@m) + np.trace(Kinv @ Sigma) - np.log(np.linalg.det(Sigma @ Kinv))) dKL_db = choleskies._triang_to_flat_pure(dKL_db_triang) return m, L, L_inv, KL, dKL_db, dKL_da
def df_d(y: List[Tuple[int, float]], yc: List[List[Tuple[int, int]]], m: np.ndarray, L: np.ndarray, L_inv: np.ndarray, K: np.ndarray, sigma2s: np.ndarray, alpha: np.ndarray, beta: np.ndarray, s_to_l: Callable=dL_fr): """ Computes the log marginal likelihood and its derivatives with respect to alpha and beta. Works for both mean feald and full rank approximations :param y: Direct observations in as a list of tuples telling location index (row in X) and observation value. :param yc: Batch comparisons in a list of lists of tuples. Each batch is a list and tuples tell the comparisons (winner index, loser index) :param m: mean of the latent values :param L: Cholesky decomposition of the latent value covariance :param L_inv: inverse of the cholesky decomposition :param K: prior covariance :param sigma2s: noise variance of the observations :param alpha: Alpha vector used to parametrize the posterior approximation :param beta: Beta vector/matrix used to parametrize the posterior approximation :param s_to_l: A function to compute the derivative of log likelihood with respect to beta using the generalized chain rule and when we know the derivative of log likelihood with respect to Sigma :return: A tuple containing log marginal likelihood, its derivative with respect to alpha and its derivative with respect to beta """ Sigma = L @ L.T dF_dm_full = np.zeros_like(m) dF_dSigma_full = np.zeros_like(Sigma) F_full = 0 #log_marginal = 0 d_list = np.random.choice(range(len(yc)), size=len(yc), replace=False) for batch_idx in d_list: loc_inds_winners, loc_inds_losers = [yc[batch_idx][k][0] for k in range(len(yc[batch_idx]))], [yc[batch_idx][k][1] for k in range(len(yc[batch_idx]))] loc_inds_batch = np.sort(np.unique(loc_inds_winners + loc_inds_losers)) # get winners ind_winners, ind_losers = [np.where(loc_inds_batch == it)[0][0] for it in loc_inds_winners], [np.where(loc_inds_batch == it)[0][0] for it in loc_inds_losers] # get variational moments F_batch, dF_dm_batch, dF_dSigma_batch = variational_expectations_ove_full_rank(m[loc_inds_batch], Sigma[np.ix_(loc_inds_batch, loc_inds_batch)], ind_winners, ind_losers, sigma2s[loc_inds_batch]) dF_dm_full[loc_inds_batch] += dF_dm_batch dF_dSigma_full[np.ix_(loc_inds_batch, loc_inds_batch)] += dF_dSigma_batch F_full += F_batch #delta = 1e-5 if len(y) > 0: ys = np.zeros((len(y),1)) y_inds = np.zeros(len(y), dtype=int) #dir_list = np.random.choice(range(len(y)), size=len(y), replace=False) for ind in range(len(y)): (y_inds[ind], ys[ind,0]) = y[ind] #index in kernel, y value F_full += -0.5*np.sum( ( (m[y_inds] - ys)**2 + Sigma[y_inds, y_inds].reshape((-1,1)) ) / sigma2s[y_inds].reshape((-1,1)) ) dF_dm_full[y_inds] += (ys - m[y_inds] ) / sigma2s[y_inds].reshape((-1,1)) dF_dSigma_full[y_inds, y_inds] += -0.5 / sigma2s[y_inds].reshape((-1)) alpha_grad = K @ dF_dm_full beta_grad = s_to_l(L, dF_dSigma_full, alpha, beta, K) log_marginal = F_full if beta_grad.shape[1] > 1: beta_grad = choleskies._triang_to_flat_pure(beta_grad) return log_marginal, alpha_grad, beta_grad
def __init__( self, X: np.ndarray, y: List[Tuple[int, float]], yc: List[List[Tuple[int, int]]], kernel: GPy.kern.Kern, likelihood: Gaussian, vi_mode: str = "fr", name: str = "VIComparisonGP", max_iters: int = 50, get_logger: Callable = None, ): super(VIComparisonGP, self).__init__(name=name) self.N, self.D = X.shape[0], X.shape[1] self.output_dim = 1 self.get_logger = get_logger self.X = X self.y = y self.yc = yc self.max_iters = max_iters self.vi_mode = vi_mode self.kern = kernel self.likelihood = likelihood self.sigma2s = self.likelihood.variance * np.ones( (X.shape[0], 1), dtype=int) jitter = 1e-6 K = self.kern.K(X) L = np.linalg.cholesky(K + np.identity(K.shape[0]) * jitter) self.alpha = np.zeros((self.N, 1)) self.beta = np.ones((self.N, 1)) self.posterior = None # If we are using full rank VI, we initialize it with mean field VI if self.vi_mode == "FRVI": self.posterior, _, _, self.alpha, self.beta = vi.vi_comparison( self.X, self.y, self.yc, self.kern, self.sigma2s, self.alpha, self.beta, max_iters=50, method="mf") self.beta = choleskies._triang_to_flat_pure( jitchol(self.posterior.covariance)[None, :])
def test_triang_to_flat(self): A1 = choleskies._triang_to_flat_pure(self.triang) A2 = choleskies._triang_to_flat_cython(self.triang) np.testing.assert_allclose(A1, A2)
old_mean_grad = mean_grad old_L_grad = L_grad # s_mean += mean_grads_running_dot_product # s_log_var += sigma_grads_running_dot_product criterion1 = mean_grads_running_dot_product < 0 criterion2 = sigma_grads_running_dot_product < 0 criterion3 = np.abs(elbo_prev - elbo) < np.abs(elbo_threshold_swa * elbo_prev) #print('old mean gradient') #print(old_mean_grad) mean_grads_running_dot_product = np.mean(mean_grad*old_mean_grad) sigma_grads_running_dot_product = np.mean(old_L_grad*L_grad) print(step_size) means_vb_clr += step_size * mean_grad_clr L_vb_clr += 0.5 * step_size * L_grad_clr betas_vb_clr = choleskies._triang_to_flat_pure(L_vb_clr[None, :]) betas = choleskies._triang_to_flat_pure(L[None, :]) # betas_vb_swa = np.reshape(L_vb_swa@L_vb_swa.T, (-1,1)) params = [means, betas] params_rms_prop = [means_vb_rms, L_vb_rms.flatten()] # step_size, params_swa, swa_n = stepsize_linear_adaptive_schedule(params, step_size, step_size_min, step_size_max, # itt+1, itt_max+1, start_swa_iter, 80, params_swa, swa_n) swa_weight = 1. step_size, params_swa, swa_n = stepsize_linear_weight_averaging(params, step_size, step_size_min, step_size_max, itt + 1, itt_max + 1, start_swa_iter, 200, params_swa, swa_n, weight=1.1, pmz='std') rho, s = rms_prop_gradient(itt + 1, mean_grad_rms, L_grad_rms.flatten(), s_prev, step_size_rms) s_prev = s