Example #1
0
def recompute_posterior_fr(alpha: np.ndarray, beta: np.ndarray, K: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    Recompute the posterior approximation (for the full rank approximation) mean: K alpha, covariance inv(K + beta)
    :param alpha: Alpha vector used to parametrize the posterior approximation
    :param beta: Beta vector/matrix used to parametrize the posterior approximation
    :param K: prior covariance
    :return: Tuple containing the mean and cholesky of the covariance, its inverse and derivatives of the KL divergence with respect to beta and alpha
    """
    N = K.shape[0]
    L = choleskies._flat_to_triang_pure(beta)
    assert(L.shape[0]==1)
    L = L[0,:,:]
    lam_sqrt= np.diag(L)
    lam = lam_sqrt**2

    # Compute Mean
    m = K @ alpha
    jitter = 1e-5
    dKL_da = m.copy()
    Kinv  = np.linalg.inv(K+ np.eye(N)*jitter)
    L_inv  = np.linalg.inv(L)

    Sigma = np.empty((alpha.size, alpha.shape[0]))
    Lamda_full_rank = np.dot(L, L.T)

    dKL_db_triang = -dL_fr(L, 0.5*(np.linalg.inv(Lamda_full_rank) - Kinv), None, None, None)

    mat1 = np.linalg.inv(K + Lamda_full_rank)
    #Sigma = np.linalg.inv(Kinv + np.linalg.inv(Lamda_full_rank))
    Sigma = Lamda_full_rank
    # Compute KL
    KL = 0.5*(-N + (m.T@Kinv@m) + np.trace(Kinv @ Sigma) - np.log(np.linalg.det(Sigma @ Kinv)))
    dKL_db = choleskies._triang_to_flat_pure(dKL_db_triang)

    return m, L, L_inv, KL, dKL_db, dKL_da
Example #2
0
def df_d(y: List[Tuple[int, float]], yc: List[List[Tuple[int, int]]], m: np.ndarray, L: np.ndarray, L_inv: np.ndarray, K: np.ndarray, sigma2s: np.ndarray, alpha: np.ndarray, beta: np.ndarray, s_to_l: Callable=dL_fr):
    """
    Computes the log marginal likelihood and its derivatives with respect to alpha and beta. Works for both mean feald and full rank approximations 
    
    :param y: Direct observations in as a list of tuples telling location index (row in X) and observation value.
    :param yc: Batch comparisons in a list of lists of tuples. Each batch is a list and tuples tell the comparisons (winner index, loser index)
    :param m: mean of the latent values
    :param L: Cholesky decomposition of the latent value covariance
    :param L_inv: inverse of the cholesky decomposition
    :param K: prior covariance
    :param sigma2s: noise variance of the observations
    :param alpha: Alpha vector used to parametrize the posterior approximation
    :param beta: Beta vector/matrix used to parametrize the posterior approximation
    :param s_to_l: A function to compute the derivative of log likelihood with respect to beta using the generalized chain rule and when we know the derivative of log likelihood with respect to Sigma
    :return: A tuple containing log marginal likelihood, its derivative with respect to alpha and its derivative with respect to beta
    """
    Sigma = L @ L.T

    dF_dm_full = np.zeros_like(m)
    dF_dSigma_full = np.zeros_like(Sigma)
    F_full = 0
    #log_marginal = 0
    d_list = np.random.choice(range(len(yc)), size=len(yc), replace=False)
    for batch_idx in d_list:
        loc_inds_winners, loc_inds_losers = [yc[batch_idx][k][0] for k in range(len(yc[batch_idx]))], [yc[batch_idx][k][1] for k in range(len(yc[batch_idx]))]
        loc_inds_batch = np.sort(np.unique(loc_inds_winners + loc_inds_losers))
        # get winners
        ind_winners, ind_losers = [np.where(loc_inds_batch == it)[0][0] for it in loc_inds_winners], [np.where(loc_inds_batch == it)[0][0] for it in loc_inds_losers]

        # get variational moments
        F_batch, dF_dm_batch, dF_dSigma_batch = variational_expectations_ove_full_rank(m[loc_inds_batch], Sigma[np.ix_(loc_inds_batch, loc_inds_batch)], ind_winners, ind_losers, sigma2s[loc_inds_batch])
        dF_dm_full[loc_inds_batch] += dF_dm_batch
        dF_dSigma_full[np.ix_(loc_inds_batch, loc_inds_batch)] += dF_dSigma_batch
        F_full += F_batch

    #delta = 1e-5
    if len(y) > 0:
        ys = np.zeros((len(y),1))
        y_inds = np.zeros(len(y), dtype=int)
        #dir_list = np.random.choice(range(len(y)), size=len(y), replace=False)
        for ind in range(len(y)):
            (y_inds[ind], ys[ind,0]) = y[ind] #index in kernel, y value
        F_full += -0.5*np.sum(  ( (m[y_inds] - ys)**2 + Sigma[y_inds, y_inds].reshape((-1,1)) ) / sigma2s[y_inds].reshape((-1,1)) )
        dF_dm_full[y_inds] += (ys - m[y_inds] ) / sigma2s[y_inds].reshape((-1,1))
        dF_dSigma_full[y_inds, y_inds] += -0.5 / sigma2s[y_inds].reshape((-1))

    alpha_grad = K @ dF_dm_full

    beta_grad = s_to_l(L, dF_dSigma_full, alpha, beta, K)

    log_marginal = F_full
    if beta_grad.shape[1] > 1:
        beta_grad = choleskies._triang_to_flat_pure(beta_grad)
    return log_marginal, alpha_grad, beta_grad
Example #3
0
    def __init__(
        self,
        X: np.ndarray,
        y: List[Tuple[int, float]],
        yc: List[List[Tuple[int, int]]],
        kernel: GPy.kern.Kern,
        likelihood: Gaussian,
        vi_mode: str = "fr",
        name: str = "VIComparisonGP",
        max_iters: int = 50,
        get_logger: Callable = None,
    ):
        super(VIComparisonGP, self).__init__(name=name)

        self.N, self.D = X.shape[0], X.shape[1]

        self.output_dim = 1
        self.get_logger = get_logger
        self.X = X
        self.y = y
        self.yc = yc

        self.max_iters = max_iters
        self.vi_mode = vi_mode

        self.kern = kernel
        self.likelihood = likelihood

        self.sigma2s = self.likelihood.variance * np.ones(
            (X.shape[0], 1), dtype=int)
        jitter = 1e-6
        K = self.kern.K(X)
        L = np.linalg.cholesky(K + np.identity(K.shape[0]) * jitter)

        self.alpha = np.zeros((self.N, 1))
        self.beta = np.ones((self.N, 1))

        self.posterior = None

        # If we are using full rank VI, we initialize it with mean field VI
        if self.vi_mode == "FRVI":
            self.posterior, _, _, self.alpha, self.beta = vi.vi_comparison(
                self.X,
                self.y,
                self.yc,
                self.kern,
                self.sigma2s,
                self.alpha,
                self.beta,
                max_iters=50,
                method="mf")
            self.beta = choleskies._triang_to_flat_pure(
                jitchol(self.posterior.covariance)[None, :])
 def test_triang_to_flat(self):
     A1 = choleskies._triang_to_flat_pure(self.triang)
     A2 = choleskies._triang_to_flat_cython(self.triang)
     np.testing.assert_allclose(A1, A2)
Example #5
0
 def test_triang_to_flat(self):
     A1 = choleskies._triang_to_flat_pure(self.triang)
     A2 = choleskies._triang_to_flat_cython(self.triang)
     np.testing.assert_allclose(A1, A2)
                old_mean_grad = mean_grad
                old_L_grad = L_grad
                # s_mean += mean_grads_running_dot_product
                # s_log_var += sigma_grads_running_dot_product
                criterion1 = mean_grads_running_dot_product < 0
                criterion2 = sigma_grads_running_dot_product < 0
                criterion3 = np.abs(elbo_prev - elbo) < np.abs(elbo_threshold_swa * elbo_prev)
                #print('old mean gradient')
                #print(old_mean_grad)
                mean_grads_running_dot_product = np.mean(mean_grad*old_mean_grad)
                sigma_grads_running_dot_product = np.mean(old_L_grad*L_grad)

                print(step_size)
                means_vb_clr += step_size * mean_grad_clr
                L_vb_clr += 0.5 * step_size * L_grad_clr
                betas_vb_clr = choleskies._triang_to_flat_pure(L_vb_clr[None, :])
                betas = choleskies._triang_to_flat_pure(L[None, :])
                # betas_vb_swa = np.reshape(L_vb_swa@L_vb_swa.T, (-1,1))
                params = [means, betas]
                params_rms_prop = [means_vb_rms, L_vb_rms.flatten()]
                # step_size, params_swa, swa_n = stepsize_linear_adaptive_schedule(params, step_size, step_size_min, step_size_max,
                # itt+1, itt_max+1, start_swa_iter, 80, params_swa, swa_n)
                swa_weight = 1.
                step_size, params_swa, swa_n = stepsize_linear_weight_averaging(params, step_size, step_size_min,
                                                                                step_size_max,
                                                                                itt + 1, itt_max + 1, start_swa_iter,
                                                                                200, params_swa, swa_n, weight=1.1,
                                                                                pmz='std')
                rho, s = rms_prop_gradient(itt + 1, mean_grad_rms, L_grad_rms.flatten(), s_prev, step_size_rms)
                s_prev = s