Exemple #1
0
    def predictive(self, x_new, lik_noise=False):
        # Matrices
        q_m = self.q_m.detach().numpy()
        q_L = torch.tril(self.q_L)
        q_S = torch.mm(q_L, q_L.t()).detach().numpy()
        Kuu = self.kernel.K(self.z, self.z).detach().numpy()

        posterior = Posterior(mean=q_m, cov=q_S, K=Kuu, prior_mean=np.zeros(q_m.shape))
        Kx = self.kernel.K(self.z, x_new).detach().numpy()
        Kxx = self.kernel.K(x_new, x_new).detach().numpy()

        # GP Predictive Posterior - mean + variance
        gp_mu = np.dot(Kx.T, posterior.woodbury_vector)
        Kxx = np.diag(Kxx)
        gp_var = (Kxx - np.sum(np.dot(np.atleast_3d(posterior.woodbury_inv).T, Kx) * Kx[None, :, :], 1)).T

        gp = gp_mu
        if lik_noise:
            gp_upper = gp_mu + 2 * np.sqrt(gp_var)  + 2 * self.likelihood.sigma.detach().numpy()
            gp_lower = gp_mu - 2 * np.sqrt(gp_var)  - 2 * self.likelihood.sigma.detach().numpy()
        else:
            gp_upper = gp_mu + 2*np.sqrt(gp_var)
            gp_lower = gp_mu - 2*np.sqrt(gp_var)

        return gp, gp_upper, gp_lower
    def _outer_loop_without_missing_data(self):
        if self.posterior is None:
            woodbury_inv = np.zeros(
                (self.num_inducing, self.num_inducing, self.output_dim))
            woodbury_vector = np.zeros((self.num_inducing, self.output_dim))
        else:
            woodbury_inv = self.posterior._woodbury_inv
            woodbury_vector = self.posterior._woodbury_vector

        d = self.stochastics.d[0][0]
        posterior, log_marginal_likelihood, grad_dict = self._inner_parameters_changed(
            self.kern, self.X, self.Z, self.likelihood,
            self.Y_normalized[:, d], self.Y_metadata)
        self.grad_dict = grad_dict

        self._log_marginal_likelihood = log_marginal_likelihood

        self._outer_values_update(self.grad_dict)

        woodbury_inv[:, :, d] = posterior.woodbury_inv[:, :, None]
        woodbury_vector[:, d] = posterior.woodbury_vector
        if self.posterior is None:
            self.posterior = Posterior(woodbury_inv=woodbury_inv,
                                       woodbury_vector=woodbury_vector,
                                       K=posterior._K,
                                       mean=None,
                                       cov=None,
                                       K_chol=posterior.K_chol)
def _inference(K: np.ndarray, ga_approx: GaussianApproximation, cav_params: CavityParams, Z_tilde: float,
               y: List[Tuple[int, float]], yc: List[List[Tuple[int, int]]]) -> Tuple[Posterior, int, Dict]:
    """
    Compute the posterior approximation
    :param K: prior covariance matrix
    :param ga_approx: Gaussian approximation of the batches
    :param cav_params: Cavity parameters of the posterior
    :param Z_tilde: Log marginal likelihood
    :param y: Direct observations as a list of tuples telling location index (row in X) and observation value.
    :param yc: Batch comparisons in a list of lists of tuples. Each batch is a list and tuples tell the comparisons (winner index, loser index)
    :return: A tuple consisting of the posterior approximation, log marginal likelihood and gradient dictionary
    """
    
    log_marginal, post_params = _ep_marginal(K, ga_approx, Z_tilde,y,yc)
    tau_tilde_root = sqrtm_block(ga_approx.tau, y,yc)
    Sroot_tilde_K = np.dot(tau_tilde_root, K)
    aux_alpha , _ = dpotrs(post_params.L, np.dot(Sroot_tilde_K, ga_approx.v), lower=1)
    alpha = (ga_approx.v - np.dot(tau_tilde_root, aux_alpha))[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde)
    LWi, _ = dtrtrs(post_params.L, tau_tilde_root, lower=1)

    Wi = np.dot(LWi.T,LWi)
    symmetrify(Wi) #(K + Sigma^(\tilde))^(-1)
    dL_dK = 0.5 * (tdot(alpha) - Wi)
    dL_dthetaL = 0
    return Posterior(woodbury_inv=np.asfortranarray(Wi), woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
Exemple #4
0
    def inference(self,
                  kern,
                  X,
                  W,
                  likelihood,
                  Y,
                  mean_function=None,
                  Y_metadata=None,
                  K=None,
                  variance=None,
                  Z_tilde=None):
        """
        Returns a Posterior class containing essential quantities of the posterior
        """

        if mean_function is None:
            m = 0
        else:
            m = mean_function.f(X)

        if variance is None:
            variance = likelihood.gaussian_variance(Y_metadata)

        YYT_factor = Y - m

        if K is None:
            K = kern.K(X)

        Ky = K.copy()

        diag.add(Ky, variance + 1e-8)

        Wi, LW, LWi, W_logdet = pdinv(Ky)

        alpha, _ = dpotrs(LW, YYT_factor, lower=1)

        log_marginal = 0.5 * (-Y.size * log_2_pi - Y.shape[1] * W_logdet -
                              np.sum(alpha * YYT_factor))

        if Z_tilde is not None:
            # This is a correction term for the log marginal likelihood
            # In EP this is log Z_tilde, which is the difference between the
            # Gaussian marginal and Z_EP
            log_marginal += Z_tilde

        dL_dK = 0.5 * (tdot(alpha) - Y.shape[1] * Wi)

        dL_dthetaL = likelihood.exact_inference_gradients(
            np.diag(dL_dK), Y_metadata)

        posterior_ = Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K)

        return posterior_, log_marginal, {
            'dL_dK': dL_dK,
            'dL_dthetaL': dL_dthetaL,
            'dL_dm': alpha
        }, W_logdet
    def _outer_loop_for_missing_data(self):
        Lm = None
        dL_dKmm = None

        self._log_marginal_likelihood = 0
        self.full_values = self._outer_init_full_values()

        if self.posterior is None:
            woodbury_inv = np.zeros(
                (self.num_inducing, self.num_inducing, self.output_dim))
            woodbury_vector = np.zeros((self.num_inducing, self.output_dim))
        else:
            woodbury_inv = self.posterior._woodbury_inv
            woodbury_vector = self.posterior._woodbury_vector

        if not self.stochastics:
            m_f = lambda i: "Inference with missing_data: {: >7.2%}".format(
                float(i + 1) / self.output_dim)
            message = m_f(-1)
            print message,

        for d in self.stochastics.d:
            ninan = self.ninan[:, d]

            if not self.stochastics:
                print ' ' * (len(message)) + '\r',
                message = m_f(d)
                print message,

            posterior, log_marginal_likelihood, \
                grad_dict, current_values, value_indices = self._inner_parameters_changed(
                                self.kern, self.X[ninan],
                                self.Z, self.likelihood,
                                self.Ylist[d], self.Y_metadata,
                                Lm, dL_dKmm,
                                subset_indices=dict(outputs=d, samples=ninan))

            self._inner_take_over_or_update(self.full_values, current_values,
                                            value_indices)
            self._inner_values_update(current_values)

            Lm = posterior.K_chol
            dL_dKmm = grad_dict['dL_dKmm']
            woodbury_inv[:, :, d] = posterior.woodbury_inv
            woodbury_vector[:, d:d + 1] = posterior.woodbury_vector
            self._log_marginal_likelihood += log_marginal_likelihood
        if not self.stochastics:
            print ''

        if self.posterior is None:
            self.posterior = Posterior(woodbury_inv=woodbury_inv,
                                       woodbury_vector=woodbury_vector,
                                       K=posterior._K,
                                       mean=None,
                                       cov=None,
                                       K_chol=posterior.K_chol)
        self._outer_values_update(self.full_values)
Exemple #6
0
def vi_comparison(X: np.ndarray, y: List[Tuple[int, float]], yc: List[List[Tuple[int, int]]],
                  kern: GPy.kern.Kern, sigma2s: np.ndarray, alpha: np.ndarray, beta: np.ndarray,
                  max_iters: int=200, lr: float=1e-3, method: str='fr', optimize: str="adam",
                  get_logger: Callable=None) -> Tuple[Posterior, float, Dict, np.ndarray, np.ndarray]:
    """
    :param X: All locations of both direct observations and batch comparisons
    :param y: Direct observations in as a list of tuples telling location index (row in X) and observation value.
    :param yc: Batch comparisons in a list of lists of tuples. Each batch is a list and tuples tell the comparisons (winner index, loser index)
    :param kern: Prior covariance kernel
    :param sigma2s: Noise variance of the observations
    :param alpha: Initial values for alpha
    :param beta: Initial values for beta
    :param max_iter: macimum number of optimization iterations
    :param method: full rank 'fr' or mean field 'mf' methods
    :param optimize: optimization algorithm. adam or l-bfgs-B
    :param get_logger: Function for receiving the legger where the prints are forwarded.
    :return: A Tuple containing the posterior, log marginal likelihood, its gradients with respect to hyper parameters (not supported at the moment) and alpha and beta values
    """
    if(method == 'fr'):
        recompute_posterior = recompute_posterior_fr
        s_to_l = dL_fr
    else:
        recompute_posterior = recompute_posterior_mf
        s_to_l = dL_mf

    K = kern.K(X)
    K = K + 1e-6*np.identity(len(K))
    N = X.shape[0]

    X0 = np.r_[alpha, beta]
    args = [K, sigma2s, y, yc, recompute_posterior, s_to_l]
    if optimize is "adam":
        X, log_marginal, _ = adam(log_lik, X0.flatten(), args, bounds=None, max_it=max_iters, get_logger=get_logger)
    else:
        res = sp.optimize.minimize(fun=log_lik,
                                   x0=X0.flatten(),
                                   args= args,
                                   method='L-BFGS-B',
                                   jac=True,
                                   bounds=None
                                   )
        X = res.x.reshape(-1)
        log_marginal = res.fun
    alpha = X[:K.shape[0]].reshape(-1,1)
    beta = X[K.shape[0]:].reshape(-1,1)

    # Create posterior instance
    m, L, L_inv, KL, dKL_db_, dKL_da_ = recompute_posterior(alpha, beta, K)
    posterior = Posterior(mean=m, cov=L @ L.T, K=K)
    grad_dict = {}# {'dL_dK': dF_dK - dKL_dK, 'dL_dthetaL':dL_dthetaL}
    # return posterior, log_marginal, grad_dict
    return posterior, log_marginal, grad_dict, alpha, beta
Exemple #7
0
def _inference(K, ga_approx, cav_params, likelihood, Z_tilde, Y_metadata=None):
    log_marginal, post_params = _ep_marginal(K, ga_approx, Z_tilde)

    tau_tilde_root = np.sqrt(ga_approx.tau)
    Sroot_tilde_K = tau_tilde_root[:,None] * K

    aux_alpha , _ = dpotrs(post_params.L, np.dot(Sroot_tilde_K, ga_approx.v), lower=1)
    alpha = (ga_approx.v - tau_tilde_root * aux_alpha)[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde)
    LWi, _ = dtrtrs(post_params.L, np.diag(tau_tilde_root), lower=1)
    Wi = np.dot(LWi.T,LWi)
    symmetrify(Wi) #(K + Sigma^(\tilde))^(-1)

    dL_dK = 0.5 * (tdot(alpha) - Wi)
    dL_dthetaL = 0 #likelihood.ep_gradients(Y, cav_params.tau, cav_params.v, np.diag(dL_dK), Y_metadata=Y_metadata, quad_mode='gh')
    #temp2 = likelihood.ep_gradients(Y, cav_params.tau, cav_params.v, np.diag(dL_dK), Y_metadata=Y_metadata, quad_mode='naive')
    #temp = likelihood.exact_inference_gradients(np.diag(dL_dK), Y_metadata = Y_metadata)
    #print("exact: {}, approx: {}, Ztilde: {}, naive: {}".format(temp, dL_dthetaL, Z_tilde, temp2))
    return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
Exemple #8
0
    def posteriors(self, q_u_means, q_u_chols, X, Y, Z, kern_list, likelihood,
                   B_list, Y_metadata):
        """
        Description:
        """
        ####### Dimensions #######
        D = likelihood.num_output_functions(Y_metadata)
        Q = len(kern_list)
        M = Z.shape[0]
        T = len(Y)
        dimensions = {'D': D, 'Q': Q, 'M': M, 'T': T}

        ####### Distributions #######
        Kuu, Luu, Kuui = multi_output.latent_funs_cov(Z, kern_list)
        p_U = pu(Kuu=Kuu, Luu=Luu, Kuui=Kuui)
        q_U = qu(mu_u=q_u_means, chols_u=q_u_chols)

        posteriors = []
        f_index = Y_metadata['function_index'].flatten()

        ####### q(F) posterior computations #######
        for d in range(D):
            Xtask = X[f_index[d]]
            q_fd = self.variational_q_fd(X=Xtask,
                                         Z=Z,
                                         q_U=q_U,
                                         p_U=p_U,
                                         kern_list=kern_list,
                                         B=B_list,
                                         N=Xtask.shape[0],
                                         dims=dimensions,
                                         d=d)
            Knew_d = multi_output.function_covariance(X=Xtask,
                                                      B=B_list,
                                                      kernel_list=kern_list,
                                                      d=d)
            posterior_fd = Posterior(mean=q_fd.m_fd.copy(),
                                     cov=q_fd.S_fd.copy(),
                                     K=Knew_d,
                                     prior_mean=np.zeros(q_fd.m_fd.shape))
            posteriors.append(posterior_fd)

        return posteriors
Exemple #9
0
    def inference(self,
                  kern,
                  X,
                  Z,
                  likelihood,
                  Y,
                  indexD,
                  output_dim,
                  Y_metadata=None,
                  Lm=None,
                  dL_dKmm=None,
                  Kuu_sigma=None):
        """
        The first phase of inference:
        Compute: log-likelihood, dL_dKmm

        Cached intermediate results: Kmm, KmmInv,
        """

        input_dim = Z.shape[0]

        uncertain_inputs = isinstance(X, VariationalPosterior)

        beta = 1. / likelihood.variance
        if len(beta) == 1:
            beta = np.zeros(output_dim) + beta

        beta_exp = np.zeros(indexD.shape[0])
        for d in range(output_dim):
            beta_exp[indexD == d] = beta[d]

        psi0, psi1, psi2 = self.gatherPsiStat(kern, X, Z, Y, beta,
                                              uncertain_inputs)

        psi2_sum = (beta_exp[:, None, None] * psi2).sum(0) / output_dim

        #======================================================================
        # Compute Common Components
        #======================================================================

        Kmm = kern.K(Z).copy()
        if Kuu_sigma is not None:
            diag.add(Kmm, Kuu_sigma)
        else:
            diag.add(Kmm, self.const_jitter)
        Lm = jitchol(Kmm)

        logL = 0.
        dL_dthetaL = np.zeros(output_dim)
        dL_dKmm = np.zeros_like(Kmm)
        dL_dpsi0 = np.zeros_like(psi0)
        dL_dpsi1 = np.zeros_like(psi1)
        dL_dpsi2 = np.zeros_like(psi2)
        wv = np.empty((Kmm.shape[0], output_dim))

        for d in range(output_dim):
            idx_d = indexD == d
            Y_d = Y[idx_d]
            N_d = Y_d.shape[0]
            beta_d = beta[d]

            psi2_d = psi2[idx_d].sum(0) * beta_d
            psi1Y = Y_d.T.dot(psi1[idx_d]) * beta_d
            psi0_d = psi0[idx_d].sum() * beta_d
            YRY_d = np.square(Y_d).sum() * beta_d

            LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2_d, 'right')

            Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT
            LL = jitchol(Lambda)
            LmLL = Lm.dot(LL)

            b = dtrtrs(LmLL, psi1Y.T)[0].T
            bbt = np.square(b).sum()
            v = dtrtrs(LmLL, b.T, trans=1)[0].T
            LLinvPsi1TYYTPsi1LLinvT = tdot(b.T)

            tmp = -backsub_both_sides(LL, LLinvPsi1TYYTPsi1LLinvT)
            dL_dpsi2R = backsub_both_sides(Lm, tmp + np.eye(input_dim)) / 2

            logL_R = -N_d * np.log(beta_d)
            logL += -((N_d * log_2_pi + logL_R + psi0_d -
                       np.trace(LmInvPsi2LmInvT)) + YRY_d - bbt) / 2.

            dL_dKmm += dL_dpsi2R - backsub_both_sides(Lm, LmInvPsi2LmInvT) / 2

            dL_dthetaL[d:d +
                       1] = (YRY_d * beta_d + beta_d * psi0_d - N_d *
                             beta_d) / 2. - beta_d * (dL_dpsi2R * psi2_d).sum(
                             ) - beta_d * np.trace(LLinvPsi1TYYTPsi1LLinvT)

            dL_dpsi0[idx_d] = -beta_d / 2.
            dL_dpsi1[idx_d] = beta_d * np.dot(Y_d, v)
            dL_dpsi2[idx_d] = beta_d * dL_dpsi2R
            wv[:, d] = v

        LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2_sum, 'right')

        Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT
        LL = jitchol(Lambda)
        LmLL = Lm.dot(LL)
        logdet_L = 2. * np.sum(np.log(np.diag(LL)))
        dL_dpsi2R_common = dpotri(LmLL)[0] / -2.
        dL_dpsi2 += dL_dpsi2R_common[None, :, :] * beta_exp[:, None, None]

        for d in range(output_dim):
            dL_dthetaL[d] += (dL_dpsi2R_common * psi2[indexD == d].sum(0)
                              ).sum() * -beta[d] * beta[d]

        dL_dKmm += dL_dpsi2R_common * output_dim

        logL += -output_dim * logdet_L / 2.

        #======================================================================
        # Compute dL_dKmm
        #======================================================================

        # dL_dKmm =  dL_dpsi2R - output_dim* backsub_both_sides(Lm, LmInvPsi2LmInvT)/2 #LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2.

        #======================================================================
        # Compute the Posterior distribution of inducing points p(u|Y)
        #======================================================================

        LLInvLmT = dtrtrs(LL, Lm.T)[0]
        cov = tdot(LLInvLmT.T)

        wd_inv = backsub_both_sides(
            Lm,
            np.eye(input_dim) -
            backsub_both_sides(LL, np.identity(input_dim), transpose='left'),
            transpose='left')
        post = Posterior(woodbury_inv=wd_inv,
                         woodbury_vector=wv,
                         K=Kmm,
                         mean=None,
                         cov=cov,
                         K_chol=Lm)

        #======================================================================
        # Compute dL_dthetaL for uncertian input and non-heter noise
        #======================================================================

        # for d in range(output_dim):
        #     dL_dthetaL[d:d+1] += - beta[d]*beta[d]*(dL_dpsi2R[None,:,:] * psi2[indexD==d]/output_dim).sum()
        # dL_dthetaL += - (dL_dpsi2R[None,:,:] * psi2_sum*D beta*(dL_dpsi2R*psi2).sum()

        #======================================================================
        # Compute dL_dpsi
        #======================================================================

        if not uncertain_inputs:
            dL_dpsi1 += (psi1[:, None, :] * dL_dpsi2).sum(2) * 2.

        if uncertain_inputs:
            grad_dict = {
                'dL_dKmm': dL_dKmm,
                'dL_dpsi0': dL_dpsi0,
                'dL_dpsi1': dL_dpsi1,
                'dL_dpsi2': dL_dpsi2,
                'dL_dthetaL': dL_dthetaL
            }
        else:
            grad_dict = {
                'dL_dKmm': dL_dKmm,
                'dL_dKdiag': dL_dpsi0,
                'dL_dKnm': dL_dpsi1,
                'dL_dthetaL': dL_dthetaL
            }

        return post, logL, grad_dict
    def inference(self,
                  q_u_means,
                  q_u_chols,
                  X,
                  Y,
                  Z,
                  kern_list,
                  kern_list_Gdj,
                  kern_aux,
                  likelihood,
                  B_list,
                  Y_metadata,
                  KL_scale=1.0,
                  batch_scale=None,
                  predictive=False,
                  Gauss_Newton=False):
        M = Z.shape[0]
        T = len(Y)
        if batch_scale is None:
            batch_scale = [1.0] * T
        Ntask = []
        [Ntask.append(Y[t].shape[0]) for t in range(T)]
        Q = len(kern_list)
        D = likelihood.num_output_functions(Y_metadata)
        Kuu, Luu, Kuui = util.latent_funs_cov(Z, kern_list)
        p_U = pu(Kuu=Kuu, Luu=Luu, Kuui=Kuui)
        q_U = qu(mu_u=q_u_means.copy(), chols_u=q_u_chols.copy())
        S_u = np.empty((Q, M, M))
        L_u = choleskies.flat_to_triang(q_U.chols_u.copy())
        [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)]
        Su_add_Kuu = np.zeros((Q, M, M))
        Su_add_Kuu_chol = np.zeros((Q, M, M))
        for q in range(Q):
            Su_add_Kuu[q, :, :] = S_u[q, :, :] + Kuu[q, :, :]
            Su_add_Kuu_chol[q, :, :] = linalg.jitchol(Su_add_Kuu[q, :, :])

        # for every latent function f_d calculate q(f_d) and keep it as q(F):
        q_F = []
        posteriors_F = []
        f_index = Y_metadata['function_index'].flatten()
        d_index = Y_metadata['d_index'].flatten()

        for d in range(D):
            Xtask = X[f_index[d]]
            q_fd, q_U = self.calculate_q_f(X=Xtask,
                                           Z=Z,
                                           q_U=q_U,
                                           S_u=S_u,
                                           p_U=p_U,
                                           kern_list=kern_list,
                                           kern_list_Gdj=kern_list_Gdj,
                                           kern_aux=kern_aux,
                                           B=B_list,
                                           M=M,
                                           N=Xtask.shape[0],
                                           Q=Q,
                                           D=D,
                                           d=d)
            # Posterior objects for output functions (used in prediction)
            #I have to get rid of function below Posterior for it is not necessary
            posterior_fd = Posterior(mean=q_fd.m_fd.copy(),
                                     cov=q_fd.S_fd.copy(),
                                     K=util.conv_function_covariance(
                                         X=Xtask,
                                         B=B_list,
                                         kernel_list=kern_list,
                                         kernel_list_Gdj=kern_list_Gdj,
                                         kff_aux=kern_aux,
                                         d=d),
                                     prior_mean=np.zeros(q_fd.m_fd.shape))
            posteriors_F.append(posterior_fd)
            q_F.append(q_fd)

        mu_F = []
        v_F = []
        for t in range(T):
            mu_F_task = np.empty((X[t].shape[0], 1))
            v_F_task = np.empty((X[t].shape[0], 1))
            for d, q_fd in enumerate(q_F):
                if f_index[d] == t:
                    mu_F_task = np.hstack((mu_F_task, q_fd.m_fd))
                    v_F_task = np.hstack((v_F_task, q_fd.v_fd))

            mu_F.append(mu_F_task[:, 1:])
            v_F.append(v_F_task[:, 1:])

        # posterior_Fnew for predictive
        if predictive:
            return posteriors_F
        # inference for rest of cases
        else:
            # Variational Expectations
            VE = likelihood.var_exp(Y, mu_F, v_F, Y_metadata)
            VE_dm, VE_dv = likelihood.var_exp_derivatives(
                Y, mu_F, v_F, Y_metadata, Gauss_Newton)
            for t in range(T):
                VE[t] = VE[t] * batch_scale[t]
                VE_dm[t] = VE_dm[t] * batch_scale[t]
                VE_dv[t] = VE_dv[t] * batch_scale[t]

            # KL Divergence
            KL = self.calculate_KL(q_U=q_U,
                                   Su_add_Kuu=Su_add_Kuu,
                                   Su_add_Kuu_chol=Su_add_Kuu_chol,
                                   p_U=p_U,
                                   M=M,
                                   Q=Q,
                                   D=D)

            # Log Marginal log(p(Y))
            F = 0
            for t in range(T):
                F += VE[t].sum()

            log_marginal = F - KL

            # Gradients and Posteriors
            dL_dS_u = []
            dL_dmu_u = []
            dL_dL_u = []
            dL_dKmm = []
            dL_dKmn = []
            dL_dKdiag = []
            posteriors = []
            for q in range(Q):
                (dL_dmu_q, dL_dL_q, dL_dS_q, posterior_q, dL_dKqq, dL_dKdq,
                 dL_dKdiag_q) = self.calculate_gradients(
                     q_U=q_U,
                     S_u=S_u,
                     Su_add_Kuu_chol=Su_add_Kuu_chol,
                     p_U=p_U,
                     q_F=q_F,
                     VE_dm=VE_dm,
                     VE_dv=VE_dv,
                     Ntask=Ntask,
                     M=M,
                     Q=Q,
                     D=D,
                     f_index=f_index,
                     d_index=d_index,
                     q=q)
                dL_dmu_u.append(dL_dmu_q)
                dL_dL_u.append(dL_dL_q)
                dL_dS_u.append(dL_dS_q)
                dL_dKmm.append(dL_dKqq)
                dL_dKmn.append(dL_dKdq)
                dL_dKdiag.append(dL_dKdiag_q)
                posteriors.append(posterior_q)

            gradients = {
                'dL_dmu_u': dL_dmu_u,
                'dL_dL_u': dL_dL_u,
                'dL_dS_u': dL_dS_u,
                'dL_dKmm': dL_dKmm,
                'dL_dKmn': dL_dKmn,
                'dL_dKdiag': dL_dKdiag
            }

            return log_marginal, gradients, posteriors, posteriors_F
    def calculate_gradients(self, q_U, S_u, Su_add_Kuu_chol, p_U, q_F, VE_dm,
                            VE_dv, Ntask, M, Q, D, f_index, d_index, q):
        """
        Calculates gradients of the Log-marginal distribution p(Y) wrt variational
        parameters mu_q, S_q
        """
        # Algebra for q(u) and p(u):
        m_u = q_U.mu_u.copy()
        #L_u = choleskies.flat_to_triang(q_U.chols_u.copy())
        #S_u = np.empty((Q, M, M))
        #[np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)]
        Kuu = p_U.Kuu.copy()
        Luu = p_U.Luu.copy()
        Kuui = p_U.Kuui.copy()
        S_qi, _ = linalg.dpotri(np.asfortranarray(Su_add_Kuu_chol[q, :, :]))

        if np.any(np.isinf(S_qi)):
            raise ValueError("Sqi: Cholesky representation unstable")

        # KL Terms
        dKL_dmu_q = []
        dKL_dKqq = 0
        for d in range(D):
            dKL_dmu_q.append(np.dot(Kuui[q, :, :], m_u[d][:, q, None]))  #same
            dKL_dKqq += -0.5 * S_qi + 0.5 * Kuui[q, :, :] - 0.5 * Kuui[q, :, :].dot(S_u[q, :, :]).dot(Kuui[q, :, :]) \
                       - 0.5 * np.dot(Kuui[q, :, :], np.dot(m_u[d][:, q, None], m_u[d][:, q, None].T)).dot(Kuui[q, :, :].T)  # same
        #dKL_dS_q = 0.5 * (Kuui[q,:,:] - S_qi)             #old
        dKL_dS_q = 0.5 * (Kuui[q, :, :] - S_qi) * D

        # VE Terms
        #dVE_dmu_q = np.zeros((M, 1))
        dVE_dmu_q = []
        dVE_dS_q = np.zeros((M, M))
        dVE_dKqq = np.zeros((M, M))
        dVE_dKqd = []
        dVE_dKdiag = []
        dL_dmu_q = []

        for d, q_fd in enumerate(q_F):
            Nt = Ntask[f_index[d]]
            dVE_dmu_q.append(
                np.dot(q_fd.Afdu[q, :, :].T,
                       VE_dm[f_index[d]][:, d_index[d]])[:, None])
            dL_dmu_q.append(dVE_dmu_q[d] - dKL_dmu_q[d])
            Adv = q_fd.Afdu[q, :, :].T * VE_dv[f_index[d]][:, d_index[d],
                                                           None].T
            Adv = np.ascontiguousarray(Adv)
            AdvA = np.dot(Adv.reshape(-1, Nt),
                          q_fd.Afdu[q, :, :]).reshape(M, M)
            dVE_dS_q += AdvA

            # Derivatives dKuquq
            tmp_dv = np.dot(AdvA, S_u[q, :, :]).dot(Kuui[q, :, :])
            dVE_dKqq += -tmp_dv - tmp_dv.T  #+ AdvA last term not included in the derivative
            Adm = np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:, d_index[d],
                                                                 None])
            dVE_dKqq += -np.dot(Adm,
                                np.dot(Kuui[q, :, :], m_u[d][:, q, None]).T)

            # Derivatives dKuqfd
            tmp = np.dot(S_u[q, :, :], Kuui[q, :, :])
            tmp = 2. * tmp  #2. * (tmp - np.eye(M))  # the term -2Adv not included
            dve_kqd = np.dot(np.dot(Kuui[q, :, :], m_u[d][:, q, None]),
                             VE_dm[f_index[d]][:, d_index[d], None].T)
            dve_kqd += np.dot(tmp.T, Adv)
            dVE_dKqd.append(dve_kqd)

            # Derivatives dKdiag
            dVE_dKdiag.append(VE_dv[f_index[d]][:, d_index[d]])

        dVE_dKqq = 0.5 * (dVE_dKqq + dVE_dKqq.T)
        # Sum of VE and KL terms
        #dL_dmu_q = dVE_dmu_q - dKL_dmu_q
        dL_dS_q = dVE_dS_q - dKL_dS_q
        dL_dKqq = dVE_dKqq - dKL_dKqq
        dL_dKdq = dVE_dKqd
        dL_dKdiag = dVE_dKdiag

        # Pass S_q gradients to its low-triangular representation L_q
        chol_u = q_U.chols_u.copy()
        L_q = choleskies.flat_to_triang(chol_u[:, q:q + 1])
        dL_dL_q = 2. * np.array(
            [np.dot(a, b) for a, b in zip(dL_dS_q[None, :, :], L_q)])
        dL_dL_q = choleskies.triang_to_flat(dL_dL_q)

        # Posterior
        posterior_q = []
        for d in range(D):
            posterior_q.append(
                Posterior(mean=m_u[d][:, q, None],
                          cov=S_u[q, :, :] + Kuu[q, :, :],
                          K=Kuu[q, :, :],
                          prior_mean=np.zeros(m_u[d][:, q, None].shape)))

        return dL_dmu_q, dL_dL_q, dL_dS_q, posterior_q, dL_dKqq, dL_dKdq, dL_dKdiag
Exemple #12
0
    def calculate_gradients(self, q_U, p_U, q_F, VE_dm, VE_dv, Ntask, M, Q, D,
                            f_index, d_index, j):
        """
        Calculates gradients of the Log-marginal distribution p(Y) wrt variational
        parameters mu_q, S_q
        """
        # Algebra for q(u) and p(u):
        m_u = q_U.mu_u.copy()
        L_u = choleskies.flat_to_triang(q_U.chols_u.copy())
        #S_u = np.empty((Q, M, M))
        S_u = np.dot(
            L_u[j, :, :], L_u[j, :, :].T
        )  #This could be done outside and recieve it to reduce computation
        #[np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)]
        Kuu = p_U.Kuu.copy()
        Luu = p_U.Luu.copy()
        Kuui = p_U.Kuui.copy()
        S_qi, _ = linalg.dpotri(np.asfortranarray(L_u[j, :, :]))

        if np.any(np.isinf(S_qi)):
            raise ValueError("Sqi: Cholesky representation unstable")

        # KL Terms
        dKL_dmu_j = np.dot(Kuui[j, :, :], m_u[:, j, None])
        dKL_dS_j = 0.5 * (Kuui[j, :, :] - S_qi)
        dKL_dKjj = 0.5 * Kuui[j,:,:] - 0.5 * Kuui[j,:,:].dot(S_u).dot(Kuui[j,:,:]) \
                   - 0.5 * np.dot(Kuui[j,:,:],np.dot(m_u[:, j, None],m_u[:, j, None].T)).dot(Kuui[j,:,:].T)

        # VE Terms
        dVE_dmu_j = np.zeros((M, 1))
        dVE_dS_j = np.zeros((M, M))
        dVE_dKjj = np.zeros((M, M))
        dVE_dKjd = []
        dVE_dKdiag = []

        Nt = Ntask[f_index[j]]
        dVE_dmu_j += np.dot(q_F[j].Afdu.T,
                            VE_dm[f_index[j]][:, d_index[j]])[:, None]
        Adv = q_F[j].Afdu.T * VE_dv[f_index[j]][:, d_index[j], None].T
        Adv = np.ascontiguousarray(Adv)
        AdvA = np.dot(Adv.reshape(-1, Nt), q_F[j].Afdu).reshape(M, M)
        dVE_dS_j += AdvA

        # Derivatives dKuquq
        tmp_dv = np.dot(AdvA, S_u).dot(Kuui[j, :, :])
        dVE_dKjj += AdvA - tmp_dv - tmp_dv.T
        Adm = np.dot(q_F[j].Afdu.T, VE_dm[f_index[j]][:, d_index[j], None])
        dVE_dKjj += -np.dot(Adm, np.dot(Kuui[j, :, :], m_u[:, j, None]).T)

        # Derivatives dKuqfd
        tmp = np.dot(S_u, Kuui[j, :, :])
        tmp = 2. * (tmp - np.eye(M))
        dve_kjd = np.dot(np.dot(Kuui[j, :, :], m_u[:, j, None]),
                         VE_dm[f_index[j]][:, d_index[j], None].T)
        dve_kjd += np.dot(tmp.T, Adv)
        dVE_dKjd.append(dve_kjd)

        # Derivatives dKdiag
        dVE_dKdiag.append(VE_dv[f_index[j]][:, d_index[j]])

        dVE_dKjj = 0.5 * (dVE_dKjj + dVE_dKjj.T)
        # Sum of VE and KL terms
        dL_dmu_j = dVE_dmu_j - dKL_dmu_j
        dL_dS_j = dVE_dS_j - dKL_dS_j
        dL_dKjj = dVE_dKjj - dKL_dKjj
        dL_dKdj = dVE_dKjd[0].copy()  #Here we just pass the unique position
        dL_dKdiag = dVE_dKdiag[0].copy(
        )  #Here we just pass the unique position

        # Pass S_q gradients to its low-triangular representation L_q
        chol_u = q_U.chols_u.copy()
        L_j = choleskies.flat_to_triang(chol_u[:, j:j + 1])
        dL_dL_j = 2. * np.array(
            [np.dot(a, b) for a, b in zip(dL_dS_j[None, :, :], L_j)])
        dL_dL_j = choleskies.triang_to_flat(dL_dL_j)

        # Posterior
        posterior_j = Posterior(mean=m_u[:, j, None],
                                cov=S_u,
                                K=Kuu[j, :, :],
                                prior_mean=np.zeros(m_u[:, j, None].shape))

        return dL_dmu_j, dL_dL_j, dL_dS_j, posterior_j, dL_dKjj, dL_dKdj, dL_dKdiag
Exemple #13
0
    def inference(self,
                  kern,
                  X,
                  Z,
                  likelihood,
                  Y,
                  Y_metadata=None,
                  Lm=None,
                  dL_dKmm=None):
        """
        The first phase of inference:
        Compute: log-likelihood, dL_dKmm

        Cached intermediate results: Kmm, KmmInv,
        """

        num_data, output_dim = Y.shape
        input_dim = Z.shape[0]

        uncertain_inputs = isinstance(X, VariationalPosterior)
        uncertain_outputs = isinstance(Y, VariationalPosterior)

        beta = 1. / np.fmax(likelihood.variance, 1e-6)

        psi0, psi2, YRY, psi1, psi1Y, Shalf, psi1S = self.gatherPsiStat(
            kern, X, Z, Y, beta, uncertain_inputs)

        #======================================================================
        # Compute Common Components
        #======================================================================

        Kmm = kern.K(Z).copy()
        diag.add(Kmm, self.const_jitter)
        Lm = jitchol(Kmm)

        #LmInv = dtrtri(Lm)
        if uncertain_inputs:
            LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right')
        else:
            LmInvPsi2LmInvT = tdot(dtrtrs(
                Lm, psi1.T)[0]) / beta  #tdot(psi1.dot(LmInv.T).T) /beta

        Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT
        LL = jitchol(Lambda)
        LmLL = Lm.dot(LL)
        #        LLInv = dtrtri(LL)
        #       LmLLInv = LLInv.dot(LmInv)

        logdet_L = 2. * np.sum(np.log(np.diag(LL)))
        b = dtrtrs(LmLL, psi1Y.T)[0].T  #psi1Y.dot(LmLLInv.T)
        bbt = np.square(b).sum()
        v = dtrtrs(LmLL, b.T, trans=1)[0].T  #b.dot(LmLLInv)
        LLinvPsi1TYYTPsi1LLinvT = tdot(b.T)

        if psi1S is not None:
            psi1SLLinv = dtrtrs(LmLL, psi1S.T)[0].T  #psi1S.dot(LmLLInv.T)
            bbt += np.square(psi1SLLinv).sum()
            LLinvPsi1TYYTPsi1LLinvT += tdot(psi1SLLinv.T)
            psi1SP = dtrtrs(LmLL, psi1SLLinv.T,
                            trans=1)[0].T  #psi1SLLinv.dot(LmLLInv)
        tmp = -backsub_both_sides(
            LL, LLinvPsi1TYYTPsi1LLinvT + output_dim * np.eye(input_dim))
        dL_dpsi2R = backsub_both_sides(
            Lm, tmp + output_dim * np.eye(input_dim)) / 2
        #tmp = -LLInv.T.dot(LLinvPsi1TYYTPsi1LLinvT+output_dim*np.eye(input_dim)).dot(LLInv)
        #dL_dpsi2R = LmInv.T.dot(tmp+output_dim*np.eye(input_dim)).dot(LmInv)/2.

        #======================================================================
        # Compute log-likelihood
        #======================================================================
        logL_R = -num_data * np.log(beta)
        logL = -(
            output_dim *
            (num_data * log_2_pi + logL_R + psi0 - np.trace(LmInvPsi2LmInvT)) +
            YRY - bbt) / 2. - output_dim * logdet_L / 2.

        #======================================================================
        # Compute dL_dKmm
        #======================================================================

        dL_dKmm = dL_dpsi2R - output_dim * backsub_both_sides(
            Lm,
            LmInvPsi2LmInvT) / 2  #LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2.

        #======================================================================
        # Compute the Posterior distribution of inducing points p(u|Y)
        #======================================================================

        wd_inv = backsub_both_sides(
            Lm,
            np.eye(input_dim) -
            backsub_both_sides(LL, np.identity(input_dim), transpose='left'),
            transpose='left')
        post = Posterior(woodbury_inv=wd_inv,
                         woodbury_vector=v.T,
                         K=Kmm,
                         mean=None,
                         cov=None,
                         K_chol=Lm)

        #======================================================================
        # Compute dL_dthetaL for uncertian input and non-heter noise
        #======================================================================

        dL_dthetaL = (YRY * beta + beta * output_dim * psi0 - num_data *
                      output_dim * beta) / 2. - beta * (dL_dpsi2R * psi2).sum(
                      ) - beta * np.trace(LLinvPsi1TYYTPsi1LLinvT)

        #======================================================================
        # Compute dL_dpsi
        #======================================================================

        dL_dpsi0 = -output_dim * (beta * np.ones((num_data, ))) / 2.

        if uncertain_outputs:
            m, s = Y.mean, Y.variance
            dL_dpsi1 = beta * (np.dot(m, v) + Shalf[:, None] * psi1SP)
        else:
            dL_dpsi1 = beta * np.dot(Y, v)

        if uncertain_inputs:
            dL_dpsi2 = beta * dL_dpsi2R
        else:
            dL_dpsi1 += np.dot(psi1, dL_dpsi2R) * 2.
            dL_dpsi2 = None

        if uncertain_inputs:
            grad_dict = {
                'dL_dKmm': dL_dKmm,
                'dL_dpsi0': dL_dpsi0,
                'dL_dpsi1': dL_dpsi1,
                'dL_dpsi2': dL_dpsi2,
                'dL_dthetaL': dL_dthetaL
            }
        else:
            grad_dict = {
                'dL_dKmm': dL_dKmm,
                'dL_dKdiag': dL_dpsi0,
                'dL_dKnm': dL_dpsi1,
                'dL_dthetaL': dL_dthetaL
            }

        if uncertain_outputs:
            m, s = Y.mean, Y.variance
            psi1LmiLLi = dtrtrs(LmLL, psi1.T)[0].T  #psi1.dot(LmLLInv.T)
            LLiLmipsi1Y = b.T
            grad_dict['dL_dYmean'] = -m * beta + psi1LmiLLi.dot(LLiLmipsi1Y)
            grad_dict['dL_dYvar'] = beta / -2. + np.square(psi1LmiLLi).sum(
                axis=1) / 2

        return post, logL, grad_dict
Exemple #14
0
    def inference(self,
                  kern,
                  X,
                  Z,
                  likelihood,
                  Y,
                  qU_mean,
                  qU_var,
                  Kuu_sigma=None):
        """
        The SVI-VarDTC inference
        """

        N, D, M, Q = Y.shape[0], Y.shape[1], Z.shape[0], Z.shape[1]

        uncertain_inputs = isinstance(X, VariationalPosterior)
        uncertain_outputs = isinstance(Y, VariationalPosterior)

        beta = 1. / likelihood.variance

        psi0, psi2, YRY, psi1, psi1Y = self.gatherPsiStat(
            kern, X, Z, Y, beta, uncertain_inputs)

        #======================================================================
        # Compute Common Components
        #======================================================================

        Kuu = kern.K(Z).copy()
        if Kuu_sigma is not None:
            diag.add(Kuu, Kuu_sigma)
        else:
            diag.add(Kuu, self.const_jitter)
        Lm = jitchol(Kuu)

        mu, S = qU_mean, qU_var
        Ls = jitchol(S)
        LinvLs = dtrtrs(Lm, Ls)[0]
        Linvmu = dtrtrs(Lm, mu)[0]
        psi1YLinvT = dtrtrs(Lm, psi1Y.T)[0].T

        self.mid = {'qU_L': Ls, 'LinvLu': LinvLs, 'L': Lm, 'Linvmu': Linvmu}

        if uncertain_inputs:
            LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right')
        else:
            LmInvPsi2LmInvT = tdot(dtrtrs(Lm, psi1.T)[0]) / beta

        LmInvSmuLmInvT = tdot(LinvLs) * D + tdot(Linvmu)

        #         logdet_L = np.sum(np.log(np.diag(Lm)))
        #         logdet_S = np.sum(np.log(np.diag(Ls)))

        #======================================================================
        # Compute log-likelihood
        #======================================================================

        logL_R = -N * np.log(beta)
        logL = -N*D*log_2_pi/2. -D*logL_R/2. - D*psi0/2. - YRY/2.  \
                     -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. + np.trace(LmInvPsi2LmInvT)*D/2.+(Linvmu*psi1YLinvT.T).sum()

        #======================================================================
        # Compute dL_dKmm
        #======================================================================

        tmp1 = backsub_both_sides(Lm, LmInvSmuLmInvT.dot(LmInvPsi2LmInvT),
                                  'left')
        tmp2 = Linvmu.dot(psi1YLinvT)
        tmp3 = backsub_both_sides(Lm, -D * LmInvPsi2LmInvT - tmp2 - tmp2.T,
                                  'left') / 2.

        dL_dKmm = (tmp1 + tmp1.T) / 2. + tmp3

        #======================================================================
        # Compute dL_dthetaL for uncertian input and non-heter noise
        #======================================================================

        dL_dthetaL = -D * N * beta / 2. - (
            -D * psi0 / 2. - YRY / 2. -
            (LmInvSmuLmInvT * LmInvPsi2LmInvT).sum() / 2. +
            np.trace(LmInvPsi2LmInvT) * D / 2. +
            (Linvmu * psi1YLinvT.T).sum()) * beta

        #======================================================================
        # Compute dL_dqU
        #======================================================================

        tmp1 = backsub_both_sides(Lm, -LmInvPsi2LmInvT, 'left')
        dL_dqU_mean = tmp1.dot(mu) + dtrtrs(Lm, psi1YLinvT.T, trans=1)[0]
        dL_dqU_var = D / 2. * tmp1

        #======================================================================
        # Compute the Posterior distribution of inducing points p(u|Y)
        #======================================================================

        KuuInvmu = dtrtrs(Lm, Linvmu, trans=1)[0]
        tmp = backsub_both_sides(Lm, np.eye(M) - tdot(LinvLs), 'left')

        post = Posterior(woodbury_inv=tmp,
                         woodbury_vector=KuuInvmu,
                         K=Kuu,
                         mean=mu,
                         cov=S,
                         K_chol=Lm)

        #======================================================================
        # Compute dL_dpsi
        #======================================================================

        dL_dpsi0 = -D * (beta * np.ones((N, ))) / 2.

        if uncertain_outputs:
            dL_dpsi1 = Y.mean.dot(dtrtrs(Lm, Linvmu, trans=1)[0].T) * beta
        else:
            dL_dpsi1 = Y.dot(dtrtrs(Lm, Linvmu, trans=1)[0].T) * beta

        dL_dpsi2 = beta * backsub_both_sides(Lm,
                                             D * np.eye(M) - LmInvSmuLmInvT,
                                             'left') / 2.
        if not uncertain_inputs:
            dL_dpsi1 += psi1.dot(dL_dpsi2 + dL_dpsi2.T) / beta
            dL_dpsi2 = None

        if uncertain_inputs:
            grad_dict = {
                'dL_dKmm': dL_dKmm,
                'dL_dpsi0': dL_dpsi0,
                'dL_dpsi1': dL_dpsi1,
                'dL_dpsi2': dL_dpsi2,
                'dL_dthetaL': dL_dthetaL,
                'dL_dqU_mean': dL_dqU_mean,
                'dL_dqU_var': dL_dqU_var
            }
        else:
            grad_dict = {
                'dL_dKmm': dL_dKmm,
                'dL_dKdiag': dL_dpsi0,
                'dL_dKnm': dL_dpsi1,
                'dL_dthetaL': dL_dthetaL,
                'dL_dqU_mean': dL_dqU_mean,
                'dL_dqU_var': dL_dqU_var
            }

        if uncertain_outputs:
            m, s = Y.mean, Y.variance
            grad_dict['dL_dYmean'] = -m * beta + dtrtrs(Lm, psi1.T)[0].T.dot(
                dtrtrs(Lm, mu)[0])
            grad_dict['dL_dYvar'] = beta / -2.

        return post, logL, grad_dict
Exemple #15
0
    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None, fixed_covs_kerns=None, **kw):

        _, output_dim = Y.shape
        uncertain_inputs = isinstance(X, VariationalPosterior)

        #see whether we've got a different noise variance for each datum
        beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6)
        # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
        #self.YYTfactor = self.get_YYTfactor(Y)
        #VVT_factor = self.get_VVTfactor(self.YYTfactor, beta)
        het_noise = beta.size > 1

        if het_noise:
            raise(NotImplementedError("Heteroscedastic noise not implemented, should be possible though, feel free to try implementing it :)"))

        if beta.ndim == 1:
            beta = beta[:, None]


        # do the inference:
        num_inducing = Z.shape[0]
        num_data = Y.shape[0]
        # kernel computations, using BGPLVM notation

        Kmm = kern.K(Z).copy()
        diag.add(Kmm, self.const_jitter)
        if Lm is None:
            Lm = jitchol(Kmm)

        # The rather complex computations of A, and the psi stats
        if uncertain_inputs:
            psi0 = kern.psi0(Z, X)
            psi1 = kern.psi1(Z, X)
            if het_noise:
                psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0)
            else:
                psi2_beta = kern.psi2(Z,X) * beta
            LmInv = dtrtri(Lm)
            A = LmInv.dot(psi2_beta.dot(LmInv.T))
        else:
            psi0 = kern.Kdiag(X)
            psi1 = kern.K(X, Z)
            if het_noise:
                tmp = psi1 * (np.sqrt(beta))
            else:
                tmp = psi1 * (np.sqrt(beta))
            tmp, _ = dtrtrs(Lm, tmp.T, lower=1)
            A = tdot(tmp)

        # factor B
        B = np.eye(num_inducing) + A
        LB = jitchol(B)
        # back substutue C into psi1Vf
        #tmp, _ = dtrtrs(Lm, psi1.T.dot(VVT_factor), lower=1, trans=0)
        #_LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0)
        #tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1)
        #Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1)

        # data fit and derivative of L w.r.t. Kmm
        #delit = tdot(_LBi_Lmi_psi1Vf)

        # Expose YYT to get additional covariates in (YYT + Kgg):
        tmp, _ = dtrtrs(Lm, psi1.T, lower=1, trans=0)
        _LBi_Lmi_psi1, _ = dtrtrs(LB, tmp, lower=1, trans=0)
        tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1, lower=1, trans=1)
        Cpsi1, _ = dtrtrs(Lm, tmp, lower=1, trans=1)

        # TODO: cache this:
        # Compute fixed covariates covariance:
        if fixed_covs_kerns is not None:
            K_fixed = 0
            for name, [cov, k] in fixed_covs_kerns.iteritems():
                K_fixed += k.K(cov)

            #trYYT = self.get_trYYT(Y)
            YYT_covs = (tdot(Y) + K_fixed)
            data_term = beta**2 * YYT_covs
            trYYT_covs = np.trace(YYT_covs)
        else:
            data_term = beta**2 * tdot(Y)
            trYYT_covs = self.get_trYYT(Y)

        #trYYT = self.get_trYYT(Y)
        delit = mdot(_LBi_Lmi_psi1, data_term, _LBi_Lmi_psi1.T)
        data_fit = np.trace(delit)

        DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit)
        if dL_dKmm is None:
            delit = -0.5 * DBi_plus_BiPBi
            delit += -0.5 * B * output_dim
            delit += output_dim * np.eye(num_inducing)
            # Compute dL_dKmm
            dL_dKmm = backsub_both_sides(Lm, delit)

        # derivatives of L w.r.t. psi
        dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm,
            data_term, Cpsi1, DBi_plus_BiPBi,
            psi1, het_noise, uncertain_inputs)

        # log marginal likelihood
        log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise,
            psi0, A, LB, trYYT_covs, data_fit, Y)

        if self.save_per_dim:
            self.saved_vals = [psi0, A, LB, _LBi_Lmi_psi1, beta]

        # No heteroscedastics, so no _LBi_Lmi_psi1Vf:
        # For the interested reader, try implementing the heteroscedastic version, it should be possible
        _LBi_Lmi_psi1Vf = None # Is just here for documentation, so you can see, what it was.

        #noise derivatives
        dL_dR = _compute_dL_dR(likelihood,
            het_noise, uncertain_inputs, LB,
            _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A,
            psi0, psi1, beta,
            data_fit, num_data, output_dim, trYYT_covs, Y, None)

        dL_dthetaL = likelihood.exact_inference_gradients(dL_dR,Y_metadata)

        #put the gradients in the right places
        if uncertain_inputs:
            grad_dict = {'dL_dKmm': dL_dKmm,
                         'dL_dpsi0':dL_dpsi0,
                         'dL_dpsi1':dL_dpsi1,
                         'dL_dpsi2':dL_dpsi2,
                         'dL_dthetaL':dL_dthetaL}
        else:
            grad_dict = {'dL_dKmm': dL_dKmm,
                         'dL_dKdiag':dL_dpsi0,
                         'dL_dKnm':dL_dpsi1,
                         'dL_dthetaL':dL_dthetaL}

        if fixed_covs_kerns is not None:
            # For now, we do not take the gradients, we can compute them,
            # but the maximum likelihood solution is to switch off the additional covariates....
            dL_dcovs = beta * np.eye(K_fixed.shape[0]) - beta**2*tdot(_LBi_Lmi_psi1.T)
            grad_dict['dL_dcovs'] = -.5 * dL_dcovs

        #get sufficient things for posterior prediction
        #TODO: do we really want to do this in  the loop?
        if 1:
            woodbury_vector = (beta*Cpsi1).dot(Y)
        else:
            import ipdb; ipdb.set_trace()
            psi1V = np.dot(Y.T*beta, psi1).T
            tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
            tmp, _ = dpotrs(LB, tmp, lower=1)
            woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
        Bi, _ = dpotri(LB, lower=1)
        symmetrify(Bi)
        Bi = -dpotri(LB, lower=1)[0]
        diag.add(Bi, 1)

        woodbury_inv = backsub_both_sides(Lm, Bi)

        #construct a posterior object
        post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
        return post, log_marginal, grad_dict
Exemple #16
0
    def inference(self, q_u_means, q_u_chols, kern, X, Z, likelihood, Y, mean_functions, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
        num_inducing = Z.shape[0]
        num_data, num_outputs = Y.shape
        num_latent_funcs = likelihood.request_num_latent_functions(Y)

        #For each latent function, calculate some required values
        latent_function_details = []
        for latent_ind in range(num_latent_funcs):
            q_u_meanj = q_u_means[:, latent_ind*num_outputs:(latent_ind+1)*num_outputs]
            q_u_cholj = q_u_chols[:, latent_ind*num_outputs:(latent_ind+1)*num_outputs]
            kernj = kern[latent_ind]
            mean_functionj = mean_functions[latent_ind]
            latent_detail = self.calculate_mu_var(X, Y, Z, q_u_meanj, q_u_cholj, kernj, mean_functionj, num_inducing, num_data, num_outputs)
            latent_function_details.append(latent_detail)

        mu = np.hstack([l.mu for l in latent_function_details])
        v = np.hstack([l.v for l in latent_function_details])
        #mu = [l.mu for l in latent_function_details]
        #v = [l.v for l in latent_function_details]

        #Hack shouldn't be necessary
        #Y = np.hstack([Y]*num_latent_funcs)

        #quadrature for the likelihood
        F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v, Y_metadata=Y_metadata)

        #for latent_ind in range(num_latent_functions):
            #l.dF_dmu = dF_dmu[:, latent_ind][:, None]
            #l.dF_dv = dF_dv[:, latent_ind][:, None]

        #rescale the F term if working on a batch
        F, dF_dmu, dF_dv =  F*batch_scale, dF_dmu*batch_scale, dF_dv*batch_scale
        if dF_dthetaL is not None:
            dF_dthetaL =  dF_dthetaL.sum(1).sum(1)*batch_scale

        #sum (gradients of) expected likelihood and KL part
        log_marginal = F.sum()

        dL_dKmm = []
        dL_dKmn = []
        dL_dKdiag = []
        dL_dm = []
        dL_dchol = []
        dL_dmfZ = []
        dL_dmfX = []
        posteriors = []
        #For each latent function (and thus for each kernel the latent function uses)
        #calculate the gradients and generate a posterior
        for latent_ind in range(num_latent_funcs):
            l = latent_function_details[latent_ind]
            #q_u_meanj = q_u_means[:, latent_ind*num_outputs:(latent_ind+1)*num_outputs]
            #q_u_cholj = q_u_chols[:, latent_ind*num_outputs:(latent_ind+1)*num_outputs]
            dF_dmui = dF_dmu[:, latent_ind*num_outputs:(latent_ind+1)*num_outputs]
            dF_dvi = dF_dv[:, latent_ind*num_outputs:(latent_ind+1)*num_outputs]
            (log_marginal, dL_dKmmi, dL_dKmni, dL_dKdiagi,
            dL_dmi, dL_dcholi, dL_dmfZi, dL_dmfXi) = self.calculate_gradients(log_marginal, l, dF_dmui, dF_dvi,
                                                                              num_inducing, num_outputs, num_data)
            posterior = Posterior(mean=l.q_u_mean, cov=l.S.T, K=l.Kmm, prior_mean=l.prior_mean_u)

            dL_dKmm.append(dL_dKmmi)
            dL_dKmn.append(dL_dKmni)
            dL_dKdiag.append(dL_dKdiagi)
            dL_dm.append(dL_dmi)
            dL_dchol.append(dL_dcholi)
            dL_dmfZ.append(dL_dmfZi)
            dL_dmfX.append(dL_dmfXi)
            posteriors.append(posterior)

        grad_dict = {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dL_dKdiag, 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}

        #If not all of the mean functions are null, fill out the others gradients with zeros
        if not all(mean_function is None for mean_function in mean_functions):
            for mean_function in mean_functions:
                if mean_function is None:
                    grad_dict['dL_dmfZ'] = np.zeros(Z.shape)
                    grad_dict['dL_dmfX'] = np.zeros(X.shape)
                else:
                    grad_dict['dL_dmfZ'] = dL_dmfZ
                    grad_dict['dL_dmfX'] = dL_dmfX

        return posteriors, log_marginal, grad_dict
Exemple #17
0
    def inference(self,
                  kern,
                  X,
                  Z,
                  likelihood,
                  Y,
                  mean_function=None,
                  Y_metadata=None):
        assert mean_function is None, "inference with a mean function not implemented"

        num_inducing, _ = Z.shape
        num_data, output_dim = Y.shape

        #make sure the noise is not hetero
        sigma_n = likelihood.gaussian_variance(Y_metadata)
        if sigma_n.size > 1:
            raise NotImplementedError(
                "no hetero noise with this implementation of PEP")

        Kmm = kern.K(Z)
        Knn = kern.Kdiag(X)
        Knm = kern.K(X, Z)
        U = Knm

        #factor Kmm
        diag.add(Kmm, self.const_jitter)
        Kmmi, L, Li, _ = pdinv(Kmm)

        #compute beta_star, the effective noise precision
        LiUT = np.dot(Li, U.T)
        sigma_star = sigma_n + self.alpha * (Knn - np.sum(np.square(LiUT), 0))
        beta_star = 1. / sigma_star

        # Compute and factor A
        A = tdot(LiUT * np.sqrt(beta_star)) + np.eye(num_inducing)
        LA = jitchol(A)

        # back substitute to get b, P, v
        URiy = np.dot(U.T * beta_star, Y)
        tmp, _ = dtrtrs(L, URiy, lower=1)
        b, _ = dtrtrs(LA, tmp, lower=1)
        tmp, _ = dtrtrs(LA, b, lower=1, trans=1)
        v, _ = dtrtrs(L, tmp, lower=1, trans=1)
        tmp, _ = dtrtrs(LA, Li, lower=1, trans=0)
        P = tdot(tmp.T)

        alpha_const_term = (1.0 - self.alpha) / self.alpha

        #compute log marginal
        log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \
                       -np.sum(np.log(np.diag(LA)))*output_dim + \
                       0.5*output_dim*(1+alpha_const_term)*np.sum(np.log(beta_star)) + \
                       -0.5*np.sum(np.square(Y.T*np.sqrt(beta_star))) + \
                       0.5*np.sum(np.square(b)) + 0.5*alpha_const_term*num_data*np.log(sigma_n)
        #compute dL_dR
        Uv = np.dot(U, v)
        dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - (1.0+alpha_const_term)/beta_star + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) \
            + np.sum(np.square(Uv), 1))*beta_star**2

        # Compute dL_dKmm
        vvT_P = tdot(v.reshape(-1, 1)) + P
        dL_dK = 0.5 * (Kmmi - vvT_P)
        KiU = np.dot(Kmmi, U.T)
        dL_dK += self.alpha * np.dot(KiU * dL_dR, KiU.T)

        # Compute dL_dU
        vY = np.dot(v.reshape(-1, 1), Y.T)
        dL_dU = vY - np.dot(vvT_P, U.T)
        dL_dU *= beta_star
        dL_dU -= self.alpha * 2. * KiU * dL_dR

        dL_dthetaL = likelihood.exact_inference_gradients(dL_dR)
        dL_dthetaL += 0.5 * alpha_const_term * num_data / sigma_n
        grad_dict = {
            'dL_dKmm': dL_dK,
            'dL_dKdiag': dL_dR * self.alpha,
            'dL_dKnm': dL_dU.T,
            'dL_dthetaL': dL_dthetaL
        }

        #construct a posterior object
        post = Posterior(woodbury_inv=Kmmi - P,
                         woodbury_vector=v,
                         K=Kmm,
                         mean=None,
                         cov=None,
                         K_chol=L)

        return post, log_marginal, grad_dict
Exemple #18
0
    def calculate_gradients(self, q_U, p_U_new, p_U_old, p_U_var, q_F, VE_dm, VE_dv, Ntask, M, Q, D, f_index, d_index,q):
        """
        Calculates gradients of the Log-marginal distribution p(Y) wrt variational
        parameters mu_q, S_q
        """
        # Algebra for q(u):
        m_u = q_U.mu_u.copy()
        L_u = choleskies.flat_to_triang(q_U.chols_u.copy())
        S_u = np.empty((Q, M, M))
        [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)]

        S_qi, _ = linalg.dpotri(np.asfortranarray(L_u[q, :, :]))
        if np.any(np.isinf(S_qi)):
            raise ValueError("Sqi: Cholesky representation unstable")

        # Algebra for p(u)
        Kuu_new = p_U_new.Kuu.copy()
        Luu_new = p_U_new.Luu.copy()
        Kuui_new = p_U_new.Kuui.copy()

        Kuu_old = p_U_old.Kuu.copy()
        Luu_old = p_U_old.Luu.copy()
        Kuui_old = p_U_old.Kuui.copy()

        Mu_var = p_U_var.Mu.copy()
        Kuu_var = p_U_var.Kuu.copy()
        Luu_var = p_U_var.Luu.copy()
        Kuui_var = p_U_var.Kuui.copy()


        # KL Terms
        dKLnew_dmu_q = np.dot(Kuui_new[q,:,:], m_u[:, q, None])
        dKLnew_dS_q = 0.5 * (Kuui_new[q,:,:] - S_qi)

        dKLold_dmu_q = np.dot(Kuui_old[q,:,:], m_u[:, q, None])
        dKLold_dS_q = 0.5 * (Kuui_old[q,:,:] - S_qi)

        dKLvar_dmu_q = np.dot(Kuui_var[q,:,:], (m_u[:, q, None] - Mu_var[q, :, :])) # important!! (Eq. 69 MCB)
        dKLvar_dS_q = 0.5 * (Kuui_var[q,:,:] - S_qi)

        dKLnew_dKqq = 0.5 * Kuui_new[q,:,:] - 0.5 * Kuui_new[q,:,:].dot(S_u[q, :, :]).dot(Kuui_new[q,:,:]) \
                   - 0.5 * np.dot(Kuui_new[q,:,:],np.dot(m_u[:, q, None],m_u[:, q, None].T)).dot(Kuui_new[q,:,:].T)

        dKLold_dKqq = 0.5 * Kuui_old[q,:,:] - 0.5 * Kuui_old[q,:,:].dot(S_u[q, :, :]).dot(Kuui_old[q,:,:]) \
                   - 0.5 * np.dot(Kuui_old[q,:,:],np.dot(m_u[:, q, None],m_u[:, q, None].T)).dot(Kuui_old[q,:,:].T)

        #dKLvar_dKqq = 0.5 * Kuui_var[q,:,:] - 0.5 * Kuui_var[q,:,:].dot(S_u[q, :, :]).dot(Kuui_var[q,:,:]) \
        #           - 0.5 * np.dot(Kuui_var[q,:,:],np.dot(m_u[:, q, None],m_u[:, q, None].T)).dot(Kuui_var[q,:,:].T) \
        #            + 0.5 * np.dot(Kuui_var[q,:,:], np.dot(m_u[:,q,None], Mu_var[q,:,:].T)).dot(Kuui_var[q,:,:].T) \
        #            + 0.5 * np.dot(Kuui_var[q,:,:], np.dot(Mu_var[q,:,:], m_u[:,q,None].T)).dot(Kuui_var[q,:,:].T) \
        #              - 0.5 * np.dot(Kuui_var[q,:,:],np.dot(Mu_var[q,:,:], Mu_var[q,:,:].T)).dot(Kuui_var[q,:,:].T)


        #KLvar += 0.5 * np.sum(Kuui_var[q, :, :] * S_u[q, :, :]) \
        #             + 0.5 * np.dot((Mu_var[q, :, :] - m_u[:, q, None]).T, np.dot(Kuui_var[q, :, :], (Mu_var[q, :, :] - m_u[:, q, None]))) \
        #             - 0.5 * M \
        #             + 0.5 * 2. * np.sum(np.log(np.abs(np.diag(Luu_var[q, :, :])))) \
        #             - 0.5 * 2. * np.sum(np.log(np.abs(np.diag(L_u[q, :, :]))))

        #

        # VE Terms
        dVE_dmu_q = np.zeros((M, 1))
        dVE_dS_q = np.zeros((M, M))
        dVE_dKqq = np.zeros((M, M))
        dVE_dKqd = []
        dVE_dKdiag = []

        for d, q_fd in enumerate(q_F):
            Nt = Ntask[f_index[d]]
            dVE_dmu_q += np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:,d_index[d]])[:, None]
            Adv = q_fd.Afdu[q,:,:].T * VE_dv[f_index[d]][:,d_index[d],None].T
            Adv = np.ascontiguousarray(Adv)
            AdvA = np.dot(Adv.reshape(-1, Nt), q_fd.Afdu[q, :, :]).reshape(M, M)
            dVE_dS_q += AdvA

            # Derivatives dKuquq
            tmp_dv = np.dot(AdvA, S_u[q, :, :]).dot(Kuui_new[q,:,:])
            dVE_dKqq += AdvA - tmp_dv - tmp_dv.T
            Adm = np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:,d_index[d],None])
            dVE_dKqq += - np.dot(Adm, np.dot(Kuui_new[q,:,:], m_u[:, q, None]).T)

            # Derivatives dKuqfd
            tmp = np.dot(S_u[q, :, :], Kuui_new[q,:,:])
            tmp = 2. * (tmp - np.eye(M))
            dve_kqd = np.dot(np.dot(Kuui_new[q,:,:], m_u[:, q, None]), VE_dm[f_index[d]][:,d_index[d],None].T)
            dve_kqd += np.dot(tmp.T, Adv)
            dVE_dKqd.append(dve_kqd)

            # Derivatives dKdiag
            dVE_dKdiag.append(VE_dv[f_index[d]][:,d_index[d]])

        dVE_dKqq = 0.5 * (dVE_dKqq + dVE_dKqq.T)

        # Derivatives of variational parameters
        dL_dmu_q = dVE_dmu_q - dKLnew_dmu_q + dKLold_dmu_q - dKLvar_dmu_q
        dL_dS_q = dVE_dS_q - dKLnew_dS_q + dKLold_dS_q - dKLvar_dS_q

        # Derivatives of prior hyperparameters
        # if using Zgrad, dL_dKqq = dVE_dKqq - dKLnew_dKqq + dKLold_dKqq - dKLvar_dKqq
        # otherwise for hyperparameters: dL_dKqq = dVE_dKqq - dKLnew_dKqq
        dL_dKqq = dVE_dKqq - dKLnew_dKqq #+ dKLold_dKqq - dKLvar_dKqq # dKLold_dKqq sólo para Zgrad, dKLvar_dKqq to be done (for Zgrad)
        dL_dKdq = dVE_dKqd
        dL_dKdiag = dVE_dKdiag

        # Pass S_q gradients to its low-triangular representation L_q
        chol_u = q_U.chols_u.copy()
        L_q = choleskies.flat_to_triang(chol_u[:,q:q+1])
        dL_dL_q = 2. * np.array([np.dot(a, b) for a, b in zip(dL_dS_q[None,:,:], L_q)])
        dL_dL_q = choleskies.triang_to_flat(dL_dL_q)

        # Posterior
        posterior_q = Posterior(mean=m_u[:, q, None], cov=S_u[q, :, :], K=Kuu_new[q,:,:], prior_mean=np.zeros(m_u[:, q, None].shape))

        return dL_dmu_q, dL_dL_q, dL_dS_q, posterior_q, dL_dKqq, dL_dKdq, dL_dKdiag
Exemple #19
0
    def inference(self,
                  q_u_means,
                  q_u_chols,
                  X,
                  Y,
                  Z,
                  kern_list,
                  likelihood,
                  B_list,
                  Y_metadata,
                  KL_scale=1.0,
                  batch_scale=None):
        M = Z.shape[0]
        T = len(Y)
        if batch_scale is None:
            batch_scale = [1.0] * T
        Ntask = []
        [Ntask.append(Y[t].shape[0]) for t in range(T)]
        Q = len(kern_list)
        D = likelihood.num_output_functions(Y_metadata)
        Kuu, Luu, Kuui = util.latent_funs_cov(Z, kern_list)
        p_U = pu(Kuu=Kuu, Luu=Luu, Kuui=Kuui)
        q_U = qu(mu_u=q_u_means, chols_u=q_u_chols)

        # for every latent function f_d calculate q(f_d) and keep it as q(F):
        q_F = []
        posteriors_F = []
        f_index = Y_metadata['function_index'].flatten()
        d_index = Y_metadata['d_index'].flatten()

        for d in range(D):
            Xtask = X[f_index[d]]
            q_fd = self.calculate_q_f(X=Xtask,
                                      Z=Z,
                                      q_U=q_U,
                                      p_U=p_U,
                                      kern_list=kern_list,
                                      B=B_list,
                                      M=M,
                                      N=Xtask.shape[0],
                                      Q=Q,
                                      D=D,
                                      d=d)
            # Posterior objects for output functions (used in prediction)
            posterior_fd = Posterior(mean=q_fd.m_fd.copy(),
                                     cov=q_fd.S_fd.copy(),
                                     K=util.function_covariance(
                                         X=Xtask,
                                         B=B_list,
                                         kernel_list=kern_list,
                                         d=d),
                                     prior_mean=np.zeros(q_fd.m_fd.shape))
            posteriors_F.append(posterior_fd)
            q_F.append(q_fd)

        mu_F = []
        v_F = []
        for t in range(T):
            mu_F_task = np.empty((Y[t].shape[0], 1))
            v_F_task = np.empty((Y[t].shape[0], 1))
            for d, q_fd in enumerate(q_F):
                if f_index[d] == t:
                    mu_F_task = np.hstack((mu_F_task, q_fd.m_fd))
                    v_F_task = np.hstack((v_F_task, q_fd.v_fd))

            mu_F.append(mu_F_task[:, 1:])
            v_F.append(v_F_task[:, 1:])

        # Variational Expectations
        VE = likelihood.var_exp(Y, mu_F, v_F, Y_metadata)
        VE_dm, VE_dv = likelihood.var_exp_derivatives(Y, mu_F, v_F, Y_metadata)
        for t in range(T):
            VE[t] = VE[t] * batch_scale[t]
            VE_dm[t] = VE_dm[t] * batch_scale[t]
            VE_dv[t] = VE_dv[t] * batch_scale[t]

        # KL Divergence
        KL = self.calculate_KL(q_U=q_U, p_U=p_U, M=M, Q=Q)

        # Log Marginal log(p(Y))
        F = 0
        for t in range(T):
            F += VE[t].sum()

        log_marginal = F - KL

        # Gradients and Posteriors
        dL_dmu_u = []
        dL_dL_u = []
        dL_dKmm = []
        dL_dKmn = []
        dL_dKdiag = []
        posteriors = []
        for q in range(Q):
            (dL_dmu_q, dL_dL_q, posterior_q, dL_dKqq, dL_dKdq,
             dL_dKdiag_q) = self.calculate_gradients(q_U=q_U,
                                                     p_U=p_U,
                                                     q_F=q_F,
                                                     VE_dm=VE_dm,
                                                     VE_dv=VE_dv,
                                                     Ntask=Ntask,
                                                     M=M,
                                                     Q=Q,
                                                     D=D,
                                                     f_index=f_index,
                                                     d_index=d_index,
                                                     q=q)
            dL_dmu_u.append(dL_dmu_q)
            dL_dL_u.append(dL_dL_q)
            dL_dKmm.append(dL_dKqq)
            dL_dKmn.append(dL_dKdq)
            dL_dKdiag.append(dL_dKdiag_q)
            posteriors.append(posterior_q)

        gradients = {
            'dL_dmu_u': dL_dmu_u,
            'dL_dL_u': dL_dL_u,
            'dL_dKmm': dL_dKmm,
            'dL_dKmn': dL_dKmn,
            'dL_dKdiag': dL_dKdiag
        }

        return log_marginal, gradients, posteriors, posteriors_F
Exemple #20
0
    def inference_root(self,
                       kern,
                       X,
                       Z,
                       likelihood,
                       Y,
                       Kuu_sigma=None,
                       Y_metadata=None,
                       Lm=None,
                       dL_dKmm=None):
        """
        The first phase of inference:
        Compute: log-likelihood, dL_dKmm

        Cached intermediate results: Kmm, KmmInv,
        """

        num_data, output_dim = Y.shape
        input_dim = Z.shape[0]
        num_data_total = allReduceArrays([np.int32(num_data)],
                                         self.mpi_comm)[0]

        uncertain_inputs = isinstance(X, VariationalPosterior)
        uncertain_outputs = isinstance(Y, VariationalPosterior)

        beta = 1. / np.fmax(likelihood.variance, 1e-6)

        psi0, psi2, YRY, psi1, psi1Y, Shalf, psi1S = self.gatherPsiStat(
            kern, X, Z, Y, beta, uncertain_inputs)

        #======================================================================
        # Compute Common Components
        #======================================================================

        try:
            Kmm = kern.K(Z).copy()
            if Kuu_sigma is not None:
                diag.add(Kmm, Kuu_sigma)
            else:
                diag.add(Kmm, self.const_jitter)
            Lm = jitchol(Kmm)

            LmInv = dtrtri(Lm)
            LmInvPsi2LmInvT = LmInv.dot(psi2.dot(LmInv.T))

            Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT
            LL = jitchol(Lambda)
            LLInv = dtrtri(LL)
            flag = np.zeros((1, ), dtype=np.int32)
            self.mpi_comm.Bcast(flag, root=self.root)
        except LinAlgError as e:
            flag = np.ones((1, ), dtype=np.int32)
            self.mpi_comm.Bcast(flag, root=self.root)
            raise e

        broadcastArrays([LmInv, LLInv], self.mpi_comm, self.root)
        LmLLInv = LLInv.dot(LmInv)

        logdet_L = 2. * np.sum(np.log(np.diag(LL)))
        b = psi1Y.dot(LmLLInv.T)
        bbt = np.square(b).sum()
        v = b.dot(LmLLInv)
        LLinvPsi1TYYTPsi1LLinvT = tdot(b.T)

        if psi1S is not None:
            psi1SLLinv = psi1S.dot(LmLLInv.T)
            bbt_sum = np.square(psi1SLLinv).sum()
            LLinvPsi1TYYTPsi1LLinvT_sum = tdot(psi1SLLinv.T)
            bbt_sum, LLinvPsi1TYYTPsi1LLinvT_sum = reduceArrays(
                [bbt_sum, LLinvPsi1TYYTPsi1LLinvT_sum], self.mpi_comm,
                self.root)
            bbt += bbt_sum
            LLinvPsi1TYYTPsi1LLinvT += LLinvPsi1TYYTPsi1LLinvT_sum
            psi1SP = psi1SLLinv.dot(LmLLInv)
        tmp = -LLInv.T.dot(LLinvPsi1TYYTPsi1LLinvT +
                           output_dim * np.eye(input_dim)).dot(LLInv)
        dL_dpsi2R = LmInv.T.dot(tmp +
                                output_dim * np.eye(input_dim)).dot(LmInv) / 2.
        broadcastArrays([dL_dpsi2R], self.mpi_comm, self.root)

        #======================================================================
        # Compute log-likelihood
        #======================================================================
        logL_R = -num_data_total * np.log(beta)
        logL = -(output_dim * (num_data_total * log_2_pi + logL_R + psi0 -
                               np.trace(LmInvPsi2LmInvT)) + YRY -
                 bbt) / 2. - output_dim * logdet_L / 2.

        #======================================================================
        # Compute dL_dKmm
        #======================================================================

        dL_dKmm = dL_dpsi2R - output_dim * LmInv.T.dot(LmInvPsi2LmInvT).dot(
            LmInv) / 2.

        #======================================================================
        # Compute the Posterior distribution of inducing points p(u|Y)
        #======================================================================

        wd_inv = backsub_both_sides(
            Lm,
            np.eye(input_dim) -
            backsub_both_sides(LL, np.identity(input_dim), transpose='left'),
            transpose='left')
        post = Posterior(woodbury_inv=wd_inv,
                         woodbury_vector=v.T,
                         K=Kmm,
                         mean=None,
                         cov=None,
                         K_chol=Lm)

        #======================================================================
        # Compute dL_dthetaL for uncertian input and non-heter noise
        #======================================================================

        dL_dthetaL = (YRY * beta + beta * output_dim * psi0 - num_data_total *
                      output_dim * beta) / 2. - beta * (dL_dpsi2R * psi2).sum(
                      ) - beta * np.trace(LLinvPsi1TYYTPsi1LLinvT)

        #======================================================================
        # Compute dL_dpsi
        #======================================================================

        dL_dpsi0 = -output_dim * (beta * np.ones((num_data, ))) / 2.

        if uncertain_outputs:
            m, s = Y.mean, Y.variance
            dL_dpsi1 = beta * (np.dot(m, v) + Shalf[:, None] * psi1SP)
        else:
            dL_dpsi1 = beta * np.dot(Y, v)

        if uncertain_inputs:
            dL_dpsi2 = beta * dL_dpsi2R
        else:
            dL_dpsi1 += np.dot(psi1, dL_dpsi2R) * 2.
            dL_dpsi2 = None

        if uncertain_inputs:
            grad_dict = {
                'dL_dKmm': dL_dKmm,
                'dL_dpsi0': dL_dpsi0,
                'dL_dpsi1': dL_dpsi1,
                'dL_dpsi2': dL_dpsi2,
                'dL_dthetaL': dL_dthetaL
            }
        else:
            grad_dict = {
                'dL_dKmm': dL_dKmm,
                'dL_dKdiag': dL_dpsi0,
                'dL_dKnm': dL_dpsi1,
                'dL_dthetaL': dL_dthetaL
            }

        if uncertain_outputs:
            m, s = Y.mean, Y.variance
            psi1LmiLLi = psi1.dot(LmLLInv.T)
            LLiLmipsi1Y = b.T
            grad_dict['dL_dYmean'] = -m * beta + psi1LmiLLi.dot(LLiLmipsi1Y)
            grad_dict['dL_dYvar'] = beta / -2. + np.square(psi1LmiLLi).sum(
                axis=1) / 2

        return post, logL, grad_dict
Exemple #21
0
    def incremental_inference(self,
                              kern,
                              X,
                              likelihood,
                              Y,
                              mean_function=None,
                              Y_metadata=None,
                              K=None,
                              variance=None,
                              Z_tilde=None):

        # do incremental update
        if mean_function is None:
            m = 0
        else:
            m = mean_function.f(X)

        if variance is None:
            variance = likelihood.gaussian_variance(Y_metadata)

        YYT_factor = Y - m

        # K_tmp = kern.K(X, X[-1:])
        K_inc = kern._K[:-1, -1]
        K_inc2 = kern._K[-1:, -1]
        # self._K = np.block([[self._K, K_inc], [K_inc.T, K_inc2]])

        # Ky = K.copy()
        jitter = variance[
            -1] + 1e-8  # variance can be given for each point individually, in which case we just take the last point
        # diag.add(Ky, jitter)

        # LW_old = self._old_posterior.woodbury_chol

        Wi, LW, LWi, W_logdet = pdinv_inc(self._old_LW, K_inc, K_inc2 + jitter,
                                          self._old_Wi)

        alpha, _ = dpotrs(LW, YYT_factor, lower=1)

        log_marginal = 0.5 * (-Y.size * log_2_pi - Y.shape[1] * W_logdet -
                              np.sum(alpha * YYT_factor))

        if Z_tilde is not None:
            # This is a correction term for the log marginal likelihood
            # In EP this is log Z_tilde, which is the difference between the
            # Gaussian marginal and Z_EP
            log_marginal += Z_tilde

        dL_dK = 0.5 * (tdot(alpha) - Y.shape[1] * Wi)

        dL_dthetaL = likelihood.exact_inference_gradients(
            np.diag(dL_dK), Y_metadata)

        self._old_LW = LW
        self._old_Wi = Wi
        posterior = Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K)

        # TODO add logdet to posterior ?
        return posterior, log_marginal, {
            'dL_dK': dL_dK,
            'dL_dthetaL': dL_dthetaL,
            'dL_dm': alpha
        }
Exemple #22
0
    def posteriors_F(self, Xnew, which_out=None):
        # This function returns all the q(f*) associated to each output (It is the )
        # We assume that Xnew can be a list of length equal to the number of likelihoods defined for the HetMOGP
        # or Xnew can be a numpy array so that we can replicate it per each outout

        if isinstance(Xnew, list):
            Xmulti_all_new = Xnew
        else:
            Xmulti_all_new = []
            for i in range(self.num_output_funcs):
                Xmulti_all_new.append(Xnew.copy())

        M = self.Z.shape[0]
        Q = len(self.kern_list)
        D = self.likelihood.num_output_functions(self.Y_metadata)
        Kuu, Luu, Kuui = util.VIK_covariance(self.Z, self.kern_list,
                                             self.kern_list_Tq, self.kern_aux)
        p_U = pu(Kuu=Kuu, Luu=Luu, Kuui=Kuui)
        q_U = qu(mu_u=self.q_u_means, chols_u=self.q_u_chols)
        S_u = np.empty((Q, M, M))
        L_u = choleskies.flat_to_triang(q_U.chols_u.copy())
        [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)]

        # for every latent function f_d calculate q(f_d) and keep it as q(F):
        posteriors_F = []
        f_index = self.Y_metadata['function_index'].flatten()
        d_index = self.Y_metadata['d_index'].flatten()

        if which_out is None:
            indix_aux = f_index.copy()
        else:
            which_out = np.array(which_out)
            indix_aux = -1 * np.ones_like(f_index)
            for i in range(which_out.shape[0]):
                posix = np.where(f_index == which_out[i])
                indix_aux[posix] = f_index[posix].copy()

        for d in range(D):
            if f_index[d] == indix_aux[d]:
                Xtask = Xmulti_all_new[f_index[d]]
                q_fd, _ = self.inference_method.calculate_q_f(
                    X=Xtask,
                    Z=self.Z,
                    q_U=q_U,
                    S_u=S_u,
                    p_U=p_U,
                    kern_list=self.kern_list,
                    kern_list_Gdj=self.kern_list_Gdj,
                    kern_list_Tq=self.kern_list_Tq,
                    kern_aux=self.kern_aux,
                    B=self.B_list,
                    M=M,
                    N=Xtask.shape[0],
                    Q=Q,
                    D=D,
                    d=d)
                # Posterior objects for output functions (used in prediction)
                posterior_fd = Posterior(mean=q_fd.m_fd.copy(),
                                         cov=q_fd.S_fd.copy(),
                                         K=util.function_covariance(
                                             X=Xtask,
                                             B=self.B_list,
                                             kernel_list=self.kern_list,
                                             d=d),
                                         prior_mean=np.zeros(q_fd.m_fd.shape))
                posteriors_F.append(posterior_fd)
            else:
                #posteriors_F.append(fake_posterior)
                posteriors_F.append([])
        return posteriors_F
    def _outer_loop_for_missing_data(self):
        Lm = None
        dL_dKmm = None

        self._log_marginal_likelihood = 0
        self.full_values = self._outer_init_full_values()

        if self.posterior is None:
            woodbury_inv = np.zeros(
                (self.num_inducing, self.num_inducing, self.output_dim))
            woodbury_vector = np.zeros((self.num_inducing, self.output_dim))
        else:
            woodbury_inv = self.posterior._woodbury_inv
            woodbury_vector = self.posterior._woodbury_vector

        if not self.stochastics:
            m_f = lambda i: "Inference with missing_data: {: >7.2%}".format(
                float(i + 1) / self.output_dim)
            message = m_f(-1)
            print(message, end=' ')

        for d, ninan in self.stochastics.d:
            if not self.stochastics:
                print(' ' * (len(message)) + '\r', end=' ')
                message = m_f(d)
                print(message, end=' ')

            psi0ni = self.psi0[ninan]
            psi1ni = self.psi1[ninan]
            if self.has_uncertain_inputs():
                psi2ni = self.psi2[ninan]
                value_indices = dict(outputs=d,
                                     samples=ninan,
                                     dL_dpsi0=ninan,
                                     dL_dpsi1=ninan,
                                     dL_dpsi2=ninan)
            else:
                psi2ni = None
                value_indices = dict(outputs=d,
                                     samples=ninan,
                                     dL_dKdiag=ninan,
                                     dL_dKnm=ninan)

            posterior, log_marginal_likelihood, grad_dict = self._inner_parameters_changed(
                self.kern,
                self.X[ninan],
                self.Z,
                self.likelihood,
                self.Y_normalized[ninan][:, d],
                self.Y_metadata,
                Lm,
                dL_dKmm,
                psi0=psi0ni,
                psi1=psi1ni,
                psi2=psi2ni)

            # Fill out the full values by adding in the apporpriate grad_dict
            # values
            self._inner_take_over_or_update(self.full_values, grad_dict,
                                            value_indices)
            self._inner_values_update(grad_dict)  # What is this for? -> MRD

            woodbury_inv[:, :, d] = posterior.woodbury_inv[:, :, None]
            woodbury_vector[:, d] = posterior.woodbury_vector
            self._log_marginal_likelihood += log_marginal_likelihood

        if not self.stochastics:
            print('')

        if self.posterior is None:
            self.posterior = Posterior(woodbury_inv=woodbury_inv,
                                       woodbury_vector=woodbury_vector,
                                       K=posterior._K,
                                       mean=None,
                                       cov=None,
                                       K_chol=posterior.K_chol)
        self._outer_values_update(self.full_values)
        if self.has_uncertain_inputs():
            self.kern.return_psi2_n = False