Example #1
0
def approxConditionals(s, lhat, rest=None):
    """
    y: Full mean (all dimensions)
    s: Full cov. matrix (all dimensions)
    lhat: The dimensions for which to compute
          p(lhat | rest).
    
    a = lhat (unobserved indices) |a| - dimensional
    b = rest (observed indices)   |b| - dimensional
    
    (a,b) ~ N(M, S)
    Then:
    S_{a|b} = Saa - Sab Sbb^-1 Sba
    M_{a|b} = Sab Sbb^-1 Mb        
    """
    #  y is L x 1
    #  s is L x L
    L = s.shape[1]
    if rest is None:
        A = list(set(range(L)) - set([lhat]))
    else:
        A = rest
    # Slice s, to get the indices denoted by A,A (ie all rows in A and columns in A)
    sAA = sliceArr(s,A,A)
    try:
        inv_sAA = pdinv(sAA)[0]
    except:
        inv_sAA = pdinv(sAA + 0.00000001*np.eye(sAA.shape[0],sAA.shape[1]))[0]
        print "Warning: Added more jitter!"
    sc = sliceArr(s,[lhat],A)
    sInvs = np.dot(np.dot(sc,inv_sAA),sc.T)

    s_U = sliceArr(s,[lhat],[lhat]) - sInvs
    return s_U
Example #2
0
    def update_kern_grads(self):
        """
        Set the derivative of the lower bound wrt the (kernel) parameters
        """
        for i, kern in enumerate(self.kern):
            K = kern.K(self.X)
            B_inv = np.diag(1. / (self.phi[:, i] / self.variance))

            alpha = linalg.cho_solve(linalg.cho_factor(K + B_inv), self.Y)
            K_B_inv = pdinv(K + B_inv)[0]

            dL_dK = np.outer(alpha, alpha) - K_B_inv

            kern.update_gradients_full(dL_dK=dL_dK, X=self.X)

        # variance gradient

        grad_Lm_variance = 0.0
        for i, kern in enumerate(self.kern):
            K = kern.K(self.X)
            I = np.eye(self.N)

            B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance))
            alpha = np.linalg.solve(K + B_inv, self.Y)
            K_B_inv = pdinv(K + B_inv)[0]
            dL_dB = np.outer(alpha, alpha) - K_B_inv
            grad_B_inv = np.diag(1. / (self.phi[:, i] + 1e-6))

            grad_Lm_variance += 0.5 * np.trace(np.dot(dL_dB, grad_B_inv))

            self.variance.gradient = grad_Lm_variance
 def woodbury_chol(self):
     """
     return $L_{W}$ where L is the lower triangular Cholesky decomposition of the Woodbury matrix
     $$
     L_{W}L_{W}^{\top} = W^{-1}
     W^{-1} := \texttt{Woodbury inv}
     $$
     """
     if self._woodbury_chol is None:
         #compute woodbury chol from
         if self._woodbury_inv is not None:
             winv = np.atleast_3d(self._woodbury_inv)
             self._woodbury_chol = np.zeros(winv.shape)
             for p in range(winv.shape[-1]):
                 self._woodbury_chol[:, :, p] = pdinv(winv[:, :, p])[2]
         elif self._covariance is not None:
             raise NotImplementedError("TODO: check code here")
             B = self._K - self._covariance
             tmp, _ = dpotrs(self.K_chol, B)
             self._woodbury_inv, _ = dpotrs(self.K_chol, tmp.T)
             _, _, self._woodbury_chol, _ = pdinv(self._woodbury_inv)
         else:
             raise ValueError(
                 "insufficient information to compute posterior")
     return self._woodbury_chol
Example #4
0
    def _update_batch(self, eta: float, delta: float, post_params: posteriorParams, marg_moments: MarginalMoments, batch: List[int], get_logger: Callable=None, sigma2s: np.ndarray=None):
        """
        Computes new gaussian approximation for a batch given posterior and marginal moments. See e.g. 3.59 in http://www.gaussianprocess.org/gpml/chapters/RW.pdf
        :param eta: parameter for fractional updates.
        :param delta: damping updates factor.
        :param post_params: Posterior approximation
        :param marg_moments: Marginal moments at this iteration
        :param batch: list of indices of the parameters to be updated
        :param get_logger: Function for receiving the legger where the prints are forwarded.
        """
        sigma_hat_inv,_,_,_ = pdinv(marg_moments.sigma2_hat[np.ix_(batch,batch)])
        post_sigma_inv,_,_,_ = pdinv(post_params.Sigma[np.ix_(batch,batch)])

        tmp0 = sigma_hat_inv - post_sigma_inv

        delta_tau = delta/eta* tmp0
        delta_v = delta/eta*(np.dot(marg_moments.mu_hat[batch],sigma_hat_inv) - np.dot(post_params.mu[batch], post_sigma_inv))
        tau_tilde_prev = self.tau[np.ix_(batch,batch)]
        tmp = (1-delta)*self.tau[np.ix_(batch,batch)] + delta_tau
        
        #Let us umake sure that sigma_hat_inv-post_sigma_inv is positive definite        
        tmp, added_value = nearestPD.nearest_pd.nearestPD(tmp)        
        update = True        
        if (added_value > 1) and (sigma2s is not None):
            update = False                
            sigma2s *= 1.05
            if get_logger is not None:
                get_logger().error('Increasing batch noise. Not updating gaussian approximation ({})'.format(sigma2s[0]))
        if update:        
            self.tau[np.ix_(batch,batch)] = tmp 
            self.v[batch] = (1-delta)*self.v[batch] + delta_v
        return (delta_tau, delta_v), sigma2s
    def update_model(self, xvals, zvals, incremental = True):
        assert(self.xvals is not None)
        assert(self.zvals is not None)
        
        Kx = self.kern.K(self.xvals, xvals)

        # Update K matrix
        self._K = np.block([
            [self._K,    Kx],
            [Kx.T,      self.kern.K(xvals, xvals)] 
         ])

        # Update internal data
        self.xvals = np.vstack([self.xvals, xvals])
        self.zvals = np.vstack([self.zvals, zvals])

        # Update woodbury inverse, either incrementally or from scratch
        if incremental == True:
            Pinv = self.woodbury_inv
            Q = Kx
            R = Kx.T
            S = self.kern.K(xvals, xvals)
            M = S - np.dot(np.dot(R, Pinv), Q)
            # Adds some additional noise to ensure well-conditioned
            diag.add(M, self.noise + 1e-8)
            M, _, _, _ = pdinv(M)

            Pnew = Pinv + np.dot(np.dot(np.dot(np.dot(Pinv, Q), M), R), Pinv)
            Qnew = -np.dot(np.dot(Pinv, Q), M)
            Rnew = -np.dot(np.dot(M, R), Pinv)
            Snew = M

            self._woodbury_inv = np.block([
                [Pnew, Qnew],
                [Rnew, Snew]
            ])
        else:
            Ky = self.K.copy()
            # Adds some additional noise to ensure well-conditioned
            diag.add(Ky, self.noise + 1e-8)
            Wi, LW, LWi, W_logdet = pdinv(Ky)
            self._woodbury_inv = Wi 
        
        self._woodbury_vector = np.dot(self.woodbury_inv, self.zvals) 

        self._woodbury_chol = None 
        self._mean =  None
        self._covariance = None
        self._prior_mean = 0.
        self._K_chol = None
Example #6
0
    def bound(self):
        """
        Compute the lower bound on the marginal likelihood (conditioned on the
        GP hyper parameters).
        """
        GP_bound = 0.0

        for i, kern in enumerate(self.kern):
            K = kern.K(self.X)
            B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance))

            # Make more stable using cholesky factorization:
            Bi, LB, LBi, Blogdet = pdinv(K+B_inv)

            # Data fit
            # alpha = linalg.cho_solve(linalg.cho_factor(K + B_inv), self.Y)
            # GP_bound += -0.5 * np.dot(self.Y.T, alpha).trace()
            GP_bound -= .5 * dpotrs(LB, self.YYT)[0].trace()

            # Penalty
            # GP_bound += -0.5 * np.linalg.slogdet(K + B_inv)[1]
            GP_bound -= 0.5 * Blogdet

            # Constant, weighted by  model assignment per point
            #GP_bound += -0.5 * (self.phi[:, i] * np.log(2 * np.pi * self.variance)).sum()
            GP_bound -= .5*self.D * np.einsum('j,j->',self.phi[:, i], np.log(2 * np.pi * self.variance))

        return  GP_bound + self.mixing_prop_bound() + self.H
Example #7
0
    def IntKKNorm(self, x1, x2, mu, sigma):
        """Compute \int k(x1,x') k(x',x2) Normal_x'(\mu, \Sigma) dx'.

        Parameters
        ----------
        x1 : array, size (n1, n_dim)
        x2 : array, size (n2, n_dim)
        mu : array, size (n_dim)
            The mean of the Gaussian distribution.
        cov : array, size (n_dim, n_dim)
            The covariance of the Gaussian distribution.

        Returns
        -------
        res : array, size (n1, n2)

        """
        ndim = self.input_dim
        var = self.variance
        ell2 = np.ones((ndim, )) * self.lengthscale**2
        sqrt_det = np.sqrt(np.prod(ell2))
        cov = sigma + 0.5 * np.diag(ell2)
        cov_inv, _, _, ld, = pdinv(cov)
        x_shift = 0.5 * (x1[:, None] + x2[None, :]) - mu
        arg = np.sum(x_shift * np.matmul(x_shift, cov_inv), axis=2)
        k1 = var * np.exp(-0.5 * arg)
        k2 = self.K(x1 / np.sqrt(2), x2 / np.sqrt(2))
        const = np.exp(-0.5 * ld) * sqrt_det / np.power(2, ndim / 2.0)
        return const * k1 * k2
Example #8
0
    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, K=None, variance=None, Z_tilde=None, A = None):
        """
        Returns a Posterior class containing essential quantities of the posterior
        The comments below corresponds to Alg 2.1 in GPML textbook.
        """
        # print('ExactGaussianInferenceGroup inference:')
        if mean_function is None:
            m = 0
        else:
            m = mean_function.f(X)

        if variance is None:
            variance = likelihood.gaussian_variance(Y_metadata)

        YYT_factor = Y-m

        # NOTE: change K to AKA^T
        if K is None:
            if A is None:
                A = np.identity(X.shape[0])
            K = A.dot(kern.K(X)).dot(A.T) # A_t k(X_t, X_t) A_t^T
        else:
            raise NotImplementedError('Need to be extended to group case!')
            

        Ky = K.copy()
        diag.add(Ky, variance+1e-8) # A_t k(X_t, X_t)A_t^T + sigma^2 I

        # pdinv: 
        # Wi: inverse of Ky
        # LW: the Cholesky decomposition of Ky -> L
        # LWi: the Cholesky decomposition of Kyi (not used)
        # W_logdet: the log of the determinat of Ky
        Wi, LW, LWi, W_logdet = pdinv(Ky) 

        # LAPACK: DPOTRS solves a system of linear equations A*X = B with a symmetric
        # positive definite matrix A using the Cholesky factorization
        # A = U**T*U or A = L*L**T computed by DPOTRF.
        alpha, _ = dpotrs(LW, YYT_factor, lower=1)
        # so this gives 
        # (A_t k(X_t, X_t)A_t^T + sigma^2 I)^{-1} (Y_t - m)

        # Note: 20210827 confirm the log marginal likelihood 
        log_marginal =  0.5*(-Y.size * log_2_pi - Y.shape[1] * W_logdet - np.sum(alpha * YYT_factor))

        if Z_tilde is not None:
            # This is a correction term for the log marginal likelihood
            # In EP this is log Z_tilde, which is the difference between the
            # Gaussian marginal and Z_EP
            log_marginal += Z_tilde

        # REVIEW: since log_marginal does not change, the gradient does not need to change as well.
        # FIXME: confirm the gradient update is correct
        # dL_dK = 0.5 * (tdot(alpha) - Y.shape[1] * Wi)
        dL_dK = 0.5 * A.T.dot((tdot(alpha) - Y.shape[1] * Wi)).dot(A)
        # print('dL_dK shape', dL_dK.shape)

        dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK), Y_metadata)

        return PosteriorExactGroup(woodbury_chol=LW, woodbury_vector=alpha, K=K, A = A), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
Example #9
0
    def dIntKKNorm_dX(self, x1, x2, mu, sigma):
        """Compute 
            d/dx1 \int k(x1,x') k(x',x2) Normal_x'(\mu, \Sigma) dx'.

        Parameters
        ----------
        x1 : array, size (n1, n_dim)
        x2 : array, size (n2, n_dim)
        mu : array, size (n_dim)
            The mean of the Gaussian distribution.
        cov : array, size (n_dim, n_dim)
            The covariance of the Gaussian distribution.

        Returns
        -------
        jac : array, size (n1, n2, n_dim)
            The gradients of IntKKNorm w.r.t. x1.

        """
        ndim = self.input_dim
        ell2 = np.ones((ndim, )) * self.lengthscale**2
        cov = sigma + 0.5 * np.diag(ell2)
        cov_inv, _, _, ld, = pdinv(cov)
        x_avg = 0.5 * (x1[:, None] + x2[None, :])
        aa = np.dot(2 * x_avg / ell2, sigma) + mu
        const = -x1[:, None] / ell2 + 0.5 * np.dot(aa, cov_inv)
        integral = self.IntKKNorm(x1, x2, mu, sigma)
        jacobian = const * integral[:, :, None]
        return jacobian
Example #10
0
    def vb_grad_natgrad(self):
        """
        Natural Gradients of the bound with respect to phi, the variational
        parameters controlling assignment of the data to GPs
        """
        grad_Lm = np.zeros_like(self.phi)
        for i, kern in enumerate(self.kern):
            K = kern.K(self.X)
            I = np.eye(self.N)

            B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance))
            alpha = np.linalg.solve(K + B_inv, self.Y)
            K_B_inv = pdinv(K + B_inv)[0]
            dL_dB = tdot(alpha) - K_B_inv

            for n in range(self.phi.shape[0]):
                grad_B_inv = np.zeros_like(B_inv)
                grad_B_inv[n, n] = -self.variance / (self.phi[n, i]**2 + 1e-6)
                grad_Lm[n, i] = 0.5 * np.trace(np.dot(dL_dB, grad_B_inv))

        grad_phi = grad_Lm + self.mixing_prop_bound_grad() + self.Hgrad

        natgrad = grad_phi - np.sum(self.phi * grad_phi, 1)[:, None]
        grad = natgrad * self.phi

        return grad.flatten(), natgrad.flatten()
Example #11
0
    def _log_likelihood(self, log_params):
        # Returns log likelihood, p(D|hyperparams)
        params = np.exp(log_params)
        l_scales = params[0:self.X_dim]
        output_var = params[self.X_dim] # Vertical length scale
        noise_var = params[self.X_dim + 1]
        # compute eta
        eta = np.min(self.Y) - params[self.X_dim + 2] # QUESTION: what is this?
        # compute the observed value for g instead of y
        g_ob = np.sqrt(2.0 * (self.Y - eta))

        kernel = GPy.kern.RBF(input_dim=self.X_dim, ARD=True, variance=output_var, lengthscale=l_scales)
        Kng = kernel.K(self.X)
        # QUESTION: does not seem to follow conditional variance form in eqn 6

        # compute posterior mean distribution for g TODO update this
        # GPg = GPy.models.GPRegression(self.X, g_ob, kernel, noise_var=1e-8)
        # mg,_ = GPg.predict(self.X)
        mg = g_ob

        # approximate covariance matrix of y using linearisation technique
        Kny = mg * Kng * mg.T + (noise_var+1e-8) * np.eye(Kng.shape[0])

        # compute likelihood terms
        Wi, LW, LWi, W_logdet = pdinv(Kny) # from GPy module
        # Wi = inverse of Kny (ndarray)
        # LW = Cholesky decomposition of Kny (ndarray)
        # LWi = Cholesky decomposition of inverse of Kny (ndarray)
        # W_logdet = log determinant of Kny (float)

        alpha, _ = dpotrs(LW, self.Y, lower=1)
        loglikelihood = 0.5 * (-self.Y.size * np.log(2 * np.pi) - self.Y.shape[1] * W_logdet - np.sum(alpha * self.Y))
        # Log marginal likelihood for GP, based on Rasmussen eqn 2.30

        return loglikelihood
Example #12
0
    def bound(self):
        """
        Compute the lower bound on the marginal likelihood (conditioned on the
        GP hyper parameters).
        """
        GP_bound = 0.0

        for i, kern in enumerate(self.kern):
            K = kern.K(self.X)
            B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance))

            # Make more stable using cholesky factorization:
            Bi, LB, LBi, Blogdet = pdinv(K + B_inv)

            # Data fit
            # alpha = linalg.cho_solve(linalg.cho_factor(K + B_inv), self.Y)
            # GP_bound += -0.5 * np.dot(self.Y.T, alpha).trace()
            GP_bound -= .5 * dpotrs(LB, self.YYT)[0].trace()

            # Penalty
            # GP_bound += -0.5 * np.linalg.slogdet(K + B_inv)[1]
            GP_bound -= 0.5 * Blogdet

            # Constant, weighted by  model assignment per point
            #GP_bound += -0.5 * (self.phi[:, i] * np.log(2 * np.pi * self.variance)).sum()
            GP_bound -= .5 * self.D * np.einsum(
                'j,j->', self.phi[:, i], np.log(2 * np.pi * self.variance))

        return GP_bound + self.mixing_prop_bound() + self.H
Example #13
0
    def vb_grad_natgrad(self):
        """
        Natural Gradients of the bound with respect to phi, the variational
        parameters controlling assignment of the data to GPs
        """
        grad_Lm = np.zeros_like(self.phi)
        for i, kern in enumerate(self.kern):
            K = kern.K(self.X)
            I = np.eye(self.N)

            B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance))
            alpha = np.linalg.solve(K + B_inv, self.Y)
            K_B_inv = pdinv(K + B_inv)[0]
            dL_dB = np.outer(alpha, alpha) - K_B_inv

            for n in range(self.phi.shape[0]):
                grad_B_inv = np.zeros_like(B_inv)
                grad_B_inv[n, n] = -self.variance / (self.phi[n, i] ** 2 + 1e-6)
                grad_Lm[n, i] = 0.5 * np.trace(np.dot(dL_dB, grad_B_inv))

        grad_phi = grad_Lm + self.mixing_prop_bound_grad() + self.Hgrad

        natgrad = grad_phi - np.sum(self.phi * grad_phi, 1)[:, None]
        grad = natgrad * self.phi

        return grad.flatten(), natgrad.flatten()
Example #14
0
def omgp_model_bound(omgp):
    ''' Calculate the part of the omgp bound which does not depend
    on the response variable.
    '''
    GP_bound = 0.0

    LBs = []
    # Precalculate the bound minus data fit,
    # and LB matrices used for data fit term.
    for i, kern in enumerate(omgp.kern):
        K = kern.K(omgp.X)
        B_inv = np.diag(1. / ((omgp.phi[:, i] + 1e-6) / omgp.variance))
        Bi, LB, LBi, Blogdet = pdinv(K + B_inv)
        LBs.append(LB)

        # Penalty
        GP_bound -= 0.5 * Blogdet

        # Constant
        GP_bound -= 0.5 * omgp.D * np.einsum('j,j->', omgp.phi[:, i],
                                             np.log(2 * np.pi * omgp.variance))

    model_bound = GP_bound + omgp.mixing_prop_bound() + omgp.H

    return model_bound, LBs
Example #15
0
    def __init__(self,
                 X,
                 kernF,
                 kernY,
                 Y,
                 K=2,
                 alpha=1.,
                 prior_Z='symmetric',
                 name='MOHGP'):

        N, self.D = Y.shape
        self.Y = Y
        self.X = X
        assert X.shape[0] == self.D, "input data don't match observations"

        CollapsedMixture.__init__(self, N, K, prior_Z, alpha, name)

        self.kernF = kernF
        self.kernY = kernY
        self.link_parameters(self.kernF, self.kernY)

        #initialize kernels
        self.Sf = self.kernF.K(self.X)
        self.Sy = self.kernY.K(self.X)
        self.Sy_inv, self.Sy_chol, self.Sy_chol_inv, self.Sy_logdet = pdinv(
            self.Sy + np.eye(self.D) * 1e-6)

        #Computations that can be done outside the optimisation loop
        self.YYT = self.Y[:, :, np.newaxis] * self.Y[:, np.newaxis, :]
        self.YTY = np.dot(self.Y.T, self.Y)

        self.do_computations()
Example #16
0
    def inference(self,
                  kern,
                  X,
                  W,
                  likelihood,
                  Y,
                  mean_function=None,
                  Y_metadata=None,
                  K=None,
                  variance=None,
                  Z_tilde=None):
        """
        Returns a Posterior class containing essential quantities of the posterior
        """

        if mean_function is None:
            m = 0
        else:
            m = mean_function.f(X)

        if variance is None:
            variance = likelihood.gaussian_variance(Y_metadata)

        YYT_factor = Y - m

        if K is None:
            K = kern.K(X)

        Ky = K.copy()

        diag.add(Ky, variance + 1e-8)

        Wi, LW, LWi, W_logdet = pdinv(Ky)

        alpha, _ = dpotrs(LW, YYT_factor, lower=1)

        log_marginal = 0.5 * (-Y.size * log_2_pi - Y.shape[1] * W_logdet -
                              np.sum(alpha * YYT_factor))

        if Z_tilde is not None:
            # This is a correction term for the log marginal likelihood
            # In EP this is log Z_tilde, which is the difference between the
            # Gaussian marginal and Z_EP
            log_marginal += Z_tilde

        dL_dK = 0.5 * (tdot(alpha) - Y.shape[1] * Wi)

        dL_dthetaL = likelihood.exact_inference_gradients(
            np.diag(dL_dK), Y_metadata)

        posterior_ = Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K)

        return posterior_, log_marginal, {
            'dL_dK': dL_dK,
            'dL_dthetaL': dL_dthetaL,
            'dL_dm': alpha
        }, W_logdet
 def __init__(self, mu, var):
     self.mu = np.array(mu).flatten()
     self.var = np.array(var)
     assert len(self.var.shape) == 2
     assert self.var.shape[0] == self.var.shape[1]
     assert self.var.shape[0] == self.mu.size
     self.input_dim = self.mu.size
     self.inv, self.hld = pdinv(self.var)
     self.constant = -0.5 * self.input_dim * np.log(2 * np.pi)
Example #18
0
    def parameters_changed(self):
        """ Set the kernel parameters. Note that the variational parameters are handled separately."""
        #get the latest kernel matrices, decompose
        self.Sf = self.kernF.K(self.X)
        self.Sy = self.kernY.K(self.X)
        self.Sy_inv, self.Sy_chol, self.Sy_chol_inv, self.Sy_logdet = pdinv(self.Sy+np.eye(self.D)*1e-6)

        #update everything
        self.do_computations()
        self.update_kern_grads()
Example #19
0
    def parameters_changed(self):
        """ Set the kernel parameters. Note that the variational parameters are handled separately."""
        #get the latest kernel matrices, decompose
        self.Sf = self.kernF.K(self.X)
        self.Sy = self.kernY.K(self.X)
        self.Sy_inv, self.Sy_chol, self.Sy_chol_inv, self.Sy_logdet = pdinv(
            self.Sy + np.eye(self.D) * 1e-6)

        #update everything
        self.do_computations()
        self.update_kern_grads()
Example #20
0
    def _set_params(self,x):
        """ Set the kernel parameters. Note that the variational parameters are handled separately."""
        #st the kernels with their parameters
        self.kernF._set_params_transformed(x[:self.kernF.num_params])
        self.kernY._set_params_transformed(x[self.kernF.num_params:])

        #get the latest kernel matrices, decompose
        self.Sf = self.kernF.K(self.X)
        self.Sy = self.kernY.K(self.X)
        self.Sy_inv, self.Sy_chol, self.Sy_chol_inv, self.Sy_logdet = pdinv(self.Sy)

        #update everything
        self.do_computations()
Example #21
0
    def _set_params(self, x):
        """ Set the kernel parameters. Note that the variational parameters are handled separately."""
        #st the kernels with their parameters
        self.kernF._set_params_transformed(x[:self.kernF.num_params])
        self.kernY._set_params_transformed(x[self.kernF.num_params:])

        #get the latest kernel matrices, decompose
        self.Sf = self.kernF.K(self.X)
        self.Sy = self.kernY.K(self.X)
        self.Sy_inv, self.Sy_chol, self.Sy_chol_inv, self.Sy_logdet = pdinv(
            self.Sy)

        #update everything
        self.do_computations()
Example #22
0
 def comp_Ckk(X, kernel, mean_i, cov_i):
     """Compute C_kk = \int k(X,x') k(x',X) Normal_i dx'."""
     ndim = kernel.input_dim
     var = kernel.variance
     Xsh = (X - mean_i) / 2.
     ell2 = np.ones((ndim, )) * kernel.lengthscale**2
     sqrt_det = np.power(np.prod(ell2), 1 / 2.)
     cov = cov_i + 0.5 * np.diag(ell2)
     cov_inv, _, _, ld, = pdinv(cov)
     X1s = np.sum(Xsh * np.dot(Xsh, cov_inv), 1)
     arg = 2.*np.dot(Xsh, np.dot(Xsh, cov_inv).T) \
           + X1s[:,None] + X1s[None,:]
     con = -0.5 * (ndim * np.log(2 * np.pi) + ld)
     zc = np.exp(con - 0.5 * arg)
     norm_const = var * np.power(np.pi, ndim/2.0) * sqrt_det \
                  * kernel.K(X/np.sqrt(2))
     return norm_const * zc, cov_inv
Example #23
0
def recompute_posterior_mf(
    alpha: np.ndarray, beta: np.ndarray, K: np.ndarray
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray,
           np.ndarray]:
    """
    Recompute the posterior approximation (for the mean field approximation) mean: K alpha, covariance inv(K + beta)
    :param alpha: Alpha vector used to parametrize the posterior approximation
    :param beta: Beta vector/matrix used to parametrize the posterior approximation
    :param K: prior covariance
    :return: Tuple containing the mean and cholesky of the covariance, its inverse and derivatives of the KL divergence with respect to beta and alpha
    """
    N = alpha.shape[0]
    # Lambda = diag(lam) = diag(beta.^2)
    lam_sqrt = beta.ravel()
    lam = beta.ravel()**2

    # Handle A = I + Lambda*K*Lambda
    KB = K @ np.diag(lam_sqrt)
    BKB = np.diag(lam_sqrt) @ KB
    A = np.eye(N) + BKB
    Ai, LA, Li, Alogdet = pdinv(A)

    # Compute Mean
    m = K @ alpha

    # Compute covariance matrix
    W = Li @ np.diag(
        1.0 / lam_sqrt
    )  # can be accelerated using broadcasting instead of matrix multiplication
    Sigma = (
        np.diag(1.0 / lam) - W.T @ W
    )  # computes np.diag(1./lam) - np.diag(1. / lam_sqrt) @ Ai @ np.diag(1. / lam_sqrt)

    # Compute KL
    KL = 0.5 * (Alogdet + np.trace(Ai) - N + np.sum(m * alpha))

    # Compute Gradients
    A_A2 = Ai - Ai.dot(Ai)
    dKL_db = np.diag(np.dot(KB.T, A_A2)).reshape(-1, 1)
    # dKL_da = K @ alpha
    dKL_da = m.copy()

    L = GPy.util.linalg.jitchol(Sigma)
    L_inv = np.linalg.inv(L)

    return m, L, L_inv, KL, dKL_db, dKL_da
Example #24
0
    def predict(self, Xnew, i):
        """ Predictive mean for a given component
        """
        kern = self.kern[i]
        K = kern.K(self.X)
        kx = kern.K(self.X, Xnew)

        # Predict mean
        # This works but should Cholesky for stability
        B_inv = np.diag(1. / (self.phi[:, i] / self.variance))
        K_B_inv = pdinv(K + B_inv)[0]
        mu = kx.T.dot(np.dot(K_B_inv, self.Y))

        # Predict variance
        kxx = kern.K(Xnew, Xnew)
        va = self.variance + kxx - kx.T.dot(np.dot(K_B_inv, kx))

        return mu, va
Example #25
0
    def predict(self, Xnew, i):
        """ Predictive mean for a given component
        """
        kern = self.kern[i]
        K = kern.K(self.X)
        kx = kern.K(self.X, Xnew)

        # Predict mean
        # This works but should Cholesky for stability
        B_inv = np.diag(1. / (self.phi[:, i] / self.variance))
        K_B_inv = pdinv(K + B_inv)[0]
        mu = kx.T.dot(np.dot(K_B_inv, self.Y))

        # Predict variance
        kxx = kern.K(Xnew, Xnew)
        va = self.variance + kxx - kx.T.dot(np.dot(K_B_inv, kx))

        return mu, va
Example #26
0
    def __init__(self, X, kernF, kernY, Y, K=2, alpha=1., prior_Z='symmetric'):
        N, self.D = Y.shape
        self.Y = Y
        self.X = X
        assert X.shape[0] == self.D, "input data don't match observations"

        #initialize kernels
        self.kernF = kernF
        self.kernY = kernY
        self.Sf = self.kernF.K(self.X)
        self.Sy = self.kernY.K(self.X)
        self.Sy_inv, self.Sy_chol, self.Sy_chol_inv, self.Sy_logdet = pdinv(
            self.Sy)

        #Computations that can be done outside the optimisation loop
        self.YYT = self.Y[:, :, np.newaxis] * self.Y[:, np.newaxis, :]
        self.YTY = np.dot(self.Y.T, self.Y)

        collapsed_mixture.__init__(self, N, K, prior_Z, alpha)
Example #27
0
    def __init__(self, X, kernF, kernY, Y, K=2, alpha=1., prior_Z='symmetric'):
        N,self.D = Y.shape
        self.Y = Y
        self.X = X
        assert X.shape[0]==self.D, "input data don't match observations"

        #initialize kernels
        self.kernF = kernF
        self.kernY = kernY
        self.Sf = self.kernF.K(self.X)
        self.Sy = self.kernY.K(self.X)
        self.Sy_inv, self.Sy_chol, self.Sy_chol_inv, self.Sy_logdet = pdinv(self.Sy)


        #Computations that can be done outside the optimisation loop
        self.YYT = self.Y[:,:,np.newaxis]*self.Y[:,np.newaxis,:]
        self.YTY = np.dot(self.Y.T,self.Y)

        collapsed_mixture.__init__(self, N, K, prior_Z, alpha)
Example #28
0
    def update_kern_grads(self):
        """
        Set the derivative of the lower bound wrt the (kernel) parameters
        """
        grad_Lm_variance = 0.0

        for i, kern in enumerate(self.kern):
            K = kern.K(self.X)
            B_inv = np.diag(1. / (self.phi[:, i] / self.variance))

            # Numerically more stable version using cholesky decomposition
            #alpha = linalg.cho_solve(linalg.cho_factor(K + B_inv), self.Y)
            #K_B_inv = pdinv(K + B_inv)[0]
            #dL_dK = .5*(tdot(alpha) - K_B_inv)

            # Make more stable using cholesky factorization:
            Bi, LB, LBi, Blogdet = pdinv(K + B_inv)

            tmp = dpotrs(LB, self.YYT)[0]
            GPy.util.diag.subtract(tmp, 1)
            dL_dB = dpotrs(LB, tmp.T)[0]

            kern.update_gradients_full(dL_dK=.5 * dL_dB, X=self.X)

            # variance gradient

            #for i, kern in enumerate(self.kern):
            K = kern.K(self.X)
            #I = np.eye(self.N)

            B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance))
            #alpha = np.linalg.solve(K + B_inv, self.Y)
            #K_B_inv = pdinv(K + B_inv)[0]
            #dL_dB = tdot(alpha) - K_B_inv
            grad_B_inv = np.diag(1. / (self.phi[:, i] + 1e-6))

            grad_Lm_variance += 0.5 * np.trace(np.dot(dL_dB, grad_B_inv))
            grad_Lm_variance -= .5 * self.D * np.einsum(
                'j,j->', self.phi[:, i], 1. / self.variance)

        self.variance.gradient = grad_Lm_variance
    def init_model(self, xvals, zvals):
        # Update internal data
        self.xvals = xvals
        self.zvals = zvals

        self._K = self.kern.K(self.xvals)

        Ky = self._K.copy()

        # Adds some additional noise to ensure well-conditioned
        diag.add(Ky, self.noise + 1e-8)
        Wi, LW, LWi, W_logdet = pdinv(Ky)

        self._woodbury_inv = Wi
        self._woodbury_vector = np.dot(self._woodbury_inv, self.zvals)

        self._woodbury_chol = None
        self._mean = None
        self._covariance = None
        self._prior_mean = 0.
        self._K_chol = None
Example #30
0
    def update_kern_grads(self):
        """
        Set the derivative of the lower bound wrt the (kernel) parameters
        """
        grad_Lm_variance = 0.0

        for i, kern in enumerate(self.kern):
            K = kern.K(self.X)
            B_inv = np.diag(1. / (self.phi[:, i] / self.variance))

            # Numerically more stable version using cholesky decomposition
            #alpha = linalg.cho_solve(linalg.cho_factor(K + B_inv), self.Y)
            #K_B_inv = pdinv(K + B_inv)[0]
            #dL_dK = .5*(tdot(alpha) - K_B_inv)

            # Make more stable using cholesky factorization:
            Bi, LB, LBi, Blogdet = pdinv(K+B_inv)

            tmp = dpotrs(LB, self.YYT)[0]
            GPy.util.diag.subtract(tmp, 1)
            dL_dB = dpotrs(LB, tmp.T)[0]

            kern.update_gradients_full(dL_dK=.5*dL_dB, X=self.X)

            # variance gradient

            #for i, kern in enumerate(self.kern):
            K = kern.K(self.X)
            #I = np.eye(self.N)

            B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance))
            #alpha = np.linalg.solve(K + B_inv, self.Y)
            #K_B_inv = pdinv(K + B_inv)[0]
            #dL_dB = tdot(alpha) - K_B_inv
            grad_B_inv = np.diag(1. / (self.phi[:, i] + 1e-6))

            grad_Lm_variance += 0.5 * np.trace(np.dot(dL_dB, grad_B_inv))
            grad_Lm_variance -= .5*self.D * np.einsum('j,j->',self.phi[:, i], 1./self.variance)

        self.variance.gradient = grad_Lm_variance
Example #31
0
    def vb_grad_natgrad(self):
        """
        Natural Gradients of the bound with respect to phi, the variational
        parameters controlling assignment of the data to GPs
        """
        grad_Lm = np.zeros_like(self.phi)
        for i, kern in enumerate(self.kern):
            K = kern.K(self.X)
            I = np.eye(self.N)

            B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance))
            K_B_inv, L_B, _, _ = pdinv(K + B_inv)
            alpha, _ = dpotrs(L_B, self.Y)
            dL_dB_diag = np.sum(np.square(alpha), 1) - np.diag(K_B_inv)

            grad_Lm[:,i] = -0.5 * self.variance * dL_dB_diag / (self.phi[:,i]**2 + 1e-6) 
            
        grad_phi = grad_Lm + self.mixing_prop_bound_grad() + self.Hgrad

        natgrad = grad_phi - np.sum(self.phi * grad_phi, 1)[:, None]
        grad = natgrad * self.phi

        return grad.flatten(), natgrad.flatten()
Example #32
0
    def __init__(self, X, kernF, kernY, Y, K=2, alpha=1., prior_Z='symmetric', name='MOHGP'):

        N,self.D = Y.shape
        self.Y = Y
        self.X = X
        assert X.shape[0]==self.D, "input data don't match observations"

        CollapsedMixture.__init__(self, N, K, prior_Z, alpha, name)

        self.kernF = kernF
        self.kernY = kernY
        self.link_parameters(self.kernF, self.kernY)

        #initialize kernels
        self.Sf = self.kernF.K(self.X)
        self.Sy = self.kernY.K(self.X)
        self.Sy_inv, self.Sy_chol, self.Sy_chol_inv, self.Sy_logdet = pdinv(self.Sy+np.eye(self.D)*1e-6)

        #Computations that can be done outside the optimisation loop
        self.YYT = self.Y[:,:,np.newaxis]*self.Y[:,np.newaxis,:]
        self.YTY = np.dot(self.Y.T,self.Y)

        self.do_computations()
Example #33
0
def omgp_model_bound(omgp):
    ''' Calculate the part of the omgp bound which does not depend
    on the response variable.
    '''
    GP_bound = 0.0

    LBs = []
    # Precalculate the bound minus data fit,
    # and LB matrices used for data fit term.
    for i, kern in enumerate(omgp.kern):
        K = kern.K(omgp.X)
        B_inv = np.diag(1. / ((omgp.phi[:, i] + 1e-6) / omgp.variance))
        Bi, LB, LBi, Blogdet = pdinv(K + B_inv)
        LBs.append(LB)

        # Penalty
        GP_bound -= 0.5 * Blogdet

        # Constant
        GP_bound -= 0.5 * omgp.D * np.einsum('j,j->', omgp.phi[:, i], np.log(2 * np.pi * omgp.variance))

    model_bound = GP_bound + omgp.mixing_prop_bound() + omgp.H

    return model_bound, LBs
Example #34
0
    def vb_grad_natgrad(self):
        """
        Natural Gradients of the bound with respect to phi, the variational
        parameters controlling assignment of the data to GPs
        """
        grad_Lm = np.zeros_like(self.phi)
        for i, kern in enumerate(self.kern):
            K = kern.K(self.X)
            I = np.eye(self.N)

            B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance))
            K_B_inv, L_B, _, _ = pdinv(K + B_inv)
            alpha, _ = dpotrs(L_B, self.Y)
            dL_dB_diag = np.sum(np.square(alpha), 1) - np.diag(K_B_inv)

            grad_Lm[:, i] = -0.5 * self.variance * dL_dB_diag / (
                self.phi[:, i]**2 + 1e-6)

        grad_phi = grad_Lm + self.mixing_prop_bound_grad() + self.Hgrad

        natgrad = grad_phi - np.sum(self.phi * grad_phi, 1)[:, None]
        grad = natgrad * self.phi

        return grad.flatten(), natgrad.flatten()
Example #35
0
    def inference(self,
                  kern,
                  X,
                  Z,
                  likelihood,
                  Y,
                  mean_function=None,
                  Y_metadata=None):
        assert mean_function is None, "inference with a mean function not implemented"

        num_inducing, _ = Z.shape
        num_data, output_dim = Y.shape

        #make sure the noise is not hetero
        sigma_n = likelihood.gaussian_variance(Y_metadata)
        if sigma_n.size > 1:
            raise NotImplementedError(
                "no hetero noise with this implementation of PEP")

        Kmm = kern.K(Z)
        Knn = kern.Kdiag(X)
        Knm = kern.K(X, Z)
        U = Knm

        #factor Kmm
        diag.add(Kmm, self.const_jitter)
        Kmmi, L, Li, _ = pdinv(Kmm)

        #compute beta_star, the effective noise precision
        LiUT = np.dot(Li, U.T)
        sigma_star = sigma_n + self.alpha * (Knn - np.sum(np.square(LiUT), 0))
        beta_star = 1. / sigma_star

        # Compute and factor A
        A = tdot(LiUT * np.sqrt(beta_star)) + np.eye(num_inducing)
        LA = jitchol(A)

        # back substitute to get b, P, v
        URiy = np.dot(U.T * beta_star, Y)
        tmp, _ = dtrtrs(L, URiy, lower=1)
        b, _ = dtrtrs(LA, tmp, lower=1)
        tmp, _ = dtrtrs(LA, b, lower=1, trans=1)
        v, _ = dtrtrs(L, tmp, lower=1, trans=1)
        tmp, _ = dtrtrs(LA, Li, lower=1, trans=0)
        P = tdot(tmp.T)

        alpha_const_term = (1.0 - self.alpha) / self.alpha

        #compute log marginal
        log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \
                       -np.sum(np.log(np.diag(LA)))*output_dim + \
                       0.5*output_dim*(1+alpha_const_term)*np.sum(np.log(beta_star)) + \
                       -0.5*np.sum(np.square(Y.T*np.sqrt(beta_star))) + \
                       0.5*np.sum(np.square(b)) + 0.5*alpha_const_term*num_data*np.log(sigma_n)
        #compute dL_dR
        Uv = np.dot(U, v)
        dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - (1.0+alpha_const_term)/beta_star + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) \
            + np.sum(np.square(Uv), 1))*beta_star**2

        # Compute dL_dKmm
        vvT_P = tdot(v.reshape(-1, 1)) + P
        dL_dK = 0.5 * (Kmmi - vvT_P)
        KiU = np.dot(Kmmi, U.T)
        dL_dK += self.alpha * np.dot(KiU * dL_dR, KiU.T)

        # Compute dL_dU
        vY = np.dot(v.reshape(-1, 1), Y.T)
        dL_dU = vY - np.dot(vvT_P, U.T)
        dL_dU *= beta_star
        dL_dU -= self.alpha * 2. * KiU * dL_dR

        dL_dthetaL = likelihood.exact_inference_gradients(dL_dR)
        dL_dthetaL += 0.5 * alpha_const_term * num_data / sigma_n
        grad_dict = {
            'dL_dKmm': dL_dK,
            'dL_dKdiag': dL_dR * self.alpha,
            'dL_dKnm': dL_dU.T,
            'dL_dthetaL': dL_dthetaL
        }

        #construct a posterior object
        post = Posterior(woodbury_inv=Kmmi - P,
                         woodbury_vector=v,
                         K=Kmm,
                         mean=None,
                         cov=None,
                         K_chol=L)

        return post, log_marginal, grad_dict
    def update_model(self,
                     x,
                     y,
                     opt_hyp=False,
                     replace_old=True,
                     noise_diag=1e-5,
                     choose_data=True):
        """ Update the model based on the current settings and new data

        Parameters
        ----------
        x: n x (n_s + n_u) array[float]
            The training set
        y: n x n_s
            The training targets
        train: bool, optional
            If this is set to TRUE the hyperparameters are re-optimized
        """
        if replace_old:
            x_new = x
            y_new = y
        else:
            x_new = np.vstack((self.x_train, x))
            y_new = np.vstack((self.y_train, y))

        if opt_hyp or not self.gp_trained:
            self.train(x_new, y_new, self.m, opt_hyp=opt_hyp, Z=self.Z)
        else:
            n_data = np.shape(x_new)[0]
            inv_K = [None] * self.n_s_out
            if self.m is None:
                n_beta = n_data
                Z = x_new
                y_z = y_new
            else:

                if n_data < self.m:
                    warnings.warn(
                        """The desired number of datapoints is not available. Dataset consist of {}
                           Datapoints! """.format(n_data))
                    Z = x_new
                    y_z = y_new
                    n_beta = n_data
                else:
                    if choose_data:
                        Z, y_z = self.choose_datapoints_maxvar(
                            x_new, y_new, self.m)

                    else:
                        idx = np.random.choice(n_data,
                                               size=self.m,
                                               replace=False)
                        Z = x_new[idx, :]
                        y_z = y_new[idx, :]
                    n_beta = self.m

            beta = np.empty((n_beta, self.n_s_out))

            for i in range(self.n_s_out):
                if self.do_sparse_gp:
                    self.gps[i].set_XY(x_new, y_new[:, i].reshape(-1, 1))
                    if not self.z_fixed:
                        self.gps[i].set_Z(Z)
                else:
                    self.gps[i].set_XY(Z, y_z[:, i].reshape(-1, 1))

                post = self.gps[i].posterior

                if noise_diag > 0.0:
                    inv_K[i] = pdinv(
                        post._K + float(self.gps[i].Gaussian_noise.variance +
                                        noise_diag) * np.eye(n_beta))[0]
                else:
                    inv_K[i] = post.woodbury_inv

                beta[:, i] = post.woodbury_vector.reshape(-1, )

            self.x_train = x_new
            self.y_train = y_new
            self.z = Z
            self.inv_K = inv_K
            self.beta = beta
    def predict_value(self, xvals, include_noise=True, full_cov=False):
        # Calculate for the test point
        assert (xvals.shape[0] >= 1)
        assert (xvals.shape[1] == self.dimension)
        n_points, input_dim = xvals.shape

        # With no observations, predict 0 mean everywhere and prior variance
        if self.xvals is None:
            return np.zeros((n_points, 1)), np.ones(
                (n_points, 1)) * self.variance

        # Find neightbors within radius
        point_group = self.spatial_tree.query_ball_point(
            xvals, self.neighbor_radius)

        point_list = []
        for points in point_group:
            for index in points:
                point_list.append(index)

        point_set = Set(point_list)
        xpoints = [self.xvals[index] for index in point_set]
        zpoints = [self.zvals[index] for index in point_set]
        # print "Size before:", len(xpoints)

        # Brute force check the points in the waiting queue
        if self.xwait is not None and self.xwait.shape[0] > 0:
            wait_list = []
            for i, u in enumerate(self.xwait):
                for j, v in enumerate(xvals):
                    # if xvals.shape[0] < 10:
                    #     print "Comparing", i, j
                    #     print "Points:", u, v
                    dist = sp.spatial.distance.minkowski(u, v, p=2.0)
                    if dist <= self.neighbor_radius:
                        wait_list.append(i)
                        # if xvals.shape[0] < 10:
                        #     print "Adding point", u

            # if xvals.shape[0] < 10:
            #     print "The wait list:", wait_list

            wait_set = Set(wait_list)

            xpoints = [self.xwait[index] for index in wait_set] + xpoints
            zpoints = [self.zwait[index] for index in wait_set] + zpoints
            # print "Size after:", len(xpoints)

        xpoints = np.array(xpoints).reshape(-1, 2)
        zpoints = np.array(zpoints).reshape(-1, 1)

        if xpoints.shape[0] == 0:
            "No nearby points!"
            return np.zeros((n_points, 1)), np.ones(
                (n_points, 1)) * self.variance

        # if self.xvals is not None:
        #     print "Size of kernel array:", self.xvals
        # if self.xwait is not None:
        #     print "Size of wait array:", self.xwait.shape
        # if xpoints is not None:
        #     print "Size of returned points:", xpoints.shape

        Kx = self.kern.K(xpoints, xvals)
        K = self.kern.K(xpoints, xpoints)

        # Adds some additional noise to ensure well-conditioned
        Ky = K.copy()
        diag.add(Ky, self.noise + 1e-8)

        Wi, LW, LWi, W_logdet = pdinv(Ky)
        woodbury_inv = Wi
        woodbury_vector = np.dot(woodbury_inv, zpoints)

        mu = np.dot(Kx.T, woodbury_vector)
        if len(mu.shape) == 1:
            mu = mu.reshape(-1, 1)
        if full_cov:
            Kxx = self.kern.K(xvals)
            if self.woodbury_inv.ndim == 2:
                var = Kxx - np.dot(Kx.T, np.dot(woodbury_inv, Kx))
        else:
            Kxx = self.kern.Kdiag(xvals)
            var = (Kxx - np.sum(np.dot(woodbury_inv.T, Kx) * Kx, 0))[:, None]

        # If model noise should be included in the prediction
        if include_noise:
            var += self.noise

        update_legacy = False
        if update_legacy:
            # With no observations, predict 0 mean everywhere and prior variance
            if self.model == None:
                mean, variance = np.zeros((n_points, 1)), np.ones(
                    (n_points, 1)) * self.variance

            # Else, return the predicted values
            mean, variance = self.model.predict(
                xvals, full_cov=False, include_likelihood=include_noise)
            if xvals.shape[0] < 10:
                # print "-------- MEAN ------------"
                # print "spatial method:"
                # print mu
                # print "default method:"
                # print mean
                # print "-------- VARIANCE ------------"
                # print "spatial method:"
                # print var
                # print "default method:"
                # print variance

                print np.sum(mu - mean)
                print np.sum(var - variance)

        return mu, var
Example #38
0
    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
        assert mean_function is None, "inference with a mean function not implemented"

        num_inducing, _ = Z.shape
        num_data, output_dim = Y.shape

        #make sure the noise is not hetero
        sigma_n = likelihood.gaussian_variance(Y_metadata)
        if sigma_n.size >1:
            raise NotImplementedError("no hetero noise with this implementation of PEP")

        Kmm = kern.K(Z)
        Knn = kern.Kdiag(X)
        Knm = kern.K(X, Z)
        U = Knm

        #factor Kmm
        diag.add(Kmm, self.const_jitter)
        Kmmi, L, Li, _ = pdinv(Kmm)

        #compute beta_star, the effective noise precision
        LiUT = np.dot(Li, U.T)
        sigma_star = sigma_n + self.alpha * (Knn - np.sum(np.square(LiUT),0))
        beta_star = 1./sigma_star

        # Compute and factor A
        A = tdot(LiUT*np.sqrt(beta_star)) + np.eye(num_inducing)
        LA = jitchol(A)

        # back substitute to get b, P, v
        URiy = np.dot(U.T*beta_star,Y)
        tmp, _ = dtrtrs(L, URiy, lower=1)
        b, _ = dtrtrs(LA, tmp, lower=1)
        tmp, _ = dtrtrs(LA, b, lower=1, trans=1)
        v, _ = dtrtrs(L, tmp, lower=1, trans=1)
        tmp, _ = dtrtrs(LA, Li, lower=1, trans=0)
        P = tdot(tmp.T)

        alpha_const_term = (1.0-self.alpha) / self.alpha

        #compute log marginal
        log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \
                       -np.sum(np.log(np.diag(LA)))*output_dim + \
                       0.5*output_dim*(1+alpha_const_term)*np.sum(np.log(beta_star)) + \
                       -0.5*np.sum(np.square(Y.T*np.sqrt(beta_star))) + \
                       0.5*np.sum(np.square(b)) + 0.5*alpha_const_term*num_data*np.log(sigma_n)
        #compute dL_dR
        Uv = np.dot(U, v)
        dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - (1.0+alpha_const_term)/beta_star + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) \
            + np.sum(np.square(Uv), 1))*beta_star**2 

        # Compute dL_dKmm
        vvT_P = tdot(v.reshape(-1,1)) + P
        dL_dK = 0.5*(Kmmi - vvT_P)
        KiU = np.dot(Kmmi, U.T)
        dL_dK += self.alpha * np.dot(KiU*dL_dR, KiU.T)

        # Compute dL_dU
        vY = np.dot(v.reshape(-1,1),Y.T)
        dL_dU = vY - np.dot(vvT_P, U.T)
        dL_dU *= beta_star
        dL_dU -= self.alpha * 2.*KiU*dL_dR

        dL_dthetaL = likelihood.exact_inference_gradients(dL_dR)
        dL_dthetaL += 0.5*alpha_const_term*num_data / sigma_n
        grad_dict = {'dL_dKmm': dL_dK, 'dL_dKdiag':dL_dR * self.alpha, 'dL_dKnm':dL_dU.T, 'dL_dthetaL':dL_dthetaL}

        #construct a posterior object
        post = Posterior(woodbury_inv=Kmmi-P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=L)

        return post, log_marginal, grad_dict
Example #39
0
 def __init__(self, domain, mu, cov):
     super().__init__(domain)
     self.mu, self.cov = process_parameters(self.input_dim, mu, cov)
     self.inv, _, _, ld, = pdinv(self.cov)
     self.constant = -0.5*(self.input_dim * np.log(2*np.pi) + ld)