Ejemplo n.º 1
0
    def parameters_changed(self):
        K = self.kern.K(self.X)
        self.L = L = GPy.util.linalg.jitchol(K)
        F = self.F = blas.dtrmm(1.0,L, self.V, lower=1, trans_a=0)

        #compute the log likelihood
        self._loglik = self.likelihood.logpdf(F, self.Y).sum()
        dL_dF = self.likelihood.dlogpdf_df(F, self.Y)
        self.likelihood.gradient = self.likelihood.dlogpdf_dtheta(F, self.Y).sum(1).sum(1)

        #here's the prior for V
        self._loglik += -0.5*self.num_data*self.output_dim*np.log(2*np.pi) - 0.5*np.sum(np.square(self.V))

        #set all gradients to zero, then only compute necessary gradients.
        self.gradient = 0.

        #compute dL_dV
        if not self.V.is_fixed:
            self.V.gradient = -self.V + blas.dtrmm(1.0, L, dL_dF, trans_a=1, lower=1)

        #compute kernel gradients
        if not self.kern.is_fixed:
            dL_dL = np.dot(dL_dF, self.V.T) # where the first L is the likelihood, the second L is the triangular matrix
            #nasty reverse Cholesky to get dL_dK
            dL_dK = GPy.util.choleskies.backprop_gradient(dL_dL, L)
            self.kern.update_gradients_full(dL_dK, self.X)
Ejemplo n.º 2
0
    def parameters_changed(self):
        K = self.kern.K(self.X)
        self.L = L = GPy.util.linalg.jitchol(K)
        F = self.F = blas.dtrmm(1.0, L, self.V, lower=1, trans_a=0)

        #compute the log likelihood
        self._loglik = self.likelihood.logpdf(F, self.Y).sum()
        dL_dF = self.likelihood.dlogpdf_df(F, self.Y)
        self.likelihood.gradient = self.likelihood.dlogpdf_dtheta(
            F, self.Y).sum(1).sum(1)

        #here's the prior for V
        self._loglik += -0.5 * self.num_data * self.output_dim * np.log(
            2 * np.pi) - 0.5 * np.sum(np.square(self.V))

        #set all gradients to zero, then only compute necessary gradients.
        self.gradient = 0.

        #compute dL_dV
        if not self.V.is_fixed:
            self.V.gradient = -self.V + blas.dtrmm(
                1.0, L, dL_dF, trans_a=1, lower=1)

        #compute kernel gradients
        if not self.kern.is_fixed:
            dL_dL = np.dot(
                dL_dF, self.V.T
            )  # where the first L is the likelihood, the second L is the triangular matrix
            #nasty reverse Cholesky to get dL_dK
            dL_dK = GPy.util.choleskies.backprop_gradient(dL_dL, L)
            self.kern.update_gradients_full(dL_dK, self.X)
Ejemplo n.º 3
0
    def calculate_mu_var(self, X, Y, Z, q_u_mean, q_u_chol, kern, mean_function, num_inducing, num_data, num_outputs):
        """
        Calculate posterior mean and variance for the latent function values for use in the
        expectation over the likelihood
        """
        #expand cholesky representation
        L = choleskies.flat_to_triang(q_u_chol)
        #S = linalg.ijk_ljk_to_ilk(L, L) #L.dot(L.T)
        S = np.empty((num_outputs, num_inducing, num_inducing))
        [np.dot(L[i,:,:], L[i,:,:].T, S[i,:,:]) for i in range(num_outputs)]
        #logdetS = np.array([2.*np.sum(np.log(np.abs(np.diag(L[:,:,i])))) for i in range(L.shape[-1])])
        logdetS = np.array([2.*np.sum(np.log(np.abs(np.diag(L[i,:,:])))) for i in range(L.shape[0])])
        #compute mean function stuff
        if mean_function is not None:
            prior_mean_u = mean_function.f(Z)
            prior_mean_f = mean_function.f(X)
        else:
            prior_mean_u = np.zeros((num_inducing, num_outputs))
            prior_mean_f = np.zeros((num_data, num_outputs))

        #compute kernel related stuff
        Kmm = kern.K(Z)
        #Knm = kern.K(X, Z)
        Kmn = kern.K(Z, X)
        Knn_diag = kern.Kdiag(X)
        #Kmmi, Lm, Lmi, logdetKmm = linalg.pdinv(Kmm)
        Lm = linalg.jitchol(Kmm)
        logdetKmm = 2.*np.sum(np.log(np.diag(Lm)))
        Kmmi, _ = linalg.dpotri(Lm)

        #compute the marginal means and variances of q(f)
        #A = np.dot(Knm, Kmmi)
        A, _ = linalg.dpotrs(Lm, Kmn)
        #mu = prior_mean_f + np.dot(A, q_u_mean - prior_mean_u)
        mu = prior_mean_f + np.dot(A.T, q_u_mean - prior_mean_u)
        #v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * linalg.ij_jlk_to_ilk(A, S), 1)
        v = np.empty((num_data, num_outputs))
        for i in range(num_outputs):
            tmp = dtrmm(1.0,L[i].T, A, lower=0, trans_a=0)
            v[:,i] = np.sum(np.square(tmp),0)
        v += (Knn_diag - np.sum(A*Kmn,0))[:,None]

        #compute the KL term
        Kmmim = np.dot(Kmmi, q_u_mean)
        #KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.einsum('ij,ijk->k', Kmmi, S) + 0.5*np.sum(q_u_mean*Kmmim,0)
        KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.sum(Kmmi[None,:,:]*S,1).sum(1) + 0.5*np.sum(q_u_mean*Kmmim,0)
        KL = KLs.sum()

        latent_detail = LatentFunctionDetails(q_u_mean=q_u_mean, q_u_chol=q_u_chol, mean_function=mean_function,
                                              mu=mu, v=v, prior_mean_u=prior_mean_u, L=L, A=A,
                                              S=S, Kmm=Kmm, Kmmi=Kmmi, Kmmim=Kmmim, KL=KL)
        return latent_detail
Ejemplo n.º 4
0
    def variational_q_fd(self, X, Z, q_U, p_U, kern_list, B, N, dims, d):
        """
        Description:  Returns the posterior approximation q(f) for the latent output functions (LOFs)
        Equation:     q(f) = \int p(f|u)q(u)du
        Paper:        In Section 2.2.2 / Variational Bounds
        """
        Q = dims['Q']
        M = dims['M']

        #-----------------------------------------#      POSTERIOR ALGEBRA       #-------------------------------------#
        #######  Algebra for q(u)  #######
        m_u = q_U.mu_u.copy()
        L_u = choleskies.flat_to_triang(q_U.chols_u.copy())
        S_u = np.empty((Q, M, M))
        [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)]

        #######  Algebra for p(f_d|u)  #######
        Kfdu = multi_output.cross_covariance(X, Z, B, kern_list, d)
        Luu = p_U.Luu.copy()
        Kff = multi_output.function_covariance(X, B, kern_list, d)
        Kff_diag = np.diag(Kff)

        ####### Algebra for q(f_d) = E_{q(u)}[p(f_d|u)] #######
        Afdu = np.empty((Q, N, M))  # Afdu = K_{fduq}Ki_{uquq}
        m_fd = np.zeros((N, 1))
        v_fd = np.zeros((N, 1))
        S_fd = np.zeros((N, N))
        v_fd += Kff_diag[:, None]
        S_fd += Kff
        for q in range(Q):
            ####### Expectation w.r.t. u_q part  #######
            R, _ = linalg.dpotrs(np.asfortranarray(Luu[q, :, :]),
                                 Kfdu[:, q * M:(q * M) + M].T)
            Afdu[q, :, :] = R.T
            m_fd += np.dot(Afdu[q, :, :], m_u[:, q, None])  # exp
            tmp = dtrmm(alpha=1.0, a=L_u[q, :, :].T, b=R, lower=0, trans_a=0)
            v_fd += np.sum(np.square(tmp), 0)[:, None] - np.sum(
                R * Kfdu[:, q * M:(q * M) + M].T, 0)[:, None]  # exp
            S_fd += np.dot(np.dot(R.T, S_u[q, :, :]), R) - np.dot(
                Kfdu[:, q * M:(q * M) + M], R)

        if (v_fd < 0).any():
            print('v negative!')

        #--------------------------------------#     VARIATIONAL POSTERIOR (LOFs)  #-----------------------------------#
        ####### Variational output distribution q_fd() #######
        q_fd = qfd(m_fd=m_fd, v_fd=v_fd, Kfdu=Kfdu, Afdu=Afdu, S_fd=S_fd)

        return q_fd
Ejemplo n.º 5
0
    def calculate_q_f(self, X, Z, q_U, p_U, kern_list, B, M, N, Q, D, d):
        """
        Calculates the mean and variance of q(f_d) as
        Equation: E_q(U)\{p(f_d|U)\}
        """
        # Algebra for q(u):
        m_u = q_U.mu_u.copy()
        L_u = choleskies.flat_to_triang(q_U.chols_u.copy())
        S_u = np.empty((Q, M, M))
        [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)]

        # Algebra for p(f_d|u):
        Kfdu = util.cross_covariance(X, Z, B, kern_list, d)
        Kuu = p_U.Kuu.copy()
        Luu = p_U.Luu.copy()
        Kuui = p_U.Kuui.copy()
        Kff = util.function_covariance(X, B, kern_list, d)
        Kff_diag = np.diag(Kff)

        # Algebra for q(f_d) = E_{q(u)}[p(f_d|u)]
        Afdu = np.empty((Q, N, M))  #Afdu = K_{fduq}Ki_{uquq}
        m_fd = np.zeros((N, 1))
        v_fd = np.zeros((N, 1))
        S_fd = np.zeros((N, N))
        v_fd += Kff_diag[:, None]
        S_fd += Kff
        for q in range(Q):
            # Expectation part
            R, _ = linalg.dpotrs(np.asfortranarray(Luu[q, :, :]),
                                 Kfdu[:, q * M:(q * M) + M].T)
            Afdu[q, :, :] = R.T
            m_fd += np.dot(Afdu[q, :, :], m_u[:, q, None])  #exp
            tmp = dtrmm(alpha=1.0, a=L_u[q, :, :].T, b=R, lower=0, trans_a=0)
            v_fd += np.sum(np.square(tmp), 0)[:, None] - np.sum(
                R * Kfdu[:, q * M:(q * M) + M].T, 0)[:, None]  #exp
            S_fd += np.dot(np.dot(R.T, S_u[q, :, :]), R) - np.dot(
                Kfdu[:, q * M:(q * M) + M], R)

        if (v_fd < 0).any():
            print('v negative!')

        q_fd = qfd(m_fd=m_fd, v_fd=v_fd, Kfdu=Kfdu, Afdu=Afdu, S_fd=S_fd)
        return q_fd
Ejemplo n.º 6
0
    def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):

        num_data, _ = Y.shape
        num_inducing, num_outputs = q_u_mean.shape

        #expand cholesky representation
        L = choleskies.flat_to_triang(q_u_chol)


        S = np.empty((num_outputs, num_inducing, num_inducing))
        [np.dot(L[i,:,:], L[i,:,:].T, S[i,:,:]) for i in range(num_outputs)]
        #Si,_ = linalg.dpotri(np.asfortranarray(L), lower=1)
        Si = choleskies.multiple_dpotri(L)
        logdetS = np.array([2.*np.sum(np.log(np.abs(np.diag(L[i,:,:])))) for i in range(L.shape[0])])

        if np.any(np.isinf(Si)):
            raise ValueError("Cholesky representation unstable")

        #compute mean function stuff
        if mean_function is not None:
            prior_mean_u = mean_function.f(Z)
            prior_mean_f = mean_function.f(X)
        else:
            prior_mean_u = np.zeros((num_inducing, num_outputs))
            prior_mean_f = np.zeros((num_data, num_outputs))

        #compute kernel related stuff
        Kmm = kern.K(Z)
        Kmn = kern.K(Z, X)
        Knn_diag = kern.Kdiag(X)
        Lm = linalg.jitchol(Kmm)
        logdetKmm = 2.*np.sum(np.log(np.diag(Lm)))
        Kmmi, _ = linalg.dpotri(Lm)

        #compute the marginal means and variances of q(f)
        A, _ = linalg.dpotrs(Lm, Kmn)
        mu = prior_mean_f + np.dot(A.T, q_u_mean - prior_mean_u)
        v = np.empty((num_data, num_outputs))
        for i in range(num_outputs):
            tmp = dtrmm(1.0,L[i].T, A, lower=0, trans_a=0)
            v[:,i] = np.sum(np.square(tmp),0)
        v += (Knn_diag - np.sum(A*Kmn,0))[:,None]

        #compute the KL term
        Kmmim = np.dot(Kmmi, q_u_mean)
        KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.sum(Kmmi[None,:,:]*S,1).sum(1) + 0.5*np.sum(q_u_mean*Kmmim,0)
        KL = KLs.sum()
        #gradient of the KL term (assuming zero mean function)
        dKL_dm = Kmmim.copy()
        dKL_dS = 0.5*(Kmmi[None,:,:] - Si)
        dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(0)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T)

        if mean_function is not None:
            #adjust KL term for mean function
            Kmmi_mfZ = np.dot(Kmmi, prior_mean_u)
            KL += -np.sum(q_u_mean*Kmmi_mfZ)
            KL += 0.5*np.sum(Kmmi_mfZ*prior_mean_u)

            #adjust gradient for mean fucntion
            dKL_dm -= Kmmi_mfZ
            dKL_dKmm += Kmmim.dot(Kmmi_mfZ.T)
            dKL_dKmm -= 0.5*Kmmi_mfZ.dot(Kmmi_mfZ.T)

            #compute gradients for mean_function
            dKL_dmfZ = Kmmi_mfZ - Kmmim

        #quadrature for the likelihood
        F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v, Y_metadata=Y_metadata)

        #rescale the F term if working on a batch
        F, dF_dmu, dF_dv =  F*batch_scale, dF_dmu*batch_scale, dF_dv*batch_scale
        if dF_dthetaL is not None:
            dF_dthetaL =  dF_dthetaL.sum(1).sum(1)*batch_scale

        #derivatives of expected likelihood, assuming zero mean function
        Adv = A[None,:,:]*dF_dv.T[:,None,:] # As if dF_Dv is diagonal, D, M, N
        Admu = A.dot(dF_dmu)
        Adv = np.ascontiguousarray(Adv) # makes for faster operations later...(inc dsymm)
        AdvA = np.dot(Adv.reshape(-1, num_data),A.T).reshape(num_outputs, num_inducing, num_inducing )
        tmp = np.sum([np.dot(a,s) for a, s in zip(AdvA, S)],0).dot(Kmmi)
        dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(0) - tmp - tmp.T
        dF_dKmm = 0.5*(dF_dKmm + dF_dKmm.T) # necessary? GPy bug?
        tmp = S.reshape(-1, num_inducing).dot(Kmmi).reshape(num_outputs, num_inducing , num_inducing )
        tmp = 2.*(tmp - np.eye(num_inducing)[None, :,:])

        dF_dKmn = Kmmim.dot(dF_dmu.T)
        for a,b in zip(tmp, Adv):
            dF_dKmn += np.dot(a.T, b)

        dF_dm = Admu
        dF_dS = AdvA

        #adjust gradient to account for mean function
        if mean_function is not None:
            dF_dmfX = dF_dmu.copy()
            dF_dmfZ = -Admu
            dF_dKmn -= np.dot(Kmmi_mfZ, dF_dmu.T)
            dF_dKmm += Admu.dot(Kmmi_mfZ.T)


        #sum (gradients of) expected likelihood and KL part
        log_marginal = F.sum() - KL
        dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS- dKL_dS, dF_dKmm- dKL_dKmm, dF_dKmn

        dL_dchol = 2.*np.array([np.dot(a,b) for a, b in zip(dL_dS, L) ])
        dL_dchol = choleskies.triang_to_flat(dL_dchol)

        grad_dict = {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv.sum(1), 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
        if mean_function is not None:
            grad_dict['dL_dmfZ'] = dF_dmfZ - dKL_dmfZ
            grad_dict['dL_dmfX'] = dF_dmfX
        return Posterior(mean=q_u_mean, cov=S.T, K=Kmm, prior_mean=prior_mean_u), log_marginal, grad_dict
Ejemplo n.º 7
0
Archivo: svgp.py Proyecto: pxlong/GPy
    def inference(
        self,
        q_u_mean,
        q_u_chol,
        kern,
        X,
        Z,
        likelihood,
        Y,
        mean_function=None,
        Y_metadata=None,
        KL_scale=1.0,
        batch_scale=1.0,
    ):

        num_data, _ = Y.shape
        num_inducing, num_outputs = q_u_mean.shape

        # expand cholesky representation
        L = choleskies.flat_to_triang(q_u_chol)

        S = np.empty((num_outputs, num_inducing, num_inducing))
        [np.dot(L[i, :, :], L[i, :, :].T, S[i, :, :]) for i in range(num_outputs)]
        # Si,_ = linalg.dpotri(np.asfortranarray(L), lower=1)
        Si = choleskies.multiple_dpotri(L)
        logdetS = np.array([2.0 * np.sum(np.log(np.abs(np.diag(L[i, :, :])))) for i in range(L.shape[0])])

        if np.any(np.isinf(Si)):
            raise ValueError("Cholesky representation unstable")

        # compute mean function stuff
        if mean_function is not None:
            prior_mean_u = mean_function.f(Z)
            prior_mean_f = mean_function.f(X)
        else:
            prior_mean_u = np.zeros((num_inducing, num_outputs))
            prior_mean_f = np.zeros((num_data, num_outputs))

        # compute kernel related stuff
        Kmm = kern.K(Z)
        Kmn = kern.K(Z, X)
        Knn_diag = kern.Kdiag(X)
        Lm = linalg.jitchol(Kmm)
        logdetKmm = 2.0 * np.sum(np.log(np.diag(Lm)))
        Kmmi, _ = linalg.dpotri(Lm)

        # compute the marginal means and variances of q(f)
        A, _ = linalg.dpotrs(Lm, Kmn)
        mu = prior_mean_f + np.dot(A.T, q_u_mean - prior_mean_u)
        v = np.empty((num_data, num_outputs))
        for i in range(num_outputs):
            tmp = dtrmm(1.0, L[i].T, A, lower=0, trans_a=0)
            v[:, i] = np.sum(np.square(tmp), 0)
        v += (Knn_diag - np.sum(A * Kmn, 0))[:, None]

        # compute the KL term
        Kmmim = np.dot(Kmmi, q_u_mean)
        KLs = (
            -0.5 * logdetS
            - 0.5 * num_inducing
            + 0.5 * logdetKmm
            + 0.5 * np.sum(Kmmi[None, :, :] * S, 1).sum(1)
            + 0.5 * np.sum(q_u_mean * Kmmim, 0)
        )
        KL = KLs.sum()
        # gradient of the KL term (assuming zero mean function)
        dKL_dm = Kmmim.copy()
        dKL_dS = 0.5 * (Kmmi[None, :, :] - Si)
        dKL_dKmm = 0.5 * num_outputs * Kmmi - 0.5 * Kmmi.dot(S.sum(0)).dot(Kmmi) - 0.5 * Kmmim.dot(Kmmim.T)

        if mean_function is not None:
            # adjust KL term for mean function
            Kmmi_mfZ = np.dot(Kmmi, prior_mean_u)
            KL += -np.sum(q_u_mean * Kmmi_mfZ)
            KL += 0.5 * np.sum(Kmmi_mfZ * prior_mean_u)

            # adjust gradient for mean fucntion
            dKL_dm -= Kmmi_mfZ
            dKL_dKmm += Kmmim.dot(Kmmi_mfZ.T)
            dKL_dKmm -= 0.5 * Kmmi_mfZ.dot(Kmmi_mfZ.T)

            # compute gradients for mean_function
            dKL_dmfZ = Kmmi_mfZ - Kmmim

        # quadrature for the likelihood
        F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v, Y_metadata=Y_metadata)

        # rescale the F term if working on a batch
        F, dF_dmu, dF_dv = F * batch_scale, dF_dmu * batch_scale, dF_dv * batch_scale
        if dF_dthetaL is not None:
            dF_dthetaL = dF_dthetaL.sum(1).sum(1) * batch_scale

        # derivatives of expected likelihood, assuming zero mean function
        Adv = A[None, :, :] * dF_dv.T[:, None, :]  # As if dF_Dv is diagonal, D, M, N
        Admu = A.dot(dF_dmu)
        Adv = np.ascontiguousarray(Adv)  # makes for faster operations later...(inc dsymm)
        AdvA = np.dot(Adv.reshape(-1, num_data), A.T).reshape(num_outputs, num_inducing, num_inducing)
        tmp = np.sum([np.dot(a, s) for a, s in zip(AdvA, S)], 0).dot(Kmmi)
        dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(0) - tmp - tmp.T
        dF_dKmm = 0.5 * (dF_dKmm + dF_dKmm.T)  # necessary? GPy bug?
        tmp = S.reshape(-1, num_inducing).dot(Kmmi).reshape(num_outputs, num_inducing, num_inducing)
        tmp = 2.0 * (tmp - np.eye(num_inducing)[None, :, :])

        dF_dKmn = Kmmim.dot(dF_dmu.T)
        for a, b in zip(tmp, Adv):
            dF_dKmn += np.dot(a.T, b)

        dF_dm = Admu
        dF_dS = AdvA

        # adjust gradient to account for mean function
        if mean_function is not None:
            dF_dmfX = dF_dmu.copy()
            dF_dmfZ = -Admu
            dF_dKmn -= np.dot(Kmmi_mfZ, dF_dmu.T)
            dF_dKmm += Admu.dot(Kmmi_mfZ.T)

        # sum (gradients of) expected likelihood and KL part
        log_marginal = F.sum() - KL
        dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS - dKL_dS, dF_dKmm - dKL_dKmm, dF_dKmn

        dL_dchol = 2.0 * np.array([np.dot(a, b) for a, b in zip(dL_dS, L)])
        dL_dchol = choleskies.triang_to_flat(dL_dchol)

        grad_dict = {
            "dL_dKmm": dL_dKmm,
            "dL_dKmn": dL_dKmn,
            "dL_dKdiag": dF_dv.sum(1),
            "dL_dm": dL_dm,
            "dL_dchol": dL_dchol,
            "dL_dthetaL": dF_dthetaL,
        }
        if mean_function is not None:
            grad_dict["dL_dmfZ"] = dF_dmfZ - dKL_dmfZ
            grad_dict["dL_dmfX"] = dF_dmfX
        return Posterior(mean=q_u_mean, cov=S.T, K=Kmm, prior_mean=prior_mean_u), log_marginal, grad_dict
Ejemplo n.º 8
0
    def local_likelihood(self,
                         X,
                         Y,
                         l_svgps,
                         g_q_u_mean,
                         g_q_u_chol,
                         g_kern,
                         g_Z,
                         g_mean_function=None):

        num_data, _ = Y.shape
        F = np.zeros([num_data, len(l_svgps)])

        #global
        g_num_inducing, num_outputs = g_q_u_mean.shape

        #expand cholesky representation
        g_L = choleskies.flat_to_triang(g_q_u_chol)

        g_S = np.empty((num_outputs, g_num_inducing, g_num_inducing))
        [
            np.dot(g_L[i, :, :], g_L[i, :, :].T, g_S[i, :, :])
            for i in range(num_outputs)
        ]

        if g_mean_function is not None:
            g_prior_mean_u = g_mean_function.f(g_Z)
            g_prior_mean_f = g_mean_function.f(X)
        else:
            g_prior_mean_u = np.zeros((g_num_inducing, num_outputs))
            g_prior_mean_f = np.zeros((num_data, num_outputs))

        g_Kmm = g_kern.K(g_Z)
        g_Kmn = g_kern.K(g_Z, X)
        #g_Knn_diag = g_kern.Kdiag(X)
        g_Lm = linalg.jitchol(g_Kmm)

        g_A, _ = linalg.dpotrs(g_Lm, g_Kmn)
        g_mu = g_prior_mean_f + np.dot(g_A.T, g_q_u_mean - g_prior_mean_u)
        g_v = np.empty((num_data, num_outputs))
        for i in range(num_outputs):
            tmp = dtrmm(1.0, g_L[i].T, g_A, lower=0, trans_a=0)
            g_v[:, i] = np.sum(np.square(tmp), 0)
        #Anh v += (Knn_diag - np.sum(A*Kmn,0))[:,None]

        for i in range(len(l_svgps)):
            num_inducing, num_outputs = l_svgps[i].q_u_mean.shape

            #expand cholesky representation
            L = choleskies.flat_to_triang(l_svgps[i].q_u_chol)

            S = np.empty((num_outputs, num_inducing, num_inducing))
            [
                np.dot(L[i, :, :], L[i, :, :].T, S[i, :, :])
                for i in range(num_outputs)
            ]

            prior_mean_u = np.zeros((num_inducing, num_outputs))
            prior_mean_f = np.zeros((num_data, num_outputs))

            #compute kernel related stuff
            Kmm = l_svgps[i].kern.K(l_svgps[i].Z)
            Kmn = l_svgps[i].kern.K(l_svgps[i].Z, X)
            Knn_diag = l_svgps[i].kern.Kdiag(X)
            Lm = linalg.jitchol(Kmm)
            #Kmmi, _ = linalg.dpotri(Lm)

            #compute the marginal means and variances of q(f)
            A, _ = linalg.dpotrs(Lm, Kmn)
            mu = prior_mean_f + np.dot(A.T, l_svgps[i].q_u_mean - prior_mean_u)
            v = np.empty((num_data, num_outputs))
            for i in range(num_outputs):
                tmp = dtrmm(1.0, L[i].T, A, lower=0, trans_a=0)
                v[:, i] = np.sum(np.square(tmp), 0)
            v += (Knn_diag - np.sum(A * Kmn, 0))[:, None]

            #final marginal means and variances of q(f)
            mu += g_mu
            v += g_v

            #quadrature for the likelihood
            F[:, i, None], dF_dmu, dF_dv, dF_dthetaL = l_svgps[
                i].likelihood.variational_expectations(
                    Y, mu, v, Y_metadata=l_svgps[i].Y_metadata)

        return F
Ejemplo n.º 9
0
    def local_inference_multithread(self,
                                    X_full,
                                    Y_full,
                                    rho_full,
                                    lsvgp,
                                    g_mu_full,
                                    g_v_full,
                                    KL_scale=1.0,
                                    batch_scale=1.0):

        index = np.where(rho_full > 0)[0]
        X = X_full[index, :]
        Y = Y_full[index, :]
        rho = rho_full[index, :]
        g_mu = g_mu_full[index, :]
        g_v = g_v_full[index, :]

        num_data, _ = Y.shape
        num_data_full, _ = Y_full.shape

        q_u_mean = lsvgp.q_u_mean
        num_inducing, num_outputs = q_u_mean.shape

        #expand cholesky representation
        L = choleskies.flat_to_triang(lsvgp.q_u_chol)

        S = np.empty((num_outputs, num_inducing, num_inducing))
        [
            np.dot(L[i, :, :], L[i, :, :].T, S[i, :, :])
            for i in range(num_outputs)
        ]
        #Si,_ = linalg.dpotri(np.asfortranarray(L), lower=1)
        Si = choleskies.multiple_dpotri(L)
        logdetS = np.array([
            2. * np.sum(np.log(np.abs(np.diag(L[i, :, :]))))
            for i in range(L.shape[0])
        ])

        if np.any(np.isinf(Si)):
            raise ValueError("Cholesky representation unstable")

        prior_mean_u = np.zeros((num_inducing, num_outputs))
        prior_mean_f = np.zeros((num_data, num_outputs))

        #compute kernel related stuff
        Kmm = lsvgp.kern.K(lsvgp.Z)
        Kmn = lsvgp.kern.K(lsvgp.Z, X)
        Knn_diag = lsvgp.kern.Kdiag1(X)
        Lm = linalg.jitchol(Kmm)
        logdetKmm = 2. * np.sum(np.log(np.diag(Lm)))
        Kmmi, _ = linalg.dpotri(Lm)

        #compute the marginal means and variances of q(f)
        A, _ = linalg.dpotrs(Lm, Kmn)
        mu = prior_mean_f + np.dot(A.T, q_u_mean - prior_mean_u)
        v = np.empty((num_data, num_outputs))
        for i in range(num_outputs):
            tmp = dtrmm(1.0, L[i].T, A, lower=0, trans_a=0)
            v[:, i] = np.sum(np.square(tmp), 0)
        v += (Knn_diag - np.sum(A * Kmn, 0))[:, None]

        #final marginal means and variances of q(f)
        mu += g_mu
        v += g_v

        #compute the KL term
        Kmmim = np.dot(Kmmi, q_u_mean)
        KLs = -0.5 * logdetS - 0.5 * num_inducing + 0.5 * logdetKmm + 0.5 * np.sum(
            Kmmi[None, :, :] * S, 1).sum(1) + 0.5 * np.sum(
                q_u_mean * Kmmim, 0)
        KL = KLs.sum()
        #gradient of the KL term (assuming zero mean function)
        dKL_dm = Kmmim.copy()
        dKL_dS = 0.5 * (Kmmi[None, :, :] - Si)
        dKL_dKmm = 0.5 * num_outputs * Kmmi - 0.5 * Kmmi.dot(
            S.sum(0)).dot(Kmmi) - 0.5 * Kmmim.dot(Kmmim.T)

        #quadrature for the likelihood
        F, dF_dmu, dF_dv, dF_dthetaL = lsvgp.likelihood.variational_expectations(
            Y, mu, v, Y_metadata=lsvgp.Y_metadata)

        #multiply with rho
        F, dF_dmu, dF_dv = F * rho, dF_dmu * rho, dF_dv * rho

        #rescale the F term if working on a batch
        F, dF_dmu, dF_dv = F * batch_scale, dF_dmu * batch_scale, dF_dv * batch_scale
        if dF_dthetaL is not None:
            dF_dthetaL = dF_dthetaL.sum(1).sum(1) * batch_scale

        #derivatives of expected likelihood w.r.t. local parameters, assuming zero mean function
        Adv = A[
            None, :, :] * dF_dv.T[:,
                                  None, :]  # As if dF_Dv is diagonal, D, M, N
        Admu = A.dot(dF_dmu)
        #Adv = np.ascontiguousarray(Adv) # makes for faster operations later...(inc dsymm)
        AdvA = np.dot(Adv.reshape(-1, num_data),
                      A.T).reshape(num_outputs, num_inducing, num_inducing)
        tmp = np.sum([np.dot(a, s) for a, s in zip(AdvA, S)], 0).dot(Kmmi)
        dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(0) - tmp - tmp.T
        dF_dKmm = 0.5 * (dF_dKmm + dF_dKmm.T)  # necessary? GPy bug?
        tmp = S.reshape(-1, num_inducing).dot(Kmmi).reshape(
            num_outputs, num_inducing, num_inducing)
        tmp = 2. * (tmp - np.eye(num_inducing)[None, :, :])

        dF_dKmn = Kmmim.dot(dF_dmu.T)
        for a, b in zip(tmp, Adv):
            dF_dKmn += np.dot(a.T, b)

        dF_dm = Admu
        dF_dS = AdvA

        #sum (gradients of) expected likelihood and KL part w.r.t local parameters
        log_marginal = F.sum() - KL
        dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS - dKL_dS, dF_dKmm - dKL_dKmm, dF_dKmn

        dL_dchol = 2. * np.array([np.dot(a, b) for a, b in zip(dL_dS, L)])
        dL_dchol = choleskies.triang_to_flat(dL_dchol)

        grad_dict = {
            'dL_dKmm': dL_dKmm,
            'dL_dKmn': dL_dKmn,
            'dL_dKdiag': dF_dv.sum(1),
            'dL_dm': dL_dm,
            'dL_dchol': dL_dchol,
            'dL_dthetaL': dF_dthetaL
        }

        return Posterior(mean=q_u_mean,
                         cov=S.T,
                         K=Kmm,
                         prior_mean=prior_mean_u
                         ), log_marginal, grad_dict, index, dF_dmu, dF_dv