Ejemplo n.º 1
0
    def _pre_compute(self):
        """Pre-computation for the projection

        Fixed terms in test time are manually identified,
        Only iid latent is implemented.
        """
        # Save the fixed terms here
        # self.saved_terms = {}
        if self.observed_dims is not None:
            # select observed dims to compute
            Y = th.cat(
                (self.Y.index_select(1, self.observed_dims), self.Y_test), 0)
            self.saved_terms["YYT"] = Y.mm(Y.t())

        # computes kernel expectations
        if self.data_type == "iid":
            eKxz = self.kernel.eKxz(self.Z, self.Xmean, self.Xcov)
            eKzxKxz = self.kernel.eKzxKxz(self.Z, self.Xmean, self.Xcov)
            self.saved_terms["eKxz"] = eKxz
            self.saved_terms["eKzxKxz"] = eKzxKxz
        else:
            print("regressive case, not implemented")

        Kzz = self.kernel.K(self.Z) + self.jitter.expand(self.Z.size(0)).diag()
        L = cholesky(Kzz, flag="L")
        self.saved_terms["L"] = L
Ejemplo n.º 2
0
    def _get_p_best(self, x_test, y_test=None, n_samples=100000, show=False):
        """
        Out of the inputs in x_test, determine for each input the probability 
        that its y would be the best (lowest)
        """

        with torch.no_grad():
            # Need the FULL predictive distribution!
            m, c = self.predict_function(TensorType(x_test), diag=False)
            assert m.shape[1] == 1, 'How to quantify "best" for multi-output?'
            
            lc = cholesky(c)
            epsilon = torch.randn(n_samples, *m.shape, dtype=torch_dtype)
            samples = (m[None, :, :] + lc[None, :, :] @ epsilon).cpu().numpy()

        i_best = np.argmin(samples, axis=1)

        p_best = np.array(
            [np.sum(i_best == i) for i in range(self.x_all.shape[0])]
        ) / n_samples

        if show:
            self._show_p_best_analysis(x_test, y_test, m, c, samples, p_best)

        return p_best
Ejemplo n.º 3
0
    def _predict(self, input_new, diag=True):
        # following GPflow implementation
        # integrating the inducing variables out

        if isinstance(input_new, np.ndarray):
            # set input_new to be volatile for inference mode
            input_new = Variable(th.Tensor(input_new).type(float_type),
                                 volatile=True)

        self.X.volatile = True
        self.Y.volatile = True
        self.Z.volatile = True

        num_inducing = self.Z.size(0)
        dim_output = self.Y.size(1)

        # err = self.Y - self.mean_function(self.X)
        err = self.Y
        # Kff_diag = self.kernel.Kdiag(self.X)
        Kuf = self.kernel.K(self.Z, self.X)
        # add jitter
        # Kuu = self.kernel.K(self.Z) + Variable(th.eye(num_inducing).float() * 1e-5)
        Kuu = self.kernel.K(
            self.Z) + self.jitter.transform().expand(num_inducing).diag()
        Kus = self.kernel.K(self.Z, input_new)
        L = cholesky(Kuu)
        A = trtrs(L, Kuf)
        AAT = A.mm(A.t()) / self.likelihood.variance.transform().expand_as(Kuu)
        B = AAT + Variable(th.eye(num_inducing).type(float_type))
        LB = cholesky(B)
        # divide variance at the end
        c = trtrs(LB, A.mm(err)) \
            / self.likelihood.variance.transform().expand(num_inducing, dim_output)
        tmp1 = trtrs(L, Kus)
        tmp2 = trtrs(LB, tmp1)
        mean = tmp2.t().mm(c)

        if diag:
            var = self.kernel.Kdiag(input_new) - tmp1.pow(2).sum(0).squeeze() \
                  + tmp2.pow(2).sum(0).squeeze()
            # add kronecker product later for multi-output case
        else:
            var = self.kernel.K(input_new) + tmp2.t().mm(tmp2) \
                  - tmp1.t().mm(tmp1)
        # return mean + self.mean_function(input_new), var
        return mean, var
Ejemplo n.º 4
0
    def compute_loss(self):
        """
        Computes the variational lower bound of the true log marginal likelihood
        Eqn (9) in Titsias, Michalis K. "Variational Learning of Inducing Variables
        in Sparse Gaussian Processes." AISTATS. Vol. 5. 2009.
        """

        num_inducing = self.Z.size(0)
        num_training = self.X.size(0)
        dim_output = self.Y.size(1)
        # TODO: add mean_functions
        # err = self.Y - self.mean_function(self.X)
        err = self.Y
        Kff_diag = self.kernel.Kdiag(self.X)
        Kuf = self.kernel.K(self.Z, self.X)
        # add jitter
        Kuu = self.kernel.K(self.Z) + \
              self.jitter.transform().expand(num_inducing).diag()
        L = cholesky(Kuu)

        A = trtrs(L, Kuf)
        AAT = A.mm(A.t()) / self.likelihood.variance.transform().expand_as(Kuu)
        B = AAT + Variable(th.eye(num_inducing).type(float_type))
        LB = cholesky(B)
        # divide variance at the end
        c = trtrs(LB, A.mm(err)) \
            / self.likelihood.variance.transform().expand(num_inducing, dim_output)

        # Evidence lower bound
        elbo = Variable(
            th.Tensor([-0.5 * dim_output * num_training * np.log(2 * np.pi)
                       ]).type(float_type))
        elbo -= dim_output * LB.diag().log().sum()
        elbo -= 0.5 * dim_output * num_training * self.likelihood.variance.transform(
        ).log()
        elbo -= 0.5 * (err.pow(2).sum() + dim_output * Kff_diag.sum()) \
                / self.likelihood.variance.transform()
        elbo += 0.5 * c.pow(2).sum()
        elbo += 0.5 * dim_output * AAT.diag().sum()

        return -elbo
Ejemplo n.º 5
0
    def compute_loss(self):
        """
        Loss is equal to the negative of the log likelihood

        Adapted from Rasmussen & Williams, GPML (2006), p. 19, Algorithm 2.1.
        """

        num_input = self.Y.size(0)
        dim_output = self.Y.size(1)

        L = cholesky(self._compute_kyy())
        alpha = trtrs(L, self.Y)
        const = Variable(th.Tensor([-0.5 * dim_output * num_input * \
                                    np.log(2 * np.pi)]).type(tensor_type))
        loss = 0.5 * alpha.pow(2).sum() + dim_output * lt_log_determinant(L) \
            - const
        return loss
        return None if self.mean_function is None \
            else self.mean_function(self.X)
Ejemplo n.º 6
0
    def eKxz_parallel(self, Z, Xmean, Xcov):
        # TODO: add test
        """Parallel implementation (needs more space, but less time)
        Refer to GPflow implementation

        Args:
            Args:
            Z (Variable): m x q inducing input
            Xmean (Variable): n x q mean of input X
            Xcov (Varible): posterior covariance of X
                two sizes are accepted:
                    n x q x q: each q(x_i) has full covariance
                    n x q: each q(x_i) has diagonal covariance (uncorrelated),
                        stored in each row
        Returns:
            (Variable): n x m
        """

        # Revisit later, check for backward support for n-D tensor
        n = Xmean.size(0)
        q = Xmean.size(1)
        m = Z.size(0)
        if Xcov.dim() == 2:
            # from flattered diagonal to full matrix
            cov = Variable(th.Tensor(n, q, q).type(float_type))
            for i in range(Xmean.size(0)):
                cov[i] = Xcov[i].diag()
            Xcov = cov
            del cov
        length_scales = self.length_scales.transform()
        Lambda = length_scales.pow(2).diag().unsqueeze(0).expand_as(Xcov)
        L = cholesky(Lambda + Xcov)
        xz = Xmean.unsqueeze(2).expand(n, q, m) - Z.unsqueeze(0).expand(
            n, q, m)
        Lxz = trtrs(L, xz)
        half_log_dets = L.diag().log().sum(1) \
                        - length_scales.log().sum().expand(n)

        return self.variance.transform().expand(n, m) \
               * th.exp(-0.5 * Lxz.pow(2).sum(1) - half_log_dets.expand(n, m))
Ejemplo n.º 7
0
    def _predict(self, input_new, diag, full_cov_size_limit=10000):
        """
        This method computes

        .. math::
            p(F^* | Y )

        where F* are points on the GP at input_new, Y are observations at the
        input X of the training data.
        :param input_new: assume to be numpy array, but should be in two dimensional
        """

        if isinstance(input_new, np.ndarray):
            # output is a data matrix, rows correspond to the rows in input,
            # columns are treated independently
            input_new = Variable(th.Tensor(input_new).type(tensor_type),
                                 requires_grad=False, volatile=True)

        k_ys = self.kernel.K(self.X, input_new)
        kyy = self._compute_kyy()

        L = cholesky(kyy)
        A = trtrs(L, k_ys)
        V = trtrs(L, self.Y)
        mean_f = th.mm(th.transpose(A, 0, 1), V)

        if self.mean_function is not None:
            mean_f += self.mean_function(input_new)

        var_f_1 = self.kernel.Kdiag(input_new) if diag else \
            self.kernel.K(input_new)  # Kss

        if diag:
            var_f_2 = th.sum(A * A, 0)
        else:
            var_f_2 = th.mm(A.t(), A)
        var_f = var_f_1 - var_f_2

        return mean_f, var_f
Ejemplo n.º 8
0
    def _predict(self, Xnew_mean, Xnew_var=None, diag=True):
        """Computes the mean and variance of latent function output
        corresponding to the new (uncertain) input

        The new input can be deterministic or uncertain (only Gaussian: mean and
        variance). Returns the predictions over all dimensions (extract the
        needed dimensions for imputation case after getting the returns)

        Args:
             Xnew_mean (np.ndarray): new latent input, it is the deterministic
                input if ``input_var`` is None, otherwise it is the mean of the
                latent posterior, size n_* x q
             Xnew_var (np.ndarray): variance (covariance) of latent posterior,
                iid case, still n_* x q (each row stores the diagonal of cov)

        Returns:
            (Variables): n_* x p, mean of the predicted latent output
            (Variables): covariance of the predicted latent output,
                n_* x p for the deterministic case (share the same covariance),
                or n_* x q x q for the uncertain Gaussian input, iid.

        """
        assert isinstance(
            Xnew_mean,
            np.ndarray) and Xnew_mean.shape[1] == self.Xmean.size(1), (
                "Input_mean should be numpy.ndarary, and its column dims "
                "should be same as the latent dimensions")
        Xnew_mean = Variable(th.Tensor(Xnew_mean).type(float_type),
                             volatile=True)

        num_inducing = self.Z.size(0)
        beta = 1.0 / self.likelihood.variance.transform()
        # Psi1, Psi2
        eKxz = self.kernel.eKxz(self.Z, self.Xmean, self.Xcov)
        eKzxKxz = self.kernel.eKzxKxz(self.Z, self.Xmean, self.Xcov)
        Kzs = self.kernel.K(self.Z, Xnew_mean)
        Kzz = self.kernel.K(self.Z) + self.jitter.expand(self.Z.size(0)).diag()
        L = cholesky(Kzz, flag="Lkz")
        A = trtrs(L, trtrs(L, eKzxKxz).t()) * beta.expand_as(L)
        B = A + Variable(th.eye(num_inducing).type(float_type))
        Lb = cholesky(B, flag="Lb")
        C = trtrs(L, Kzs)
        D = trtrs(Lb, C)

        if Xnew_var is None:
            # broadcast udpated
            mean = D.t().mm(trtrs(Lb, trtrs(
                L,
                eKxz.t().mm(self.Y)))) * beta.expand(Xnew_mean.size(0),
                                                     self.Y.size(1))
            # return full covariance or only the diagonal
            if diag:
                # 1d tensor
                var = (self.kernel.Kdiag(Xnew_mean) -
                       C.pow(2).sum(0).squeeze() + D.pow(2).sum(0).squeeze())
            else:
                var = self.kernel.K(Xnew_mean) - C.t().mm(C) + D.t().mm(D)
        else:
            # uncertain input, assume Gaussian.
            assert (isinstance(Xnew_var, np.ndarray)
                    and Xnew_var.shape == Xnew_var.shape), (
                        "Uncertain input, inconsistent variance size, "
                        "should be numpy ndarray")
            Xnew_var = Param(th.Tensor(Xnew_var).type(float_type))
            Xnew_var.requires_transform = True
            Xnew_var.volatile = True
            # s for star (new input), z for inducing input
            eKsz = self.kernel.eKxz(self.Z, Xnew_mean, Xnew_var)
            # list of n_* expectations w.r.t. each test datum
            eKzsKsz = self.kernel.eKzxKxz(self.Z,
                                          Xnew_mean,
                                          Xnew_var,
                                          sum=False)
            Im = Variable(th.eye(self.Z.size(0)).type(float_type))
            E = trtrs(Lb, trtrs(L, Im))
            EtE = E.t().mm(E)
            F = EtE.mm(eKxz.t().mm(self.Y)) * beta.expand(
                self.Z.size(0), self.Y.size(1))
            mean = eKsz.mm(F)
            Linv = trtrs(L, Im)
            Sigma = Linv.t().mm(Linv) - EtE
            # n x m x m
            # eKzsKsz = eKzsKsz.cat(0).view(Xnew_mean.size(0), *self.Z.size())
            var = []
            if diag:
                ns = Xnew_mean.size(0)
                p = self.Y.size(1)
                # vectorization?
                for i in range(ns):
                    cov = (self.kernel.variance.transform() - Sigma.mm(
                        eKzsKsz[i]).trace()).expand(
                            p, p) + F.t().mm(eKzsKsz[i] - eKsz[i, :].unsqueeze(
                                0).t().mm(eKsz[i, :].unsqueeze(0))).mm(F)
                    var.append(cov)
            else:
                # full covariance case, leave for future
                print("multi-output case, future feature")
                var = None
                pass

        return mean, var
Ejemplo n.º 9
0
    def log_likelihood_inference(self):
        """Computes the loss in the inference mode, e.g. for projection.
        Handles both fully observed and partially observed data.

        Only iid latent is implemented.
        """
        num_data_train = self.Y.size(0)
        # dim_output_train = self.Y.size(1)
        dim_latent = self.Z.size(1)
        num_inducing = self.Z.size(0)
        num_data_test = self.Y_test.size(0)
        # total number of data for inference
        num_data = num_data_train + num_data_test
        # dimension of output in the test time
        dim_output = self.Y_test.size(1)
        # whole data for inference
        if self.observed_dims is None:
            Y = th.cat((self.Y, self.Y_test), 0)
        else:
            Y = th.cat(
                (self.Y.index_select(1, self.observed_dims), self.Y_test), 0)

        var_kernel = self.kernel.variance.transform()
        var_noise = self.likelihood.variance.transform()

        # computes kernel expectations
        # eKxx = num_data * self.kernel.eKxx(self.Xmean).sum()
        eKxx = num_data * var_kernel
        if self.data_type == "iid":
            eKxz_test = self.kernel.eKxz(self.Z, self.Xmean_test,
                                         self.Xcov_test)
            eKzxKxz_test = self.kernel.eKzxKxz(self.Z, self.Xmean_test,
                                               self.Xcov_test)
            eKxz = th.cat((self.saved_terms["eKxz"], eKxz_test), 0)
            eKzxKxz = self.saved_terms["eKzxKxz"] + eKzxKxz_test
        else:
            print("regressive case not implemented")

        # compute ELBO
        L = self.saved_terms["L"]
        A = trtrs(L, trtrs(L, eKzxKxz).t()) / var_noise.expand_as(L)
        B = A + Variable(th.eye(num_inducing).type(float_type))
        LB = cholesky(B, flag="LB")

        log_2pi = Variable(th.Tensor([np.log(2 * np.pi)]).type(float_type))
        elbo = -dim_output * (LB.diag().log().sum() + 0.5 * num_data *
                              (var_noise.log() + log_2pi))
        elbo -= 0.5 * dim_output * (eKxx / var_noise - A.diag().sum())

        if not self.is_large_p:
            # distributed
            # C = Variable(th.zeros(num_inducing, dim_output))
            # for i in xrange(num_data):
            #     C += Psi[i, :].unsqueeze(1).mm(self.Y[i, :].unsqueeze(0))
            C = eKxz.t().mm(Y)
            D = trtrs(LB, trtrs(L, C))
            elbo -= (0.5 *
                     (Y.t().mm(Y) / var_noise.expand(dim_output, dim_output) -
                      D.t().mm(D) /
                      var_noise.pow(2).expand(dim_output, dim_output)).trace())
        else:
            # small n, pre-compute YY'
            # YYT = self.Y.mm(self.Y.t())
            D = trtrs(LB, trtrs(L, eKxz.t()))
            W = Variable(th.eye(num_data).type(float_type)) / var_noise.expand(
                num_data, num_data) - D.t().mm(D) / var_noise.pow(2).expand(
                    num_data, num_data)
            elbo -= 0.5 * (W.mm(self.saved_terms["YYT"])).trace()

        # KL Divergence (KLD) btw the posterior and the prior
        if self.data_type == "iid":
            const_nq = Variable(
                th.Tensor([num_data * dim_latent]).type(float_type))
            # eqn (3.28) below p57 Damianou's Diss.
            KLD = 0.5 * (self.Xmean.pow(2).sum() + self.Xcov.transform().sum()
                         - self.Xcov.transform().log().sum() - const_nq)

        elbo -= KLD
        return elbo
Ejemplo n.º 10
0
    def log_likelihood(self):
        """
        Computation graph for the ELBO (Evidence Lower Bound) of
        the variational GPLVM
        For the implementation details, please see ``notes/impl_gplvm``.

        """
        num_data = self.Y.size(0)
        dim_output = self.Y.size(1)
        dim_latent = self.Z.size(1)
        num_inducing = self.Z.size(0)

        var_kernel = self.kernel.variance.transform()
        var_noise = self.likelihood.variance.transform()

        # computes kernel expectations
        eKxx = num_data * var_kernel
        if self.data_type == "iid":
            eKxz = self.kernel.eKxz(self.Z, self.Xmean, self.Xcov)
            eKzxKxz = self.kernel.eKzxKxz(self.Z, self.Xmean, self.Xcov)
        else:
            # seq data
            # compute S_j's and mu_bar_j's (reparameterization: forward)
            # self.Xmean, self.Xcov = self._reparam_vargp(self.Xmean_bar, self.Lambda)
            Kx = self.kernel_x.K(np.array(xrange(self.Y.size(0)))[:, None])
            # print(Kx.data.eig())
            Lkx = cholesky(Kx, flag="Lkx")
            # Kx_inverse = inverse(Kx)
            self.Xmean = Kx.mm(self.Xmean_bar)
            Xcov = []
            # S = []
            Le = []
            In = Variable(th.eye(num_data).type(float_type))
            for j in xrange(dim_latent):
                Ej = Lkx.t().mm(self.Lambda.transform()[:,
                                                        j].diag()).mm(Lkx) + In
                # print(Ej.data.eig())
                Lej = cholesky(Ej, flag="Lej")
                Lsj = trtrs(Lej, Lkx.t()).t()
                Sj = Lsj.mm(Lsj.t())
                Xcov.append(Sj.diag().unsqueeze(1))
                # S.append(Sj)
                Le.append(Lej)
            self.Xcov = th.cat(Xcov, 1)
            eKxz = self.kernel.eKxz(self.Z, self.Xmean, self.Xcov, False)
            eKzxKxz = self.kernel.eKzxKxz(self.Z, self.Xmean, self.Xcov, False)

        # compute ELBO
        # add jitter
        # broadcast update
        Kzz = self.kernel.K(self.Z) + self.jitter.expand(self.Z.size(0)).diag()
        L = cholesky(Kzz, flag="Lkz")
        A = trtrs(L, trtrs(L, eKzxKxz).t()) / var_noise.expand_as(L)
        B = A + Variable(th.eye(num_inducing).type(float_type))
        LB = cholesky(B, flag="LB")

        # log|B|
        # log_det_b = LB.diag().log().sum()

        log_2pi = Variable(th.Tensor([np.log(2 * np.pi)]).type(float_type))
        elbo = -dim_output * (LB.diag().log().sum() + 0.5 * num_data *
                              (var_noise.log() + log_2pi))
        elbo -= 0.5 * dim_output * (eKxx / var_noise - A.trace())

        if not self.is_large_p:
            # distributed
            # C = Variable(th.zeros(num_inducing, dim_output))
            # for i in xrange(num_data):
            #     C += Psi[i, :].unsqueeze(1).mm(self.Y[i, :].unsqueeze(0))
            C = eKxz.t().mm(self.Y)
            D = trtrs(LB, trtrs(L, C))
            elbo -= (0.5 *
                     (self.Y.t().mm(self.Y) /
                      var_noise.expand(dim_output, dim_output) - D.t().mm(D) /
                      var_noise.pow(2).expand(dim_output, dim_output)).trace())
        else:
            # small n, pre-compute YY'
            # YYT = self.Y.mm(self.Y.t())
            D = trtrs(LB, trtrs(L, eKxz.t()))
            W = Variable(th.eye(num_data).type(float_type)) / var_noise.expand(
                num_data, num_data) - D.t().mm(D) / var_noise.pow(2).expand(
                    num_data, num_data)
            elbo -= 0.5 * (W.mm(self.saved_terms["YYT"])).trace()

        # KL Divergence (KLD) btw the posterior and the prior
        if self.data_type == "iid":
            const_nq = Variable(
                th.Tensor([num_data * dim_latent]).type(float_type))
            # eqn (3.28) below p57 Damianou's Diss.
            KLD = 0.5 * (self.Xmean.pow(2).sum() + self.Xcov.transform().sum()
                         - self.Xcov.transform().log().sum() - const_nq)
        else:
            # seq data (3.29) p58
            # Xmean n x q
            # S: q x n x n
            # Kx, Kx_inverse
            KLD = Variable(
                th.Tensor([-0.5 * num_data * dim_latent]).type(float_type))
            KLD += 0.5 * self.Xmean_bar.mm(self.Xmean_bar.t()).mm(
                Kx.t()).trace()
            for j in xrange(dim_latent):
                Lej_inv = trtrs(Le[j], In)
                KLD += 0.5 * Lej_inv.t().mm(Lej_inv).trace() + Le[j].diag(
                ).log().sum()

        elbo -= KLD
        return elbo