Example #1
0
    def predict(self, Xnew: torch.tensor, Xtrain: torch.tensor,
                ytrain: torch.tensor):
        """
        Inputs:
        :Xnew: n_newsamples * n_features
        :Xtrain: n_samples * n_features
        :ytrain: n_samples / n_samples * n_output

        Returns:
        A set of 1d normal distributions, their mean/var are scalar tensors
        """
        device = Xnew.device
        Xnew, Xtrain = self.transformer(Xnew), self.transformer(Xtrain)

        K = self.to_matrix(self.kernel(Xtrain))
        Kprime = K + self.alpha * torch.eye(K.shape[0], device=device)
        L = torch.cholesky(Kprime)

        kXnew = self.to_matrix(self.kernel(Xnew, Xtrain))
        ytrain = ytrain.flatten()
        ytrain_minus_mean = ytrain - self.mean(Xtrain)
        # logging.debug(self.mean)
        mean_ynew = self.mean(Xnew) + torch.squeeze(torch.matmul(
            kXnew, torch.cholesky_solve(ytrain_minus_mean[:, None], L)),
                                                    dim=-1)
        logging.debug("mean={}".format(mean_ynew))
        # LL^T _x = kXnew^T
        _x = torch.cholesky_solve(kXnew.T, L)

        kXnewXnew = self.to_matrix(self.kernel(Xnew))
        std_ynew = torch.sqrt(
            torch.diag(kXnewXnew) - torch.einsum("ij,ji->i", kXnew, _x))
        logging.debug("var={}".format(std_ynew))
        return [Normal(mu, sigma) for mu, sigma in zip(mean_ynew, std_ynew)]
Example #2
0
File: node.py Project: wpfhtl/ddn
 def _solve_linear_system(self, A, B):
     """Solves linear system AX = B.
     If B is a tuple (B1, B2, ...), returns tuple (X1, X2, ...).
     Otherwise returns X.
     """
     B_sizes = None
     # If B is a tuple, concatenate into single tensor:
     if isinstance(B, (tuple, list)):
         B_sizes = list(map(lambda x: x.size(-1), B))
         B = torch.cat(B, dim=-1)
     # Ensure B is 2D (bxmxn):
     if len(B.size()) == 2:
         B = B.unsqueeze(-1)
     try: # Batchwise Cholesky solve
         A_decomp = torch.cholesky(A, upper=False)
         X = torch.cholesky_solve(B, A_decomp, upper=False) # bxmxn
     except: # Revert to loop if batchwise solve fails
         X = torch.zeros_like(B)
         for i in range(A.size(0)):
             try: # Cholesky solve
                 A_decomp = torch.cholesky(A[i, ...], upper=False)
                 X[i, ...] = torch.cholesky_solve(B[i, ...], A_decomp,
                     upper=False) # mxn
             except: # Revert to LU solve
                 X[i, ...], _ = torch.solve(B[i, ...], A[i, ...]) # mxn
     if B_sizes is not None:
         X = X.split(B_sizes, dim=-1)
     return X
Example #3
0
    def predict(self, Z, full=False, tensor=False):
        with torch.no_grad():
            Z = self._check_input(Z)  # MxD

            K = self.kernel(self.X) + self.noise() * self.eye  # NxN
            Ks = self.kernel(self.X, Z)  # NxM
            Kss = self.kernel(Z) + self.noise() * torch.eye(
                Z.shape[0], device=config.device, dtype=config.dtype)  # MxM

            L = self._cholesky(K)  # NxN
            v = torch.triangular_solve(Ks, L, upper=False)[0]  # NxM

            if self.mean is not None:
                y = self.y - self.mean(self.X).reshape(-1, 1)  # Nx1
                mu = Ks.T.mm(torch.cholesky_solve(y, L))  # Mx1
                mu += self.mean(Z).reshape(-1, 1)  # Mx1
            else:
                mu = Ks.T.mm(torch.cholesky_solve(self.y, L))  # Mx1

            var = Kss - v.T.mm(v)  # MxM
            if not full:
                var = var.diag().reshape(-1, 1)  # Mx1
            if tensor:
                return mu, var
            else:
                return mu.cpu().numpy(), var.cpu().numpy()
Example #4
0
    def __computeMeansAndVarsGivenKernelMatrices(self, Kzz, KzzChol, Ktz,
                                                 KttDiag):
        nTrials = KttDiag.shape[0]
        nQuad = KttDiag.shape[1]
        nLatent = KttDiag.shape[2]

        qKMu = torch.empty((nTrials, nQuad, nLatent),
                           dtype=Kzz[0].dtype,
                           device=Kzz[0].device)
        qKVar = torch.empty((nTrials, nQuad, nLatent),
                            dtype=Kzz[0].dtype,
                            device=Kzz[0].device)

        qSigma = self._svPosteriorOnIndPoints.buildQSigma()
        for k in range(len(self._svPosteriorOnIndPoints.getQMu())):
            # Ak \in nTrials x nInd[k] x 1
            Ak = torch.cholesky_solve(self._svPosteriorOnIndPoints.getQMu()[k],
                                      KzzChol[k])
            # qKMu \in  nTrial x nQuad x nLatent
            qKMu[:, :, k] = torch.squeeze(torch.matmul(Ktz[k], Ak))

            # Bkf \in nTrials x nInd[k] x nQuad
            Bkf = torch.cholesky_solve(Ktz[k].transpose(dim0=1, dim1=2),
                                       KzzChol[k])
            # mm1f \in nTrials x nInd[k] x nQuad
            mm1f = torch.matmul(qSigma[k] - Kzz[k], Bkf)
            # aux1 \in nTrials x nInd[k] x nQuad
            aux1 = Bkf * mm1f
            # aux2 \in nTrials x nQuad
            aux2 = torch.sum(input=aux1, dim=1)
            # aux3 \in nTrials x nQuad
            aux3 = KttDiag[:, :, k] + aux2
            # qKVar \in nTrials x nQuad x nLatent
            qKVar[:, :, k] = aux3
        return qKMu, qKVar
Example #5
0
 def _inner_no_grad(self, x, u, v=None, *, keepdim=False):
     l = torch.cholesky(x)
     x_inv_u = torch.cholesky_solve(u, l)
     if v is None:
         x_inv_v = x_inv_u
     else:
         x_inv_v = torch.cholesky_solve(v, l)
     return multitrace(torch.matmul(x_inv_u, x_inv_v), keepdim=keepdim)
Example #6
0
def compute_damped_gn_update(jacobian, output_error, damping):
    """
    Compute the damped Gauss-Newton update, based on the given jacobian and
    output error.
    Args:
        jacobian (torch.Tensor): 2D tensor containing the Jacobian of the
            flattened output with respect to the flattened parameters for which
            the GN update is computed.
        output_error (torch.Tensor): tensor containing the gradient of the loss
            with respect to the output layer of the network.
        damping (float): positive damping hyperparameter

    Returns: the damped Gauss-Newton update for the parameters for which the
        jacobian was computed.

    """
    if damping < 0:
        raise ValueError('Positive value for damping expected, got '
                         '{}'.format(damping))
    # The jacobian also flattens the  output dimension, so we need to do
    # the same.
    output_error = output_error.view(-1, 1).detach()

    if damping == 0:
        # if the damping is 0, the curvature matrix C=J^TJ can be
        # rank deficit. Therefore, it is numerically best to compute the
        # pseudo inverse explicitly and after that multiply with it.
        jacobian_pinv = torch.pinverse(jacobian)
        gn_updates = jacobian_pinv.mm(output_error)
    else:
        # If damping is greater than 0, the curvature matrix C will be
        # positive definite and symmetric. Numerically, it is the most
        # efficient to use the cholesky decomposition to compute the
        # resulting Gauss-newton updates

        # As (J^T*J + l*I)^{-1}*J^T = J^T*(JJ^T + l*I)^{-1}, we select
        # the one which is most computationally efficient, depending on
        # the number of rows and columns of J (we want to take the inverse
        # of the smallest possible matrix, as this is the most expensive
        # operation. Note that we solve a linear system with cholesky
        # instead of explicitly computing the inverse, as this is more
        # efficient.
        if jacobian.shape[0] >= jacobian.shape[1]:
            G = jacobian.t().mm(jacobian)
            C = G + damping * torch.eye(G.shape[0])
            C_cholesky = torch.cholesky(C)
            jacobian_error = jacobian.t().matmul(output_error)
            gn_updates = torch.cholesky_solve(jacobian_error, C_cholesky)
        else:
            G = jacobian.mm(jacobian.t())
            C = G + damping * torch.eye(G.shape[0])
            C_cholesky = torch.cholesky(C)
            inverse_error = torch.cholesky_solve(output_error, C_cholesky)
            gn_updates = jacobian.t().matmul(inverse_error)

    return gn_updates
Example #7
0
    def __call__(self, A, b):
        l = self.cache.get(id(A))
        if l is None:
            l = robust_cholesky(A)
            self.cache[id(A)] = l

        if b.ndim == 1:
            return torch.cholesky_solve(b.unsqueeze(-1), l).squeeze(-1)
        else:
            return torch.cholesky_solve(b, l)
Example #8
0
    def inner(self, x, u, v, keepdim=False):
        # FIXME(ccruceru): This is not currently differentiable.
        assert not x.requires_grad and \
                not u.requires_grad and \
                not v.requires_grad

        l = self.chol(x)
        x_inv_u = torch.cholesky_solve(u, l)
        x_inv_v = torch.cholesky_solve(v, l)
        return tb.trace(x_inv_u @ x_inv_v, keepdim=keepdim)
Example #9
0
def test_cholesky_solve(batch_shape, size):
    b = torch.randn(batch_shape + (size, 5))
    x = torch.randn(batch_shape + (size, size))
    x = x.transpose(-1, -2).matmul(x)
    u = x.cholesky()
    expected = torch.cholesky_solve(b, u)
    assert not expected.requires_grad
    actual = torch.cholesky_solve(b.requires_grad_(), u.requires_grad_())
    assert actual.requires_grad
    assert_close(expected, actual)
    def forward(ctx, mu0, mu1, cho0, cho1):
        """
        calculate of two gaussian. Here gaussian represent as (mu, chokesky),
        here var be represent as cholesky^T cholesky, heew cholesky is upper triangular matrix
        :param ctx: holder
        :param mu0: mu for gaussian 1
        :param mu1: mu for gaussian 2
        :param cho0: cholesky matrix for gaussian 1
        :param cho1: cholesky matrix for gaussian 2
        :return: scale: the scale of two gaussian multiply
        :return: mu_new: the mu for the new gaussian
        :return: cho_new: the chokesky matrix of the new gaussian
        """

        dim = mu0.size(-1)
        # inverse
        # TODO here inverse calculation can use numpy or pytorch, here we conside torch first.
        # lambda0 = np.linalg.inv(var0)
        # lambda1 = np.linalg.inv(var1)
        eye = torch.eye(dim).unsqueeze_(0).double()
        lambda0 = torch.cholesky_solve(cho0, eye, upper=True)
        lambda1 = torch.cholesky_solve(cho1, eye, upper=True)

        mu0 = mu0.unsqueeze(-1)
        mu1 = mu1.unsqueeze(-1)
        eta0 = torch.matmul(lambda0, mu0)
        eta1 = torch.matmul(lambda1, mu1)

        # calculate zeta
        diag0 = torch.diagonal(cho0, dim1=1, dim2=2)
        diag1 = torch.diagonal(cho1, dim1=1, dim2=2)
        zeta0 = -0.5 * (dim * np.log(np.pi * 2) -
                        torch.sum(torch.log(diag0 * diag0), dim=-1) +
                        mu0.transpose(1, 2).matmul(eta0).reshape(-1))

        zeta1 = -0.5 * (dim * np.log(np.pi * 2) -
                        torch.sum(torch.log(diag1 * diag1), dim=-1) +
                        mu1.transpose(1, 2).matmul(eta1).reshape(-1))

        lambda_new = lambda0 + lambda1
        eta_new = eta0 + eta1

        var_new = torch.inverse(lambda_new)
        cho_new = torch.cholesky(var_new, upper=True)

        mu_new = torch.matmul(var_new, eta_new)
        diag_new = torch.diagonal(cho_new, dim1=1, dim2=2)
        zeta_new = -0.5 * (dim * np.log(np.pi * 2) -
                           torch.sum(torch.log(diag_new * diag_new), dim=-1) +
                           mu_new.transpose(1, 2).matmul(eta_new).reshape(-1))

        scale = zeta0 + zeta1 - zeta_new

        return scale, mu_new, cho_new
Example #11
0
    def _update_variational_moments(self, x, y):
        C = self.current_C_matrix(x)
        c = self.current_c_vec(x, y)
        z_b = self.variational_strategy.inducing_points
        Kbb = self.covar_module(z_b).evaluate()
        L = psd_safe_cholesky(Kbb + C.evaluate(),
                              upper=False,
                              jitter=self._jitter)
        m_b = Kbb @ torch.cholesky_solve(c, L, upper=False)
        S_b = Kbb @ torch.cholesky_solve(Kbb, L, upper=False)

        return m_b, S_b
Example #12
0
    def predict(self, x: Tensor, y_dim: int = 1) -> Tuple[Tensor, Tensor]:
        """Predicts mean and covariance.

        Args:
            x (torch.Tensor): Input data for test, size
                `(batch_size, num_points, x_dim)`.
            y_dim (int, optional): Output y dim size for prior.

        Returns:
            y_mean (torch.Tensor): Predicted output, size
                `(batch_size, num_points, y_dim)`.
            y_cov (torch.Tensor): Covariance of the joint predictive
                distribution at the sample points, size
                `(batch_size, num_points, num_points)`.
        """

        if x.dim() != 3:
            raise ValueError("Dim of x should be 3 (batch_size, num_points, "
                             f"x_dim), but given {x.size()}.")

        # Predict y|x based on GP prior
        if self._x_train is None or self._y_train is None:
            batch_size, num_points, _ = x.size()
            y_mean = torch.zeros(batch_size, num_points, y_dim)
            y_cov = self.gaussian_kernel(x, x)
            return y_mean, y_cov

        # Predict y*|x*, x, y based on GP posterior

        # Shift mean of y_train to 0
        y_mean = self._y_train.mean(dim=[0, 1])
        y_train = self._y_train - y_mean

        # Kernel
        K_nn = self.gaussian_kernel(self._x_train, self._x_train)
        K_xx = self.gaussian_kernel(x, x)
        K_xn = self.gaussian_kernel(x, self._x_train)

        # Solve cholesky for each y_dim
        L_ = torch.cholesky(K_nn.double()).float()
        alpha_ = torch.cholesky_solve(y_train, L_)

        # Mean prediction with undoing normalization
        y_mean = K_xn.matmul(alpha_) + y_mean

        # Cov
        v = torch.cholesky_solve(K_xn.transpose(1, 2), L_)
        y_cov = K_xx - K_xn.matmul(v)

        return y_mean, y_cov
Example #13
0
    def sample(self, times, regFactor=1e-3):
        Kzz = self._indPointsLocsKMS.getKzz()
        KzzChol = self._indPointsLocsKMS.getKzzChol()

        indPointsLocsAndAllTimesKMS = IndPointsLocsAndAllTimesKMS()
        indPointsLocsAndAllTimesKMS.setKernels(
            kernels=self._indPointsLocsKMS.getKernels())
        indPointsLocsAndAllTimesKMS.setIndPointsLocs(
            indPointsLocs=self._indPointsLocsKMS.getIndPointsLocs())
        indPointsLocsAndAllTimesKMS.setTimes(times=times)
        indPointsLocsAndAllTimesKMS.buildKernelsMatrices()
        indPointsLocsAndAllTimesKMS.buildKttKernelsMatrices()
        Ktz = indPointsLocsAndAllTimesKMS.getKtz()
        Ktt = indPointsLocsAndAllTimesKMS.getKtt()

        qMu = self._svPosteriorOnIndPoints.getQMu()
        qSigma = self._svPosteriorOnIndPoints.buildQSigma()

        nLatents = len(Kzz)
        nTrials = Kzz[0].shape[0]
        samples = [[] for r in range(nTrials)]
        for r in range(nTrials):
            samples[r] = torch.empty((nLatents, Ktt[0].shape[1]),
                                     dtype=Kzz[0].dtype)
            for k in range(nLatents):
                print("Processing trial {:d} and latent {:d}".format(r, k))
                Kzzrk = Kzz[k][r, :, :]
                KzzCholrk = KzzChol[k][r, :, :]
                Ktzrk = Ktz[k][r, :, :]
                Kttrk = Ktt[k][r, :, :]
                qMurk = qMu[k][r, :, :]
                qSigmark = qSigma[k][r, :, :]

                ### being compute mean ###
                b = torch.cholesky_solve(qMurk, KzzCholrk)
                meanrk = torch.squeeze(Ktzrk.matmul(b))
                ### end compute mean ###

                ### being compute covar ###
                B = torch.cholesky_solve(torch.t(Ktzrk), KzzCholrk)
                covarrk = Kttrk + torch.t(B).matmul(qSigmark - Kzzrk).matmul(B)
                ### end compute covar ###

                covarrk += torch.eye(covarrk.shape[0]) * regFactor
                covarrk = covarrk.detach()
                mn = scipy.stats.multivariate_normal(mean=meanrk, cov=covarrk)
                samples[r][k, :] = torch.from_numpy(mn.rvs())
        return samples
Example #14
0
    def log_marginal(self, Y, gauss_mean, gauss_cov, **kwargs):
        """ Computes the log marginal likelihood w.r.t the prior
        
            log p(y|x) = -1/2 (Y-mu)' @ (K+sigma²I)^{-1} @ (Y-mu) - 1/2 \log |K+sigma^2I| - N/2 log(2pi)         

            Args:
                `Y` (torch.tensor)  :->: Observations Y with shape (Dy,MB)
                `gauss_mean` (torch.tensor)  :->:  mean from p(f). Shape (Dy,MB)
                `gauss_cov`  (torch.tensor)  :->:  full covariance from p(f). Shape (Dy,MB,MB)
        """

        N = Y.size(1)
        Dy = self.out_dim
    
        # compute mean and covariance from the marginal distribution p(y|x).
        # This basically add the observation noise to the covariance
        mx,Kxx = self.marginal_moments(gauss_mean,gauss_cov, diagonal = False)

        # reshapes
        mx = mx.view(Dy,N,1)
        Y  = Y.view(Dy,N,1)

        # solve using cholesky
        Y_mx = Y-mx
        Lxx = psd_safe_cholesky(Kxx, upper = False, jitter = cg.global_jitter)

        # Compute (Y-mu)' @ (K+sigma²I)^{-1} @ (Y-mu)
        rhs = torch.cholesky_solve(Y_mx, Lxx, upper = False)

        data_fit_term   = torch.matmul(Y_mx.transpose(1,2),rhs)
        complexity_term = 2*torch.log(torch.diagonal(Lxx, dim1 = 1, dim2 = 2)).sum(1) 
        cte      = -N/2. * torch.log(2*cg.pi)
        

        return -0.5*(data_fit_term + complexity_term ) + cte
Example #15
0
 def _update_weights(self):
     """
     Method internally used to update GP weights' posterior mean
     """
     self.weights_train = torch.cholesky_solve(
         self.training_vec, self.training_mat,
         upper=True)  # not to be confused with BLR weights sample
Example #16
0
 def step(self, actions, shadow=False):
     actions = torch.clamp(actions, 0, 1)
     x = self.data[:, :self.nstep, :-1]
     y = self.data[:, :self.nstep, -1]
     K = exponential_kernel(
         x.unsqueeze(2), x.unsqueeze(1), self.length_scale) + torch.eye(
             self.nstep, dtype=torch.float, device=self.device) * 1e-5
     #print(K)
     # K = torch.exp(-1 / 2 * ((x.view(self.batch_size, self.nstep, 1) - x.view(self.batch_size, 1, self.nstep)) / 0.1) ** 2) + torch.eye(self.nstep, dtype=torch.float, device=self.device) * 1e-5
     u = torch.cholesky(K)
     k = exponential_kernel(x, actions.view(self.batch_size, 1, self.dims),
                            self.length_scale)
     #print(k)
     # k = torch.exp(-1 / 2 * ((x.view(self.batch_size, self.nstep) - actions.view(self.batch_size,1)) / 0.1) ** 2)
     sol = torch.cholesky_solve(
         torch.cat((k.view(self.batch_size, self.nstep,
                           1), y.view(self.batch_size, self.nstep, 1)),
                   dim=2), u)
     #print(sol)
     mav = torch.matmul(k.view(self.batch_size, 1, self.nstep),
                        sol).view(self.batch_size, 2)  #check shapes!
     #print(mav)
     newy = torch.normal(mav[:, 1], 1 - mav[:, 0])
     newbest = torch.max(self.best, newy)
     reward = newbest - self.best
     if not shadow:
         self.best = newbest
     self.data[:, self.nstep, :-1] = actions.view(self.batch_size,
                                                  self.dims)
     self.data[:, self.nstep, -1] = newy
     self.nstep = self.nstep + 1
     return self.data[:, self.nstep - 1], reward.unsqueeze(1), mav
Example #17
0
 def forward(ctx, A, b):
     u = torch.cholesky(torch.matmul(A.transpose(-1, -2), A), upper=True)
     ret = torch.cholesky_solve(torch.matmul(A.transpose(-1, -2), b),
                                u,
                                upper=True)
     ctx.save_for_backward(u, ret, A, b)
     return ret
    def manual_predict(params, train_x, train_y, test_x):

        with torch.no_grad():
            print('computing K and Kt...')
            K = mnl_cov_with_noise(params, train_x)
            Kt = manual_cov(params, test_x, train_x)
            print('computing L...')
            L = torch.cholesky(K)
            alpha = torch.cholesky_solve(torch.t(torch.stack([train_y])), L)
            print('computing mean f...')
            f = torch.matmul(Kt, alpha)[:, 0]

            print('computing predictive variances...')

            def var(i):
                xi = test_x[[i], :]
                ki = torch.t(Kt[[i], :])
                v, _cc = torch.triangular_solve(ki, L, upper=False)
                ret = (manual_cov(params, xi, xi) -
                       torch.matmul(torch.t(v), v)).item()
                return ret

            vars = torch.tensor([var(i) for i in range(len(test_x))])

            return (f, vars)
Example #19
0
    def _e_step(self, data):

        X, noise_covars = data

        T = self.covars[None, :, :, :] + noise_covars[:, None, :, :]
        try:
            T_chol = torch.cholesky(T)
        except RuntimeError:
            return torch.tensor(float('-inf')), None
        T_inv = torch.cholesky_solve(torch.eye(self.d, device=self.device),
                                     T_chol)

        diff = X[:, None, :] - self.means
        T_inv_diff = torch.matmul(T_inv, diff[:, :, :, None])
        log_resps = -0.5 * (torch.matmul(diff[:, :, None, :], T_inv_diff) +
                            self.d * math.log(2 * math.pi)).squeeze()
        log_resps -= T_chol.diagonal(dim1=-2, dim2=-1).log().sum(-1)

        log_resps += torch.log(self.weights[None, :, 0])

        cond_means = self.means + torch.matmul(  # n, j, d
            self.covars[None, :, :, :],  # 1, j, d, d
            T_inv_diff)[:, :, :, 0]

        cond_covars = self.covars - torch.matmul(  # n, j, d, d
            self.covars,  # j, d, d
            torch.matmul(  # n, j, d, d
                T_inv,  # n, j, d, d
                self.covars  # j, d, d
            ))

        log_prob = torch.logsumexp(log_resps, dim=1, keepdim=True)
        log_resps -= log_prob
        return torch.sum(log_prob), (log_resps, cond_means, cond_covars)
Example #20
0
    def forward(self):
        K, B = self.GlobalKernel(self.X, self.X)

        dK = K.diagonal()
        dK += self.global_gp_noise_std**2 + self.jitter
        # print(K)
        L = torch.linalg.cholesky(K)
        alpha = torch.cholesky_solve(self.y, L)

        Apart1 = self.y.T @ alpha
        Apart2 = torch.sum(torch.log(L.diagonal()))
        # Apart2 = torch.det(K)
        # print('Before Apart2', Apart2)
        # Apart2 = Apart2.clamp(Apart2, min=10**-20)
        # Apart2 = torch.log(Apart2)
        # Apart3 = self.N * torch.log(2*self.pi)

        A = 0.5 * (Apart1 + Apart2)[0, 0]

        # Bpart1 = B
        # Bpart2 = 0.5*(self.num_latent_points *
        #                                    self.input_dim*torch.log(2*self.pi))

        # B = Bpart1# + Bpart2

        # print("A1", Apart1, "A2", Apart2, "B", B, "Loss", A+B, 'local var', self.local_gp_std)
        return (A + B) / self.X.nelement()
 def train(self, X, y, method='cholesky', alpha=1e-2):
     """
     Compute the output weights with a linear regression.
     Parameters:
       - X: input sequence of shape (seq_len, res_size)
       - y: target output (seq_len, out_dim)
       - method: "cholesky" or "sklearn ridge"
       - alpha: L2-regularization parameter
     
     Returns: a tensor of shape (res_size, out_dim)
     """
     if method == 'cholesky':
         # This technique uses the Cholesky decomposition
         # It should be fast when res_size < seq_len
         Xt_y = X.T @ y  # size (res_size, out_dim)
         K = X.T @ X  # size (res_size, res_size)
         K.view(-1)[::len(K) +
                    1] += alpha  # add elements on the diagonal inplace
         L = torch.cholesky(K, upper=False)
         return torch.cholesky_solve(Xt_y, L, upper=False)
     elif method == 'sklearn ridge':
         from sklearn.linear_model import Ridge
         clf = Ridge(fit_intercept=False, alpha=alpha)
         clf.fit(X.cpu().numpy(), y.cpu().numpy())
         return torch.from_numpy(clf.coef_.T).to(self.device)
    def __call__(self, x, full_cov=False):
        # Training mode
        if self.training:
            if self.train_inputs is None:
                raise RuntimeError(
                    "train_inputs, train_targets cannot be None in training mode. "
                    "Call .eval() for prior predictions, or call .set_train_data() to add training data."
                )
            if settings.debug.on():
                if not torch.equal(self.X, x):
                    raise RuntimeError("You must train on the training inputs!")
            return self.forward(x)

        # Prior mode
        elif settings.prior_mode.on() or self.train_inputs is None or self.train_targets is None:
            full_output = self.forward(x)
            if settings.debug().on():
                if not isinstance(full_output, gpytorch.distributions.MultivariateNormal):
                    raise RuntimeError("ExactGP.forward must return a MultivariateNormal")
            return full_output

        # Posterior mode
        else:
            cov_data_query = self.covar_module(self.X, x).evaluate()
            prior_pred = self.forward(x)
            pred_mean = prior_pred.mean.view(-1, 1) + cov_data_query.t() @ self.y_weights
            cov_weights = torch.cholesky_solve(cov_data_query, self.chol_cov_data)

            if full_cov:
                pred_cov = prior_pred.covariance_matrix - cov_data_query.t() @ cov_weights
            else:  # Evaluates only diagonal (variances) as a diagonal lazy matrix
                diag_k = gpytorch.lazy.DiagLazyTensor(prior_pred.lazy_covariance_matrix.diag())
                pred_cov = diag_k.add_diag(-cov_data_query.t().matmul(cov_weights).diag())

        return gpytorch.distributions.MultivariateNormal(pred_mean.view_as(prior_pred.mean), pred_cov)
Example #23
0
    def test_cg_with_tridiag(self):
        size = 10
        matrix = torch.randn(size, size, dtype=torch.float64)
        matrix = matrix.matmul(matrix.transpose(-1, -2))
        matrix.div_(matrix.norm())
        matrix.add_(torch.eye(matrix.size(-1), dtype=torch.float64).mul_(1e-1))

        rhs = torch.randn(size, 50, dtype=torch.float64)
        solves, t_mats = linear_cg(matrix.matmul,
                                   rhs=rhs,
                                   n_tridiag=5,
                                   max_tridiag_iter=10,
                                   max_iter=size,
                                   tolerance=0,
                                   eps=1e-15)

        # Check cg
        matrix_chol = torch.linalg.cholesky(matrix)
        actual = torch.cholesky_solve(rhs, matrix_chol)
        self.assertTrue(torch.allclose(solves, actual, atol=1e-3, rtol=1e-4))

        # Check tridiag
        eigs = torch.linalg.eigvalsh(matrix)
        for i in range(5):
            approx_eigs = torch.linalg.eigvalsh(t_mats[i])
            self.assertTrue(
                torch.allclose(eigs, approx_eigs, atol=1e-3, rtol=1e-4))
Example #24
0
 def step(self, actions, shadow=False):
     x = self.data[:, 0, :self.nstep]
     y = self.data[:, 1, :self.nstep]
     K = torch.exp(-1 / 2 * (
         (x.view(self.batch_size, self.nstep, 1) -
          x.view(self.batch_size, 1, self.nstep)) / 0.1)**2) + torch.eye(
              self.nstep, dtype=torch.float, device=self.device) * 1e-5
     u = torch.cholesky(K)
     k = torch.exp(-1 / 2 * ((x.view(self.batch_size, self.nstep) -
                              actions.view(self.batch_size, 1)) / 0.1)**2)
     sol = torch.cholesky_solve(
         torch.cat((k.view(self.batch_size, self.nstep,
                           1), y.view(self.batch_size, self.nstep, 1)),
                   dim=2), u)
     #print(sol)
     mav = torch.matmul(k.view(self.batch_size, 1, self.nstep),
                        sol).view(self.batch_size, 2)  #check shapes!
     #print(mav)
     newy = torch.normal(mav[:, 1], 1 - mav[:, 0])
     newbest = torch.max(self.best, newy)
     reward = newbest - self.best
     if not shadow:
         self.best = newbest
     self.data[:, 0, self.nstep] = actions.view(self.batch_size)
     self.data[:, 1, self.nstep] = newy
     self.nstep = self.nstep + 1
     return self.data[:, :, self.nstep - 1], reward
Example #25
0
 def _update_projections(self, Y, R, component_batches):
     print('Updating projections...')
     for bstart, bend in component_batches:
         self.t_matrix[bstart:bend, :, :] = torch.cholesky_solve(
             Y[bstart:bend, :, :].transpose(1, 2),
             torch.cholesky(R[bstart:bend, :, :], upper=True),
             upper=True)
Example #26
0
    def test_batch_cg_with_tridiag(self):
        batch = 5
        size = 10
        matrix = torch.randn(batch, size, size, dtype=torch.float64)
        matrix = matrix.matmul(matrix.transpose(-1, -2))
        matrix.div_(matrix.norm())
        matrix.add_(torch.eye(matrix.size(-1), dtype=torch.float64).mul_(1e-1))

        rhs = torch.randn(batch, size, 10, dtype=torch.float64)
        solves, t_mats = linear_cg(matrix.matmul,
                                   rhs=rhs,
                                   n_tridiag=8,
                                   max_iter=size,
                                   max_tridiag_iter=10,
                                   tolerance=0,
                                   eps=1e-30)

        # Check cg
        matrix_chol = torch.linalg.cholesky(matrix)
        actual = torch.cholesky_solve(rhs, matrix_chol)
        self.assertTrue(torch.allclose(solves, actual, atol=1e-3, rtol=1e-4))

        # Check tridiag
        for i in range(5):
            eigs = matrix[i].symeig()[0]
            for j in range(8):
                approx_eigs = t_mats[j, i].symeig()[0]
                self.assertTrue(
                    torch.allclose(eigs, approx_eigs, atol=1e-3, rtol=1e-4))
Example #27
0
    def fit(self, X, labels):
        """
		Fit method
		Args:
		- X: a tensor, appropriately flattened, having sizes: (Batch Size, Features, 1)
		- labels: a tensor of labels, having sizes: (Batch Size, 1)

		"""
        #for p in self.W_comp:
        #	p.data.clamp_(0) #projection to ensure positive semi-definiteness

        #W_soft = F.softmax(self.W_comp)

        self.kern = torch.sum(torch.stack([
            self.W_comp[i] * self.kernel[i](X) for i in range(self.nb_kernels)
        ]),
                              dim=0)
        K = self.kern + torch.eye(
            self.kern.size()[0]).to(device) * self.lambda_reg
        L = torch.cholesky(K, upper=False)
        one_hot_y = F.one_hot(labels, num_classes=10).type(
            torch.FloatTensor).to(device)

        #A, _ = torch.solve(kern, L)
        #V, _ = torch.solve(one_hot_y, L)
        #alpha = A.T @ V
        self.alpha = torch.cholesky_solve(one_hot_y, L, upper=False)
Example #28
0
    def KL(self):
        """
        The KL divergence from the variational distribution to the prior

        :return: KL divergence from N(q_mu, q_sqrt) to N(0, I), independently for each GP
        """
        # if self.white:
        #     return gauss_kl(self.q_mu, self.q_sqrt)
        # else:
        #     return gauss_kl(self.q_mu, self.q_sqrt, self.Ku)

        self.build_cholesky_if_needed()

        KL = -0.5 * self.num_outputs * self.num_inducing
        KL -= 0.5 * torch.cumsum(torch.log(torch.stack(tuple(t.diag() for t in torch.unbind(self.q_sqrt,0))) ** 2),dim=0)[:,a.size(1)-1] #error check

        if not self.white:
            KL += torch.cumsum(torch.log(torch.stack(tuple(t.diag() for t in torch.unbind(self.q_sqrt,0)))),dim=0)[:,a.size(1)-1]  * self.num_outputs
            KL += 0.5 * torch.cumsum(torch.square(torch.triangular_solve(self.Lu_tiled, self.q_sqrt, upper=False)),dim=0)[:,a.size(1)-1]
            Kinv_m = torch.cholesky_solve(self.q_mu , self.Lu)
            KL += 0.5 * torch.cumsum(self.q_mu * Kinv_m, dim=0)[:,a.size(1)-1]
        else:
            KL += 0.5 * torch.cumsum(torch.square(self.q_sqrt),dim=0)[:,a.size(1)-1]
            KL += 0.5 * torch.cumsum(self.q_mu**2,dim=0)[:,a.size(1)-1]

        return KL
    def _sample_posterior(self, x, num_samples, context=None):
        log_weights = torch.log(self.module.soft_max(self.module.soft_weights))
        T = self.module.covars[None, :, :, :] + x[1][:, None, :, :]

        p_weights = log_weights + dist.MultivariateNormal(
            loc=self.module.means, covariance_matrix=T
        ).log_prob(x[0][:, None, :])
        p_weights -= torch.logsumexp(p_weights, axis=1)[:, None]

        L_t = torch.cholesky(T)
        T_inv = torch.cholesky_solve(
            torch.eye(self.d, device=self.device), L_t)

        diff = x[0][:, None, :] - self.module.means
        T_prod = torch.matmul(T_inv, diff[:, :, :, None])
        p_means = self.module.means + torch.matmul(
            self.module.covars,
            T_prod
        ).squeeze()

        p_covars = self.module.covars - torch.matmul(
            self.module.covars,
            torch.matmul(T_inv, self.module.covars)
        )

        idx = dist.Categorical(logits=p_weights).sample([num_samples])
        samples = dist.MultivariateNormal(
            loc=p_means, covariance_matrix=p_covars).sample([num_samples])

        return samples.transpose(0, 1)[
            torch.arange(len(x), device=self.device)[:, None, None, None],
            torch.arange(num_samples, device=self.device)[None, :, None, None],
            idx.T[:, :, None, None],
            torch.arange(self.d, device=self.device)[None, None, None, :]
        ].squeeze()
Example #30
0
    def backward(ctx, g):
        l, = ctx.saved_tensors
        n = l.shape[-1]
        # TODO: Use cholesky_inverse once pytorch/pytorch/issues/7500 is solved.
        grad_x = g.view(*l.shape[:-2], 1, 1) * torch.cholesky_solve(
            torch.eye(n, out=l.new(n, n)), l)

        return grad_x, None, None