def predict(self, Xnew: torch.tensor, Xtrain: torch.tensor, ytrain: torch.tensor): """ Inputs: :Xnew: n_newsamples * n_features :Xtrain: n_samples * n_features :ytrain: n_samples / n_samples * n_output Returns: A set of 1d normal distributions, their mean/var are scalar tensors """ device = Xnew.device Xnew, Xtrain = self.transformer(Xnew), self.transformer(Xtrain) K = self.to_matrix(self.kernel(Xtrain)) Kprime = K + self.alpha * torch.eye(K.shape[0], device=device) L = torch.cholesky(Kprime) kXnew = self.to_matrix(self.kernel(Xnew, Xtrain)) ytrain = ytrain.flatten() ytrain_minus_mean = ytrain - self.mean(Xtrain) # logging.debug(self.mean) mean_ynew = self.mean(Xnew) + torch.squeeze(torch.matmul( kXnew, torch.cholesky_solve(ytrain_minus_mean[:, None], L)), dim=-1) logging.debug("mean={}".format(mean_ynew)) # LL^T _x = kXnew^T _x = torch.cholesky_solve(kXnew.T, L) kXnewXnew = self.to_matrix(self.kernel(Xnew)) std_ynew = torch.sqrt( torch.diag(kXnewXnew) - torch.einsum("ij,ji->i", kXnew, _x)) logging.debug("var={}".format(std_ynew)) return [Normal(mu, sigma) for mu, sigma in zip(mean_ynew, std_ynew)]
def _solve_linear_system(self, A, B): """Solves linear system AX = B. If B is a tuple (B1, B2, ...), returns tuple (X1, X2, ...). Otherwise returns X. """ B_sizes = None # If B is a tuple, concatenate into single tensor: if isinstance(B, (tuple, list)): B_sizes = list(map(lambda x: x.size(-1), B)) B = torch.cat(B, dim=-1) # Ensure B is 2D (bxmxn): if len(B.size()) == 2: B = B.unsqueeze(-1) try: # Batchwise Cholesky solve A_decomp = torch.cholesky(A, upper=False) X = torch.cholesky_solve(B, A_decomp, upper=False) # bxmxn except: # Revert to loop if batchwise solve fails X = torch.zeros_like(B) for i in range(A.size(0)): try: # Cholesky solve A_decomp = torch.cholesky(A[i, ...], upper=False) X[i, ...] = torch.cholesky_solve(B[i, ...], A_decomp, upper=False) # mxn except: # Revert to LU solve X[i, ...], _ = torch.solve(B[i, ...], A[i, ...]) # mxn if B_sizes is not None: X = X.split(B_sizes, dim=-1) return X
def predict(self, Z, full=False, tensor=False): with torch.no_grad(): Z = self._check_input(Z) # MxD K = self.kernel(self.X) + self.noise() * self.eye # NxN Ks = self.kernel(self.X, Z) # NxM Kss = self.kernel(Z) + self.noise() * torch.eye( Z.shape[0], device=config.device, dtype=config.dtype) # MxM L = self._cholesky(K) # NxN v = torch.triangular_solve(Ks, L, upper=False)[0] # NxM if self.mean is not None: y = self.y - self.mean(self.X).reshape(-1, 1) # Nx1 mu = Ks.T.mm(torch.cholesky_solve(y, L)) # Mx1 mu += self.mean(Z).reshape(-1, 1) # Mx1 else: mu = Ks.T.mm(torch.cholesky_solve(self.y, L)) # Mx1 var = Kss - v.T.mm(v) # MxM if not full: var = var.diag().reshape(-1, 1) # Mx1 if tensor: return mu, var else: return mu.cpu().numpy(), var.cpu().numpy()
def __computeMeansAndVarsGivenKernelMatrices(self, Kzz, KzzChol, Ktz, KttDiag): nTrials = KttDiag.shape[0] nQuad = KttDiag.shape[1] nLatent = KttDiag.shape[2] qKMu = torch.empty((nTrials, nQuad, nLatent), dtype=Kzz[0].dtype, device=Kzz[0].device) qKVar = torch.empty((nTrials, nQuad, nLatent), dtype=Kzz[0].dtype, device=Kzz[0].device) qSigma = self._svPosteriorOnIndPoints.buildQSigma() for k in range(len(self._svPosteriorOnIndPoints.getQMu())): # Ak \in nTrials x nInd[k] x 1 Ak = torch.cholesky_solve(self._svPosteriorOnIndPoints.getQMu()[k], KzzChol[k]) # qKMu \in nTrial x nQuad x nLatent qKMu[:, :, k] = torch.squeeze(torch.matmul(Ktz[k], Ak)) # Bkf \in nTrials x nInd[k] x nQuad Bkf = torch.cholesky_solve(Ktz[k].transpose(dim0=1, dim1=2), KzzChol[k]) # mm1f \in nTrials x nInd[k] x nQuad mm1f = torch.matmul(qSigma[k] - Kzz[k], Bkf) # aux1 \in nTrials x nInd[k] x nQuad aux1 = Bkf * mm1f # aux2 \in nTrials x nQuad aux2 = torch.sum(input=aux1, dim=1) # aux3 \in nTrials x nQuad aux3 = KttDiag[:, :, k] + aux2 # qKVar \in nTrials x nQuad x nLatent qKVar[:, :, k] = aux3 return qKMu, qKVar
def _inner_no_grad(self, x, u, v=None, *, keepdim=False): l = torch.cholesky(x) x_inv_u = torch.cholesky_solve(u, l) if v is None: x_inv_v = x_inv_u else: x_inv_v = torch.cholesky_solve(v, l) return multitrace(torch.matmul(x_inv_u, x_inv_v), keepdim=keepdim)
def compute_damped_gn_update(jacobian, output_error, damping): """ Compute the damped Gauss-Newton update, based on the given jacobian and output error. Args: jacobian (torch.Tensor): 2D tensor containing the Jacobian of the flattened output with respect to the flattened parameters for which the GN update is computed. output_error (torch.Tensor): tensor containing the gradient of the loss with respect to the output layer of the network. damping (float): positive damping hyperparameter Returns: the damped Gauss-Newton update for the parameters for which the jacobian was computed. """ if damping < 0: raise ValueError('Positive value for damping expected, got ' '{}'.format(damping)) # The jacobian also flattens the output dimension, so we need to do # the same. output_error = output_error.view(-1, 1).detach() if damping == 0: # if the damping is 0, the curvature matrix C=J^TJ can be # rank deficit. Therefore, it is numerically best to compute the # pseudo inverse explicitly and after that multiply with it. jacobian_pinv = torch.pinverse(jacobian) gn_updates = jacobian_pinv.mm(output_error) else: # If damping is greater than 0, the curvature matrix C will be # positive definite and symmetric. Numerically, it is the most # efficient to use the cholesky decomposition to compute the # resulting Gauss-newton updates # As (J^T*J + l*I)^{-1}*J^T = J^T*(JJ^T + l*I)^{-1}, we select # the one which is most computationally efficient, depending on # the number of rows and columns of J (we want to take the inverse # of the smallest possible matrix, as this is the most expensive # operation. Note that we solve a linear system with cholesky # instead of explicitly computing the inverse, as this is more # efficient. if jacobian.shape[0] >= jacobian.shape[1]: G = jacobian.t().mm(jacobian) C = G + damping * torch.eye(G.shape[0]) C_cholesky = torch.cholesky(C) jacobian_error = jacobian.t().matmul(output_error) gn_updates = torch.cholesky_solve(jacobian_error, C_cholesky) else: G = jacobian.mm(jacobian.t()) C = G + damping * torch.eye(G.shape[0]) C_cholesky = torch.cholesky(C) inverse_error = torch.cholesky_solve(output_error, C_cholesky) gn_updates = jacobian.t().matmul(inverse_error) return gn_updates
def __call__(self, A, b): l = self.cache.get(id(A)) if l is None: l = robust_cholesky(A) self.cache[id(A)] = l if b.ndim == 1: return torch.cholesky_solve(b.unsqueeze(-1), l).squeeze(-1) else: return torch.cholesky_solve(b, l)
def inner(self, x, u, v, keepdim=False): # FIXME(ccruceru): This is not currently differentiable. assert not x.requires_grad and \ not u.requires_grad and \ not v.requires_grad l = self.chol(x) x_inv_u = torch.cholesky_solve(u, l) x_inv_v = torch.cholesky_solve(v, l) return tb.trace(x_inv_u @ x_inv_v, keepdim=keepdim)
def test_cholesky_solve(batch_shape, size): b = torch.randn(batch_shape + (size, 5)) x = torch.randn(batch_shape + (size, size)) x = x.transpose(-1, -2).matmul(x) u = x.cholesky() expected = torch.cholesky_solve(b, u) assert not expected.requires_grad actual = torch.cholesky_solve(b.requires_grad_(), u.requires_grad_()) assert actual.requires_grad assert_close(expected, actual)
def forward(ctx, mu0, mu1, cho0, cho1): """ calculate of two gaussian. Here gaussian represent as (mu, chokesky), here var be represent as cholesky^T cholesky, heew cholesky is upper triangular matrix :param ctx: holder :param mu0: mu for gaussian 1 :param mu1: mu for gaussian 2 :param cho0: cholesky matrix for gaussian 1 :param cho1: cholesky matrix for gaussian 2 :return: scale: the scale of two gaussian multiply :return: mu_new: the mu for the new gaussian :return: cho_new: the chokesky matrix of the new gaussian """ dim = mu0.size(-1) # inverse # TODO here inverse calculation can use numpy or pytorch, here we conside torch first. # lambda0 = np.linalg.inv(var0) # lambda1 = np.linalg.inv(var1) eye = torch.eye(dim).unsqueeze_(0).double() lambda0 = torch.cholesky_solve(cho0, eye, upper=True) lambda1 = torch.cholesky_solve(cho1, eye, upper=True) mu0 = mu0.unsqueeze(-1) mu1 = mu1.unsqueeze(-1) eta0 = torch.matmul(lambda0, mu0) eta1 = torch.matmul(lambda1, mu1) # calculate zeta diag0 = torch.diagonal(cho0, dim1=1, dim2=2) diag1 = torch.diagonal(cho1, dim1=1, dim2=2) zeta0 = -0.5 * (dim * np.log(np.pi * 2) - torch.sum(torch.log(diag0 * diag0), dim=-1) + mu0.transpose(1, 2).matmul(eta0).reshape(-1)) zeta1 = -0.5 * (dim * np.log(np.pi * 2) - torch.sum(torch.log(diag1 * diag1), dim=-1) + mu1.transpose(1, 2).matmul(eta1).reshape(-1)) lambda_new = lambda0 + lambda1 eta_new = eta0 + eta1 var_new = torch.inverse(lambda_new) cho_new = torch.cholesky(var_new, upper=True) mu_new = torch.matmul(var_new, eta_new) diag_new = torch.diagonal(cho_new, dim1=1, dim2=2) zeta_new = -0.5 * (dim * np.log(np.pi * 2) - torch.sum(torch.log(diag_new * diag_new), dim=-1) + mu_new.transpose(1, 2).matmul(eta_new).reshape(-1)) scale = zeta0 + zeta1 - zeta_new return scale, mu_new, cho_new
def _update_variational_moments(self, x, y): C = self.current_C_matrix(x) c = self.current_c_vec(x, y) z_b = self.variational_strategy.inducing_points Kbb = self.covar_module(z_b).evaluate() L = psd_safe_cholesky(Kbb + C.evaluate(), upper=False, jitter=self._jitter) m_b = Kbb @ torch.cholesky_solve(c, L, upper=False) S_b = Kbb @ torch.cholesky_solve(Kbb, L, upper=False) return m_b, S_b
def predict(self, x: Tensor, y_dim: int = 1) -> Tuple[Tensor, Tensor]: """Predicts mean and covariance. Args: x (torch.Tensor): Input data for test, size `(batch_size, num_points, x_dim)`. y_dim (int, optional): Output y dim size for prior. Returns: y_mean (torch.Tensor): Predicted output, size `(batch_size, num_points, y_dim)`. y_cov (torch.Tensor): Covariance of the joint predictive distribution at the sample points, size `(batch_size, num_points, num_points)`. """ if x.dim() != 3: raise ValueError("Dim of x should be 3 (batch_size, num_points, " f"x_dim), but given {x.size()}.") # Predict y|x based on GP prior if self._x_train is None or self._y_train is None: batch_size, num_points, _ = x.size() y_mean = torch.zeros(batch_size, num_points, y_dim) y_cov = self.gaussian_kernel(x, x) return y_mean, y_cov # Predict y*|x*, x, y based on GP posterior # Shift mean of y_train to 0 y_mean = self._y_train.mean(dim=[0, 1]) y_train = self._y_train - y_mean # Kernel K_nn = self.gaussian_kernel(self._x_train, self._x_train) K_xx = self.gaussian_kernel(x, x) K_xn = self.gaussian_kernel(x, self._x_train) # Solve cholesky for each y_dim L_ = torch.cholesky(K_nn.double()).float() alpha_ = torch.cholesky_solve(y_train, L_) # Mean prediction with undoing normalization y_mean = K_xn.matmul(alpha_) + y_mean # Cov v = torch.cholesky_solve(K_xn.transpose(1, 2), L_) y_cov = K_xx - K_xn.matmul(v) return y_mean, y_cov
def sample(self, times, regFactor=1e-3): Kzz = self._indPointsLocsKMS.getKzz() KzzChol = self._indPointsLocsKMS.getKzzChol() indPointsLocsAndAllTimesKMS = IndPointsLocsAndAllTimesKMS() indPointsLocsAndAllTimesKMS.setKernels( kernels=self._indPointsLocsKMS.getKernels()) indPointsLocsAndAllTimesKMS.setIndPointsLocs( indPointsLocs=self._indPointsLocsKMS.getIndPointsLocs()) indPointsLocsAndAllTimesKMS.setTimes(times=times) indPointsLocsAndAllTimesKMS.buildKernelsMatrices() indPointsLocsAndAllTimesKMS.buildKttKernelsMatrices() Ktz = indPointsLocsAndAllTimesKMS.getKtz() Ktt = indPointsLocsAndAllTimesKMS.getKtt() qMu = self._svPosteriorOnIndPoints.getQMu() qSigma = self._svPosteriorOnIndPoints.buildQSigma() nLatents = len(Kzz) nTrials = Kzz[0].shape[0] samples = [[] for r in range(nTrials)] for r in range(nTrials): samples[r] = torch.empty((nLatents, Ktt[0].shape[1]), dtype=Kzz[0].dtype) for k in range(nLatents): print("Processing trial {:d} and latent {:d}".format(r, k)) Kzzrk = Kzz[k][r, :, :] KzzCholrk = KzzChol[k][r, :, :] Ktzrk = Ktz[k][r, :, :] Kttrk = Ktt[k][r, :, :] qMurk = qMu[k][r, :, :] qSigmark = qSigma[k][r, :, :] ### being compute mean ### b = torch.cholesky_solve(qMurk, KzzCholrk) meanrk = torch.squeeze(Ktzrk.matmul(b)) ### end compute mean ### ### being compute covar ### B = torch.cholesky_solve(torch.t(Ktzrk), KzzCholrk) covarrk = Kttrk + torch.t(B).matmul(qSigmark - Kzzrk).matmul(B) ### end compute covar ### covarrk += torch.eye(covarrk.shape[0]) * regFactor covarrk = covarrk.detach() mn = scipy.stats.multivariate_normal(mean=meanrk, cov=covarrk) samples[r][k, :] = torch.from_numpy(mn.rvs()) return samples
def log_marginal(self, Y, gauss_mean, gauss_cov, **kwargs): """ Computes the log marginal likelihood w.r.t the prior log p(y|x) = -1/2 (Y-mu)' @ (K+sigma²I)^{-1} @ (Y-mu) - 1/2 \log |K+sigma^2I| - N/2 log(2pi) Args: `Y` (torch.tensor) :->: Observations Y with shape (Dy,MB) `gauss_mean` (torch.tensor) :->: mean from p(f). Shape (Dy,MB) `gauss_cov` (torch.tensor) :->: full covariance from p(f). Shape (Dy,MB,MB) """ N = Y.size(1) Dy = self.out_dim # compute mean and covariance from the marginal distribution p(y|x). # This basically add the observation noise to the covariance mx,Kxx = self.marginal_moments(gauss_mean,gauss_cov, diagonal = False) # reshapes mx = mx.view(Dy,N,1) Y = Y.view(Dy,N,1) # solve using cholesky Y_mx = Y-mx Lxx = psd_safe_cholesky(Kxx, upper = False, jitter = cg.global_jitter) # Compute (Y-mu)' @ (K+sigma²I)^{-1} @ (Y-mu) rhs = torch.cholesky_solve(Y_mx, Lxx, upper = False) data_fit_term = torch.matmul(Y_mx.transpose(1,2),rhs) complexity_term = 2*torch.log(torch.diagonal(Lxx, dim1 = 1, dim2 = 2)).sum(1) cte = -N/2. * torch.log(2*cg.pi) return -0.5*(data_fit_term + complexity_term ) + cte
def _update_weights(self): """ Method internally used to update GP weights' posterior mean """ self.weights_train = torch.cholesky_solve( self.training_vec, self.training_mat, upper=True) # not to be confused with BLR weights sample
def step(self, actions, shadow=False): actions = torch.clamp(actions, 0, 1) x = self.data[:, :self.nstep, :-1] y = self.data[:, :self.nstep, -1] K = exponential_kernel( x.unsqueeze(2), x.unsqueeze(1), self.length_scale) + torch.eye( self.nstep, dtype=torch.float, device=self.device) * 1e-5 #print(K) # K = torch.exp(-1 / 2 * ((x.view(self.batch_size, self.nstep, 1) - x.view(self.batch_size, 1, self.nstep)) / 0.1) ** 2) + torch.eye(self.nstep, dtype=torch.float, device=self.device) * 1e-5 u = torch.cholesky(K) k = exponential_kernel(x, actions.view(self.batch_size, 1, self.dims), self.length_scale) #print(k) # k = torch.exp(-1 / 2 * ((x.view(self.batch_size, self.nstep) - actions.view(self.batch_size,1)) / 0.1) ** 2) sol = torch.cholesky_solve( torch.cat((k.view(self.batch_size, self.nstep, 1), y.view(self.batch_size, self.nstep, 1)), dim=2), u) #print(sol) mav = torch.matmul(k.view(self.batch_size, 1, self.nstep), sol).view(self.batch_size, 2) #check shapes! #print(mav) newy = torch.normal(mav[:, 1], 1 - mav[:, 0]) newbest = torch.max(self.best, newy) reward = newbest - self.best if not shadow: self.best = newbest self.data[:, self.nstep, :-1] = actions.view(self.batch_size, self.dims) self.data[:, self.nstep, -1] = newy self.nstep = self.nstep + 1 return self.data[:, self.nstep - 1], reward.unsqueeze(1), mav
def forward(ctx, A, b): u = torch.cholesky(torch.matmul(A.transpose(-1, -2), A), upper=True) ret = torch.cholesky_solve(torch.matmul(A.transpose(-1, -2), b), u, upper=True) ctx.save_for_backward(u, ret, A, b) return ret
def manual_predict(params, train_x, train_y, test_x): with torch.no_grad(): print('computing K and Kt...') K = mnl_cov_with_noise(params, train_x) Kt = manual_cov(params, test_x, train_x) print('computing L...') L = torch.cholesky(K) alpha = torch.cholesky_solve(torch.t(torch.stack([train_y])), L) print('computing mean f...') f = torch.matmul(Kt, alpha)[:, 0] print('computing predictive variances...') def var(i): xi = test_x[[i], :] ki = torch.t(Kt[[i], :]) v, _cc = torch.triangular_solve(ki, L, upper=False) ret = (manual_cov(params, xi, xi) - torch.matmul(torch.t(v), v)).item() return ret vars = torch.tensor([var(i) for i in range(len(test_x))]) return (f, vars)
def _e_step(self, data): X, noise_covars = data T = self.covars[None, :, :, :] + noise_covars[:, None, :, :] try: T_chol = torch.cholesky(T) except RuntimeError: return torch.tensor(float('-inf')), None T_inv = torch.cholesky_solve(torch.eye(self.d, device=self.device), T_chol) diff = X[:, None, :] - self.means T_inv_diff = torch.matmul(T_inv, diff[:, :, :, None]) log_resps = -0.5 * (torch.matmul(diff[:, :, None, :], T_inv_diff) + self.d * math.log(2 * math.pi)).squeeze() log_resps -= T_chol.diagonal(dim1=-2, dim2=-1).log().sum(-1) log_resps += torch.log(self.weights[None, :, 0]) cond_means = self.means + torch.matmul( # n, j, d self.covars[None, :, :, :], # 1, j, d, d T_inv_diff)[:, :, :, 0] cond_covars = self.covars - torch.matmul( # n, j, d, d self.covars, # j, d, d torch.matmul( # n, j, d, d T_inv, # n, j, d, d self.covars # j, d, d )) log_prob = torch.logsumexp(log_resps, dim=1, keepdim=True) log_resps -= log_prob return torch.sum(log_prob), (log_resps, cond_means, cond_covars)
def forward(self): K, B = self.GlobalKernel(self.X, self.X) dK = K.diagonal() dK += self.global_gp_noise_std**2 + self.jitter # print(K) L = torch.linalg.cholesky(K) alpha = torch.cholesky_solve(self.y, L) Apart1 = self.y.T @ alpha Apart2 = torch.sum(torch.log(L.diagonal())) # Apart2 = torch.det(K) # print('Before Apart2', Apart2) # Apart2 = Apart2.clamp(Apart2, min=10**-20) # Apart2 = torch.log(Apart2) # Apart3 = self.N * torch.log(2*self.pi) A = 0.5 * (Apart1 + Apart2)[0, 0] # Bpart1 = B # Bpart2 = 0.5*(self.num_latent_points * # self.input_dim*torch.log(2*self.pi)) # B = Bpart1# + Bpart2 # print("A1", Apart1, "A2", Apart2, "B", B, "Loss", A+B, 'local var', self.local_gp_std) return (A + B) / self.X.nelement()
def train(self, X, y, method='cholesky', alpha=1e-2): """ Compute the output weights with a linear regression. Parameters: - X: input sequence of shape (seq_len, res_size) - y: target output (seq_len, out_dim) - method: "cholesky" or "sklearn ridge" - alpha: L2-regularization parameter Returns: a tensor of shape (res_size, out_dim) """ if method == 'cholesky': # This technique uses the Cholesky decomposition # It should be fast when res_size < seq_len Xt_y = X.T @ y # size (res_size, out_dim) K = X.T @ X # size (res_size, res_size) K.view(-1)[::len(K) + 1] += alpha # add elements on the diagonal inplace L = torch.cholesky(K, upper=False) return torch.cholesky_solve(Xt_y, L, upper=False) elif method == 'sklearn ridge': from sklearn.linear_model import Ridge clf = Ridge(fit_intercept=False, alpha=alpha) clf.fit(X.cpu().numpy(), y.cpu().numpy()) return torch.from_numpy(clf.coef_.T).to(self.device)
def __call__(self, x, full_cov=False): # Training mode if self.training: if self.train_inputs is None: raise RuntimeError( "train_inputs, train_targets cannot be None in training mode. " "Call .eval() for prior predictions, or call .set_train_data() to add training data." ) if settings.debug.on(): if not torch.equal(self.X, x): raise RuntimeError("You must train on the training inputs!") return self.forward(x) # Prior mode elif settings.prior_mode.on() or self.train_inputs is None or self.train_targets is None: full_output = self.forward(x) if settings.debug().on(): if not isinstance(full_output, gpytorch.distributions.MultivariateNormal): raise RuntimeError("ExactGP.forward must return a MultivariateNormal") return full_output # Posterior mode else: cov_data_query = self.covar_module(self.X, x).evaluate() prior_pred = self.forward(x) pred_mean = prior_pred.mean.view(-1, 1) + cov_data_query.t() @ self.y_weights cov_weights = torch.cholesky_solve(cov_data_query, self.chol_cov_data) if full_cov: pred_cov = prior_pred.covariance_matrix - cov_data_query.t() @ cov_weights else: # Evaluates only diagonal (variances) as a diagonal lazy matrix diag_k = gpytorch.lazy.DiagLazyTensor(prior_pred.lazy_covariance_matrix.diag()) pred_cov = diag_k.add_diag(-cov_data_query.t().matmul(cov_weights).diag()) return gpytorch.distributions.MultivariateNormal(pred_mean.view_as(prior_pred.mean), pred_cov)
def test_cg_with_tridiag(self): size = 10 matrix = torch.randn(size, size, dtype=torch.float64) matrix = matrix.matmul(matrix.transpose(-1, -2)) matrix.div_(matrix.norm()) matrix.add_(torch.eye(matrix.size(-1), dtype=torch.float64).mul_(1e-1)) rhs = torch.randn(size, 50, dtype=torch.float64) solves, t_mats = linear_cg(matrix.matmul, rhs=rhs, n_tridiag=5, max_tridiag_iter=10, max_iter=size, tolerance=0, eps=1e-15) # Check cg matrix_chol = torch.linalg.cholesky(matrix) actual = torch.cholesky_solve(rhs, matrix_chol) self.assertTrue(torch.allclose(solves, actual, atol=1e-3, rtol=1e-4)) # Check tridiag eigs = torch.linalg.eigvalsh(matrix) for i in range(5): approx_eigs = torch.linalg.eigvalsh(t_mats[i]) self.assertTrue( torch.allclose(eigs, approx_eigs, atol=1e-3, rtol=1e-4))
def step(self, actions, shadow=False): x = self.data[:, 0, :self.nstep] y = self.data[:, 1, :self.nstep] K = torch.exp(-1 / 2 * ( (x.view(self.batch_size, self.nstep, 1) - x.view(self.batch_size, 1, self.nstep)) / 0.1)**2) + torch.eye( self.nstep, dtype=torch.float, device=self.device) * 1e-5 u = torch.cholesky(K) k = torch.exp(-1 / 2 * ((x.view(self.batch_size, self.nstep) - actions.view(self.batch_size, 1)) / 0.1)**2) sol = torch.cholesky_solve( torch.cat((k.view(self.batch_size, self.nstep, 1), y.view(self.batch_size, self.nstep, 1)), dim=2), u) #print(sol) mav = torch.matmul(k.view(self.batch_size, 1, self.nstep), sol).view(self.batch_size, 2) #check shapes! #print(mav) newy = torch.normal(mav[:, 1], 1 - mav[:, 0]) newbest = torch.max(self.best, newy) reward = newbest - self.best if not shadow: self.best = newbest self.data[:, 0, self.nstep] = actions.view(self.batch_size) self.data[:, 1, self.nstep] = newy self.nstep = self.nstep + 1 return self.data[:, :, self.nstep - 1], reward
def _update_projections(self, Y, R, component_batches): print('Updating projections...') for bstart, bend in component_batches: self.t_matrix[bstart:bend, :, :] = torch.cholesky_solve( Y[bstart:bend, :, :].transpose(1, 2), torch.cholesky(R[bstart:bend, :, :], upper=True), upper=True)
def test_batch_cg_with_tridiag(self): batch = 5 size = 10 matrix = torch.randn(batch, size, size, dtype=torch.float64) matrix = matrix.matmul(matrix.transpose(-1, -2)) matrix.div_(matrix.norm()) matrix.add_(torch.eye(matrix.size(-1), dtype=torch.float64).mul_(1e-1)) rhs = torch.randn(batch, size, 10, dtype=torch.float64) solves, t_mats = linear_cg(matrix.matmul, rhs=rhs, n_tridiag=8, max_iter=size, max_tridiag_iter=10, tolerance=0, eps=1e-30) # Check cg matrix_chol = torch.linalg.cholesky(matrix) actual = torch.cholesky_solve(rhs, matrix_chol) self.assertTrue(torch.allclose(solves, actual, atol=1e-3, rtol=1e-4)) # Check tridiag for i in range(5): eigs = matrix[i].symeig()[0] for j in range(8): approx_eigs = t_mats[j, i].symeig()[0] self.assertTrue( torch.allclose(eigs, approx_eigs, atol=1e-3, rtol=1e-4))
def fit(self, X, labels): """ Fit method Args: - X: a tensor, appropriately flattened, having sizes: (Batch Size, Features, 1) - labels: a tensor of labels, having sizes: (Batch Size, 1) """ #for p in self.W_comp: # p.data.clamp_(0) #projection to ensure positive semi-definiteness #W_soft = F.softmax(self.W_comp) self.kern = torch.sum(torch.stack([ self.W_comp[i] * self.kernel[i](X) for i in range(self.nb_kernels) ]), dim=0) K = self.kern + torch.eye( self.kern.size()[0]).to(device) * self.lambda_reg L = torch.cholesky(K, upper=False) one_hot_y = F.one_hot(labels, num_classes=10).type( torch.FloatTensor).to(device) #A, _ = torch.solve(kern, L) #V, _ = torch.solve(one_hot_y, L) #alpha = A.T @ V self.alpha = torch.cholesky_solve(one_hot_y, L, upper=False)
def KL(self): """ The KL divergence from the variational distribution to the prior :return: KL divergence from N(q_mu, q_sqrt) to N(0, I), independently for each GP """ # if self.white: # return gauss_kl(self.q_mu, self.q_sqrt) # else: # return gauss_kl(self.q_mu, self.q_sqrt, self.Ku) self.build_cholesky_if_needed() KL = -0.5 * self.num_outputs * self.num_inducing KL -= 0.5 * torch.cumsum(torch.log(torch.stack(tuple(t.diag() for t in torch.unbind(self.q_sqrt,0))) ** 2),dim=0)[:,a.size(1)-1] #error check if not self.white: KL += torch.cumsum(torch.log(torch.stack(tuple(t.diag() for t in torch.unbind(self.q_sqrt,0)))),dim=0)[:,a.size(1)-1] * self.num_outputs KL += 0.5 * torch.cumsum(torch.square(torch.triangular_solve(self.Lu_tiled, self.q_sqrt, upper=False)),dim=0)[:,a.size(1)-1] Kinv_m = torch.cholesky_solve(self.q_mu , self.Lu) KL += 0.5 * torch.cumsum(self.q_mu * Kinv_m, dim=0)[:,a.size(1)-1] else: KL += 0.5 * torch.cumsum(torch.square(self.q_sqrt),dim=0)[:,a.size(1)-1] KL += 0.5 * torch.cumsum(self.q_mu**2,dim=0)[:,a.size(1)-1] return KL
def _sample_posterior(self, x, num_samples, context=None): log_weights = torch.log(self.module.soft_max(self.module.soft_weights)) T = self.module.covars[None, :, :, :] + x[1][:, None, :, :] p_weights = log_weights + dist.MultivariateNormal( loc=self.module.means, covariance_matrix=T ).log_prob(x[0][:, None, :]) p_weights -= torch.logsumexp(p_weights, axis=1)[:, None] L_t = torch.cholesky(T) T_inv = torch.cholesky_solve( torch.eye(self.d, device=self.device), L_t) diff = x[0][:, None, :] - self.module.means T_prod = torch.matmul(T_inv, diff[:, :, :, None]) p_means = self.module.means + torch.matmul( self.module.covars, T_prod ).squeeze() p_covars = self.module.covars - torch.matmul( self.module.covars, torch.matmul(T_inv, self.module.covars) ) idx = dist.Categorical(logits=p_weights).sample([num_samples]) samples = dist.MultivariateNormal( loc=p_means, covariance_matrix=p_covars).sample([num_samples]) return samples.transpose(0, 1)[ torch.arange(len(x), device=self.device)[:, None, None, None], torch.arange(num_samples, device=self.device)[None, :, None, None], idx.T[:, :, None, None], torch.arange(self.d, device=self.device)[None, None, None, :] ].squeeze()
def backward(ctx, g): l, = ctx.saved_tensors n = l.shape[-1] # TODO: Use cholesky_inverse once pytorch/pytorch/issues/7500 is solved. grad_x = g.view(*l.shape[:-2], 1, 1) * torch.cholesky_solve( torch.eye(n, out=l.new(n, n)), l) return grad_x, None, None