def factor_solve_kkt(Q, D, G, A, rx, rs, rz, ry): nineq, nz, neq, _ = get_sizes(G, A) if neq > 0: H_ = torch.cat([ torch.cat([Q, torch.zeros(nz, nineq).type_as(Q)], 1), torch.cat([torch.zeros(nineq, nz).type_as(Q), D], 1) ], 0) A_ = torch.cat([ torch.cat([G, torch.eye(nineq).type_as(Q)], 1), torch.cat([A, torch.zeros(neq, nineq).type_as(Q)], 1) ], 0) g_ = torch.cat([rx, rs], 0) h_ = torch.cat([rz, ry], 0) else: H_ = torch.cat([ torch.cat([Q, torch.zeros(nz, nineq).type_as(Q)], 1), torch.cat([torch.zeros(nineq, nz).type_as(Q), D], 1) ], 0) A_ = torch.cat([G, torch.eye(nineq).type_as(Q)], 1) g_ = torch.cat([rx, rs], 0) h_ = rz U_H_ = torch.potrf(H_) invH_A_ = torch.potrs(A_.t(), U_H_) invH_g_ = torch.potrs(g_.view(-1, 1), U_H_).view(-1) S_ = torch.mm(A_, invH_A_) U_S_ = torch.potrf(S_) t_ = torch.mv(A_, invH_g_).view(-1, 1) - h_ w_ = -torch.potrs(t_, U_S_).view(-1) v_ = torch.potrs(-g_.view(-1, 1) - torch.mv(A_.t(), w_), U_H_).view(-1) return v_[:nz], v_[nz:], w_[:nineq], w_[nineq:] if neq > 0 else None
def pre_factor_kkt(Q, G, A): """ Perform all one-time factorizations and cache relevant matrix products""" nineq, nz, neq, _ = get_sizes(G, A) # S = [ A Q^{-1} A^T A Q^{-1} G^T ] # [ G Q^{-1} A^T G Q^{-1} G^T + D^{-1} ] U_Q = torch.potrf(Q) # partial cholesky of S matrix U_S = torch.zeros(neq + nineq, neq + nineq).type_as(Q) G_invQ_GT = torch.mm(G, torch.potrs(G.t(), U_Q)) R = G_invQ_GT if neq > 0: invQ_AT = torch.potrs(A.t(), U_Q) A_invQ_AT = torch.mm(A, invQ_AT) G_invQ_AT = torch.mm(G, invQ_AT) # TODO: torch.potrf sometimes says the matrix is not PSD but # numpy does? I filed an issue at # https://github.com/pytorch/pytorch/issues/199 try: U11 = torch.potrf(A_invQ_AT) except: U11 = torch.Tensor(np.linalg.cholesky( A_invQ_AT.cpu().numpy())).type_as(A_invQ_AT) # TODO: torch.trtrs is currently not implemented on the GPU # and we are using gesv as a workaround. U12 = torch.gesv(G_invQ_AT.t(), U11.t())[0] U_S[:neq, :neq] = U11 U_S[:neq, neq:] = U12 R -= torch.mm(U12.t(), U12) return U_Q, U_S, R
def solve_kkt(U_Q, d, G, A, U_S, rx, rs, rz, ry, dbg=False): """ Solve KKT equations for the affine step""" nineq, nz, neq, _ = get_sizes(G, A) invQ_rx = torch.potrs(rx.view(-1, 1), U_Q).view(-1) if neq > 0: h = torch.cat( [torch.mv(A, invQ_rx) - ry, torch.mv(G, invQ_rx) + rs / d - rz], 0) else: h = torch.mv(G, invQ_rx) + rs / d - rz w = -torch.potrs(h.view(-1, 1), U_S).view(-1) g1 = -rx - torch.mv(G.t(), w[neq:]) if neq > 0: g1 -= torch.mv(A.t(), w[:neq]) g2 = -rs - w[neq:] dx = torch.potrs(g1.view(-1, 1), U_Q).view(-1) ds = g2 / d dz = w[neq:] dy = w[:neq] if neq > 0 else None # if np.all(np.array([x.norm() for x in [rx, rs, rz, ry]]) != 0): if dbg: import IPython import sys IPython.embed() sys.exit(-1) # if rs.norm() > 0: import IPython, sys; IPython.embed(); sys.exit(-1) return dx, ds, dz, dy
def forward(self, x_train, y_train, x_test=None): # See the autograd section for explanation of what happens here. n = x_train.size(0) p = x_train.size(-1) d = torch.zeros(n, n) for i in range(p): d += 0.5 * (x_train[:, i].unsqueeze(1) - x_train[:, i].unsqueeze(0) ).pow(2) / self.lengthscale[i].pow(2) kyy = self.sigma_f.pow(2) * torch.exp(-d) + self.sigma_n.pow( 2) * torch.eye(n) c = torch.cholesky(kyy, upper=True) # v = torch.potrs(y_train, c, upper=True) v, _ = torch.gesv(y_train.unsqueeze(1), kyy) # v = torch.cholesky_solve(y_train.unsqueeze(1), c, upper=True) if x_test is None: out = (c, v) if x_test is not None: with torch.no_grad(): ntest = x_test.size(0) d = torch.zeros(ntest, n) for i in range(p): d += 0.5 * (x_test[:, i].unsqueeze(1) - x_train[:, i].unsqueeze(0) ).pow(2) / self.lengthscale[i].pow(2) kfy = self.sigma_f.pow(2) * torch.exp(-d) # solve f_test = kfy.mm(v) tmp = torch.potrs(kfy.t(), c, upper=True) # tmp = torch.cholesky_solve(kfy.t(), c, upper=True) tmp = torch.sum(kfy * tmp.t(), dim=1) cov_f = self.sigma_f.pow(2) - tmp out = (f_test, cov_f) return out
def _output_output_covariance(self, input, Beta_a, lengthscale_a, variance_a, mu_a, Kff_inv_a, Beta_b, lengthscale_b, variance_b, mu_b, Kff_inv_b, mean, covariance): """ :param input: traing inputs H x (n+m) :param Beta_a: cached Beta for output dim a, 1 x H :param lengthscale_a: legnth scale of the RBF kernel for output dim a, 1 x (n + m) :param Kff_inv_a: for output dim a , H x H :param variance_a: variance of the kernel for output dim a :param mu_a: prediction for the mean of GP under uncertain inputs for output dim a :param Beta_b: cached Beta for output dim b, H x (n+m) :param lengthscale_b: legnth scale of the RBF kernel for output dim b, 1 x H :param Kff_inv_b: for output dim b , H x H :param variance_b: variance of the kernel for output dim b :param mu_b: prediction for the mean of GP under uncertain inputs for output dim b :param mean: mean for the uncertain inputs 1 x (n + m) :param covariance: covariance for the uncertain inputs (n + m) x (n + m) :return: """ assert (input.size()[1] == mean.size()[1]) mean.requires_grad = True covariance.requires_grad = True # eq 12 of ref.[1] #with torch.no_grad(): mat1 = 1 / ((1 / lengthscale_a).diag() + (1 / lengthscale_b).diag()) R = mat1 + covariance det = (torch.det(R) ** -0.5) * (torch.det(mat1) ** 0.5) # H x 1 x (n+m) -/+ H x (n+m) = H x H x (n+m) diff_m = (input.unsqueeze(1) - input) / 2. sum_m = (input.unsqueeze(1) * lengthscale_a + input * lengthscale_b) / (lengthscale_a + lengthscale_b) mat2 = R.potrf(upper=False) mat3 = torch.potrs(torch.eye(mat1.size()[0]), mat2, upper=False) # elementwise computation # H x H mat4 = ((diff_m ** 2 / (lengthscale_a + lengthscale_b)).sum(dim=-1)) * -0.5 mat5 = sum_m - mean # H x H x 1 x (n+m) * (n+m) x (n+m) @ H x H x (n+m) x 1 = H x H x 1 x 1 TODO MAYBE CONSIDER ADD SOME JITTER ? mat6 = (torch.matmul(mat5.unsqueeze(2), torch.matmul(mat3, mat5.unsqueeze(-1)))) * -0.5 # H by H L = variance_a * variance_b * det * torch.mul(torch.exp(mat4), torch.exp(mat6.view(input.size()[0], input.size()[0]))) cov = torch.matmul(Beta_a, torch.matmul(L, Beta_b)) - mu_a * mu_b # the diagonal term if ((Beta_a == Beta_b).all() and (lengthscale_a == lengthscale_b).all() and (variance_a == variance_b).all() and (mu_a == mu_b).all() and (Kff_inv_a == Kff_inv_b).all()): cov = cov + variance_a - torch.trace(torch.matmul(Kff_inv_a, L)) #TODO Compute the gradient cov.backward() return cov, mean.grad.data, covariance.grad.data
def train_locator_model(self, model_XTX, model_XTY, model=None): if model is None: model = torch.potrs(model_XTY, torch.potrf(model_XTX)) else: for _ in range(30): model, _ = torch.trtrs(model_XTY - torch.mm(torch.triu(model_XTX, diagonal=1), model), torch.tril(model_XTX, diagonal=0), upper=False) return model
def test_cg_with_tridiag(self): size = 10 matrix = torch.DoubleTensor(size, size).normal_() matrix = matrix.matmul(matrix.transpose(-1, -2)) matrix.div_(matrix.norm()) matrix.add_(torch.DoubleTensor(matrix.size(-1)).fill_(1e-1).diag()) rhs = torch.DoubleTensor(size, 50).normal_() solves, t_mats = linear_cg( matrix.matmul, rhs=rhs, n_tridiag=5, max_iter=size, tolerance=0, ) # Check cg matrix_chol = matrix.potrf() actual = torch.potrs(rhs, matrix_chol) self.assertTrue(approx_equal(solves, actual)) # Check tridiag eigs = matrix.symeig()[0] for i in range(5): approx_eigs = t_mats[i].symeig()[0] self.assertTrue(approx_equal(eigs, approx_eigs))
def gauss_kl_diag(q_mu, q_sqrt, K): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume multiple independent distributions, given by the columns of q_mu and q_sqrt. q_mu is a matrix, each column contains a mean q_sqrt is a matrix, each column represents the diagonal of a square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. """ L = torch.potrf(K, upper=False) alpha, _ = torch.gesv(q_mu, L) KL = 0.5 * (alpha**2).sum() # Mahalanobis term. num_latent = q_sqrt.size(1) KL += num_latent * torch.diag(L).log().sum() # Prior log-det term. KL += -0.5 * q_sqrt.numel() # constant term KL += -q_sqrt.log().sum() # Log-det of q-cov K_inv, _ = torch.potrs(Variable(torch.eye(L.size(0), out=L.data.new())), L, upper=False) KL += 0.5 * (torch.diag(K_inv).unsqueeze(1) * q_sqrt**2).sum() # Trace term. return KL
def test_batch_cg_with_tridiag(self): batch = 5 size = 10 matrix = torch.DoubleTensor(batch, size, size).normal_() matrix = matrix.matmul(matrix.transpose(-1, -2)) matrix.div_(matrix.norm()) matrix.add_(torch.DoubleTensor(matrix.size(-1)).fill_(1e-1).diag()) rhs = torch.DoubleTensor(batch, size, 50).normal_() solves, t_mats = linear_cg( matrix.matmul, rhs=rhs, n_tridiag=8, max_iter=size, tolerance=0, ) # Check cg matrix_chol = torch.cat( [matrix[i].potrf().unsqueeze(0) for i in range(5)]) actual = torch.cat([ torch.potrs(rhs[i], matrix_chol[i]).unsqueeze(0) for i in range(5) ]) self.assertTrue(approx_equal(solves, actual)) # Check tridiag for i in range(5): eigs = matrix[i].symeig()[0] for j in range(8): approx_eigs = t_mats[j, i].symeig()[0] self.assertLess( torch.mean(torch.abs((eigs - approx_eigs) / eigs)), 0.05, )
def test_potrs(self): chol = torch.tensor([[1, 0, 0, 0], [2, 1, 0, 0], [0, 1, 2, 0], [0, 0, 2, 3]], dtype=torch.float).unsqueeze(0) mat = torch.randn(1, 4, 3) self.assertTrue( approx_equal(torch.potrs(mat[0], chol[0], upper=False), tridiag_batch_potrs(mat, chol, upper=False)[0]) )
def batch_potrs(mat, chol): """ """ potrs_list = [] for i in range(mat.size(0)): potrs_list.append(torch.potrs(mat[i], chol[i]).unsqueeze(0)) return torch.cat(potrs_list, 0)
def filtering(self, observation, mu_s_curr, sigma_s_curr, index=None): """ filtering from p(x(k) | y(1:k-1)), updated using p(y(k) | x(k)), to get p(x(k) | y(1:k)) :param mean_pred: mean of p(x(k) | y(1:k-1)), :param covariance_pred: covariance of p(x(k) | y(1:k-1)) :return: """ # first compute the predtion of measurement based on the observation model if self.option == "GP": #print(self.lengthscale_o, sigma_s_curr) mu_o_curr, sigma_o_curr = self._prediction(self.X_o, self.Beta_o, self.lengthscale_o, self.K_o_var, self.Kff_o_inv, self.noise_o, mu_s_curr, sigma_s_curr, flag='filtering') Cov_yx, Cov_xy = self._compute_cov(self.X_o, mu_s_curr, self.mu_o_curr, self.lengthscale_o, sigma_s_curr, self.K_o_var, self.Beta_o) else: assert (index != 0 and index <= len(self.Xu_o)), "state transition models have dimension {}, index is {}.".format(len(self.Xu_o), index) mu_o_curr, sigma_o_curr = self._prediction(self.Xu_o[index], self.zip_cached_o, mu_s_curr, sigma_s_curr) Cov_yx, Cov_xy = self._compute_cov(self.Xu_o[index], mu_s_curr, self.mu_o_curr, self.lengthscale_o, sigma_s_curr, self.K_o_var, self.Beta_o) self.mu_o_curr, self.sigma_o_curr = mu_o_curr, sigma_o_curr #print(observation, self.mu_o_curr, self.sigma_o_curr) sigma_o_curr_inv = torch.potrs(torch.eye(sigma_o_curr.size()[0]), sigma_o_curr.potrf(upper=False), upper=False) mu_hat_s_curr = mu_s_curr + torch.matmul(Cov_xy, torch.matmul(sigma_o_curr_inv, (observation - mu_o_curr))) sigma_hat_s_curr = sigma_s_curr - torch.matmul(Cov_xy, torch.matmul(sigma_o_curr_inv, Cov_yx)) self.mu_hat_s_curr, self.sigma_hat_s_curr = mu_hat_s_curr, sigma_hat_s_curr self.mu_hat_s_curr_lis.append(self.mu_hat_s_curr.clone()) self.sigma_hat_s_curr_lis.append(self.sigma_hat_s_curr.clone()) return mu_hat_s_curr, sigma_hat_s_curr
def test_cg_with_tridiag(self): size = 10 matrix = torch.randn(size, size, dtype=torch.float64) matrix = matrix.matmul(matrix.transpose(-1, -2)) matrix.div_(matrix.norm()) matrix.add_(torch.eye(matrix.size(-1), dtype=torch.float64).mul_(1e-1)) rhs = torch.randn(size, 50, dtype=torch.float64) solves, t_mats = linear_cg(matrix.matmul, rhs=rhs, n_tridiag=5, max_tridiag_iter=10, max_iter=size, tolerance=0, eps=1e-15) # Check cg matrix_chol = matrix.cholesky(upper=True) actual = torch.potrs(rhs, matrix_chol) self.assertTrue(approx_equal(solves, actual)) # Check tridiag eigs = matrix.symeig()[0] for i in range(5): approx_eigs = t_mats[i].symeig()[0] self.assertTrue(approx_equal(eigs, approx_eigs))
def test_batch_cg_with_tridiag(self): batch = 5 size = 10 matrix = torch.randn(batch, size, size, dtype=torch.float64) matrix = matrix.matmul(matrix.transpose(-1, -2)) matrix.div_(matrix.norm()) matrix.add_(torch.eye(matrix.size(-1), dtype=torch.float64).mul_(1e-1)) rhs = torch.randn(batch, size, 50, dtype=torch.float64) solves, t_mats = linear_cg(matrix.matmul, rhs=rhs, n_tridiag=8, max_iter=size, max_tridiag_iter=10, tolerance=0, eps=1e-20) # Check cg matrix_chol = torch.cholesky(matrix, upper=True) actual = torch.potrs(rhs, matrix_chol) self.assertTrue(approx_equal(solves, actual)) # Check tridiag for i in range(5): eigs = matrix[i].symeig()[0] for j in range(8): approx_eigs = t_mats[j, i].symeig()[0] self.assertLess( torch.mean(torch.abs((eigs - approx_eigs) / eigs)), 0.05)
def linear_solve_compat(matrix, matrix_chol, y): """Solves the equation ``torch.mm(matrix, x) = y`` for x.""" if matrix.requires_grad or y.requires_grad: # If derivatives are required, use the more expensive gesv. return torch.gesv(y, matrix)[0] else: # Use the cheaper Cholesky solver. return torch.potrs(y, matrix_chol)
def test_potrs(): chol = torch.Tensor([ [1, 0, 0, 0], [2, 1, 0, 0], [0, 1, 2, 0], [0, 0, 2, 3], ]).unsqueeze(0) mat = torch.randn(1, 4, 3) assert approx_equal(torch.potrs(mat[0], chol[0], upper=False), tridiag_batch_potrs(mat, chol, upper=False)[0])
def batch_potrs(mat, chol): """ TODO: Replace with torch batch potrs once it is implemented. """ potrs_list = [] potrs_list = [ torch.potrs(sub_mat, sub_chol) for sub_mat, sub_chol in zip( mat.view(-1, *mat.shape[-2:]), chol.view(-1, *chol.shape[-2:])) ] res = torch.cat(potrs_list, 0) return res.view_as(mat)
def woodbury_factor(low_rank_mat, shift): """ Given a low rank (k x n) matrix V and a shift, returns the matrix R so that R = (I_k + 1/shift VV')^{-1}V to be used in solves with (V'V + shift I) via the Woodbury formula """ k = low_rank_mat.size(-2) shifted_mat = (1 / shift) * low_rank_mat.matmul( low_rank_mat.transpose(-1, -2)) shifted_mat = shifted_mat + shifted_mat.new(k).fill_(1).diag() if low_rank_mat.ndimension() == 3: R = torch.cat([ torch.potrs(low_rank_mat[i], shifted_mat[i].potrf()).unsqueeze(0) for i in range(shifted_mat.size(0)) ]) else: R = torch.potrs(low_rank_mat, shifted_mat.potrf()) return R
def cache_variable(self): #Beta = None for (i, GP_dyn) in enumerate(self.GP_dyn): if self.option == 'GP': noise = GP_dyn.guide() Kff = GP_dyn.kernel(self.X_hat).contiguous() Kff.view(-1)[::self.X_hat.size()[0] + 1] += GP_dyn.get_param('noise') Lff= Kff.potrf(upper=False) self.Kff_inv[i, :, :] = torch.potrs(torch.eye(self.X_hat.size()[0]), Lff, upper=False) self.Beta[i, :] = torch.potrs(self.dX[:, i], Lff, upper=False).squeeze(-1) self.K_var[i] = GP_dyn.kernel.get_param("variance") self.lengthscale[i, :] = GP_dyn.kernel.get_param("lengthscale") self.noise[i, :] = noise else: Xu, noise = GP_dyn.guide() if (GP_dyn.approx == 'DTC' or GP_dyn.option == 'VFE'): Kff_inv, Beta = self._compute_cached_var_ssgp(GP_dyn, Xu, noise, "DTC") else: Kff_inv, Beta = self._compute_cached_var_ssgp(GP_dyn, Xu, noise, "FITC") self.Beta[i, :] = Beta self.Kff_inv[i, :, :] = Kff_inv self.K_var[i] = GP_dyn.kernel.get_param("variance") self.lengthscale[i, :] = GP_dyn.kernel.get_param("lengthscale") self.Xu[i, :] = Xu self.noise[i, :, :] = noise print("variable caching for dynamics model {} is done!".format(i)) print(self.Beta.size()) print(self.lengthscale.size()) print(self.Kff_inv.size()) print(self.K_var.size()) print("initialization is done!")
def test_cg(self): size = 100 matrix = torch.DoubleTensor(size, size).normal_() matrix = matrix.matmul(matrix.transpose(-1, -2)) matrix.div_(matrix.norm()) matrix.add_(torch.DoubleTensor(matrix.size(-1)).fill_(1e-1).diag()) rhs = torch.DoubleTensor(size, 50).normal_() solves = linear_cg(matrix.matmul, rhs=rhs, max_iter=size) # Check cg matrix_chol = matrix.potrf() actual = torch.potrs(rhs, matrix_chol) self.assertTrue(approx_equal(solves, actual))
def test_cg(self): size = 100 matrix = torch.randn(size, size, dtype=torch.float64) matrix = matrix.matmul(matrix.transpose(-1, -2)) matrix.div_(matrix.norm()) matrix.add_(torch.eye(matrix.size(-1), dtype=torch.float64).mul_(1e-1)) rhs = torch.randn(size, 50, dtype=torch.float64) solves = linear_cg(matrix.matmul, rhs=rhs, max_iter=size) # Check cg matrix_chol = matrix.cholesky(upper=True) actual = torch.potrs(rhs, matrix_chol) self.assertTrue(approx_equal(solves, actual))
def variance_propagation(self, input, Beta, lengthscale, variance, Kff_inv, mu, mean, covariance, noise, flag='prediction'): """ variace of the propagation of GP for uncertain inputs :param input: traing inputs N by D or N by E :param Beta: cached Beta 1 by N :param lengthscale: legnth scale of the RBF kernel 1 by D :param Kff_inv: N by N :param variance: variance of the kernel :param mu: prediction for the mean of GP under uncertain inputs :param mean: mean for the uncertain inputs 1 by D or 1 by E :param covariance: covariance for the uncertain inputs D by D or E by E :return: """ assert (input.size()[1] == mean.size()[1]) #eq 11 of ref.[1] with torch.no_grad(): mat1 = (lengthscale.diag() / 2. + covariance) #mat1 = (covariance / lengthscale * 2 + torch.eye(input.size()[1])) det = (torch.det(mat1) ** -0.5) * (torch.det(lengthscale.diag()) ** 0.5) #mat1 = (lengthscale.diag() / 2. + covariance) # N by 1 by D (E) -/+ N by D (E) = N by N by D (E) diff_m = (input.unsqueeze(1) - input) / 2. sum_m = (input.unsqueeze(1) + input) / 2. mat2 = mat1.potrf(upper=False) mat3 = torch.potrs(torch.eye(mat1.size()[0]), mat2, upper=False) # elementwise computation # N by N mat4 = ((diff_m ** 2 / lengthscale * 2).sum(dim=-1)) * -0.5 # N x N x 1 x D @ D x D @ N x N x D x 1 = N x N x 1 x 1(or D replaced by E) TODO MAYBE CONSIDER ADD SOME JITTER ? mat5 = sum_m - mean #print(mat3.size(), mat5.size()) mat6 = (torch.matmul(mat5.unsqueeze(2), torch.matmul(mat3, mat5.unsqueeze(-1)))) * -0.5 # N by N L = variance**2 * det* torch.mul(torch.exp(mat4), torch.exp(mat6.view(input.size()[0], input.size()[0]))) #print(torch.trace(torch.matmul(Kff_inv, L)), torch.sum(torch.mul(Kff_inv, L))) var = torch.matmul(Beta, torch.matmul(L, Beta)) + variance - torch.trace(torch.matmul(Kff_inv, L)) - mu * mu + 2 * noise # if flag != 'prediction': # print(mean, mu, var, torch.matmul(Beta, torch.matmul(L, Beta)), variance - torch.trace(torch.matmul(Kff_inv, L)), mu * mu) # #print(mat4) return var
def train_locator_model(self, locator_features, model=None): regularization = self.params.regularization if self.regularization_matrix is None: self.regularization_matrix = regularization*torch.eye(locator_features.shape[1], device=self.params.device) train_XTX = torch.mm(locator_features.t(), locator_features) train_XTX = train_XTX + self.regularization_matrix train_XTY = torch.mm(locator_features.t(), self.labels) if model is None: model = torch.potrs(train_XTY, torch.potrf(train_XTX)) else: for _ in range(30): model, _ = torch.trtrs(train_XTY - torch.mm(torch.triu(train_XTX, diagonal=1), model), torch.tril(train_XTX, diagonal=0), upper=False) return model
def forward(self, x_train, y_train, x_test=None, classify=False): # See the autograd section for explanation of what happens here. self.classify = classify n = x_train.size(0) kyy = torch.empty(n, n) for i in range(n): for j in range(i, n): # integrate over the cov func out = self.int2D(self.cov_func, x_train[i, 0], x_train[i, 1], x_train[j, 0], x_train[j, 1], 1e-6) kyy[i, j] = out if i != j: kyy[j, i] = out kyy = kyy + self.sigma_n.pow(2) * torch.eye(n) with torch.no_grad(): e, _ = kyy.eig() mine = torch.min(e[:, 0]) if mine < 1e-6: print('chol correction') kyy = kyy + 1.1 * (1e-6 - torch.eye(n) * mine).abs() c = torch.cholesky(kyy, upper=True) # v = torch.potrs(y_train, c, upper=True) v, _ = torch.gesv(y_train.unsqueeze(1), kyy) if x_test is None: out = (c, v) if x_test is not None: with torch.no_grad(): ntest = x_test.size(0) kfy = torch.empty(ntest, n) for i in range(ntest): for j in range(n): # integrate over the cov func out = self.int1D(lambda x: self.cov_func(x_test[i], x), x_train[j, 0], x_train[j, 1], 1e-6) kfy[i, j] = out # solve f_test = kfy.mm(v) tmp = torch.potrs(kfy.t(), c, upper=True) tmp = torch.sum(kfy * tmp.t(), dim=1) cov_f = self.sigma_f.pow(2) - tmp out = (f_test, cov_f) return out
def test_batch_cg(): batch = 5 size = 100 matrix = torch.DoubleTensor(batch, size, size).normal_() matrix = matrix.matmul(matrix.transpose(-1, -2)) matrix.div_(matrix.norm()) matrix.add_(torch.DoubleTensor(matrix.size(-1)).fill_(1e-1).diag()) rhs = torch.DoubleTensor(batch, size, 50).normal_() solves = linear_cg(matrix.matmul, rhs=rhs, max_iter=size) # Check cg matrix_chol = torch.cat([matrix[i].potrf().unsqueeze(0) for i in range(5)]) actual = torch.cat( [torch.potrs(rhs[i], matrix_chol[i]).unsqueeze(0) for i in range(5)]) assert approx_equal(solves, actual)
def covariance_propagation(self, input, Beta_a, lengthscale_a, variance_a, mu_a, Beta_b, lengthscale_b, variance_b, mu_b, mean, covariance): """ :param input: traing inputs N by D or N by E :param Beta_a: cached Beta for output dim a, 1 by N :param lengthscale_a: legnth scale of the RBF kernel for output dim a, 1 by D :param Kff_inv_a: for output dim a ,N by N :param variance_a: variance of the kernel for output dim a :param mu_a: prediction for the mean of GP under uncertain inputs for output dim a :param Beta_b: cached Beta for output dim b, 1 by N :param lengthscale_b: legnth scale of the RBF kernel for output dim b, 1 by D :param Kff_inv_b: for output dim b ,N by N :param variance_b: variance of the kernel for output dim b :param mu_b: prediction for the mean of GP under uncertain inputs for output dim b :param mean: mean for the uncertain inputs 1 by D or 1 by E :param covariance: covariance for the uncertain inputs D by D or E by E :return: """ assert (input.size()[1] == mean.size()[1]) # eq 12 of ref.[1] with torch.no_grad(): mat1 = 1 / (1 / lengthscale_a + 1 / lengthscale_b).diag() R = mat1 + covariance det = (torch.det(R) ** -0.5) * (torch.det(mat1) ** 0.5) # N by 1 by D (E) -/+ N by D (E) = N by N by D (E) diff_m = (input.unsqueeze(1) - input) / 2. sum_m = (input.unsqueeze(1) * lengthscale_a + input * lengthscale_b) / (lengthscale_a + lengthscale_b) mat2 = R.potrf(upper=False) mat3 = torch.potrs(torch.eye(mat1.size()[0]), mat2, upper=False) # elementwise computation # N by N mat4 = ((diff_m ** 2 / (lengthscale_a + lengthscale_b)).sum(dim=-1)) * -0.5 # N x N x 1 x D @ D x D @ N x N x D x 1 = N x N x 1 x 1(or D replaced by E) TODO MAYBE CONSIDER ADD SOME JITTER ? mat5 = sum_m - mean mat6 = (torch.matmul(mat5.unsqueeze(2), torch.matmul(mat3, mat5.unsqueeze(-1)))) * -0.5 # N by N L = variance_a * variance_b * det * torch.mul(torch.exp(mat4), torch.exp(mat6.view(input.size()[0], input.size()[0]))) cov = torch.matmul(Beta_a, torch.matmul(L, Beta_b)) - mu_a * mu_b return cov
def forward(self, x_train, y_train, x_test=None): # See the autograd section for explanation of what happens here. n = x_train.size(0) q1 = (x_train[:, 1].view(n, 1) - x_train[:, 0].view(n, 1).t()) / ( 2 * self.lengthscale.pow(2)).sqrt() q2 = (x_train[:, 1].view(n, 1) - x_train[:, 1].view(n, 1).t()) / ( 2 * self.lengthscale.pow(2)).sqrt() m1 = (x_train[:, 0].view(n, 1) - x_train[:, 0].view(n, 1).t()) / ( 2 * self.lengthscale.pow(2)).sqrt() m2 = (x_train[:, 0].view(n, 1) - x_train[:, 1].view(n, 1).t()) / ( 2 * self.lengthscale.pow(2)).sqrt() kyy = self.sigma_f.pow( 2 ) * (self.lengthscale.pow(2) * math.sqrt(math.pi) * (( (q1 * torch.erf(q1) + torch.exp(-q1.pow(2)) / math.sqrt(math.pi)) - (q2 * torch.erf(q2) + torch.exp(-q2.pow(2)) / math.sqrt(math.pi)) ) + ( (m2 * torch.erf(m2) + torch.exp(-m2.pow(2)) / math.sqrt(math.pi)) - (m1 * torch.erf(m1) + torch.exp(-m1.pow(2)) / math.sqrt(math.pi)))) + self.sigma_n.pow(2) * torch.eye(n)) #d = 0.5*(x_train - x_train.t()).pow(2)/self.lengthscale.pow(2) #kyy = self.sigma_f.pow(2)*torch.exp(-d) + self.sigma_n.pow(2) * torch.eye(n) c = torch.cholesky(kyy, upper=True) # v = torch.potrs(y_train, c, upper=True) v, _ = torch.gesv(y_train, kyy) # kyy^-1 * y if x_test is None: out = (c, v) if x_test is not None: with torch.no_grad(): kfy = ((math.sqrt(math.pi) / 2) * (torch.erf( (x_train[:, 1].view(n, 1) - x_test.t()) / math.sqrt(2 * self.lengthscale.pow(2))) - torch.erf( (x_train[:, 0].view(n, 1) - x_test.t()) / math.sqrt(2 * self.lengthscale.pow(2)))) * math.sqrt(2 * self.lengthscale.pow(2))) kfy = self.sigma_f.pow(2) * kfy.t() # solve f_test = kfy.mm(v) tmp = torch.potrs(kfy.t(), c, upper=True) tmp = torch.sum(kfy * tmp.t(), dim=1) cov_f = self.sigma_f.pow(2) - tmp out = (f_test, cov_f) return out
def bpotrs(b, u, upper=True, out=None): """ batch solve a linear system of equations """ "with a positive semidefinite matrix to be inverted given its given a Cholesky factor matrix" "RuntimeError: the derivative for 'potri' is not implemented" s = u.size() # (m,N,D,D) D = s[-1] b_view = b.view(-1,D) # (mN, D) u_view = u.view((-1,)+s[-2:]) c = Variable(b_view.data.new(b_view.size())) for i in range(c.size()[0]): c[i,:] = torch.potrs(b_view[i,:],u_view[i,:,:],upper) if out: out = c.view(b.size()) else: return c.view(b.size())
def woodbury_factor(low_rank_mat, shift): """ Given a low rank (k x n) matrix V and a shift, returns the matrix R so that R = (I_k + 1/shift VV')^{-1}V to be used in solves with (V'V + shift I) via the Woodbury formula """ k = low_rank_mat.size(-2) shifted_mat = low_rank_mat.matmul(low_rank_mat.transpose(-1, -2) / shift.unsqueeze(-1)) shifted_mat = shifted_mat + torch.eye(k, dtype=shifted_mat.dtype, device=shifted_mat.device) if low_rank_mat.ndimension() == 3: R = batch_potrs(low_rank_mat, batch_potrf(shifted_mat)) else: R = torch.potrs(low_rank_mat, shifted_mat.potrf()) return R
def _output_mean(self, input, Beta, lengthscale, variance, mean, covariance): """ mean of the prpagation of GP for uncertain inputs :param input: traing inputs H x (n+m) :param Beta: cached Beta 1 x H :param lengthscale: legnth scale of the RBF kernel 1 x (n + m) :param variance: variance of the kernel :param mean: mean for the uncertain inputs 1 x (n + m) :param covariance: covariance for the uncertain inputs (n + m) x (n + m) :return: """ ### porediction of gp mean for uncertain inputs # print(input.size()) # print(Beta.size()) # print(lengthscale.size()) # print(variance.size()) # print(mean.size()) # print(covariance.size()) mean.requires_grad = True covariance.requires_grad = True assert(input.size()[1] == mean.size()[1]) # eq 9 of ref. [1] #with torch.no_grad(): #print(covariance) mat1 = (lengthscale.diag() + covariance) det = variance * (torch.det(mat1) ** -0.5) * (torch.det(lengthscale.diag()) ** 0.5) diff = input - mean # N x 1 x D @ D x D @ N x D x 1 = N x 1 x 1(or D replaced by E) TODO MAYBE CONSIDER ADD SOME JITTER ? mat2 = mat1.potrf(upper=False) mat3 = torch.potrs(torch.eye(mat1.size()[0]), mat2, upper=False) mat4 = (torch.matmul(diff.unsqueeze(1), torch.matmul(mat3, diff.unsqueeze(-1)))) * -0.5 # (N, ) l = det * torch.exp(mat4.view(-1)) mu = torch.matmul(Beta, l) #TODO compute the gradient mu.backward() return mu, mean.grad.data, covariance.grad.data