def test_grad(): #try with a model class Network(nn.Module): def __init__(self): super(Network, self).__init__() self.linear2 = nn.Linear(2, 1) self.linear2.weight.data.fill_(0.0) self.linear2.weight[0, 0] = 1. self.linear2.weight[0, 1] = 1. self.linear2.weight[1, 1] = 1. self.linear2.weight[1, 2] = 1. def forward(self, x): pax_predict = self.linear2(x) #print(self.linear2.weight.data) return pax_predict f_t = Network() def fun(x): t_x = torch.from_numpy(x).float() f_x = f_t(t_x).detach().numpy() #print(f_x.shape) return f_x ##1d square #torch time_start = time.clock() model = Network() x = torch.ones((5, 2)) print([af.jacobian(model, x[i, :]) for i in range(x.shape[0])]) print([af.hessian(model, x[i, :]) for i in range(x.shape[0])]) time_e = time.clock() - time_start print(time_e) #numerical time_start = time.clock() model = Network() x = np.ones((5, 2)) df = nd.Gradient(fun) H = nd.Hessian(fun) print(list(map(df, x.tolist()))) print(list(map(H, x.tolist()))) time_e = time.clock() - time_start print(time_e) #from mpc time_start = time.clock() model = Network() x = np.ones((5, 2)) print(grad(model, x)) x = torch.ones((5, 2)) print([af.hessian(model, x[i, :]) for i in range(x.shape[0])]) time_e = time.clock() - time_start print(time_e)
def Newton_for_Nesterov(x, func, epoch=100, h=0.001): n = x.shape[0] f_line = [] for i in range(epoch): # print(i, end='\r') f_line.append(func(x)) # print(f_line[-1]) jac = jacobian(func, x.view(n, 1)) hes = hessian(func, x.view(n, 1)).sum() # print('jac: {}'.format(jac)) # print('hes: {}'.format(hes)) # print('x: {}'.format(x)) h = jac / hes h = h.view(n) # print(h) x -= h f_line.append(func(x)) return x, f_line
def test_hessian_vector_valued_postprocessing(self, dev_name, diff_method, mocker, tol): """Test hessian calculation of a vector valued QNode with post-processing""" if diff_method not in {"parameter-shift", "backprop"}: pytest.skip("Test only supports parameter-shift or backprop") dev = qml.device(dev_name, wires=1) @qnode(dev, diff_method=diff_method, interface="torch") def circuit(x): qml.RX(x[0], wires=0) qml.RY(x[1], wires=0) return [qml.expval(qml.PauliZ(0)), qml.expval(qml.PauliZ(0))] x = torch.tensor([0.76, -0.87], requires_grad=True, dtype=torch.float64) def cost_fn(x): return x @ circuit(x) a, b = x.detach().numpy() res = cost_fn(x) expected_res = np.array( [a, b]) @ [np.cos(a) * np.cos(b), np.cos(a) * np.cos(b)] assert np.allclose(res.detach(), expected_res, atol=tol, rtol=0) res.backward() g = x.grad expected_g = [ np.cos(b) * (np.cos(a) - (a + b) * np.sin(a)), np.cos(a) * (np.cos(b) - (a + b) * np.sin(b)), ] assert np.allclose(g.detach(), expected_g, atol=tol, rtol=0) spy = mocker.spy(JacobianTape, "hessian") hess = hessian(cost_fn, x) if diff_method == "backprop": spy.assert_not_called() elif diff_method == "parameter-shift": spy.assert_called_once() expected_hess = [ [ -(np.cos(b) * ((a + b) * np.cos(a) + 2 * np.sin(a))), -(np.cos(b) * np.sin(a)) + (-np.cos(a) + (a + b) * np.sin(a)) * np.sin(b), ], [ -(np.cos(b) * np.sin(a)) + (-np.cos(a) + (a + b) * np.sin(a)) * np.sin(b), -(np.cos(a) * ((a + b) * np.cos(b) + 2 * np.sin(b))), ], ] assert np.allclose(hess.detach(), expected_hess, atol=tol, rtol=0)
def test_hessian(self, dev_name, diff_method, mocker, tol): """Test hessian calculation of a scalar valued QNode""" if diff_method not in {"parameter-shift", "backprop"}: pytest.skip("Test only supports parameter-shift or backprop") dev = qml.device(dev_name, wires=1) @qnode(dev, diff_method=diff_method, interface="torch") def circuit(x): qml.RY(x[0], wires=0) qml.RX(x[1], wires=0) return qml.expval(qml.PauliZ(0)) x = torch.tensor([1.0, 2.0], requires_grad=True) res = circuit(x) res.backward() g = x.grad spy = mocker.spy(JacobianTape, "hessian") hess = hessian(circuit, x) spy.assert_called_once() a, b = x.detach().numpy() expected_res = np.cos(a) * np.cos(b) assert np.allclose(res.detach(), expected_res, atol=tol, rtol=0) expected_g = [-np.sin(a) * np.cos(b), -np.cos(a) * np.sin(b)] assert np.allclose(g.detach(), expected_g, atol=tol, rtol=0) expected_hess = [[-np.cos(a) * np.cos(b), np.sin(a) * np.sin(b)], [np.sin(a) * np.sin(b), -np.cos(a) * np.cos(b)]] assert np.allclose(hess.detach(), expected_hess, atol=tol, rtol=0)
def Newton_for_Newton(x, func, epoch=100, h=1): f_line = [] for i in range(epoch): # print(i, end='\r') f_line.append(func(x)) # print(f_line[-1]) jac = jacobian(func, x) hes = hessian(func, x).sum() # print('jac: {}'.format(jac)) # print('hes: {}'.format(hes)) # print('x: {}'.format(x)) if x - jac / hes < 0: # print('neg : {}'.format(x - jac / hes)) if h < 1e-3: break h *= 0.1 continue x -= h * jac / hes f_line.append(func(x)) return x, f_line
def hess(self, x): x = self.x_encode(x) hess_x = hessian(self.model.obj, x).numpy() hess_x = hess_x[1:-1, 1:-1, 1:-1, 1:-1] length = (self.m - 2) * (self.n - 2) hess_x = hess_x.reshape(length, length) assert np.linalg.norm(hess_x - hess_x.T) < 1e-5, np.linalg.norm(hess_x) return hess_x
def vcov(self): from torch.autograd.functional import hessian bias, weight = torch.tensor(self.bias), torch.tensor(self.weight) h = hessian(self.log_lik, (bias, weight)) fisher_obs = -torch.cat([torch.cat([h[0][0],h[0][1].squeeze(dim = 2)], dim = 1), torch.cat([h[1][0].squeeze(dim =0).squeeze(dim =1), h[1][1].squeeze()], dim = 1)], dim = 0) vcov = torch.inverse(fisher_obs)/self.n_sample return vcov
def get_hess(self, input_var): assert 'shapes' in dir( self), 'You must first call get input to define the tensors shapes.' input_var_ = torch.tensor( input_var, dtype=self.precision, device=self.device) def func(inp): return self._eval_func(self._unconcat(inp, self.shapes)) hess = hessian(func, input_var_, vectorize=False) return hess.cpu().detach().numpy().astype(np.float64)
def call_oracle(self,x): if type(x) != torch.Tensor: try: x = torch.tensor(x, dtype=torch.double, requires_grad=self.requires_grad) except: raise Exception('Optimization variable must be Pytorch tensor\ or something that could be cast into it such as\ numpy array, list, etc.') assert len(x.shape) == 1 if (not x.requires_grad) and (self.requires_grad): raise Exception('Need to enable gradients on optimization variable.') # Zero the gradient if x has one if x.requires_grad and (x.grad is not None): x.grad.zero_grad() self.x = x self.fx = self.obj_func(self.x) if self.fx.dim() != 0: raise Exception('Objective function must outputscalar value') # Uses auto differentiation to get subgradient if self.requires_grad: self.fx.backward() if self.oracle_output == 'f': return self.oracle_f() elif self.oracle_output == 'df': return self.oracle_df() elif self.oracle_output == 'both': return {'f' : self.oracle_f(), 'df' : self.oracle_df(), } elif self.oracle_output == 'hess+': assert type(x) == torch.Tensor return {'f' : self.oracle_f(), 'df' : self.oracle_df(), 'd2f': np.nan_to_num(hessian(self.obj_func,self.x).data.numpy(),1e16) }
def linearize(self): # linearize dynamics XU_t = torch.from_numpy(self.XU[:, :, 0]).float() #print(XU_t.size()) # apply num method #self.F = grad(self.dyn_f, self.XU[:,:,0]).reshape((self.T, self.n, self.n+self.m)) #T*n*n+m self.F = compute_jacobian(self.dyn_f, XU_t, (self.T, self.n)) self.F = np.concatenate([self.F[i, :, i, :] for i in range(self.T)]).reshape( (self.T, self.n, self.n + self.m)) #print(self.F) self.f = self.dyn_f(XU_t).detach().numpy() self.f = self.f.reshape((self.T, -1, 1)) #print(self.f) # linearize cost # TODO: can we further speedup with high accuracy? self.C = [self.hessian_torch(XU_t[i, :]) for i in range(self.T)] # self.C = Parallel(n_jobs=4,backend="threading")(delayed(self.hessian_torch)(XU_t[i,:]) for i in range(self.T)) # self.C = self.map(self.hessian_torch, [XU_t[i,:] for i in range(self.T)]) self.C = np.asarray(self.C) #self.C = torch_hessian(self.cost_f, XU_t) #try numerical #self.C = hess(self.cost_f, self.XU[:,:,0]) #self.c = grad(self.cost_f, self.XU[:,:,0]) self.c = compute_jacobian(self.cost_f, XU_t, (self.T, 1)) self.c = np.concatenate([self.c[i, :, i, :] for i in range(self.T)]) #print(self.c.shape) #debug self.c = self.c.reshape((self.T, -1, 1)) # linearize val X_T = torch.from_numpy(self.X[self.T, :, 0]).float() #print(X_T.size()) self.V_T = af.hessian(self.val_f, X_T).detach().numpy() self.v_T = compute_jacobian(self.val_f, X_T.view(1, -1), (1, )) #self.v_T = grad(self.val_f,self.X[self.T,:,:].reshape((1,-1))) #debug self.v_T = self.v_T.reshape((-1, 1))
def s_test_sample_exact(model, x_test, y_test, train_loader, gpu=-1): grads = grad_z(x_test, y_test, model, gpu=gpu) flat_grads = parameters_to_vector(grads) def make_loss_f(model, params, names, x, y): def f(flat_params_): split_params = tensor_to_tuple(flat_params_, params) load_weights(model, names, split_params) out = model(x) loss = model.loss(out, y) return loss return f # Make model functional params, names = make_functional(model) # Make params regular Tensors instead of nn.Parameter params = tuple(p.detach().requires_grad_() for p in params) flat_params = parameters_to_vector(params) h = torch.zeros([flat_params.shape[0], flat_params.shape[0]]) if gpu >= 0: h = h.cuda() # Compute real IHVP for x_train, y_train in train_loader: if gpu >= 0: x_train, y_train = x_train.cuda(), y_train.cuda() f = make_loss_f(model, params, names, x_train, y_train) batch_h = hessian(f, flat_params, strict=True) with torch.no_grad(): h += batch_h / float(len(train_loader)) h = (h + h.transpose(0,1))/2 with torch.no_grad(): load_weights(model, names, params, as_params=True) inv_h = torch.inverse(h) print("Inverse Hessian") print(inv_h) real_ihvp = inv_h @ flat_grads return tensor_to_tuple(real_ihvp, params)
def grad_xy(self, x, y): return torch.trace(hessian(self.kernel, (x, y))[0][1])
def hessian_fwdrev(model, inp, strict=None): return functional.hessian(model, inp, strict=False, vectorize=True, outer_jacobian_strategy="forward-mode")
import numpy as np import torch from torch import nn, optim from torch.autograd import grad from torch.autograd.functional import hessian, jacobian ### scalar function #### def scalar_func(x): return x ** 2 + x print(jacobian(scalar_func, torch.ones(1), create_graph=True)) # f'(x) = 2x + 1 print(hessian(scalar_func, torch.ones(1), create_graph=True)) # f''(x) = 2 ### vector quadratic function #### def vector_func(x): H = torch.FloatTensor([[1.0, -1.0], [-1.0, 2.0]]) g = torch.FloatTensor([3.0, 1.0]) return 0.5 * x.t().matmul(H).matmul(x) + g.t().matmul(x) ### neural network function #### simple_model = nn.Sequential(*[nn.Linear(4, 2), nn.Softplus(), nn.Linear(2, 1)]) x = torch.ones(4) x.requires_grad = True y = simple_model(x) # print(grad(y, x, retain_graph=True, create_graph=True))
import torch from torch.autograd.functional import hessian def pow_reducer(x): return x.pow(3).sum() + torch.norm(x) inputs = torch.rand(2, 3) #.flatten() # print(torch.randn(2, 3, dtype=torch.float32)) print(hessian(pow_reducer, inputs)) y = hessian(pow_reducer, inputs) x = hessian(pow_reducer, inputs).reshape(6, 6) print(x) print(torch.norm(x - x.T))
import torch import time import numpy as np import torch.autograd.functional as F num_params = 40010 k = torch.tensor(np.load('./tests/utils/numpy_params/function_2_param_k.npy'), requires_grad=True, dtype=torch.float) torch.set_num_threads(1) def make_func(k): return (torch.sin(k) + torch.cos(k) + torch.pow(k, 2)).sum() start_time_pytorch = time.time() output = F.hessian(make_func, k.data) end_time_pytorch = time.time() runtime = (end_time_pytorch - start_time_pytorch) print(str(runtime)) output = output.data.numpy() for i in range(num_params): print(output[i][i])
def setUpClass(cls) -> None: pl.seed_everything(0) cls.n_features = 10 cls.n_params = 2 * cls.n_features cls.model = LinearRegression(cls.n_features) gpus = 1 if torch.cuda.is_available() else 0 trainer = pl.Trainer(gpus=gpus, max_epochs=10) # trainer.fit(cls.model) print(tuple(cls.model.parameters())) use_sklearn = True if use_sklearn: train_dataset = DummyDataset(cls.n_features) clf = SklearnLR() clf.fit(train_dataset.data, train_dataset.targets) with torch.no_grad(): cls.model.linear.weight = torch.nn.Parameter( torch.tensor([clf.coef_], dtype=torch.float)) cls.model.linear.bias = torch.nn.Parameter( torch.tensor([clf.intercept_], dtype=torch.float)) cls.train_loader = cls.model.train_dataloader(batch_size=40000) # Setup test point data cls.test_idx = 8 cls.x_test = torch.tensor([cls.model.test_set.data[[cls.test_idx]]], dtype=torch.float) cls.y_test = torch.tensor([cls.model.test_set.targets[[cls.test_idx]]], dtype=torch.float) # Compute estimated IVHP cls.gpu = 1 if torch.cuda.is_available() else -1 # Compute anc flatten grad grads = grad_z(cls.x_test, cls.y_test, cls.model, gpu=cls.gpu) flat_grads = parameters_to_vector(grads) print("Grads:") print(flat_grads) # Make model functional params, names = make_functional(cls.model) # Make params regular Tensors instead of nn.Parameter params = tuple(p.detach().requires_grad_() for p in params) flat_params = parameters_to_vector(params) # Initialize Hessian h = torch.zeros([flat_params.shape[0], flat_params.shape[0]]) # Compute real IHVP for x_train, y_train in cls.train_loader: if cls.gpu >= 0: x_train, y_train = x_train.cuda(), y_train.cuda() def f(flat_params_): split_params = tensor_to_tuple(flat_params_, params) load_weights(cls.model, names, split_params) out = cls.model(x_train) loss = calc_loss(out, y_train) return loss batch_h = hessian(f, flat_params, strict=True) with torch.no_grad(): h += batch_h / float(len(cls.train_loader)) print("Hessian:") print(h) complete_x_train = cls.train_loader.dataset.data real_hessian = complete_x_train.T @ complete_x_train / complete_x_train.shape[ 0] * 2 print(real_hessian) print(np.linalg.norm(real_hessian - h.cpu().numpy()[:10, :10])) np.save("hessian_pytorch.npy", h.cpu().numpy()) # Make the model back `nn` with torch.no_grad(): load_weights(cls.model, names, params, as_params=True) inv_h = torch.inverse(h) print("Inverse Hessian") print(inv_h) cls.real_ihvp = inv_h @ flat_grads print("Real IHVP") print(cls.real_ihvp)
def setUpClass(cls) -> None: pl.seed_everything(0) cls.n_features = 10 cls.n_classes = 3 cls.n_params = cls.n_classes * cls.n_features + cls.n_features cls.wd = wd = 1e-2 # weight decay=1/(nC) cls.model = LogisticRegression(cls.n_classes, cls.n_features, wd=cls.wd) gpus = 1 if torch.cuda.is_available() else 0 trainer = pl.Trainer(gpus=gpus, max_epochs=10) # trainer.fit(self.model) use_sklearn = True if use_sklearn: cls.train_dataset = cls.model.training_set #DummyDataset(cls.n_features, cls.n_classes) multi_class = "multinomial" if cls.model.n_classes != 2 else "auto" clf = SklearnLogReg(C=1 / len(cls.train_dataset) / wd, tol=1e-8, max_iter=1000, multi_class=multi_class) clf.fit(cls.train_dataset.data, cls.train_dataset.targets) with torch.no_grad(): cls.model.linear.weight = torch.nn.Parameter( torch.tensor(clf.coef_, dtype=torch.float)) cls.model.linear.bias = torch.nn.Parameter( torch.tensor(clf.intercept_, dtype=torch.float)) # Setup test point data cls.test_idx = 5 cls.x_test = torch.tensor(cls.model.test_set.data[[cls.test_idx]], dtype=torch.float) cls.y_test = torch.tensor(cls.model.test_set.targets[[cls.test_idx]], dtype=torch.long) # Compute estimated IVHP cls.gpu = 1 if torch.cuda.is_available() else -1 if cls.gpu >= 0: cls.model = cls.model.cuda() cls.x_test = cls.x_test.cuda() cls.y_test = cls.y_test.cuda() cls.train_loader = cls.model.train_dataloader(batch_size=40000) # Compute anc flatten grad grads = grad_z(cls.x_test, cls.y_test, cls.model, gpu=cls.gpu) flat_grads = parameters_to_vector(grads) print("Grads:") print(flat_grads) # Make model functional params, names = make_functional(cls.model) # Make params regular Tensors instead of nn.Parameter params = tuple(p.detach().requires_grad_() for p in params) flat_params = parameters_to_vector(params) # Initialize Hessian h = torch.zeros([flat_params.shape[0], flat_params.shape[0]]) if cls.gpu == 1: h = h.cuda() # Compute real IHVP for x_train, y_train in cls.train_loader: if cls.gpu >= 0: x_train, y_train = x_train.cuda(), y_train.cuda() f = make_loss_f(cls.model, params, names, x_train, y_train, wd=wd) batch_h = hessian(f, flat_params, strict=True) with torch.no_grad(): h += batch_h / float(len(cls.train_loader)) h = (h + h.transpose(0, 1)) / 2 print("Hessian:") print(h) np.save("hessian_pytorch.npy", h.cpu().numpy()) from numpy import linalg as LA ei = LA.eig(h.cpu().numpy())[0] print('ei=', ei) print("max,min eigen value=", ei.max(), ei.min()) assert ei.min() > 0, "Error: Non-positive Eigenvalues" # Make the model back `nn` with torch.no_grad(): load_weights(cls.model, names, params, as_params=True) inv_h = torch.inverse(h) print("Inverse Hessian") print(inv_h) cls.real_ihvp = inv_h @ flat_grads print("Real IHVP") print(cls.real_ihvp)
def H(x, A): def f(x): return qform(x,A) hval = agf.hessian(f, x) return hval
import torch from torch.autograd.functional import hessian def pow_addr_reducer(x, y): return (2 * x.pow(2) + 3 * y.pow(2)).sum() inputs = (torch.FloatTensor(1),torch.FloatTensor(1)) print(hessian(pow_addr_reducer, inputs))
def hessian_(x, lambda_): if le_cons.type() == LINEAR and obj.type() == LINEAR: return 0.0 else: return hessian(lambda x: l(x, lambda_), x)
z.backward() print("descent!!!") return z opt.step(closure) ### 計算黑塞矩陣 `torch.autograd.functional` 中有 `hessian` 此函數,其可用於計算純量函數之黑塞矩陣,這意味著研究者可以自行撰寫牛頓法之算則。 from torch.autograd.functional import hessian x = torch.tensor([1, 2, 3], dtype = torch.float, requires_grad=True) def g(x): z = (x ** 3).sum() return z print(hessian(g, x)) ## 實徵範例與練習 ### 練習 # set seed torch.manual_seed(246437) # write a function to generate data from torch.distributions import Bernoulli def generate_data(n_sample, weight, bias = 0,
def hessian_revrev(model, inp, strict=None): return functional.hessian(model, inp, strict=False, vectorize=True)
def hessian_torch(self, XU): return af.hessian(self.cost_f, XU).detach().numpy()