def test_minimize_methods(dtype, device, method): torch.manual_seed(400) random.seed(100) nr = 3 nbatch = 2 default_fwd_options = { "max_niter": 50, "f_tol": 1e-9, "alpha": -1.0, } linearmixing_fwd_options = { "max_niter": 50, "f_tol": 3e-6, "alpha": -0.3, } gd_fwd_options = { "maxiter": 5000, "f_rtol": 1e-10, "x_rtol": 1e-10, "step": 1e-2, } # list the methods and the options here options = { "broyden1": default_fwd_options, "broyden2": default_fwd_options, "linearmixing": linearmixing_fwd_options, "gd": gd_fwd_options, "adam": gd_fwd_options, }[method] # specify higher atol for non-ideal method atol = defaultdict(lambda: 1e-8) atol["linearmixing"] = 3e-6 A = torch.nn.Parameter((torch.randn( (nr, nr)) * 0.5).to(dtype).requires_grad_()) diag = torch.nn.Parameter( torch.randn((nbatch, nr)).to(dtype).requires_grad_()) # bias will be detached from the optimization line, so set it undifferentiable bias = torch.zeros((nbatch, nr)).to(dtype) y0 = torch.randn((nbatch, nr)).to(dtype) activation = "square" # square activation makes it easy to optimize fwd_options = {**options, "method": method} model = DummyModule(A, addx=False, activation=activation, sumoutput=True) model.set_diag_bias(diag, bias) y = minimize(model.forward, y0, **fwd_options) # check the grad (must be close to 1) with torch.enable_grad(): y1 = y.clone().requires_grad_() f = model.forward(y1) grady, = torch.autograd.grad(f, (y1, )) assert torch.allclose(grady, grady * 0, atol=atol[method]) # check the hessian (must be posdef) h = hess(model.forward, (y1, ), idxs=0).fullmatrix() eigval, _ = torch.symeig(h) assert torch.all(eigval >= 0)
def getloss(A, y0, diag, bias): model = clss(A, addx=False, activation=activation, sumoutput=True) model.set_diag_bias(diag, bias) y = minimize(model.forward, y0, method=method, bck_options=bck_options, **fwd_options) return y
def test_minimizer_warnings(): # test to see if it produces warnings def fcn(a): return (a * a).sum() with pytest.warns(UserWarning, match="converge"): # set it so that it will never converge a = torch.tensor(1.0, dtype=torch.float64) amin = minimize(fcn, a, method="gd", step=0.1, f_rtol=0, x_rtol=0, maxiter=10, verbose=True)
def test_minimize(dtype, device, clss): torch.manual_seed(400) random.seed(100) nr = 3 nbatch = 2 A = torch.nn.Parameter((torch.randn( (nr, nr)) * 0.5).to(dtype).requires_grad_()) diag = torch.nn.Parameter( torch.randn((nbatch, nr)).to(dtype).requires_grad_()) # bias will be detached from the optimization line, so set it undifferentiable bias = torch.zeros((nbatch, nr)).to(dtype) y0 = torch.randn((nbatch, nr)).to(dtype) fwd_options = { "method": "broyden1", "max_niter": 50, "f_tol": 1e-9, "alpha": -0.5, } activation = "square" # square activation makes it easy to optimize model = clss(A, addx=False, activation=activation, sumoutput=True) model.set_diag_bias(diag, bias) y = minimize(model.forward, y0, **fwd_options) # check the grad (must be close to 1) with torch.enable_grad(): y1 = y.clone().requires_grad_() f = model.forward(y1) grady, = torch.autograd.grad(f, (y1, )) assert torch.allclose(grady, grady * 0) # check the hessian (must be posdef) h = hess(model.forward, (y1, ), idxs=0).fullmatrix() eigval, _ = torch.symeig(h) assert torch.all(eigval >= 0) def getloss(A, y0, diag, bias): model = clss(A, addx=False, activation=activation, sumoutput=True) model.set_diag_bias(diag, bias) y = minimize(model.forward, y0, **fwd_options) return y gradcheck(getloss, (A, y0, diag, bias)) gradgradcheck(getloss, (A, y0, diag, bias))
def test_minimize_methods(dtype, device): torch.manual_seed(400) random.seed(100) dtype = torch.float64 nr = 3 nbatch = 2 default_fwd_options = { "max_niter": 50, "f_tol": 1e-9, "alpha": -0.5, } # list the methods and the options here methods_and_options = { "broyden1": default_fwd_options, } A = torch.nn.Parameter((torch.randn( (nr, nr)) * 0.5).to(dtype).requires_grad_()) diag = torch.nn.Parameter( torch.randn((nbatch, nr)).to(dtype).requires_grad_()) # bias will be detached from the optimization line, so set it undifferentiable bias = torch.zeros((nbatch, nr)).to(dtype) y0 = torch.randn((nbatch, nr)).to(dtype) activation = "square" # square activation makes it easy to optimize for method in methods_and_options: fwd_options = {**methods_and_options[method], "method": method} model = DummyModule(A, addx=False, activation=activation, sumoutput=True) model.set_diag_bias(diag, bias) y = minimize(model.forward, y0, **fwd_options) # check the grad (must be close to 1) with torch.enable_grad(): y1 = y.clone().requires_grad_() f = model.forward(y1) grady, = torch.autograd.grad(f, (y1, )) assert torch.allclose(grady, grady * 0) # check the hessian (must be posdef) h = hess(model.forward, (y1, ), idxs=0).fullmatrix() eigval, _ = torch.symeig(h) assert torch.all(eigval >= 0)
def test_min_not_stop_for_negative_value(): # there was a bug where the minimizer stops right away if the value is negative # this test is written to make sure it does not happen again def fcn(a): return (a * a).sum() - 100. # the method must be non-rootfinder method with pytest.warns(UserWarning, match="converge"): method = "gd" a = torch.tensor(1.0, dtype=torch.float64) amin = minimize(fcn, a, method=method, step=0.2, f_rtol=0, x_rtol=0, verbose=True) amin_true = torch.zeros_like(amin) assert torch.allclose(amin, amin_true)
def test_minimize(dtype, device, clss, method): torch.manual_seed(400) random.seed(100) method_fwd_options = { "broyden1": { "max_niter": 50, "f_tol": 1e-9, "alpha": -0.5, }, "gd": { "maxiter": 10000, "f_rtol": 1e-14, "x_rtol": 1e-14, "step": 2e-2, }, } nr = 2 nbatch = 2 A = torch.nn.Parameter((torch.randn( (nr, nr)) * 0.5).to(dtype).requires_grad_()) diag = torch.nn.Parameter( torch.randn((nbatch, nr)).to(dtype).requires_grad_()) bias = torch.nn.Parameter( torch.zeros((nbatch, nr)).to(dtype).requires_grad_()) y0 = torch.randn((nbatch, nr)).to(dtype) fwd_options = method_fwd_options[method] bck_options = { "rtol": 1e-9, "atol": 1e-9, } activation = "square" # square activation makes it easy to optimize model = clss(A, addx=False, activation=activation, sumoutput=True) model.set_diag_bias(diag, bias) y = minimize(model.forward, y0, method=method, **fwd_options) # check the grad (must be close to 0) with torch.enable_grad(): y1 = y.clone().requires_grad_() f = model.forward(y1) grady, = torch.autograd.grad(f, (y1, )) assert torch.allclose(grady, grady * 0) # check the hessian (must be posdef) h = hess(model.forward, (y1, ), idxs=0).fullmatrix() eigval, _ = torch.symeig(h) assert torch.all(eigval >= 0) def getloss(A, y0, diag, bias): model = clss(A, addx=False, activation=activation, sumoutput=True) model.set_diag_bias(diag, bias) y = minimize(model.forward, y0, method=method, bck_options=bck_options, **fwd_options) return y gradcheck(getloss, (A, y0, diag, bias)) # pytorch 1.8's gradgradcheck fails if there are unrelated variables # I have made a PR to solve this and will be in 1.9 gradgradcheck(getloss, (A, y0, diag, bias.detach()))
def getloss(a): model = clss(a, sumoutput=True) y = minimize(model.forward, y0, **fwd_options) return y