def test_optimization_1(): # See torch.test.test_optim # Also see Rosenbrock/banana function def rosenbrock(tensor): x, y = tensor return (1 - x)**2 + 100 * (y - x**2)**2 params_t = torch.Tensor([1.0, 1.5]) params = torch.autograd.Variable(params_t, requires_grad=True) sgd = torch.optim.SGD([params], lr=LR) schedule = torch.optim.lr_scheduler.MultiStepLR(sgd, STEP_SCHEDULE) adp = Mock(require_backward_grad_sync=True) adascale.AdaScale(adp, sgd, accum_scale=1.0, num_replicas=1, patch_optimizer=True) for i in range(100000): sgd.zero_grad() loss = rosenbrock(params) loss.backward() sgd.step() schedule.step() if params.allclose(torch.tensor([1.0, 1.0]), atol=ATOL): break else: pytest.fail(f"Did not converge: {params}")
def test_optimization_2(): def rosenbrock_noisy(tensor): x, y = tensor return (np.random.normal(1.0, 0.2) * (1 - x)**2 + np.random.normal(1.0, 0.2) * 100 * (y - x**2)**2) params_t = torch.Tensor([1.0, 1.5]) params = torch.autograd.Variable(params_t, requires_grad=True) sgd = torch.optim.SGD([params], lr=LR) schedule = torch.optim.lr_scheduler.MultiStepLR(sgd, STEP_SCHEDULE) adp = Mock(require_backward_grad_sync=True) adascale.AdaScale(adp, sgd, accum_scale=1.0, num_replicas=1, patch_optimizer=True) for i in range(100000): sgd.zero_grad() loss = sum([rosenbrock_noisy(params) for i in range(2)]) / 2.0 loss.backward() sgd.step() schedule.step() if params.allclose(torch.tensor([1.0, 1.0]), atol=ATOL): break else: pytest.fail(f"Did not converge: {params}")
def test_optimization_3(): def rosenbrock(x, y): return (1 - x)**2 + 100 * (y - x**2)**2 params_t = [{ "params": [torch.autograd.Variable(torch.Tensor([1.0]), requires_grad=True)] }, { "params": [torch.autograd.Variable(torch.Tensor([1.5]), requires_grad=True)] }] sgd = torch.optim.SGD(params_t, lr=LR) schedule = torch.optim.lr_scheduler.MultiStepLR(sgd, STEP_SCHEDULE) adp = Mock(require_backward_grad_sync=True) adascale.AdaScale(adp, sgd, accum_scale=1.0, num_replicas=1, patch_optimizer=True) for i in range(100000): sgd.zero_grad() loss = rosenbrock(params_t[0]['params'][0], params_t[1]['params'][0]) loss.backward() sgd.step() schedule.step() if params_t[0]['params'][0].allclose(torch.tensor([1.0]), atol=ATOL) \ and params_t[1]['params'][0].allclose(torch.tensor([1.0]), atol=ATOL): break else: pytest.fail(f"Did not converge: {params_t}")
def test_optimization_2(): def rosenbrock_noisy(tensor): x, y = tensor return (np.random.normal(1.0, 0.2) * (1 - x)**2 + np.random.normal(1.0, 0.2) * 100 * (y - x**2)**2) params_t = torch.Tensor([1.0, 1.5]) params = torch.autograd.Variable(params_t, requires_grad=True) sgd = torch.optim.SGD([params], lr=0.001) schedule = torch.optim.lr_scheduler.MultiStepLR(sgd, [1000]) obj = adascale.AdaScale(sgd, scale=2.0, num_replicas=1, patch_optimizer=True) i = 0.0 while i < 100000 and not params.allclose(torch.tensor([1.0, 1.0]), atol=0.01): sgd.zero_grad() loss = rosenbrock_noisy(params) loss.backward() sgd.step() i += obj.get_progress() schedule.step() assert (params.allclose(torch.tensor([1.0, 1.0]), atol=0.01))
def test_optimization_1(): # See torch.test.test_optim # Also see Rosenbrock/banana function def rosenbrock(tensor): x, y = tensor return (1 - x)**2 + 100 * (y - x**2)**2 params_t = torch.Tensor([1.0, 1.5]) params = torch.autograd.Variable(params_t, requires_grad=True) sgd = torch.optim.SGD([params], lr=0.001) schedule = torch.optim.lr_scheduler.MultiStepLR(sgd, [1000]) obj = adascale.AdaScale(sgd, scale=1.0, num_replicas=1, patch_optimizer=True) i = 0.0 while i < 100000 and not params.allclose(torch.tensor([1.0, 1.0]), atol=0.01): sgd.zero_grad() loss = rosenbrock(params) loss.backward() sgd.step() i += obj.get_progress() schedule.step() assert (params.allclose(torch.tensor([1.0, 1.0]), atol=0.01))
def test_nan(): def nan_objective(tensor): if random.random() > 0.5: target = float("Nan") else: target = 4.0 return (tensor - target)**2 params_t = torch.Tensor([1.0]) params = torch.autograd.Variable(params_t, requires_grad=True) sgd = torch.optim.SGD([params], lr=0.1) adp = Mock(require_backward_grad_sync=True) ada = adascale.AdaScale(adp, sgd, accum_scale=1.0, num_replicas=1, patch_optimizer=True) for i in range(100): sgd.zero_grad() loss = nan_objective(params) loss.backward() if np.all(np.isfinite(loss.detach().numpy())): sgd.step() if params.allclose(torch.tensor([4.0]), atol=ATOL): break else: pytest.fail(f"Did not converge: {params}") if not (np.all(np.isfinite(ada.sqr_avg())) and np.all(np.isfinite(ada.var_avg()))): pytest.fail(f"non-finite adascale parameters:" f"{ada.sqr_avg()}, {ada.var_avg()}")
def test_object(): params = [torch.tensor([[1., -1.], [2., 3.]], requires_grad=True), torch.tensor([[2., 3.]], requires_grad=True)] sgd = torch.optim.SGD(params, lr=0.1) obj = adascale.AdaScale(sgd, scale=1.0, num_replicas=1) assert(obj._scale == 1.0) obj._num_replicas = 8 obj.set_scale(3.0) assert(obj.scale == 3.0) obj._num_replicas = 4 obj.set_scale(3.0) assert(obj.scale == 3.0) assert(obj.gain(2.0) == 1.0) obj._state['var_avg'] = 3.0 obj._state['norm_avg'] = 1.0 assert(obj.gain(3.0) == 2.0)
def test_object(): params = [ torch.tensor([[1., -1.], [2., 3.]], requires_grad=True), torch.tensor([[2., 3.]], requires_grad=True) ] sgd = torch.optim.SGD(params, lr=0.1) adp = Mock(require_backward_grad_sync=True) obj = adascale.AdaScale(adp, sgd, accum_scale=1.0, num_replicas=1) assert (obj._accum_scale == 1.0) obj._num_replicas = 8 obj.set_accum_scale(3.0) assert (obj.accum_scale == 3.0) obj._num_replicas = 4 obj.set_accum_scale(3.0) assert (obj.accum_scale == 3.0) assert (np.isclose(obj.gain(2.0), 1.0)) obj._state['var_avg'] = 3.0 obj._state['norm_avg'] = 1.0 assert (np.isclose(obj.gain(3.0), 2.0))
def test_optimization_3(): # See torch.test.test_optim # Also see Rosenbrock/banana function def rosenbrock(x, y): return (1 - x)**2 + 100 * (y - x**2)**2 params_t = [{ "params": [torch.autograd.Variable(torch.Tensor([1.0]), requires_grad=True)] }, { "params": [torch.autograd.Variable(torch.Tensor([1.5]), requires_grad=True)] }] sgd = torch.optim.SGD(params_t, lr=0.001) schedule = torch.optim.lr_scheduler.MultiStepLR(sgd, [1000]) obj = adascale.AdaScale(sgd, scale=1.0, num_replicas=1, patch_optimizer=True) i = 0.0 while (i < 100000 and not (params_t[0]['params'][0].allclose(torch.tensor([1.0]), atol=0.01) and params_t[1]['params'][0].allclose(torch.tensor([1.0]), atol=0.01))): sgd.zero_grad() loss = rosenbrock(params_t[0]['params'][0], params_t[1]['params'][0]) loss.backward() sgd.step() i += obj.get_progress() schedule.step() print(params_t) assert (params_t[0]['params'][0].allclose(torch.tensor([1.0]), atol=0.01) and params_t[1]['params'][0].allclose(torch.tensor([1.0]), atol=0.01))