def test_gradient_value(): """Test that we don't mutate the gradients during backward""" model = Linear(2, 2, bias=False) optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2) # fwd 1 out = model(Tensor([0.0, 1.0])) out.sum().backward() assert np.allclose(model.weight.grad.numpy(), [[0.0, 1.0], [0.0, 1.0]]), model.weight.grad # fwd 2, grad is accumulated out = model(Tensor([0.0, 1.0])) out.sum().backward() assert np.allclose(model.weight.grad.numpy(), [[0.0, 2.0], [0.0, 2.0]]), model.weight.grad # assert gain and grad value before/after step/zero_grad assert np.allclose(optim.gain(), 1.0000002499999376), optim.gain() optim.step() assert np.allclose(model.weight.grad.numpy(), [[0.0, 2.0], [0.0, 2.0]]), model.weight.grad optim.zero_grad() assert np.allclose(model.weight.grad.numpy(), [[0.0, 0.0], [0.0, 0.0]]), model.weight.grad
def _test_grad_accum_func(rank, world_size, tempfile_name): _dist_init(rank, world_size, tempfile_name, backend="gloo") # Covers gloo model = Linear(4, 2, bias=False) model.to("cuda") model = DDP(model, device_ids=[rank]) optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2) with model.no_sync(): # iter 1, input vectors are pointing dim0 and dim1 in_data = Tensor([0.0] * 4) in_data[rank] = 1.0 in_data = in_data.cuda() out = model(in_data) out.sum().backward() # iter 2, input vectors are pointing dim2 and dim3 in_data = Tensor([0.0] * 4) in_data[rank + 2] = 1.0 in_data = in_data.cuda() out = model(in_data) out.sum().backward() # since all inputs are orthogonal, the gain should be exactly 4.0. assert np.allclose(optim.gain(), 4.0), optim.gain() optim.step() optim.zero_grad() dist.destroy_process_group()
def _test_basic_func(rank, world_size, tempfile_name, test_case): _dist_init(rank, world_size, tempfile_name, backend="nccl") # Covers nccl model = Linear(2, 2, bias=False) model.to("cuda") model = DDP(model, device_ids=[rank]) optim = AdaScale(SGD(model.parameters(), lr=0.1)) if "input" in test_case: # single iter in_data = Tensor(test_case["input"][rank]) in_data = in_data.cuda() out = model(in_data) out.sum().backward() assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain() optim.step() optim.zero_grad() else: # multiple iters for in_data in test_case["inputs"]: in_data = Tensor(in_data[rank]).cuda() out = model(in_data) out.sum().backward() optim.step() optim.zero_grad() assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain() dist.destroy_process_group()
def test_grad_accum(test_case, cpu): """Test the basic functionality on CPU/GPU with gradient accumulation without DDP""" model = Linear(2, 2, bias=False) if not cpu: if torch.cuda.device_count() < 1: pytest.skip("1 GPU is required") model = model.cuda() optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2) expected_gain = test_case["expected_gain"] if "input" in test_case: data = [test_case["input"]] * 2 gains = [expected_gain] * 2 else: data = test_case["inputs"] gains = [None, expected_gain] for in_data, exp_gain in zip(data, gains): # test 2 iterations catch more corner cases. # grad pass 1 in_data_0 = Tensor(in_data[0]) if not cpu: in_data_0 = in_data_0.cuda() out = model(in_data_0) out.sum().backward() # grad pass 2 in_data_1 = Tensor(in_data[1]) if not cpu: in_data_1 = in_data_1.cuda() out = model(in_data_1) out.sum().backward() if exp_gain is not None: assert np.allclose(optim.gain(), exp_gain), optim.gain() # stepping it. Note that if we did more than 2 passes as promised by the # num_gradients_to_accumulate argument above, AdaScale is not be able to # detect that mistake for now. The result will just be wrong in that case. optim.step() optim.zero_grad()
def test_set_num_gradients_to_accumulate(test_case): """Test set_num_gradients_to_accumulate experimental feature.""" new_accum = test_case["new_accum"] exp_gain = test_case["exp_gain"] model = Linear(2, 2, bias=False) optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2) out = model(Tensor([0.0, 1.0])) out.sum().backward() out = model(Tensor([1.0, 0.0])) out.sum().backward() assert np.allclose(optim.gain(), 2.0) optim.step() optim.zero_grad() optim.set_scale(float(new_accum)) optim.set_num_gradients_to_accumulate(new_accum) for _ in range(new_accum): out = model(Tensor([0.0, 1.0])) out.sum().backward() assert np.allclose(optim.gain(), exp_gain), optim.gain() optim.step() optim.zero_grad()
def test_grad_accum_cpu(cpu=True): """Test the basic functionality on CPU with gradient accumulation without DDP""" model = Linear(2, 2, bias=False) if not cpu: model = model.cuda() optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2) for expected_gain in [2.0, 2.0]: # test 2 iterations catch more corner cases. # grad pass 1 in_data = Tensor([0.0, 1.0]) if not cpu: in_data = in_data.cuda() out = model(in_data) out.sum().backward() # grad pass 2 in_data = Tensor([1.0, 0.0]) if not cpu: in_data = in_data.cuda() out = model(in_data) out.sum().backward() # stepping it. Note that if we did more than 2 passes as promised by the # num_gradients_to_accumulate argument above, AdaScale is not be able to # detect that mistake for now. The result will just be wrong in that case. assert np.allclose(optim.gain(), expected_gain), optim.gain() optim.step() optim.zero_grad()
def _test_basic_func(rank, world_size, tempfile_name, test_case, oss, model=None): _dist_init(rank, world_size, tempfile_name, backend="nccl") if model is None: model = Linear(2, 2) model.bias.data.fill_(0.0) model.to("cuda") model = DDP(model, device_ids=[rank]) assert oss in ["none", "ada-oss", "wrapper-oss", "oss-wrapper"] if oss == "ada-oss": optim = AdaScale(OSS(model.parameters(), SGD, lr=0.1)) elif oss == "wrapper-oss": optim = AdaScaleWrapper(model.parameters(), optim_cls=OSS, optim=SGD, lr=0.1) elif oss == "oss-wrapper": optim = OSS(model.parameters(), AdaScaleWrapper, optim_cls=SGD, lr=0.1) else: assert oss == "none" optim = AdaScale(SGD(model.parameters(), lr=0.1)) if "input" in test_case: inputs = [test_case["input"]] else: inputs = test_case["inputs"] for in_data in inputs: in_data = Tensor(in_data[rank]).cuda() out = model(in_data) out.sum().backward() optim.step() optim.zero_grad() if "expected_gain" in test_case: assert np.allclose(optim.gain(), test_case["expected_gain"]), "{} vs {}".format( optim.gain(), test_case["expected_gain"]) if "expected_mean_weight" in test_case: mean_weight = mean( [model.module[i].weight.data.mean().item() for i in range(4)]) assert np.allclose(mean_weight, test_case["expected_mean_weight"]), mean_weight dist.destroy_process_group()
def test_debias_ewma(): """Test debias_ewma experimental feature""" model = Linear(2, 2, bias=False) optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2, debias_ewma=True) for _ in range(4): out = model(Tensor([0.0, 1.0])) out.sum().backward() out = model(Tensor([1.0, 0.0])) out.sum().backward() assert np.allclose(optim.gain(), 2.0), optim.gain() optim.step() optim.zero_grad()
def _test_basic_func(rank, world_size, tempfile_name): _dist_init(rank, world_size, tempfile_name, backend="nccl") # Covers nccl model = Linear(2, 2, bias=False) model.to("cuda") model = DDP(model, device_ids=[rank]) optim = AdaScale(SGD(model.parameters(), lr=0.1)) # iter 1 in_data = Tensor([0.0, 0.0]) in_data[rank] = 1.0 in_data = in_data.cuda() out = model(in_data) out.sum().backward() assert np.allclose(optim.gain(), 2.0), optim.gain() optim.step() optim.zero_grad() dist.destroy_process_group()
def _test_basic_func(rank, ddp_cls, world_size, tempfile_name, test_case): _dist_init(rank, world_size, tempfile_name, backend="nccl") # Covers nccl model = Linear(2, 2) model.to("cuda") if ddp_cls is DDP: model = ddp_cls(model, device_ids=[rank]) optim = AdaScale(SGD(model.parameters(), lr=0.1)) elif ddp_cls is SDP: optim = AdaScale(OSS(model.parameters(), SGD, lr=0.1)) model = ddp_cls(model, sharded_optimizer=optim) else: assert ddp_cls is FSDP, ddp_cls # Two cases: # flatten=True : AdaScale wrapper must be after FSDP and it receives # a single grad tensor. It won't receive grad if # wrapped before. # flatten=False: AdaScale can be both before or after FSDP. # So, it is better to do AdaScale after FSDP. model = ddp_cls(model, flatten_parameters=False) optim = AdaScale(SGD(model.parameters(), lr=0.1)) if "input" in test_case: # single iter in_data = Tensor(test_case["input"][rank]) in_data = in_data.cuda() out = model(in_data) out.sum().backward() if ddp_cls is DDP: assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain() optim.step() optim.zero_grad() else: # multiple iters for in_data in test_case["inputs"]: in_data = Tensor(in_data[rank]).cuda() out = model(in_data) out.sum().backward() optim.step() optim.zero_grad() if ddp_cls is DDP: assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain() dist.destroy_process_group()
def test_lr_scheduler(): """Test AdaScale working with torch.optim.lr_scheduler.""" model = Linear(2, 2, bias=False) optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=3) # We use 1, not 0.1 here since scheduler.step() is called here first. scheduler = LambdaLR(optim, lr_lambda=lambda epoch: 1 / 10 ** epoch) for epoch in range(3): for data_idx in range(10): for accumulation in range(3): in_data = torch.rand(2) loss = model(in_data).sum() loss.backward() assert optim.gain() <= 3, optim.gain() optim.step() optim.zero_grad() # asserting LR is right assert np.allclose(optim.param_groups[0]["lr"], 0.1 / 10 ** epoch), optim.param_groups[0]["lr"] scheduler.step() # asserting LR is right assert np.allclose(optim.param_groups[0]["lr"], 0.1 / 10 ** (epoch + 1)), optim.param_groups[0]["lr"]
def _test_basic_func(rank, world_size, tempfile_name, test_case, oss, model=None): _dist_init(rank, world_size, tempfile_name, backend="nccl") if model is None: model = Linear(2, 2, bias=False) model.to("cuda") model = DDP(model, device_ids=[rank]) if oss: # For now, we can only wrap AdaScale over OSS. If we do it the other way around, # AdaScale needs to take different parameter types, i.e. the parameter list, etc. optim = AdaScale(OSS(model.parameters(), SGD, lr=0.1)) else: optim = AdaScale(SGD(model.parameters(), lr=0.1)) if "input" in test_case: inputs = [test_case["input"]] else: inputs = test_case["inputs"] for in_data in inputs: in_data = Tensor(in_data[rank]).cuda() out = model(in_data) out.sum().backward() optim.step() optim.zero_grad() assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain() if "expected_mean_weight" in test_case: mean_weight = mean( [model.module[i].weight.data.mean().item() for i in range(4)]) assert np.allclose(mean_weight, test_case["expected_mean_weight"]), mean_weight dist.destroy_process_group()
def test_add_param_group(debias_ewma): """Test AdaScale supports add_param_group() API.""" model1 = Linear(2, 2, bias=True) with torch.no_grad(): # make weights and bias deterministic, which is needed for # multi-layer models. For them, adascale gain is affected by # parameters from other layers. model1.weight.copy_(Tensor([1.0, 2.0, 3.0, 4.0]).reshape(2, 2)) model1.bias.fill_(0.1) optim = AdaScale(SGD(model1.parameters(), lr=0.1), num_gradients_to_accumulate=2, debias_ewma=debias_ewma) assert len(optim._hook_handles) == 2 model2 = Linear(2, 3, bias=True) with torch.no_grad(): # make weights and bias deterministic model2.weight.copy_( Tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).reshape(3, 2)) model2.bias.fill_(0.2) optim.add_param_group({"params": model2.parameters()}) assert len(optim._hook_handles) == 4 # make sure we can run the model. model = Sequential(model1, model2).cuda() in_data_0 = Tensor([1.0, 2.0]).cuda() out = model(in_data_0) out.sum().backward() in_data_1 = Tensor([3.0, 4.0]).cuda() out = model(in_data_1) out.sum().backward() # make sure the gains are right and we can step. # since this is the first step, debias_ewma doesn't affect the value. assert np.allclose(optim.gain(), 1.1440223454935758), optim.gain() assert np.allclose(optim.gain(0), 1.1428571428571428), optim.gain(0) assert np.allclose(optim.gain(1), 1.1471258476157762), optim.gain(1) optim.step() optim.zero_grad() # make sure we can add a PG again after stepping. model3 = Linear(3, 4, bias=True) with torch.no_grad(): # make weights and bias deterministic model3.weight.copy_( Tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0] * 2).reshape(4, 3)) model3.bias.fill_(0.2) optim.add_param_group({"params": model3.parameters()}) assert len(optim._hook_handles) == 6 # make sure we can run the model. model = Sequential(model1, model2, model3).cuda() in_data_0 = Tensor([1.0, 2.0]).cuda() out = model(in_data_0) out.sum().backward() in_data_1 = Tensor([3.0, 4.0]).cuda() out = model(in_data_1) out.sum().backward() # make sure gains are right and we can step. # the last PG's gain is not affected by debias_ewma since it is the first step for that PG. assert np.allclose( optim.gain(), 1.1191193589460822 if debias_ewma else 1.1192783954732368), optim.gain() assert np.allclose( optim.gain(0), 1.1428571880897151 if debias_ewma else 1.142857188085096), optim.gain(0) assert np.allclose( optim.gain(1), 1.1167103578364508 if debias_ewma else 1.1167104954034948), optim.gain(1) assert np.allclose(optim.gain(2), 1.117381091722702), optim.gain(2) optim.step() optim.zero_grad()