Esempio n. 1
0
def test_unhook():
    """Test unhook that frees the tensor from CUDA memory."""
    model = Linear(123, 456,
                   bias=False).cuda()  # unique shape so that it can be found
    optim = AdaScale(SGD(model.parameters(), lr=0.1),
                     num_gradients_to_accumulate=2)

    def find_tensor():
        """ Find the weight tensor from the heap

            Return True if found.
        """
        for obj in gc.get_objects():
            try:
                # Only need to check parameter type objects
                if "torch.nn.parameter.Parameter" not in str(type(obj)):
                    continue
                if torch.is_tensor(obj) or (hasattr(obj, "data")
                                            and torch.is_tensor(obj.data)):
                    if obj.shape == (456, 123):
                        return True
            except Exception as e:
                pass
        return False

    torch.cuda.empty_cache()
    assert find_tensor(
    ), "something wrong with gc-based method to find the tensor"

    optim.unhook()
    del model
    del optim
    torch.cuda.empty_cache()
    assert not find_tensor(), "tensor should have been released"
Esempio n. 2
0
def test_gradient_value():
    """Test that we don't mutate the gradients during backward"""
    model = Linear(2, 2, bias=False)
    optim = AdaScale(SGD(model.parameters(), lr=0.1),
                     num_gradients_to_accumulate=2)

    # fwd 1
    out = model(Tensor([0.0, 1.0]))
    out.sum().backward()
    assert np.allclose(model.weight.grad.numpy(),
                       [[0.0, 1.0], [0.0, 1.0]]), model.weight.grad

    # fwd 2, grad is accumulated
    out = model(Tensor([0.0, 1.0]))
    out.sum().backward()
    assert np.allclose(model.weight.grad.numpy(),
                       [[0.0, 2.0], [0.0, 2.0]]), model.weight.grad

    # assert gain and grad value before/after step/zero_grad
    assert np.allclose(optim.gain(), 1.0000002499999376), optim.gain()
    optim.step()
    assert np.allclose(model.weight.grad.numpy(),
                       [[0.0, 2.0], [0.0, 2.0]]), model.weight.grad
    optim.zero_grad()
    assert np.allclose(model.weight.grad.numpy(),
                       [[0.0, 0.0], [0.0, 0.0]]), model.weight.grad
Esempio n. 3
0
def test_grad_accum(test_case, cpu):
    """Test the basic functionality on CPU/GPU with gradient accumulation without DDP"""
    model = Linear(2, 2, bias=False)
    if not cpu:
        if torch.cuda.device_count() < 1:
            pytest.skip("1 GPU is required")
        model = model.cuda()
    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)
    expected_gain = test_case["expected_gain"]
    if "input" in test_case:
        data = [test_case["input"]] * 2
        gains = [expected_gain] * 2
    else:
        data = test_case["inputs"]
        gains = [None, expected_gain]
    for in_data, exp_gain in zip(data, gains):  # test 2 iterations catch more corner cases.
        # grad pass 1
        in_data_0 = Tensor(in_data[0])
        if not cpu:
            in_data_0 = in_data_0.cuda()
        out = model(in_data_0)
        out.sum().backward()
        # grad pass 2
        in_data_1 = Tensor(in_data[1])
        if not cpu:
            in_data_1 = in_data_1.cuda()
        out = model(in_data_1)
        out.sum().backward()
        if exp_gain is not None:
            assert np.allclose(optim.gain(), exp_gain), optim.gain()
        # stepping it. Note that if we did more than 2 passes as promised by the
        # num_gradients_to_accumulate argument above, AdaScale is not be able to
        # detect that mistake for now. The result will just be wrong in that case.
        optim.step()
        optim.zero_grad()
Esempio n. 4
0
def test_grad_accum_cpu(cpu=True):
    """Test the basic functionality on CPU with gradient accumulation without DDP"""
    model = Linear(2, 2, bias=False)
    if not cpu:
        model = model.cuda()
    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)
    for expected_gain in [2.0, 2.0]:  # test 2 iterations catch more corner cases.
        # grad pass 1
        in_data = Tensor([0.0, 1.0])
        if not cpu:
            in_data = in_data.cuda()
        out = model(in_data)
        out.sum().backward()
        # grad pass 2
        in_data = Tensor([1.0, 0.0])
        if not cpu:
            in_data = in_data.cuda()
        out = model(in_data)
        out.sum().backward()
        # stepping it. Note that if we did more than 2 passes as promised by the
        # num_gradients_to_accumulate argument above, AdaScale is not be able to
        # detect that mistake for now. The result will just be wrong in that case.
        assert np.allclose(optim.gain(), expected_gain), optim.gain()
        optim.step()
        optim.zero_grad()
def test_unhook():
    """Test unhook that frees the tensor from CUDA memory."""
    model = Linear(123, 456, bias=False).cuda()  # unique shape so that it can be found
    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)

    torch.cuda.empty_cache()
    target_shape = (456, 123)
    assert find_tensor_by_shape(target_shape), "something wrong with gc-based method to find the tensor"

    optim.unhook()
    del model
    del optim
    torch.cuda.empty_cache()
    assert not find_tensor_by_shape(target_shape), "tensor should have been released"
Esempio n. 6
0
def _test_grad_accum_func(rank, world_size, tempfile_name):
    _dist_init(rank, world_size, tempfile_name, backend="gloo")  # Covers gloo

    model = Linear(4, 2, bias=False)
    model.to("cuda")
    model = DDP(model, device_ids=[rank])
    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)
    with model.no_sync():
        # iter 1, input vectors are pointing dim0 and dim1
        in_data = Tensor([0.0] * 4)
        in_data[rank] = 1.0
        in_data = in_data.cuda()
        out = model(in_data)
        out.sum().backward()
    # iter 2, input vectors are pointing dim2 and dim3
    in_data = Tensor([0.0] * 4)
    in_data[rank + 2] = 1.0
    in_data = in_data.cuda()
    out = model(in_data)
    out.sum().backward()
    # since all inputs are orthogonal, the gain should be exactly 4.0.
    assert np.allclose(optim.gain(), 4.0), optim.gain()
    optim.step()
    optim.zero_grad()

    dist.destroy_process_group()
Esempio n. 7
0
def test_basic_cpu():
    """Test single batch behavior on CPU"""
    model = Linear(2, 2, bias=False)
    try:
        optim = AdaScale(SGD(model.parameters(), lr=0.1))
    except RuntimeError:
        return
    assert False, "Single batch AdaScale should not be suppported"
def test_scale_not_equal_default(test_case):
    """Test gain value when scale doesn't equal world size * grad_accum"""
    scale = test_case["scale"]
    exp_gain = test_case["exp_gain"]
    model = Linear(4, 2, bias=False)
    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=4, scale=scale)

    data = [
        [1.0, 0.0, 0.0, 0.0],
        [0.0, 1.0, 0.0, 0.0],
        [0.0, 0.0, 1.0, 0.0],
        [0.0, 0.0, 0.0, 1.0],
    ]
    for i in range(4):
        out = model(Tensor(data[i]))
        out.sum().backward()
    # Since the inputs are perfect orthogonal, the gain should be at the scale.
    assert np.allclose(optim.gain(), exp_gain), optim.gain()
def test_debias_ewma():
    """Test debias_ewma experimental feature"""
    model = Linear(2, 2, bias=False)
    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2, debias_ewma=True)
    for _ in range(4):
        out = model(Tensor([0.0, 1.0]))
        out.sum().backward()
        out = model(Tensor([1.0, 0.0]))
        out.sum().backward()
        assert np.allclose(optim.gain(), 2.0), optim.gain()
        optim.step()
        optim.zero_grad()
def test_loss_accum_cpu():
    """Test the loss accumulation behavior on CPU

    Loss accumulation is NOT SUPPORTED. This test shows that it does not work.
    """
    model = Linear(2, 2, bias=False)
    # num_gradients_to_accumulate value doesn't matter in this negative test.
    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=3)
    # data 1
    in_data = Tensor([0.0, 1.0])
    loss = model(in_data).sum()
    # data 2
    in_data = Tensor([1.0, 0.0])
    loss += model(in_data).sum()
    # data 3
    in_data = Tensor([1.0, 2.0])
    loss += model(in_data).sum()
    # backward, but gradient is only produced once by the autograd engine.
    loss.backward()
    # The gain will always be 1, which renders adascale as noop.
    assert np.allclose(optim.gain(), 1.0), optim.gain()
Esempio n. 11
0
def _test_basic_func(rank,
                     world_size,
                     tempfile_name,
                     test_case,
                     oss,
                     model=None):
    _dist_init(rank, world_size, tempfile_name, backend="nccl")

    if model is None:
        model = Linear(2, 2, bias=False)
    model.to("cuda")
    model = DDP(model, device_ids=[rank])
    if oss:
        # For now, we can only wrap AdaScale over OSS. If we do it the other way around,
        # AdaScale needs to take different parameter types, i.e. the parameter list, etc.
        optim = AdaScale(OSS(model.parameters(), SGD, lr=0.1))
    else:
        optim = AdaScale(SGD(model.parameters(), lr=0.1))

    if "input" in test_case:
        inputs = [test_case["input"]]
    else:
        inputs = test_case["inputs"]

    for in_data in inputs:
        in_data = Tensor(in_data[rank]).cuda()
        out = model(in_data)
        out.sum().backward()
        optim.step()
        optim.zero_grad()

    assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()

    if "expected_mean_weight" in test_case:
        mean_weight = mean(
            [model.module[i].weight.data.mean().item() for i in range(4)])
        assert np.allclose(mean_weight,
                           test_case["expected_mean_weight"]), mean_weight

    dist.destroy_process_group()
Esempio n. 12
0
def _test_basic_func(rank, world_size, tempfile_name):
    _dist_init(rank, world_size, tempfile_name, backend="nccl")  # Covers nccl

    model = Linear(2, 2, bias=False)
    model.to("cuda")
    model = DDP(model, device_ids=[rank])
    optim = AdaScale(SGD(model.parameters(), lr=0.1))
    # iter 1
    in_data = Tensor([0.0, 0.0])
    in_data[rank] = 1.0
    in_data = in_data.cuda()
    out = model(in_data)
    out.sum().backward()
    assert np.allclose(optim.gain(), 2.0), optim.gain()
    optim.step()
    optim.zero_grad()

    dist.destroy_process_group()
def test_lr_scheduler():
    """Test AdaScale working with torch.optim.lr_scheduler."""
    model = Linear(2, 2, bias=False)
    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=3)
    # We use 1, not 0.1 here since scheduler.step() is called here first.
    scheduler = LambdaLR(optim, lr_lambda=lambda epoch: 1 / 10 ** epoch)
    for epoch in range(3):
        for data_idx in range(10):
            for accumulation in range(3):
                in_data = torch.rand(2)
                loss = model(in_data).sum()
                loss.backward()
            assert optim.gain() <= 3, optim.gain()
            optim.step()
            optim.zero_grad()
            # asserting LR is right
            assert np.allclose(optim.param_groups[0]["lr"], 0.1 / 10 ** epoch), optim.param_groups[0]["lr"]
        scheduler.step()
        # asserting LR is right
        assert np.allclose(optim.param_groups[0]["lr"], 0.1 / 10 ** (epoch + 1)), optim.param_groups[0]["lr"]
Esempio n. 14
0
def _test_basic_func(rank, world_size, tempfile_name, test_case, oss, model=None):
    _dist_init(rank, world_size, tempfile_name, backend="nccl")

    if model is None:
        model = Linear(2, 2, bias=False)
    model.to("cuda")
    model = DDP(model, device_ids=[rank])

    assert oss in ["none", "ada-oss", "wrapper-oss", "oss-wrapper"]
    if oss == "ada-oss":
        optim = AdaScale(OSS(model.parameters(), SGD, lr=0.1))
    elif oss == "wrapper-oss":
        optim = AdaScaleWrapper(model.parameters(), optim_cls=OSS, optim=SGD, lr=0.1)
    elif oss == "oss-wrapper":
        optim = OSS(model.parameters(), AdaScaleWrapper, optim_cls=SGD, lr=0.1)
    else:
        assert oss == "none"
        optim = AdaScale(SGD(model.parameters(), lr=0.1))

    if "input" in test_case:
        inputs = [test_case["input"]]
    else:
        inputs = test_case["inputs"]

    for in_data in inputs:
        in_data = Tensor(in_data[rank]).cuda()
        out = model(in_data)
        out.sum().backward()
        optim.step()
        optim.zero_grad()

    if "expected_gain" in test_case:
        assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()

    if "expected_mean_weight" in test_case:
        mean_weight = mean([model.module[i].weight.data.mean().item() for i in range(4)])
        assert np.allclose(mean_weight, test_case["expected_mean_weight"]), mean_weight

    dist.destroy_process_group()
Esempio n. 15
0
def _test_basic_func(rank, world_size, tempfile_name, test_case):
    _dist_init(rank, world_size, tempfile_name, backend="nccl")  # Covers nccl

    model = Linear(2, 2, bias=False)
    model.to("cuda")
    model = DDP(model, device_ids=[rank])
    optim = AdaScale(SGD(model.parameters(), lr=0.1))
    if "input" in test_case:
        # single iter
        in_data = Tensor(test_case["input"][rank])
        in_data = in_data.cuda()
        out = model(in_data)
        out.sum().backward()
        assert np.allclose(optim.gain(),
                           test_case["expected_gain"]), optim.gain()
        optim.step()
        optim.zero_grad()
    else:
        # multiple iters
        for in_data in test_case["inputs"]:
            in_data = Tensor(in_data[rank]).cuda()
            out = model(in_data)
            out.sum().backward()
            optim.step()
            optim.zero_grad()
        assert np.allclose(optim.gain(),
                           test_case["expected_gain"]), optim.gain()

    dist.destroy_process_group()
def test_custom_smoothing_factor():
    """Test custom smoothing since we had a bug around it."""
    model = Linear(1, 1)
    optim = AdaScale(SGD(model.parameters(), lr=0.1), smoothing=0.12345, num_gradients_to_accumulate=3)
    assert optim._smoothing == 0.12345
Esempio n. 17
0
def _test_basic_func(rank, ddp_cls, world_size, tempfile_name, test_case):
    _dist_init(rank, world_size, tempfile_name, backend="nccl")  # Covers nccl

    model = Linear(2, 2)
    model.to("cuda")
    if ddp_cls is DDP:
        model = ddp_cls(model, device_ids=[rank])
        optim = AdaScale(SGD(model.parameters(), lr=0.1))
    elif ddp_cls is SDP:
        optim = AdaScale(OSS(model.parameters(), SGD, lr=0.1))
        model = ddp_cls(model, sharded_optimizer=optim)
    else:
        assert ddp_cls is FSDP, ddp_cls
        # Two cases:
        #    flatten=True : AdaScale wrapper must be after FSDP and it receives
        #                   a single grad tensor. It won't receive grad if
        #                   wrapped before.
        #    flatten=False: AdaScale can be both before or after FSDP.
        # So, it is better to do AdaScale after FSDP.
        model = ddp_cls(model, flatten_parameters=False)
        optim = AdaScale(SGD(model.parameters(), lr=0.1))
    if "input" in test_case:
        # single iter
        in_data = Tensor(test_case["input"][rank])
        in_data = in_data.cuda()
        out = model(in_data)
        out.sum().backward()
        if ddp_cls is DDP:
            assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()
        optim.step()
        optim.zero_grad()
    else:
        # multiple iters
        for in_data in test_case["inputs"]:
            in_data = Tensor(in_data[rank]).cuda()
            out = model(in_data)
            out.sum().backward()
            optim.step()
            optim.zero_grad()
        if ddp_cls is DDP:
            assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()

    dist.destroy_process_group()
Esempio n. 18
0
def test_set_num_gradients_to_accumulate(test_case):
    """Test set_num_gradients_to_accumulate experimental feature."""
    new_accum = test_case["new_accum"]
    exp_gain = test_case["exp_gain"]

    model = Linear(2, 2, bias=False)
    optim = AdaScale(SGD(model.parameters(), lr=0.1),
                     num_gradients_to_accumulate=2)
    out = model(Tensor([0.0, 1.0]))
    out.sum().backward()
    out = model(Tensor([1.0, 0.0]))
    out.sum().backward()
    assert np.allclose(optim.gain(), 2.0)
    optim.step()
    optim.zero_grad()

    optim.set_scale(float(new_accum))
    optim.set_num_gradients_to_accumulate(new_accum)
    for _ in range(new_accum):
        out = model(Tensor([0.0, 1.0]))
        out.sum().backward()

    assert np.allclose(optim.gain(), exp_gain), optim.gain()
    optim.step()
    optim.zero_grad()
Esempio n. 19
0
def test_add_param_group(debias_ewma):
    """Test AdaScale supports add_param_group() API."""
    model1 = Linear(2, 2, bias=True)
    with torch.no_grad():
        # make weights and bias deterministic, which is needed for
        # multi-layer models. For them, adascale gain is affected by
        # parameters from other layers.
        model1.weight.copy_(Tensor([1.0, 2.0, 3.0, 4.0]).reshape(2, 2))
        model1.bias.fill_(0.1)
    optim = AdaScale(SGD(model1.parameters(), lr=0.1),
                     num_gradients_to_accumulate=2,
                     debias_ewma=debias_ewma)
    assert len(optim._hook_handles) == 2

    model2 = Linear(2, 3, bias=True)
    with torch.no_grad():
        # make weights and bias deterministic
        model2.weight.copy_(
            Tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).reshape(3, 2))
        model2.bias.fill_(0.2)
    optim.add_param_group({"params": model2.parameters()})
    assert len(optim._hook_handles) == 4

    # make sure we can run the model.
    model = Sequential(model1, model2).cuda()
    in_data_0 = Tensor([1.0, 2.0]).cuda()
    out = model(in_data_0)
    out.sum().backward()

    in_data_1 = Tensor([3.0, 4.0]).cuda()
    out = model(in_data_1)
    out.sum().backward()

    # make sure the gains are right and we can step.
    # since this is the first step, debias_ewma doesn't affect the value.
    assert np.allclose(optim.gain(), 1.1440223454935758), optim.gain()
    assert np.allclose(optim.gain(0), 1.1428571428571428), optim.gain(0)
    assert np.allclose(optim.gain(1), 1.1471258476157762), optim.gain(1)
    optim.step()
    optim.zero_grad()

    # make sure we can add a PG again after stepping.
    model3 = Linear(3, 4, bias=True)
    with torch.no_grad():
        # make weights and bias deterministic
        model3.weight.copy_(
            Tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0] * 2).reshape(4, 3))
        model3.bias.fill_(0.2)
    optim.add_param_group({"params": model3.parameters()})
    assert len(optim._hook_handles) == 6

    # make sure we can run the model.
    model = Sequential(model1, model2, model3).cuda()
    in_data_0 = Tensor([1.0, 2.0]).cuda()
    out = model(in_data_0)
    out.sum().backward()

    in_data_1 = Tensor([3.0, 4.0]).cuda()
    out = model(in_data_1)
    out.sum().backward()

    # make sure gains are right and we can step.
    # the last PG's gain is not affected by debias_ewma since it is the first step for that PG.
    assert np.allclose(
        optim.gain(), 1.1191193589460822
        if debias_ewma else 1.1192783954732368), optim.gain()
    assert np.allclose(
        optim.gain(0), 1.1428571880897151
        if debias_ewma else 1.142857188085096), optim.gain(0)
    assert np.allclose(
        optim.gain(1), 1.1167103578364508
        if debias_ewma else 1.1167104954034948), optim.gain(1)
    assert np.allclose(optim.gain(2), 1.117381091722702), optim.gain(2)
    optim.step()
    optim.zero_grad()
Esempio n. 20
0
 def make_model_and_optim():
     model = Linear(in_dim, 2, bias=False)
     model = model.cuda()
     optim = AdaScale(SGD(model.parameters(), lr=0.1, momentum=0.9),
                      num_gradients_to_accumulate=accum_steps)
     return model, optim