Python AdaScale.zero_gradの例、fairscale.optim.AdaScale.zero_grad Pythonの例

コード例 #1

0

ファイルを表示

def test_gradient_value():
    """Test that we don't mutate the gradients during backward"""
    model = Linear(2, 2, bias=False)
    optim = AdaScale(SGD(model.parameters(), lr=0.1),
                     num_gradients_to_accumulate=2)

    # fwd 1
    out = model(Tensor([0.0, 1.0]))
    out.sum().backward()
    assert np.allclose(model.weight.grad.numpy(),
                       [[0.0, 1.0], [0.0, 1.0]]), model.weight.grad

    # fwd 2, grad is accumulated
    out = model(Tensor([0.0, 1.0]))
    out.sum().backward()
    assert np.allclose(model.weight.grad.numpy(),
                       [[0.0, 2.0], [0.0, 2.0]]), model.weight.grad

    # assert gain and grad value before/after step/zero_grad
    assert np.allclose(optim.gain(), 1.0000002499999376), optim.gain()
    optim.step()
    assert np.allclose(model.weight.grad.numpy(),
                       [[0.0, 2.0], [0.0, 2.0]]), model.weight.grad
    optim.zero_grad()
    assert np.allclose(model.weight.grad.numpy(),
                       [[0.0, 0.0], [0.0, 0.0]]), model.weight.grad

コード例 #2

0

ファイルを表示

def _test_grad_accum_func(rank, world_size, tempfile_name):
    _dist_init(rank, world_size, tempfile_name, backend="gloo")  # Covers gloo

    model = Linear(4, 2, bias=False)
    model.to("cuda")
    model = DDP(model, device_ids=[rank])
    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)
    with model.no_sync():
        # iter 1, input vectors are pointing dim0 and dim1
        in_data = Tensor([0.0] * 4)
        in_data[rank] = 1.0
        in_data = in_data.cuda()
        out = model(in_data)
        out.sum().backward()
    # iter 2, input vectors are pointing dim2 and dim3
    in_data = Tensor([0.0] * 4)
    in_data[rank + 2] = 1.0
    in_data = in_data.cuda()
    out = model(in_data)
    out.sum().backward()
    # since all inputs are orthogonal, the gain should be exactly 4.0.
    assert np.allclose(optim.gain(), 4.0), optim.gain()
    optim.step()
    optim.zero_grad()

    dist.destroy_process_group()

コード例 #3

0

ファイルを表示

def _test_basic_func(rank, world_size, tempfile_name, test_case):
    _dist_init(rank, world_size, tempfile_name, backend="nccl")  # Covers nccl

    model = Linear(2, 2, bias=False)
    model.to("cuda")
    model = DDP(model, device_ids=[rank])
    optim = AdaScale(SGD(model.parameters(), lr=0.1))
    if "input" in test_case:
        # single iter
        in_data = Tensor(test_case["input"][rank])
        in_data = in_data.cuda()
        out = model(in_data)
        out.sum().backward()
        assert np.allclose(optim.gain(),
                           test_case["expected_gain"]), optim.gain()
        optim.step()
        optim.zero_grad()
    else:
        # multiple iters
        for in_data in test_case["inputs"]:
            in_data = Tensor(in_data[rank]).cuda()
            out = model(in_data)
            out.sum().backward()
            optim.step()
            optim.zero_grad()
        assert np.allclose(optim.gain(),
                           test_case["expected_gain"]), optim.gain()

    dist.destroy_process_group()

コード例 #4

0

ファイルを表示

def test_grad_accum(test_case, cpu):
    """Test the basic functionality on CPU/GPU with gradient accumulation without DDP"""
    model = Linear(2, 2, bias=False)
    if not cpu:
        if torch.cuda.device_count() < 1:
            pytest.skip("1 GPU is required")
        model = model.cuda()
    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)
    expected_gain = test_case["expected_gain"]
    if "input" in test_case:
        data = [test_case["input"]] * 2
        gains = [expected_gain] * 2
    else:
        data = test_case["inputs"]
        gains = [None, expected_gain]
    for in_data, exp_gain in zip(data, gains):  # test 2 iterations catch more corner cases.
        # grad pass 1
        in_data_0 = Tensor(in_data[0])
        if not cpu:
            in_data_0 = in_data_0.cuda()
        out = model(in_data_0)
        out.sum().backward()
        # grad pass 2
        in_data_1 = Tensor(in_data[1])
        if not cpu:
            in_data_1 = in_data_1.cuda()
        out = model(in_data_1)
        out.sum().backward()
        if exp_gain is not None:
            assert np.allclose(optim.gain(), exp_gain), optim.gain()
        # stepping it. Note that if we did more than 2 passes as promised by the
        # num_gradients_to_accumulate argument above, AdaScale is not be able to
        # detect that mistake for now. The result will just be wrong in that case.
        optim.step()
        optim.zero_grad()

コード例 #5

0

ファイルを表示

def test_set_num_gradients_to_accumulate(test_case):
    """Test set_num_gradients_to_accumulate experimental feature."""
    new_accum = test_case["new_accum"]
    exp_gain = test_case["exp_gain"]

    model = Linear(2, 2, bias=False)
    optim = AdaScale(SGD(model.parameters(), lr=0.1),
                     num_gradients_to_accumulate=2)
    out = model(Tensor([0.0, 1.0]))
    out.sum().backward()
    out = model(Tensor([1.0, 0.0]))
    out.sum().backward()
    assert np.allclose(optim.gain(), 2.0)
    optim.step()
    optim.zero_grad()

    optim.set_scale(float(new_accum))
    optim.set_num_gradients_to_accumulate(new_accum)
    for _ in range(new_accum):
        out = model(Tensor([0.0, 1.0]))
        out.sum().backward()

    assert np.allclose(optim.gain(), exp_gain), optim.gain()
    optim.step()
    optim.zero_grad()

コード例 #6

0

ファイルを表示

def test_grad_accum_cpu(cpu=True):
    """Test the basic functionality on CPU with gradient accumulation without DDP"""
    model = Linear(2, 2, bias=False)
    if not cpu:
        model = model.cuda()
    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)
    for expected_gain in [2.0, 2.0]:  # test 2 iterations catch more corner cases.
        # grad pass 1
        in_data = Tensor([0.0, 1.0])
        if not cpu:
            in_data = in_data.cuda()
        out = model(in_data)
        out.sum().backward()
        # grad pass 2
        in_data = Tensor([1.0, 0.0])
        if not cpu:
            in_data = in_data.cuda()
        out = model(in_data)
        out.sum().backward()
        # stepping it. Note that if we did more than 2 passes as promised by the
        # num_gradients_to_accumulate argument above, AdaScale is not be able to
        # detect that mistake for now. The result will just be wrong in that case.
        assert np.allclose(optim.gain(), expected_gain), optim.gain()
        optim.step()
        optim.zero_grad()

コード例 #7

0

ファイルを表示

def _test_basic_func(rank,
                     world_size,
                     tempfile_name,
                     test_case,
                     oss,
                     model=None):
    _dist_init(rank, world_size, tempfile_name, backend="nccl")

    if model is None:
        model = Linear(2, 2)
        model.bias.data.fill_(0.0)

    model.to("cuda")
    model = DDP(model, device_ids=[rank])

    assert oss in ["none", "ada-oss", "wrapper-oss", "oss-wrapper"]
    if oss == "ada-oss":
        optim = AdaScale(OSS(model.parameters(), SGD, lr=0.1))
    elif oss == "wrapper-oss":
        optim = AdaScaleWrapper(model.parameters(),
                                optim_cls=OSS,
                                optim=SGD,
                                lr=0.1)
    elif oss == "oss-wrapper":
        optim = OSS(model.parameters(), AdaScaleWrapper, optim_cls=SGD, lr=0.1)
    else:
        assert oss == "none"
        optim = AdaScale(SGD(model.parameters(), lr=0.1))

    if "input" in test_case:
        inputs = [test_case["input"]]
    else:
        inputs = test_case["inputs"]

    for in_data in inputs:
        in_data = Tensor(in_data[rank]).cuda()
        out = model(in_data)
        out.sum().backward()
        optim.step()
        optim.zero_grad()

    if "expected_gain" in test_case:
        assert np.allclose(optim.gain(),
                           test_case["expected_gain"]), "{} vs {}".format(
                               optim.gain(), test_case["expected_gain"])

    if "expected_mean_weight" in test_case:
        mean_weight = mean(
            [model.module[i].weight.data.mean().item() for i in range(4)])
        assert np.allclose(mean_weight,
                           test_case["expected_mean_weight"]), mean_weight

    dist.destroy_process_group()

コード例 #8

0

ファイルを表示

ファイル: test_single_node_adascale.py プロジェクト: vfdev-5/fairscale

def test_debias_ewma():
    """Test debias_ewma experimental feature"""
    model = Linear(2, 2, bias=False)
    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2, debias_ewma=True)
    for _ in range(4):
        out = model(Tensor([0.0, 1.0]))
        out.sum().backward()
        out = model(Tensor([1.0, 0.0]))
        out.sum().backward()
        assert np.allclose(optim.gain(), 2.0), optim.gain()
        optim.step()
        optim.zero_grad()

コード例 #9

0

ファイルを表示

def _test_basic_func(rank, world_size, tempfile_name):
    _dist_init(rank, world_size, tempfile_name, backend="nccl")  # Covers nccl

    model = Linear(2, 2, bias=False)
    model.to("cuda")
    model = DDP(model, device_ids=[rank])
    optim = AdaScale(SGD(model.parameters(), lr=0.1))
    # iter 1
    in_data = Tensor([0.0, 0.0])
    in_data[rank] = 1.0
    in_data = in_data.cuda()
    out = model(in_data)
    out.sum().backward()
    assert np.allclose(optim.gain(), 2.0), optim.gain()
    optim.step()
    optim.zero_grad()

    dist.destroy_process_group()

コード例 #10

0

ファイルを表示

def _test_basic_func(rank, ddp_cls, world_size, tempfile_name, test_case):
    _dist_init(rank, world_size, tempfile_name, backend="nccl")  # Covers nccl

    model = Linear(2, 2)
    model.to("cuda")
    if ddp_cls is DDP:
        model = ddp_cls(model, device_ids=[rank])
        optim = AdaScale(SGD(model.parameters(), lr=0.1))
    elif ddp_cls is SDP:
        optim = AdaScale(OSS(model.parameters(), SGD, lr=0.1))
        model = ddp_cls(model, sharded_optimizer=optim)
    else:
        assert ddp_cls is FSDP, ddp_cls
        # Two cases:
        #    flatten=True : AdaScale wrapper must be after FSDP and it receives
        #                   a single grad tensor. It won't receive grad if
        #                   wrapped before.
        #    flatten=False: AdaScale can be both before or after FSDP.
        # So, it is better to do AdaScale after FSDP.
        model = ddp_cls(model, flatten_parameters=False)
        optim = AdaScale(SGD(model.parameters(), lr=0.1))
    if "input" in test_case:
        # single iter
        in_data = Tensor(test_case["input"][rank])
        in_data = in_data.cuda()
        out = model(in_data)
        out.sum().backward()
        if ddp_cls is DDP:
            assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()
        optim.step()
        optim.zero_grad()
    else:
        # multiple iters
        for in_data in test_case["inputs"]:
            in_data = Tensor(in_data[rank]).cuda()
            out = model(in_data)
            out.sum().backward()
            optim.step()
            optim.zero_grad()
        if ddp_cls is DDP:
            assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()

    dist.destroy_process_group()

コード例 #11

0

ファイルを表示

ファイル: test_single_node_adascale.py プロジェクト: vfdev-5/fairscale

def test_lr_scheduler():
    """Test AdaScale working with torch.optim.lr_scheduler."""
    model = Linear(2, 2, bias=False)
    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=3)
    # We use 1, not 0.1 here since scheduler.step() is called here first.
    scheduler = LambdaLR(optim, lr_lambda=lambda epoch: 1 / 10 ** epoch)
    for epoch in range(3):
        for data_idx in range(10):
            for accumulation in range(3):
                in_data = torch.rand(2)
                loss = model(in_data).sum()
                loss.backward()
            assert optim.gain() <= 3, optim.gain()
            optim.step()
            optim.zero_grad()
            # asserting LR is right
            assert np.allclose(optim.param_groups[0]["lr"], 0.1 / 10 ** epoch), optim.param_groups[0]["lr"]
        scheduler.step()
        # asserting LR is right
        assert np.allclose(optim.param_groups[0]["lr"], 0.1 / 10 ** (epoch + 1)), optim.param_groups[0]["lr"]

コード例 #12

0

ファイルを表示

ファイル: test_oss_adascale.py プロジェクト: hulaba/fairscale

def _test_basic_func(rank,
                     world_size,
                     tempfile_name,
                     test_case,
                     oss,
                     model=None):
    _dist_init(rank, world_size, tempfile_name, backend="nccl")

    if model is None:
        model = Linear(2, 2, bias=False)
    model.to("cuda")
    model = DDP(model, device_ids=[rank])
    if oss:
        # For now, we can only wrap AdaScale over OSS. If we do it the other way around,
        # AdaScale needs to take different parameter types, i.e. the parameter list, etc.
        optim = AdaScale(OSS(model.parameters(), SGD, lr=0.1))
    else:
        optim = AdaScale(SGD(model.parameters(), lr=0.1))

    if "input" in test_case:
        inputs = [test_case["input"]]
    else:
        inputs = test_case["inputs"]

    for in_data in inputs:
        in_data = Tensor(in_data[rank]).cuda()
        out = model(in_data)
        out.sum().backward()
        optim.step()
        optim.zero_grad()

    assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()

    if "expected_mean_weight" in test_case:
        mean_weight = mean(
            [model.module[i].weight.data.mean().item() for i in range(4)])
        assert np.allclose(mean_weight,
                           test_case["expected_mean_weight"]), mean_weight

    dist.destroy_process_group()

コード例 #13

0

ファイルを表示

def test_add_param_group(debias_ewma):
    """Test AdaScale supports add_param_group() API."""
    model1 = Linear(2, 2, bias=True)
    with torch.no_grad():
        # make weights and bias deterministic, which is needed for
        # multi-layer models. For them, adascale gain is affected by
        # parameters from other layers.
        model1.weight.copy_(Tensor([1.0, 2.0, 3.0, 4.0]).reshape(2, 2))
        model1.bias.fill_(0.1)
    optim = AdaScale(SGD(model1.parameters(), lr=0.1),
                     num_gradients_to_accumulate=2,
                     debias_ewma=debias_ewma)
    assert len(optim._hook_handles) == 2

    model2 = Linear(2, 3, bias=True)
    with torch.no_grad():
        # make weights and bias deterministic
        model2.weight.copy_(
            Tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).reshape(3, 2))
        model2.bias.fill_(0.2)
    optim.add_param_group({"params": model2.parameters()})
    assert len(optim._hook_handles) == 4

    # make sure we can run the model.
    model = Sequential(model1, model2).cuda()
    in_data_0 = Tensor([1.0, 2.0]).cuda()
    out = model(in_data_0)
    out.sum().backward()

    in_data_1 = Tensor([3.0, 4.0]).cuda()
    out = model(in_data_1)
    out.sum().backward()

    # make sure the gains are right and we can step.
    # since this is the first step, debias_ewma doesn't affect the value.
    assert np.allclose(optim.gain(), 1.1440223454935758), optim.gain()
    assert np.allclose(optim.gain(0), 1.1428571428571428), optim.gain(0)
    assert np.allclose(optim.gain(1), 1.1471258476157762), optim.gain(1)
    optim.step()
    optim.zero_grad()

    # make sure we can add a PG again after stepping.
    model3 = Linear(3, 4, bias=True)
    with torch.no_grad():
        # make weights and bias deterministic
        model3.weight.copy_(
            Tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0] * 2).reshape(4, 3))
        model3.bias.fill_(0.2)
    optim.add_param_group({"params": model3.parameters()})
    assert len(optim._hook_handles) == 6

    # make sure we can run the model.
    model = Sequential(model1, model2, model3).cuda()
    in_data_0 = Tensor([1.0, 2.0]).cuda()
    out = model(in_data_0)
    out.sum().backward()

    in_data_1 = Tensor([3.0, 4.0]).cuda()
    out = model(in_data_1)
    out.sum().backward()

    # make sure gains are right and we can step.
    # the last PG's gain is not affected by debias_ewma since it is the first step for that PG.
    assert np.allclose(
        optim.gain(), 1.1191193589460822
        if debias_ewma else 1.1192783954732368), optim.gain()
    assert np.allclose(
        optim.gain(0), 1.1428571880897151
        if debias_ewma else 1.142857188085096), optim.gain(0)
    assert np.allclose(
        optim.gain(1), 1.1167103578364508
        if debias_ewma else 1.1167104954034948), optim.gain(1)
    assert np.allclose(optim.gain(2), 1.117381091722702), optim.gain(2)
    optim.step()
    optim.zero_grad()