Example #1
0
def test_build_fp32_params():
    weight = torch.randn(10, 5).cuda().half().requires_grad_()
    bias = torch.randn(10).cuda().half().requires_grad_()
    optimizer = Adam([weight, bias], lr=1e-3)
    optimizer._build_fp32_params([weight, bias])
    for fp32_group, fp16_group in zip(optimizer.fp32_param_groups, optimizer.param_groups):
        for fp32_p, fp16_p in zip(fp32_group["params"], fp16_group["params"]):
            assert fp32_p.dtype == torch.float32
            if fp16_p.requires_grad:
                assert fp16_p.dtype == torch.float16
                (fp32_p - fp16_p).to("cpu").detach().apply_(assert_almost_zero)
Example #2
0
def test_memory_efficient_with_full_precision_parameters():
    weight = torch.randn(10, 5, requires_grad=True).float().cuda()
    bias = torch.randn(10, requires_grad=True).float().cuda()
    with pytest.raises(AssertionError):
        Adam([weight, bias],
             lr=1e-2,
             precision=Precision.MEMORY_EFFICIENT_MIXED_PRECISION)
Example #3
0
def test_step_with_grad_scaler():
    weight, bias, input = make_half_precision_params()
    optimizer = Adam([weight, bias], lr=1e-3, precision=Precision.PURE_FP16)
    scaler = GradScaler()
    initial_value = None

    for _i in range(5):
        optimizer.zero_grad()
        loss = (weight.mv(input) + bias).pow(2).sum()
        if _i == 0:
            initial_value = loss.item()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

    assert loss.item() < initial_value
Example #4
0
 def make_adam(model):
     if args.ddp_zero:
         return OSS(params=model.parameters(),
                    optim=Adam,
                    group=get_data_parallel_group(),
                    lr=lr)
     else:
         return Adam(model.parameters(), lr=lr)
Example #5
0
def test_step_multigpu_mixed_precision():
    if not torch.cuda.device_count() > 1:
        return
    weight = torch.randn(10, 5).cuda(0).half().requires_grad_()
    bias = torch.randn(10).cuda(1).half().requires_grad_()
    input = torch.randn(5).cuda(0).half()
    optimizer = Adam([weight, bias], lr=1e-3)

    step_test(optimizer, weight, bias, input)
Example #6
0
def test_state_dict_memory_efficient():
    # TODO: Optimizer state gets cast to FP16 and back to FP32 for
    # mixed-precision and memory-efficient mixed-precision, resulting
    # in a potential loss of precision. Thus, as training proceeds, we don't
    # necessarily expect the parameters to remain the exact same.
    weight, bias, input = make_half_precision_params()
    optimizer = Adam([weight, bias], lr=1e-3, precision=Precision.MEMORY_EFFICIENT_MIXED_PRECISION)

    state_dict_test(optimizer, weight, bias, input)
Example #7
0
def test_update_optim_scale():
    weight, bias, input = make_half_precision_params()
    optimizer = Adam([weight, bias], lr=1e-3, precision=Precision.PURE_FP16)
    optimizer._optim_scale_update_freq = 1
    optimizer._optim_scale = 2**15

    optimizer.zero_grad()
    loss = (weight.mv(input) + bias).pow(2).sum()
    loss.backward()
    optimizer.step()

    assert optimizer._optim_scale == 2**16
Example #8
0
def make_model(device, ntokens):
    ninp = 50  # embedding dimension
    nhid = 50  # the dimension of the feedforward network model in nn.TransformerEncoder
    nhead = 2  # the number of heads in the multiheadattention models
    dropout = 0
    initrange = 0.1

    model = TransformerLMSequntial(ntokens, ninp, nhead, nhid, dropout, initrange).half().to(device)
    balance = generate_balance(min(num_devices, 4), len(model))
    p = Pipe(model, balance, chunks=len(balance))

    criterion = nn.CrossEntropyLoss()
    lr = 0.001  # learning rate

    try:
        optimizer = Adam(p.parameters(), lr=lr, precision=Precision.PURE_FP16)
    except NameError:
        optimizer = Adam(p.parameters(), lr=lr)
    scaler = GradScaler()

    return p, criterion, optimizer, scaler
Example #9
0
def test_exploding_optimizer_state():
    weight = torch.tensor([[float("inf")]]).half().cuda().requires_grad_()
    input = torch.tensor([1.0]).half().cuda().requires_grad_()

    optimizer = Adam([weight], lr=1e-3, precision=Precision.PURE_FP16)
    optimizer._optim_scale = 1.0

    optimizer.zero_grad()
    loss = (weight.mv(input)).pow(2).sum()
    loss.backward()
    with pytest.raises(RuntimeError):
        optimizer.step()
Example #10
0
def test_step_pure_fp16_multigpu():
    if not torch.cuda.device_count() > 1:
        return
    weight = torch.randn(10, 5).half().cuda(0).requires_grad_()
    bias = torch.randn(10).half().cuda(1).requires_grad_()
    input = torch.randn(5).half().cuda(0)
    optimizer = Adam([weight, bias], lr=1e-3, precision=Precision.PURE_FP16)

    step_test(optimizer, weight, bias, input)

    assert optimizer.state[weight]["exp_avg"].dtype == torch.float16
    assert optimizer.state[weight]["exp_avg_sq"].dtype == torch.float16
    assert optimizer.state[bias]["exp_avg"].dtype == torch.float16
    assert optimizer.state[bias]["exp_avg_sq"].dtype == torch.float16
Example #11
0
def test_step_full_precision_inferred():
    weight, bias, input = make_full_precision_params()
    optimizer = Adam([weight, bias], lr=1e-3)

    step_test(optimizer, weight, bias, input)

    for group in optimizer.param_groups:
        for p in group["params"]:
            if p.requires_grad:
                assert p.dtype == torch.float32
    assert not optimizer.fp32_param_groups

    assert optimizer.state[weight]["exp_avg"].dtype == torch.float32
    assert optimizer.state[weight]["exp_avg_sq"].dtype == torch.float32
    assert optimizer.state[bias]["exp_avg"].dtype == torch.float32
    assert optimizer.state[bias]["exp_avg_sq"].dtype == torch.float32
Example #12
0
def test_step_pure_fp16():
    weight, bias, input = make_half_precision_params()
    optimizer = Adam([weight, bias], lr=1e-3, precision=Precision.PURE_FP16)
    step_test(optimizer, weight, bias, input)

    for group in optimizer.param_groups:
        for p in group["params"]:
            if p.requires_grad:
                assert p.dtype == torch.float16

    assert optimizer.state[weight]["exp_avg"].dtype == torch.float16
    assert optimizer.state[weight]["exp_avg_sq"].dtype == torch.float16
    assert optimizer.state[bias]["exp_avg"].dtype == torch.float16
    assert optimizer.state[bias]["exp_avg_sq"].dtype == torch.float16

    assert not optimizer.fp32_param_groups
Example #13
0
def test_step_memory_efficient():
    weight, bias, input = make_half_precision_params()
    optimizer = Adam([weight, bias], lr=1e-3, precision=Precision.MEMORY_EFFICIENT_MIXED_PRECISION)
    step_test(optimizer, weight, bias, input)

    for group in optimizer.param_groups:
        for p in group["params"]:
            if p.requires_grad:
                assert p.dtype == torch.float16

    assert not optimizer.fp32_param_groups

    assert optimizer.state[weight]["exp_avg"].dtype == torch.float32
    assert optimizer.state[weight]["exp_avg_sq"].dtype == torch.float32
    assert optimizer.state[bias]["exp_avg"].dtype == torch.float32
    assert optimizer.state[bias]["exp_avg_sq"].dtype == torch.float32
Example #14
0
def state_dict_test(optimizer, weight, bias, input):
    def fn_base(optimizer, weight, bias, input):
        optimizer.zero_grad()
        loss = (weight.mv(input) + bias).pow(2).sum()
        loss.backward()
        return loss

    fn = functools.partial(fn_base, optimizer, weight, bias, input)

    # Prime the optimizer
    for _i in range(5):
        optimizer.step(fn)
    # Clone the weights and construct new optimizer for them
    weight_c = weight.data.clone().requires_grad_()
    bias_c = bias.data.clone().requires_grad_()
    optimizer_c = Adam([weight_c, bias_c],
                       lr=1e-3,
                       precision=optimizer.precision)
    fn_c = functools.partial(fn_base, optimizer_c, weight_c, bias_c, input)
    # Load state dict
    state_dict = deepcopy(optimizer.state_dict())
    optimizer_c.load_state_dict(state_dict)

    for group, group_c in zip(optimizer.param_groups,
                              optimizer_c.param_groups):
        for p, p_c in zip(group["params"], group_c["params"]):
            assert torch.equal(optimizer.state[p]["exp_avg"],
                               optimizer_c.state[p_c]["exp_avg"])
            assert torch.equal(optimizer.state[p]["exp_avg_sq"],
                               optimizer_c.state[p_c]["exp_avg_sq"])

    if optimizer.fp32_param_groups:
        # When using mixed precision, fp32_param_groups are made from FP16 params rather than
        # copied via state_dict, introducing differences between the original optimizer and
        # the copy. Because this test requires that they be the exact same, we copy the
        # fp32 params from the original optimizer to the copy
        optimizer_c.fp32_param_groups = deepcopy(optimizer.fp32_param_groups)

    # Run both optimizations in parallel
    for _i in range(5):
        optimizer.step(fn)
        optimizer_c.step(fn_c)

        assert torch.equal(weight, weight_c)
        assert torch.equal(bias, bias_c)
Example #15
0
def test_step_mixed_precision_inferred():
    weight, bias, input = make_half_precision_params()
    optimizer = Adam([weight, bias], lr=1e-3)
    step_test(optimizer, weight, bias, input)

    assert len(optimizer.fp32_param_groups) == len(optimizer.param_groups)

    for fp32_group, fp16_group in zip(optimizer.fp32_param_groups, optimizer.param_groups):
        for fp32_p, fp16_p in zip(fp32_group["params"], fp16_group["params"]):

            def assert_almost_zero(x):
                assert abs(x) < 1e-3
                return 1.0

            assert fp32_p.dtype == torch.float32
            if fp16_p.requires_grad:
                assert fp16_p.dtype == torch.float16
                (fp32_p - fp16_p).to("cpu").detach().apply_(assert_almost_zero)

    assert optimizer.state[weight]["exp_avg"].dtype == torch.float32
    assert optimizer.state[weight]["exp_avg_sq"].dtype == torch.float32
    assert optimizer.state[bias]["exp_avg"].dtype == torch.float32
    assert optimizer.state[bias]["exp_avg_sq"].dtype == torch.float32
Example #16
0
 def make_adam(model):
     return Adam(model.parameters(), lr=lr)
Example #17
0
 def make_adam(params):
     if args.ddp_zero:
         return OSS(params=params, optim=Adam, group=get_data_parallel_group(), lr=lr)
     else:
         return Adam(params, lr=lr)
Example #18
0
def test_pure_fp16_with_full_precision_parameters():
    weight = torch.randn(10, 5, requires_grad=True).float().cuda()
    bias = torch.randn(10, requires_grad=True).float().cuda()
    with pytest.raises(AssertionError):
        Adam([weight, bias], lr=1e-2, precision=Precision.PURE_FP16)
Example #19
0
def test_amsgrad():
    weight = torch.randn(10, 5, requires_grad=True).float().cuda()
    bias = torch.randn(10, requires_grad=True).float().cuda()
    with pytest.raises(RuntimeError):
        Adam([weight, bias], lr=1e-2, amsgrad=True)
Example #20
0
def test_invalid_weight_decay():
    weight = torch.randn(10, 5, requires_grad=True).float().cuda()
    bias = torch.randn(10, requires_grad=True).float().cuda()
    with pytest.raises(ValueError):
        Adam([weight, bias], lr=1e-2, weight_decay=-1)
Example #21
0
def test_invalid_beta():
    weight = torch.randn(10, 5, requires_grad=True).float().cuda()
    bias = torch.randn(10, requires_grad=True).float().cuda()
    with pytest.raises(ValueError):
        Adam([weight, bias], lr=1e-2, betas=(1.0, 0.0))
Example #22
0
def test_state_dict_pure_fp16():
    weight, bias, input = make_half_precision_params()
    optimizer = Adam([weight, bias], lr=1e-3, precision=Precision.PURE_FP16)

    state_dict_test(optimizer, weight, bias, input)
Example #23
0
def test_state_dict_full_precision():
    weight, bias, input = make_full_precision_params()
    optimizer = Adam([weight, bias], lr=1e-3)

    state_dict_test(optimizer, weight, bias, input)