def test_exploding_optimizer_state(): weight = torch.tensor([[float("inf")]]).half().cuda().requires_grad_() input = torch.tensor([1.0]).half().cuda().requires_grad_() optimizer = Adam([weight], lr=1e-3, precision=Precision.PURE_FP16) optimizer._optim_scale = 1.0 optimizer.zero_grad() loss = (weight.mv(input)).pow(2).sum() loss.backward() with pytest.raises(RuntimeError): optimizer.step()
def test_update_optim_scale(): weight, bias, input = make_half_precision_params() optimizer = Adam([weight, bias], lr=1e-3, precision=Precision.PURE_FP16) optimizer._optim_scale_update_freq = 1 optimizer._optim_scale = 2**15 optimizer.zero_grad() loss = (weight.mv(input) + bias).pow(2).sum() loss.backward() optimizer.step() assert optimizer._optim_scale == 2**16
def state_dict_test(optimizer, weight, bias, input): def fn_base(optimizer, weight, bias, input): optimizer.zero_grad() loss = (weight.mv(input) + bias).pow(2).sum() loss.backward() return loss fn = functools.partial(fn_base, optimizer, weight, bias, input) # Prime the optimizer for _i in range(5): optimizer.step(fn) # Clone the weights and construct new optimizer for them weight_c = weight.data.clone().requires_grad_() bias_c = bias.data.clone().requires_grad_() optimizer_c = Adam([weight_c, bias_c], lr=1e-3, precision=optimizer.precision) fn_c = functools.partial(fn_base, optimizer_c, weight_c, bias_c, input) # Load state dict state_dict = deepcopy(optimizer.state_dict()) optimizer_c.load_state_dict(state_dict) for group, group_c in zip(optimizer.param_groups, optimizer_c.param_groups): for p, p_c in zip(group["params"], group_c["params"]): assert torch.equal(optimizer.state[p]["exp_avg"], optimizer_c.state[p_c]["exp_avg"]) assert torch.equal(optimizer.state[p]["exp_avg_sq"], optimizer_c.state[p_c]["exp_avg_sq"]) if optimizer.fp32_param_groups: # When using mixed precision, fp32_param_groups are made from FP16 params rather than # copied via state_dict, introducing differences between the original optimizer and # the copy. Because this test requires that they be the exact same, we copy the # fp32 params from the original optimizer to the copy optimizer_c.fp32_param_groups = deepcopy(optimizer.fp32_param_groups) # Run both optimizations in parallel for _i in range(5): optimizer.step(fn) optimizer_c.step(fn_c) assert torch.equal(weight, weight_c) assert torch.equal(bias, bias_c)