Beispiel #1
0
def test_compare_cpu_and_gpu(dtype):
    param_norm = torch.tensor(1., dtype=dtype)
    grad_norm = torch.tensor(1., dtype=dtype)
    adaptive_lr_cpu = torch.tensor(0., dtype=dtype)

    weight_decay = 1.
    eps = 2.
    trust_coef = 1.

    adaptive_lr_cpu = compute_adaptive_lr(
        param_norm,
        grad_norm,
        weight_decay,
        eps,
        trust_coef,
        adaptive_lr_cpu)

    param_norm = torch.tensor(1., dtype=dtype, device='cuda')
    grad_norm = torch.tensor(1., dtype=dtype, device='cuda')
    adaptive_lr_gpu = torch.tensor(0., dtype=dtype, device='cuda')

    weight_decay = 1.
    eps = 2.
    trust_coef = 1.

    adaptive_lr_gpu = compute_adaptive_lr(
        param_norm,
        grad_norm,
        weight_decay,
        eps,
        trust_coef,
        adaptive_lr_gpu)

    assert torch.allclose(adaptive_lr_cpu, adaptive_lr_gpu.cpu())
def test_specific_case(dtype):
    param_norm = torch.tensor(1.234, dtype=dtype)
    grad_norm = torch.tensor(5.678, dtype=dtype)
    adaptive_lr = torch.tensor(0., dtype=dtype)

    weight_decay = 1e-4
    eps = 1e-8
    trust_coef = 0.001

    adaptive_lr = compute_adaptive_lr(param_norm, grad_norm, weight_decay, eps,
                                      trust_coef, adaptive_lr)

    assert torch.allclose(adaptive_lr, torch.tensor(0.000217325, dtype=dtype))
def test_when_grad_norm_is_zero_with_half():
    param_norm = torch.tensor(1., dtype=torch.half, device='cuda')
    grad_norm = torch.tensor(0., dtype=torch.half, device='cuda')
    adaptive_lr = torch.tensor(0., dtype=torch.half, device='cuda')

    weight_decay = 1.
    eps = 1.
    trust_coef = 1.

    adaptive_lr = compute_adaptive_lr(param_norm, grad_norm, weight_decay, eps,
                                      trust_coef, adaptive_lr)

    assert adaptive_lr == torch.tensor(1., dtype=torch.half, device='cuda')
def test_when_grad_norm_is_zero(dtype):
    param_norm = torch.tensor(1., dtype=dtype)
    grad_norm = torch.tensor(0., dtype=dtype)
    adaptive_lr = torch.tensor(0., dtype=dtype)

    weight_decay = 1.
    eps = 1.
    trust_coef = 1.

    adaptive_lr = compute_adaptive_lr(param_norm, grad_norm, weight_decay, eps,
                                      trust_coef, adaptive_lr)

    assert adaptive_lr == torch.tensor(1., dtype=dtype)
Beispiel #5
0
    def apply_adaptive_lrs(self, weight_decays):
        with torch.no_grad():
            for group, weight_decay in zip(self.optim.param_groups,
                                           weight_decays):
                if weight_decay is None:
                    weight_decay = 0.0
                for p in group['params']:
                    if p.grad is None:
                        continue

                    if group.get('lars_adaptation', True):

                        param_norm = p.norm()
                        grad_norm = p.grad.norm()

                        # The optimizer class has no method to change `dtype` of
                        # its inner tensors (like `adaptive_lr`) and to select to
                        # use CPU or GPU with Tensor. LARS's interface follows the
                        # optimizer class's interface, so LARS cannot change
                        # `dtype` of inner tensors explicitly also. In that
                        # context, we have constructed LARS can modify its member
                        # variable's spec implicitly by comparing with given spec
                        # by the original optimizer's element.
                        param_norm_spec = (param_norm.is_cuda,
                                           param_norm.type())
                        adaptive_lr_spec = (self.adaptive_lr.is_cuda,
                                            self.adaptive_lr.type())

                        if param_norm_spec != adaptive_lr_spec:
                            self.adaptive_lr = torch.ones_like(param_norm)

                        # calculate adaptive lr & weight decay
                        adaptive_lr = compute_adaptive_lr(
                            param_norm, grad_norm, weight_decay, self.eps,
                            self.trust_coef, self.adaptive_lr)

                    else:
                        adaptive_lr = group['lr']

                    p.grad.add_(p.data, alpha=weight_decay)
                    p.grad.mul_(adaptive_lr)