Example #1
0
def _adam_optimizer(c: OptimizerConfigs):
    if c.amsgrad:
        from labml_nn.optimizers.amsgrad import AMSGrad
        return AMSGrad(c.parameters,
                       lr=c.learning_rate, betas=c.betas, eps=c.eps,
                       optimized_update=c.optimized_adam_update,
                       weight_decay=c.weight_decay_obj, amsgrad=c.amsgrad)
    else:
        from labml_nn.optimizers.adam import Adam
        return Adam(c.parameters,
                    lr=c.learning_rate, betas=c.betas, eps=c.eps,
                    optimized_update=c.optimized_adam_update,
                    weight_decay=c.weight_decay_obj)
Example #2
0
def test():
    device_info = DeviceInfo(use_cuda=True, cuda_device=0)
    print(device_info)
    inp = torch.randn((64, 1, 28, 28), device=device_info.device)
    target = torch.ones(64, dtype=torch.long, device=device_info.device)
    loss_func = nn.CrossEntropyLoss()
    model = Model().to(device_info.device)
    my_adam = MyAdam(model.parameters())
    torch_adam = TorchAdam(model.parameters())
    loss = loss_func(model(inp), target)
    loss.backward()
    with monit.section('MyAdam warmup'):
        for i in range(100):
            my_adam.step()
    with monit.section('MyAdam'):
        for i in range(1000):
            my_adam.step()
    with monit.section('TorchAdam warmup'):
        for i in range(100):
            torch_adam.step()
    with monit.section('TorchAdam'):
        for i in range(1000):
            torch_adam.step()
Example #3
0
File: amsgrad.py Project: wx-b/nn
def _synthetic_experiment(is_adam: bool):
    """
    ## Synthetic Experiment

    This is the synthetic experiment described in the paper,
    that shows a scenario where *Adam* fails.

    The paper (and Adam) formulates the problem of optimizing as
    minimizing the expected value of a function, $\mathbb{E}[f(\theta)]$
    with respect to the parameters $\theta$.
    In the stochastic training setting we do not get hold of the function $f$
    it self; that is,
    when you are optimizing a NN $f$ would be the function on  entire
    batch of data.
    What we actually evaluate is a mini-batch so the actual function is
    realization of the stochastic $f$.
    This is why we are talking about an expected value.
    So let the function realizations be $f_1, f_2, ..., f_T$ for each time step
    of training.

    We measure the performance of the optimizer as the regret,
    $$R(T) = \sum_{t=1}^T \big[ f_t(\theta_t) - f_t(\theta^*) \big]$$
    where $theta_t$ is the parameters at time step $t$, and  $\theta^*$ is the
    optimal parameters that minimize $\mathbb{E}[f(\theta)]$.

    Now lets define the synthetic problem,
    \begin{align}
    f_t(x) =
    \begin{cases}
    1010 x,  & \text{for $t \mod 101 = 1$} \\
    -10  x, & \text{otherwise}
    \end{cases}
    \end{align}
    where $-1 \le x \le +1$.
    The optimal solution is $x = -1$.

    This code will try running *Adam* and *AMSGrad* on this problem.
    """

    # Define $x$ parameter
    x = nn.Parameter(torch.tensor([.0]))
    # Optimal, $x^* = -1$
    x_star = nn.Parameter(torch.tensor([-1]), requires_grad=False)

    def func(t: int, x_: nn.Parameter):
        """
        ### $f_t(x)$
        """
        if t % 101 == 1:
            return (1010 * x_).sum()
        else:
            return (-10 * x_).sum()

    # Initialize the relevant optimizer
    if is_adam:
        optimizer = Adam([x], lr=1e-2, betas=(0.9, 0.99))
    else:
        optimizer = AMSGrad([x], lr=1e-2, betas=(0.9, 0.99))
    # $R(T)$
    total_regret = 0

    from labml import monit, tracker, experiment

    # Create experiment to record results
    with experiment.record(name='synthetic',
                           comment='Adam' if is_adam else 'AMSGrad'):
        # Run for $10^7$ steps
        for step in monit.loop(10_000_000):
            # $f_t(\theta_t) - f_t(\theta^*)$
            regret = func(step, x) - func(step, x_star)
            # $R(T) = \sum_{t=1}^T \big[ f_t(\theta_t) - f_t(\theta^*) \big]$
            total_regret += regret.item()
            # Track results every 1,000 steps
            if (step + 1) % 1000 == 0:
                tracker.save(loss=regret,
                             x=x,
                             regret=total_regret / (step + 1))
            # Calculate gradients
            regret.backward()
            # Optimize
            optimizer.step()
            # Clear gradients
            optimizer.zero_grad()

            # Make sure $-1 \le x \le +1$
            x.data.clamp_(-1., +1.)