def test(): device_info = DeviceInfo(use_cuda=True, cuda_device=0) print(device_info) inp = torch.randn((64, 1, 28, 28), device=device_info.device) target = torch.ones(64, dtype=torch.long, device=device_info.device) loss_func = nn.CrossEntropyLoss() model = Model().to(device_info.device) my_adam = MyAdam(model.parameters()) torch_adam = TorchAdam(model.parameters()) loss = loss_func(model(inp), target) loss.backward() with monit.section('MyAdam warmup'): for i in range(100): my_adam.step() with monit.section('MyAdam'): for i in range(1000): my_adam.step() with monit.section('TorchAdam warmup'): for i in range(100): torch_adam.step() with monit.section('TorchAdam'): for i in range(1000): torch_adam.step()
def _synthetic_experiment(is_adam: bool): """ ## Synthetic Experiment This is the synthetic experiment described in the paper, that shows a scenario where *Adam* fails. The paper (and Adam) formulates the problem of optimizing as minimizing the expected value of a function, $\mathbb{E}[f(\theta)]$ with respect to the parameters $\theta$. In the stochastic training setting we do not get hold of the function $f$ it self; that is, when you are optimizing a NN $f$ would be the function on entire batch of data. What we actually evaluate is a mini-batch so the actual function is realization of the stochastic $f$. This is why we are talking about an expected value. So let the function realizations be $f_1, f_2, ..., f_T$ for each time step of training. We measure the performance of the optimizer as the regret, $$R(T) = \sum_{t=1}^T \big[ f_t(\theta_t) - f_t(\theta^*) \big]$$ where $theta_t$ is the parameters at time step $t$, and $\theta^*$ is the optimal parameters that minimize $\mathbb{E}[f(\theta)]$. Now lets define the synthetic problem, \begin{align} f_t(x) = \begin{cases} 1010 x, & \text{for $t \mod 101 = 1$} \\ -10 x, & \text{otherwise} \end{cases} \end{align} where $-1 \le x \le +1$. The optimal solution is $x = -1$. This code will try running *Adam* and *AMSGrad* on this problem. """ # Define $x$ parameter x = nn.Parameter(torch.tensor([.0])) # Optimal, $x^* = -1$ x_star = nn.Parameter(torch.tensor([-1]), requires_grad=False) def func(t: int, x_: nn.Parameter): """ ### $f_t(x)$ """ if t % 101 == 1: return (1010 * x_).sum() else: return (-10 * x_).sum() # Initialize the relevant optimizer if is_adam: optimizer = Adam([x], lr=1e-2, betas=(0.9, 0.99)) else: optimizer = AMSGrad([x], lr=1e-2, betas=(0.9, 0.99)) # $R(T)$ total_regret = 0 from labml import monit, tracker, experiment # Create experiment to record results with experiment.record(name='synthetic', comment='Adam' if is_adam else 'AMSGrad'): # Run for $10^7$ steps for step in monit.loop(10_000_000): # $f_t(\theta_t) - f_t(\theta^*)$ regret = func(step, x) - func(step, x_star) # $R(T) = \sum_{t=1}^T \big[ f_t(\theta_t) - f_t(\theta^*) \big]$ total_regret += regret.item() # Track results every 1,000 steps if (step + 1) % 1000 == 0: tracker.save(loss=regret, x=x, regret=total_regret / (step + 1)) # Calculate gradients regret.backward() # Optimize optimizer.step() # Clear gradients optimizer.zero_grad() # Make sure $-1 \le x \le +1$ x.data.clamp_(-1., +1.)