Exemple #1
0
def optimize_model(policy_net: Model, target_net: Model,
                   replay_buffer: ReplayBuffer, optimizer, num_actions):
    """Takes a random batch from :experience_memory and trains the policy_net on it for one iteration"""
    # Sample training data
    obs_batch, actions_batch, rewards_batch, obs_next_batch, sample_ix = replay_buffer.sample(
        BATCH_SIZE)
    actions_batch: torch.Tensor = torch.from_numpy(actions_batch).to(DEVICE)
    obs_batch_torch: torch.Tensor = torch.from_numpy(obs_batch).to(DEVICE)
    obs_next_batch_torch: torch.Tensor = torch.from_numpy(obs_next_batch).to(
        DEVICE)

    q_t_batch = policy_net(obs_batch_torch)
    q_t_ac = q_t_batch.gather(1, actions_batch.unsqueeze_(1))
    print(obs_next_batch_torch.shape)
    with torch.no_grad():
        rewards_batch_torch = torch.from_numpy(rewards_batch).float().to(
            DEVICE)
        q_tp1 = policy_net(obs_next_batch_torch)
        _, q_tp1_maxind = q_tp1.max(1)
        q_tp1_target = target_net(obs_next_batch_torch)
        q_target = rewards_batch_torch.unsqueeze_(
            1) + GAMMA * q_tp1_target.gather(1, q_tp1_maxind.unsqueeze(1))

    errors: torch.Tensor = F.smooth_l1_loss(q_t_ac,
                                            q_target.float().to(DEVICE),
                                            reduction='none')
    loss = errors.mean(dim=0)
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(policy_net.parameters(), GRAD_CLIP)
    optimizer.step()
    # Update replay buffer priorities
    print(errors.shape)
    replay_buffer.add_errors(sample_ix,
                             errors.detach().squeeze_().cpu().numpy())