Example #1
0
def optimize_model(memory, batch_size, gamma=0.999):
    if len(memory) < batch_size:
        return
    transitions = memory.sample(batch_size)
    batch = utils.Transition(*zip(*transitions))

    next_state_batch = torch.stack(batch.next_state).to(device)
    state_batch = torch.stack(batch.state).to(device)
    action_batch = torch.stack(batch.action).to(device)
    reward_batch = torch.stack(batch.reward).to(device)
    done_batch = torch.stack(batch.done).to(device)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    next_action = policy_net(next_state_batch).argmax(dim=1).unsqueeze(1)
    next_state_values = target_net(next_state_batch).gather(
        1, next_action).detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * gamma *
                                    (1.0 - done_batch)) + reward_batch

    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()
Example #2
0
 def push(self, *args):
     max_prio = self.priorities.max() if self.memory else 1.0
     if len(self.memory) < self.capacity:
         self.memory.append(None)
     self.memory[self.position] = utils.Transition(*args)
     self.priorities[self.position] = max_prio
     self.position = (self.position + 1) % self.capacity
Example #3
0
def optimize_model(memory, batch_size, criterion=nn.MSELoss(), gamma=0.999):
    if len(memory) < batch_size:
        return
    transitions = memory.sample(batch_size)
    batch = utils.Transition(*zip(*transitions))

    next_state_batch = torch.stack(batch.next_state).to(device)
    state_batch = torch.stack(batch.state).to(device)
    action_batch = torch.stack(batch.action).to(device)
    reward_batch = torch.stack(batch.reward).to(device)
    done_batch = torch.stack(batch.done).to(device)

    state_action_values = critic([state_batch, action_batch])
    next_state_action_values = target_critic(
        [next_state_batch, target_actor(next_state_batch)]).detach()
    expected_state_action_values = (next_state_action_values * gamma *
                                    (1.0 - done_batch)) + reward_batch
    critic_loss = criterion(state_action_values, expected_state_action_values)
    critic_optimizer.zero_grad()
    critic_loss.backward()
    critic_optimizer.step()

    actor_loss = -critic([state_batch, actor(state_batch)]).mean()
    actor_optimizer.zero_grad()
    actor_loss.backward()
    actor_optimizer.step()

    soft_update(target_actor, actor)
    soft_update(target_critic, critic)
Example #4
0
def optimize_model(memory, batch_size, gamma=0.999):
    if len(memory) < batch_size:
        return
    transitions, indices, weights = memory.sample(batch_size)
    batch = utils.Transition(*zip(*transitions))

    next_state_batch = torch.stack(batch.next_state).to(device)
    state_batch = torch.stack(batch.state).to(device)
    action_batch = torch.stack(batch.action).to(device)
    reward_batch = torch.stack(batch.reward).to(device)
    done_batch = torch.stack(batch.done).to(device)
    weights_batch = torch.tensor(weights).to(device)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    next_state_values = target_net(next_state_batch).max(1)[0].unsqueeze(
        1).detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * gamma *
                                    (1.0 - done_batch)) + reward_batch

    # Compute Huber loss
    delta = F.smooth_l1_loss(state_action_values,
                             expected_state_action_values,
                             reduce=False)
    prios = delta.abs() + 1e-5
    loss = (delta * weights_batch.unsqueeze(1)).mean()

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    memory.update_priorities(indices, prios.data.cpu().numpy())
    optimizer.step()
Example #5
0
 def push(self, *args):
     """Saves a transition."""
     if len(self.memory) < self.capacity:
         self.memory.append(None)
     self.memory[self.position] = utils.Transition(*args)
     self.position = (self.position + 1) % self.capacity