def optimize_model(memory, batch_size, gamma=0.999): if len(memory) < batch_size: return transitions = memory.sample(batch_size) batch = utils.Transition(*zip(*transitions)) next_state_batch = torch.stack(batch.next_state).to(device) state_batch = torch.stack(batch.state).to(device) action_batch = torch.stack(batch.action).to(device) reward_batch = torch.stack(batch.reward).to(device) done_batch = torch.stack(batch.done).to(device) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken state_action_values = policy_net(state_batch).gather(1, action_batch) # Compute V(s_{t+1}) for all next states. next_action = policy_net(next_state_batch).argmax(dim=1).unsqueeze(1) next_state_values = target_net(next_state_batch).gather( 1, next_action).detach() # Compute the expected Q values expected_state_action_values = (next_state_values * gamma * (1.0 - done_batch)) + reward_batch # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values) # Optimize the model optimizer.zero_grad() loss.backward() for param in policy_net.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step()
def push(self, *args): max_prio = self.priorities.max() if self.memory else 1.0 if len(self.memory) < self.capacity: self.memory.append(None) self.memory[self.position] = utils.Transition(*args) self.priorities[self.position] = max_prio self.position = (self.position + 1) % self.capacity
def optimize_model(memory, batch_size, criterion=nn.MSELoss(), gamma=0.999): if len(memory) < batch_size: return transitions = memory.sample(batch_size) batch = utils.Transition(*zip(*transitions)) next_state_batch = torch.stack(batch.next_state).to(device) state_batch = torch.stack(batch.state).to(device) action_batch = torch.stack(batch.action).to(device) reward_batch = torch.stack(batch.reward).to(device) done_batch = torch.stack(batch.done).to(device) state_action_values = critic([state_batch, action_batch]) next_state_action_values = target_critic( [next_state_batch, target_actor(next_state_batch)]).detach() expected_state_action_values = (next_state_action_values * gamma * (1.0 - done_batch)) + reward_batch critic_loss = criterion(state_action_values, expected_state_action_values) critic_optimizer.zero_grad() critic_loss.backward() critic_optimizer.step() actor_loss = -critic([state_batch, actor(state_batch)]).mean() actor_optimizer.zero_grad() actor_loss.backward() actor_optimizer.step() soft_update(target_actor, actor) soft_update(target_critic, critic)
def optimize_model(memory, batch_size, gamma=0.999): if len(memory) < batch_size: return transitions, indices, weights = memory.sample(batch_size) batch = utils.Transition(*zip(*transitions)) next_state_batch = torch.stack(batch.next_state).to(device) state_batch = torch.stack(batch.state).to(device) action_batch = torch.stack(batch.action).to(device) reward_batch = torch.stack(batch.reward).to(device) done_batch = torch.stack(batch.done).to(device) weights_batch = torch.tensor(weights).to(device) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken state_action_values = policy_net(state_batch).gather(1, action_batch) # Compute V(s_{t+1}) for all next states. next_state_values = target_net(next_state_batch).max(1)[0].unsqueeze( 1).detach() # Compute the expected Q values expected_state_action_values = (next_state_values * gamma * (1.0 - done_batch)) + reward_batch # Compute Huber loss delta = F.smooth_l1_loss(state_action_values, expected_state_action_values, reduce=False) prios = delta.abs() + 1e-5 loss = (delta * weights_batch.unsqueeze(1)).mean() # Optimize the model optimizer.zero_grad() loss.backward() for param in policy_net.parameters(): param.grad.data.clamp_(-1, 1) memory.update_priorities(indices, prios.data.cpu().numpy()) optimizer.step()
def push(self, *args): """Saves a transition.""" if len(self.memory) < self.capacity: self.memory.append(None) self.memory[self.position] = utils.Transition(*args) self.position = (self.position + 1) % self.capacity