def update_from_imitation(self, experiences, take_grad_step, vmax): """Updates the Q values to match the reward to go and a margin loss. regression_loss = ||G_t - Q(s_t, a_t)||_2 G_t = return of the episode from timestep t and onward margin_loss = max_a (Q(s, a) + l(a_e, a)) - Q(s, a_e) a_e is "expert" action from the experience l(a_e, a) = 0 if a_e = a, = K otherwise. (following DQfD) Args: experiences (list[Experience]): batch of experiences, state and next_state may be LazyFrames or np.arrays take_grad_step (Callable(loss)): takes the loss and updates vmax (float): optimal value of best state """ states = [e.state for e in experiences] actions = GPUVariable( torch.LongTensor( np.array([np.array(e.action) for e in experiences]))) # Computes V-max (trajectories assumed to achieve V-max reward) rewards_to_go = GPUVariable( torch.FloatTensor( np.array([vmax - e.state.goal.cum_reward for e in experiences]))) q_values = self._Q(states) expert_q_values = q_values.gather(1, actions.unsqueeze(1)) regression_loss = torch.mean((rewards_to_go - expert_q_values)**2) margin = 0.5 * torch.ones_like(q_values).scatter( 1, actions.unsqueeze(1), 0.) max_margin_q_values, _ = torch.max(q_values + margin, dim=1) margin_loss = torch.mean(max_margin_q_values - expert_q_values) grad_norm = take_grad_step(regression_loss + margin_loss)[1] if self._debug_stats: self._margin_losses.append(margin_loss) self._regression_losses.append(regression_loss) self._imitation_grad_norms.append(grad_norm)
def update_from_experiences(self, experiences, weights, take_grad_step, vmax=None, vmin=None): """Updates parameters from a batch of experiences Minimizing the loss: (target - Q(s, a))^2 target = r if done r + \gamma * max_a' Q(s', a') target is clamped between [vmin, vmax - G_t] if vmin, vmax provided Args: experiences (list[Experience]): batch of experiences, state and next_state may be LazyFrames or np.arrays weights (list[float]): importance weights on each experience take_grad_step (Callable(loss)): takes the loss and updates parameters vmax (float | None): if None, no clamping vmin (float | None): if None, no clamping Returns: td_error (GPUVariable[FloatTensor]): (batch_size), error per experience """ batch_size = len(experiences) states = [e.state for e in experiences] actions = GPUVariable( torch.LongTensor(np.array([np.array(e.action) for e in experiences]))) next_states = [e.next_state for e in experiences] rewards = GPUVariable( torch.FloatTensor(np.array([e.reward for e in experiences]))) # (batch_size,) 1 if was not done, otherwise 0 not_done_mask = GPUVariable( torch.FloatTensor(np.array([1 - e.done for e in experiences]))) weights = GPUVariable(torch.FloatTensor(np.array(weights))) current_state_q_values = self._Q(states).gather(1, actions.unsqueeze(1)) # DDQN next_state_q_values = self._Q(next_states) best_q_values, best_actions = torch.max(next_state_q_values, 1) best_actions = best_actions.unsqueeze(1) target_q_values = self._target_Q(next_states).gather( 1, best_actions).squeeze(1) targets = rewards + self._gamma * (target_q_values * not_done_mask) if vmax is not None: # targets - (targets - G_t)_+ max_reward_to_go = GPUVariable( torch.FloatTensor( np.array([vmax - e.state.goal.cum_reward for e in experiences]))) clip_amount = torch.clamp(targets - max_reward_to_go, min=0.) targets = targets - clip_amount if vmin is not None: targets = torch.clamp(targets, min=vmin) targets.detach_() # Don't backprop through targets td_error = current_state_q_values.squeeze() - targets loss = torch.mean((td_error**2) * weights) grad_norm = take_grad_step(loss)[1] if grad_norm > 100: logging.warning("Large grad norm: {}".format(grad_norm)) logging.warning("TD Errors: {}".format(td_error)) logging.warning("Predicted Q-values: {}".format(current_state_q_values)) logging.warning("Targets: {}".format(targets)) if self._debug_stats: max_target = torch.max(targets)[0] min_target = torch.min(targets)[0] self._max_target.append(max_target) self._min_target.append(min_target) self._td_losses.append(loss) self._grad_norms.append(grad_norm) return td_error