Beispiel #1
0
 def stats(self):
     stats = {}
     for k, v in self._dqn.stats().items():
         stats["{}_{}".format(self.name, k)] = v
     stats["{}_avg_reward".format(self.name)] = mean_with_default(
         self._episode_rewards, 0.)
     stats["{}_success_rate".format(self.name)] = mean_with_default(
         self._success_rate, 0.)
     stats["{}_epsilon".format(self.name)] = self._epsilon
     return stats
Beispiel #2
0
  def stats(self):
    """See comments in constructor for more details about what these stats

        are
    """
    return {
        "epsilon":
            self._epsilon_schedule.step(take_step=False),
        "Max Q":
            mean_with_default(self._max_q, None),
        "Min Q":
            mean_with_default(self._min_q, None),
        "Avg grad norm":
            mean_with_default(self._grad_norms, None),
        "Max target":
            mean_with_default(self._max_target, None),
        "Min target":
            mean_with_default(self._min_target, None),
        "TD loss":
            mean_with_default(self._td_losses, None),
        "Imitation grad norm":
            mean_with_default(self._imitation_grad_norms, None),
        "Margin loss":
            mean_with_default(self._margin_losses, None),
        "Regression loss":
            mean_with_default(self._regression_losses, None)
    }
Beispiel #3
0
 def __str__(self):
     return ("{} --> {} (deg={}, evaluate={}/{} [{:.2f}],"
             " train={}/{} [{:.2f}], past={}/{} [{:.2f}], status={},"
             " train count={}, reward={}, total successes={},"
             " dead={}, life lost={}").format(
                 self.start.uid, self.end.uid, self.degree,
                 sum(self._eval_window), len(self._eval_window),
                 mean_with_default(self._eval_window, 0.),
                 sum(self._train_window), len(self._train_window),
                 mean_with_default(self._train_window, 0.),
                 sum(self._past_window), len(self._past_window),
                 mean_with_default(self._past_window, 0.), self.state,
                 self.train_count, self.reward, self.total_successes,
                 self.dead, self.life_lost)
Beispiel #4
0
    def update(self):
        """Takes gradient steps by sampling from replay buffer."""
        def take_grad_step(loss):
            self._optimizer.zero_grad()
            loss.backward()

            # clip according to the max allowed grad norm
            grad_norm = clip_grad_norm(self._dqn.parameters(),
                                       self._max_grad_norm,
                                       norm_type=2)

            # TODO: Fix
            finite_grads = True

            # take a step if the grads are finite
            if finite_grads:
                self._optimizer.step()
            return finite_grads, grad_norm

        if self._frozen:
            return

        # Adaptive success: w/ prob 1 - current success rate, take update
        success_rate = mean_with_default(self._success_rate, 0.)
        update = not self._adaptive_update or np.random.random() > success_rate
        if len(self._replay_buffer) >= self._min_buffer_size and update:
            for _ in range(self._grad_steps_per_update):
                self._updates += 1
                if self._updates % self._sync_freq == 0:
                    self._dqn.sync_target()
                experiences = self._replay_buffer.sample(self._batch_size)
                experiences = [self._reward_bonus(e) for e in experiences]
                td_error = self._dqn.update_from_experiences(
                    experiences,
                    np.ones(self._batch_size),
                    take_grad_step,
                    vmax=self._dqn_vmax,
                    vmin=self._dqn_vmin)
                max_td_error = torch.max(td_error)[0]

                if (max_td_error > 4).any():
                    logging.warning("Large error: {} on skill: {}".format(
                        max_td_error, self))

        imitation_update = update and self._imitation_buffer is not None
        if imitation_update and len(self._imitation_buffer) > 0:
            imitation_experiences = self._imitation_buffer.sample(
                self._batch_size)
            self._dqn.update_from_imitation(imitation_experiences,
                                            take_grad_step,
                                            self._max_worker_reward)
Beispiel #5
0
    def act(self, state, epsilon=None, **kwargs):
        """Given the current state, returns an action.

    Supports all the
        keyword args as DQNPolicy.

        Args: state (State)

        Returns:
            action (int)
        """
        if self._epsilon_clipping and epsilon is not None:
            epsilon -= mean_with_default(self._success_rate, 0.)
            epsilon = max(epsilon, 0.)
        self._epsilon = epsilon or 0.
        return self._dqn.act(state, epsilon=epsilon, **kwargs)
Beispiel #6
0
 def success_rate(self):
   """Returns the success rate over the past N attempts"""
   return mean_with_default(self._past_window, 0.)
Beispiel #7
0
    def train(self):
        rewards = deque(maxlen=100)
        take_grad_step = lambda loss: self._take_grad_step(
            self._train_state, loss, self._max_grad_norm)
        frames = 0  # number of training frames seen
        episodes = 0  # number of training episodes that have been played
        with tqdm(total=self._max_frames) as progress:
            # Each loop completes a single episode
            while frames < self._max_frames:
                state = self._env.reset()
                episode_reward = 0.
                episode_frames = 0
                # Each loop completes a single step, duplicates _evaluate() to
                # update at the appropriate frame #s
                for _ in range(self._max_episode_len):
                    frames += 1
                    episode_frames += 1
                    action = self._dqn.act(state)
                    next_state, reward, done, info = self._env.step(action)
                    next_state = next_state
                    episode_reward += reward
                    # NOTE: state and next_state are LazyFrames and must be
                    # converted to np.arrays
                    self._replay_buffer.add(
                        Experience(state, action, reward, next_state, done))
                    state = next_state

                    if len(self._replay_buffer) > self._buffer_size_start and \
                            frames % self._update_freq == 0:
                        experiences, weights, indices = \
                                self._replay_buffer.sample(self._batch_size)
                        td_error = self._dqn.update_from_experiences(
                            experiences, weights, take_grad_step)
                        new_priorities = \
                                np.abs(td_error.cpu().data.numpy()) + 1e-6
                        self._replay_buffer.update_priorities(
                            indices, new_priorities)

                    if frames % self._sync_target_freq == 0:
                        self._dqn.sync_target()

                    if done:
                        break

                episodes += 1
                rewards.append(episode_reward)
                stats = self._dqn.stats()
                stats["Episode Reward"] = episode_reward
                stats["Avg Episode Reward"] = mean_with_default(rewards, None)
                stats["Num Episodes"] = episodes
                progress.set_postfix(stats, refresh=False)
                progress.update(episode_frames)
                episode_frames = 0

                for k, v in stats.items():
                    if v is not None:
                        self.tb_logger.log_value(k, v, step=frames)

                if episodes % self._evaluate_freq == 0:
                    test_rewards = []
                    gif_images = []
                    for _ in tqdm(range(self._episodes_to_evaluate),
                                  desc="Evaluating"):
                        test_reward, images = self._evaluate()
                        gif_images += images
                        test_rewards.append(test_reward)
                    save_path = os.path.join(self.workspace.video,
                                             "{}.gif".format(episodes))
                    durations = [20] * len(gif_images)
                    durations[-1] = 1000
                    gif_images[0].save(save_path,
                                       append_images=gif_images[1:],
                                       save_all=True,
                                       duration=durations,
                                       loop=0)
                    avg_test_reward = \
                        sum(test_rewards) / float(len(test_rewards))
                    print("Evaluation Reward: {}".format(avg_test_reward))
                    self.tb_logger.log_value("Evaluation Reward",
                                             avg_test_reward,
                                             step=frames)