Beispiel #1
0
    def add_experience(self,
                       edge,
                       experience,
                       step,
                       cum_reward,
                       success=False):
        """Adds an experience for updating to the skill associated with the

        edge. Reward in the experience should be the env extrinsic reward.

        Args: edge (DirectedEdge) experience (Experience)
            step (int): step at the beginning of the experience
            success (bool): True if the experience was part of a successful
            trajectory
    """
        # Experience doesn't apply
        if step > self.max_steps(edge):
            return

        current_skill_state = self._skill_state(experience.state, edge, step,
                                                cum_reward)
        reward = self.reward(experience.next_state, edge, experience.reward,
                             experience.done)
        next_skill_state = self._skill_state(experience.next_state, edge,
                                             step + 1, cum_reward + reward)
        skill_done = experience.done or step + 1 >= self.max_steps(edge) \
            or cum_reward + reward >= self._max_worker_reward
        skill_experience = Experience(current_skill_state, experience.action,
                                      reward, next_skill_state, skill_done)
        skill, _ = self._skills.get_skill(edge)
        skill.add_experience(skill_experience,
                             success,
                             allow_updates=edge.training())
Beispiel #2
0
    def __call__(self, experience):
        """Returns another experience with the bonus added to the reward.

        Args: experience (Experience)

        Returns:
            Experience
        """
        next_state = AS.AbstractState(experience.next_state)
        next_state_count = self._state_counts[next_state]
        assert next_state_count > 0
        reward_bonus = self._beta / np.sqrt(next_state_count)
        return Experience(experience.state, experience.action,
                          experience.reward + reward_bonus,
                          experience.next_state, experience.done)
    def observe(self, state, action, reward, next_state, done):
        """Updates episode state based on observations from environment.

        Args: state (State) action (Action) reward (float) next_state (State)
        done (bool)
    """
        if isinstance(action, Teleport) or isinstance(action, EndEpisode):
            return

        experience = Experience(state, action.action_num, reward, next_state,
                                done)
        self._episode.append(experience)

        if len(self._plan) > 0:
            assert not self._explorer.active()
            curr_edge = self._plan[0]
            self._edge_trajectories[curr_edge].append(
                (experience, len(self._worker_rewards),
                 sum(self._worker_rewards)))
            worker_reward = self._worker.reward(next_state, curr_edge, reward,
                                                done)
            self._worker_rewards.append(worker_reward)
            success = worker_reward == 1 and curr_edge.reliable()
            success = success or sum(
                self._worker_rewards) >= self._worker_reward_thresh
            failure = done or reward < 0 or \
                len(self._worker_rewards) >= self._worker.max_steps(curr_edge)

            if success:
                # Grab the first time you hit the goal state for teleport
                index = self._worker_rewards.index(1.)
                teleport_exp, _, _ = self._edge_trajectories[curr_edge][index]
                teleport = teleport_exp.next_state.teleport
                if not self._allow_setting_teleport:
                    teleport = None
                self._plan.pop(0)
                self._worker_rewards = []
                self._graph_updates.append(Traverse(curr_edge, True, teleport))
                self._edge_trajectories[curr_edge].set_success(True)
                self._allow_setting_teleport = True
            elif failure:
                if curr_edge.reliable():
                    logging.error("Failed reliable edge: {}".format(curr_edge))
                self._worker_rewards = []
                self._plan = []
                self._graph_updates.append(Traverse(curr_edge, False))
                self._edge_trajectories[curr_edge].set_success(False)
Beispiel #4
0
    def train(self):
        rewards = deque(maxlen=100)
        take_grad_step = lambda loss: self._take_grad_step(
            self._train_state, loss, self._max_grad_norm)
        frames = 0  # number of training frames seen
        episodes = 0  # number of training episodes that have been played
        with tqdm(total=self._max_frames) as progress:
            # Each loop completes a single episode
            while frames < self._max_frames:
                state = self._env.reset()
                episode_reward = 0.
                episode_frames = 0
                # Each loop completes a single step, duplicates _evaluate() to
                # update at the appropriate frame #s
                for _ in range(self._max_episode_len):
                    frames += 1
                    episode_frames += 1
                    action = self._dqn.act(state)
                    next_state, reward, done, info = self._env.step(action)
                    next_state = next_state
                    episode_reward += reward
                    # NOTE: state and next_state are LazyFrames and must be
                    # converted to np.arrays
                    self._replay_buffer.add(
                        Experience(state, action, reward, next_state, done))
                    state = next_state

                    if len(self._replay_buffer) > self._buffer_size_start and \
                            frames % self._update_freq == 0:
                        experiences, weights, indices = \
                                self._replay_buffer.sample(self._batch_size)
                        td_error = self._dqn.update_from_experiences(
                            experiences, weights, take_grad_step)
                        new_priorities = \
                                np.abs(td_error.cpu().data.numpy()) + 1e-6
                        self._replay_buffer.update_priorities(
                            indices, new_priorities)

                    if frames % self._sync_target_freq == 0:
                        self._dqn.sync_target()

                    if done:
                        break

                episodes += 1
                rewards.append(episode_reward)
                stats = self._dqn.stats()
                stats["Episode Reward"] = episode_reward
                stats["Avg Episode Reward"] = mean_with_default(rewards, None)
                stats["Num Episodes"] = episodes
                progress.set_postfix(stats, refresh=False)
                progress.update(episode_frames)
                episode_frames = 0

                for k, v in stats.items():
                    if v is not None:
                        self.tb_logger.log_value(k, v, step=frames)

                if episodes % self._evaluate_freq == 0:
                    test_rewards = []
                    gif_images = []
                    for _ in tqdm(range(self._episodes_to_evaluate),
                                  desc="Evaluating"):
                        test_reward, images = self._evaluate()
                        gif_images += images
                        test_rewards.append(test_reward)
                    save_path = os.path.join(self.workspace.video,
                                             "{}.gif".format(episodes))
                    durations = [20] * len(gif_images)
                    durations[-1] = 1000
                    gif_images[0].save(save_path,
                                       append_images=gif_images[1:],
                                       save_all=True,
                                       duration=durations,
                                       loop=0)
                    avg_test_reward = \
                        sum(test_rewards) / float(len(test_rewards))
                    print("Evaluation Reward: {}".format(avg_test_reward))
                    self.tb_logger.log_value("Evaluation Reward",
                                             avg_test_reward,
                                             step=frames)