Example #1
0
 def transitions(self, states, actions, rewards, next_states, dones):
     stats = Statistics()
     assert not self.eval
     for idx in range(len(states)):
         self._buffer.push(state=states[idx],
                           action=actions[idx],
                           reward=rewards[idx],
                           next_state=next_states[idx],
                           done=dones[idx])
     stats.set("replay_buffer_size", len(self._buffer))
     if len(self._buffer) >= self._min_replay_buffer_size:
         t0 = time.time()  # time spent for optimization
         stats.set_all(self._optimize())
         stats.set("optimization_time", time.time() - t0)
     return stats
Example #2
0
    def _run_one_iteration(self):
        stats = Statistics(self._summary_writer, self._iteration)

        phase_stats, agent_stats = self._run_one_phase(is_training=True)
        stats.set("training_episodes", phase_stats.sum("episodes"))
        stats.set("training_steps", phase_stats.sum("steps"))
        stats.set_all(phase_stats.get(["agent_time", "step_time", "env_time"]))
        stats.set_all(agent_stats)

        if self._evaluation_steps != 0:
            phase_stats, _ = self._run_one_phase(is_training=False)
            stats.set("eval_episodes", phase_stats.sum("episodes"))
        stats.set("episode_reward", phase_stats.get("rewards"))
        stats.set("episode_steps", phase_stats.get("steps"))

        return stats
Example #3
0
 def transitions(self, states, actions, rewards, next_states, term):
     states = self._state_preprocessor(states)
     next_states = self._state_preprocessor(next_states)
     batch_size = self._n_agents * self._n_envs
     actions_shape = (batch_size, ) + self._action_space.shape
     assert actions.shape == actions_shape, actions.shape
     stats = Statistics()
     n_agents = self._n_agents
     for idx, agent in enumerate(self._agents):
         s = agent.transitions(
             states[idx::n_agents],
             actions[idx::n_agents],
             rewards[idx::n_agents],
             next_states[idx::n_agents],
             term[idx::n_agents],
         )
         stats.set_all(s)
     return stats
Example #4
0
def main(checkpoint, debug=False):
    filename = os.path.basename(checkpoint)
    s = filename.split('-')

    # Create Environment
    # Derive environment ID from the checkpoint filename
    file_prefix = s[0]
    openai_env_ids = {
        "pole": "CartPole-v1",
        "lunarcont": "LunarLanderContinuous-v2",
        "lunar": "LunarLander-v2",
        "carcont": "MountainCarContinuous-v0",
        "pendulum": "Pendulum-v0",
    }
    if file_prefix in openai_env_ids:
        env_id = openai_env_ids[file_prefix]
    else:
        env_id = file_prefix

    s = s[1:]

    env = create_env(env_id)

    # Create agent
    sample_action = sample_action_fn(checkpoint, env.action_space)

    stats = Statistics()

    try:
        while True:
            episode_stats = play_episode(env, sample_action, debug=debug)
            stats.set_all(episode_stats)
            print(
                ("Episode #{}: {:.2f}; Average Reward: {:.2f}; " +
                 "Episode length: {}; Average episode length: {:.1f}").format(
                     stats.sum("episodes"), episode_stats.avg("rewards"),
                     stats.avg("rewards"), int(episode_stats.avg("steps")),
                     stats.avg("steps")))
    except KeyboardInterrupt:
        env.close()
        return
    env.close()
Example #5
0
    def _run_one_phase(self, is_training):
        stats = Statistics()
        agent_stats = Statistics()

        self._agent.eval = not is_training
        min_steps = (self._training_steps if is_training else
                     self._evaluation_steps) * self._env.n_agents

        self._env.reset()
        while stats.sum("steps") < min_steps:
            step_time0 = time.time()

            states = np.copy(self._env.states)
            actions = self._agent.step(states)

            rewards, next_states, dones, env_stats = \
                self._env.step(actions)
            stats.set_all(env_stats)

            if self._traj_buffer is not None:
                self._traj_buffer.push(states, actions, rewards, next_states,
                                       dones)

            if is_training:
                t0 = time.time()
                agent_stats.set_all(
                    self._agent.transitions(states, actions, rewards,
                                            next_states, dones))
                stats.set("agent_time", time.time() - t0)
                stats.set("step_time", time.time() - step_time0)

            sys.stdout.write(
                "Iteration {} ({}). ".format(
                    self._iteration, "train" if is_training else "eval") +
                "Steps executed: {} ".format(stats.sum("steps")) +
                "Episode length: {} ".format(int(stats.avg("steps"))) +
                "Return: {:.4f}      \r".format(stats.avg("rewards")))
            sys.stdout.flush()
        print()
        self._agent.episodes_end()
        return stats, agent_stats