def transitions(self, states, actions, rewards, next_states, dones): stats = Statistics() assert not self.eval for idx in range(len(states)): self._buffer.push(state=states[idx], action=actions[idx], reward=rewards[idx], next_state=next_states[idx], done=dones[idx]) stats.set("replay_buffer_size", len(self._buffer)) if len(self._buffer) >= self._min_replay_buffer_size: t0 = time.time() # time spent for optimization stats.set_all(self._optimize()) stats.set("optimization_time", time.time() - t0) return stats
def _run_one_iteration(self): stats = Statistics(self._summary_writer, self._iteration) phase_stats, agent_stats = self._run_one_phase(is_training=True) stats.set("training_episodes", phase_stats.sum("episodes")) stats.set("training_steps", phase_stats.sum("steps")) stats.set_all(phase_stats.get(["agent_time", "step_time", "env_time"])) stats.set_all(agent_stats) if self._evaluation_steps != 0: phase_stats, _ = self._run_one_phase(is_training=False) stats.set("eval_episodes", phase_stats.sum("episodes")) stats.set("episode_reward", phase_stats.get("rewards")) stats.set("episode_steps", phase_stats.get("steps")) return stats
def transitions(self, states, actions, rewards, next_states, term): states = self._state_preprocessor(states) next_states = self._state_preprocessor(next_states) batch_size = self._n_agents * self._n_envs actions_shape = (batch_size, ) + self._action_space.shape assert actions.shape == actions_shape, actions.shape stats = Statistics() n_agents = self._n_agents for idx, agent in enumerate(self._agents): s = agent.transitions( states[idx::n_agents], actions[idx::n_agents], rewards[idx::n_agents], next_states[idx::n_agents], term[idx::n_agents], ) stats.set_all(s) return stats
def main(checkpoint, debug=False): filename = os.path.basename(checkpoint) s = filename.split('-') # Create Environment # Derive environment ID from the checkpoint filename file_prefix = s[0] openai_env_ids = { "pole": "CartPole-v1", "lunarcont": "LunarLanderContinuous-v2", "lunar": "LunarLander-v2", "carcont": "MountainCarContinuous-v0", "pendulum": "Pendulum-v0", } if file_prefix in openai_env_ids: env_id = openai_env_ids[file_prefix] else: env_id = file_prefix s = s[1:] env = create_env(env_id) # Create agent sample_action = sample_action_fn(checkpoint, env.action_space) stats = Statistics() try: while True: episode_stats = play_episode(env, sample_action, debug=debug) stats.set_all(episode_stats) print( ("Episode #{}: {:.2f}; Average Reward: {:.2f}; " + "Episode length: {}; Average episode length: {:.1f}").format( stats.sum("episodes"), episode_stats.avg("rewards"), stats.avg("rewards"), int(episode_stats.avg("steps")), stats.avg("steps"))) except KeyboardInterrupt: env.close() return env.close()
def _run_one_phase(self, is_training): stats = Statistics() agent_stats = Statistics() self._agent.eval = not is_training min_steps = (self._training_steps if is_training else self._evaluation_steps) * self._env.n_agents self._env.reset() while stats.sum("steps") < min_steps: step_time0 = time.time() states = np.copy(self._env.states) actions = self._agent.step(states) rewards, next_states, dones, env_stats = \ self._env.step(actions) stats.set_all(env_stats) if self._traj_buffer is not None: self._traj_buffer.push(states, actions, rewards, next_states, dones) if is_training: t0 = time.time() agent_stats.set_all( self._agent.transitions(states, actions, rewards, next_states, dones)) stats.set("agent_time", time.time() - t0) stats.set("step_time", time.time() - step_time0) sys.stdout.write( "Iteration {} ({}). ".format( self._iteration, "train" if is_training else "eval") + "Steps executed: {} ".format(stats.sum("steps")) + "Episode length: {} ".format(int(stats.avg("steps"))) + "Return: {:.4f} \r".format(stats.avg("rewards"))) sys.stdout.flush() print() self._agent.episodes_end() return stats, agent_stats