def after_all_episodes(self, episode, rewards): gamma = self.agent.config.get("gamma", 1) self.writer.add_scalar('episode/length', len(rewards), episode) self.writer.add_scalar('episode/total_reward', sum(rewards), episode) self.writer.add_scalar( 'episode/return', sum(r * gamma**t for t, r in enumerate(rewards)), episode) self.writer.add_histogram('episode/rewards', rewards, episode) logger.info("Episode {} score: {:.1f}".format(episode, sum(rewards)))
def load_agent_model(self, model_path): if model_path is True: model_path = self.directory / self.SAVED_MODELS_FOLDER / "latest.tar" try: self.agent.load(filename=model_path) logger.info("Load {} model from {}".format( self.agent.__class__.__name__, model_path)) except FileNotFoundError: logger.warning( "No pre-trained model found at the desired location.") except NotImplementedError: pass
def save_agent_model(self, episode, do_save=True): # Create the folder if it doesn't exist permanent_folder = self.directory / self.SAVED_MODELS_FOLDER os.makedirs(permanent_folder, exist_ok=True) if do_save: episode_path = Path( self.monitor.directory) / "checkpoint-{}.tar".format(episode + 1) try: self.agent.save(filename=episode_path) self.agent.save(filename=permanent_folder / "latest.tar") except NotImplementedError: pass else: logger.info("Saved {} model to {}".format( self.agent.__class__.__name__, episode_path))
def __init__(self, env, agent, directory=None, run_directory=None, num_episodes=1000, training=True, sim_seed=None, recover=None, display_env=True, display_agent=True, display_rewards=True, close_env=True): """ :param env: The environment to be solved, possibly wrapping an AbstractEnv environment :param AbstractAgent agent: The agent solving the environment :param Path directory: Workspace directory path :param Path run_directory: Run directory path :param int num_episodes: Number of episodes run !param training: Whether the agent is being trained or tested :param sim_seed: The seed used for the environment/agent randomness source :param recover: Recover the agent parameters from a file. - If True, it the default latest save will be used. - If a string, it will be used as a path. :param display_env: Render the environment, and have a monitor recording its videos :param display_agent: Add the agent graphics to the environment viewer, if supported :param display_rewards: Display the performances of the agent through the episodes :param close_env: Should the environment be closed when the evaluation is closed """ self.env = env self.agent = agent self.num_episodes = num_episodes self.training = training self.sim_seed = sim_seed self.close_env = close_env self.display_env = display_env self.directory = Path(directory or self.default_directory) self.run_directory = self.directory / (run_directory or self.default_run_directory) self.monitor = MonitorV2( env, self.run_directory, video_callable=(None if self.display_env else False)) self.writer = SummaryWriter(str(self.run_directory)) self.agent.set_writer(self.writer) self.write_logging() self.write_metadata() self.filtered_agent_stats = 0 self.best_agent_stats = -np.infty, 0 self.recover = recover if self.recover: self.load_agent_model(self.recover) if display_agent: try: # Render the agent within the environment viewer, if supported self.env.render() self.env.unwrapped.viewer.set_agent_display( lambda agent_surface, sim_surface: AgentGraphics.display( self.agent, agent_surface, sim_surface)) except AttributeError: logger.info( "The environment viewer doesn't support agent rendering.") self.reward_viewer = None if display_rewards: self.reward_viewer = RewardViewer() self.observation = None
def run_batched_episodes(self): """ Alternatively, - run multiple sample-collection jobs in parallel - update model """ episode = 0 episode_duration = 14 # TODO: use a fixed number of samples instead batch_sizes = near_split(self.num_episodes * episode_duration, size_bins=self.agent.config["batch_size"]) self.agent.reset() for batch, batch_size in enumerate(batch_sizes): logger.info( "[BATCH={}/{}]---------------------------------------".format( batch + 1, len(batch_sizes))) logger.info( "[BATCH={}/{}][run_batched_episodes] #samples={}".format( batch + 1, len(batch_sizes), len(self.agent.memory))) logger.info( "[BATCH={}/{}]---------------------------------------".format( batch + 1, len(batch_sizes))) # Save current agent model_path = self.save_agent_model(identifier=batch) # Prepare workers env_config, agent_config = serialize(self.env), serialize( self.agent) cpu_processes = self.agent.config["processes"] or os.cpu_count() workers_sample_counts = near_split(batch_size, cpu_processes) workers_starts = list( np.cumsum(np.insert(workers_sample_counts[:-1], 0, 0)) + np.sum(batch_sizes[:batch])) base_seed = self.seed(batch * cpu_processes)[0] workers_seeds = [base_seed + i for i in range(cpu_processes)] workers_params = list( zip_with_singletons(env_config, agent_config, workers_sample_counts, workers_starts, workers_seeds, model_path, batch)) # Collect trajectories logger.info("Collecting {} samples with {} workers...".format( batch_size, cpu_processes)) if cpu_processes == 1: results = [Evaluation.collect_samples(*workers_params[0])] else: with Pool(processes=cpu_processes) as pool: results = pool.starmap(Evaluation.collect_samples, workers_params) trajectories = [ trajectory for worker in results for trajectory in worker ] # Fill memory for trajectory in trajectories: if trajectory[ -1].terminal: # Check whether the episode was properly finished before logging self.after_all_episodes( episode, [transition.reward for transition in trajectory]) episode += 1 [self.agent.record(*transition) for transition in trajectory] # Fit model self.agent.update()
def after_all_episodes(self, episode, rewards, duration): rewards_individual_agents = np.array(self.rewards) rewards_averaged_over_agents = np.array( self.rewards_averaged_over_agents) self.episode_length = rewards_individual_agents.shape[0] if len(rewards_individual_agents.shape) > 1: controlled_vehicle_count = rewards_individual_agents.shape[1] else: controlled_vehicle_count = 1 assert controlled_vehicle_count == len(self.env.controlled_vehicles), \ "Length of each row in reward should be equal to the number of controlled vehicles" reward_total_episode = sum(rewards_averaged_over_agents) if not self.test_stable_baseline: self.writer.add_scalar('episode/length', self.episode_length, episode) self.writer.add_scalar('episode/total_reward', reward_total_episode, episode) if self.individual_reward_tensorboard: # logging individual rewards for each controlled_vehicle individual_rewards_dict = {} individual_rewards_title = f'individual_stats/agent_rewards' for n in range(controlled_vehicle_count): agent_name = 'agent' + str(n + 1) agent_reward_array = sum(rewards_individual_agents[:, n]) individual_rewards_dict[agent_name] = agent_reward_array self.writer.add_scalars(individual_rewards_title, individual_rewards_dict, episode) if not self.test_stable_baseline: gamma = self.agent.config.get("gamma", 1) self.writer.add_scalar( 'episode/return', sum(r * gamma**t for t, r in enumerate(rewards_averaged_over_agents)), episode) self.writer.add_histogram('episode/rewards', rewards_averaged_over_agents, episode) self.writer.add_scalar('episode/fps', len(rewards) / duration, episode) # Create raw logfiles if self.create_episode_log: logged_info = self.log_creator.episode_info_logger(episode) # Adding logged info to TensorBoard if not self.test_stable_baseline: self.writer.add_scalar('episode/mission_time', logged_info['mission_time'], episode) self.writer.add_scalar( 'episode_average_speeds/episode_average_speed_all', logged_info['episode_average_speed_all'], episode) self.writer.add_scalar( 'episode_average_speeds/episode_average_speed_controlled', logged_info['episode_average_speed_controlled'], episode) self.writer.add_scalar( 'episode_average_speeds/episode_average_speed_human', logged_info['episode_average_speed_human'], episode) if self.log_creator.log_distance: self.writer.add_scalar( 'episode_average_distances/episode_average_distance_all', logged_info['episode_average_distance_all'], episode) self.writer.add_scalar( 'episode_average_distances/episode_average_distance_controlled', logged_info['episode_average_distance_controlled'], episode) self.writer.add_scalar( 'episode_average_distances/episode_average_distance_human', logged_info['episode_average_distance_human'], episode) # Calculate episode ET in ms episode_elapsed_time = 1000 * (time.time() - self.episode_start_time) logger.info( "Episode {} done in {:.1f}ms - step duration: {}, - episode duration: {}, total episode reward: {:.1f}" .format(episode, episode_elapsed_time, episode_elapsed_time / self.episode_length, self.episode_length, sum(rewards_averaged_over_agents)))
def __init__( self, env, agent, directory=None, run_directory=None, num_episodes=1000, training=True, sim_seed=None, recover=None, display_env=True, display_agent=True, display_rewards=True, close_env=True, test_stable_baseline=False, model=None, options=None, ): """ :param env: The environment to be solved, possibly wrapping an AbstractEnv environment :param AbstractAgent agent: The agent solving the environment :param Path directory: Workspace directory path :param Path run_directory: Run directory path :param int num_episodes: Number of episodes run !param training: Whether the agent is being trained or tested :param sim_seed: The seed used for the environment/agent randomness source :param recover: Recover the agent parameters from a file. - If True, it the default latest save will be used. - If a string, it will be used as a path. :param display_env: Render the environment, and have a monitor recording its videos :param display_agent: Add the agent graphics to the environment viewer, if supported :param display_rewards: Display the performances of the agent through the episodes :param close_env: Should the environment be closed when the evaluation is closed """ self.env = env self.agent = agent self.num_episodes = num_episodes self.training = training self.env.training = training self.sim_seed = sim_seed self.close_env = close_env self.display_env = display_env # Modifications self.dataset_by_episode = [] self.env.options = copy.deepcopy(options) self.options = copy.deepcopy(options) if options['--output_folder']: self.OUTPUT_FOLDER = options['--output_folder'] self.directory = Path(directory or self.default_directory) if self.options["--name-from-envconfig"]: exp_json = options["--environment"].split('/')[-1] default_run_directory = self.default_run_directory + "_" + exp_json.split( '.')[0] if training: default_run_directory = os.path.join("train", default_run_directory) else: default_run_directory = os.path.join( "test", default_run_directory + "-test") else: default_run_directory = self.default_run_directory self.run_directory = self.directory / (run_directory or default_run_directory) self.monitor = MonitorV2( env, self.run_directory, video_callable=(None if self.display_env else False), options=self.options) self.test_stable_baseline = test_stable_baseline self.episode = 0 if not self.test_stable_baseline: self.writer = SummaryWriter(str(self.run_directory)) self.agent.set_writer(self.writer) self.agent.evaluation = self self.write_logging() self.write_metadata() self.filtered_agent_stats = 0 self.best_agent_stats = -np.infty, 0 self.recover = recover if self.recover: self.load_agent_model(self.recover) if display_agent: try: # Render the agent within the environment viewer, if supported self.env.render() self.env.unwrapped.viewer.directory = self.run_directory self.env.unwrapped.viewer.set_agent_display( lambda agent_surface, sim_surface: AgentGraphics.display( self.agent, agent_surface, sim_surface)) self.env.unwrapped.viewer.directory = self.run_directory except AttributeError: logger.info( "The environment viewer doesn't support agent rendering.") self.reward_viewer = None if display_rewards: self.reward_viewer = RewardViewer() self.observation = None # Modifications self.episode_start_time = 0 self.episode_length = None self.episode_info = None self.create_episode_log = options["--create_episode_log"] self.individual_episode_log_level = int( options["--individual_episode_log_level"]) self.create_timestep_log = options["--create_timestep_log"] self.timestep_log_freq = int(options["--timestep_log_freq"]) self.individual_reward_tensorboard = options[ "--individual_reward_tensorboard"] self.log_creator = None self.rewards = None self.rewards_averaged_over_agents = None if self.test_stable_baseline: self.model = model