def render_test(parameters, test_nr=0, nr_examples=5): for trial in range(nr_examples): # Reset the env print( 'Showing {} Level {} with (x_dim,y_dim) = ({},{}) and {} Agents.'. format(test_nr, trial, parameters[0], parameters[1], parameters[2])) file_name = "./Tests/{}/Level_{}.pkl".format(test_nr, trial) env = RailEnv( width=1, height=1, rail_generator=rail_from_file(file_name), obs_builder_object=TreeObsForRailEnv(max_depth=2), number_of_agents=1, ) env_renderer = RenderTool( env, gl="PILSVG", ) env_renderer.set_new_rail() env.reset(False, False) env_renderer.render_env(show=True, show_observations=False) time.sleep(0.1) env_renderer.close_window() return
def train_agent(env_params, train_params): # Environment parameters n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city seed = env_params.seed # Observation parameters observation_tree_depth = env_params.observation_tree_depth observation_radius = env_params.observation_radius observation_max_path_depth = env_params.observation_max_path_depth # Training parameters eps_start = train_params.eps_start eps_end = train_params.eps_end eps_decay = train_params.eps_decay n_episodes = train_params.n_episodes checkpoint_interval = train_params.checkpoint_interval n_eval_episodes = train_params.n_evaluation_episodes # Set the seeds random.seed(seed) np.random.seed(seed) # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 10000, # Rate of malfunctions min_duration=15, # Minimal duration max_duration=50 # Max duration ) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Fraction of train which each speed speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Setup the environment env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) env.reset(regenerate_schedule=True, regenerate_rail=True) # Setup renderer if train_params.render: env_renderer = RenderTool(env, gl="PGL") # Calculate the state size given the depth of the tree observation and the number of features n_features_per_node = env.obs_builder.observation_dim n_nodes = 0 for i in range(observation_tree_depth + 1): n_nodes += np.power(4, i) state_size = n_features_per_node * n_nodes # The action space of flatland is 5 discrete actions action_size = 5 # Max number of steps per episode # This is the official formula used during evaluations # See details in flatland.envs.schedule_generators.sparse_schedule_generator max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) action_count = [0] * action_size action_dict = dict() agent_obs = [None] * env.get_num_agents() agent_prev_obs = [None] * env.get_num_agents() agent_prev_action = [2] * env.get_num_agents() update_values = False smoothed_normalized_score = -1.0 smoothed_eval_normalized_score = -1.0 smoothed_completion = 0.0 smoothed_eval_completion = 0.0 # Double Dueling DQN policy policy = DDDQNPolicy(state_size, action_size, train_params) # TensorBoard writer writer = SummaryWriter() writer.add_hparams(vars(train_params), {}) writer.add_hparams(vars(env_params), {}) training_timer = Timer() training_timer.start() print( "\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes.\n" .format(env.get_num_agents(), x_dim, y_dim, n_episodes, n_eval_episodes, checkpoint_interval)) for episode_idx in range(n_episodes + 1): # Timers step_timer = Timer() reset_timer = Timer() learn_timer = Timer() preproc_timer = Timer() # Reset environment reset_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True) reset_timer.end() if train_params.render: env_renderer.set_new_rail() score = 0 nb_steps = 0 actions_taken = [] # Build agent specific observations for agent in env.get_agent_handles(): if obs[agent]: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) agent_prev_obs[agent] = agent_obs[agent].copy() # Run episode for step in range(max_steps - 1): for agent in env.get_agent_handles(): if info['action_required'][agent]: # If an action is required, we want to store the obs at that step as well as the action update_values = True action = policy.act(agent_obs[agent], eps=eps_start) action_count[action] += 1 actions_taken.append(action) else: update_values = False action = 0 action_dict.update({agent: action}) # Environment step step_timer.start() next_obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if train_params.render and episode_idx % checkpoint_interval == 0: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) for agent in range(env.get_num_agents()): # Update replay buffer and train agent # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[agent]: learn_timer.start() policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent]) learn_timer.end() agent_prev_obs[agent] = agent_obs[agent].copy() agent_prev_action[agent] = action_dict[agent] # Preprocess the new observations if next_obs[agent]: preproc_timer.start() agent_obs[agent] = normalize_observation( next_obs[agent], observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() score += all_rewards[agent] nb_steps = step if done['__all__']: break # Epsilon decay eps_start = max(eps_end, eps_decay * eps_start) # Collection information about training tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) normalized_score = score / (max_steps * env.get_num_agents()) action_probs = action_count / np.sum(action_count) action_count = [1] * action_size # Smoothed values for terminal display and for more stable hyper-parameter tuning smoothing = 0.99 smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * ( 1.0 - smoothing) smoothed_completion = smoothed_completion * smoothing + completion * ( 1.0 - smoothing) # Print logs if episode_idx % checkpoint_interval == 0: torch.save( policy.qnetwork_local, './checkpoints/origin_multi-' + str(episode_idx) + '.pth') if train_params.render: env_renderer.close_window() print('\r🚂 Episode {}' '\t 🏆 Score: {:.3f}' ' Avg: {:.3f}' '\t 💯 Done: {:.2f}%' ' Avg: {:.2f}%' '\t 🎲 Epsilon: {:.2f} ' '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score, smoothed_normalized_score, 100 * completion, 100 * smoothed_completion, eps_start, format_action_prob(action_probs)), end=" ") # Evaluate policy if episode_idx % train_params.checkpoint_interval == 0: scores, completions, nb_steps_eval = eval_policy( env, policy, n_eval_episodes, max_steps) writer.add_scalar("evaluation/scores_min", np.min(scores), episode_idx) writer.add_scalar("evaluation/scores_max", np.max(scores), episode_idx) writer.add_scalar("evaluation/scores_mean", np.mean(scores), episode_idx) writer.add_scalar("evaluation/scores_std", np.std(scores), episode_idx) writer.add_histogram("evaluation/scores", np.array(scores), episode_idx) writer.add_scalar("evaluation/completions_min", np.min(completions), episode_idx) writer.add_scalar("evaluation/completions_max", np.max(completions), episode_idx) writer.add_scalar("evaluation/completions_mean", np.mean(completions), episode_idx) writer.add_scalar("evaluation/completions_std", np.std(completions), episode_idx) writer.add_histogram("evaluation/completions", np.array(completions), episode_idx) writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_mean", np.mean(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval), episode_idx) writer.add_histogram("evaluation/nb_steps", np.array(nb_steps_eval), episode_idx) smoothing = 0.9 smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean( scores) * (1.0 - smoothing) smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean( completions) * (1.0 - smoothing) writer.add_scalar("evaluation/smoothed_score", smoothed_eval_normalized_score, episode_idx) writer.add_scalar("evaluation/smoothed_completion", smoothed_eval_completion, episode_idx) # Save logs to tensorboard writer.add_scalar("training/score", normalized_score, episode_idx) writer.add_scalar("training/smoothed_score", smoothed_normalized_score, episode_idx) writer.add_scalar("training/completion", np.mean(completion), episode_idx) writer.add_scalar("training/smoothed_completion", np.mean(smoothed_completion), episode_idx) writer.add_scalar("training/nb_steps", nb_steps, episode_idx) writer.add_histogram("actions/distribution", np.array(actions_taken), episode_idx) writer.add_scalar("actions/nothing", action_probs[RailEnvActions.DO_NOTHING], episode_idx) writer.add_scalar("actions/left", action_probs[RailEnvActions.MOVE_LEFT], episode_idx) writer.add_scalar("actions/forward", action_probs[RailEnvActions.MOVE_FORWARD], episode_idx) writer.add_scalar("actions/right", action_probs[RailEnvActions.MOVE_RIGHT], episode_idx) writer.add_scalar("actions/stop", action_probs[RailEnvActions.STOP_MOVING], episode_idx) writer.add_scalar("training/epsilon", eps_start, episode_idx) writer.add_scalar("training/buffer_size", len(policy.memory), episode_idx) writer.add_scalar("training/loss", policy.loss, episode_idx) writer.add_scalar("timer/reset", reset_timer.get(), episode_idx) writer.add_scalar("timer/step", step_timer.get(), episode_idx) writer.add_scalar("timer/learn", learn_timer.get(), episode_idx) writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx) writer.add_scalar("timer/total", training_timer.get_current(), episode_idx)
def train_agent(train_params, train_env_params, eval_env_params, obs_params): # Environment parameters n_agents = train_env_params.n_agents x_dim = train_env_params.x_dim y_dim = train_env_params.y_dim n_cities = train_env_params.n_cities max_rails_between_cities = train_env_params.max_rails_between_cities max_rails_in_city = train_env_params.max_rails_in_city seed = train_env_params.seed # Unique ID for this training now = datetime.now() training_id = now.strftime('%y%m%d%H%M%S') # Observation parameters observation_tree_depth = obs_params.observation_tree_depth observation_radius = obs_params.observation_radius observation_max_path_depth = obs_params.observation_max_path_depth # Training parameters eps_start = train_params.eps_start eps_end = train_params.eps_end eps_decay = train_params.eps_decay n_episodes = train_params.n_episodes checkpoint_interval = train_params.checkpoint_interval n_eval_episodes = train_params.n_evaluation_episodes restore_replay_buffer = train_params.restore_replay_buffer save_replay_buffer = train_params.save_replay_buffer # Set the seeds random.seed(seed) np.random.seed(seed) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Setup the environments train_env = create_rail_env(train_env_params, tree_observation) train_env.reset(regenerate_schedule=True, regenerate_rail=True) eval_env = create_rail_env(eval_env_params, tree_observation) eval_env.reset(regenerate_schedule=True, regenerate_rail=True) # Setup renderer if train_params.render: env_renderer = RenderTool(train_env, gl="PGL") # Calculate the state size given the depth of the tree observation and the number of features n_features_per_node = train_env.obs_builder.observation_dim n_nodes = sum([np.power(4, i) for i in range(observation_tree_depth + 1)]) state_size = n_features_per_node * n_nodes # The action space of flatland is 5 discrete actions action_size = 5 # Max number of steps per episode # This is the official formula used during evaluations # See details in flatland.envs.schedule_generators.sparse_schedule_generator # max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) max_steps = train_env._max_episode_steps action_count = [0] * action_size action_dict = dict() agent_obs = [None] * n_agents agent_prev_obs = [None] * n_agents agent_prev_action = [2] * n_agents update_values = [False] * n_agents # Smoothed values used as target for hyperparameter tuning smoothed_normalized_score = -1.0 smoothed_eval_normalized_score = -1.0 smoothed_completion = 0.0 smoothed_eval_completion = 0.0 # Double Dueling DQN policy policy = DDDQNPolicy(state_size, action_size, train_params) # Loads existing replay buffer if restore_replay_buffer: try: policy.load_replay_buffer(restore_replay_buffer) policy.test() except RuntimeError as e: print( "\n🛑 Could't load replay buffer, were the experiences generated using the same tree depth?" ) print(e) exit(1) print("\n💾 Replay buffer status: {}/{} experiences".format( len(policy.memory.memory), train_params.buffer_size)) hdd = psutil.disk_usage('/') if save_replay_buffer and (hdd.free / (2**30)) < 500.0: print( "⚠️ Careful! Saving replay buffers will quickly consume a lot of disk space. You have {:.2f}gb left." .format(hdd.free / (2**30))) # TensorBoard writer writer = SummaryWriter() writer.add_hparams(vars(train_params), {}) writer.add_hparams(vars(train_env_params), {}) writer.add_hparams(vars(obs_params), {}) training_timer = Timer() training_timer.start() print( "\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes. Training id '{}'.\n" .format(train_env.get_num_agents(), x_dim, y_dim, n_episodes, n_eval_episodes, checkpoint_interval, training_id)) for episode_idx in range(n_episodes + 1): step_timer = Timer() reset_timer = Timer() learn_timer = Timer() preproc_timer = Timer() inference_timer = Timer() # Reset environment reset_timer.start() obs, info = train_env.reset(regenerate_rail=True, regenerate_schedule=True) reset_timer.end() if train_params.render: env_renderer.set_new_rail() score = 0 nb_steps = 0 actions_taken = [] # Build initial agent-specific observations for agent in train_env.get_agent_handles(): if obs[agent]: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) agent_prev_obs[agent] = agent_obs[agent].copy() # Run episode for step in range(max_steps - 1): inference_timer.start() for agent in train_env.get_agent_handles(): if info['action_required'][agent]: update_values[agent] = True action = policy.act(agent_obs[agent], eps=eps_start) action_count[action] += 1 actions_taken.append(action) else: # An action is not required if the train hasn't joined the railway network, # if it already reached its target, or if is currently malfunctioning. update_values[agent] = False action = 0 action_dict.update({agent: action}) inference_timer.end() # Environment step step_timer.start() next_obs, all_rewards, done, info = train_env.step(action_dict) step_timer.end() # Render an episode at some interval if train_params.render and episode_idx % checkpoint_interval == 0: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) # Update replay buffer and train agent for agent in train_env.get_agent_handles(): if update_values[agent] or done['__all__']: # Only learn from timesteps where somethings happened learn_timer.start() policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent]) learn_timer.end() agent_prev_obs[agent] = agent_obs[agent].copy() agent_prev_action[agent] = action_dict[agent] # Preprocess the new observations if next_obs[agent]: preproc_timer.start() agent_obs[agent] = normalize_observation( next_obs[agent], observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() score += all_rewards[agent] nb_steps = step if done['__all__']: break # Epsilon decay eps_start = max(eps_end, eps_decay * eps_start) # Collect information about training tasks_finished = sum(done[idx] for idx in train_env.get_agent_handles()) completion = tasks_finished / max(1, train_env.get_num_agents()) normalized_score = score / (max_steps * train_env.get_num_agents()) action_probs = action_count / np.sum(action_count) action_count = [1] * action_size smoothing = 0.99 smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * ( 1.0 - smoothing) smoothed_completion = smoothed_completion * smoothing + completion * ( 1.0 - smoothing) # Print logs if episode_idx % checkpoint_interval == 0: torch.save( policy.qnetwork_local, './checkpoints/' + training_id + '-' + str(episode_idx) + '.pth') if save_replay_buffer: policy.save_replay_buffer('./replay_buffers/' + training_id + '-' + str(episode_idx) + '.pkl') if train_params.render: env_renderer.close_window() print('\r🚂 Episode {}' '\t 🏆 Score: {:.3f}' ' Avg: {:.3f}' '\t 💯 Done: {:.2f}%' ' Avg: {:.2f}%' '\t 🎲 Epsilon: {:.3f} ' '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score, smoothed_normalized_score, 100 * completion, 100 * smoothed_completion, eps_start, format_action_prob(action_probs)), end=" ") # Evaluate policy and log results at some interval if episode_idx % checkpoint_interval == 0 and n_eval_episodes > 0: scores, completions, nb_steps_eval = eval_policy( eval_env, policy, train_params, obs_params) writer.add_scalar("evaluation/scores_min", np.min(scores), episode_idx) writer.add_scalar("evaluation/scores_max", np.max(scores), episode_idx) writer.add_scalar("evaluation/scores_mean", np.mean(scores), episode_idx) writer.add_scalar("evaluation/scores_std", np.std(scores), episode_idx) writer.add_histogram("evaluation/scores", np.array(scores), episode_idx) writer.add_scalar("evaluation/completions_min", np.min(completions), episode_idx) writer.add_scalar("evaluation/completions_max", np.max(completions), episode_idx) writer.add_scalar("evaluation/completions_mean", np.mean(completions), episode_idx) writer.add_scalar("evaluation/completions_std", np.std(completions), episode_idx) writer.add_histogram("evaluation/completions", np.array(completions), episode_idx) writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_mean", np.mean(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval), episode_idx) writer.add_histogram("evaluation/nb_steps", np.array(nb_steps_eval), episode_idx) smoothing = 0.9 smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean( scores) * (1.0 - smoothing) smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean( completions) * (1.0 - smoothing) writer.add_scalar("evaluation/smoothed_score", smoothed_eval_normalized_score, episode_idx) writer.add_scalar("evaluation/smoothed_completion", smoothed_eval_completion, episode_idx) # Save logs to tensorboard writer.add_scalar("training/score", normalized_score, episode_idx) writer.add_scalar("training/smoothed_score", smoothed_normalized_score, episode_idx) writer.add_scalar("training/completion", np.mean(completion), episode_idx) writer.add_scalar("training/smoothed_completion", np.mean(smoothed_completion), episode_idx) writer.add_scalar("training/nb_steps", nb_steps, episode_idx) writer.add_histogram("actions/distribution", np.array(actions_taken), episode_idx) writer.add_scalar("actions/nothing", action_probs[RailEnvActions.DO_NOTHING], episode_idx) writer.add_scalar("actions/left", action_probs[RailEnvActions.MOVE_LEFT], episode_idx) writer.add_scalar("actions/forward", action_probs[RailEnvActions.MOVE_FORWARD], episode_idx) writer.add_scalar("actions/right", action_probs[RailEnvActions.MOVE_RIGHT], episode_idx) writer.add_scalar("actions/stop", action_probs[RailEnvActions.STOP_MOVING], episode_idx) writer.add_scalar("training/epsilon", eps_start, episode_idx) writer.add_scalar("training/buffer_size", len(policy.memory), episode_idx) writer.add_scalar("training/loss", policy.loss, episode_idx) writer.add_scalar("timer/reset", reset_timer.get(), episode_idx) writer.add_scalar("timer/step", step_timer.get(), episode_idx) writer.add_scalar("timer/learn", learn_timer.get(), episode_idx) writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx) writer.add_scalar("timer/total", training_timer.get_current(), episode_idx)
def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps, action_size, state_size, seed, render, allow_skipping, allow_caching): # Evaluation is faster on CPU (except if you use a really huge policy) parameters = {'use_gpu': False} policy = DDDQNPolicy(state_size, action_size, Namespace(**parameters), evaluation_mode=True) policy.qnetwork_local = torch.load(checkpoint) env_params = Namespace(**env_params) # Environment parameters n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city # Malfunction and speed profiles # TODO pass these parameters properly from main! malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 2000, # Rate of malfunctions min_duration=20, # Minimal duration max_duration=50 # Max duration ) # Only fast trains in Round 1 speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Observation parameters observation_tree_depth = env_params.observation_tree_depth observation_radius = env_params.observation_radius observation_max_path_depth = env_params.observation_max_path_depth # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Setup the environment env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city, ), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation) if render: env_renderer = RenderTool(env, gl="PGL") action_dict = dict() scores = [] completions = [] nb_steps = [] inference_times = [] preproc_times = [] agent_times = [] step_times = [] for episode_idx in range(n_eval_episodes): seed += 1 inference_timer = Timer() preproc_timer = Timer() agent_timer = Timer() step_timer = Timer() step_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True, random_seed=seed) step_timer.end() agent_obs = [None] * env.get_num_agents() score = 0.0 if render: env_renderer.set_new_rail() final_step = 0 skipped = 0 nb_hit = 0 agent_last_obs = {} agent_last_action = {} for step in range(max_steps - 1): if allow_skipping and check_if_all_blocked(env): # FIXME why -1? bug where all agents are "done" after max_steps! skipped = max_steps - step - 1 final_step = max_steps - 2 n_unfinished_agents = sum(not done[idx] for idx in env.get_agent_handles()) score -= skipped * n_unfinished_agents break agent_timer.start() for agent in env.get_agent_handles(): if obs[agent] and info['action_required'][agent]: if agent in agent_last_obs and np.all( agent_last_obs[agent] == obs[agent]): nb_hit += 1 action = agent_last_action[agent] else: preproc_timer.start() norm_obs = normalize_observation( obs[agent], tree_depth=observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() inference_timer.start() action = policy.act(norm_obs, eps=0.0) inference_timer.end() action_dict.update({agent: action}) if allow_caching: agent_last_obs[agent] = obs[agent] agent_last_action[agent] = action agent_timer.end() step_timer.start() obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if render: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) if step % 100 == 0: print("{}/{}".format(step, max_steps - 1)) for agent in env.get_agent_handles(): score += all_rewards[agent] final_step = step if done['__all__']: break normalized_score = score / (max_steps * env.get_num_agents()) scores.append(normalized_score) tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) completions.append(completion) nb_steps.append(final_step) inference_times.append(inference_timer.get()) preproc_times.append(preproc_timer.get()) agent_times.append(agent_timer.get()) step_times.append(step_timer.get()) skipped_text = "" if skipped > 0: skipped_text = "\t⚡ Skipped {}".format(skipped) hit_text = "" if nb_hit > 0: hit_text = "\t⚡ Hit {} ({:.1f}%)".format(nb_hit, (100 * nb_hit) / (n_agents * final_step)) print( "☑️ Score: {:.3f} \tDone: {:.1f}% \tNb steps: {:.3f} " "\t🍭 Seed: {}" "\t🚉 Env: {:.3f}s " "\t🤖 Agent: {:.3f}s (per step: {:.3f}s) \t[preproc: {:.3f}s \tinfer: {:.3f}s]" "{}{}".format(normalized_score, completion * 100.0, final_step, seed, step_timer.get(), agent_timer.get(), agent_timer.get() / final_step, preproc_timer.get(), inference_timer.get(), skipped_text, hit_text)) return scores, completions, nb_steps, agent_times, step_times
def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps, action_size, state_size, seed, render, allow_skipping, allow_caching): # Evaluation is faster on CPU (except if you use a really huge policy) parameters = {'use_gpu': False} # policy = DDDQNPolicy(state_size, action_size, Namespace(**parameters), evaluation_mode=True) # policy.qnetwork_local = torch.load(checkpoint, map_location={'cuda:0': 'cpu'}) env_params = Namespace(**env_params) # Environment parameters n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city agents = [] for agent_id in range(n_agents): agent = AttentionAgent(num_in_pol=state_size, num_out_pol=action_size, hidden_dim=256, lr=0.001) agent.policy = torch.load(os.path.join( checkpoint, f'2300_agent{agent_id}' + '.pth'), map_location=torch.device('cpu')) agent.policy.eval() agents.append(agent) # Malfunction and speed profiles # TODO pass these parameters properly from main! malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 2000, # Rate of malfunctions min_duration=20, # Minimal duration max_duration=50 # Max duration ) # Only fast trains in Round 1 speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Observation parameters observation_tree_depth = env_params.observation_tree_depth observation_radius = env_params.observation_radius observation_max_path_depth = env_params.observation_max_path_depth # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Setup the environment env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city, ), # rail_generator = complex_rail_generator( # nr_start_goal=10, # nr_extra=10, # min_dist=10, # max_dist=99999, # seed=1 # ), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation) if render: # env_renderer = RenderTool(env, gl="PGL") env_renderer = RenderTool( env, # gl="PGL", agent_render_variant=AgentRenderVariant. AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=False, screen_height=600, # Adjust these parameters to fit your resolution screen_width=800) action_dict = dict() scores = [] completions = [] nb_steps = [] inference_times = [] preproc_times = [] agent_times = [] step_times = [] for agent_id in range(n_agents): action_dict[agent_id] = 0 for episode_idx in range(n_eval_episodes): images = [] seed += 1 inference_timer = Timer() preproc_timer = Timer() agent_timer = Timer() step_timer = Timer() step_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True, random_seed=seed) step_timer.end() agent_obs = [None] * env.get_num_agents() score = 0.0 if render: env_renderer.set_new_rail() final_step = 0 skipped = 0 nb_hit = 0 agent_last_obs = {} agent_last_action = {} for step in range(max_steps - 1): # time.sleep(0.2) if allow_skipping and check_if_all_blocked(env): # FIXME why -1? bug where all agents are "done" after max_steps! skipped = max_steps - step - 1 final_step = max_steps - 2 n_unfinished_agents = sum(not done[idx] for idx in env.get_agent_handles()) score -= skipped * n_unfinished_agents break agent_timer.start() for agent in env.get_agent_handles(): agent_model = agents[agent] if obs[agent] and info['action_required'][agent]: if agent in agent_last_obs and np.all( agent_last_obs[agent] == obs[agent]): nb_hit += 1 action = agent_last_action[agent] else: preproc_timer.start() norm_obs = normalize_observation( obs[agent], tree_depth=observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() inference_timer.start() action = act(agent_model, norm_obs) inference_timer.end() action_dict.update({agent: action}) if allow_caching: agent_last_obs[agent] = obs[agent] agent_last_action[agent] = action agent_timer.end() step_timer.start() obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if render: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) im = env_renderer.get_image() im = PIL.Image.fromarray(im) images.append(im) if step % 100 == 0: print("{}/{}".format(step, max_steps - 1)) for agent in env.get_agent_handles(): score += all_rewards[agent] final_step = step if done['__all__']: break if render: for _ in range(10): images.append(images[len(images) - 1]) # save video images[0].save( f'/Users/nikhilvs/repos/nyu/flatland-reinforcement-learning/videos/maac-final/out_{episode_idx}.gif', save_all=True, append_images=images[1:], optimize=False, duration=60, loop=0) normalized_score = score / (max_steps * env.get_num_agents()) scores.append(normalized_score) tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) completions.append(completion) nb_steps.append(final_step) inference_times.append(inference_timer.get()) preproc_times.append(preproc_timer.get()) agent_times.append(agent_timer.get()) step_times.append(step_timer.get()) skipped_text = "" if skipped > 0: skipped_text = "\t⚡ Skipped {}".format(skipped) hit_text = "" if nb_hit > 0: hit_text = "\t⚡ Hit {} ({:.1f}%)".format(nb_hit, (100 * nb_hit) / (n_agents * final_step)) print( "☑️ Score: {:.3f} \tDone: {:.1f}% \tNb steps: {:.3f} " "\t🍭 Seed: {}" "\t🚉 Env: {:.3f}s " "\t🤖 Agent: {:.3f}s (per step: {:.3f}s) \t[preproc: {:.3f}s \tinfer: {:.3f}s]" "{}{}".format(normalized_score, completion * 100.0, final_step, seed, step_timer.get(), agent_timer.get(), agent_timer.get() / final_step, preproc_timer.get(), inference_timer.get(), skipped_text, hit_text)) return scores, completions, nb_steps, agent_times, step_times
def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps, seed, render): # evaluation is faster on CPU, except if you have huge networks parameters = {'use_gpu': False} policy = DDDQNPolicy(state_size, action_size, Namespace(**parameters), evaluation_mode=True) policy.qnetwork_local = torch.load(checkpoint) env_params = Namespace(**env_params) # Environment parameters n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city # Observation parameters observation_tree_depth = env_params.observation_tree_depth observation_radius = env_params.observation_radius observation_max_path_depth = env_params.observation_max_path_depth # Malfunction and speed profiles # TODO pass these parameters properly from main! malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 2000, # Rate of malfunctions min_duration=20, # Minimal duration max_duration=50 # Max duration ) speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Setup the environment env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) env.reset(True, True) if render: env_renderer = RenderTool(env, gl="PGL") action_dict = dict() scores = [] completions = [] nb_steps = [] inference_times = [] preproc_times = [] agent_times = [] step_times = [] for episode_idx in range(n_eval_episodes): inference_timer = Timer() preproc_timer = Timer() agent_timer = Timer() step_timer = Timer() agent_obs = [None] * env.get_num_agents() score = 0.0 step_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True) step_timer.end() if render: env_renderer.set_new_rail() final_step = 0 for step in range(max_steps - 1): agent_timer.start() for agent in env.get_agent_handles(): if obs[agent]: preproc_timer.start() agent_obs[agent] = normalize_observation( obs[agent], tree_depth=observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() action = 0 if info['action_required'][agent]: inference_timer.start() action = policy.act(agent_obs[agent], eps=0.0) inference_timer.end() action_dict.update({agent: action}) agent_timer.end() step_timer.start() obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if render: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) for agent in env.get_agent_handles(): score += all_rewards[agent] final_step = step if done['__all__']: break normalized_score = score / (max_steps * env.get_num_agents()) scores.append(normalized_score) tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) completions.append(completion) nb_steps.append(final_step) inference_times.append(inference_timer.get()) preproc_times.append(preproc_timer.get()) agent_times.append(agent_timer.get()) step_times.append(step_timer.get()) print( "☑️ Score: {:.3f} \tDone: {:.1f}% \tNb steps: {:.3f} " "\t🚉 Env: {:.3f}s " "\t🤖 Agent: {:.3f}s (per step: {:.3f}s) \t[preproc: {:.3f}s \tinfer: {:.3f}s]" .format(normalized_score, completion * 100.0, final_step, step_timer.get(), agent_timer.get(), agent_timer.get() / final_step, preproc_timer.get(), inference_timer.get())) return scores, completions, nb_steps, agent_times, step_times
action_dict = dict() # Stats for each episode stats = [] shortest_paths_rewards = [] for episode in range(0, EPISODES): # Reset the environment old_observations, info = environment.reset() print(str(old_observations)) old_observations = reshape_observation(old_observations) # Reset the renderer if render: env_renderer = RenderTool(env, gl="PGL") env_renderer.set_new_rail() # Shortest path = number of intermediate states = number of states - 2 (excluding the first and the last one) shortest_paths_rewards.append(-(len(get_shortest_paths(env.distance_map, max_depth=25, agent_handle=0)[0])-2)) # Initialize variables episode_reward = 0 terminated = False # Episode stats action_counter = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0} for time_step in range(TIMESTEPS): print(shortest_paths_rewards) if print_stats: print("Episode " + str(time_step) + " in episode " + str(episode + 1))
class FlatlandMultiAgentEnv(MultiAgentEnv): """ Wrap a flatland RailEnv as an Rllib MultiAgentEnv. width, height, number_of_agents: int remove_agents_at_target: bool """ def __init__(self, width, height, rail_generator, number_of_agents, remove_agents_at_target, obs_builder_object, wait_for_all_done, schedule_generator=random_schedule_generator(), name=None): super().__init__() self.env = RailEnv( width=width, height=height, rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=number_of_agents, obs_builder_object=obs_builder_object, remove_agents_at_target=remove_agents_at_target, ) self.wait_for_all_done = wait_for_all_done self.env_renderer = None self.agents_done = [] self.frame_step = 0 self.name = name self.number_of_agents = number_of_agents # Track when targets are reached. Ony used for correct reward propagation # when using wait_for_all_done=True self.at_target = dict( zip(list(np.arange(self.number_of_agents)), [False for _ in range(self.number_of_agents)])) def _running_agents(self): """ Return IDs of the agents that are not done """ agents = range(len(self.env.agents)) return (i for i in agents if i not in self.agents_done) def _agents_not_at_target(self): """ Return the number of agents that are not at their targets. Used when wait_for_all_done=True """ return max(1, list(self.at_target.values()).count(False)) def step(self, action_dict): """ Env step for each agent, like a gym.step() call The action_dict object is a dict with str or int keys corresponding to agent IDs E.g: {'0': ..., '1': ..., ...} or {0: ..., 1: ..., ...} Return a dict with keys: "observations" "rewards" "dones" "infos" """ obs, rewards, dones, infos = self.env.step(action_dict) o, r, d, i = {}, {}, {}, {} for agent in self._running_agents(): o[agent] = obs[agent] r[agent] = rewards[agent] / self._agents_not_at_target() i[agent] = infos if self.wait_for_all_done: dones, r, i = self._process_all_done(agent, dones, r, i) d[agent] = dones[agent] d["__all__"] = dones["__all__"] for agent, done in dones.items(): if agent != "__all__" and done: self.agents_done.append(agent) self.frame_step += 1 return o, r, d, i def reset(self): """ Return a dict {agent_id: agent_obs, ...} """ self.agents_done = [] obs, _ = self.env.reset() if self.env_renderer: self.env_renderer.set_new_rail() return obs def render(self, **kwargs): from flatland.utils.rendertools import RenderTool if not self.env_renderer: self.env_renderer = RenderTool(self.env, gl="PILSVG") self.env_renderer.set_new_rail() self.env_renderer.render_env(show=True, frames=False, show_observations=False, **kwargs) time.sleep(0.1) self.env_renderer.render_env(show=True, frames=False, show_observations=False, **kwargs) return self.env_renderer.get_image() def _process_all_done(self, agent, dones, r, i): # Do not count target reward more than once if self.at_target[agent]: r[agent] = 0.0 # If agent is done, and the group is not done, and agent has # not previously reached the target if dones[agent] and not dones['__all__']: self.at_target[agent] = True # Ensure each individual agent is only marked 'done' when all are done for a in list(dones.keys()): dones[a] = dones['__all__'] return dones, r, i @property def action_space(self): return Discrete(5) @property def observation_space(self): size, pow4 = 0, 1 for _ in range(self.env.obs_builder.max_depth + 1): size += pow4 pow4 *= 4 observation_size = size * self.env.obs_builder.observation_dim return Box(-np.inf, np.inf, shape=(observation_size, ))
class FlatlandGymEnv(gym.Env): """ gym.Env wrapper of the Flatland environment providing deadlocks and observation normalization. """ def __init__(self, rail_env, custom_observations, env_params, render=False, regenerate_rail_on_reset=True, regenerate_schedule_on_reset=True): self._regenerate_rail_on_reset = regenerate_rail_on_reset self._regenerate_schedule_on_reset = regenerate_schedule_on_reset self.rail_env = rail_env self.deadlocks_detector = DeadlocksDetector() self.observation_normalizer = NormalizeObservations(self.rail_env.obs_builder.observation_dim, env_params.observation_tree_depth, custom_observations, self.rail_env.width, self.rail_env.height, env_params.observation_radius) self.state_size = self.observation_normalizer.state_size self.render = render self.env_renderer = None def reset(self): obs, info = self.rail_env.reset(regenerate_rail=self._regenerate_rail_on_reset, regenerate_schedule=self._regenerate_schedule_on_reset) # Reset rendering if self.render: self.env_renderer = RenderTool(self.rail_env, gl="PGL") self.env_renderer.set_new_rail() # Reset custom observations self.observation_normalizer.reset_custom_obs(self.rail_env) # Compute deadlocks self.deadlocks_detector.reset(self.rail_env.get_num_agents()) info["deadlocks"] = {} for agent in range(self.rail_env.get_num_agents()): info["deadlocks"][agent] = self.deadlocks_detector.deadlocks[agent] # Normalization for agent in obs: if obs[agent] is not None: obs[agent] = self.observation_normalizer.normalize_observation(obs[agent], self.rail_env, agent, info["deadlocks"][agent]) return obs, info def step(self, action_dict): """ Normalize observations by default, update deadlocks and step. :param action_dict: :return: """ obs, rewards, dones, info = self.rail_env.step(action_dict) # Compute deadlocks deadlocks = self.deadlocks_detector.step(self.rail_env) info["deadlocks"] = {} for agent in range(self.rail_env.get_num_agents()): info["deadlocks"][agent] = deadlocks[agent] # Normalization for agent in obs: if obs[agent] is not None: obs[agent] = self.observation_normalizer.normalize_observation(obs[agent], self.rail_env, agent, info["deadlocks"][agent]) return obs, rewards, dones, info def show_render(self): """ Open rendering window. :return: """ if self.render: return self.env_renderer.render_env( show=True, frames=False, show_observations=False, show_predictions=False) def close(self): """ Close rendering window. :return: """ if self.render: return self.env_renderer.close_window()