def render_test(parameters, test_nr=0, nr_examples=5): for trial in range(nr_examples): # Reset the env print( 'Showing {} Level {} with (x_dim,y_dim) = ({},{}) and {} Agents.'. format(test_nr, trial, parameters[0], parameters[1], parameters[2])) file_name = "./Tests/{}/Level_{}.pkl".format(test_nr, trial) env = RailEnv( width=1, height=1, rail_generator=rail_from_file(file_name), obs_builder_object=TreeObsForRailEnv(max_depth=2), number_of_agents=1, ) env_renderer = RenderTool( env, gl="PILSVG", ) env_renderer.set_new_rail() env.reset(False, False) env_renderer.render_env(show=True, show_observations=False) time.sleep(0.1) env_renderer.close_window() return
def main(args): try: opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""]) except getopt.GetoptError as err: print(str(err)) # will print something like "option -a not recognized" sys.exit(2) sleep_for_animation = True for o, a in opts: if o in ("--sleep-for-animation"): sleep_for_animation = str2bool(a) else: assert False, "unhandled option" env = RailEnv(width=7, height=7, rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=1, min_dist=5, max_dist=99999, seed=1), schedule_generator=complex_schedule_generator(), number_of_agents=1, obs_builder_object=SingleAgentNavigationObs()) obs, info = env.reset() env_renderer = RenderTool(env) env_renderer.render_env(show=True, frames=True, show_observations=True) for step in range(100): action = np.argmax(obs[0]) + 1 obs, all_rewards, done, _ = env.step({0: action}) print("Rewards: ", all_rewards, " [done=", done, "]") env_renderer.render_env(show=True, frames=True, show_observations=True) if sleep_for_animation: time.sleep(0.1) if done["__all__"]: break env_renderer.close_window()
action_probs = action_count / np.sum(action_count) action_count = [1] * action_size # Smoothed values for terminal display and for more stable hyper-parameter tuning smoothing = 0.99 smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * ( 1.0 - smoothing) smoothed_completion = smoothed_completion * smoothing + completion * ( 1.0 - smoothing) # Print logs if episode_idx % checkpoint_interval == 0: torch.save(policy.qnetwork_local, './checkpoints/testmulti-' + str(episode_idx) + '.pth') if train_params.render: env_renderer.close_window() a.append(normalized_score) b.append(completion) print('\r🚂 Episode {}' '\t 🏆 Score: {:.3f}' ' Avg: {:.3f}' '\t 💯 Done: {:.2f}%' ' Avg: {:.2f}%' '\t 🎲 Epsilon: {:.2f} ' '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score, smoothed_normalized_score, 100 * completion, 100 * smoothed_completion, eps_start, format_action_prob(action_probs)), end=" ")
def train_agent(env_params, train_params): # Environment parameters n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city seed = env_params.seed # Observation parameters observation_tree_depth = env_params.observation_tree_depth observation_radius = env_params.observation_radius observation_max_path_depth = env_params.observation_max_path_depth # Training parameters eps_start = train_params.eps_start eps_end = train_params.eps_end eps_decay = train_params.eps_decay n_episodes = train_params.n_episodes checkpoint_interval = train_params.checkpoint_interval n_eval_episodes = train_params.n_evaluation_episodes # Set the seeds random.seed(seed) np.random.seed(seed) # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 10000, # Rate of malfunctions min_duration=15, # Minimal duration max_duration=50 # Max duration ) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Fraction of train which each speed speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Setup the environment env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) env.reset(regenerate_schedule=True, regenerate_rail=True) # Setup renderer if train_params.render: env_renderer = RenderTool(env, gl="PGL") # Calculate the state size given the depth of the tree observation and the number of features n_features_per_node = env.obs_builder.observation_dim n_nodes = 0 for i in range(observation_tree_depth + 1): n_nodes += np.power(4, i) state_size = n_features_per_node * n_nodes # The action space of flatland is 5 discrete actions action_size = 5 # Max number of steps per episode # This is the official formula used during evaluations # See details in flatland.envs.schedule_generators.sparse_schedule_generator max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) action_count = [0] * action_size action_dict = dict() agent_obs = [None] * env.get_num_agents() agent_prev_obs = [None] * env.get_num_agents() agent_prev_action = [2] * env.get_num_agents() update_values = False smoothed_normalized_score = -1.0 smoothed_eval_normalized_score = -1.0 smoothed_completion = 0.0 smoothed_eval_completion = 0.0 # Double Dueling DQN policy policy = DDDQNPolicy(state_size, action_size, train_params) # TensorBoard writer writer = SummaryWriter() writer.add_hparams(vars(train_params), {}) writer.add_hparams(vars(env_params), {}) training_timer = Timer() training_timer.start() print( "\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes.\n" .format(env.get_num_agents(), x_dim, y_dim, n_episodes, n_eval_episodes, checkpoint_interval)) for episode_idx in range(n_episodes + 1): # Timers step_timer = Timer() reset_timer = Timer() learn_timer = Timer() preproc_timer = Timer() # Reset environment reset_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True) reset_timer.end() if train_params.render: env_renderer.set_new_rail() score = 0 nb_steps = 0 actions_taken = [] # Build agent specific observations for agent in env.get_agent_handles(): if obs[agent]: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) agent_prev_obs[agent] = agent_obs[agent].copy() # Run episode for step in range(max_steps - 1): for agent in env.get_agent_handles(): if info['action_required'][agent]: # If an action is required, we want to store the obs at that step as well as the action update_values = True action = policy.act(agent_obs[agent], eps=eps_start) action_count[action] += 1 actions_taken.append(action) else: update_values = False action = 0 action_dict.update({agent: action}) # Environment step step_timer.start() next_obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if train_params.render and episode_idx % checkpoint_interval == 0: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) for agent in range(env.get_num_agents()): # Update replay buffer and train agent # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[agent]: learn_timer.start() policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent]) learn_timer.end() agent_prev_obs[agent] = agent_obs[agent].copy() agent_prev_action[agent] = action_dict[agent] # Preprocess the new observations if next_obs[agent]: preproc_timer.start() agent_obs[agent] = normalize_observation( next_obs[agent], observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() score += all_rewards[agent] nb_steps = step if done['__all__']: break # Epsilon decay eps_start = max(eps_end, eps_decay * eps_start) # Collection information about training tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) normalized_score = score / (max_steps * env.get_num_agents()) action_probs = action_count / np.sum(action_count) action_count = [1] * action_size # Smoothed values for terminal display and for more stable hyper-parameter tuning smoothing = 0.99 smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * ( 1.0 - smoothing) smoothed_completion = smoothed_completion * smoothing + completion * ( 1.0 - smoothing) # Print logs if episode_idx % checkpoint_interval == 0: torch.save( policy.qnetwork_local, './checkpoints/origin_multi-' + str(episode_idx) + '.pth') if train_params.render: env_renderer.close_window() print('\r🚂 Episode {}' '\t 🏆 Score: {:.3f}' ' Avg: {:.3f}' '\t 💯 Done: {:.2f}%' ' Avg: {:.2f}%' '\t 🎲 Epsilon: {:.2f} ' '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score, smoothed_normalized_score, 100 * completion, 100 * smoothed_completion, eps_start, format_action_prob(action_probs)), end=" ") # Evaluate policy if episode_idx % train_params.checkpoint_interval == 0: scores, completions, nb_steps_eval = eval_policy( env, policy, n_eval_episodes, max_steps) writer.add_scalar("evaluation/scores_min", np.min(scores), episode_idx) writer.add_scalar("evaluation/scores_max", np.max(scores), episode_idx) writer.add_scalar("evaluation/scores_mean", np.mean(scores), episode_idx) writer.add_scalar("evaluation/scores_std", np.std(scores), episode_idx) writer.add_histogram("evaluation/scores", np.array(scores), episode_idx) writer.add_scalar("evaluation/completions_min", np.min(completions), episode_idx) writer.add_scalar("evaluation/completions_max", np.max(completions), episode_idx) writer.add_scalar("evaluation/completions_mean", np.mean(completions), episode_idx) writer.add_scalar("evaluation/completions_std", np.std(completions), episode_idx) writer.add_histogram("evaluation/completions", np.array(completions), episode_idx) writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_mean", np.mean(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval), episode_idx) writer.add_histogram("evaluation/nb_steps", np.array(nb_steps_eval), episode_idx) smoothing = 0.9 smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean( scores) * (1.0 - smoothing) smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean( completions) * (1.0 - smoothing) writer.add_scalar("evaluation/smoothed_score", smoothed_eval_normalized_score, episode_idx) writer.add_scalar("evaluation/smoothed_completion", smoothed_eval_completion, episode_idx) # Save logs to tensorboard writer.add_scalar("training/score", normalized_score, episode_idx) writer.add_scalar("training/smoothed_score", smoothed_normalized_score, episode_idx) writer.add_scalar("training/completion", np.mean(completion), episode_idx) writer.add_scalar("training/smoothed_completion", np.mean(smoothed_completion), episode_idx) writer.add_scalar("training/nb_steps", nb_steps, episode_idx) writer.add_histogram("actions/distribution", np.array(actions_taken), episode_idx) writer.add_scalar("actions/nothing", action_probs[RailEnvActions.DO_NOTHING], episode_idx) writer.add_scalar("actions/left", action_probs[RailEnvActions.MOVE_LEFT], episode_idx) writer.add_scalar("actions/forward", action_probs[RailEnvActions.MOVE_FORWARD], episode_idx) writer.add_scalar("actions/right", action_probs[RailEnvActions.MOVE_RIGHT], episode_idx) writer.add_scalar("actions/stop", action_probs[RailEnvActions.STOP_MOVING], episode_idx) writer.add_scalar("training/epsilon", eps_start, episode_idx) writer.add_scalar("training/buffer_size", len(policy.memory), episode_idx) writer.add_scalar("training/loss", policy.loss, episode_idx) writer.add_scalar("timer/reset", reset_timer.get(), episode_idx) writer.add_scalar("timer/step", step_timer.get(), episode_idx) writer.add_scalar("timer/learn", learn_timer.get(), episode_idx) writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx) writer.add_scalar("timer/total", training_timer.get_current(), episode_idx)
def train_agent(train_params, train_env_params, eval_env_params, obs_params): # Environment parameters n_agents = train_env_params.n_agents x_dim = train_env_params.x_dim y_dim = train_env_params.y_dim n_cities = train_env_params.n_cities max_rails_between_cities = train_env_params.max_rails_between_cities max_rails_in_city = train_env_params.max_rails_in_city seed = train_env_params.seed # Unique ID for this training now = datetime.now() training_id = now.strftime('%y%m%d%H%M%S') # Observation parameters observation_tree_depth = obs_params.observation_tree_depth observation_radius = obs_params.observation_radius observation_max_path_depth = obs_params.observation_max_path_depth # Training parameters eps_start = train_params.eps_start eps_end = train_params.eps_end eps_decay = train_params.eps_decay n_episodes = train_params.n_episodes checkpoint_interval = train_params.checkpoint_interval n_eval_episodes = train_params.n_evaluation_episodes restore_replay_buffer = train_params.restore_replay_buffer save_replay_buffer = train_params.save_replay_buffer # Set the seeds random.seed(seed) np.random.seed(seed) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Setup the environments train_env = create_rail_env(train_env_params, tree_observation) train_env.reset(regenerate_schedule=True, regenerate_rail=True) eval_env = create_rail_env(eval_env_params, tree_observation) eval_env.reset(regenerate_schedule=True, regenerate_rail=True) # Setup renderer if train_params.render: env_renderer = RenderTool(train_env, gl="PGL") # Calculate the state size given the depth of the tree observation and the number of features n_features_per_node = train_env.obs_builder.observation_dim n_nodes = sum([np.power(4, i) for i in range(observation_tree_depth + 1)]) state_size = n_features_per_node * n_nodes # The action space of flatland is 5 discrete actions action_size = 5 # Max number of steps per episode # This is the official formula used during evaluations # See details in flatland.envs.schedule_generators.sparse_schedule_generator # max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) max_steps = train_env._max_episode_steps action_count = [0] * action_size action_dict = dict() agent_obs = [None] * n_agents agent_prev_obs = [None] * n_agents agent_prev_action = [2] * n_agents update_values = [False] * n_agents # Smoothed values used as target for hyperparameter tuning smoothed_normalized_score = -1.0 smoothed_eval_normalized_score = -1.0 smoothed_completion = 0.0 smoothed_eval_completion = 0.0 # Double Dueling DQN policy policy = DDDQNPolicy(state_size, action_size, train_params) # Loads existing replay buffer if restore_replay_buffer: try: policy.load_replay_buffer(restore_replay_buffer) policy.test() except RuntimeError as e: print( "\n🛑 Could't load replay buffer, were the experiences generated using the same tree depth?" ) print(e) exit(1) print("\n💾 Replay buffer status: {}/{} experiences".format( len(policy.memory.memory), train_params.buffer_size)) hdd = psutil.disk_usage('/') if save_replay_buffer and (hdd.free / (2**30)) < 500.0: print( "⚠️ Careful! Saving replay buffers will quickly consume a lot of disk space. You have {:.2f}gb left." .format(hdd.free / (2**30))) # TensorBoard writer writer = SummaryWriter() writer.add_hparams(vars(train_params), {}) writer.add_hparams(vars(train_env_params), {}) writer.add_hparams(vars(obs_params), {}) training_timer = Timer() training_timer.start() print( "\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes. Training id '{}'.\n" .format(train_env.get_num_agents(), x_dim, y_dim, n_episodes, n_eval_episodes, checkpoint_interval, training_id)) for episode_idx in range(n_episodes + 1): step_timer = Timer() reset_timer = Timer() learn_timer = Timer() preproc_timer = Timer() inference_timer = Timer() # Reset environment reset_timer.start() obs, info = train_env.reset(regenerate_rail=True, regenerate_schedule=True) reset_timer.end() if train_params.render: env_renderer.set_new_rail() score = 0 nb_steps = 0 actions_taken = [] # Build initial agent-specific observations for agent in train_env.get_agent_handles(): if obs[agent]: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) agent_prev_obs[agent] = agent_obs[agent].copy() # Run episode for step in range(max_steps - 1): inference_timer.start() for agent in train_env.get_agent_handles(): if info['action_required'][agent]: update_values[agent] = True action = policy.act(agent_obs[agent], eps=eps_start) action_count[action] += 1 actions_taken.append(action) else: # An action is not required if the train hasn't joined the railway network, # if it already reached its target, or if is currently malfunctioning. update_values[agent] = False action = 0 action_dict.update({agent: action}) inference_timer.end() # Environment step step_timer.start() next_obs, all_rewards, done, info = train_env.step(action_dict) step_timer.end() # Render an episode at some interval if train_params.render and episode_idx % checkpoint_interval == 0: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) # Update replay buffer and train agent for agent in train_env.get_agent_handles(): if update_values[agent] or done['__all__']: # Only learn from timesteps where somethings happened learn_timer.start() policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent]) learn_timer.end() agent_prev_obs[agent] = agent_obs[agent].copy() agent_prev_action[agent] = action_dict[agent] # Preprocess the new observations if next_obs[agent]: preproc_timer.start() agent_obs[agent] = normalize_observation( next_obs[agent], observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() score += all_rewards[agent] nb_steps = step if done['__all__']: break # Epsilon decay eps_start = max(eps_end, eps_decay * eps_start) # Collect information about training tasks_finished = sum(done[idx] for idx in train_env.get_agent_handles()) completion = tasks_finished / max(1, train_env.get_num_agents()) normalized_score = score / (max_steps * train_env.get_num_agents()) action_probs = action_count / np.sum(action_count) action_count = [1] * action_size smoothing = 0.99 smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * ( 1.0 - smoothing) smoothed_completion = smoothed_completion * smoothing + completion * ( 1.0 - smoothing) # Print logs if episode_idx % checkpoint_interval == 0: torch.save( policy.qnetwork_local, './checkpoints/' + training_id + '-' + str(episode_idx) + '.pth') if save_replay_buffer: policy.save_replay_buffer('./replay_buffers/' + training_id + '-' + str(episode_idx) + '.pkl') if train_params.render: env_renderer.close_window() print('\r🚂 Episode {}' '\t 🏆 Score: {:.3f}' ' Avg: {:.3f}' '\t 💯 Done: {:.2f}%' ' Avg: {:.2f}%' '\t 🎲 Epsilon: {:.3f} ' '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score, smoothed_normalized_score, 100 * completion, 100 * smoothed_completion, eps_start, format_action_prob(action_probs)), end=" ") # Evaluate policy and log results at some interval if episode_idx % checkpoint_interval == 0 and n_eval_episodes > 0: scores, completions, nb_steps_eval = eval_policy( eval_env, policy, train_params, obs_params) writer.add_scalar("evaluation/scores_min", np.min(scores), episode_idx) writer.add_scalar("evaluation/scores_max", np.max(scores), episode_idx) writer.add_scalar("evaluation/scores_mean", np.mean(scores), episode_idx) writer.add_scalar("evaluation/scores_std", np.std(scores), episode_idx) writer.add_histogram("evaluation/scores", np.array(scores), episode_idx) writer.add_scalar("evaluation/completions_min", np.min(completions), episode_idx) writer.add_scalar("evaluation/completions_max", np.max(completions), episode_idx) writer.add_scalar("evaluation/completions_mean", np.mean(completions), episode_idx) writer.add_scalar("evaluation/completions_std", np.std(completions), episode_idx) writer.add_histogram("evaluation/completions", np.array(completions), episode_idx) writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_mean", np.mean(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval), episode_idx) writer.add_histogram("evaluation/nb_steps", np.array(nb_steps_eval), episode_idx) smoothing = 0.9 smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean( scores) * (1.0 - smoothing) smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean( completions) * (1.0 - smoothing) writer.add_scalar("evaluation/smoothed_score", smoothed_eval_normalized_score, episode_idx) writer.add_scalar("evaluation/smoothed_completion", smoothed_eval_completion, episode_idx) # Save logs to tensorboard writer.add_scalar("training/score", normalized_score, episode_idx) writer.add_scalar("training/smoothed_score", smoothed_normalized_score, episode_idx) writer.add_scalar("training/completion", np.mean(completion), episode_idx) writer.add_scalar("training/smoothed_completion", np.mean(smoothed_completion), episode_idx) writer.add_scalar("training/nb_steps", nb_steps, episode_idx) writer.add_histogram("actions/distribution", np.array(actions_taken), episode_idx) writer.add_scalar("actions/nothing", action_probs[RailEnvActions.DO_NOTHING], episode_idx) writer.add_scalar("actions/left", action_probs[RailEnvActions.MOVE_LEFT], episode_idx) writer.add_scalar("actions/forward", action_probs[RailEnvActions.MOVE_FORWARD], episode_idx) writer.add_scalar("actions/right", action_probs[RailEnvActions.MOVE_RIGHT], episode_idx) writer.add_scalar("actions/stop", action_probs[RailEnvActions.STOP_MOVING], episode_idx) writer.add_scalar("training/epsilon", eps_start, episode_idx) writer.add_scalar("training/buffer_size", len(policy.memory), episode_idx) writer.add_scalar("training/loss", policy.loss, episode_idx) writer.add_scalar("timer/reset", reset_timer.get(), episode_idx) writer.add_scalar("timer/step", step_timer.get(), episode_idx) writer.add_scalar("timer/learn", learn_timer.get(), episode_idx) writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx) writer.add_scalar("timer/total", training_timer.get_current(), episode_idx)
def run_episode(kwargs) -> [Trajectory]: """ Runs a single episode and collects the trajectories of each agent """ total_controller_time = 0 env_dict: Callable = kwargs.get("env_dict") obs_builder = kwargs.get("obs_builder") controller_creator: Callable = kwargs.get("controller_creator") episode_id: int = kwargs.get("episode_id") max_episode_length: int = kwargs.get("max_episode_length", 1000) render: bool = kwargs.get("render", False) # Create and Start Environment _env = load_env(env_dict, obs_builder_object=obs_builder) obs, info = _env.reset( regenerate_rail=False, regenerate_schedule=True, ) score = 0 _trajectories = [Trajectory() for _ in _env.get_agent_handles()] # Create and Start Controller controller: AbstractController = controller_creator() start = time.time() controller.start_of_round(obs=obs, env=_env) total_controller_time += time.time() - start if render: env_renderer = RenderTool(_env) env_renderer.reset() for step in range(max_episode_length): start = time.time() action_dict, processed_obs = controller.act(observation=obs) total_controller_time += time.time() - start next_obs, all_rewards, done, info = _env.step(action_dict) if render: env_renderer.render_env(show=True, show_observations=True, show_predictions=False) # Save actions and rewards for each agent [ _trajectories[agent_handle].add_row( state=processed_obs[agent_handle], action=action_dict[agent_handle], reward=all_rewards[agent_handle], done=done[agent_handle]) for agent_handle in _env.get_agent_handles() ] score += sum(all_rewards) obs = next_obs.copy() if done['__all__']: break if render: env_renderer.close_window() # print(f"\nController took a total time of: {total_controller_time} seconds", flush=True) return _trajectories
def main(args): try: opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""]) except getopt.GetoptError as err: print(str(err)) # will print something like "option -a not recognized" sys.exit(2) sleep_for_animation = True for o, a in opts: if o in ("--sleep-for-animation"): sleep_for_animation = str2bool(a) else: assert False, "unhandled option" test_envs_root = "./railway" test_env_file_path = "testing_stuff.pkl" test_env_file_path = os.path.join( test_envs_root, test_env_file_path ) x_dim = 7 y_dim = 7 n_agents = 4 stochastic_data = {'prop_malfunction': 0.05, # Percentage of defective agents 'malfunction_rate': 100, # Rate of malfunction occurence 'min_duration': 20, # Minimal duration of malfunction 'max_duration': 50 # Max duration of malfunction } # Different agent types (trains) with different speeds. speed_ration_map = {1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25} # Slow freight train # env = RailEnv(width=1, height=1, rail_generator=rail_from_file(test_env_file_path), # schedule_generator=schedule_from_file(test_env_file_path), # #malfunction_generator_and_process_data=malfunction_from_file(test_env_file_path), # obs_builder_object=MultipleAgentNavigationObs(max_depth=2, predictor=ShortestPathPredictorForRailEnv(30))) # # #env.number_of_agents = n_agents # n_agents = env.number_of_agents env = RailEnv(width=x_dim, height=y_dim, rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=1, min_dist=6, max_dist=99999,seed=1), # sparse_rail_generator(max_num_cities=3, # # Number of cities in map (where train stations are) # seed=1, # Random seed # grid_mode=False, # max_rails_between_cities=2, # max_rails_in_city=3), schedule_generator=complex_schedule_generator(speed_ration_map), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), # # env = RailEnv(width=7, height=7, # rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=1, min_dist=5, max_dist=99999, # seed=1), schedule_generator=complex_schedule_generator(), # number_of_agents=n_agents, obs_builder_object=MultipleAgentNavigationObs(max_depth=2, predictor=ShortestPathPredictorForRailEnv(30))) max_steps = int(4 * 2 * (20 + env.height + env.width)) obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True, random_seed=random_seed) env_renderer = RenderTool(env, gl="PILSVG") env_renderer.render_env(show=True, frames=True, show_observations=True) # Reset score and done score = 0 env_done = 0 step = 0 for step in range(max_steps): action_dict = {} for i in range(n_agents): if not obs: action_dict.update({i: 2}) elif obs[i] is not None: action = np.argmax(obs[i][1:4]) + 1 action_dict.update({i: action}) obs, all_rewards, done, _ = env.step(action_dict) print("Rewards: ", all_rewards, " [done=", done, "]") for a in range(env.get_num_agents()): score += all_rewards[a] / env.get_num_agents() env_renderer.render_env(show=True, frames=True, show_observations=True) if sleep_for_animation: time.sleep(0.5) if done["__all__"]: break # Collection information about training tasks_finished = 0 for current_agent in env.agents: if current_agent.status == RailAgentStatus.DONE_REMOVED: tasks_finished += 1 done_window = tasks_finished / max(1, env.get_num_agents()) scores_window = score / max_steps print( '\rTraining {} Agents on ({},{}).\t Steps {}\t Average Score: {:.3f}\tDones: {:.2f}%\t'.format( env.get_num_agents(), x_dim, y_dim, step, np.mean(scores_window), 100 * np.mean(done_window)), end=" ") tasks_finished = 0 for current_agent in env.agents: if current_agent.status == RailAgentStatus.DONE_REMOVED: tasks_finished += 1 done_window = tasks_finished / max(1, env.get_num_agents()) scores_window = score / max_steps print( '\rTraining {} Agents on ({},{}).\t Total Steps {}\t Average Score: {:.3f}\tDones: {:.2f}%\t'.format( env.get_num_agents(), x_dim, y_dim, step, np.mean(scores_window), 100 * np.mean(done_window)), end=" ") env_renderer.close_window()
def main(args): try: opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""]) except getopt.GetoptError as err: print(str(err)) # will print something like "option -a not recognized" sys.exit(2) sleep_for_animation = True for o, a in opts: if o in ("--sleep-for-animation"): sleep_for_animation = str2bool(a) else: assert False, "unhandled option" test_envs_root = f"./test-envs/Test_{test_env_no}" test_env_file_path = f"Level_{level_no}.pkl" test_env_file_path = os.path.join( test_envs_root, test_env_file_path ) x_dim = 35 y_dim = 35 n_agents = 10 stochastic_data = {'prop_malfunction': 0.05, # Percentage of defective agents 'malfunction_rate': 100, # Rate of malfunction occurence 'min_duration': 2, # Minimal duration of malfunction 'max_duration': 5 # Max duration of malfunction } # Different agent types (trains) with different speeds. speed_ration_map = {1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25} # Slow freight train env = RailEnv(width=x_dim, height=y_dim, #rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=1, min_dist=6, max_dist=99999,seed=1), rail_generator=sparse_rail_generator(max_num_cities=3, # Number of cities in map (where train stations are) seed=1, # Random seed grid_mode=False, max_rails_between_cities=2, max_rails_in_city=3), #schedule_generator=complex_schedule_generator(speed_ration_map), schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), obs_builder_object=MultipleAgentNavigationObs(max_depth=2, predictor=ShortestPathPredictorForRailEnv(30))) # print(f"Testing Environment: {test_env_file_path} with seed: {random_seed}") # env = RailEnv(width=1, height=1, rail_generator=rail_from_file(test_env_file_path), # schedule_generator=schedule_from_file(test_env_file_path), # malfunction_generator_and_process_data=malfunction_from_file(test_env_file_path), # obs_builder_object=MultipleAgentNavigationObs(max_depth=2, predictor=ShortestPathPredictorForRailEnv(30))) obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True, activate_agents=False, random_seed=random_seed) n_agents = env.get_num_agents() x_dim, y_dim = env.width,env.height max_steps = int(4 * 2 * (20 + env.height + env.width)) env_renderer = RenderTool(env, gl="PILSVG") env_renderer.render_env(show=True, frames=True, show_observations=True) # Reset score and done score = 0 env_done = 0 step = 0 for step in range(max_steps): for i in range(n_agents): if obs[i] is not None: observations, prediction_data, prediction_pos = obs[i] break action_dict = {} next_shortest_actions = 2*np.ones(n_agents) next_next_shortest_actions = 2*np.ones(n_agents) agent_conflicts = np.zeros((n_agents,n_agents)) agent_conflicts_count = np.zeros((n_agents, n_agents)) minDist = -1 *np.ones(n_agents) incDiff1 = -1 * np.ones(n_agents) incDiff2 = -1 * np.ones(n_agents) malfunc = np.zeros(n_agents) speed = np.ones(n_agents) pos_frac = np.ones(n_agents) agent_num_conflicts = [] vals = [] counts = [] counter = np.zeros(n_agents) for i in range(30): pos = prediction_pos[i] val, count = np.unique(pos, return_counts=True) if(val[0] == -1): val = val[1:] count = count[1:] vals.append(val) counts.append(count) for j,curVal in enumerate(val): #curVal = vals[i] curCount = count[j] if curCount > 1: idxs = np.argwhere(pos == curVal) lsIdx = [int(x) for x in idxs] combs = list(combinations(lsIdx,2)) for k,comb in enumerate(combs): counter[comb[0]] += 1 counter[comb[1]] += 1 agent_conflicts_count[comb[0], comb[1]] = (counter[comb[0]] + counter[comb[1]])/2 if agent_conflicts[comb[0], comb[1]] == 0: agent_conflicts[comb[0], comb[1]] = i else: agent_conflicts[comb[0], comb[1]] = min(i, agent_conflicts[comb[0], comb[1]]) for i in range(n_agents): agent_num_conflicts.append(sum(agent_conflicts[i,:])) if not obs or obs is None or obs[i] is None: action_dict.update({i: 2}) elif obs[i][0] is not None: shortest_action = np.argmax(obs[i][0][1:4]) + 1 next_shortest_action = np.argmax(obs[i][0][5:7]) + 1 next_next_shortest_action = np.argmax(obs[i][0][8:10]) + 1 next_shortest_actions[i] = next_shortest_action next_next_shortest_actions[i] = next_next_shortest_action malfunc[i] = obs[i][0][-3] speed[i] = obs[i][0][-2] pos_frac[i] = obs[i][0][-1] minDist[i] = obs[i][0][0] incDiff1[i] = obs[i][0][-5] incDiff2[i] = obs[i][0][-4] action_dict.update({i: shortest_action}) else: action_dict.update({i: 2}) mal_agents = (np.array(-1)) for i in range(n_agents): if agent_num_conflicts[i] > 0: mal_agents = np.where(malfunc > 0) for i,mal_agent in enumerate(mal_agents[0]): if mal_agent is None: break conflict_agents = np.where(agent_conflicts[:,int(mal_agent)]>0) for j,cur_conflict_agent in enumerate(conflict_agents[0]): cur_conflict_agent = int(cur_conflict_agent) steps_conflict = agent_conflicts[cur_conflict_agent, mal_agent] if steps_conflict <= 3: if incDiff1[cur_conflict_agent] == -1: if int(minDist[cur_conflict_agent]) >= 5: action_dict.update({cur_conflict_agent: 4}) elif agent_conflicts_count[cur_conflict_agent,mal_agent] > 1: action_dict.update({cur_conflict_agent: 4}) elif minDist[cur_conflict_agent] > incDiff1[cur_conflict_agent]: action_dict.update({cur_conflict_agent: 4}) else: action_dict.update({cur_conflict_agent: next_shortest_actions[cur_conflict_agent]}) obs, all_rewards, done, _ = env.step(action_dict) print("Rewards: ", all_rewards, " [done=", done, "]") for a in range(env.get_num_agents()): score += all_rewards[a] / env.get_num_agents() env_renderer.render_env(show=True, frames=True, show_observations=True) if sleep_for_animation: time.sleep(0.5) if done["__all__"]: break # Collection information about training tasks_finished = 0 for current_agent in env.agents: if current_agent.status == RailAgentStatus.DONE_REMOVED: tasks_finished += 1 done_window = tasks_finished / max(1, env.get_num_agents()) scores_window = score / max_steps print( '\rTraining {} Agents on ({},{}).\t Steps {}\t Average Score: {:.3f}\tDones: {:.2f}%\t'.format( env.get_num_agents(), x_dim, y_dim, step, np.mean(scores_window), 100 * np.mean(done_window)), end=" ") tasks_finished = 0 for current_agent in env.agents: if current_agent.status == RailAgentStatus.DONE_REMOVED: tasks_finished += 1 done_window = tasks_finished / max(1, env.get_num_agents()) scores_window = score / max_steps print( '\rTraining {} Agents on ({},{}).\t Total Steps {}\t Average Score: {:.3f}\tDones: {:.2f}%\t'.format( env.get_num_agents(), x_dim, y_dim, step, np.mean(scores_window), 100 * np.mean(done_window)), end=" ") env_renderer.close_window()
def main(args): try: opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""]) except getopt.GetoptError as err: print(str(err)) # will print something like "option -a not recognized" sys.exit(2) sleep_for_animation = True for o, a in opts: if o in ("--sleep-for-animation"): sleep_for_animation = str2bool(a) else: assert False, "unhandled option" test_envs_root = f"./test-envs/Test_{test_env_no}" test_env_file_path = f"Level_{level_no}.pkl" test_env_file_path = os.path.join( test_envs_root, test_env_file_path ) print(f"Testing Environment: {test_env_file_path} with seed: {random_seed}") env = RailEnv(width=1, height=1, rail_generator=rail_from_file(test_env_file_path), schedule_generator=schedule_from_file(test_env_file_path), malfunction_generator_and_process_data=malfunction_from_file(test_env_file_path), obs_builder_object=MultipleAgentNavigationObs(max_depth=2, predictor=ShortestPathPredictorForRailEnv(30))) max_steps = int(4 * 2 * (20 + env.height + env.width)) obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True, activate_agents=False, random_seed=random_seed) env_renderer = RenderTool(env, gl="PILSVG") env_renderer.render_env(show=True, frames=True, show_observations=True) n_agents = env.get_num_agents() x_dim, y_dim = env.width,env.height # Reset score and done score = 0 env_done = 0 step = 0 for step in range(max_steps): action_dict = {} for i in range(n_agents): if not obs: action_dict.update({i: 2}) elif obs[i] is not None: action = np.argmax(obs[i][1:4]) + 1 action_dict.update({i: action}) obs, all_rewards, done, _ = env.step(action_dict) print("Rewards: ", all_rewards, " [done=", done, "]") for a in range(env.get_num_agents()): score += all_rewards[a] / env.get_num_agents() env_renderer.render_env(show=True, frames=True, show_observations=True) if sleep_for_animation: time.sleep(0.5) if done["__all__"]: break # Collection information about training tasks_finished = 0 for current_agent in env.agents: if current_agent.status == RailAgentStatus.DONE_REMOVED: tasks_finished += 1 done_window = tasks_finished / max(1, env.get_num_agents()) scores_window = score / max_steps print( '\rTraining {} Agents on ({},{}).\t Steps {}\t Average Score: {:.3f}\tDones: {:.2f}%\t'.format( n_agents, x_dim, y_dim, step, np.mean(scores_window), 100 * np.mean(done_window)), end=" ") tasks_finished = 0 for current_agent in env.agents: if current_agent.status == RailAgentStatus.DONE_REMOVED: tasks_finished += 1 done_window = tasks_finished / max(1, env.get_num_agents()) scores_window = score / max_steps print( '\rTraining {} Agents on ({},{}).\t Total Steps {}\t Average Score: {:.3f}\tDones: {:.2f}%\t'.format( n_agents, x_dim, y_dim, step, np.mean(scores_window), 100 * np.mean(done_window)), end=" ") env_renderer.close_window()
def evaluate_remote(): remote_client = FlatlandRemoteClient() my_observation_builder = SimpleObservation(max_depth=3, neighbours_depth=3, timetable=Judge(LinearOnAgentNumberSizeGenerator(0.03, 5), lr=0, batch_size=0, optimization_epochs=0, device=torch.device("cpu")), deadlock_checker=DeadlockChecker(), greedy_checker=GreedyChecker(), parallel=False, eval=True) params = torch.load("generated/params.torch") params.neighbours_depth=my_observation_builder.neighbours_depth controller = PPOController(params, torch.device("cpu")) controller.load_controller("generated/controller.torch") my_observation_builder.timetable.load_judge("generated/judge.torch") render = False sum_reward, sum_percent_done = 0., 0. for evaluation_number in itertools.count(): time_start = time.time() observation, info = remote_client.env_create(obs_builder_object=my_observation_builder) if not observation: break local_env = FlatlandWrapper(remote_client.env, FakeRewardShaper()) local_env.n_agents = len(local_env.agents) log().check_time() if render: env_renderer = RenderTool( local_env.env, agent_render_variant=AgentRenderVariant.ONE_STEP_BEHIND, show_debug=True, screen_height=600, screen_width=800 ) env_creation_time = time.time() - time_start print("Evaluation Number : {}".format(evaluation_number)) time_taken_by_controller = [] time_taken_per_step = [] steps = 0 done = defaultdict(lambda: False) while True: try: if render: env_renderer.render_env(show=True, show_observations=False, show_predictions=False) time_start = time.time() action_dict = dict() handles_to_ask = list() observation = {k: torch.tensor(v, dtype=torch.float) for k, v in observation.items() if v is not None} for i in range(local_env.n_agents): if not done[i]: if local_env.obs_builder.greedy_checker.greedy_position(i): action_dict[i] = 0 elif i in observation: handles_to_ask.append(i) for handle in handles_to_ask: for opp_handle in local_env.obs_builder.encountered[handle]: if opp_handle != -1 and opp_handle not in observation: observation[opp_handle] = torch.tensor(local_env.obs_builder._get_internal(opp_handle), dtype=torch.float) time_taken_per_step.append(time.time() - time_start) time_start = time.time() controller_actions = controller.fast_select_actions(handles_to_ask, observation, local_env.obs_builder.encountered, train=True) action_dict.update(controller_actions) action_dict = {k: local_env.transform_action(k, v) for k, v in action_dict.items()} action_dict = {handle: action for handle, action in action_dict.items() if action != -1} time_taken = time.time() - time_start time_taken_by_controller.append(time_taken) time_start = time.time() observation, all_rewards, done, info = remote_client.env_step(action_dict) num_done = sum([1 for agent in local_env.agents if agent.status == RailAgentStatus.DONE_REMOVED]) num_started = sum([1 for handle in range(len(local_env.agents)) if local_env.obs_builder.timetable.is_ready(handle)]) finished_handles = [handle for handle in range(len(local_env.agents)) if local_env.obs_builder.timetable.ready_to_depart[handle] == 2] reward = torch.sum(local_env._max_episode_steps - local_env.obs_builder.timetable.end_time[finished_handles]) reward /= len(local_env.agents) * local_env._max_episode_steps percent_done = float(num_done) / len(local_env.agents) deadlocked = int(sum(local_env.obs_builder.deadlock_checker._is_deadlocked) + 0.5) steps += 1 time_taken = time.time() - time_start time_taken_per_step.append(time_taken) if done['__all__']: print("Done agents {}/{}".format(num_done, len(local_env.agents))) print("Started agents {}/{}".format(num_started, len(local_env.agents))) print("Deadlocked agents {}/{}".format(deadlocked, len(local_env.agents))) print("Reward: {} Percent done: {}".format(reward, percent_done)) sum_reward += reward sum_percent_done += percent_done print("Total reward: {} Avg percent done: {}".format(sum_reward, sum_percent_done / (evaluation_number + 1))) if render: env_renderer.close_window() break except TimeoutException as err: print("Timeout! Will skip this episode and go to the next.", err) break np_time_taken_by_controller = np.array(time_taken_by_controller) np_time_taken_per_step = np.array(time_taken_per_step) print("="*100) print("="*100) print("Evaluation Number : ", evaluation_number) print("Current Env Path : ", remote_client.current_env_path) print("Env Creation Time : ", env_creation_time) print("Number of Steps : {}/{}".format(steps, local_env._max_episode_steps)) print("Mean/Std/Sum of Time taken by Controller : ", np_time_taken_by_controller.mean(), np_time_taken_by_controller.std(), np_time_taken_by_controller.sum()) print("Mean/Std/Sum of Time per Step : ", np_time_taken_per_step.mean(), np_time_taken_per_step.std(), np_time_taken_per_step.sum()) log().print_time_metrics() log().zero_time_metrics() print("="*100) print("\n\n") print("Evaluation of all environments complete...") print(remote_client.submit())
class SingleAgentEnvironment(Env): flatland_env = None renderer = None """ Args: flatland_env: The Flatland environment renderer: The renderer """ def __init__(self, flatland_env, renderer=None): self.flatland_env = flatland_env self.renderer = renderer self.reward_range = (-1, 1) self.action_space = Discrete(5) self.observation_space = Discrete(5) """ Execute an action. Args: action_dict: the dictionary agent -> action to perform Return: new_observation: The new observation for each agent reward: The reward for each agent done: True if an agent has concluded info: Some info for each agent """ def step(self, action_dict): return self.flatland_env.step(action_dict) """ Reset the environment and return an observation Returns: observation: The new observation """ def reset(self): return self.flatland_env.reset(regenerate_rail=False, regenerate_schedule=False, random_seed=True) """ Render the environment """ def render(self, mode='human'): # TODO: Merge both strategies (Jupyter vs .py) # In .py files # self.renderer.render_env(show=False, show_observations=False, show_predictions=False) # In Jupyter Notebooks env_renderer = RenderTool(self.flatland_env, gl="PILSVG") env_renderer.render_env() image = env_renderer.get_image() pil_image = Image.fromarray(image) display(pil_image) return image """ Reset the renderer the environment """ def reset_renderer(self): self.renderer = RenderTool(self.flatland_env, gl="PILSVG", agent_render_variant=AgentRenderVariant. AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=True, screen_height=700, screen_width=1300) def close_window(self): self.renderer.close_window()
class FlatlandRenderWrapper(RailEnv, gym.Env): # reward_range = (-float('inf'), float('inf')) # spec = None # # Set these in ALL subclasses # observation_space = None def __init__(self, use_renderer=False, *args, **kwargs): super().__init__(*args, **kwargs) self.use_renderer = use_renderer self.renderer = None self.metadata = { 'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 10, 'semantics.autoreset': True } if self.use_renderer: self.initialize_renderer() def reset(self, *args, **kwargs): if self.use_renderer: if self.renderer: #TODO: Errors with RLLib with renderer as None. self.renderer.reset() return super().reset(*args, **kwargs) def render(self, mode='human'): """ This methods provides the option to render the environment's behavior to a window which should be readable to the human eye if mode is set to 'human'. """ if not self.use_renderer: return if not self.renderer: self.initialize_renderer(mode=mode) return self.update_renderer(mode=mode) def initialize_renderer(self, mode="human"): # Initiate the renderer from flatland.utils.rendertools import RenderTool, AgentRenderVariant self.renderer = RenderTool( self, gl="PGL", # gl="TKPILSVG", agent_render_variant=AgentRenderVariant.ONE_STEP_BEHIND, show_debug=False, screen_height=600, # Adjust these parameters to fit your resolution screen_width=800) # Adjust these parameters to fit your resolution self.renderer.show = False def update_renderer(self, mode='human'): image = self.renderer.render_env(show=False, show_observations=False, show_predictions=False, return_image=True) return image[:, :, :3] def set_renderer(self, renderer): self.use_renderer = renderer if self.use_renderer: self.initialize_renderer(mode=self.use_renderer) def close(self): super().close() if self.renderer: try: if self.renderer.show: self.renderer.close_window() except Exception as e: # This is since the last step(Due to a stopping criteria) is skipped by rllib # Due to this done is not true and the env does not close # Finally the env is closed when RLLib exits but at that time there is no window # and hence the error print("Could Not close window due to:", e) self.renderer = None
def main(args): try: opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""]) except getopt.GetoptError as err: print(str(err)) # will print something like "option -a not recognized" sys.exit(2) sleep_for_animation = True for o, a in opts: if o in ("--sleep-for-animation"): sleep_for_animation = str2bool(a) else: assert False, "unhandled option" batch_builder = SampleBatchBuilder() # or MultiAgentSampleBatchBuilder writer = JsonWriter("./out/") # Setting these 2 parameters to True can slow down training visuals = False sleep_for_animation = False if visuals: from flatland.utils.rendertools import RenderTool max_depth = 30 tree_depth = 2 trial_start = 100 n_trials = 999 start = 0 columns = [ 'Agents', 'X_DIM', 'Y_DIM', 'TRIAL_NO', 'REWARD', 'NORMALIZED_REWARD', 'DONE_RATIO', 'STEPS', 'ACTION_PROB' ] df_all_results = pd.DataFrame(columns=columns) for trials in range(trial_start, n_trials + 1): env_file = f"envs-100-999/envs/Level_{trials}.pkl" # env_file = f"../env_configs/round_1-small/Test_0/Level_{trials}.mpk" # file = f"../env_configs/actions-small/Test_0/Level_{trials}.mpk" file = f"envs-100-999/actions/envs/Level_{trials}.json" if not os.path.isfile(env_file) or not os.path.isfile(file): print("Missing file!", env_file, file) continue step = 0 obs_builder_object = TreeObsForRailEnv( max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv(max_depth)) env = RailEnv( width=1, height=1, rail_generator=rail_from_file(env_file), schedule_generator=schedule_from_file(env_file), malfunction_generator_and_process_data=malfunction_from_file( env_file), obs_builder_object=obs_builder_object) obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True, activate_agents=False, random_seed=1001) with open(file, "r") as files: expert_actions = json.load(files) n_agents = env.get_num_agents() x_dim, y_dim = env.width, env.height agent_obs = [None] * n_agents agent_obs_buffer = [None] * n_agents done = dict() done["__all__"] = False if imitate: agent_action_buffer = list(expert_actions[step].values()) else: # , p=[0.2, 0, 0.5]) # [0] * n_agents agent_action_buffer = np.random.choice(5, n_agents, replace=True) update_values = [False] * n_agents max_steps = int(4 * 2 * (20 + env.height + env.width)) action_size = 5 # 3 # And some variables to keep track of the progress action_dict = dict() scores_window = deque(maxlen=100) reward_window = deque(maxlen=100) done_window = deque(maxlen=100) action_prob = [0] * action_size # agent = Agent(state_size, action_size) if visuals: env_renderer = RenderTool(env, gl="PILSVG") env_renderer.render_env(show=True, frames=True, show_observations=True) for a in range(n_agents): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) agent_obs_buffer[a] = agent_obs[a].copy() # Reset score and done score = 0 agent_action_buffer = np.zeros(n_agents) # prev_action = np.zeros_like(envs.action_space.sample()) prev_reward = np.zeros(n_agents) for step in range(max_steps): for a in range(n_agents): if info['action_required'][a]: if imitate: if step < len(expert_actions): action = expert_actions[step][str(a)] else: action = 0 else: action = 0 action_prob[action] += 1 update_values[a] = True else: update_values[a] = False action = 0 action_dict.update({a: action}) next_obs, all_rewards, done, info = env.step(action_dict) for a in range(n_agents): if next_obs[a] is not None: agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10) # Only update the values when we are done or when an action # was taken and thus relevant information is present if update_values[a] or done[a]: start += 1 batch_builder.add_values( t=step, eps_id=trials, agent_index=0, obs=agent_obs_buffer[a], actions=action_dict[a], action_prob=1.0, # put the true action probability rewards=all_rewards[a], prev_actions=agent_action_buffer[a], prev_rewards=prev_reward[a], dones=done[a], infos=info['action_required'][a], new_obs=agent_obs[a]) agent_obs_buffer[a] = agent_obs[a].copy() agent_action_buffer[a] = action_dict[a] prev_reward[a] = all_rewards[a] score += all_rewards[a] # / envs.get_num_agents() if visuals: env_renderer.render_env(show=True, frames=True, show_observations=True) if sleep_for_animation: time.sleep(0.5) if done["__all__"] or step > max_steps: writer.write(batch_builder.build_and_reset()) break # Collection information about training if step % 100 == 0: tasks_finished = 0 for current_agent in env.agents: if current_agent.status == RailAgentStatus.DONE_REMOVED: tasks_finished += 1 print( '\rTrial No {} Training {} Agents on ({},{}).\t Steps {}\t Reward: {:.3f}\t Normalized Reward: {:.3f}\tDones: {:.2f}%\t' .format( trials, env.get_num_agents(), x_dim, y_dim, step, score, score / (max_steps + n_agents), 100 * np.mean( tasks_finished / max(1, env.get_num_agents()))), end=" ") tasks_finished = 0 for current_agent in env.agents: if current_agent.status == RailAgentStatus.DONE_REMOVED: tasks_finished += 1 done_window.append(tasks_finished / max(1, env.get_num_agents())) reward_window.append(score) scores_window.append(score / (max_steps + n_agents)) data = [[ n_agents, x_dim, y_dim, trials, np.mean(reward_window), np.mean(scores_window), 100 * np.mean(done_window), step, action_prob / np.sum(action_prob) ]] df_cur = pd.DataFrame(data, columns=columns) df_all_results = pd.concat([df_all_results, df_cur]) if imitate: df_all_results.to_csv( f'TreeImitationLearning_DQN_TrainingResults.csv', index=False) print( '\rTrial No {} Training {} Agents on ({},{}).\t Total Steps {}\t Reward: {:.3f}\t Normalized Reward: {:.3f}\tDones: {:.2f}%\t' .format(trials, env.get_num_agents(), x_dim, y_dim, step, np.mean(reward_window), np.mean(scores_window), 100 * np.mean(done_window))) if visuals: env_renderer.close_window() gc.collect()
class FlatlandGymEnv(gym.Env): """ gym.Env wrapper of the Flatland environment providing deadlocks and observation normalization. """ def __init__(self, rail_env, custom_observations, env_params, render=False, regenerate_rail_on_reset=True, regenerate_schedule_on_reset=True): self._regenerate_rail_on_reset = regenerate_rail_on_reset self._regenerate_schedule_on_reset = regenerate_schedule_on_reset self.rail_env = rail_env self.deadlocks_detector = DeadlocksDetector() self.observation_normalizer = NormalizeObservations(self.rail_env.obs_builder.observation_dim, env_params.observation_tree_depth, custom_observations, self.rail_env.width, self.rail_env.height, env_params.observation_radius) self.state_size = self.observation_normalizer.state_size self.render = render self.env_renderer = None def reset(self): obs, info = self.rail_env.reset(regenerate_rail=self._regenerate_rail_on_reset, regenerate_schedule=self._regenerate_schedule_on_reset) # Reset rendering if self.render: self.env_renderer = RenderTool(self.rail_env, gl="PGL") self.env_renderer.set_new_rail() # Reset custom observations self.observation_normalizer.reset_custom_obs(self.rail_env) # Compute deadlocks self.deadlocks_detector.reset(self.rail_env.get_num_agents()) info["deadlocks"] = {} for agent in range(self.rail_env.get_num_agents()): info["deadlocks"][agent] = self.deadlocks_detector.deadlocks[agent] # Normalization for agent in obs: if obs[agent] is not None: obs[agent] = self.observation_normalizer.normalize_observation(obs[agent], self.rail_env, agent, info["deadlocks"][agent]) return obs, info def step(self, action_dict): """ Normalize observations by default, update deadlocks and step. :param action_dict: :return: """ obs, rewards, dones, info = self.rail_env.step(action_dict) # Compute deadlocks deadlocks = self.deadlocks_detector.step(self.rail_env) info["deadlocks"] = {} for agent in range(self.rail_env.get_num_agents()): info["deadlocks"][agent] = deadlocks[agent] # Normalization for agent in obs: if obs[agent] is not None: obs[agent] = self.observation_normalizer.normalize_observation(obs[agent], self.rail_env, agent, info["deadlocks"][agent]) return obs, rewards, dones, info def show_render(self): """ Open rendering window. :return: """ if self.render: return self.env_renderer.render_env( show=True, frames=False, show_observations=False, show_predictions=False) def close(self): """ Close rendering window. :return: """ if self.render: return self.env_renderer.close_window()
my_env_current_state = new_state current_state = get_current_state(env) for handle in range(number_agents): if not (env.agents[handle].position == None and my_env.agents[handle].position == my_env.agents[handle].target): if env.agents[handle].position != my_env.agents[ handle].position or env.agents[ handle].direction != my_env.agents[ handle].direction: print("#################### EPISODE ", episode, " #######################") print(' --------------------- step ', step, ' --------------------- action', action) print('') print(tmp, tmp_2) print(my_env_current_state, current_state) print('') if episode % checkout_episode == 0: renderer.close_window() env = gen_env(number_agents, width, height, n_start_goal, seed) renderer = RenderTool(env, agent_render_variant=3) renderer.reset() renderer.render_env(show=True, show_predictions=False, show_observations=False)