def test_rail_environment_single_agent(show=False): # We instantiate the following map on a 3x3 grid # _ _ # / \/ \ # | | | # \_/\_/ transitions = RailEnvTransitions() if False: # This env creation doesn't quite work right. cells = transitions.transition_list vertical_line = cells[1] south_symmetrical_switch = cells[6] north_symmetrical_switch = transitions.rotate_transition(south_symmetrical_switch, 180) south_east_turn = int('0100000000000010', 2) south_west_turn = transitions.rotate_transition(south_east_turn, 90) north_east_turn = transitions.rotate_transition(south_east_turn, 270) north_west_turn = transitions.rotate_transition(south_east_turn, 180) rail_map = np.array([[south_east_turn, south_symmetrical_switch, south_west_turn], [vertical_line, vertical_line, vertical_line], [north_east_turn, north_symmetrical_switch, north_west_turn]], dtype=np.uint16) rail = GridTransitionMap(width=3, height=3, transitions=transitions) rail.grid = rail_map rail_env = RailEnv(width=3, height=3, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=1, obs_builder_object=GlobalObsForRailEnv()) else: rail_env, env_dict = RailEnvPersister.load_new("test_env_loop.pkl", "env_data.tests") rail_map = rail_env.rail.grid rail_env._max_episode_steps = 1000 _ = rail_env.reset(False, False, True) liActions = [int(a) for a in RailEnvActions] env_renderer = RenderTool(rail_env) #RailEnvPersister.save(rail_env, "test_env_figure8.pkl") for _ in range(5): #rail_env.agents[0].initial_position = (1,2) _ = rail_env.reset(False, False, True) # We do not care about target for the moment agent = rail_env.agents[0] agent.target = [-1, -1] # Check that trains are always initialized at a consistent position # or direction. # They should always be able to go somewhere. if show: print("After reset - agent pos:", agent.position, "dir: ", agent.direction) print(transitions.get_transitions(rail_map[agent.position], agent.direction)) #assert (transitions.get_transitions( # rail_map[agent.position], # agent.direction) != (0, 0, 0, 0)) # HACK - force the direction to one we know is good. #agent.initial_position = agent.position = (2,3) agent.initial_direction = agent.direction = 0 if show: print ("handle:", agent.handle) #agent.initial_position = initial_pos = agent.position valid_active_actions_done = 0 pos = agent.position if show: env_renderer.render_env(show=show, show_agents=True) time.sleep(0.01) iStep = 0 while valid_active_actions_done < 6: # We randomly select an action action = np.random.choice(liActions) #action = RailEnvActions.MOVE_FORWARD _, _, dict_done, _ = rail_env.step({0: action}) prev_pos = pos pos = agent.position # rail_env.agents_position[0] print("action:", action, "pos:", agent.position, "prev:", prev_pos, agent.direction) print(dict_done) if prev_pos != pos: valid_active_actions_done += 1 iStep += 1 if show: env_renderer.render_env(show=show, show_agents=True, step=iStep) time.sleep(0.01) assert iStep < 100, "valid actions should have been performed by now - hung agent" # After 6 movements on this railway network, the train should be back # to its original height on the map. #assert (initial_pos[0] == agent.position[0]) # We check that the train always attains its target after some time for _ in range(10): _ = rail_env.reset() rail_env.agents[0].direction = 0 # JW - to avoid problem with random_schedule_generator. #rail_env.agents[0].position = (1,2) iStep = 0 while iStep < 100: # We randomly select an action action = np.random.choice(liActions) _, _, dones, _ = rail_env.step({0: action}) done = dones['__all__'] if done: break iStep +=1 assert iStep < 100, "agent should have finished by now" env_renderer.render_env(show=show)
def train_agent(train_params, train_env_params, eval_env_params, obs_params): # Environment parameters n_agents = train_env_params.n_agents x_dim = train_env_params.x_dim y_dim = train_env_params.y_dim n_cities = train_env_params.n_cities max_rails_between_cities = train_env_params.max_rails_between_cities max_rails_in_city = train_env_params.max_rails_in_city seed = train_env_params.seed # Unique ID for this training now = datetime.now() training_id = now.strftime('%y%m%d%H%M%S') # Observation parameters observation_tree_depth = obs_params.observation_tree_depth observation_radius = obs_params.observation_radius observation_max_path_depth = obs_params.observation_max_path_depth # Training parameters eps_start = train_params.eps_start eps_end = train_params.eps_end eps_decay = train_params.eps_decay n_episodes = train_params.n_episodes checkpoint_interval = train_params.checkpoint_interval n_eval_episodes = train_params.n_evaluation_episodes restore_replay_buffer = train_params.restore_replay_buffer save_replay_buffer = train_params.save_replay_buffer last_checkpoint = train_params.last_checkpoint # Set the seeds random.seed(seed) np.random.seed(seed) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=obs_params.max_depth, predictor=predictor) # Setup the environments train_env = create_rail_env(train_env_params, tree_observation) train_env.reset(regenerate_schedule=True, regenerate_rail=True) eval_env = create_rail_env(eval_env_params, tree_observation) eval_env.reset(regenerate_schedule=True, regenerate_rail=True) # Setup renderer if train_params.render: env_renderer = RenderTool(train_env, gl="PGL") # Calculate the state size given the depth of the tree observation and the number of features n_features_per_node = train_env.obs_builder.observation_dim n_nodes = observation_tree_depth state_size = (n_features_per_node + 1) * n_nodes - 1 # The action space of flatland is 5 discrete actions action_size = 5 # Max number of steps per episode # This is the official formula used during evaluations # See details in flatland.envs.schedule_generators.sparse_schedule_generator # max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) max_steps = train_env._max_episode_steps action_count = [0] * action_size action_dict = dict() agent_obs = [None] * n_agents agent_prev_obs = [None] * n_agents agent_prev_action = [2] * n_agents update_values = [False] * n_agents # Smoothed values used as target for hyperparameter tuning smoothed_normalized_score = -1.0 smoothed_eval_normalized_score = -1.0 smoothed_completion = 0.0 smoothed_eval_completion = 0.0 # Double Dueling DQN policy policy = DDDQNPolicy(state_size, action_size, train_params) if os.path.isfile(last_checkpoint): policy.qnetwork_local = torch.load(last_checkpoint) print("load checkpoint from %s" % (last_checkpoint)) # Loads existing replay buffer if restore_replay_buffer: try: policy.load_replay_buffer(restore_replay_buffer) policy.test() except RuntimeError as e: print( "\n🛑 Could't load replay buffer, were the experiences generated using the same tree depth?" ) print(e) exit(1) print("\n💾 Replay buffer status: {}/{} experiences".format( len(policy.memory.memory), train_params.buffer_size)) hdd = psutil.disk_usage('/') if save_replay_buffer and (hdd.free / (2**30)) < 500.0: print( "⚠️ Careful! Saving replay buffers will quickly consume a lot of disk space. You have {:.2f}gb left." .format(hdd.free / (2**30))) # TensorBoard writer writer = SummaryWriter() writer.add_hparams(vars(train_params), {}) writer.add_hparams(vars(train_env_params), {}) writer.add_hparams(vars(obs_params), {}) training_timer = Timer() training_timer.start() print( "\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes. Training id '{}'.\n" .format(train_env.get_num_agents(), x_dim, y_dim, n_episodes, n_eval_episodes, checkpoint_interval, training_id)) for episode_idx in range(n_episodes + 1): step_timer = Timer() reset_timer = Timer() learn_timer = Timer() preproc_timer = Timer() inference_timer = Timer() # Reset environment reset_timer.start() obs, info = train_env.reset(regenerate_rail=True, regenerate_schedule=True) reset_timer.end() if train_params.render: env_renderer.set_new_rail() score = 0 nb_steps = 0 actions_taken = [] # Build initial agent-specific observations for agent in train_env.get_agent_handles(): if obs[agent]: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) agent_prev_obs[agent] = agent_obs[agent].copy() # Run episode for step in range(max_steps - 1): inference_timer.start() for agent in train_env.get_agent_handles(): if info['action_required'][agent]: update_values[agent] = True action = policy.act(agent_obs[agent], eps=eps_start) action_count[action] += 1 actions_taken.append(action) else: # An action is not required if the train hasn't joined the railway network, # if it already reached its target, or if is currently malfunctioning. update_values[agent] = False action = 0 action_dict.update({agent: action}) inference_timer.end() # Environment step step_timer.start() next_obs, all_rewards, done, info = train_env.step(action_dict) step_timer.end() # Render an episode at some interval if train_params.render and episode_idx % checkpoint_interval == 0: env_renderer.render_env( show=True, frames=False, show_observations=True, show_predictions=True, ) # Update replay buffer and train agent for agent in train_env.get_agent_handles(): if update_values[agent] or done['__all__']: # Only learn from timesteps where somethings happened learn_timer.start() policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent]) learn_timer.end() agent_prev_obs[agent] = agent_obs[agent].copy() agent_prev_action[agent] = action_dict[agent] # Preprocess the new observations if next_obs[agent]: preproc_timer.start() agent_obs[agent] = normalize_observation( next_obs[agent], observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() score += all_rewards[agent] nb_steps = step if done['__all__']: break # Epsilon decay eps_start = max(eps_end, eps_decay * eps_start) # Collect information about training tasks_finished = sum(done[idx] for idx in train_env.get_agent_handles()) completion = tasks_finished / max(1, train_env.get_num_agents()) normalized_score = score / (max_steps * train_env.get_num_agents()) action_probs = action_count / np.sum(action_count) action_count = [1] * action_size smoothing = 0.99 smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * ( 1.0 - smoothing) smoothed_completion = smoothed_completion * smoothing + completion * ( 1.0 - smoothing) # Print logs if episode_idx % checkpoint_interval == 0: torch.save( policy.qnetwork_local, './checkpoints/' + training_id + '-' + str(episode_idx) + '.pth') if save_replay_buffer: policy.save_replay_buffer('./replay_buffers/' + training_id + '-' + str(episode_idx) + '.pkl') if train_params.render: env_renderer.close_window() print('\r🚂 Episode {}' '\t 🏆 Score: {:.3f}' ' Avg: {:.3f}' '\t 💯 Done: {:.2f}%' ' Avg: {:.2f}%' '\t 🎲 Epsilon: {:.3f} ' '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score, smoothed_normalized_score, 100 * completion, 100 * smoothed_completion, eps_start, format_action_prob(action_probs)), end=" ") # Evaluate policy and log results at some interval if episode_idx % checkpoint_interval == 0 and n_eval_episodes > 0: scores, completions, nb_steps_eval = eval_policy( eval_env, policy, train_params, obs_params) writer.add_scalar("evaluation/scores_min", np.min(scores), episode_idx) writer.add_scalar("evaluation/scores_max", np.max(scores), episode_idx) writer.add_scalar("evaluation/scores_mean", np.mean(scores), episode_idx) writer.add_scalar("evaluation/scores_std", np.std(scores), episode_idx) writer.add_histogram("evaluation/scores", np.array(scores), episode_idx) writer.add_scalar("evaluation/completions_min", np.min(completions), episode_idx) writer.add_scalar("evaluation/completions_max", np.max(completions), episode_idx) writer.add_scalar("evaluation/completions_mean", np.mean(completions), episode_idx) writer.add_scalar("evaluation/completions_std", np.std(completions), episode_idx) writer.add_histogram("evaluation/completions", np.array(completions), episode_idx) writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_mean", np.mean(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval), episode_idx) writer.add_histogram("evaluation/nb_steps", np.array(nb_steps_eval), episode_idx) smoothing = 0.9 smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean( scores) * (1.0 - smoothing) smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean( completions) * (1.0 - smoothing) writer.add_scalar("evaluation/smoothed_score", smoothed_eval_normalized_score, episode_idx) writer.add_scalar("evaluation/smoothed_completion", smoothed_eval_completion, episode_idx) # Save logs to tensorboard writer.add_scalar("training/score", normalized_score, episode_idx) writer.add_scalar("training/smoothed_score", smoothed_normalized_score, episode_idx) writer.add_scalar("training/completion", np.mean(completion), episode_idx) writer.add_scalar("training/smoothed_completion", np.mean(smoothed_completion), episode_idx) writer.add_scalar("training/nb_steps", nb_steps, episode_idx) writer.add_histogram("actions/distribution", np.array(actions_taken), episode_idx) writer.add_scalar("actions/nothing", action_probs[RailEnvActions.DO_NOTHING], episode_idx) writer.add_scalar("actions/left", action_probs[RailEnvActions.MOVE_LEFT], episode_idx) writer.add_scalar("actions/forward", action_probs[RailEnvActions.MOVE_FORWARD], episode_idx) writer.add_scalar("actions/right", action_probs[RailEnvActions.MOVE_RIGHT], episode_idx) writer.add_scalar("actions/stop", action_probs[RailEnvActions.STOP_MOVING], episode_idx) writer.add_scalar("training/epsilon", eps_start, episode_idx) writer.add_scalar("training/buffer_size", len(policy.memory), episode_idx) writer.add_scalar("training/loss", policy.loss, episode_idx) writer.add_scalar("timer/reset", reset_timer.get(), episode_idx) writer.add_scalar("timer/step", step_timer.get(), episode_idx) writer.add_scalar("timer/learn", learn_timer.get(), episode_idx) writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx) writer.add_scalar("timer/total", training_timer.get_current(), episode_idx)
np.random.seed(0) # We need to either load in some pre-generated railways from disk, or else create a random railway generator. if flags.load_railways: rail_generator, schedule_generator = load_precomputed_railways(project_root, flags) else: rail_generator, schedule_generator = create_random_railways(project_root) # Create the Flatland environment env = RailEnv(width=flags.grid_width, height=flags.grid_height, number_of_agents=flags.num_agents, rail_generator=rail_generator, schedule_generator=schedule_generator, malfunction_generator=ParamMalfunctionGen(MalfunctionParameters(1 / 8000, 15, 50)), obs_builder_object=TreeObservation(max_depth=flags.tree_depth)) # After training we want to render the results so we also load a renderer env_renderer = RenderTool(env, gl="PILSVG", screen_width=800, screen_height=800, agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX) # Calculate the state size based on the number of nodes in the tree observation num_features_per_node = env.obs_builder.observation_dim num_nodes = sum(np.power(4, i) for i in range(flags.tree_depth + 1)) state_size = num_nodes * num_features_per_node action_size = 5 # Add some variables to keep track of the progress scores_window, steps_window, collisions_window, done_window = [deque(maxlen=200) for _ in range(4)] agent_obs = [None] * flags.num_agents agent_obs_buffer = [None] * flags.num_agents agent_action_buffer = [2] * flags.num_agents max_steps = 8 * (flags.grid_width + flags.grid_height) start_time = time.time()
env = RailEnv(width=width, height=height, rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=nr_trains, obs_builder_object=observation_builder, malfunction_generator_and_process_data=malfunction_from_params( stochastic_data), remove_agents_at_target=True) env.reset() # Initiate the renderer env_renderer = RenderTool( env, gl="PILSVG", agent_render_variant=AgentRenderVariant.ONE_STEP_BEHIND, show_debug=False, screen_height=1200, # Adjust these parameters to fit your resolution screen_width=1800) # Adjust these parameters to fit your resolution ######### Get arguments of the script ######### parser = argparse.ArgumentParser() parser.add_argument("-step", type=int, help="steps") args = parser.parse_args() ######### Custom controller setup ######### controller = GreedyAgent(218, env.action_space[0]) my_grid = [[Node((i, j), env.rail.grid[i, j]) for j in range(env.rail.width)] for i in range(env.rail.height)] astar_planner = AStarAgent(my_grid, env.rail.width, env.rail.height)
def train_agent(train_params): env = load_flatland_environment_from_file( "scratch/test-envs/Test_13/Level_0.pkl") env.reset(regenerate_schedule=True, regenerate_rail=True) # Environment parameters n_agents = len(env.agents) print("n_agents= ", n_agents) print("env.get_num_agents(): ", env.get_num_agents()) x_dim = env.width y_dim = env.height n_cities = 37 #max_rails_between_cities = env_params.max_rails_between_cities #max_rails_in_city = env_params.max_rails_in_city seed = 2125 # Observation parameters # observation_tree_depth = env_params.observation_tree_depth # observation_radius = env_params.observation_radius # observation_max_path_depth = env_params.observation_max_path_depth observation_tree_depth = 2 observation_radius = 10 observation_max_path_depth = 30 # Training parameters eps_start = train_params.eps_start eps_end = train_params.eps_end eps_decay = train_params.eps_decay n_episodes = train_params.n_episodes checkpoint_interval = train_params.checkpoint_interval n_eval_episodes = train_params.n_evaluation_episodes # Set the seeds random.seed(seed) np.random.seed(seed) # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 10000, # Rate of malfunctions min_duration=15, # Minimal duration max_duration=50 # Max duration ) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Fraction of train which each speed speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Setup the environment # env = RailEnv( # width=x_dim, # height=y_dim, # rail_generator=sparse_rail_generator( # max_num_cities=n_cities, # grid_mode=False, # max_rails_between_cities=max_rails_between_cities, # max_rails_in_city=max_rails_in_city # ), # schedule_generator=sparse_schedule_generator(speed_profiles), # number_of_agents=n_agents, # malfunction_generator_and_process_data=malfunction_from_params(malfunction_parameters), # obs_builder_object=tree_observation, # random_seed=seed # # env.reset(regenerate_schedule=True, regenerate_rail=True) # Setup renderer if train_params.render: env_renderer = RenderTool(env, gl="PGL") # Calculate the state size given the depth of the tree observation and the number of features n_features_per_node = env.obs_builder.observation_dim n_nodes = 0 for i in range(observation_tree_depth + 1): n_nodes += np.power(4, i) state_size = n_features_per_node * n_nodes # The action space of flatland is 5 discrete actions action_size = 5 # Max number of steps per episode # This is the official formula used during evaluations # See details in flatland.envs.schedule_generators.sparse_schedule_generator max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) #max_steps = env._max_episode_steps print("max_steps = ", max_steps) print("env._max_episode_steps= ", env._max_episode_steps) action_count = [0] * action_size action_dict = dict() agent_obs = [None] * env.get_num_agents() agent_prev_obs = [None] * env.get_num_agents() agent_prev_action = [2] * env.get_num_agents() update_values = False smoothed_normalized_score = -1.0 smoothed_eval_normalized_score = -1.0 smoothed_completion = 0.0 smoothed_eval_completion = 0.0 # Double Dueling DQN policy policy = DDDQNPolicy(state_size, action_size, train_params) # TensorBoard writer writer = SummaryWriter() writer.add_hparams(vars(train_params), {}) #writer.add_hparams(vars(env_params), {}) training_timer = Timer() training_timer.start() print( "\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes.\n" .format(env.get_num_agents(), x_dim, y_dim, n_episodes, n_eval_episodes, checkpoint_interval)) for episode_idx in range(n_episodes + 1): # Timers step_timer = Timer() reset_timer = Timer() learn_timer = Timer() preproc_timer = Timer() # Reset environment reset_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True) reset_timer.end() if train_params.render: env_renderer.set_new_rail() score = 0 nb_steps = 0 actions_taken = [] # Build agent specific observations for agent in env.get_agent_handles(): if obs[agent]: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) agent_prev_obs[agent] = agent_obs[agent].copy() # Run episode for step in range(max_steps - 1): for agent in env.get_agent_handles(): if info['action_required'][agent]: # If an action is required, we want to store the obs at that step as well as the action update_values = True action = policy.act(agent_obs[agent], eps=eps_start) action_count[action] += 1 actions_taken.append(action) else: update_values = False action = 0 action_dict.update({agent: action}) # Environment step step_timer.start() next_obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if train_params.render and episode_idx % checkpoint_interval == 0: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) for agent in range(env.get_num_agents()): # Update replay buffer and train agent # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[agent]: learn_timer.start() policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent]) learn_timer.end() agent_prev_obs[agent] = agent_obs[agent].copy() agent_prev_action[agent] = action_dict[agent] # Preprocess the new observations if next_obs[agent]: preproc_timer.start() agent_obs[agent] = normalize_observation( next_obs[agent], observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() score += all_rewards[agent] nb_steps = step if done['__all__']: break # Epsilon decay eps_start = max(eps_end, eps_decay * eps_start) # Collection information about training tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) normalized_score = score / (max_steps * env.get_num_agents()) action_probs = action_count / np.sum(action_count) action_count = [1] * action_size # Smoothed values for terminal display and for more stable hyper-parameter tuning smoothing = 0.99 smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * ( 1.0 - smoothing) smoothed_completion = smoothed_completion * smoothing + completion * ( 1.0 - smoothing) # Print logs if episode_idx % checkpoint_interval == 0: torch.save(policy.qnetwork_local, './checkpoints/obs2_multi-' + str(episode_idx) + '.pth') if train_params.render: env_renderer.close_window() print('\r🚂 Episode {}' '\t 🏆 Score: {:.3f}' ' Avg: {:.3f}' '\t 💯 Done: {:.2f}%' ' Avg: {:.2f}%' '\t 🎲 Epsilon: {:.2f} ' '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score, smoothed_normalized_score, 100 * completion, 100 * smoothed_completion, eps_start, format_action_prob(action_probs)), end=" ") # Evaluate policy if episode_idx % train_params.checkpoint_interval == 0: scores, completions, nb_steps_eval = eval_policy( env, policy, n_eval_episodes, max_steps) writer.add_scalar("evaluation/scores_min", np.min(scores), episode_idx) writer.add_scalar("evaluation/scores_max", np.max(scores), episode_idx) writer.add_scalar("evaluation/scores_mean", np.mean(scores), episode_idx) writer.add_scalar("evaluation/scores_std", np.std(scores), episode_idx) writer.add_histogram("evaluation/scores", np.array(scores), episode_idx) writer.add_scalar("evaluation/completions_min", np.min(completions), episode_idx) writer.add_scalar("evaluation/completions_max", np.max(completions), episode_idx) writer.add_scalar("evaluation/completions_mean", np.mean(completions), episode_idx) writer.add_scalar("evaluation/completions_std", np.std(completions), episode_idx) writer.add_histogram("evaluation/completions", np.array(completions), episode_idx) writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_mean", np.mean(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval), episode_idx) writer.add_histogram("evaluation/nb_steps", np.array(nb_steps_eval), episode_idx) smoothing = 0.9 smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean( scores) * (1.0 - smoothing) smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean( completions) * (1.0 - smoothing) writer.add_scalar("evaluation/smoothed_score", smoothed_eval_normalized_score, episode_idx) writer.add_scalar("evaluation/smoothed_completion", smoothed_eval_completion, episode_idx) # Save logs to tensorboard writer.add_scalar("training/score", normalized_score, episode_idx) writer.add_scalar("training/smoothed_score", smoothed_normalized_score, episode_idx) writer.add_scalar("training/completion", np.mean(completion), episode_idx) writer.add_scalar("training/smoothed_completion", np.mean(smoothed_completion), episode_idx) writer.add_scalar("training/nb_steps", nb_steps, episode_idx) writer.add_histogram("actions/distribution", np.array(actions_taken), episode_idx) writer.add_scalar("actions/nothing", action_probs[RailEnvActions.DO_NOTHING], episode_idx) writer.add_scalar("actions/left", action_probs[RailEnvActions.MOVE_LEFT], episode_idx) writer.add_scalar("actions/forward", action_probs[RailEnvActions.MOVE_FORWARD], episode_idx) writer.add_scalar("actions/right", action_probs[RailEnvActions.MOVE_RIGHT], episode_idx) writer.add_scalar("actions/stop", action_probs[RailEnvActions.STOP_MOVING], episode_idx) writer.add_scalar("training/epsilon", eps_start, episode_idx) writer.add_scalar("training/buffer_size", len(policy.memory), episode_idx) writer.add_scalar("training/loss", policy.loss, episode_idx) writer.add_scalar("timer/reset", reset_timer.get(), episode_idx) writer.add_scalar("timer/step", step_timer.get(), episode_idx) writer.add_scalar("timer/learn", learn_timer.get(), episode_idx) writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx) writer.add_scalar("timer/total", training_timer.get_current(), episode_idx)
def evaluate(n_episodes): run = SUBMISSIONS["rlpr-tcpr"] config, run = init_run(run) prio_agent = get_agent(config, run) env = get_env(config, rl=True) env_renderer = RenderTool(env, screen_width=8800) returns = [] pcs = [] malfs = [] for _ in tqdm(range(n_episodes)): obs, _ = env.reset(regenerate_schedule=True, regenerate_rail=True) if RENDER: env_renderer.reset() env_renderer.render_env(show=True, frames=True, show_observations=False) if not obs: break steps = 0 ep_return = 0 done = defaultdict(lambda: False) robust_env = CprFlatlandGymEnv(rail_env=env, max_nr_active_agents=200, observation_space=None, priorizer=DistToTargetPriorizer(), allow_noop=True) priorities = prio_agent.compute_actions(obs, explore=False) sorted_actions = { k: v for k, v in sorted( priorities.items(), key=lambda item: item[1], reverse=True) } sorted_handles = list(sorted_actions.keys()) while not done['__all__']: actions = ShortestPathAgent().compute_actions(obs, env) robust_actions = robust_env.get_robust_actions( actions, sorted_handles) obs, all_rewards, done, info = env.step(robust_actions) if RENDER: env_renderer.render_env(show=True, frames=True, show_observations=False) print('.', end='', flush=True) steps += 1 ep_return += np.sum(list(all_rewards.values())) pc = np.sum(np.array([1 for a in env.agents if is_done(a) ])) / env.get_num_agents() print("EPISODE PC:", pc) n_episodes += 1 pcs.append(pc) returns.append(ep_return / (env._max_episode_steps * env.get_num_agents())) malfs.append( np.sum([a.malfunction_data['nr_malfunctions'] for a in env.agents])) return pcs, returns, malfs
# Relative weights of each cell type to be used by the random rail generators. transition_probability = [ 1.0, # empty cell - Case 0 1.0, # Case 1 - straight 1.0, # Case 2 - simple switch 0.3, # Case 3 - diamond drossing 0.5, # Case 4 - single slip 0.5, # Case 5 - double slip 0.2, # Case 6 - symmetrical 0.0, # Case 7 - dead end 0.2, # Case 8 - turn left 0.2, # Case 9 - turn right 1.0 ] # Case 10 - mirrored switch # Example generate a random rail env = RailEnv(width=10, height=10, rail_generator=random_rail_generator( cell_type_relative_proportion=transition_probability), number_of_agents=3) env.reset() env_renderer = RenderTool(env, gl="PIL") env_renderer.render_env(show=True) # uncomment to keep the renderer open # input("Press Enter to continue...")
rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist, max_dist=99999, seed=0), schedule_generator=complex_schedule_generator(), obs_builder_object=TreeObsForRailEnv( max_depth=1, predictor=ShortestPathPredictorForRailEnv()), number_of_agents=n_agents) env.reset(True, True) tree_depth = 1 observation_helper = TreeObsForRailEnv( max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv()) env_renderer = RenderTool( env, gl="PGL", ) handle = env.get_agent_handles() n_episodes = 10 max_steps = 100 * (env.height + env.width) record_images = False policy = OrderedPolicy() action_dict = dict() for trials in range(1, n_episodes + 1): # Reset environment obs, info = env.reset(True, True) done = env.dones env_renderer.reset() frame_step = 0
def main(args): # Show options and values print(' ' * 26 + 'Options') for k, v in vars(args).items(): print(' ' * 26 + k + ': ' + str(v)) # Where to save models results_dir = os.path.join('results', args.model_id) if not os.path.exists(results_dir): os.makedirs(results_dir) rail_generator = sparse_rail_generator( max_num_cities=args.max_num_cities, seed=args.seed, grid_mode=args.grid_mode, max_rails_between_cities=args.max_rails_between_cities, max_rails_in_city=args.max_rails_in_city, ) # Maps speeds to % of appearance in the env speed_ration_map = {1.: 1} # Fast passenger train if args.multi_speed: speed_ration_map = { 1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25 } # Slow freight train schedule_generator = sparse_schedule_generator(speed_ration_map) prediction_builder = ShortestPathPredictorForRailEnv( max_depth=args.prediction_depth) obs_builder = RailObsForRailEnv(predictor=prediction_builder) env = RailEnv( width=args.width, height=args.height, rail_generator=rail_generator, random_seed=0, schedule_generator=schedule_generator, number_of_agents=args.num_agents, obs_builder_object=obs_builder, malfunction_generator_and_process_data=malfunction_from_params( parameters={ 'malfunction_rate': args.malfunction_rate, 'min_duration': args.min_duration, 'max_duration': args.max_duration })) if args.render: env_renderer = RenderTool(env, agent_render_variant=AgentRenderVariant. AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=True, screen_height=800, screen_width=800) if args.plot: writer = SummaryWriter(log_dir='runs/' + args.model_id) max_rails = 100 # TODO Must be a parameter of the env (estimated) # max_steps = env.compute_max_episode_steps(env.width, env.height) max_steps = 200 preprocessor = ObsPreprocessor(max_rails, args.reorder_rails) dqn = DQNAgent(args, bitmap_height=max_rails * 3, action_space=2) if args.load_path: file = os.path.isfile(args.load_path) if file: dqn.qnetwork_local.load_state_dict(torch.load(args.load_path)) print('WEIGHTS LOADED from: ', args.load_path) eps = args.start_eps railenv_action_dict = {} network_action_dict = {} # Metrics done_window = deque( maxlen=args.window_size) # Env dones over last window_size episodes done_agents_window = deque( maxlen=args.window_size) # Fraction of done agents over last ... reward_window = deque( maxlen=args.window_size ) # Cumulative rewards over last window_size episodes norm_reward_window = deque( maxlen=args.window_size ) # Normalized cum. rewards over last window_size episodes # Track means over windows of window_size episodes mean_dones = [] mean_agent_dones = [] mean_rewards = [] mean_norm_rewards = [] # Episode rewards/dones/norm rewards since beginning of training TODO #env_dones = [] crash = [False] * args.num_agents update_values = [False] * args.num_agents buffer_obs = [[]] * args.num_agents ############ Main loop for ep in range(args.num_episodes): cumulative_reward = 0 env_done = 0 altmaps = [None] * args.num_agents altpaths = [[]] * args.num_agents buffer_rew = [0] * args.num_agents buffer_done = [False] * args.num_agents curr_obs = [None] * args.num_agents maps, info = env.reset() if args.print: debug.print_bitmaps(maps) if args.render: env_renderer.reset() for step in range(max_steps - 1): # Save a copy of maps at the beginning buffer_maps = maps.copy() # rem first bit is 0 for agent not departed for a in range(env.get_num_agents()): agent = env.agents[a] crash[a] = False update_values[a] = False network_action = None action = None # If agent is arrived if agent.status == RailAgentStatus.DONE or agent.status == RailAgentStatus.DONE_REMOVED: # TODO if agent !removed you should leave a bit in the bitmap # TODO? set bitmap only the first time maps[a, :, :] = 0 network_action = 0 action = RailEnvActions.DO_NOTHING # If agent is not departed elif agent.status == RailAgentStatus.READY_TO_DEPART: update_values[a] = True obs = preprocessor.get_obs(a, maps[a], buffer_maps) curr_obs[a] = obs.copy() # Network chooses action q_values = dqn.act(obs).cpu().data.numpy() if np.random.random() > eps: network_action = np.argmax(q_values) else: network_action = np.random.choice([0, 1]) if network_action == 0: action = RailEnvActions.DO_NOTHING else: # Go crash[a] = obs_builder.check_crash(a, maps) if crash[a]: network_action = 0 action = RailEnvActions.STOP_MOVING else: maps = obs_builder.update_bitmaps(a, maps) action = obs_builder.get_agent_action(a) # If the agent is entering a switch elif obs_builder.is_before_switch( a) and info['action_required'][a]: # If the altpaths cache is empty or already contains # the altpaths from the current agent's position if len( altpaths[a] ) == 0 or agent.position != altpaths[a][0][0].position: altmaps[a], altpaths[a] = obs_builder.get_altmaps(a) if len(altmaps[a]) > 0: update_values[a] = True altobs = [None] * len(altmaps[a]) q_values = np.array([]) for i in range(len(altmaps[a])): altobs[i] = preprocessor.get_obs( a, altmaps[a][i], buffer_maps) q_values = np.concatenate([ q_values, dqn.act(altobs[i]).cpu().data.numpy() ]) # Epsilon-greedy action selection if np.random.random() > eps: argmax = np.argmax(q_values) network_action = argmax % 2 best_i = argmax // 2 else: network_action = np.random.choice([0, 1]) best_i = np.random.choice( np.arange(len(altmaps[a]))) # Use new bitmaps and paths maps[a, :, :] = altmaps[a][best_i] obs_builder.set_agent_path(a, altpaths[a][best_i]) curr_obs[a] = altobs[best_i].copy() else: print('[ERROR] NO ALTHPATHS EP: {} STEP: {} AGENT: {}'. format(ep, step, a)) network_action = 0 if network_action == 0: action = RailEnvActions.STOP_MOVING else: crash[a] = obs_builder.check_crash( a, maps, is_before_switch=True) if crash[a]: network_action = 0 action = RailEnvActions.STOP_MOVING else: action = obs_builder.get_agent_action(a) maps = obs_builder.update_bitmaps( a, maps, is_before_switch=True) # If the agent is following a rail elif info['action_required'][a]: crash[a] = obs_builder.check_crash(a, maps) if crash[a]: network_action = 0 action = RailEnvActions.STOP_MOVING else: network_action = 1 action = obs_builder.get_agent_action(a) maps = obs_builder.update_bitmaps(a, maps) else: # not action_required network_action = 1 action = RailEnvActions.DO_NOTHING maps = obs_builder.update_bitmaps(a, maps) network_action_dict.update({a: network_action}) railenv_action_dict.update({a: action}) # Obs is computed from bitmaps while state is computed from env step (temporarily) _, reward, done, info = env.step(railenv_action_dict) # Env step if args.render: env_renderer.render_env(show=True, show_observations=False, show_predictions=True) if args.debug: for a in range(env.get_num_agents()): print('#########################################') print('Info for agent {}'.format(a)) print('Status: {}'.format(info['status'][a])) print('Position: {}'.format(env.agents[a].position)) print('Target: {}'.format(env.agents[a].target)) print('Moving? {} at speed: {}'.format( env.agents[a].moving, info['speed'][a])) print('Action required? {}'.format( info['action_required'][a])) print('Network action: {}'.format(network_action_dict[a])) print('Railenv action: {}'.format(railenv_action_dict[a])) # Update replay buffer and train agent if args.train: for a in range(env.get_num_agents()): if args.crash_penalty and crash[a]: # Store bad experience dqn.step(curr_obs[a], 1, -100, curr_obs[a], True) if not args.switch2switch: if update_values[a] and not buffer_done[a]: next_obs = preprocessor.get_obs(a, maps[a], maps) dqn.step(curr_obs[a], network_action_dict[a], reward[a], next_obs, done[a]) else: if update_values[a] and not buffer_done[a]: # If I had an obs from a previous switch if len(buffer_obs[a]) != 0: dqn.step(buffer_obs[a], 1, buffer_rew[a], curr_obs[a], done[a]) buffer_obs[a] = [] buffer_rew[a] = 0 if network_action_dict[a] == 0: dqn.step(curr_obs[a], 1, reward[a], curr_obs[a], False) elif network_action_dict[a] == 1: # I store the obs and update at the next switch buffer_obs[a] = curr_obs[a].copy() # Cache reward only if we have an obs from a prev switch if len(buffer_obs[a]) != 0: buffer_rew[a] += reward[a] # Now update the done cache to avoid adding experience many times buffer_done[a] = done[a] for a in range(env.get_num_agents()): cumulative_reward += reward[ a] # / env.get_num_agents() # Update cumulative reward (not norm) # TODO? env sets done[all] = True for everyone when time limit is reached # devid: I also remember this, but debuggind doesn't seem to happen if done['__all__']: env_done = 1 break ################### End of the episode eps = max(args.end_eps, args.eps_decay * eps) # Decrease epsilon # Metrics done_window.append(env_done) # Save done in this episode num_agents_done = 0 # Num of agents that reached their target in the last episode for a in range(env.get_num_agents()): if done[a]: num_agents_done += 1 done_agents_window.append(num_agents_done / env.get_num_agents()) reward_window.append( cumulative_reward) # Save cumulative reward in this episode normalized_reward = cumulative_reward / (env.compute_max_episode_steps( env.width, env.height) + env.get_num_agents()) norm_reward_window.append(normalized_reward) mean_dones.append((np.mean(done_window))) mean_agent_dones.append((np.mean(done_agents_window))) mean_rewards.append(np.mean(reward_window)) mean_norm_rewards.append(np.mean(norm_reward_window)) # Print training results info print( '\r{} Agents on ({},{}). Episode: {}\t Mean done agents: {:.2f}\t Mean reward: {:.2f}\t Mean normalized reward: {:.2f}\t Done agents in last episode: {:.2f}%\t Epsilon: {:.2f}' .format( env.get_num_agents(), args.width, args.height, ep, mean_agent_dones[-1], # Fraction of done agents mean_rewards[-1], mean_norm_rewards[-1], (num_agents_done / args.num_agents), eps), end=" ") if ep != 0 and (ep + 1) % args.checkpoint_interval == 0: print( '\r{} Agents on ({},{}). Episode: {}\t Mean done agents: {:.2f}\t Mean reward: {:.2f}\t Mean normalized reward: {:.2f}\t Epsilon: {:.2f}' .format(env.get_num_agents(), args.width, args.height, ep, mean_agent_dones[-1], mean_rewards[-1], mean_norm_rewards[-1], eps)) if args.train and ep != 0 and (ep + 1) % args.save_interval == 0: torch.save(dqn.qnetwork_local.state_dict(), results_dir + '/weights.pt') if args.plot: writer.add_scalar('mean_agent_dones', mean_agent_dones[-1], ep) writer.add_scalar('mean_rewards', mean_rewards[-1], ep) writer.add_scalar('mean_dones', mean_dones[-1], ep) writer.add_scalar('mean_norm_rewards', mean_norm_rewards[-1], ep) writer.add_scalar('epsilon', eps, ep)
def main(argv): random.seed(1) np.random.seed(1) # Initialize a random map with a random number of agents x_dim = np.random.randint(20, 40) y_dim = np.random.randint(20, 40) n_agents = np.random.randint(3, 4) n_goals = n_agents + np.random.randint(0, 3) min_dist = int(0.75 * min(x_dim, y_dim)) tree_depth = 4 # Get an observation builder and predictor predictor = ShortestPathPredictorForRailEnv() observation_helper = TreeObsForRailEnv(max_depth=tree_depth, predictor=predictor) # Use a the malfunction generator to break agents from time to time stochastic_data = {'prop_malfunction': 0.0, # Percentage of defective agents 'malfunction_rate': 0, # Rate of malfunction occurrence 'min_duration': 3, # Minimal duration of malfunction 'max_duration': 20 # Max duration of malfunction } # Different agent types (trains) with different speeds. speed_ration_map = {1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25} # Slow freight train env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator(max_num_cities=3, # Number of cities in map (where train stations are) seed=1, # Random seed grid_mode=False, max_rails_between_cities=2, max_rails_in_city=3), schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=n_agents, stochastic_data=stochastic_data, # Malfunction data generator obs_builder_object=observation_helper) env.reset(True, True) # Initiate the renderer env_renderer = RenderTool(env, gl="PILSVG", agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=False, screen_height=1000, # Adjust these parameters to fit your resolution screen_width=1000) # Adjust these parameters to fit your resolution handle = env.get_agent_handles() num_features_per_node = env.obs_builder.observation_dim nr_nodes = 0 for i in range(tree_depth + 1): nr_nodes += np.power(4, i) state_size = 2 * num_features_per_node * nr_nodes action_size = 5 n_trials = 10 observation_radius = 10 max_steps = int(3 * (env.height + env.width)) action_dict = dict() time_obs = deque(maxlen=2) agent_obs = [None] * env.get_num_agents() # Init and load agent agent = Agent(state_size, action_size) with path(fc_treeobs.nets, "multi_agent_2ts_checkpoint200.pth") as file_in: agent.qnetwork_local.load_state_dict(torch.load(file_in)) # Vars used to record agent performance record_images = False frame_step = 0 for trials in range(1, n_trials + 1): # Reset environment obs, info = env.reset(True, True) env_renderer.reset() # Build first two-time step observation for a in range(env.get_num_agents()): obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) # Accumulate two time steps of observation (Here just twice the first state) for i in range(2): time_obs.append(obs) # Build the agent specific double ti for a in range(env.get_num_agents()): agent_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a])) # Run episode for step in range(max_steps): time.sleep(0.01) env_renderer.render_env(show=True, show_observations=False, show_predictions=True) if record_images: env_renderer.gl.save_image("./Images/Avoiding/flatland_frame_{:04d}.bmp".format(frame_step)) frame_step += 1 # Perform action for each agent for a in range(env.get_num_agents()): action = agent.act(agent_obs[a], eps=0) action_dict.update({a: action}) # Environment step next_obs, all_rewards, done, _ = env.step(action_dict) # Collect observation after environment step for a in range(env.get_num_agents()): next_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10) # Add new obs to the obs vector # Since time_obs is a deque of max_len = 2, an append on the right side when the deque is full # provokes a pop of the element from the left side time_obs.append(next_obs) # Create obs using obs at time step t-1 and ob at time step t for a in range(env.get_num_agents()): agent_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a])) if done['__all__']: break
# Construct the enviornment with the given observation, generataors, predictors, and stochastic data env = RailEnv(width=width, height=height, rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=nr_trains, stochastic_data=stochastic_data, # Malfunction data generator obs_builder_object=observation_builder, remove_agents_at_target=True # Removes agents at the end of their journey to make space for others ) # Initiate the renderer env_renderer = RenderTool(env, gl="PILSVG", agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=False, screen_height=800, # Adjust these parameters to fit your resolution screen_width=800) # Adjust these parameters to fit your resolution plt.ion() plt.show() succ_best = 1 tries_best = 1 succ_stoch = 1 tries_stoch = 1 use_best = False while True: episode_done = False episode_reward = 0
def main(argv): try: opts, args = getopt.getopt(argv, "n:", ["n_trials="]) except getopt.GetoptError: print('training_navigation.py -n <n_trials>') sys.exit(2) for opt, arg in opts: if opt in ('-n', '--n_trials'): n_trials = int(arg) random.seed(1) np.random.seed(1) # Parameters for the Environment # Multi agent (4000 iterations) x_dim = 16*3 y_dim = 9*3 n_agents = 5 max_num_cities = 5 max_rails_between_cities = 2 max_rails_in_city = 3 # Single agent (1000 iterations) x_dim1 = 16*4 y_dim1 = 9*4 n_agents1 = 1 max_num_cities1 = 9 max_rails_between_cities1 = 5 max_rails_in_city1 = 5 # Use a the malfunction generator to break agents from time to time # stochastic_data = {'malfunction_rate': 8000, # Rate of malfunction occurence of single agent # 'min_duration': 15, # Minimal duration of malfunction # 'max_duration': 50 # Max duration of malfunction # } # Custom observation builder tree_depth = 2 TreeObservation = TreeObsForRailEnv(max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv(20)) # Different agent types (trains) with different speeds. speed_ration_map = {1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0} # Slow freight train env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator(max_num_cities=max_num_cities, # Number of cities in map (where train stations are) seed=1, # Random seed grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=n_agents, # malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), # Malfunction data generator obs_builder_object=TreeObservation) env1 = RailEnv(width=x_dim1, height=y_dim1, rail_generator=sparse_rail_generator(max_num_cities=max_num_cities1, # Number of cities in map (where train stations are) seed=786, # Random seed grid_mode=False, max_rails_between_cities=max_rails_between_cities1, max_rails_in_city=max_rails_in_city1), schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=n_agents1, # malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), # Malfunction data generator obs_builder_object=TreeObservation) env.reset(True, True) env1.reset(True, True) # After training we want to render the results so we also load a renderer #env_renderer = RenderTool(env, gl="PILSVG", # screen_height=800, # Adjust these parameters to fit your resolution # screen_width=900) env1_renderer = RenderTool(env1, gl="PILSVG", screen_height=800, # Adjust these parameters to fit your resolution screen_width=900) # Given the depth of the tree observation and the number of features per node we get the following state_size num_features_per_node = env.obs_builder.observation_dim nr_nodes = 0 for i in range(tree_depth + 1): nr_nodes += np.power(4, i) state_size = num_features_per_node * nr_nodes # The action space of flatland is 5 discrete actions action_size = 5 # We set the number of episodes we would like to train on if 'n_trials' not in locals(): n_trials = 15000 # And the max number of steps we want to take per episode max_steps = int(3 * (env.height + env.width)) max_steps1 = int(3 * (env1.height + env1.width)) # Define training parameters eps = 1. eps_end = 0.005 eps_decay = 0.9985 # And some variables to keep track of the progress action_dict = dict() final_action_dict = dict() scores_window = deque(maxlen=100) done_window = deque(maxlen=100) deadlock_window = deque(maxlen=100) deadlock_average = [] scores = [] dones_list = [] #Metrics eps_list = [] action_prob_list = [] action_prob = [0] * action_size agent_obs = [None] * env.get_num_agents() agent_next_obs = [None] * env.get_num_agents() agent_obs_buffer = [None] * env.get_num_agents() agent_action_buffer = [2] * env.get_num_agents() cummulated_reward = np.zeros(env.get_num_agents()) update_values = False # Now we load a Double dueling DQN agent agent = Agent(state_size, action_size) for trials in range(1, n_trials + 1): if trials > 1000: # Reset environment obs, info = env.reset(True, True) #env_renderer.reset() # Build agent specific observations for a in range(env.get_num_agents()): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) agent_obs_buffer[a] = agent_obs[a].copy() # Reset score and done score = 0 env_done = 0 # Run episode for step in range(max_steps): # Action for a in range(env.get_num_agents()): if info['action_required'][a]: # If an action is require, we want to store the obs a that step as well as the action update_values = True action = agent.act(agent_obs[a], eps=eps) action_prob[action] += 1 else: update_values = False action = 0 action_dict.update({a: action}) # Environment step next_obs, all_rewards, done, deadlocks, info = env.step(action_dict) #env_renderer.render_env(show=True, show_predictions=True, show_observations=True) # Update replay buffer and train agent for a in range(env.get_num_agents()): # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[a]: agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a], agent_obs[a], done[a]) cummulated_reward[a] = 0. agent_obs_buffer[a] = agent_obs[a].copy() agent_action_buffer[a] = action_dict[a] if next_obs[a]: agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10) score += all_rewards[a] / env.get_num_agents() # Copy observation if done['__all__']: env_done = 1 break else: # Odd trials # Reset environment obs, info = env1.reset(True, True) #env1_renderer.reset() # Build agent specific observations for a in range(env1.get_num_agents()): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) agent_obs_buffer[a] = agent_obs[a].copy() # Reset score and done score = 0 env_done = 0 # Run episode for step in range(max_steps1): # Action for a in range(env1.get_num_agents()): if info['action_required'][a]: # If an action is require, we want to store the obs a that step as well as the action update_values = True action = agent.act(agent_obs[a], eps=eps) action_prob[action] += 1 else: update_values = False action = 0 action_dict.update({a: action}) # Environment step next_obs, all_rewards, done, deadlocks, info = env1.step(action_dict) #env1_renderer.render_env(show=True, show_predictions=True, show_observations=True) # Update replay buffer and train agent for a in range(env1.get_num_agents()): # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[a]: agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a], agent_obs[a], done[a]) cummulated_reward[a] = 0. agent_obs_buffer[a] = agent_obs[a].copy() agent_action_buffer[a] = action_dict[a] if next_obs[a]: agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10) score += all_rewards[a] / env1.get_num_agents() # Copy observation if done['__all__']: env_done = 1 break # Epsilon decay eps = max(eps_end, eps_decay * eps) # decrease epsilon # Collection information about training tasks_finished = 0 if trials > 1000: for _idx in range(env.get_num_agents()): if done[_idx] == 1: tasks_finished += 1 done_window.append(tasks_finished / max(1, env.get_num_agents())) scores_window.append(score / max_steps) # save most recent score scores.append(np.mean(scores_window)) deadlock_window.append(deadlocks.count(1)/max(1, env.get_num_agents())) deadlock_average.append(np.mean(deadlock_window)) dones_list.append((np.mean(done_window))) x_dim_current = x_dim y_dim_current = y_dim agent_num = env.get_num_agents() else: for _idx in range(env1.get_num_agents()): if done[_idx] == 1: tasks_finished += 1 done_window.append(tasks_finished / max(1, env1.get_num_agents())) scores_window.append(score / max_steps1) # save most recent score scores.append(np.mean(scores_window)) deadlock_window.append(deadlocks.count(1)/max(1, env1.get_num_agents())) deadlock_average.append(np.mean(deadlock_window)) dones_list.append((np.mean(done_window))) x_dim_current = x_dim1 y_dim_current = y_dim1 agent_num = env1.get_num_agents() eps_list.append(eps) action_prob_list.append(action_prob/ np.sum(action_prob)) print( '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f} %\tDeadlocks: {:.2f} \tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format( agent_num, x_dim_current, y_dim_current, trials, np.mean(scores_window), 100 * np.mean(done_window), np.mean(deadlock_window), eps, action_prob / np.sum(action_prob)), end=" ") if trials % 100 == 0: print( '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format( env.get_num_agents(), x_dim_current, y_dim_current, trials, np.mean(scores_window), 100 * np.mean(done_window), eps, action_prob / np.sum(action_prob))) torch.save(agent.qnetwork_local.state_dict(), path.join('Nets',('navigator_checkpoint' +str(trials) + '.pth'))) action_prob = [1] * action_size if trials % 50 == 0: np.savetxt(fname=path.join('Nets' , 'metrics.csv'), X=np.transpose(np.asarray([scores,dones_list,deadlock_average,eps_list])), delimiter=';',newline='\n') np.savetxt(fname=path.join('Nets' , 'action_prob.csv'), X=np.asarray(action_prob_list), delimiter=';',newline='\n') # Plot overall training progress at the end plt.plot(scores) plt.show()
def main(args): try: opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""]) except getopt.GetoptError as err: print(str(err)) # will print something like "option -a not recognized" sys.exit(2) sleep_for_animation = True for o, a in opts: if o in ("--sleep-for-animation"): sleep_for_animation = str2bool(a) else: assert False, "unhandled option" batch_builder = SampleBatchBuilder() # or MultiAgentSampleBatchBuilder writer = JsonWriter("./") # Setting these 2 parameters to True can slow down training visuals = False sleep_for_animation = False if visuals: from flatland.utils.rendertools import RenderTool max_depth = 30 tree_depth = 2 trial_start = 0 n_trials = 97 start = 0 columns = [ 'Agents', 'X_DIM', 'Y_DIM', 'TRIAL_NO', 'REWARD', 'NORMALIZED_REWARD', 'DONE_RATIO', 'STEPS', 'ACTION_PROB' ] df_all_results = pd.DataFrame(columns=columns) for trials in range(trial_start, n_trials + 1): step = 0 obs_builder_object = TreeObsForRailEnv( max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv(max_depth)) env_file = f"../env_configs/test-envs-small/Test_0/Level_{trials}.mpk" env = RailEnv( width=1, height=1, rail_generator=rail_from_file(env_file), schedule_generator=schedule_from_file(env_file), malfunction_generator_and_process_data=malfunction_from_file( env_file), obs_builder_object=obs_builder_object) obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True, activate_agents=False, random_seed=1001) file = f"../env_configs/actions-small/Test_0/Level_{trials}.mpk" with open(file, "r") as files: expert_actions = json.load(files) n_agents = env.get_num_agents() x_dim, y_dim = env.width, env.height agent_obs = [None] * n_agents agent_obs_buffer = [None] * n_agents done = dict() done["__all__"] = False if imitate: agent_action_buffer = list(expert_actions[step].values()) else: # , p=[0.2, 0, 0.5]) # [0] * n_agents agent_action_buffer = np.random.choice(5, n_agents, replace=True) update_values = [False] * n_agents max_steps = int(4 * 2 * (20 + env.height + env.width)) action_size = 5 # 3 # And some variables to keep track of the progress action_dict = dict() scores_window = deque(maxlen=100) reward_window = deque(maxlen=100) done_window = deque(maxlen=100) action_prob = [0] * action_size # agent = Agent(state_size, action_size) if visuals: env_renderer = RenderTool(env, gl="PILSVG") env_renderer.render_env(show=True, frames=True, show_observations=True) for a in range(n_agents): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) agent_obs_buffer[a] = agent_obs[a].copy() # Reset score and done score = 0 agent_action_buffer = np.zeros(n_agents) # prev_action = np.zeros_like(env.action_space.sample()) prev_reward = np.zeros(n_agents) for step in range(max_steps): for a in range(n_agents): if info['action_required'][a]: if imitate: if step < len(expert_actions): action = expert_actions[step][str(a)] else: action = 0 else: action = 0 action_prob[action] += 1 update_values[a] = True else: update_values[a] = False action = 0 action_dict.update({a: action}) next_obs, all_rewards, done, info = env.step(action_dict) for a in range(n_agents): if next_obs[a] is not None: agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10) # Only update the values when we are done or when an action # was taken and thus relevant information is present if update_values[a] or done[a]: start += 1 batch_builder.add_values( t=step, eps_id=trials, agent_index=0, obs=agent_obs_buffer[a], actions=action_dict[a], action_prob=1.0, # put the true action probability rewards=all_rewards[a], prev_actions=agent_action_buffer[a], prev_rewards=prev_reward[a], dones=done[a], infos=info['action_required'][a], new_obs=agent_obs[a]) agent_obs_buffer[a] = agent_obs[a].copy() agent_action_buffer[a] = action_dict[a] prev_reward[a] = all_rewards[a] score += all_rewards[a] # / env.get_num_agents() if visuals: env_renderer.render_env(show=True, frames=True, show_observations=True) if sleep_for_animation: time.sleep(0.5) if done["__all__"] or step > max_steps: writer.write(batch_builder.build_and_reset()) break # Collection information about training if step % 100 == 0: tasks_finished = 0 for current_agent in env.agents: if current_agent.status == RailAgentStatus.DONE_REMOVED: tasks_finished += 1 print( '\rTrial No {} Training {} Agents on ({},{}).\t Steps {}\t Reward: {:.3f}\t Normalized Reward: {:.3f}\tDones: {:.2f}%\t' .format( trials, env.get_num_agents(), x_dim, y_dim, step, score, score / (max_steps + n_agents), 100 * np.mean( tasks_finished / max(1, env.get_num_agents()))), end=" ") tasks_finished = 0 for current_agent in env.agents: if current_agent.status == RailAgentStatus.DONE_REMOVED: tasks_finished += 1 done_window.append(tasks_finished / max(1, env.get_num_agents())) reward_window.append(score) scores_window.append(score / (max_steps + n_agents)) data = [[ n_agents, x_dim, y_dim, trials, np.mean(reward_window), np.mean(scores_window), 100 * np.mean(done_window), step, action_prob / np.sum(action_prob) ]] df_cur = pd.DataFrame(data, columns=columns) df_all_results = pd.concat([df_all_results, df_cur]) if imitate: df_all_results.to_csv( f'TreeImitationLearning_DQN_TrainingResults.csv', index=False) print( '\rTrial No {} Training {} Agents on ({},{}).\t Total Steps {}\t Reward: {:.3f}\t Normalized Reward: {:.3f}\tDones: {:.2f}%\t' .format(trials, env.get_num_agents(), x_dim, y_dim, step, np.mean(reward_window), np.mean(scores_window), 100 * np.mean(done_window)), end=" ") if visuals: env_renderer.close_window() gc.collect()
def handle_env_create(self, command): """ Handles a ENV_CREATE command from the client TODO: Add a high level summary of everything thats happening here. """ if not self.simulation_done: # trying to reset a simulation before finishing the previous one _command_response = self._error_template( "CAN'T CREATE NEW ENV BEFORE PREVIOUS IS DONE") self.send_response(_command_response, command) raise Exception(_command_response['payload']) self.simulation_count += 1 self.simulation_done = False if self.simulation_count < len(self.env_file_paths): """ There are still test envs left that are yet to be evaluated """ test_env_file_path = self.env_file_paths[self.simulation_count] print("Evaluating : {}".format(test_env_file_path)) test_env_file_path = os.path.join(self.test_env_folder, test_env_file_path) del self.env self.env = RailEnv( width=1, height=1, rail_generator=rail_from_file(test_env_file_path), schedule_generator=schedule_from_file(test_env_file_path), malfunction_generator_and_process_data=malfunction_from_file( test_env_file_path), obs_builder_object=DummyObservationBuilder()) if self.begin_simulation: # If begin simulation has already been initialized # atleast once # This adds the simulation time for the previous episode self.simulation_times.append(time.time() - self.begin_simulation) self.begin_simulation = time.time() # Update evaluation metadata for the previous episode self.update_evaluation_metadata() # Start adding placeholders for the new episode self.simulation_env_file_paths.append( os.path.relpath(test_env_file_path, self.test_env_folder)) # relative path self.simulation_rewards.append(0) self.simulation_rewards_normalized.append(0) self.simulation_percentage_complete.append(0) self.simulation_steps.append(0) self.current_step = 0 _observation, _info = self.env.reset(regenerate_rail=True, regenerate_schedule=True, activate_agents=False, random_seed=RANDOM_SEED) if self.visualize: current_env_path = self.env_file_paths[self.simulation_count] if current_env_path in self.video_generation_envs: self.env_renderer = RenderTool( self.env, gl="PILSVG", ) elif self.env_renderer: self.env_renderer = False _command_response = {} _command_response[ 'type'] = messages.FLATLAND_RL.ENV_CREATE_RESPONSE _command_response['payload'] = {} _command_response['payload']['observation'] = _observation _command_response['payload'][ 'env_file_path'] = self.env_file_paths[self.simulation_count] _command_response['payload']['info'] = _info _command_response['payload']['random_seed'] = RANDOM_SEED else: """ All test env evaluations are complete """ _command_response = {} _command_response[ 'type'] = messages.FLATLAND_RL.ENV_CREATE_RESPONSE _command_response['payload'] = {} _command_response['payload']['observation'] = False _command_response['payload']['env_file_path'] = False _command_response['payload']['info'] = False _command_response['payload']['random_seed'] = False self.send_response(_command_response, command) ##################################################################### # Update evaluation state ##################################################################### progress = np.clip( self.simulation_count * 1.0 / len(self.env_file_paths), 0, 1) mean_reward, mean_normalized_reward, mean_percentage_complete = self.compute_mean_scores( ) self.evaluation_state["state"] = "IN_PROGRESS" self.evaluation_state["progress"] = progress self.evaluation_state["simulation_count"] = self.simulation_count self.evaluation_state["score"]["score"] = mean_percentage_complete self.evaluation_state["score"]["score_secondary"] = mean_reward self.evaluation_state["meta"][ "normalized_reward"] = mean_normalized_reward self.handle_aicrowd_info_event(self.evaluation_state)
def main(argv): try: opts, args = getopt.getopt(argv, "n:", ["n_trials="]) except getopt.GetoptError: print('training_navigation.py -n <n_trials>') sys.exit(2) for opt, arg in opts: if opt in ('-n', '--n_trials'): n_trials = int(arg) random.seed(1) np.random.seed(1) # Parameters for the Environment x_dim = 35 y_dim = 35 n_agents = 1 # Use a the malfunction generator to break agents from time to time stochastic_data = { 'prop_malfunction': 0.0, # Percentage of defective agents 'malfunction_rate': 30, # Rate of malfunction occurence 'min_duration': 3, # Minimal duration of malfunction 'max_duration': 20 # Max duration of malfunction } # Custom observation builder TreeObservation = TreeObsForRailEnv(max_depth=2) # Different agent types (trains) with different speeds. speed_ration_map = { 1.: 0., # Fast passenger train 1. / 2.: 1.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 } # Slow freight train env = RailEnv(width=x_dim, height=y_dim, rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=2, min_dist=5, max_dist=99999), number_of_agents=n_agents, obs_builder_object=TreeObservation) # After training we want to render the results so we also load a renderer env_renderer = RenderTool( env, gl="PILSVG", ) # Given the depth of the tree observation and the number of features per node we get the following state_size num_features_per_node = env.obs_builder.observation_dim tree_depth = 2 nr_nodes = 0 for i in range(tree_depth + 1): nr_nodes += np.power(4, i) state_size = num_features_per_node * nr_nodes # The action space of flatland is 5 discrete actions action_size = 5 # We set the number of episodes we would like to train on if 'n_trials' not in locals(): n_trials = 1 # And the max number of steps we want to take per episode max_steps = int(3 * (env.height + env.width)) # Define training parameters eps = 1. eps_end = 0.005 eps_decay = 0.998 # And some variables to keep track of the progress action_dict = dict() final_action_dict = dict() scores_window = deque(maxlen=100) done_window = deque(maxlen=100) scores = [] dones_list = [] action_prob = [0] * action_size agent_obs = [None] * env.get_num_agents() agent_next_obs = [None] * env.get_num_agents() agent_obs_buffer = [None] * env.get_num_agents() agent_action_buffer = [2] * env.get_num_agents() cummulated_reward = np.zeros(env.get_num_agents()) update_values = False # Now we load a Double dueling DQN agent agent = Agent(state_size, action_size) # agent.load("models") for trials in range(1, n_trials + 1): # Reset environment obs = env.reset() env_renderer.reset() # Build agent specific observations for a in range(env.get_num_agents()): if obs[a]: # agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) agent_obs_buffer[a] = obs[a].copy() # Reset score and done score = 0 env_done = 0 # Run episode for step in range(max_steps): # Action for a in range(env.get_num_agents()): # If an action is require, we want to store the obs a that step as well as the action update_values = True action = agent.act(obs[a], eps=eps) action_prob[action] += 1 # else: # update_values = False # action = 0 action_dict.update({a: action}) # Environment step next_obs, all_rewards, done, info = env.step(action_dict) env_renderer.render_env(show=True, show_observations=True, show_predictions=False) # Update replay buffer and train agent for a in range(env.get_num_agents()): # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[a]: agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a], obs[a], done[a]) cummulated_reward[a] = 0. agent_obs_buffer[a] = obs[a].copy() agent_action_buffer[a] = action_dict[a] # if next_obs[a]: # agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10) score += all_rewards[a] / env.get_num_agents() # Copy observation if done['__all__']: env_done = 1 break # Epsilon decay eps = max(eps_end, eps_decay * eps) # decrease epsilon # Collection information about training tasks_finished = 0 for _idx in range(env.get_num_agents()): if done[_idx] == 1: tasks_finished += 1 done_window.append(tasks_finished / max(1, env.get_num_agents())) scores_window.append(score / max_steps) # save most recent score scores.append(np.mean(scores_window)) dones_list.append((np.mean(done_window))) print( '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}' .format(env.get_num_agents(), x_dim, y_dim, trials, np.mean(scores_window), 100 * np.mean(done_window), eps, action_prob / np.sum(action_prob)), end=" ") if trials % 100 == 0: print( '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}' .format(env.get_num_agents(), x_dim, y_dim, trials, np.mean(scores_window), 100 * np.mean(done_window), eps, action_prob / np.sum(action_prob))) # tf.save_checkpoint(agent.qnetwork_local.state_dict(), # './Nets/navigator_checkpoint' + str(trials) + '.pth') action_prob = [1] * action_size # Plot overall training progress at the end # plt.plot(scores) # plt.show() agent.save("first-run")
steps_by_episode.append(step) total_rewards_by_episode.append(total_reward) env.restart_agents() if done: env.dones = {0: False, '__all__': False} #%% env = RailEnv(width=7, height=7, rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=1, min_dist=8, max_dist=99999, seed=1), schedule_generator=complex_schedule_generator(), number_of_agents=2, obs_builder_object=TreeObsForRailEnv(max_depth=2)) #env.reset() env_renderer = RenderTool(env, agent_render_variant=3) env_renderer.render_env(show=True, show_predictions=False, show_observations=False) #%% steps_by_episode = np.array(steps_by_episode) plt.plot(moving_average(steps_by_episode, 200)) #%% #qq = QTable(env, get_rail_coordinates(env),3,3,3)
def test_shortest_path_predictor(rendering=False): rail, rail_map = make_simple_rail() env = RailEnv( width=rail_map.shape[1], height=rail_map.shape[0], rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=1, obs_builder_object=TreeObsForRailEnv( max_depth=2, predictor=ShortestPathPredictorForRailEnv()), ) env.reset() # set the initial position agent = env.agents[0] agent.initial_position = (5, 6) # south dead-end agent.position = (5, 6) # south dead-end agent.direction = 0 # north agent.initial_direction = 0 # north agent.target = (3, 9) # east dead-end agent.moving = True agent.status = RailAgentStatus.ACTIVE env.reset(False, False) if rendering: renderer = RenderTool(env, gl="PILSVG") renderer.render_env(show=True, show_observations=False) input("Continue?") # compute the observations and predictions distance_map = env.distance_map.get() assert distance_map[0, agent.initial_position[0], agent.initial_position[1], agent.direction] == 5.0, \ "found {} instead of {}".format( distance_map[agent.handle, agent.initial_position[0], agent.position[1], agent.direction], 5.0) paths = get_shortest_paths(env.distance_map)[0] assert paths == [ WayPoint((5, 6), 0), WayPoint((4, 6), 0), WayPoint((3, 6), 0), WayPoint((3, 7), 1), WayPoint((3, 8), 1), WayPoint((3, 9), 1) ] # extract the data predictions = env.obs_builder.predictions positions = np.array( list(map(lambda prediction: [*prediction[1:3]], predictions[0]))) directions = np.array( list(map(lambda prediction: [prediction[3]], predictions[0]))) time_offsets = np.array( list(map(lambda prediction: [prediction[0]], predictions[0]))) # test if data meets expectations expected_positions = [ [5, 6], [4, 6], [3, 6], [3, 7], [3, 8], [3, 9], [3, 9], [3, 9], [3, 9], [3, 9], [3, 9], [3, 9], [3, 9], [3, 9], [3, 9], [3, 9], [3, 9], [3, 9], [3, 9], [3, 9], [3, 9], ] expected_directions = [ [Grid4TransitionsEnum.NORTH], # next is [5,6] heading north [Grid4TransitionsEnum.NORTH], # next is [4,6] heading north [Grid4TransitionsEnum.NORTH], # next is [3,6] heading north [Grid4TransitionsEnum.EAST], # next is [3,7] heading east [Grid4TransitionsEnum.EAST], [Grid4TransitionsEnum.EAST], [Grid4TransitionsEnum.EAST], [Grid4TransitionsEnum.EAST], [Grid4TransitionsEnum.EAST], [Grid4TransitionsEnum.EAST], [Grid4TransitionsEnum.EAST], [Grid4TransitionsEnum.EAST], [Grid4TransitionsEnum.EAST], [Grid4TransitionsEnum.EAST], [Grid4TransitionsEnum.EAST], [Grid4TransitionsEnum.EAST], [Grid4TransitionsEnum.EAST], [Grid4TransitionsEnum.EAST], [Grid4TransitionsEnum.EAST], [Grid4TransitionsEnum.EAST], [Grid4TransitionsEnum.EAST], ] expected_time_offsets = np.array([ [0.], [1.], [2.], [3.], [4.], [5.], [6.], [7.], [8.], [9.], [10.], [11.], [12.], [13.], [14.], [15.], [16.], [17.], [18.], [19.], [20.], ]) assert np.array_equal(time_offsets, expected_time_offsets), \ "time_offsets {}, expected {}".format(time_offsets, expected_time_offsets) assert np.array_equal(positions, expected_positions), \ "positions {}, expected {}".format(positions, expected_positions) assert np.array_equal(directions, expected_directions), \ "directions {}, expected {}".format(directions, expected_directions)
def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps, action_size, state_size, seed, render, allow_skipping, allow_caching): # Evaluation is faster on CPU (except if you use a really huge policy) parameters = {'use_gpu': False} policy = DDDQNPolicy(state_size, action_size, Namespace(**parameters), evaluation_mode=True) policy.qnetwork_local = torch.load(checkpoint) env_params = Namespace(**env_params) # Environment parameters n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city # Malfunction and speed profiles # TODO pass these parameters properly from main! malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 2000, # Rate of malfunctions min_duration=20, # Minimal duration max_duration=50 # Max duration ) # Only fast trains in Round 1 speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Observation parameters observation_tree_depth = env_params.observation_tree_depth observation_radius = env_params.observation_radius observation_max_path_depth = env_params.observation_max_path_depth # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Setup the environment env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city, ), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation) if render: env_renderer = RenderTool(env, gl="PGL") action_dict = dict() scores = [] completions = [] nb_steps = [] inference_times = [] preproc_times = [] agent_times = [] step_times = [] for episode_idx in range(n_eval_episodes): seed += 1 inference_timer = Timer() preproc_timer = Timer() agent_timer = Timer() step_timer = Timer() step_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True, random_seed=seed) step_timer.end() agent_obs = [None] * env.get_num_agents() score = 0.0 if render: env_renderer.set_new_rail() final_step = 0 skipped = 0 nb_hit = 0 agent_last_obs = {} agent_last_action = {} for step in range(max_steps - 1): if allow_skipping and check_if_all_blocked(env): # FIXME why -1? bug where all agents are "done" after max_steps! skipped = max_steps - step - 1 final_step = max_steps - 2 n_unfinished_agents = sum(not done[idx] for idx in env.get_agent_handles()) score -= skipped * n_unfinished_agents break agent_timer.start() for agent in env.get_agent_handles(): if obs[agent] and info['action_required'][agent]: if agent in agent_last_obs and np.all( agent_last_obs[agent] == obs[agent]): nb_hit += 1 action = agent_last_action[agent] else: preproc_timer.start() norm_obs = normalize_observation( obs[agent], tree_depth=observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() inference_timer.start() action = policy.act(norm_obs, eps=0.0) inference_timer.end() action_dict.update({agent: action}) if allow_caching: agent_last_obs[agent] = obs[agent] agent_last_action[agent] = action agent_timer.end() step_timer.start() obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if render: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) if step % 100 == 0: print("{}/{}".format(step, max_steps - 1)) for agent in env.get_agent_handles(): score += all_rewards[agent] final_step = step if done['__all__']: break normalized_score = score / (max_steps * env.get_num_agents()) scores.append(normalized_score) tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) completions.append(completion) nb_steps.append(final_step) inference_times.append(inference_timer.get()) preproc_times.append(preproc_timer.get()) agent_times.append(agent_timer.get()) step_times.append(step_timer.get()) skipped_text = "" if skipped > 0: skipped_text = "\t⚡ Skipped {}".format(skipped) hit_text = "" if nb_hit > 0: hit_text = "\t⚡ Hit {} ({:.1f}%)".format(nb_hit, (100 * nb_hit) / (n_agents * final_step)) print( "☑️ Score: {:.3f} \tDone: {:.1f}% \tNb steps: {:.3f} " "\t🍭 Seed: {}" "\t🚉 Env: {:.3f}s " "\t🤖 Agent: {:.3f}s (per step: {:.3f}s) \t[preproc: {:.3f}s \tinfer: {:.3f}s]" "{}{}".format(normalized_score, completion * 100.0, final_step, seed, step_timer.get(), agent_timer.get(), agent_timer.get() / final_step, preproc_timer.get(), inference_timer.get(), skipped_text, hit_text)) return scores, completions, nb_steps, agent_times, step_times
def test_dummy_predictor(rendering=False): rail, rail_map = make_simple_rail2() env = RailEnv( width=rail_map.shape[1], height=rail_map.shape[0], rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=1, obs_builder_object=TreeObsForRailEnv( max_depth=2, predictor=DummyPredictorForRailEnv(max_depth=10)), ) env.reset() # set initial position and direction for testing... env.agents[0].initial_position = (5, 6) env.agents[0].initial_direction = 0 env.agents[0].direction = 0 env.agents[0].target = (3, 0) env.reset(False, False) env.set_agent_active(env.agents[0]) if rendering: renderer = RenderTool(env, gl="PILSVG") renderer.render_env(show=True, show_observations=False) input("Continue?") # test assertions predictions = env.obs_builder.predictor.get(None) positions = np.array( list(map(lambda prediction: [*prediction[1:3]], predictions[0]))) directions = np.array( list(map(lambda prediction: [prediction[3]], predictions[0]))) time_offsets = np.array( list(map(lambda prediction: [prediction[0]], predictions[0]))) actions = np.array( list(map(lambda prediction: [prediction[4]], predictions[0]))) # compare against expected values expected_positions = np.array([ [5., 6.], [4., 6.], [3., 6.], [3., 5.], [3., 4.], [3., 3.], [3., 2.], [3., 1.], # at target (3,0): stay in this position from here on [3., 0.], [3., 0.], [3., 0.], ]) expected_directions = np.array([ [0.], [0.], [0.], [3.], [3.], [3.], [3.], [3.], # at target (3,0): stay in this position from here on [3.], [3.], [3.] ]) expected_time_offsets = np.array([ [0.], [1.], [2.], [3.], [4.], [5.], [6.], [7.], [8.], [9.], [10.], ]) expected_actions = np.array([ [0.], [2.], [2.], [2.], [2.], [2.], [2.], [2.], # reaching target by straight [2.], # at target: stopped moving [4.], [4.], ]) assert np.array_equal(positions, expected_positions) assert np.array_equal(directions, expected_directions) assert np.array_equal(time_offsets, expected_time_offsets) assert np.array_equal(actions, expected_actions)
def get_reward(weights, model, render=False): cloned_model = copy.deepcopy(model) for i, param in enumerate(cloned_model.parameters()): try: param.data.copy_(weights[i]) except: param.data.copy_(weights[i].data) env_Orig = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=3, # Number of cities in map (where train stations are) seed=1, # Random seed grid_mode=False, max_rails_between_cities=2, max_rails_in_city=3), schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=n_agents, stochastic_data=stochastic_data, # Malfunction data generator obs_builder_object=TreeObservation) env = copy.deepcopy(env_Orig) # After training we want to render the results so we also load a renderer env_renderer = RenderTool( env, gl="PILSVG", ) # And the max number of steps we want to take per episode max_steps = int(4 * 2 * (20 + env.height + env.width)) n_episodes = 1 for trials in range(1, n_episodes + 1): # Reset environment obs, info = env.reset(True, True) env_renderer.reset() # Build agent specific observations for a in range(env.get_num_agents()): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) agent_obs_buffer[a] = agent_obs[a].copy() # Reset score and done score = 0 env_done = 0 step = 0 # Run episode while True: # Action for a in range(env.get_num_agents()): if info['action_required'][a]: # If an action is require, we want to store the obs a that step as well as the action update_values[a] = True batch = torch.from_numpy(agent_obs[a][np.newaxis, ...]).float() if cuda: batch = batch.cuda() prediction = cloned_model(Variable(batch)) action = prediction.data.cpu().numpy().argmax() # action = agent.act(agent_obs[a], eps=eps) action_prob[action] += 1 else: update_values[a] = False action = 0 action_dict.update({a: action}) # Environment step # print("Action Values:", action_dict) next_obs, all_rewards, done, info = env.step(action_dict) step += 1 if (render): env_renderer.render_env(show=True, show_predictions=True, show_observations=False) for a in range(env.get_num_agents()): # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values[a] or done[a]: # agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a], # agent_obs[a], done[a]) cummulated_reward[a] = 0. agent_obs_buffer[a] = agent_obs[a].copy() agent_action_buffer[a] = action_dict[a] if next_obs[a]: agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10) score += all_rewards[a] / env.get_num_agents() # print(all_rewards) # Copy observation if done['__all__'] or step >= max_steps: env_done = 1 break # Collection information about training tasks_finished = 0 for current_agent in env.agents: if current_agent.status == RailAgentStatus.DONE_REMOVED: tasks_finished += 1 done_window.append(tasks_finished / max(1, env.get_num_agents())) scores_window.append(score / max_steps) # save most recent score scores.append(np.mean(scores_window)) dones_list.append((np.mean(done_window))) print( '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\t Action Probabilities: \t {}' .format(env.get_num_agents(), x_dim, y_dim, trials, np.mean(scores_window), 100 * np.mean(done_window), action_prob / np.sum(action_prob)), end=" ") # env.close() data = [[ n_agents, x_dim, y_dim, trials, np.mean(scores_window), 100 * np.mean(done_window), step, action_prob / np.sum(action_prob) ]] dfCur = pd.DataFrame(data) with open(f'ES_TrainingResults_{n_agents}_{x_dim}_{y_dim}.csv', 'a') as f: dfCur.to_csv(f, index=False, header=False) return np.mean(scores)
def evaluate(seed=37429879, timed=False, filename="./rl-weights.pth", debug=False, refresh=1): # Attempt to load policy from disk. policy = load_policy(filename, seed=seed) # Create environment with given seeding. env, max_steps, _, _, observation_tree_depth, _ = create_default_single_agent_environment( seed + 1, timed) # Fixed environment parameters (note, these must correspond with the training parameters!) observation_radius = 10 env_renderer = None if (debug): env_renderer = RenderTool(env, screen_width=1920, screen_height=1080) # Create container for the agent actions and observations. action_dict = dict() agent_obs = [None] * env.number_of_agents num_maps = 100 scores = [] successes = 0 for _ in range(0, num_maps): # Create a new map. obs, info = env.reset(True, True) score = 0 if debug: env_renderer.reset() env_renderer.render_env(show=True, frames=False, show_observations=False) time.sleep(refresh) # Run episode for _ in range(max_steps - 1): # Build agent specific observations for agent in env.get_agent_handles(): if obs[agent]: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) # If an action is required, select the action. for agent in env.get_agent_handles(): action = 0 if info['action_required'][agent]: action = policy.act(agent_obs[agent], eps=0.08) #print("Required " + str(action)) action_dict.update({agent: action}) # Environment step obs, all_rewards, done, info = env.step(action_dict) if debug: env_renderer.render_env(show=True, frames=False, show_observations=False) time.sleep(refresh) # Track rewards. score = score + all_rewards[agent] if done[agent]: successes = successes + 1 break # Record scores. scores.append(score) print("Successful: %8.2f%%" % (100 * successes / num_maps)) print("Mean reward: %8.2f" % (np.mean(scores))) print("Median reward: %8.2f" % (np.median(scores)))
def test_get_k_shortest_paths(rendering=False): rail, rail_map = make_simple_rail_with_alternatives() env = RailEnv(width=rail_map.shape[1], height=rail_map.shape[0], rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=1, obs_builder_object=GlobalObsForRailEnv(), ) env.reset() initial_position = (3, 1) # west dead-end initial_direction = Grid4TransitionsEnum.WEST # west target_position = (3, 9) # east # set the initial position agent = env.agents[0] agent.position = initial_position agent.initial_position = initial_position agent.direction = initial_direction agent.target = target_position # east dead-end agent.moving = True env.reset(False, False) if rendering: renderer = RenderTool(env, gl="PILSVG") renderer.render_env(show=True, show_observations=False) input() actual = set(get_k_shortest_paths( env=env, source_position=initial_position, # west dead-end source_direction=int(initial_direction), # east target_position=target_position, k=10 )) expected = set([ ( Waypoint(position=(3, 1), direction=3), Waypoint(position=(3, 0), direction=3), Waypoint(position=(3, 1), direction=1), Waypoint(position=(3, 2), direction=1), Waypoint(position=(3, 3), direction=1), Waypoint(position=(2, 3), direction=0), Waypoint(position=(1, 3), direction=0), Waypoint(position=(0, 3), direction=0), Waypoint(position=(0, 4), direction=1), Waypoint(position=(0, 5), direction=1), Waypoint(position=(0, 6), direction=1), Waypoint(position=(0, 7), direction=1), Waypoint(position=(0, 8), direction=1), Waypoint(position=(0, 9), direction=1), Waypoint(position=(1, 9), direction=2), Waypoint(position=(2, 9), direction=2), Waypoint(position=(3, 9), direction=2)), ( Waypoint(position=(3, 1), direction=3), Waypoint(position=(3, 0), direction=3), Waypoint(position=(3, 1), direction=1), Waypoint(position=(3, 2), direction=1), Waypoint(position=(3, 3), direction=1), Waypoint(position=(3, 4), direction=1), Waypoint(position=(3, 5), direction=1), Waypoint(position=(3, 6), direction=1), Waypoint(position=(4, 6), direction=2), Waypoint(position=(5, 6), direction=2), Waypoint(position=(6, 6), direction=2), Waypoint(position=(5, 6), direction=0), Waypoint(position=(4, 6), direction=0), Waypoint(position=(4, 7), direction=1), Waypoint(position=(4, 8), direction=1), Waypoint(position=(4, 9), direction=1), Waypoint(position=(3, 9), direction=0)) ]) assert actual == expected, "actual={},expected={}".format(actual, expected)
def main(argv): try: opts, args = getopt.getopt(argv, "n:", ["n_trials="]) except getopt.GetoptError: print('test_navigation_single_agent.py -n <n_trials>') sys.exit(2) for opt, arg in opts: if opt in ('-n', '--n_trials'): n_trials = int(arg) random.seed(1) np.random.seed(1) # Parameters for the Environment multi_agent_setup = 3 if multi_agent_setup == 1: x_dim = 35 y_dim = 35 n_agents = 1 max_num_cities = 3 max_rails_between_cities = 2 max_rails_in_city = 3 # Multi agent (3) if multi_agent_setup == 3: x_dim = 40 y_dim = 40 n_agents = 3 max_num_cities = 4 max_rails_between_cities = 2 max_rails_in_city = 3 # Multi agent (5) if multi_agent_setup == 5: x_dim = 16 * 3 y_dim = 9 * 3 n_agents = 7 max_num_cities = 5 max_rails_between_cities = 2 max_rails_in_city = 3 if multi_agent_setup == 7: x_dim = 16 * 4 y_dim = 9 * 4 n_agents = 7 max_num_cities = 7 max_rails_between_cities = 4 max_rails_in_city = 4 if multi_agent_setup == 8: x_dim = 16 * 4 y_dim = 9 * 4 n_agents = 10 max_num_cities = 9 max_rails_between_cities = 5 max_rails_in_city = 5 # We are training an Agent using the Tree Observation with depth 2 #observation_builder = TreeObsForRailEnv(max_depth=2, predictor = ShortestPathPredictorForRailEnv(20)) # Use a the malfunction generator to break agents from time to time stochastic_data = { 'malfunction_rate': 80, # Rate of malfunction occurence of single agent 'min_duration': 15, # Minimal duration of malfunction 'max_duration': 50 # Max duration of malfunction } # Custom observation builder tree_depth = 2 TreeObservation = TreeObsForRailEnv( max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv(20)) np.savetxt(fname=path.join('NetsTest', 'info.txt'), X=[ x_dim, y_dim, n_agents, max_num_cities, max_rails_between_cities, max_rails_in_city, tree_depth ], delimiter=';') # Different agent types (trains) with different speeds. speed_ration_map = { 1.: 1., # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 } # Slow freight train #speed_ration_map = {1.: 0.25, # Fast passenger train # 1. / 2.: 0.25, # Fast freight train # 1. / 3.: 0.25, # Slow commuter train # 1. / 4.: 0.25} # Slow freight train env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=max_num_cities, # Number of cities in map (where train stations are) seed=14, # Random seed grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_ration_map), #malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), number_of_agents=n_agents, obs_builder_object=TreeObservation) env.reset() #env_renderer = RenderTool(env, gl="PILSVG", ) env_renderer = RenderTool( env, gl="PILSVG", agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=False, screen_height=(1080 * 0.8), # Adjust these parameters to fit your resolution screen_width=(1920 * 0.8)) num_features_per_node = env.obs_builder.observation_dim nr_nodes = 0 for i in range(tree_depth + 1): nr_nodes += np.power(4, i) state_size = num_features_per_node * nr_nodes action_size = 5 # We set the number of episodes we would like to train on if 'n_trials' not in locals(): n_trials = 15000 max_steps = int(3 * (env.height + env.width)) eps = 1. eps_end = 0.005 eps_decay = 0.9995 # And some variables to keep track of the performance action_dict = dict() final_action_dict = dict() action_prob_list = [] scores_window = deque(maxlen=100) done_window = deque(maxlen=100) scores = [] scores_list = [] deadlock_list = [] dones_list_window = [] dones_list = [] action_prob = [0] * action_size agent_obs = [None] * env.get_num_agents() agent_next_obs = [None] * env.get_num_agents() # Useless agent = Agent(state_size, action_size) # Load model's weights to test agent.qnetwork_local.load_state_dict( torch.load( path.join( 'NetsTest', 'navigator_checkpoint5000_multi3_deadlock_global10.pth'))) record_images = False frame_step = 0 for trials in range(1, n_trials + 1): # Reset environment obs, info = env.reset() #(True, True) env_renderer.reset() # Build agent specific observations for a in range(env.get_num_agents()): agent_obs[a] = agent_obs[a] = normalize_observation( obs[a], tree_depth, observation_radius=10) # Reset score and done score = 0 env_done = 0 # Run episode for step in range(max_steps): # Action for a in range(env.get_num_agents()): if info['action_required'][a]: action = agent.act(agent_obs[a], eps=0.) action_prob[action] += 1 else: action = 0 action_dict.update({a: action}) # Environment step obs, all_rewards, done, deadlocks, info = env.step(action_dict) env_renderer.render_env(show=True, show_predictions=True, show_observations=False) # Build agent specific observations and normalize for a in range(env.get_num_agents()): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) score += all_rewards[a] / env.get_num_agents() if done['__all__']: break # Collection information about training tasks_finished = 0 for _idx in range(env.get_num_agents()): if done[_idx] == 1: tasks_finished += 1 done_window.append(tasks_finished / max(1, env.get_num_agents())) scores_window.append(score / max_steps) # save most recent score scores.append(np.mean(scores_window)) dones_list.append(tasks_finished / max(1, env.get_num_agents())) dones_list_window.append((np.mean(done_window))) scores_list.append(score / max_steps) deadlock_list.append(deadlocks.count(1) / max(1, env.get_num_agents())) if (np.sum(action_prob) == 0): action_prob_normalized = [0] * action_size else: action_prob_normalized = action_prob / np.sum(action_prob) print( '\rTesting {} Agents on ({},{}).\t Episode {}\t Score: {:.3f}\tDones: {:.2f}%\tDeadlocks: {:.2f}\t Action Probabilities: \t {}' .format(env.get_num_agents(), x_dim, y_dim, trials, score / max_steps, 100 * tasks_finished / max(1, env.get_num_agents()), deadlocks.count(1) / max(1, env.get_num_agents()), action_prob_normalized), end=" ") #if trials % 100 == 0: action_prob_list.append(action_prob_normalized) action_prob = [0] * action_size if trials % 50 == 0: #np.savetxt(fname=path.join('Nets' , 'scores_metric.txt'), X=scores) #np.savetxt(fname=path.join('Nets' , 'dones_metric.txt'), X=dones_list) np.savetxt(fname=path.join('NetsTest', 'test_metrics.csv'), X=np.transpose( np.asarray([ scores_list, scores, dones_list, dones_list_window, deadlock_list ])), delimiter=';', newline='\n') np.savetxt(fname=path.join('NetsTest', 'test_action_prob.csv'), X=np.asarray(action_prob_list), delimiter=';', newline='\n')
# stn to action controller # this is an old stn to action controller for 2019 challenge # may need correction/improvement ##################################################################### my_controller = MCP_Controller(local_env) # init prev locations to be -1 for each agent. (None of them has left the station yet) prev_locs = [-1 for i in range(0,len(local_env.agents))] ##################################################################### # Show the flatland visualization, for debugging ##################################################################### if env_renderer_enable: env_renderer = RenderTool(local_env, screen_height=4000, screen_width=4000) env_renderer.render_env(show=True, show_observations=False, show_predictions=False) ##################################################################### while True: ##################################################################### # Evaluation of a single episode # ##################################################################### if debug_print: print("current step: ", steps) time_start = time.time()
from flatland.envs.rail_env import RailEnv from flatland.utils.rendertools import RenderTool NUMBER_OF_AGENTS = 10 env = RailEnv( width=20, height=20, rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=1, min_dist=8, max_dist=99999, seed=0), number_of_agents=NUMBER_OF_AGENTS, ) env_renderer = RenderTool(env) def my_controller(): """ You are supposed to write this controller """ _action = {} for _idx in range(NUMBER_OF_AGENTS): _action[_idx] = np.random.randint(0, 5) return _action for step in range(100): _action = my_controller()
# Initialize the q-values # Q = np.zeros((x_dim, y_dim, action_size), dtype=np.float) with open(Q_filename, "rb") as f: Q = pickle.load(f) # Use the complex_rail_generator to generate feasible network configurations with corresponding tasks # Training on simple small tasks is the best way to get familiar with the environment TreeObservation = TreeObsForRailEnv(max_depth=tree_depth) env = RailEnv(width=x_dim, height=y_dim, rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=2, min_dist=5, max_dist=99999), obs_builder_object=TreeObservation, number_of_agents=n_agents) env_renderer = RenderTool(env, gl="PILSVG", ) # Given the depth of the tree observation and the number of features per node we get the following state_size features_per_node = 9 nr_nodes = 0 for i in range(tree_depth + 1): nr_nodes += np.power(4, i) state_size = features_per_node * nr_nodes # Import your own Agent or use RLlib to train agents on Flatland # As an example we use a random agent here class RandomAgent: def __init__(self, state_size, action_size):
regenerate_schedule=False, random_seed=True) for episode in range(EPISODES): if episode % SHOW_EVERY == 0: # This episode will be showed render = True print("Show episode number: ", episode) else: render = False observation, info = sparse_env.reset(regenerate_rail=False, regenerate_schedule=False, random_seed=True) env_renderer = RenderTool(sparse_env, gl="PGL") env_renderer.set_new_rail() state = observation[0]["state"] # e.g. of observation {0: {"state": (12, 4), observations: [[1 0 0], [0 1 0]]}} obs = observation[0]["observations"] done = {0: False} cost = 0 # cost of the path found (for plotting the results) while not done[0]: q_best = -np.inf # searching in the Q tensor the most promising action given the for single_obs in obs: # admissible actions (this information is stored in the "observations") index = np.argmax(single_obs)