def gen_env(number_agents, width, height, n_start_goal, seed): speed_ration_map = { 1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25 } # Slow freight train env = RailEnv(width=width, height=height, rail_generator=complex_rail_generator( nr_start_goal=n_start_goal, nr_extra=3, min_dist=6, max_dist=99999, seed=seed), schedule_generator=complex_schedule_generator( speed_ratio_map=speed_ration_map), number_of_agents=number_agents, obs_builder_object=TreeObsForRailEnv(max_depth=5)) env.reset() env.step(dict(zip(range(number_agents), [2] * number_agents))) return env
def test_malfunction_values_and_behavior(): """ Test the malfunction counts down as desired Returns ------- """ # Set fixed malfunction duration for this test rail, rail_map = make_simple_rail2() action_dict: Dict[int, RailEnvActions] = {} stochastic_data = MalfunctionParameters(malfunction_rate=0.001, # Rate of malfunction occurence min_duration=10, # Minimal duration of malfunction max_duration=10 # Max duration of malfunction ) env = RailEnv(width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=1, malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), obs_builder_object=SingleAgentNavigationObs() ) env.reset(False, False, activate_agents=True, random_seed=10) # Assertions assert_list = [9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 10, 9, 8, 7, 6, 5] print("[") for time_step in range(15): # Move in the env env.step(action_dict) # Check that next_step decreases as expected assert env.agents[0].malfunction_data['malfunction'] == assert_list[time_step]
def test_seeding_and_observations(): # Test if two different instances diverge with different observations rail, rail_map = make_simple_rail2() # Make two seperate envs with different observation builders # Global Observation env = RailEnv(width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(seed=12), number_of_agents=10, obs_builder_object=GlobalObsForRailEnv()) # Tree Observation env2 = RailEnv(width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(seed=12), number_of_agents=10, obs_builder_object=TreeObsForRailEnv( max_depth=2, predictor=ShortestPathPredictorForRailEnv())) env.reset(False, False, False, random_seed=12) env2.reset(False, False, False, random_seed=12) # Check that both environments produce the same initial start positions assert env.agents[0].initial_position == env2.agents[0].initial_position assert env.agents[1].initial_position == env2.agents[1].initial_position assert env.agents[2].initial_position == env2.agents[2].initial_position assert env.agents[3].initial_position == env2.agents[3].initial_position assert env.agents[4].initial_position == env2.agents[4].initial_position assert env.agents[5].initial_position == env2.agents[5].initial_position assert env.agents[6].initial_position == env2.agents[6].initial_position assert env.agents[7].initial_position == env2.agents[7].initial_position assert env.agents[8].initial_position == env2.agents[8].initial_position assert env.agents[9].initial_position == env2.agents[9].initial_position action_dict = {} for step in range(10): for a in range(env.get_num_agents()): action = np.random.randint(4) action_dict[a] = action env.step(action_dict) env2.step(action_dict) # Check that both environments end up in the same position assert env.agents[0].position == env2.agents[0].position assert env.agents[1].position == env2.agents[1].position assert env.agents[2].position == env2.agents[2].position assert env.agents[3].position == env2.agents[3].position assert env.agents[4].position == env2.agents[4].position assert env.agents[5].position == env2.agents[5].position assert env.agents[6].position == env2.agents[6].position assert env.agents[7].position == env2.agents[7].position assert env.agents[8].position == env2.agents[8].position assert env.agents[9].position == env2.agents[9].position for a in range(env.get_num_agents()): print("assert env.agents[{}].position == env2.agents[{}].position". format(a, a))
def test_random_seeding(): # Set fixed malfunction duration for this test rail, rail_map = make_simple_rail2() # Move target to unreachable position in order to not interfere with test for idx in range(100): env = RailEnv(width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(seed=12), number_of_agents=10) env.reset(True, True, False, random_seed=1) env.agents[0].target = (0, 0) for step in range(10): actions = {} actions[0] = 2 env.step(actions) agent_positions = [] env.agents[0].initial_position == (3, 2) env.agents[1].initial_position == (3, 5) env.agents[2].initial_position == (3, 6) env.agents[3].initial_position == (5, 6) env.agents[4].initial_position == (3, 4) env.agents[5].initial_position == (3, 1) env.agents[6].initial_position == (3, 9) env.agents[7].initial_position == (4, 6) env.agents[8].initial_position == (0, 3) env.agents[9].initial_position == (3, 7)
def test_single_malfunction_generator(): """ Test single malfunction generator Returns ------- """ rail, rail_map = make_simple_rail2() env = RailEnv( width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=10, malfunction_generator_and_process_data=single_malfunction_generator( earlierst_malfunction=10, malfunction_duration=5)) for test in range(10): env.reset() action_dict = dict() tot_malfunctions = 0 print(test) for i in range(10): for agent in env.agents: # Go forward all the time action_dict[agent.handle] = RailEnvActions(2) env.step(action_dict) for agent in env.agents: # Go forward all the time tot_malfunctions += agent.malfunction_data['nr_malfunctions'] assert tot_malfunctions == 1
def tests_random_interference_from_outside(): """Tests that malfunctions are produced by stochastic_data!""" # Set fixed malfunction duration for this test rail, rail_map = make_simple_rail2() env = RailEnv(width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(seed=2), number_of_agents=1, random_seed=1) env.reset() env.agents[0].speed_data['speed'] = 0.33 env.reset(False, False, False, random_seed=10) env_data = [] for step in range(200): action_dict: Dict[int, RailEnvActions] = {} for agent in env.agents: # We randomly select an action action_dict[agent.handle] = RailEnvActions(2) _, reward, _, _ = env.step(action_dict) # Append the rewards of the first trial env_data.append((reward[0], env.agents[0].position)) assert reward[0] == env_data[step][0] assert env.agents[0].position == env_data[step][1] # Run the same test as above but with an external random generator running # Check that the reward stays the same rail, rail_map = make_simple_rail2() random.seed(47) np.random.seed(1234) env = RailEnv(width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(seed=2), number_of_agents=1, random_seed=1) env.reset() env.agents[0].speed_data['speed'] = 0.33 env.reset(False, False, False, random_seed=10) dummy_list = [1, 2, 6, 7, 8, 9, 4, 5, 4] for step in range(200): action_dict: Dict[int, RailEnvActions] = {} for agent in env.agents: # We randomly select an action action_dict[agent.handle] = RailEnvActions(2) # Do dummy random number generations random.shuffle(dummy_list) np.random.rand() _, reward, _, _ = env.step(action_dict) assert reward[0] == env_data[step][0] assert env.agents[0].position == env_data[step][1]
def test_malfunction_process_statistically(): """Tests that malfunctions are produced by stochastic_data!""" # Set fixed malfunction duration for this test stochastic_data = MalfunctionParameters( malfunction_rate=1 / 5, # Rate of malfunction occurence min_duration=5, # Minimal duration of malfunction max_duration=5 # Max duration of malfunction ) rail, rail_map = make_simple_rail2() env = RailEnv( width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=10, malfunction_generator_and_process_data=malfunction_from_params( stochastic_data), obs_builder_object=SingleAgentNavigationObs()) env.reset(True, True, False, random_seed=10) env.agents[0].target = (0, 0) # Next line only for test generation # agent_malfunction_list = [[] for i in range(10)] agent_malfunction_list = [ [0, 0, 0, 0, 5, 4, 3, 2, 1, 0, 5, 4, 3, 2, 1, 0, 0, 0, 5, 4], [0, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 5, 4, 3, 2, 1, 0, 5, 4, 3, 2, 1, 0, 0, 5, 4, 3, 2], [0, 0, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 5, 4, 3, 2, 1], [0, 0, 5, 4, 3, 2, 1, 0, 0, 5, 4, 3, 2, 1, 0, 5, 4, 3, 2, 1], [0, 0, 0, 0, 0, 0, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0], [5, 4, 3, 2, 1, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 5], [5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 4, 3, 2], [5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0, 5, 4] ] for step in range(20): action_dict: Dict[int, RailEnvActions] = {} for agent_idx in range(env.get_num_agents()): # We randomly select an action action_dict[agent_idx] = RailEnvActions(np.random.randint(4)) # For generating tests only: # agent_malfunction_list[agent_idx].append(env.agents[agent_idx].malfunction_data['malfunction']) assert env.agents[agent_idx].malfunction_data[ 'malfunction'] == agent_malfunction_list[agent_idx][step] env.step(action_dict)
def replay_verify(max_episode_steps: int, ctl: ControllerFromTrainRuns, env: RailEnv, rendering: bool): """Replays this deterministic `ActionPlan` and verifies whether it is feasible.""" if rendering: renderer = RenderTool(env, gl="PILSVG", agent_render_variant=AgentRenderVariant. AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=True, clear_debug_text=True, screen_height=1000, screen_width=1000) renderer.render_env(show=True, show_observations=False, show_predictions=False) i = 0 while not env.dones['__all__'] and i <= max_episode_steps: for agent_id, agent in enumerate(env.agents): way_point: WayPoint = ctl.get_way_point_before_or_at_step( agent_id, i) assert agent.position == way_point.position, \ "before {}, agent {} at {}, expected {}".format(i, agent_id, agent.position, way_point.position) actions = ctl.act(i) print("actions for {}: {}".format(i, actions)) obs, all_rewards, done, _ = env.step(actions) if rendering: renderer.render_env(show=True, show_observations=False, show_predictions=False) i += 1
def test_normalize_features(): random.seed(1) np.random.seed(1) max_depth = 4 for i in range(10): tree_observer = TreeObsForRailEnv(max_depth=max_depth) next_rand_number = random.randint(0, 100) env = RailEnv(width=10, height=10, rail_generator=complex_rail_generator( nr_start_goal=10, nr_extra=1, min_dist=8, max_dist=99999, seed=next_rand_number), schedule_generator=complex_schedule_generator(), number_of_agents=1, obs_builder_object=tree_observer) obs, all_rewards, done, _ = env.step({0: 0}) obs_new = tree_observer.get() # data, distance, agent_data = split_tree(tree=np.array(obs_old), num_features_per_node=11) data_normalized = normalize_observation(obs_new, max_depth, observation_radius=10) filename = 'testdata/test_array_{}.csv'.format(i) data_loaded = np.loadtxt(filename, delimiter=',') assert np.allclose(data_loaded, data_normalized)
def replay_verify( ctl: ControllerFromTrainruns, env: RailEnv, call_back: ControllerFromTrainrunsReplayerRenderCallback = lambda *a, **k: None): """Replays this deterministic `ActionPlan` and verifies whether it is feasible. Parameters ---------- ctl env call_back Called before/after each step() call. The env is passed to it. """ call_back(env) i = 0 while not env.dones['__all__'] and i <= env._max_episode_steps: for agent_id, agent in enumerate(env.agents): waypoint: Waypoint = ctl.get_waypoint_before_or_at_step( agent_id, i) assert agent.position == waypoint.position, \ "before {}, agent {} at {}, expected {}".format(i, agent_id, agent.position, waypoint.position) actions = ctl.act(i) print("actions for {}: {}".format(i, actions)) obs, all_rewards, done, _ = env.step(actions) call_back(env) i += 1
def demo(args=None): """Demo script to check installation""" env = RailEnv(width=15, height=15, rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=1, min_dist=8, max_dist=99999), schedule_generator=complex_schedule_generator(), number_of_agents=5) env._max_episode_steps = int(15 * (env.width + env.height)) env_renderer = RenderTool(env) while True: obs, info = env.reset() _done = False # Run a single episode here step = 0 while not _done: # Compute Action _action = {} for _idx, _ in enumerate(env.agents): _action[_idx] = np.random.randint(0, 5) obs, all_rewards, done, _ = env.step(_action) _done = done['__all__'] step += 1 env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) time.sleep(0.3) return 0
def test_global_obs(): rail, rail_map = make_simple_rail() env = RailEnv(width=rail_map.shape[1], height=rail_map.shape[0], rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=1, obs_builder_object=GlobalObsForRailEnv()) global_obs, info = env.reset() # we have to take step for the agent to enter the grid. global_obs, _, _, _ = env.step({0: RailEnvActions.MOVE_FORWARD}) assert (global_obs[0][0].shape == rail_map.shape + (16, )) rail_map_recons = np.zeros_like(rail_map) for i in range(global_obs[0][0].shape[0]): for j in range(global_obs[0][0].shape[1]): rail_map_recons[i, j] = int( ''.join(global_obs[0][0][i, j].astype(int).astype(str)), 2) assert (rail_map_recons.all() == rail_map.all()) # If this assertion is wrong, it means that the observation returned # places the agent on an empty cell obs_agents_state = global_obs[0][1] obs_agents_state = obs_agents_state + 1 assert (np.sum(rail_map * obs_agents_state[:, :, :4].sum(2)) > 0)
def select(self, env: RailEnv, node: Node, o: dict) -> (Node, dict): while True: # calculate UCBs if len(node.valid_moves) == 0 and node.children: best_node = max(node.children, key=self.ucb) o, r, d, _ = env.step(best_node.action) node = best_node else: return node, o
def expand(cls, node: Node, env: RailEnv, obs) -> (Node, dict): if len(node.valid_moves) == 0: return node else: new_node = Node(node, node.valid_moves[0], cls.get_possible_moves(env, obs)) node.valid_moves.pop(0) node.children.append(new_node) o, r, d, _ = env.step(new_node.action) return new_node, o
def test_malfunction_process(): # Set fixed malfunction duration for this test stochastic_data = MalfunctionParameters( malfunction_rate=1, # Rate of malfunction occurence min_duration=3, # Minimal duration of malfunction max_duration=3 # Max duration of malfunction ) rail, rail_map = make_simple_rail2() env = RailEnv( width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=1, malfunction_generator_and_process_data=malfunction_from_params( stochastic_data), obs_builder_object=SingleAgentNavigationObs()) obs, info = env.reset(False, False, True, random_seed=10) agent_halts = 0 total_down_time = 0 agent_old_position = env.agents[0].position # Move target to unreachable position in order to not interfere with test env.agents[0].target = (0, 0) for step in range(100): actions = {} for i in range(len(obs)): actions[i] = np.argmax(obs[i]) + 1 obs, all_rewards, done, _ = env.step(actions) if env.agents[0].malfunction_data['malfunction'] > 0: agent_malfunctioning = True else: agent_malfunctioning = False if agent_malfunctioning: # Check that agent is not moving while malfunctioning assert agent_old_position == env.agents[0].position agent_old_position = env.agents[0].position total_down_time += env.agents[0].malfunction_data['malfunction'] # Check that the appropriate number of malfunctions is achieved assert env.agents[0].malfunction_data[ 'nr_malfunctions'] == 23, "Actual {}".format( env.agents[0].malfunction_data['nr_malfunctions']) # Check that malfunctioning data was standing around assert total_down_time > 0
def main(args): try: opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""]) except getopt.GetoptError as err: print(str(err)) # will print something like "option -a not recognized" sys.exit(2) sleep_for_animation = True for o, a in opts: if o in ("--sleep-for-animation"): sleep_for_animation = str2bool(a) else: assert False, "unhandled option" # Initiate the Predictor custom_predictor = ShortestPathPredictorForRailEnv(10) # Pass the Predictor to the observation builder custom_obs_builder = ObservePredictions(custom_predictor) # Initiate Environment env = RailEnv(width=10, height=10, rail_generator=complex_rail_generator(nr_start_goal=5, nr_extra=1, min_dist=8, max_dist=99999, seed=1), schedule_generator=complex_schedule_generator(), number_of_agents=3, obs_builder_object=custom_obs_builder) obs, info = env.reset() env_renderer = RenderTool(env, gl="PILSVG") # We render the initial step and show the obsered cells as colored boxes env_renderer.render_env(show=True, frames=True, show_observations=True, show_predictions=False) action_dict = {} for step in range(100): for a in range(env.get_num_agents()): action = np.random.randint(0, 5) action_dict[a] = action obs, all_rewards, done, _ = env.step(action_dict) print("Rewards: ", all_rewards, " [done=", done, "]") env_renderer.render_env(show=True, frames=True, show_observations=True, show_predictions=False) if sleep_for_animation: time.sleep(0.5)
def main(): env = RailEnv(width=7, height=7, rail_generator=random_rail_generator(), number_of_agents=3, obs_builder_object=SimpleObs()) env.reset() # Print the observation vector for each agents obs, all_rewards, done, _ = env.step({0: 0}) for i in range(env.get_num_agents()): print("Agent ", i, "'s observation: ", obs[i])
def simulate(self, env: RailEnv, obs: dict) -> float: done = False reward = 0. count = 0 while not done: if not count <= self.rollout_depth: break o, r, d, _ = env.step(self.rollout_policy(obs)) reward += np.sum(list(r.values())) done = d["__all__"] count += 1 return reward
def decorate_step_method(env: RailEnv) -> None: """Enable the step method of the env to take action dictionaries where agent keys are the agent ids. Flatland uses the agent handles as keys instead. This function decorates the step method so that it accepts an action dict where the keys are the agent ids """ env.step_ = env.step def _step(self: RailEnv, actions: Dict[str, Union[int, float, Any]]) -> dm_env.TimeStep: actions_ = {get_agent_handle(k): int(v) for k, v in actions.items()} return self.step_(actions_) env.step = tp.MethodType(_step, env)
def test_multi_speed_init(): env = RailEnv(width=50, height=50, rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=1, min_dist=8, max_dist=99999, seed=1), schedule_generator=complex_schedule_generator(), number_of_agents=5) # Initialize the agent with the parameters corresponding to the environment and observation_builder agent = RandomAgent(218, 4) # Empty dictionary for all agent action action_dict = dict() # Set all the different speeds # Reset environment and get initial observations for all agents env.reset(False, False, True) # Here you can also further enhance the provided observation by means of normalization # See training navigation example in the baseline repository old_pos = [] for i_agent in range(env.get_num_agents()): env.agents[i_agent].speed_data['speed'] = 1. / (i_agent + 1) old_pos.append(env.agents[i_agent].position) # Run episode for step in range(100): # Choose an action for each agent in the environment for a in range(env.get_num_agents()): action = agent.act(0) action_dict.update({a: action}) # Check that agent did not move in between its speed updates assert old_pos[a] == env.agents[a].position # Environment step which returns the observations for all agents, their corresponding # reward and whether they are done _, _, _, _ = env.step(action_dict) # Update old position whenever an agent was allowed to move for i_agent in range(env.get_num_agents()): if (step + 1) % (i_agent + 1) == 0: print(step, i_agent, env.agents[i_agent].position) old_pos[i_agent] = env.agents[i_agent].position
def test_last_malfunction_step(): """ Test to check that agent moves when it is not malfunctioning """ # Set fixed malfunction duration for this test rail, rail_map = make_simple_rail2() env = RailEnv(width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(seed=2), number_of_agents=1, random_seed=1) env.reset() env.agents[0].speed_data['speed'] = 1. / 3. env.agents[0].target = (0, 0) env.reset(False, False, True) # Force malfunction to be off at beginning and next malfunction to happen in 2 steps env.agents[0].malfunction_data['next_malfunction'] = 2 env.agents[0].malfunction_data['malfunction'] = 0 env_data = [] for step in range(20): action_dict: Dict[int, RailEnvActions] = {} for agent in env.agents: # Go forward all the time action_dict[agent.handle] = RailEnvActions(2) if env.agents[0].malfunction_data['malfunction'] < 1: agent_can_move = True # Store the position before and after the step pre_position = env.agents[0].speed_data['position_fraction'] _, reward, _, _ = env.step(action_dict) # Check if the agent is still allowed to move in this step if env.agents[0].malfunction_data['malfunction'] > 0: agent_can_move = False post_position = env.agents[0].speed_data['position_fraction'] # Assert that the agent moved while it was still allowed if agent_can_move: assert pre_position != post_position else: assert post_position == pre_position
def run_benchmark(): """Run benchmark on a small number of agents in complex rail environment.""" random.seed(1) np.random.seed(1) # Example generate a random rail env = RailEnv(width=15, height=15, rail_generator=complex_rail_generator(nr_start_goal=5, nr_extra=20, min_dist=12), schedule_generator=complex_schedule_generator(), number_of_agents=5) env.reset() n_trials = 20 action_dict = dict() action_prob = [0] * 4 for trials in range(1, n_trials + 1): # Reset environment obs, info = env.reset() # Run episode for step in range(100): # Action for a in range(env.get_num_agents()): action = np.random.randint(0, 4) action_prob[action] += 1 action_dict.update({a: action}) # Environment step next_obs, all_rewards, done, _ = env.step(action_dict) if done['__all__']: break if trials % 100 == 0: action_prob = [1] * 4
def main(args): try: opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""]) except getopt.GetoptError as err: print(str(err)) # will print something like "option -a not recognized" sys.exit(2) sleep_for_animation = True for o, a in opts: if o in ("--sleep-for-animation"): sleep_for_animation = str2bool(a) else: assert False, "unhandled option" env = RailEnv(width=7, height=7, rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=1, min_dist=5, max_dist=99999, seed=1), schedule_generator=complex_schedule_generator(), number_of_agents=1, obs_builder_object=SingleAgentNavigationObs()) obs, info = env.reset() env_renderer = RenderTool(env) env_renderer.render_env(show=True, frames=True, show_observations=True) for step in range(100): action = np.argmax(obs[0]) + 1 obs, all_rewards, done, _ = env.step({0: action}) print("Rewards: ", all_rewards, " [done=", done, "]") env_renderer.render_env(show=True, frames=True, show_observations=True) if sleep_for_animation: time.sleep(0.1) if done["__all__"]: break env_renderer.close_window()
# Run episode for step in range(max_steps - 1): for agent in env.get_agent_handles(): if info['action_required'][agent]: # If an action is required, we want to store the obs at that step as well as the action update_values = True action = policy.act(agent_obs[agent], eps=eps_start) action_count[action] += 1 actions_taken.append(action) else: update_values = False action = 0 action_dict.update({agent: action}) # Environment step next_obs, all_rewards, done, info = env.step(action_dict) #if train_params.render and episode_idx % checkpoint_interval == 0: ''' env_renderer.render_env( show=True, frames=False, show_observations=True, show_predictions=False )''' for agent in range(env.get_num_agents()): # Update replay buffer and train agent # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[agent]: policy.step(agent_prev_obs[agent], agent_prev_action[agent],
# Reset score and done score = 0 env_done = 0 # Run episode for step in range(max_steps): # Action for a in range(env.get_num_agents()): if info['action_required'][a]: action = agent.act(agent_obs[a], eps=0.) else: action = 0 action_prob[action] += 1 action_dict.update({a: action}) # Environment step obs, all_rewards, done, _ = env.step(action_dict) env_renderer.render_env(show=True, show_predictions=True, show_observations=False) # Build agent specific observations and normalize for a in range(env.get_num_agents()): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) if done['__all__']: break
def main(args, dir): ''' :param args: :return: Episodes to debug (set breakpoint in episodes loop to debug): - ep = 3, agent 1 spawns in front of 3, blocking its path; 0 and 2 are in a deadlock since they have same priority - ep = 4, agents stop because of wrong priorities even though the conflict zone wasn't entered, - ep = 14, ''' rail_generator = sparse_rail_generator( max_num_cities=args.max_num_cities, seed=args.seed, grid_mode=args.grid_mode, max_rails_between_cities=args.max_rails_between_cities, max_rails_in_city=args.max_rails_in_city, ) # Maps speeds to % of appearance in the env speed_ration_map = { 1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25 } # Slow freight train observation_builder = GraphObsForRailEnv( predictor=ShortestPathPredictorForRailEnv( max_depth=args.prediction_depth), bfs_depth=4) env = RailEnv( width=args.width, height=args.height, rail_generator=rail_generator, schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=args.num_agents, obs_builder_object=observation_builder, malfunction_generator_and_process_data=malfunction_from_params( parameters={ 'malfunction_rate': args.malfunction_rate, # Rate of malfunction occurrence 'min_duration': args.min_duration, # Minimal duration of malfunction 'max_duration': args.max_duration # Max duration of malfunction })) if args.render: env_renderer = RenderTool(env, agent_render_variant=AgentRenderVariant. AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=True) sm = stateMachine() tb = TestBattery(env, observation_builder) state_machine_action_dict = {} railenv_action_dict = {} # max_time_steps = env.compute_max_episode_steps(args.width, args.height) max_time_steps = 200 T_rewards = [] # List of episodes rewards T_Qs = [] # List of q values T_num_done_agents = [] # List of number of done agents for each episode T_all_done = [] # If all agents completed in each episode T_episodes = [] # Time taken for each episode if args.save_image and not os.path.isdir("image_dump"): os.makedirs("image_dump") step_taken = 0 total_step_taken = 0 total_episodes = 0 step_times = [] # Time taken for each step for ep in range(args.num_episodes): # Reset info at the beginning of an episode start_time = time.time() # Take time of one episode if args.generate_baseline: if not os.path.isdir("image_dump/" + str(dir)) and args.save_image: os.makedirs("image_dump/" + str(dir)) else: if not os.path.isdir("image_dump/" + str(ep)) and args.save_image: os.makedirs("image_dump/" + str(ep)) state, info = env.reset() tb.reset() if args.render: env_renderer.reset() reward_sum, all_done = 0, False # reward_sum contains the cumulative reward obtained as sum during the steps num_done_agents = 0 state_machine_action = {} for i in range(env.number_of_agents): state_machine_action[i] = 0 for step in range(max_time_steps): start_step_time = time.time() #if step % 10 == 0: # print(step) # Test battery # see test_battery.py triggers = tb.tests(state, args.prediction_depth, state_machine_action) # state machine based on triggers of test battery # see state_machine.py state_machine_action = sm.act( triggers) # State machine picks action for a in range(env.get_num_agents()): #if info['action_required'][a]: # #railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a]) # railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a]) # state_machine_action_dict.update({a: state_machine_action}) # railenv_action_dict.update({a: railenv_action}) # railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a]) railenv_action = observation_builder.choose_railenv_action( a, state_machine_action[a]) state_machine_action_dict.update({a: state_machine_action}) railenv_action_dict.update({a: railenv_action}) state, reward, done, info = env.step( railenv_action_dict) # Env step if args.generate_baseline: #env_renderer.render_env(show=True, show_observations=False, show_predictions=True) env_renderer.render_env(show=False, show_observations=False, show_predictions=True) else: env_renderer.render_env(show=True, show_observations=False, show_predictions=True) if args.generate_baseline: if args.save_image: env_renderer.save_image("image_dump/" + str(dir) + "/image_" + str(step) + "_.png") else: if args.save_image: env_renderer.save_image("image_dump/" + str(ep) + "/image_" + str(step) + "_.png") if args.debug: for a in range(env.get_num_agents()): log('\n\n#########################################') log('\nInfo for agent {}'.format(a)) #log('\npath : {}'.format(state[a]["path"])) log('\noverlap : {}'.format(state[a]["overlap"])) log('\ndirection : {}'.format(state[a]["direction"])) log('\nOccupancy, first layer: {}'.format( state[a]["occupancy"])) log('\nOccupancy, second layer: {}'.format( state[a]["conflict"])) log('\nForks: {}'.format(state[a]["forks"])) log('\nTarget: {}'.format(state[a]["target"])) log('\nPriority: {}'.format(state[a]["priority"])) log('\nMax priority encountered: {}'.format( state[a]["max_priority"])) log('\nNum malfunctioning agents (globally): {}'.format( state[a]["n_malfunction"])) log('\nNum agents ready to depart (globally): {}'.format( state[a]["ready_to_depart"])) log('\nStatus: {}'.format(info['status'][a])) log('\nPosition: {}'.format(env.agents[a].position)) log('\nTarget: {}'.format(env.agents[a].target)) log('\nMoving? {} at speed: {}'.format( env.agents[a].moving, info['speed'][a])) log('\nAction required? {}'.format( info['action_required'][a])) log('\nState machine action: {}'.format( state_machine_action_dict[a])) log('\nRailenv action: {}'.format(railenv_action_dict[a])) log('\nRewards: {}'.format(reward[a])) log('\n\n#########################################') reward_sum += sum(reward[a] for a in range(env.get_num_agents())) step_taken = step time_taken_step = time.time() - start_step_time step_times.append(time_taken_step) if done['__all__']: all_done = True break total_step_taken += step_taken time_taken = time.time() - start_time # Time taken for one episode total_episodes = ep # Time metrics - too precise avg_time_step = sum(step_times) / step_taken #print("Avg time step: " + str(avg_time_step)) # No need to close the renderer since env parameter sizes stay the same T_rewards.append(reward_sum) # Compute num of agents that reached their target for a in range(env.get_num_agents()): if done[a]: num_done_agents += 1 percentage_done_agents = num_done_agents / env.get_num_agents() log("\nDone agents in episode: {}".format(percentage_done_agents)) T_num_done_agents.append( percentage_done_agents) # In proportion to total T_all_done.append(all_done) # Average number of agents that reached their target avg_done_agents = sum(T_num_done_agents) / len(T_num_done_agents) if len( T_num_done_agents) > 0 else 0 avg_reward = sum(T_rewards) / len(T_rewards) if len(T_rewards) > 0 else 0 avg_norm_reward = avg_reward / (max_time_steps / env.get_num_agents()) avg_ep_time = sum(T_episodes) / args.num_episodes if total_episodes == 0: total_episodes = 1 log("\nSeed: " + str(args.seed) \ + "\t | Avg_done_agents: " + str(avg_done_agents)\ + "\t | Avg_reward: " + str(avg_reward)\ + "\t | Avg_norm_reward: " + str(avg_norm_reward)\ + "\t | Max_num_time_steps: " + str(max_time_steps)\ + "\t | Avg_num_time_steps: " + str(total_step_taken/total_episodes) + "\t | Avg episode time: " + str(avg_ep_time))
# show_agents = [ 2, 6] show_agents = range(len(local_env.agents)) # stores if agent direction is 0, 1 (or 2, 3) # 1 means True agent_directions = np.zeros(local_env.number_of_agents) # Place agent on map action_dict = dict() for a in show_agents: # action = controller.act(0) action = 2 action_dict.update({a: action}) agent_directions[a] = 1 if local_env.agents[a].direction < 2 else 0 # Do the environment step observations, rewards, dones, information = local_env.step(action_dict) # print("observations:", observations) for a in show_agents: agent = local_env.agents[a] if agent.position is not None: # astar_planner.add_cell_to_avoid(agent.position) astar_planner.visited_node(agent.position, 0, a) astar_paths_readable = [None for _ in range(local_env.number_of_agents)] # run A* for the selected agent #for a_id in show_agents: # ag = env.agents[a_id] # start = ag.initial_position # if ag.position is not None: # start = ag.position
def test(args, T, ep, dqn, val_mem, metrics, results_dir, evaluate=False): # Init env and set in evaluation mode # Maps speeds to % of appearance in the env speed_ration_map = { 1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25 } # Slow freight train schedule_generator = sparse_schedule_generator(speed_ration_map) observation_builder = GraphObsForRailEnv( predictor=ShortestPathPredictorForRailEnv( max_depth=args.prediction_depth)) env = RailEnv( width=args.width, height=args.height, rail_generator=sparse_rail_generator( max_num_cities=args.max_num_cities, seed= ep, # Use episode as seed when evaluation is performed during training grid_mode=args.grid_mode, max_rails_between_cities=args.max_rails_between_cities, max_rails_in_city=args.max_rails_in_city, ), schedule_generator=schedule_generator, number_of_agents=args.num_agents, obs_builder_object=observation_builder, malfunction_generator_and_process_data=malfunction_from_params( parameters={ 'malfunction_rate': args.malfunction_rate, 'min_duration': args.min_duration, 'max_duration': args.max_duration }), ) if args.render: env_renderer = RenderTool(env, gl="PILSVG", agent_render_variant=AgentRenderVariant. AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=True, screen_height=1080, screen_width=1920) #max_time_steps = env.compute_max_episode_steps(env.width, env.height) max_time_steps = 200 # TODO Debug # metrics['steps'].append(T) metrics['episodes'].append(ep) T_rewards = [] # List of episodes rewards T_Qs = [] # List T_num_done_agents = [] # List of number of done agents for each episode T_all_done = [] # If all agents completed in each episode network_action_dict = dict() railenv_action_dict = dict() qvalues = {} # Test performance over several episodes for ep in range(args.evaluation_episodes): # Reset info state, info = env.reset() reward_sum, all_done = 0, False # reward_sum contains the cumulative reward obtained as sum during the steps num_done_agents = 0 if args.render: env_renderer.reset() # Choose first action - decide entering of agents into the environment for a in range(env.get_num_agents()): action = np.random.choice((0, 2)) railenv_action_dict.update({a: action}) state, reward, done, info = env.step(railenv_action_dict) # Env step reward_sum += sum(reward[a] for a in range(env.get_num_agents())) if args.render: env_renderer.render_env(show=True, show_observations=False, show_predictions=True) for step in range(max_time_steps - 1): # Choose actions for a in range(env.get_num_agents()): if info['action_required'][a]: network_action = dqn.act( state[a] ) # Choose an action greedily (with noisy weights) # network_action = 0 railenv_action = observation_builder.choose_railenv_action( a, network_action) qvalues.update({a: dqn.get_q_values(state[a])}) else: network_action = 0 railenv_action = 0 qvalues.update({a: [0, 0]}) # '0' if wasn't updated railenv_action_dict.update({a: railenv_action}) network_action_dict.update({a: network_action}) if args.debug: for a in range(env.get_num_agents()): print('#########################################') print('Info for agent {}'.format(a)) print('Occupancy, first layer: {}'.format( state[a][:args.prediction_depth])) print('Occupancy, second layer: {}'.format( state[a][args.prediction_depth:args.prediction_depth * 2])) print('Forks: {}'.format( state[a][args.prediction_depth * 2:args.prediction_depth * 3])) print('Target: {}'.format( state[a][args.prediction_depth * 3:args.prediction_depth * 4])) print('Priority: {}'.format( state[a][args.prediction_depth * 4])) print('Max priority encountered: {}'.format( state[a][args.prediction_depth * 4 + 1])) print('Num malfunctoning agents (globally): {}'.format( state[a][args.prediction_depth * 4 + 2])) print('Num agents ready to depart (globally): {}'.format( state[a][args.prediction_depth * 4 + 3])) print('Status: {}'.format(info['status'][a])) print('Position: {}'.format(env.agents[a].position)) print('Moving? {} at speed: {}'.format( env.agents[a].moving, info['speed'][a])) print('Action required? {}'.format( info['action_required'][a])) print('Network action: {}'.format(network_action_dict[a])) print('Railenv action: {}'.format(railenv_action_dict[a])) print('Q values: {}'.format(qvalues[a])) # print('QValues: {}'.format(qvalues)) print('Rewards: {}'.format(reward[a])) # Breakpoint for debugging here state, reward, done, info = env.step( railenv_action_dict) # Env step if args.render: env_renderer.render_env(show=True, show_observations=False, show_predictions=True) reward_sum += sum(reward[a] for a in range(env.get_num_agents())) if done['__all__']: all_done = True break # No need to close the renderer since env parameter sizes stay the same T_rewards.append(reward_sum) # Compute num of agents that reached their target for a in range(env.get_num_agents()): if done[a]: num_done_agents += 1 T_num_done_agents.append( num_done_agents / env.get_num_agents()) # In proportion to total T_all_done.append(all_done) # Test Q-values over validation memory for state in val_mem: # Iterate over valid states T_Qs.append(dqn.evaluate_q(state)) if args.debug: print('T_Qs: {}'.format(T_Qs)) # These are Qs from a single agent TODO avg_done_agents = sum(T_num_done_agents) / len( T_num_done_agents ) # Average number of agents that reached their target avg_reward = sum(T_rewards) / len(T_rewards) avg_norm_reward = avg_reward / (max_time_steps / env.get_num_agents()) # avg_reward, avg_Q = sum(T_rewards) / len(T_rewards), sum(T_Qs) / len(T_Qs) if not evaluate: # Save model parameters if improved if avg_done_agents > metrics['best_avg_done_agents']: metrics['best_avg_done_agents'] = avg_done_agents dqn.save(results_dir) # Append to results and save metrics metrics['rewards'].append(T_rewards) metrics['Qs'].append(T_Qs) torch.save(metrics, os.path.join(results_dir, 'metrics.pth')) # Plot HTML _plot_line(metrics['episodes'], metrics['rewards'], 'Reward', path=results_dir) # Plot rewards in episodes _plot_line(metrics['episodes'], metrics['Qs'], 'Q', path=results_dir) # Return average number of done agents (in proportion) and average reward return avg_done_agents, avg_reward, avg_norm_reward
def train(env): n_agents = env["n_agents"] x_dim = env["x_dim"] y_dim = env["y_dim"] n_cities = env["n_cities"] max_rails_between_cities = env["max_rails_between_cities"] max_rails_in_city = env["max_rails_in_city"] seed = 0 use_fast_tree_obs = False # Observation parameters observation_tree_depth = 4 observation_radius = 10 observation_max_path_depth = 30 # Set the seeds random.seed(seed) np.random.seed(seed) # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 10000, # Rate of malfunctions min_duration=15, # Minimal duration max_duration=50 # Max duration ) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = None if use_fast_tree_obs: tree_observation = FastTreeObs(max_depth=observation_tree_depth) print("Using FastTreeObs") else: tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) print("Using StandardTreeObs") speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) rewards = [] obs, info = env.reset() if use_fast_tree_obs: state_size = tree_observation.observation_dim else: # Calculate the state size given the depth of the tree observation and the # number of features n_features_per_node = env.obs_builder.observation_dim n_nodes = 0 for i in range(observation_tree_depth + 1): n_nodes += np.power(4, i) state_size = n_features_per_node * n_nodes action_size = 5 DEVICE = 'cpu' # if torch.cuda.is_available(): # DEVICE = 'gpu' buffer_length = 10000 steps_to_save_model = 10 step_size = 100 num_steps = 100 # update every 100 steps avg_steps = 20 # num steps to average and plot rewards reward_q = [] batch_size = 100 agent_obs = np.array([None] * env.get_num_agents()) max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) num_episodes = 100000 agent_init_params = [] sa_size = [] for i in range(n_agents): agent_init_params.append({ 'num_in_pol': state_size, 'num_out_pol': action_size, 'init_weights': 'model.pt' }) sa_size.append((state_size, action_size)) hyperparams = { "tau": 0.01, "pi_lr": 0.00001, "q_lr": 0.00005, "pol_hidden_dim": 256, "critic_hidden_dim": 256, "attend_heads": 8 } model = AttentionSAC(agent_init_params=agent_init_params, sa_size=sa_size, tau=hyperparams["tau"], pi_lr=hyperparams["pi_lr"], q_lr=hyperparams["q_lr"], pol_hidden_dim=hyperparams["pol_hidden_dim"], critic_hidden_dim=hyperparams["critic_hidden_dim"], attend_heads=hyperparams["attend_heads"]) model.init_dict = {} replay_buffer = ReplayBuffer(buffer_length, n_agents, [state_size for i in range(n_agents)], [action_size for i in range(n_agents)]) print("MAX STEPS: " + str(max_steps)) print("NUM EPISODES: ", num_episodes) print("HYPERPARAMS: ") print(hyperparams) start_time = time.time() for ep in range(num_episodes): print("Episode " + str(ep) + ":", flush=True) obs, info = env.reset(True, True) model.prep_rollouts(device=DEVICE) reward_sum_for_this_episode = 0 for steps in range(max_steps): if steps % step_size == 0: print("=", end="", flush=True) for agent in env.get_agent_handles(): if obs[agent] is not None: if use_fast_tree_obs: agent_obs[agent] = obs[agent] else: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) else: agent_obs[agent] = np.array([0.] * state_size) action_dict = {} agent_actions = [] torch_obs = [ Variable(torch.Tensor([agent_obs[i]]), requires_grad=False) for i in range(n_agents) ] torch_agent_actions = model.step(torch_obs, explore=True) agent_actions = [ac.data.numpy() for ac in torch_agent_actions] for i in range(n_agents): dist = torch_agent_actions[i][0] idx = -1 for j in range(action_size): if dist[j] != 0: idx = j break action_dict[i] = idx next_obs, all_rewards, done, info = env.step(action_dict) rewards = [] dones = [] next_agent_obs = np.array([None] * env.get_num_agents()) for agent in env.get_agent_handles(): if next_obs[agent] is not None: if use_fast_tree_obs: next_agent_obs[agent] = next_obs[agent] else: next_agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) else: next_agent_obs[agent] = np.array([0.] * state_size) for i in range(n_agents): reward_sum_for_this_episode += all_rewards[i] rewards.append(all_rewards[i]) all_rewards[i] += augment_reward(agent_obs[agent]) dones.append(done[i]) replay_buffer.push(np.array([agent_obs]), np.array(agent_actions), np.array([rewards]), np.array([next_agent_obs]), np.array([dones])) if steps % num_steps == 0: model.prep_training(device=DEVICE) sample = replay_buffer.sample(batch_size, norm_rews=False) #print(sample) model.update_critic(sample) model.update_policies(sample) model.update_all_targets() model.prep_rollouts(device=DEVICE) reward_sum_for_this_episode /= n_agents reward_q.append(reward_sum_for_this_episode) if len(reward_q) == avg_steps: wandb.log({'reward': np.mean(reward_q)}) reward_q = [] print() if ep % steps_to_save_model == 0: print("\nSaving model") model.save(os.getcwd() + "/model.pt") cur_time = time.time() time_elapsed = (cur_time - start_time) // 60 print("Time Elapsed: " + str(time_elapsed) + "\n")
def main(argv): try: opts, args = getopt.getopt(argv, "n:", ["n_trials="]) except getopt.GetoptError: print('test_navigation_single_agent.py -n <n_trials>') sys.exit(2) for opt, arg in opts: if opt in ('-n', '--n_trials'): n_trials = int(arg) random.seed(1) np.random.seed(1) ######## TEST SET SELECTION - PARAMETERS ######## test_multi_agent_setup = 1 # 1 for Medium size test, 2 for Big size test test_n_agents = 5 # Number of agents to test (3 - 5 - 7 for Medium, 5 - 7 - 10 for Big) test_malfunctions_enabled = True # Malfunctions enabled? test_agents_one_speed = True # Test agents with the same speed (1) or with 4 different speeds? ################################################# # Medium size if test_multi_agent_setup == 1: x_dim = 16*3 y_dim = 9*3 max_num_cities = 5 max_rails_between_cities = 2 max_rails_in_city = 3 # Big size if test_multi_agent_setup == 2: x_dim = 16*4 y_dim = 9*4 max_num_cities = 9 max_rails_between_cities = 5 max_rails_in_city = 5 stochastic_data = {'malfunction_rate': 80, # Rate of malfunction occurence of single agent 'min_duration': 15, # Minimal duration of malfunction 'max_duration': 50 # Max duration of malfunction } # Custom observation builder tree_depth = 2 TreeObservation = TreeObsForRailEnv(max_depth=tree_depth, predictor = ShortestPathPredictorForRailEnv(20)) np.savetxt(fname=path.join('NetsTest' , 'info.txt'), X=[x_dim,y_dim,test_n_agents,max_num_cities,max_rails_between_cities,max_rails_in_city,tree_depth],delimiter=';') # Different agent types (trains) with different speeds. if test_agents_one_speed: speed_ration_map = {1.: 1., # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0} # Slow freight train else: speed_ration_map = {1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25} # Slow freight train if test_malfunctions_enabled: env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator(max_num_cities=max_num_cities, # Number of cities in map (where train stations are) seed=14, # Random seed grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_ration_map), malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), number_of_agents=test_n_agents, obs_builder_object=TreeObservation) else: env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator(max_num_cities=max_num_cities, # Number of cities in map (where train stations are) seed=14, # Random seed grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=test_n_agents, obs_builder_object=TreeObservation) env.reset() #env_renderer = RenderTool(env, gl="PILSVG", ) env_renderer = RenderTool(env, gl="PILSVG", agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=False, screen_height=(1080*0.8), # Adjust these parameters to fit your resolution screen_width=(1920*0.8)) num_features_per_node = env.obs_builder.observation_dim nr_nodes = 0 for i in range(tree_depth + 1): nr_nodes += np.power(4, i) state_size = num_features_per_node * nr_nodes action_size = 5 # We set the number of episodes we would like to train on if 'n_trials' not in locals(): n_trials = 15000 # max_steps computation speed_weighted_mean = 0 for key in speed_ration_map.keys(): speed_weighted_mean += key * speed_ration_map[key] #max_steps = int(3 * (env.height + env.width)) max_steps = int((1/speed_weighted_mean) * 3 * (env.height + env.width)) #eps = 1. #eps_end = 0.005 #eps_decay = 0.9995 # And some variables to keep track of the performance action_dict = dict() final_action_dict = dict() action_prob_list = [] scores_window = deque(maxlen=100) done_window = deque(maxlen=100) scores = [] scores_list = [] deadlock_list =[] dones_list_window = [] dones_list = [] action_prob = [0] * action_size agent_obs = [None] * env.get_num_agents() agent_next_obs = [None] * env.get_num_agents() # Useless agent = Agent(state_size, action_size) # LOAD MODEL WEIGHTS TO TEST agent.qnetwork_local.load_state_dict(torch.load(path.join('NetsTest' , 'navigator_checkpoint3800_multi10_deadlock_global10.pth'))) record_images = False frame_step = 0 for trials in range(1, n_trials + 1): # Reset environment obs, info = env.reset()#(True, True) env_renderer.reset() # Build agent specific observations for a in range(env.get_num_agents()): agent_obs[a] = agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) # Reset score and done score = 0 env_done = 0 # Run episode for step in range(max_steps): # Action for a in range(env.get_num_agents()): if info['action_required'][a]: action = agent.act(agent_obs[a], eps=0.) action_prob[action] += 1 else: action = 0 action_dict.update({a: action}) # Environment step obs, all_rewards, done, deadlocks, info = env.step(action_dict) env_renderer.render_env(show=True, show_predictions=True, show_observations=False) # Build agent specific observations and normalize for a in range(env.get_num_agents()): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) score += all_rewards[a] / env.get_num_agents() if done['__all__']: break # Collection information about training tasks_finished = 0 for _idx in range(env.get_num_agents()): if done[_idx] == 1: tasks_finished += 1 done_window.append(tasks_finished / max(1, env.get_num_agents())) scores_window.append(score / max_steps) # save most recent score scores.append(np.mean(scores_window)) dones_list.append(tasks_finished / max(1, env.get_num_agents())) dones_list_window.append((np.mean(done_window))) scores_list.append(score / max_steps) deadlock_list.append(deadlocks.count(1)/max(1, env.get_num_agents())) if (np.sum(action_prob) == 0): action_prob_normalized = [0] * action_size else: action_prob_normalized = action_prob / np.sum(action_prob) print( '\rTesting {} Agents on ({},{}).\t Episode {}\t Score: {:.3f}\tDones: {:.2f}%\tDeadlocks: {:.2f}\t Action Probabilities: \t {}'.format( env.get_num_agents(), x_dim, y_dim, trials, score / max_steps, 100 * tasks_finished / max(1, env.get_num_agents()), deadlocks.count(1)/max(1, env.get_num_agents()), action_prob_normalized), end=" ") #if trials % 100 == 0: action_prob_list.append(action_prob_normalized) action_prob = [0] * action_size if trials % 50 == 0: np.savetxt(fname=path.join('NetsTest' , 'test_metrics.csv'), X=np.transpose(np.asarray([scores_list,scores,dones_list,dones_list_window,deadlock_list])), delimiter=';',newline='\n') np.savetxt(fname=path.join('NetsTest' , 'test_action_prob.csv'), X=np.asarray(action_prob_list), delimiter=';',newline='\n')