def create_rail_env(env_params, tree_observation): n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city seed = env_params.seed # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=env_params.malfunction_rate, min_duration=20, max_duration=50) return RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed)
def test_malfunction_before_entry(): """Tests that malfunctions are working properly for agents before entering the environment!""" # Set fixed malfunction duration for this test stochastic_data = MalfunctionParameters(malfunction_rate=2, # Rate of malfunction occurence min_duration=10, # Minimal duration of malfunction max_duration=10 # Max duration of malfunction ) rail, rail_map = make_simple_rail2() env = RailEnv(width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=10, malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), obs_builder_object=SingleAgentNavigationObs() ) env.reset(False, False, False, random_seed=10) env.agents[0].target = (0, 0) # Test initial malfunction values for all agents # we want some agents to be malfuncitoning already and some to be working # we want different next_malfunction values for the agents assert env.agents[0].malfunction_data['malfunction'] == 0 assert env.agents[1].malfunction_data['malfunction'] == 10 assert env.agents[2].malfunction_data['malfunction'] == 0 assert env.agents[3].malfunction_data['malfunction'] == 10 assert env.agents[4].malfunction_data['malfunction'] == 10 assert env.agents[5].malfunction_data['malfunction'] == 10 assert env.agents[6].malfunction_data['malfunction'] == 10 assert env.agents[7].malfunction_data['malfunction'] == 10 assert env.agents[8].malfunction_data['malfunction'] == 10 assert env.agents[9].malfunction_data['malfunction'] == 10
def test_malfunction_values_and_behavior(): """ Test the malfunction counts down as desired Returns ------- """ # Set fixed malfunction duration for this test rail, rail_map = make_simple_rail2() action_dict: Dict[int, RailEnvActions] = {} stochastic_data = MalfunctionParameters(malfunction_rate=0.001, # Rate of malfunction occurence min_duration=10, # Minimal duration of malfunction max_duration=10 # Max duration of malfunction ) env = RailEnv(width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=1, malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), obs_builder_object=SingleAgentNavigationObs() ) env.reset(False, False, activate_agents=True, random_seed=10) # Assertions assert_list = [9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 10, 9, 8, 7, 6, 5] print("[") for time_step in range(15): # Move in the env env.step(action_dict) # Check that next_step decreases as expected assert env.agents[0].malfunction_data['malfunction'] == assert_list[time_step]
def train_validate_env_generator_params(train_set, n_agents, x_dim, y_dim, observation, stochastic_data, speed_ration_map, seed=1): if train_set: random_seed = np.random.randint(1000) else: random_seed = np.random.randint(1000, 2000) random.seed(random_seed) np.random.seed(random_seed) env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=3, # Number of cities in map (where train stations are) seed=seed, # Random seed grid_mode=False, max_rails_between_cities=2, max_rails_in_city=3), schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( stochastic_data), # Malfunction data generator obs_builder_object=observation) return env, random_seed
def test_malfanction_from_params(): """ Test loading malfunction from Returns ------- """ stochastic_data = MalfunctionParameters( malfunction_rate=1000, # Rate of malfunction occurence min_duration=2, # Minimal duration of malfunction max_duration=5 # Max duration of malfunction ) rail, rail_map = make_simple_rail2() env = RailEnv( width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=10, malfunction_generator_and_process_data=malfunction_from_params( stochastic_data)) env.reset() assert env.malfunction_process_data.malfunction_rate == 1000 assert env.malfunction_process_data.min_duration == 2 assert env.malfunction_process_data.max_duration == 5
def env(self): # obs builder obs_builder_object = self.obs_builder_dict[self.obs_builder] env = RailEnv( width=self.width, # width和height是网格grid的数量 height=self.height, rail_generator=sparse_rail_generator( max_num_cities=self.max_num_cities, # Number of cities in map (where train stations are) seed=19, # Random seed grid_mode=True, max_rails_between_cities=2, max_rails_in_city=2, ), schedule_generator=sparse_schedule_generator( self.speed_ration_map), number_of_agents=self.number_of_agents, malfunction_generator_and_process_data=malfunction_from_params( self.stochastic_data), # Malfunction data generator obs_builder_object=obs_builder_object, remove_agents_at_target=False, record_steps=True) return env
def test_malfunction_process(): # Set fixed malfunction duration for this test stochastic_data = MalfunctionParameters( malfunction_rate=1, # Rate of malfunction occurence min_duration=3, # Minimal duration of malfunction max_duration=3 # Max duration of malfunction ) rail, rail_map = make_simple_rail2() env = RailEnv( width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=1, malfunction_generator_and_process_data=malfunction_from_params( stochastic_data), obs_builder_object=SingleAgentNavigationObs()) obs, info = env.reset(False, False, True, random_seed=10) agent_halts = 0 total_down_time = 0 agent_old_position = env.agents[0].position # Move target to unreachable position in order to not interfere with test env.agents[0].target = (0, 0) for step in range(100): actions = {} for i in range(len(obs)): actions[i] = np.argmax(obs[i]) + 1 obs, all_rewards, done, _ = env.step(actions) if env.agents[0].malfunction_data['malfunction'] > 0: agent_malfunctioning = True else: agent_malfunctioning = False if agent_malfunctioning: # Check that agent is not moving while malfunctioning assert agent_old_position == env.agents[0].position agent_old_position = env.agents[0].position total_down_time += env.agents[0].malfunction_data['malfunction'] # Check that the appropriate number of malfunctions is achieved assert env.agents[0].malfunction_data[ 'nr_malfunctions'] == 23, "Actual {}".format( env.agents[0].malfunction_data['nr_malfunctions']) # Check that malfunctioning data was standing around assert total_down_time > 0
def _launch(self): rail_generator = sparse_rail_generator( seed=self._config['seed'], max_num_cities=self._config['max_num_cities'], grid_mode=self._config['grid_mode'], max_rails_between_cities=self._config['max_rails_between_cities'], max_rails_in_city=self._config['max_rails_in_city']) malfunction_generator = no_malfunction_generator() if { 'malfunction_rate', 'malfunction_min_duration', 'malfunction_max_duration' } <= self._config.keys(): stochastic_data = { 'malfunction_rate': self._config['malfunction_rate'], 'min_duration': self._config['malfunction_min_duration'], 'max_duration': self._config['malfunction_max_duration'] } malfunction_generator = malfunction_from_params(stochastic_data) speed_ratio_map = None if 'speed_ratio_map' in self._config: speed_ratio_map = { float(k): float(v) for k, v in self._config['speed_ratio_map'].items() } schedule_generator = sparse_schedule_generator(speed_ratio_map) env = None try: env = RailEnv( width=self._config['width'], height=self._config['height'], rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=self._config['number_of_agents'], malfunction_generator_and_process_data=malfunction_generator, obs_builder_object=self._observation.builder(), remove_agents_at_target=False, random_seed=self._config['seed'], # Should Below line be commented as here the env tries different configs, # hence opening it can be wasteful, morever the render has to be closed use_renderer=self._env_config.get('render')) env.reset() except ValueError as e: logging.error("=" * 50) logging.error(f"Error while creating env: {e}") logging.error("=" * 50) return env
def test_malfunction_process_statistically(): """Tests that malfunctions are produced by stochastic_data!""" # Set fixed malfunction duration for this test stochastic_data = MalfunctionParameters( malfunction_rate=1 / 5, # Rate of malfunction occurence min_duration=5, # Minimal duration of malfunction max_duration=5 # Max duration of malfunction ) rail, rail_map = make_simple_rail2() env = RailEnv( width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=10, malfunction_generator_and_process_data=malfunction_from_params( stochastic_data), obs_builder_object=SingleAgentNavigationObs()) env.reset(True, True, False, random_seed=10) env.agents[0].target = (0, 0) # Next line only for test generation # agent_malfunction_list = [[] for i in range(10)] agent_malfunction_list = [ [0, 0, 0, 0, 5, 4, 3, 2, 1, 0, 5, 4, 3, 2, 1, 0, 0, 0, 5, 4], [0, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 5, 4, 3, 2, 1, 0, 5, 4, 3, 2, 1, 0, 0, 5, 4, 3, 2], [0, 0, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 5, 4, 3, 2, 1], [0, 0, 5, 4, 3, 2, 1, 0, 0, 5, 4, 3, 2, 1, 0, 5, 4, 3, 2, 1], [0, 0, 0, 0, 0, 0, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0], [5, 4, 3, 2, 1, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 5], [5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 4, 3, 2], [5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0, 5, 4] ] for step in range(20): action_dict: Dict[int, RailEnvActions] = {} for agent_idx in range(env.get_num_agents()): # We randomly select an action action_dict[agent_idx] = RailEnvActions(np.random.randint(4)) # For generating tests only: # agent_malfunction_list[agent_idx].append(env.agents[agent_idx].malfunction_data['malfunction']) assert env.agents[agent_idx].malfunction_data[ 'malfunction'] == agent_malfunction_list[agent_idx][step] env.step(action_dict)
def _launch(self, env_params, observation): return RailEnv( width=env_params.x_dim, height=env_params.y_dim, rail_generator=sparse_rail_generator( max_num_cities=env_params.n_cities, grid_mode=False, max_rails_between_cities=env_params.max_rails_between_cities, max_rails_in_city=env_params.max_rails_in_city, seed=env_params.seed ), schedule_generator=sparse_schedule_generator(env_params.speed_profiles), number_of_agents=env_params.n_agents, malfunction_generator_and_process_data=malfunction_from_params(env_params.malfunction_parameters), obs_builder_object=observation, random_seed=env_params.seed )
def random_sparse_env_small(random_seed, max_width, max_height, observation_builder): random.seed(random_seed) size = random.randint(0, 5) width = 20 + size * 5 height = 20 + size * 5 nr_cities = 2 + size // 2 + random.randint(0, 2) nr_trains = min(nr_cities * 5, 5 + random.randint(0, 5)) # , 10 + random.randint(0, 10)) max_rails_between_cities = 2 max_rails_in_cities = 3 + random.randint(0, size) malfunction_rate = 30 + random.randint(0, 100) malfunction_min_duration = 3 + random.randint(0, 7) malfunction_max_duration = 20 + random.randint(0, 80) rail_generator = sparse_rail_generator(max_num_cities=nr_cities, seed=random_seed, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_cities) # new version: # stochastic_data = MalfunctionParameters(malfunction_rate, malfunction_min_duration, malfunction_max_duration) stochastic_data = {'malfunction_rate': malfunction_rate, 'min_duration': malfunction_min_duration, 'max_duration': malfunction_max_duration} schedule_generator = sparse_schedule_generator({1.: 0.25, 1. / 2.: 0.25, 1. / 3.: 0.25, 1. / 4.: 0.25}) while width <= max_width and height <= max_height: try: env = RailEnv(width=width, height=height, rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=nr_trains, malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), obs_builder_object=observation_builder, remove_agents_at_target=False) print("[{}] {}x{} {} cities {} trains, max {} rails between cities, max {} rails in cities. Malfunction rate {}, {} to {} steps.".format( random_seed, width, height, nr_cities, nr_trains, max_rails_between_cities, max_rails_in_cities, malfunction_rate, malfunction_min_duration, malfunction_max_duration )) return env except ValueError as e: logging.error(f"Error: {e}") width += 5 height += 5 logging.info("Try again with larger env: (w,h):", width, height) logging.error(f"Unable to generate env with seed={random_seed}, max_width={max_height}, max_height={max_height}") return None
def create_and_save_env(file_name: str, schedule_generator: ScheduleGenerator, rail_generator: RailGenerator): stochastic_data = MalfunctionParameters( malfunction_rate=1000, # Rate of malfunction occurence min_duration=15, # Minimal duration of malfunction max_duration=50 # Max duration of malfunction ) env = RailEnv( width=30, height=30, rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=10, malfunction_generator_and_process_data=malfunction_from_params( stochastic_data), remove_agents_at_target=True) env.reset(True, True) #env.save(file_name) RailEnvPersister.save(env, file_name)
def _launch(self): rail_generator = self.get_rail_generator() malfunction_generator = no_malfunction_generator() if {'malfunction_rate', 'min_duration', 'max_duration' } <= self._config.keys(): stochastic_data = { 'malfunction_rate': self._config['malfunction_rate'], 'min_duration': self._config['malfunction_min_duration'], 'max_duration': self._config['malfunction_max_duration'] } malfunction_generator = malfunction_from_params(stochastic_data) speed_ratio_map = None if 'speed_ratio_map' in self._config: speed_ratio_map = { float(k): float(v) for k, v in self._config['speed_ratio_map'].items() } schedule_generator = sparse_schedule_generator(speed_ratio_map) env = None try: env = RailEnv( width=self._config['width'], height=self._config['height'], rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=self._config['number_of_agents'], malfunction_generator_and_process_data=malfunction_generator, obs_builder_object=self._observation.builder(), remove_agents_at_target=False, random_seed=self._config['seed']) env.reset() except ValueError as e: logging.error("=" * 50) logging.error(f"Error while creating env: {e}") logging.error("=" * 50) return env
def __init__(self, n_cars=3, n_acts=5, min_obs=-1, max_obs=1, n_nodes=2, ob_radius=10, x_dim=36, y_dim=36, feats='all'): self.tree_obs = tree_observation.TreeObservation(n_nodes) self.n_cars = n_cars self.n_nodes = n_nodes self.ob_radius = ob_radius self.feats = feats rail_gen = sparse_rail_generator(max_num_cities=3, seed=666, grid_mode=False, max_rails_between_cities=2, max_rails_in_city=3) self._rail_env = RailEnv( width=x_dim, height=y_dim, rail_generator=rail_gen, schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=n_cars, malfunction_generator_and_process_data=malfunction_from_params( stochastic_data), obs_builder_object=self.tree_obs) self.renderer = RenderTool(self._rail_env, gl="PILSVG") self.action_dict = dict() self.info = dict() self.old_obs = dict()
def test_initial_malfunction(): stochastic_data = MalfunctionParameters(malfunction_rate=1000, # Rate of malfunction occurence min_duration=2, # Minimal duration of malfunction max_duration=5 # Max duration of malfunction ) rail, rail_map = make_simple_rail2() env = RailEnv(width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(seed=10), number_of_agents=1, malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), # Malfunction data generator obs_builder_object=SingleAgentNavigationObs() ) # reset to initialize agents_static env.reset(False, False, True, random_seed=10) print(env.agents[0].malfunction_data) env.agents[0].target = (0, 5) set_penalties_for_replay(env) replay_config = ReplayConfig( replay=[ Replay( position=(3, 2), direction=Grid4TransitionsEnum.EAST, action=RailEnvActions.MOVE_FORWARD, set_malfunction=3, malfunction=3, reward=env.step_penalty # full step penalty when malfunctioning ), Replay( position=(3, 2), direction=Grid4TransitionsEnum.EAST, action=RailEnvActions.MOVE_FORWARD, malfunction=2, reward=env.step_penalty # full step penalty when malfunctioning ), # malfunction stops in the next step and we're still at the beginning of the cell # --> if we take action MOVE_FORWARD, agent should restart and move to the next cell Replay( position=(3, 2), direction=Grid4TransitionsEnum.EAST, action=RailEnvActions.MOVE_FORWARD, malfunction=1, reward=env.step_penalty ), # malfunctioning ends: starting and running at speed 1.0 Replay( position=(3, 2), direction=Grid4TransitionsEnum.EAST, action=RailEnvActions.MOVE_FORWARD, malfunction=0, reward=env.start_penalty + env.step_penalty * 1.0 # running at speed 1.0 ), Replay( position=(3, 3), direction=Grid4TransitionsEnum.EAST, action=RailEnvActions.MOVE_FORWARD, malfunction=0, reward=env.step_penalty # running at speed 1.0 ) ], speed=env.agents[0].speed_data['speed'], target=env.agents[0].target, initial_position=(3, 2), initial_direction=Grid4TransitionsEnum.EAST, ) run_replay_config(env, [replay_config])
height=config[test].as_int('height'), rail_generator=sparse_rail_generator( max_num_cities=config[test].as_int('max_num_cities'), seed=config[test].as_int('seed'), grid_mode=grid_distribution_of_cities, max_rails_between_cities=config[test].as_int( 'max_rails_between_cities'), max_rails_in_city=config[test].as_int('max_rail_in_city')), schedule_generator=schedule_generator, number_of_agents=config[test].as_int('num_agents'), obs_builder_object=observation_builder, malfunction_generator_and_process_data=malfunction_from_params( parameters={ 'malfunction_rate': config[test].as_int( 'malfunction_rate' ), # Rate of malfunction occurrence of single agent 'min_duration': config[test].as_int( 'min_duration'), # Minimal duration of malfunction 'max_duration': config[test].as_int( 'max_duration') # Max duration of malfunction }), remove_agents_at_target=True) obs, info = env.reset(True, True) # Initiate the renderer env_renderer = RenderTool( env, gl="PILSVG", agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=True, screen_height=1080,
def create_test_env(fnParams, nTest, sDir): (seed, width, height, nr_trains, nr_cities, max_rails_between_cities, max_rails_in_cities, malfunction_rate, malfunction_min_duration, malfunction_max_duration) = fnParams(nTest) #if not ShouldRunTest(test_id): # continue rail_generator = sparse_rail_generator( max_num_cities=nr_cities, seed=seed, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_cities, ) #stochastic_data = {'malfunction_rate': malfunction_rate, # 'min_duration': malfunction_min_duration, # 'max_duration': malfunction_max_duration # } stochastic_data = MalfunctionParameters( malfunction_rate=malfunction_rate, min_duration=malfunction_min_duration, max_duration=malfunction_max_duration) observation_builder = GlobalObsForRailEnv() DEFAULT_SPEED_RATIO_MAP = { 1.: 0.25, 1. / 2.: 0.25, 1. / 3.: 0.25, 1. / 4.: 0.25 } schedule_generator = sparse_schedule_generator(DEFAULT_SPEED_RATIO_MAP) for iAttempt in range(5): try: env = RailEnv( width=width, height=height, rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=nr_trains, malfunction_generator_and_process_data=malfunction_from_params( stochastic_data), obs_builder_object=observation_builder, remove_agents_at_target=True) obs = env.reset(random_seed=seed) break except ValueError as oErr: print("Error:", oErr) width += 5 height += 5 print("Try again with larger env: (w,h):", width, height) if not os.path.exists(sDir): os.makedirs(sDir) sfName = "{}/Level_{}.mpk".format(sDir, nTest) if os.path.exists(sfName): os.remove(sfName) env.save(sfName) sys.stdout.write(".") sys.stdout.flush() return env
def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps, action_size, state_size, seed, render, allow_skipping, allow_caching): # Evaluation is faster on CPU (except if you use a really huge policy) parameters = {'use_gpu': False} policy = DDDQNPolicy(state_size, action_size, Namespace(**parameters), evaluation_mode=True) policy.qnetwork_local = torch.load(checkpoint) env_params = Namespace(**env_params) # Environment parameters n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city # Malfunction and speed profiles # TODO pass these parameters properly from main! malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 2000, # Rate of malfunctions min_duration=20, # Minimal duration max_duration=50 # Max duration ) # Only fast trains in Round 1 speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Observation parameters observation_tree_depth = env_params.observation_tree_depth observation_radius = env_params.observation_radius observation_max_path_depth = env_params.observation_max_path_depth # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Setup the environment env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city, ), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation) if render: env_renderer = RenderTool(env, gl="PGL") action_dict = dict() scores = [] completions = [] nb_steps = [] inference_times = [] preproc_times = [] agent_times = [] step_times = [] for episode_idx in range(n_eval_episodes): seed += 1 inference_timer = Timer() preproc_timer = Timer() agent_timer = Timer() step_timer = Timer() step_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True, random_seed=seed) step_timer.end() agent_obs = [None] * env.get_num_agents() score = 0.0 if render: env_renderer.set_new_rail() final_step = 0 skipped = 0 nb_hit = 0 agent_last_obs = {} agent_last_action = {} for step in range(max_steps - 1): if allow_skipping and check_if_all_blocked(env): # FIXME why -1? bug where all agents are "done" after max_steps! skipped = max_steps - step - 1 final_step = max_steps - 2 n_unfinished_agents = sum(not done[idx] for idx in env.get_agent_handles()) score -= skipped * n_unfinished_agents break agent_timer.start() for agent in env.get_agent_handles(): if obs[agent] and info['action_required'][agent]: if agent in agent_last_obs and np.all( agent_last_obs[agent] == obs[agent]): nb_hit += 1 action = agent_last_action[agent] else: preproc_timer.start() norm_obs = normalize_observation( obs[agent], tree_depth=observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() inference_timer.start() action = policy.act(norm_obs, eps=0.0) inference_timer.end() action_dict.update({agent: action}) if allow_caching: agent_last_obs[agent] = obs[agent] agent_last_action[agent] = action agent_timer.end() step_timer.start() obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if render: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) if step % 100 == 0: print("{}/{}".format(step, max_steps - 1)) for agent in env.get_agent_handles(): score += all_rewards[agent] final_step = step if done['__all__']: break normalized_score = score / (max_steps * env.get_num_agents()) scores.append(normalized_score) tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) completions.append(completion) nb_steps.append(final_step) inference_times.append(inference_timer.get()) preproc_times.append(preproc_timer.get()) agent_times.append(agent_timer.get()) step_times.append(step_timer.get()) skipped_text = "" if skipped > 0: skipped_text = "\t⚡ Skipped {}".format(skipped) hit_text = "" if nb_hit > 0: hit_text = "\t⚡ Hit {} ({:.1f}%)".format(nb_hit, (100 * nb_hit) / (n_agents * final_step)) print( "☑️ Score: {:.3f} \tDone: {:.1f}% \tNb steps: {:.3f} " "\t🍭 Seed: {}" "\t🚉 Env: {:.3f}s " "\t🤖 Agent: {:.3f}s (per step: {:.3f}s) \t[preproc: {:.3f}s \tinfer: {:.3f}s]" "{}{}".format(normalized_score, completion * 100.0, final_step, seed, step_timer.get(), agent_timer.get(), agent_timer.get() / final_step, preproc_timer.get(), inference_timer.get(), skipped_text, hit_text)) return scores, completions, nb_steps, agent_times, step_times
def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps, action_size, state_size, seed, render, allow_skipping, allow_caching): # Evaluation is faster on CPU (except if you use a really huge policy) parameters = {'use_gpu': False} # policy = DDDQNPolicy(state_size, action_size, Namespace(**parameters), evaluation_mode=True) # policy.qnetwork_local = torch.load(checkpoint, map_location={'cuda:0': 'cpu'}) env_params = Namespace(**env_params) # Environment parameters n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city agents = [] for agent_id in range(n_agents): agent = AttentionAgent(num_in_pol=state_size, num_out_pol=action_size, hidden_dim=256, lr=0.001) agent.policy = torch.load(os.path.join( checkpoint, f'2300_agent{agent_id}' + '.pth'), map_location=torch.device('cpu')) agent.policy.eval() agents.append(agent) # Malfunction and speed profiles # TODO pass these parameters properly from main! malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 2000, # Rate of malfunctions min_duration=20, # Minimal duration max_duration=50 # Max duration ) # Only fast trains in Round 1 speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Observation parameters observation_tree_depth = env_params.observation_tree_depth observation_radius = env_params.observation_radius observation_max_path_depth = env_params.observation_max_path_depth # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Setup the environment env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city, ), # rail_generator = complex_rail_generator( # nr_start_goal=10, # nr_extra=10, # min_dist=10, # max_dist=99999, # seed=1 # ), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation) if render: # env_renderer = RenderTool(env, gl="PGL") env_renderer = RenderTool( env, # gl="PGL", agent_render_variant=AgentRenderVariant. AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=False, screen_height=600, # Adjust these parameters to fit your resolution screen_width=800) action_dict = dict() scores = [] completions = [] nb_steps = [] inference_times = [] preproc_times = [] agent_times = [] step_times = [] for agent_id in range(n_agents): action_dict[agent_id] = 0 for episode_idx in range(n_eval_episodes): images = [] seed += 1 inference_timer = Timer() preproc_timer = Timer() agent_timer = Timer() step_timer = Timer() step_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True, random_seed=seed) step_timer.end() agent_obs = [None] * env.get_num_agents() score = 0.0 if render: env_renderer.set_new_rail() final_step = 0 skipped = 0 nb_hit = 0 agent_last_obs = {} agent_last_action = {} for step in range(max_steps - 1): # time.sleep(0.2) if allow_skipping and check_if_all_blocked(env): # FIXME why -1? bug where all agents are "done" after max_steps! skipped = max_steps - step - 1 final_step = max_steps - 2 n_unfinished_agents = sum(not done[idx] for idx in env.get_agent_handles()) score -= skipped * n_unfinished_agents break agent_timer.start() for agent in env.get_agent_handles(): agent_model = agents[agent] if obs[agent] and info['action_required'][agent]: if agent in agent_last_obs and np.all( agent_last_obs[agent] == obs[agent]): nb_hit += 1 action = agent_last_action[agent] else: preproc_timer.start() norm_obs = normalize_observation( obs[agent], tree_depth=observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() inference_timer.start() action = act(agent_model, norm_obs) inference_timer.end() action_dict.update({agent: action}) if allow_caching: agent_last_obs[agent] = obs[agent] agent_last_action[agent] = action agent_timer.end() step_timer.start() obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if render: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) im = env_renderer.get_image() im = PIL.Image.fromarray(im) images.append(im) if step % 100 == 0: print("{}/{}".format(step, max_steps - 1)) for agent in env.get_agent_handles(): score += all_rewards[agent] final_step = step if done['__all__']: break if render: for _ in range(10): images.append(images[len(images) - 1]) # save video images[0].save( f'/Users/nikhilvs/repos/nyu/flatland-reinforcement-learning/videos/maac-final/out_{episode_idx}.gif', save_all=True, append_images=images[1:], optimize=False, duration=60, loop=0) normalized_score = score / (max_steps * env.get_num_agents()) scores.append(normalized_score) tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) completions.append(completion) nb_steps.append(final_step) inference_times.append(inference_timer.get()) preproc_times.append(preproc_timer.get()) agent_times.append(agent_timer.get()) step_times.append(step_timer.get()) skipped_text = "" if skipped > 0: skipped_text = "\t⚡ Skipped {}".format(skipped) hit_text = "" if nb_hit > 0: hit_text = "\t⚡ Hit {} ({:.1f}%)".format(nb_hit, (100 * nb_hit) / (n_agents * final_step)) print( "☑️ Score: {:.3f} \tDone: {:.1f}% \tNb steps: {:.3f} " "\t🍭 Seed: {}" "\t🚉 Env: {:.3f}s " "\t🤖 Agent: {:.3f}s (per step: {:.3f}s) \t[preproc: {:.3f}s \tinfer: {:.3f}s]" "{}{}".format(normalized_score, completion * 100.0, final_step, seed, step_timer.get(), agent_timer.get(), agent_timer.get() / final_step, preproc_timer.get(), inference_timer.get(), skipped_text, hit_text)) return scores, completions, nb_steps, agent_times, step_times
def main(argv): try: opts, args = getopt.getopt(argv, "n:", ["n_trials="]) except getopt.GetoptError: print('test_navigation_single_agent.py -n <n_trials>') sys.exit(2) for opt, arg in opts: if opt in ('-n', '--n_trials'): n_trials = int(arg) random.seed(1) np.random.seed(1) ######## TEST SET SELECTION - PARAMETERS ######## test_multi_agent_setup = 1 # 1 for Medium size test, 2 for Big size test test_n_agents = 5 # Number of agents to test (3 - 5 - 7 for Medium, 5 - 7 - 10 for Big) test_malfunctions_enabled = True # Malfunctions enabled? test_agents_one_speed = True # Test agents with the same speed (1) or with 4 different speeds? ################################################# # Medium size if test_multi_agent_setup == 1: x_dim = 16*3 y_dim = 9*3 max_num_cities = 5 max_rails_between_cities = 2 max_rails_in_city = 3 # Big size if test_multi_agent_setup == 2: x_dim = 16*4 y_dim = 9*4 max_num_cities = 9 max_rails_between_cities = 5 max_rails_in_city = 5 stochastic_data = {'malfunction_rate': 80, # Rate of malfunction occurence of single agent 'min_duration': 15, # Minimal duration of malfunction 'max_duration': 50 # Max duration of malfunction } # Custom observation builder tree_depth = 2 TreeObservation = TreeObsForRailEnv(max_depth=tree_depth, predictor = ShortestPathPredictorForRailEnv(20)) np.savetxt(fname=path.join('NetsTest' , 'info.txt'), X=[x_dim,y_dim,test_n_agents,max_num_cities,max_rails_between_cities,max_rails_in_city,tree_depth],delimiter=';') # Different agent types (trains) with different speeds. if test_agents_one_speed: speed_ration_map = {1.: 1., # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0} # Slow freight train else: speed_ration_map = {1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25} # Slow freight train if test_malfunctions_enabled: env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator(max_num_cities=max_num_cities, # Number of cities in map (where train stations are) seed=14, # Random seed grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_ration_map), malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), number_of_agents=test_n_agents, obs_builder_object=TreeObservation) else: env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator(max_num_cities=max_num_cities, # Number of cities in map (where train stations are) seed=14, # Random seed grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=test_n_agents, obs_builder_object=TreeObservation) env.reset() #env_renderer = RenderTool(env, gl="PILSVG", ) env_renderer = RenderTool(env, gl="PILSVG", agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=False, screen_height=(1080*0.8), # Adjust these parameters to fit your resolution screen_width=(1920*0.8)) num_features_per_node = env.obs_builder.observation_dim nr_nodes = 0 for i in range(tree_depth + 1): nr_nodes += np.power(4, i) state_size = num_features_per_node * nr_nodes action_size = 5 # We set the number of episodes we would like to train on if 'n_trials' not in locals(): n_trials = 15000 # max_steps computation speed_weighted_mean = 0 for key in speed_ration_map.keys(): speed_weighted_mean += key * speed_ration_map[key] #max_steps = int(3 * (env.height + env.width)) max_steps = int((1/speed_weighted_mean) * 3 * (env.height + env.width)) #eps = 1. #eps_end = 0.005 #eps_decay = 0.9995 # And some variables to keep track of the performance action_dict = dict() final_action_dict = dict() action_prob_list = [] scores_window = deque(maxlen=100) done_window = deque(maxlen=100) scores = [] scores_list = [] deadlock_list =[] dones_list_window = [] dones_list = [] action_prob = [0] * action_size agent_obs = [None] * env.get_num_agents() agent_next_obs = [None] * env.get_num_agents() # Useless agent = Agent(state_size, action_size) # LOAD MODEL WEIGHTS TO TEST agent.qnetwork_local.load_state_dict(torch.load(path.join('NetsTest' , 'navigator_checkpoint3800_multi10_deadlock_global10.pth'))) record_images = False frame_step = 0 for trials in range(1, n_trials + 1): # Reset environment obs, info = env.reset()#(True, True) env_renderer.reset() # Build agent specific observations for a in range(env.get_num_agents()): agent_obs[a] = agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) # Reset score and done score = 0 env_done = 0 # Run episode for step in range(max_steps): # Action for a in range(env.get_num_agents()): if info['action_required'][a]: action = agent.act(agent_obs[a], eps=0.) action_prob[action] += 1 else: action = 0 action_dict.update({a: action}) # Environment step obs, all_rewards, done, deadlocks, info = env.step(action_dict) env_renderer.render_env(show=True, show_predictions=True, show_observations=False) # Build agent specific observations and normalize for a in range(env.get_num_agents()): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) score += all_rewards[a] / env.get_num_agents() if done['__all__']: break # Collection information about training tasks_finished = 0 for _idx in range(env.get_num_agents()): if done[_idx] == 1: tasks_finished += 1 done_window.append(tasks_finished / max(1, env.get_num_agents())) scores_window.append(score / max_steps) # save most recent score scores.append(np.mean(scores_window)) dones_list.append(tasks_finished / max(1, env.get_num_agents())) dones_list_window.append((np.mean(done_window))) scores_list.append(score / max_steps) deadlock_list.append(deadlocks.count(1)/max(1, env.get_num_agents())) if (np.sum(action_prob) == 0): action_prob_normalized = [0] * action_size else: action_prob_normalized = action_prob / np.sum(action_prob) print( '\rTesting {} Agents on ({},{}).\t Episode {}\t Score: {:.3f}\tDones: {:.2f}%\tDeadlocks: {:.2f}\t Action Probabilities: \t {}'.format( env.get_num_agents(), x_dim, y_dim, trials, score / max_steps, 100 * tasks_finished / max(1, env.get_num_agents()), deadlocks.count(1)/max(1, env.get_num_agents()), action_prob_normalized), end=" ") #if trials % 100 == 0: action_prob_list.append(action_prob_normalized) action_prob = [0] * action_size if trials % 50 == 0: np.savetxt(fname=path.join('NetsTest' , 'test_metrics.csv'), X=np.transpose(np.asarray([scores_list,scores,dones_list,dones_list_window,deadlock_list])), delimiter=';',newline='\n') np.savetxt(fname=path.join('NetsTest' , 'test_action_prob.csv'), X=np.asarray(action_prob_list), delimiter=';',newline='\n')
speed_ration_map = {1.: 1., # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0} # Slow freight train env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator(max_num_cities=3, # Number of cities in map (where train stations are) seed=1, # Random seed grid_mode=False, max_rails_between_cities=2, max_rails_in_city=4), schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), obs_builder_object=TreeObservation) env.reset() env_renderer = RenderTool(env, gl="PILSVG", ) num_features_per_node = env.obs_builder.observation_dim tree_depth = 2 nr_nodes = 0 for i in range(tree_depth + 1): nr_nodes += np.power(4, i) state_size = num_features_per_node * nr_nodes action_size = 5 # We set the number of episodes we would like to train on if 'n_trials' not in locals():
def test(args, T, ep, dqn, val_mem, metrics, results_dir, evaluate=False): # Init env and set in evaluation mode # Maps speeds to % of appearance in the env speed_ration_map = { 1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25 } # Slow freight train schedule_generator = sparse_schedule_generator(speed_ration_map) observation_builder = GraphObsForRailEnv( predictor=ShortestPathPredictorForRailEnv( max_depth=args.prediction_depth)) env = RailEnv( width=args.width, height=args.height, rail_generator=sparse_rail_generator( max_num_cities=args.max_num_cities, seed= ep, # Use episode as seed when evaluation is performed during training grid_mode=args.grid_mode, max_rails_between_cities=args.max_rails_between_cities, max_rails_in_city=args.max_rails_in_city, ), schedule_generator=schedule_generator, number_of_agents=args.num_agents, obs_builder_object=observation_builder, malfunction_generator_and_process_data=malfunction_from_params( parameters={ 'malfunction_rate': args.malfunction_rate, 'min_duration': args.min_duration, 'max_duration': args.max_duration }), ) if args.render: env_renderer = RenderTool(env, gl="PILSVG", agent_render_variant=AgentRenderVariant. AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=True, screen_height=1080, screen_width=1920) #max_time_steps = env.compute_max_episode_steps(env.width, env.height) max_time_steps = 200 # TODO Debug # metrics['steps'].append(T) metrics['episodes'].append(ep) T_rewards = [] # List of episodes rewards T_Qs = [] # List T_num_done_agents = [] # List of number of done agents for each episode T_all_done = [] # If all agents completed in each episode network_action_dict = dict() railenv_action_dict = dict() qvalues = {} # Test performance over several episodes for ep in range(args.evaluation_episodes): # Reset info state, info = env.reset() reward_sum, all_done = 0, False # reward_sum contains the cumulative reward obtained as sum during the steps num_done_agents = 0 if args.render: env_renderer.reset() # Choose first action - decide entering of agents into the environment for a in range(env.get_num_agents()): action = np.random.choice((0, 2)) railenv_action_dict.update({a: action}) state, reward, done, info = env.step(railenv_action_dict) # Env step reward_sum += sum(reward[a] for a in range(env.get_num_agents())) if args.render: env_renderer.render_env(show=True, show_observations=False, show_predictions=True) for step in range(max_time_steps - 1): # Choose actions for a in range(env.get_num_agents()): if info['action_required'][a]: network_action = dqn.act( state[a] ) # Choose an action greedily (with noisy weights) # network_action = 0 railenv_action = observation_builder.choose_railenv_action( a, network_action) qvalues.update({a: dqn.get_q_values(state[a])}) else: network_action = 0 railenv_action = 0 qvalues.update({a: [0, 0]}) # '0' if wasn't updated railenv_action_dict.update({a: railenv_action}) network_action_dict.update({a: network_action}) if args.debug: for a in range(env.get_num_agents()): print('#########################################') print('Info for agent {}'.format(a)) print('Occupancy, first layer: {}'.format( state[a][:args.prediction_depth])) print('Occupancy, second layer: {}'.format( state[a][args.prediction_depth:args.prediction_depth * 2])) print('Forks: {}'.format( state[a][args.prediction_depth * 2:args.prediction_depth * 3])) print('Target: {}'.format( state[a][args.prediction_depth * 3:args.prediction_depth * 4])) print('Priority: {}'.format( state[a][args.prediction_depth * 4])) print('Max priority encountered: {}'.format( state[a][args.prediction_depth * 4 + 1])) print('Num malfunctoning agents (globally): {}'.format( state[a][args.prediction_depth * 4 + 2])) print('Num agents ready to depart (globally): {}'.format( state[a][args.prediction_depth * 4 + 3])) print('Status: {}'.format(info['status'][a])) print('Position: {}'.format(env.agents[a].position)) print('Moving? {} at speed: {}'.format( env.agents[a].moving, info['speed'][a])) print('Action required? {}'.format( info['action_required'][a])) print('Network action: {}'.format(network_action_dict[a])) print('Railenv action: {}'.format(railenv_action_dict[a])) print('Q values: {}'.format(qvalues[a])) # print('QValues: {}'.format(qvalues)) print('Rewards: {}'.format(reward[a])) # Breakpoint for debugging here state, reward, done, info = env.step( railenv_action_dict) # Env step if args.render: env_renderer.render_env(show=True, show_observations=False, show_predictions=True) reward_sum += sum(reward[a] for a in range(env.get_num_agents())) if done['__all__']: all_done = True break # No need to close the renderer since env parameter sizes stay the same T_rewards.append(reward_sum) # Compute num of agents that reached their target for a in range(env.get_num_agents()): if done[a]: num_done_agents += 1 T_num_done_agents.append( num_done_agents / env.get_num_agents()) # In proportion to total T_all_done.append(all_done) # Test Q-values over validation memory for state in val_mem: # Iterate over valid states T_Qs.append(dqn.evaluate_q(state)) if args.debug: print('T_Qs: {}'.format(T_Qs)) # These are Qs from a single agent TODO avg_done_agents = sum(T_num_done_agents) / len( T_num_done_agents ) # Average number of agents that reached their target avg_reward = sum(T_rewards) / len(T_rewards) avg_norm_reward = avg_reward / (max_time_steps / env.get_num_agents()) # avg_reward, avg_Q = sum(T_rewards) / len(T_rewards), sum(T_Qs) / len(T_Qs) if not evaluate: # Save model parameters if improved if avg_done_agents > metrics['best_avg_done_agents']: metrics['best_avg_done_agents'] = avg_done_agents dqn.save(results_dir) # Append to results and save metrics metrics['rewards'].append(T_rewards) metrics['Qs'].append(T_Qs) torch.save(metrics, os.path.join(results_dir, 'metrics.pth')) # Plot HTML _plot_line(metrics['episodes'], metrics['rewards'], 'Reward', path=results_dir) # Plot rewards in episodes _plot_line(metrics['episodes'], metrics['Qs'], 'Q', path=results_dir) # Return average number of done agents (in proportion) and average reward return avg_done_agents, avg_reward, avg_norm_reward
def train_agent(env_params, train_params): # Environment parameters n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city seed = env_params.seed # Observation parameters observation_tree_depth = env_params.observation_tree_depth observation_radius = env_params.observation_radius observation_max_path_depth = env_params.observation_max_path_depth # Training parameters eps_start = train_params.eps_start eps_end = train_params.eps_end eps_decay = train_params.eps_decay n_episodes = train_params.n_episodes checkpoint_interval = train_params.checkpoint_interval n_eval_episodes = train_params.n_evaluation_episodes # Set the seeds random.seed(seed) np.random.seed(seed) # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 10000, # Rate of malfunctions min_duration=15, # Minimal duration max_duration=50 # Max duration ) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Fraction of train which each speed speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Setup the environment env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) env.reset(regenerate_schedule=True, regenerate_rail=True) # Setup renderer if train_params.render: env_renderer = RenderTool(env, gl="PGL") # Calculate the state size given the depth of the tree observation and the number of features n_features_per_node = env.obs_builder.observation_dim n_nodes = 0 for i in range(observation_tree_depth + 1): n_nodes += np.power(4, i) state_size = n_features_per_node * n_nodes # The action space of flatland is 5 discrete actions action_size = 5 # Max number of steps per episode # This is the official formula used during evaluations # See details in flatland.envs.schedule_generators.sparse_schedule_generator max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) action_count = [0] * action_size action_dict = dict() agent_obs = [None] * env.get_num_agents() agent_prev_obs = [None] * env.get_num_agents() agent_prev_action = [2] * env.get_num_agents() update_values = False smoothed_normalized_score = -1.0 smoothed_eval_normalized_score = -1.0 smoothed_completion = 0.0 smoothed_eval_completion = 0.0 # Double Dueling DQN policy policy = DDDQNPolicy(state_size, action_size, train_params) # TensorBoard writer writer = SummaryWriter() writer.add_hparams(vars(train_params), {}) writer.add_hparams(vars(env_params), {}) training_timer = Timer() training_timer.start() print( "\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes.\n" .format(env.get_num_agents(), x_dim, y_dim, n_episodes, n_eval_episodes, checkpoint_interval)) for episode_idx in range(n_episodes + 1): # Timers step_timer = Timer() reset_timer = Timer() learn_timer = Timer() preproc_timer = Timer() # Reset environment reset_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True) reset_timer.end() if train_params.render: env_renderer.set_new_rail() score = 0 nb_steps = 0 actions_taken = [] # Build agent specific observations for agent in env.get_agent_handles(): if obs[agent]: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) agent_prev_obs[agent] = agent_obs[agent].copy() # Run episode for step in range(max_steps - 1): for agent in env.get_agent_handles(): if info['action_required'][agent]: # If an action is required, we want to store the obs at that step as well as the action update_values = True action = policy.act(agent_obs[agent], eps=eps_start) action_count[action] += 1 actions_taken.append(action) else: update_values = False action = 0 action_dict.update({agent: action}) # Environment step step_timer.start() next_obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if train_params.render and episode_idx % checkpoint_interval == 0: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) for agent in range(env.get_num_agents()): # Update replay buffer and train agent # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[agent]: learn_timer.start() policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent]) learn_timer.end() agent_prev_obs[agent] = agent_obs[agent].copy() agent_prev_action[agent] = action_dict[agent] # Preprocess the new observations if next_obs[agent]: preproc_timer.start() agent_obs[agent] = normalize_observation( next_obs[agent], observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() score += all_rewards[agent] nb_steps = step if done['__all__']: break # Epsilon decay eps_start = max(eps_end, eps_decay * eps_start) # Collection information about training tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) normalized_score = score / (max_steps * env.get_num_agents()) action_probs = action_count / np.sum(action_count) action_count = [1] * action_size # Smoothed values for terminal display and for more stable hyper-parameter tuning smoothing = 0.99 smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * ( 1.0 - smoothing) smoothed_completion = smoothed_completion * smoothing + completion * ( 1.0 - smoothing) # Print logs if episode_idx % checkpoint_interval == 0: torch.save( policy.qnetwork_local, './checkpoints/origin_multi-' + str(episode_idx) + '.pth') if train_params.render: env_renderer.close_window() print('\r🚂 Episode {}' '\t 🏆 Score: {:.3f}' ' Avg: {:.3f}' '\t 💯 Done: {:.2f}%' ' Avg: {:.2f}%' '\t 🎲 Epsilon: {:.2f} ' '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score, smoothed_normalized_score, 100 * completion, 100 * smoothed_completion, eps_start, format_action_prob(action_probs)), end=" ") # Evaluate policy if episode_idx % train_params.checkpoint_interval == 0: scores, completions, nb_steps_eval = eval_policy( env, policy, n_eval_episodes, max_steps) writer.add_scalar("evaluation/scores_min", np.min(scores), episode_idx) writer.add_scalar("evaluation/scores_max", np.max(scores), episode_idx) writer.add_scalar("evaluation/scores_mean", np.mean(scores), episode_idx) writer.add_scalar("evaluation/scores_std", np.std(scores), episode_idx) writer.add_histogram("evaluation/scores", np.array(scores), episode_idx) writer.add_scalar("evaluation/completions_min", np.min(completions), episode_idx) writer.add_scalar("evaluation/completions_max", np.max(completions), episode_idx) writer.add_scalar("evaluation/completions_mean", np.mean(completions), episode_idx) writer.add_scalar("evaluation/completions_std", np.std(completions), episode_idx) writer.add_histogram("evaluation/completions", np.array(completions), episode_idx) writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_mean", np.mean(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval), episode_idx) writer.add_histogram("evaluation/nb_steps", np.array(nb_steps_eval), episode_idx) smoothing = 0.9 smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean( scores) * (1.0 - smoothing) smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean( completions) * (1.0 - smoothing) writer.add_scalar("evaluation/smoothed_score", smoothed_eval_normalized_score, episode_idx) writer.add_scalar("evaluation/smoothed_completion", smoothed_eval_completion, episode_idx) # Save logs to tensorboard writer.add_scalar("training/score", normalized_score, episode_idx) writer.add_scalar("training/smoothed_score", smoothed_normalized_score, episode_idx) writer.add_scalar("training/completion", np.mean(completion), episode_idx) writer.add_scalar("training/smoothed_completion", np.mean(smoothed_completion), episode_idx) writer.add_scalar("training/nb_steps", nb_steps, episode_idx) writer.add_histogram("actions/distribution", np.array(actions_taken), episode_idx) writer.add_scalar("actions/nothing", action_probs[RailEnvActions.DO_NOTHING], episode_idx) writer.add_scalar("actions/left", action_probs[RailEnvActions.MOVE_LEFT], episode_idx) writer.add_scalar("actions/forward", action_probs[RailEnvActions.MOVE_FORWARD], episode_idx) writer.add_scalar("actions/right", action_probs[RailEnvActions.MOVE_RIGHT], episode_idx) writer.add_scalar("actions/stop", action_probs[RailEnvActions.STOP_MOVING], episode_idx) writer.add_scalar("training/epsilon", eps_start, episode_idx) writer.add_scalar("training/buffer_size", len(policy.memory), episode_idx) writer.add_scalar("training/loss", policy.loss, episode_idx) writer.add_scalar("timer/reset", reset_timer.get(), episode_idx) writer.add_scalar("timer/step", step_timer.get(), episode_idx) writer.add_scalar("timer/learn", learn_timer.get(), episode_idx) writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx) writer.add_scalar("timer/total", training_timer.get_current(), episode_idx)
def test_initial_malfunction_do_nothing(): stochastic_data = MalfunctionParameters(malfunction_rate=70, # Rate of malfunction occurence min_duration=2, # Minimal duration of malfunction max_duration=5 # Max duration of malfunction ) rail, rail_map = make_simple_rail2() env = RailEnv(width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=1, malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), # Malfunction data generator ) env.reset() set_penalties_for_replay(env) replay_config = ReplayConfig( replay=[ Replay( position=None, direction=Grid4TransitionsEnum.EAST, action=RailEnvActions.MOVE_FORWARD, set_malfunction=3, malfunction=3, reward=env.step_penalty, # full step penalty while malfunctioning status=RailAgentStatus.READY_TO_DEPART ), Replay( position=(3, 2), direction=Grid4TransitionsEnum.EAST, action=RailEnvActions.DO_NOTHING, malfunction=2, reward=env.step_penalty, # full step penalty while malfunctioning status=RailAgentStatus.ACTIVE ), # malfunction stops in the next step and we're still at the beginning of the cell # --> if we take action DO_NOTHING, agent should restart without moving # Replay( position=(3, 2), direction=Grid4TransitionsEnum.EAST, action=RailEnvActions.DO_NOTHING, malfunction=1, reward=env.step_penalty, # full step penalty while stopped status=RailAgentStatus.ACTIVE ), # we haven't started moving yet --> stay here Replay( position=(3, 2), direction=Grid4TransitionsEnum.EAST, action=RailEnvActions.DO_NOTHING, malfunction=0, reward=env.step_penalty, # full step penalty while stopped status=RailAgentStatus.ACTIVE ), Replay( position=(3, 2), direction=Grid4TransitionsEnum.EAST, action=RailEnvActions.MOVE_FORWARD, malfunction=0, reward=env.start_penalty + env.step_penalty * 1.0, # start penalty + step penalty for speed 1.0 status=RailAgentStatus.ACTIVE ), # we start to move forward --> should go to next cell now Replay( position=(3, 3), direction=Grid4TransitionsEnum.EAST, action=RailEnvActions.MOVE_FORWARD, malfunction=0, reward=env.step_penalty * 1.0, # step penalty for speed 1.0 status=RailAgentStatus.ACTIVE ) ], speed=env.agents[0].speed_data['speed'], target=env.agents[0].target, initial_position=(3, 2), initial_direction=Grid4TransitionsEnum.EAST, ) run_replay_config(env, [replay_config], activate_agents=False)
def train(env): n_agents = env["n_agents"] x_dim = env["x_dim"] y_dim = env["y_dim"] n_cities = env["n_cities"] max_rails_between_cities = env["max_rails_between_cities"] max_rails_in_city = env["max_rails_in_city"] seed = 0 use_fast_tree_obs = False # Observation parameters observation_tree_depth = 4 observation_radius = 10 observation_max_path_depth = 30 # Set the seeds random.seed(seed) np.random.seed(seed) # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 10000, # Rate of malfunctions min_duration=15, # Minimal duration max_duration=50 # Max duration ) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = None if use_fast_tree_obs: tree_observation = FastTreeObs(max_depth=observation_tree_depth) print("Using FastTreeObs") else: tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) print("Using StandardTreeObs") speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) rewards = [] obs, info = env.reset() if use_fast_tree_obs: state_size = tree_observation.observation_dim else: # Calculate the state size given the depth of the tree observation and the # number of features n_features_per_node = env.obs_builder.observation_dim n_nodes = 0 for i in range(observation_tree_depth + 1): n_nodes += np.power(4, i) state_size = n_features_per_node * n_nodes action_size = 5 DEVICE = 'cpu' # if torch.cuda.is_available(): # DEVICE = 'gpu' buffer_length = 10000 steps_to_save_model = 10 step_size = 100 num_steps = 100 # update every 100 steps avg_steps = 20 # num steps to average and plot rewards reward_q = [] batch_size = 100 agent_obs = np.array([None] * env.get_num_agents()) max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) num_episodes = 100000 agent_init_params = [] sa_size = [] for i in range(n_agents): agent_init_params.append({ 'num_in_pol': state_size, 'num_out_pol': action_size, 'init_weights': 'model.pt' }) sa_size.append((state_size, action_size)) hyperparams = { "tau": 0.01, "pi_lr": 0.00001, "q_lr": 0.00005, "pol_hidden_dim": 256, "critic_hidden_dim": 256, "attend_heads": 8 } model = AttentionSAC(agent_init_params=agent_init_params, sa_size=sa_size, tau=hyperparams["tau"], pi_lr=hyperparams["pi_lr"], q_lr=hyperparams["q_lr"], pol_hidden_dim=hyperparams["pol_hidden_dim"], critic_hidden_dim=hyperparams["critic_hidden_dim"], attend_heads=hyperparams["attend_heads"]) model.init_dict = {} replay_buffer = ReplayBuffer(buffer_length, n_agents, [state_size for i in range(n_agents)], [action_size for i in range(n_agents)]) print("MAX STEPS: " + str(max_steps)) print("NUM EPISODES: ", num_episodes) print("HYPERPARAMS: ") print(hyperparams) start_time = time.time() for ep in range(num_episodes): print("Episode " + str(ep) + ":", flush=True) obs, info = env.reset(True, True) model.prep_rollouts(device=DEVICE) reward_sum_for_this_episode = 0 for steps in range(max_steps): if steps % step_size == 0: print("=", end="", flush=True) for agent in env.get_agent_handles(): if obs[agent] is not None: if use_fast_tree_obs: agent_obs[agent] = obs[agent] else: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) else: agent_obs[agent] = np.array([0.] * state_size) action_dict = {} agent_actions = [] torch_obs = [ Variable(torch.Tensor([agent_obs[i]]), requires_grad=False) for i in range(n_agents) ] torch_agent_actions = model.step(torch_obs, explore=True) agent_actions = [ac.data.numpy() for ac in torch_agent_actions] for i in range(n_agents): dist = torch_agent_actions[i][0] idx = -1 for j in range(action_size): if dist[j] != 0: idx = j break action_dict[i] = idx next_obs, all_rewards, done, info = env.step(action_dict) rewards = [] dones = [] next_agent_obs = np.array([None] * env.get_num_agents()) for agent in env.get_agent_handles(): if next_obs[agent] is not None: if use_fast_tree_obs: next_agent_obs[agent] = next_obs[agent] else: next_agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) else: next_agent_obs[agent] = np.array([0.] * state_size) for i in range(n_agents): reward_sum_for_this_episode += all_rewards[i] rewards.append(all_rewards[i]) all_rewards[i] += augment_reward(agent_obs[agent]) dones.append(done[i]) replay_buffer.push(np.array([agent_obs]), np.array(agent_actions), np.array([rewards]), np.array([next_agent_obs]), np.array([dones])) if steps % num_steps == 0: model.prep_training(device=DEVICE) sample = replay_buffer.sample(batch_size, norm_rews=False) #print(sample) model.update_critic(sample) model.update_policies(sample) model.update_all_targets() model.prep_rollouts(device=DEVICE) reward_sum_for_this_episode /= n_agents reward_q.append(reward_sum_for_this_episode) if len(reward_q) == avg_steps: wandb.log({'reward': np.mean(reward_q)}) reward_q = [] print() if ep % steps_to_save_model == 0: print("\nSaving model") model.save(os.getcwd() + "/model.pt") cur_time = time.time() time_elapsed = (cur_time - start_time) // 60 print("Time Elapsed: " + str(time_elapsed) + "\n")
def main(argv): try: opts, args = getopt.getopt(argv, "n:", ["n_trials="]) except getopt.GetoptError: print('training_navigation.py -n <n_trials>') sys.exit(2) for opt, arg in opts: if opt in ('-n', '--n_trials'): n_trials = int(arg) random.seed(1) np.random.seed(1) #### Choose the desired setup #### multi_agent_setup = 1 malfunctions_enabled = False agents_one_speed = True ################################## # Single agent (1) if multi_agent_setup == 1: x_dim = 35 y_dim = 35 n_agents = 1 max_num_cities = 3 max_rails_between_cities = 2 max_rails_in_city = 3 # Multi agent (3) if multi_agent_setup == 3: x_dim = 40 y_dim = 40 n_agents = 3 max_num_cities = 4 max_rails_between_cities = 2 max_rails_in_city = 3 # Multi agent (5) if multi_agent_setup == 5: x_dim = 16 * 3 y_dim = 9 * 3 n_agents = 5 max_num_cities = 5 max_rails_between_cities = 2 max_rails_in_city = 3 # Multi agent (10) if multi_agent_setup == 10: x_dim = 16 * 4 y_dim = 9 * 4 n_agents = 10 max_num_cities = 9 max_rails_between_cities = 5 max_rails_in_city = 5 # Use a the malfunction generator to break agents from time to time stochastic_data = { 'malfunction_rate': 80, # Rate of malfunction occurence of single agent 'min_duration': 15, # Minimal duration of malfunction 'max_duration': 50 # Max duration of malfunction } # Custom observation builder tree_depth = 2 TreeObservation = TreeObsForRailEnv( max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv(20)) np.savetxt(fname=path.join('Nets', 'info.txt'), X=[ x_dim, y_dim, n_agents, max_num_cities, max_rails_between_cities, max_rails_in_city, tree_depth ], delimiter=';') # Different agent types (trains) with different speeds. if agents_one_speed: speed_ration_map = { 1.: 1., # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 } # Slow freight train else: speed_ration_map = { 1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25 } # Slow freight train if malfunctions_enabled: env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=max_num_cities, # Number of cities in map (where train stations are) seed=14, # Random seed grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_ration_map), malfunction_generator_and_process_data=malfunction_from_params( stochastic_data), number_of_agents=n_agents, obs_builder_object=TreeObservation) else: env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=max_num_cities, # Number of cities in map (where train stations are) seed=14, # Random seed grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=n_agents, obs_builder_object=TreeObservation) env.reset(True, True) # After training we want to render the results so we also load a renderer env_renderer = RenderTool( env, gl="PILSVG", screen_height=800, # Adjust these parameters to fit your resolution screen_width=900) # Given the depth of the tree observation and the number of features per node we get the following state_size num_features_per_node = env.obs_builder.observation_dim nr_nodes = 0 for i in range(tree_depth + 1): nr_nodes += np.power(4, i) state_size = num_features_per_node * nr_nodes # The action space of flatland is 5 discrete actions action_size = 5 # We set the number of episodes we would like to train on if 'n_trials' not in locals(): n_trials = 15000 # And the max number of steps we want to take per episode max_steps = int(3 * (env.height + env.width)) # Define training parameters eps = 1. eps_end = 0.005 eps_decay = 0.998 # And some variables to keep track of the progress action_dict = dict() final_action_dict = dict() scores_window = deque(maxlen=100) done_window = deque(maxlen=100) deadlock_window = deque(maxlen=100) deadlock_average = [] scores = [] dones_list = [] #Metrics eps_list = [] action_prob_list = [] action_prob = [0] * action_size agent_obs = [None] * env.get_num_agents() agent_next_obs = [None] * env.get_num_agents() agent_obs_buffer = [None] * env.get_num_agents() agent_action_buffer = [2] * env.get_num_agents() cummulated_reward = np.zeros(env.get_num_agents()) update_values = False # Now we load a Double dueling DQN agent agent = Agent(state_size, action_size) for trials in range(1, n_trials + 1): #print(torch.cuda.current_device()) # Reset environment obs, info = env.reset(True, True) #env_renderer.reset() # Build agent specific observations for a in range(env.get_num_agents()): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) agent_obs_buffer[a] = agent_obs[a].copy() # Reset score and done score = 0 env_done = 0 # Run episode for step in range(max_steps): # Action for a in range(env.get_num_agents()): if info['action_required'][a]: # If an action is require, we want to store the obs a that step as well as the action update_values = True action = agent.act(agent_obs[a], eps=eps) action_prob[action] += 1 else: update_values = False action = 0 action_dict.update({a: action}) # Environment step next_obs, all_rewards, done, deadlocks, info = env.step( action_dict) #env_renderer.render_env(show=True, show_predictions=True, show_observations=True) # Update replay buffer and train agent for a in range(env.get_num_agents()): # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[a]: agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a], agent_obs[a], done[a]) cummulated_reward[a] = 0. agent_obs_buffer[a] = agent_obs[a].copy() agent_action_buffer[a] = action_dict[a] if next_obs[a]: agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10) score += all_rewards[a] / env.get_num_agents() # Copy observation if done['__all__']: env_done = 1 break # Epsilon decay eps = max(eps_end, eps_decay * eps) # decrease epsilon # Collection information about training tasks_finished = 0 for _idx in range(env.get_num_agents()): if done[_idx] == 1: tasks_finished += 1 done_window.append(tasks_finished / max(1, env.get_num_agents())) scores_window.append(score / max_steps) # save most recent score scores.append(np.mean(scores_window)) deadlock_window.append( deadlocks.count(1) / max(1, env.get_num_agents())) deadlock_average.append(np.mean(deadlock_window)) dones_list.append((np.mean(done_window))) eps_list.append(eps) action_prob_list.append(action_prob / np.sum(action_prob)) print( '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f} %\tDeadlocks: {:.2f} \tEpsilon: {:.2f} \t Action Probabilities: \t {}' .format(env.get_num_agents(), x_dim, y_dim, trials, np.mean(scores_window), 100 * np.mean(done_window), np.mean(deadlock_window), eps, action_prob / np.sum(action_prob)), end=" ") if trials % 100 == 0: print( '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}' .format(env.get_num_agents(), x_dim, y_dim, trials, np.mean(scores_window), 100 * np.mean(done_window), eps, action_prob / np.sum(action_prob))) torch.save( agent.qnetwork_local.state_dict(), path.join('Nets', ('navigator_checkpoint' + str(trials) + '.pth'))) action_prob = [1] * action_size if trials % 50 == 0: np.savetxt(fname=path.join('Nets', 'metrics.csv'), X=np.transpose( np.asarray([ scores, dones_list, deadlock_average, eps_list ])), delimiter=';', newline='\n') np.savetxt(fname=path.join('Nets', 'action_prob.csv'), X=np.asarray(action_prob_list), delimiter=';', newline='\n') # Plot overall training progress at the end plt.plot(scores) plt.show()
def main(args, dir): ''' :param args: :return: Episodes to debug (set breakpoint in episodes loop to debug): - ep = 3, agent 1 spawns in front of 3, blocking its path; 0 and 2 are in a deadlock since they have same priority - ep = 4, agents stop because of wrong priorities even though the conflict zone wasn't entered, - ep = 14, ''' rail_generator = sparse_rail_generator( max_num_cities=args.max_num_cities, seed=args.seed, grid_mode=args.grid_mode, max_rails_between_cities=args.max_rails_between_cities, max_rails_in_city=args.max_rails_in_city, ) # Maps speeds to % of appearance in the env speed_ration_map = { 1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25 } # Slow freight train observation_builder = GraphObsForRailEnv( predictor=ShortestPathPredictorForRailEnv( max_depth=args.prediction_depth), bfs_depth=4) env = RailEnv( width=args.width, height=args.height, rail_generator=rail_generator, schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=args.num_agents, obs_builder_object=observation_builder, malfunction_generator_and_process_data=malfunction_from_params( parameters={ 'malfunction_rate': args.malfunction_rate, # Rate of malfunction occurrence 'min_duration': args.min_duration, # Minimal duration of malfunction 'max_duration': args.max_duration # Max duration of malfunction })) if args.render: env_renderer = RenderTool(env, agent_render_variant=AgentRenderVariant. AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=True) sm = stateMachine() tb = TestBattery(env, observation_builder) state_machine_action_dict = {} railenv_action_dict = {} # max_time_steps = env.compute_max_episode_steps(args.width, args.height) max_time_steps = 200 T_rewards = [] # List of episodes rewards T_Qs = [] # List of q values T_num_done_agents = [] # List of number of done agents for each episode T_all_done = [] # If all agents completed in each episode T_episodes = [] # Time taken for each episode if args.save_image and not os.path.isdir("image_dump"): os.makedirs("image_dump") step_taken = 0 total_step_taken = 0 total_episodes = 0 step_times = [] # Time taken for each step for ep in range(args.num_episodes): # Reset info at the beginning of an episode start_time = time.time() # Take time of one episode if args.generate_baseline: if not os.path.isdir("image_dump/" + str(dir)) and args.save_image: os.makedirs("image_dump/" + str(dir)) else: if not os.path.isdir("image_dump/" + str(ep)) and args.save_image: os.makedirs("image_dump/" + str(ep)) state, info = env.reset() tb.reset() if args.render: env_renderer.reset() reward_sum, all_done = 0, False # reward_sum contains the cumulative reward obtained as sum during the steps num_done_agents = 0 state_machine_action = {} for i in range(env.number_of_agents): state_machine_action[i] = 0 for step in range(max_time_steps): start_step_time = time.time() #if step % 10 == 0: # print(step) # Test battery # see test_battery.py triggers = tb.tests(state, args.prediction_depth, state_machine_action) # state machine based on triggers of test battery # see state_machine.py state_machine_action = sm.act( triggers) # State machine picks action for a in range(env.get_num_agents()): #if info['action_required'][a]: # #railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a]) # railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a]) # state_machine_action_dict.update({a: state_machine_action}) # railenv_action_dict.update({a: railenv_action}) # railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a]) railenv_action = observation_builder.choose_railenv_action( a, state_machine_action[a]) state_machine_action_dict.update({a: state_machine_action}) railenv_action_dict.update({a: railenv_action}) state, reward, done, info = env.step( railenv_action_dict) # Env step if args.generate_baseline: #env_renderer.render_env(show=True, show_observations=False, show_predictions=True) env_renderer.render_env(show=False, show_observations=False, show_predictions=True) else: env_renderer.render_env(show=True, show_observations=False, show_predictions=True) if args.generate_baseline: if args.save_image: env_renderer.save_image("image_dump/" + str(dir) + "/image_" + str(step) + "_.png") else: if args.save_image: env_renderer.save_image("image_dump/" + str(ep) + "/image_" + str(step) + "_.png") if args.debug: for a in range(env.get_num_agents()): log('\n\n#########################################') log('\nInfo for agent {}'.format(a)) #log('\npath : {}'.format(state[a]["path"])) log('\noverlap : {}'.format(state[a]["overlap"])) log('\ndirection : {}'.format(state[a]["direction"])) log('\nOccupancy, first layer: {}'.format( state[a]["occupancy"])) log('\nOccupancy, second layer: {}'.format( state[a]["conflict"])) log('\nForks: {}'.format(state[a]["forks"])) log('\nTarget: {}'.format(state[a]["target"])) log('\nPriority: {}'.format(state[a]["priority"])) log('\nMax priority encountered: {}'.format( state[a]["max_priority"])) log('\nNum malfunctioning agents (globally): {}'.format( state[a]["n_malfunction"])) log('\nNum agents ready to depart (globally): {}'.format( state[a]["ready_to_depart"])) log('\nStatus: {}'.format(info['status'][a])) log('\nPosition: {}'.format(env.agents[a].position)) log('\nTarget: {}'.format(env.agents[a].target)) log('\nMoving? {} at speed: {}'.format( env.agents[a].moving, info['speed'][a])) log('\nAction required? {}'.format( info['action_required'][a])) log('\nState machine action: {}'.format( state_machine_action_dict[a])) log('\nRailenv action: {}'.format(railenv_action_dict[a])) log('\nRewards: {}'.format(reward[a])) log('\n\n#########################################') reward_sum += sum(reward[a] for a in range(env.get_num_agents())) step_taken = step time_taken_step = time.time() - start_step_time step_times.append(time_taken_step) if done['__all__']: all_done = True break total_step_taken += step_taken time_taken = time.time() - start_time # Time taken for one episode total_episodes = ep # Time metrics - too precise avg_time_step = sum(step_times) / step_taken #print("Avg time step: " + str(avg_time_step)) # No need to close the renderer since env parameter sizes stay the same T_rewards.append(reward_sum) # Compute num of agents that reached their target for a in range(env.get_num_agents()): if done[a]: num_done_agents += 1 percentage_done_agents = num_done_agents / env.get_num_agents() log("\nDone agents in episode: {}".format(percentage_done_agents)) T_num_done_agents.append( percentage_done_agents) # In proportion to total T_all_done.append(all_done) # Average number of agents that reached their target avg_done_agents = sum(T_num_done_agents) / len(T_num_done_agents) if len( T_num_done_agents) > 0 else 0 avg_reward = sum(T_rewards) / len(T_rewards) if len(T_rewards) > 0 else 0 avg_norm_reward = avg_reward / (max_time_steps / env.get_num_agents()) avg_ep_time = sum(T_episodes) / args.num_episodes if total_episodes == 0: total_episodes = 1 log("\nSeed: " + str(args.seed) \ + "\t | Avg_done_agents: " + str(avg_done_agents)\ + "\t | Avg_reward: " + str(avg_reward)\ + "\t | Avg_norm_reward: " + str(avg_norm_reward)\ + "\t | Max_num_time_steps: " + str(max_time_steps)\ + "\t | Avg_num_time_steps: " + str(total_step_taken/total_episodes) + "\t | Avg episode time: " + str(avg_ep_time))
def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps, seed, render): # evaluation is faster on CPU, except if you have huge networks parameters = {'use_gpu': False} policy = DDDQNPolicy(state_size, action_size, Namespace(**parameters), evaluation_mode=True) policy.qnetwork_local = torch.load(checkpoint) env_params = Namespace(**env_params) # Environment parameters n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city # Observation parameters observation_tree_depth = env_params.observation_tree_depth observation_radius = env_params.observation_radius observation_max_path_depth = env_params.observation_max_path_depth # Malfunction and speed profiles # TODO pass these parameters properly from main! malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 2000, # Rate of malfunctions min_duration=20, # Minimal duration max_duration=50 # Max duration ) speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Setup the environment env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) env.reset(True, True) if render: env_renderer = RenderTool(env, gl="PGL") action_dict = dict() scores = [] completions = [] nb_steps = [] inference_times = [] preproc_times = [] agent_times = [] step_times = [] for episode_idx in range(n_eval_episodes): inference_timer = Timer() preproc_timer = Timer() agent_timer = Timer() step_timer = Timer() agent_obs = [None] * env.get_num_agents() score = 0.0 step_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True) step_timer.end() if render: env_renderer.set_new_rail() final_step = 0 for step in range(max_steps - 1): agent_timer.start() for agent in env.get_agent_handles(): if obs[agent]: preproc_timer.start() agent_obs[agent] = normalize_observation( obs[agent], tree_depth=observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() action = 0 if info['action_required'][agent]: inference_timer.start() action = policy.act(agent_obs[agent], eps=0.0) inference_timer.end() action_dict.update({agent: action}) agent_timer.end() step_timer.start() obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if render: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) for agent in env.get_agent_handles(): score += all_rewards[agent] final_step = step if done['__all__']: break normalized_score = score / (max_steps * env.get_num_agents()) scores.append(normalized_score) tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) completions.append(completion) nb_steps.append(final_step) inference_times.append(inference_timer.get()) preproc_times.append(preproc_timer.get()) agent_times.append(agent_timer.get()) step_times.append(step_timer.get()) print( "☑️ Score: {:.3f} \tDone: {:.1f}% \tNb steps: {:.3f} " "\t🚉 Env: {:.3f}s " "\t🤖 Agent: {:.3f}s (per step: {:.3f}s) \t[preproc: {:.3f}s \tinfer: {:.3f}s]" .format(normalized_score, completion * 100.0, final_step, step_timer.get(), agent_timer.get(), agent_timer.get() / final_step, preproc_timer.get(), inference_timer.get())) return scores, completions, nb_steps, agent_times, step_times
1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Setup the environment env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) env.reset(regenerate_schedule=True, regenerate_rail=True) # Setup renderer env_renderer = RenderTool(env) ''' env_renderer.render_env(show=True,show_predictions=False) time.sleep(5) env_renderer.close_window() ''' n_features_per_node = env.obs_builder.observation_dim n_nodes = 0
def main(): np.random.seed(1) env = RailEnv( width=x_dim, height=y_dim, number_of_agents=n_agents, rail_generator=rail_generator, schedule_generator=schedule_generator, malfunction_generator_and_process_data=malfunction_from_params( StochasticData(1 / 8000, 15, 50)), obs_builder_object=TreeObservation(max_depth=tree_depth)) # After training we want to render the results so we also load a renderer env_renderer = RenderTool(env, gl="PILSVG") # Calculate the state size based on the number of nodes in the tree observation num_features_per_node = env.obs_builder.observation_dim num_nodes = sum(np.power(4, i) for i in range(tree_depth + 1)) state_size = num_features_per_node * num_nodes action_size = 5 # Now we load a double dueling DQN agent and initialize it from the checkpoint agent = Agent(state_size, action_size) if load_from_checkpoint: start, eps = agent.load(project_root / 'checkpoints', 0, 1.0) else: start, eps = 0, 1.0 # And some variables to keep track of the progress action_dict, final_action_dict = {}, {} scores_window, done_window = deque(maxlen=500), deque(maxlen=500) action_prob = [0] * action_size agent_obs = [None] * n_agents agent_obs_buffer = [None] * n_agents agent_action_buffer = [2] * n_agents max_steps = int(3 * (x_dim + y_dim)) update_values = False start_time = time.time() # We don't want to retrain on old railway networks when we restart from a checkpoint, so we just loop # through the generators to get all the old networks out of the way for _ in range(0, start): rail_generator() schedule_generator() # Start the training loop for episode in range(start + 1, n_trials + 1): env_renderer.reset() obs, info = env.reset(True, True) score = 0 # Build agent specific observations for a in range(n_agents): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) agent_obs_buffer[a] = agent_obs[a].copy() # Run episode for step in range(max_steps): for a in range(n_agents): if info['action_required'][a]: # If an action is required, we want to store the obs a that step as well as the action update_values = True action = agent.act(agent_obs[a], eps=eps) # action = np.random.randint(4) action_dict[a] = action action_prob[action] += 1 else: update_values = False action_dict[a] = 0 # Environment step next_obs, all_rewards, done, info = env.step(action_dict) # Update replay buffer and train agent for a in range(n_agents): # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[a]: agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a], agent_obs[a], done[a], train) agent_obs_buffer[a] = agent_obs[a].copy() agent_action_buffer[a] = action_dict[a] if next_obs[a]: agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10) score += all_rewards[a] / n_agents # Render if episode % render_interval == 0: render(env_renderer) if done['__all__']: break # Epsilon decay eps = max(eps_end, eps_decay * eps) # decrease epsilon # Collection information about training tasks_finished = sum(done[i] for i in range(n_agents)) done_window.append(tasks_finished / max(1, n_agents)) scores_window.append(score / max_steps) # save most recent score action_probs = ', '.join(f'{x:.3f}' for x in action_prob / np.sum(action_prob)) print(f'\rTraining {n_agents} Agents on ({x_dim},{y_dim}) \t ' + f'Episode {episode} \t ' + f'Average Score: {np.mean(scores_window):.3f} \t ' + f'Dones: {100 * np.mean(done_window):.2f}% \t ' + f'Epsilon: {eps:.2f} \t ' + f'Action Probabilities: {action_probs}', end=" ") if episode % report_interval == 0: print(f'\rTraining {n_agents} Agents on ({x_dim},{y_dim}) \t ' + f'Episode {episode} \t ' + f'Average Score: {np.mean(scores_window):.3f} \t ' + f'Dones: {100 * np.mean(done_window):.2f}% \t ' + f'Epsilon: {eps:.2f} \t ' + f'Action Probabilities: {action_probs} \t ' + f'Time taken: {time.time() - start_time:.2f}s') if train: agent.save(project_root / 'checkpoints', episode, eps) start_time = time.time() action_prob = [1] * action_size