def test_normalize_features(): random.seed(1) np.random.seed(1) max_depth = 4 for i in range(10): tree_observer = TreeObsForRailEnv(max_depth=max_depth) next_rand_number = random.randint(0, 100) env = RailEnv(width=10, height=10, rail_generator=complex_rail_generator( nr_start_goal=10, nr_extra=1, min_dist=8, max_dist=99999, seed=next_rand_number), schedule_generator=complex_schedule_generator(), number_of_agents=1, obs_builder_object=tree_observer) obs, all_rewards, done, _ = env.step({0: 0}) obs_new = tree_observer.get() # data, distance, agent_data = split_tree(tree=np.array(obs_old), num_features_per_node=11) data_normalized = normalize_observation(obs_new, max_depth, observation_radius=10) filename = 'testdata/test_array_{}.csv'.format(i) data_loaded = np.loadtxt(filename, delimiter=',') assert np.allclose(data_loaded, data_normalized)
def __init__(self, env_params=env_params, speed_ration_map=speed_ration_map, obs_builder="global"): ''' obs_builder: GlobalObsForRailEnv, LocalObsForRainEnv, TreeObsForRailEnv ''' self.width = env_params['width'] self.height = env_params['height'] self.max_num_cities = env_params['max_num_cities'] self.number_of_agents = env_params['number_of_agents'] # Use a the malfunction generator to break agents from time to time self.stochastic_data = { 'malfunction_rate': 0, # Rate of malfunction occurence 'min_duration': 0, # Minimal duration of malfunction 'max_duration': 0 # Max duration of malfunction } # Custom observation builder self.TreeObservation = TreeObsForRailEnv( max_depth=2, predictor=ShortestPathPredictorForRailEnv()) # Different agent types (trains) with different speeds. self.speed_ration_map = speed_ration_map # obs builder list self.obs_builder_dict = { "global": GlobalObsForRailEnv, "local": LocalObsForRailEnv(view_width=2, view_height=5, center=3), "tree": TreeObsForRailEnv(max_depth=2, predictor=ShortestPathPredictorForRailEnv()) } self.obs_builder = obs_builder
def test_rail_env_reset(): file_name = "test_rail_env_reset.pkl" # Test to save and load file. rail, rail_map = make_simple_rail() env = RailEnv(width=rail_map.shape[1], height=rail_map.shape[0], rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=3, obs_builder_object=TreeObsForRailEnv(max_depth=2, predictor=ShortestPathPredictorForRailEnv())) env.reset() #env.save(file_name) RailEnvPersister.save(env, file_name) dist_map_shape = np.shape(env.distance_map.get()) rails_initial = env.rail.grid agents_initial = env.agents #env2 = RailEnv(width=1, height=1, rail_generator=rail_from_file(file_name), # schedule_generator=schedule_from_file(file_name), number_of_agents=1, # obs_builder_object=TreeObsForRailEnv(max_depth=2, predictor=ShortestPathPredictorForRailEnv())) #env2.reset(False, False, False) env2, env2_dict = RailEnvPersister.load_new(file_name) rails_loaded = env2.rail.grid agents_loaded = env2.agents assert np.all(np.array_equal(rails_initial, rails_loaded)) assert agents_initial == agents_loaded env3 = RailEnv(width=1, height=1, rail_generator=rail_from_file(file_name), schedule_generator=schedule_from_file(file_name), number_of_agents=1, obs_builder_object=TreeObsForRailEnv(max_depth=2, predictor=ShortestPathPredictorForRailEnv())) env3.reset(False, True, False) rails_loaded = env3.rail.grid agents_loaded = env3.agents assert np.all(np.array_equal(rails_initial, rails_loaded)) assert agents_initial == agents_loaded env4 = RailEnv(width=1, height=1, rail_generator=rail_from_file(file_name), schedule_generator=schedule_from_file(file_name), number_of_agents=1, obs_builder_object=TreeObsForRailEnv(max_depth=2, predictor=ShortestPathPredictorForRailEnv())) env4.reset(True, False, False) rails_loaded = env4.rail.grid agents_loaded = env4.agents assert np.all(np.array_equal(rails_initial, rails_loaded)) assert agents_initial == agents_loaded
def test_get_entry_directions(): rail, rail_map = make_simple_rail() env = RailEnv(width=rail_map.shape[1], height=rail_map.shape[0], rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=1, obs_builder_object=TreeObsForRailEnv( max_depth=2, predictor=ShortestPathPredictorForRailEnv())) env.reset() def _assert(position, expected): actual = env.get_valid_directions_on_grid(*position) assert actual == expected, "[{},{}] actual={}, expected={}".format( *position, actual, expected) # north dead end _assert((0, 3), [True, False, False, False]) # west dead end _assert((3, 0), [False, False, False, True]) # switch _assert((3, 3), [False, True, True, True]) # horizontal _assert((3, 2), [False, True, False, True]) # vertical _assert((2, 3), [True, False, True, False]) # nowhere _assert((0, 0), [False, False, False, False])
def __init__(self, config) -> None: super().__init__(config) self._builder = FixedTreeObsWrapper( TreeObsForRailEnv(max_depth=config['max_depth'], predictor=get_predictor(config=config)), small_tree=config.get('small_tree', None), search_strategy=config.get('search_strategy', 'dfs'))
def test_path_not_exists(rendering=False): rail, rail_map = make_simple_rail_unconnected() env = RailEnv( width=rail_map.shape[1], height=rail_map.shape[0], rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=1, obs_builder_object=TreeObsForRailEnv( max_depth=2, predictor=ShortestPathPredictorForRailEnv()), ) env.reset() check_path( env, rail, (5, 6), # south dead-end 0, # north (0, 3), # north dead-end False) if rendering: renderer = RenderTool(env, gl="PILSVG") renderer.render_env(show=True, show_observations=False) input("Continue?")
def render_test(parameters, test_nr=0, nr_examples=5): for trial in range(nr_examples): # Reset the env print( 'Showing {} Level {} with (x_dim,y_dim) = ({},{}) and {} Agents.'. format(test_nr, trial, parameters[0], parameters[1], parameters[2])) file_name = "./Tests/{}/Level_{}.pkl".format(test_nr, trial) env = RailEnv( width=1, height=1, rail_generator=rail_from_file(file_name), obs_builder_object=TreeObsForRailEnv(max_depth=2), number_of_agents=1, ) env_renderer = RenderTool( env, gl="PILSVG", ) env_renderer.set_new_rail() env.reset(False, False) env_renderer.render_env(show=True, show_observations=False) time.sleep(0.1) env_renderer.close_window() return
def create_testfiles(parameters, test_nr=0, nr_trials_per_test=100): # Parameter initialization print('Creating {} with (x_dim,y_dim) = ({},{}) and {} Agents.'.format( test_nr, parameters[0], parameters[1], parameters[2])) # Reset environment random.seed(parameters[3]) np.random.seed(parameters[3]) nr_paths = max(4, parameters[2] + int(0.5 * parameters[2])) min_dist = int(min([parameters[0], parameters[1]]) * 0.75) env = RailEnv(width=parameters[0], height=parameters[1], rail_generator=complex_rail_generator(nr_start_goal=nr_paths, nr_extra=5, min_dist=min_dist, max_dist=99999, seed=parameters[3]), schedule_generator=complex_schedule_generator(), obs_builder_object=TreeObsForRailEnv(max_depth=2), number_of_agents=parameters[2]) printProgressBar(0, nr_trials_per_test, prefix='Progress:', suffix='Complete', length=20) for trial in range(nr_trials_per_test): # Reset the env env.reset(True, True) env.save("./Tests/{}/Level_{}.pkl".format(test_nr, trial)) printProgressBar(trial + 1, nr_trials_per_test, prefix='Progress:', suffix='Complete', length=20) return
def test_seeding_and_observations(): # Test if two different instances diverge with different observations rail, rail_map = make_simple_rail2() # Make two seperate envs with different observation builders # Global Observation env = RailEnv(width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(seed=12), number_of_agents=10, obs_builder_object=GlobalObsForRailEnv()) # Tree Observation env2 = RailEnv(width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(seed=12), number_of_agents=10, obs_builder_object=TreeObsForRailEnv( max_depth=2, predictor=ShortestPathPredictorForRailEnv())) env.reset(False, False, False, random_seed=12) env2.reset(False, False, False, random_seed=12) # Check that both environments produce the same initial start positions assert env.agents[0].initial_position == env2.agents[0].initial_position assert env.agents[1].initial_position == env2.agents[1].initial_position assert env.agents[2].initial_position == env2.agents[2].initial_position assert env.agents[3].initial_position == env2.agents[3].initial_position assert env.agents[4].initial_position == env2.agents[4].initial_position assert env.agents[5].initial_position == env2.agents[5].initial_position assert env.agents[6].initial_position == env2.agents[6].initial_position assert env.agents[7].initial_position == env2.agents[7].initial_position assert env.agents[8].initial_position == env2.agents[8].initial_position assert env.agents[9].initial_position == env2.agents[9].initial_position action_dict = {} for step in range(10): for a in range(env.get_num_agents()): action = np.random.randint(4) action_dict[a] = action env.step(action_dict) env2.step(action_dict) # Check that both environments end up in the same position assert env.agents[0].position == env2.agents[0].position assert env.agents[1].position == env2.agents[1].position assert env.agents[2].position == env2.agents[2].position assert env.agents[3].position == env2.agents[3].position assert env.agents[4].position == env2.agents[4].position assert env.agents[5].position == env2.agents[5].position assert env.agents[6].position == env2.agents[6].position assert env.agents[7].position == env2.agents[7].position assert env.agents[8].position == env2.agents[8].position assert env.agents[9].position == env2.agents[9].position for a in range(env.get_num_agents()): print("assert env.agents[{}].position == env2.agents[{}].position". format(a, a))
def regenerate(self, method=None, nAgents=0, env=None): self.log("Regenerate size", self.regen_size_width, self.regen_size_height) if method is None or method == "Empty": fnMethod = empty_rail_generator() elif method == "Random Cell": fnMethod = random_rail_generator( cell_type_relative_proportion=[1] * 11) else: fnMethod = complex_rail_generator(nr_start_goal=nAgents, nr_extra=20, min_dist=12, seed=int(time.time())) if env is None: self.env = RailEnv( width=self.regen_size_width, height=self.regen_size_height, rail_generator=fnMethod, number_of_agents=nAgents, obs_builder_object=TreeObsForRailEnv(max_depth=2)) else: self.env = env self.env.reset(regenerate_rail=True) self.fix_env() self.set_env(self.env) self.view.new_env() self.redraw()
def gen_env(number_agents, width, height, n_start_goal, seed): speed_ration_map = { 1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25 } # Slow freight train env = RailEnv(width=width, height=height, rail_generator=complex_rail_generator( nr_start_goal=n_start_goal, nr_extra=3, min_dist=6, max_dist=99999, seed=seed), schedule_generator=complex_schedule_generator( speed_ratio_map=speed_ration_map), number_of_agents=number_agents, obs_builder_object=TreeObsForRailEnv(max_depth=5)) env.reset() env.step(dict(zip(range(number_agents), [2] * number_agents))) return env
def load_flatland_environment_from_file( file_name: str, load_from_package: str = None, obs_builder_object: ObservationBuilder = None) -> RailEnv: """ Parameters ---------- file_name : str The pickle file. load_from_package : str The python module to import from. Example: 'env_data.tests' This requires that there are `__init__.py` files in the folder structure we load the file from. obs_builder_object: ObservationBuilder The obs builder for the `RailEnv` that is created. Returns ------- RailEnv The environment loaded from the pickle file. """ if obs_builder_object is None: obs_builder_object = TreeObsForRailEnv( max_depth=2, predictor=ShortestPathPredictorForRailEnv(max_depth=10)) environment = RailEnv( width=1, height=1, rail_generator=rail_from_file(file_name, load_from_package), schedule_generator=schedule_from_file(file_name, load_from_package), number_of_agents=1, obs_builder_object=obs_builder_object) return environment
def test_path_exists(rendering=False): rail, rail_map = make_simple_rail() env = RailEnv( width=rail_map.shape[1], height=rail_map.shape[0], rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=1, obs_builder_object=TreeObsForRailEnv( max_depth=2, predictor=ShortestPathPredictorForRailEnv()), ) env.reset() check_path( env, rail, (5, 6), # north of south dead-end 0, # north (3, 9), # east dead-end True) check_path( env, rail, (6, 6), # south dead-end 2, # south (3, 9), # east dead-end True) check_path( env, rail, (3, 0), # east dead-end 3, # west (0, 3), # north dead-end True) check_path( env, rail, (5, 6), # east dead-end 0, # west (1, 3), # north dead-end True) check_path( env, rail, (1, 3), # east dead-end 2, # south (3, 3), # north dead-end True) check_path( env, rail, (1, 3), # east dead-end 0, # north (3, 3), # north dead-end True)
def test_shortest_path_predictor_conflicts(rendering=False): rail, rail_map = make_invalid_simple_rail() env = RailEnv( width=rail_map.shape[1], height=rail_map.shape[0], rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=2, obs_builder_object=TreeObsForRailEnv( max_depth=2, predictor=ShortestPathPredictorForRailEnv()), ) env.reset() # set the initial position agent = env.agents[0] agent.initial_position = (5, 6) # south dead-end agent.position = (5, 6) # south dead-end agent.direction = 0 # north agent.initial_direction = 0 # north agent.target = (3, 9) # east dead-end agent.moving = True agent.status = RailAgentStatus.ACTIVE agent = env.agents[1] agent.initial_position = (3, 8) # east dead-end agent.position = (3, 8) # east dead-end agent.direction = 3 # west agent.initial_direction = 3 # west agent.target = (6, 6) # south dead-end agent.moving = True agent.status = RailAgentStatus.ACTIVE observations, info = env.reset(False, False, True) if rendering: renderer = RenderTool(env, gl="PILSVG") renderer.render_env(show=True, show_observations=False) input("Continue?") # get the trees to test obs_builder: TreeObsForRailEnv = env.obs_builder pp = pprint.PrettyPrinter(indent=4) tree_0 = observations[0] tree_1 = observations[1] env.obs_builder.util_print_obs_subtree(tree_0) env.obs_builder.util_print_obs_subtree(tree_1) # check the expectations expected_conflicts_0 = [('F', 'R')] expected_conflicts_1 = [('F', 'L')] _check_expected_conflicts(expected_conflicts_0, obs_builder, tree_0, "agent[0]: ") _check_expected_conflicts(expected_conflicts_1, obs_builder, tree_1, "agent[1]: ")
def test_render_env(save_new_images=False): np.random.seed(100) oEnv = RailEnv(width=10, height=10, rail_generator=empty_rail_generator(), number_of_agents=0, obs_builder_object=TreeObsForRailEnv(max_depth=2)) oEnv.reset() oEnv.rail.load_transition_map('env_data.tests', "test1.npy") oRT = rt.RenderTool(oEnv, gl="PILSVG") oRT.render_env(show=False) checkFrozenImage(oRT, "basic-env.npz", resave=save_new_images) oRT = rt.RenderTool(oEnv, gl="PIL") oRT.render_env() checkFrozenImage(oRT, "basic-env-PIL.npz", resave=save_new_images)
def __init__(self, env=None, sGL="PIL", env_filename="temp.pkl"): """ Create an Editor MVC assembly around a railenv, or create one if None. """ if env is None: env = RailEnv(width=10, height=10, rail_generator=empty_rail_generator(), number_of_agents=0, obs_builder_object=TreeObsForRailEnv(max_depth=2)) env.reset() self.editor = EditorModel(env, env_filename=env_filename) self.editor.view = self.view = View(self.editor, sGL=sGL) self.view.controller = self.editor.controller = self.controller = Controller(self.editor, self.view) self.view.init_canvas() self.view.init_widgets() # has to be done after controller
def create_default_single_agent_environment(seed, timed): # Default observation parameters observation_tree_depth = 2 observation_max_path_depth = 30 # Default (tree) observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Unpack the return values of the default environment, in order to re-pack them for our return value. env, max_steps, x_dim, y_dim = _create_default_single_agent_environment( seed, timed, tree_observation) return env, max_steps, x_dim, y_dim, observation_tree_depth, observation_max_path_depth
def test_walker(): # _ _ _ transitions = RailEnvTransitions() cells = transitions.transition_list dead_end_from_south = cells[7] dead_end_from_west = transitions.rotate_transition(dead_end_from_south, 90) dead_end_from_east = transitions.rotate_transition(dead_end_from_south, 270) vertical_straight = cells[1] horizontal_straight = transitions.rotate_transition(vertical_straight, 90) rail_map = np.array( [[dead_end_from_east] + [horizontal_straight] + [dead_end_from_west]], dtype=np.uint16) rail = GridTransitionMap(width=rail_map.shape[1], height=rail_map.shape[0], transitions=transitions) rail.grid = rail_map env = RailEnv( width=rail_map.shape[1], height=rail_map.shape[0], rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=1, obs_builder_object=TreeObsForRailEnv( max_depth=2, predictor=ShortestPathPredictorForRailEnv(max_depth=10)), ) env.reset() # set initial position and direction for testing... env.agents[0].position = (0, 1) env.agents[0].direction = 1 env.agents[0].target = (0, 0) # reset to set agents from agents_static env.reset(False, False) print(env.distance_map.get()[(0, *[0, 1], 1)]) assert env.distance_map.get()[(0, *[0, 1], 1)] == 3 print(env.distance_map.get()[(0, *[0, 2], 3)]) assert env.distance_map.get()[(0, *[0, 2], 1)] == 2
def create_env(seed=None): """ Helper function that creates an env everywhere This way it only needs to be defined here """ from flatland.envs.rail_env import RailEnv from flatland.envs.observations import TreeObsForRailEnv from flatland.envs.rail_generators import complex_rail_generator from flatland.envs.schedule_generators import complex_schedule_generator # TODO make more configurable env = RailEnv(width=20, height=20, obs_builder_object=TreeObsForRailEnv(2), rail_generator=complex_rail_generator(nr_start_goal=100, nr_extra=2, min_dist=8, max_dist=99999, seed=seed), schedule_generator=complex_schedule_generator(seed=seed), number_of_agents=3, random_seed=seed) return env
def demo_lpg_planing(): from flatland.envs.rail_generators import sparse_rail_generator from flatland.envs.schedule_generators import sparse_schedule_generator from flatland.envs.observations import TreeObsForRailEnv n_agents = 1 x_dim = 25 y_dim = 25 n_cities = 4 max_rails_between_cities = 2 max_rails_in_city = 3 seed = 42 # Observation parameters observation_tree_depth = 2 domain_file = "./pddl/flatland.pddl" problem_dir = "./pddl/flatland" num_problems = 6 tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth) env = PDDLFlatlandEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, seed=seed, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(), number_of_agents=n_agents, obs_builder_object=tree_observation, domain_file=domain_file, problem_dir=problem_dir) for problem_index in range(num_problems): env.fix_problem_index(problem_index) run_planning_flatland_demo(env, 'lpg')
n_eval_episodes = train_params.n_evaluation_episodes # Set the seeds random.seed(seed) np.random.seed(seed) # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 10000, # Rate of malfunctions min_duration=15, # Minimal duration max_duration=50 # Max duration ) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Fraction of train which each speed speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Setup the environment env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False,
height=20, rail_generator=rail_from_file(file_name), obs_builder_object=TreeObsForRailEnv(max_depth=3, predictor=ShortestPathPredictorForRailEnv())) x_dim = env.width y_dim = env.height """ # Parameters for the Environment x_dim = 25 y_dim = 25 n_agents = 1 n_goals = 5 min_dist = 5 # We are training an Agent using the Tree Observation with depth 2 observation_builder = TreeObsForRailEnv(max_depth=2) # Use a the malfunction generator to break agents from time to time stochastic_data = {'malfunction_rate': 80, # Rate of malfunction occurence of single agent 'min_duration': 15, # Minimal duration of malfunction 'max_duration': 50 # Max duration of malfunction } # Custom observation builder TreeObservation = TreeObsForRailEnv(max_depth=2) # Different agent types (trains) with different speeds. speed_ration_map = {1.: 1., # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0} # Slow freight train
def __init__(self, config) -> None: super().__init__(config) self._builder = TreeObsForRailEnvRLLibWrapper( TreeObsForRailEnv(max_depth=config['max_depth'], predictor=ShortestPathPredictorForRailEnv( config['shortest_path_max_depth'])))
def train(env): n_agents = env["n_agents"] x_dim = env["x_dim"] y_dim = env["y_dim"] n_cities = env["n_cities"] max_rails_between_cities = env["max_rails_between_cities"] max_rails_in_city = env["max_rails_in_city"] seed = 0 use_fast_tree_obs = False # Observation parameters observation_tree_depth = 4 observation_radius = 10 observation_max_path_depth = 30 # Set the seeds random.seed(seed) np.random.seed(seed) # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 10000, # Rate of malfunctions min_duration=15, # Minimal duration max_duration=50 # Max duration ) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = None if use_fast_tree_obs: tree_observation = FastTreeObs(max_depth=observation_tree_depth) print("Using FastTreeObs") else: tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) print("Using StandardTreeObs") speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) rewards = [] obs, info = env.reset() if use_fast_tree_obs: state_size = tree_observation.observation_dim else: # Calculate the state size given the depth of the tree observation and the # number of features n_features_per_node = env.obs_builder.observation_dim n_nodes = 0 for i in range(observation_tree_depth + 1): n_nodes += np.power(4, i) state_size = n_features_per_node * n_nodes action_size = 5 DEVICE = 'cpu' # if torch.cuda.is_available(): # DEVICE = 'gpu' buffer_length = 10000 steps_to_save_model = 10 step_size = 100 num_steps = 100 # update every 100 steps avg_steps = 20 # num steps to average and plot rewards reward_q = [] batch_size = 100 agent_obs = np.array([None] * env.get_num_agents()) max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) num_episodes = 100000 agent_init_params = [] sa_size = [] for i in range(n_agents): agent_init_params.append({ 'num_in_pol': state_size, 'num_out_pol': action_size, 'init_weights': 'model.pt' }) sa_size.append((state_size, action_size)) hyperparams = { "tau": 0.01, "pi_lr": 0.00001, "q_lr": 0.00005, "pol_hidden_dim": 256, "critic_hidden_dim": 256, "attend_heads": 8 } model = AttentionSAC(agent_init_params=agent_init_params, sa_size=sa_size, tau=hyperparams["tau"], pi_lr=hyperparams["pi_lr"], q_lr=hyperparams["q_lr"], pol_hidden_dim=hyperparams["pol_hidden_dim"], critic_hidden_dim=hyperparams["critic_hidden_dim"], attend_heads=hyperparams["attend_heads"]) model.init_dict = {} replay_buffer = ReplayBuffer(buffer_length, n_agents, [state_size for i in range(n_agents)], [action_size for i in range(n_agents)]) print("MAX STEPS: " + str(max_steps)) print("NUM EPISODES: ", num_episodes) print("HYPERPARAMS: ") print(hyperparams) start_time = time.time() for ep in range(num_episodes): print("Episode " + str(ep) + ":", flush=True) obs, info = env.reset(True, True) model.prep_rollouts(device=DEVICE) reward_sum_for_this_episode = 0 for steps in range(max_steps): if steps % step_size == 0: print("=", end="", flush=True) for agent in env.get_agent_handles(): if obs[agent] is not None: if use_fast_tree_obs: agent_obs[agent] = obs[agent] else: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) else: agent_obs[agent] = np.array([0.] * state_size) action_dict = {} agent_actions = [] torch_obs = [ Variable(torch.Tensor([agent_obs[i]]), requires_grad=False) for i in range(n_agents) ] torch_agent_actions = model.step(torch_obs, explore=True) agent_actions = [ac.data.numpy() for ac in torch_agent_actions] for i in range(n_agents): dist = torch_agent_actions[i][0] idx = -1 for j in range(action_size): if dist[j] != 0: idx = j break action_dict[i] = idx next_obs, all_rewards, done, info = env.step(action_dict) rewards = [] dones = [] next_agent_obs = np.array([None] * env.get_num_agents()) for agent in env.get_agent_handles(): if next_obs[agent] is not None: if use_fast_tree_obs: next_agent_obs[agent] = next_obs[agent] else: next_agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) else: next_agent_obs[agent] = np.array([0.] * state_size) for i in range(n_agents): reward_sum_for_this_episode += all_rewards[i] rewards.append(all_rewards[i]) all_rewards[i] += augment_reward(agent_obs[agent]) dones.append(done[i]) replay_buffer.push(np.array([agent_obs]), np.array(agent_actions), np.array([rewards]), np.array([next_agent_obs]), np.array([dones])) if steps % num_steps == 0: model.prep_training(device=DEVICE) sample = replay_buffer.sample(batch_size, norm_rews=False) #print(sample) model.update_critic(sample) model.update_policies(sample) model.update_all_targets() model.prep_rollouts(device=DEVICE) reward_sum_for_this_episode /= n_agents reward_q.append(reward_sum_for_this_episode) if len(reward_q) == avg_steps: wandb.log({'reward': np.mean(reward_q)}) reward_q = [] print() if ep % steps_to_save_model == 0: print("\nSaving model") model.save(os.getcwd() + "/model.pt") cur_time = time.time() time_elapsed = (cur_time - start_time) // 60 print("Time Elapsed: " + str(time_elapsed) + "\n")
# ------------------------------------------------------ # 1. Setup of the environment # ------------------------------------------------------ start = time.time() # Seed for reproducibility np.random.seed(420) # Parameters for the environment x_dim = 5 y_dim = 5 n_agents = 1 # Custom observation builder tree_depth = 2 tree_obs = TreeObsForRailEnv(max_depth=tree_depth) # Environment setup env = RailEnv( width=x_dim, height=y_dim, number_of_agents=n_agents, rail_generator=random_rail_generator(), obs_builder_object=tree_obs ) # Render and show the env env_renderer = RenderTool(env=env) # ------------------------------------------------------ # 2. Define state & action size
def main(argv): try: opts, args = getopt.getopt(argv, "n:", ["n_trials="]) except getopt.GetoptError: print('test_navigation_single_agent.py -n <n_trials>') sys.exit(2) for opt, arg in opts: if opt in ('-n', '--n_trials'): n_trials = int(arg) random.seed(1) np.random.seed(1) ######## TEST SET SELECTION - PARAMETERS ######## test_multi_agent_setup = 1 # 1 for Medium size test, 2 for Big size test test_n_agents = 5 # Number of agents to test (3 - 5 - 7 for Medium, 5 - 7 - 10 for Big) test_malfunctions_enabled = True # Malfunctions enabled? test_agents_one_speed = True # Test agents with the same speed (1) or with 4 different speeds? ################################################# # Medium size if test_multi_agent_setup == 1: x_dim = 16*3 y_dim = 9*3 max_num_cities = 5 max_rails_between_cities = 2 max_rails_in_city = 3 # Big size if test_multi_agent_setup == 2: x_dim = 16*4 y_dim = 9*4 max_num_cities = 9 max_rails_between_cities = 5 max_rails_in_city = 5 stochastic_data = {'malfunction_rate': 80, # Rate of malfunction occurence of single agent 'min_duration': 15, # Minimal duration of malfunction 'max_duration': 50 # Max duration of malfunction } # Custom observation builder tree_depth = 2 TreeObservation = TreeObsForRailEnv(max_depth=tree_depth, predictor = ShortestPathPredictorForRailEnv(20)) np.savetxt(fname=path.join('NetsTest' , 'info.txt'), X=[x_dim,y_dim,test_n_agents,max_num_cities,max_rails_between_cities,max_rails_in_city,tree_depth],delimiter=';') # Different agent types (trains) with different speeds. if test_agents_one_speed: speed_ration_map = {1.: 1., # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0} # Slow freight train else: speed_ration_map = {1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25} # Slow freight train if test_malfunctions_enabled: env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator(max_num_cities=max_num_cities, # Number of cities in map (where train stations are) seed=14, # Random seed grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_ration_map), malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), number_of_agents=test_n_agents, obs_builder_object=TreeObservation) else: env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator(max_num_cities=max_num_cities, # Number of cities in map (where train stations are) seed=14, # Random seed grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=test_n_agents, obs_builder_object=TreeObservation) env.reset() #env_renderer = RenderTool(env, gl="PILSVG", ) env_renderer = RenderTool(env, gl="PILSVG", agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=False, screen_height=(1080*0.8), # Adjust these parameters to fit your resolution screen_width=(1920*0.8)) num_features_per_node = env.obs_builder.observation_dim nr_nodes = 0 for i in range(tree_depth + 1): nr_nodes += np.power(4, i) state_size = num_features_per_node * nr_nodes action_size = 5 # We set the number of episodes we would like to train on if 'n_trials' not in locals(): n_trials = 15000 # max_steps computation speed_weighted_mean = 0 for key in speed_ration_map.keys(): speed_weighted_mean += key * speed_ration_map[key] #max_steps = int(3 * (env.height + env.width)) max_steps = int((1/speed_weighted_mean) * 3 * (env.height + env.width)) #eps = 1. #eps_end = 0.005 #eps_decay = 0.9995 # And some variables to keep track of the performance action_dict = dict() final_action_dict = dict() action_prob_list = [] scores_window = deque(maxlen=100) done_window = deque(maxlen=100) scores = [] scores_list = [] deadlock_list =[] dones_list_window = [] dones_list = [] action_prob = [0] * action_size agent_obs = [None] * env.get_num_agents() agent_next_obs = [None] * env.get_num_agents() # Useless agent = Agent(state_size, action_size) # LOAD MODEL WEIGHTS TO TEST agent.qnetwork_local.load_state_dict(torch.load(path.join('NetsTest' , 'navigator_checkpoint3800_multi10_deadlock_global10.pth'))) record_images = False frame_step = 0 for trials in range(1, n_trials + 1): # Reset environment obs, info = env.reset()#(True, True) env_renderer.reset() # Build agent specific observations for a in range(env.get_num_agents()): agent_obs[a] = agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) # Reset score and done score = 0 env_done = 0 # Run episode for step in range(max_steps): # Action for a in range(env.get_num_agents()): if info['action_required'][a]: action = agent.act(agent_obs[a], eps=0.) action_prob[action] += 1 else: action = 0 action_dict.update({a: action}) # Environment step obs, all_rewards, done, deadlocks, info = env.step(action_dict) env_renderer.render_env(show=True, show_predictions=True, show_observations=False) # Build agent specific observations and normalize for a in range(env.get_num_agents()): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) score += all_rewards[a] / env.get_num_agents() if done['__all__']: break # Collection information about training tasks_finished = 0 for _idx in range(env.get_num_agents()): if done[_idx] == 1: tasks_finished += 1 done_window.append(tasks_finished / max(1, env.get_num_agents())) scores_window.append(score / max_steps) # save most recent score scores.append(np.mean(scores_window)) dones_list.append(tasks_finished / max(1, env.get_num_agents())) dones_list_window.append((np.mean(done_window))) scores_list.append(score / max_steps) deadlock_list.append(deadlocks.count(1)/max(1, env.get_num_agents())) if (np.sum(action_prob) == 0): action_prob_normalized = [0] * action_size else: action_prob_normalized = action_prob / np.sum(action_prob) print( '\rTesting {} Agents on ({},{}).\t Episode {}\t Score: {:.3f}\tDones: {:.2f}%\tDeadlocks: {:.2f}\t Action Probabilities: \t {}'.format( env.get_num_agents(), x_dim, y_dim, trials, score / max_steps, 100 * tasks_finished / max(1, env.get_num_agents()), deadlocks.count(1)/max(1, env.get_num_agents()), action_prob_normalized), end=" ") #if trials % 100 == 0: action_prob_list.append(action_prob_normalized) action_prob = [0] * action_size if trials % 50 == 0: np.savetxt(fname=path.join('NetsTest' , 'test_metrics.csv'), X=np.transpose(np.asarray([scores_list,scores,dones_list,dones_list_window,deadlock_list])), delimiter=';',newline='\n') np.savetxt(fname=path.join('NetsTest' , 'test_action_prob.csv'), X=np.asarray(action_prob_list), delimiter=';',newline='\n')
def train_agent(env_params, train_params): # Environment parameters n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city seed = env_params.seed # Observation parameters observation_tree_depth = env_params.observation_tree_depth observation_radius = env_params.observation_radius observation_max_path_depth = env_params.observation_max_path_depth # Training parameters eps_start = train_params.eps_start eps_end = train_params.eps_end eps_decay = train_params.eps_decay n_episodes = train_params.n_episodes checkpoint_interval = train_params.checkpoint_interval n_eval_episodes = train_params.n_evaluation_episodes # Set the seeds random.seed(seed) np.random.seed(seed) # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 10000, # Rate of malfunctions min_duration=15, # Minimal duration max_duration=50 # Max duration ) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Fraction of train which each speed speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Setup the environment env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) env.reset(regenerate_schedule=True, regenerate_rail=True) # Setup renderer if train_params.render: env_renderer = RenderTool(env, gl="PGL") # Calculate the state size given the depth of the tree observation and the number of features n_features_per_node = env.obs_builder.observation_dim n_nodes = 0 for i in range(observation_tree_depth + 1): n_nodes += np.power(4, i) state_size = n_features_per_node * n_nodes # The action space of flatland is 5 discrete actions action_size = 5 # Max number of steps per episode # This is the official formula used during evaluations # See details in flatland.envs.schedule_generators.sparse_schedule_generator max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) action_count = [0] * action_size action_dict = dict() agent_obs = [None] * env.get_num_agents() agent_prev_obs = [None] * env.get_num_agents() agent_prev_action = [2] * env.get_num_agents() update_values = False smoothed_normalized_score = -1.0 smoothed_eval_normalized_score = -1.0 smoothed_completion = 0.0 smoothed_eval_completion = 0.0 # Double Dueling DQN policy policy = DDDQNPolicy(state_size, action_size, train_params) # TensorBoard writer writer = SummaryWriter() writer.add_hparams(vars(train_params), {}) writer.add_hparams(vars(env_params), {}) training_timer = Timer() training_timer.start() print( "\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes.\n" .format(env.get_num_agents(), x_dim, y_dim, n_episodes, n_eval_episodes, checkpoint_interval)) for episode_idx in range(n_episodes + 1): # Timers step_timer = Timer() reset_timer = Timer() learn_timer = Timer() preproc_timer = Timer() # Reset environment reset_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True) reset_timer.end() if train_params.render: env_renderer.set_new_rail() score = 0 nb_steps = 0 actions_taken = [] # Build agent specific observations for agent in env.get_agent_handles(): if obs[agent]: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) agent_prev_obs[agent] = agent_obs[agent].copy() # Run episode for step in range(max_steps - 1): for agent in env.get_agent_handles(): if info['action_required'][agent]: # If an action is required, we want to store the obs at that step as well as the action update_values = True action = policy.act(agent_obs[agent], eps=eps_start) action_count[action] += 1 actions_taken.append(action) else: update_values = False action = 0 action_dict.update({agent: action}) # Environment step step_timer.start() next_obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if train_params.render and episode_idx % checkpoint_interval == 0: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) for agent in range(env.get_num_agents()): # Update replay buffer and train agent # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[agent]: learn_timer.start() policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent]) learn_timer.end() agent_prev_obs[agent] = agent_obs[agent].copy() agent_prev_action[agent] = action_dict[agent] # Preprocess the new observations if next_obs[agent]: preproc_timer.start() agent_obs[agent] = normalize_observation( next_obs[agent], observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() score += all_rewards[agent] nb_steps = step if done['__all__']: break # Epsilon decay eps_start = max(eps_end, eps_decay * eps_start) # Collection information about training tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) normalized_score = score / (max_steps * env.get_num_agents()) action_probs = action_count / np.sum(action_count) action_count = [1] * action_size # Smoothed values for terminal display and for more stable hyper-parameter tuning smoothing = 0.99 smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * ( 1.0 - smoothing) smoothed_completion = smoothed_completion * smoothing + completion * ( 1.0 - smoothing) # Print logs if episode_idx % checkpoint_interval == 0: torch.save( policy.qnetwork_local, './checkpoints/origin_multi-' + str(episode_idx) + '.pth') if train_params.render: env_renderer.close_window() print('\r🚂 Episode {}' '\t 🏆 Score: {:.3f}' ' Avg: {:.3f}' '\t 💯 Done: {:.2f}%' ' Avg: {:.2f}%' '\t 🎲 Epsilon: {:.2f} ' '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score, smoothed_normalized_score, 100 * completion, 100 * smoothed_completion, eps_start, format_action_prob(action_probs)), end=" ") # Evaluate policy if episode_idx % train_params.checkpoint_interval == 0: scores, completions, nb_steps_eval = eval_policy( env, policy, n_eval_episodes, max_steps) writer.add_scalar("evaluation/scores_min", np.min(scores), episode_idx) writer.add_scalar("evaluation/scores_max", np.max(scores), episode_idx) writer.add_scalar("evaluation/scores_mean", np.mean(scores), episode_idx) writer.add_scalar("evaluation/scores_std", np.std(scores), episode_idx) writer.add_histogram("evaluation/scores", np.array(scores), episode_idx) writer.add_scalar("evaluation/completions_min", np.min(completions), episode_idx) writer.add_scalar("evaluation/completions_max", np.max(completions), episode_idx) writer.add_scalar("evaluation/completions_mean", np.mean(completions), episode_idx) writer.add_scalar("evaluation/completions_std", np.std(completions), episode_idx) writer.add_histogram("evaluation/completions", np.array(completions), episode_idx) writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_mean", np.mean(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval), episode_idx) writer.add_histogram("evaluation/nb_steps", np.array(nb_steps_eval), episode_idx) smoothing = 0.9 smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean( scores) * (1.0 - smoothing) smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean( completions) * (1.0 - smoothing) writer.add_scalar("evaluation/smoothed_score", smoothed_eval_normalized_score, episode_idx) writer.add_scalar("evaluation/smoothed_completion", smoothed_eval_completion, episode_idx) # Save logs to tensorboard writer.add_scalar("training/score", normalized_score, episode_idx) writer.add_scalar("training/smoothed_score", smoothed_normalized_score, episode_idx) writer.add_scalar("training/completion", np.mean(completion), episode_idx) writer.add_scalar("training/smoothed_completion", np.mean(smoothed_completion), episode_idx) writer.add_scalar("training/nb_steps", nb_steps, episode_idx) writer.add_histogram("actions/distribution", np.array(actions_taken), episode_idx) writer.add_scalar("actions/nothing", action_probs[RailEnvActions.DO_NOTHING], episode_idx) writer.add_scalar("actions/left", action_probs[RailEnvActions.MOVE_LEFT], episode_idx) writer.add_scalar("actions/forward", action_probs[RailEnvActions.MOVE_FORWARD], episode_idx) writer.add_scalar("actions/right", action_probs[RailEnvActions.MOVE_RIGHT], episode_idx) writer.add_scalar("actions/stop", action_probs[RailEnvActions.STOP_MOVING], episode_idx) writer.add_scalar("training/epsilon", eps_start, episode_idx) writer.add_scalar("training/buffer_size", len(policy.memory), episode_idx) writer.add_scalar("training/loss", policy.loss, episode_idx) writer.add_scalar("timer/reset", reset_timer.get(), episode_idx) writer.add_scalar("timer/step", step_timer.get(), episode_idx) writer.add_scalar("timer/learn", learn_timer.get(), episode_idx) writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx) writer.add_scalar("timer/total", training_timer.get_current(), episode_idx)
# flatland environment config rail_gen_cfg: Dict = { "max_num_cities": 4, "max_rails_between_cities": 2, "max_rails_in_city": 3, "grid_mode": True, "seed": 42, } flatland_env_config: Dict = { "number_of_agents": 2, "width": 25, "height": 25, "rail_generator": sparse_rail_generator(**rail_gen_cfg), "schedule_generator": sparse_schedule_generator(), "obs_builder_object": TreeObsForRailEnv(max_depth=2), } def main(_: Any) -> None: # Environment. environment_factory = functools.partial( flatland_env_factory, env_config=flatland_env_config, include_agent_info=False ) # Networks. network_factory = lp_utils.partial_kwargs(madqn.make_default_networks) # Checkpointer appends "Checkpoints" to checkpoint_dir checkpoint_dir = f"{FLAGS.base_dir}/{FLAGS.mava_id}"
WINDOW_LENGTH = 22 # @param{type: "integer"} random_rail_generator = complex_rail_generator( nr_start_goal=10, # @param{type:"integer"} number of start and end goals # connections, the higher the easier it should be for # the trains nr_extra=10, # @param{type:"integer"} extra connections # (useful for alternite paths), the higher the easier min_dist=10, max_dist=99999, seed=seed) env = RailEnv(width=width, height=height, rail_generator=random_rail_generator, obs_builder_object=TreeObsForRailEnv(tree_depth), number_of_agents=num_agents) obs, info = env.reset() env_renderer = RenderTool(env) state_shape = normalize_observation(obs[0], tree_depth, radius_observation).shape action_shape = (5, ) import tensorflow as tf import numpy as np from tensorflow.keras.models import load_model
def train_agent(train_params, train_env_params, eval_env_params, obs_params): # Environment parameters n_agents = train_env_params.n_agents x_dim = train_env_params.x_dim y_dim = train_env_params.y_dim n_cities = train_env_params.n_cities max_rails_between_cities = train_env_params.max_rails_between_cities max_rails_in_city = train_env_params.max_rails_in_city seed = train_env_params.seed # Unique ID for this training now = datetime.now() training_id = now.strftime('%y%m%d%H%M%S') # Observation parameters observation_tree_depth = obs_params.observation_tree_depth observation_radius = obs_params.observation_radius observation_max_path_depth = obs_params.observation_max_path_depth # Training parameters eps_start = train_params.eps_start eps_end = train_params.eps_end eps_decay = train_params.eps_decay n_episodes = train_params.n_episodes checkpoint_interval = train_params.checkpoint_interval n_eval_episodes = train_params.n_evaluation_episodes restore_replay_buffer = train_params.restore_replay_buffer save_replay_buffer = train_params.save_replay_buffer # Set the seeds random.seed(seed) np.random.seed(seed) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Setup the environments train_env = create_rail_env(train_env_params, tree_observation) train_env.reset(regenerate_schedule=True, regenerate_rail=True) eval_env = create_rail_env(eval_env_params, tree_observation) eval_env.reset(regenerate_schedule=True, regenerate_rail=True) # Setup renderer if train_params.render: env_renderer = RenderTool(train_env, gl="PGL") # Calculate the state size given the depth of the tree observation and the number of features n_features_per_node = train_env.obs_builder.observation_dim n_nodes = sum([np.power(4, i) for i in range(observation_tree_depth + 1)]) state_size = n_features_per_node * n_nodes # The action space of flatland is 5 discrete actions action_size = 5 # Max number of steps per episode # This is the official formula used during evaluations # See details in flatland.envs.schedule_generators.sparse_schedule_generator # max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) max_steps = train_env._max_episode_steps action_count = [0] * action_size action_dict = dict() agent_obs = [None] * n_agents agent_prev_obs = [None] * n_agents agent_prev_action = [2] * n_agents update_values = [False] * n_agents # Smoothed values used as target for hyperparameter tuning smoothed_normalized_score = -1.0 smoothed_eval_normalized_score = -1.0 smoothed_completion = 0.0 smoothed_eval_completion = 0.0 # Double Dueling DQN policy policy = DDDQNPolicy(state_size, action_size, train_params) # Loads existing replay buffer if restore_replay_buffer: try: policy.load_replay_buffer(restore_replay_buffer) policy.test() except RuntimeError as e: print( "\n🛑 Could't load replay buffer, were the experiences generated using the same tree depth?" ) print(e) exit(1) print("\n💾 Replay buffer status: {}/{} experiences".format( len(policy.memory.memory), train_params.buffer_size)) hdd = psutil.disk_usage('/') if save_replay_buffer and (hdd.free / (2**30)) < 500.0: print( "⚠️ Careful! Saving replay buffers will quickly consume a lot of disk space. You have {:.2f}gb left." .format(hdd.free / (2**30))) # TensorBoard writer writer = SummaryWriter() writer.add_hparams(vars(train_params), {}) writer.add_hparams(vars(train_env_params), {}) writer.add_hparams(vars(obs_params), {}) training_timer = Timer() training_timer.start() print( "\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes. Training id '{}'.\n" .format(train_env.get_num_agents(), x_dim, y_dim, n_episodes, n_eval_episodes, checkpoint_interval, training_id)) for episode_idx in range(n_episodes + 1): step_timer = Timer() reset_timer = Timer() learn_timer = Timer() preproc_timer = Timer() inference_timer = Timer() # Reset environment reset_timer.start() obs, info = train_env.reset(regenerate_rail=True, regenerate_schedule=True) reset_timer.end() if train_params.render: env_renderer.set_new_rail() score = 0 nb_steps = 0 actions_taken = [] # Build initial agent-specific observations for agent in train_env.get_agent_handles(): if obs[agent]: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) agent_prev_obs[agent] = agent_obs[agent].copy() # Run episode for step in range(max_steps - 1): inference_timer.start() for agent in train_env.get_agent_handles(): if info['action_required'][agent]: update_values[agent] = True action = policy.act(agent_obs[agent], eps=eps_start) action_count[action] += 1 actions_taken.append(action) else: # An action is not required if the train hasn't joined the railway network, # if it already reached its target, or if is currently malfunctioning. update_values[agent] = False action = 0 action_dict.update({agent: action}) inference_timer.end() # Environment step step_timer.start() next_obs, all_rewards, done, info = train_env.step(action_dict) step_timer.end() # Render an episode at some interval if train_params.render and episode_idx % checkpoint_interval == 0: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) # Update replay buffer and train agent for agent in train_env.get_agent_handles(): if update_values[agent] or done['__all__']: # Only learn from timesteps where somethings happened learn_timer.start() policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent]) learn_timer.end() agent_prev_obs[agent] = agent_obs[agent].copy() agent_prev_action[agent] = action_dict[agent] # Preprocess the new observations if next_obs[agent]: preproc_timer.start() agent_obs[agent] = normalize_observation( next_obs[agent], observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() score += all_rewards[agent] nb_steps = step if done['__all__']: break # Epsilon decay eps_start = max(eps_end, eps_decay * eps_start) # Collect information about training tasks_finished = sum(done[idx] for idx in train_env.get_agent_handles()) completion = tasks_finished / max(1, train_env.get_num_agents()) normalized_score = score / (max_steps * train_env.get_num_agents()) action_probs = action_count / np.sum(action_count) action_count = [1] * action_size smoothing = 0.99 smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * ( 1.0 - smoothing) smoothed_completion = smoothed_completion * smoothing + completion * ( 1.0 - smoothing) # Print logs if episode_idx % checkpoint_interval == 0: torch.save( policy.qnetwork_local, './checkpoints/' + training_id + '-' + str(episode_idx) + '.pth') if save_replay_buffer: policy.save_replay_buffer('./replay_buffers/' + training_id + '-' + str(episode_idx) + '.pkl') if train_params.render: env_renderer.close_window() print('\r🚂 Episode {}' '\t 🏆 Score: {:.3f}' ' Avg: {:.3f}' '\t 💯 Done: {:.2f}%' ' Avg: {:.2f}%' '\t 🎲 Epsilon: {:.3f} ' '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score, smoothed_normalized_score, 100 * completion, 100 * smoothed_completion, eps_start, format_action_prob(action_probs)), end=" ") # Evaluate policy and log results at some interval if episode_idx % checkpoint_interval == 0 and n_eval_episodes > 0: scores, completions, nb_steps_eval = eval_policy( eval_env, policy, train_params, obs_params) writer.add_scalar("evaluation/scores_min", np.min(scores), episode_idx) writer.add_scalar("evaluation/scores_max", np.max(scores), episode_idx) writer.add_scalar("evaluation/scores_mean", np.mean(scores), episode_idx) writer.add_scalar("evaluation/scores_std", np.std(scores), episode_idx) writer.add_histogram("evaluation/scores", np.array(scores), episode_idx) writer.add_scalar("evaluation/completions_min", np.min(completions), episode_idx) writer.add_scalar("evaluation/completions_max", np.max(completions), episode_idx) writer.add_scalar("evaluation/completions_mean", np.mean(completions), episode_idx) writer.add_scalar("evaluation/completions_std", np.std(completions), episode_idx) writer.add_histogram("evaluation/completions", np.array(completions), episode_idx) writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_mean", np.mean(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval), episode_idx) writer.add_histogram("evaluation/nb_steps", np.array(nb_steps_eval), episode_idx) smoothing = 0.9 smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean( scores) * (1.0 - smoothing) smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean( completions) * (1.0 - smoothing) writer.add_scalar("evaluation/smoothed_score", smoothed_eval_normalized_score, episode_idx) writer.add_scalar("evaluation/smoothed_completion", smoothed_eval_completion, episode_idx) # Save logs to tensorboard writer.add_scalar("training/score", normalized_score, episode_idx) writer.add_scalar("training/smoothed_score", smoothed_normalized_score, episode_idx) writer.add_scalar("training/completion", np.mean(completion), episode_idx) writer.add_scalar("training/smoothed_completion", np.mean(smoothed_completion), episode_idx) writer.add_scalar("training/nb_steps", nb_steps, episode_idx) writer.add_histogram("actions/distribution", np.array(actions_taken), episode_idx) writer.add_scalar("actions/nothing", action_probs[RailEnvActions.DO_NOTHING], episode_idx) writer.add_scalar("actions/left", action_probs[RailEnvActions.MOVE_LEFT], episode_idx) writer.add_scalar("actions/forward", action_probs[RailEnvActions.MOVE_FORWARD], episode_idx) writer.add_scalar("actions/right", action_probs[RailEnvActions.MOVE_RIGHT], episode_idx) writer.add_scalar("actions/stop", action_probs[RailEnvActions.STOP_MOVING], episode_idx) writer.add_scalar("training/epsilon", eps_start, episode_idx) writer.add_scalar("training/buffer_size", len(policy.memory), episode_idx) writer.add_scalar("training/loss", policy.loss, episode_idx) writer.add_scalar("timer/reset", reset_timer.get(), episode_idx) writer.add_scalar("timer/step", step_timer.get(), episode_idx) writer.add_scalar("timer/learn", learn_timer.get(), episode_idx) writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx) writer.add_scalar("timer/total", training_timer.get_current(), episode_idx)