def create_rail_env(env_params, tree_observation): n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city seed = env_params.seed # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=env_params.malfunction_rate, min_duration=20, max_duration=50) return RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed)
def test_malfunction_before_entry(): """Tests that malfunctions are working properly for agents before entering the environment!""" # Set fixed malfunction duration for this test stochastic_data = MalfunctionParameters(malfunction_rate=2, # Rate of malfunction occurence min_duration=10, # Minimal duration of malfunction max_duration=10 # Max duration of malfunction ) rail, rail_map = make_simple_rail2() env = RailEnv(width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=10, malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), obs_builder_object=SingleAgentNavigationObs() ) env.reset(False, False, False, random_seed=10) env.agents[0].target = (0, 0) # Test initial malfunction values for all agents # we want some agents to be malfuncitoning already and some to be working # we want different next_malfunction values for the agents assert env.agents[0].malfunction_data['malfunction'] == 0 assert env.agents[1].malfunction_data['malfunction'] == 10 assert env.agents[2].malfunction_data['malfunction'] == 0 assert env.agents[3].malfunction_data['malfunction'] == 10 assert env.agents[4].malfunction_data['malfunction'] == 10 assert env.agents[5].malfunction_data['malfunction'] == 10 assert env.agents[6].malfunction_data['malfunction'] == 10 assert env.agents[7].malfunction_data['malfunction'] == 10 assert env.agents[8].malfunction_data['malfunction'] == 10 assert env.agents[9].malfunction_data['malfunction'] == 10
def test_malfunction_values_and_behavior(): """ Test the malfunction counts down as desired Returns ------- """ # Set fixed malfunction duration for this test rail, rail_map = make_simple_rail2() action_dict: Dict[int, RailEnvActions] = {} stochastic_data = MalfunctionParameters(malfunction_rate=0.001, # Rate of malfunction occurence min_duration=10, # Minimal duration of malfunction max_duration=10 # Max duration of malfunction ) env = RailEnv(width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=1, malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), obs_builder_object=SingleAgentNavigationObs() ) env.reset(False, False, activate_agents=True, random_seed=10) # Assertions assert_list = [9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 10, 9, 8, 7, 6, 5] print("[") for time_step in range(15): # Move in the env env.step(action_dict) # Check that next_step decreases as expected assert env.agents[0].malfunction_data['malfunction'] == assert_list[time_step]
def test_malfanction_from_params(): """ Test loading malfunction from Returns ------- """ stochastic_data = MalfunctionParameters( malfunction_rate=1000, # Rate of malfunction occurence min_duration=2, # Minimal duration of malfunction max_duration=5 # Max duration of malfunction ) rail, rail_map = make_simple_rail2() env = RailEnv( width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=10, malfunction_generator_and_process_data=malfunction_from_params( stochastic_data)) env.reset() assert env.malfunction_process_data.malfunction_rate == 1000 assert env.malfunction_process_data.min_duration == 2 assert env.malfunction_process_data.max_duration == 5
def test_malfunction_process(): # Set fixed malfunction duration for this test stochastic_data = MalfunctionParameters( malfunction_rate=1, # Rate of malfunction occurence min_duration=3, # Minimal duration of malfunction max_duration=3 # Max duration of malfunction ) rail, rail_map = make_simple_rail2() env = RailEnv( width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=1, malfunction_generator_and_process_data=malfunction_from_params( stochastic_data), obs_builder_object=SingleAgentNavigationObs()) obs, info = env.reset(False, False, True, random_seed=10) agent_halts = 0 total_down_time = 0 agent_old_position = env.agents[0].position # Move target to unreachable position in order to not interfere with test env.agents[0].target = (0, 0) for step in range(100): actions = {} for i in range(len(obs)): actions[i] = np.argmax(obs[i]) + 1 obs, all_rewards, done, _ = env.step(actions) if env.agents[0].malfunction_data['malfunction'] > 0: agent_malfunctioning = True else: agent_malfunctioning = False if agent_malfunctioning: # Check that agent is not moving while malfunctioning assert agent_old_position == env.agents[0].position agent_old_position = env.agents[0].position total_down_time += env.agents[0].malfunction_data['malfunction'] # Check that the appropriate number of malfunctions is achieved assert env.agents[0].malfunction_data[ 'nr_malfunctions'] == 23, "Actual {}".format( env.agents[0].malfunction_data['nr_malfunctions']) # Check that malfunctioning data was standing around assert total_down_time > 0
def create_rail_env(args, load_env=""): ''' Build a RailEnv object with the specified parameters, as described in the .yml file ''' # Check if an environment file is provided if load_env: rail_generator = rail_from_file(load_env) else: rail_generator = sparse_rail_generator( max_num_cities=args.env.max_cities, grid_mode=args.env.grid, max_rails_between_cities=args.env.max_rails_between_cities, max_rails_in_city=args.env.max_rails_in_cities, seed=args.env.seed) # Build predictor and observator obs_type = args.policy.type.get_true_key() if PREDICTORS[obs_type] is ShortestDeviationPathPredictor: predictor = PREDICTORS[obs_type]( max_depth=args.observator.max_depth, max_deviations=args.predictor.max_depth) else: predictor = PREDICTORS[obs_type](max_depth=args.predictor.max_depth) observator = OBSERVATORS[obs_type](args.observator.max_depth, predictor) # Initialize malfunctions malfunctions = None if args.env.malfunctions.enabled: malfunctions = ParamMalfunctionGen( MalfunctionParameters( malfunction_rate=args.env.malfunctions.rate, min_duration=args.env.malfunctions.min_duration, max_duration=args.env.malfunctions.max_duration)) # Initialize agents speeds speed_map = None if args.env.variable_speed: speed_map = {1.: 0.25, 1. / 2.: 0.25, 1. / 3.: 0.25, 1. / 4.: 0.25} schedule_generator = sparse_schedule_generator(speed_map, seed=args.env.seed) # Build the environment return RailEnvWrapper(params=args, width=args.env.width, height=args.env.height, rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=args.env.num_trains, obs_builder_object=observator, malfunction_generator=malfunctions, remove_agents_at_target=True, random_seed=args.env.seed)
def _launch(self): rail_generator = sparse_rail_generator( seed=self._config['seed'], max_num_cities=self._config['max_num_cities'], grid_mode=self._config['grid_mode'], max_rails_between_cities=self._config['max_rails_between_cities'], max_rails_in_city=self._config['max_rails_in_city']) malfunction_generator = no_malfunction_generator() if { 'malfunction_rate', 'malfunction_min_duration', 'malfunction_max_duration' } <= self._config.keys(): stochastic_data = MalfunctionParameters( malfunction_rate=self._config['malfunction_rate'], min_duration=self._config['malfunction_min_duration'], max_duration=self._config['malfunction_max_duration']) malfunction_generator = malfunction_from_params(stochastic_data) speed_ratio_map = None if 'speed_ratio_map' in self._config: speed_ratio_map = { float(k): float(v) for k, v in self._config['speed_ratio_map'].items() } schedule_generator = sparse_schedule_generator(speed_ratio_map) env = None try: env = RailEnv( width=self._config['width'], height=self._config['height'], rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=self._config['number_of_agents'], malfunction_generator_and_process_data=malfunction_generator, obs_builder_object=self._observation.builder(), remove_agents_at_target=False, random_seed=self._config['seed'], # Should Below line be commented as here the env tries different configs, # hence opening it can be wasteful, morever the render has to be closed use_renderer=self._env_config.get('render')) env.reset() except ValueError as e: logging.error("=" * 50) logging.error(f"Error while creating env: {e}") logging.error("=" * 50) return env
def test_malfunction_process_statistically(): """Tests that malfunctions are produced by stochastic_data!""" # Set fixed malfunction duration for this test stochastic_data = MalfunctionParameters( malfunction_rate=1 / 5, # Rate of malfunction occurence min_duration=5, # Minimal duration of malfunction max_duration=5 # Max duration of malfunction ) rail, rail_map = make_simple_rail2() env = RailEnv( width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=10, malfunction_generator_and_process_data=malfunction_from_params( stochastic_data), obs_builder_object=SingleAgentNavigationObs()) env.reset(True, True, False, random_seed=10) env.agents[0].target = (0, 0) # Next line only for test generation # agent_malfunction_list = [[] for i in range(10)] agent_malfunction_list = [ [0, 0, 0, 0, 5, 4, 3, 2, 1, 0, 5, 4, 3, 2, 1, 0, 0, 0, 5, 4], [0, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 5, 4, 3, 2, 1, 0, 5, 4, 3, 2, 1, 0, 0, 5, 4, 3, 2], [0, 0, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 5, 4, 3, 2, 1], [0, 0, 5, 4, 3, 2, 1, 0, 0, 5, 4, 3, 2, 1, 0, 5, 4, 3, 2, 1], [0, 0, 0, 0, 0, 0, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0], [5, 4, 3, 2, 1, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 5], [5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 4, 3, 2], [5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0, 5, 4] ] for step in range(20): action_dict: Dict[int, RailEnvActions] = {} for agent_idx in range(env.get_num_agents()): # We randomly select an action action_dict[agent_idx] = RailEnvActions(np.random.randint(4)) # For generating tests only: # agent_malfunction_list[agent_idx].append(env.agents[agent_idx].malfunction_data['malfunction']) assert env.agents[agent_idx].malfunction_data[ 'malfunction'] == agent_malfunction_list[agent_idx][step] env.step(action_dict)
def _launch(self): rail_generator = self.get_rail_generator() malfunction_generator = NoMalfunctionGen() if {'malfunction_rate', 'malfunction_min_duration', 'malfunction_max_duration'} <= self._config.keys(): print("MALFUNCTIONS POSSIBLE") params = MalfunctionParameters(malfunction_rate=1 / self._config['malfunction_rate'], max_duration=self._config['malfunction_max_duration'], min_duration=self._config['malfunction_min_duration']) malfunction_generator = ParamMalfunctionGen(params) speed_ratio_map = None if 'speed_ratio_map' in self._config: speed_ratio_map = { float(k): float(v) for k, v in self._config['speed_ratio_map'].items() } if self._gym_env_class == SequentialFlatlandGymEnv: schedule_generator = SequentialSparseSchedGen(speed_ratio_map, seed=1) else: schedule_generator = sparse_schedule_generator(speed_ratio_map) env = None try: if self._fine_tune_env_path is None: env = RailEnv( width=self._config['width'], height=self._config['height'], rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=self._config['number_of_agents'], malfunction_generator=malfunction_generator, obs_builder_object=self._observation.builder(), remove_agents_at_target=True, random_seed=self._config['seed'], use_renderer=self._env_config.get('render') ) env.reset() else: env, _ = RailEnvPersister.load_new(self._fine_tune_env_path) env.reset(regenerate_rail=False, regenerate_schedule=False) env.obs_builder = self._observation.builder() env.obs_builder.set_env(env) except ValueError as e: logging.error("=" * 50) logging.error(f"Error while creating env: {e}") logging.error("=" * 50) return env
def create_and_save_env(file_name: str, schedule_generator: ScheduleGenerator, rail_generator: RailGenerator): stochastic_data = MalfunctionParameters( malfunction_rate=1000, # Rate of malfunction occurence min_duration=15, # Minimal duration of malfunction max_duration=50 # Max duration of malfunction ) env = RailEnv( width=30, height=30, rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=10, malfunction_generator_and_process_data=malfunction_from_params( stochastic_data), remove_agents_at_target=True) env.reset(True, True) #env.save(file_name) RailEnvPersister.save(env, file_name)
def get_env(config=None, rl=False): n_agents = 16 schedule_generator = sparse_schedule_generator(None) rail_generator = sparse_rail_generator( seed=seed, max_num_cities=3, grid_mode=False, max_rails_between_cities=2, max_rails_in_city=4, ) if rl: obs_builder = make_obs("combined", { "path": None, "simple_meta": None }).builder() else: obs_builder = DummyObs() params = MalfunctionParameters(malfunction_rate=1 / 1000, max_duration=50, min_duration=20) malfunction_generator = ParamMalfunctionGen(params) env = RailEnv( width=28, height=28, rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=n_agents, malfunction_generator=malfunction_generator, obs_builder_object=obs_builder, remove_agents_at_target=True, random_seed=seed, ) return env
def get_env(config=None, rl=False): n_agents = 32 schedule_generator = sparse_schedule_generator(None) rail_generator = sparse_rail_generator( seed=seed, max_num_cities=4, grid_mode=False, max_rails_between_cities=2, max_rails_in_city=4, ) if rl: obs_builder = make_obs( config["env_config"]['observation'], config["env_config"].get('observation_config')).builder() else: obs_builder = DummyObs() params = MalfunctionParameters(malfunction_rate=1 / 1000, max_duration=50, min_duration=20) malfunction_generator = ParamMalfunctionGen(params) env = RailEnv( width=32, height=32, rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=n_agents, malfunction_generator=malfunction_generator, obs_builder_object=obs_builder, remove_agents_at_target=True, random_seed=seed, ) return env
speed_ration_map = { 1.: 1, # Fast passenger train 1. / 2.: 0, # Fast freight train 1. / 3.: 0, # Slow commuter train 1. / 4.: 0 } # Slow freight train # We can now initiate the schedule generator with the given speed profiles schedule_generator = sparse_schedule_generator(speed_ration_map) # We can furthermore pass stochastic data to the RailEnv constructor which will allow for stochastic malfunctions # during an episode. stochastic_data = MalfunctionParameters( malfunction_rate=0, # Rate of malfunction occurence min_duration=3, # Minimal duration of malfunction max_duration=20 # Max duration of malfunction ) print(stochastic_data) # Custom observation builder without predictor observation_builder = GlobalObsForRailEnv() # Custom observation builder with predictor, uncomment line below if you want to try this one # observation_builder = TreeObsForRailEnv(max_depth=2, predictor=ShortestPathPredictorForRailEnv()) # Construct the enviornment with the given observation, generataors, predictors, and stochastic data env = RailEnv( width=width, height=height, rail_generator=rail_generator,
def create_test_env(fnParams, nTest, sDir): (seed, width, height, nr_trains, nr_cities, max_rails_between_cities, max_rails_in_cities, malfunction_rate, malfunction_min_duration, malfunction_max_duration) = fnParams(nTest) #if not ShouldRunTest(test_id): # continue rail_generator = sparse_rail_generator( max_num_cities=nr_cities, seed=seed, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_cities, ) #stochastic_data = {'malfunction_rate': malfunction_rate, # 'min_duration': malfunction_min_duration, # 'max_duration': malfunction_max_duration # } stochastic_data = MalfunctionParameters( malfunction_rate=malfunction_rate, min_duration=malfunction_min_duration, max_duration=malfunction_max_duration) observation_builder = GlobalObsForRailEnv() DEFAULT_SPEED_RATIO_MAP = { 1.: 0.25, 1. / 2.: 0.25, 1. / 3.: 0.25, 1. / 4.: 0.25 } schedule_generator = sparse_schedule_generator(DEFAULT_SPEED_RATIO_MAP) for iAttempt in range(5): try: env = RailEnv( width=width, height=height, rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=nr_trains, malfunction_generator_and_process_data=malfunction_from_params( stochastic_data), obs_builder_object=observation_builder, remove_agents_at_target=True) obs = env.reset(random_seed=seed) break except ValueError as oErr: print("Error:", oErr) width += 5 height += 5 print("Try again with larger env: (w,h):", width, height) if not os.path.exists(sDir): os.makedirs(sDir) sfName = "{}/Level_{}.mpk".format(sDir, nTest) if os.path.exists(sfName): os.remove(sfName) env.save(sfName) sys.stdout.write(".") sys.stdout.flush() return env
def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps, action_size, state_size, seed, render, allow_skipping, allow_caching): # Evaluation is faster on CPU (except if you use a really huge policy) parameters = {'use_gpu': False} policy = DDDQNPolicy(state_size, action_size, Namespace(**parameters), evaluation_mode=True) policy.qnetwork_local = torch.load(checkpoint) env_params = Namespace(**env_params) # Environment parameters n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city # Malfunction and speed profiles # TODO pass these parameters properly from main! malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 2000, # Rate of malfunctions min_duration=20, # Minimal duration max_duration=50 # Max duration ) # Only fast trains in Round 1 speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Observation parameters observation_tree_depth = env_params.observation_tree_depth observation_radius = env_params.observation_radius observation_max_path_depth = env_params.observation_max_path_depth # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Setup the environment env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city, ), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation) if render: env_renderer = RenderTool(env, gl="PGL") action_dict = dict() scores = [] completions = [] nb_steps = [] inference_times = [] preproc_times = [] agent_times = [] step_times = [] for episode_idx in range(n_eval_episodes): seed += 1 inference_timer = Timer() preproc_timer = Timer() agent_timer = Timer() step_timer = Timer() step_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True, random_seed=seed) step_timer.end() agent_obs = [None] * env.get_num_agents() score = 0.0 if render: env_renderer.set_new_rail() final_step = 0 skipped = 0 nb_hit = 0 agent_last_obs = {} agent_last_action = {} for step in range(max_steps - 1): if allow_skipping and check_if_all_blocked(env): # FIXME why -1? bug where all agents are "done" after max_steps! skipped = max_steps - step - 1 final_step = max_steps - 2 n_unfinished_agents = sum(not done[idx] for idx in env.get_agent_handles()) score -= skipped * n_unfinished_agents break agent_timer.start() for agent in env.get_agent_handles(): if obs[agent] and info['action_required'][agent]: if agent in agent_last_obs and np.all( agent_last_obs[agent] == obs[agent]): nb_hit += 1 action = agent_last_action[agent] else: preproc_timer.start() norm_obs = normalize_observation( obs[agent], tree_depth=observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() inference_timer.start() action = policy.act(norm_obs, eps=0.0) inference_timer.end() action_dict.update({agent: action}) if allow_caching: agent_last_obs[agent] = obs[agent] agent_last_action[agent] = action agent_timer.end() step_timer.start() obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if render: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) if step % 100 == 0: print("{}/{}".format(step, max_steps - 1)) for agent in env.get_agent_handles(): score += all_rewards[agent] final_step = step if done['__all__']: break normalized_score = score / (max_steps * env.get_num_agents()) scores.append(normalized_score) tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) completions.append(completion) nb_steps.append(final_step) inference_times.append(inference_timer.get()) preproc_times.append(preproc_timer.get()) agent_times.append(agent_timer.get()) step_times.append(step_timer.get()) skipped_text = "" if skipped > 0: skipped_text = "\t⚡ Skipped {}".format(skipped) hit_text = "" if nb_hit > 0: hit_text = "\t⚡ Hit {} ({:.1f}%)".format(nb_hit, (100 * nb_hit) / (n_agents * final_step)) print( "☑️ Score: {:.3f} \tDone: {:.1f}% \tNb steps: {:.3f} " "\t🍭 Seed: {}" "\t🚉 Env: {:.3f}s " "\t🤖 Agent: {:.3f}s (per step: {:.3f}s) \t[preproc: {:.3f}s \tinfer: {:.3f}s]" "{}{}".format(normalized_score, completion * 100.0, final_step, seed, step_timer.get(), agent_timer.get(), agent_timer.get() / final_step, preproc_timer.get(), inference_timer.get(), skipped_text, hit_text)) return scores, completions, nb_steps, agent_times, step_times
def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps, action_size, state_size, seed, render, allow_skipping, allow_caching): # Evaluation is faster on CPU (except if you use a really huge policy) parameters = {'use_gpu': False} # policy = DDDQNPolicy(state_size, action_size, Namespace(**parameters), evaluation_mode=True) # policy.qnetwork_local = torch.load(checkpoint, map_location={'cuda:0': 'cpu'}) env_params = Namespace(**env_params) # Environment parameters n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city agents = [] for agent_id in range(n_agents): agent = AttentionAgent(num_in_pol=state_size, num_out_pol=action_size, hidden_dim=256, lr=0.001) agent.policy = torch.load(os.path.join( checkpoint, f'2300_agent{agent_id}' + '.pth'), map_location=torch.device('cpu')) agent.policy.eval() agents.append(agent) # Malfunction and speed profiles # TODO pass these parameters properly from main! malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 2000, # Rate of malfunctions min_duration=20, # Minimal duration max_duration=50 # Max duration ) # Only fast trains in Round 1 speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Observation parameters observation_tree_depth = env_params.observation_tree_depth observation_radius = env_params.observation_radius observation_max_path_depth = env_params.observation_max_path_depth # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Setup the environment env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city, ), # rail_generator = complex_rail_generator( # nr_start_goal=10, # nr_extra=10, # min_dist=10, # max_dist=99999, # seed=1 # ), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation) if render: # env_renderer = RenderTool(env, gl="PGL") env_renderer = RenderTool( env, # gl="PGL", agent_render_variant=AgentRenderVariant. AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=False, screen_height=600, # Adjust these parameters to fit your resolution screen_width=800) action_dict = dict() scores = [] completions = [] nb_steps = [] inference_times = [] preproc_times = [] agent_times = [] step_times = [] for agent_id in range(n_agents): action_dict[agent_id] = 0 for episode_idx in range(n_eval_episodes): images = [] seed += 1 inference_timer = Timer() preproc_timer = Timer() agent_timer = Timer() step_timer = Timer() step_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True, random_seed=seed) step_timer.end() agent_obs = [None] * env.get_num_agents() score = 0.0 if render: env_renderer.set_new_rail() final_step = 0 skipped = 0 nb_hit = 0 agent_last_obs = {} agent_last_action = {} for step in range(max_steps - 1): # time.sleep(0.2) if allow_skipping and check_if_all_blocked(env): # FIXME why -1? bug where all agents are "done" after max_steps! skipped = max_steps - step - 1 final_step = max_steps - 2 n_unfinished_agents = sum(not done[idx] for idx in env.get_agent_handles()) score -= skipped * n_unfinished_agents break agent_timer.start() for agent in env.get_agent_handles(): agent_model = agents[agent] if obs[agent] and info['action_required'][agent]: if agent in agent_last_obs and np.all( agent_last_obs[agent] == obs[agent]): nb_hit += 1 action = agent_last_action[agent] else: preproc_timer.start() norm_obs = normalize_observation( obs[agent], tree_depth=observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() inference_timer.start() action = act(agent_model, norm_obs) inference_timer.end() action_dict.update({agent: action}) if allow_caching: agent_last_obs[agent] = obs[agent] agent_last_action[agent] = action agent_timer.end() step_timer.start() obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if render: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) im = env_renderer.get_image() im = PIL.Image.fromarray(im) images.append(im) if step % 100 == 0: print("{}/{}".format(step, max_steps - 1)) for agent in env.get_agent_handles(): score += all_rewards[agent] final_step = step if done['__all__']: break if render: for _ in range(10): images.append(images[len(images) - 1]) # save video images[0].save( f'/Users/nikhilvs/repos/nyu/flatland-reinforcement-learning/videos/maac-final/out_{episode_idx}.gif', save_all=True, append_images=images[1:], optimize=False, duration=60, loop=0) normalized_score = score / (max_steps * env.get_num_agents()) scores.append(normalized_score) tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) completions.append(completion) nb_steps.append(final_step) inference_times.append(inference_timer.get()) preproc_times.append(preproc_timer.get()) agent_times.append(agent_timer.get()) step_times.append(step_timer.get()) skipped_text = "" if skipped > 0: skipped_text = "\t⚡ Skipped {}".format(skipped) hit_text = "" if nb_hit > 0: hit_text = "\t⚡ Hit {} ({:.1f}%)".format(nb_hit, (100 * nb_hit) / (n_agents * final_step)) print( "☑️ Score: {:.3f} \tDone: {:.1f}% \tNb steps: {:.3f} " "\t🍭 Seed: {}" "\t🚉 Env: {:.3f}s " "\t🤖 Agent: {:.3f}s (per step: {:.3f}s) \t[preproc: {:.3f}s \tinfer: {:.3f}s]" "{}{}".format(normalized_score, completion * 100.0, final_step, seed, step_timer.get(), agent_timer.get(), agent_timer.get() / final_step, preproc_timer.get(), inference_timer.get(), skipped_text, hit_text)) return scores, completions, nb_steps, agent_times, step_times
def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps, seed, render): # evaluation is faster on CPU, except if you have huge networks parameters = {'use_gpu': False} policy = DDDQNPolicy(state_size, action_size, Namespace(**parameters), evaluation_mode=True) policy.qnetwork_local = torch.load(checkpoint) env_params = Namespace(**env_params) # Environment parameters n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city # Observation parameters observation_tree_depth = env_params.observation_tree_depth observation_radius = env_params.observation_radius observation_max_path_depth = env_params.observation_max_path_depth # Malfunction and speed profiles # TODO pass these parameters properly from main! malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 2000, # Rate of malfunctions min_duration=20, # Minimal duration max_duration=50 # Max duration ) speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Setup the environment env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) env.reset(True, True) if render: env_renderer = RenderTool(env, gl="PGL") action_dict = dict() scores = [] completions = [] nb_steps = [] inference_times = [] preproc_times = [] agent_times = [] step_times = [] for episode_idx in range(n_eval_episodes): inference_timer = Timer() preproc_timer = Timer() agent_timer = Timer() step_timer = Timer() agent_obs = [None] * env.get_num_agents() score = 0.0 step_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True) step_timer.end() if render: env_renderer.set_new_rail() final_step = 0 for step in range(max_steps - 1): agent_timer.start() for agent in env.get_agent_handles(): if obs[agent]: preproc_timer.start() agent_obs[agent] = normalize_observation( obs[agent], tree_depth=observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() action = 0 if info['action_required'][agent]: inference_timer.start() action = policy.act(agent_obs[agent], eps=0.0) inference_timer.end() action_dict.update({agent: action}) agent_timer.end() step_timer.start() obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if render: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) for agent in env.get_agent_handles(): score += all_rewards[agent] final_step = step if done['__all__']: break normalized_score = score / (max_steps * env.get_num_agents()) scores.append(normalized_score) tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) completions.append(completion) nb_steps.append(final_step) inference_times.append(inference_timer.get()) preproc_times.append(preproc_timer.get()) agent_times.append(agent_timer.get()) step_times.append(step_timer.get()) print( "☑️ Score: {:.3f} \tDone: {:.1f}% \tNb steps: {:.3f} " "\t🚉 Env: {:.3f}s " "\t🤖 Agent: {:.3f}s (per step: {:.3f}s) \t[preproc: {:.3f}s \tinfer: {:.3f}s]" .format(normalized_score, completion * 100.0, final_step, step_timer.get(), agent_timer.get(), agent_timer.get() / final_step, preproc_timer.get(), inference_timer.get())) return scores, completions, nb_steps, agent_times, step_times
def test_initial_malfunction_do_nothing(): stochastic_data = MalfunctionParameters(malfunction_rate=70, # Rate of malfunction occurence min_duration=2, # Minimal duration of malfunction max_duration=5 # Max duration of malfunction ) rail, rail_map = make_simple_rail2() env = RailEnv(width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=1, malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), # Malfunction data generator ) env.reset() set_penalties_for_replay(env) replay_config = ReplayConfig( replay=[ Replay( position=None, direction=Grid4TransitionsEnum.EAST, action=RailEnvActions.MOVE_FORWARD, set_malfunction=3, malfunction=3, reward=env.step_penalty, # full step penalty while malfunctioning status=RailAgentStatus.READY_TO_DEPART ), Replay( position=(3, 2), direction=Grid4TransitionsEnum.EAST, action=RailEnvActions.DO_NOTHING, malfunction=2, reward=env.step_penalty, # full step penalty while malfunctioning status=RailAgentStatus.ACTIVE ), # malfunction stops in the next step and we're still at the beginning of the cell # --> if we take action DO_NOTHING, agent should restart without moving # Replay( position=(3, 2), direction=Grid4TransitionsEnum.EAST, action=RailEnvActions.DO_NOTHING, malfunction=1, reward=env.step_penalty, # full step penalty while stopped status=RailAgentStatus.ACTIVE ), # we haven't started moving yet --> stay here Replay( position=(3, 2), direction=Grid4TransitionsEnum.EAST, action=RailEnvActions.DO_NOTHING, malfunction=0, reward=env.step_penalty, # full step penalty while stopped status=RailAgentStatus.ACTIVE ), Replay( position=(3, 2), direction=Grid4TransitionsEnum.EAST, action=RailEnvActions.MOVE_FORWARD, malfunction=0, reward=env.start_penalty + env.step_penalty * 1.0, # start penalty + step penalty for speed 1.0 status=RailAgentStatus.ACTIVE ), # we start to move forward --> should go to next cell now Replay( position=(3, 3), direction=Grid4TransitionsEnum.EAST, action=RailEnvActions.MOVE_FORWARD, malfunction=0, reward=env.step_penalty * 1.0, # step penalty for speed 1.0 status=RailAgentStatus.ACTIVE ) ], speed=env.agents[0].speed_data['speed'], target=env.agents[0].target, initial_position=(3, 2), initial_direction=Grid4TransitionsEnum.EAST, ) run_replay_config(env, [replay_config], activate_agents=False)
def test_initial_malfunction(): stochastic_data = MalfunctionParameters(malfunction_rate=1000, # Rate of malfunction occurence min_duration=2, # Minimal duration of malfunction max_duration=5 # Max duration of malfunction ) rail, rail_map = make_simple_rail2() env = RailEnv(width=25, height=30, rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(seed=10), number_of_agents=1, malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), # Malfunction data generator obs_builder_object=SingleAgentNavigationObs() ) # reset to initialize agents_static env.reset(False, False, True, random_seed=10) print(env.agents[0].malfunction_data) env.agents[0].target = (0, 5) set_penalties_for_replay(env) replay_config = ReplayConfig( replay=[ Replay( position=(3, 2), direction=Grid4TransitionsEnum.EAST, action=RailEnvActions.MOVE_FORWARD, set_malfunction=3, malfunction=3, reward=env.step_penalty # full step penalty when malfunctioning ), Replay( position=(3, 2), direction=Grid4TransitionsEnum.EAST, action=RailEnvActions.MOVE_FORWARD, malfunction=2, reward=env.step_penalty # full step penalty when malfunctioning ), # malfunction stops in the next step and we're still at the beginning of the cell # --> if we take action MOVE_FORWARD, agent should restart and move to the next cell Replay( position=(3, 2), direction=Grid4TransitionsEnum.EAST, action=RailEnvActions.MOVE_FORWARD, malfunction=1, reward=env.step_penalty ), # malfunctioning ends: starting and running at speed 1.0 Replay( position=(3, 2), direction=Grid4TransitionsEnum.EAST, action=RailEnvActions.MOVE_FORWARD, malfunction=0, reward=env.start_penalty + env.step_penalty * 1.0 # running at speed 1.0 ), Replay( position=(3, 3), direction=Grid4TransitionsEnum.EAST, action=RailEnvActions.MOVE_FORWARD, malfunction=0, reward=env.step_penalty # running at speed 1.0 ) ], speed=env.agents[0].speed_data['speed'], target=env.agents[0].target, initial_position=(3, 2), initial_direction=Grid4TransitionsEnum.EAST, ) run_replay_config(env, [replay_config])
flags = parser.parse_args() # Seeded RNG so we can replicate our results np.random.seed(0) # We need to either load in some pre-generated railways from disk, or else create a random railway generator. if flags.load_railways: rail_generator, schedule_generator = load_precomputed_railways(project_root, flags) else: rail_generator, schedule_generator = create_random_railways(project_root) # Create the Flatland environment env = RailEnv(width=flags.grid_width, height=flags.grid_height, number_of_agents=flags.num_agents, rail_generator=rail_generator, schedule_generator=schedule_generator, malfunction_generator=ParamMalfunctionGen(MalfunctionParameters(1 / 8000, 15, 50)), obs_builder_object=TreeObservation(max_depth=flags.tree_depth)) # After training we want to render the results so we also load a renderer env_renderer = RenderTool(env, gl="PILSVG", screen_width=800, screen_height=800, agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX) # Calculate the state size based on the number of nodes in the tree observation num_features_per_node = env.obs_builder.observation_dim num_nodes = sum(np.power(4, i) for i in range(flags.tree_depth + 1)) state_size = num_nodes * num_features_per_node action_size = 5 # Add some variables to keep track of the progress scores_window, steps_window, collisions_window, done_window = [deque(maxlen=200) for _ in range(4)] agent_obs = [None] * flags.num_agents agent_obs_buffer = [None] * flags.num_agents
def curriculum_learning(): num_levels = 70 num_episodes_in_env = 20 model_saving_name = "" """ curriculum = Manual_Curriculum("curriculum.yml") """ curriculum = Semi_Auto_Curriculum( offset_curriculum_generator( num_levels, { "x_dim": 25, "y_dim": 25, "n_agents": 1.75, "n_cities": 3, # "n_extra": 3, # "min_dist": 2, # "max_dist": 5 "max_rails_between_cities": 1, "max_rails_in_city": 1, "malfunction_level": 0, "speed_level": 0 }), num_levels) try: myseed = 19 level = 0 threshold = 0.57 while True: if curriculum.get("speed_level") < 1: speed_profiles = { 1.: 1.0, 1. / 2.: 0.0, 1. / 3.: 0.0, 1. / 4.: 0.0 } elif curriculum.get("speed_level") < 2: speed_profiles = { 1.: 0.75, 1. / 2.: 0.25, 1. / 3.: 0.0, 1. / 4.: 0.0 } elif curriculum.get("speed_level") < 3: speed_profiles = { 1.: 0.4, 1. / 2.: 0.3, 1. / 3.: 0.2, 1. / 4.: 0.1 } else: speed_profiles = { 1.: 0.25, 1. / 2.: 0.25, 1. / 3.: 0.25, 1. / 4.: 0.25 } if curriculum.get("malfunction_level") < 1: malfunction_rate = 0.0 elif curriculum.get("malfunction_level") < 2: malfunction_rate = 0.0025 elif curriculum.get("malfunction_level") < 3: malfunction_rate = 0.0055 else: malfunction_rate = 0.0075 print("=" * 203) print("Running the level {}".format(level)) print("=" * 203) print( "{}x{} grid, {} agents, {} cities, {} rails between cities, {} rails in cities, malfunctions rate {} " "and speed {}. Completion threshold of {}".format( curriculum.get("x_dim"), curriculum.get("y_dim"), curriculum.get("n_agents"), curriculum.get("n_cities"), curriculum.get("max_rails_between_cities"), curriculum.get("max_rails_in_city"), malfunction_rate, speed_profiles, threshold)) try_outs = 0 completion = 0 while try_outs < 10 and completion < threshold: namefile = "curriculum_{}_{}_{}".format( myseed, level, try_outs) environment_parameters = { "n_agents": curriculum.get("n_agents"), "x_dim": curriculum.get("x_dim"), "y_dim": curriculum.get("y_dim"), "n_cities": curriculum.get("n_cities"), # Old curriculum # "n_extra": curriculum.get("n_extra"), # "min_dist": curriculum.get("min_dist"), # "max_dist": curriculum.get("max_dist"), "max_rails_between_cities": curriculum.get("max_rails_between_cities"), "max_rails_in_city": curriculum.get("max_rails_in_city"), "seed": myseed, "observation_tree_depth": 2, "observation_radius": 10, "observation_max_path_depth": 30, # Malfunctions "malfunction_parameters": MalfunctionParameters(malfunction_rate=malfunction_rate, min_duration=15, max_duration=50), # Speeds "speed_profiles": speed_profiles, # ============================ # Custom observations&rewards # ============================ "custom_observations": False, "reward_shaping": True, "uniform_reward": True, "stop_penalty": -0.0, "invalid_action_penalty": -0.0, "deadlock_penalty": -15.0, # 1.0 for skipping "shortest_path_penalty_coefficient": 1 + 1 / 15, "done_bonus": 1 / 15, } training_parameters = { # ============================ # Network architecture # ============================ # Shared actor-critic layer # If shared is True then the considered sizes are taken from the critic "shared": False, "shared_recurrent": False, "linear_size": 128, "hidden_size": 64, # Policy network "critic_mlp_width": 128, "critic_mlp_depth": 3, "last_critic_layer_scaling": 0.1, # Actor network "actor_mlp_width": 128, "actor_mlp_depth": 3, "last_actor_layer_scaling": 0.01, "learning_rate": 0.001, "adam_eps": 1e-5, "activation": "Tanh", "lmbda": 0.95, "entropy_coefficient": 0.01, "value_loss_coefficient": 0.001, # ============================ # Training setup # ============================ "n_episodes": num_episodes_in_env, "horizon": 512, "epochs": 8, "batch_size": 32, "batch_mode": "shuffle", # ============================ # Normalization and clipping # ============================ "discount_factor": 0.99, "max_grad_norm": 0.5, "eps_clip": 0.2, # ============================ # Advantage estimation # ============================ "advantage_estimator": "gae", # ============================ # Optimization and rendering # ============================ "checkpoint_interval": num_episodes_in_env, "evaluation_mode": False, "eval_episodes": None, "use_gpu": False, "render": False, "print_stats": True, "wandb_project": "flatland-challenge-ps-ppo-curriculum", "wandb_entity": "lomb", "wandb_tag": "curriculum_{}".format(myseed), "save_model_path": "/content/drive/My Drive/Colab Notebooks/models/" + namefile + ".pt", "load_model_path": "/content/drive/My Drive/Colab Notebooks/models/" + model_saving_name + ".pt", "tensorboard_path": "log_" + namefile + "/", "automatic_name_saving": False, # ============================ # Action Masking # ============================ "action_masking": True, "allow_no_op": False } print("\nLevel %d try out number % d" % (level, try_outs)) # Train _, completions, _, _ = train_multiple_agents( Namespace(**environment_parameters), Namespace(**training_parameters)) try_outs += 1 model_saving_name = namefile completion = np.mean(completions) print("\n" + "=" * 203) # Update curriculum curriculum.update() level += 1 except StopIteration: return
def train(env): n_agents = env["n_agents"] x_dim = env["x_dim"] y_dim = env["y_dim"] n_cities = env["n_cities"] max_rails_between_cities = env["max_rails_between_cities"] max_rails_in_city = env["max_rails_in_city"] seed = 0 use_fast_tree_obs = False # Observation parameters observation_tree_depth = 4 observation_radius = 10 observation_max_path_depth = 30 # Set the seeds random.seed(seed) np.random.seed(seed) # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 10000, # Rate of malfunctions min_duration=15, # Minimal duration max_duration=50 # Max duration ) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = None if use_fast_tree_obs: tree_observation = FastTreeObs(max_depth=observation_tree_depth) print("Using FastTreeObs") else: tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) print("Using StandardTreeObs") speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) rewards = [] obs, info = env.reset() if use_fast_tree_obs: state_size = tree_observation.observation_dim else: # Calculate the state size given the depth of the tree observation and the # number of features n_features_per_node = env.obs_builder.observation_dim n_nodes = 0 for i in range(observation_tree_depth + 1): n_nodes += np.power(4, i) state_size = n_features_per_node * n_nodes action_size = 5 DEVICE = 'cpu' # if torch.cuda.is_available(): # DEVICE = 'gpu' buffer_length = 10000 steps_to_save_model = 10 step_size = 100 num_steps = 100 # update every 100 steps avg_steps = 20 # num steps to average and plot rewards reward_q = [] batch_size = 100 agent_obs = np.array([None] * env.get_num_agents()) max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) num_episodes = 100000 agent_init_params = [] sa_size = [] for i in range(n_agents): agent_init_params.append({ 'num_in_pol': state_size, 'num_out_pol': action_size, 'init_weights': 'model.pt' }) sa_size.append((state_size, action_size)) hyperparams = { "tau": 0.01, "pi_lr": 0.00001, "q_lr": 0.00005, "pol_hidden_dim": 256, "critic_hidden_dim": 256, "attend_heads": 8 } model = AttentionSAC(agent_init_params=agent_init_params, sa_size=sa_size, tau=hyperparams["tau"], pi_lr=hyperparams["pi_lr"], q_lr=hyperparams["q_lr"], pol_hidden_dim=hyperparams["pol_hidden_dim"], critic_hidden_dim=hyperparams["critic_hidden_dim"], attend_heads=hyperparams["attend_heads"]) model.init_dict = {} replay_buffer = ReplayBuffer(buffer_length, n_agents, [state_size for i in range(n_agents)], [action_size for i in range(n_agents)]) print("MAX STEPS: " + str(max_steps)) print("NUM EPISODES: ", num_episodes) print("HYPERPARAMS: ") print(hyperparams) start_time = time.time() for ep in range(num_episodes): print("Episode " + str(ep) + ":", flush=True) obs, info = env.reset(True, True) model.prep_rollouts(device=DEVICE) reward_sum_for_this_episode = 0 for steps in range(max_steps): if steps % step_size == 0: print("=", end="", flush=True) for agent in env.get_agent_handles(): if obs[agent] is not None: if use_fast_tree_obs: agent_obs[agent] = obs[agent] else: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) else: agent_obs[agent] = np.array([0.] * state_size) action_dict = {} agent_actions = [] torch_obs = [ Variable(torch.Tensor([agent_obs[i]]), requires_grad=False) for i in range(n_agents) ] torch_agent_actions = model.step(torch_obs, explore=True) agent_actions = [ac.data.numpy() for ac in torch_agent_actions] for i in range(n_agents): dist = torch_agent_actions[i][0] idx = -1 for j in range(action_size): if dist[j] != 0: idx = j break action_dict[i] = idx next_obs, all_rewards, done, info = env.step(action_dict) rewards = [] dones = [] next_agent_obs = np.array([None] * env.get_num_agents()) for agent in env.get_agent_handles(): if next_obs[agent] is not None: if use_fast_tree_obs: next_agent_obs[agent] = next_obs[agent] else: next_agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) else: next_agent_obs[agent] = np.array([0.] * state_size) for i in range(n_agents): reward_sum_for_this_episode += all_rewards[i] rewards.append(all_rewards[i]) all_rewards[i] += augment_reward(agent_obs[agent]) dones.append(done[i]) replay_buffer.push(np.array([agent_obs]), np.array(agent_actions), np.array([rewards]), np.array([next_agent_obs]), np.array([dones])) if steps % num_steps == 0: model.prep_training(device=DEVICE) sample = replay_buffer.sample(batch_size, norm_rews=False) #print(sample) model.update_critic(sample) model.update_policies(sample) model.update_all_targets() model.prep_rollouts(device=DEVICE) reward_sum_for_this_episode /= n_agents reward_q.append(reward_sum_for_this_episode) if len(reward_q) == avg_steps: wandb.log({'reward': np.mean(reward_q)}) reward_q = [] print() if ep % steps_to_save_model == 0: print("\nSaving model") model.save(os.getcwd() + "/model.pt") cur_time = time.time() time_elapsed = (cur_time - start_time) // 60 print("Time Elapsed: " + str(time_elapsed) + "\n")
def main(argv): try: opts, args = getopt.getopt(argv, "n:", ["n_trials="]) except getopt.GetoptError: print('training_navigation.py -n <n_trials>') sys.exit(2) for opt, arg in opts: if opt in ('-n', '--n_trials'): n_trials = int(arg) random.seed(1) np.random.seed(1) # Parameters for the Environment x_dim = 35 y_dim = 35 n_agents = 1 # Use a the malfunction generator to break agents from time to time # stochastic_data = {'malfunction_rate': 8000, # Rate of malfunction occurence of single agent # 'min_duration': 15, # Minimal duration of malfunction # 'max_duration': 50 # Max duration of malfunction # } stochastic_data = MalfunctionParameters(malfunction_rate=8000, # Rate of malfunction occurence min_duration=15, # Minimal duration of malfunction max_duration=50 # Max duration of malfunction ) # Custom observation builder TreeObservation = TreeObsForRailEnv(max_depth=2) # Different agent types (trains) with different speeds. speed_ration_map = {1.: 0., # Fast passenger train 1. / 2.: 1.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0} # Slow freight train env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator(max_num_cities=3, # Number of cities in map (where train stations are) seed=1, # Random seed grid_mode=False, max_rails_between_cities=2, max_rails_in_city=3), schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), # Malfunction data generator obs_builder_object=TreeObservation) # After training we want to render the results so we also load a renderer env_renderer = RenderTool(env, gl="PILSVG", ) # Given the depth of the tree observation and the number of features per node we get the following state_size num_features_per_node = env.obs_builder.observation_dim tree_depth = 2 nr_nodes = 0 for i in range(tree_depth + 1): nr_nodes += np.power(4, i) state_size = num_features_per_node * nr_nodes # The action space of flatland is 5 discrete actions action_size = 5 # We set the number of episodes we would like to train on if 'n_trials' not in locals(): n_trials = 50 # And the max number of steps we want to take per episode max_steps = int(3 * (env.height + env.width)) # Define training parameters eps = 1. eps_end = 0.005 eps_decay = 0.998 # And some variables to keep track of the progress action_dict = dict() final_action_dict = dict() scores_window = deque(maxlen=100) done_window = deque(maxlen=100) scores = [] dones_list = [] action_prob = [0] * action_size agent_obs = [None] * n_agents agent_next_obs = [None] * n_agents agent_obs_buffer = [None] * n_agents agent_action_buffer = [2] * n_agents cummulated_reward = np.zeros(n_agents) update_values = False # Now we load a Double dueling DQN agent agent = Agent(state_size, action_size) for trials in range(1, n_trials + 1): # Reset environment obs, info = env.reset(False, False) # env_renderer.reset() # Build agent specific observations for a in range(env.get_num_agents()): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) agent_obs_buffer[a] = agent_obs[a].copy() # Reset score and done score = 0 env_done = 0 # Run episode for step in range(max_steps): # Action for a in range(env.get_num_agents()): if info['action_required'][a]: # If an action is require, we want to store the obs a that step as well as the action update_values = True action = agent.act(agent_obs[a], eps=eps) action_prob[action] += 1 else: update_values = False action = 0 action_dict.update({a: action}) # Environment step next_obs, all_rewards, done, info = env.step(action_dict) # Update replay buffer and train agent for a in range(env.get_num_agents()): # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[a]: agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a], agent_obs[a], done[a]) cummulated_reward[a] = 0. agent_obs_buffer[a] = agent_obs[a].copy() agent_action_buffer[a] = action_dict[a] if next_obs[a]: agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10) score += all_rewards[a] / env.get_num_agents() # env_renderer.render_env(show=True, show_observations=False, show_predictions=False) # Copy observation if done['__all__']: env_done = 1 break # Epsilon decay eps = max(eps_end, eps_decay * eps) # decrease epsilon # Collection information about training tasks_finished = 0 for _idx in range(env.get_num_agents()): if done[_idx] == 1: tasks_finished += 1 done_window.append(tasks_finished / max(1, env.get_num_agents())) scores_window.append(score / max_steps) # save most recent score scores.append(np.mean(scores_window)) dones_list.append((np.mean(done_window))) print( '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format( env.get_num_agents(), x_dim, y_dim, trials, np.mean(scores_window), 100 * np.mean(done_window), eps, action_prob / np.sum(action_prob)), end=" ") if trials % 100 == 0: print( '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format( env.get_num_agents(), x_dim, y_dim, trials, np.mean(scores_window), 100 * np.mean(done_window), eps, action_prob / np.sum(action_prob))) torch.save(agent.qnetwork_local.state_dict(), './Nets/navigator_checkpoint' + str(trials) + '.pth') action_prob = [1] * action_size
def main(): np.random.seed(1) env = RailEnv( width=flags.grid_width, height=flags.grid_height, number_of_agents=flags.num_agents, rail_generator=rail_generator, schedule_generator=schedule_generator, malfunction_generator_and_process_data=malfunction_from_params( MalfunctionParameters(1 / 8000, 15, 50)), obs_builder_object=TreeObservation(max_depth=flags.tree_depth)) # After training we want to render the results so we also load a renderer env_renderer = RenderTool(env, gl="PILSVG") # Calculate the state size based on the number of nodes in the tree observation num_features_per_node = env.obs_builder.observation_dim num_nodes = sum(np.power(4, i) for i in range(flags.tree_depth + 1)) state_size = num_nodes * num_features_per_node action_size = 5 # Now we load a double dueling DQN agent and initialize it from the checkpoint agent = Agent(state_size, action_size) if flags.load_from_checkpoint: start, eps = agent.load(project_root / 'checkpoints', 0, 1.0) else: start, eps = 0, 1.0 # And some variables to keep track of the progress action_dict, final_action_dict = {}, {} scores_window, steps_window, done_window = deque(maxlen=200), deque( maxlen=200), deque(maxlen=200) action_prob = [0] * action_size agent_obs = [None] * flags.num_agents agent_obs_buffer = [None] * flags.num_agents agent_action_buffer = [2] * flags.num_agents max_steps = int(8 * (flags.grid_width + flags.grid_height)) update_values = False start_time = time.time() # We don't want to retrain on old railway networks when we restart from a checkpoint, so we just loop # through the generators to get all the old networks out of the way if start > 0: print(f"Skipping {start} railways") for _ in range(0, start): rail_generator() schedule_generator() # Start the training loop for episode in range(start + 1, flags.num_episodes + 1): env_renderer.reset() obs, info = env.reset(True, True) score, steps_taken = 0, 0 # Build agent specific observations for a in range(flags.num_agents): if obs[a]: agent_obs[a] = normalize_observation(obs[a], flags.tree_depth) agent_obs_buffer[a] = agent_obs[a].copy() # Run episode for step in range(max_steps): for a in range(flags.num_agents): # if not isinstance(obs[a].childs['L'], float) or not isinstance(obs[a].childs['R'], float): if info['action_required'][a]: # If an action is required, we want to store the obs a that step as well as the action update_values = True # distances = { key: child.dist_min_to_target for key, child in obs[a].childs.items() if not isinstance(child, float) } # action_key = min(distances, key=distances.get) # action = { 'L': 1, 'F': 2, 'R': 3 }[action_key] # action = np.argmin(agent_obs[a]) # action = np.random.randint(4) action = agent.act(agent_obs[a], eps=eps) action_dict[a] = action action_prob[action] += 1 steps_taken += 1 else: update_values = False action_dict[a] = 2 # Environment step obs, all_rewards, done, info = env.step(action_dict) # Update replay buffer and train agent for a in range(flags.num_agents): # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[a]: agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a], agent_obs[a], done[a], flags.train) agent_obs_buffer[a] = agent_obs[a].copy() agent_action_buffer[a] = action_dict[a] if obs[a]: agent_obs[a] = normalize_observation( obs[a], flags.tree_depth) score += all_rewards[a] / flags.num_agents # Render if flags.render_interval and episode % flags.render_interval == 0: render(env_renderer) if done['__all__']: break # Epsilon decay eps = max(0.01, flags.epsilon_decay * eps) # Save some training statistics in their respective deques tasks_finished = sum(done[i] for i in range(flags.num_agents)) done_window.append(tasks_finished / max(1, flags.num_agents)) scores_window.append(score / max_steps) steps_window.append(steps_taken) action_probs = ', '.join(f'{x:.3f}' for x in action_prob / np.sum(action_prob)) print( f'\rTraining {flags.num_agents} Agents on ({flags.grid_width},{flags.grid_height}) \t ' + f'Episode {episode} \t ' + f'Average Score: {np.mean(scores_window):.3f} \t ' + f'Average Steps Taken: {np.mean(steps_window):.1f} \t ' + f'Dones: {100 * np.mean(done_window):.2f}% \t ' + f'Epsilon: {eps:.2f} \t ' + f'Action Probabilities: {action_probs}', end=" ") if episode % flags.report_interval == 0: print( f'\rTraining {flags.num_agents} Agents on ({flags.grid_width},{flags.grid_height}) \t ' + f'Episode {episode} \t ' + f'Average Score: {np.mean(scores_window):.3f} \t ' + f'Average Steps Taken: {np.mean(steps_window):.1f} \t ' + f'Dones: {100 * np.mean(done_window):.2f}% \t ' + f'Epsilon: {eps:.2f} \t ' + f'Action Probabilities: {action_probs} \t ' + f'Time taken: {time.time() - start_time:.2f}s') if flags.train: agent.save(project_root / 'checkpoints', episode, eps) start_time = time.time() action_prob = [1] * action_size
from flatland.envs.agent_utils import RailAgentStatus from flatland.envs.observations import TreeObsForRailEnv from flatland.envs.rail_env import RailEnv from flatland.envs.rail_generators import random_rail_generator, complex_rail_generator, sparse_rail_generator from flatland.utils.rendertools import RenderTool rail_generator = sparse_rail_generator(seed=0, max_num_cities=4, grid_mode=False, max_rails_between_cities=2, max_rails_in_city=2) malfunction_generator = ParamMalfunctionGen( MalfunctionParameters(malfunction_rate=10, min_duration=20, max_duration=50)) speed_ratio_map = None speed_ratio_map = {1: 1} schedule_generator = sparse_schedule_generator(speed_ratio_map) n_agents = 5 env = RailEnv( width=25, height=25, rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=n_agents, malfunction_generator=malfunction_generator, obs_builder_object=TreeObsForRailEnv(
def train(): seed = 14 namefile = "d3qn_1_mm5" print("Running {}".format(namefile)) environment_parameters = { "n_agents": 5, "x_dim": 16 * 3, "y_dim": 9 * 3, "n_cities": 5, "max_rails_between_cities": 2, "max_rails_in_city": 3, "seed": seed, "observation_tree_depth": 2, "observation_radius": 10, "observation_max_path_depth": 30, # Malfunctions "malfunction_parameters": MalfunctionParameters( malfunction_rate=0.005, min_duration=15, max_duration=50), # Speeds "speed_profiles": { 1.: 0.25, 1. / 2.: 0.25, 1. / 3.: 0.25, 1. / 4.: 0.25}, # ============================ # Custom observations&rewards # ============================ "custom_observations": False, "reward_shaping": True, "uniform_reward": True, "stop_penalty": -0.2, "invalid_action_penalty": -0.0, "deadlock_penalty": -5.0, # 1.0 for skipping "shortest_path_penalty_coefficient": 1.2, "done_bonus": 0.2, } training_parameters = { # ============================ # Network architecture # ============================ "double_dqn": True, "shared": False, "hidden_size": 256, "hidden_layers": 2, "update_every": 16, "type": 1, # epsilon greedy decay regulators "eps_decay": 0.99739538258046, "eps_start": 1.0, "eps_end": 0.02, "learning_rate": 0.52e-4, # To compute q targets "gamma": 0.99, # To compute target network soft update "tau": 1e-3, # ============================ # Training setup # ============================ "n_episodes": 15000, "batch_size": 32, # Minimum number of samples to start learning "buffer_min_size": 0, "fingerprints": True, # If not set the default value is the standard FingerprintType.EPSILON_STEP "fingerprint_type": FingerprintType.EPSILON_EPISODE, # ============================ # Memory # ============================ # Memory maximum size "buffer_size": int(1e6), # memory type uer or per "memory_type": "per", # ============================ # Saving and rendering # ============================ "checkpoint_interval": 500, "evaluation_mode": False, "eval_episodes": 25, "use_gpu": False, "render": False, "print_stats": True, "wandb_project": "flatland-challenge-final", "wandb_entity": "fiorenzoparascandolo", "wandb_tag": "d3qn", "save_model_path": "/content/drive/My Drive/Colab Notebooks/models/" + namefile + ".pt", "load_model_path": "/content/drive/My Drive/Colab Notebooks/models/todo.pt", "tensorboard_path": "/content/drive/My Drive/Colab Notebooks/logs/logs" + namefile + "/", "automatic_name_saving": True, # ============================ # Action Masking / Skipping # ============================ "action_masking": True, "allow_no_op": False } if training_parameters["evaluation_mode"]: eval_policy(Namespace(**environment_parameters), Namespace(**training_parameters)) else: train_multiple_agents(Namespace(**environment_parameters), Namespace(**training_parameters))
def train_agent(env_params, train_params): # Environment parameters n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city seed = env_params.seed # Observation parameters observation_tree_depth = env_params.observation_tree_depth observation_radius = env_params.observation_radius observation_max_path_depth = env_params.observation_max_path_depth # Training parameters eps_start = train_params.eps_start eps_end = train_params.eps_end eps_decay = train_params.eps_decay n_episodes = train_params.n_episodes checkpoint_interval = train_params.checkpoint_interval n_eval_episodes = train_params.n_evaluation_episodes # Set the seeds random.seed(seed) np.random.seed(seed) # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 10000, # Rate of malfunctions min_duration=15, # Minimal duration max_duration=50 # Max duration ) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Fraction of train which each speed speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Setup the environment env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) env.reset(regenerate_schedule=True, regenerate_rail=True) # Setup renderer if train_params.render: env_renderer = RenderTool(env, gl="PGL") # Calculate the state size given the depth of the tree observation and the number of features n_features_per_node = env.obs_builder.observation_dim n_nodes = 0 for i in range(observation_tree_depth + 1): n_nodes += np.power(4, i) state_size = n_features_per_node * n_nodes # The action space of flatland is 5 discrete actions action_size = 5 # Max number of steps per episode # This is the official formula used during evaluations # See details in flatland.envs.schedule_generators.sparse_schedule_generator max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) action_count = [0] * action_size action_dict = dict() agent_obs = [None] * env.get_num_agents() agent_prev_obs = [None] * env.get_num_agents() agent_prev_action = [2] * env.get_num_agents() update_values = False smoothed_normalized_score = -1.0 smoothed_eval_normalized_score = -1.0 smoothed_completion = 0.0 smoothed_eval_completion = 0.0 # Double Dueling DQN policy policy = DDDQNPolicy(state_size, action_size, train_params) # TensorBoard writer writer = SummaryWriter() writer.add_hparams(vars(train_params), {}) writer.add_hparams(vars(env_params), {}) training_timer = Timer() training_timer.start() print( "\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes.\n" .format(env.get_num_agents(), x_dim, y_dim, n_episodes, n_eval_episodes, checkpoint_interval)) for episode_idx in range(n_episodes + 1): # Timers step_timer = Timer() reset_timer = Timer() learn_timer = Timer() preproc_timer = Timer() # Reset environment reset_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True) reset_timer.end() if train_params.render: env_renderer.set_new_rail() score = 0 nb_steps = 0 actions_taken = [] # Build agent specific observations for agent in env.get_agent_handles(): if obs[agent]: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) agent_prev_obs[agent] = agent_obs[agent].copy() # Run episode for step in range(max_steps - 1): for agent in env.get_agent_handles(): if info['action_required'][agent]: # If an action is required, we want to store the obs at that step as well as the action update_values = True action = policy.act(agent_obs[agent], eps=eps_start) action_count[action] += 1 actions_taken.append(action) else: update_values = False action = 0 action_dict.update({agent: action}) # Environment step step_timer.start() next_obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if train_params.render and episode_idx % checkpoint_interval == 0: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) for agent in range(env.get_num_agents()): # Update replay buffer and train agent # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[agent]: learn_timer.start() policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent]) learn_timer.end() agent_prev_obs[agent] = agent_obs[agent].copy() agent_prev_action[agent] = action_dict[agent] # Preprocess the new observations if next_obs[agent]: preproc_timer.start() agent_obs[agent] = normalize_observation( next_obs[agent], observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() score += all_rewards[agent] nb_steps = step if done['__all__']: break # Epsilon decay eps_start = max(eps_end, eps_decay * eps_start) # Collection information about training tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) normalized_score = score / (max_steps * env.get_num_agents()) action_probs = action_count / np.sum(action_count) action_count = [1] * action_size # Smoothed values for terminal display and for more stable hyper-parameter tuning smoothing = 0.99 smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * ( 1.0 - smoothing) smoothed_completion = smoothed_completion * smoothing + completion * ( 1.0 - smoothing) # Print logs if episode_idx % checkpoint_interval == 0: torch.save( policy.qnetwork_local, './checkpoints/origin_multi-' + str(episode_idx) + '.pth') if train_params.render: env_renderer.close_window() print('\r🚂 Episode {}' '\t 🏆 Score: {:.3f}' ' Avg: {:.3f}' '\t 💯 Done: {:.2f}%' ' Avg: {:.2f}%' '\t 🎲 Epsilon: {:.2f} ' '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score, smoothed_normalized_score, 100 * completion, 100 * smoothed_completion, eps_start, format_action_prob(action_probs)), end=" ") # Evaluate policy if episode_idx % train_params.checkpoint_interval == 0: scores, completions, nb_steps_eval = eval_policy( env, policy, n_eval_episodes, max_steps) writer.add_scalar("evaluation/scores_min", np.min(scores), episode_idx) writer.add_scalar("evaluation/scores_max", np.max(scores), episode_idx) writer.add_scalar("evaluation/scores_mean", np.mean(scores), episode_idx) writer.add_scalar("evaluation/scores_std", np.std(scores), episode_idx) writer.add_histogram("evaluation/scores", np.array(scores), episode_idx) writer.add_scalar("evaluation/completions_min", np.min(completions), episode_idx) writer.add_scalar("evaluation/completions_max", np.max(completions), episode_idx) writer.add_scalar("evaluation/completions_mean", np.mean(completions), episode_idx) writer.add_scalar("evaluation/completions_std", np.std(completions), episode_idx) writer.add_histogram("evaluation/completions", np.array(completions), episode_idx) writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_mean", np.mean(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval), episode_idx) writer.add_histogram("evaluation/nb_steps", np.array(nb_steps_eval), episode_idx) smoothing = 0.9 smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean( scores) * (1.0 - smoothing) smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean( completions) * (1.0 - smoothing) writer.add_scalar("evaluation/smoothed_score", smoothed_eval_normalized_score, episode_idx) writer.add_scalar("evaluation/smoothed_completion", smoothed_eval_completion, episode_idx) # Save logs to tensorboard writer.add_scalar("training/score", normalized_score, episode_idx) writer.add_scalar("training/smoothed_score", smoothed_normalized_score, episode_idx) writer.add_scalar("training/completion", np.mean(completion), episode_idx) writer.add_scalar("training/smoothed_completion", np.mean(smoothed_completion), episode_idx) writer.add_scalar("training/nb_steps", nb_steps, episode_idx) writer.add_histogram("actions/distribution", np.array(actions_taken), episode_idx) writer.add_scalar("actions/nothing", action_probs[RailEnvActions.DO_NOTHING], episode_idx) writer.add_scalar("actions/left", action_probs[RailEnvActions.MOVE_LEFT], episode_idx) writer.add_scalar("actions/forward", action_probs[RailEnvActions.MOVE_FORWARD], episode_idx) writer.add_scalar("actions/right", action_probs[RailEnvActions.MOVE_RIGHT], episode_idx) writer.add_scalar("actions/stop", action_probs[RailEnvActions.STOP_MOVING], episode_idx) writer.add_scalar("training/epsilon", eps_start, episode_idx) writer.add_scalar("training/buffer_size", len(policy.memory), episode_idx) writer.add_scalar("training/loss", policy.loss, episode_idx) writer.add_scalar("timer/reset", reset_timer.get(), episode_idx) writer.add_scalar("timer/step", step_timer.get(), episode_idx) writer.add_scalar("timer/learn", learn_timer.get(), episode_idx) writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx) writer.add_scalar("timer/total", training_timer.get_current(), episode_idx)
# We need to either load in some pre-generated railways from disk, or else create a random railway generator. if flags.load_railways: rail_generator, schedule_generator = load_precomputed_railways( project_root, flags) else: rail_generator, schedule_generator = create_random_railways(project_root) # Create the Flatland environment env = RailEnv(width=flags.grid_width, height=flags.grid_height, number_of_agents=flags.num_agents, rail_generator=rail_generator, schedule_generator=schedule_generator, malfunction_generator_and_process_data=malfunction_from_params( MalfunctionParameters(1 / 8000, 15, 50)), obs_builder_object=TreeObservation(max_depth=flags.tree_depth)) # After training we want to render the results so we also load a renderer env_renderer = RenderTool( env, gl="PILSVG", screen_width=800, screen_height=800, agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX) # Calculate the state size based on the number of nodes in the tree observation num_features_per_node = env.obs_builder.observation_dim num_nodes = sum(np.power(4, i) for i in range(flags.tree_depth + 1)) state_size = num_nodes * num_features_per_node action_size = 5
# Training parameters eps_start = train_params.eps_start eps_end = train_params.eps_end eps_decay = train_params.eps_decay n_episodes = train_params.n_episodes checkpoint_interval = train_params.checkpoint_interval n_eval_episodes = train_params.n_evaluation_episodes # Set the seeds random.seed(seed) np.random.seed(seed) # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 10000, # Rate of malfunctions min_duration=15, # Minimal duration max_duration=50 # Max duration ) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Fraction of train which each speed speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train }
def train(): seed = 14 namefile = "psppo_" + datetime.now().strftime("%m_%d_%Y_%H_%M_%S") print("Running {}".format(namefile)) environment_parameters = { "n_agents": 5, "x_dim": 16 * 3, "y_dim": 9 * 3, "n_cities": 5, "max_rails_between_cities": 2, "max_rails_in_city": 3, "seed": seed, "observation_tree_depth": 2, "observation_radius": 10, "observation_max_path_depth": 30, # Malfunctions "malfunction_parameters": MalfunctionParameters( malfunction_rate=0.005, min_duration=15, max_duration=50), # Speeds "speed_profiles": { 1.: 0.25, 1. / 2.: 0.25, 1. / 3.: 0.25, 1. / 4.: 0.25}, # ============================ # Custom observations&rewards # ============================ "custom_observations": False, "reward_shaping": True, "uniform_reward": True, "stop_penalty": -0.2, "invalid_action_penalty": -0.0, "deadlock_penalty": -5.0, # 1.0 for skipping "shortest_path_penalty_coefficient": 1.2, "done_bonus": 0.2, } training_parameters = { # ============================ # Network architecture # ============================ # Shared actor-critic layer # If shared is True then the considered sizes are taken from the critic "shared": False, "shared_recurrent": True, "linear_size": 128, "hidden_size": 64, # Policy network "critic_mlp_width": 128, "critic_mlp_depth": 3, "last_critic_layer_scaling": 0.1, # Actor network "actor_mlp_width": 128, "actor_mlp_depth": 3, "last_actor_layer_scaling": 0.01, # Adam learning rate "learning_rate": 0.002, # Adam epsilon "adam_eps": 1e-5, # Activation "activation": "Tanh", "lmbda": 0.95, "entropy_coefficient": 0.01, # Called also baseline cost in shared setting (0.5) # (C54): {0.001, 0.1, 1.0, 10.0, 100.0} "value_loss_coefficient": 0.001, # ============================ # Training setup # ============================ "n_episodes": 500, "horizon": 2048, "epochs": 8, # 64, 128, 256 "batch_size": 256, "batch_mode": "shuffle", # ============================ # Normalization and clipping # ============================ # Discount factor (0.95, 0.97, 0.99, 0.999) "discount_factor": 0.99, "max_grad_norm": 0.5, # PPO-style value clipping "eps_clip": 0.3, # ============================ # Advantage estimation # ============================ # gae or n-steps "advantage_estimator": "gae", # ============================ # Optimization and rendering # ============================ # Save and evaluate interval "checkpoint_interval": 100, "evaluation_mode": False, "eval_episodes": 500, "use_gpu": False, "render": False, "print_stats": True, "wandb_project": "flatland-challenge-ps-ppo-test", "wandb_entity": "fiorenzoparascandolo", "wandb_tag": "ps-ppo", "save_model_path": namefile + ".pt", "load_model_path": namefile + ".pt", "automatic_name_saving": True, "tensorboard_path": "log_" + namefile + "/", # ============================ # Action Masking / Skipping # ============================ "action_masking": True, "allow_no_op": False } """ # Save on Google Drive on Colab "save_model_path": "/content/drive/My Drive/Colab Notebooks/models/" + namefile + ".pt", "load_model_path": "/content/drive/My Drive/Colab Notebooks/models/todo.pt", "tensorboard_path": "/content/drive/My Drive/Colab Notebooks/logs/logs" + namefile + "/", """ """ # Mount Drive on Colab from google.colab import drive drive.mount("/content/drive", force_remount=True) # Show Tensorboard on Colab import tensorflow %load_ext tensorboard % tensorboard --logdir "/content/drive/My Drive/Colab Notebooks/logs_todo" """ if training_parameters["evaluation_mode"]: eval_policy(Namespace(**environment_parameters), Namespace(**training_parameters)) else: train_multiple_agents(Namespace(**environment_parameters), Namespace(**training_parameters))