def demo(args=None): """Demo script to check installation""" env = RailEnv(width=15, height=15, rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=1, min_dist=8, max_dist=99999), schedule_generator=complex_schedule_generator(), number_of_agents=5) env._max_episode_steps = int(15 * (env.width + env.height)) env_renderer = RenderTool(env) while True: obs, info = env.reset() _done = False # Run a single episode here step = 0 while not _done: # Compute Action _action = {} for _idx, _ in enumerate(env.agents): _action[_idx] = np.random.randint(0, 5) obs, all_rewards, done, _ = env.step(_action) _done = done['__all__'] step += 1 env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) time.sleep(0.3) return 0
def render_env(env, fname): env_renderer = RenderTool(env, gl="PGL") env_renderer.render_env() image = env_renderer.get_image() pil_image = PIL.Image.fromarray(image) pil_image.save(fname)
def render_test(parameters, test_nr=0, nr_examples=5): for trial in range(nr_examples): # Reset the env print( 'Showing {} Level {} with (x_dim,y_dim) = ({},{}) and {} Agents.'. format(test_nr, trial, parameters[0], parameters[1], parameters[2])) file_name = "./Tests/{}/Level_{}.pkl".format(test_nr, trial) env = RailEnv( width=1, height=1, rail_generator=rail_from_file(file_name), obs_builder_object=TreeObsForRailEnv(max_depth=2), number_of_agents=1, ) env_renderer = RenderTool( env, gl="PILSVG", ) env_renderer.set_new_rail() env.reset(False, False) env_renderer.render_env(show=True, show_observations=False) time.sleep(0.1) env_renderer.close_window() return
def test_path_not_exists(rendering=False): rail, rail_map = make_simple_rail_unconnected() env = RailEnv( width=rail_map.shape[1], height=rail_map.shape[0], rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=1, obs_builder_object=TreeObsForRailEnv( max_depth=2, predictor=ShortestPathPredictorForRailEnv()), ) env.reset() check_path( env, rail, (5, 6), # south dead-end 0, # north (0, 3), # north dead-end False) if rendering: renderer = RenderTool(env, gl="PILSVG") renderer.render_env(show=True, show_observations=False) input("Continue?")
def replay_verify(max_episode_steps: int, ctl: ControllerFromTrainRuns, env: RailEnv, rendering: bool): """Replays this deterministic `ActionPlan` and verifies whether it is feasible.""" if rendering: renderer = RenderTool(env, gl="PILSVG", agent_render_variant=AgentRenderVariant. AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=True, clear_debug_text=True, screen_height=1000, screen_width=1000) renderer.render_env(show=True, show_observations=False, show_predictions=False) i = 0 while not env.dones['__all__'] and i <= max_episode_steps: for agent_id, agent in enumerate(env.agents): way_point: WayPoint = ctl.get_way_point_before_or_at_step( agent_id, i) assert agent.position == way_point.position, \ "before {}, agent {} at {}, expected {}".format(i, agent_id, agent.position, way_point.position) actions = ctl.act(i) print("actions for {}: {}".format(i, actions)) obs, all_rewards, done, _ = env.step(actions) if rendering: renderer.render_env(show=True, show_observations=False, show_predictions=False) i += 1
def evaluate(n_episodes): run = SUBMISSIONS["rlps-tcpr"] config, run = init_run(run) agent = ShortestPathRllibAgent(get_agent(config, run)) env = get_env(config, rl=True) env_renderer = RenderTool(env, screen_width=8800) returns = [] pcs = [] malfs = [] for _ in tqdm(range(n_episodes)): obs, _ = env.reset(regenerate_schedule=True, regenerate_rail=True) if RENDER: env_renderer.reset() env_renderer.render_env(show=True, frames=True, show_observations=False) if not obs: break steps = 0 ep_return = 0 done = defaultdict(lambda: False) robust_env = RobustFlatlandGymEnv(rail_env=env, max_nr_active_agents=200, observation_space=None, priorizer=DistToTargetPriorizer(), allow_noop=True) sorted_handles = robust_env.priorizer.priorize(handles=list( obs.keys()), rail_env=env) while not done['__all__']: actions = agent.compute_actions(obs, env) robust_actions = robust_env.get_robust_actions( actions, sorted_handles) obs, all_rewards, done, info = env.step(robust_actions) if RENDER: env_renderer.render_env(show=True, frames=True, show_observations=False) print('.', end='', flush=True) steps += 1 ep_return += np.sum(list(all_rewards.values())) pc = np.sum(np.array([1 for a in env.agents if is_done(a) ])) / env.get_num_agents() print("EPISODE PC:", pc) n_episodes += 1 pcs.append(pc) returns.append(ep_return / (env._max_episode_steps * env.get_num_agents())) malfs.append( np.sum([a.malfunction_data['nr_malfunctions'] for a in env.agents])) return pcs, returns, malfs
def render_env(env): env_renderer = RenderTool(env, gl="PGL") env_renderer.render_env() image = env_renderer.get_image() pil_image = PIL.Image.fromarray(image) #print("RENDER") #pil_image.show() images.append(pil_image) print(len(images))
def test_shortest_path_predictor_conflicts(rendering=False): rail, rail_map = make_invalid_simple_rail() env = RailEnv( width=rail_map.shape[1], height=rail_map.shape[0], rail_generator=rail_from_grid_transition_map(rail), schedule_generator=random_schedule_generator(), number_of_agents=2, obs_builder_object=TreeObsForRailEnv( max_depth=2, predictor=ShortestPathPredictorForRailEnv()), ) env.reset() # set the initial position agent = env.agents[0] agent.initial_position = (5, 6) # south dead-end agent.position = (5, 6) # south dead-end agent.direction = 0 # north agent.initial_direction = 0 # north agent.target = (3, 9) # east dead-end agent.moving = True agent.status = RailAgentStatus.ACTIVE agent = env.agents[1] agent.initial_position = (3, 8) # east dead-end agent.position = (3, 8) # east dead-end agent.direction = 3 # west agent.initial_direction = 3 # west agent.target = (6, 6) # south dead-end agent.moving = True agent.status = RailAgentStatus.ACTIVE observations, info = env.reset(False, False, True) if rendering: renderer = RenderTool(env, gl="PILSVG") renderer.render_env(show=True, show_observations=False) input("Continue?") # get the trees to test obs_builder: TreeObsForRailEnv = env.obs_builder pp = pprint.PrettyPrinter(indent=4) tree_0 = observations[0] tree_1 = observations[1] env.obs_builder.util_print_obs_subtree(tree_0) env.obs_builder.util_print_obs_subtree(tree_1) # check the expectations expected_conflicts_0 = [('F', 'R')] expected_conflicts_1 = [('F', 'L')] _check_expected_conflicts(expected_conflicts_0, obs_builder, tree_0, "agent[0]: ") _check_expected_conflicts(expected_conflicts_1, obs_builder, tree_1, "agent[1]: ")
def render(self, mode='human'): # TODO: Merge both strategies (Jupyter vs .py) # In .py files # self.renderer.render_env(show=False, show_observations=False, show_predictions=False) # In Jupyter Notebooks env_renderer = RenderTool(self.flatland_env, gl="PILSVG") env_renderer.render_env() image = env_renderer.get_image() pil_image = Image.fromarray(image) display(pil_image) return image
def main(args): try: opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""]) except getopt.GetoptError as err: print(str(err)) # will print something like "option -a not recognized" sys.exit(2) sleep_for_animation = True for o, a in opts: if o in ("--sleep-for-animation"): sleep_for_animation = str2bool(a) else: assert False, "unhandled option" # Initiate the Predictor custom_predictor = ShortestPathPredictorForRailEnv(10) # Pass the Predictor to the observation builder custom_obs_builder = ObservePredictions(custom_predictor) # Initiate Environment env = RailEnv(width=10, height=10, rail_generator=complex_rail_generator(nr_start_goal=5, nr_extra=1, min_dist=8, max_dist=99999, seed=1), schedule_generator=complex_schedule_generator(), number_of_agents=3, obs_builder_object=custom_obs_builder) obs, info = env.reset() env_renderer = RenderTool(env, gl="PILSVG") # We render the initial step and show the obsered cells as colored boxes env_renderer.render_env(show=True, frames=True, show_observations=True, show_predictions=False) action_dict = {} for step in range(100): for a in range(env.get_num_agents()): action = np.random.randint(0, 5) action_dict[a] = action obs, all_rewards, done, _ = env.step(action_dict) print("Rewards: ", all_rewards, " [done=", done, "]") env_renderer.render_env(show=True, frames=True, show_observations=True, show_predictions=False) if sleep_for_animation: time.sleep(0.5)
class OurEnv(RailEnv): def reset(self, *args, **kwargs): return_val = super().reset(*args, **kwargs) self.env_renderer = RenderTool(env) self.step({0: RailEnvActions.MOVE_FORWARD}) return return_val def step(self, *args, **kwargs): self.env_renderer.render_env(show=True) print(args[0]) observation, reward, done, info = super().step(*args, **kwargs) return observation, reward, done["__all__"], info
def evaluate(n_episodes): run = SUBMISSIONS["ato"] config, run = init_run(run) agent = get_agent(config, run) env = get_env(config, rl=True) env_renderer = RenderTool(env, screen_width=8800) returns = [] pcs = [] malfs = [] for _ in tqdm(range(n_episodes)): obs, _ = env.reset(regenerate_schedule=True, regenerate_rail=True) if RENDER: env_renderer.reset() env_renderer.render_env(show=True, frames=True, show_observations=False) if not obs: break steps = 0 ep_return = 0 done = defaultdict(lambda: False) while not done['__all__']: actions = agent.compute_actions(obs, explore=False) obs, all_rewards, done, info = env.step(actions) if RENDER: env_renderer.render_env(show=True, frames=True, show_observations=False) print('.', end='', flush=True) steps += 1 ep_return += np.sum(list(all_rewards.values())) pc = np.sum(np.array([1 for a in env.agents if is_done(a) ])) / env.get_num_agents() print("EPISODE PC:", pc) n_episodes += 1 pcs.append(pc) returns.append(ep_return / (env._max_episode_steps * env.get_num_agents())) malfs.append( np.sum([a.malfunction_data['nr_malfunctions'] for a in env.agents])) return pcs, returns, malfs
def solve(env, width, height, naive, predictor): env_renderer = RenderTool(env) solver = r2_solver.Solver(1) obs, _ = env.reset() env.obs_builder.find_safe_edges(env) predictor.env = env predictor.get() for step in range(100): # print(obs) # print(obs.shape) if naive: _action = naive_solver(env, obs) else: _action = solver.GetMoves(env.agents, obs) obs_paths = TL_detector(env, obs, _action) for k in obs_paths.keys(): if obs_paths[k] is not None and improved_solver(obs_paths[k]) == 0: _action[k] = 4 for k in _action.keys(): if env.agents[k].position is None: continue pos = (env.agents[k].position[0], env.agents[k].position[1], env.agents[k].direction) if _action[k] != 0 and _action[ k] != 4 and pos in env.dev_pred_dict[k]: env.dev_pred_dict[k].remove(pos) next_obs, all_rewards, done, _ = env.step(_action) print("Rewards: {}, [done={}]".format(all_rewards, done)) img = env_renderer.render_env(show=True, show_inactive_agents=False, show_predictions=True, show_observations=False, frames=True, return_image=True) cv2.imwrite("./env_images/" + str(step).zfill(3) + ".jpg", img) obs = next_obs.copy() if obs is None or done['__all__']: break unfinished_agents = [] for k in done.keys(): if not done[k] and type(k) is int: unfinished_agents.append(k) with open('observations_and_agents.pickle', 'wb') as f: pickle.dump((env.obs_builder.obs_dict, unfinished_agents, env.obs_builder.branches, env.obs_builder.safe_map), f) return
def check_path(env, rail, position, direction, target, expected, rendering=False): agent = env.agents[0] agent.position = position # south dead-end agent.direction = direction # north agent.target = target # east dead-end agent.moving = True if rendering: renderer = RenderTool(env, gl="PILSVG") renderer.render_env(show=True, show_observations=False) input("Continue?") assert rail.check_path_exists(agent.position, agent.direction, agent.target) == expected
def createEnvSet(nStart, nEnd, sDir, bSmall=True): # print("Generate small envs in train-envs-small:") print(f"Generate envs (small={bSmall}) in dir {sDir}:") sDirImages = "train-envs-small/images/" if not os.path.exists(sDirImages): os.makedirs(sDirImages) for test_id in range(nStart, nEnd, 1): env = create_test_env(RandomTestParams_small, test_id, sDir) oRender = RenderTool(env, gl="PILSVG") # oRender.envs = envs # oRender.set_new_rail() oRender.render_env() g2img = oRender.get_image() imgPIL = Image.fromarray(g2img) # imgPIL.show() imgPIL.save(sDirImages + "Level_{}.png".format(test_id))
def main(args): try: opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""]) except getopt.GetoptError as err: print(str(err)) # will print something like "option -a not recognized" sys.exit(2) sleep_for_animation = True for o, a in opts: if o in ("--sleep-for-animation"): sleep_for_animation = str2bool(a) else: assert False, "unhandled option" env = RailEnv(width=7, height=7, rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=1, min_dist=5, max_dist=99999, seed=1), schedule_generator=complex_schedule_generator(), number_of_agents=1, obs_builder_object=SingleAgentNavigationObs()) obs, info = env.reset() env_renderer = RenderTool(env) env_renderer.render_env(show=True, frames=True, show_observations=True) for step in range(100): action = np.argmax(obs[0]) + 1 obs, all_rewards, done, _ = env.step({0: action}) print("Rewards: ", all_rewards, " [done=", done, "]") env_renderer.render_env(show=True, frames=True, show_observations=True) if sleep_for_animation: time.sleep(0.1) if done["__all__"]: break env_renderer.close_window()
def main(args, dir): ''' :param args: :return: Episodes to debug (set breakpoint in episodes loop to debug): - ep = 3, agent 1 spawns in front of 3, blocking its path; 0 and 2 are in a deadlock since they have same priority - ep = 4, agents stop because of wrong priorities even though the conflict zone wasn't entered, - ep = 14, ''' rail_generator = sparse_rail_generator( max_num_cities=args.max_num_cities, seed=args.seed, grid_mode=args.grid_mode, max_rails_between_cities=args.max_rails_between_cities, max_rails_in_city=args.max_rails_in_city, ) # Maps speeds to % of appearance in the env speed_ration_map = { 1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25 } # Slow freight train observation_builder = GraphObsForRailEnv( predictor=ShortestPathPredictorForRailEnv( max_depth=args.prediction_depth), bfs_depth=4) env = RailEnv( width=args.width, height=args.height, rail_generator=rail_generator, schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=args.num_agents, obs_builder_object=observation_builder, malfunction_generator_and_process_data=malfunction_from_params( parameters={ 'malfunction_rate': args.malfunction_rate, # Rate of malfunction occurrence 'min_duration': args.min_duration, # Minimal duration of malfunction 'max_duration': args.max_duration # Max duration of malfunction })) if args.render: env_renderer = RenderTool(env, agent_render_variant=AgentRenderVariant. AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=True) sm = stateMachine() tb = TestBattery(env, observation_builder) state_machine_action_dict = {} railenv_action_dict = {} # max_time_steps = env.compute_max_episode_steps(args.width, args.height) max_time_steps = 200 T_rewards = [] # List of episodes rewards T_Qs = [] # List of q values T_num_done_agents = [] # List of number of done agents for each episode T_all_done = [] # If all agents completed in each episode T_episodes = [] # Time taken for each episode if args.save_image and not os.path.isdir("image_dump"): os.makedirs("image_dump") step_taken = 0 total_step_taken = 0 total_episodes = 0 step_times = [] # Time taken for each step for ep in range(args.num_episodes): # Reset info at the beginning of an episode start_time = time.time() # Take time of one episode if args.generate_baseline: if not os.path.isdir("image_dump/" + str(dir)) and args.save_image: os.makedirs("image_dump/" + str(dir)) else: if not os.path.isdir("image_dump/" + str(ep)) and args.save_image: os.makedirs("image_dump/" + str(ep)) state, info = env.reset() tb.reset() if args.render: env_renderer.reset() reward_sum, all_done = 0, False # reward_sum contains the cumulative reward obtained as sum during the steps num_done_agents = 0 state_machine_action = {} for i in range(env.number_of_agents): state_machine_action[i] = 0 for step in range(max_time_steps): start_step_time = time.time() #if step % 10 == 0: # print(step) # Test battery # see test_battery.py triggers = tb.tests(state, args.prediction_depth, state_machine_action) # state machine based on triggers of test battery # see state_machine.py state_machine_action = sm.act( triggers) # State machine picks action for a in range(env.get_num_agents()): #if info['action_required'][a]: # #railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a]) # railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a]) # state_machine_action_dict.update({a: state_machine_action}) # railenv_action_dict.update({a: railenv_action}) # railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a]) railenv_action = observation_builder.choose_railenv_action( a, state_machine_action[a]) state_machine_action_dict.update({a: state_machine_action}) railenv_action_dict.update({a: railenv_action}) state, reward, done, info = env.step( railenv_action_dict) # Env step if args.generate_baseline: #env_renderer.render_env(show=True, show_observations=False, show_predictions=True) env_renderer.render_env(show=False, show_observations=False, show_predictions=True) else: env_renderer.render_env(show=True, show_observations=False, show_predictions=True) if args.generate_baseline: if args.save_image: env_renderer.save_image("image_dump/" + str(dir) + "/image_" + str(step) + "_.png") else: if args.save_image: env_renderer.save_image("image_dump/" + str(ep) + "/image_" + str(step) + "_.png") if args.debug: for a in range(env.get_num_agents()): log('\n\n#########################################') log('\nInfo for agent {}'.format(a)) #log('\npath : {}'.format(state[a]["path"])) log('\noverlap : {}'.format(state[a]["overlap"])) log('\ndirection : {}'.format(state[a]["direction"])) log('\nOccupancy, first layer: {}'.format( state[a]["occupancy"])) log('\nOccupancy, second layer: {}'.format( state[a]["conflict"])) log('\nForks: {}'.format(state[a]["forks"])) log('\nTarget: {}'.format(state[a]["target"])) log('\nPriority: {}'.format(state[a]["priority"])) log('\nMax priority encountered: {}'.format( state[a]["max_priority"])) log('\nNum malfunctioning agents (globally): {}'.format( state[a]["n_malfunction"])) log('\nNum agents ready to depart (globally): {}'.format( state[a]["ready_to_depart"])) log('\nStatus: {}'.format(info['status'][a])) log('\nPosition: {}'.format(env.agents[a].position)) log('\nTarget: {}'.format(env.agents[a].target)) log('\nMoving? {} at speed: {}'.format( env.agents[a].moving, info['speed'][a])) log('\nAction required? {}'.format( info['action_required'][a])) log('\nState machine action: {}'.format( state_machine_action_dict[a])) log('\nRailenv action: {}'.format(railenv_action_dict[a])) log('\nRewards: {}'.format(reward[a])) log('\n\n#########################################') reward_sum += sum(reward[a] for a in range(env.get_num_agents())) step_taken = step time_taken_step = time.time() - start_step_time step_times.append(time_taken_step) if done['__all__']: all_done = True break total_step_taken += step_taken time_taken = time.time() - start_time # Time taken for one episode total_episodes = ep # Time metrics - too precise avg_time_step = sum(step_times) / step_taken #print("Avg time step: " + str(avg_time_step)) # No need to close the renderer since env parameter sizes stay the same T_rewards.append(reward_sum) # Compute num of agents that reached their target for a in range(env.get_num_agents()): if done[a]: num_done_agents += 1 percentage_done_agents = num_done_agents / env.get_num_agents() log("\nDone agents in episode: {}".format(percentage_done_agents)) T_num_done_agents.append( percentage_done_agents) # In proportion to total T_all_done.append(all_done) # Average number of agents that reached their target avg_done_agents = sum(T_num_done_agents) / len(T_num_done_agents) if len( T_num_done_agents) > 0 else 0 avg_reward = sum(T_rewards) / len(T_rewards) if len(T_rewards) > 0 else 0 avg_norm_reward = avg_reward / (max_time_steps / env.get_num_agents()) avg_ep_time = sum(T_episodes) / args.num_episodes if total_episodes == 0: total_episodes = 1 log("\nSeed: " + str(args.seed) \ + "\t | Avg_done_agents: " + str(avg_done_agents)\ + "\t | Avg_reward: " + str(avg_reward)\ + "\t | Avg_norm_reward: " + str(avg_norm_reward)\ + "\t | Max_num_time_steps: " + str(max_time_steps)\ + "\t | Avg_num_time_steps: " + str(total_step_taken/total_episodes) + "\t | Avg episode time: " + str(avg_ep_time))
def test(args, T, ep, dqn, val_mem, metrics, results_dir, evaluate=False): # Init env and set in evaluation mode # Maps speeds to % of appearance in the env speed_ration_map = { 1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25 } # Slow freight train schedule_generator = sparse_schedule_generator(speed_ration_map) observation_builder = GraphObsForRailEnv( predictor=ShortestPathPredictorForRailEnv( max_depth=args.prediction_depth)) env = RailEnv( width=args.width, height=args.height, rail_generator=sparse_rail_generator( max_num_cities=args.max_num_cities, seed= ep, # Use episode as seed when evaluation is performed during training grid_mode=args.grid_mode, max_rails_between_cities=args.max_rails_between_cities, max_rails_in_city=args.max_rails_in_city, ), schedule_generator=schedule_generator, number_of_agents=args.num_agents, obs_builder_object=observation_builder, malfunction_generator_and_process_data=malfunction_from_params( parameters={ 'malfunction_rate': args.malfunction_rate, 'min_duration': args.min_duration, 'max_duration': args.max_duration }), ) if args.render: env_renderer = RenderTool(env, gl="PILSVG", agent_render_variant=AgentRenderVariant. AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=True, screen_height=1080, screen_width=1920) #max_time_steps = env.compute_max_episode_steps(env.width, env.height) max_time_steps = 200 # TODO Debug # metrics['steps'].append(T) metrics['episodes'].append(ep) T_rewards = [] # List of episodes rewards T_Qs = [] # List T_num_done_agents = [] # List of number of done agents for each episode T_all_done = [] # If all agents completed in each episode network_action_dict = dict() railenv_action_dict = dict() qvalues = {} # Test performance over several episodes for ep in range(args.evaluation_episodes): # Reset info state, info = env.reset() reward_sum, all_done = 0, False # reward_sum contains the cumulative reward obtained as sum during the steps num_done_agents = 0 if args.render: env_renderer.reset() # Choose first action - decide entering of agents into the environment for a in range(env.get_num_agents()): action = np.random.choice((0, 2)) railenv_action_dict.update({a: action}) state, reward, done, info = env.step(railenv_action_dict) # Env step reward_sum += sum(reward[a] for a in range(env.get_num_agents())) if args.render: env_renderer.render_env(show=True, show_observations=False, show_predictions=True) for step in range(max_time_steps - 1): # Choose actions for a in range(env.get_num_agents()): if info['action_required'][a]: network_action = dqn.act( state[a] ) # Choose an action greedily (with noisy weights) # network_action = 0 railenv_action = observation_builder.choose_railenv_action( a, network_action) qvalues.update({a: dqn.get_q_values(state[a])}) else: network_action = 0 railenv_action = 0 qvalues.update({a: [0, 0]}) # '0' if wasn't updated railenv_action_dict.update({a: railenv_action}) network_action_dict.update({a: network_action}) if args.debug: for a in range(env.get_num_agents()): print('#########################################') print('Info for agent {}'.format(a)) print('Occupancy, first layer: {}'.format( state[a][:args.prediction_depth])) print('Occupancy, second layer: {}'.format( state[a][args.prediction_depth:args.prediction_depth * 2])) print('Forks: {}'.format( state[a][args.prediction_depth * 2:args.prediction_depth * 3])) print('Target: {}'.format( state[a][args.prediction_depth * 3:args.prediction_depth * 4])) print('Priority: {}'.format( state[a][args.prediction_depth * 4])) print('Max priority encountered: {}'.format( state[a][args.prediction_depth * 4 + 1])) print('Num malfunctoning agents (globally): {}'.format( state[a][args.prediction_depth * 4 + 2])) print('Num agents ready to depart (globally): {}'.format( state[a][args.prediction_depth * 4 + 3])) print('Status: {}'.format(info['status'][a])) print('Position: {}'.format(env.agents[a].position)) print('Moving? {} at speed: {}'.format( env.agents[a].moving, info['speed'][a])) print('Action required? {}'.format( info['action_required'][a])) print('Network action: {}'.format(network_action_dict[a])) print('Railenv action: {}'.format(railenv_action_dict[a])) print('Q values: {}'.format(qvalues[a])) # print('QValues: {}'.format(qvalues)) print('Rewards: {}'.format(reward[a])) # Breakpoint for debugging here state, reward, done, info = env.step( railenv_action_dict) # Env step if args.render: env_renderer.render_env(show=True, show_observations=False, show_predictions=True) reward_sum += sum(reward[a] for a in range(env.get_num_agents())) if done['__all__']: all_done = True break # No need to close the renderer since env parameter sizes stay the same T_rewards.append(reward_sum) # Compute num of agents that reached their target for a in range(env.get_num_agents()): if done[a]: num_done_agents += 1 T_num_done_agents.append( num_done_agents / env.get_num_agents()) # In proportion to total T_all_done.append(all_done) # Test Q-values over validation memory for state in val_mem: # Iterate over valid states T_Qs.append(dqn.evaluate_q(state)) if args.debug: print('T_Qs: {}'.format(T_Qs)) # These are Qs from a single agent TODO avg_done_agents = sum(T_num_done_agents) / len( T_num_done_agents ) # Average number of agents that reached their target avg_reward = sum(T_rewards) / len(T_rewards) avg_norm_reward = avg_reward / (max_time_steps / env.get_num_agents()) # avg_reward, avg_Q = sum(T_rewards) / len(T_rewards), sum(T_Qs) / len(T_Qs) if not evaluate: # Save model parameters if improved if avg_done_agents > metrics['best_avg_done_agents']: metrics['best_avg_done_agents'] = avg_done_agents dqn.save(results_dir) # Append to results and save metrics metrics['rewards'].append(T_rewards) metrics['Qs'].append(T_Qs) torch.save(metrics, os.path.join(results_dir, 'metrics.pth')) # Plot HTML _plot_line(metrics['episodes'], metrics['rewards'], 'Reward', path=results_dir) # Plot rewards in episodes _plot_line(metrics['episodes'], metrics['Qs'], 'Q', path=results_dir) # Return average number of done agents (in proportion) and average reward return avg_done_agents, avg_reward, avg_norm_reward
def evalfun(num_samples = 100, timed=True, debug = False, refresh = 0.1): # A list of (mapsize, agent count) tuples, change or extend this to test different sizes. #problemsizes = [(5, 3), (7, 4), (9, 5), (11, 6), (13, 7)] problemsizes = [(25, 4)] # Create a list of seeds to consider. seeds = numpy.random.randint(2**29, size=3*num_samples) print("%10s\t%8s\t%8s\t%9s" % ("Dimensions", "Success", "Rewards", "Runtime")) for problemsize in problemsizes: j = 0 for _ in range(0, num_samples): # Create environments while they are not the intended dimension. env = create_multi_agent_environment(problemsize[0], problemsize[1], timed, seeds[j]) j = j + 1 while len(env.agents) != problemsize[1]: env = create_multi_agent_environment(problemsize[0], problemsize[1], timed, seeds[j]) j = j + 1 # Create a renderer only if in debug mode. if debug: env_renderer = RenderTool(env, screen_width=1920, screen_height=1080) # Time the search. for i in range(len(env.agents)): print( "Agent ID: " + str(i) + " Initial Position: (" + str(env.agents[i].initial_position[0]) + "," + str( env.agents[i].initial_position[1]) + ")" + " Release Date " + str(env.agents[i].release_date)+ " Deadline " + str(env.agents[i].deadline)) start = time.time() # Task 1 prioritized planning ######## a_schdule = prioritized_planning(env) # Task 3 Improvement prioritized planning (Genetic Algorithm) # Uncomment to run task 3 #a_schdule = genetic_algorithm(env) duration = time.time() - start; schedule = a_schdule print(schedule) if debug: env_renderer.render_env(show=True, frames=False, show_observations=False) time.sleep(refresh) # Validate that environment state is unchanged. assert env.num_resets == 1 and env._elapsed_steps == 0 # Run the schedule success = False sumreward = 0 for action in schedule: _, _reward_dict, _done, _ = env.step(action) success = all(_done.values()) sumreward = sumreward + sum(_reward_dict.values()) if debug: #print(action) env_renderer.render_env(show=True, frames=False, show_observations=False) time.sleep(refresh) # Print the performance of the algorithm print("%10s\t%8s\t%8.3f\t%9.6f" % (str(problemsize), str(success), sumreward, duration))
action = agent.act(state=norm_obs, eps=eps) action_dict.update({_idx: action}) # Environment executes action and returns # 1. next observations for all agents # 2. corresponding rewards for all agents # 3. status if the agents are done # 4. information about actions, malfunction, speed and status next_obs, all_rewards, done, info = env.step(action_dict) for _idx in range(n_agents): if not done[_idx]: next_norm_obs = normalize_observation(next_obs[_idx], tree_depth=tree_depth) agent.remember((norm_obs, action_dict[_idx], all_rewards[_idx], next_norm_obs, done[_idx])) # Render the environment -> show me what you got! env_renderer.render_env(show=True, show_observations=True) # Prepare for new step and stop if agent is done obs = next_obs.copy() if done["__all__"]: break # Train the agent if len(agent.memory) > batch_size: agent.step(batch_size) score += all_rewards[0] # Epsilon decay eps = max(eps_end, eps_decay * eps) # Copy weights from Q' to Q
def main(argv): try: opts, args = getopt.getopt(argv, "n:", ["n_trials="]) except getopt.GetoptError: print('test_navigation_single_agent.py -n <n_trials>') sys.exit(2) for opt, arg in opts: if opt in ('-n', '--n_trials'): n_trials = int(arg) random.seed(1) np.random.seed(1) ######## TEST SET SELECTION - PARAMETERS ######## test_multi_agent_setup = 1 # 1 for Medium size test, 2 for Big size test test_n_agents = 5 # Number of agents to test (3 - 5 - 7 for Medium, 5 - 7 - 10 for Big) test_malfunctions_enabled = True # Malfunctions enabled? test_agents_one_speed = True # Test agents with the same speed (1) or with 4 different speeds? ################################################# # Medium size if test_multi_agent_setup == 1: x_dim = 16*3 y_dim = 9*3 max_num_cities = 5 max_rails_between_cities = 2 max_rails_in_city = 3 # Big size if test_multi_agent_setup == 2: x_dim = 16*4 y_dim = 9*4 max_num_cities = 9 max_rails_between_cities = 5 max_rails_in_city = 5 stochastic_data = {'malfunction_rate': 80, # Rate of malfunction occurence of single agent 'min_duration': 15, # Minimal duration of malfunction 'max_duration': 50 # Max duration of malfunction } # Custom observation builder tree_depth = 2 TreeObservation = TreeObsForRailEnv(max_depth=tree_depth, predictor = ShortestPathPredictorForRailEnv(20)) np.savetxt(fname=path.join('NetsTest' , 'info.txt'), X=[x_dim,y_dim,test_n_agents,max_num_cities,max_rails_between_cities,max_rails_in_city,tree_depth],delimiter=';') # Different agent types (trains) with different speeds. if test_agents_one_speed: speed_ration_map = {1.: 1., # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0} # Slow freight train else: speed_ration_map = {1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25} # Slow freight train if test_malfunctions_enabled: env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator(max_num_cities=max_num_cities, # Number of cities in map (where train stations are) seed=14, # Random seed grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_ration_map), malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), number_of_agents=test_n_agents, obs_builder_object=TreeObservation) else: env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator(max_num_cities=max_num_cities, # Number of cities in map (where train stations are) seed=14, # Random seed grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=test_n_agents, obs_builder_object=TreeObservation) env.reset() #env_renderer = RenderTool(env, gl="PILSVG", ) env_renderer = RenderTool(env, gl="PILSVG", agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=False, screen_height=(1080*0.8), # Adjust these parameters to fit your resolution screen_width=(1920*0.8)) num_features_per_node = env.obs_builder.observation_dim nr_nodes = 0 for i in range(tree_depth + 1): nr_nodes += np.power(4, i) state_size = num_features_per_node * nr_nodes action_size = 5 # We set the number of episodes we would like to train on if 'n_trials' not in locals(): n_trials = 15000 # max_steps computation speed_weighted_mean = 0 for key in speed_ration_map.keys(): speed_weighted_mean += key * speed_ration_map[key] #max_steps = int(3 * (env.height + env.width)) max_steps = int((1/speed_weighted_mean) * 3 * (env.height + env.width)) #eps = 1. #eps_end = 0.005 #eps_decay = 0.9995 # And some variables to keep track of the performance action_dict = dict() final_action_dict = dict() action_prob_list = [] scores_window = deque(maxlen=100) done_window = deque(maxlen=100) scores = [] scores_list = [] deadlock_list =[] dones_list_window = [] dones_list = [] action_prob = [0] * action_size agent_obs = [None] * env.get_num_agents() agent_next_obs = [None] * env.get_num_agents() # Useless agent = Agent(state_size, action_size) # LOAD MODEL WEIGHTS TO TEST agent.qnetwork_local.load_state_dict(torch.load(path.join('NetsTest' , 'navigator_checkpoint3800_multi10_deadlock_global10.pth'))) record_images = False frame_step = 0 for trials in range(1, n_trials + 1): # Reset environment obs, info = env.reset()#(True, True) env_renderer.reset() # Build agent specific observations for a in range(env.get_num_agents()): agent_obs[a] = agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) # Reset score and done score = 0 env_done = 0 # Run episode for step in range(max_steps): # Action for a in range(env.get_num_agents()): if info['action_required'][a]: action = agent.act(agent_obs[a], eps=0.) action_prob[action] += 1 else: action = 0 action_dict.update({a: action}) # Environment step obs, all_rewards, done, deadlocks, info = env.step(action_dict) env_renderer.render_env(show=True, show_predictions=True, show_observations=False) # Build agent specific observations and normalize for a in range(env.get_num_agents()): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) score += all_rewards[a] / env.get_num_agents() if done['__all__']: break # Collection information about training tasks_finished = 0 for _idx in range(env.get_num_agents()): if done[_idx] == 1: tasks_finished += 1 done_window.append(tasks_finished / max(1, env.get_num_agents())) scores_window.append(score / max_steps) # save most recent score scores.append(np.mean(scores_window)) dones_list.append(tasks_finished / max(1, env.get_num_agents())) dones_list_window.append((np.mean(done_window))) scores_list.append(score / max_steps) deadlock_list.append(deadlocks.count(1)/max(1, env.get_num_agents())) if (np.sum(action_prob) == 0): action_prob_normalized = [0] * action_size else: action_prob_normalized = action_prob / np.sum(action_prob) print( '\rTesting {} Agents on ({},{}).\t Episode {}\t Score: {:.3f}\tDones: {:.2f}%\tDeadlocks: {:.2f}\t Action Probabilities: \t {}'.format( env.get_num_agents(), x_dim, y_dim, trials, score / max_steps, 100 * tasks_finished / max(1, env.get_num_agents()), deadlocks.count(1)/max(1, env.get_num_agents()), action_prob_normalized), end=" ") #if trials % 100 == 0: action_prob_list.append(action_prob_normalized) action_prob = [0] * action_size if trials % 50 == 0: np.savetxt(fname=path.join('NetsTest' , 'test_metrics.csv'), X=np.transpose(np.asarray([scores_list,scores,dones_list,dones_list_window,deadlock_list])), delimiter=';',newline='\n') np.savetxt(fname=path.join('NetsTest' , 'test_action_prob.csv'), X=np.asarray(action_prob_list), delimiter=';',newline='\n')
def train_agent(env_params, train_params): # Environment parameters n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city seed = env_params.seed # Observation parameters observation_tree_depth = env_params.observation_tree_depth observation_radius = env_params.observation_radius observation_max_path_depth = env_params.observation_max_path_depth # Training parameters eps_start = train_params.eps_start eps_end = train_params.eps_end eps_decay = train_params.eps_decay n_episodes = train_params.n_episodes checkpoint_interval = train_params.checkpoint_interval n_eval_episodes = train_params.n_evaluation_episodes # Set the seeds random.seed(seed) np.random.seed(seed) # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 10000, # Rate of malfunctions min_duration=15, # Minimal duration max_duration=50 # Max duration ) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Fraction of train which each speed speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Setup the environment env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) env.reset(regenerate_schedule=True, regenerate_rail=True) # Setup renderer if train_params.render: env_renderer = RenderTool(env, gl="PGL") # Calculate the state size given the depth of the tree observation and the number of features n_features_per_node = env.obs_builder.observation_dim n_nodes = 0 for i in range(observation_tree_depth + 1): n_nodes += np.power(4, i) state_size = n_features_per_node * n_nodes # The action space of flatland is 5 discrete actions action_size = 5 # Max number of steps per episode # This is the official formula used during evaluations # See details in flatland.envs.schedule_generators.sparse_schedule_generator max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) action_count = [0] * action_size action_dict = dict() agent_obs = [None] * env.get_num_agents() agent_prev_obs = [None] * env.get_num_agents() agent_prev_action = [2] * env.get_num_agents() update_values = False smoothed_normalized_score = -1.0 smoothed_eval_normalized_score = -1.0 smoothed_completion = 0.0 smoothed_eval_completion = 0.0 # Double Dueling DQN policy policy = DDDQNPolicy(state_size, action_size, train_params) # TensorBoard writer writer = SummaryWriter() writer.add_hparams(vars(train_params), {}) writer.add_hparams(vars(env_params), {}) training_timer = Timer() training_timer.start() print( "\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes.\n" .format(env.get_num_agents(), x_dim, y_dim, n_episodes, n_eval_episodes, checkpoint_interval)) for episode_idx in range(n_episodes + 1): # Timers step_timer = Timer() reset_timer = Timer() learn_timer = Timer() preproc_timer = Timer() # Reset environment reset_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True) reset_timer.end() if train_params.render: env_renderer.set_new_rail() score = 0 nb_steps = 0 actions_taken = [] # Build agent specific observations for agent in env.get_agent_handles(): if obs[agent]: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) agent_prev_obs[agent] = agent_obs[agent].copy() # Run episode for step in range(max_steps - 1): for agent in env.get_agent_handles(): if info['action_required'][agent]: # If an action is required, we want to store the obs at that step as well as the action update_values = True action = policy.act(agent_obs[agent], eps=eps_start) action_count[action] += 1 actions_taken.append(action) else: update_values = False action = 0 action_dict.update({agent: action}) # Environment step step_timer.start() next_obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if train_params.render and episode_idx % checkpoint_interval == 0: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) for agent in range(env.get_num_agents()): # Update replay buffer and train agent # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[agent]: learn_timer.start() policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent]) learn_timer.end() agent_prev_obs[agent] = agent_obs[agent].copy() agent_prev_action[agent] = action_dict[agent] # Preprocess the new observations if next_obs[agent]: preproc_timer.start() agent_obs[agent] = normalize_observation( next_obs[agent], observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() score += all_rewards[agent] nb_steps = step if done['__all__']: break # Epsilon decay eps_start = max(eps_end, eps_decay * eps_start) # Collection information about training tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) normalized_score = score / (max_steps * env.get_num_agents()) action_probs = action_count / np.sum(action_count) action_count = [1] * action_size # Smoothed values for terminal display and for more stable hyper-parameter tuning smoothing = 0.99 smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * ( 1.0 - smoothing) smoothed_completion = smoothed_completion * smoothing + completion * ( 1.0 - smoothing) # Print logs if episode_idx % checkpoint_interval == 0: torch.save( policy.qnetwork_local, './checkpoints/origin_multi-' + str(episode_idx) + '.pth') if train_params.render: env_renderer.close_window() print('\r🚂 Episode {}' '\t 🏆 Score: {:.3f}' ' Avg: {:.3f}' '\t 💯 Done: {:.2f}%' ' Avg: {:.2f}%' '\t 🎲 Epsilon: {:.2f} ' '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score, smoothed_normalized_score, 100 * completion, 100 * smoothed_completion, eps_start, format_action_prob(action_probs)), end=" ") # Evaluate policy if episode_idx % train_params.checkpoint_interval == 0: scores, completions, nb_steps_eval = eval_policy( env, policy, n_eval_episodes, max_steps) writer.add_scalar("evaluation/scores_min", np.min(scores), episode_idx) writer.add_scalar("evaluation/scores_max", np.max(scores), episode_idx) writer.add_scalar("evaluation/scores_mean", np.mean(scores), episode_idx) writer.add_scalar("evaluation/scores_std", np.std(scores), episode_idx) writer.add_histogram("evaluation/scores", np.array(scores), episode_idx) writer.add_scalar("evaluation/completions_min", np.min(completions), episode_idx) writer.add_scalar("evaluation/completions_max", np.max(completions), episode_idx) writer.add_scalar("evaluation/completions_mean", np.mean(completions), episode_idx) writer.add_scalar("evaluation/completions_std", np.std(completions), episode_idx) writer.add_histogram("evaluation/completions", np.array(completions), episode_idx) writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_mean", np.mean(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval), episode_idx) writer.add_histogram("evaluation/nb_steps", np.array(nb_steps_eval), episode_idx) smoothing = 0.9 smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean( scores) * (1.0 - smoothing) smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean( completions) * (1.0 - smoothing) writer.add_scalar("evaluation/smoothed_score", smoothed_eval_normalized_score, episode_idx) writer.add_scalar("evaluation/smoothed_completion", smoothed_eval_completion, episode_idx) # Save logs to tensorboard writer.add_scalar("training/score", normalized_score, episode_idx) writer.add_scalar("training/smoothed_score", smoothed_normalized_score, episode_idx) writer.add_scalar("training/completion", np.mean(completion), episode_idx) writer.add_scalar("training/smoothed_completion", np.mean(smoothed_completion), episode_idx) writer.add_scalar("training/nb_steps", nb_steps, episode_idx) writer.add_histogram("actions/distribution", np.array(actions_taken), episode_idx) writer.add_scalar("actions/nothing", action_probs[RailEnvActions.DO_NOTHING], episode_idx) writer.add_scalar("actions/left", action_probs[RailEnvActions.MOVE_LEFT], episode_idx) writer.add_scalar("actions/forward", action_probs[RailEnvActions.MOVE_FORWARD], episode_idx) writer.add_scalar("actions/right", action_probs[RailEnvActions.MOVE_RIGHT], episode_idx) writer.add_scalar("actions/stop", action_probs[RailEnvActions.STOP_MOVING], episode_idx) writer.add_scalar("training/epsilon", eps_start, episode_idx) writer.add_scalar("training/buffer_size", len(policy.memory), episode_idx) writer.add_scalar("training/loss", policy.loss, episode_idx) writer.add_scalar("timer/reset", reset_timer.get(), episode_idx) writer.add_scalar("timer/step", step_timer.get(), episode_idx) writer.add_scalar("timer/learn", learn_timer.get(), episode_idx) writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx) writer.add_scalar("timer/total", training_timer.get_current(), episode_idx)
def train_agent(train_params, train_env_params, eval_env_params, obs_params): # Environment parameters n_agents = train_env_params.n_agents x_dim = train_env_params.x_dim y_dim = train_env_params.y_dim n_cities = train_env_params.n_cities max_rails_between_cities = train_env_params.max_rails_between_cities max_rails_in_city = train_env_params.max_rails_in_city seed = train_env_params.seed # Unique ID for this training now = datetime.now() training_id = now.strftime('%y%m%d%H%M%S') # Observation parameters observation_tree_depth = obs_params.observation_tree_depth observation_radius = obs_params.observation_radius observation_max_path_depth = obs_params.observation_max_path_depth # Training parameters eps_start = train_params.eps_start eps_end = train_params.eps_end eps_decay = train_params.eps_decay n_episodes = train_params.n_episodes checkpoint_interval = train_params.checkpoint_interval n_eval_episodes = train_params.n_evaluation_episodes restore_replay_buffer = train_params.restore_replay_buffer save_replay_buffer = train_params.save_replay_buffer # Set the seeds random.seed(seed) np.random.seed(seed) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Setup the environments train_env = create_rail_env(train_env_params, tree_observation) train_env.reset(regenerate_schedule=True, regenerate_rail=True) eval_env = create_rail_env(eval_env_params, tree_observation) eval_env.reset(regenerate_schedule=True, regenerate_rail=True) # Setup renderer if train_params.render: env_renderer = RenderTool(train_env, gl="PGL") # Calculate the state size given the depth of the tree observation and the number of features n_features_per_node = train_env.obs_builder.observation_dim n_nodes = sum([np.power(4, i) for i in range(observation_tree_depth + 1)]) state_size = n_features_per_node * n_nodes # The action space of flatland is 5 discrete actions action_size = 5 # Max number of steps per episode # This is the official formula used during evaluations # See details in flatland.envs.schedule_generators.sparse_schedule_generator # max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) max_steps = train_env._max_episode_steps action_count = [0] * action_size action_dict = dict() agent_obs = [None] * n_agents agent_prev_obs = [None] * n_agents agent_prev_action = [2] * n_agents update_values = [False] * n_agents # Smoothed values used as target for hyperparameter tuning smoothed_normalized_score = -1.0 smoothed_eval_normalized_score = -1.0 smoothed_completion = 0.0 smoothed_eval_completion = 0.0 # Double Dueling DQN policy policy = DDDQNPolicy(state_size, action_size, train_params) # Loads existing replay buffer if restore_replay_buffer: try: policy.load_replay_buffer(restore_replay_buffer) policy.test() except RuntimeError as e: print( "\n🛑 Could't load replay buffer, were the experiences generated using the same tree depth?" ) print(e) exit(1) print("\n💾 Replay buffer status: {}/{} experiences".format( len(policy.memory.memory), train_params.buffer_size)) hdd = psutil.disk_usage('/') if save_replay_buffer and (hdd.free / (2**30)) < 500.0: print( "⚠️ Careful! Saving replay buffers will quickly consume a lot of disk space. You have {:.2f}gb left." .format(hdd.free / (2**30))) # TensorBoard writer writer = SummaryWriter() writer.add_hparams(vars(train_params), {}) writer.add_hparams(vars(train_env_params), {}) writer.add_hparams(vars(obs_params), {}) training_timer = Timer() training_timer.start() print( "\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes. Training id '{}'.\n" .format(train_env.get_num_agents(), x_dim, y_dim, n_episodes, n_eval_episodes, checkpoint_interval, training_id)) for episode_idx in range(n_episodes + 1): step_timer = Timer() reset_timer = Timer() learn_timer = Timer() preproc_timer = Timer() inference_timer = Timer() # Reset environment reset_timer.start() obs, info = train_env.reset(regenerate_rail=True, regenerate_schedule=True) reset_timer.end() if train_params.render: env_renderer.set_new_rail() score = 0 nb_steps = 0 actions_taken = [] # Build initial agent-specific observations for agent in train_env.get_agent_handles(): if obs[agent]: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) agent_prev_obs[agent] = agent_obs[agent].copy() # Run episode for step in range(max_steps - 1): inference_timer.start() for agent in train_env.get_agent_handles(): if info['action_required'][agent]: update_values[agent] = True action = policy.act(agent_obs[agent], eps=eps_start) action_count[action] += 1 actions_taken.append(action) else: # An action is not required if the train hasn't joined the railway network, # if it already reached its target, or if is currently malfunctioning. update_values[agent] = False action = 0 action_dict.update({agent: action}) inference_timer.end() # Environment step step_timer.start() next_obs, all_rewards, done, info = train_env.step(action_dict) step_timer.end() # Render an episode at some interval if train_params.render and episode_idx % checkpoint_interval == 0: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) # Update replay buffer and train agent for agent in train_env.get_agent_handles(): if update_values[agent] or done['__all__']: # Only learn from timesteps where somethings happened learn_timer.start() policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent]) learn_timer.end() agent_prev_obs[agent] = agent_obs[agent].copy() agent_prev_action[agent] = action_dict[agent] # Preprocess the new observations if next_obs[agent]: preproc_timer.start() agent_obs[agent] = normalize_observation( next_obs[agent], observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() score += all_rewards[agent] nb_steps = step if done['__all__']: break # Epsilon decay eps_start = max(eps_end, eps_decay * eps_start) # Collect information about training tasks_finished = sum(done[idx] for idx in train_env.get_agent_handles()) completion = tasks_finished / max(1, train_env.get_num_agents()) normalized_score = score / (max_steps * train_env.get_num_agents()) action_probs = action_count / np.sum(action_count) action_count = [1] * action_size smoothing = 0.99 smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * ( 1.0 - smoothing) smoothed_completion = smoothed_completion * smoothing + completion * ( 1.0 - smoothing) # Print logs if episode_idx % checkpoint_interval == 0: torch.save( policy.qnetwork_local, './checkpoints/' + training_id + '-' + str(episode_idx) + '.pth') if save_replay_buffer: policy.save_replay_buffer('./replay_buffers/' + training_id + '-' + str(episode_idx) + '.pkl') if train_params.render: env_renderer.close_window() print('\r🚂 Episode {}' '\t 🏆 Score: {:.3f}' ' Avg: {:.3f}' '\t 💯 Done: {:.2f}%' ' Avg: {:.2f}%' '\t 🎲 Epsilon: {:.3f} ' '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score, smoothed_normalized_score, 100 * completion, 100 * smoothed_completion, eps_start, format_action_prob(action_probs)), end=" ") # Evaluate policy and log results at some interval if episode_idx % checkpoint_interval == 0 and n_eval_episodes > 0: scores, completions, nb_steps_eval = eval_policy( eval_env, policy, train_params, obs_params) writer.add_scalar("evaluation/scores_min", np.min(scores), episode_idx) writer.add_scalar("evaluation/scores_max", np.max(scores), episode_idx) writer.add_scalar("evaluation/scores_mean", np.mean(scores), episode_idx) writer.add_scalar("evaluation/scores_std", np.std(scores), episode_idx) writer.add_histogram("evaluation/scores", np.array(scores), episode_idx) writer.add_scalar("evaluation/completions_min", np.min(completions), episode_idx) writer.add_scalar("evaluation/completions_max", np.max(completions), episode_idx) writer.add_scalar("evaluation/completions_mean", np.mean(completions), episode_idx) writer.add_scalar("evaluation/completions_std", np.std(completions), episode_idx) writer.add_histogram("evaluation/completions", np.array(completions), episode_idx) writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_mean", np.mean(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval), episode_idx) writer.add_histogram("evaluation/nb_steps", np.array(nb_steps_eval), episode_idx) smoothing = 0.9 smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean( scores) * (1.0 - smoothing) smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean( completions) * (1.0 - smoothing) writer.add_scalar("evaluation/smoothed_score", smoothed_eval_normalized_score, episode_idx) writer.add_scalar("evaluation/smoothed_completion", smoothed_eval_completion, episode_idx) # Save logs to tensorboard writer.add_scalar("training/score", normalized_score, episode_idx) writer.add_scalar("training/smoothed_score", smoothed_normalized_score, episode_idx) writer.add_scalar("training/completion", np.mean(completion), episode_idx) writer.add_scalar("training/smoothed_completion", np.mean(smoothed_completion), episode_idx) writer.add_scalar("training/nb_steps", nb_steps, episode_idx) writer.add_histogram("actions/distribution", np.array(actions_taken), episode_idx) writer.add_scalar("actions/nothing", action_probs[RailEnvActions.DO_NOTHING], episode_idx) writer.add_scalar("actions/left", action_probs[RailEnvActions.MOVE_LEFT], episode_idx) writer.add_scalar("actions/forward", action_probs[RailEnvActions.MOVE_FORWARD], episode_idx) writer.add_scalar("actions/right", action_probs[RailEnvActions.MOVE_RIGHT], episode_idx) writer.add_scalar("actions/stop", action_probs[RailEnvActions.STOP_MOVING], episode_idx) writer.add_scalar("training/epsilon", eps_start, episode_idx) writer.add_scalar("training/buffer_size", len(policy.memory), episode_idx) writer.add_scalar("training/loss", policy.loss, episode_idx) writer.add_scalar("timer/reset", reset_timer.get(), episode_idx) writer.add_scalar("timer/step", step_timer.get(), episode_idx) writer.add_scalar("timer/learn", learn_timer.get(), episode_idx) writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx) writer.add_scalar("timer/total", training_timer.get_current(), episode_idx)
return observation n_agents = 5 env = RailEnv(width=25, height=25, rail_generator=sparse_rail_generator(), number_of_agents=n_agents, obs_builder_object=SingleAgentNavigationObs()) env_renderer = None for _ in range(100): if env_renderer is not None: env_renderer.close_window() obs, _ = env.reset() env_renderer = RenderTool(env) env_renderer.render_env(show=True, frames=True, show_observations=False) done = {"__all__": False} while not done["__all__"]: action = {i: np.argmax(o) for i, o in obs.items()} obs, all_rewards, done, _ = env.step(action) print("Rewards: ", all_rewards, " [done=", done, "]") print("Observations: ", obs) assert len(obs.keys()) == n_agents env_renderer.render_env(show=True, frames=True, show_observations=False) time.sleep(0.1)
def evaluate(n_episodes, rl_prio=True): agent = None if rl_prio: config, run = init_run() agent = get_agent(config, run) env = get_env(config, rl=True) else: env = get_env(rl=False) env_renderer = RenderTool(env, screen_width=8800) returns = [] pcs = [] malfs = [] for _ in tqdm(range(n_episodes)): obs, _ = env.reset(regenerate_schedule=True, regenerate_rail=True) if RENDER: env_renderer.reset() env_renderer.render_env(show=True, frames=True, show_observations=False) if not obs: break steps = 0 ep_return = 0 done = defaultdict(lambda: False) robust_env = CprFlatlandGymEnv(rail_env=env, max_nr_active_agents=200, observation_space=None, priorizer=NrAgentsSameStart(), allow_noop=True) # if rl_prio: # priorities = prio_agent.compute_actions(obs, explore=False) # sorted_actions = {k: v for k, v in sorted(priorities.items(), key=lambda item: item[1], reverse=True)} # sorted_handles = list(sorted_actions.keys()) # else: sorted_handles = robust_env.priorizer.priorize(handles=list( obs.keys()), rail_env=env) while not done['__all__']: actions = ShortestPathAgent().compute_actions(obs, env) robust_actions = robust_env.get_robust_actions( actions, sorted_handles) obs, all_rewards, done, info = env.step(robust_actions) if RENDER: env_renderer.render_env(show=True, frames=True, show_observations=False) print('.', end='', flush=True) steps += 1 ep_return += np.sum(list(all_rewards.values())) pc = np.sum(np.array([1 for a in env.agents if is_done(a) ])) / env.get_num_agents() print("EPISODE PC:", pc) n_episodes += 1 pcs.append(pc) returns.append(ep_return / (env._max_episode_steps * env.get_num_agents())) malfs.append( np.sum([a.malfunction_data['nr_malfunctions'] for a in env.agents])) return pcs, returns, malfs
def run_episode(kwargs) -> [Trajectory]: """ Runs a single episode and collects the trajectories of each agent """ total_controller_time = 0 env_dict: Callable = kwargs.get("env_dict") obs_builder = kwargs.get("obs_builder") controller_creator: Callable = kwargs.get("controller_creator") episode_id: int = kwargs.get("episode_id") max_episode_length: int = kwargs.get("max_episode_length", 1000) render: bool = kwargs.get("render", False) # Create and Start Environment _env = load_env(env_dict, obs_builder_object=obs_builder) obs, info = _env.reset( regenerate_rail=False, regenerate_schedule=True, ) score = 0 _trajectories = [Trajectory() for _ in _env.get_agent_handles()] # Create and Start Controller controller: AbstractController = controller_creator() start = time.time() controller.start_of_round(obs=obs, env=_env) total_controller_time += time.time() - start if render: env_renderer = RenderTool(_env) env_renderer.reset() for step in range(max_episode_length): start = time.time() action_dict, processed_obs = controller.act(observation=obs) total_controller_time += time.time() - start next_obs, all_rewards, done, info = _env.step(action_dict) if render: env_renderer.render_env(show=True, show_observations=True, show_predictions=False) # Save actions and rewards for each agent [ _trajectories[agent_handle].add_row( state=processed_obs[agent_handle], action=action_dict[agent_handle], reward=all_rewards[agent_handle], done=done[agent_handle]) for agent_handle in _env.get_agent_handles() ] score += sum(all_rewards) obs = next_obs.copy() if done['__all__']: break if render: env_renderer.close_window() # print(f"\nController took a total time of: {total_controller_time} seconds", flush=True) return _trajectories
def evalfun(num_samples=100, timed=True, debug=False, refresh=0.1): # A list of (mapsize, agent count) tuples, change or extend this to test different sizes. #problemsizes = [(5, 3), (7, 4), (9, 5), (11, 6), (13, 7)] problemsizes = [(7, 4)] # Create a list of seeds to consider. #seeds = numpy.random.randint(2**29, size=3*num_samples) scores = [] successes = 0 completion_window = deque(maxlen=100) completion = [] runtime = [] schedule_lenth = [] seeds = 37429879 print("%10s\t%8s\t%8s\t%9s" % ("Dimensions", "Success", "Rewards", "Runtime")) for problemsize in problemsizes: # Create environments while they are not the intended dimension. j = 0 env = create_multi_agent_environment(problemsize[0], problemsize[1], timed, seeds) for _ in range(0, num_samples): # Create environments while they are not the intended dimension. # Create a renderer only if in debug mode. if debug: env_renderer = RenderTool(env, screen_width=1920, screen_height=1080) # Find the schedules start = time.time() _, schedule = genetic_algorithm(env) duration = time.time() - start runtime.append(duration) schedule_lenth.append(len(schedule)) if debug: env_renderer.render_env(show=True, frames=False, show_observations=False) time.sleep(refresh) # Validate that environment state is unchanged. #assert env.num_resets == 1 and env._elapsed_steps == 0 # Run the schedule success = False sumreward = 0 for action in schedule: _, _reward_dict, _done, _ = env.step(action) success = all(_done.values()) sumreward = sumreward + sum(_reward_dict.values()) if debug: env_renderer.render_env(show=True, frames=False, show_observations=False) time.sleep(refresh) # Print the performance of the algorithm if success == True: successes += 1 tasks_finished = np.sum( [int(_done[idx]) for idx in range(env.get_num_agents())]) completion_window.append(tasks_finished / max(1, env.get_num_agents())) completion.append((np.mean(completion_window))) scores.append(sumreward) print("%10s\t%8s\t%8.3f\t%9.6f" % (str(problemsize), str(success), sumreward, duration)) print(schedule_lenth) env.reset() print("Number of sucesses", successes) print("Number of samples", num_samples) print("Successful: %8.2f%%" % (100 * successes / num_samples)) print("Mean reward: %8.2f" % (np.mean(scores))) print("Median reward: %8.2f" % (np.median(scores))) print("Instances solved: %8.2f" % (np.mean(completion))) print("Run Time %8.2f" % (np.mean(runtime))) print("Avg schedule length %8.2f", np.mean(schedule_lenth))
# Reset score and done score = 0 env_done = 0 # Run episode for step in range(max_steps): # Action for a in range(env.get_num_agents()): if info['action_required'][a]: action = agent.act(agent_obs[a], eps=0.) else: action = 0 action_prob[action] += 1 action_dict.update({a: action}) # Environment step obs, all_rewards, done, _ = env.step(action_dict) env_renderer.render_env(show=True, show_predictions=True, show_observations=False) # Build agent specific observations and normalize for a in range(env.get_num_agents()): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) if done['__all__']: break
grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) env.reset(regenerate_schedule=True, regenerate_rail=True) # Setup renderer env_renderer = RenderTool(env) ''' env_renderer.render_env(show=True,show_predictions=False) time.sleep(5) env_renderer.close_window() ''' n_features_per_node = env.obs_builder.observation_dim n_nodes = 0 for i in range(observation_tree_depth + 1): n_nodes += np.power(4, i) state_size = n_features_per_node * n_nodes action_size = 5 # Max number of steps per episode # This is the official formula used during evaluations # See details in flatland.envs.schedule_generators.sparse_schedule_generator
class FlatlandRemoteEvaluationService: """ A remote evaluation service which exposes the following interfaces of a RailEnv : - env_create - env_step and an additional `env_submit` to cater to score computation and on-episode-complete post processings. This service is designed to be used in conjunction with `FlatlandRemoteClient` and both the srevice and client maintain a local instance of the RailEnv instance, and in case of any unexpected divergences in the state of both the instances, the local RailEnv instance of the `FlatlandRemoteEvaluationService` is supposed to act as the single source of truth. Both the client and remote service communicate with each other via Redis as a message broker. The individual messages are packed and unpacked with `msgpack` (a patched version of msgpack which also supports numpy arrays). """ def __init__(self, test_env_folder="/tmp", flatland_rl_service_id='FLATLAND_RL_SERVICE_ID', remote_host='127.0.0.1', remote_port=6379, remote_db=0, remote_password=None, visualize=False, video_generation_envs=[], report=None, verbose=False): # Test Env folder Paths self.test_env_folder = test_env_folder self.video_generation_envs = video_generation_envs self.env_file_paths = self.get_env_filepaths() random.shuffle(self.env_file_paths) print(self.env_file_paths) # Shuffle all the env_file_paths for more exciting videos # and for more uniform time progression # Logging and Reporting related vars self.verbose = verbose self.report = report # Communication Protocol Related vars self.namespace = "flatland-rl" self.service_id = flatland_rl_service_id self.command_channel = "{}::{}::commands".format( self.namespace, self.service_id) # Message Broker related vars self.remote_host = remote_host self.remote_port = remote_port self.remote_db = remote_db self.remote_password = remote_password self.instantiate_redis_connection_pool() # AIcrowd evaluation specific vars self.oracle_events = crowdai_api.events.CrowdAIEvents(with_oracle=True) self.evaluation_state = { "state": "PENDING", "progress": 0.0, "simulation_count": 0, "total_simulation_count": len(self.env_file_paths), "score": { "score": 0.0, "score_secondary": 0.0 }, "meta": { "normalized_reward": 0.0 } } self.stats = {} # RailEnv specific variables self.env = False self.env_renderer = False self.reward = 0 self.simulation_count = -1 self.simulation_rewards = [] self.simulation_rewards_normalized = [] self.simulation_percentage_complete = [] self.simulation_steps = [] self.simulation_times = [] self.env_step_times = [] self.begin_simulation = False self.current_step = 0 self.visualize = visualize self.vizualization_folder_name = "./.visualizations" self.record_frame_step = 0 if self.visualize: if os.path.exists(self.vizualization_folder_name): print( "[WARNING] Deleting already existing visualizations folder at : {}" .format(self.vizualization_folder_name)) shutil.rmtree(self.vizualization_folder_name) os.mkdir(self.vizualization_folder_name) def update_running_mean_stats(self, key, scalar): """ Computes the running mean for certain params """ mean_key = "{}_mean".format(key) counter_key = "{}_counter".format(key) try: self.stats[mean_key] = \ ((self.stats[mean_key] * self.stats[counter_key]) + scalar) / (self.stats[counter_key] + 1) self.stats[counter_key] += 1 except KeyError: self.stats[mean_key] = 0 self.stats[counter_key] = 0 def get_env_filepaths(self): """ Gathers a list of all available rail env files to be used for evaluation. The folder structure expected at the `test_env_folder` is similar to : . ├── Test_0 │ ├── Level_1.pkl │ ├── ....... │ ├── ....... │ └── Level_99.pkl └── Test_1 ├── Level_1.pkl ├── ....... ├── ....... └── Level_99.pkl """ env_paths = sorted( glob.glob(os.path.join(self.test_env_folder, "*/*.pkl"))) # Remove the root folder name from the individual # lists, so that we only have the path relative # to the test root folder env_paths = sorted( [os.path.relpath(x, self.test_env_folder) for x in env_paths]) return env_paths def instantiate_redis_connection_pool(self): """ Instantiates a Redis connection pool which can be used to communicate with the message broker """ if self.verbose or self.report: print("Attempting to connect to redis server at {}:{}/{}".format( self.remote_host, self.remote_port, self.remote_db)) self.redis_pool = redis.ConnectionPool(host=self.remote_host, port=self.remote_port, db=self.remote_db, password=self.remote_password) self.redis_conn = redis.Redis(connection_pool=self.redis_pool) def get_redis_connection(self): """ Obtains a new redis connection from a previously instantiated redis connection pool """ return self.redis_conn def _error_template(self, payload): """ Simple helper function to pass a payload as a part of a flatland comms error template. """ _response = {} _response['type'] = messages.FLATLAND_RL.ERROR _response['payload'] = payload return _response @timeout_decorator.timeout(PER_STEP_TIMEOUT, use_signals=use_signals_in_timeout ) # timeout for each command def _get_next_command(self, _redis): """ A low level wrapper for obtaining the next command from a pre-agreed command channel. At the momment, the communication protocol uses lpush for pushing in commands, and brpop for reading out commands. """ command = _redis.brpop(self.command_channel)[1] return command def get_next_command(self): """ A helper function to obtain the next command, which transparently also deals with things like unpacking of the command from the packed message, and consider the timeouts, etc when trying to fetch a new command. """ try: _redis = self.get_redis_connection() command = self._get_next_command(_redis) if self.verbose or self.report: print("Command Service: ", command) except timeout_decorator.timeout_decorator.TimeoutError: raise Exception("Timeout in step {} of simulation {}".format( self.current_step, self.simulation_count)) command = msgpack.unpackb(command, object_hook=m.decode, encoding="utf8") if self.verbose: print("Received Request : ", command) message_queue_latency = time.time() - command["timestamp"] self.update_running_mean_stats("message_queue_latency", message_queue_latency) return command def send_response(self, _command_response, command, suppress_logs=False): _redis = self.get_redis_connection() command_response_channel = command['response_channel'] if self.verbose and not suppress_logs: print("Responding with : ", _command_response) _redis.rpush( command_response_channel, msgpack.packb(_command_response, default=m.encode, use_bin_type=True)) def handle_ping(self, command): """ Handles PING command from the client. """ service_version = flatland.__version__ if "version" in command["payload"].keys(): client_version = command["payload"]["version"] else: # 2.1.4 -> when the version mismatch check was added client_version = "2.1.4" _command_response = {} _command_response['type'] = messages.FLATLAND_RL.PONG _command_response['payload'] = {} if client_version not in SUPPORTED_CLIENT_VERSIONS: _command_response['type'] = messages.FLATLAND_RL.ERROR _command_response['payload']['message'] = \ "Client-Server Version Mismatch => " + \ "[ Client Version : {} ] ".format(client_version) + \ "[ Server Version : {} ] ".format(service_version) self.send_response(_command_response, command) raise Exception(_command_response['payload']['message']) self.send_response(_command_response, command) def handle_env_create(self, command): """ Handles a ENV_CREATE command from the client TODO: Add a high level summary of everything thats happening here. """ self.simulation_count += 1 if self.simulation_count < len(self.env_file_paths): """ There are still test envs left that are yet to be evaluated """ test_env_file_path = self.env_file_paths[self.simulation_count] print("Evaluating : {}".format(test_env_file_path)) test_env_file_path = os.path.join(self.test_env_folder, test_env_file_path) del self.env self.env = RailEnv( width=1, height=1, rail_generator=rail_from_file(test_env_file_path), schedule_generator=schedule_from_file(test_env_file_path), malfunction_generator_and_process_data=malfunction_from_file( test_env_file_path), obs_builder_object=DummyObservationBuilder()) if self.begin_simulation: # If begin simulation has already been initialized # atleast once self.simulation_times.append(time.time() - self.begin_simulation) self.begin_simulation = time.time() self.simulation_rewards.append(0) self.simulation_rewards_normalized.append(0) self.simulation_percentage_complete.append(0) self.simulation_steps.append(0) self.current_step = 0 _observation, _info = self.env.reset(regenerate_rail=True, regenerate_schedule=True, activate_agents=False, random_seed=RANDOM_SEED) if self.visualize: if self.env_renderer: del self.env_renderer self.env_renderer = RenderTool( self.env, gl="PILSVG", ) _command_response = {} _command_response[ 'type'] = messages.FLATLAND_RL.ENV_CREATE_RESPONSE _command_response['payload'] = {} _command_response['payload']['observation'] = _observation _command_response['payload'][ 'env_file_path'] = self.env_file_paths[self.simulation_count] _command_response['payload']['info'] = _info _command_response['payload']['random_seed'] = RANDOM_SEED else: """ All test env evaluations are complete """ _command_response = {} _command_response[ 'type'] = messages.FLATLAND_RL.ENV_CREATE_RESPONSE _command_response['payload'] = {} _command_response['payload']['observation'] = False _command_response['payload']['env_file_path'] = False _command_response['payload']['info'] = False _command_response['payload']['random_seed'] = False self.send_response(_command_response, command) ##################################################################### # Update evaluation state ##################################################################### progress = np.clip( self.simulation_count * 1.0 / len(self.env_file_paths), 0, 1) mean_reward = round(np.mean(self.simulation_rewards), 2) mean_normalized_reward = round( np.mean(self.simulation_rewards_normalized), 2) mean_percentage_complete = round( np.mean(self.simulation_percentage_complete), 3) self.evaluation_state["state"] = "IN_PROGRESS" self.evaluation_state["progress"] = progress self.evaluation_state["simulation_count"] = self.simulation_count self.evaluation_state["score"]["score"] = mean_percentage_complete self.evaluation_state["score"]["score_secondary"] = mean_reward self.evaluation_state["meta"][ "normalized_reward"] = mean_normalized_reward self.handle_aicrowd_info_event(self.evaluation_state) def handle_env_step(self, command): """ Handles a ENV_STEP command from the client TODO: Add a high level summary of everything thats happening here. """ _payload = command['payload'] if not self.env: raise Exception( "env_client.step called before env_client.env_create() call") if self.env.dones['__all__']: raise Exception( "Client attempted to perform an action on an Env which \ has done['__all__']==True") action = _payload['action'] time_start = time.time() _observation, all_rewards, done, info = self.env.step(action) time_diff = time.time() - time_start self.update_running_mean_stats("internal_env_step_time", time_diff) cumulative_reward = sum(all_rewards.values()) self.simulation_rewards[-1] += cumulative_reward self.simulation_steps[-1] += 1 """ The normalized rewards normalize the reward for an episode by dividing the whole reward by max-time-steps allowed in that episode, and the number of agents present in that episode """ self.simulation_rewards_normalized[-1] += \ cumulative_reward / ( self.env._max_episode_steps + self.env.get_num_agents() ) if done["__all__"]: # Compute percentage complete complete = 0 for i_agent in range(self.env.get_num_agents()): agent = self.env.agents[i_agent] if agent.status in [RailAgentStatus.DONE_REMOVED]: complete += 1 percentage_complete = complete * 1.0 / self.env.get_num_agents() self.simulation_percentage_complete[-1] = percentage_complete # Record Frame if self.visualize: self.env_renderer.render_env(show=False, show_observations=False, show_predictions=False) """ Only save the frames for environments which are separately provided in video_generation_indices param """ current_env_path = self.env_file_paths[self.simulation_count] if current_env_path in self.video_generation_envs: self.env_renderer.gl.save_image( os.path.join( self.vizualization_folder_name, "flatland_frame_{:04d}.png".format( self.record_frame_step))) self.record_frame_step += 1 def handle_env_submit(self, command): """ Handles a ENV_SUBMIT command from the client TODO: Add a high level summary of everything thats happening here. """ _payload = command['payload'] ###################################################################### # Print Local Stats ###################################################################### print("=" * 100) print("=" * 100) print("## Server Performance Stats") print("=" * 100) for _key in self.stats: if _key.endswith("_mean"): print("\t - {}\t:{}".format(_key, self.stats[_key])) print("=" * 100) # Register simulation time of the last episode self.simulation_times.append(time.time() - self.begin_simulation) if len(self.simulation_rewards) != len(self.env_file_paths): raise Exception( """env.submit called before the agent had the chance to operate on all the test environments. """) mean_reward = round(np.mean(self.simulation_rewards), 2) mean_normalized_reward = round( np.mean(self.simulation_rewards_normalized), 2) mean_percentage_complete = round( np.mean(self.simulation_percentage_complete), 3) if self.visualize and len(os.listdir( self.vizualization_folder_name)) > 0: # Generate the video # # Note, if you had depdency issues due to ffmpeg, you can # install it by : # # conda install -c conda-forge x264 ffmpeg print("Generating Video from thumbnails...") video_output_path, video_thumb_output_path = \ aicrowd_helpers.generate_movie_from_frames( self.vizualization_folder_name ) print("Videos : ", video_output_path, video_thumb_output_path) # Upload to S3 if configuration is available if aicrowd_helpers.is_grading( ) and aicrowd_helpers.is_aws_configured() and self.visualize: video_s3_key = aicrowd_helpers.upload_to_s3(video_output_path) video_thumb_s3_key = aicrowd_helpers.upload_to_s3( video_thumb_output_path) static_thumbnail_s3_key = aicrowd_helpers.upload_random_frame_to_s3( self.vizualization_folder_name) self.evaluation_state["score"][ "media_content_type"] = "video/mp4" self.evaluation_state["score"]["media_large"] = video_s3_key self.evaluation_state["score"][ "media_thumbnail"] = video_thumb_s3_key self.evaluation_state["meta"][ "static_media_frame"] = static_thumbnail_s3_key else: print("[WARNING] Ignoring uploading of video to S3") _command_response = {} _command_response['type'] = messages.FLATLAND_RL.ENV_SUBMIT_RESPONSE _payload = {} _payload['mean_reward'] = mean_reward _payload['mean_normalized_reward'] = mean_normalized_reward _payload['mean_percentage_complete'] = mean_percentage_complete _command_response['payload'] = _payload self.send_response(_command_response, command) ##################################################################### # Update evaluation state ##################################################################### self.evaluation_state["state"] = "FINISHED" self.evaluation_state["progress"] = 1.0 self.evaluation_state["simulation_count"] = self.simulation_count self.evaluation_state["score"]["score"] = mean_percentage_complete self.evaluation_state["score"]["score_secondary"] = mean_reward self.evaluation_state["meta"][ "normalized_reward"] = mean_normalized_reward self.handle_aicrowd_success_event(self.evaluation_state) print("#" * 100) print("EVALUATION COMPLETE !!") print("#" * 100) print("# Mean Reward : {}".format(mean_reward)) print("# Mean Normalized Reward : {}".format(mean_normalized_reward)) print( "# Mean Percentage Complete : {}".format(mean_percentage_complete)) print("#" * 100) print("#" * 100) def report_error(self, error_message, command_response_channel): """ A helper function used to report error back to the client """ _redis = self.get_redis_connection() _command_response = {} _command_response['type'] = messages.FLATLAND_RL.ERROR _command_response['payload'] = error_message _redis.rpush( command_response_channel, msgpack.packb(_command_response, default=m.encode, use_bin_type=True)) self.evaluation_state["state"] = "ERROR" self.evaluation_state["error"] = error_message self.handle_aicrowd_error_event(self.evaluation_state) def handle_aicrowd_info_event(self, payload): self.oracle_events.register_event( event_type=self.oracle_events.CROWDAI_EVENT_INFO, payload=payload) def handle_aicrowd_success_event(self, payload): self.oracle_events.register_event( event_type=self.oracle_events.CROWDAI_EVENT_SUCCESS, payload=payload) def handle_aicrowd_error_event(self, payload): self.oracle_events.register_event( event_type=self.oracle_events.CROWDAI_EVENT_ERROR, payload=payload) def run(self): """ Main runner function which waits for commands from the client and acts accordingly. """ print("Listening at : ", self.command_channel) MESSAGE_QUEUE_LATENCY = [] while True: command = self.get_next_command() if "timestamp" in command.keys(): latency = time.time() - command["timestamp"] MESSAGE_QUEUE_LATENCY.append(latency) if self.verbose: print("Self.Reward : ", self.reward) print("Current Simulation : ", self.simulation_count) if self.env_file_paths and \ self.simulation_count < len(self.env_file_paths): print("Current Env Path : ", self.env_file_paths[self.simulation_count]) try: if command['type'] == messages.FLATLAND_RL.PING: """ INITIAL HANDSHAKE : Respond with PONG """ self.handle_ping(command) elif command['type'] == messages.FLATLAND_RL.ENV_CREATE: """ ENV_CREATE Respond with an internal _env object """ self.handle_env_create(command) elif command['type'] == messages.FLATLAND_RL.ENV_STEP: """ ENV_STEP Request : Action dict Respond with updated [observation,reward,done,info] after step """ self.handle_env_step(command) elif command['type'] == messages.FLATLAND_RL.ENV_SUBMIT: """ ENV_SUBMIT Submit the final cumulative reward """ print("Overall Message Queue Latency : ", np.array(MESSAGE_QUEUE_LATENCY).mean()) self.handle_env_submit(command) else: _error = self._error_template("UNKNOWN_REQUEST:{}".format( str(command))) if self.verbose: print("Responding with : ", _error) self.report_error(_error, command['response_channel']) return _error except Exception as e: print("Error : ", str(e)) print(traceback.format_exc()) self.report_error(self._error_template(str(e)), command['response_channel']) return self._error_template(str(e))