def test_normalize_features(): random.seed(1) np.random.seed(1) max_depth = 4 for i in range(10): tree_observer = TreeObsForRailEnv(max_depth=max_depth) next_rand_number = random.randint(0, 100) env = RailEnv(width=10, height=10, rail_generator=complex_rail_generator( nr_start_goal=10, nr_extra=1, min_dist=8, max_dist=99999, seed=next_rand_number), schedule_generator=complex_schedule_generator(), number_of_agents=1, obs_builder_object=tree_observer) obs, all_rewards, done, _ = env.step({0: 0}) obs_new = tree_observer.get() # data, distance, agent_data = split_tree(tree=np.array(obs_old), num_features_per_node=11) data_normalized = normalize_observation(obs_new, max_depth, observation_radius=10) filename = 'testdata/test_array_{}.csv'.format(i) data_loaded = np.loadtxt(filename, delimiter=',') assert np.allclose(data_loaded, data_normalized)
def eval_policy(env, policy, train_params, obs_params): n_eval_episodes = train_params.n_evaluation_episodes max_steps = env._max_episode_steps tree_depth = obs_params.observation_tree_depth observation_radius = obs_params.observation_radius action_dict = dict() scores = [] completions = [] nb_steps = [] for episode_idx in range(n_eval_episodes): agent_obs = [None] * env.get_num_agents() score = 0.0 obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True) final_step = 0 for step in range(max_steps - 1): for agent in env.get_agent_handles(): if obs[agent]: agent_obs[agent] = normalize_observation( obs[agent], tree_depth=tree_depth, observation_radius=observation_radius) action = 0 if info['action_required'][agent]: action = policy.act(agent_obs[agent], eps=0.0) action_dict.update({agent: action}) obs, all_rewards, done, info = env.step(action_dict) for agent in env.get_agent_handles(): score += all_rewards[agent] final_step = step if done['__all__']: break normalized_score = score / (max_steps * env.get_num_agents()) scores.append(normalized_score) tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) completions.append(completion) nb_steps.append(final_step) print("\t✅ Eval: score {:.3f} done {:.1f}%".format( np.mean(scores), np.mean(completions) * 100.0)) return scores, completions, nb_steps
def get_actions(obs, n_agents, info): actions = {} for _idx in obs.keys(): # and info['action_required'][_idx]==True and info['malfunction'][_idx]!=1 if obs[_idx]!=None : normalized_obs = normalize_observation(obs[_idx], observation_tree_depth, observation_radius) action = agent.take_action(normalized_obs) actions[_idx] = action else: actions[_idx] = 0 return actions
def step(self, action_dict): #print(action_dict) obs, rewards, dones, infos = super().step(action_dict) agent_obs = [None] * self.get_num_agents() for a in range(self.get_num_agents()): if obs[a]: #print("Obs") agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) #else: print("No Obs") return agent_obs, rewards, dones, infos
def reset(self): obs, info = super().reset(True, True) agent_obs = [None] * self.get_num_agents() #agent_next_obs = [None] * env.get_num_agents() #agent_obs_buffer = [None] * self.get_num_agents() for a in range(self.get_num_agents()): if obs[a]: agent_obs[a] = normalize_observation( obs[a], tree_depth, observation_radius=10).astype('float32') #agent_obs_buffer[a] = agent_obs[a].copy() return agent_obs, info
def normalized_obs(obs): norm_obs = [] for _idx in obs.keys(): try: if obs[_idx]!=None: norm_obs.append(normalize_observation(obs[_idx], observation_tree_depth, observation_radius)) except: print("OBS inside",obs[_idx]) try: #print(norm_obs) return norm_obs except: print("OBS",obs)
def my_controller(env, obs, number_of_agents): for a in range(number_of_agents): agent = env.agents[a] if done[a]: continue if agent.speed_data['position_fraction']>5: action_dict.update({a: 4}) # stop continue if info['action_required'][a]: if agent.position is not None: possible_transition_num = np.count_nonzero(env.rail.get_transitions(*agent.position, agent.direction)) if possible_transition_num == 1: action = 2 else: action = policy.act(normalize_observation(obs[a], observation_tree_depth, observation_radius=10), eps=0.01) else: action = 2 else: action = 0 action_dict.update({a: action}) return action_dict
def get_normalized_observation(observation, tree_depth: int, observation_radius=0): return normalize_observation(observation, tree_depth, observation_radius)
def get_reward(weights, model, render=False): cloned_model = copy.deepcopy(model) for i, param in enumerate(cloned_model.parameters()): try: param.data.copy_(weights[i]) except: param.data.copy_(weights[i].data) env_Orig = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=3, # Number of cities in map (where train stations are) seed=1, # Random seed grid_mode=False, max_rails_between_cities=2, max_rails_in_city=3), schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=n_agents, stochastic_data=stochastic_data, # Malfunction data generator obs_builder_object=TreeObservation) env = copy.deepcopy(env_Orig) # After training we want to render the results so we also load a renderer env_renderer = RenderTool( env, gl="PILSVG", ) # And the max number of steps we want to take per episode max_steps = int(4 * 2 * (20 + env.height + env.width)) n_episodes = 1 for trials in range(1, n_episodes + 1): # Reset environment obs, info = env.reset(True, True) env_renderer.reset() # Build agent specific observations for a in range(env.get_num_agents()): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) agent_obs_buffer[a] = agent_obs[a].copy() # Reset score and done score = 0 env_done = 0 step = 0 # Run episode while True: # Action for a in range(env.get_num_agents()): if info['action_required'][a]: # If an action is require, we want to store the obs a that step as well as the action update_values[a] = True batch = torch.from_numpy(agent_obs[a][np.newaxis, ...]).float() if cuda: batch = batch.cuda() prediction = cloned_model(Variable(batch)) action = prediction.data.cpu().numpy().argmax() # action = agent.act(agent_obs[a], eps=eps) action_prob[action] += 1 else: update_values[a] = False action = 0 action_dict.update({a: action}) # Environment step # print("Action Values:", action_dict) next_obs, all_rewards, done, info = env.step(action_dict) step += 1 if (render): env_renderer.render_env(show=True, show_predictions=True, show_observations=False) for a in range(env.get_num_agents()): # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values[a] or done[a]: # agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a], # agent_obs[a], done[a]) cummulated_reward[a] = 0. agent_obs_buffer[a] = agent_obs[a].copy() agent_action_buffer[a] = action_dict[a] if next_obs[a]: agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10) score += all_rewards[a] / env.get_num_agents() # print(all_rewards) # Copy observation if done['__all__'] or step >= max_steps: env_done = 1 break # Collection information about training tasks_finished = 0 for current_agent in env.agents: if current_agent.status == RailAgentStatus.DONE_REMOVED: tasks_finished += 1 done_window.append(tasks_finished / max(1, env.get_num_agents())) scores_window.append(score / max_steps) # save most recent score scores.append(np.mean(scores_window)) dones_list.append((np.mean(done_window))) print( '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\t Action Probabilities: \t {}' .format(env.get_num_agents(), x_dim, y_dim, trials, np.mean(scores_window), 100 * np.mean(done_window), action_prob / np.sum(action_prob)), end=" ") # env.close() data = [[ n_agents, x_dim, y_dim, trials, np.mean(scores_window), 100 * np.mean(done_window), step, action_prob / np.sum(action_prob) ]] dfCur = pd.DataFrame(data) with open(f'ES_TrainingResults_{n_agents}_{x_dim}_{y_dim}.csv', 'a') as f: dfCur.to_csv(f, index=False, header=False) return np.mean(scores)
def main(args): try: opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""]) except getopt.GetoptError as err: print(str(err)) # will print something like "option -a not recognized" sys.exit(2) sleep_for_animation = True for o, a in opts: if o in ("--sleep-for-animation"): sleep_for_animation = str2bool(a) else: assert False, "unhandled option" batch_builder = SampleBatchBuilder() # or MultiAgentSampleBatchBuilder writer = JsonWriter("./out/") # Setting these 2 parameters to True can slow down training visuals = False sleep_for_animation = False if visuals: from flatland.utils.rendertools import RenderTool max_depth = 30 tree_depth = 2 trial_start = 100 n_trials = 999 start = 0 columns = [ 'Agents', 'X_DIM', 'Y_DIM', 'TRIAL_NO', 'REWARD', 'NORMALIZED_REWARD', 'DONE_RATIO', 'STEPS', 'ACTION_PROB' ] df_all_results = pd.DataFrame(columns=columns) for trials in range(trial_start, n_trials + 1): env_file = f"envs-100-999/envs/Level_{trials}.pkl" # env_file = f"../env_configs/round_1-small/Test_0/Level_{trials}.mpk" # file = f"../env_configs/actions-small/Test_0/Level_{trials}.mpk" file = f"envs-100-999/actions/envs/Level_{trials}.json" if not os.path.isfile(env_file) or not os.path.isfile(file): print("Missing file!", env_file, file) continue step = 0 obs_builder_object = TreeObsForRailEnv( max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv(max_depth)) env = RailEnv( width=1, height=1, rail_generator=rail_from_file(env_file), schedule_generator=schedule_from_file(env_file), malfunction_generator_and_process_data=malfunction_from_file( env_file), obs_builder_object=obs_builder_object) obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True, activate_agents=False, random_seed=1001) with open(file, "r") as files: expert_actions = json.load(files) n_agents = env.get_num_agents() x_dim, y_dim = env.width, env.height agent_obs = [None] * n_agents agent_obs_buffer = [None] * n_agents done = dict() done["__all__"] = False if imitate: agent_action_buffer = list(expert_actions[step].values()) else: # , p=[0.2, 0, 0.5]) # [0] * n_agents agent_action_buffer = np.random.choice(5, n_agents, replace=True) update_values = [False] * n_agents max_steps = int(4 * 2 * (20 + env.height + env.width)) action_size = 5 # 3 # And some variables to keep track of the progress action_dict = dict() scores_window = deque(maxlen=100) reward_window = deque(maxlen=100) done_window = deque(maxlen=100) action_prob = [0] * action_size # agent = Agent(state_size, action_size) if visuals: env_renderer = RenderTool(env, gl="PILSVG") env_renderer.render_env(show=True, frames=True, show_observations=True) for a in range(n_agents): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) agent_obs_buffer[a] = agent_obs[a].copy() # Reset score and done score = 0 agent_action_buffer = np.zeros(n_agents) # prev_action = np.zeros_like(envs.action_space.sample()) prev_reward = np.zeros(n_agents) for step in range(max_steps): for a in range(n_agents): if info['action_required'][a]: if imitate: if step < len(expert_actions): action = expert_actions[step][str(a)] else: action = 0 else: action = 0 action_prob[action] += 1 update_values[a] = True else: update_values[a] = False action = 0 action_dict.update({a: action}) next_obs, all_rewards, done, info = env.step(action_dict) for a in range(n_agents): if next_obs[a] is not None: agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10) # Only update the values when we are done or when an action # was taken and thus relevant information is present if update_values[a] or done[a]: start += 1 batch_builder.add_values( t=step, eps_id=trials, agent_index=0, obs=agent_obs_buffer[a], actions=action_dict[a], action_prob=1.0, # put the true action probability rewards=all_rewards[a], prev_actions=agent_action_buffer[a], prev_rewards=prev_reward[a], dones=done[a], infos=info['action_required'][a], new_obs=agent_obs[a]) agent_obs_buffer[a] = agent_obs[a].copy() agent_action_buffer[a] = action_dict[a] prev_reward[a] = all_rewards[a] score += all_rewards[a] # / envs.get_num_agents() if visuals: env_renderer.render_env(show=True, frames=True, show_observations=True) if sleep_for_animation: time.sleep(0.5) if done["__all__"] or step > max_steps: writer.write(batch_builder.build_and_reset()) break # Collection information about training if step % 100 == 0: tasks_finished = 0 for current_agent in env.agents: if current_agent.status == RailAgentStatus.DONE_REMOVED: tasks_finished += 1 print( '\rTrial No {} Training {} Agents on ({},{}).\t Steps {}\t Reward: {:.3f}\t Normalized Reward: {:.3f}\tDones: {:.2f}%\t' .format( trials, env.get_num_agents(), x_dim, y_dim, step, score, score / (max_steps + n_agents), 100 * np.mean( tasks_finished / max(1, env.get_num_agents()))), end=" ") tasks_finished = 0 for current_agent in env.agents: if current_agent.status == RailAgentStatus.DONE_REMOVED: tasks_finished += 1 done_window.append(tasks_finished / max(1, env.get_num_agents())) reward_window.append(score) scores_window.append(score / (max_steps + n_agents)) data = [[ n_agents, x_dim, y_dim, trials, np.mean(reward_window), np.mean(scores_window), 100 * np.mean(done_window), step, action_prob / np.sum(action_prob) ]] df_cur = pd.DataFrame(data, columns=columns) df_all_results = pd.concat([df_all_results, df_cur]) if imitate: df_all_results.to_csv( f'TreeImitationLearning_DQN_TrainingResults.csv', index=False) print( '\rTrial No {} Training {} Agents on ({},{}).\t Total Steps {}\t Reward: {:.3f}\t Normalized Reward: {:.3f}\tDones: {:.2f}%\t' .format(trials, env.get_num_agents(), x_dim, y_dim, step, np.mean(reward_window), np.mean(scores_window), 100 * np.mean(done_window))) if visuals: env_renderer.close_window() gc.collect()
def evaluate(seed=37429879, timed=False, filename="./rl-weights.pth", debug=False, refresh=1): # Attempt to load policy from disk. policy = load_policy(filename, seed=seed) # Create environment with given seeding. env, max_steps, _, _, observation_tree_depth, _ = create_multi_agent_rail_env( seed + 1, timed) # Fixed environment parameters (note, these must correspond with the training parameters!) observation_radius = 10 env_renderer = None if (debug): env_renderer = RenderTool(env, screen_width=1920, screen_height=1080) # Create container for the agent actions and observations. action_dict = dict() agent_obs = [None] * env.number_of_agents num_maps = 100 scores = [] successes = 0 scores_window = deque(maxlen=100) # todo smooth when rendering instead completion_window = deque(maxlen=100) completion = [] schedule_length = [] for _ in range(0, num_maps): # Create a new map. obs, info = env.reset(True, True) score = 0 if debug: env_renderer.reset() env_renderer.render_env(show=True, frames=False, show_observations=False) time.sleep(refresh) # Run episode for _ in range(max_steps - 1): # Build agent specific observations for agent in env.get_agent_handles(): if obs[agent]: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) # If an action is required, select the action. for agent in env.get_agent_handles(): action = 0 if info['action_required'][agent]: action = policy.act(agent_obs[agent], eps=0.08) # print("Required " + str(action)) action_dict.update({agent: action}) schedule_length.append(len(action_dict)) # Environment step obs, all_rewards, done, info = env.step(action_dict) if debug: env_renderer.render_env(show=True, frames=False, show_observations=False) time.sleep(refresh) # Track rewards. for agent in env.get_agent_handles(): score = score + all_rewards[agent] if done['__all__']: successes = successes + 1 break tasks_finished = np.sum( [int(done[idx]) for idx in env.get_agent_handles()]) completion_window.append(tasks_finished / max(1, env.get_num_agents())) completion.append((np.mean(completion_window))) # Record scores. scores.append(score) print("Successful: %8.2f%%" % (100 * successes / num_maps)) print("Mean reward: %8.2f" % (np.mean(scores))) print("Median reward: %8.2f" % (np.median(scores))) print("Instances solved: %8.2f" % (np.mean(completion))) print("Instances solved: %8.2f" % (np.mean(schedule_length)))
def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps, action_size, state_size, seed, render, allow_skipping, allow_caching): # Evaluation is faster on CPU (except if you use a really huge policy) parameters = {'use_gpu': False} # policy = DDDQNPolicy(state_size, action_size, Namespace(**parameters), evaluation_mode=True) # policy.qnetwork_local = torch.load(checkpoint, map_location={'cuda:0': 'cpu'}) env_params = Namespace(**env_params) # Environment parameters n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city agents = [] for agent_id in range(n_agents): agent = AttentionAgent(num_in_pol=state_size, num_out_pol=action_size, hidden_dim=256, lr=0.001) agent.policy = torch.load(os.path.join( checkpoint, f'2300_agent{agent_id}' + '.pth'), map_location=torch.device('cpu')) agent.policy.eval() agents.append(agent) # Malfunction and speed profiles # TODO pass these parameters properly from main! malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 2000, # Rate of malfunctions min_duration=20, # Minimal duration max_duration=50 # Max duration ) # Only fast trains in Round 1 speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Observation parameters observation_tree_depth = env_params.observation_tree_depth observation_radius = env_params.observation_radius observation_max_path_depth = env_params.observation_max_path_depth # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Setup the environment env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city, ), # rail_generator = complex_rail_generator( # nr_start_goal=10, # nr_extra=10, # min_dist=10, # max_dist=99999, # seed=1 # ), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation) if render: # env_renderer = RenderTool(env, gl="PGL") env_renderer = RenderTool( env, # gl="PGL", agent_render_variant=AgentRenderVariant. AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=False, screen_height=600, # Adjust these parameters to fit your resolution screen_width=800) action_dict = dict() scores = [] completions = [] nb_steps = [] inference_times = [] preproc_times = [] agent_times = [] step_times = [] for agent_id in range(n_agents): action_dict[agent_id] = 0 for episode_idx in range(n_eval_episodes): images = [] seed += 1 inference_timer = Timer() preproc_timer = Timer() agent_timer = Timer() step_timer = Timer() step_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True, random_seed=seed) step_timer.end() agent_obs = [None] * env.get_num_agents() score = 0.0 if render: env_renderer.set_new_rail() final_step = 0 skipped = 0 nb_hit = 0 agent_last_obs = {} agent_last_action = {} for step in range(max_steps - 1): # time.sleep(0.2) if allow_skipping and check_if_all_blocked(env): # FIXME why -1? bug where all agents are "done" after max_steps! skipped = max_steps - step - 1 final_step = max_steps - 2 n_unfinished_agents = sum(not done[idx] for idx in env.get_agent_handles()) score -= skipped * n_unfinished_agents break agent_timer.start() for agent in env.get_agent_handles(): agent_model = agents[agent] if obs[agent] and info['action_required'][agent]: if agent in agent_last_obs and np.all( agent_last_obs[agent] == obs[agent]): nb_hit += 1 action = agent_last_action[agent] else: preproc_timer.start() norm_obs = normalize_observation( obs[agent], tree_depth=observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() inference_timer.start() action = act(agent_model, norm_obs) inference_timer.end() action_dict.update({agent: action}) if allow_caching: agent_last_obs[agent] = obs[agent] agent_last_action[agent] = action agent_timer.end() step_timer.start() obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if render: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) im = env_renderer.get_image() im = PIL.Image.fromarray(im) images.append(im) if step % 100 == 0: print("{}/{}".format(step, max_steps - 1)) for agent in env.get_agent_handles(): score += all_rewards[agent] final_step = step if done['__all__']: break if render: for _ in range(10): images.append(images[len(images) - 1]) # save video images[0].save( f'/Users/nikhilvs/repos/nyu/flatland-reinforcement-learning/videos/maac-final/out_{episode_idx}.gif', save_all=True, append_images=images[1:], optimize=False, duration=60, loop=0) normalized_score = score / (max_steps * env.get_num_agents()) scores.append(normalized_score) tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) completions.append(completion) nb_steps.append(final_step) inference_times.append(inference_timer.get()) preproc_times.append(preproc_timer.get()) agent_times.append(agent_timer.get()) step_times.append(step_timer.get()) skipped_text = "" if skipped > 0: skipped_text = "\t⚡ Skipped {}".format(skipped) hit_text = "" if nb_hit > 0: hit_text = "\t⚡ Hit {} ({:.1f}%)".format(nb_hit, (100 * nb_hit) / (n_agents * final_step)) print( "☑️ Score: {:.3f} \tDone: {:.1f}% \tNb steps: {:.3f} " "\t🍭 Seed: {}" "\t🚉 Env: {:.3f}s " "\t🤖 Agent: {:.3f}s (per step: {:.3f}s) \t[preproc: {:.3f}s \tinfer: {:.3f}s]" "{}{}".format(normalized_score, completion * 100.0, final_step, seed, step_timer.get(), agent_timer.get(), agent_timer.get() / final_step, preproc_timer.get(), inference_timer.get(), skipped_text, hit_text)) return scores, completions, nb_steps, agent_times, step_times
def main(argv): try: opts, args = getopt.getopt(argv, "n:", ["n_trials="]) except getopt.GetoptError: print('test_navigation_single_agent.py -n <n_trials>') sys.exit(2) for opt, arg in opts: if opt in ('-n', '--n_trials'): n_trials = int(arg) random.seed(1) np.random.seed(1) ######## TEST SET SELECTION - PARAMETERS ######## test_multi_agent_setup = 1 # 1 for Medium size test, 2 for Big size test test_n_agents = 5 # Number of agents to test (3 - 5 - 7 for Medium, 5 - 7 - 10 for Big) test_malfunctions_enabled = True # Malfunctions enabled? test_agents_one_speed = True # Test agents with the same speed (1) or with 4 different speeds? ################################################# # Medium size if test_multi_agent_setup == 1: x_dim = 16*3 y_dim = 9*3 max_num_cities = 5 max_rails_between_cities = 2 max_rails_in_city = 3 # Big size if test_multi_agent_setup == 2: x_dim = 16*4 y_dim = 9*4 max_num_cities = 9 max_rails_between_cities = 5 max_rails_in_city = 5 stochastic_data = {'malfunction_rate': 80, # Rate of malfunction occurence of single agent 'min_duration': 15, # Minimal duration of malfunction 'max_duration': 50 # Max duration of malfunction } # Custom observation builder tree_depth = 2 TreeObservation = TreeObsForRailEnv(max_depth=tree_depth, predictor = ShortestPathPredictorForRailEnv(20)) np.savetxt(fname=path.join('NetsTest' , 'info.txt'), X=[x_dim,y_dim,test_n_agents,max_num_cities,max_rails_between_cities,max_rails_in_city,tree_depth],delimiter=';') # Different agent types (trains) with different speeds. if test_agents_one_speed: speed_ration_map = {1.: 1., # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0} # Slow freight train else: speed_ration_map = {1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25} # Slow freight train if test_malfunctions_enabled: env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator(max_num_cities=max_num_cities, # Number of cities in map (where train stations are) seed=14, # Random seed grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_ration_map), malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), number_of_agents=test_n_agents, obs_builder_object=TreeObservation) else: env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator(max_num_cities=max_num_cities, # Number of cities in map (where train stations are) seed=14, # Random seed grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=test_n_agents, obs_builder_object=TreeObservation) env.reset() #env_renderer = RenderTool(env, gl="PILSVG", ) env_renderer = RenderTool(env, gl="PILSVG", agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=False, screen_height=(1080*0.8), # Adjust these parameters to fit your resolution screen_width=(1920*0.8)) num_features_per_node = env.obs_builder.observation_dim nr_nodes = 0 for i in range(tree_depth + 1): nr_nodes += np.power(4, i) state_size = num_features_per_node * nr_nodes action_size = 5 # We set the number of episodes we would like to train on if 'n_trials' not in locals(): n_trials = 15000 # max_steps computation speed_weighted_mean = 0 for key in speed_ration_map.keys(): speed_weighted_mean += key * speed_ration_map[key] #max_steps = int(3 * (env.height + env.width)) max_steps = int((1/speed_weighted_mean) * 3 * (env.height + env.width)) #eps = 1. #eps_end = 0.005 #eps_decay = 0.9995 # And some variables to keep track of the performance action_dict = dict() final_action_dict = dict() action_prob_list = [] scores_window = deque(maxlen=100) done_window = deque(maxlen=100) scores = [] scores_list = [] deadlock_list =[] dones_list_window = [] dones_list = [] action_prob = [0] * action_size agent_obs = [None] * env.get_num_agents() agent_next_obs = [None] * env.get_num_agents() # Useless agent = Agent(state_size, action_size) # LOAD MODEL WEIGHTS TO TEST agent.qnetwork_local.load_state_dict(torch.load(path.join('NetsTest' , 'navigator_checkpoint3800_multi10_deadlock_global10.pth'))) record_images = False frame_step = 0 for trials in range(1, n_trials + 1): # Reset environment obs, info = env.reset()#(True, True) env_renderer.reset() # Build agent specific observations for a in range(env.get_num_agents()): agent_obs[a] = agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) # Reset score and done score = 0 env_done = 0 # Run episode for step in range(max_steps): # Action for a in range(env.get_num_agents()): if info['action_required'][a]: action = agent.act(agent_obs[a], eps=0.) action_prob[action] += 1 else: action = 0 action_dict.update({a: action}) # Environment step obs, all_rewards, done, deadlocks, info = env.step(action_dict) env_renderer.render_env(show=True, show_predictions=True, show_observations=False) # Build agent specific observations and normalize for a in range(env.get_num_agents()): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) score += all_rewards[a] / env.get_num_agents() if done['__all__']: break # Collection information about training tasks_finished = 0 for _idx in range(env.get_num_agents()): if done[_idx] == 1: tasks_finished += 1 done_window.append(tasks_finished / max(1, env.get_num_agents())) scores_window.append(score / max_steps) # save most recent score scores.append(np.mean(scores_window)) dones_list.append(tasks_finished / max(1, env.get_num_agents())) dones_list_window.append((np.mean(done_window))) scores_list.append(score / max_steps) deadlock_list.append(deadlocks.count(1)/max(1, env.get_num_agents())) if (np.sum(action_prob) == 0): action_prob_normalized = [0] * action_size else: action_prob_normalized = action_prob / np.sum(action_prob) print( '\rTesting {} Agents on ({},{}).\t Episode {}\t Score: {:.3f}\tDones: {:.2f}%\tDeadlocks: {:.2f}\t Action Probabilities: \t {}'.format( env.get_num_agents(), x_dim, y_dim, trials, score / max_steps, 100 * tasks_finished / max(1, env.get_num_agents()), deadlocks.count(1)/max(1, env.get_num_agents()), action_prob_normalized), end=" ") #if trials % 100 == 0: action_prob_list.append(action_prob_normalized) action_prob = [0] * action_size if trials % 50 == 0: np.savetxt(fname=path.join('NetsTest' , 'test_metrics.csv'), X=np.transpose(np.asarray([scores_list,scores,dones_list,dones_list_window,deadlock_list])), delimiter=';',newline='\n') np.savetxt(fname=path.join('NetsTest' , 'test_action_prob.csv'), X=np.asarray(action_prob_list), delimiter=';',newline='\n')
def train_agent(env_params, train_params): # Environment parameters n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city seed = env_params.seed # Observation parameters observation_tree_depth = env_params.observation_tree_depth observation_radius = env_params.observation_radius observation_max_path_depth = env_params.observation_max_path_depth # Training parameters eps_start = train_params.eps_start eps_end = train_params.eps_end eps_decay = train_params.eps_decay n_episodes = train_params.n_episodes checkpoint_interval = train_params.checkpoint_interval n_eval_episodes = train_params.n_evaluation_episodes # Set the seeds random.seed(seed) np.random.seed(seed) # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 10000, # Rate of malfunctions min_duration=15, # Minimal duration max_duration=50 # Max duration ) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Fraction of train which each speed speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Setup the environment env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city ), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params(malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed ) env.reset(regenerate_schedule=True, regenerate_rail=True) # Setup renderer if train_params.render: env_renderer = RenderTool(env, gl="PGL") # Calculate the state size given the depth of the tree observation and the number of features n_features_per_node = env.obs_builder.observation_dim n_nodes = 0 for i in range(observation_tree_depth + 1): n_nodes += np.power(4, i) state_size = n_features_per_node * n_nodes # The action space of flatland is 5 discrete actions action_size = 5 # Max number of steps per episode # This is the official formula used during evaluations # See details in flatland.envs.schedule_generators.sparse_schedule_generator max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) action_count = [0] * action_size action_dict = dict() agent_obs = [None] * env.get_num_agents() agent_prev_obs = [None] * env.get_num_agents() agent_prev_action = [2] * env.get_num_agents() update_values = False smoothed_normalized_score = -1.0 smoothed_eval_normalized_score = -1.0 smoothed_completion = 0.0 smoothed_eval_completion = 0.0 # Double Dueling DQN policy policy = DDDQNPolicy(state_size, action_size, train_params) # TensorBoard writer writer = SummaryWriter() writer.add_hparams(vars(train_params), {}) writer.add_hparams(vars(env_params), {}) training_timer = Timer() training_timer.start() print("\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes.\n" .format(env.get_num_agents(), x_dim, y_dim, n_episodes, n_eval_episodes, checkpoint_interval)) for episode_idx in range(n_episodes + 1): # Timers step_timer = Timer() reset_timer = Timer() learn_timer = Timer() preproc_timer = Timer() # Reset environment reset_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True) reset_timer.end() if train_params.render: env_renderer.set_new_rail() score = 0 nb_steps = 0 actions_taken = [] # Build agent specific observations for agent in env.get_agent_handles(): if obs[agent]: agent_obs[agent] = normalize_observation(obs[agent], observation_tree_depth, observation_radius=observation_radius) agent_prev_obs[agent] = agent_obs[agent].copy() # Run episode for step in range(max_steps - 1): for agent in env.get_agent_handles(): if info['action_required'][agent]: # If an action is required, we want to store the obs at that step as well as the action update_values = True action = policy.act(agent_obs[agent], eps=eps_start) action_count[action] += 1 actions_taken.append(action) else: update_values = False action = 0 action_dict.update({agent: action}) # Environment step step_timer.start() next_obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if train_params.render and episode_idx % checkpoint_interval == 0: env_renderer.render_env( show=True, frames=False, show_observations=False, show_predictions=False ) for agent in range(env.get_num_agents()): # Update replay buffer and train agent # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[agent]: learn_timer.start() policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent]) learn_timer.end() agent_prev_obs[agent] = agent_obs[agent].copy() agent_prev_action[agent] = action_dict[agent] # Preprocess the new observations if next_obs[agent]: preproc_timer.start() agent_obs[agent] = normalize_observation(next_obs[agent], observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() score += all_rewards[agent] nb_steps = step if done['__all__']: break # Epsilon decay eps_start = max(eps_end, eps_decay * eps_start) # Collection information about training tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) normalized_score = score / (max_steps * env.get_num_agents()) action_probs = action_count / np.sum(action_count) action_count = [1] * action_size # Smoothed values for terminal display and for more stable hyper-parameter tuning smoothing = 0.99 smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * (1.0 - smoothing) smoothed_completion = smoothed_completion * smoothing + completion * (1.0 - smoothing) # Print logs if episode_idx % checkpoint_interval == 0: torch.save(policy.qnetwork_local, './checkpoints/multi-' + str(episode_idx) + '.pth') if train_params.render: env_renderer.close_window() print( '\r🚂 Episode {}' '\t 🏆 Score: {:.3f}' ' Avg: {:.3f}' '\t 💯 Done: {:.2f}%' ' Avg: {:.2f}%' '\t 🎲 Epsilon: {:.2f} ' '\t 🔀 Action Probs: {}'.format( episode_idx, normalized_score, smoothed_normalized_score, 100 * completion, 100 * smoothed_completion, eps_start, format_action_prob(action_probs) ), end=" ") # Evaluate policy if episode_idx % train_params.checkpoint_interval == 0: scores, completions, nb_steps_eval = eval_policy(env, policy, n_eval_episodes, max_steps) writer.add_scalar("evaluation/scores_min", np.min(scores), episode_idx) writer.add_scalar("evaluation/scores_max", np.max(scores), episode_idx) writer.add_scalar("evaluation/scores_mean", np.mean(scores), episode_idx) writer.add_scalar("evaluation/scores_std", np.std(scores), episode_idx) writer.add_histogram("evaluation/scores", np.array(scores), episode_idx) writer.add_scalar("evaluation/completions_min", np.min(completions), episode_idx) writer.add_scalar("evaluation/completions_max", np.max(completions), episode_idx) writer.add_scalar("evaluation/completions_mean", np.mean(completions), episode_idx) writer.add_scalar("evaluation/completions_std", np.std(completions), episode_idx) writer.add_histogram("evaluation/completions", np.array(completions), episode_idx) writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_mean", np.mean(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval), episode_idx) writer.add_histogram("evaluation/nb_steps", np.array(nb_steps_eval), episode_idx) smoothing = 0.9 smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean(scores) * (1.0 - smoothing) smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean(completions) * (1.0 - smoothing) writer.add_scalar("evaluation/smoothed_score", smoothed_eval_normalized_score, episode_idx) writer.add_scalar("evaluation/smoothed_completion", smoothed_eval_completion, episode_idx) # Save logs to tensorboard writer.add_scalar("training/score", normalized_score, episode_idx) writer.add_scalar("training/smoothed_score", smoothed_normalized_score, episode_idx) writer.add_scalar("training/completion", np.mean(completion), episode_idx) writer.add_scalar("training/smoothed_completion", np.mean(smoothed_completion), episode_idx) writer.add_scalar("training/nb_steps", nb_steps, episode_idx) writer.add_histogram("actions/distribution", np.array(actions_taken), episode_idx) writer.add_scalar("actions/nothing", action_probs[RailEnvActions.DO_NOTHING], episode_idx) writer.add_scalar("actions/left", action_probs[RailEnvActions.MOVE_LEFT], episode_idx) writer.add_scalar("actions/forward", action_probs[RailEnvActions.MOVE_FORWARD], episode_idx) writer.add_scalar("actions/right", action_probs[RailEnvActions.MOVE_RIGHT], episode_idx) writer.add_scalar("actions/stop", action_probs[RailEnvActions.STOP_MOVING], episode_idx) writer.add_scalar("training/epsilon", eps_start, episode_idx) writer.add_scalar("training/buffer_size", len(policy.memory), episode_idx) writer.add_scalar("training/loss", policy.loss, episode_idx) writer.add_scalar("timer/reset", reset_timer.get(), episode_idx) writer.add_scalar("timer/step", step_timer.get(), episode_idx) writer.add_scalar("timer/learn", learn_timer.get(), episode_idx) writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx) writer.add_scalar("timer/total", training_timer.get_current(), episode_idx)
agent_next_obs = [None] * env.get_num_agents() agent = Agent(state_size, action_size) with path(torch_training.Nets, "navigator_checkpoint1000.pth") as file_in: agent.qnetwork_local.load_state_dict(torch.load(file_in)) record_images = False frame_step = 0 for trials in range(1, n_trials + 1): # Reset environment obs, info = env.reset(True, True) env_renderer.reset() # Build agent specific observations for a in range(env.get_num_agents()): agent_obs[a] = agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) # Reset score and done score = 0 env_done = 0 # Run episode for step in range(max_steps): # Action for a in range(env.get_num_agents()): if info['action_required'][a]: action = agent.act(agent_obs[a], eps=0.) else: action = 0
def train_agent(n_episodes): # Environment parameters n_agents = 1 x_dim = 25 y_dim = 25 n_cities = 4 max_rails_between_cities = 2 max_rails_in_city = 3 seed = 42 # Observation parameters observation_tree_depth = 2 observation_radius = 10 # Exploration parameters eps_start = 1.0 eps_end = 0.01 eps_decay = 0.997 # for 2500ts # Set the seeds random.seed(seed) np.random.seed(seed) # Observation builder tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth) # Setup the environment env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, seed=seed, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(), number_of_agents=n_agents, obs_builder_object=tree_observation) env.reset(True, True) # Calculate the state size given the depth of the tree observation and the number of features n_features_per_node = env.obs_builder.observation_dim n_nodes = 0 for i in range(observation_tree_depth + 1): n_nodes += np.power(4, i) state_size = n_features_per_node * n_nodes # The action space of flatland is 5 discrete actions action_size = 5 # Max number of steps per episode # This is the official formula used during evaluations max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) action_dict = dict() # And some variables to keep track of the progress scores_window = deque(maxlen=100) # todo smooth when rendering instead completion_window = deque(maxlen=100) scores = [] completion = [] action_count = [0] * action_size agent_obs = [None] * env.get_num_agents() agent_prev_obs = [None] * env.get_num_agents() agent_prev_action = [2] * env.get_num_agents() update_values = False # Training parameters training_parameters = { 'buffer_size': int(1e5), 'batch_size': 32, 'update_every': 8, 'learning_rate': 0.5e-4, 'tau': 1e-3, 'gamma': 0.99, 'buffer_min_size': 0, 'hidden_size': 256, 'use_gpu': False } # Double Dueling DQN policy policy = DDDQNPolicy(state_size, action_size, Namespace(**training_parameters)) for episode_idx in range(n_episodes): score = 0 # Reset environment obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True) # Build agent specific observations for agent in env.get_agent_handles(): if obs[agent]: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) agent_prev_obs[agent] = agent_obs[agent].copy() # Run episode for step in range(max_steps - 1): for agent in env.get_agent_handles(): if info['action_required'][agent]: # If an action is required, we want to store the obs at that step as well as the action update_values = True action = policy.act(agent, agent_obs[agent], eps=eps_start) action_count[action] += 1 else: update_values = False action = 0 action_dict.update({agent: action}) # Environment step next_obs, all_rewards, done, info = env.step(action_dict) # Update replay buffer and train agent for agent in range(env.get_num_agents()): # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[agent]: policy.step(agent, agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent]) agent_prev_obs[agent] = agent_obs[agent].copy() agent_prev_action[agent] = action_dict[agent] if next_obs[agent]: agent_obs[agent] = normalize_observation( next_obs[agent], observation_tree_depth, observation_radius=10) score += all_rewards[agent] if done['__all__']: break # Epsilon decay eps_start = max(eps_end, eps_decay * eps_start) # Collection information about training tasks_finished = np.sum( [int(done[idx]) for idx in env.get_agent_handles()]) completion_window.append(tasks_finished / max(1, env.get_num_agents())) scores_window.append(score / (max_steps * env.get_num_agents())) completion.append((np.mean(completion_window))) scores.append(np.mean(scores_window)) action_probs = action_count / np.sum(action_count) if episode_idx % 100 == 0: end = "\n" torch.save(policy.qnetwork_local, './checkpoints/single-' + str(episode_idx) + '.pth') action_count = [1] * action_size else: end = " " print( '\rTraining {} agents on {}x{}\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}' .format(env.get_num_agents(), x_dim, y_dim, episode_idx, np.mean(scores_window), 100 * np.mean(completion_window), eps_start, action_probs), end=end) # Plot overall training progress at the end plt.plot(scores) plt.show() plt.plot(completion) plt.show()
no_ops_mode = False if not check_if_all_blocked(env=local_env): time_start = time.time() action_dict = {} for agent in range(nb_agents): if observation[agent] and info['action_required'][agent]: if agent in agent_last_obs and np.all( agent_last_obs[agent] == observation[agent]): # cache hit action = agent_last_action[agent] nb_hit += 1 else: # otherwise, run normalization and inference norm_obs = normalize_observation( observation[agent], tree_depth=observation_tree_depth, observation_radius=observation_radius) action = policy.act(norm_obs, eps=0.0) action_dict[agent] = action if USE_ACTION_CACHE: agent_last_obs[agent] = observation[agent] agent_last_action[agent] = action agent_time = time.time() - time_start time_taken_by_controller.append(agent_time) time_start = time.time() _, all_rewards, done, info = remote_client.env_step( action_dict) step_time = time.time() - time_start
def main(argv): try: opts, args = getopt.getopt(argv, "n:", ["n_trials="]) except getopt.GetoptError: print('test_navigation_single_agent.py -n <n_trials>') sys.exit(2) for opt, arg in opts: if opt in ('-n', '--n_trials'): n_trials = int(arg) random.seed(1) np.random.seed(1) # Parameters for the Environment x_dim = 35 y_dim = 35 n_agents = 1 max_num_cities = 3 max_rails_between_cities = 2 max_rails_in_city = 3 # We are training an Agent using the Tree Observation with depth 2 #observation_builder = TreeObsForRailEnv(max_depth=2, predictor = ShortestPathPredictorForRailEnv(20)) # Use a the malfunction generator to break agents from time to time stochastic_data = { 'malfunction_rate': 80, # Rate of malfunction occurence of single agent 'min_duration': 15, # Minimal duration of malfunction 'max_duration': 50 # Max duration of malfunction } # Custom observation builder tree_depth = 2 TreeObservation = TreeObsForRailEnv( max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv(20)) np.savetxt(fname=path.join('NetsTest', 'info.txt'), X=[ x_dim, y_dim, n_agents, max_num_cities, max_rails_between_cities, max_rails_in_city, tree_depth ], delimiter=';') # Different agent types (trains) with different speeds. speed_ration_map = { 1.: 1., # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 } # Slow freight train env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=max_num_cities, # Number of cities in map (where train stations are) seed=14, # Random seed grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=n_agents, obs_builder_object=TreeObservation) env.reset() env_renderer = RenderTool( env, gl="PILSVG", ) #env_renderer = RenderTool(env, gl="PILSVG", # agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX, # show_debug=False, # screen_height=(1080*0.8), # Adjust these parameters to fit your resolution # screen_width=(1920*0.8)) num_features_per_node = env.obs_builder.observation_dim nr_nodes = 0 for i in range(tree_depth + 1): nr_nodes += np.power(4, i) state_size = num_features_per_node * nr_nodes action_size = 5 # We set the number of episodes we would like to train on if 'n_trials' not in locals(): n_trials = 15000 max_steps = int(3 * (env.height + env.width)) eps = 1. eps_end = 0.005 eps_decay = 0.9995 # And some variables to keep track of the performance action_dict = dict() final_action_dict = dict() action_prob_list = [] scores_window = deque(maxlen=100) done_window = deque(maxlen=100) scores = [] scores_list = [] dones_list_window = [] dones_list = [] action_prob = [0] * action_size agent_obs = [None] * env.get_num_agents() agent_next_obs = [None] * env.get_num_agents() # Useless agent = RandomAgent(state_size, action_size) #agent.qnetwork_local.load_state_dict(torch.load(path.join('NetsTest' , 'navigator_checkpoint_glob10.pth'))) #agent.qnetwork_local.load_state_dict(torch.load(path.join('NetsLoad' , 'navigator_checkpoint2100.pth'))) record_images = False frame_step = 0 for trials in range(1, n_trials + 1): # Reset environment obs, info = env.reset() #(True, True) env_renderer.reset() # Build agent specific observations for a in range(env.get_num_agents()): agent_obs[a] = agent_obs[a] = normalize_observation( obs[a], tree_depth, observation_radius=10) # Reset score and done score = 0 env_done = 0 # Run episode for step in range(max_steps): # Action for a in range(env.get_num_agents()): if info['action_required'][a]: action = agent.act(agent_obs[a]) #, eps=0.) action_prob[action] += 1 else: action = 0 action_dict.update({a: action}) # Environment step obs, all_rewards, done, deadlocks, info = env.step(action_dict) #env_renderer.render_env(show=True, show_predictions=True, show_observations=False) # Build agent specific observations and normalize for a in range(env.get_num_agents()): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) score += all_rewards[a] / env.get_num_agents() if done['__all__']: break # Collection information about training tasks_finished = 0 for _idx in range(env.get_num_agents()): if done[_idx] == 1: tasks_finished += 1 done_window.append(tasks_finished / max(1, env.get_num_agents())) scores_window.append(score / max_steps) # save most recent score scores.append(np.mean(scores_window)) dones_list.append(tasks_finished / max(1, env.get_num_agents())) dones_list_window.append((np.mean(done_window))) scores_list.append(score / max_steps) action_prob_list.append(action_prob / np.sum(action_prob)) print( '\rTesting {} Agents on ({},{}).\t Episode {}\t Score: {:.3f}\tDones: {:.2f}%\t Action Probabilities: \t {}' .format(env.get_num_agents(), x_dim, y_dim, trials, score / max_steps, 100 * tasks_finished / max(1, env.get_num_agents()), action_prob / np.sum(action_prob)), end=" ") if trials % 100 == 0: action_prob = [1] * action_size if trials % 50 == 0: #np.savetxt(fname=path.join('Nets' , 'scores_metric.txt'), X=scores) #np.savetxt(fname=path.join('Nets' , 'dones_metric.txt'), X=dones_list) np.savetxt(fname=path.join('NetsTest', 'test_metrics.csv'), X=np.transpose( np.asarray([ scores_list, scores, dones_list, dones_list_window ])), delimiter=';', newline='\n') np.savetxt(fname=path.join('NetsTest', 'test_action_prob.csv'), X=np.asarray(action_prob_list), delimiter=';', newline='\n')
def main(argv): try: opts, args = getopt.getopt(argv, "n:", ["n_trials="]) except getopt.GetoptError: print('training_navigation.py -n <n_trials>') sys.exit(2) for opt, arg in opts: if opt in ('-n', '--n_trials'): n_trials = int(arg) random.seed(1) np.random.seed(1) training = True weights = False # Setting these 2 parameters to True can slow down the network for training visuals = False sleep_for_animation = False trained_epochs = 1 # set to trained epoch number if resuming training or for validation , else value is 1 if training: train_set = True else: train_set = False weights = True dataset_type = "MANUAL" # set as MANUAL if we want to specify our own parameters, AUTO otherwise # Parameters for the Environment x_dim = 25 y_dim = 25 n_agents = 4 # Use a the malfunction generator to break agents from time to time stochastic_data = { 'prop_malfunction': 0.05, # Percentage of defective agents 'malfunction_rate': 100, # Rate of malfunction occurence 'min_duration': 20, # Minimal duration of malfunction 'max_duration': 50 # Max duration of malfunction } # Custom observation builder tree_observation = TreeObsForRailEnv( max_depth=2, predictor=ShortestPathPredictorForRailEnv(30)) # Different agent types (trains) with different speeds. speed_ration_map = { 1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25 } # Slow freight train if dataset_type == "MANUAL": env, random_seed = train_validate_env_generator_params( train_set, n_agents, x_dim, y_dim, tree_observation, stochastic_data, speed_ration_map) else: env, random_seed = train_validate_env_generator( train_set, tree_observation) # Given the depth of the tree observation and the number of features per node we get the following state_size num_features_per_node = env.obs_builder.observation_dim tree_depth = 2 nr_nodes = 0 for i in range(tree_depth + 1): nr_nodes += np.power(4, i) state_size = num_features_per_node * nr_nodes # state_size = state_size + n_agents*12 # state_size = num_features_per_node # The action space of flatland is 5 discrete actions action_size = 5 # We set the number of episodes we would like to train on if 'n_trials' not in locals(): n_trials = 15000 # And the max number of steps we want to take per episode max_steps = int(4 * 2 * (20 + env.height + env.width)) columns = [ 'Agents', 'X_DIM', 'Y_DIM', 'TRIAL_NO', 'SCORE', 'DONE_RATIO', 'STEPS', 'ACTION_PROB' ] df_all_results = pd.DataFrame(columns=columns) # Define training parameters eps = 1. eps_end = 0.005 eps_decay = 0.998 # And some variables to keep track of the progress action_dict = dict() final_action_dict = dict() scores_window = deque(maxlen=100) done_window = deque(maxlen=100) scores = [] dones_list = [] action_prob = [0] * action_size # Now we load a Double dueling DQN agent agent = Agent(state_size, action_size) trial_start = 1 if weights: trial_start = trained_epochs eps = max(eps_end, (np.power(eps_decay, trial_start)) * eps) weight_file = f"navigator_checkpoint{trial_start}.pth" with open("./Nets/" + weight_file, "rb") as file_in: agent.qnetwork_local.load_state_dict(torch.load(file_in)) for trials in range(trial_start, n_trials + 1): if dataset_type == "MANUAL": env, random_seed = train_validate_env_generator_params( train_set, n_agents, x_dim, y_dim, tree_observation, stochastic_data, speed_ration_map) else: env, random_seed = train_validate_env_generator( train_set, tree_observation) # Reset environment obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True, activate_agents=False, random_seed=random_seed) env_renderer = RenderTool( env, gl="PILSVG", ) if visuals: env_renderer.render_env(show=True, frames=True, show_observations=True) x_dim, y_dim, n_agents = env.width, env.height, env.get_num_agents() agent_obs = [None] * n_agents agent_next_obs = [None] * n_agents agent_obs_buffer = [None] * n_agents agent_action_buffer = [2] * n_agents cummulated_reward = np.zeros(n_agents) update_values = [False] * n_agents # Build agent specific observations for a in range(n_agents): # if obs[a] is not None: # agent_obs[a] = obs[a] if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) agent_obs_buffer[a] = agent_obs[a].copy() # Reset score and done score = 0 env_done = 0 step = 0 # Run episode while True: # Action for a in range(n_agents): if info['action_required'][a]: # If an action is require, we want to store the obs a that step as well as the action update_values[a] = True action = agent.act(agent_obs[a], eps=eps) action_prob[action] += 1 else: update_values[a] = False action = 0 action_dict.update({a: action}) # Environment step next_obs, all_rewards, done, info = env.step(action_dict) step += 1 if visuals: env_renderer.render_env(show=True, frames=True, show_observations=True) if sleep_for_animation: time.sleep(0.5) for a in range(n_agents): # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values[a] or done[a]: agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a], agent_obs[a], done[a]) cummulated_reward[a] = 0. agent_obs_buffer[a] = agent_obs[a].copy() agent_action_buffer[a] = action_dict[a] if next_obs[a]: agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10) # if next_obs[a] is not None: # agent_obs[a] = next_obs[a] score += all_rewards[a] / n_agents # Copy observation if done['__all__']: env_done = 1 break # Epsilon decay eps = max(eps_end, eps_decay * eps) # decrease epsilon # Collection information about training tasks_finished = 0 for current_agent in env.agents: if current_agent.status == RailAgentStatus.DONE_REMOVED: tasks_finished += 1 done_window.append(tasks_finished / max(1, env.get_num_agents())) scores_window.append(score / max_steps) # save most recent score scores.append(np.mean(scores_window)) dones_list.append((np.mean(done_window))) print( '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}' .format(env.get_num_agents(), x_dim, y_dim, trials, np.mean(scores_window), 100 * np.mean(done_window), eps, action_prob / np.sum(action_prob)), end=" ") data = [[ n_agents, x_dim, y_dim, trials, np.mean(scores_window), 100 * np.mean(done_window), step, action_prob / np.sum(action_prob) ]] df_cur = pd.DataFrame(data, columns=columns) df_all_results = pd.concat([df_all_results, df_cur]) df_all_results.to_csv( f'{dataset_type}_DQN_TrainingResults_{n_agents}_{x_dim}_{y_dim}.csv', index=False) if trials % 100 == 0: print( '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}' .format(env.get_num_agents(), x_dim, y_dim, trials, np.mean(scores_window), 100 * np.mean(done_window), eps, action_prob / np.sum(action_prob))) torch.save(agent.qnetwork_local.state_dict(), './Nets/navigator_checkpoint' + str(trials) + '.pth') action_prob = [1] * action_size # Plot overall training progress at the end plt.plot(scores) plt.show()
def train_agent(train_params, train_env_params, eval_env_params, obs_params): # Environment parameters n_agents = train_env_params.n_agents x_dim = train_env_params.x_dim y_dim = train_env_params.y_dim n_cities = train_env_params.n_cities max_rails_between_cities = train_env_params.max_rails_between_cities max_rails_in_city = train_env_params.max_rails_in_city seed = train_env_params.seed # Unique ID for this training now = datetime.now() training_id = now.strftime('%y%m%d%H%M%S') # Observation parameters observation_tree_depth = obs_params.observation_tree_depth observation_radius = obs_params.observation_radius observation_max_path_depth = obs_params.observation_max_path_depth # Training parameters eps_start = train_params.eps_start eps_end = train_params.eps_end eps_decay = train_params.eps_decay n_episodes = train_params.n_episodes checkpoint_interval = train_params.checkpoint_interval n_eval_episodes = train_params.n_evaluation_episodes restore_replay_buffer = train_params.restore_replay_buffer save_replay_buffer = train_params.save_replay_buffer # Set the seeds random.seed(seed) np.random.seed(seed) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Setup the environments train_env = create_rail_env(train_env_params, tree_observation) train_env.reset(regenerate_schedule=True, regenerate_rail=True) eval_env = create_rail_env(eval_env_params, tree_observation) eval_env.reset(regenerate_schedule=True, regenerate_rail=True) # Setup renderer if train_params.render: env_renderer = RenderTool(train_env, gl="PGL") # Calculate the state size given the depth of the tree observation and the number of features n_features_per_node = train_env.obs_builder.observation_dim n_nodes = sum([np.power(4, i) for i in range(observation_tree_depth + 1)]) state_size = n_features_per_node * n_nodes # The action space of flatland is 5 discrete actions action_size = 5 # Max number of steps per episode # This is the official formula used during evaluations # See details in flatland.envs.schedule_generators.sparse_schedule_generator # max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) max_steps = train_env._max_episode_steps action_count = [0] * action_size action_dict = dict() agent_obs = [None] * n_agents agent_prev_obs = [None] * n_agents agent_prev_action = [2] * n_agents update_values = [False] * n_agents # Smoothed values used as target for hyperparameter tuning smoothed_normalized_score = -1.0 smoothed_eval_normalized_score = -1.0 smoothed_completion = 0.0 smoothed_eval_completion = 0.0 # Double Dueling DQN policy policy = DDDQNPolicy(state_size, action_size, train_params) # Loads existing replay buffer if restore_replay_buffer: try: policy.load_replay_buffer(restore_replay_buffer) policy.test() except RuntimeError as e: print("\n🛑 Could't load replay buffer, were the experiences generated using the same tree depth?") print(e) exit(1) print("\n💾 Replay buffer status: {}/{} experiences".format(len(policy.memory.memory), train_params.buffer_size)) hdd = psutil.disk_usage('/') if save_replay_buffer and (hdd.free / (2 ** 30)) < 500.0: print("⚠️ Careful! Saving replay buffers will quickly consume a lot of disk space. You have {:.2f}gb left.".format(hdd.free / (2 ** 30))) # TensorBoard writer writer = SummaryWriter() writer.add_hparams(vars(train_params), {}) writer.add_hparams(vars(train_env_params), {}) writer.add_hparams(vars(obs_params), {}) training_timer = Timer() training_timer.start() print("\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes. Training id '{}'.\n".format( train_env.get_num_agents(), x_dim, y_dim, n_episodes, n_eval_episodes, checkpoint_interval, training_id )) for episode_idx in range(n_episodes + 1): step_timer = Timer() reset_timer = Timer() learn_timer = Timer() preproc_timer = Timer() inference_timer = Timer() # Reset environment reset_timer.start() obs, info = train_env.reset(regenerate_rail=True, regenerate_schedule=True) reset_timer.end() if train_params.render: env_renderer.set_new_rail() score = 0 nb_steps = 0 actions_taken = [] # Build initial agent-specific observations for agent in train_env.get_agent_handles(): if obs[agent]: agent_obs[agent] = normalize_observation(obs[agent], observation_tree_depth, observation_radius=observation_radius) agent_prev_obs[agent] = agent_obs[agent].copy() # Run episode for step in range(max_steps - 1): inference_timer.start() for agent in train_env.get_agent_handles(): if info['action_required'][agent]: update_values[agent] = True action = policy.act(agent_obs[agent], eps=eps_start) action_count[action] += 1 actions_taken.append(action) else: # An action is not required if the train hasn't joined the railway network, # if it already reached its target, or if is currently malfunctioning. update_values[agent] = False action = 0 action_dict.update({agent: action}) inference_timer.end() # Environment step step_timer.start() next_obs, all_rewards, done, info = train_env.step(action_dict) step_timer.end() # Render an episode at some interval if train_params.render and episode_idx % checkpoint_interval == 0: env_renderer.render_env( show=True, frames=False, show_observations=False, show_predictions=False ) # Update replay buffer and train agent for agent in train_env.get_agent_handles(): if update_values[agent] or done['__all__']: # Only learn from timesteps where somethings happened learn_timer.start() policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent]) learn_timer.end() agent_prev_obs[agent] = agent_obs[agent].copy() agent_prev_action[agent] = action_dict[agent] # Preprocess the new observations if next_obs[agent]: preproc_timer.start() agent_obs[agent] = normalize_observation(next_obs[agent], observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() score += all_rewards[agent] nb_steps = step if done['__all__']: break # Epsilon decay eps_start = max(eps_end, eps_decay * eps_start) # Collect information about training tasks_finished = sum(done[idx] for idx in train_env.get_agent_handles()) completion = tasks_finished / max(1, train_env.get_num_agents()) normalized_score = score / (max_steps * train_env.get_num_agents()) action_probs = action_count / np.sum(action_count) action_count = [1] * action_size smoothing = 0.99 smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * (1.0 - smoothing) smoothed_completion = smoothed_completion * smoothing + completion * (1.0 - smoothing) # Print logs if episode_idx % checkpoint_interval == 0: torch.save(policy.qnetwork_local, './checkpoints/' + training_id + '-' + str(episode_idx) + '.pth') if save_replay_buffer: policy.save_replay_buffer('./replay_buffers/' + training_id + '-' + str(episode_idx) + '.pkl') if train_params.render: env_renderer.close_window() print( '\r🚂 Episode {}' '\t 🏆 Score: {:.3f}' ' Avg: {:.3f}' '\t 💯 Done: {:.2f}%' ' Avg: {:.2f}%' '\t 🎲 Epsilon: {:.3f} ' '\t 🔀 Action Probs: {}'.format( episode_idx, normalized_score, smoothed_normalized_score, 100 * completion, 100 * smoothed_completion, eps_start, format_action_prob(action_probs) ), end=" ") # Evaluate policy and log results at some interval if episode_idx % checkpoint_interval == 0 and n_eval_episodes > 0: scores, completions, nb_steps_eval = eval_policy(eval_env, policy, train_params, obs_params) writer.add_scalar("evaluation/scores_min", np.min(scores), episode_idx) writer.add_scalar("evaluation/scores_max", np.max(scores), episode_idx) writer.add_scalar("evaluation/scores_mean", np.mean(scores), episode_idx) writer.add_scalar("evaluation/scores_std", np.std(scores), episode_idx) writer.add_histogram("evaluation/scores", np.array(scores), episode_idx) writer.add_scalar("evaluation/completions_min", np.min(completions), episode_idx) writer.add_scalar("evaluation/completions_max", np.max(completions), episode_idx) writer.add_scalar("evaluation/completions_mean", np.mean(completions), episode_idx) writer.add_scalar("evaluation/completions_std", np.std(completions), episode_idx) writer.add_histogram("evaluation/completions", np.array(completions), episode_idx) writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_mean", np.mean(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval), episode_idx) writer.add_histogram("evaluation/nb_steps", np.array(nb_steps_eval), episode_idx) smoothing = 0.9 smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean(scores) * (1.0 - smoothing) smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean(completions) * (1.0 - smoothing) writer.add_scalar("evaluation/smoothed_score", smoothed_eval_normalized_score, episode_idx) writer.add_scalar("evaluation/smoothed_completion", smoothed_eval_completion, episode_idx) # Save logs to tensorboard writer.add_scalar("training/score", normalized_score, episode_idx) writer.add_scalar("training/smoothed_score", smoothed_normalized_score, episode_idx) writer.add_scalar("training/completion", np.mean(completion), episode_idx) writer.add_scalar("training/smoothed_completion", np.mean(smoothed_completion), episode_idx) writer.add_scalar("training/nb_steps", nb_steps, episode_idx) writer.add_histogram("actions/distribution", np.array(actions_taken), episode_idx) writer.add_scalar("actions/nothing", action_probs[RailEnvActions.DO_NOTHING], episode_idx) writer.add_scalar("actions/left", action_probs[RailEnvActions.MOVE_LEFT], episode_idx) writer.add_scalar("actions/forward", action_probs[RailEnvActions.MOVE_FORWARD], episode_idx) writer.add_scalar("actions/right", action_probs[RailEnvActions.MOVE_RIGHT], episode_idx) writer.add_scalar("actions/stop", action_probs[RailEnvActions.STOP_MOVING], episode_idx) writer.add_scalar("training/epsilon", eps_start, episode_idx) writer.add_scalar("training/buffer_size", len(policy.memory), episode_idx) writer.add_scalar("training/loss", policy.loss, episode_idx) writer.add_scalar("timer/reset", reset_timer.get(), episode_idx) writer.add_scalar("timer/step", step_timer.get(), episode_idx) writer.add_scalar("timer/learn", learn_timer.get(), episode_idx) writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx) writer.add_scalar("timer/total", training_timer.get_current(), episode_idx)
def main(argv): try: opts, args = getopt.getopt(argv, "n:", ["n_trials="]) except getopt.GetoptError: print('training_navigation.py -n <n_trials>') sys.exit(2) for opt, arg in opts: if opt in ('-n', '--n_trials'): n_trials = int(arg) random.seed(1) np.random.seed(1) #### Choose the desired setup #### multi_agent_setup = 1 malfunctions_enabled = False agents_one_speed = True ################################## # Single agent (1) if multi_agent_setup == 1: x_dim = 35 y_dim = 35 n_agents = 1 max_num_cities = 3 max_rails_between_cities = 2 max_rails_in_city = 3 # Multi agent (3) if multi_agent_setup == 3: x_dim = 40 y_dim = 40 n_agents = 3 max_num_cities = 4 max_rails_between_cities = 2 max_rails_in_city = 3 # Multi agent (5) if multi_agent_setup == 5: x_dim = 16 * 3 y_dim = 9 * 3 n_agents = 5 max_num_cities = 5 max_rails_between_cities = 2 max_rails_in_city = 3 # Multi agent (10) if multi_agent_setup == 10: x_dim = 16 * 4 y_dim = 9 * 4 n_agents = 10 max_num_cities = 9 max_rails_between_cities = 5 max_rails_in_city = 5 # Use a the malfunction generator to break agents from time to time stochastic_data = { 'malfunction_rate': 80, # Rate of malfunction occurence of single agent 'min_duration': 15, # Minimal duration of malfunction 'max_duration': 50 # Max duration of malfunction } # Custom observation builder tree_depth = 2 TreeObservation = TreeObsForRailEnv( max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv(20)) np.savetxt(fname=path.join('Nets', 'info.txt'), X=[ x_dim, y_dim, n_agents, max_num_cities, max_rails_between_cities, max_rails_in_city, tree_depth ], delimiter=';') # Different agent types (trains) with different speeds. if agents_one_speed: speed_ration_map = { 1.: 1., # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 } # Slow freight train else: speed_ration_map = { 1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25 } # Slow freight train if malfunctions_enabled: env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=max_num_cities, # Number of cities in map (where train stations are) seed=14, # Random seed grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_ration_map), malfunction_generator_and_process_data=malfunction_from_params( stochastic_data), number_of_agents=n_agents, obs_builder_object=TreeObservation) else: env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=max_num_cities, # Number of cities in map (where train stations are) seed=14, # Random seed grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=n_agents, obs_builder_object=TreeObservation) env.reset(True, True) # After training we want to render the results so we also load a renderer env_renderer = RenderTool( env, gl="PILSVG", screen_height=800, # Adjust these parameters to fit your resolution screen_width=900) # Given the depth of the tree observation and the number of features per node we get the following state_size num_features_per_node = env.obs_builder.observation_dim nr_nodes = 0 for i in range(tree_depth + 1): nr_nodes += np.power(4, i) state_size = num_features_per_node * nr_nodes # The action space of flatland is 5 discrete actions action_size = 5 # We set the number of episodes we would like to train on if 'n_trials' not in locals(): n_trials = 15000 # And the max number of steps we want to take per episode max_steps = int(3 * (env.height + env.width)) # Define training parameters eps = 1. eps_end = 0.005 eps_decay = 0.998 # And some variables to keep track of the progress action_dict = dict() final_action_dict = dict() scores_window = deque(maxlen=100) done_window = deque(maxlen=100) deadlock_window = deque(maxlen=100) deadlock_average = [] scores = [] dones_list = [] #Metrics eps_list = [] action_prob_list = [] action_prob = [0] * action_size agent_obs = [None] * env.get_num_agents() agent_next_obs = [None] * env.get_num_agents() agent_obs_buffer = [None] * env.get_num_agents() agent_action_buffer = [2] * env.get_num_agents() cummulated_reward = np.zeros(env.get_num_agents()) update_values = False # Now we load a Double dueling DQN agent agent = Agent(state_size, action_size) for trials in range(1, n_trials + 1): #print(torch.cuda.current_device()) # Reset environment obs, info = env.reset(True, True) #env_renderer.reset() # Build agent specific observations for a in range(env.get_num_agents()): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) agent_obs_buffer[a] = agent_obs[a].copy() # Reset score and done score = 0 env_done = 0 # Run episode for step in range(max_steps): # Action for a in range(env.get_num_agents()): if info['action_required'][a]: # If an action is require, we want to store the obs a that step as well as the action update_values = True action = agent.act(agent_obs[a], eps=eps) action_prob[action] += 1 else: update_values = False action = 0 action_dict.update({a: action}) # Environment step next_obs, all_rewards, done, deadlocks, info = env.step( action_dict) #env_renderer.render_env(show=True, show_predictions=True, show_observations=True) # Update replay buffer and train agent for a in range(env.get_num_agents()): # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[a]: agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a], agent_obs[a], done[a]) cummulated_reward[a] = 0. agent_obs_buffer[a] = agent_obs[a].copy() agent_action_buffer[a] = action_dict[a] if next_obs[a]: agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10) score += all_rewards[a] / env.get_num_agents() # Copy observation if done['__all__']: env_done = 1 break # Epsilon decay eps = max(eps_end, eps_decay * eps) # decrease epsilon # Collection information about training tasks_finished = 0 for _idx in range(env.get_num_agents()): if done[_idx] == 1: tasks_finished += 1 done_window.append(tasks_finished / max(1, env.get_num_agents())) scores_window.append(score / max_steps) # save most recent score scores.append(np.mean(scores_window)) deadlock_window.append( deadlocks.count(1) / max(1, env.get_num_agents())) deadlock_average.append(np.mean(deadlock_window)) dones_list.append((np.mean(done_window))) eps_list.append(eps) action_prob_list.append(action_prob / np.sum(action_prob)) print( '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f} %\tDeadlocks: {:.2f} \tEpsilon: {:.2f} \t Action Probabilities: \t {}' .format(env.get_num_agents(), x_dim, y_dim, trials, np.mean(scores_window), 100 * np.mean(done_window), np.mean(deadlock_window), eps, action_prob / np.sum(action_prob)), end=" ") if trials % 100 == 0: print( '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}' .format(env.get_num_agents(), x_dim, y_dim, trials, np.mean(scores_window), 100 * np.mean(done_window), eps, action_prob / np.sum(action_prob))) torch.save( agent.qnetwork_local.state_dict(), path.join('Nets', ('navigator_checkpoint' + str(trials) + '.pth'))) action_prob = [1] * action_size if trials % 50 == 0: np.savetxt(fname=path.join('Nets', 'metrics.csv'), X=np.transpose( np.asarray([ scores, dones_list, deadlock_average, eps_list ])), delimiter=';', newline='\n') np.savetxt(fname=path.join('Nets', 'action_prob.csv'), X=np.asarray(action_prob_list), delimiter=';', newline='\n') # Plot overall training progress at the end plt.plot(scores) plt.show()
sheet1.write(x+3,4,done[3]) sheet1.write(x+4, 0, str(obs[4])) sheet1.write(x+4, 1, actionlist[4]) sheet1.write(x+4,2,all_rewards[4]) sheet1.write(x+4,3,str(next_obs[4])) sheet1.write(x+4,4,done[4]) x=x+5""" var = 0 #print(len(env.get_agent_handles())) while var < len(env.get_agent_handles()): varlist.append(var) obslist.append(str(obs[var])) if obs[var]: normalizeobslist.append( normalize_observation( obs[var], observation_tree_depth, observation_radius=observation_radius)) else: normalizeobslist.append("None") action_list.append(actionlist[var]) reward_list.append(all_rewards[var]) next_statelist.append(str(next_obs[var])) if next_obs[var]: next_statenormalized.append( normalize_observation( obs[var], observation_tree_depth, observation_radius=observation_radius)) else: next_statenormalized.append("None") donelist.append(done[var])
def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps, seed, render): # evaluation is faster on CPU, except if you have huge networks parameters = {'use_gpu': False} policy = DDDQNPolicy(state_size, action_size, Namespace(**parameters), evaluation_mode=True) policy.qnetwork_local = torch.load(checkpoint) env_params = Namespace(**env_params) # Environment parameters n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city # Observation parameters observation_tree_depth = env_params.observation_tree_depth observation_radius = env_params.observation_radius observation_max_path_depth = env_params.observation_max_path_depth # Malfunction and speed profiles # TODO pass these parameters properly from main! malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 2000, # Rate of malfunctions min_duration=20, # Minimal duration max_duration=50 # Max duration ) speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Setup the environment env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) env.reset(True, True) if render: env_renderer = RenderTool(env, gl="PGL") action_dict = dict() scores = [] completions = [] nb_steps = [] inference_times = [] preproc_times = [] agent_times = [] step_times = [] for episode_idx in range(n_eval_episodes): inference_timer = Timer() preproc_timer = Timer() agent_timer = Timer() step_timer = Timer() agent_obs = [None] * env.get_num_agents() score = 0.0 step_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True) step_timer.end() if render: env_renderer.set_new_rail() final_step = 0 for step in range(max_steps - 1): agent_timer.start() for agent in env.get_agent_handles(): if obs[agent]: preproc_timer.start() agent_obs[agent] = normalize_observation( obs[agent], tree_depth=observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() action = 0 if info['action_required'][agent]: inference_timer.start() action = policy.act(agent_obs[agent], eps=0.0) inference_timer.end() action_dict.update({agent: action}) agent_timer.end() step_timer.start() obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if render: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) for agent in env.get_agent_handles(): score += all_rewards[agent] final_step = step if done['__all__']: break normalized_score = score / (max_steps * env.get_num_agents()) scores.append(normalized_score) tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) completions.append(completion) nb_steps.append(final_step) inference_times.append(inference_timer.get()) preproc_times.append(preproc_timer.get()) agent_times.append(agent_timer.get()) step_times.append(step_timer.get()) print( "☑️ Score: {:.3f} \tDone: {:.1f}% \tNb steps: {:.3f} " "\t🚉 Env: {:.3f}s " "\t🤖 Agent: {:.3f}s (per step: {:.3f}s) \t[preproc: {:.3f}s \tinfer: {:.3f}s]" .format(normalized_score, completion * 100.0, final_step, step_timer.get(), agent_timer.get(), agent_timer.get() / final_step, preproc_timer.get(), inference_timer.get())) return scores, completions, nb_steps, agent_times, step_times
def train_agent(seed, n_episodes, timed = False): # Observation parameters observation_radius = 10 # Exploration parameters eps_start = 1.0 eps_end = 0.01 eps_decay = 0.997 # for 2500ts # Setup the environment env, max_steps, x_dim, y_dim, observation_tree_depth, _ = create_multi_agent_rail_env(seed, timed) env.reset(True, True) # Calculate the state size given the depth of the tree observation and the number of features n_features_per_node = env.obs_builder.observation_dim n_nodes = 0 for i in range(observation_tree_depth + 1): n_nodes += np.power(4, i) state_size = n_features_per_node * n_nodes # The action space of flatland is 5 discrete actions action_size = 5 action_dict = dict() # And some variables to keep track of the progress scores_window = deque(maxlen=100) # todo smooth when rendering instead completion_window = deque(maxlen=100) scores = [] completion = [] action_count = [0] * action_size agent_obs = [None] * env.get_num_agents() agent_prev_obs = [None] * env.get_num_agents() agent_prev_action = [2] * env.get_num_agents() update_values = False # Training parameters training_parameters = { 'buffer_size': int(1e5), 'batch_size': 32, 'update_every': 8, 'learning_rate': 0.5e-4, 'tau': 1e-3, 'gamma': 0.99, 'buffer_min_size': 0, 'hidden_size': 256, 'use_gpu': False } # Double Dueling DQN policy policy = DDDQNPolicy(state_size, action_size, Namespace(**training_parameters), seed=(seed+1)) for episode_idx in range(n_episodes): score = 0 # Reset environment obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True) # Build agent specific observations for agent in env.get_agent_handles(): if obs[agent]: agent_obs[agent] = normalize_observation(obs[agent], observation_tree_depth, observation_radius=observation_radius) agent_prev_obs[agent] = agent_obs[agent].copy() # Run episode for _ in range(max_steps - 1): for agent in env.get_agent_handles(): if info['action_required'][agent]: # If an action is required, we want to store the obs at that step as well as the action update_values = True action = policy.act(agent_obs[agent], eps=eps_start) action_count[action] += 1 else: update_values = False action = 0 action_dict.update({agent: action}) # Environment step next_obs, all_rewards, done, info = env.step(action_dict) # Update replay buffer and train agent for agent in range(env.get_num_agents()): # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[agent]: policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent]) agent_prev_obs[agent] = agent_obs[agent].copy() agent_prev_action[agent] = action_dict[agent] if next_obs[agent]: agent_obs[agent] = normalize_observation(next_obs[agent], observation_tree_depth, observation_radius=observation_radius) score += all_rewards[agent] if done['__all__']: break # Epsilon decay eps_start = max(eps_end, eps_decay * eps_start) # Collection information about training tasks_finished = np.sum([int(done[idx]) for idx in env.get_agent_handles()]) completion_window.append(tasks_finished / max(1, env.get_num_agents())) scores_window.append(score / (max_steps * env.get_num_agents())) completion.append((np.mean(completion_window))) scores.append(np.mean(scores_window)) action_probs = action_count / np.sum(action_count) if episode_idx % 100 == 0: end = "\n" action_count = [1] * action_size else: end = " " print('\rTraining {} agents on {}x{}\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format( env.get_num_agents(), x_dim, y_dim, episode_idx, np.mean(scores_window), 100 * np.mean(completion_window), eps_start, action_probs ), end=end) # Return trained policy. return policy
def main(argv): try: opts, args = getopt.getopt(argv, "n:", ["n_trials="]) except getopt.GetoptError: print('training_navigation.py -n <n_trials>') sys.exit(2) for opt, arg in opts: if opt in ('-n', '--n_trials'): n_trials = int(arg) random.seed(1) np.random.seed(1) # Parameters for the Environment x_dim = 35 y_dim = 35 n_agents = 10 # Use a the malfunction generator to break agents from time to time stochastic_data = {'malfunction_rate': 8000, # Rate of malfunction occurence of single agent 'min_duration': 15, # Minimal duration of malfunction 'max_duration': 50 # Max duration of malfunction } # Custom observation builder TreeObservation = TreeObsForRailEnv(max_depth=2, predictor=ShortestPathPredictorForRailEnv(30)) # Different agent types (trains) with different speeds. speed_ration_map = {1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25} # Slow freight train env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator(max_num_cities=3, # Number of cities in map (where train stations are) seed=1, # Random seed grid_mode=False, max_rails_between_cities=2, max_rails_in_city=3), schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), obs_builder_object=TreeObservation) # After training we want to render the results so we also load a renderer env_renderer = RenderTool(env, gl="PILSVG", ) # Given the depth of the tree observation and the number of features per node we get the following state_size num_features_per_node = env.obs_builder.observation_dim tree_depth = 2 nr_nodes = 0 for i in range(tree_depth + 1): nr_nodes += np.power(4, i) state_size = num_features_per_node * nr_nodes # The action space of flatland is 5 discrete actions action_size = 5 # We set the number of episodes we would like to train on if 'n_trials' not in locals(): n_trials = 15000 # And the max number of steps we want to take per episode max_steps = int(4 * 2 * (20 + env.height + env.width)) # Define training parameters eps = 1. eps_end = 0.005 eps_decay = 0.998 # And some variables to keep track of the progress action_dict = dict() final_action_dict = dict() scores_window = deque(maxlen=100) done_window = deque(maxlen=100) scores = [] dones_list = [] action_prob = [0] * action_size agent_obs = [None] * env.get_num_agents() agent_next_obs = [None] * env.get_num_agents() agent_obs_buffer = [None] * env.get_num_agents() agent_action_buffer = [2] * env.get_num_agents() cummulated_reward = np.zeros(env.get_num_agents()) update_values = [False] * env.get_num_agents() # Now we load a Double dueling DQN agent agent = Agent(state_size, action_size) for trials in range(1, n_trials + 1): # Reset environment obs, info = env.reset(True, True) env_renderer.reset() # Build agent specific observations for a in range(env.get_num_agents()): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) agent_obs_buffer[a] = agent_obs[a].copy() # Reset score and done score = 0 env_done = 0 # Run episode while True: # Action for a in range(env.get_num_agents()): if info['action_required'][a]: # If an action is require, we want to store the obs a that step as well as the action update_values[a] = True action = agent.act(agent_obs[a], eps=eps) action_prob[action] += 1 else: update_values[a] = False action = 0 action_dict.update({a: action}) # Environment step next_obs, all_rewards, done, info = env.step(action_dict) # Update replay buffer and train agent for a in range(env.get_num_agents()): # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values[a] or done[a]: agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a], agent_obs[a], done[a]) cummulated_reward[a] = 0. agent_obs_buffer[a] = agent_obs[a].copy() agent_action_buffer[a] = action_dict[a] if next_obs[a]: agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10) score += all_rewards[a] / env.get_num_agents() # Copy observation if done['__all__']: env_done = 1 break # Epsilon decay eps = max(eps_end, eps_decay * eps) # decrease epsilon # Collection information about training tasks_finished = 0 for current_agent in env.agents: if current_agent.status == RailAgentStatus.DONE_REMOVED: tasks_finished += 1 done_window.append(tasks_finished / max(1, env.get_num_agents())) scores_window.append(score / max_steps) # save most recent score scores.append(np.mean(scores_window)) dones_list.append((np.mean(done_window))) print( '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format( env.get_num_agents(), x_dim, y_dim, trials, np.mean(scores_window), 100 * np.mean(done_window), eps, action_prob / np.sum(action_prob)), end=" ") if trials % 100 == 0: print( '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format( env.get_num_agents(), x_dim, y_dim, trials, np.mean(scores_window), 100 * np.mean(done_window), eps, action_prob / np.sum(action_prob))) torch.save(agent.qnetwork_local.state_dict(), './Nets/navigator_checkpoint' + str(trials) + '.pth') action_prob = [1] * action_size # Plot overall training progress at the end plt.plot(scores) plt.show()
np.mean(completions) * 100.0)) return scores, completions, nb_steps for episode_idx in range(n_episodes + 1): obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True) score = 0 nb_steps = 0 actions_taken = [] # Build agent specific observations for agent in env.get_agent_handles(): if obs[agent]: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) agent_prev_obs[agent] = agent_obs[agent].copy() # Run episode for step in range(max_steps - 1): for agent in env.get_agent_handles(): if info['action_required'][agent]: # If an action is required, we want to store the obs at that step as well as the action update_values = True action = policy.act(agent_obs[agent], eps=eps_start) action_count[action] += 1 actions_taken.append(action) else: update_values = False action = 0
def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps, action_size, state_size, seed, render, allow_skipping, allow_caching): # Evaluation is faster on CPU (except if you use a really huge policy) parameters = {'use_gpu': False} policy = DDDQNPolicy(state_size, action_size, Namespace(**parameters), evaluation_mode=True) policy.qnetwork_local = torch.load(checkpoint) env_params = Namespace(**env_params) # Environment parameters n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city # Malfunction and speed profiles # TODO pass these parameters properly from main! malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 2000, # Rate of malfunctions min_duration=20, # Minimal duration max_duration=50 # Max duration ) # Only fast trains in Round 1 speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Observation parameters observation_tree_depth = env_params.observation_tree_depth observation_radius = env_params.observation_radius observation_max_path_depth = env_params.observation_max_path_depth # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Setup the environment env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city, ), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation) if render: env_renderer = RenderTool(env, gl="PGL") action_dict = dict() scores = [] completions = [] nb_steps = [] inference_times = [] preproc_times = [] agent_times = [] step_times = [] for episode_idx in range(n_eval_episodes): seed += 1 inference_timer = Timer() preproc_timer = Timer() agent_timer = Timer() step_timer = Timer() step_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True, random_seed=seed) step_timer.end() agent_obs = [None] * env.get_num_agents() score = 0.0 if render: env_renderer.set_new_rail() final_step = 0 skipped = 0 nb_hit = 0 agent_last_obs = {} agent_last_action = {} for step in range(max_steps - 1): if allow_skipping and check_if_all_blocked(env): # FIXME why -1? bug where all agents are "done" after max_steps! skipped = max_steps - step - 1 final_step = max_steps - 2 n_unfinished_agents = sum(not done[idx] for idx in env.get_agent_handles()) score -= skipped * n_unfinished_agents break agent_timer.start() for agent in env.get_agent_handles(): if obs[agent] and info['action_required'][agent]: if agent in agent_last_obs and np.all( agent_last_obs[agent] == obs[agent]): nb_hit += 1 action = agent_last_action[agent] else: preproc_timer.start() norm_obs = normalize_observation( obs[agent], tree_depth=observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() inference_timer.start() action = policy.act(norm_obs, eps=0.0) inference_timer.end() action_dict.update({agent: action}) if allow_caching: agent_last_obs[agent] = obs[agent] agent_last_action[agent] = action agent_timer.end() step_timer.start() obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if render: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) if step % 100 == 0: print("{}/{}".format(step, max_steps - 1)) for agent in env.get_agent_handles(): score += all_rewards[agent] final_step = step if done['__all__']: break normalized_score = score / (max_steps * env.get_num_agents()) scores.append(normalized_score) tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) completions.append(completion) nb_steps.append(final_step) inference_times.append(inference_timer.get()) preproc_times.append(preproc_timer.get()) agent_times.append(agent_timer.get()) step_times.append(step_timer.get()) skipped_text = "" if skipped > 0: skipped_text = "\t⚡ Skipped {}".format(skipped) hit_text = "" if nb_hit > 0: hit_text = "\t⚡ Hit {} ({:.1f}%)".format(nb_hit, (100 * nb_hit) / (n_agents * final_step)) print( "☑️ Score: {:.3f} \tDone: {:.1f}% \tNb steps: {:.3f} " "\t🍭 Seed: {}" "\t🚉 Env: {:.3f}s " "\t🤖 Agent: {:.3f}s (per step: {:.3f}s) \t[preproc: {:.3f}s \tinfer: {:.3f}s]" "{}{}".format(normalized_score, completion * 100.0, final_step, seed, step_timer.get(), agent_timer.get(), agent_timer.get() / final_step, preproc_timer.get(), inference_timer.get(), skipped_text, hit_text)) return scores, completions, nb_steps, agent_times, step_times