def eval_policy(env, policy, n_eval_episodes, max_steps): action_dict = dict() scores = [] completions = [] nb_steps = [] for episode_idx in range(n_eval_episodes): agent_obs = [None] * env.get_num_agents() score = 0.0 obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True) final_step = 0 for step in range(max_steps - 1): for agent in env.get_agent_handles(): if obs[agent]: # TODO pass parameters properly # agent_obs[agent] = normalize_observation(obs[agent], tree_depth=2, observation_radius=10) agent_obs[agent] = normalize_observation( obs[agent], tree_depth=2, observation_radius=10) action = 0 if info['action_required'][agent]: action = policy.act(agent_obs[agent], eps=0.0) action_dict.update({agent: action}) obs, all_rewards, done, info = env.step(action_dict) for agent in env.get_agent_handles(): score += all_rewards[agent] final_step = step if done['__all__']: break normalized_score = score / (max_steps * env.get_num_agents()) scores.append(normalized_score) tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) completions.append(completion) nb_steps.append(final_step) print("\tβ Eval: score {:.3f} done {:.1f}%".format( np.mean(scores), np.mean(completions) * 100.0)) return scores, completions, nb_steps
def my_controller(env, obs, number_of_agents): # ====================S==================== for a in range(number_of_agents): if done[a]: continue agent = env.agents[a] if agent.speed_data['position_fraction']>5: action_dict.update({a: 4}) # stop continue if info['action_required'][a]: if agent.position is not None: possible_transition_num = np.count_nonzero(env.rail.get_transitions(*agent.position, agent.direction)) if possible_transition_num == 1: action = 2 else: action = policy.act(normalize_observation(obs[a], observation_tree_depth, zero_center=False), eps=1.0) else: action = 2 else: action = 0 action_dict.update({a: action}) return action_dict
def train_agent(env_params, train_params): # Environment parameters n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city seed = env_params.seed # Observation parameters observation_tree_depth = env_params.observation_tree_depth observation_radius = env_params.observation_radius observation_max_path_depth = env_params.observation_max_path_depth # Training parameters eps_start = train_params.eps_start eps_end = train_params.eps_end eps_decay = train_params.eps_decay n_episodes = train_params.n_episodes checkpoint_interval = train_params.checkpoint_interval n_eval_episodes = train_params.n_evaluation_episodes # Set the seeds random.seed(seed) np.random.seed(seed) # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 10000, # Rate of malfunctions min_duration=15, # Minimal duration max_duration=50 # Max duration ) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Fraction of train which each speed speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Setup the environment env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) env.reset(regenerate_schedule=True, regenerate_rail=True) # Setup renderer if train_params.render: env_renderer = RenderTool(env, gl="PGL") # Calculate the state size given the depth of the tree observation and the number of features n_features_per_node = env.obs_builder.observation_dim n_nodes = 0 for i in range(observation_tree_depth + 1): n_nodes += np.power(4, i) state_size = n_features_per_node * n_nodes # The action space of flatland is 5 discrete actions action_size = 5 # Max number of steps per episode # This is the official formula used during evaluations # See details in flatland.envs.schedule_generators.sparse_schedule_generator max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) action_count = [0] * action_size action_dict = dict() agent_obs = [None] * env.get_num_agents() agent_prev_obs = [None] * env.get_num_agents() agent_prev_action = [2] * env.get_num_agents() update_values = False smoothed_normalized_score = -1.0 smoothed_eval_normalized_score = -1.0 smoothed_completion = 0.0 smoothed_eval_completion = 0.0 # Double Dueling DQN policy policy = DDDQNPolicy(state_size, action_size, train_params) # TensorBoard writer writer = SummaryWriter() writer.add_hparams(vars(train_params), {}) writer.add_hparams(vars(env_params), {}) training_timer = Timer() training_timer.start() print( "\nπ Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes.\n" .format(env.get_num_agents(), x_dim, y_dim, n_episodes, n_eval_episodes, checkpoint_interval)) for episode_idx in range(n_episodes + 1): # Timers step_timer = Timer() reset_timer = Timer() learn_timer = Timer() preproc_timer = Timer() # Reset environment reset_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True) reset_timer.end() if train_params.render: env_renderer.set_new_rail() score = 0 nb_steps = 0 actions_taken = [] # Build agent specific observations for agent in env.get_agent_handles(): if obs[agent]: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) agent_prev_obs[agent] = agent_obs[agent].copy() # Run episode for step in range(max_steps - 1): for agent in env.get_agent_handles(): if info['action_required'][agent]: # If an action is required, we want to store the obs at that step as well as the action update_values = True action = policy.act(agent_obs[agent], eps=eps_start) action_count[action] += 1 actions_taken.append(action) else: update_values = False action = 0 action_dict.update({agent: action}) # Environment step step_timer.start() next_obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if train_params.render and episode_idx % checkpoint_interval == 0: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) for agent in range(env.get_num_agents()): # Update replay buffer and train agent # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[agent]: learn_timer.start() policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent]) learn_timer.end() agent_prev_obs[agent] = agent_obs[agent].copy() agent_prev_action[agent] = action_dict[agent] # Preprocess the new observations if next_obs[agent]: preproc_timer.start() agent_obs[agent] = normalize_observation( next_obs[agent], observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() score += all_rewards[agent] nb_steps = step if done['__all__']: break # Epsilon decay eps_start = max(eps_end, eps_decay * eps_start) # Collection information about training tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) normalized_score = score / (max_steps * env.get_num_agents()) action_probs = action_count / np.sum(action_count) action_count = [1] * action_size # Smoothed values for terminal display and for more stable hyper-parameter tuning smoothing = 0.99 smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * ( 1.0 - smoothing) smoothed_completion = smoothed_completion * smoothing + completion * ( 1.0 - smoothing) # Print logs if episode_idx % checkpoint_interval == 0: torch.save( policy.qnetwork_local, './checkpoints/origin_multi-' + str(episode_idx) + '.pth') if train_params.render: env_renderer.close_window() print('\rπ Episode {}' '\t π Score: {:.3f}' ' Avg: {:.3f}' '\t π― Done: {:.2f}%' ' Avg: {:.2f}%' '\t π² Epsilon: {:.2f} ' '\t π Action Probs: {}'.format(episode_idx, normalized_score, smoothed_normalized_score, 100 * completion, 100 * smoothed_completion, eps_start, format_action_prob(action_probs)), end=" ") # Evaluate policy if episode_idx % train_params.checkpoint_interval == 0: scores, completions, nb_steps_eval = eval_policy( env, policy, n_eval_episodes, max_steps) writer.add_scalar("evaluation/scores_min", np.min(scores), episode_idx) writer.add_scalar("evaluation/scores_max", np.max(scores), episode_idx) writer.add_scalar("evaluation/scores_mean", np.mean(scores), episode_idx) writer.add_scalar("evaluation/scores_std", np.std(scores), episode_idx) writer.add_histogram("evaluation/scores", np.array(scores), episode_idx) writer.add_scalar("evaluation/completions_min", np.min(completions), episode_idx) writer.add_scalar("evaluation/completions_max", np.max(completions), episode_idx) writer.add_scalar("evaluation/completions_mean", np.mean(completions), episode_idx) writer.add_scalar("evaluation/completions_std", np.std(completions), episode_idx) writer.add_histogram("evaluation/completions", np.array(completions), episode_idx) writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_mean", np.mean(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval), episode_idx) writer.add_histogram("evaluation/nb_steps", np.array(nb_steps_eval), episode_idx) smoothing = 0.9 smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean( scores) * (1.0 - smoothing) smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean( completions) * (1.0 - smoothing) writer.add_scalar("evaluation/smoothed_score", smoothed_eval_normalized_score, episode_idx) writer.add_scalar("evaluation/smoothed_completion", smoothed_eval_completion, episode_idx) # Save logs to tensorboard writer.add_scalar("training/score", normalized_score, episode_idx) writer.add_scalar("training/smoothed_score", smoothed_normalized_score, episode_idx) writer.add_scalar("training/completion", np.mean(completion), episode_idx) writer.add_scalar("training/smoothed_completion", np.mean(smoothed_completion), episode_idx) writer.add_scalar("training/nb_steps", nb_steps, episode_idx) writer.add_histogram("actions/distribution", np.array(actions_taken), episode_idx) writer.add_scalar("actions/nothing", action_probs[RailEnvActions.DO_NOTHING], episode_idx) writer.add_scalar("actions/left", action_probs[RailEnvActions.MOVE_LEFT], episode_idx) writer.add_scalar("actions/forward", action_probs[RailEnvActions.MOVE_FORWARD], episode_idx) writer.add_scalar("actions/right", action_probs[RailEnvActions.MOVE_RIGHT], episode_idx) writer.add_scalar("actions/stop", action_probs[RailEnvActions.STOP_MOVING], episode_idx) writer.add_scalar("training/epsilon", eps_start, episode_idx) writer.add_scalar("training/buffer_size", len(policy.memory), episode_idx) writer.add_scalar("training/loss", policy.loss, episode_idx) writer.add_scalar("timer/reset", reset_timer.get(), episode_idx) writer.add_scalar("timer/step", step_timer.get(), episode_idx) writer.add_scalar("timer/learn", learn_timer.get(), episode_idx) writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx) writer.add_scalar("timer/total", training_timer.get_current(), episode_idx)
def main(): np.random.seed(1) env = RailEnv( width=x_dim, height=y_dim, number_of_agents=n_agents, rail_generator=rail_generator, schedule_generator=schedule_generator, malfunction_generator_and_process_data=malfunction_from_params( StochasticData(1 / 8000, 15, 50)), obs_builder_object=TreeObservation(max_depth=tree_depth)) # After training we want to render the results so we also load a renderer env_renderer = RenderTool(env, gl="PILSVG") # Calculate the state size based on the number of nodes in the tree observation num_features_per_node = env.obs_builder.observation_dim num_nodes = sum(np.power(4, i) for i in range(tree_depth + 1)) state_size = num_features_per_node * num_nodes action_size = 5 # Now we load a double dueling DQN agent and initialize it from the checkpoint agent = Agent(state_size, action_size) if load_from_checkpoint: start, eps = agent.load(project_root / 'checkpoints', 0, 1.0) else: start, eps = 0, 1.0 # And some variables to keep track of the progress action_dict, final_action_dict = {}, {} scores_window, done_window = deque(maxlen=500), deque(maxlen=500) action_prob = [0] * action_size agent_obs = [None] * n_agents agent_obs_buffer = [None] * n_agents agent_action_buffer = [2] * n_agents max_steps = int(3 * (x_dim + y_dim)) update_values = False start_time = time.time() # We don't want to retrain on old railway networks when we restart from a checkpoint, so we just loop # through the generators to get all the old networks out of the way for _ in range(0, start): rail_generator() schedule_generator() # Start the training loop for episode in range(start + 1, n_trials + 1): env_renderer.reset() obs, info = env.reset(True, True) score = 0 # Build agent specific observations for a in range(n_agents): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) agent_obs_buffer[a] = agent_obs[a].copy() # Run episode for step in range(max_steps): for a in range(n_agents): if info['action_required'][a]: # If an action is required, we want to store the obs a that step as well as the action update_values = True action = agent.act(agent_obs[a], eps=eps) # action = np.random.randint(4) action_dict[a] = action action_prob[action] += 1 else: update_values = False action_dict[a] = 0 # Environment step next_obs, all_rewards, done, info = env.step(action_dict) # Update replay buffer and train agent for a in range(n_agents): # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[a]: agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a], agent_obs[a], done[a], train) agent_obs_buffer[a] = agent_obs[a].copy() agent_action_buffer[a] = action_dict[a] if next_obs[a]: agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10) score += all_rewards[a] / n_agents # Render if episode % render_interval == 0: render(env_renderer) if done['__all__']: break # Epsilon decay eps = max(eps_end, eps_decay * eps) # decrease epsilon # Collection information about training tasks_finished = sum(done[i] for i in range(n_agents)) done_window.append(tasks_finished / max(1, n_agents)) scores_window.append(score / max_steps) # save most recent score action_probs = ', '.join(f'{x:.3f}' for x in action_prob / np.sum(action_prob)) print(f'\rTraining {n_agents} Agents on ({x_dim},{y_dim}) \t ' + f'Episode {episode} \t ' + f'Average Score: {np.mean(scores_window):.3f} \t ' + f'Dones: {100 * np.mean(done_window):.2f}% \t ' + f'Epsilon: {eps:.2f} \t ' + f'Action Probabilities: {action_probs}', end=" ") if episode % report_interval == 0: print(f'\rTraining {n_agents} Agents on ({x_dim},{y_dim}) \t ' + f'Episode {episode} \t ' + f'Average Score: {np.mean(scores_window):.3f} \t ' + f'Dones: {100 * np.mean(done_window):.2f}% \t ' + f'Epsilon: {eps:.2f} \t ' + f'Action Probabilities: {action_probs} \t ' + f'Time taken: {time.time() - start_time:.2f}s') if train: agent.save(project_root / 'checkpoints', episode, eps) start_time = time.time() action_prob = [1] * action_size
f'Epsilon: {eps:.2f}' if flags.agent_type == "dqn" else None, f'Time taken: {time.time() - start_time:.2f}s' if show_time else None ])) + ' ' # Main training loop for episode in range(start + 1, flags.num_episodes + 1): agent.reset() env_renderer.reset() obs, info = env.reset(True, True) score, steps_taken, collision = 0, 0, False # Build initial observations for each agent for a in range(flags.num_agents): agent_obs[a] = normalize_observation( obs[a], flags.tree_depth, zero_center=flags.agent_type == 'dqn') agent_obs_buffer[a] = agent_obs[a].copy() # Run an episode for step in range(max_steps): update_values = [False] * flags.num_agents action_dict = {} for a in range(flags.num_agents): if info['action_required'][a]: action_dict[a] = agent.act(agent_obs[a], eps=eps) # action_dict[a] = np.random.randint(5) update_values[a] = True steps_taken += 1 else: action_dict[a] = 0
def train_agent(n_episodes): # Environment parameters n_agents = 1 x_dim = 25 y_dim = 25 n_cities = 4 max_rails_between_cities = 2 max_rails_in_city = 3 seed = 42 # Observation parameters observation_tree_depth = 2 observation_radius = 10 # Exploration parameters eps_start = 1.0 eps_end = 0.01 eps_decay = 0.997 # for 2500ts # Set the seeds random.seed(seed) np.random.seed(seed) # Observation builder tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth) # Setup the environment env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, seed=seed, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(), number_of_agents=n_agents, obs_builder_object=tree_observation) env.reset(True, True) # Calculate the state size given the depth of the tree observation and the number of features n_features_per_node = env.obs_builder.observation_dim n_nodes = 0 for i in range(observation_tree_depth + 1): n_nodes += np.power(4, i) state_size = n_features_per_node * n_nodes # The action space of flatland is 5 discrete actions action_size = 5 # Max number of steps per episode # This is the official formula used during evaluations max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) action_dict = dict() # And some variables to keep track of the progress scores_window = deque(maxlen=100) # todo smooth when rendering instead completion_window = deque(maxlen=100) scores = [] completion = [] action_count = [0] * action_size agent_obs = [None] * env.get_num_agents() agent_prev_obs = [None] * env.get_num_agents() agent_prev_action = [2] * env.get_num_agents() update_values = False # Training parameters training_parameters = { 'buffer_size': int(1e5), 'batch_size': 32, 'update_every': 8, 'learning_rate': 0.5e-4, 'tau': 1e-3, 'gamma': 0.99, 'buffer_min_size': 0, 'hidden_size': 256, 'use_gpu': False } # Double Dueling DQN policy policy = DDDQNPolicy(state_size, action_size, Namespace(**training_parameters)) for episode_idx in range(n_episodes): score = 0 # Reset environment obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True) # print("reset μ΄ν ",obs) # print(type(obs)) # print(len(obs)) # Build agent specific observations for agent in env.get_agent_handles(): #print(obs[agent]) if obs[agent]: # print("normalizeμ ") # print(type(obs[agent])) # print(obs[agent]) #print(agent_obs[agent].ndim) agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) agent_prev_obs[agent] = agent_obs[agent].copy() # Run episode for step in range(max_steps - 1): for agent in env.get_agent_handles(): if info['action_required'][agent]: # If an action is required, we want to store the obs at that step as well as the action update_values = True # if(step == 10): # print("normalize ν") # print(type(agent_obs[agent])) # print(agent_obs[agent]) # print(agent_obs[agent].size) action = policy.act(agent_obs[agent], eps=eps_start) #print(action) #print(type(action)) action_count[action] += 1 else: update_values = False action = 0 action_dict.update({agent: action}) #print(action_dict) # Environment step next_obs, all_rewards, done, info = env.step(action_dict) # Update replay buffer and train agent for agent in range(env.get_num_agents()): # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[agent]: policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent]) agent_prev_obs[agent] = agent_obs[agent].copy() agent_prev_action[agent] = action_dict[agent] if next_obs[agent]: agent_obs[agent] = normalize_observation( next_obs[agent], observation_tree_depth, observation_radius=10) score += all_rewards[agent] if done['__all__']: break # Epsilon decay eps_start = max(eps_end, eps_decay * eps_start) # Collection information about training tasks_finished = np.sum( [int(done[idx]) for idx in env.get_agent_handles()]) completion_window.append(tasks_finished / max(1, env.get_num_agents())) scores_window.append(score / (max_steps * env.get_num_agents())) completion.append((np.mean(completion_window))) scores.append(np.mean(scores_window)) action_probs = action_count / np.sum(action_count) if episode_idx % 100 == 0: end = "\n" torch.save(policy.qnetwork_local, './checkpoints/single-' + str(episode_idx) + '.pth') action_count = [1] * action_size else: end = " " print( '\rTraining {} agents on {}x{}\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}' .format(env.get_num_agents(), x_dim, y_dim, episode_idx, np.mean(scores_window), 100 * np.mean(completion_window), eps_start, action_probs), end=end) # Plot overall training progress at the end plt.plot(scores) plt.show() plt.plot(completion) plt.show()
def main(): np.random.seed(1) env = RailEnv( width=flags.grid_width, height=flags.grid_height, number_of_agents=flags.num_agents, rail_generator=rail_generator, schedule_generator=schedule_generator, malfunction_generator_and_process_data=malfunction_from_params( MalfunctionParameters(1 / 8000, 15, 50)), obs_builder_object=TreeObservation(max_depth=flags.tree_depth)) # After training we want to render the results so we also load a renderer env_renderer = RenderTool(env, gl="PILSVG") # Calculate the state size based on the number of nodes in the tree observation num_features_per_node = env.obs_builder.observation_dim num_nodes = sum(np.power(4, i) for i in range(flags.tree_depth + 1)) state_size = num_nodes * num_features_per_node action_size = 5 # Now we load a double dueling DQN agent and initialize it from the checkpoint agent = Agent(state_size, action_size) if flags.load_from_checkpoint: start, eps = agent.load(project_root / 'checkpoints', 0, 1.0) else: start, eps = 0, 1.0 # And some variables to keep track of the progress action_dict, final_action_dict = {}, {} scores_window, steps_window, done_window = deque(maxlen=200), deque( maxlen=200), deque(maxlen=200) action_prob = [0] * action_size agent_obs = [None] * flags.num_agents agent_obs_buffer = [None] * flags.num_agents agent_action_buffer = [2] * flags.num_agents max_steps = int(8 * (flags.grid_width + flags.grid_height)) update_values = False start_time = time.time() # We don't want to retrain on old railway networks when we restart from a checkpoint, so we just loop # through the generators to get all the old networks out of the way if start > 0: print(f"Skipping {start} railways") for _ in range(0, start): rail_generator() schedule_generator() # Start the training loop for episode in range(start + 1, flags.num_episodes + 1): env_renderer.reset() obs, info = env.reset(True, True) score, steps_taken = 0, 0 # Build agent specific observations for a in range(flags.num_agents): if obs[a]: agent_obs[a] = normalize_observation(obs[a], flags.tree_depth) agent_obs_buffer[a] = agent_obs[a].copy() # Run episode for step in range(max_steps): for a in range(flags.num_agents): # if not isinstance(obs[a].childs['L'], float) or not isinstance(obs[a].childs['R'], float): if info['action_required'][a]: # If an action is required, we want to store the obs a that step as well as the action update_values = True # distances = { key: child.dist_min_to_target for key, child in obs[a].childs.items() if not isinstance(child, float) } # action_key = min(distances, key=distances.get) # action = { 'L': 1, 'F': 2, 'R': 3 }[action_key] # action = np.argmin(agent_obs[a]) # action = np.random.randint(4) action = agent.act(agent_obs[a], eps=eps) action_dict[a] = action action_prob[action] += 1 steps_taken += 1 else: update_values = False action_dict[a] = 2 # Environment step obs, all_rewards, done, info = env.step(action_dict) # Update replay buffer and train agent for a in range(flags.num_agents): # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[a]: agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a], agent_obs[a], done[a], flags.train) agent_obs_buffer[a] = agent_obs[a].copy() agent_action_buffer[a] = action_dict[a] if obs[a]: agent_obs[a] = normalize_observation( obs[a], flags.tree_depth) score += all_rewards[a] / flags.num_agents # Render if flags.render_interval and episode % flags.render_interval == 0: render(env_renderer) if done['__all__']: break # Epsilon decay eps = max(0.01, flags.epsilon_decay * eps) # Save some training statistics in their respective deques tasks_finished = sum(done[i] for i in range(flags.num_agents)) done_window.append(tasks_finished / max(1, flags.num_agents)) scores_window.append(score / max_steps) steps_window.append(steps_taken) action_probs = ', '.join(f'{x:.3f}' for x in action_prob / np.sum(action_prob)) print( f'\rTraining {flags.num_agents} Agents on ({flags.grid_width},{flags.grid_height}) \t ' + f'Episode {episode} \t ' + f'Average Score: {np.mean(scores_window):.3f} \t ' + f'Average Steps Taken: {np.mean(steps_window):.1f} \t ' + f'Dones: {100 * np.mean(done_window):.2f}% \t ' + f'Epsilon: {eps:.2f} \t ' + f'Action Probabilities: {action_probs}', end=" ") if episode % flags.report_interval == 0: print( f'\rTraining {flags.num_agents} Agents on ({flags.grid_width},{flags.grid_height}) \t ' + f'Episode {episode} \t ' + f'Average Score: {np.mean(scores_window):.3f} \t ' + f'Average Steps Taken: {np.mean(steps_window):.1f} \t ' + f'Dones: {100 * np.mean(done_window):.2f}% \t ' + f'Epsilon: {eps:.2f} \t ' + f'Action Probabilities: {action_probs} \t ' + f'Time taken: {time.time() - start_time:.2f}s') if flags.train: agent.save(project_root / 'checkpoints', episode, eps) start_time = time.time() action_prob = [1] * action_size
# the state of the remote copy of the env, and the observations and # rewards, etc will behave unexpectedly # # You can however probe the local_env instance to get any information # you need from the environment. It is a valid RailEnv instance. local_env = remote_client.env number_of_agents = len(local_env.agents) #====================S==================== done = local_env.dones agent_obs = [None] * number_of_agents #agent_obs_buffer = [None] * number_of_agents # Build initial observations for each agent for a in range(number_of_agents): agent_obs[a] = normalize_observation(observation[a], observation_tree_depth, zero_center=False) #====================E==================== # Now we enter into another infinite loop where we # compute the actions for all the individual steps in this episode # until the episode is `done` # # An episode is considered done when either all the agents have # reached their target destination # or when the number of time steps has exceed max_time_steps, which # is defined by : # # max_time_steps = int(4 * 2 * (env.width + env.height + 20)) # time_taken_by_controller = [] time_taken_per_step = []