def test(args, T, ep, dqn, val_mem, metrics, results_dir, evaluate=False): # Init env and set in evaluation mode # Maps speeds to % of appearance in the env speed_ration_map = { 1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25 } # Slow freight train schedule_generator = sparse_schedule_generator(speed_ration_map) observation_builder = GraphObsForRailEnv( predictor=ShortestPathPredictorForRailEnv( max_depth=args.prediction_depth)) env = RailEnv( width=args.width, height=args.height, rail_generator=sparse_rail_generator( max_num_cities=args.max_num_cities, seed= ep, # Use episode as seed when evaluation is performed during training grid_mode=args.grid_mode, max_rails_between_cities=args.max_rails_between_cities, max_rails_in_city=args.max_rails_in_city, ), schedule_generator=schedule_generator, number_of_agents=args.num_agents, obs_builder_object=observation_builder, malfunction_generator_and_process_data=malfunction_from_params( parameters={ 'malfunction_rate': args.malfunction_rate, 'min_duration': args.min_duration, 'max_duration': args.max_duration }), ) if args.render: env_renderer = RenderTool(env, gl="PILSVG", agent_render_variant=AgentRenderVariant. AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=True, screen_height=1080, screen_width=1920) #max_time_steps = env.compute_max_episode_steps(env.width, env.height) max_time_steps = 200 # TODO Debug # metrics['steps'].append(T) metrics['episodes'].append(ep) T_rewards = [] # List of episodes rewards T_Qs = [] # List T_num_done_agents = [] # List of number of done agents for each episode T_all_done = [] # If all agents completed in each episode network_action_dict = dict() railenv_action_dict = dict() qvalues = {} # Test performance over several episodes for ep in range(args.evaluation_episodes): # Reset info state, info = env.reset() reward_sum, all_done = 0, False # reward_sum contains the cumulative reward obtained as sum during the steps num_done_agents = 0 if args.render: env_renderer.reset() # Choose first action - decide entering of agents into the environment for a in range(env.get_num_agents()): action = np.random.choice((0, 2)) railenv_action_dict.update({a: action}) state, reward, done, info = env.step(railenv_action_dict) # Env step reward_sum += sum(reward[a] for a in range(env.get_num_agents())) if args.render: env_renderer.render_env(show=True, show_observations=False, show_predictions=True) for step in range(max_time_steps - 1): # Choose actions for a in range(env.get_num_agents()): if info['action_required'][a]: network_action = dqn.act( state[a] ) # Choose an action greedily (with noisy weights) # network_action = 0 railenv_action = observation_builder.choose_railenv_action( a, network_action) qvalues.update({a: dqn.get_q_values(state[a])}) else: network_action = 0 railenv_action = 0 qvalues.update({a: [0, 0]}) # '0' if wasn't updated railenv_action_dict.update({a: railenv_action}) network_action_dict.update({a: network_action}) if args.debug: for a in range(env.get_num_agents()): print('#########################################') print('Info for agent {}'.format(a)) print('Occupancy, first layer: {}'.format( state[a][:args.prediction_depth])) print('Occupancy, second layer: {}'.format( state[a][args.prediction_depth:args.prediction_depth * 2])) print('Forks: {}'.format( state[a][args.prediction_depth * 2:args.prediction_depth * 3])) print('Target: {}'.format( state[a][args.prediction_depth * 3:args.prediction_depth * 4])) print('Priority: {}'.format( state[a][args.prediction_depth * 4])) print('Max priority encountered: {}'.format( state[a][args.prediction_depth * 4 + 1])) print('Num malfunctoning agents (globally): {}'.format( state[a][args.prediction_depth * 4 + 2])) print('Num agents ready to depart (globally): {}'.format( state[a][args.prediction_depth * 4 + 3])) print('Status: {}'.format(info['status'][a])) print('Position: {}'.format(env.agents[a].position)) print('Moving? {} at speed: {}'.format( env.agents[a].moving, info['speed'][a])) print('Action required? {}'.format( info['action_required'][a])) print('Network action: {}'.format(network_action_dict[a])) print('Railenv action: {}'.format(railenv_action_dict[a])) print('Q values: {}'.format(qvalues[a])) # print('QValues: {}'.format(qvalues)) print('Rewards: {}'.format(reward[a])) # Breakpoint for debugging here state, reward, done, info = env.step( railenv_action_dict) # Env step if args.render: env_renderer.render_env(show=True, show_observations=False, show_predictions=True) reward_sum += sum(reward[a] for a in range(env.get_num_agents())) if done['__all__']: all_done = True break # No need to close the renderer since env parameter sizes stay the same T_rewards.append(reward_sum) # Compute num of agents that reached their target for a in range(env.get_num_agents()): if done[a]: num_done_agents += 1 T_num_done_agents.append( num_done_agents / env.get_num_agents()) # In proportion to total T_all_done.append(all_done) # Test Q-values over validation memory for state in val_mem: # Iterate over valid states T_Qs.append(dqn.evaluate_q(state)) if args.debug: print('T_Qs: {}'.format(T_Qs)) # These are Qs from a single agent TODO avg_done_agents = sum(T_num_done_agents) / len( T_num_done_agents ) # Average number of agents that reached their target avg_reward = sum(T_rewards) / len(T_rewards) avg_norm_reward = avg_reward / (max_time_steps / env.get_num_agents()) # avg_reward, avg_Q = sum(T_rewards) / len(T_rewards), sum(T_Qs) / len(T_Qs) if not evaluate: # Save model parameters if improved if avg_done_agents > metrics['best_avg_done_agents']: metrics['best_avg_done_agents'] = avg_done_agents dqn.save(results_dir) # Append to results and save metrics metrics['rewards'].append(T_rewards) metrics['Qs'].append(T_Qs) torch.save(metrics, os.path.join(results_dir, 'metrics.pth')) # Plot HTML _plot_line(metrics['episodes'], metrics['rewards'], 'Reward', path=results_dir) # Plot rewards in episodes _plot_line(metrics['episodes'], metrics['Qs'], 'Q', path=results_dir) # Return average number of done agents (in proportion) and average reward return avg_done_agents, avg_reward, avg_norm_reward
def main(args, dir): ''' :param args: :return: Episodes to debug (set breakpoint in episodes loop to debug): - ep = 3, agent 1 spawns in front of 3, blocking its path; 0 and 2 are in a deadlock since they have same priority - ep = 4, agents stop because of wrong priorities even though the conflict zone wasn't entered, - ep = 14, ''' rail_generator = sparse_rail_generator( max_num_cities=args.max_num_cities, seed=args.seed, grid_mode=args.grid_mode, max_rails_between_cities=args.max_rails_between_cities, max_rails_in_city=args.max_rails_in_city, ) # Maps speeds to % of appearance in the env speed_ration_map = { 1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25 } # Slow freight train observation_builder = GraphObsForRailEnv( predictor=ShortestPathPredictorForRailEnv( max_depth=args.prediction_depth), bfs_depth=4) env = RailEnv( width=args.width, height=args.height, rail_generator=rail_generator, schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=args.num_agents, obs_builder_object=observation_builder, malfunction_generator_and_process_data=malfunction_from_params( parameters={ 'malfunction_rate': args.malfunction_rate, # Rate of malfunction occurrence 'min_duration': args.min_duration, # Minimal duration of malfunction 'max_duration': args.max_duration # Max duration of malfunction })) if args.render: env_renderer = RenderTool(env, agent_render_variant=AgentRenderVariant. AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=True) sm = stateMachine() tb = TestBattery(env, observation_builder) state_machine_action_dict = {} railenv_action_dict = {} # max_time_steps = env.compute_max_episode_steps(args.width, args.height) max_time_steps = 200 T_rewards = [] # List of episodes rewards T_Qs = [] # List of q values T_num_done_agents = [] # List of number of done agents for each episode T_all_done = [] # If all agents completed in each episode T_episodes = [] # Time taken for each episode if args.save_image and not os.path.isdir("image_dump"): os.makedirs("image_dump") step_taken = 0 total_step_taken = 0 total_episodes = 0 step_times = [] # Time taken for each step for ep in range(args.num_episodes): # Reset info at the beginning of an episode start_time = time.time() # Take time of one episode if args.generate_baseline: if not os.path.isdir("image_dump/" + str(dir)) and args.save_image: os.makedirs("image_dump/" + str(dir)) else: if not os.path.isdir("image_dump/" + str(ep)) and args.save_image: os.makedirs("image_dump/" + str(ep)) state, info = env.reset() tb.reset() if args.render: env_renderer.reset() reward_sum, all_done = 0, False # reward_sum contains the cumulative reward obtained as sum during the steps num_done_agents = 0 state_machine_action = {} for i in range(env.number_of_agents): state_machine_action[i] = 0 for step in range(max_time_steps): start_step_time = time.time() #if step % 10 == 0: # print(step) # Test battery # see test_battery.py triggers = tb.tests(state, args.prediction_depth, state_machine_action) # state machine based on triggers of test battery # see state_machine.py state_machine_action = sm.act( triggers) # State machine picks action for a in range(env.get_num_agents()): #if info['action_required'][a]: # #railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a]) # railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a]) # state_machine_action_dict.update({a: state_machine_action}) # railenv_action_dict.update({a: railenv_action}) # railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a]) railenv_action = observation_builder.choose_railenv_action( a, state_machine_action[a]) state_machine_action_dict.update({a: state_machine_action}) railenv_action_dict.update({a: railenv_action}) state, reward, done, info = env.step( railenv_action_dict) # Env step if args.generate_baseline: #env_renderer.render_env(show=True, show_observations=False, show_predictions=True) env_renderer.render_env(show=False, show_observations=False, show_predictions=True) else: env_renderer.render_env(show=True, show_observations=False, show_predictions=True) if args.generate_baseline: if args.save_image: env_renderer.save_image("image_dump/" + str(dir) + "/image_" + str(step) + "_.png") else: if args.save_image: env_renderer.save_image("image_dump/" + str(ep) + "/image_" + str(step) + "_.png") if args.debug: for a in range(env.get_num_agents()): log('\n\n#########################################') log('\nInfo for agent {}'.format(a)) #log('\npath : {}'.format(state[a]["path"])) log('\noverlap : {}'.format(state[a]["overlap"])) log('\ndirection : {}'.format(state[a]["direction"])) log('\nOccupancy, first layer: {}'.format( state[a]["occupancy"])) log('\nOccupancy, second layer: {}'.format( state[a]["conflict"])) log('\nForks: {}'.format(state[a]["forks"])) log('\nTarget: {}'.format(state[a]["target"])) log('\nPriority: {}'.format(state[a]["priority"])) log('\nMax priority encountered: {}'.format( state[a]["max_priority"])) log('\nNum malfunctioning agents (globally): {}'.format( state[a]["n_malfunction"])) log('\nNum agents ready to depart (globally): {}'.format( state[a]["ready_to_depart"])) log('\nStatus: {}'.format(info['status'][a])) log('\nPosition: {}'.format(env.agents[a].position)) log('\nTarget: {}'.format(env.agents[a].target)) log('\nMoving? {} at speed: {}'.format( env.agents[a].moving, info['speed'][a])) log('\nAction required? {}'.format( info['action_required'][a])) log('\nState machine action: {}'.format( state_machine_action_dict[a])) log('\nRailenv action: {}'.format(railenv_action_dict[a])) log('\nRewards: {}'.format(reward[a])) log('\n\n#########################################') reward_sum += sum(reward[a] for a in range(env.get_num_agents())) step_taken = step time_taken_step = time.time() - start_step_time step_times.append(time_taken_step) if done['__all__']: all_done = True break total_step_taken += step_taken time_taken = time.time() - start_time # Time taken for one episode total_episodes = ep # Time metrics - too precise avg_time_step = sum(step_times) / step_taken #print("Avg time step: " + str(avg_time_step)) # No need to close the renderer since env parameter sizes stay the same T_rewards.append(reward_sum) # Compute num of agents that reached their target for a in range(env.get_num_agents()): if done[a]: num_done_agents += 1 percentage_done_agents = num_done_agents / env.get_num_agents() log("\nDone agents in episode: {}".format(percentage_done_agents)) T_num_done_agents.append( percentage_done_agents) # In proportion to total T_all_done.append(all_done) # Average number of agents that reached their target avg_done_agents = sum(T_num_done_agents) / len(T_num_done_agents) if len( T_num_done_agents) > 0 else 0 avg_reward = sum(T_rewards) / len(T_rewards) if len(T_rewards) > 0 else 0 avg_norm_reward = avg_reward / (max_time_steps / env.get_num_agents()) avg_ep_time = sum(T_episodes) / args.num_episodes if total_episodes == 0: total_episodes = 1 log("\nSeed: " + str(args.seed) \ + "\t | Avg_done_agents: " + str(avg_done_agents)\ + "\t | Avg_reward: " + str(avg_reward)\ + "\t | Avg_norm_reward: " + str(avg_norm_reward)\ + "\t | Max_num_time_steps: " + str(max_time_steps)\ + "\t | Avg_num_time_steps: " + str(total_step_taken/total_episodes) + "\t | Avg episode time: " + str(avg_ep_time))
print('\rTest: {}\t Step / MaxSteps: {} / {}'.format( test, step + 1, max_time_steps), end=" ") ''' for agent_idx, agent in enumerate(env.agents): print( "Agent {} ha state {} in (current) position {} with malfunction {}".format( agent_idx, str(agent.status), str(agent.position), str(agent.malfunction_data['malfunction']))) ''' # Chose an action for each agent in the environment for a in range(env.get_num_agents()): if info['action_required'][a]: # print('Agent {} needs to submit an action'.format(a)) # 'railenv_action' is in [0, 4], network_action' is in [0, 1] network_action = controller.act(obs[a]) railenv_action = observation_builder.choose_railenv_action( a, network_action) else: network_action = 0 railenv_action = 0 # DO NOTHING railenv_action_dict.update({a: railenv_action}) network_action_dict.update({a: network_action}) #for a in range(env.get_num_agents()): for a in (0, 1, 2, 3): print('#########################################') print('Info for agent {}'.format(a)) print('Obs: {}'.format(obs[a])) print('Status: {}'.format(info['status'][a])) print('Moving? {} at speed: {}'.format(env.agents[a].moving,
def main(args): rail_generator = sparse_rail_generator( max_num_cities=args.max_num_cities, #seed=args.seed, seed=0, # 0, 3, 7, 10, 14, 16, 20, 22, 23, 25, 26, 32 grid_mode=args.grid_mode, max_rails_between_cities=args.max_rails_between_cities, max_rails_in_city=args.max_rails_in_city, ) # Maps speeds to % of appearance in the env speed_ration_map = { 1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25 } # Slow freight train observation_builder = GraphObsForRailEnv( bfs_depth=args.bfs_depth, predictor=ShortestPathPredictorForRailEnv( max_depth=args.prediction_depth)) # Construct the environment with the given observation, generators, predictors, and stochastic data env = RailEnv( width=args.width, height=args.height, rail_generator=rail_generator, # rail_from_file boh... schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=args.num_agents, obs_builder_object=observation_builder, malfunction_generator_and_process_data=malfunction_from_params( parameters={ 'malfunction_rate': args.malfunction_rate, # Rate of malfunction occurrence 'min_duration': args.min_duration, # Minimal duration of malfunction 'max_duration': args.max_duration # Max duration of malfunction })) env_renderer = RenderTool( env, agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=True, screen_height=1080, screen_width=1920) state_machine_action_dict = {} railenv_action_dict = {} max_time_steps = 150 T_rewards = [] # List of episodes rewards T_Qs = [] # List of q values T_num_done_agents = [] # List of number of done agents for each episode T_all_done = [] # If all agents completed in each episode for ep in range(args.num_episodes): # Reset info at the beginning of an episode # env.load(filename="map" + str(ep)) state, info = env.reset() # env.save(filename="map" + str(ep)) env_renderer.reset() reward_sum, all_done = 0, False # reward_sum contains the cumulative reward obtained as sum during the steps num_done_agents = 0 for step in range(max_time_steps): for a in range(env.get_num_agents()): shortest_path_prediction = observation_builder.cells_sequence[ a] state_machine_action, is_alternative = act( args, env, a, state[a], shortest_path_prediction) # State machine picks action if not is_alternative: railenv_action = observation_builder.choose_railenv_action( a, state_machine_action) else: railenv_action = state_machine_action state_machine_action_dict.update({a: state_machine_action}) railenv_action_dict.update({a: railenv_action}) state, reward, done, info = env.step( railenv_action_dict) # Env step env_renderer.render_env(show=True, show_observations=False, show_predictions=True) for a in range(env.get_num_agents()): print('#########################################') print('Info for agent {}'.format(a)) print('Occupancy, first layer: {}'.format( state[a][:args.prediction_depth])) print('Occupancy, second layer: {}'.format( state[a][args.prediction_depth:args.prediction_depth * 2])) print('Forks: {}'.format( state[a][args.prediction_depth * 2:args.prediction_depth * 3])) print('Target: {}'.format( state[a][args.prediction_depth * 3:args.prediction_depth * 4])) print('Priority: {}'.format(state[a][args.prediction_depth * 4])) print('Max priority encountered: {}'.format( state[a][args.prediction_depth * 4 + 1])) print('Num malfunctoning agents (globally): {}'.format( state[a][args.prediction_depth * 4 + 2])) print('Num agents ready to depart (globally): {}'.format( state[a][args.prediction_depth * 4 + 3])) print('Status: {}'.format(info['status'][a])) print('Position: {}'.format(env.agents[a].position)) print('Moving? {} at speed: {}'.format(env.agents[a].moving, info['speed'][a])) print('Action required? {}'.format(info['action_required'][a])) print('Network action: {}'.format( state_machine_action_dict[a])) print('Railenv action: {}'.format(railenv_action_dict[a])) # print('Q values: {}'.format(qvalues[a])) print('Rewards: {}'.format(reward[a])) reward_sum += sum(reward[a] for a in range(env.get_num_agents())) if done['__all__']: all_done = True break # No need to close the renderer since env parameter sizes stay the same T_rewards.append(reward_sum) # Compute num of agents that reached their target for a in range(env.get_num_agents()): if done[a]: num_done_agents += 1 T_num_done_agents.append( num_done_agents / env.get_num_agents()) # In proportion to total T_all_done.append(all_done) avg_done_agents = sum(T_num_done_agents) / len( T_num_done_agents ) # Average number of agents that reached their target avg_reward = sum(T_rewards) / len(T_rewards) avg_norm_reward = avg_reward / (max_time_steps / env.get_num_agents()) print("Avg. done agents: {}".format(avg_done_agents)) print("Avg. reward: {}".format(avg_reward)) print("Avg. norm reward: {}".format(avg_norm_reward))
# ##################################################################### # Compute the action for this step by using the previously # defined controller time_start = time.time() # Test battery # see test_battery.py triggers = tb.tests(state, prediction_depth, state_machine_action) # state machine based on triggers of test battery # see state_machine.py state_machine_action = sm.act(triggers) # State machine picks action for a in range(number_of_agents): #state_machine_action = act(prediction_depth, state[a]) # State machine picks action railenv_action = observation_builder.choose_railenv_action(a, state_machine_action) # state_machine_action_dict.update({a: state_machine_action}) railenv_action_dict.update({a: railenv_action}) time_taken = time.time() - time_start time_taken_by_controller.append(time_taken) # Perform the chosen action on the environment. # The action gets applied to both the local and the remote copy # of the environment instance, and the observation is what is # returned by the local copy of the env, and the rewards, and done and info # are returned by the remote copy of the env time_start = time.time() state, reward, done, info = remote_client.env_step(railenv_action_dict) steps += 1 time_taken = time.time() - time_start time_taken_per_step.append(time_taken) reward_sum += sum(list(reward.values()))
def main(args): # Show options and values print(' ' * 26 + 'Options') for k, v in vars(args).items(): print(' ' * 26 + k + ': ' + str(v)) # Where to save models results_dir = os.path.join('results', args.id) if not os.path.exists(results_dir): os.makedirs(results_dir) # These are saved in a .pth metrics = {'episodes': [], # originally 'steps' 'rewards': [], 'Qs': [], 'best_avg_done_agents': -float('inf'), 'best_avg_reward': -float('inf')} np.random.seed(args.seed) torch.manual_seed(np.random.randint(1, 10000)) # Set cpu or gpu if torch.cuda.is_available() and not args.disable_cuda: args.device = torch.device('cuda') torch.cuda.manual_seed(np.random.randint(1, 10000)) torch.backends.cudnn.enabled = args.enable_cudnn else: args.device = torch.device('cpu') # Simple ISO 8601 timestamped logger def log(s): print('[' + str(datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) + '] ' + s) def load_memory(memory_path, disable_bzip): if disable_bzip: with open(memory_path, 'rb') as pickle_file: return pickle.load(pickle_file) else: with bz2.open(memory_path, 'rb') as zipped_pickle_file: return pickle.load(zipped_pickle_file) def save_memory(memory, memory_path, disable_bzip): if disable_bzip: with open(memory_path, 'wb') as pickle_file: pickle.dump(memory, pickle_file) else: with bz2.open(memory_path, 'wb') as zipped_pickle_file: pickle.dump(memory, zipped_pickle_file) rail_generator = sparse_rail_generator(max_num_cities=args.max_num_cities, seed=args.seed, grid_mode=args.grid_mode, max_rails_between_cities=args.max_rails_between_cities, max_rails_in_city=args.max_rails_in_city, ) # Maps speeds to % of appearance in the env speed_ration_map = {1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25} # Slow freight train schedule_generator = sparse_schedule_generator(speed_ration_map) stochastic_data = {'malfunction_rate': args.malfunction_rate, # Rate of malfunction occurrence 'min_duration': args.min_duration, # Minimal duration of malfunction 'max_duration': args.max_duration # Max duration of malfunction } observation_builder = GraphObsForRailEnv(predictor=ShortestPathPredictorForRailEnv(max_depth=args.prediction_depth)) # Construct the environment with the given observation, generators, predictors, and stochastic data env = RailEnv(width=args.width, height=args.height, rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=args.num_agents, obs_builder_object=observation_builder, malfunction_generator_and_process_data=malfunction_from_params(stochastic_data) ) env.reset() state_size = args.prediction_depth * 4 + 4 # TODO # action_space = args.network_action_space network_action_dict = {} railenv_action_dict = {} qvalues = {} # Map handle: q value for this step # Init agent dqn = RainbowAgent(args, state_size, env) # If a model is provided, and evaluate is false, presumably we want to resume, so try to load memory if args.model is not None and not args.evaluate: if not args.memory: raise ValueError('Cannot resume training without memory save path. Aborting...') elif not os.path.exists(args.memory): raise ValueError('Could not find memory file at {path}. Aborting...'.format(path=args.memory)) mem = load_memory(args.memory, args.disable_bzip_memory) else: # Init one replay buffer for each agent (TODO) Must be updated when the number of agents change mems = [ReplayMemory(args, int(args.memory_capacity/args.num_agents)) for a in range(args.num_agents)] # mem = ReplayMemory(args, args.memory_capacity) # Init empty replay buffer priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start) # Construct validation memory val_mem = ReplayMemory(args, args.evaluation_size) T = 0 all_done = True update_values = [False] * env.get_num_agents() # Used to update agent if action was performed in this step # Number of transitions to do for validating Q print("Validating Q...") while T < args.evaluation_size: for a in range(env.get_num_agents()): if all_done: state, info = env.reset() all_done = False for a in range(env.get_num_agents()): action = np.random.choice(np.arange(5)) railenv_action_dict.update({a: action}) next_state, reward, done, info = env.step(railenv_action_dict) val_mem.append(state[0], None, None, all_done) # TODO Using only state from agent 0 for now all_done = done['__all__'] state = next_state T += 1 if args.evaluate: dqn.eval() # Set DQN (online network) to evaluation mode avg_done_agents, avg_reward, avg_norm_reward = test(args, 0, 0, dqn, val_mem, metrics, results_dir, evaluate=True) # Test #print('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q)) print('Avg. done agents: ' + str(avg_done_agents) + ' | Avg. cumulative reward: ' + str(avg_reward) + ' | Avg. normalized reward: ' + str(avg_norm_reward)) else: # Training loop print("Training started...") dqn.train() ################## Episodes loop ####################### for ep in trange(1, args.num_episodes + 1): # Reset env at the beginning of one episode state, info = env.reset() # Pick first action - entering of agents is now random for a in range(env.get_num_agents()): action = np.random.choice((0,2)) railenv_action_dict.update({a: action}) next_state, reward, done, info = env.step(railenv_action_dict) # Env first step ############## Steps loop ########################## for T in range(1, args.T_max + 1): if T % args.replay_frequency == 0: dqn.reset_noise() # Draw a new set of noisy weights for a in range(env.get_num_agents()): if info['action_required'][a]: network_action = dqn.act(state[a]) # Choose an action greedily (with noisy weights) railenv_action = observation_builder.choose_railenv_action(a, network_action) update_values[a] = True qvalues.update({a: dqn.get_q_values(state[a])}) else: network_action = 0 railenv_action = 0 update_values[a] = False qvalues.update({a: [0, 0]}) # '0' if wasn't updated # Update action dicts railenv_action_dict.update({a: railenv_action}) network_action_dict.update({a: network_action}) next_state, reward, done, info = env.step(railenv_action_dict) # Env step ''' if T == 100: # Print only at 100th steps of each episode if args.debug: for a in range(env.get_num_agents()): print('#########################################') print('Info for agent {}'.format(a)) print('Occupancy, first layer: {}'.format(state[a][:args.prediction_depth])) print('Occupancy, second layer: {}'.format( state[a][args.prediction_depth:args.prediction_depth * 2])) print('Forks: {}'.format(state[a][args.prediction_depth * 2:args.prediction_depth * 3])) print('Target: {}'.format(state[a][args.prediction_depth * 3:args.prediction_depth * 4])) print('Priority: {}'.format(state[a][args.prediction_depth * 4])) print('Max priority encountered: {}'.format(state[a][args.prediction_depth * 4 + 1])) print('Num malfunctoning agents (globally): {}'.format(state[a][args.prediction_depth * 4 + 2])) print( 'Num agents ready to depart (globally): {}'.format(state[a][args.prediction_depth * 4 + 3])) print('Status: {}'.format(info['status'][a])) print('Position: {}'.format(env.agents[a].position)) print('Moving? {} at speed: {}'.format(env.agents[a].moving, info['speed'][a])) print('Action required? {}'.format(info['action_required'][a])) print('Network action: {}'.format(network_action_dict[a])) print('Railenv action: {}'.format(railenv_action_dict[a])) print('Q values: {}'.format(qvalues[a])) print('Rewards: {}'.format(reward)) ''' # Clip reward and update replay buffer for a in range(env.get_num_agents()): ''' * Reward is always in [-1, 1], so we shouldn't need clipping if args.reward_clip > 0: reward[a] = max(min(reward[a], args.reward_clip), -args.reward_clip) ''' if update_values[a]: # Store transition only if this agent performed action in this time step mems[a].append(state[a], network_action_dict[a], reward[a], done[a]) # Append to own buffer #mem.append(state[a], network_action_dict[a], reward[a], done[a]) # Append transition to memory # print('Clipped rewards: {}'.format(reward)) state = next_state.copy() # Train and test if ep >= args.learn_start: # Give time to accumulate experiences # Anneal importance sampling weight β to 1 #mem.priority_weight = min(mem.priority_weight + priority_weight_increase, 1) for a in range(args.num_agents): mems[a].priority_weight = min(mems[a].priority_weight + priority_weight_increase, 1) if T % args.replay_frequency == 0: a = np.random.choice(np.arange(args.num_agents)) dqn.learn(mems[a]) # Learn randomly from one of the available replay buffer # dqn.learn(mem) # Train with n-step distributional double-Q learning # Update target network if T % args.target_update == 0: dqn.update_target_net() if done['__all__']: break ##### EPISODE END ############## if (ep % args.evaluation_interval) == 0: # Evaluate only at the end of the episodes dqn.eval() # Set DQN (online network) to evaluation mode avg_done_agents, avg_reward, avg_norm_reward = test(args, T, ep, dqn, val_mem, metrics, results_dir) # Test log( 'T = ' + str(T) + ' / ' + str(args.T_max) + ' | Avg. done agents: ' + str(avg_done_agents) + ' | Avg. reward: ' + str(avg_reward) + ' | Avg. normalized reward: ' + str(avg_norm_reward)) dqn.train() # Set DQN (online network) back to training mode # If memory path provided, save it if args.memory is not None: save_memory(mems[0], args.memory, args.disable_bzip_memory) # Save only first replay buffer (?) # save_memory(mem, args.memory, args.disable_bzip_memory) # Checkpoint the network every 'checkpoint_interval' episodes if (args.checkpoint_interval != 0) and (ep % args.checkpoint_interval == 0): dqn.save(results_dir, 'checkpoint.pth')
def main(args): rail_generator = sparse_rail_generator( max_num_cities=args.max_num_cities, seed=args.seed, grid_mode=args.grid_mode, max_rails_between_cities=args.max_rails_between_cities, max_rails_in_city=args.max_rails_in_city, ) # Maps speeds to % of appearance in the env speed_ration_map = { 1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25 } # Slow freight train schedule_generator = sparse_schedule_generator(speed_ration_map) ''' THIS WORKS WITH NEXT VERSION stochastic_data = MalfunctionParameters( malfunction_rate=args.malfunction_rate, # Rate of malfunction occurrence of single agent min_duration=args.min_duration, # Minimal duration of malfunction max_duration=args.max_duration # Max duration of malfunction ) ''' stochastic_data = { 'malfunction_rate': args.malfunction_rate, 'min_duration': args.min_duration, 'max_duration': args.max_duration } if args.observation_builder == 'GraphObsForRailEnv': prediction_depth = args.prediction_depth bfs_depth = args.bfs_depth observation_builder = GraphObsForRailEnv( bfs_depth=bfs_depth, predictor=ShortestPathPredictorForRailEnv( max_depth=prediction_depth)) state_size = args.prediction_depth * 3 + 4 # TODO network_action_size = 2 # {follow path, stop} railenv_action_size = 5 # The RailEnv possible actions agent = Agent(network_type='fc', state_size=state_size, action_size=network_action_size) elif args.observation_builder == 'LocalObsForRailEnv': observation_builder = LocalObsForRailEnv(args.view_semiwidth, args.view_height, args.offset) #state_size = (2 * args.view_semiwidth + 1) * args.height state_size = 16 + 5 + 2 # state_size == in_channels railenv_action_size = 5 agent = Agent(network_type='conv', state_size=state_size, action_size=railenv_action_size) # Construct the environment with the given observation, generators, predictors, and stochastic data env = RailEnv( width=args.width, height=args.height, rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=args.num_agents, obs_builder_object=observation_builder, malfunction_generator_and_process_data=malfunction_from_params( stochastic_data), remove_agents_at_target=True) env.reset() # max_steps = env.compute_max_episode_steps(args.width, args.height, args.num_agents/args.max_num_cities) max_steps = 200 # TODO DEBUG eps = 1. eps_end = 0.005 eps_decay = 0.998 # Need to have two since env works with RailEnv actions but agent works with network actions network_action_dict = dict() railenv_action_dict = dict() scores_window = deque(maxlen=100) done_window = deque(maxlen=100) scores = [] dones_list = [] action_prob = [0] * railenv_action_size agent_obs = [None] * env.get_num_agents() agent_obs_buffer = [None] * env.get_num_agents() agent_action_buffer = [2] * env.get_num_agents() update_values = [False] * env.get_num_agents( ) # Used to update agent if action was performed in this step qvalues = {} for ep in range(1, args.num_episodes + 1): obs, info = env.reset() if args.observation_builder == 'GraphObsForRailEnv': for a in range(env.get_num_agents()): agent_obs[a] = obs[a].copy() agent_obs_buffer[a] = agent_obs[a].copy() for a in range(env.get_num_agents()): action = np.random.choice((0, 2)) railenv_action_dict.update( {a: action}) # All'inizio faccio partire a random TODO Prova next_obs, all_rewards, done, info = env.step(railenv_action_dict) # Normalize obs, only for LocalObs now elif args.observation_builder == 'LocalObsForRailEnv': for a in range(env.get_num_agents()): if obs[a]: agent_obs[a] = preprocess_obs(obs[a]) agent_obs_buffer[a] = agent_obs[a].copy() score = 0 env_done = 0 ############# Main loop for step in range(max_steps - 1): ''' print( '\r{} Agents on ({},{}).\t Ep: {}\t Step/MaxSteps: {} / {}'.format( env.get_num_agents(), args.width, args.height, ep, step, max_steps), end=" ") ''' # Logging #print_info(env) for a in range(env.get_num_agents()): if args.observation_builder == 'GraphObsForRailEnv': if info['action_required'][a]: # 'railenv_action' is in [0, 4], network_action' is in [0, 1] network_action = agent.act(agent_obs[a], eps=eps) # Pick railenv action according to network decision if it's safe to go or to stop railenv_action = observation_builder.choose_railenv_action( a, network_action) update_values[a] = True qvalues.update({a: agent.get_q_values(agent_obs[a])}) else: network_action = 0 railenv_action = 0 update_values[a] = False qvalues.update({a: [0, 0]}) # Update action dicts action_prob[railenv_action] += 1 railenv_action_dict.update({a: railenv_action}) network_action_dict.update({a: network_action}) elif args.observation_builder == 'LocalObsForRailEnv': if info['action_required'][a]: railenv_action = agent.act(agent_obs[a], eps=eps) update_values[a] = True else: railenv_action = 0 # If action is not required DO_NOTHING update_values[a] = False action_prob[railenv_action] += 1 railenv_action_dict.update({a: railenv_action}) # Environment step next_obs, all_rewards, done, info = env.step(railenv_action_dict) if step == 100: print('QValues: {}'.format(qvalues)) # Update replay buffer and train agent for a in range(env.get_num_agents()): if update_values[a] or done[a]: if args.observation_builder == 'GraphObsForRailEnv': agent.step(agent_obs_buffer[a], network_action_dict[a], all_rewards[a], agent_obs[a], done[a]) else: agent.step(agent_obs_buffer[a], network_action_dict[a], all_rewards[a], agent_obs[a], done[a]) agent_obs_buffer[a] = agent_obs[a].copy() ''' if args.observation_builder == 'GraphObsForRailEnv': agent_action_buffer[a] = network_action_dict[a] elif args.observation_builder == 'LocalObsForRailEnv': agent_action_buffer[a] = railenv_action_dict[a] ''' # Preprocessing and normalization if args.observation_builder == 'GraphObsForRailEnv': agent_obs[a] = next_obs[a].copy() if args.observation_builder == 'LocalObsForRailEnv' and next_obs[ a]: agent_obs[a] = preprocess_obs(next_obs[a]) score += all_rewards[a] / env.get_num_agents() # Update score if done['__all__']: env_done = 1 break ################### At the end of the episode TODO This part could be done in another script eps = max(eps_end, eps_decay * eps) # Decrease epsilon # Metrics done_window.append(env_done) num_agents_done = 0 # Num of agents that reached their target for a in range(env.get_num_agents()): if done[a]: num_agents_done += 1 scores_window.append(score / max_steps) # Save most recent score scores.append(np.mean(scores_window)) dones_list.append((np.mean(done_window))) action_prob_float = action_prob / np.sum(action_prob) formatted_action_prob = [ '{:.5f}'.format(ap) for ap in action_prob_float ] # Print training results info print( '\r{} Agents on ({},{}).\t Ep: {}\t Avg Score: {:.3f}\t Env Dones so far: {:.2f}%\t Done Agents in ep: {:.2f}%\t Eps: {:.2f}\t Action Probs: {} ' .format(env.get_num_agents(), args.width, args.height, ep, np.mean(scores_window), 100 * np.mean(done_window), 100 * (num_agents_done / args.num_agents), eps, formatted_action_prob), end=" ") if ep % 50 == 0: print( '\rTraining {} Agents.\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}' .format(env.get_num_agents(), ep, np.mean(scores_window), 100 * np.mean(done_window), 100 * (num_agents_done / args.num_agents), eps, formatted_action_prob)) torch.save(agent.qnetwork_local.state_dict(), './nets/' + str(args.model_name) + str(ep) + '.pth') action_prob = [1] * railenv_action_size