コード例 #1
0
def test(args, T, ep, dqn, val_mem, metrics, results_dir, evaluate=False):

    # Init env and set in evaluation mode
    # Maps speeds to % of appearance in the env
    speed_ration_map = {
        1.: 0.25,  # Fast passenger train
        1. / 2.: 0.25,  # Fast freight train
        1. / 3.: 0.25,  # Slow commuter train
        1. / 4.: 0.25
    }  # Slow freight train

    schedule_generator = sparse_schedule_generator(speed_ration_map)

    observation_builder = GraphObsForRailEnv(
        predictor=ShortestPathPredictorForRailEnv(
            max_depth=args.prediction_depth))

    env = RailEnv(
        width=args.width,
        height=args.height,
        rail_generator=sparse_rail_generator(
            max_num_cities=args.max_num_cities,
            seed=
            ep,  # Use episode as seed when evaluation is performed during training
            grid_mode=args.grid_mode,
            max_rails_between_cities=args.max_rails_between_cities,
            max_rails_in_city=args.max_rails_in_city,
        ),
        schedule_generator=schedule_generator,
        number_of_agents=args.num_agents,
        obs_builder_object=observation_builder,
        malfunction_generator_and_process_data=malfunction_from_params(
            parameters={
                'malfunction_rate': args.malfunction_rate,
                'min_duration': args.min_duration,
                'max_duration': args.max_duration
            }),
    )

    if args.render:
        env_renderer = RenderTool(env,
                                  gl="PILSVG",
                                  agent_render_variant=AgentRenderVariant.
                                  AGENT_SHOWS_OPTIONS_AND_BOX,
                                  show_debug=True,
                                  screen_height=1080,
                                  screen_width=1920)

    #max_time_steps = env.compute_max_episode_steps(env.width, env.height)
    max_time_steps = 200  # TODO Debug
    # metrics['steps'].append(T)
    metrics['episodes'].append(ep)
    T_rewards = []  # List of episodes rewards
    T_Qs = []  # List
    T_num_done_agents = []  # List of number of done agents for each episode
    T_all_done = []  # If all agents completed in each episode
    network_action_dict = dict()
    railenv_action_dict = dict()
    qvalues = {}

    # Test performance over several episodes
    for ep in range(args.evaluation_episodes):
        # Reset info
        state, info = env.reset()
        reward_sum, all_done = 0, False  # reward_sum contains the cumulative reward obtained as sum during the steps
        num_done_agents = 0
        if args.render:
            env_renderer.reset()

        # Choose first action - decide entering of agents into the environment
        for a in range(env.get_num_agents()):
            action = np.random.choice((0, 2))
            railenv_action_dict.update({a: action})
        state, reward, done, info = env.step(railenv_action_dict)  # Env step
        reward_sum += sum(reward[a] for a in range(env.get_num_agents()))

        if args.render:
            env_renderer.render_env(show=True,
                                    show_observations=False,
                                    show_predictions=True)

        for step in range(max_time_steps - 1):
            # Choose actions
            for a in range(env.get_num_agents()):
                if info['action_required'][a]:
                    network_action = dqn.act(
                        state[a]
                    )  # Choose an action greedily (with noisy weights)
                    # network_action = 0
                    railenv_action = observation_builder.choose_railenv_action(
                        a, network_action)
                    qvalues.update({a: dqn.get_q_values(state[a])})
                else:
                    network_action = 0
                    railenv_action = 0
                    qvalues.update({a: [0, 0]})  # '0' if wasn't updated

                railenv_action_dict.update({a: railenv_action})
                network_action_dict.update({a: network_action})

            if args.debug:
                for a in range(env.get_num_agents()):
                    print('#########################################')
                    print('Info for agent {}'.format(a))
                    print('Occupancy, first layer: {}'.format(
                        state[a][:args.prediction_depth]))
                    print('Occupancy, second layer: {}'.format(
                        state[a][args.prediction_depth:args.prediction_depth *
                                 2]))
                    print('Forks: {}'.format(
                        state[a][args.prediction_depth *
                                 2:args.prediction_depth * 3]))
                    print('Target: {}'.format(
                        state[a][args.prediction_depth *
                                 3:args.prediction_depth * 4]))
                    print('Priority: {}'.format(
                        state[a][args.prediction_depth * 4]))
                    print('Max priority encountered: {}'.format(
                        state[a][args.prediction_depth * 4 + 1]))
                    print('Num malfunctoning agents (globally): {}'.format(
                        state[a][args.prediction_depth * 4 + 2]))
                    print('Num agents ready to depart (globally): {}'.format(
                        state[a][args.prediction_depth * 4 + 3]))
                    print('Status: {}'.format(info['status'][a]))
                    print('Position: {}'.format(env.agents[a].position))
                    print('Moving? {} at speed: {}'.format(
                        env.agents[a].moving, info['speed'][a]))
                    print('Action required? {}'.format(
                        info['action_required'][a]))
                    print('Network action: {}'.format(network_action_dict[a]))
                    print('Railenv action: {}'.format(railenv_action_dict[a]))
                    print('Q values: {}'.format(qvalues[a]))
                    # print('QValues: {}'.format(qvalues))
                    print('Rewards: {}'.format(reward[a]))

            # Breakpoint for debugging here
            state, reward, done, info = env.step(
                railenv_action_dict)  # Env step
            if args.render:
                env_renderer.render_env(show=True,
                                        show_observations=False,
                                        show_predictions=True)

            reward_sum += sum(reward[a] for a in range(env.get_num_agents()))

            if done['__all__']:
                all_done = True
                break
        # No need to close the renderer since env parameter sizes stay the same
        T_rewards.append(reward_sum)
        # Compute num of agents that reached their target
        for a in range(env.get_num_agents()):
            if done[a]:
                num_done_agents += 1
        T_num_done_agents.append(
            num_done_agents / env.get_num_agents())  # In proportion to total
        T_all_done.append(all_done)

    # Test Q-values over validation memory
    for state in val_mem:  # Iterate over valid states
        T_Qs.append(dqn.evaluate_q(state))
    if args.debug:
        print('T_Qs: {}'.format(T_Qs))  # These are Qs from a single agent TODO

    avg_done_agents = sum(T_num_done_agents) / len(
        T_num_done_agents
    )  # Average number of agents that reached their target
    avg_reward = sum(T_rewards) / len(T_rewards)
    avg_norm_reward = avg_reward / (max_time_steps / env.get_num_agents())

    # avg_reward, avg_Q = sum(T_rewards) / len(T_rewards), sum(T_Qs) / len(T_Qs)
    if not evaluate:
        # Save model parameters if improved
        if avg_done_agents > metrics['best_avg_done_agents']:
            metrics['best_avg_done_agents'] = avg_done_agents
            dqn.save(results_dir)

        # Append to results and save metrics
        metrics['rewards'].append(T_rewards)
        metrics['Qs'].append(T_Qs)
        torch.save(metrics, os.path.join(results_dir, 'metrics.pth'))

        # Plot HTML
        _plot_line(metrics['episodes'],
                   metrics['rewards'],
                   'Reward',
                   path=results_dir)  # Plot rewards in episodes
        _plot_line(metrics['episodes'], metrics['Qs'], 'Q', path=results_dir)

    # Return average number of done agents (in proportion) and average reward
    return avg_done_agents, avg_reward, avg_norm_reward
コード例 #2
0
def main(args, dir):
    '''
	
	:param args: 
	:return: 
	Episodes to debug (set breakpoint in episodes loop to debug):
	- ep = 3, agent 1 spawns in front of 3, blocking its path; 0 and 2 are in a deadlock since they have same priority
	- ep = 4, agents stop because of wrong priorities even though the conflict zone wasn't entered,
	- ep = 14, 
	'''
    rail_generator = sparse_rail_generator(
        max_num_cities=args.max_num_cities,
        seed=args.seed,
        grid_mode=args.grid_mode,
        max_rails_between_cities=args.max_rails_between_cities,
        max_rails_in_city=args.max_rails_in_city,
    )

    # Maps speeds to % of appearance in the env
    speed_ration_map = {
        1.: 0.25,  # Fast passenger train
        1. / 2.: 0.25,  # Fast freight train
        1. / 3.: 0.25,  # Slow commuter train
        1. / 4.: 0.25
    }  # Slow freight train

    observation_builder = GraphObsForRailEnv(
        predictor=ShortestPathPredictorForRailEnv(
            max_depth=args.prediction_depth),
        bfs_depth=4)

    env = RailEnv(
        width=args.width,
        height=args.height,
        rail_generator=rail_generator,
        schedule_generator=sparse_schedule_generator(speed_ration_map),
        number_of_agents=args.num_agents,
        obs_builder_object=observation_builder,
        malfunction_generator_and_process_data=malfunction_from_params(
            parameters={
                'malfunction_rate':
                args.malfunction_rate,  # Rate of malfunction occurrence
                'min_duration':
                args.min_duration,  # Minimal duration of malfunction
                'max_duration':
                args.max_duration  # Max duration of malfunction
            }))

    if args.render:
        env_renderer = RenderTool(env,
                                  agent_render_variant=AgentRenderVariant.
                                  AGENT_SHOWS_OPTIONS_AND_BOX,
                                  show_debug=True)

    sm = stateMachine()
    tb = TestBattery(env, observation_builder)

    state_machine_action_dict = {}
    railenv_action_dict = {}
    # max_time_steps = env.compute_max_episode_steps(args.width, args.height)
    max_time_steps = 200
    T_rewards = []  # List of episodes rewards
    T_Qs = []  # List of q values
    T_num_done_agents = []  # List of number of done agents for each episode
    T_all_done = []  # If all agents completed in each episode
    T_episodes = []  # Time taken for each episode

    if args.save_image and not os.path.isdir("image_dump"):
        os.makedirs("image_dump")

    step_taken = 0
    total_step_taken = 0
    total_episodes = 0
    step_times = []  # Time taken for each step

    for ep in range(args.num_episodes):
        # Reset info at the beginning of an episode
        start_time = time.time()  # Take time of one episode

        if args.generate_baseline:
            if not os.path.isdir("image_dump/" + str(dir)) and args.save_image:
                os.makedirs("image_dump/" + str(dir))
        else:
            if not os.path.isdir("image_dump/" + str(ep)) and args.save_image:
                os.makedirs("image_dump/" + str(ep))

        state, info = env.reset()
        tb.reset()

        if args.render:
            env_renderer.reset()
        reward_sum, all_done = 0, False  # reward_sum contains the cumulative reward obtained as sum during the steps
        num_done_agents = 0

        state_machine_action = {}
        for i in range(env.number_of_agents):
            state_machine_action[i] = 0

        for step in range(max_time_steps):
            start_step_time = time.time()

            #if step % 10 == 0:
            #	print(step)

            # Test battery
            # see test_battery.py
            triggers = tb.tests(state, args.prediction_depth,
                                state_machine_action)
            # state machine based on triggers of test battery
            # see state_machine.py
            state_machine_action = sm.act(
                triggers)  # State machine picks action

            for a in range(env.get_num_agents()):
                #if info['action_required'][a]:
                #	#railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a])
                #	railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a])
                #	state_machine_action_dict.update({a: state_machine_action})
                #	railenv_action_dict.update({a: railenv_action})
                # railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a])
                railenv_action = observation_builder.choose_railenv_action(
                    a, state_machine_action[a])
                state_machine_action_dict.update({a: state_machine_action})
                railenv_action_dict.update({a: railenv_action})

            state, reward, done, info = env.step(
                railenv_action_dict)  # Env step

            if args.generate_baseline:
                #env_renderer.render_env(show=True, show_observations=False, show_predictions=True)
                env_renderer.render_env(show=False,
                                        show_observations=False,
                                        show_predictions=True)
            else:
                env_renderer.render_env(show=True,
                                        show_observations=False,
                                        show_predictions=True)

            if args.generate_baseline:
                if args.save_image:
                    env_renderer.save_image("image_dump/" + str(dir) +
                                            "/image_" + str(step) + "_.png")
            else:
                if args.save_image:
                    env_renderer.save_image("image_dump/" + str(ep) +
                                            "/image_" + str(step) + "_.png")

            if args.debug:
                for a in range(env.get_num_agents()):
                    log('\n\n#########################################')
                    log('\nInfo for agent {}'.format(a))
                    #log('\npath : {}'.format(state[a]["path"]))
                    log('\noverlap : {}'.format(state[a]["overlap"]))
                    log('\ndirection : {}'.format(state[a]["direction"]))
                    log('\nOccupancy, first layer: {}'.format(
                        state[a]["occupancy"]))
                    log('\nOccupancy, second layer: {}'.format(
                        state[a]["conflict"]))
                    log('\nForks: {}'.format(state[a]["forks"]))
                    log('\nTarget: {}'.format(state[a]["target"]))
                    log('\nPriority: {}'.format(state[a]["priority"]))
                    log('\nMax priority encountered: {}'.format(
                        state[a]["max_priority"]))
                    log('\nNum malfunctioning agents (globally): {}'.format(
                        state[a]["n_malfunction"]))
                    log('\nNum agents ready to depart (globally): {}'.format(
                        state[a]["ready_to_depart"]))
                    log('\nStatus: {}'.format(info['status'][a]))
                    log('\nPosition: {}'.format(env.agents[a].position))
                    log('\nTarget: {}'.format(env.agents[a].target))
                    log('\nMoving? {} at speed: {}'.format(
                        env.agents[a].moving, info['speed'][a]))
                    log('\nAction required? {}'.format(
                        info['action_required'][a]))
                    log('\nState machine action: {}'.format(
                        state_machine_action_dict[a]))
                    log('\nRailenv action: {}'.format(railenv_action_dict[a]))
                    log('\nRewards: {}'.format(reward[a]))
                    log('\n\n#########################################')

            reward_sum += sum(reward[a] for a in range(env.get_num_agents()))

            step_taken = step
            time_taken_step = time.time() - start_step_time
            step_times.append(time_taken_step)

            if done['__all__']:
                all_done = True
                break

        total_step_taken += step_taken

        time_taken = time.time() - start_time  # Time taken for one episode
        total_episodes = ep

        # Time metrics - too precise
        avg_time_step = sum(step_times) / step_taken
        #print("Avg time step: " + str(avg_time_step))

        # No need to close the renderer since env parameter sizes stay the same
        T_rewards.append(reward_sum)
        # Compute num of agents that reached their target
        for a in range(env.get_num_agents()):
            if done[a]:
                num_done_agents += 1
        percentage_done_agents = num_done_agents / env.get_num_agents()
        log("\nDone agents in episode: {}".format(percentage_done_agents))
        T_num_done_agents.append(
            percentage_done_agents)  # In proportion to total
        T_all_done.append(all_done)

    # Average number of agents that reached their target
    avg_done_agents = sum(T_num_done_agents) / len(T_num_done_agents) if len(
        T_num_done_agents) > 0 else 0
    avg_reward = sum(T_rewards) / len(T_rewards) if len(T_rewards) > 0 else 0
    avg_norm_reward = avg_reward / (max_time_steps / env.get_num_agents())

    avg_ep_time = sum(T_episodes) / args.num_episodes

    if total_episodes == 0:
        total_episodes = 1

    log("\nSeed: " + str(args.seed) \
      + "\t | Avg_done_agents: " + str(avg_done_agents)\
      + "\t | Avg_reward: " + str(avg_reward)\
      + "\t | Avg_norm_reward: " + str(avg_norm_reward)\
      + "\t | Max_num_time_steps: " + str(max_time_steps)\
      + "\t | Avg_num_time_steps: " + str(total_step_taken/total_episodes)
            + "\t | Avg episode time: " + str(avg_ep_time))
コード例 #3
0
        print('\rTest: {}\t Step / MaxSteps: {} / {}'.format(
            test, step + 1, max_time_steps),
              end=" ")
        '''
        for agent_idx, agent in enumerate(env.agents):
            print(
                "Agent {} ha state {} in (current) position {} with malfunction {}".format(
                    agent_idx, str(agent.status), str(agent.position), str(agent.malfunction_data['malfunction'])))
        '''
        # Chose an action for each agent in the environment
        for a in range(env.get_num_agents()):
            if info['action_required'][a]:
                # print('Agent {} needs to submit an action'.format(a))
                # 'railenv_action' is in [0, 4], network_action' is in [0, 1]
                network_action = controller.act(obs[a])
                railenv_action = observation_builder.choose_railenv_action(
                    a, network_action)
            else:
                network_action = 0
                railenv_action = 0  # DO NOTHING

            railenv_action_dict.update({a: railenv_action})
            network_action_dict.update({a: network_action})

        #for a in range(env.get_num_agents()):

        for a in (0, 1, 2, 3):
            print('#########################################')
            print('Info for agent {}'.format(a))
            print('Obs: {}'.format(obs[a]))
            print('Status: {}'.format(info['status'][a]))
            print('Moving? {} at speed: {}'.format(env.agents[a].moving,
コード例 #4
0
def main(args):

    rail_generator = sparse_rail_generator(
        max_num_cities=args.max_num_cities,
        #seed=args.seed,
        seed=0,  # 0, 3, 7, 10, 14, 16, 20, 22, 23, 25, 26, 32
        grid_mode=args.grid_mode,
        max_rails_between_cities=args.max_rails_between_cities,
        max_rails_in_city=args.max_rails_in_city,
    )

    # Maps speeds to % of appearance in the env
    speed_ration_map = {
        1.: 0.25,  # Fast passenger train
        1. / 2.: 0.25,  # Fast freight train
        1. / 3.: 0.25,  # Slow commuter train
        1. / 4.: 0.25
    }  # Slow freight train

    observation_builder = GraphObsForRailEnv(
        bfs_depth=args.bfs_depth,
        predictor=ShortestPathPredictorForRailEnv(
            max_depth=args.prediction_depth))

    # Construct the environment with the given observation, generators, predictors, and stochastic data
    env = RailEnv(
        width=args.width,
        height=args.height,
        rail_generator=rail_generator,  # rail_from_file boh...
        schedule_generator=sparse_schedule_generator(speed_ration_map),
        number_of_agents=args.num_agents,
        obs_builder_object=observation_builder,
        malfunction_generator_and_process_data=malfunction_from_params(
            parameters={
                'malfunction_rate':
                args.malfunction_rate,  # Rate of malfunction occurrence
                'min_duration':
                args.min_duration,  # Minimal duration of malfunction
                'max_duration':
                args.max_duration  # Max duration of malfunction
            }))

    env_renderer = RenderTool(
        env,
        agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX,
        show_debug=True,
        screen_height=1080,
        screen_width=1920)

    state_machine_action_dict = {}
    railenv_action_dict = {}
    max_time_steps = 150
    T_rewards = []  # List of episodes rewards
    T_Qs = []  # List of q values
    T_num_done_agents = []  # List of number of done agents for each episode
    T_all_done = []  # If all agents completed in each episode

    for ep in range(args.num_episodes):
        # Reset info at the beginning of an episode
        # env.load(filename="map" + str(ep))
        state, info = env.reset()
        # env.save(filename="map" + str(ep))
        env_renderer.reset()
        reward_sum, all_done = 0, False  # reward_sum contains the cumulative reward obtained as sum during the steps
        num_done_agents = 0

        for step in range(max_time_steps):

            for a in range(env.get_num_agents()):
                shortest_path_prediction = observation_builder.cells_sequence[
                    a]
                state_machine_action, is_alternative = act(
                    args, env, a, state[a],
                    shortest_path_prediction)  # State machine picks action
                if not is_alternative:
                    railenv_action = observation_builder.choose_railenv_action(
                        a, state_machine_action)
                else:
                    railenv_action = state_machine_action
                state_machine_action_dict.update({a: state_machine_action})
                railenv_action_dict.update({a: railenv_action})

            state, reward, done, info = env.step(
                railenv_action_dict)  # Env step
            env_renderer.render_env(show=True,
                                    show_observations=False,
                                    show_predictions=True)

            for a in range(env.get_num_agents()):
                print('#########################################')
                print('Info for agent {}'.format(a))
                print('Occupancy, first layer: {}'.format(
                    state[a][:args.prediction_depth]))
                print('Occupancy, second layer: {}'.format(
                    state[a][args.prediction_depth:args.prediction_depth * 2]))
                print('Forks: {}'.format(
                    state[a][args.prediction_depth * 2:args.prediction_depth *
                             3]))
                print('Target: {}'.format(
                    state[a][args.prediction_depth * 3:args.prediction_depth *
                             4]))
                print('Priority: {}'.format(state[a][args.prediction_depth *
                                                     4]))
                print('Max priority encountered: {}'.format(
                    state[a][args.prediction_depth * 4 + 1]))
                print('Num malfunctoning agents (globally): {}'.format(
                    state[a][args.prediction_depth * 4 + 2]))
                print('Num agents ready to depart (globally): {}'.format(
                    state[a][args.prediction_depth * 4 + 3]))
                print('Status: {}'.format(info['status'][a]))
                print('Position: {}'.format(env.agents[a].position))
                print('Moving? {} at speed: {}'.format(env.agents[a].moving,
                                                       info['speed'][a]))
                print('Action required? {}'.format(info['action_required'][a]))
                print('Network action: {}'.format(
                    state_machine_action_dict[a]))
                print('Railenv action: {}'.format(railenv_action_dict[a]))
                # print('Q values: {}'.format(qvalues[a]))
                print('Rewards: {}'.format(reward[a]))

            reward_sum += sum(reward[a] for a in range(env.get_num_agents()))

            if done['__all__']:
                all_done = True
                break
        # No need to close the renderer since env parameter sizes stay the same
        T_rewards.append(reward_sum)
        # Compute num of agents that reached their target
        for a in range(env.get_num_agents()):
            if done[a]:
                num_done_agents += 1
        T_num_done_agents.append(
            num_done_agents / env.get_num_agents())  # In proportion to total
        T_all_done.append(all_done)

    avg_done_agents = sum(T_num_done_agents) / len(
        T_num_done_agents
    )  # Average number of agents that reached their target
    avg_reward = sum(T_rewards) / len(T_rewards)
    avg_norm_reward = avg_reward / (max_time_steps / env.get_num_agents())

    print("Avg. done agents: {}".format(avg_done_agents))
    print("Avg. reward: {}".format(avg_reward))
    print("Avg. norm reward: {}".format(avg_norm_reward))
コード例 #5
0
        #
        #####################################################################
        # Compute the action for this step by using the previously 
        # defined controller
        time_start = time.time()

        # Test battery
        # see test_battery.py
        triggers = tb.tests(state, prediction_depth, state_machine_action)
        # state machine based on triggers of test battery
        # see state_machine.py
        state_machine_action = sm.act(triggers) # State machine picks action

        for a in range(number_of_agents):
            #state_machine_action = act(prediction_depth, state[a])  # State machine picks action
            railenv_action = observation_builder.choose_railenv_action(a, state_machine_action)
            # state_machine_action_dict.update({a: state_machine_action})
            railenv_action_dict.update({a: railenv_action})
        time_taken = time.time() - time_start
        time_taken_by_controller.append(time_taken)
        # Perform the chosen action on the environment.
        # The action gets applied to both the local and the remote copy 
        # of the environment instance, and the observation is what is 
        # returned by the local copy of the env, and the rewards, and done and info
        # are returned by the remote copy of the env
        time_start = time.time()
        state, reward, done, info = remote_client.env_step(railenv_action_dict)
        steps += 1
        time_taken = time.time() - time_start
        time_taken_per_step.append(time_taken)
        reward_sum += sum(list(reward.values()))
コード例 #6
0
def main(args):
	# Show options and values
	print(' ' * 26 + 'Options')
	for k, v in vars(args).items():
		print(' ' * 26 + k + ': ' + str(v))
	# Where to save models
	results_dir = os.path.join('results', args.id)
	if not os.path.exists(results_dir):
		os.makedirs(results_dir)
	# These are saved in a .pth
	metrics = {'episodes': [], # originally 'steps'
			   'rewards': [],
			   'Qs': [],
			   'best_avg_done_agents': -float('inf'),
			   'best_avg_reward': -float('inf')}
	np.random.seed(args.seed)
	torch.manual_seed(np.random.randint(1, 10000))
	# Set cpu or gpu
	if torch.cuda.is_available() and not args.disable_cuda:
		args.device = torch.device('cuda')
		torch.cuda.manual_seed(np.random.randint(1, 10000))
		torch.backends.cudnn.enabled = args.enable_cudnn
	else:
		args.device = torch.device('cpu')
	
	# Simple ISO 8601 timestamped logger
	def log(s):
		print('[' + str(datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) + '] ' + s)
	
	
	def load_memory(memory_path, disable_bzip):
		if disable_bzip:
			with open(memory_path, 'rb') as pickle_file:
				return pickle.load(pickle_file)
		else:
			with bz2.open(memory_path, 'rb') as zipped_pickle_file:
				return pickle.load(zipped_pickle_file)
	
	
	def save_memory(memory, memory_path, disable_bzip):
		if disable_bzip:
			with open(memory_path, 'wb') as pickle_file:
				pickle.dump(memory, pickle_file)
		else:
			with bz2.open(memory_path, 'wb') as zipped_pickle_file:
				pickle.dump(memory, zipped_pickle_file)
	
	
	rail_generator = sparse_rail_generator(max_num_cities=args.max_num_cities,
										   seed=args.seed,
										   grid_mode=args.grid_mode,
										   max_rails_between_cities=args.max_rails_between_cities,
										   max_rails_in_city=args.max_rails_in_city,
										   )
	# Maps speeds to % of appearance in the env
	speed_ration_map = {1.: 0.25,  # Fast passenger train
						1. / 2.: 0.25,  # Fast freight train
						1. / 3.: 0.25,  # Slow commuter train
						1. / 4.: 0.25}  # Slow freight train
	
	schedule_generator = sparse_schedule_generator(speed_ration_map)
	
	stochastic_data = {'malfunction_rate': args.malfunction_rate,  # Rate of malfunction occurrence
					   'min_duration': args.min_duration,  # Minimal duration of malfunction
					   'max_duration': args.max_duration  # Max duration of malfunction
					   }
	
	observation_builder = GraphObsForRailEnv(predictor=ShortestPathPredictorForRailEnv(max_depth=args.prediction_depth))
	
	# Construct the environment with the given observation, generators, predictors, and stochastic data
	env = RailEnv(width=args.width,
				  height=args.height,
				  rail_generator=rail_generator,
				  schedule_generator=schedule_generator,
				  number_of_agents=args.num_agents,
				  obs_builder_object=observation_builder,
				  malfunction_generator_and_process_data=malfunction_from_params(stochastic_data)
	              )
	env.reset()
	
	state_size = args.prediction_depth * 4 + 4 # TODO
	# action_space = args.network_action_space
	network_action_dict = {}
	railenv_action_dict = {}
	qvalues = {} # Map handle: q value for this step
	# Init agent
	dqn = RainbowAgent(args, state_size, env)
	
	# If a model is provided, and evaluate is false, presumably we want to resume, so try to load memory
	if args.model is not None and not args.evaluate:
		if not args.memory:
			raise ValueError('Cannot resume training without memory save path. Aborting...')
		elif not os.path.exists(args.memory):
			raise ValueError('Could not find memory file at {path}. Aborting...'.format(path=args.memory))
	
		mem = load_memory(args.memory, args.disable_bzip_memory)
	else:
		# Init one replay buffer for each agent (TODO) Must be updated when the number of agents change
		mems = [ReplayMemory(args, int(args.memory_capacity/args.num_agents)) for a in range(args.num_agents)]
		# mem = ReplayMemory(args, args.memory_capacity)  # Init empty replay buffer
	
	priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start)
	
	# Construct validation memory
	val_mem = ReplayMemory(args, args.evaluation_size)
	T = 0
	all_done = True
	update_values = [False] * env.get_num_agents() # Used to update agent if action was performed in this step
	
	# Number of transitions to do for validating Q
	print("Validating Q...")
	while T < args.evaluation_size:
		
		for a in range(env.get_num_agents()):
			if all_done:
				state, info = env.reset()
				all_done = False
		
		for a in range(env.get_num_agents()):
			action = np.random.choice(np.arange(5))
			railenv_action_dict.update({a: action})
			
		next_state, reward, done, info = env.step(railenv_action_dict)
		val_mem.append(state[0], None, None, all_done) # TODO Using only state from agent 0 for now
		all_done = done['__all__']
		state = next_state
		T += 1
	
	if args.evaluate:
		dqn.eval() # Set DQN (online network) to evaluation mode
		avg_done_agents, avg_reward, avg_norm_reward = test(args, 0, 0, dqn, val_mem, metrics, results_dir, evaluate=True)  # Test
		#print('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q))
		print('Avg. done agents: ' + str(avg_done_agents) + ' | Avg. cumulative reward: ' + str(avg_reward) + 
			  ' | Avg. normalized reward: ' + str(avg_norm_reward))
	else:
		# Training loop
		print("Training started...")
		dqn.train()
		################## Episodes loop #######################
		for ep in trange(1, args.num_episodes + 1):
			# Reset env at the beginning of one episode
			state, info = env.reset()
	
			# Pick first action - entering of agents is now random
			for a in range(env.get_num_agents()):
				action = np.random.choice((0,2))
				railenv_action_dict.update({a: action})
			next_state, reward, done, info = env.step(railenv_action_dict)  # Env first step
	
			############## Steps loop ##########################
			for T in range(1, args.T_max + 1):
				if T % args.replay_frequency == 0:
					dqn.reset_noise()  # Draw a new set of noisy weights
	
				for a in range(env.get_num_agents()):
					if info['action_required'][a]:
						network_action = dqn.act(state[a])  # Choose an action greedily (with noisy weights)
						railenv_action = observation_builder.choose_railenv_action(a, network_action)
						update_values[a] = True
						qvalues.update({a: dqn.get_q_values(state[a])})
					else:
						network_action = 0
						railenv_action = 0
						update_values[a] = False
						qvalues.update({a: [0, 0]}) # '0' if wasn't updated
					# Update action dicts
					railenv_action_dict.update({a: railenv_action})
					network_action_dict.update({a: network_action})
	
				next_state, reward, done, info = env.step(railenv_action_dict)  # Env step
				
				'''
				if T == 100: # Print only at 100th steps of each episode
					if args.debug:
						for a in range(env.get_num_agents()):
							print('#########################################')
							print('Info for agent {}'.format(a))
							print('Occupancy, first layer: {}'.format(state[a][:args.prediction_depth]))
							print('Occupancy, second layer: {}'.format(
								state[a][args.prediction_depth:args.prediction_depth * 2]))
							print('Forks: {}'.format(state[a][args.prediction_depth * 2:args.prediction_depth * 3]))
							print('Target: {}'.format(state[a][args.prediction_depth * 3:args.prediction_depth * 4]))
							print('Priority: {}'.format(state[a][args.prediction_depth * 4]))
							print('Max priority encountered: {}'.format(state[a][args.prediction_depth * 4 + 1]))
							print('Num malfunctoning agents (globally): {}'.format(state[a][args.prediction_depth * 4 + 2]))
							print(
								'Num agents ready to depart (globally): {}'.format(state[a][args.prediction_depth * 4 + 3]))
							print('Status: {}'.format(info['status'][a]))
							print('Position: {}'.format(env.agents[a].position))
							print('Moving? {} at speed: {}'.format(env.agents[a].moving, info['speed'][a]))
							print('Action required? {}'.format(info['action_required'][a]))
							print('Network action: {}'.format(network_action_dict[a]))
							print('Railenv action: {}'.format(railenv_action_dict[a]))
							print('Q values: {}'.format(qvalues[a]))
							print('Rewards: {}'.format(reward))

				'''

				# Clip reward and update replay buffer
				for a in range(env.get_num_agents()):
					'''
					* Reward is always in [-1, 1], so we shouldn't need clipping
					if args.reward_clip > 0:
						reward[a] = max(min(reward[a], args.reward_clip), -args.reward_clip)
					'''
					if update_values[a]:  # Store transition only if this agent performed action in this time step
						mems[a].append(state[a], network_action_dict[a], reward[a], done[a]) # Append to own buffer
						#mem.append(state[a], network_action_dict[a], reward[a], done[a])  # Append transition to memory
				# print('Clipped rewards: {}'.format(reward))
	
				state = next_state.copy()
				# Train and test
				if ep >= args.learn_start: # Give time to accumulate experiences
					# Anneal importance sampling weight β to 1
					#mem.priority_weight = min(mem.priority_weight + priority_weight_increase, 1)
					for a in range(args.num_agents):
						mems[a].priority_weight = min(mems[a].priority_weight + priority_weight_increase, 1)
	
					if T % args.replay_frequency == 0:
						a = np.random.choice(np.arange(args.num_agents))
						dqn.learn(mems[a]) # Learn randomly from one of the available replay buffer
						# dqn.learn(mem)  # Train with n-step distributional double-Q learning
	
					# Update target network
					if T % args.target_update == 0:
						dqn.update_target_net()
	
				if done['__all__']:
					break
			##### EPISODE END ##############
			if (ep % args.evaluation_interval) == 0: # Evaluate only at the end of the episodes

				dqn.eval()  # Set DQN (online network) to evaluation mode
				avg_done_agents, avg_reward, avg_norm_reward = test(args, T, ep, dqn, val_mem, metrics, results_dir)  # Test
				log(
					'T = ' + str(T) + ' / ' + str(args.T_max) + ' | Avg. done agents: ' + str(avg_done_agents) +
					' | Avg. reward: ' + str(avg_reward) + ' | Avg. normalized reward: ' + str(avg_norm_reward))
				dqn.train()  # Set DQN (online network) back to training mode

				# If memory path provided, save it
				if args.memory is not None:
					save_memory(mems[0], args.memory, args.disable_bzip_memory)  # Save only first replay buffer (?)
			# save_memory(mem, args.memory, args.disable_bzip_memory)

			# Checkpoint the network every 'checkpoint_interval' episodes
			if (args.checkpoint_interval != 0) and (ep % args.checkpoint_interval == 0):
				dqn.save(results_dir, 'checkpoint.pth')
コード例 #7
0
def main(args):

    rail_generator = sparse_rail_generator(
        max_num_cities=args.max_num_cities,
        seed=args.seed,
        grid_mode=args.grid_mode,
        max_rails_between_cities=args.max_rails_between_cities,
        max_rails_in_city=args.max_rails_in_city,
    )
    # Maps speeds to % of appearance in the env
    speed_ration_map = {
        1.: 0.25,  # Fast passenger train
        1. / 2.: 0.25,  # Fast freight train
        1. / 3.: 0.25,  # Slow commuter train
        1. / 4.: 0.25
    }  # Slow freight train

    schedule_generator = sparse_schedule_generator(speed_ration_map)
    ''' THIS WORKS WITH NEXT VERSION
    stochastic_data = MalfunctionParameters(
        malfunction_rate=args.malfunction_rate,  # Rate of malfunction occurrence of single agent
        min_duration=args.min_duration,  # Minimal duration of malfunction
        max_duration=args.max_duration  # Max duration of malfunction
    )
    '''

    stochastic_data = {
        'malfunction_rate': args.malfunction_rate,
        'min_duration': args.min_duration,
        'max_duration': args.max_duration
    }

    if args.observation_builder == 'GraphObsForRailEnv':

        prediction_depth = args.prediction_depth
        bfs_depth = args.bfs_depth
        observation_builder = GraphObsForRailEnv(
            bfs_depth=bfs_depth,
            predictor=ShortestPathPredictorForRailEnv(
                max_depth=prediction_depth))
        state_size = args.prediction_depth * 3 + 4  # TODO
        network_action_size = 2  # {follow path, stop}
        railenv_action_size = 5  # The RailEnv possible actions
        agent = Agent(network_type='fc',
                      state_size=state_size,
                      action_size=network_action_size)

    elif args.observation_builder == 'LocalObsForRailEnv':

        observation_builder = LocalObsForRailEnv(args.view_semiwidth,
                                                 args.view_height, args.offset)
        #state_size = (2 * args.view_semiwidth + 1) * args.height
        state_size = 16 + 5 + 2  # state_size == in_channels
        railenv_action_size = 5
        agent = Agent(network_type='conv',
                      state_size=state_size,
                      action_size=railenv_action_size)

    # Construct the environment with the given observation, generators, predictors, and stochastic data
    env = RailEnv(
        width=args.width,
        height=args.height,
        rail_generator=rail_generator,
        schedule_generator=schedule_generator,
        number_of_agents=args.num_agents,
        obs_builder_object=observation_builder,
        malfunction_generator_and_process_data=malfunction_from_params(
            stochastic_data),
        remove_agents_at_target=True)
    env.reset()

    # max_steps = env.compute_max_episode_steps(args.width, args.height, args.num_agents/args.max_num_cities)
    max_steps = 200  # TODO DEBUG
    eps = 1.
    eps_end = 0.005
    eps_decay = 0.998
    # Need to have two since env works with RailEnv actions but agent works with network actions
    network_action_dict = dict()
    railenv_action_dict = dict()
    scores_window = deque(maxlen=100)
    done_window = deque(maxlen=100)
    scores = []
    dones_list = []
    action_prob = [0] * railenv_action_size
    agent_obs = [None] * env.get_num_agents()
    agent_obs_buffer = [None] * env.get_num_agents()
    agent_action_buffer = [2] * env.get_num_agents()
    update_values = [False] * env.get_num_agents(
    )  # Used to update agent if action was performed in this step
    qvalues = {}

    for ep in range(1, args.num_episodes + 1):

        obs, info = env.reset()

        if args.observation_builder == 'GraphObsForRailEnv':
            for a in range(env.get_num_agents()):
                agent_obs[a] = obs[a].copy()
                agent_obs_buffer[a] = agent_obs[a].copy()
            for a in range(env.get_num_agents()):
                action = np.random.choice((0, 2))
                railenv_action_dict.update(
                    {a:
                     action})  # All'inizio faccio partire a random TODO Prova
            next_obs, all_rewards, done, info = env.step(railenv_action_dict)
        # Normalize obs, only for LocalObs now
        elif args.observation_builder == 'LocalObsForRailEnv':
            for a in range(env.get_num_agents()):
                if obs[a]:
                    agent_obs[a] = preprocess_obs(obs[a])
                    agent_obs_buffer[a] = agent_obs[a].copy()

        score = 0
        env_done = 0

        ############# Main loop
        for step in range(max_steps - 1):
            '''
            print(
                '\r{} Agents on ({},{}).\t Ep: {}\t Step/MaxSteps: {} / {}'.format(
                    env.get_num_agents(), args.width, args.height,
                    ep,
                    step,
                    max_steps), end=" ")
            '''
            # Logging
            #print_info(env)

            for a in range(env.get_num_agents()):

                if args.observation_builder == 'GraphObsForRailEnv':
                    if info['action_required'][a]:
                        # 'railenv_action' is in [0, 4], network_action' is in [0, 1]
                        network_action = agent.act(agent_obs[a], eps=eps)
                        # Pick railenv action according to network decision if it's safe to go or to stop
                        railenv_action = observation_builder.choose_railenv_action(
                            a, network_action)
                        update_values[a] = True
                        qvalues.update({a: agent.get_q_values(agent_obs[a])})
                    else:
                        network_action = 0
                        railenv_action = 0
                        update_values[a] = False
                        qvalues.update({a: [0, 0]})
                    # Update action dicts
                    action_prob[railenv_action] += 1
                    railenv_action_dict.update({a: railenv_action})
                    network_action_dict.update({a: network_action})

                elif args.observation_builder == 'LocalObsForRailEnv':
                    if info['action_required'][a]:
                        railenv_action = agent.act(agent_obs[a], eps=eps)
                        update_values[a] = True
                    else:
                        railenv_action = 0  # If action is not required DO_NOTHING
                        update_values[a] = False
                    action_prob[railenv_action] += 1
                    railenv_action_dict.update({a: railenv_action})

            # Environment step
            next_obs, all_rewards, done, info = env.step(railenv_action_dict)
            if step == 100:
                print('QValues: {}'.format(qvalues))
            # Update replay buffer and train agent
            for a in range(env.get_num_agents()):
                if update_values[a] or done[a]:
                    if args.observation_builder == 'GraphObsForRailEnv':
                        agent.step(agent_obs_buffer[a], network_action_dict[a],
                                   all_rewards[a], agent_obs[a], done[a])
                    else:
                        agent.step(agent_obs_buffer[a], network_action_dict[a],
                                   all_rewards[a], agent_obs[a], done[a])
                    agent_obs_buffer[a] = agent_obs[a].copy()
                    '''
                    if args.observation_builder == 'GraphObsForRailEnv':
                        agent_action_buffer[a] = network_action_dict[a]
                    elif args.observation_builder == 'LocalObsForRailEnv':
                        agent_action_buffer[a] = railenv_action_dict[a]
                    '''
                # Preprocessing and normalization
                if args.observation_builder == 'GraphObsForRailEnv':
                    agent_obs[a] = next_obs[a].copy()
                if args.observation_builder == 'LocalObsForRailEnv' and next_obs[
                        a]:
                    agent_obs[a] = preprocess_obs(next_obs[a])

                score += all_rewards[a] / env.get_num_agents()  # Update score

            if done['__all__']:
                env_done = 1
                break

        ################### At the end of the episode TODO This part could be done in another script
        eps = max(eps_end, eps_decay * eps)  # Decrease epsilon
        # Metrics
        done_window.append(env_done)
        num_agents_done = 0  # Num of agents that reached their target
        for a in range(env.get_num_agents()):
            if done[a]:
                num_agents_done += 1

        scores_window.append(score / max_steps)  # Save most recent score
        scores.append(np.mean(scores_window))
        dones_list.append((np.mean(done_window)))

        action_prob_float = action_prob / np.sum(action_prob)
        formatted_action_prob = [
            '{:.5f}'.format(ap) for ap in action_prob_float
        ]

        # Print training results info
        print(
            '\r{} Agents on ({},{}).\t Ep: {}\t Avg Score: {:.3f}\t Env Dones so far: {:.2f}%\t Done Agents in ep: {:.2f}%\t Eps: {:.2f}\t Action Probs: {} '
            .format(env.get_num_agents(), args.width, args.height, ep,
                    np.mean(scores_window), 100 * np.mean(done_window),
                    100 * (num_agents_done / args.num_agents), eps,
                    formatted_action_prob),
            end=" ")

        if ep % 50 == 0:
            print(
                '\rTraining {} Agents.\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'
                .format(env.get_num_agents(), ep, np.mean(scores_window),
                        100 * np.mean(done_window),
                        100 * (num_agents_done / args.num_agents), eps,
                        formatted_action_prob))
            torch.save(agent.qnetwork_local.state_dict(),
                       './nets/' + str(args.model_name) + str(ep) + '.pth')
            action_prob = [1] * railenv_action_size