def test_normalize_features():

    random.seed(1)
    np.random.seed(1)
    max_depth = 4

    for i in range(10):
        tree_observer = TreeObsForRailEnv(max_depth=max_depth)
        next_rand_number = random.randint(0, 100)

        env = RailEnv(width=10,
                      height=10,
                      rail_generator=complex_rail_generator(
                          nr_start_goal=10,
                          nr_extra=1,
                          min_dist=8,
                          max_dist=99999,
                          seed=next_rand_number),
                      schedule_generator=complex_schedule_generator(),
                      number_of_agents=1,
                      obs_builder_object=tree_observer)

        obs, all_rewards, done, _ = env.step({0: 0})

        obs_new = tree_observer.get()
        # data, distance, agent_data = split_tree(tree=np.array(obs_old), num_features_per_node=11)
        data_normalized = normalize_observation(obs_new,
                                                max_depth,
                                                observation_radius=10)

        filename = 'testdata/test_array_{}.csv'.format(i)
        data_loaded = np.loadtxt(filename, delimiter=',')

        assert np.allclose(data_loaded, data_normalized)
def eval_policy(env, policy, train_params, obs_params):
    n_eval_episodes = train_params.n_evaluation_episodes
    max_steps = env._max_episode_steps
    tree_depth = obs_params.observation_tree_depth
    observation_radius = obs_params.observation_radius

    action_dict = dict()
    scores = []
    completions = []
    nb_steps = []

    for episode_idx in range(n_eval_episodes):
        agent_obs = [None] * env.get_num_agents()
        score = 0.0

        obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True)

        final_step = 0

        for step in range(max_steps - 1):
            for agent in env.get_agent_handles():
                if obs[agent]:
                    agent_obs[agent] = normalize_observation(
                        obs[agent],
                        tree_depth=tree_depth,
                        observation_radius=observation_radius)

                action = 0
                if info['action_required'][agent]:
                    action = policy.act(agent_obs[agent], eps=0.0)
                action_dict.update({agent: action})

            obs, all_rewards, done, info = env.step(action_dict)

            for agent in env.get_agent_handles():
                score += all_rewards[agent]

            final_step = step

            if done['__all__']:
                break

        normalized_score = score / (max_steps * env.get_num_agents())
        scores.append(normalized_score)

        tasks_finished = sum(done[idx] for idx in env.get_agent_handles())
        completion = tasks_finished / max(1, env.get_num_agents())
        completions.append(completion)

        nb_steps.append(final_step)

    print("\t✅ Eval: score {:.3f} done {:.1f}%".format(
        np.mean(scores),
        np.mean(completions) * 100.0))

    return scores, completions, nb_steps
Esempio n. 3
0
def get_actions(obs, n_agents, info):
    actions = {}
    for _idx in obs.keys():
        # and info['action_required'][_idx]==True and info['malfunction'][_idx]!=1
        if obs[_idx]!=None :
            normalized_obs = normalize_observation(obs[_idx], observation_tree_depth, observation_radius)
            action = agent.take_action(normalized_obs)
            actions[_idx] = action
        else:
            actions[_idx] = 0
    return actions
 def step(self, action_dict):
     #print(action_dict)
     obs, rewards, dones, infos = super().step(action_dict)
     agent_obs = [None] * self.get_num_agents()
     for a in range(self.get_num_agents()):
         if obs[a]:
             #print("Obs")
             agent_obs[a] = normalize_observation(obs[a],
                                                  tree_depth,
                                                  observation_radius=10)
         #else: print("No Obs")
     return agent_obs, rewards, dones, infos
    def reset(self):
        obs, info = super().reset(True, True)
        agent_obs = [None] * self.get_num_agents()
        #agent_next_obs = [None] * env.get_num_agents()
        #agent_obs_buffer = [None] * self.get_num_agents()

        for a in range(self.get_num_agents()):
            if obs[a]:
                agent_obs[a] = normalize_observation(
                    obs[a], tree_depth,
                    observation_radius=10).astype('float32')
                #agent_obs_buffer[a] = agent_obs[a].copy()
        return agent_obs, info
Esempio n. 6
0
def normalized_obs(obs):
    norm_obs = []
    for _idx in obs.keys():
        
        try:
            if obs[_idx]!=None:
                norm_obs.append(normalize_observation(obs[_idx], observation_tree_depth, observation_radius))
        except:
            print("OBS inside",obs[_idx])
    try:
        #print(norm_obs)
        return norm_obs
    except:
        print("OBS",obs)
Esempio n. 7
0
def my_controller(env, obs, number_of_agents):
    for a in range(number_of_agents):
        agent = env.agents[a]
        if done[a]: 
           continue
        if agent.speed_data['position_fraction']>5:
           action_dict.update({a: 4}) # stop
           continue
        if info['action_required'][a]:
            if agent.position is not None:
                possible_transition_num = np.count_nonzero(env.rail.get_transitions(*agent.position, agent.direction))
                if possible_transition_num == 1:
                    action = 2
                else:
                    action = policy.act(normalize_observation(obs[a], observation_tree_depth, observation_radius=10), eps=0.01)
            else:
                action = 2
        else:
            action = 0
        action_dict.update({a: action})
    return  action_dict
Esempio n. 8
0
 def get_normalized_observation(observation,
                                tree_depth: int,
                                observation_radius=0):
     return normalize_observation(observation, tree_depth,
                                  observation_radius)
Esempio n. 9
0
def get_reward(weights, model, render=False):
    cloned_model = copy.deepcopy(model)
    for i, param in enumerate(cloned_model.parameters()):
        try:
            param.data.copy_(weights[i])
        except:
            param.data.copy_(weights[i].data)

    env_Orig = RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=3,
            # Number of cities in map (where train stations are)
            seed=1,  # Random seed
            grid_mode=False,
            max_rails_between_cities=2,
            max_rails_in_city=3),
        schedule_generator=sparse_schedule_generator(speed_ration_map),
        number_of_agents=n_agents,
        stochastic_data=stochastic_data,  # Malfunction data generator
        obs_builder_object=TreeObservation)

    env = copy.deepcopy(env_Orig)

    # After training we want to render the results so we also load a renderer
    env_renderer = RenderTool(
        env,
        gl="PILSVG",
    )

    # And the max number of steps we want to take per episode
    max_steps = int(4 * 2 * (20 + env.height + env.width))

    n_episodes = 1
    for trials in range(1, n_episodes + 1):
        # Reset environment
        obs, info = env.reset(True, True)
        env_renderer.reset()
        # Build agent specific observations
        for a in range(env.get_num_agents()):
            if obs[a]:
                agent_obs[a] = normalize_observation(obs[a],
                                                     tree_depth,
                                                     observation_radius=10)
                agent_obs_buffer[a] = agent_obs[a].copy()

        # Reset score and done
        score = 0
        env_done = 0
        step = 0

        # Run episode
        while True:
            # Action
            for a in range(env.get_num_agents()):
                if info['action_required'][a]:
                    # If an action is require, we want to store the obs a that step as well as the action
                    update_values[a] = True

                    batch = torch.from_numpy(agent_obs[a][np.newaxis,
                                                          ...]).float()
                    if cuda:
                        batch = batch.cuda()
                    prediction = cloned_model(Variable(batch))
                    action = prediction.data.cpu().numpy().argmax()

                    # action = agent.act(agent_obs[a], eps=eps)
                    action_prob[action] += 1
                else:
                    update_values[a] = False
                    action = 0
                action_dict.update({a: action})

            # Environment step
            # print("Action Values:", action_dict)
            next_obs, all_rewards, done, info = env.step(action_dict)
            step += 1
            if (render):
                env_renderer.render_env(show=True,
                                        show_predictions=True,
                                        show_observations=False)

            for a in range(env.get_num_agents()):
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values[a] or done[a]:
                    # agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a],
                    #           agent_obs[a], done[a])
                    cummulated_reward[a] = 0.

                    agent_obs_buffer[a] = agent_obs[a].copy()
                    agent_action_buffer[a] = action_dict[a]
                if next_obs[a]:
                    agent_obs[a] = normalize_observation(next_obs[a],
                                                         tree_depth,
                                                         observation_radius=10)

                score += all_rewards[a] / env.get_num_agents()
            # print(all_rewards)
            # Copy observation
            if done['__all__'] or step >= max_steps:
                env_done = 1
                break

        # Collection information about training
        tasks_finished = 0
        for current_agent in env.agents:
            if current_agent.status == RailAgentStatus.DONE_REMOVED:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / max_steps)  # save most recent score
        scores.append(np.mean(scores_window))
        dones_list.append((np.mean(done_window)))

        print(
            '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\t Action Probabilities: \t {}'
            .format(env.get_num_agents(), x_dim, y_dim, trials,
                    np.mean(scores_window), 100 * np.mean(done_window),
                    action_prob / np.sum(action_prob)),
            end=" ")

    # env.close()
    data = [[
        n_agents, x_dim, y_dim, trials,
        np.mean(scores_window), 100 * np.mean(done_window), step,
        action_prob / np.sum(action_prob)
    ]]

    dfCur = pd.DataFrame(data)

    with open(f'ES_TrainingResults_{n_agents}_{x_dim}_{y_dim}.csv', 'a') as f:
        dfCur.to_csv(f, index=False, header=False)

    return np.mean(scores)
Esempio n. 10
0
def main(args):
    try:
        opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""])
    except getopt.GetoptError as err:
        print(str(err))  # will print something like "option -a not recognized"
        sys.exit(2)
    sleep_for_animation = True
    for o, a in opts:
        if o in ("--sleep-for-animation"):
            sleep_for_animation = str2bool(a)
        else:
            assert False, "unhandled option"

    batch_builder = SampleBatchBuilder()  # or MultiAgentSampleBatchBuilder
    writer = JsonWriter("./out/")

    #  Setting these 2 parameters to True can slow down training
    visuals = False
    sleep_for_animation = False

    if visuals:
        from flatland.utils.rendertools import RenderTool

    max_depth = 30
    tree_depth = 2
    trial_start = 100
    n_trials = 999
    start = 0

    columns = [
        'Agents', 'X_DIM', 'Y_DIM', 'TRIAL_NO', 'REWARD', 'NORMALIZED_REWARD',
        'DONE_RATIO', 'STEPS', 'ACTION_PROB'
    ]
    df_all_results = pd.DataFrame(columns=columns)

    for trials in range(trial_start, n_trials + 1):

        env_file = f"envs-100-999/envs/Level_{trials}.pkl"
        # env_file = f"../env_configs/round_1-small/Test_0/Level_{trials}.mpk"

        # file = f"../env_configs/actions-small/Test_0/Level_{trials}.mpk"
        file = f"envs-100-999/actions/envs/Level_{trials}.json"

        if not os.path.isfile(env_file) or not os.path.isfile(file):
            print("Missing file!", env_file, file)
            continue

        step = 0

        obs_builder_object = TreeObsForRailEnv(
            max_depth=tree_depth,
            predictor=ShortestPathPredictorForRailEnv(max_depth))

        env = RailEnv(
            width=1,
            height=1,
            rail_generator=rail_from_file(env_file),
            schedule_generator=schedule_from_file(env_file),
            malfunction_generator_and_process_data=malfunction_from_file(
                env_file),
            obs_builder_object=obs_builder_object)

        obs, info = env.reset(regenerate_rail=True,
                              regenerate_schedule=True,
                              activate_agents=False,
                              random_seed=1001)

        with open(file, "r") as files:
            expert_actions = json.load(files)

        n_agents = env.get_num_agents()
        x_dim, y_dim = env.width, env.height

        agent_obs = [None] * n_agents
        agent_obs_buffer = [None] * n_agents
        done = dict()
        done["__all__"] = False

        if imitate:
            agent_action_buffer = list(expert_actions[step].values())
        else:
            # , p=[0.2, 0, 0.5])  # [0] * n_agents
            agent_action_buffer = np.random.choice(5, n_agents, replace=True)
        update_values = [False] * n_agents

        max_steps = int(4 * 2 * (20 + env.height + env.width))

        action_size = 5  # 3

        # And some variables to keep track of the progress
        action_dict = dict()
        scores_window = deque(maxlen=100)
        reward_window = deque(maxlen=100)
        done_window = deque(maxlen=100)
        action_prob = [0] * action_size

        # agent = Agent(state_size, action_size)

        if visuals:
            env_renderer = RenderTool(env, gl="PILSVG")
            env_renderer.render_env(show=True,
                                    frames=True,
                                    show_observations=True)

        for a in range(n_agents):
            if obs[a]:
                agent_obs[a] = normalize_observation(obs[a],
                                                     tree_depth,
                                                     observation_radius=10)
                agent_obs_buffer[a] = agent_obs[a].copy()

        # Reset score and done
        score = 0
        agent_action_buffer = np.zeros(n_agents)
        # prev_action = np.zeros_like(envs.action_space.sample())
        prev_reward = np.zeros(n_agents)
        for step in range(max_steps):
            for a in range(n_agents):
                if info['action_required'][a]:
                    if imitate:
                        if step < len(expert_actions):
                            action = expert_actions[step][str(a)]
                        else:
                            action = 0
                    else:
                        action = 0

                    action_prob[action] += 1
                    update_values[a] = True

                else:
                    update_values[a] = False
                    action = 0

                action_dict.update({a: action})

            next_obs, all_rewards, done, info = env.step(action_dict)

            for a in range(n_agents):

                if next_obs[a] is not None:
                    agent_obs[a] = normalize_observation(next_obs[a],
                                                         tree_depth,
                                                         observation_radius=10)

                # Only update the values when we are done or when an action
                # was taken and thus relevant information is present
                if update_values[a] or done[a]:
                    start += 1

                    batch_builder.add_values(
                        t=step,
                        eps_id=trials,
                        agent_index=0,
                        obs=agent_obs_buffer[a],
                        actions=action_dict[a],
                        action_prob=1.0,  # put the true action probability
                        rewards=all_rewards[a],
                        prev_actions=agent_action_buffer[a],
                        prev_rewards=prev_reward[a],
                        dones=done[a],
                        infos=info['action_required'][a],
                        new_obs=agent_obs[a])

                agent_obs_buffer[a] = agent_obs[a].copy()
                agent_action_buffer[a] = action_dict[a]
                prev_reward[a] = all_rewards[a]

                score += all_rewards[a]  # / envs.get_num_agents()

            if visuals:
                env_renderer.render_env(show=True,
                                        frames=True,
                                        show_observations=True)
                if sleep_for_animation:
                    time.sleep(0.5)

            if done["__all__"] or step > max_steps:
                writer.write(batch_builder.build_and_reset())
                break

            # Collection information about training
            if step % 100 == 0:
                tasks_finished = 0
                for current_agent in env.agents:
                    if current_agent.status == RailAgentStatus.DONE_REMOVED:
                        tasks_finished += 1
                print(
                    '\rTrial No {} Training {} Agents on ({},{}).\t Steps {}\t Reward: {:.3f}\t Normalized Reward: {:.3f}\tDones: {:.2f}%\t'
                    .format(
                        trials, env.get_num_agents(), x_dim, y_dim, step,
                        score, score / (max_steps + n_agents), 100 * np.mean(
                            tasks_finished / max(1, env.get_num_agents()))),
                    end=" ")

        tasks_finished = 0
        for current_agent in env.agents:
            if current_agent.status == RailAgentStatus.DONE_REMOVED:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        reward_window.append(score)
        scores_window.append(score / (max_steps + n_agents))

        data = [[
            n_agents, x_dim, y_dim, trials,
            np.mean(reward_window),
            np.mean(scores_window), 100 * np.mean(done_window), step,
            action_prob / np.sum(action_prob)
        ]]

        df_cur = pd.DataFrame(data, columns=columns)
        df_all_results = pd.concat([df_all_results, df_cur])

        if imitate:
            df_all_results.to_csv(
                f'TreeImitationLearning_DQN_TrainingResults.csv', index=False)

        print(
            '\rTrial No {} Training {} Agents on ({},{}).\t Total Steps {}\t Reward: {:.3f}\t Normalized Reward: {:.3f}\tDones: {:.2f}%\t'
            .format(trials, env.get_num_agents(), x_dim, y_dim, step,
                    np.mean(reward_window), np.mean(scores_window),
                    100 * np.mean(done_window)))

        if visuals:
            env_renderer.close_window()

        gc.collect()
Esempio n. 11
0
def evaluate(seed=37429879,
             timed=False,
             filename="./rl-weights.pth",
             debug=False,
             refresh=1):
    # Attempt to load policy from disk.
    policy = load_policy(filename, seed=seed)

    # Create environment with given seeding.
    env, max_steps, _, _, observation_tree_depth, _ = create_multi_agent_rail_env(
        seed + 1, timed)

    # Fixed environment parameters (note, these must correspond with the training parameters!)
    observation_radius = 10

    env_renderer = None
    if (debug):
        env_renderer = RenderTool(env, screen_width=1920, screen_height=1080)

    # Create container for the agent actions and observations.
    action_dict = dict()
    agent_obs = [None] * env.number_of_agents

    num_maps = 100
    scores = []
    successes = 0
    scores_window = deque(maxlen=100)  # todo smooth when rendering instead
    completion_window = deque(maxlen=100)
    completion = []
    schedule_length = []

    for _ in range(0, num_maps):

        # Create a new map.
        obs, info = env.reset(True, True)
        score = 0

        if debug:
            env_renderer.reset()
            env_renderer.render_env(show=True,
                                    frames=False,
                                    show_observations=False)
            time.sleep(refresh)

        # Run episode
        for _ in range(max_steps - 1):

            # Build agent specific observations
            for agent in env.get_agent_handles():
                if obs[agent]:
                    agent_obs[agent] = normalize_observation(
                        obs[agent],
                        observation_tree_depth,
                        observation_radius=observation_radius)

            # If an action is required, select the action.
            for agent in env.get_agent_handles():
                action = 0
                if info['action_required'][agent]:
                    action = policy.act(agent_obs[agent], eps=0.08)
                    # print("Required " + str(action))
                action_dict.update({agent: action})
            schedule_length.append(len(action_dict))
            # Environment step
            obs, all_rewards, done, info = env.step(action_dict)

            if debug:
                env_renderer.render_env(show=True,
                                        frames=False,
                                        show_observations=False)
                time.sleep(refresh)

            # Track rewards.
            for agent in env.get_agent_handles():
                score = score + all_rewards[agent]

            if done['__all__']:
                successes = successes + 1
                break
        tasks_finished = np.sum(
            [int(done[idx]) for idx in env.get_agent_handles()])
        completion_window.append(tasks_finished / max(1, env.get_num_agents()))
        completion.append((np.mean(completion_window)))
        # Record scores.
        scores.append(score)

    print("Successful:    %8.2f%%" % (100 * successes / num_maps))
    print("Mean reward:   %8.2f" % (np.mean(scores)))
    print("Median reward: %8.2f" % (np.median(scores)))
    print("Instances solved: %8.2f" % (np.mean(completion)))
    print("Instances solved: %8.2f" % (np.mean(schedule_length)))
Esempio n. 12
0
def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps,
                action_size, state_size, seed, render, allow_skipping,
                allow_caching):
    # Evaluation is faster on CPU (except if you use a really huge policy)
    parameters = {'use_gpu': False}

    # policy = DDDQNPolicy(state_size, action_size, Namespace(**parameters), evaluation_mode=True)
    # policy.qnetwork_local = torch.load(checkpoint, map_location={'cuda:0': 'cpu'})

    env_params = Namespace(**env_params)

    # Environment parameters
    n_agents = env_params.n_agents
    x_dim = env_params.x_dim
    y_dim = env_params.y_dim
    n_cities = env_params.n_cities
    max_rails_between_cities = env_params.max_rails_between_cities
    max_rails_in_city = env_params.max_rails_in_city

    agents = []
    for agent_id in range(n_agents):
        agent = AttentionAgent(num_in_pol=state_size,
                               num_out_pol=action_size,
                               hidden_dim=256,
                               lr=0.001)

        agent.policy = torch.load(os.path.join(
            checkpoint, f'2300_agent{agent_id}' + '.pth'),
                                  map_location=torch.device('cpu'))
        agent.policy.eval()

        agents.append(agent)

    # Malfunction and speed profiles
    # TODO pass these parameters properly from main!
    malfunction_parameters = MalfunctionParameters(
        malfunction_rate=1. / 2000,  # Rate of malfunctions
        min_duration=20,  # Minimal duration
        max_duration=50  # Max duration
    )

    # Only fast trains in Round 1
    speed_profiles = {
        1.: 1.0,  # Fast passenger train
        1. / 2.: 0.0,  # Fast freight train
        1. / 3.: 0.0,  # Slow commuter train
        1. / 4.: 0.0  # Slow freight train
    }

    # Observation parameters
    observation_tree_depth = env_params.observation_tree_depth
    observation_radius = env_params.observation_radius
    observation_max_path_depth = env_params.observation_max_path_depth

    # Observation builder
    predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
    tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth,
                                         predictor=predictor)

    # Setup the environment
    env = RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=n_cities,
            grid_mode=False,
            max_rails_between_cities=max_rails_between_cities,
            max_rails_in_city=max_rails_in_city,
        ),
        # rail_generator = complex_rail_generator(
        #     nr_start_goal=10,
        #     nr_extra=10,
        #     min_dist=10,
        #     max_dist=99999,
        #     seed=1
        # ),
        schedule_generator=sparse_schedule_generator(speed_profiles),
        number_of_agents=n_agents,
        malfunction_generator_and_process_data=malfunction_from_params(
            malfunction_parameters),
        obs_builder_object=tree_observation)

    if render:
        # env_renderer = RenderTool(env, gl="PGL")
        env_renderer = RenderTool(
            env,
            # gl="PGL",
            agent_render_variant=AgentRenderVariant.
            AGENT_SHOWS_OPTIONS_AND_BOX,
            show_debug=False,
            screen_height=600,  # Adjust these parameters to fit your resolution
            screen_width=800)

    action_dict = dict()
    scores = []
    completions = []
    nb_steps = []
    inference_times = []
    preproc_times = []
    agent_times = []
    step_times = []

    for agent_id in range(n_agents):
        action_dict[agent_id] = 0

    for episode_idx in range(n_eval_episodes):
        images = []
        seed += 1

        inference_timer = Timer()
        preproc_timer = Timer()
        agent_timer = Timer()
        step_timer = Timer()

        step_timer.start()
        obs, info = env.reset(regenerate_rail=True,
                              regenerate_schedule=True,
                              random_seed=seed)
        step_timer.end()

        agent_obs = [None] * env.get_num_agents()
        score = 0.0

        if render:
            env_renderer.set_new_rail()

        final_step = 0
        skipped = 0

        nb_hit = 0
        agent_last_obs = {}
        agent_last_action = {}

        for step in range(max_steps - 1):
            # time.sleep(0.2)
            if allow_skipping and check_if_all_blocked(env):
                # FIXME why -1? bug where all agents are "done" after max_steps!
                skipped = max_steps - step - 1
                final_step = max_steps - 2
                n_unfinished_agents = sum(not done[idx]
                                          for idx in env.get_agent_handles())
                score -= skipped * n_unfinished_agents
                break

            agent_timer.start()
            for agent in env.get_agent_handles():
                agent_model = agents[agent]
                if obs[agent] and info['action_required'][agent]:
                    if agent in agent_last_obs and np.all(
                            agent_last_obs[agent] == obs[agent]):
                        nb_hit += 1
                        action = agent_last_action[agent]

                    else:
                        preproc_timer.start()
                        norm_obs = normalize_observation(
                            obs[agent],
                            tree_depth=observation_tree_depth,
                            observation_radius=observation_radius)
                        preproc_timer.end()

                        inference_timer.start()
                        action = act(agent_model, norm_obs)
                        inference_timer.end()

                    action_dict.update({agent: action})

                    if allow_caching:
                        agent_last_obs[agent] = obs[agent]
                        agent_last_action[agent] = action
            agent_timer.end()

            step_timer.start()
            obs, all_rewards, done, info = env.step(action_dict)
            step_timer.end()

            if render:
                env_renderer.render_env(show=True,
                                        frames=False,
                                        show_observations=False,
                                        show_predictions=False)

                im = env_renderer.get_image()
                im = PIL.Image.fromarray(im)
                images.append(im)

                if step % 100 == 0:
                    print("{}/{}".format(step, max_steps - 1))

            for agent in env.get_agent_handles():
                score += all_rewards[agent]

            final_step = step

            if done['__all__']:
                break

        if render:
            for _ in range(10):
                images.append(images[len(images) - 1])

            # save video
            images[0].save(
                f'/Users/nikhilvs/repos/nyu/flatland-reinforcement-learning/videos/maac-final/out_{episode_idx}.gif',
                save_all=True,
                append_images=images[1:],
                optimize=False,
                duration=60,
                loop=0)

        normalized_score = score / (max_steps * env.get_num_agents())
        scores.append(normalized_score)

        tasks_finished = sum(done[idx] for idx in env.get_agent_handles())
        completion = tasks_finished / max(1, env.get_num_agents())
        completions.append(completion)

        nb_steps.append(final_step)

        inference_times.append(inference_timer.get())
        preproc_times.append(preproc_timer.get())
        agent_times.append(agent_timer.get())
        step_times.append(step_timer.get())

        skipped_text = ""
        if skipped > 0:
            skipped_text = "\t⚡ Skipped {}".format(skipped)

        hit_text = ""
        if nb_hit > 0:
            hit_text = "\t⚡ Hit {} ({:.1f}%)".format(nb_hit, (100 * nb_hit) /
                                                     (n_agents * final_step))

        print(
            "☑️  Score: {:.3f} \tDone: {:.1f}% \tNb steps: {:.3f} "
            "\t🍭 Seed: {}"
            "\t🚉 Env: {:.3f}s  "
            "\t🤖 Agent: {:.3f}s (per step: {:.3f}s) \t[preproc: {:.3f}s \tinfer: {:.3f}s]"
            "{}{}".format(normalized_score, completion * 100.0, final_step,
                          seed, step_timer.get(), agent_timer.get(),
                          agent_timer.get() / final_step, preproc_timer.get(),
                          inference_timer.get(), skipped_text, hit_text))

    return scores, completions, nb_steps, agent_times, step_times
def main(argv):
    try:
        opts, args = getopt.getopt(argv, "n:", ["n_trials="])
    except getopt.GetoptError:
        print('test_navigation_single_agent.py -n <n_trials>')
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-n', '--n_trials'):
            n_trials = int(arg)

    random.seed(1)
    np.random.seed(1)

    ######## TEST SET SELECTION - PARAMETERS ########
    
    test_multi_agent_setup = 1             # 1 for Medium size test, 2 for Big size test
    test_n_agents = 5                      # Number of agents to test (3 - 5 - 7 for Medium, 5 - 7 - 10 for Big)
    test_malfunctions_enabled = True       # Malfunctions enabled?
    test_agents_one_speed = True           # Test agents with the same speed (1) or with 4 different speeds?

    #################################################

    # Medium size
    if test_multi_agent_setup == 1:
        x_dim = 16*3
        y_dim = 9*3
        max_num_cities = 5
        max_rails_between_cities = 2
        max_rails_in_city = 3

    # Big size
    if test_multi_agent_setup == 2:
        x_dim = 16*4
        y_dim = 9*4
        max_num_cities = 9
        max_rails_between_cities = 5
        max_rails_in_city = 5


    stochastic_data = {'malfunction_rate': 80,  # Rate of malfunction occurence of single agent
                       'min_duration': 15,  # Minimal duration of malfunction
                       'max_duration': 50  # Max duration of malfunction
                       }

    # Custom observation builder
    tree_depth = 2
    TreeObservation = TreeObsForRailEnv(max_depth=tree_depth, predictor = ShortestPathPredictorForRailEnv(20))

    np.savetxt(fname=path.join('NetsTest' , 'info.txt'), X=[x_dim,y_dim,test_n_agents,max_num_cities,max_rails_between_cities,max_rails_in_city,tree_depth],delimiter=';')

    # Different agent types (trains) with different speeds.
    if test_agents_one_speed:
        speed_ration_map = {1.: 1.,  # Fast passenger train
                            1. / 2.: 0.0,  # Fast freight train
                            1. / 3.: 0.0,  # Slow commuter train
                            1. / 4.: 0.0}  # Slow freight train
    else:
        speed_ration_map = {1.: 0.25,  # Fast passenger train
                            1. / 2.: 0.25,  # Fast freight train
                            1. / 3.: 0.25,  # Slow commuter train
                            1. / 4.: 0.25}  # Slow freight train

    
    if test_malfunctions_enabled:
        env = RailEnv(width=x_dim,
                      height=y_dim,
                      rail_generator=sparse_rail_generator(max_num_cities=max_num_cities,
                                                           # Number of cities in map (where train stations are)
                                                           seed=14,  # Random seed
                                                           grid_mode=False,
                                                           max_rails_between_cities=max_rails_between_cities,
                                                               max_rails_in_city=max_rails_in_city),
                    schedule_generator=sparse_schedule_generator(speed_ration_map),
                    malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
                    number_of_agents=test_n_agents,
                    obs_builder_object=TreeObservation)
    else:
        env = RailEnv(width=x_dim,
                      height=y_dim,
                      rail_generator=sparse_rail_generator(max_num_cities=max_num_cities,
                                                           # Number of cities in map (where train stations are)
                                                           seed=14,  # Random seed
                                                           grid_mode=False,
                                                           max_rails_between_cities=max_rails_between_cities,
                                                               max_rails_in_city=max_rails_in_city),
                    schedule_generator=sparse_schedule_generator(speed_ration_map),
                    number_of_agents=test_n_agents,
                    obs_builder_object=TreeObservation)
    
    env.reset()

    #env_renderer = RenderTool(env, gl="PILSVG", )
    env_renderer = RenderTool(env, gl="PILSVG",
                          agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX,
                          show_debug=False,
                          screen_height=(1080*0.8),  # Adjust these parameters to fit your resolution
                          screen_width=(1920*0.8))
    num_features_per_node = env.obs_builder.observation_dim

    
    nr_nodes = 0
    for i in range(tree_depth + 1):
        nr_nodes += np.power(4, i)
    state_size = num_features_per_node * nr_nodes
    action_size = 5

    # We set the number of episodes we would like to train on
    if 'n_trials' not in locals():
        n_trials = 15000
    
    # max_steps computation
    speed_weighted_mean = 0

    for key in speed_ration_map.keys():
        speed_weighted_mean += key * speed_ration_map[key]
    
    #max_steps = int(3 * (env.height + env.width))
    max_steps = int((1/speed_weighted_mean) * 3 * (env.height + env.width))
    #eps = 1.
    #eps_end = 0.005
    #eps_decay = 0.9995

    # And some variables to keep track of the performance
    action_dict = dict()
    final_action_dict = dict()
    action_prob_list = []
    scores_window = deque(maxlen=100)
    done_window = deque(maxlen=100)
    scores = []
    scores_list = []
    deadlock_list =[]
    dones_list_window = []
    dones_list = []
    action_prob = [0] * action_size
    agent_obs = [None] * env.get_num_agents()
    agent_next_obs = [None] * env.get_num_agents() # Useless
    agent = Agent(state_size, action_size)
    
    # LOAD MODEL WEIGHTS TO TEST
    agent.qnetwork_local.load_state_dict(torch.load(path.join('NetsTest' , 'navigator_checkpoint3800_multi10_deadlock_global10.pth')))

    record_images = False
    frame_step = 0

    for trials in range(1, n_trials + 1):

        # Reset environment
        obs, info = env.reset()#(True, True)
        env_renderer.reset()
        # Build agent specific observations
        for a in range(env.get_num_agents()):
            agent_obs[a] = agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
        # Reset score and done
        score = 0
        env_done = 0

        # Run episode
        for step in range(max_steps):

            # Action
            for a in range(env.get_num_agents()):
                if info['action_required'][a]:
                    action = agent.act(agent_obs[a], eps=0.)
                    action_prob[action] += 1

                else:
                    action = 0

                action_dict.update({a: action})
            # Environment step
            obs, all_rewards, done, deadlocks, info = env.step(action_dict)

            env_renderer.render_env(show=True, show_predictions=True, show_observations=False)
            # Build agent specific observations and normalize
            for a in range(env.get_num_agents()):
                if obs[a]:
                    agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)

                score += all_rewards[a] / env.get_num_agents()


            if done['__all__']:
                break

        # Collection information about training
        tasks_finished = 0
        for _idx in range(env.get_num_agents()):
            if done[_idx] == 1:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / max_steps)  # save most recent score
        scores.append(np.mean(scores_window))
        dones_list.append(tasks_finished / max(1, env.get_num_agents()))
        dones_list_window.append((np.mean(done_window)))
        scores_list.append(score / max_steps)
        deadlock_list.append(deadlocks.count(1)/max(1, env.get_num_agents()))

        if (np.sum(action_prob) == 0):
            action_prob_normalized = [0] * action_size
        else:
            action_prob_normalized = action_prob / np.sum(action_prob)



        print(
                '\rTesting {} Agents on ({},{}).\t Episode {}\t Score: {:.3f}\tDones: {:.2f}%\tDeadlocks: {:.2f}\t Action Probabilities: \t {}'.format(
                    env.get_num_agents(), x_dim, y_dim,
                    trials,
                    score / max_steps,
                    100 * tasks_finished / max(1, env.get_num_agents()),
                    deadlocks.count(1)/max(1, env.get_num_agents()),
                    action_prob_normalized), end=" ")

        #if trials % 100 == 0:
        action_prob_list.append(action_prob_normalized)
        action_prob = [0] * action_size

        if trials % 50 == 0:

            np.savetxt(fname=path.join('NetsTest' , 'test_metrics.csv'), X=np.transpose(np.asarray([scores_list,scores,dones_list,dones_list_window,deadlock_list])), delimiter=';',newline='\n')
            np.savetxt(fname=path.join('NetsTest' , 'test_action_prob.csv'), X=np.asarray(action_prob_list), delimiter=';',newline='\n')
def train_agent(env_params, train_params):
    # Environment parameters
    n_agents = env_params.n_agents
    x_dim = env_params.x_dim
    y_dim = env_params.y_dim
    n_cities = env_params.n_cities
    max_rails_between_cities = env_params.max_rails_between_cities
    max_rails_in_city = env_params.max_rails_in_city
    seed = env_params.seed

    # Observation parameters
    observation_tree_depth = env_params.observation_tree_depth
    observation_radius = env_params.observation_radius
    observation_max_path_depth = env_params.observation_max_path_depth

    # Training parameters
    eps_start = train_params.eps_start
    eps_end = train_params.eps_end
    eps_decay = train_params.eps_decay
    n_episodes = train_params.n_episodes
    checkpoint_interval = train_params.checkpoint_interval
    n_eval_episodes = train_params.n_evaluation_episodes

    # Set the seeds
    random.seed(seed)
    np.random.seed(seed)

    # Break agents from time to time
    malfunction_parameters = MalfunctionParameters(
        malfunction_rate=1. / 10000,  # Rate of malfunctions
        min_duration=15,  # Minimal duration
        max_duration=50  # Max duration
    )

    # Observation builder
    predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
    tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor)

    # Fraction of train which each speed
    speed_profiles = {
        1.: 1.0,  # Fast passenger train
        1. / 2.: 0.0,  # Fast freight train
        1. / 3.: 0.0,  # Slow commuter train
        1. / 4.: 0.0  # Slow freight train
    }

    # Setup the environment
    env = RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=n_cities,
            grid_mode=False,
            max_rails_between_cities=max_rails_between_cities,
            max_rails_in_city=max_rails_in_city
        ),
        schedule_generator=sparse_schedule_generator(speed_profiles),
        number_of_agents=n_agents,
        malfunction_generator_and_process_data=malfunction_from_params(malfunction_parameters),
        obs_builder_object=tree_observation,
        random_seed=seed
    )

    env.reset(regenerate_schedule=True, regenerate_rail=True)

    # Setup renderer
    if train_params.render:
        env_renderer = RenderTool(env, gl="PGL")

    # Calculate the state size given the depth of the tree observation and the number of features
    n_features_per_node = env.obs_builder.observation_dim
    n_nodes = 0
    for i in range(observation_tree_depth + 1):
        n_nodes += np.power(4, i)
    state_size = n_features_per_node * n_nodes

    # The action space of flatland is 5 discrete actions
    action_size = 5

    # Max number of steps per episode
    # This is the official formula used during evaluations
    # See details in flatland.envs.schedule_generators.sparse_schedule_generator
    max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities)))

    action_count = [0] * action_size
    action_dict = dict()
    agent_obs = [None] * env.get_num_agents()
    agent_prev_obs = [None] * env.get_num_agents()
    agent_prev_action = [2] * env.get_num_agents()
    update_values = False
    smoothed_normalized_score = -1.0
    smoothed_eval_normalized_score = -1.0
    smoothed_completion = 0.0
    smoothed_eval_completion = 0.0

    # Double Dueling DQN policy
    policy = DDDQNPolicy(state_size, action_size, train_params)

    # TensorBoard writer
    writer = SummaryWriter()
    writer.add_hparams(vars(train_params), {})
    writer.add_hparams(vars(env_params), {})

    training_timer = Timer()
    training_timer.start()

    print("\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes.\n"
          .format(env.get_num_agents(), x_dim, y_dim, n_episodes, n_eval_episodes, checkpoint_interval))

    for episode_idx in range(n_episodes + 1):
        # Timers
        step_timer = Timer()
        reset_timer = Timer()
        learn_timer = Timer()
        preproc_timer = Timer()

        # Reset environment
        reset_timer.start()
        obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True)
        reset_timer.end()

        if train_params.render:
            env_renderer.set_new_rail()

        score = 0
        nb_steps = 0
        actions_taken = []

        # Build agent specific observations
        for agent in env.get_agent_handles():
            if obs[agent]:
                agent_obs[agent] = normalize_observation(obs[agent], observation_tree_depth, observation_radius=observation_radius)
                agent_prev_obs[agent] = agent_obs[agent].copy()

        # Run episode
        for step in range(max_steps - 1):
            for agent in env.get_agent_handles():
                if info['action_required'][agent]:
                    # If an action is required, we want to store the obs at that step as well as the action
                    update_values = True
                    action = policy.act(agent_obs[agent], eps=eps_start)
                    action_count[action] += 1
                    actions_taken.append(action)
                else:
                    update_values = False
                    action = 0
                action_dict.update({agent: action})

            # Environment step
            step_timer.start()
            next_obs, all_rewards, done, info = env.step(action_dict)
            step_timer.end()

            if train_params.render and episode_idx % checkpoint_interval == 0:
                env_renderer.render_env(
                    show=True,
                    frames=False,
                    show_observations=False,
                    show_predictions=False
                )

            for agent in range(env.get_num_agents()):
                # Update replay buffer and train agent
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values or done[agent]:
                    learn_timer.start()
                    policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent])
                    learn_timer.end()

                    agent_prev_obs[agent] = agent_obs[agent].copy()
                    agent_prev_action[agent] = action_dict[agent]

                # Preprocess the new observations
                if next_obs[agent]:
                    preproc_timer.start()
                    agent_obs[agent] = normalize_observation(next_obs[agent], observation_tree_depth, observation_radius=observation_radius)
                    preproc_timer.end()

                score += all_rewards[agent]

            nb_steps = step

            if done['__all__']:
                break

        # Epsilon decay
        eps_start = max(eps_end, eps_decay * eps_start)

        # Collection information about training
        tasks_finished = sum(done[idx] for idx in env.get_agent_handles())
        completion = tasks_finished / max(1, env.get_num_agents())
        normalized_score = score / (max_steps * env.get_num_agents())
        action_probs = action_count / np.sum(action_count)
        action_count = [1] * action_size

        # Smoothed values for terminal display and for more stable hyper-parameter tuning
        smoothing = 0.99
        smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * (1.0 - smoothing)
        smoothed_completion = smoothed_completion * smoothing + completion * (1.0 - smoothing)

        # Print logs
        if episode_idx % checkpoint_interval == 0:
            torch.save(policy.qnetwork_local, './checkpoints/multi-' + str(episode_idx) + '.pth')
            if train_params.render:
                env_renderer.close_window()

        print(
            '\r🚂 Episode {}'
            '\t 🏆 Score: {:.3f}'
            ' Avg: {:.3f}'
            '\t 💯 Done: {:.2f}%'
            ' Avg: {:.2f}%'
            '\t 🎲 Epsilon: {:.2f} '
            '\t 🔀 Action Probs: {}'.format(
                episode_idx,
                normalized_score,
                smoothed_normalized_score,
                100 * completion,
                100 * smoothed_completion,
                eps_start,
                format_action_prob(action_probs)
            ), end=" ")

        # Evaluate policy
        if episode_idx % train_params.checkpoint_interval == 0:
            scores, completions, nb_steps_eval = eval_policy(env, policy, n_eval_episodes, max_steps)
            writer.add_scalar("evaluation/scores_min", np.min(scores), episode_idx)
            writer.add_scalar("evaluation/scores_max", np.max(scores), episode_idx)
            writer.add_scalar("evaluation/scores_mean", np.mean(scores), episode_idx)
            writer.add_scalar("evaluation/scores_std", np.std(scores), episode_idx)
            writer.add_histogram("evaluation/scores", np.array(scores), episode_idx)
            writer.add_scalar("evaluation/completions_min", np.min(completions), episode_idx)
            writer.add_scalar("evaluation/completions_max", np.max(completions), episode_idx)
            writer.add_scalar("evaluation/completions_mean", np.mean(completions), episode_idx)
            writer.add_scalar("evaluation/completions_std", np.std(completions), episode_idx)
            writer.add_histogram("evaluation/completions", np.array(completions), episode_idx)
            writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval), episode_idx)
            writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval), episode_idx)
            writer.add_scalar("evaluation/nb_steps_mean", np.mean(nb_steps_eval), episode_idx)
            writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval), episode_idx)
            writer.add_histogram("evaluation/nb_steps", np.array(nb_steps_eval), episode_idx)

            smoothing = 0.9
            smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean(scores) * (1.0 - smoothing)
            smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean(completions) * (1.0 - smoothing)
            writer.add_scalar("evaluation/smoothed_score", smoothed_eval_normalized_score, episode_idx)
            writer.add_scalar("evaluation/smoothed_completion", smoothed_eval_completion, episode_idx)

        # Save logs to tensorboard
        writer.add_scalar("training/score", normalized_score, episode_idx)
        writer.add_scalar("training/smoothed_score", smoothed_normalized_score, episode_idx)
        writer.add_scalar("training/completion", np.mean(completion), episode_idx)
        writer.add_scalar("training/smoothed_completion", np.mean(smoothed_completion), episode_idx)
        writer.add_scalar("training/nb_steps", nb_steps, episode_idx)
        writer.add_histogram("actions/distribution", np.array(actions_taken), episode_idx)
        writer.add_scalar("actions/nothing", action_probs[RailEnvActions.DO_NOTHING], episode_idx)
        writer.add_scalar("actions/left", action_probs[RailEnvActions.MOVE_LEFT], episode_idx)
        writer.add_scalar("actions/forward", action_probs[RailEnvActions.MOVE_FORWARD], episode_idx)
        writer.add_scalar("actions/right", action_probs[RailEnvActions.MOVE_RIGHT], episode_idx)
        writer.add_scalar("actions/stop", action_probs[RailEnvActions.STOP_MOVING], episode_idx)
        writer.add_scalar("training/epsilon", eps_start, episode_idx)
        writer.add_scalar("training/buffer_size", len(policy.memory), episode_idx)
        writer.add_scalar("training/loss", policy.loss, episode_idx)
        writer.add_scalar("timer/reset", reset_timer.get(), episode_idx)
        writer.add_scalar("timer/step", step_timer.get(), episode_idx)
        writer.add_scalar("timer/learn", learn_timer.get(), episode_idx)
        writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx)
        writer.add_scalar("timer/total", training_timer.get_current(), episode_idx)
Esempio n. 15
0
agent_next_obs = [None] * env.get_num_agents()
agent = Agent(state_size, action_size)
with path(torch_training.Nets, "navigator_checkpoint1000.pth") as file_in:
    agent.qnetwork_local.load_state_dict(torch.load(file_in))

record_images = False
frame_step = 0

for trials in range(1, n_trials + 1):

    # Reset environment
    obs, info = env.reset(True, True)
    env_renderer.reset()
    # Build agent specific observations
    for a in range(env.get_num_agents()):
        agent_obs[a] = agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
    # Reset score and done
    score = 0
    env_done = 0

    # Run episode
    for step in range(max_steps):

        # Action
        for a in range(env.get_num_agents()):
            if info['action_required'][a]:
                action = agent.act(agent_obs[a], eps=0.)

            else:
                action = 0
def train_agent(n_episodes):
    # Environment parameters
    n_agents = 1
    x_dim = 25
    y_dim = 25
    n_cities = 4
    max_rails_between_cities = 2
    max_rails_in_city = 3
    seed = 42

    # Observation parameters
    observation_tree_depth = 2
    observation_radius = 10

    # Exploration parameters
    eps_start = 1.0
    eps_end = 0.01
    eps_decay = 0.997  # for 2500ts

    # Set the seeds
    random.seed(seed)
    np.random.seed(seed)

    # Observation builder
    tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth)

    # Setup the environment
    env = RailEnv(width=x_dim,
                  height=y_dim,
                  rail_generator=sparse_rail_generator(
                      max_num_cities=n_cities,
                      seed=seed,
                      grid_mode=False,
                      max_rails_between_cities=max_rails_between_cities,
                      max_rails_in_city=max_rails_in_city),
                  schedule_generator=sparse_schedule_generator(),
                  number_of_agents=n_agents,
                  obs_builder_object=tree_observation)

    env.reset(True, True)

    # Calculate the state size given the depth of the tree observation and the number of features
    n_features_per_node = env.obs_builder.observation_dim
    n_nodes = 0
    for i in range(observation_tree_depth + 1):
        n_nodes += np.power(4, i)
    state_size = n_features_per_node * n_nodes

    # The action space of flatland is 5 discrete actions
    action_size = 5

    # Max number of steps per episode
    # This is the official formula used during evaluations
    max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities)))

    action_dict = dict()

    # And some variables to keep track of the progress
    scores_window = deque(maxlen=100)  # todo smooth when rendering instead
    completion_window = deque(maxlen=100)
    scores = []
    completion = []
    action_count = [0] * action_size
    agent_obs = [None] * env.get_num_agents()
    agent_prev_obs = [None] * env.get_num_agents()
    agent_prev_action = [2] * env.get_num_agents()
    update_values = False

    # Training parameters
    training_parameters = {
        'buffer_size': int(1e5),
        'batch_size': 32,
        'update_every': 8,
        'learning_rate': 0.5e-4,
        'tau': 1e-3,
        'gamma': 0.99,
        'buffer_min_size': 0,
        'hidden_size': 256,
        'use_gpu': False
    }

    # Double Dueling DQN policy
    policy = DDDQNPolicy(state_size, action_size,
                         Namespace(**training_parameters))

    for episode_idx in range(n_episodes):
        score = 0

        # Reset environment
        obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True)

        # Build agent specific observations
        for agent in env.get_agent_handles():
            if obs[agent]:
                agent_obs[agent] = normalize_observation(
                    obs[agent],
                    observation_tree_depth,
                    observation_radius=observation_radius)
                agent_prev_obs[agent] = agent_obs[agent].copy()

        # Run episode
        for step in range(max_steps - 1):
            for agent in env.get_agent_handles():
                if info['action_required'][agent]:
                    # If an action is required, we want to store the obs at that step as well as the action
                    update_values = True
                    action = policy.act(agent, agent_obs[agent], eps=eps_start)
                    action_count[action] += 1
                else:
                    update_values = False
                    action = 0
                action_dict.update({agent: action})

            # Environment step
            next_obs, all_rewards, done, info = env.step(action_dict)

            # Update replay buffer and train agent
            for agent in range(env.get_num_agents()):
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values or done[agent]:
                    policy.step(agent, agent_prev_obs[agent],
                                agent_prev_action[agent], all_rewards[agent],
                                agent_obs[agent], done[agent])

                    agent_prev_obs[agent] = agent_obs[agent].copy()
                    agent_prev_action[agent] = action_dict[agent]

                if next_obs[agent]:
                    agent_obs[agent] = normalize_observation(
                        next_obs[agent],
                        observation_tree_depth,
                        observation_radius=10)

                score += all_rewards[agent]

            if done['__all__']:
                break

        # Epsilon decay
        eps_start = max(eps_end, eps_decay * eps_start)

        # Collection information about training
        tasks_finished = np.sum(
            [int(done[idx]) for idx in env.get_agent_handles()])
        completion_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / (max_steps * env.get_num_agents()))
        completion.append((np.mean(completion_window)))
        scores.append(np.mean(scores_window))
        action_probs = action_count / np.sum(action_count)

        if episode_idx % 100 == 0:
            end = "\n"
            torch.save(policy.qnetwork_local,
                       './checkpoints/single-' + str(episode_idx) + '.pth')
            action_count = [1] * action_size
        else:
            end = " "

        print(
            '\rTraining {} agents on {}x{}\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'
            .format(env.get_num_agents(), x_dim, y_dim, episode_idx,
                    np.mean(scores_window), 100 * np.mean(completion_window),
                    eps_start, action_probs),
            end=end)

    # Plot overall training progress at the end
    plt.plot(scores)
    plt.show()

    plt.plot(completion)
    plt.show()
Esempio n. 17
0
            no_ops_mode = False

            if not check_if_all_blocked(env=local_env):
                time_start = time.time()
                action_dict = {}
                for agent in range(nb_agents):
                    if observation[agent] and info['action_required'][agent]:
                        if agent in agent_last_obs and np.all(
                                agent_last_obs[agent] == observation[agent]):
                            # cache hit
                            action = agent_last_action[agent]
                            nb_hit += 1
                        else:
                            # otherwise, run normalization and inference
                            norm_obs = normalize_observation(
                                observation[agent],
                                tree_depth=observation_tree_depth,
                                observation_radius=observation_radius)
                            action = policy.act(norm_obs, eps=0.0)

                        action_dict[agent] = action

                        if USE_ACTION_CACHE:
                            agent_last_obs[agent] = observation[agent]
                            agent_last_action[agent] = action
                agent_time = time.time() - time_start
                time_taken_by_controller.append(agent_time)

                time_start = time.time()
                _, all_rewards, done, info = remote_client.env_step(
                    action_dict)
                step_time = time.time() - time_start
Esempio n. 18
0
def main(argv):
    try:
        opts, args = getopt.getopt(argv, "n:", ["n_trials="])
    except getopt.GetoptError:
        print('test_navigation_single_agent.py -n <n_trials>')
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-n', '--n_trials'):
            n_trials = int(arg)

    random.seed(1)
    np.random.seed(1)

    # Parameters for the Environment
    x_dim = 35
    y_dim = 35
    n_agents = 1
    max_num_cities = 3
    max_rails_between_cities = 2
    max_rails_in_city = 3

    # We are training an Agent using the Tree Observation with depth 2
    #observation_builder = TreeObsForRailEnv(max_depth=2, predictor = ShortestPathPredictorForRailEnv(20))

    # Use a the malfunction generator to break agents from time to time
    stochastic_data = {
        'malfunction_rate':
        80,  # Rate of malfunction occurence of single agent
        'min_duration': 15,  # Minimal duration of malfunction
        'max_duration': 50  # Max duration of malfunction
    }

    # Custom observation builder
    tree_depth = 2
    TreeObservation = TreeObsForRailEnv(
        max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv(20))

    np.savetxt(fname=path.join('NetsTest', 'info.txt'),
               X=[
                   x_dim, y_dim, n_agents, max_num_cities,
                   max_rails_between_cities, max_rails_in_city, tree_depth
               ],
               delimiter=';')

    # Different agent types (trains) with different speeds.
    speed_ration_map = {
        1.: 1.,  # Fast passenger train
        1. / 2.: 0.0,  # Fast freight train
        1. / 3.: 0.0,  # Slow commuter train
        1. / 4.: 0.0
    }  # Slow freight train

    env = RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=max_num_cities,
            # Number of cities in map (where train stations are)
            seed=14,  # Random seed
            grid_mode=False,
            max_rails_between_cities=max_rails_between_cities,
            max_rails_in_city=max_rails_in_city),
        schedule_generator=sparse_schedule_generator(speed_ration_map),
        number_of_agents=n_agents,
        obs_builder_object=TreeObservation)
    env.reset()

    env_renderer = RenderTool(
        env,
        gl="PILSVG",
    )
    #env_renderer = RenderTool(env, gl="PILSVG",
    #                      agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX,
    #                      show_debug=False,
    #                      screen_height=(1080*0.8),  # Adjust these parameters to fit your resolution
    #                      screen_width=(1920*0.8))
    num_features_per_node = env.obs_builder.observation_dim

    nr_nodes = 0
    for i in range(tree_depth + 1):
        nr_nodes += np.power(4, i)
    state_size = num_features_per_node * nr_nodes
    action_size = 5

    # We set the number of episodes we would like to train on
    if 'n_trials' not in locals():
        n_trials = 15000
    max_steps = int(3 * (env.height + env.width))
    eps = 1.
    eps_end = 0.005
    eps_decay = 0.9995

    # And some variables to keep track of the performance
    action_dict = dict()
    final_action_dict = dict()
    action_prob_list = []
    scores_window = deque(maxlen=100)
    done_window = deque(maxlen=100)
    scores = []
    scores_list = []
    dones_list_window = []
    dones_list = []
    action_prob = [0] * action_size
    agent_obs = [None] * env.get_num_agents()
    agent_next_obs = [None] * env.get_num_agents()  # Useless
    agent = RandomAgent(state_size, action_size)

    #agent.qnetwork_local.load_state_dict(torch.load(path.join('NetsTest' , 'navigator_checkpoint_glob10.pth')))
    #agent.qnetwork_local.load_state_dict(torch.load(path.join('NetsLoad' , 'navigator_checkpoint2100.pth')))

    record_images = False
    frame_step = 0

    for trials in range(1, n_trials + 1):

        # Reset environment
        obs, info = env.reset()  #(True, True)
        env_renderer.reset()
        # Build agent specific observations
        for a in range(env.get_num_agents()):
            agent_obs[a] = agent_obs[a] = normalize_observation(
                obs[a], tree_depth, observation_radius=10)
        # Reset score and done
        score = 0
        env_done = 0

        # Run episode
        for step in range(max_steps):

            # Action
            for a in range(env.get_num_agents()):
                if info['action_required'][a]:
                    action = agent.act(agent_obs[a])  #, eps=0.)
                    action_prob[action] += 1

                else:
                    action = 0

                action_dict.update({a: action})
            # Environment step
            obs, all_rewards, done, deadlocks, info = env.step(action_dict)

            #env_renderer.render_env(show=True, show_predictions=True, show_observations=False)
            # Build agent specific observations and normalize
            for a in range(env.get_num_agents()):
                if obs[a]:
                    agent_obs[a] = normalize_observation(obs[a],
                                                         tree_depth,
                                                         observation_radius=10)

                score += all_rewards[a] / env.get_num_agents()

            if done['__all__']:
                break

        # Collection information about training
        tasks_finished = 0
        for _idx in range(env.get_num_agents()):
            if done[_idx] == 1:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / max_steps)  # save most recent score
        scores.append(np.mean(scores_window))
        dones_list.append(tasks_finished / max(1, env.get_num_agents()))
        dones_list_window.append((np.mean(done_window)))
        scores_list.append(score / max_steps)

        action_prob_list.append(action_prob / np.sum(action_prob))

        print(
            '\rTesting {} Agents on ({},{}).\t Episode {}\t Score: {:.3f}\tDones: {:.2f}%\t Action Probabilities: \t {}'
            .format(env.get_num_agents(), x_dim, y_dim, trials,
                    score / max_steps,
                    100 * tasks_finished / max(1, env.get_num_agents()),
                    action_prob / np.sum(action_prob)),
            end=" ")

        if trials % 100 == 0:
            action_prob = [1] * action_size

        if trials % 50 == 0:

            #np.savetxt(fname=path.join('Nets' , 'scores_metric.txt'), X=scores)
            #np.savetxt(fname=path.join('Nets' , 'dones_metric.txt'), X=dones_list)
            np.savetxt(fname=path.join('NetsTest', 'test_metrics.csv'),
                       X=np.transpose(
                           np.asarray([
                               scores_list, scores, dones_list,
                               dones_list_window
                           ])),
                       delimiter=';',
                       newline='\n')
            np.savetxt(fname=path.join('NetsTest', 'test_action_prob.csv'),
                       X=np.asarray(action_prob_list),
                       delimiter=';',
                       newline='\n')
def main(argv):
    try:
        opts, args = getopt.getopt(argv, "n:", ["n_trials="])
    except getopt.GetoptError:
        print('training_navigation.py -n <n_trials>')
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-n', '--n_trials'):
            n_trials = int(arg)

    random.seed(1)
    np.random.seed(1)

    training = True
    weights = False

    #  Setting these 2 parameters to True can slow down the network for training
    visuals = False
    sleep_for_animation = False

    trained_epochs = 1  # set to trained epoch number if resuming training or for validation , else value is 1

    if training:
        train_set = True
    else:
        train_set = False
        weights = True

    dataset_type = "MANUAL"  # set as MANUAL if we want to specify our own parameters, AUTO otherwise
    # Parameters for the Environment
    x_dim = 25
    y_dim = 25
    n_agents = 4

    # Use a the malfunction generator to break agents from time to time
    stochastic_data = {
        'prop_malfunction': 0.05,  # Percentage of defective agents
        'malfunction_rate': 100,  # Rate of malfunction occurence
        'min_duration': 20,  # Minimal duration of malfunction
        'max_duration': 50  # Max duration of malfunction
    }

    # Custom observation builder
    tree_observation = TreeObsForRailEnv(
        max_depth=2, predictor=ShortestPathPredictorForRailEnv(30))

    # Different agent types (trains) with different speeds.
    speed_ration_map = {
        1.: 0.25,  # Fast passenger train
        1. / 2.: 0.25,  # Fast freight train
        1. / 3.: 0.25,  # Slow commuter train
        1. / 4.: 0.25
    }  # Slow freight train

    if dataset_type == "MANUAL":
        env, random_seed = train_validate_env_generator_params(
            train_set, n_agents, x_dim, y_dim, tree_observation,
            stochastic_data, speed_ration_map)
    else:
        env, random_seed = train_validate_env_generator(
            train_set, tree_observation)

    # Given the depth of the tree observation and the number of features per node we get the following state_size
    num_features_per_node = env.obs_builder.observation_dim
    tree_depth = 2
    nr_nodes = 0
    for i in range(tree_depth + 1):
        nr_nodes += np.power(4, i)
    state_size = num_features_per_node * nr_nodes
    # state_size = state_size + n_agents*12
    # state_size = num_features_per_node
    # The action space of flatland is 5 discrete actions
    action_size = 5

    # We set the number of episodes we would like to train on
    if 'n_trials' not in locals():
        n_trials = 15000

    # And the max number of steps we want to take per episode
    max_steps = int(4 * 2 * (20 + env.height + env.width))

    columns = [
        'Agents', 'X_DIM', 'Y_DIM', 'TRIAL_NO', 'SCORE', 'DONE_RATIO', 'STEPS',
        'ACTION_PROB'
    ]
    df_all_results = pd.DataFrame(columns=columns)

    # Define training parameters
    eps = 1.
    eps_end = 0.005
    eps_decay = 0.998

    # And some variables to keep track of the progress
    action_dict = dict()
    final_action_dict = dict()
    scores_window = deque(maxlen=100)
    done_window = deque(maxlen=100)
    scores = []
    dones_list = []
    action_prob = [0] * action_size

    # Now we load a Double dueling DQN agent
    agent = Agent(state_size, action_size)

    trial_start = 1
    if weights:
        trial_start = trained_epochs
        eps = max(eps_end, (np.power(eps_decay, trial_start)) * eps)
        weight_file = f"navigator_checkpoint{trial_start}.pth"
        with open("./Nets/" + weight_file, "rb") as file_in:
            agent.qnetwork_local.load_state_dict(torch.load(file_in))

    for trials in range(trial_start, n_trials + 1):

        if dataset_type == "MANUAL":
            env, random_seed = train_validate_env_generator_params(
                train_set, n_agents, x_dim, y_dim, tree_observation,
                stochastic_data, speed_ration_map)
        else:
            env, random_seed = train_validate_env_generator(
                train_set, tree_observation)
        # Reset environment
        obs, info = env.reset(regenerate_rail=True,
                              regenerate_schedule=True,
                              activate_agents=False,
                              random_seed=random_seed)

        env_renderer = RenderTool(
            env,
            gl="PILSVG",
        )
        if visuals:
            env_renderer.render_env(show=True,
                                    frames=True,
                                    show_observations=True)

        x_dim, y_dim, n_agents = env.width, env.height, env.get_num_agents()

        agent_obs = [None] * n_agents
        agent_next_obs = [None] * n_agents
        agent_obs_buffer = [None] * n_agents
        agent_action_buffer = [2] * n_agents
        cummulated_reward = np.zeros(n_agents)
        update_values = [False] * n_agents
        # Build agent specific observations
        for a in range(n_agents):
            # if obs[a] is not None:
            #     agent_obs[a] = obs[a]
            if obs[a]:
                agent_obs[a] = normalize_observation(obs[a],
                                                     tree_depth,
                                                     observation_radius=10)
                agent_obs_buffer[a] = agent_obs[a].copy()

        # Reset score and done
        score = 0
        env_done = 0
        step = 0
        # Run episode
        while True:
            # Action
            for a in range(n_agents):
                if info['action_required'][a]:
                    # If an action is require, we want to store the obs a that step as well as the action
                    update_values[a] = True
                    action = agent.act(agent_obs[a], eps=eps)
                    action_prob[action] += 1
                else:
                    update_values[a] = False
                    action = 0
                action_dict.update({a: action})

            # Environment step
            next_obs, all_rewards, done, info = env.step(action_dict)
            step += 1
            if visuals:
                env_renderer.render_env(show=True,
                                        frames=True,
                                        show_observations=True)
                if sleep_for_animation:
                    time.sleep(0.5)
            for a in range(n_agents):
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values[a] or done[a]:
                    agent.step(agent_obs_buffer[a], agent_action_buffer[a],
                               all_rewards[a], agent_obs[a], done[a])
                    cummulated_reward[a] = 0.

                    agent_obs_buffer[a] = agent_obs[a].copy()
                    agent_action_buffer[a] = action_dict[a]
                if next_obs[a]:
                    agent_obs[a] = normalize_observation(next_obs[a],
                                                         tree_depth,
                                                         observation_radius=10)
                # if next_obs[a] is not None:
                #     agent_obs[a] = next_obs[a]
                score += all_rewards[a] / n_agents

            # Copy observation
            if done['__all__']:
                env_done = 1
                break

        # Epsilon decay
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon

        # Collection information about training
        tasks_finished = 0
        for current_agent in env.agents:
            if current_agent.status == RailAgentStatus.DONE_REMOVED:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / max_steps)  # save most recent score
        scores.append(np.mean(scores_window))
        dones_list.append((np.mean(done_window)))

        print(
            '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'
            .format(env.get_num_agents(), x_dim, y_dim, trials,
                    np.mean(scores_window), 100 * np.mean(done_window), eps,
                    action_prob / np.sum(action_prob)),
            end=" ")

        data = [[
            n_agents, x_dim, y_dim, trials,
            np.mean(scores_window), 100 * np.mean(done_window), step,
            action_prob / np.sum(action_prob)
        ]]

        df_cur = pd.DataFrame(data, columns=columns)
        df_all_results = pd.concat([df_all_results, df_cur])

        df_all_results.to_csv(
            f'{dataset_type}_DQN_TrainingResults_{n_agents}_{x_dim}_{y_dim}.csv',
            index=False)

        if trials % 100 == 0:
            print(
                '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'
                .format(env.get_num_agents(), x_dim, y_dim, trials,
                        np.mean(scores_window), 100 * np.mean(done_window),
                        eps, action_prob / np.sum(action_prob)))
            torch.save(agent.qnetwork_local.state_dict(),
                       './Nets/navigator_checkpoint' + str(trials) + '.pth')
            action_prob = [1] * action_size

    # Plot overall training progress at the end
    plt.plot(scores)
    plt.show()
def train_agent(train_params, train_env_params, eval_env_params, obs_params):
    # Environment parameters
    n_agents = train_env_params.n_agents
    x_dim = train_env_params.x_dim
    y_dim = train_env_params.y_dim
    n_cities = train_env_params.n_cities
    max_rails_between_cities = train_env_params.max_rails_between_cities
    max_rails_in_city = train_env_params.max_rails_in_city
    seed = train_env_params.seed

    # Unique ID for this training
    now = datetime.now()
    training_id = now.strftime('%y%m%d%H%M%S')

    # Observation parameters
    observation_tree_depth = obs_params.observation_tree_depth
    observation_radius = obs_params.observation_radius
    observation_max_path_depth = obs_params.observation_max_path_depth

    # Training parameters
    eps_start = train_params.eps_start
    eps_end = train_params.eps_end
    eps_decay = train_params.eps_decay
    n_episodes = train_params.n_episodes
    checkpoint_interval = train_params.checkpoint_interval
    n_eval_episodes = train_params.n_evaluation_episodes
    restore_replay_buffer = train_params.restore_replay_buffer
    save_replay_buffer = train_params.save_replay_buffer

    # Set the seeds
    random.seed(seed)
    np.random.seed(seed)

    # Observation builder
    predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
    tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor)

    # Setup the environments
    train_env = create_rail_env(train_env_params, tree_observation)
    train_env.reset(regenerate_schedule=True, regenerate_rail=True)
    eval_env = create_rail_env(eval_env_params, tree_observation)
    eval_env.reset(regenerate_schedule=True, regenerate_rail=True)

    # Setup renderer
    if train_params.render:
        env_renderer = RenderTool(train_env, gl="PGL")

    # Calculate the state size given the depth of the tree observation and the number of features
    n_features_per_node = train_env.obs_builder.observation_dim
    n_nodes = sum([np.power(4, i) for i in range(observation_tree_depth + 1)])
    state_size = n_features_per_node * n_nodes

    # The action space of flatland is 5 discrete actions
    action_size = 5

    # Max number of steps per episode
    # This is the official formula used during evaluations
    # See details in flatland.envs.schedule_generators.sparse_schedule_generator
    # max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities)))
    max_steps = train_env._max_episode_steps

    action_count = [0] * action_size
    action_dict = dict()
    agent_obs = [None] * n_agents
    agent_prev_obs = [None] * n_agents
    agent_prev_action = [2] * n_agents
    update_values = [False] * n_agents

    # Smoothed values used as target for hyperparameter tuning
    smoothed_normalized_score = -1.0
    smoothed_eval_normalized_score = -1.0
    smoothed_completion = 0.0
    smoothed_eval_completion = 0.0

    # Double Dueling DQN policy
    policy = DDDQNPolicy(state_size, action_size, train_params)

    # Loads existing replay buffer
    if restore_replay_buffer:
        try:
            policy.load_replay_buffer(restore_replay_buffer)
            policy.test()
        except RuntimeError as e:
            print("\n🛑 Could't load replay buffer, were the experiences generated using the same tree depth?")
            print(e)
            exit(1)

    print("\n💾 Replay buffer status: {}/{} experiences".format(len(policy.memory.memory), train_params.buffer_size))

    hdd = psutil.disk_usage('/')
    if save_replay_buffer and (hdd.free / (2 ** 30)) < 500.0:
        print("⚠️  Careful! Saving replay buffers will quickly consume a lot of disk space. You have {:.2f}gb left.".format(hdd.free / (2 ** 30)))

    # TensorBoard writer
    writer = SummaryWriter()
    writer.add_hparams(vars(train_params), {})
    writer.add_hparams(vars(train_env_params), {})
    writer.add_hparams(vars(obs_params), {})

    training_timer = Timer()
    training_timer.start()

    print("\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes. Training id '{}'.\n".format(
        train_env.get_num_agents(),
        x_dim, y_dim,
        n_episodes,
        n_eval_episodes,
        checkpoint_interval,
        training_id
    ))

    for episode_idx in range(n_episodes + 1):
        step_timer = Timer()
        reset_timer = Timer()
        learn_timer = Timer()
        preproc_timer = Timer()
        inference_timer = Timer()

        # Reset environment
        reset_timer.start()
        obs, info = train_env.reset(regenerate_rail=True, regenerate_schedule=True)
        reset_timer.end()

        if train_params.render:
            env_renderer.set_new_rail()

        score = 0
        nb_steps = 0
        actions_taken = []

        # Build initial agent-specific observations
        for agent in train_env.get_agent_handles():
            if obs[agent]:
                agent_obs[agent] = normalize_observation(obs[agent], observation_tree_depth, observation_radius=observation_radius)
                agent_prev_obs[agent] = agent_obs[agent].copy()

        # Run episode
        for step in range(max_steps - 1):
            inference_timer.start()
            for agent in train_env.get_agent_handles():
                if info['action_required'][agent]:
                    update_values[agent] = True
                    action = policy.act(agent_obs[agent], eps=eps_start)

                    action_count[action] += 1
                    actions_taken.append(action)
                else:
                    # An action is not required if the train hasn't joined the railway network,
                    # if it already reached its target, or if is currently malfunctioning.
                    update_values[agent] = False
                    action = 0
                action_dict.update({agent: action})
            inference_timer.end()

            # Environment step
            step_timer.start()
            next_obs, all_rewards, done, info = train_env.step(action_dict)
            step_timer.end()

            # Render an episode at some interval
            if train_params.render and episode_idx % checkpoint_interval == 0:
                env_renderer.render_env(
                    show=True,
                    frames=False,
                    show_observations=False,
                    show_predictions=False
                )

            # Update replay buffer and train agent
            for agent in train_env.get_agent_handles():
                if update_values[agent] or done['__all__']:
                    # Only learn from timesteps where somethings happened
                    learn_timer.start()
                    policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent])
                    learn_timer.end()

                    agent_prev_obs[agent] = agent_obs[agent].copy()
                    agent_prev_action[agent] = action_dict[agent]

                # Preprocess the new observations
                if next_obs[agent]:
                    preproc_timer.start()
                    agent_obs[agent] = normalize_observation(next_obs[agent], observation_tree_depth, observation_radius=observation_radius)
                    preproc_timer.end()

                score += all_rewards[agent]

            nb_steps = step

            if done['__all__']:
                break

        # Epsilon decay
        eps_start = max(eps_end, eps_decay * eps_start)

        # Collect information about training
        tasks_finished = sum(done[idx] for idx in train_env.get_agent_handles())
        completion = tasks_finished / max(1, train_env.get_num_agents())
        normalized_score = score / (max_steps * train_env.get_num_agents())
        action_probs = action_count / np.sum(action_count)
        action_count = [1] * action_size

        smoothing = 0.99
        smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * (1.0 - smoothing)
        smoothed_completion = smoothed_completion * smoothing + completion * (1.0 - smoothing)

        # Print logs
        if episode_idx % checkpoint_interval == 0:
            torch.save(policy.qnetwork_local, './checkpoints/' + training_id + '-' + str(episode_idx) + '.pth')

            if save_replay_buffer:
                policy.save_replay_buffer('./replay_buffers/' + training_id + '-' + str(episode_idx) + '.pkl')

            if train_params.render:
                env_renderer.close_window()

        print(
            '\r🚂 Episode {}'
            '\t 🏆 Score: {:.3f}'
            ' Avg: {:.3f}'
            '\t 💯 Done: {:.2f}%'
            ' Avg: {:.2f}%'
            '\t 🎲 Epsilon: {:.3f} '
            '\t 🔀 Action Probs: {}'.format(
                episode_idx,
                normalized_score,
                smoothed_normalized_score,
                100 * completion,
                100 * smoothed_completion,
                eps_start,
                format_action_prob(action_probs)
            ), end=" ")

        # Evaluate policy and log results at some interval
        if episode_idx % checkpoint_interval == 0 and n_eval_episodes > 0:
            scores, completions, nb_steps_eval = eval_policy(eval_env, policy, train_params, obs_params)

            writer.add_scalar("evaluation/scores_min", np.min(scores), episode_idx)
            writer.add_scalar("evaluation/scores_max", np.max(scores), episode_idx)
            writer.add_scalar("evaluation/scores_mean", np.mean(scores), episode_idx)
            writer.add_scalar("evaluation/scores_std", np.std(scores), episode_idx)
            writer.add_histogram("evaluation/scores", np.array(scores), episode_idx)
            writer.add_scalar("evaluation/completions_min", np.min(completions), episode_idx)
            writer.add_scalar("evaluation/completions_max", np.max(completions), episode_idx)
            writer.add_scalar("evaluation/completions_mean", np.mean(completions), episode_idx)
            writer.add_scalar("evaluation/completions_std", np.std(completions), episode_idx)
            writer.add_histogram("evaluation/completions", np.array(completions), episode_idx)
            writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval), episode_idx)
            writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval), episode_idx)
            writer.add_scalar("evaluation/nb_steps_mean", np.mean(nb_steps_eval), episode_idx)
            writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval), episode_idx)
            writer.add_histogram("evaluation/nb_steps", np.array(nb_steps_eval), episode_idx)

            smoothing = 0.9
            smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean(scores) * (1.0 - smoothing)
            smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean(completions) * (1.0 - smoothing)
            writer.add_scalar("evaluation/smoothed_score", smoothed_eval_normalized_score, episode_idx)
            writer.add_scalar("evaluation/smoothed_completion", smoothed_eval_completion, episode_idx)

        # Save logs to tensorboard
        writer.add_scalar("training/score", normalized_score, episode_idx)
        writer.add_scalar("training/smoothed_score", smoothed_normalized_score, episode_idx)
        writer.add_scalar("training/completion", np.mean(completion), episode_idx)
        writer.add_scalar("training/smoothed_completion", np.mean(smoothed_completion), episode_idx)
        writer.add_scalar("training/nb_steps", nb_steps, episode_idx)
        writer.add_histogram("actions/distribution", np.array(actions_taken), episode_idx)
        writer.add_scalar("actions/nothing", action_probs[RailEnvActions.DO_NOTHING], episode_idx)
        writer.add_scalar("actions/left", action_probs[RailEnvActions.MOVE_LEFT], episode_idx)
        writer.add_scalar("actions/forward", action_probs[RailEnvActions.MOVE_FORWARD], episode_idx)
        writer.add_scalar("actions/right", action_probs[RailEnvActions.MOVE_RIGHT], episode_idx)
        writer.add_scalar("actions/stop", action_probs[RailEnvActions.STOP_MOVING], episode_idx)
        writer.add_scalar("training/epsilon", eps_start, episode_idx)
        writer.add_scalar("training/buffer_size", len(policy.memory), episode_idx)
        writer.add_scalar("training/loss", policy.loss, episode_idx)
        writer.add_scalar("timer/reset", reset_timer.get(), episode_idx)
        writer.add_scalar("timer/step", step_timer.get(), episode_idx)
        writer.add_scalar("timer/learn", learn_timer.get(), episode_idx)
        writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx)
        writer.add_scalar("timer/total", training_timer.get_current(), episode_idx)
def main(argv):
    try:
        opts, args = getopt.getopt(argv, "n:", ["n_trials="])
    except getopt.GetoptError:
        print('training_navigation.py -n <n_trials>')
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-n', '--n_trials'):
            n_trials = int(arg)

    random.seed(1)
    np.random.seed(1)

    #### Choose the desired setup ####

    multi_agent_setup = 1
    malfunctions_enabled = False
    agents_one_speed = True

    ##################################

    # Single agent (1)
    if multi_agent_setup == 1:
        x_dim = 35
        y_dim = 35
        n_agents = 1
        max_num_cities = 3
        max_rails_between_cities = 2
        max_rails_in_city = 3

    # Multi agent (3)
    if multi_agent_setup == 3:
        x_dim = 40
        y_dim = 40
        n_agents = 3
        max_num_cities = 4
        max_rails_between_cities = 2
        max_rails_in_city = 3

    # Multi agent (5)
    if multi_agent_setup == 5:
        x_dim = 16 * 3
        y_dim = 9 * 3
        n_agents = 5
        max_num_cities = 5
        max_rails_between_cities = 2
        max_rails_in_city = 3

    # Multi agent (10)
    if multi_agent_setup == 10:
        x_dim = 16 * 4
        y_dim = 9 * 4
        n_agents = 10
        max_num_cities = 9
        max_rails_between_cities = 5
        max_rails_in_city = 5

    # Use a the malfunction generator to break agents from time to time
    stochastic_data = {
        'malfunction_rate':
        80,  # Rate of malfunction occurence of single agent
        'min_duration': 15,  # Minimal duration of malfunction
        'max_duration': 50  # Max duration of malfunction
    }

    # Custom observation builder
    tree_depth = 2
    TreeObservation = TreeObsForRailEnv(
        max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv(20))

    np.savetxt(fname=path.join('Nets', 'info.txt'),
               X=[
                   x_dim, y_dim, n_agents, max_num_cities,
                   max_rails_between_cities, max_rails_in_city, tree_depth
               ],
               delimiter=';')

    # Different agent types (trains) with different speeds.
    if agents_one_speed:
        speed_ration_map = {
            1.: 1.,  # Fast passenger train
            1. / 2.: 0.0,  # Fast freight train
            1. / 3.: 0.0,  # Slow commuter train
            1. / 4.: 0.0
        }  # Slow freight train
    else:
        speed_ration_map = {
            1.: 0.25,  # Fast passenger train
            1. / 2.: 0.25,  # Fast freight train
            1. / 3.: 0.25,  # Slow commuter train
            1. / 4.: 0.25
        }  # Slow freight train

    if malfunctions_enabled:
        env = RailEnv(
            width=x_dim,
            height=y_dim,
            rail_generator=sparse_rail_generator(
                max_num_cities=max_num_cities,
                # Number of cities in map (where train stations are)
                seed=14,  # Random seed
                grid_mode=False,
                max_rails_between_cities=max_rails_between_cities,
                max_rails_in_city=max_rails_in_city),
            schedule_generator=sparse_schedule_generator(speed_ration_map),
            malfunction_generator_and_process_data=malfunction_from_params(
                stochastic_data),
            number_of_agents=n_agents,
            obs_builder_object=TreeObservation)
    else:
        env = RailEnv(
            width=x_dim,
            height=y_dim,
            rail_generator=sparse_rail_generator(
                max_num_cities=max_num_cities,
                # Number of cities in map (where train stations are)
                seed=14,  # Random seed
                grid_mode=False,
                max_rails_between_cities=max_rails_between_cities,
                max_rails_in_city=max_rails_in_city),
            schedule_generator=sparse_schedule_generator(speed_ration_map),
            number_of_agents=n_agents,
            obs_builder_object=TreeObservation)

    env.reset(True, True)

    # After training we want to render the results so we also load a renderer
    env_renderer = RenderTool(
        env,
        gl="PILSVG",
        screen_height=800,  # Adjust these parameters to fit your resolution
        screen_width=900)
    # Given the depth of the tree observation and the number of features per node we get the following state_size
    num_features_per_node = env.obs_builder.observation_dim

    nr_nodes = 0
    for i in range(tree_depth + 1):
        nr_nodes += np.power(4, i)
    state_size = num_features_per_node * nr_nodes

    # The action space of flatland is 5 discrete actions
    action_size = 5

    # We set the number of episodes we would like to train on
    if 'n_trials' not in locals():
        n_trials = 15000

    # And the max number of steps we want to take per episode
    max_steps = int(3 * (env.height + env.width))

    # Define training parameters
    eps = 1.
    eps_end = 0.005
    eps_decay = 0.998

    # And some variables to keep track of the progress
    action_dict = dict()
    final_action_dict = dict()
    scores_window = deque(maxlen=100)
    done_window = deque(maxlen=100)
    deadlock_window = deque(maxlen=100)
    deadlock_average = []
    scores = []
    dones_list = []
    #Metrics
    eps_list = []
    action_prob_list = []
    action_prob = [0] * action_size
    agent_obs = [None] * env.get_num_agents()
    agent_next_obs = [None] * env.get_num_agents()
    agent_obs_buffer = [None] * env.get_num_agents()
    agent_action_buffer = [2] * env.get_num_agents()
    cummulated_reward = np.zeros(env.get_num_agents())
    update_values = False
    # Now we load a Double dueling DQN agent
    agent = Agent(state_size, action_size)

    for trials in range(1, n_trials + 1):

        #print(torch.cuda.current_device())
        # Reset environment
        obs, info = env.reset(True, True)
        #env_renderer.reset()
        # Build agent specific observations
        for a in range(env.get_num_agents()):
            if obs[a]:
                agent_obs[a] = normalize_observation(obs[a],
                                                     tree_depth,
                                                     observation_radius=10)
                agent_obs_buffer[a] = agent_obs[a].copy()

        # Reset score and done
        score = 0
        env_done = 0

        # Run episode
        for step in range(max_steps):
            # Action
            for a in range(env.get_num_agents()):
                if info['action_required'][a]:
                    # If an action is require, we want to store the obs a that step as well as the action
                    update_values = True
                    action = agent.act(agent_obs[a], eps=eps)
                    action_prob[action] += 1
                else:
                    update_values = False
                    action = 0
                action_dict.update({a: action})

            # Environment step
            next_obs, all_rewards, done, deadlocks, info = env.step(
                action_dict)
            #env_renderer.render_env(show=True, show_predictions=True, show_observations=True)
            # Update replay buffer and train agent
            for a in range(env.get_num_agents()):
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values or done[a]:
                    agent.step(agent_obs_buffer[a], agent_action_buffer[a],
                               all_rewards[a], agent_obs[a], done[a])
                    cummulated_reward[a] = 0.

                    agent_obs_buffer[a] = agent_obs[a].copy()
                    agent_action_buffer[a] = action_dict[a]
                if next_obs[a]:
                    agent_obs[a] = normalize_observation(next_obs[a],
                                                         tree_depth,
                                                         observation_radius=10)

                score += all_rewards[a] / env.get_num_agents()

            # Copy observation
            if done['__all__']:
                env_done = 1
                break

        # Epsilon decay
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon

        # Collection information about training
        tasks_finished = 0
        for _idx in range(env.get_num_agents()):
            if done[_idx] == 1:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / max_steps)  # save most recent score
        scores.append(np.mean(scores_window))
        deadlock_window.append(
            deadlocks.count(1) / max(1, env.get_num_agents()))
        deadlock_average.append(np.mean(deadlock_window))
        dones_list.append((np.mean(done_window)))

        eps_list.append(eps)
        action_prob_list.append(action_prob / np.sum(action_prob))
        print(
            '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f} %\tDeadlocks: {:.2f} \tEpsilon: {:.2f} \t Action Probabilities: \t {}'
            .format(env.get_num_agents(), x_dim, y_dim, trials,
                    np.mean(scores_window), 100 * np.mean(done_window),
                    np.mean(deadlock_window), eps,
                    action_prob / np.sum(action_prob)),
            end=" ")

        if trials % 100 == 0:
            print(
                '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'
                .format(env.get_num_agents(), x_dim, y_dim, trials,
                        np.mean(scores_window), 100 * np.mean(done_window),
                        eps, action_prob / np.sum(action_prob)))
            torch.save(
                agent.qnetwork_local.state_dict(),
                path.join('Nets',
                          ('navigator_checkpoint' + str(trials) + '.pth')))

            action_prob = [1] * action_size

        if trials % 50 == 0:

            np.savetxt(fname=path.join('Nets', 'metrics.csv'),
                       X=np.transpose(
                           np.asarray([
                               scores, dones_list, deadlock_average, eps_list
                           ])),
                       delimiter=';',
                       newline='\n')
            np.savetxt(fname=path.join('Nets', 'action_prob.csv'),
                       X=np.asarray(action_prob_list),
                       delimiter=';',
                       newline='\n')

    # Plot overall training progress at the end
    plt.plot(scores)
    plt.show()
Esempio n. 22
0
		sheet1.write(x+3,4,done[3])
		sheet1.write(x+4, 0, str(obs[4]))
		sheet1.write(x+4, 1, actionlist[4])
		sheet1.write(x+4,2,all_rewards[4])
		sheet1.write(x+4,3,str(next_obs[4]))
		sheet1.write(x+4,4,done[4])
		x=x+5"""
        var = 0
        #print(len(env.get_agent_handles()))
        while var < len(env.get_agent_handles()):
            varlist.append(var)
            obslist.append(str(obs[var]))
            if obs[var]:
                normalizeobslist.append(
                    normalize_observation(
                        obs[var],
                        observation_tree_depth,
                        observation_radius=observation_radius))
            else:
                normalizeobslist.append("None")
            action_list.append(actionlist[var])
            reward_list.append(all_rewards[var])
            next_statelist.append(str(next_obs[var]))
            if next_obs[var]:
                next_statenormalized.append(
                    normalize_observation(
                        obs[var],
                        observation_tree_depth,
                        observation_radius=observation_radius))
            else:
                next_statenormalized.append("None")
            donelist.append(done[var])
Esempio n. 23
0
def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps, seed,
                render):
    # evaluation is faster on CPU, except if you have huge networks
    parameters = {'use_gpu': False}

    policy = DDDQNPolicy(state_size,
                         action_size,
                         Namespace(**parameters),
                         evaluation_mode=True)
    policy.qnetwork_local = torch.load(checkpoint)

    env_params = Namespace(**env_params)

    # Environment parameters
    n_agents = env_params.n_agents
    x_dim = env_params.x_dim
    y_dim = env_params.y_dim
    n_cities = env_params.n_cities
    max_rails_between_cities = env_params.max_rails_between_cities
    max_rails_in_city = env_params.max_rails_in_city

    # Observation parameters
    observation_tree_depth = env_params.observation_tree_depth
    observation_radius = env_params.observation_radius
    observation_max_path_depth = env_params.observation_max_path_depth

    # Malfunction and speed profiles
    # TODO pass these parameters properly from main!
    malfunction_parameters = MalfunctionParameters(
        malfunction_rate=1. / 2000,  # Rate of malfunctions
        min_duration=20,  # Minimal duration
        max_duration=50  # Max duration
    )
    speed_profiles = {
        1.: 1.0,  # Fast passenger train
        1. / 2.: 0.0,  # Fast freight train
        1. / 3.: 0.0,  # Slow commuter train
        1. / 4.: 0.0  # Slow freight train
    }

    # Observation builder
    predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
    tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth,
                                         predictor=predictor)

    # Setup the environment
    env = RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=n_cities,
            grid_mode=False,
            max_rails_between_cities=max_rails_between_cities,
            max_rails_in_city=max_rails_in_city),
        schedule_generator=sparse_schedule_generator(speed_profiles),
        number_of_agents=n_agents,
        malfunction_generator_and_process_data=malfunction_from_params(
            malfunction_parameters),
        obs_builder_object=tree_observation,
        random_seed=seed)
    env.reset(True, True)

    if render:
        env_renderer = RenderTool(env, gl="PGL")

    action_dict = dict()
    scores = []
    completions = []
    nb_steps = []
    inference_times = []
    preproc_times = []
    agent_times = []
    step_times = []

    for episode_idx in range(n_eval_episodes):
        inference_timer = Timer()
        preproc_timer = Timer()
        agent_timer = Timer()
        step_timer = Timer()

        agent_obs = [None] * env.get_num_agents()
        score = 0.0

        step_timer.start()
        obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True)
        step_timer.end()

        if render:
            env_renderer.set_new_rail()

        final_step = 0

        for step in range(max_steps - 1):
            agent_timer.start()
            for agent in env.get_agent_handles():
                if obs[agent]:
                    preproc_timer.start()
                    agent_obs[agent] = normalize_observation(
                        obs[agent],
                        tree_depth=observation_tree_depth,
                        observation_radius=observation_radius)
                    preproc_timer.end()

                action = 0
                if info['action_required'][agent]:
                    inference_timer.start()
                    action = policy.act(agent_obs[agent], eps=0.0)
                    inference_timer.end()
                action_dict.update({agent: action})
            agent_timer.end()

            step_timer.start()
            obs, all_rewards, done, info = env.step(action_dict)
            step_timer.end()

            if render:
                env_renderer.render_env(show=True,
                                        frames=False,
                                        show_observations=False,
                                        show_predictions=False)

            for agent in env.get_agent_handles():
                score += all_rewards[agent]

            final_step = step

            if done['__all__']:
                break

        normalized_score = score / (max_steps * env.get_num_agents())
        scores.append(normalized_score)

        tasks_finished = sum(done[idx] for idx in env.get_agent_handles())
        completion = tasks_finished / max(1, env.get_num_agents())
        completions.append(completion)

        nb_steps.append(final_step)

        inference_times.append(inference_timer.get())
        preproc_times.append(preproc_timer.get())
        agent_times.append(agent_timer.get())
        step_times.append(step_timer.get())

        print(
            "☑️  Score: {:.3f} \tDone: {:.1f}% \tNb steps: {:.3f} "
            "\t🚉 Env: {:.3f}s "
            "\t🤖 Agent: {:.3f}s (per step: {:.3f}s) \t[preproc: {:.3f}s \tinfer: {:.3f}s]"
            .format(normalized_score, completion * 100.0, final_step,
                    step_timer.get(), agent_timer.get(),
                    agent_timer.get() / final_step, preproc_timer.get(),
                    inference_timer.get()))

    return scores, completions, nb_steps, agent_times, step_times
Esempio n. 24
0
def train_agent(seed, n_episodes, timed = False):

    # Observation parameters
    observation_radius = 10

    # Exploration parameters
    eps_start = 1.0
    eps_end = 0.01
    eps_decay = 0.997  # for 2500ts

    # Setup the environment
    env, max_steps, x_dim, y_dim, observation_tree_depth, _ = create_multi_agent_rail_env(seed, timed)
    env.reset(True, True)

    # Calculate the state size given the depth of the tree observation and the number of features
    n_features_per_node = env.obs_builder.observation_dim
    n_nodes = 0
    for i in range(observation_tree_depth + 1):
        n_nodes += np.power(4, i)
    state_size = n_features_per_node * n_nodes

    # The action space of flatland is 5 discrete actions
    action_size = 5
    action_dict = dict()

    # And some variables to keep track of the progress
    scores_window = deque(maxlen=100)  # todo smooth when rendering instead
    completion_window = deque(maxlen=100)
    scores = []
    completion = []
    action_count = [0] * action_size
    agent_obs = [None] * env.get_num_agents()
    agent_prev_obs = [None] * env.get_num_agents()
    agent_prev_action = [2] * env.get_num_agents()
    update_values = False

    # Training parameters
    training_parameters = {
        'buffer_size': int(1e5),
        'batch_size': 32,
        'update_every': 8,
        'learning_rate': 0.5e-4,
        'tau': 1e-3,
        'gamma': 0.99,
        'buffer_min_size': 0,
        'hidden_size': 256,
        'use_gpu': False
    }

    # Double Dueling DQN policy
    policy = DDDQNPolicy(state_size, action_size, Namespace(**training_parameters), seed=(seed+1))

    for episode_idx in range(n_episodes):
        score = 0

        # Reset environment
        obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True)

        # Build agent specific observations
        for agent in env.get_agent_handles():
            if obs[agent]:
                agent_obs[agent] = normalize_observation(obs[agent], observation_tree_depth, observation_radius=observation_radius)
                agent_prev_obs[agent] = agent_obs[agent].copy()

        # Run episode
        for _ in range(max_steps - 1):
            for agent in env.get_agent_handles():
                if info['action_required'][agent]:
                    # If an action is required, we want to store the obs at that step as well as the action
                    update_values = True
                    action = policy.act(agent_obs[agent], eps=eps_start)
                    action_count[action] += 1
                else:
                    update_values = False
                    action = 0
                action_dict.update({agent: action})

            # Environment step
            next_obs, all_rewards, done, info = env.step(action_dict)

            # Update replay buffer and train agent
            for agent in range(env.get_num_agents()):
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values or done[agent]:
                    policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent])

                    agent_prev_obs[agent] = agent_obs[agent].copy()
                    agent_prev_action[agent] = action_dict[agent]

                if next_obs[agent]:
                    agent_obs[agent] = normalize_observation(next_obs[agent], observation_tree_depth, observation_radius=observation_radius)

                score += all_rewards[agent]

            if done['__all__']:
                break

        # Epsilon decay
        eps_start = max(eps_end, eps_decay * eps_start)

        # Collection information about training
        tasks_finished = np.sum([int(done[idx]) for idx in env.get_agent_handles()])
        completion_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / (max_steps * env.get_num_agents()))
        completion.append((np.mean(completion_window)))
        scores.append(np.mean(scores_window))
        action_probs = action_count / np.sum(action_count)

        if episode_idx % 100 == 0:
            end = "\n"
            action_count = [1] * action_size
        else:
            end = " "

        print('\rTraining {} agents on {}x{}\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
            env.get_num_agents(),
            x_dim, y_dim,
            episode_idx,
            np.mean(scores_window),
            100 * np.mean(completion_window),
            eps_start,
            action_probs
        ), end=end)

    # Return trained policy.
    return policy
def main(argv):
    try:
        opts, args = getopt.getopt(argv, "n:", ["n_trials="])
    except getopt.GetoptError:
        print('training_navigation.py -n <n_trials>')
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-n', '--n_trials'):
            n_trials = int(arg)

    random.seed(1)
    np.random.seed(1)

    # Parameters for the Environment
    x_dim = 35
    y_dim = 35
    n_agents = 10


    # Use a the malfunction generator to break agents from time to time
    stochastic_data = {'malfunction_rate': 8000,  # Rate of malfunction occurence of single agent
                       'min_duration': 15,  # Minimal duration of malfunction
                       'max_duration': 50  # Max duration of malfunction
                       }

    # Custom observation builder
    TreeObservation = TreeObsForRailEnv(max_depth=2, predictor=ShortestPathPredictorForRailEnv(30))

    # Different agent types (trains) with different speeds.
    speed_ration_map = {1.: 0.25,  # Fast passenger train
                        1. / 2.: 0.25,  # Fast freight train
                        1. / 3.: 0.25,  # Slow commuter train
                        1. / 4.: 0.25}  # Slow freight train

    env = RailEnv(width=x_dim,
                  height=y_dim,
                  rail_generator=sparse_rail_generator(max_num_cities=3,
                                                       # Number of cities in map (where train stations are)
                                                       seed=1,  # Random seed
                                                       grid_mode=False,
                                                       max_rails_between_cities=2,
                                                       max_rails_in_city=3),
                  schedule_generator=sparse_schedule_generator(speed_ration_map),
                  number_of_agents=n_agents,
                  malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
                  obs_builder_object=TreeObservation)

    # After training we want to render the results so we also load a renderer
    env_renderer = RenderTool(env, gl="PILSVG", )
    # Given the depth of the tree observation and the number of features per node we get the following state_size
    num_features_per_node = env.obs_builder.observation_dim
    tree_depth = 2
    nr_nodes = 0
    for i in range(tree_depth + 1):
        nr_nodes += np.power(4, i)
    state_size = num_features_per_node * nr_nodes

    # The action space of flatland is 5 discrete actions
    action_size = 5

    # We set the number of episodes we would like to train on
    if 'n_trials' not in locals():
        n_trials = 15000

    # And the max number of steps we want to take per episode
    max_steps = int(4 * 2 * (20 + env.height + env.width))

    # Define training parameters
    eps = 1.
    eps_end = 0.005
    eps_decay = 0.998

    # And some variables to keep track of the progress
    action_dict = dict()
    final_action_dict = dict()
    scores_window = deque(maxlen=100)
    done_window = deque(maxlen=100)
    scores = []
    dones_list = []
    action_prob = [0] * action_size
    agent_obs = [None] * env.get_num_agents()
    agent_next_obs = [None] * env.get_num_agents()
    agent_obs_buffer = [None] * env.get_num_agents()
    agent_action_buffer = [2] * env.get_num_agents()
    cummulated_reward = np.zeros(env.get_num_agents())
    update_values = [False] * env.get_num_agents()
    # Now we load a Double dueling DQN agent
    agent = Agent(state_size, action_size)

    for trials in range(1, n_trials + 1):

        # Reset environment
        obs, info = env.reset(True, True)
        env_renderer.reset()
        # Build agent specific observations
        for a in range(env.get_num_agents()):
            if obs[a]:
                agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
                agent_obs_buffer[a] = agent_obs[a].copy()

        # Reset score and done
        score = 0
        env_done = 0

        # Run episode
        while True:
            # Action
            for a in range(env.get_num_agents()):
                if info['action_required'][a]:
                    # If an action is require, we want to store the obs a that step as well as the action
                    update_values[a] = True
                    action = agent.act(agent_obs[a], eps=eps)
                    action_prob[action] += 1
                else:
                    update_values[a] = False
                    action = 0
                action_dict.update({a: action})

            # Environment step
            next_obs, all_rewards, done, info = env.step(action_dict)
            # Update replay buffer and train agent
            for a in range(env.get_num_agents()):
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values[a] or done[a]:
                    agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a],
                               agent_obs[a], done[a])
                    cummulated_reward[a] = 0.

                    agent_obs_buffer[a] = agent_obs[a].copy()
                    agent_action_buffer[a] = action_dict[a]
                if next_obs[a]:
                    agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10)

                score += all_rewards[a] / env.get_num_agents()

            # Copy observation
            if done['__all__']:
                env_done = 1
                break

        # Epsilon decay
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon

        # Collection information about training
        tasks_finished = 0
        for current_agent in env.agents:
            if current_agent.status == RailAgentStatus.DONE_REMOVED:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / max_steps)  # save most recent score
        scores.append(np.mean(scores_window))
        dones_list.append((np.mean(done_window)))

        print(
            '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
                env.get_num_agents(), x_dim, y_dim,
                trials,
                np.mean(scores_window),
                100 * np.mean(done_window),
                eps, action_prob / np.sum(action_prob)), end=" ")

        if trials % 100 == 0:
            print(
                '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
                    env.get_num_agents(), x_dim, y_dim,
                    trials,
                    np.mean(scores_window),
                    100 * np.mean(done_window),
                    eps, action_prob / np.sum(action_prob)))
            torch.save(agent.qnetwork_local.state_dict(),
                       './Nets/navigator_checkpoint' + str(trials) + '.pth')
            action_prob = [1] * action_size

    # Plot overall training progress at the end
    plt.plot(scores)
    plt.show()
Esempio n. 26
0
        np.mean(completions) * 100.0))

    return scores, completions, nb_steps


for episode_idx in range(n_episodes + 1):
    obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True)
    score = 0
    nb_steps = 0
    actions_taken = []

    # Build agent specific observations
    for agent in env.get_agent_handles():
        if obs[agent]:
            agent_obs[agent] = normalize_observation(
                obs[agent],
                observation_tree_depth,
                observation_radius=observation_radius)
            agent_prev_obs[agent] = agent_obs[agent].copy()

    # Run episode
    for step in range(max_steps - 1):
        for agent in env.get_agent_handles():
            if info['action_required'][agent]:
                # If an action is required, we want to store the obs at that step as well as the action
                update_values = True
                action = policy.act(agent_obs[agent], eps=eps_start)
                action_count[action] += 1
                actions_taken.append(action)
            else:
                update_values = False
                action = 0
def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps,
                action_size, state_size, seed, render, allow_skipping,
                allow_caching):
    # Evaluation is faster on CPU (except if you use a really huge policy)
    parameters = {'use_gpu': False}

    policy = DDDQNPolicy(state_size,
                         action_size,
                         Namespace(**parameters),
                         evaluation_mode=True)
    policy.qnetwork_local = torch.load(checkpoint)

    env_params = Namespace(**env_params)

    # Environment parameters
    n_agents = env_params.n_agents
    x_dim = env_params.x_dim
    y_dim = env_params.y_dim
    n_cities = env_params.n_cities
    max_rails_between_cities = env_params.max_rails_between_cities
    max_rails_in_city = env_params.max_rails_in_city

    # Malfunction and speed profiles
    # TODO pass these parameters properly from main!
    malfunction_parameters = MalfunctionParameters(
        malfunction_rate=1. / 2000,  # Rate of malfunctions
        min_duration=20,  # Minimal duration
        max_duration=50  # Max duration
    )

    # Only fast trains in Round 1
    speed_profiles = {
        1.: 1.0,  # Fast passenger train
        1. / 2.: 0.0,  # Fast freight train
        1. / 3.: 0.0,  # Slow commuter train
        1. / 4.: 0.0  # Slow freight train
    }

    # Observation parameters
    observation_tree_depth = env_params.observation_tree_depth
    observation_radius = env_params.observation_radius
    observation_max_path_depth = env_params.observation_max_path_depth

    # Observation builder
    predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
    tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth,
                                         predictor=predictor)

    # Setup the environment
    env = RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=n_cities,
            grid_mode=False,
            max_rails_between_cities=max_rails_between_cities,
            max_rails_in_city=max_rails_in_city,
        ),
        schedule_generator=sparse_schedule_generator(speed_profiles),
        number_of_agents=n_agents,
        malfunction_generator_and_process_data=malfunction_from_params(
            malfunction_parameters),
        obs_builder_object=tree_observation)

    if render:
        env_renderer = RenderTool(env, gl="PGL")

    action_dict = dict()
    scores = []
    completions = []
    nb_steps = []
    inference_times = []
    preproc_times = []
    agent_times = []
    step_times = []

    for episode_idx in range(n_eval_episodes):
        seed += 1

        inference_timer = Timer()
        preproc_timer = Timer()
        agent_timer = Timer()
        step_timer = Timer()

        step_timer.start()
        obs, info = env.reset(regenerate_rail=True,
                              regenerate_schedule=True,
                              random_seed=seed)
        step_timer.end()

        agent_obs = [None] * env.get_num_agents()
        score = 0.0

        if render:
            env_renderer.set_new_rail()

        final_step = 0
        skipped = 0

        nb_hit = 0
        agent_last_obs = {}
        agent_last_action = {}

        for step in range(max_steps - 1):
            if allow_skipping and check_if_all_blocked(env):
                # FIXME why -1? bug where all agents are "done" after max_steps!
                skipped = max_steps - step - 1
                final_step = max_steps - 2
                n_unfinished_agents = sum(not done[idx]
                                          for idx in env.get_agent_handles())
                score -= skipped * n_unfinished_agents
                break

            agent_timer.start()
            for agent in env.get_agent_handles():
                if obs[agent] and info['action_required'][agent]:
                    if agent in agent_last_obs and np.all(
                            agent_last_obs[agent] == obs[agent]):
                        nb_hit += 1
                        action = agent_last_action[agent]

                    else:
                        preproc_timer.start()
                        norm_obs = normalize_observation(
                            obs[agent],
                            tree_depth=observation_tree_depth,
                            observation_radius=observation_radius)
                        preproc_timer.end()

                        inference_timer.start()
                        action = policy.act(norm_obs, eps=0.0)
                        inference_timer.end()

                    action_dict.update({agent: action})

                    if allow_caching:
                        agent_last_obs[agent] = obs[agent]
                        agent_last_action[agent] = action
            agent_timer.end()

            step_timer.start()
            obs, all_rewards, done, info = env.step(action_dict)
            step_timer.end()

            if render:
                env_renderer.render_env(show=True,
                                        frames=False,
                                        show_observations=False,
                                        show_predictions=False)

                if step % 100 == 0:
                    print("{}/{}".format(step, max_steps - 1))

            for agent in env.get_agent_handles():
                score += all_rewards[agent]

            final_step = step

            if done['__all__']:
                break

        normalized_score = score / (max_steps * env.get_num_agents())
        scores.append(normalized_score)

        tasks_finished = sum(done[idx] for idx in env.get_agent_handles())
        completion = tasks_finished / max(1, env.get_num_agents())
        completions.append(completion)

        nb_steps.append(final_step)

        inference_times.append(inference_timer.get())
        preproc_times.append(preproc_timer.get())
        agent_times.append(agent_timer.get())
        step_times.append(step_timer.get())

        skipped_text = ""
        if skipped > 0:
            skipped_text = "\t⚡ Skipped {}".format(skipped)

        hit_text = ""
        if nb_hit > 0:
            hit_text = "\t⚡ Hit {} ({:.1f}%)".format(nb_hit, (100 * nb_hit) /
                                                     (n_agents * final_step))

        print(
            "☑️  Score: {:.3f} \tDone: {:.1f}% \tNb steps: {:.3f} "
            "\t🍭 Seed: {}"
            "\t🚉 Env: {:.3f}s  "
            "\t🤖 Agent: {:.3f}s (per step: {:.3f}s) \t[preproc: {:.3f}s \tinfer: {:.3f}s]"
            "{}{}".format(normalized_score, completion * 100.0, final_step,
                          seed, step_timer.get(), agent_timer.get(),
                          agent_timer.get() / final_step, preproc_timer.get(),
                          inference_timer.get(), skipped_text, hit_text))

    return scores, completions, nb_steps, agent_times, step_times