Exemple #1
0
def test_rail_environment_single_agent(show=False):
    # We instantiate the following map on a 3x3 grid
    #  _  _
    # / \/ \
    # | |  |
    # \_/\_/

    transitions = RailEnvTransitions()
    
    
    
    if False:
        # This env creation doesn't quite work right.
        cells = transitions.transition_list
        vertical_line = cells[1]
        south_symmetrical_switch = cells[6]
        north_symmetrical_switch = transitions.rotate_transition(south_symmetrical_switch, 180)
        south_east_turn = int('0100000000000010', 2)
        south_west_turn = transitions.rotate_transition(south_east_turn, 90)
        north_east_turn = transitions.rotate_transition(south_east_turn, 270)
        north_west_turn = transitions.rotate_transition(south_east_turn, 180)

        rail_map = np.array([[south_east_turn, south_symmetrical_switch,
                            south_west_turn],
                            [vertical_line, vertical_line, vertical_line],
                            [north_east_turn, north_symmetrical_switch,
                            north_west_turn]],
                            dtype=np.uint16)

        rail = GridTransitionMap(width=3, height=3, transitions=transitions)
        rail.grid = rail_map
        rail_env = RailEnv(width=3, height=3, rail_generator=rail_from_grid_transition_map(rail),
                        schedule_generator=random_schedule_generator(), number_of_agents=1,
                        obs_builder_object=GlobalObsForRailEnv())
    else:
        rail_env, env_dict = RailEnvPersister.load_new("test_env_loop.pkl", "env_data.tests")
        rail_map = rail_env.rail.grid
    
    rail_env._max_episode_steps = 1000

    _ = rail_env.reset(False, False, True)

    liActions = [int(a) for a in RailEnvActions]

    env_renderer = RenderTool(rail_env)

    #RailEnvPersister.save(rail_env, "test_env_figure8.pkl")
    
    for _ in range(5):

        #rail_env.agents[0].initial_position = (1,2)
        _ = rail_env.reset(False, False, True)

        # We do not care about target for the moment
        agent = rail_env.agents[0]
        agent.target = [-1, -1]

        # Check that trains are always initialized at a consistent position
        # or direction.
        # They should always be able to go somewhere.
        if show:
            print("After reset - agent pos:", agent.position, "dir: ", agent.direction)
            print(transitions.get_transitions(rail_map[agent.position], agent.direction))

        #assert (transitions.get_transitions(
        #    rail_map[agent.position],
        #    agent.direction) != (0, 0, 0, 0))

        # HACK - force the direction to one we know is good.
        #agent.initial_position = agent.position = (2,3)
        agent.initial_direction = agent.direction = 0

        if show:
            print ("handle:", agent.handle)
        #agent.initial_position = initial_pos = agent.position

        valid_active_actions_done = 0
        pos = agent.position

        if show:
            env_renderer.render_env(show=show, show_agents=True)
            time.sleep(0.01)

        iStep = 0
        while valid_active_actions_done < 6:
            # We randomly select an action
            action = np.random.choice(liActions)
            #action = RailEnvActions.MOVE_FORWARD

            _, _, dict_done, _ = rail_env.step({0: action})

            prev_pos = pos
            pos = agent.position  # rail_env.agents_position[0]

            print("action:", action, "pos:", agent.position, "prev:", prev_pos, agent.direction)
            print(dict_done)
            if prev_pos != pos:
                valid_active_actions_done += 1
            iStep += 1
            
            if show:
                env_renderer.render_env(show=show, show_agents=True, step=iStep)
                time.sleep(0.01)
            assert iStep < 100, "valid actions should have been performed by now - hung agent"

        # After 6 movements on this railway network, the train should be back
        # to its original height on the map.
        #assert (initial_pos[0] == agent.position[0])

        # We check that the train always attains its target after some time
        for _ in range(10):
            _ = rail_env.reset()

            rail_env.agents[0].direction = 0

            # JW - to avoid problem with random_schedule_generator.
            #rail_env.agents[0].position = (1,2)

            iStep = 0
            while iStep < 100:
                # We randomly select an action
                action = np.random.choice(liActions)

                _, _, dones, _ = rail_env.step({0: action})
                done = dones['__all__']
                if done:
                    break
                iStep +=1
                assert iStep < 100, "agent should have finished by now"
                env_renderer.render_env(show=show)
Exemple #2
0
def train_agent(train_params, train_env_params, eval_env_params, obs_params):
    # Environment parameters
    n_agents = train_env_params.n_agents
    x_dim = train_env_params.x_dim
    y_dim = train_env_params.y_dim
    n_cities = train_env_params.n_cities
    max_rails_between_cities = train_env_params.max_rails_between_cities
    max_rails_in_city = train_env_params.max_rails_in_city
    seed = train_env_params.seed

    # Unique ID for this training
    now = datetime.now()
    training_id = now.strftime('%y%m%d%H%M%S')

    # Observation parameters
    observation_tree_depth = obs_params.observation_tree_depth
    observation_radius = obs_params.observation_radius
    observation_max_path_depth = obs_params.observation_max_path_depth

    # Training parameters
    eps_start = train_params.eps_start
    eps_end = train_params.eps_end
    eps_decay = train_params.eps_decay
    n_episodes = train_params.n_episodes
    checkpoint_interval = train_params.checkpoint_interval
    n_eval_episodes = train_params.n_evaluation_episodes
    restore_replay_buffer = train_params.restore_replay_buffer
    save_replay_buffer = train_params.save_replay_buffer
    last_checkpoint = train_params.last_checkpoint
    # Set the seeds
    random.seed(seed)
    np.random.seed(seed)

    # Observation builder
    predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
    tree_observation = TreeObsForRailEnv(max_depth=obs_params.max_depth,
                                         predictor=predictor)

    # Setup the environments
    train_env = create_rail_env(train_env_params, tree_observation)
    train_env.reset(regenerate_schedule=True, regenerate_rail=True)
    eval_env = create_rail_env(eval_env_params, tree_observation)
    eval_env.reset(regenerate_schedule=True, regenerate_rail=True)

    # Setup renderer
    if train_params.render:
        env_renderer = RenderTool(train_env, gl="PGL")

    # Calculate the state size given the depth of the tree observation and the number of features
    n_features_per_node = train_env.obs_builder.observation_dim
    n_nodes = observation_tree_depth
    state_size = (n_features_per_node + 1) * n_nodes - 1

    # The action space of flatland is 5 discrete actions
    action_size = 5

    # Max number of steps per episode
    # This is the official formula used during evaluations
    # See details in flatland.envs.schedule_generators.sparse_schedule_generator
    # max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities)))
    max_steps = train_env._max_episode_steps

    action_count = [0] * action_size
    action_dict = dict()
    agent_obs = [None] * n_agents
    agent_prev_obs = [None] * n_agents
    agent_prev_action = [2] * n_agents
    update_values = [False] * n_agents

    # Smoothed values used as target for hyperparameter tuning
    smoothed_normalized_score = -1.0
    smoothed_eval_normalized_score = -1.0
    smoothed_completion = 0.0
    smoothed_eval_completion = 0.0

    # Double Dueling DQN policy
    policy = DDDQNPolicy(state_size, action_size, train_params)

    if os.path.isfile(last_checkpoint):
        policy.qnetwork_local = torch.load(last_checkpoint)
        print("load checkpoint from %s" % (last_checkpoint))

    # Loads existing replay buffer
    if restore_replay_buffer:
        try:
            policy.load_replay_buffer(restore_replay_buffer)
            policy.test()
        except RuntimeError as e:
            print(
                "\n🛑 Could't load replay buffer, were the experiences generated using the same tree depth?"
            )
            print(e)
            exit(1)

    print("\n💾 Replay buffer status: {}/{} experiences".format(
        len(policy.memory.memory), train_params.buffer_size))

    hdd = psutil.disk_usage('/')
    if save_replay_buffer and (hdd.free / (2**30)) < 500.0:
        print(
            "⚠️  Careful! Saving replay buffers will quickly consume a lot of disk space. You have {:.2f}gb left."
            .format(hdd.free / (2**30)))

    # TensorBoard writer
    writer = SummaryWriter()
    writer.add_hparams(vars(train_params), {})
    writer.add_hparams(vars(train_env_params), {})
    writer.add_hparams(vars(obs_params), {})

    training_timer = Timer()
    training_timer.start()

    print(
        "\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes. Training id '{}'.\n"
        .format(train_env.get_num_agents(), x_dim, y_dim, n_episodes,
                n_eval_episodes, checkpoint_interval, training_id))

    for episode_idx in range(n_episodes + 1):
        step_timer = Timer()
        reset_timer = Timer()
        learn_timer = Timer()
        preproc_timer = Timer()
        inference_timer = Timer()

        # Reset environment
        reset_timer.start()
        obs, info = train_env.reset(regenerate_rail=True,
                                    regenerate_schedule=True)
        reset_timer.end()

        if train_params.render:
            env_renderer.set_new_rail()

        score = 0
        nb_steps = 0
        actions_taken = []

        # Build initial agent-specific observations
        for agent in train_env.get_agent_handles():
            if obs[agent]:
                agent_obs[agent] = normalize_observation(
                    obs[agent],
                    observation_tree_depth,
                    observation_radius=observation_radius)
                agent_prev_obs[agent] = agent_obs[agent].copy()

        # Run episode
        for step in range(max_steps - 1):
            inference_timer.start()
            for agent in train_env.get_agent_handles():
                if info['action_required'][agent]:
                    update_values[agent] = True
                    action = policy.act(agent_obs[agent], eps=eps_start)

                    action_count[action] += 1
                    actions_taken.append(action)
                else:
                    # An action is not required if the train hasn't joined the railway network,
                    # if it already reached its target, or if is currently malfunctioning.
                    update_values[agent] = False
                    action = 0
                action_dict.update({agent: action})
            inference_timer.end()

            # Environment step
            step_timer.start()
            next_obs, all_rewards, done, info = train_env.step(action_dict)
            step_timer.end()

            # Render an episode at some interval
            if train_params.render and episode_idx % checkpoint_interval == 0:
                env_renderer.render_env(
                    show=True,
                    frames=False,
                    show_observations=True,
                    show_predictions=True,
                )

            # Update replay buffer and train agent
            for agent in train_env.get_agent_handles():
                if update_values[agent] or done['__all__']:
                    # Only learn from timesteps where somethings happened
                    learn_timer.start()
                    policy.step(agent_prev_obs[agent],
                                agent_prev_action[agent], all_rewards[agent],
                                agent_obs[agent], done[agent])
                    learn_timer.end()

                    agent_prev_obs[agent] = agent_obs[agent].copy()
                    agent_prev_action[agent] = action_dict[agent]

                # Preprocess the new observations
                if next_obs[agent]:
                    preproc_timer.start()
                    agent_obs[agent] = normalize_observation(
                        next_obs[agent],
                        observation_tree_depth,
                        observation_radius=observation_radius)
                    preproc_timer.end()

                score += all_rewards[agent]

            nb_steps = step

            if done['__all__']:
                break

        # Epsilon decay
        eps_start = max(eps_end, eps_decay * eps_start)

        # Collect information about training
        tasks_finished = sum(done[idx]
                             for idx in train_env.get_agent_handles())
        completion = tasks_finished / max(1, train_env.get_num_agents())
        normalized_score = score / (max_steps * train_env.get_num_agents())
        action_probs = action_count / np.sum(action_count)
        action_count = [1] * action_size

        smoothing = 0.99
        smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * (
            1.0 - smoothing)
        smoothed_completion = smoothed_completion * smoothing + completion * (
            1.0 - smoothing)

        # Print logs
        if episode_idx % checkpoint_interval == 0:
            torch.save(
                policy.qnetwork_local, './checkpoints/' + training_id + '-' +
                str(episode_idx) + '.pth')

            if save_replay_buffer:
                policy.save_replay_buffer('./replay_buffers/' + training_id +
                                          '-' + str(episode_idx) + '.pkl')

            if train_params.render:
                env_renderer.close_window()

        print('\r🚂 Episode {}'
              '\t 🏆 Score: {:.3f}'
              ' Avg: {:.3f}'
              '\t 💯 Done: {:.2f}%'
              ' Avg: {:.2f}%'
              '\t 🎲 Epsilon: {:.3f} '
              '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score,
                                             smoothed_normalized_score,
                                             100 * completion,
                                             100 * smoothed_completion,
                                             eps_start,
                                             format_action_prob(action_probs)),
              end=" ")

        # Evaluate policy and log results at some interval
        if episode_idx % checkpoint_interval == 0 and n_eval_episodes > 0:
            scores, completions, nb_steps_eval = eval_policy(
                eval_env, policy, train_params, obs_params)

            writer.add_scalar("evaluation/scores_min", np.min(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_max", np.max(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_mean", np.mean(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_std", np.std(scores),
                              episode_idx)
            writer.add_histogram("evaluation/scores", np.array(scores),
                                 episode_idx)
            writer.add_scalar("evaluation/completions_min",
                              np.min(completions), episode_idx)
            writer.add_scalar("evaluation/completions_max",
                              np.max(completions), episode_idx)
            writer.add_scalar("evaluation/completions_mean",
                              np.mean(completions), episode_idx)
            writer.add_scalar("evaluation/completions_std",
                              np.std(completions), episode_idx)
            writer.add_histogram("evaluation/completions",
                                 np.array(completions), episode_idx)
            writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval),
                              episode_idx)
            writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval),
                              episode_idx)
            writer.add_scalar("evaluation/nb_steps_mean",
                              np.mean(nb_steps_eval), episode_idx)
            writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval),
                              episode_idx)
            writer.add_histogram("evaluation/nb_steps",
                                 np.array(nb_steps_eval), episode_idx)

            smoothing = 0.9
            smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean(
                scores) * (1.0 - smoothing)
            smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean(
                completions) * (1.0 - smoothing)
            writer.add_scalar("evaluation/smoothed_score",
                              smoothed_eval_normalized_score, episode_idx)
            writer.add_scalar("evaluation/smoothed_completion",
                              smoothed_eval_completion, episode_idx)

        # Save logs to tensorboard
        writer.add_scalar("training/score", normalized_score, episode_idx)
        writer.add_scalar("training/smoothed_score", smoothed_normalized_score,
                          episode_idx)
        writer.add_scalar("training/completion", np.mean(completion),
                          episode_idx)
        writer.add_scalar("training/smoothed_completion",
                          np.mean(smoothed_completion), episode_idx)
        writer.add_scalar("training/nb_steps", nb_steps, episode_idx)
        writer.add_histogram("actions/distribution", np.array(actions_taken),
                             episode_idx)
        writer.add_scalar("actions/nothing",
                          action_probs[RailEnvActions.DO_NOTHING], episode_idx)
        writer.add_scalar("actions/left",
                          action_probs[RailEnvActions.MOVE_LEFT], episode_idx)
        writer.add_scalar("actions/forward",
                          action_probs[RailEnvActions.MOVE_FORWARD],
                          episode_idx)
        writer.add_scalar("actions/right",
                          action_probs[RailEnvActions.MOVE_RIGHT], episode_idx)
        writer.add_scalar("actions/stop",
                          action_probs[RailEnvActions.STOP_MOVING],
                          episode_idx)
        writer.add_scalar("training/epsilon", eps_start, episode_idx)
        writer.add_scalar("training/buffer_size", len(policy.memory),
                          episode_idx)
        writer.add_scalar("training/loss", policy.loss, episode_idx)
        writer.add_scalar("timer/reset", reset_timer.get(), episode_idx)
        writer.add_scalar("timer/step", step_timer.get(), episode_idx)
        writer.add_scalar("timer/learn", learn_timer.get(), episode_idx)
        writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx)
        writer.add_scalar("timer/total", training_timer.get_current(),
                          episode_idx)
np.random.seed(0)

# We need to either load in some pre-generated railways from disk, or else create a random railway generator.
if flags.load_railways:
      rail_generator, schedule_generator = load_precomputed_railways(project_root, flags)
else: rail_generator, schedule_generator = create_random_railways(project_root)

# Create the Flatland environment
env = RailEnv(width=flags.grid_width, height=flags.grid_height, number_of_agents=flags.num_agents,
              rail_generator=rail_generator,
              schedule_generator=schedule_generator,
              malfunction_generator=ParamMalfunctionGen(MalfunctionParameters(1 / 8000, 15, 50)),
              obs_builder_object=TreeObservation(max_depth=flags.tree_depth))

# After training we want to render the results so we also load a renderer
env_renderer = RenderTool(env, gl="PILSVG", screen_width=800, screen_height=800, agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX)

# Calculate the state size based on the number of nodes in the tree observation
num_features_per_node = env.obs_builder.observation_dim
num_nodes = sum(np.power(4, i) for i in range(flags.tree_depth + 1))
state_size = num_nodes * num_features_per_node
action_size = 5

# Add some variables to keep track of the progress
scores_window, steps_window, collisions_window, done_window = [deque(maxlen=200) for _ in range(4)]
agent_obs = [None] * flags.num_agents
agent_obs_buffer = [None] * flags.num_agents
agent_action_buffer = [2] * flags.num_agents
max_steps = 8 * (flags.grid_width + flags.grid_height)
start_time = time.time()
Exemple #4
0
env = RailEnv(width=width,
              height=height,
              rail_generator=rail_generator,
              schedule_generator=schedule_generator,
              number_of_agents=nr_trains,
              obs_builder_object=observation_builder,
              malfunction_generator_and_process_data=malfunction_from_params(
                  stochastic_data),
              remove_agents_at_target=True)
env.reset()

# Initiate the renderer
env_renderer = RenderTool(
    env,
    gl="PILSVG",
    agent_render_variant=AgentRenderVariant.ONE_STEP_BEHIND,
    show_debug=False,
    screen_height=1200,  # Adjust these parameters to fit your resolution
    screen_width=1800)  # Adjust these parameters to fit your resolution

######### Get arguments of the script #########
parser = argparse.ArgumentParser()
parser.add_argument("-step", type=int, help="steps")
args = parser.parse_args()

######### Custom controller setup #########
controller = GreedyAgent(218, env.action_space[0])
my_grid = [[Node((i, j), env.rail.grid[i, j]) for j in range(env.rail.width)]
           for i in range(env.rail.height)]
astar_planner = AStarAgent(my_grid, env.rail.width, env.rail.height)
Exemple #5
0
def train_agent(train_params):

    env = load_flatland_environment_from_file(
        "scratch/test-envs/Test_13/Level_0.pkl")
    env.reset(regenerate_schedule=True, regenerate_rail=True)
    # Environment parameters
    n_agents = len(env.agents)
    print("n_agents= ", n_agents)
    print("env.get_num_agents(): ", env.get_num_agents())
    x_dim = env.width
    y_dim = env.height
    n_cities = 37
    #max_rails_between_cities = env_params.max_rails_between_cities
    #max_rails_in_city = env_params.max_rails_in_city
    seed = 2125

    # Observation parameters
    # observation_tree_depth = env_params.observation_tree_depth
    # observation_radius = env_params.observation_radius
    # observation_max_path_depth = env_params.observation_max_path_depth
    observation_tree_depth = 2
    observation_radius = 10
    observation_max_path_depth = 30

    # Training parameters
    eps_start = train_params.eps_start
    eps_end = train_params.eps_end
    eps_decay = train_params.eps_decay
    n_episodes = train_params.n_episodes
    checkpoint_interval = train_params.checkpoint_interval
    n_eval_episodes = train_params.n_evaluation_episodes

    # Set the seeds
    random.seed(seed)
    np.random.seed(seed)

    # Break agents from time to time
    malfunction_parameters = MalfunctionParameters(
        malfunction_rate=1. / 10000,  # Rate of malfunctions
        min_duration=15,  # Minimal duration
        max_duration=50  # Max duration
    )

    # Observation builder
    predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
    tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth,
                                         predictor=predictor)

    # Fraction of train which each speed
    speed_profiles = {
        1.: 1.0,  # Fast passenger train
        1. / 2.: 0.0,  # Fast freight train
        1. / 3.: 0.0,  # Slow commuter train
        1. / 4.: 0.0  # Slow freight train
    }

    # Setup the environment
    # env = RailEnv(
    #     width=x_dim,
    #     height=y_dim,
    #     rail_generator=sparse_rail_generator(
    #         max_num_cities=n_cities,
    #         grid_mode=False,
    #         max_rails_between_cities=max_rails_between_cities,
    #         max_rails_in_city=max_rails_in_city
    #     ),
    #     schedule_generator=sparse_schedule_generator(speed_profiles),
    #     number_of_agents=n_agents,
    #     malfunction_generator_and_process_data=malfunction_from_params(malfunction_parameters),
    #     obs_builder_object=tree_observation,
    #     random_seed=seed
    #

    # env.reset(regenerate_schedule=True, regenerate_rail=True)

    # Setup renderer
    if train_params.render:
        env_renderer = RenderTool(env, gl="PGL")

    # Calculate the state size given the depth of the tree observation and the number of features
    n_features_per_node = env.obs_builder.observation_dim
    n_nodes = 0
    for i in range(observation_tree_depth + 1):
        n_nodes += np.power(4, i)
    state_size = n_features_per_node * n_nodes

    # The action space of flatland is 5 discrete actions
    action_size = 5

    # Max number of steps per episode
    # This is the official formula used during evaluations
    # See details in flatland.envs.schedule_generators.sparse_schedule_generator
    max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities)))
    #max_steps = env._max_episode_steps
    print("max_steps = ", max_steps)
    print("env._max_episode_steps= ", env._max_episode_steps)

    action_count = [0] * action_size
    action_dict = dict()
    agent_obs = [None] * env.get_num_agents()
    agent_prev_obs = [None] * env.get_num_agents()
    agent_prev_action = [2] * env.get_num_agents()
    update_values = False
    smoothed_normalized_score = -1.0
    smoothed_eval_normalized_score = -1.0
    smoothed_completion = 0.0
    smoothed_eval_completion = 0.0

    # Double Dueling DQN policy
    policy = DDDQNPolicy(state_size, action_size, train_params)

    # TensorBoard writer
    writer = SummaryWriter()
    writer.add_hparams(vars(train_params), {})
    #writer.add_hparams(vars(env_params), {})

    training_timer = Timer()
    training_timer.start()

    print(
        "\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes.\n"
        .format(env.get_num_agents(), x_dim, y_dim, n_episodes,
                n_eval_episodes, checkpoint_interval))

    for episode_idx in range(n_episodes + 1):
        # Timers
        step_timer = Timer()
        reset_timer = Timer()
        learn_timer = Timer()
        preproc_timer = Timer()

        # Reset environment
        reset_timer.start()
        obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True)
        reset_timer.end()

        if train_params.render:
            env_renderer.set_new_rail()

        score = 0
        nb_steps = 0
        actions_taken = []

        # Build agent specific observations
        for agent in env.get_agent_handles():
            if obs[agent]:
                agent_obs[agent] = normalize_observation(
                    obs[agent],
                    observation_tree_depth,
                    observation_radius=observation_radius)
                agent_prev_obs[agent] = agent_obs[agent].copy()

        # Run episode
        for step in range(max_steps - 1):
            for agent in env.get_agent_handles():
                if info['action_required'][agent]:
                    # If an action is required, we want to store the obs at that step as well as the action
                    update_values = True
                    action = policy.act(agent_obs[agent], eps=eps_start)
                    action_count[action] += 1
                    actions_taken.append(action)
                else:
                    update_values = False
                    action = 0
                action_dict.update({agent: action})

            # Environment step
            step_timer.start()
            next_obs, all_rewards, done, info = env.step(action_dict)
            step_timer.end()

            if train_params.render and episode_idx % checkpoint_interval == 0:
                env_renderer.render_env(show=True,
                                        frames=False,
                                        show_observations=False,
                                        show_predictions=False)

            for agent in range(env.get_num_agents()):
                # Update replay buffer and train agent
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values or done[agent]:
                    learn_timer.start()
                    policy.step(agent_prev_obs[agent],
                                agent_prev_action[agent], all_rewards[agent],
                                agent_obs[agent], done[agent])
                    learn_timer.end()

                    agent_prev_obs[agent] = agent_obs[agent].copy()
                    agent_prev_action[agent] = action_dict[agent]

                # Preprocess the new observations
                if next_obs[agent]:
                    preproc_timer.start()
                    agent_obs[agent] = normalize_observation(
                        next_obs[agent],
                        observation_tree_depth,
                        observation_radius=observation_radius)
                    preproc_timer.end()

                score += all_rewards[agent]

            nb_steps = step

            if done['__all__']:
                break

        # Epsilon decay
        eps_start = max(eps_end, eps_decay * eps_start)

        # Collection information about training
        tasks_finished = sum(done[idx] for idx in env.get_agent_handles())
        completion = tasks_finished / max(1, env.get_num_agents())
        normalized_score = score / (max_steps * env.get_num_agents())
        action_probs = action_count / np.sum(action_count)
        action_count = [1] * action_size

        # Smoothed values for terminal display and for more stable hyper-parameter tuning
        smoothing = 0.99
        smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * (
            1.0 - smoothing)
        smoothed_completion = smoothed_completion * smoothing + completion * (
            1.0 - smoothing)

        # Print logs
        if episode_idx % checkpoint_interval == 0:
            torch.save(policy.qnetwork_local,
                       './checkpoints/obs2_multi-' + str(episode_idx) + '.pth')
            if train_params.render:
                env_renderer.close_window()

        print('\r🚂 Episode {}'
              '\t 🏆 Score: {:.3f}'
              ' Avg: {:.3f}'
              '\t 💯 Done: {:.2f}%'
              ' Avg: {:.2f}%'
              '\t 🎲 Epsilon: {:.2f} '
              '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score,
                                             smoothed_normalized_score,
                                             100 * completion,
                                             100 * smoothed_completion,
                                             eps_start,
                                             format_action_prob(action_probs)),
              end=" ")

        # Evaluate policy
        if episode_idx % train_params.checkpoint_interval == 0:
            scores, completions, nb_steps_eval = eval_policy(
                env, policy, n_eval_episodes, max_steps)
            writer.add_scalar("evaluation/scores_min", np.min(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_max", np.max(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_mean", np.mean(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_std", np.std(scores),
                              episode_idx)
            writer.add_histogram("evaluation/scores", np.array(scores),
                                 episode_idx)
            writer.add_scalar("evaluation/completions_min",
                              np.min(completions), episode_idx)
            writer.add_scalar("evaluation/completions_max",
                              np.max(completions), episode_idx)
            writer.add_scalar("evaluation/completions_mean",
                              np.mean(completions), episode_idx)
            writer.add_scalar("evaluation/completions_std",
                              np.std(completions), episode_idx)
            writer.add_histogram("evaluation/completions",
                                 np.array(completions), episode_idx)
            writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval),
                              episode_idx)
            writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval),
                              episode_idx)
            writer.add_scalar("evaluation/nb_steps_mean",
                              np.mean(nb_steps_eval), episode_idx)
            writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval),
                              episode_idx)
            writer.add_histogram("evaluation/nb_steps",
                                 np.array(nb_steps_eval), episode_idx)

            smoothing = 0.9
            smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean(
                scores) * (1.0 - smoothing)
            smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean(
                completions) * (1.0 - smoothing)
            writer.add_scalar("evaluation/smoothed_score",
                              smoothed_eval_normalized_score, episode_idx)
            writer.add_scalar("evaluation/smoothed_completion",
                              smoothed_eval_completion, episode_idx)

        # Save logs to tensorboard
        writer.add_scalar("training/score", normalized_score, episode_idx)
        writer.add_scalar("training/smoothed_score", smoothed_normalized_score,
                          episode_idx)
        writer.add_scalar("training/completion", np.mean(completion),
                          episode_idx)
        writer.add_scalar("training/smoothed_completion",
                          np.mean(smoothed_completion), episode_idx)
        writer.add_scalar("training/nb_steps", nb_steps, episode_idx)
        writer.add_histogram("actions/distribution", np.array(actions_taken),
                             episode_idx)
        writer.add_scalar("actions/nothing",
                          action_probs[RailEnvActions.DO_NOTHING], episode_idx)
        writer.add_scalar("actions/left",
                          action_probs[RailEnvActions.MOVE_LEFT], episode_idx)
        writer.add_scalar("actions/forward",
                          action_probs[RailEnvActions.MOVE_FORWARD],
                          episode_idx)
        writer.add_scalar("actions/right",
                          action_probs[RailEnvActions.MOVE_RIGHT], episode_idx)
        writer.add_scalar("actions/stop",
                          action_probs[RailEnvActions.STOP_MOVING],
                          episode_idx)
        writer.add_scalar("training/epsilon", eps_start, episode_idx)
        writer.add_scalar("training/buffer_size", len(policy.memory),
                          episode_idx)
        writer.add_scalar("training/loss", policy.loss, episode_idx)
        writer.add_scalar("timer/reset", reset_timer.get(), episode_idx)
        writer.add_scalar("timer/step", step_timer.get(), episode_idx)
        writer.add_scalar("timer/learn", learn_timer.get(), episode_idx)
        writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx)
        writer.add_scalar("timer/total", training_timer.get_current(),
                          episode_idx)
Exemple #6
0
def evaluate(n_episodes):
    run = SUBMISSIONS["rlpr-tcpr"]
    config, run = init_run(run)
    prio_agent = get_agent(config, run)
    env = get_env(config, rl=True)
    env_renderer = RenderTool(env, screen_width=8800)
    returns = []
    pcs = []
    malfs = []

    for _ in tqdm(range(n_episodes)):

        obs, _ = env.reset(regenerate_schedule=True, regenerate_rail=True)
        if RENDER:
            env_renderer.reset()
            env_renderer.render_env(show=True,
                                    frames=True,
                                    show_observations=False)

        if not obs:
            break

        steps = 0
        ep_return = 0
        done = defaultdict(lambda: False)
        robust_env = CprFlatlandGymEnv(rail_env=env,
                                       max_nr_active_agents=200,
                                       observation_space=None,
                                       priorizer=DistToTargetPriorizer(),
                                       allow_noop=True)
        priorities = prio_agent.compute_actions(obs, explore=False)
        sorted_actions = {
            k: v
            for k, v in sorted(
                priorities.items(), key=lambda item: item[1], reverse=True)
        }
        sorted_handles = list(sorted_actions.keys())

        while not done['__all__']:
            actions = ShortestPathAgent().compute_actions(obs, env)
            robust_actions = robust_env.get_robust_actions(
                actions, sorted_handles)
            obs, all_rewards, done, info = env.step(robust_actions)
            if RENDER:
                env_renderer.render_env(show=True,
                                        frames=True,
                                        show_observations=False)
            print('.', end='', flush=True)
            steps += 1
            ep_return += np.sum(list(all_rewards.values()))

        pc = np.sum(np.array([1 for a in env.agents if is_done(a)
                              ])) / env.get_num_agents()
        print("EPISODE PC:", pc)
        n_episodes += 1
        pcs.append(pc)
        returns.append(ep_return /
                       (env._max_episode_steps * env.get_num_agents()))
        malfs.append(
            np.sum([a.malfunction_data['nr_malfunctions']
                    for a in env.agents]))
    return pcs, returns, malfs
Exemple #7
0
# Relative weights of each cell type to be used by the random rail generators.
transition_probability = [
    1.0,  # empty cell - Case 0
    1.0,  # Case 1 - straight
    1.0,  # Case 2 - simple switch
    0.3,  # Case 3 - diamond drossing
    0.5,  # Case 4 - single slip
    0.5,  # Case 5 - double slip
    0.2,  # Case 6 - symmetrical
    0.0,  # Case 7 - dead end
    0.2,  # Case 8 - turn left
    0.2,  # Case 9 - turn right
    1.0
]  # Case 10 - mirrored switch

# Example generate a random rail
env = RailEnv(width=10,
              height=10,
              rail_generator=random_rail_generator(
                  cell_type_relative_proportion=transition_probability),
              number_of_agents=3)

env.reset()

env_renderer = RenderTool(env, gl="PIL")
env_renderer.render_env(show=True)

# uncomment to keep the renderer open
# input("Press Enter to continue...")
              rail_generator=complex_rail_generator(nr_start_goal=n_goals,
                                                    nr_extra=5,
                                                    min_dist=min_dist,
                                                    max_dist=99999,
                                                    seed=0),
              schedule_generator=complex_schedule_generator(),
              obs_builder_object=TreeObsForRailEnv(
                  max_depth=1, predictor=ShortestPathPredictorForRailEnv()),
              number_of_agents=n_agents)
env.reset(True, True)

tree_depth = 1
observation_helper = TreeObsForRailEnv(
    max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv())
env_renderer = RenderTool(
    env,
    gl="PGL",
)
handle = env.get_agent_handles()
n_episodes = 10
max_steps = 100 * (env.height + env.width)
record_images = False
policy = OrderedPolicy()
action_dict = dict()

for trials in range(1, n_episodes + 1):
    # Reset environment
    obs, info = env.reset(True, True)
    done = env.dones
    env_renderer.reset()
    frame_step = 0
Exemple #9
0
def main(args):

    # Show options and values
    print(' ' * 26 + 'Options')
    for k, v in vars(args).items():
        print(' ' * 26 + k + ': ' + str(v))
    # Where to save models
    results_dir = os.path.join('results', args.model_id)
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)

    rail_generator = sparse_rail_generator(
        max_num_cities=args.max_num_cities,
        seed=args.seed,
        grid_mode=args.grid_mode,
        max_rails_between_cities=args.max_rails_between_cities,
        max_rails_in_city=args.max_rails_in_city,
    )

    # Maps speeds to % of appearance in the env
    speed_ration_map = {1.: 1}  # Fast passenger train

    if args.multi_speed:
        speed_ration_map = {
            1.: 0.25,  # Fast passenger train
            1. / 2.: 0.25,  # Fast freight train
            1. / 3.: 0.25,  # Slow commuter train
            1. / 4.: 0.25
        }  # Slow freight train

    schedule_generator = sparse_schedule_generator(speed_ration_map)

    prediction_builder = ShortestPathPredictorForRailEnv(
        max_depth=args.prediction_depth)
    obs_builder = RailObsForRailEnv(predictor=prediction_builder)

    env = RailEnv(
        width=args.width,
        height=args.height,
        rail_generator=rail_generator,
        random_seed=0,
        schedule_generator=schedule_generator,
        number_of_agents=args.num_agents,
        obs_builder_object=obs_builder,
        malfunction_generator_and_process_data=malfunction_from_params(
            parameters={
                'malfunction_rate': args.malfunction_rate,
                'min_duration': args.min_duration,
                'max_duration': args.max_duration
            }))

    if args.render:
        env_renderer = RenderTool(env,
                                  agent_render_variant=AgentRenderVariant.
                                  AGENT_SHOWS_OPTIONS_AND_BOX,
                                  show_debug=True,
                                  screen_height=800,
                                  screen_width=800)

    if args.plot:
        writer = SummaryWriter(log_dir='runs/' + args.model_id)

    max_rails = 100  # TODO Must be a parameter of the env (estimated)
    # max_steps = env.compute_max_episode_steps(env.width, env.height)
    max_steps = 200

    preprocessor = ObsPreprocessor(max_rails, args.reorder_rails)

    dqn = DQNAgent(args, bitmap_height=max_rails * 3, action_space=2)

    if args.load_path:
        file = os.path.isfile(args.load_path)
        if file:
            dqn.qnetwork_local.load_state_dict(torch.load(args.load_path))
            print('WEIGHTS LOADED from: ', args.load_path)

    eps = args.start_eps
    railenv_action_dict = {}
    network_action_dict = {}
    # Metrics
    done_window = deque(
        maxlen=args.window_size)  # Env dones over last window_size episodes
    done_agents_window = deque(
        maxlen=args.window_size)  # Fraction of done agents over last ...
    reward_window = deque(
        maxlen=args.window_size
    )  # Cumulative rewards over last window_size episodes
    norm_reward_window = deque(
        maxlen=args.window_size
    )  # Normalized cum. rewards over last window_size episodes
    # Track means over windows of window_size episodes
    mean_dones = []
    mean_agent_dones = []
    mean_rewards = []
    mean_norm_rewards = []
    # Episode rewards/dones/norm rewards since beginning of training TODO
    #env_dones = []

    crash = [False] * args.num_agents
    update_values = [False] * args.num_agents
    buffer_obs = [[]] * args.num_agents

    ############ Main loop
    for ep in range(args.num_episodes):
        cumulative_reward = 0
        env_done = 0
        altmaps = [None] * args.num_agents
        altpaths = [[]] * args.num_agents
        buffer_rew = [0] * args.num_agents
        buffer_done = [False] * args.num_agents
        curr_obs = [None] * args.num_agents

        maps, info = env.reset()
        if args.print:
            debug.print_bitmaps(maps)

        if args.render:
            env_renderer.reset()

        for step in range(max_steps - 1):
            # Save a copy of maps at the beginning
            buffer_maps = maps.copy()
            # rem first bit is 0 for agent not departed
            for a in range(env.get_num_agents()):
                agent = env.agents[a]
                crash[a] = False
                update_values[a] = False
                network_action = None
                action = None

                # If agent is arrived
                if agent.status == RailAgentStatus.DONE or agent.status == RailAgentStatus.DONE_REMOVED:
                    # TODO if agent !removed you should leave a bit in the bitmap
                    # TODO? set bitmap only the first time
                    maps[a, :, :] = 0
                    network_action = 0
                    action = RailEnvActions.DO_NOTHING

                # If agent is not departed
                elif agent.status == RailAgentStatus.READY_TO_DEPART:
                    update_values[a] = True
                    obs = preprocessor.get_obs(a, maps[a], buffer_maps)
                    curr_obs[a] = obs.copy()

                    # Network chooses action
                    q_values = dqn.act(obs).cpu().data.numpy()
                    if np.random.random() > eps:
                        network_action = np.argmax(q_values)
                    else:
                        network_action = np.random.choice([0, 1])

                    if network_action == 0:
                        action = RailEnvActions.DO_NOTHING
                    else:  # Go
                        crash[a] = obs_builder.check_crash(a, maps)

                        if crash[a]:
                            network_action = 0
                            action = RailEnvActions.STOP_MOVING
                        else:
                            maps = obs_builder.update_bitmaps(a, maps)
                            action = obs_builder.get_agent_action(a)

                # If the agent is entering a switch
                elif obs_builder.is_before_switch(
                        a) and info['action_required'][a]:
                    # If the altpaths cache is empty or already contains
                    # the altpaths from the current agent's position
                    if len(
                            altpaths[a]
                    ) == 0 or agent.position != altpaths[a][0][0].position:
                        altmaps[a], altpaths[a] = obs_builder.get_altmaps(a)

                    if len(altmaps[a]) > 0:
                        update_values[a] = True
                        altobs = [None] * len(altmaps[a])
                        q_values = np.array([])
                        for i in range(len(altmaps[a])):
                            altobs[i] = preprocessor.get_obs(
                                a, altmaps[a][i], buffer_maps)
                            q_values = np.concatenate([
                                q_values,
                                dqn.act(altobs[i]).cpu().data.numpy()
                            ])

                        # Epsilon-greedy action selection
                        if np.random.random() > eps:
                            argmax = np.argmax(q_values)
                            network_action = argmax % 2
                            best_i = argmax // 2
                        else:
                            network_action = np.random.choice([0, 1])
                            best_i = np.random.choice(
                                np.arange(len(altmaps[a])))

                        # Use new bitmaps and paths
                        maps[a, :, :] = altmaps[a][best_i]
                        obs_builder.set_agent_path(a, altpaths[a][best_i])
                        curr_obs[a] = altobs[best_i].copy()

                    else:
                        print('[ERROR] NO ALTHPATHS EP: {} STEP: {} AGENT: {}'.
                              format(ep, step, a))
                        network_action = 0

                    if network_action == 0:
                        action = RailEnvActions.STOP_MOVING
                    else:
                        crash[a] = obs_builder.check_crash(
                            a, maps, is_before_switch=True)

                        if crash[a]:
                            network_action = 0
                            action = RailEnvActions.STOP_MOVING
                        else:
                            action = obs_builder.get_agent_action(a)
                            maps = obs_builder.update_bitmaps(
                                a, maps, is_before_switch=True)

                # If the agent is following a rail
                elif info['action_required'][a]:
                    crash[a] = obs_builder.check_crash(a, maps)

                    if crash[a]:
                        network_action = 0
                        action = RailEnvActions.STOP_MOVING
                    else:
                        network_action = 1
                        action = obs_builder.get_agent_action(a)
                        maps = obs_builder.update_bitmaps(a, maps)

                else:  # not action_required
                    network_action = 1
                    action = RailEnvActions.DO_NOTHING
                    maps = obs_builder.update_bitmaps(a, maps)

                network_action_dict.update({a: network_action})
                railenv_action_dict.update({a: action})

            # Obs is computed from bitmaps while state is computed from env step (temporarily)
            _, reward, done, info = env.step(railenv_action_dict)  # Env step

            if args.render:
                env_renderer.render_env(show=True,
                                        show_observations=False,
                                        show_predictions=True)

            if args.debug:
                for a in range(env.get_num_agents()):
                    print('#########################################')
                    print('Info for agent {}'.format(a))
                    print('Status: {}'.format(info['status'][a]))
                    print('Position: {}'.format(env.agents[a].position))
                    print('Target: {}'.format(env.agents[a].target))
                    print('Moving? {} at speed: {}'.format(
                        env.agents[a].moving, info['speed'][a]))
                    print('Action required? {}'.format(
                        info['action_required'][a]))
                    print('Network action: {}'.format(network_action_dict[a]))
                    print('Railenv action: {}'.format(railenv_action_dict[a]))
            # Update replay buffer and train agent
            if args.train:
                for a in range(env.get_num_agents()):
                    if args.crash_penalty and crash[a]:
                        # Store bad experience
                        dqn.step(curr_obs[a], 1, -100, curr_obs[a], True)

                    if not args.switch2switch:
                        if update_values[a] and not buffer_done[a]:
                            next_obs = preprocessor.get_obs(a, maps[a], maps)
                            dqn.step(curr_obs[a], network_action_dict[a],
                                     reward[a], next_obs, done[a])

                    else:
                        if update_values[a] and not buffer_done[a]:
                            # If I had an obs from a previous switch
                            if len(buffer_obs[a]) != 0:
                                dqn.step(buffer_obs[a], 1, buffer_rew[a],
                                         curr_obs[a], done[a])
                                buffer_obs[a] = []
                                buffer_rew[a] = 0

                            if network_action_dict[a] == 0:
                                dqn.step(curr_obs[a], 1, reward[a],
                                         curr_obs[a], False)
                            elif network_action_dict[a] == 1:
                                # I store the obs and update at the next switch
                                buffer_obs[a] = curr_obs[a].copy()

                        # Cache reward only if we have an obs from a prev switch
                        if len(buffer_obs[a]) != 0:
                            buffer_rew[a] += reward[a]

                    # Now update the done cache to avoid adding experience many times
                    buffer_done[a] = done[a]

            for a in range(env.get_num_agents()):
                cumulative_reward += reward[
                    a]  # / env.get_num_agents() # Update cumulative reward (not norm)

            # TODO? env sets done[all] = True for everyone when time limit is reached
            # devid: I also remember this, but debuggind doesn't seem to happen
            if done['__all__']:
                env_done = 1
                break

        ################### End of the episode
        eps = max(args.end_eps, args.eps_decay * eps)  # Decrease epsilon
        # Metrics
        done_window.append(env_done)  # Save done in this episode

        num_agents_done = 0  # Num of agents that reached their target in the last episode
        for a in range(env.get_num_agents()):
            if done[a]:
                num_agents_done += 1
        done_agents_window.append(num_agents_done / env.get_num_agents())
        reward_window.append(
            cumulative_reward)  # Save cumulative reward in this episode
        normalized_reward = cumulative_reward / (env.compute_max_episode_steps(
            env.width, env.height) + env.get_num_agents())
        norm_reward_window.append(normalized_reward)

        mean_dones.append((np.mean(done_window)))
        mean_agent_dones.append((np.mean(done_agents_window)))
        mean_rewards.append(np.mean(reward_window))
        mean_norm_rewards.append(np.mean(norm_reward_window))

        # Print training results info
        print(
            '\r{} Agents on ({},{}). Episode: {}\t Mean done agents: {:.2f}\t Mean reward: {:.2f}\t Mean normalized reward: {:.2f}\t Done agents in last episode: {:.2f}%\t Epsilon: {:.2f}'
            .format(
                env.get_num_agents(),
                args.width,
                args.height,
                ep,
                mean_agent_dones[-1],  # Fraction of done agents
                mean_rewards[-1],
                mean_norm_rewards[-1],
                (num_agents_done / args.num_agents),
                eps),
            end=" ")

        if ep != 0 and (ep + 1) % args.checkpoint_interval == 0:
            print(
                '\r{} Agents on ({},{}). Episode: {}\t Mean done agents: {:.2f}\t Mean reward: {:.2f}\t Mean normalized reward: {:.2f}\t Epsilon: {:.2f}'
                .format(env.get_num_agents(), args.width, args.height, ep,
                        mean_agent_dones[-1], mean_rewards[-1],
                        mean_norm_rewards[-1], eps))

        if args.train and ep != 0 and (ep + 1) % args.save_interval == 0:
            torch.save(dqn.qnetwork_local.state_dict(),
                       results_dir + '/weights.pt')

        if args.plot:
            writer.add_scalar('mean_agent_dones', mean_agent_dones[-1], ep)
            writer.add_scalar('mean_rewards', mean_rewards[-1], ep)
            writer.add_scalar('mean_dones', mean_dones[-1], ep)
            writer.add_scalar('mean_norm_rewards', mean_norm_rewards[-1], ep)
            writer.add_scalar('epsilon', eps, ep)
Exemple #10
0
def main(argv):

    random.seed(1)
    np.random.seed(1)

    # Initialize a random map with a random number of agents
    x_dim = np.random.randint(20, 40)
    y_dim = np.random.randint(20, 40)
    n_agents = np.random.randint(3, 4)
    n_goals = n_agents + np.random.randint(0, 3)
    min_dist = int(0.75 * min(x_dim, y_dim))
    tree_depth = 4

    # Get an observation builder and predictor
    predictor = ShortestPathPredictorForRailEnv()
    observation_helper = TreeObsForRailEnv(max_depth=tree_depth, predictor=predictor)

    # Use a the malfunction generator to break agents from time to time
    stochastic_data = {'prop_malfunction': 0.0,  # Percentage of defective agents
                       'malfunction_rate': 0,  # Rate of malfunction occurrence
                       'min_duration': 3,  # Minimal duration of malfunction
                       'max_duration': 20  # Max duration of malfunction
                       }

    # Different agent types (trains) with different speeds.
    speed_ration_map = {1.: 0.25,  # Fast passenger train
                        1. / 2.: 0.25,  # Fast freight train
                        1. / 3.: 0.25,  # Slow commuter train
                        1. / 4.: 0.25}  # Slow freight train

    env = RailEnv(width=x_dim,
                  height=y_dim,
                  rail_generator=sparse_rail_generator(max_num_cities=3,
                                                       # Number of cities in map (where train stations are)
                                                       seed=1,  # Random seed
                                                       grid_mode=False,
                                                       max_rails_between_cities=2,
                                                       max_rails_in_city=3),
                  schedule_generator=sparse_schedule_generator(speed_ration_map),
                  number_of_agents=n_agents,
                  stochastic_data=stochastic_data,  # Malfunction data generator
                  obs_builder_object=observation_helper)
    env.reset(True, True)

    # Initiate the renderer
    env_renderer = RenderTool(env, gl="PILSVG",
                              agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX,
                              show_debug=False,
                              screen_height=1000,  # Adjust these parameters to fit your resolution
                              screen_width=1000)  # Adjust these parameters to fit your resolution
    handle = env.get_agent_handles()
    num_features_per_node = env.obs_builder.observation_dim

    nr_nodes = 0
    for i in range(tree_depth + 1):
        nr_nodes += np.power(4, i)
    state_size = 2 * num_features_per_node * nr_nodes
    action_size = 5

    n_trials = 10
    observation_radius = 10
    max_steps = int(3 * (env.height + env.width))
    action_dict = dict()
    time_obs = deque(maxlen=2)
    agent_obs = [None] * env.get_num_agents()

    # Init and load agent
    agent = Agent(state_size, action_size)
    with path(fc_treeobs.nets, "multi_agent_2ts_checkpoint200.pth") as file_in:
        agent.qnetwork_local.load_state_dict(torch.load(file_in))

    # Vars used to record agent performance
    record_images = False
    frame_step = 0

    for trials in range(1, n_trials + 1):
        # Reset environment
        obs, info = env.reset(True, True)
        env_renderer.reset()

        # Build first two-time step observation
        for a in range(env.get_num_agents()):
            obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
        # Accumulate two time steps of observation (Here just twice the first state)
        for i in range(2):
            time_obs.append(obs)
        # Build the agent specific double ti
        for a in range(env.get_num_agents()):
            agent_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))

        # Run episode
        for step in range(max_steps):
            time.sleep(0.01)

            env_renderer.render_env(show=True, show_observations=False, show_predictions=True)

            if record_images:
                env_renderer.gl.save_image("./Images/Avoiding/flatland_frame_{:04d}.bmp".format(frame_step))
                frame_step += 1

            # Perform action for each agent
            for a in range(env.get_num_agents()):
                action = agent.act(agent_obs[a], eps=0)
                action_dict.update({a: action})

            # Environment step
            next_obs, all_rewards, done, _ = env.step(action_dict)

            # Collect observation after environment step
            for a in range(env.get_num_agents()):
                next_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10)
            # Add new obs to the obs vector
            # Since time_obs is a deque of max_len = 2, an append on the right side when the deque is full
            # provokes a pop of the element from the left side
            time_obs.append(next_obs)
            # Create obs using obs at time step t-1 and ob at time step t
            for a in range(env.get_num_agents()):
                agent_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))

            if done['__all__']:
                break
Exemple #11
0
# Construct the enviornment with the given observation, generataors, predictors, and stochastic data
env = RailEnv(width=width,
              height=height,
              rail_generator=rail_generator,
              schedule_generator=schedule_generator,
              number_of_agents=nr_trains,
              stochastic_data=stochastic_data,  # Malfunction data generator
              obs_builder_object=observation_builder,
              remove_agents_at_target=True  # Removes agents at the end of their journey to make space for others
              )

# Initiate the renderer
env_renderer = RenderTool(env, gl="PILSVG",
                          agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX,
                          show_debug=False,
                          screen_height=800,  # Adjust these parameters to fit your resolution
                          screen_width=800)  # Adjust these parameters to fit your resolution

plt.ion()
plt.show()

succ_best = 1
tries_best = 1
succ_stoch = 1
tries_stoch = 1
use_best = False

while True:
    episode_done = False
    episode_reward = 0
Exemple #12
0
def main(argv):
    try:
        opts, args = getopt.getopt(argv, "n:", ["n_trials="])
    except getopt.GetoptError:
        print('training_navigation.py -n <n_trials>')
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-n', '--n_trials'):
            n_trials = int(arg)

    random.seed(1)
    np.random.seed(1)

    # Parameters for the Environment

    # Multi agent (4000 iterations)
    x_dim = 16*3
    y_dim = 9*3
    n_agents = 5
    max_num_cities = 5
    max_rails_between_cities = 2
    max_rails_in_city = 3

    # Single agent (1000 iterations)    
    x_dim1 = 16*4
    y_dim1 = 9*4
    n_agents1 = 1
    max_num_cities1 = 9
    max_rails_between_cities1 = 5
    max_rails_in_city1 = 5

    # Use a the malfunction generator to break agents from time to time
#    stochastic_data = {'malfunction_rate': 8000,  # Rate of malfunction occurence of single agent
#                       'min_duration': 15,  # Minimal duration of malfunction
#                       'max_duration': 50  # Max duration of malfunction
#                       }

    # Custom observation builder
    tree_depth = 2
    TreeObservation = TreeObsForRailEnv(max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv(20))

    # Different agent types (trains) with different speeds.
    speed_ration_map = {1.: 1.0,  # Fast passenger train
                        1. / 2.: 0.0,  # Fast freight train
                        1. / 3.: 0.0,  # Slow commuter train
                        1. / 4.: 0.0}  # Slow freight train

    env = RailEnv(width=x_dim,
                  height=y_dim,
                  rail_generator=sparse_rail_generator(max_num_cities=max_num_cities,
                                                       # Number of cities in map (where train stations are)
                                                       seed=1,  # Random seed
                                                       grid_mode=False,
                                                       max_rails_between_cities=max_rails_between_cities,
                                                       max_rails_in_city=max_rails_in_city),
                  schedule_generator=sparse_schedule_generator(speed_ration_map),
                  number_of_agents=n_agents,
#                  malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
                  # Malfunction data generator
                  obs_builder_object=TreeObservation)

    env1 = RailEnv(width=x_dim1,
                  height=y_dim1,
                  rail_generator=sparse_rail_generator(max_num_cities=max_num_cities1,
                                                       # Number of cities in map (where train stations are)
                                                       seed=786,  # Random seed
                                                       grid_mode=False,
                                                       max_rails_between_cities=max_rails_between_cities1,
                                                       max_rails_in_city=max_rails_in_city1),
                  schedule_generator=sparse_schedule_generator(speed_ration_map),
                  number_of_agents=n_agents1,
#                  malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
                  # Malfunction data generator
                  obs_builder_object=TreeObservation)
                
    env.reset(True, True)
    env1.reset(True, True)

    # After training we want to render the results so we also load a renderer
    #env_renderer = RenderTool(env, gl="PILSVG", 
    #                               screen_height=800,  # Adjust these parameters to fit your resolution
    #                               screen_width=900)

    env1_renderer = RenderTool(env1, gl="PILSVG", 
                                   screen_height=800,  # Adjust these parameters to fit your resolution
                                  screen_width=900)
    
    # Given the depth of the tree observation and the number of features per node we get the following state_size
    num_features_per_node = env.obs_builder.observation_dim

    nr_nodes = 0
    for i in range(tree_depth + 1):
        nr_nodes += np.power(4, i)
    state_size = num_features_per_node * nr_nodes

    # The action space of flatland is 5 discrete actions
    action_size = 5

    # We set the number of episodes we would like to train on
    if 'n_trials' not in locals():
        n_trials = 15000

    # And the max number of steps we want to take per episode
    max_steps = int(3 * (env.height + env.width))
    max_steps1 = int(3 * (env1.height + env1.width))

    # Define training parameters
    eps = 1.
    eps_end = 0.005
    eps_decay = 0.9985

    # And some variables to keep track of the progress
    action_dict = dict()
    final_action_dict = dict()
    scores_window = deque(maxlen=100)
    done_window = deque(maxlen=100)
    deadlock_window = deque(maxlen=100)
    deadlock_average = []
    scores = []
    dones_list = []
    #Metrics
    eps_list = []
    action_prob_list = []
    action_prob = [0] * action_size
    agent_obs = [None] * env.get_num_agents()
    agent_next_obs = [None] * env.get_num_agents()
    agent_obs_buffer = [None] * env.get_num_agents()
    agent_action_buffer = [2] * env.get_num_agents()
    cummulated_reward = np.zeros(env.get_num_agents())
    update_values = False
    # Now we load a Double dueling DQN agent
    agent = Agent(state_size, action_size)

    for trials in range(1, n_trials + 1):

        if trials > 1000:
            # Reset environment
            obs, info = env.reset(True, True)
            #env_renderer.reset()
            # Build agent specific observations
            for a in range(env.get_num_agents()):
                if obs[a]:
                    agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
                    agent_obs_buffer[a] = agent_obs[a].copy()

        # Reset score and done
            score = 0
            env_done = 0

        # Run episode
            for step in range(max_steps):
            # Action
                for a in range(env.get_num_agents()):
                    if info['action_required'][a]:
                    # If an action is require, we want to store the obs a that step as well as the action
                        update_values = True
                        action = agent.act(agent_obs[a], eps=eps)
                        action_prob[action] += 1
                    else:
                        update_values = False
                        action = 0 
                    action_dict.update({a: action})

              # Environment step
                next_obs, all_rewards, done, deadlocks, info = env.step(action_dict)
                #env_renderer.render_env(show=True, show_predictions=True, show_observations=True)
                # Update replay buffer and train agent
                for a in range(env.get_num_agents()):
                   # Only update the values when we are done or when an action was taken and thus relevant information is present
                    if update_values or done[a]:
                        agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a],
                                 agent_obs[a], done[a])
                        cummulated_reward[a] = 0.

                        agent_obs_buffer[a] = agent_obs[a].copy()
                        agent_action_buffer[a] = action_dict[a]
                    if next_obs[a]:
                        agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10)

                    score += all_rewards[a] / env.get_num_agents()

                # Copy observation
                if done['__all__']:
                    env_done = 1
                    break

        else: # Odd trials

            # Reset environment
            obs, info = env1.reset(True, True)
            #env1_renderer.reset()
            # Build agent specific observations
            for a in range(env1.get_num_agents()):
                if obs[a]:
                    agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
                    agent_obs_buffer[a] = agent_obs[a].copy()

            # Reset score and done
            score = 0
            env_done = 0

        # Run episode
            for step in range(max_steps1):
            # Action
                for a in range(env1.get_num_agents()):
                    if info['action_required'][a]:
                    # If an action is require, we want to store the obs a that step as well as the action
                        update_values = True
                        action = agent.act(agent_obs[a], eps=eps)
                        action_prob[action] += 1
                    else:
                        update_values = False
                        action = 0
                    action_dict.update({a: action})

              # Environment step
                next_obs, all_rewards, done, deadlocks, info = env1.step(action_dict)
                #env1_renderer.render_env(show=True, show_predictions=True, show_observations=True)
                # Update replay buffer and train agent
                for a in range(env1.get_num_agents()):
                   # Only update the values when we are done or when an action was taken and thus relevant information is present
                    if update_values or done[a]:
                        agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a],
                                     agent_obs[a], done[a])
                        cummulated_reward[a] = 0.

                        agent_obs_buffer[a] = agent_obs[a].copy()
                        agent_action_buffer[a] = action_dict[a]
                    if next_obs[a]:
                       agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10)

                    score += all_rewards[a] / env1.get_num_agents()

                # Copy observation
                if done['__all__']:
                    env_done = 1
                    break
        
        
        
        # Epsilon decay
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon

        # Collection information about training
        tasks_finished = 0

        if trials > 1000:

            for _idx in range(env.get_num_agents()):
                if done[_idx] == 1:
                 tasks_finished += 1
            done_window.append(tasks_finished / max(1, env.get_num_agents()))
        
        
            scores_window.append(score / max_steps)  # save most recent score
        
            scores.append(np.mean(scores_window))
            deadlock_window.append(deadlocks.count(1)/max(1, env.get_num_agents()))
            deadlock_average.append(np.mean(deadlock_window))
            dones_list.append((np.mean(done_window)))

            x_dim_current = x_dim
            y_dim_current = y_dim
            agent_num = env.get_num_agents()
        else:
            for _idx in range(env1.get_num_agents()):
                if done[_idx] == 1:
                 tasks_finished += 1
            done_window.append(tasks_finished / max(1, env1.get_num_agents()))
        
        
            scores_window.append(score / max_steps1)  # save most recent score
        
            scores.append(np.mean(scores_window))
            deadlock_window.append(deadlocks.count(1)/max(1, env1.get_num_agents()))
            deadlock_average.append(np.mean(deadlock_window))
            dones_list.append((np.mean(done_window)))

            x_dim_current = x_dim1
            y_dim_current = y_dim1
            agent_num = env1.get_num_agents()

        eps_list.append(eps)
        action_prob_list.append(action_prob/ np.sum(action_prob))
        
        

        print(
            '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f} %\tDeadlocks: {:.2f} \tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
                agent_num, 
                x_dim_current, y_dim_current,
                trials,
                np.mean(scores_window), 
                100 * np.mean(done_window), np.mean(deadlock_window),
                eps, action_prob / np.sum(action_prob)), end=" ")


        if trials % 100 == 0:
            print(
                '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
                    env.get_num_agents(),
                    x_dim_current, y_dim_current,
                    trials,
                    np.mean(scores_window),
                    100 * np.mean(done_window),
                    eps, action_prob / np.sum(action_prob)))
            torch.save(agent.qnetwork_local.state_dict(),
                       path.join('Nets',('navigator_checkpoint' +str(trials) + '.pth')))

            action_prob = [1] * action_size

        if trials % 50 == 0:

            np.savetxt(fname=path.join('Nets' , 'metrics.csv'), X=np.transpose(np.asarray([scores,dones_list,deadlock_average,eps_list])), delimiter=';',newline='\n')
            np.savetxt(fname=path.join('Nets' , 'action_prob.csv'), X=np.asarray(action_prob_list), delimiter=';',newline='\n')


    # Plot overall training progress at the end
    plt.plot(scores)
    plt.show()
Exemple #13
0
def main(args):
    try:
        opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""])
    except getopt.GetoptError as err:
        print(str(err))  # will print something like "option -a not recognized"
        sys.exit(2)
    sleep_for_animation = True
    for o, a in opts:
        if o in ("--sleep-for-animation"):
            sleep_for_animation = str2bool(a)
        else:
            assert False, "unhandled option"

    batch_builder = SampleBatchBuilder()  # or MultiAgentSampleBatchBuilder
    writer = JsonWriter("./")

    #  Setting these 2 parameters to True can slow down training
    visuals = False
    sleep_for_animation = False

    if visuals:
        from flatland.utils.rendertools import RenderTool

    max_depth = 30
    tree_depth = 2
    trial_start = 0
    n_trials = 97
    start = 0

    columns = [
        'Agents', 'X_DIM', 'Y_DIM', 'TRIAL_NO', 'REWARD', 'NORMALIZED_REWARD',
        'DONE_RATIO', 'STEPS', 'ACTION_PROB'
    ]
    df_all_results = pd.DataFrame(columns=columns)

    for trials in range(trial_start, n_trials + 1):

        step = 0

        obs_builder_object = TreeObsForRailEnv(
            max_depth=tree_depth,
            predictor=ShortestPathPredictorForRailEnv(max_depth))

        env_file = f"../env_configs/test-envs-small/Test_0/Level_{trials}.mpk"

        env = RailEnv(
            width=1,
            height=1,
            rail_generator=rail_from_file(env_file),
            schedule_generator=schedule_from_file(env_file),
            malfunction_generator_and_process_data=malfunction_from_file(
                env_file),
            obs_builder_object=obs_builder_object)

        obs, info = env.reset(regenerate_rail=True,
                              regenerate_schedule=True,
                              activate_agents=False,
                              random_seed=1001)

        file = f"../env_configs/actions-small/Test_0/Level_{trials}.mpk"

        with open(file, "r") as files:
            expert_actions = json.load(files)

        n_agents = env.get_num_agents()
        x_dim, y_dim = env.width, env.height

        agent_obs = [None] * n_agents
        agent_obs_buffer = [None] * n_agents
        done = dict()
        done["__all__"] = False

        if imitate:
            agent_action_buffer = list(expert_actions[step].values())
        else:
            # , p=[0.2, 0, 0.5])  # [0] * n_agents
            agent_action_buffer = np.random.choice(5, n_agents, replace=True)
        update_values = [False] * n_agents

        max_steps = int(4 * 2 * (20 + env.height + env.width))

        action_size = 5  # 3

        # And some variables to keep track of the progress
        action_dict = dict()
        scores_window = deque(maxlen=100)
        reward_window = deque(maxlen=100)
        done_window = deque(maxlen=100)
        action_prob = [0] * action_size

        # agent = Agent(state_size, action_size)

        if visuals:
            env_renderer = RenderTool(env, gl="PILSVG")
            env_renderer.render_env(show=True,
                                    frames=True,
                                    show_observations=True)

        for a in range(n_agents):
            if obs[a]:
                agent_obs[a] = normalize_observation(obs[a],
                                                     tree_depth,
                                                     observation_radius=10)
                agent_obs_buffer[a] = agent_obs[a].copy()

        # Reset score and done
        score = 0
        agent_action_buffer = np.zeros(n_agents)
        # prev_action = np.zeros_like(env.action_space.sample())
        prev_reward = np.zeros(n_agents)
        for step in range(max_steps):
            for a in range(n_agents):
                if info['action_required'][a]:
                    if imitate:
                        if step < len(expert_actions):
                            action = expert_actions[step][str(a)]
                        else:
                            action = 0
                    else:
                        action = 0

                    action_prob[action] += 1
                    update_values[a] = True

                else:
                    update_values[a] = False
                    action = 0

                action_dict.update({a: action})

            next_obs, all_rewards, done, info = env.step(action_dict)

            for a in range(n_agents):

                if next_obs[a] is not None:
                    agent_obs[a] = normalize_observation(next_obs[a],
                                                         tree_depth,
                                                         observation_radius=10)

                # Only update the values when we are done or when an action
                # was taken and thus relevant information is present
                if update_values[a] or done[a]:
                    start += 1

                    batch_builder.add_values(
                        t=step,
                        eps_id=trials,
                        agent_index=0,
                        obs=agent_obs_buffer[a],
                        actions=action_dict[a],
                        action_prob=1.0,  # put the true action probability
                        rewards=all_rewards[a],
                        prev_actions=agent_action_buffer[a],
                        prev_rewards=prev_reward[a],
                        dones=done[a],
                        infos=info['action_required'][a],
                        new_obs=agent_obs[a])

                agent_obs_buffer[a] = agent_obs[a].copy()
                agent_action_buffer[a] = action_dict[a]
                prev_reward[a] = all_rewards[a]

                score += all_rewards[a]  # / env.get_num_agents()

            if visuals:
                env_renderer.render_env(show=True,
                                        frames=True,
                                        show_observations=True)
                if sleep_for_animation:
                    time.sleep(0.5)

            if done["__all__"] or step > max_steps:
                writer.write(batch_builder.build_and_reset())
                break

            # Collection information about training
            if step % 100 == 0:
                tasks_finished = 0
                for current_agent in env.agents:
                    if current_agent.status == RailAgentStatus.DONE_REMOVED:
                        tasks_finished += 1
                print(
                    '\rTrial No {} Training {} Agents on ({},{}).\t Steps {}\t Reward: {:.3f}\t Normalized Reward: {:.3f}\tDones: {:.2f}%\t'
                    .format(
                        trials, env.get_num_agents(), x_dim, y_dim, step,
                        score, score / (max_steps + n_agents), 100 * np.mean(
                            tasks_finished / max(1, env.get_num_agents()))),
                    end=" ")

        tasks_finished = 0
        for current_agent in env.agents:
            if current_agent.status == RailAgentStatus.DONE_REMOVED:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        reward_window.append(score)
        scores_window.append(score / (max_steps + n_agents))

        data = [[
            n_agents, x_dim, y_dim, trials,
            np.mean(reward_window),
            np.mean(scores_window), 100 * np.mean(done_window), step,
            action_prob / np.sum(action_prob)
        ]]

        df_cur = pd.DataFrame(data, columns=columns)
        df_all_results = pd.concat([df_all_results, df_cur])

        if imitate:
            df_all_results.to_csv(
                f'TreeImitationLearning_DQN_TrainingResults.csv', index=False)

        print(
            '\rTrial No {} Training {} Agents on ({},{}).\t Total Steps {}\t Reward: {:.3f}\t Normalized Reward: {:.3f}\tDones: {:.2f}%\t'
            .format(trials, env.get_num_agents(), x_dim, y_dim, step,
                    np.mean(reward_window), np.mean(scores_window),
                    100 * np.mean(done_window)),
            end=" ")

        if visuals:
            env_renderer.close_window()

        gc.collect()
Exemple #14
0
    def handle_env_create(self, command):
        """
        Handles a ENV_CREATE command from the client
        TODO: Add a high level summary of everything thats happening here.
        """
        if not self.simulation_done:
            # trying to reset a simulation before finishing the previous one
            _command_response = self._error_template(
                "CAN'T CREATE NEW ENV BEFORE PREVIOUS IS DONE")
            self.send_response(_command_response, command)
            raise Exception(_command_response['payload'])

        self.simulation_count += 1
        self.simulation_done = False
        if self.simulation_count < len(self.env_file_paths):
            """
            There are still test envs left that are yet to be evaluated 
            """
            test_env_file_path = self.env_file_paths[self.simulation_count]
            print("Evaluating : {}".format(test_env_file_path))
            test_env_file_path = os.path.join(self.test_env_folder,
                                              test_env_file_path)
            del self.env
            self.env = RailEnv(
                width=1,
                height=1,
                rail_generator=rail_from_file(test_env_file_path),
                schedule_generator=schedule_from_file(test_env_file_path),
                malfunction_generator_and_process_data=malfunction_from_file(
                    test_env_file_path),
                obs_builder_object=DummyObservationBuilder())

            if self.begin_simulation:
                # If begin simulation has already been initialized
                # atleast once
                # This adds the simulation time for the previous episode
                self.simulation_times.append(time.time() -
                                             self.begin_simulation)
            self.begin_simulation = time.time()

            # Update evaluation metadata for the previous episode
            self.update_evaluation_metadata()

            # Start adding placeholders for the new episode
            self.simulation_env_file_paths.append(
                os.path.relpath(test_env_file_path,
                                self.test_env_folder))  # relative path

            self.simulation_rewards.append(0)
            self.simulation_rewards_normalized.append(0)
            self.simulation_percentage_complete.append(0)
            self.simulation_steps.append(0)

            self.current_step = 0

            _observation, _info = self.env.reset(regenerate_rail=True,
                                                 regenerate_schedule=True,
                                                 activate_agents=False,
                                                 random_seed=RANDOM_SEED)

            if self.visualize:
                current_env_path = self.env_file_paths[self.simulation_count]
                if current_env_path in self.video_generation_envs:
                    self.env_renderer = RenderTool(
                        self.env,
                        gl="PILSVG",
                    )
                elif self.env_renderer:
                    self.env_renderer = False

            _command_response = {}
            _command_response[
                'type'] = messages.FLATLAND_RL.ENV_CREATE_RESPONSE
            _command_response['payload'] = {}
            _command_response['payload']['observation'] = _observation
            _command_response['payload'][
                'env_file_path'] = self.env_file_paths[self.simulation_count]
            _command_response['payload']['info'] = _info
            _command_response['payload']['random_seed'] = RANDOM_SEED
        else:
            """
            All test env evaluations are complete
            """
            _command_response = {}
            _command_response[
                'type'] = messages.FLATLAND_RL.ENV_CREATE_RESPONSE
            _command_response['payload'] = {}
            _command_response['payload']['observation'] = False
            _command_response['payload']['env_file_path'] = False
            _command_response['payload']['info'] = False
            _command_response['payload']['random_seed'] = False

        self.send_response(_command_response, command)
        #####################################################################
        # Update evaluation state
        #####################################################################
        progress = np.clip(
            self.simulation_count * 1.0 / len(self.env_file_paths), 0, 1)

        mean_reward, mean_normalized_reward, mean_percentage_complete = self.compute_mean_scores(
        )

        self.evaluation_state["state"] = "IN_PROGRESS"
        self.evaluation_state["progress"] = progress
        self.evaluation_state["simulation_count"] = self.simulation_count
        self.evaluation_state["score"]["score"] = mean_percentage_complete
        self.evaluation_state["score"]["score_secondary"] = mean_reward
        self.evaluation_state["meta"][
            "normalized_reward"] = mean_normalized_reward
        self.handle_aicrowd_info_event(self.evaluation_state)
def main(argv):
    try:
        opts, args = getopt.getopt(argv, "n:", ["n_trials="])
    except getopt.GetoptError:
        print('training_navigation.py -n <n_trials>')
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-n', '--n_trials'):
            n_trials = int(arg)

    random.seed(1)
    np.random.seed(1)

    # Parameters for the Environment
    x_dim = 35
    y_dim = 35
    n_agents = 1

    # Use a the malfunction generator to break agents from time to time
    stochastic_data = {
        'prop_malfunction': 0.0,  # Percentage of defective agents
        'malfunction_rate': 30,  # Rate of malfunction occurence
        'min_duration': 3,  # Minimal duration of malfunction
        'max_duration': 20  # Max duration of malfunction
    }

    # Custom observation builder
    TreeObservation = TreeObsForRailEnv(max_depth=2)

    # Different agent types (trains) with different speeds.
    speed_ration_map = {
        1.: 0.,  # Fast passenger train
        1. / 2.: 1.0,  # Fast freight train
        1. / 3.: 0.0,  # Slow commuter train
        1. / 4.: 0.0
    }  # Slow freight train

    env = RailEnv(width=x_dim,
                  height=y_dim,
                  rail_generator=complex_rail_generator(nr_start_goal=10,
                                                        nr_extra=2,
                                                        min_dist=5,
                                                        max_dist=99999),
                  number_of_agents=n_agents,
                  obs_builder_object=TreeObservation)

    # After training we want to render the results so we also load a renderer
    env_renderer = RenderTool(
        env,
        gl="PILSVG",
    )
    # Given the depth of the tree observation and the number of features per node we get the following state_size
    num_features_per_node = env.obs_builder.observation_dim
    tree_depth = 2
    nr_nodes = 0
    for i in range(tree_depth + 1):
        nr_nodes += np.power(4, i)
    state_size = num_features_per_node * nr_nodes

    # The action space of flatland is 5 discrete actions
    action_size = 5

    # We set the number of episodes we would like to train on
    if 'n_trials' not in locals():
        n_trials = 1

    # And the max number of steps we want to take per episode
    max_steps = int(3 * (env.height + env.width))

    # Define training parameters
    eps = 1.
    eps_end = 0.005
    eps_decay = 0.998

    # And some variables to keep track of the progress
    action_dict = dict()
    final_action_dict = dict()
    scores_window = deque(maxlen=100)
    done_window = deque(maxlen=100)
    scores = []
    dones_list = []
    action_prob = [0] * action_size
    agent_obs = [None] * env.get_num_agents()
    agent_next_obs = [None] * env.get_num_agents()
    agent_obs_buffer = [None] * env.get_num_agents()
    agent_action_buffer = [2] * env.get_num_agents()
    cummulated_reward = np.zeros(env.get_num_agents())
    update_values = False
    # Now we load a Double dueling DQN agent
    agent = Agent(state_size, action_size)
    # agent.load("models")

    for trials in range(1, n_trials + 1):

        # Reset environment
        obs = env.reset()
        env_renderer.reset()
        # Build agent specific observations
        for a in range(env.get_num_agents()):
            if obs[a]:
                # agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
                agent_obs_buffer[a] = obs[a].copy()

        # Reset score and done
        score = 0
        env_done = 0

        # Run episode
        for step in range(max_steps):
            # Action
            for a in range(env.get_num_agents()):
                # If an action is require, we want to store the obs a that step as well as the action
                update_values = True
                action = agent.act(obs[a], eps=eps)
                action_prob[action] += 1
                # else:
                #     update_values = False
                #     action = 0
                action_dict.update({a: action})

            # Environment step
            next_obs, all_rewards, done, info = env.step(action_dict)
            env_renderer.render_env(show=True,
                                    show_observations=True,
                                    show_predictions=False)
            # Update replay buffer and train agent
            for a in range(env.get_num_agents()):
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values or done[a]:
                    agent.step(agent_obs_buffer[a], agent_action_buffer[a],
                               all_rewards[a], obs[a], done[a])
                    cummulated_reward[a] = 0.

                    agent_obs_buffer[a] = obs[a].copy()
                    agent_action_buffer[a] = action_dict[a]
                # if next_obs[a]:
                # agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10)

                score += all_rewards[a] / env.get_num_agents()

            # Copy observation
            if done['__all__']:
                env_done = 1
                break

        # Epsilon decay
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon

        # Collection information about training
        tasks_finished = 0
        for _idx in range(env.get_num_agents()):
            if done[_idx] == 1:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / max_steps)  # save most recent score
        scores.append(np.mean(scores_window))
        dones_list.append((np.mean(done_window)))

        print(
            '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'
            .format(env.get_num_agents(), x_dim, y_dim, trials,
                    np.mean(scores_window), 100 * np.mean(done_window), eps,
                    action_prob / np.sum(action_prob)),
            end=" ")

        if trials % 100 == 0:
            print(
                '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'
                .format(env.get_num_agents(), x_dim, y_dim, trials,
                        np.mean(scores_window), 100 * np.mean(done_window),
                        eps, action_prob / np.sum(action_prob)))
            # tf.save_checkpoint(agent.qnetwork_local.state_dict(),
            #            './Nets/navigator_checkpoint' + str(trials) + '.pth')
            action_prob = [1] * action_size

    # Plot overall training progress at the end

    # plt.plot(scores)
    # plt.show()
    agent.save("first-run")
Exemple #16
0
    steps_by_episode.append(step)
    total_rewards_by_episode.append(total_reward)

    env.restart_agents()
    if done:
        env.dones = {0: False, '__all__': False}

#%%

env = RailEnv(width=7,
              height=7,
              rail_generator=complex_rail_generator(nr_start_goal=10,
                                                    nr_extra=1,
                                                    min_dist=8,
                                                    max_dist=99999,
                                                    seed=1),
              schedule_generator=complex_schedule_generator(),
              number_of_agents=2,
              obs_builder_object=TreeObsForRailEnv(max_depth=2))
#env.reset()
env_renderer = RenderTool(env, agent_render_variant=3)
env_renderer.render_env(show=True,
                        show_predictions=False,
                        show_observations=False)

#%%
steps_by_episode = np.array(steps_by_episode)
plt.plot(moving_average(steps_by_episode, 200))

#%%
#qq = QTable(env, get_rail_coordinates(env),3,3,3)
def test_shortest_path_predictor(rendering=False):
    rail, rail_map = make_simple_rail()
    env = RailEnv(
        width=rail_map.shape[1],
        height=rail_map.shape[0],
        rail_generator=rail_from_grid_transition_map(rail),
        schedule_generator=random_schedule_generator(),
        number_of_agents=1,
        obs_builder_object=TreeObsForRailEnv(
            max_depth=2, predictor=ShortestPathPredictorForRailEnv()),
    )
    env.reset()

    # set the initial position
    agent = env.agents[0]
    agent.initial_position = (5, 6)  # south dead-end
    agent.position = (5, 6)  # south dead-end
    agent.direction = 0  # north
    agent.initial_direction = 0  # north
    agent.target = (3, 9)  # east dead-end
    agent.moving = True
    agent.status = RailAgentStatus.ACTIVE

    env.reset(False, False)

    if rendering:
        renderer = RenderTool(env, gl="PILSVG")
        renderer.render_env(show=True, show_observations=False)
        input("Continue?")

    # compute the observations and predictions
    distance_map = env.distance_map.get()
    assert distance_map[0, agent.initial_position[0], agent.initial_position[1], agent.direction] == 5.0, \
        "found {} instead of {}".format(
            distance_map[agent.handle, agent.initial_position[0], agent.position[1], agent.direction], 5.0)

    paths = get_shortest_paths(env.distance_map)[0]
    assert paths == [
        WayPoint((5, 6), 0),
        WayPoint((4, 6), 0),
        WayPoint((3, 6), 0),
        WayPoint((3, 7), 1),
        WayPoint((3, 8), 1),
        WayPoint((3, 9), 1)
    ]

    # extract the data
    predictions = env.obs_builder.predictions
    positions = np.array(
        list(map(lambda prediction: [*prediction[1:3]], predictions[0])))
    directions = np.array(
        list(map(lambda prediction: [prediction[3]], predictions[0])))
    time_offsets = np.array(
        list(map(lambda prediction: [prediction[0]], predictions[0])))

    # test if data meets expectations
    expected_positions = [
        [5, 6],
        [4, 6],
        [3, 6],
        [3, 7],
        [3, 8],
        [3, 9],
        [3, 9],
        [3, 9],
        [3, 9],
        [3, 9],
        [3, 9],
        [3, 9],
        [3, 9],
        [3, 9],
        [3, 9],
        [3, 9],
        [3, 9],
        [3, 9],
        [3, 9],
        [3, 9],
        [3, 9],
    ]
    expected_directions = [
        [Grid4TransitionsEnum.NORTH],  # next is [5,6] heading north
        [Grid4TransitionsEnum.NORTH],  # next is [4,6] heading north
        [Grid4TransitionsEnum.NORTH],  # next is [3,6] heading north
        [Grid4TransitionsEnum.EAST],  # next is [3,7] heading east
        [Grid4TransitionsEnum.EAST],
        [Grid4TransitionsEnum.EAST],
        [Grid4TransitionsEnum.EAST],
        [Grid4TransitionsEnum.EAST],
        [Grid4TransitionsEnum.EAST],
        [Grid4TransitionsEnum.EAST],
        [Grid4TransitionsEnum.EAST],
        [Grid4TransitionsEnum.EAST],
        [Grid4TransitionsEnum.EAST],
        [Grid4TransitionsEnum.EAST],
        [Grid4TransitionsEnum.EAST],
        [Grid4TransitionsEnum.EAST],
        [Grid4TransitionsEnum.EAST],
        [Grid4TransitionsEnum.EAST],
        [Grid4TransitionsEnum.EAST],
        [Grid4TransitionsEnum.EAST],
        [Grid4TransitionsEnum.EAST],
    ]

    expected_time_offsets = np.array([
        [0.],
        [1.],
        [2.],
        [3.],
        [4.],
        [5.],
        [6.],
        [7.],
        [8.],
        [9.],
        [10.],
        [11.],
        [12.],
        [13.],
        [14.],
        [15.],
        [16.],
        [17.],
        [18.],
        [19.],
        [20.],
    ])

    assert np.array_equal(time_offsets, expected_time_offsets), \
        "time_offsets {}, expected {}".format(time_offsets, expected_time_offsets)

    assert np.array_equal(positions, expected_positions), \
        "positions {}, expected {}".format(positions, expected_positions)
    assert np.array_equal(directions, expected_directions), \
        "directions {}, expected {}".format(directions, expected_directions)
def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps,
                action_size, state_size, seed, render, allow_skipping,
                allow_caching):
    # Evaluation is faster on CPU (except if you use a really huge policy)
    parameters = {'use_gpu': False}

    policy = DDDQNPolicy(state_size,
                         action_size,
                         Namespace(**parameters),
                         evaluation_mode=True)
    policy.qnetwork_local = torch.load(checkpoint)

    env_params = Namespace(**env_params)

    # Environment parameters
    n_agents = env_params.n_agents
    x_dim = env_params.x_dim
    y_dim = env_params.y_dim
    n_cities = env_params.n_cities
    max_rails_between_cities = env_params.max_rails_between_cities
    max_rails_in_city = env_params.max_rails_in_city

    # Malfunction and speed profiles
    # TODO pass these parameters properly from main!
    malfunction_parameters = MalfunctionParameters(
        malfunction_rate=1. / 2000,  # Rate of malfunctions
        min_duration=20,  # Minimal duration
        max_duration=50  # Max duration
    )

    # Only fast trains in Round 1
    speed_profiles = {
        1.: 1.0,  # Fast passenger train
        1. / 2.: 0.0,  # Fast freight train
        1. / 3.: 0.0,  # Slow commuter train
        1. / 4.: 0.0  # Slow freight train
    }

    # Observation parameters
    observation_tree_depth = env_params.observation_tree_depth
    observation_radius = env_params.observation_radius
    observation_max_path_depth = env_params.observation_max_path_depth

    # Observation builder
    predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
    tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth,
                                         predictor=predictor)

    # Setup the environment
    env = RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=n_cities,
            grid_mode=False,
            max_rails_between_cities=max_rails_between_cities,
            max_rails_in_city=max_rails_in_city,
        ),
        schedule_generator=sparse_schedule_generator(speed_profiles),
        number_of_agents=n_agents,
        malfunction_generator_and_process_data=malfunction_from_params(
            malfunction_parameters),
        obs_builder_object=tree_observation)

    if render:
        env_renderer = RenderTool(env, gl="PGL")

    action_dict = dict()
    scores = []
    completions = []
    nb_steps = []
    inference_times = []
    preproc_times = []
    agent_times = []
    step_times = []

    for episode_idx in range(n_eval_episodes):
        seed += 1

        inference_timer = Timer()
        preproc_timer = Timer()
        agent_timer = Timer()
        step_timer = Timer()

        step_timer.start()
        obs, info = env.reset(regenerate_rail=True,
                              regenerate_schedule=True,
                              random_seed=seed)
        step_timer.end()

        agent_obs = [None] * env.get_num_agents()
        score = 0.0

        if render:
            env_renderer.set_new_rail()

        final_step = 0
        skipped = 0

        nb_hit = 0
        agent_last_obs = {}
        agent_last_action = {}

        for step in range(max_steps - 1):
            if allow_skipping and check_if_all_blocked(env):
                # FIXME why -1? bug where all agents are "done" after max_steps!
                skipped = max_steps - step - 1
                final_step = max_steps - 2
                n_unfinished_agents = sum(not done[idx]
                                          for idx in env.get_agent_handles())
                score -= skipped * n_unfinished_agents
                break

            agent_timer.start()
            for agent in env.get_agent_handles():
                if obs[agent] and info['action_required'][agent]:
                    if agent in agent_last_obs and np.all(
                            agent_last_obs[agent] == obs[agent]):
                        nb_hit += 1
                        action = agent_last_action[agent]

                    else:
                        preproc_timer.start()
                        norm_obs = normalize_observation(
                            obs[agent],
                            tree_depth=observation_tree_depth,
                            observation_radius=observation_radius)
                        preproc_timer.end()

                        inference_timer.start()
                        action = policy.act(norm_obs, eps=0.0)
                        inference_timer.end()

                    action_dict.update({agent: action})

                    if allow_caching:
                        agent_last_obs[agent] = obs[agent]
                        agent_last_action[agent] = action
            agent_timer.end()

            step_timer.start()
            obs, all_rewards, done, info = env.step(action_dict)
            step_timer.end()

            if render:
                env_renderer.render_env(show=True,
                                        frames=False,
                                        show_observations=False,
                                        show_predictions=False)

                if step % 100 == 0:
                    print("{}/{}".format(step, max_steps - 1))

            for agent in env.get_agent_handles():
                score += all_rewards[agent]

            final_step = step

            if done['__all__']:
                break

        normalized_score = score / (max_steps * env.get_num_agents())
        scores.append(normalized_score)

        tasks_finished = sum(done[idx] for idx in env.get_agent_handles())
        completion = tasks_finished / max(1, env.get_num_agents())
        completions.append(completion)

        nb_steps.append(final_step)

        inference_times.append(inference_timer.get())
        preproc_times.append(preproc_timer.get())
        agent_times.append(agent_timer.get())
        step_times.append(step_timer.get())

        skipped_text = ""
        if skipped > 0:
            skipped_text = "\t⚡ Skipped {}".format(skipped)

        hit_text = ""
        if nb_hit > 0:
            hit_text = "\t⚡ Hit {} ({:.1f}%)".format(nb_hit, (100 * nb_hit) /
                                                     (n_agents * final_step))

        print(
            "☑️  Score: {:.3f} \tDone: {:.1f}% \tNb steps: {:.3f} "
            "\t🍭 Seed: {}"
            "\t🚉 Env: {:.3f}s  "
            "\t🤖 Agent: {:.3f}s (per step: {:.3f}s) \t[preproc: {:.3f}s \tinfer: {:.3f}s]"
            "{}{}".format(normalized_score, completion * 100.0, final_step,
                          seed, step_timer.get(), agent_timer.get(),
                          agent_timer.get() / final_step, preproc_timer.get(),
                          inference_timer.get(), skipped_text, hit_text))

    return scores, completions, nb_steps, agent_times, step_times
def test_dummy_predictor(rendering=False):
    rail, rail_map = make_simple_rail2()

    env = RailEnv(
        width=rail_map.shape[1],
        height=rail_map.shape[0],
        rail_generator=rail_from_grid_transition_map(rail),
        schedule_generator=random_schedule_generator(),
        number_of_agents=1,
        obs_builder_object=TreeObsForRailEnv(
            max_depth=2, predictor=DummyPredictorForRailEnv(max_depth=10)),
    )
    env.reset()

    # set initial position and direction for testing...
    env.agents[0].initial_position = (5, 6)
    env.agents[0].initial_direction = 0
    env.agents[0].direction = 0
    env.agents[0].target = (3, 0)

    env.reset(False, False)
    env.set_agent_active(env.agents[0])

    if rendering:
        renderer = RenderTool(env, gl="PILSVG")
        renderer.render_env(show=True, show_observations=False)
        input("Continue?")

    # test assertions
    predictions = env.obs_builder.predictor.get(None)
    positions = np.array(
        list(map(lambda prediction: [*prediction[1:3]], predictions[0])))
    directions = np.array(
        list(map(lambda prediction: [prediction[3]], predictions[0])))
    time_offsets = np.array(
        list(map(lambda prediction: [prediction[0]], predictions[0])))
    actions = np.array(
        list(map(lambda prediction: [prediction[4]], predictions[0])))

    # compare against expected values
    expected_positions = np.array([
        [5., 6.],
        [4., 6.],
        [3., 6.],
        [3., 5.],
        [3., 4.],
        [3., 3.],
        [3., 2.],
        [3., 1.],
        # at target (3,0): stay in this position from here on
        [3., 0.],
        [3., 0.],
        [3., 0.],
    ])
    expected_directions = np.array([
        [0.],
        [0.],
        [0.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        # at target (3,0): stay in this position from here on
        [3.],
        [3.],
        [3.]
    ])
    expected_time_offsets = np.array([
        [0.],
        [1.],
        [2.],
        [3.],
        [4.],
        [5.],
        [6.],
        [7.],
        [8.],
        [9.],
        [10.],
    ])
    expected_actions = np.array([
        [0.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        # reaching target by straight
        [2.],
        # at target: stopped moving
        [4.],
        [4.],
    ])
    assert np.array_equal(positions, expected_positions)
    assert np.array_equal(directions, expected_directions)
    assert np.array_equal(time_offsets, expected_time_offsets)
    assert np.array_equal(actions, expected_actions)
Exemple #20
0
def get_reward(weights, model, render=False):
    cloned_model = copy.deepcopy(model)
    for i, param in enumerate(cloned_model.parameters()):
        try:
            param.data.copy_(weights[i])
        except:
            param.data.copy_(weights[i].data)

    env_Orig = RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=3,
            # Number of cities in map (where train stations are)
            seed=1,  # Random seed
            grid_mode=False,
            max_rails_between_cities=2,
            max_rails_in_city=3),
        schedule_generator=sparse_schedule_generator(speed_ration_map),
        number_of_agents=n_agents,
        stochastic_data=stochastic_data,  # Malfunction data generator
        obs_builder_object=TreeObservation)

    env = copy.deepcopy(env_Orig)

    # After training we want to render the results so we also load a renderer
    env_renderer = RenderTool(
        env,
        gl="PILSVG",
    )

    # And the max number of steps we want to take per episode
    max_steps = int(4 * 2 * (20 + env.height + env.width))

    n_episodes = 1
    for trials in range(1, n_episodes + 1):
        # Reset environment
        obs, info = env.reset(True, True)
        env_renderer.reset()
        # Build agent specific observations
        for a in range(env.get_num_agents()):
            if obs[a]:
                agent_obs[a] = normalize_observation(obs[a],
                                                     tree_depth,
                                                     observation_radius=10)
                agent_obs_buffer[a] = agent_obs[a].copy()

        # Reset score and done
        score = 0
        env_done = 0
        step = 0

        # Run episode
        while True:
            # Action
            for a in range(env.get_num_agents()):
                if info['action_required'][a]:
                    # If an action is require, we want to store the obs a that step as well as the action
                    update_values[a] = True

                    batch = torch.from_numpy(agent_obs[a][np.newaxis,
                                                          ...]).float()
                    if cuda:
                        batch = batch.cuda()
                    prediction = cloned_model(Variable(batch))
                    action = prediction.data.cpu().numpy().argmax()

                    # action = agent.act(agent_obs[a], eps=eps)
                    action_prob[action] += 1
                else:
                    update_values[a] = False
                    action = 0
                action_dict.update({a: action})

            # Environment step
            # print("Action Values:", action_dict)
            next_obs, all_rewards, done, info = env.step(action_dict)
            step += 1
            if (render):
                env_renderer.render_env(show=True,
                                        show_predictions=True,
                                        show_observations=False)

            for a in range(env.get_num_agents()):
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values[a] or done[a]:
                    # agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a],
                    #           agent_obs[a], done[a])
                    cummulated_reward[a] = 0.

                    agent_obs_buffer[a] = agent_obs[a].copy()
                    agent_action_buffer[a] = action_dict[a]
                if next_obs[a]:
                    agent_obs[a] = normalize_observation(next_obs[a],
                                                         tree_depth,
                                                         observation_radius=10)

                score += all_rewards[a] / env.get_num_agents()
            # print(all_rewards)
            # Copy observation
            if done['__all__'] or step >= max_steps:
                env_done = 1
                break

        # Collection information about training
        tasks_finished = 0
        for current_agent in env.agents:
            if current_agent.status == RailAgentStatus.DONE_REMOVED:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / max_steps)  # save most recent score
        scores.append(np.mean(scores_window))
        dones_list.append((np.mean(done_window)))

        print(
            '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\t Action Probabilities: \t {}'
            .format(env.get_num_agents(), x_dim, y_dim, trials,
                    np.mean(scores_window), 100 * np.mean(done_window),
                    action_prob / np.sum(action_prob)),
            end=" ")

    # env.close()
    data = [[
        n_agents, x_dim, y_dim, trials,
        np.mean(scores_window), 100 * np.mean(done_window), step,
        action_prob / np.sum(action_prob)
    ]]

    dfCur = pd.DataFrame(data)

    with open(f'ES_TrainingResults_{n_agents}_{x_dim}_{y_dim}.csv', 'a') as f:
        dfCur.to_csv(f, index=False, header=False)

    return np.mean(scores)
Exemple #21
0
def evaluate(seed=37429879,
             timed=False,
             filename="./rl-weights.pth",
             debug=False,
             refresh=1):

    # Attempt to load policy from disk.
    policy = load_policy(filename, seed=seed)

    # Create environment with given seeding.
    env, max_steps, _, _, observation_tree_depth, _ = create_default_single_agent_environment(
        seed + 1, timed)

    # Fixed environment parameters (note, these must correspond with the training parameters!)
    observation_radius = 10

    env_renderer = None
    if (debug):
        env_renderer = RenderTool(env, screen_width=1920, screen_height=1080)

    # Create container for the agent actions and observations.
    action_dict = dict()
    agent_obs = [None] * env.number_of_agents

    num_maps = 100
    scores = []
    successes = 0

    for _ in range(0, num_maps):

        # Create a new map.
        obs, info = env.reset(True, True)
        score = 0

        if debug:
            env_renderer.reset()
            env_renderer.render_env(show=True,
                                    frames=False,
                                    show_observations=False)
            time.sleep(refresh)

        # Run episode
        for _ in range(max_steps - 1):

            # Build agent specific observations
            for agent in env.get_agent_handles():
                if obs[agent]:
                    agent_obs[agent] = normalize_observation(
                        obs[agent],
                        observation_tree_depth,
                        observation_radius=observation_radius)

            # If an action is required, select the action.
            for agent in env.get_agent_handles():
                action = 0
                if info['action_required'][agent]:
                    action = policy.act(agent_obs[agent], eps=0.08)
                    #print("Required " + str(action))
                action_dict.update({agent: action})

            # Environment step
            obs, all_rewards, done, info = env.step(action_dict)

            if debug:
                env_renderer.render_env(show=True,
                                        frames=False,
                                        show_observations=False)
                time.sleep(refresh)

            # Track rewards.
            score = score + all_rewards[agent]

            if done[agent]:
                successes = successes + 1
                break

        # Record scores.
        scores.append(score)

    print("Successful:    %8.2f%%" % (100 * successes / num_maps))
    print("Mean reward:   %8.2f" % (np.mean(scores)))
    print("Median reward: %8.2f" % (np.median(scores)))
Exemple #22
0
def test_get_k_shortest_paths(rendering=False):
    rail, rail_map = make_simple_rail_with_alternatives()

    env = RailEnv(width=rail_map.shape[1],
                  height=rail_map.shape[0],
                  rail_generator=rail_from_grid_transition_map(rail),
                  schedule_generator=random_schedule_generator(),
                  number_of_agents=1,
                  obs_builder_object=GlobalObsForRailEnv(),
                  )
    env.reset()

    initial_position = (3, 1)  # west dead-end
    initial_direction = Grid4TransitionsEnum.WEST  # west
    target_position = (3, 9)  # east

    # set the initial position
    agent = env.agents[0]
    agent.position = initial_position
    agent.initial_position = initial_position
    agent.direction = initial_direction
    agent.target = target_position  # east dead-end
    agent.moving = True

    env.reset(False, False)
    if rendering:
        renderer = RenderTool(env, gl="PILSVG")
        renderer.render_env(show=True, show_observations=False)
        input()

    actual = set(get_k_shortest_paths(
        env=env,
        source_position=initial_position,  # west dead-end
        source_direction=int(initial_direction),  # east
        target_position=target_position,
        k=10
    ))

    expected = set([
        (
            Waypoint(position=(3, 1), direction=3),
            Waypoint(position=(3, 0), direction=3),
            Waypoint(position=(3, 1), direction=1),
            Waypoint(position=(3, 2), direction=1),
            Waypoint(position=(3, 3), direction=1),
            Waypoint(position=(2, 3), direction=0),
            Waypoint(position=(1, 3), direction=0),
            Waypoint(position=(0, 3), direction=0),
            Waypoint(position=(0, 4), direction=1),
            Waypoint(position=(0, 5), direction=1),
            Waypoint(position=(0, 6), direction=1),
            Waypoint(position=(0, 7), direction=1),
            Waypoint(position=(0, 8), direction=1),
            Waypoint(position=(0, 9), direction=1),
            Waypoint(position=(1, 9), direction=2),
            Waypoint(position=(2, 9), direction=2),
            Waypoint(position=(3, 9), direction=2)),
        (
            Waypoint(position=(3, 1), direction=3),
            Waypoint(position=(3, 0), direction=3),
            Waypoint(position=(3, 1), direction=1),
            Waypoint(position=(3, 2), direction=1),
            Waypoint(position=(3, 3), direction=1),
            Waypoint(position=(3, 4), direction=1),
            Waypoint(position=(3, 5), direction=1),
            Waypoint(position=(3, 6), direction=1),
            Waypoint(position=(4, 6), direction=2),
            Waypoint(position=(5, 6), direction=2),
            Waypoint(position=(6, 6), direction=2),
            Waypoint(position=(5, 6), direction=0),
            Waypoint(position=(4, 6), direction=0),
            Waypoint(position=(4, 7), direction=1),
            Waypoint(position=(4, 8), direction=1),
            Waypoint(position=(4, 9), direction=1),
            Waypoint(position=(3, 9), direction=0))
    ])

    assert actual == expected, "actual={},expected={}".format(actual, expected)
Exemple #23
0
def main(argv):
    try:
        opts, args = getopt.getopt(argv, "n:", ["n_trials="])
    except getopt.GetoptError:
        print('test_navigation_single_agent.py -n <n_trials>')
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-n', '--n_trials'):
            n_trials = int(arg)

    random.seed(1)
    np.random.seed(1)

    # Parameters for the Environment
    multi_agent_setup = 3

    if multi_agent_setup == 1:
        x_dim = 35
        y_dim = 35
        n_agents = 1
        max_num_cities = 3
        max_rails_between_cities = 2
        max_rails_in_city = 3

    # Multi agent (3)
    if multi_agent_setup == 3:
        x_dim = 40
        y_dim = 40
        n_agents = 3
        max_num_cities = 4
        max_rails_between_cities = 2
        max_rails_in_city = 3

    # Multi agent (5)
    if multi_agent_setup == 5:
        x_dim = 16 * 3
        y_dim = 9 * 3
        n_agents = 7
        max_num_cities = 5
        max_rails_between_cities = 2
        max_rails_in_city = 3

    if multi_agent_setup == 7:
        x_dim = 16 * 4
        y_dim = 9 * 4
        n_agents = 7
        max_num_cities = 7
        max_rails_between_cities = 4
        max_rails_in_city = 4

    if multi_agent_setup == 8:
        x_dim = 16 * 4
        y_dim = 9 * 4
        n_agents = 10
        max_num_cities = 9
        max_rails_between_cities = 5
        max_rails_in_city = 5

    # We are training an Agent using the Tree Observation with depth 2
    #observation_builder = TreeObsForRailEnv(max_depth=2, predictor = ShortestPathPredictorForRailEnv(20))

    # Use a the malfunction generator to break agents from time to time
    stochastic_data = {
        'malfunction_rate':
        80,  # Rate of malfunction occurence of single agent
        'min_duration': 15,  # Minimal duration of malfunction
        'max_duration': 50  # Max duration of malfunction
    }

    # Custom observation builder
    tree_depth = 2
    TreeObservation = TreeObsForRailEnv(
        max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv(20))

    np.savetxt(fname=path.join('NetsTest', 'info.txt'),
               X=[
                   x_dim, y_dim, n_agents, max_num_cities,
                   max_rails_between_cities, max_rails_in_city, tree_depth
               ],
               delimiter=';')

    # Different agent types (trains) with different speeds.
    speed_ration_map = {
        1.: 1.,  # Fast passenger train
        1. / 2.: 0.0,  # Fast freight train
        1. / 3.: 0.0,  # Slow commuter train
        1. / 4.: 0.0
    }  # Slow freight train

    #speed_ration_map = {1.: 0.25,  # Fast passenger train
    #                    1. / 2.: 0.25,  # Fast freight train
    #                    1. / 3.: 0.25,  # Slow commuter train
    #                    1. / 4.: 0.25}  # Slow freight train

    env = RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=max_num_cities,
            # Number of cities in map (where train stations are)
            seed=14,  # Random seed
            grid_mode=False,
            max_rails_between_cities=max_rails_between_cities,
            max_rails_in_city=max_rails_in_city),
        schedule_generator=sparse_schedule_generator(speed_ration_map),
        #malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
        number_of_agents=n_agents,
        obs_builder_object=TreeObservation)
    env.reset()

    #env_renderer = RenderTool(env, gl="PILSVG", )
    env_renderer = RenderTool(
        env,
        gl="PILSVG",
        agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX,
        show_debug=False,
        screen_height=(1080 *
                       0.8),  # Adjust these parameters to fit your resolution
        screen_width=(1920 * 0.8))
    num_features_per_node = env.obs_builder.observation_dim

    nr_nodes = 0
    for i in range(tree_depth + 1):
        nr_nodes += np.power(4, i)
    state_size = num_features_per_node * nr_nodes
    action_size = 5

    # We set the number of episodes we would like to train on
    if 'n_trials' not in locals():
        n_trials = 15000
    max_steps = int(3 * (env.height + env.width))
    eps = 1.
    eps_end = 0.005
    eps_decay = 0.9995

    # And some variables to keep track of the performance
    action_dict = dict()
    final_action_dict = dict()
    action_prob_list = []
    scores_window = deque(maxlen=100)
    done_window = deque(maxlen=100)
    scores = []
    scores_list = []
    deadlock_list = []
    dones_list_window = []
    dones_list = []
    action_prob = [0] * action_size
    agent_obs = [None] * env.get_num_agents()
    agent_next_obs = [None] * env.get_num_agents()  # Useless
    agent = Agent(state_size, action_size)

    # Load model's weights to test
    agent.qnetwork_local.load_state_dict(
        torch.load(
            path.join(
                'NetsTest',
                'navigator_checkpoint5000_multi3_deadlock_global10.pth')))

    record_images = False
    frame_step = 0

    for trials in range(1, n_trials + 1):

        # Reset environment
        obs, info = env.reset()  #(True, True)
        env_renderer.reset()
        # Build agent specific observations
        for a in range(env.get_num_agents()):
            agent_obs[a] = agent_obs[a] = normalize_observation(
                obs[a], tree_depth, observation_radius=10)
        # Reset score and done
        score = 0
        env_done = 0

        # Run episode
        for step in range(max_steps):

            # Action
            for a in range(env.get_num_agents()):
                if info['action_required'][a]:
                    action = agent.act(agent_obs[a], eps=0.)
                    action_prob[action] += 1

                else:
                    action = 0

                action_dict.update({a: action})
            # Environment step
            obs, all_rewards, done, deadlocks, info = env.step(action_dict)

            env_renderer.render_env(show=True,
                                    show_predictions=True,
                                    show_observations=False)
            # Build agent specific observations and normalize
            for a in range(env.get_num_agents()):
                if obs[a]:
                    agent_obs[a] = normalize_observation(obs[a],
                                                         tree_depth,
                                                         observation_radius=10)

                score += all_rewards[a] / env.get_num_agents()

            if done['__all__']:
                break

        # Collection information about training
        tasks_finished = 0
        for _idx in range(env.get_num_agents()):
            if done[_idx] == 1:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / max_steps)  # save most recent score
        scores.append(np.mean(scores_window))
        dones_list.append(tasks_finished / max(1, env.get_num_agents()))
        dones_list_window.append((np.mean(done_window)))
        scores_list.append(score / max_steps)
        deadlock_list.append(deadlocks.count(1) / max(1, env.get_num_agents()))

        if (np.sum(action_prob) == 0):
            action_prob_normalized = [0] * action_size
        else:
            action_prob_normalized = action_prob / np.sum(action_prob)

        print(
            '\rTesting {} Agents on ({},{}).\t Episode {}\t Score: {:.3f}\tDones: {:.2f}%\tDeadlocks: {:.2f}\t Action Probabilities: \t {}'
            .format(env.get_num_agents(), x_dim, y_dim, trials,
                    score / max_steps,
                    100 * tasks_finished / max(1, env.get_num_agents()),
                    deadlocks.count(1) / max(1, env.get_num_agents()),
                    action_prob_normalized),
            end=" ")

        #if trials % 100 == 0:
        action_prob_list.append(action_prob_normalized)
        action_prob = [0] * action_size

        if trials % 50 == 0:

            #np.savetxt(fname=path.join('Nets' , 'scores_metric.txt'), X=scores)
            #np.savetxt(fname=path.join('Nets' , 'dones_metric.txt'), X=dones_list)
            np.savetxt(fname=path.join('NetsTest', 'test_metrics.csv'),
                       X=np.transpose(
                           np.asarray([
                               scores_list, scores, dones_list,
                               dones_list_window, deadlock_list
                           ])),
                       delimiter=';',
                       newline='\n')
            np.savetxt(fname=path.join('NetsTest', 'test_action_prob.csv'),
                       X=np.asarray(action_prob_list),
                       delimiter=';',
                       newline='\n')
Exemple #24
0
    # stn to action controller
    # this is an old stn to action controller for 2019 challenge
    # may need correction/improvement
    #####################################################################

    my_controller = MCP_Controller(local_env)

    # init prev locations to be -1 for each agent. (None of them has left the station yet)
    prev_locs = [-1 for i in range(0,len(local_env.agents))]

    #####################################################################
    # Show the flatland visualization, for debugging
    #####################################################################

    if env_renderer_enable:
        env_renderer = RenderTool(local_env, screen_height=4000,
                                  screen_width=4000)
        env_renderer.render_env(show=True, show_observations=False, show_predictions=False)

    #####################################################################

    while True:
        #####################################################################
        # Evaluation of a single episode
        #
        #####################################################################

        if debug_print:
            print("current step: ", steps)

        time_start = time.time()
Exemple #25
0
from flatland.envs.rail_env import RailEnv
from flatland.utils.rendertools import RenderTool

NUMBER_OF_AGENTS = 10
env = RailEnv(
    width=20,
    height=20,
    rail_generator=complex_rail_generator(nr_start_goal=10,
                                          nr_extra=1,
                                          min_dist=8,
                                          max_dist=99999,
                                          seed=0),
    number_of_agents=NUMBER_OF_AGENTS,
)

env_renderer = RenderTool(env)


def my_controller():
    """
    You are supposed to write this controller
    """
    _action = {}
    for _idx in range(NUMBER_OF_AGENTS):
        _action[_idx] = np.random.randint(0, 5)
    return _action


for step in range(100):

    _action = my_controller()
# Initialize the q-values
# Q = np.zeros((x_dim, y_dim, action_size), dtype=np.float)
with open(Q_filename, "rb") as f:
    Q = pickle.load(f)

# Use the complex_rail_generator to generate feasible network configurations with corresponding tasks
# Training on simple small tasks is the best way to get familiar with the environment
TreeObservation = TreeObsForRailEnv(max_depth=tree_depth)
env = RailEnv(width=x_dim,
              height=y_dim,
              rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=2, min_dist=5, max_dist=99999),
              obs_builder_object=TreeObservation,
              number_of_agents=n_agents)

env_renderer = RenderTool(env, gl="PILSVG", )

# Given the depth of the tree observation and the number of features per node we get the following state_size
features_per_node = 9
nr_nodes = 0
for i in range(tree_depth + 1):
    nr_nodes += np.power(4, i)
state_size = features_per_node * nr_nodes

# Import your own Agent or use RLlib to train agents on Flatland
# As an example we use a random agent here


class RandomAgent:

    def __init__(self, state_size, action_size):
Exemple #27
0
    regenerate_schedule=False,
    random_seed=True)

for episode in range(EPISODES):
    if episode % SHOW_EVERY == 0:
        # This episode will be showed
        render = True
        print("Show episode number: ", episode)
    else:
        render = False

    observation, info = sparse_env.reset(regenerate_rail=False,
                                         regenerate_schedule=False,
                                         random_seed=True)

    env_renderer = RenderTool(sparse_env, gl="PGL")
    env_renderer.set_new_rail()

    state = observation[0]["state"]
    # e.g. of observation {0: {"state": (12, 4), observations: [[1 0 0], [0 1 0]]}}
    obs = observation[0]["observations"]
    done = {0: False}
    cost = 0
    # cost of the path found (for plotting the results)

    while not done[0]:
        q_best = -np.inf
        # searching in the Q tensor the most promising action given the
        for single_obs in obs:
            # admissible actions (this information is stored in the "observations")
            index = np.argmax(single_obs)