def render_test(parameters, test_nr=0, nr_examples=5):
    for trial in range(nr_examples):
        # Reset the env
        print(
            'Showing {} Level {} with (x_dim,y_dim) = ({},{}) and {} Agents.'.
            format(test_nr, trial, parameters[0], parameters[1],
                   parameters[2]))
        file_name = "./Tests/{}/Level_{}.pkl".format(test_nr, trial)

        env = RailEnv(
            width=1,
            height=1,
            rail_generator=rail_from_file(file_name),
            obs_builder_object=TreeObsForRailEnv(max_depth=2),
            number_of_agents=1,
        )
        env_renderer = RenderTool(
            env,
            gl="PILSVG",
        )
        env_renderer.set_new_rail()

        env.reset(False, False)
        env_renderer.render_env(show=True, show_observations=False)

        time.sleep(0.1)
        env_renderer.close_window()
    return
Exemple #2
0
def main(args):
    try:
        opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""])
    except getopt.GetoptError as err:
        print(str(err))  # will print something like "option -a not recognized"
        sys.exit(2)
    sleep_for_animation = True
    for o, a in opts:
        if o in ("--sleep-for-animation"):
            sleep_for_animation = str2bool(a)
        else:
            assert False, "unhandled option"

    env = RailEnv(width=7,
                  height=7,
                  rail_generator=complex_rail_generator(nr_start_goal=10,
                                                        nr_extra=1,
                                                        min_dist=5,
                                                        max_dist=99999,
                                                        seed=1),
                  schedule_generator=complex_schedule_generator(),
                  number_of_agents=1,
                  obs_builder_object=SingleAgentNavigationObs())

    obs, info = env.reset()
    env_renderer = RenderTool(env)
    env_renderer.render_env(show=True, frames=True, show_observations=True)
    for step in range(100):
        action = np.argmax(obs[0]) + 1
        obs, all_rewards, done, _ = env.step({0: action})
        print("Rewards: ", all_rewards, "  [done=", done, "]")
        env_renderer.render_env(show=True, frames=True, show_observations=True)
        if sleep_for_animation:
            time.sleep(0.1)
        if done["__all__"]:
            break
    env_renderer.close_window()
    action_probs = action_count / np.sum(action_count)
    action_count = [1] * action_size

    # Smoothed values for terminal display and for more stable hyper-parameter tuning
    smoothing = 0.99
    smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * (
        1.0 - smoothing)
    smoothed_completion = smoothed_completion * smoothing + completion * (
        1.0 - smoothing)

    # Print logs
    if episode_idx % checkpoint_interval == 0:
        torch.save(policy.qnetwork_local,
                   './checkpoints/testmulti-' + str(episode_idx) + '.pth')
        if train_params.render:
            env_renderer.close_window()
    a.append(normalized_score)
    b.append(completion)
    print('\r🚂 Episode {}'
          '\t 🏆 Score: {:.3f}'
          ' Avg: {:.3f}'
          '\t 💯 Done: {:.2f}%'
          ' Avg: {:.2f}%'
          '\t 🎲 Epsilon: {:.2f} '
          '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score,
                                         smoothed_normalized_score,
                                         100 * completion,
                                         100 * smoothed_completion, eps_start,
                                         format_action_prob(action_probs)),
          end=" ")
Exemple #4
0
def train_agent(env_params, train_params):
    # Environment parameters
    n_agents = env_params.n_agents
    x_dim = env_params.x_dim
    y_dim = env_params.y_dim
    n_cities = env_params.n_cities
    max_rails_between_cities = env_params.max_rails_between_cities
    max_rails_in_city = env_params.max_rails_in_city
    seed = env_params.seed

    # Observation parameters
    observation_tree_depth = env_params.observation_tree_depth
    observation_radius = env_params.observation_radius
    observation_max_path_depth = env_params.observation_max_path_depth

    # Training parameters
    eps_start = train_params.eps_start
    eps_end = train_params.eps_end
    eps_decay = train_params.eps_decay
    n_episodes = train_params.n_episodes
    checkpoint_interval = train_params.checkpoint_interval
    n_eval_episodes = train_params.n_evaluation_episodes

    # Set the seeds
    random.seed(seed)
    np.random.seed(seed)

    # Break agents from time to time
    malfunction_parameters = MalfunctionParameters(
        malfunction_rate=1. / 10000,  # Rate of malfunctions
        min_duration=15,  # Minimal duration
        max_duration=50  # Max duration
    )

    # Observation builder
    predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
    tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth,
                                         predictor=predictor)

    # Fraction of train which each speed
    speed_profiles = {
        1.: 1.0,  # Fast passenger train
        1. / 2.: 0.0,  # Fast freight train
        1. / 3.: 0.0,  # Slow commuter train
        1. / 4.: 0.0  # Slow freight train
    }

    # Setup the environment
    env = RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=n_cities,
            grid_mode=False,
            max_rails_between_cities=max_rails_between_cities,
            max_rails_in_city=max_rails_in_city),
        schedule_generator=sparse_schedule_generator(speed_profiles),
        number_of_agents=n_agents,
        malfunction_generator_and_process_data=malfunction_from_params(
            malfunction_parameters),
        obs_builder_object=tree_observation,
        random_seed=seed)

    env.reset(regenerate_schedule=True, regenerate_rail=True)

    # Setup renderer
    if train_params.render:
        env_renderer = RenderTool(env, gl="PGL")

    # Calculate the state size given the depth of the tree observation and the number of features
    n_features_per_node = env.obs_builder.observation_dim
    n_nodes = 0
    for i in range(observation_tree_depth + 1):
        n_nodes += np.power(4, i)
    state_size = n_features_per_node * n_nodes

    # The action space of flatland is 5 discrete actions
    action_size = 5

    # Max number of steps per episode
    # This is the official formula used during evaluations
    # See details in flatland.envs.schedule_generators.sparse_schedule_generator
    max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities)))

    action_count = [0] * action_size
    action_dict = dict()
    agent_obs = [None] * env.get_num_agents()
    agent_prev_obs = [None] * env.get_num_agents()
    agent_prev_action = [2] * env.get_num_agents()
    update_values = False
    smoothed_normalized_score = -1.0
    smoothed_eval_normalized_score = -1.0
    smoothed_completion = 0.0
    smoothed_eval_completion = 0.0

    # Double Dueling DQN policy
    policy = DDDQNPolicy(state_size, action_size, train_params)

    # TensorBoard writer
    writer = SummaryWriter()
    writer.add_hparams(vars(train_params), {})
    writer.add_hparams(vars(env_params), {})

    training_timer = Timer()
    training_timer.start()

    print(
        "\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes.\n"
        .format(env.get_num_agents(), x_dim, y_dim, n_episodes,
                n_eval_episodes, checkpoint_interval))

    for episode_idx in range(n_episodes + 1):
        # Timers
        step_timer = Timer()
        reset_timer = Timer()
        learn_timer = Timer()
        preproc_timer = Timer()

        # Reset environment
        reset_timer.start()
        obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True)
        reset_timer.end()

        if train_params.render:
            env_renderer.set_new_rail()

        score = 0
        nb_steps = 0
        actions_taken = []

        # Build agent specific observations
        for agent in env.get_agent_handles():
            if obs[agent]:
                agent_obs[agent] = normalize_observation(
                    obs[agent],
                    observation_tree_depth,
                    observation_radius=observation_radius)
                agent_prev_obs[agent] = agent_obs[agent].copy()

        # Run episode
        for step in range(max_steps - 1):
            for agent in env.get_agent_handles():
                if info['action_required'][agent]:
                    # If an action is required, we want to store the obs at that step as well as the action
                    update_values = True
                    action = policy.act(agent_obs[agent], eps=eps_start)
                    action_count[action] += 1
                    actions_taken.append(action)
                else:
                    update_values = False
                    action = 0
                action_dict.update({agent: action})

            # Environment step
            step_timer.start()
            next_obs, all_rewards, done, info = env.step(action_dict)
            step_timer.end()

            if train_params.render and episode_idx % checkpoint_interval == 0:
                env_renderer.render_env(show=True,
                                        frames=False,
                                        show_observations=False,
                                        show_predictions=False)

            for agent in range(env.get_num_agents()):
                # Update replay buffer and train agent
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values or done[agent]:
                    learn_timer.start()
                    policy.step(agent_prev_obs[agent],
                                agent_prev_action[agent], all_rewards[agent],
                                agent_obs[agent], done[agent])
                    learn_timer.end()

                    agent_prev_obs[agent] = agent_obs[agent].copy()
                    agent_prev_action[agent] = action_dict[agent]

                # Preprocess the new observations
                if next_obs[agent]:
                    preproc_timer.start()
                    agent_obs[agent] = normalize_observation(
                        next_obs[agent],
                        observation_tree_depth,
                        observation_radius=observation_radius)
                    preproc_timer.end()

                score += all_rewards[agent]

            nb_steps = step

            if done['__all__']:
                break

        # Epsilon decay
        eps_start = max(eps_end, eps_decay * eps_start)

        # Collection information about training
        tasks_finished = sum(done[idx] for idx in env.get_agent_handles())
        completion = tasks_finished / max(1, env.get_num_agents())
        normalized_score = score / (max_steps * env.get_num_agents())
        action_probs = action_count / np.sum(action_count)
        action_count = [1] * action_size

        # Smoothed values for terminal display and for more stable hyper-parameter tuning
        smoothing = 0.99
        smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * (
            1.0 - smoothing)
        smoothed_completion = smoothed_completion * smoothing + completion * (
            1.0 - smoothing)

        # Print logs
        if episode_idx % checkpoint_interval == 0:
            torch.save(
                policy.qnetwork_local,
                './checkpoints/origin_multi-' + str(episode_idx) + '.pth')
            if train_params.render:
                env_renderer.close_window()

        print('\r🚂 Episode {}'
              '\t 🏆 Score: {:.3f}'
              ' Avg: {:.3f}'
              '\t 💯 Done: {:.2f}%'
              ' Avg: {:.2f}%'
              '\t 🎲 Epsilon: {:.2f} '
              '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score,
                                             smoothed_normalized_score,
                                             100 * completion,
                                             100 * smoothed_completion,
                                             eps_start,
                                             format_action_prob(action_probs)),
              end=" ")

        # Evaluate policy
        if episode_idx % train_params.checkpoint_interval == 0:
            scores, completions, nb_steps_eval = eval_policy(
                env, policy, n_eval_episodes, max_steps)
            writer.add_scalar("evaluation/scores_min", np.min(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_max", np.max(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_mean", np.mean(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_std", np.std(scores),
                              episode_idx)
            writer.add_histogram("evaluation/scores", np.array(scores),
                                 episode_idx)
            writer.add_scalar("evaluation/completions_min",
                              np.min(completions), episode_idx)
            writer.add_scalar("evaluation/completions_max",
                              np.max(completions), episode_idx)
            writer.add_scalar("evaluation/completions_mean",
                              np.mean(completions), episode_idx)
            writer.add_scalar("evaluation/completions_std",
                              np.std(completions), episode_idx)
            writer.add_histogram("evaluation/completions",
                                 np.array(completions), episode_idx)
            writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval),
                              episode_idx)
            writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval),
                              episode_idx)
            writer.add_scalar("evaluation/nb_steps_mean",
                              np.mean(nb_steps_eval), episode_idx)
            writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval),
                              episode_idx)
            writer.add_histogram("evaluation/nb_steps",
                                 np.array(nb_steps_eval), episode_idx)

            smoothing = 0.9
            smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean(
                scores) * (1.0 - smoothing)
            smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean(
                completions) * (1.0 - smoothing)
            writer.add_scalar("evaluation/smoothed_score",
                              smoothed_eval_normalized_score, episode_idx)
            writer.add_scalar("evaluation/smoothed_completion",
                              smoothed_eval_completion, episode_idx)

        # Save logs to tensorboard
        writer.add_scalar("training/score", normalized_score, episode_idx)
        writer.add_scalar("training/smoothed_score", smoothed_normalized_score,
                          episode_idx)
        writer.add_scalar("training/completion", np.mean(completion),
                          episode_idx)
        writer.add_scalar("training/smoothed_completion",
                          np.mean(smoothed_completion), episode_idx)
        writer.add_scalar("training/nb_steps", nb_steps, episode_idx)
        writer.add_histogram("actions/distribution", np.array(actions_taken),
                             episode_idx)
        writer.add_scalar("actions/nothing",
                          action_probs[RailEnvActions.DO_NOTHING], episode_idx)
        writer.add_scalar("actions/left",
                          action_probs[RailEnvActions.MOVE_LEFT], episode_idx)
        writer.add_scalar("actions/forward",
                          action_probs[RailEnvActions.MOVE_FORWARD],
                          episode_idx)
        writer.add_scalar("actions/right",
                          action_probs[RailEnvActions.MOVE_RIGHT], episode_idx)
        writer.add_scalar("actions/stop",
                          action_probs[RailEnvActions.STOP_MOVING],
                          episode_idx)
        writer.add_scalar("training/epsilon", eps_start, episode_idx)
        writer.add_scalar("training/buffer_size", len(policy.memory),
                          episode_idx)
        writer.add_scalar("training/loss", policy.loss, episode_idx)
        writer.add_scalar("timer/reset", reset_timer.get(), episode_idx)
        writer.add_scalar("timer/step", step_timer.get(), episode_idx)
        writer.add_scalar("timer/learn", learn_timer.get(), episode_idx)
        writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx)
        writer.add_scalar("timer/total", training_timer.get_current(),
                          episode_idx)
Exemple #5
0
def train_agent(train_params, train_env_params, eval_env_params, obs_params):
    # Environment parameters
    n_agents = train_env_params.n_agents
    x_dim = train_env_params.x_dim
    y_dim = train_env_params.y_dim
    n_cities = train_env_params.n_cities
    max_rails_between_cities = train_env_params.max_rails_between_cities
    max_rails_in_city = train_env_params.max_rails_in_city
    seed = train_env_params.seed

    # Unique ID for this training
    now = datetime.now()
    training_id = now.strftime('%y%m%d%H%M%S')

    # Observation parameters
    observation_tree_depth = obs_params.observation_tree_depth
    observation_radius = obs_params.observation_radius
    observation_max_path_depth = obs_params.observation_max_path_depth

    # Training parameters
    eps_start = train_params.eps_start
    eps_end = train_params.eps_end
    eps_decay = train_params.eps_decay
    n_episodes = train_params.n_episodes
    checkpoint_interval = train_params.checkpoint_interval
    n_eval_episodes = train_params.n_evaluation_episodes
    restore_replay_buffer = train_params.restore_replay_buffer
    save_replay_buffer = train_params.save_replay_buffer

    # Set the seeds
    random.seed(seed)
    np.random.seed(seed)

    # Observation builder
    predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
    tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth,
                                         predictor=predictor)

    # Setup the environments
    train_env = create_rail_env(train_env_params, tree_observation)
    train_env.reset(regenerate_schedule=True, regenerate_rail=True)
    eval_env = create_rail_env(eval_env_params, tree_observation)
    eval_env.reset(regenerate_schedule=True, regenerate_rail=True)

    # Setup renderer
    if train_params.render:
        env_renderer = RenderTool(train_env, gl="PGL")

    # Calculate the state size given the depth of the tree observation and the number of features
    n_features_per_node = train_env.obs_builder.observation_dim
    n_nodes = sum([np.power(4, i) for i in range(observation_tree_depth + 1)])
    state_size = n_features_per_node * n_nodes

    # The action space of flatland is 5 discrete actions
    action_size = 5

    # Max number of steps per episode
    # This is the official formula used during evaluations
    # See details in flatland.envs.schedule_generators.sparse_schedule_generator
    # max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities)))
    max_steps = train_env._max_episode_steps

    action_count = [0] * action_size
    action_dict = dict()
    agent_obs = [None] * n_agents
    agent_prev_obs = [None] * n_agents
    agent_prev_action = [2] * n_agents
    update_values = [False] * n_agents

    # Smoothed values used as target for hyperparameter tuning
    smoothed_normalized_score = -1.0
    smoothed_eval_normalized_score = -1.0
    smoothed_completion = 0.0
    smoothed_eval_completion = 0.0

    # Double Dueling DQN policy
    policy = DDDQNPolicy(state_size, action_size, train_params)

    # Loads existing replay buffer
    if restore_replay_buffer:
        try:
            policy.load_replay_buffer(restore_replay_buffer)
            policy.test()
        except RuntimeError as e:
            print(
                "\n🛑 Could't load replay buffer, were the experiences generated using the same tree depth?"
            )
            print(e)
            exit(1)

    print("\n💾 Replay buffer status: {}/{} experiences".format(
        len(policy.memory.memory), train_params.buffer_size))

    hdd = psutil.disk_usage('/')
    if save_replay_buffer and (hdd.free / (2**30)) < 500.0:
        print(
            "⚠️  Careful! Saving replay buffers will quickly consume a lot of disk space. You have {:.2f}gb left."
            .format(hdd.free / (2**30)))

    # TensorBoard writer
    writer = SummaryWriter()
    writer.add_hparams(vars(train_params), {})
    writer.add_hparams(vars(train_env_params), {})
    writer.add_hparams(vars(obs_params), {})

    training_timer = Timer()
    training_timer.start()

    print(
        "\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes. Training id '{}'.\n"
        .format(train_env.get_num_agents(), x_dim, y_dim, n_episodes,
                n_eval_episodes, checkpoint_interval, training_id))

    for episode_idx in range(n_episodes + 1):
        step_timer = Timer()
        reset_timer = Timer()
        learn_timer = Timer()
        preproc_timer = Timer()
        inference_timer = Timer()

        # Reset environment
        reset_timer.start()
        obs, info = train_env.reset(regenerate_rail=True,
                                    regenerate_schedule=True)
        reset_timer.end()

        if train_params.render:
            env_renderer.set_new_rail()

        score = 0
        nb_steps = 0
        actions_taken = []

        # Build initial agent-specific observations
        for agent in train_env.get_agent_handles():
            if obs[agent]:
                agent_obs[agent] = normalize_observation(
                    obs[agent],
                    observation_tree_depth,
                    observation_radius=observation_radius)
                agent_prev_obs[agent] = agent_obs[agent].copy()

        # Run episode
        for step in range(max_steps - 1):
            inference_timer.start()
            for agent in train_env.get_agent_handles():
                if info['action_required'][agent]:
                    update_values[agent] = True
                    action = policy.act(agent_obs[agent], eps=eps_start)

                    action_count[action] += 1
                    actions_taken.append(action)
                else:
                    # An action is not required if the train hasn't joined the railway network,
                    # if it already reached its target, or if is currently malfunctioning.
                    update_values[agent] = False
                    action = 0
                action_dict.update({agent: action})
            inference_timer.end()

            # Environment step
            step_timer.start()
            next_obs, all_rewards, done, info = train_env.step(action_dict)
            step_timer.end()

            # Render an episode at some interval
            if train_params.render and episode_idx % checkpoint_interval == 0:
                env_renderer.render_env(show=True,
                                        frames=False,
                                        show_observations=False,
                                        show_predictions=False)

            # Update replay buffer and train agent
            for agent in train_env.get_agent_handles():
                if update_values[agent] or done['__all__']:
                    # Only learn from timesteps where somethings happened
                    learn_timer.start()
                    policy.step(agent_prev_obs[agent],
                                agent_prev_action[agent], all_rewards[agent],
                                agent_obs[agent], done[agent])
                    learn_timer.end()

                    agent_prev_obs[agent] = agent_obs[agent].copy()
                    agent_prev_action[agent] = action_dict[agent]

                # Preprocess the new observations
                if next_obs[agent]:
                    preproc_timer.start()
                    agent_obs[agent] = normalize_observation(
                        next_obs[agent],
                        observation_tree_depth,
                        observation_radius=observation_radius)
                    preproc_timer.end()

                score += all_rewards[agent]

            nb_steps = step

            if done['__all__']:
                break

        # Epsilon decay
        eps_start = max(eps_end, eps_decay * eps_start)

        # Collect information about training
        tasks_finished = sum(done[idx]
                             for idx in train_env.get_agent_handles())
        completion = tasks_finished / max(1, train_env.get_num_agents())
        normalized_score = score / (max_steps * train_env.get_num_agents())
        action_probs = action_count / np.sum(action_count)
        action_count = [1] * action_size

        smoothing = 0.99
        smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * (
            1.0 - smoothing)
        smoothed_completion = smoothed_completion * smoothing + completion * (
            1.0 - smoothing)

        # Print logs
        if episode_idx % checkpoint_interval == 0:
            torch.save(
                policy.qnetwork_local, './checkpoints/' + training_id + '-' +
                str(episode_idx) + '.pth')

            if save_replay_buffer:
                policy.save_replay_buffer('./replay_buffers/' + training_id +
                                          '-' + str(episode_idx) + '.pkl')

            if train_params.render:
                env_renderer.close_window()

        print('\r🚂 Episode {}'
              '\t 🏆 Score: {:.3f}'
              ' Avg: {:.3f}'
              '\t 💯 Done: {:.2f}%'
              ' Avg: {:.2f}%'
              '\t 🎲 Epsilon: {:.3f} '
              '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score,
                                             smoothed_normalized_score,
                                             100 * completion,
                                             100 * smoothed_completion,
                                             eps_start,
                                             format_action_prob(action_probs)),
              end=" ")

        # Evaluate policy and log results at some interval
        if episode_idx % checkpoint_interval == 0 and n_eval_episodes > 0:
            scores, completions, nb_steps_eval = eval_policy(
                eval_env, policy, train_params, obs_params)

            writer.add_scalar("evaluation/scores_min", np.min(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_max", np.max(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_mean", np.mean(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_std", np.std(scores),
                              episode_idx)
            writer.add_histogram("evaluation/scores", np.array(scores),
                                 episode_idx)
            writer.add_scalar("evaluation/completions_min",
                              np.min(completions), episode_idx)
            writer.add_scalar("evaluation/completions_max",
                              np.max(completions), episode_idx)
            writer.add_scalar("evaluation/completions_mean",
                              np.mean(completions), episode_idx)
            writer.add_scalar("evaluation/completions_std",
                              np.std(completions), episode_idx)
            writer.add_histogram("evaluation/completions",
                                 np.array(completions), episode_idx)
            writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval),
                              episode_idx)
            writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval),
                              episode_idx)
            writer.add_scalar("evaluation/nb_steps_mean",
                              np.mean(nb_steps_eval), episode_idx)
            writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval),
                              episode_idx)
            writer.add_histogram("evaluation/nb_steps",
                                 np.array(nb_steps_eval), episode_idx)

            smoothing = 0.9
            smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean(
                scores) * (1.0 - smoothing)
            smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean(
                completions) * (1.0 - smoothing)
            writer.add_scalar("evaluation/smoothed_score",
                              smoothed_eval_normalized_score, episode_idx)
            writer.add_scalar("evaluation/smoothed_completion",
                              smoothed_eval_completion, episode_idx)

        # Save logs to tensorboard
        writer.add_scalar("training/score", normalized_score, episode_idx)
        writer.add_scalar("training/smoothed_score", smoothed_normalized_score,
                          episode_idx)
        writer.add_scalar("training/completion", np.mean(completion),
                          episode_idx)
        writer.add_scalar("training/smoothed_completion",
                          np.mean(smoothed_completion), episode_idx)
        writer.add_scalar("training/nb_steps", nb_steps, episode_idx)
        writer.add_histogram("actions/distribution", np.array(actions_taken),
                             episode_idx)
        writer.add_scalar("actions/nothing",
                          action_probs[RailEnvActions.DO_NOTHING], episode_idx)
        writer.add_scalar("actions/left",
                          action_probs[RailEnvActions.MOVE_LEFT], episode_idx)
        writer.add_scalar("actions/forward",
                          action_probs[RailEnvActions.MOVE_FORWARD],
                          episode_idx)
        writer.add_scalar("actions/right",
                          action_probs[RailEnvActions.MOVE_RIGHT], episode_idx)
        writer.add_scalar("actions/stop",
                          action_probs[RailEnvActions.STOP_MOVING],
                          episode_idx)
        writer.add_scalar("training/epsilon", eps_start, episode_idx)
        writer.add_scalar("training/buffer_size", len(policy.memory),
                          episode_idx)
        writer.add_scalar("training/loss", policy.loss, episode_idx)
        writer.add_scalar("timer/reset", reset_timer.get(), episode_idx)
        writer.add_scalar("timer/step", step_timer.get(), episode_idx)
        writer.add_scalar("timer/learn", learn_timer.get(), episode_idx)
        writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx)
        writer.add_scalar("timer/total", training_timer.get_current(),
                          episode_idx)
Exemple #6
0
def run_episode(kwargs) -> [Trajectory]:
    """
    Runs a single episode and collects the trajectories of each agent
    """
    total_controller_time = 0
    env_dict: Callable = kwargs.get("env_dict")
    obs_builder = kwargs.get("obs_builder")
    controller_creator: Callable = kwargs.get("controller_creator")
    episode_id: int = kwargs.get("episode_id")
    max_episode_length: int = kwargs.get("max_episode_length", 1000)
    render: bool = kwargs.get("render", False)
    # Create and Start Environment
    _env = load_env(env_dict, obs_builder_object=obs_builder)
    obs, info = _env.reset(
        regenerate_rail=False,
        regenerate_schedule=True,
    )
    score = 0
    _trajectories = [Trajectory() for _ in _env.get_agent_handles()]

    # Create and Start Controller
    controller: AbstractController = controller_creator()
    start = time.time()
    controller.start_of_round(obs=obs, env=_env)
    total_controller_time += time.time() - start

    if render:
        env_renderer = RenderTool(_env)
        env_renderer.reset()

    for step in range(max_episode_length):
        start = time.time()
        action_dict, processed_obs = controller.act(observation=obs)
        total_controller_time += time.time() - start
        next_obs, all_rewards, done, info = _env.step(action_dict)

        if render:
            env_renderer.render_env(show=True,
                                    show_observations=True,
                                    show_predictions=False)

        # Save actions and rewards for each agent
        [
            _trajectories[agent_handle].add_row(
                state=processed_obs[agent_handle],
                action=action_dict[agent_handle],
                reward=all_rewards[agent_handle],
                done=done[agent_handle])
            for agent_handle in _env.get_agent_handles()
        ]

        score += sum(all_rewards)

        obs = next_obs.copy()
        if done['__all__']:
            break

    if render:
        env_renderer.close_window()
    # print(f"\nController took a total time of: {total_controller_time} seconds", flush=True)
    return _trajectories
Exemple #7
0
def main(args):
    try:
        opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""])
    except getopt.GetoptError as err:
        print(str(err))  # will print something like "option -a not recognized"
        sys.exit(2)
    sleep_for_animation = True
    for o, a in opts:
        if o in ("--sleep-for-animation"):
            sleep_for_animation = str2bool(a)
        else:
            assert False, "unhandled option"

    test_envs_root = "./railway"
    test_env_file_path = "testing_stuff.pkl"

    test_env_file_path = os.path.join(
        test_envs_root,
        test_env_file_path
    )

    x_dim = 7
    y_dim = 7
    n_agents = 4

    stochastic_data = {'prop_malfunction': 0.05,  # Percentage of defective agents
                       'malfunction_rate': 100,  # Rate of malfunction occurence
                       'min_duration': 20,  # Minimal duration of malfunction
                       'max_duration': 50  # Max duration of malfunction
                       }

    # Different agent types (trains) with different speeds.
    speed_ration_map = {1.: 0.25,  # Fast passenger train
                        1. / 2.: 0.25,  # Fast freight train
                        1. / 3.: 0.25,  # Slow commuter train
                        1. / 4.: 0.25}  # Slow freight train

    # env = RailEnv(width=1, height=1, rail_generator=rail_from_file(test_env_file_path),
    #                    schedule_generator=schedule_from_file(test_env_file_path),
    #                    #malfunction_generator_and_process_data=malfunction_from_file(test_env_file_path),
    #                    obs_builder_object=MultipleAgentNavigationObs(max_depth=2, predictor=ShortestPathPredictorForRailEnv(30)))
    #
    # #env.number_of_agents = n_agents
    # n_agents = env.number_of_agents
    env = RailEnv(width=x_dim,
                  height=y_dim,
                  rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=1, min_dist=6, max_dist=99999,seed=1),
                  # sparse_rail_generator(max_num_cities=3,
                  #                                      # Number of cities in map (where train stations are)
                  #                                      seed=1,  # Random seed
                  #                                      grid_mode=False,
                  #                                      max_rails_between_cities=2,
                  #                                      max_rails_in_city=3),
                  schedule_generator=complex_schedule_generator(speed_ration_map),
                  number_of_agents=n_agents,
                  malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
    #
    # env = RailEnv(width=7, height=7,
    #               rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=1, min_dist=5, max_dist=99999,
    #                                                     seed=1), schedule_generator=complex_schedule_generator(),
    #               number_of_agents=n_agents,
                  obs_builder_object=MultipleAgentNavigationObs(max_depth=2, predictor=ShortestPathPredictorForRailEnv(30)))

    max_steps = int(4 * 2 * (20 + env.height + env.width))
    obs, info = env.reset(regenerate_rail=True,
            regenerate_schedule=True,
            random_seed=random_seed)
    env_renderer = RenderTool(env, gl="PILSVG")
    env_renderer.render_env(show=True, frames=True, show_observations=True)

    # Reset score and done
    score = 0
    env_done = 0
    step = 0
    for step in range(max_steps):
        action_dict = {}
        for i in range(n_agents):
            if not obs:
                action_dict.update({i: 2})
            elif obs[i] is not None:
                action = np.argmax(obs[i][1:4]) + 1
                action_dict.update({i: action})

        obs, all_rewards, done, _ = env.step(action_dict)
        print("Rewards: ", all_rewards, "  [done=", done, "]")

        for a in range(env.get_num_agents()):
            score += all_rewards[a] / env.get_num_agents()

        env_renderer.render_env(show=True, frames=True, show_observations=True)
        if sleep_for_animation:
            time.sleep(0.5)
        if done["__all__"]:
            break

        # Collection information about training
        tasks_finished = 0
        for current_agent in env.agents:
            if current_agent.status == RailAgentStatus.DONE_REMOVED:
                tasks_finished += 1
        done_window = tasks_finished / max(1, env.get_num_agents())
        scores_window = score / max_steps
        print(
            '\rTraining {} Agents on ({},{}).\t Steps {}\t Average Score: {:.3f}\tDones: {:.2f}%\t'.format(
                env.get_num_agents(), x_dim, y_dim,
                step,
                np.mean(scores_window),
                100 * np.mean(done_window)), end=" ")

    tasks_finished = 0
    for current_agent in env.agents:
        if current_agent.status == RailAgentStatus.DONE_REMOVED:
            tasks_finished += 1
    done_window = tasks_finished / max(1, env.get_num_agents())
    scores_window = score / max_steps
    print(
        '\rTraining {} Agents on ({},{}).\t Total Steps {}\t Average Score: {:.3f}\tDones: {:.2f}%\t'.format(
            env.get_num_agents(), x_dim, y_dim,
            step,
            np.mean(scores_window),
            100 * np.mean(done_window)), end=" ")

    env_renderer.close_window()
def main(args):
    try:
        opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""])
    except getopt.GetoptError as err:
        print(str(err))  # will print something like "option -a not recognized"
        sys.exit(2)
    sleep_for_animation = True
    for o, a in opts:
        if o in ("--sleep-for-animation"):
            sleep_for_animation = str2bool(a)
        else:
            assert False, "unhandled option"

    test_envs_root = f"./test-envs/Test_{test_env_no}"
    test_env_file_path = f"Level_{level_no}.pkl"

    test_env_file_path = os.path.join(
        test_envs_root,
        test_env_file_path
    )

    x_dim = 35
    y_dim = 35
    n_agents = 10

    stochastic_data = {'prop_malfunction': 0.05,  # Percentage of defective agents
                       'malfunction_rate': 100,  # Rate of malfunction occurence
                       'min_duration': 2,  # Minimal duration of malfunction
                       'max_duration': 5  # Max duration of malfunction
                       }

    # Different agent types (trains) with different speeds.
    speed_ration_map = {1.: 0.25,  # Fast passenger train
                        1. / 2.: 0.25,  # Fast freight train
                        1. / 3.: 0.25,  # Slow commuter train
                        1. / 4.: 0.25}  # Slow freight train

    env = RailEnv(width=x_dim,
                  height=y_dim,
                  #rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=1, min_dist=6, max_dist=99999,seed=1),
                  rail_generator=sparse_rail_generator(max_num_cities=3,
                                                       # Number of cities in map (where train stations are)
                                                       seed=1,  # Random seed
                                                       grid_mode=False,
                                                       max_rails_between_cities=2,
                                                       max_rails_in_city=3),
                  #schedule_generator=complex_schedule_generator(speed_ration_map),
                  schedule_generator=sparse_schedule_generator(speed_ration_map),
                  number_of_agents=n_agents,
                  malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
                  obs_builder_object=MultipleAgentNavigationObs(max_depth=2, predictor=ShortestPathPredictorForRailEnv(30)))


    # print(f"Testing Environment: {test_env_file_path} with seed: {random_seed}")
    # env = RailEnv(width=1, height=1, rail_generator=rail_from_file(test_env_file_path),
    #                    schedule_generator=schedule_from_file(test_env_file_path),
    #                    malfunction_generator_and_process_data=malfunction_from_file(test_env_file_path),
    #                    obs_builder_object=MultipleAgentNavigationObs(max_depth=2, predictor=ShortestPathPredictorForRailEnv(30)))

    obs, info = env.reset(regenerate_rail=True,
            regenerate_schedule=True,
            activate_agents=False,
            random_seed=random_seed)

    n_agents = env.get_num_agents()
    x_dim, y_dim = env.width,env.height

    max_steps = int(4 * 2 * (20 + env.height + env.width))

    env_renderer = RenderTool(env, gl="PILSVG")
    env_renderer.render_env(show=True, frames=True, show_observations=True)

    # Reset score and done
    score = 0
    env_done = 0
    step = 0
    for step in range(max_steps):

        for i in range(n_agents):
            if obs[i] is not None:
                observations, prediction_data, prediction_pos = obs[i]
                break

        action_dict = {}
        next_shortest_actions = 2*np.ones(n_agents)
        next_next_shortest_actions = 2*np.ones(n_agents)
        agent_conflicts = np.zeros((n_agents,n_agents))
        agent_conflicts_count = np.zeros((n_agents, n_agents))
        minDist = -1 *np.ones(n_agents)
        incDiff1 = -1 * np.ones(n_agents)
        incDiff2 = -1 * np.ones(n_agents)
        malfunc = np.zeros(n_agents)
        speed = np.ones(n_agents)
        pos_frac = np.ones(n_agents)
        agent_num_conflicts = []

        vals = []
        counts = []
        counter = np.zeros(n_agents)
        for i in range(30):
            pos = prediction_pos[i]
            val, count = np.unique(pos, return_counts=True)
            if(val[0] == -1):
                val = val[1:]
                count = count[1:]
            vals.append(val)
            counts.append(count)

            for j,curVal in enumerate(val):
                #curVal = vals[i]
                curCount = count[j]
                if curCount > 1:
                    idxs = np.argwhere(pos == curVal)
                    lsIdx = [int(x) for x in idxs]
                    combs = list(combinations(lsIdx,2))
                    for k,comb in enumerate(combs):
                        counter[comb[0]] += 1
                        counter[comb[1]] += 1
                        agent_conflicts_count[comb[0], comb[1]] = (counter[comb[0]] + counter[comb[1]])/2
                        if agent_conflicts[comb[0], comb[1]] == 0:
                            agent_conflicts[comb[0], comb[1]] = i
                        else:
                            agent_conflicts[comb[0], comb[1]] = min(i, agent_conflicts[comb[0], comb[1]])

        for i in range(n_agents):
            agent_num_conflicts.append(sum(agent_conflicts[i,:]))
            if not obs or obs is None or obs[i] is None:
                action_dict.update({i: 2})
            elif obs[i][0] is not None:
                shortest_action = np.argmax(obs[i][0][1:4]) + 1
                next_shortest_action = np.argmax(obs[i][0][5:7]) + 1
                next_next_shortest_action = np.argmax(obs[i][0][8:10]) + 1
                next_shortest_actions[i] = next_shortest_action
                next_next_shortest_actions[i] = next_next_shortest_action
                malfunc[i] = obs[i][0][-3]
                speed[i] = obs[i][0][-2]
                pos_frac[i] = obs[i][0][-1]
                minDist[i] = obs[i][0][0]
                incDiff1[i] = obs[i][0][-5]
                incDiff2[i] = obs[i][0][-4]
                action_dict.update({i: shortest_action})
            else:
                action_dict.update({i: 2})
        mal_agents = (np.array(-1))
        for i in range(n_agents):
            if agent_num_conflicts[i] > 0:
                mal_agents = np.where(malfunc > 0)
                for i,mal_agent in enumerate(mal_agents[0]):
                    if mal_agent is None:
                        break
                    conflict_agents = np.where(agent_conflicts[:,int(mal_agent)]>0)

                    for j,cur_conflict_agent in enumerate(conflict_agents[0]):
                        cur_conflict_agent = int(cur_conflict_agent)
                        steps_conflict = agent_conflicts[cur_conflict_agent, mal_agent]
                        if steps_conflict <= 3:
                            if incDiff1[cur_conflict_agent] == -1:
                                if int(minDist[cur_conflict_agent]) >= 5:
                                    action_dict.update({cur_conflict_agent: 4})
                                elif agent_conflicts_count[cur_conflict_agent,mal_agent] > 1:
                                    action_dict.update({cur_conflict_agent: 4})
                            elif minDist[cur_conflict_agent] > incDiff1[cur_conflict_agent]:
                                action_dict.update({cur_conflict_agent: 4})
                            else:
                                action_dict.update({cur_conflict_agent: next_shortest_actions[cur_conflict_agent]})

        obs, all_rewards, done, _ = env.step(action_dict)

        print("Rewards: ", all_rewards, "  [done=", done, "]")

        for a in range(env.get_num_agents()):
            score += all_rewards[a] / env.get_num_agents()

        env_renderer.render_env(show=True, frames=True, show_observations=True)
        if sleep_for_animation:
            time.sleep(0.5)
        if done["__all__"]:
            break

        # Collection information about training
        tasks_finished = 0
        for current_agent in env.agents:
            if current_agent.status == RailAgentStatus.DONE_REMOVED:
                tasks_finished += 1
        done_window = tasks_finished / max(1, env.get_num_agents())
        scores_window = score / max_steps
        print(
            '\rTraining {} Agents on ({},{}).\t Steps {}\t Average Score: {:.3f}\tDones: {:.2f}%\t'.format(
                env.get_num_agents(), x_dim, y_dim,
                step,
                np.mean(scores_window),
                100 * np.mean(done_window)), end=" ")

    tasks_finished = 0
    for current_agent in env.agents:
        if current_agent.status == RailAgentStatus.DONE_REMOVED:
            tasks_finished += 1
    done_window = tasks_finished / max(1, env.get_num_agents())
    scores_window = score / max_steps
    print(
        '\rTraining {} Agents on ({},{}).\t Total Steps {}\t Average Score: {:.3f}\tDones: {:.2f}%\t'.format(
            env.get_num_agents(), x_dim, y_dim,
            step,
            np.mean(scores_window),
            100 * np.mean(done_window)), end=" ")

    env_renderer.close_window()
Exemple #9
0
def main(args):
    try:
        opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""])
    except getopt.GetoptError as err:
        print(str(err))  # will print something like "option -a not recognized"
        sys.exit(2)
    sleep_for_animation = True
    for o, a in opts:
        if o in ("--sleep-for-animation"):
            sleep_for_animation = str2bool(a)
        else:
            assert False, "unhandled option"

    test_envs_root = f"./test-envs/Test_{test_env_no}"
    test_env_file_path = f"Level_{level_no}.pkl"

    test_env_file_path = os.path.join(
        test_envs_root,
        test_env_file_path
    )
    print(f"Testing Environment: {test_env_file_path} with seed: {random_seed}")

    env = RailEnv(width=1, height=1, rail_generator=rail_from_file(test_env_file_path),
                       schedule_generator=schedule_from_file(test_env_file_path),
                       malfunction_generator_and_process_data=malfunction_from_file(test_env_file_path),
                       obs_builder_object=MultipleAgentNavigationObs(max_depth=2, predictor=ShortestPathPredictorForRailEnv(30)))

    max_steps = int(4 * 2 * (20 + env.height + env.width))

    obs, info = env.reset(regenerate_rail=True,
            regenerate_schedule=True,
            activate_agents=False,
            random_seed=random_seed)
    env_renderer = RenderTool(env, gl="PILSVG")
    env_renderer.render_env(show=True, frames=True, show_observations=True)
    n_agents = env.get_num_agents()
    x_dim, y_dim = env.width,env.height
    # Reset score and done
    score = 0
    env_done = 0
    step = 0
    for step in range(max_steps):
        action_dict = {}
        for i in range(n_agents):
            if not obs:
                action_dict.update({i: 2})
            elif obs[i] is not None:
                action = np.argmax(obs[i][1:4]) + 1
                action_dict.update({i: action})

        obs, all_rewards, done, _ = env.step(action_dict)
        print("Rewards: ", all_rewards, "  [done=", done, "]")

        for a in range(env.get_num_agents()):
            score += all_rewards[a] / env.get_num_agents()

        env_renderer.render_env(show=True, frames=True, show_observations=True)
        if sleep_for_animation:
            time.sleep(0.5)
        if done["__all__"]:
            break

        # Collection information about training
        tasks_finished = 0
        for current_agent in env.agents:
            if current_agent.status == RailAgentStatus.DONE_REMOVED:
                tasks_finished += 1
        done_window = tasks_finished / max(1, env.get_num_agents())
        scores_window = score / max_steps
        print(
            '\rTraining {} Agents on ({},{}).\t Steps {}\t Average Score: {:.3f}\tDones: {:.2f}%\t'.format(
                n_agents, x_dim, y_dim,
                step,
                np.mean(scores_window),
                100 * np.mean(done_window)), end=" ")

    tasks_finished = 0
    for current_agent in env.agents:
        if current_agent.status == RailAgentStatus.DONE_REMOVED:
            tasks_finished += 1
    done_window = tasks_finished / max(1, env.get_num_agents())
    scores_window = score / max_steps
    print(
        '\rTraining {} Agents on ({},{}).\t Total Steps {}\t Average Score: {:.3f}\tDones: {:.2f}%\t'.format(
            n_agents, x_dim, y_dim,
            step,
            np.mean(scores_window),
            100 * np.mean(done_window)), end=" ")

    env_renderer.close_window()
Exemple #10
0
def evaluate_remote():
    remote_client = FlatlandRemoteClient()
    my_observation_builder = SimpleObservation(max_depth=3, neighbours_depth=3,
            timetable=Judge(LinearOnAgentNumberSizeGenerator(0.03, 5), lr=0,
                    batch_size=0, optimization_epochs=0, device=torch.device("cpu")),
            deadlock_checker=DeadlockChecker(), greedy_checker=GreedyChecker(), parallel=False, eval=True)

    params = torch.load("generated/params.torch")
    params.neighbours_depth=my_observation_builder.neighbours_depth
    controller = PPOController(params, torch.device("cpu"))
    controller.load_controller("generated/controller.torch")
    my_observation_builder.timetable.load_judge("generated/judge.torch")

    render = False

    sum_reward, sum_percent_done = 0., 0.
    for evaluation_number in itertools.count():
        time_start = time.time()
        observation, info = remote_client.env_create(obs_builder_object=my_observation_builder)
        if not observation:
            break

        local_env = FlatlandWrapper(remote_client.env, FakeRewardShaper())
        local_env.n_agents = len(local_env.agents)
        log().check_time()
        if render:
            env_renderer = RenderTool(
                local_env.env,
                agent_render_variant=AgentRenderVariant.ONE_STEP_BEHIND,
                show_debug=True,
                screen_height=600,
                screen_width=800
            )

        env_creation_time = time.time() - time_start

        print("Evaluation Number : {}".format(evaluation_number))

        time_taken_by_controller = []
        time_taken_per_step = []
        steps = 0
        done = defaultdict(lambda: False)
        while True:
            try:
                if render:
                    env_renderer.render_env(show=True, show_observations=False, show_predictions=False)
                time_start = time.time()
                action_dict = dict()
                handles_to_ask = list()
                observation = {k: torch.tensor(v, dtype=torch.float) for k, v in observation.items() if v is not None}
                for i in range(local_env.n_agents):
                    if not done[i]:
                        if local_env.obs_builder.greedy_checker.greedy_position(i):
                            action_dict[i] = 0
                        elif i in observation:
                            handles_to_ask.append(i)

                for handle in handles_to_ask:
                    for opp_handle in local_env.obs_builder.encountered[handle]:
                        if opp_handle != -1 and opp_handle not in observation:
                            observation[opp_handle] = torch.tensor(local_env.obs_builder._get_internal(opp_handle), dtype=torch.float)

                time_taken_per_step.append(time.time() - time_start)
                time_start = time.time()

                controller_actions = controller.fast_select_actions(handles_to_ask, observation,
                        local_env.obs_builder.encountered, train=True)
                action_dict.update(controller_actions)
                action_dict = {k: local_env.transform_action(k, v) for k, v in action_dict.items()}
                action_dict = {handle: action for handle, action in action_dict.items() if action != -1}

                time_taken = time.time() - time_start
                time_taken_by_controller.append(time_taken)

                time_start = time.time()
                observation, all_rewards, done, info = remote_client.env_step(action_dict)
                num_done = sum([1 for agent in local_env.agents if agent.status == RailAgentStatus.DONE_REMOVED])
                num_started = sum([1 for handle in range(len(local_env.agents)) if local_env.obs_builder.timetable.is_ready(handle)])

                finished_handles = [handle for handle in range(len(local_env.agents))
                        if local_env.obs_builder.timetable.ready_to_depart[handle] == 2]
                reward = torch.sum(local_env._max_episode_steps - local_env.obs_builder.timetable.end_time[finished_handles])
                reward /= len(local_env.agents) * local_env._max_episode_steps
                percent_done = float(num_done) / len(local_env.agents)
                deadlocked = int(sum(local_env.obs_builder.deadlock_checker._is_deadlocked) + 0.5)

                steps += 1
                time_taken = time.time() - time_start
                time_taken_per_step.append(time_taken)

                if done['__all__']:
                    print("Done agents {}/{}".format(num_done, len(local_env.agents)))
                    print("Started agents {}/{}".format(num_started, len(local_env.agents)))
                    print("Deadlocked agents {}/{}".format(deadlocked, len(local_env.agents)))
                    print("Reward: {}        Percent done: {}".format(reward, percent_done))
                    sum_reward += reward
                    sum_percent_done += percent_done
                    print("Total reward: {}        Avg percent done: {}".format(sum_reward, sum_percent_done / (evaluation_number + 1)))
                    if render:
                        env_renderer.close_window()
                    break
            except TimeoutException as err:
                print("Timeout! Will skip this episode and go to the next.", err)
                break

        
        np_time_taken_by_controller = np.array(time_taken_by_controller)
        np_time_taken_per_step = np.array(time_taken_per_step)
        print("="*100)
        print("="*100)
        print("Evaluation Number : ", evaluation_number)
        print("Current Env Path : ", remote_client.current_env_path)
        print("Env Creation Time : ", env_creation_time)
        print("Number of Steps : {}/{}".format(steps, local_env._max_episode_steps))
        print("Mean/Std/Sum of Time taken by Controller : ", np_time_taken_by_controller.mean(), np_time_taken_by_controller.std(), np_time_taken_by_controller.sum())
        print("Mean/Std/Sum of Time per Step : ", np_time_taken_per_step.mean(), np_time_taken_per_step.std(), np_time_taken_per_step.sum())
        log().print_time_metrics()
        log().zero_time_metrics()
        print("="*100)
        print("\n\n")

    print("Evaluation of all environments complete...")
    print(remote_client.submit())
class SingleAgentEnvironment(Env):
    flatland_env = None
    renderer = None
    """
    Args:
        flatland_env: The Flatland environment
        renderer: The renderer
    """
    def __init__(self, flatland_env, renderer=None):
        self.flatland_env = flatland_env
        self.renderer = renderer

        self.reward_range = (-1, 1)
        self.action_space = Discrete(5)
        self.observation_space = Discrete(5)

    """
    Execute an action.
    Args:
        action_dict: the dictionary agent -> action to perform
    Return:
        new_observation: The new observation for each agent
        reward: The reward for each agent
        done: True if an agent has concluded
        info: Some info for each agent
    """

    def step(self, action_dict):
        return self.flatland_env.step(action_dict)

    """
    Reset the environment and return an observation
    Returns:
        observation: The new observation
    """

    def reset(self):
        return self.flatland_env.reset(regenerate_rail=False,
                                       regenerate_schedule=False,
                                       random_seed=True)

    """
        Render the environment
    """

    def render(self, mode='human'):
        # TODO: Merge both strategies (Jupyter vs .py)
        # In .py files
        # self.renderer.render_env(show=False, show_observations=False, show_predictions=False)
        # In Jupyter Notebooks
        env_renderer = RenderTool(self.flatland_env, gl="PILSVG")
        env_renderer.render_env()

        image = env_renderer.get_image()
        pil_image = Image.fromarray(image)
        display(pil_image)
        return image

    """
        Reset the renderer the environment
    """

    def reset_renderer(self):
        self.renderer = RenderTool(self.flatland_env,
                                   gl="PILSVG",
                                   agent_render_variant=AgentRenderVariant.
                                   AGENT_SHOWS_OPTIONS_AND_BOX,
                                   show_debug=True,
                                   screen_height=700,
                                   screen_width=1300)

    def close_window(self):
        self.renderer.close_window()
class FlatlandRenderWrapper(RailEnv, gym.Env):

    # reward_range = (-float('inf'), float('inf'))
    # spec = None

    # # Set these in ALL subclasses
    # observation_space = None

    def __init__(self, use_renderer=False, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.use_renderer = use_renderer
        self.renderer = None
        self.metadata = {
            'render.modes': ['human', 'rgb_array'],
            'video.frames_per_second': 10,
            'semantics.autoreset': True
        }
        if self.use_renderer:
            self.initialize_renderer()

    def reset(self, *args, **kwargs):
        if self.use_renderer:
            if self.renderer:  #TODO: Errors with RLLib with renderer as None.
                self.renderer.reset()
        return super().reset(*args, **kwargs)

    def render(self, mode='human'):
        """
        This methods provides the option to render the
        environment's behavior to a window which should be
        readable to the human eye if mode is set to 'human'.
        """
        if not self.use_renderer:
            return

        if not self.renderer:
            self.initialize_renderer(mode=mode)

        return self.update_renderer(mode=mode)

    def initialize_renderer(self, mode="human"):
        # Initiate the renderer
        from flatland.utils.rendertools import RenderTool, AgentRenderVariant
        self.renderer = RenderTool(
            self,
            gl="PGL",  # gl="TKPILSVG",
            agent_render_variant=AgentRenderVariant.ONE_STEP_BEHIND,
            show_debug=False,
            screen_height=600,  # Adjust these parameters to fit your resolution
            screen_width=800)  # Adjust these parameters to fit your resolution
        self.renderer.show = False

    def update_renderer(self, mode='human'):
        image = self.renderer.render_env(show=False,
                                         show_observations=False,
                                         show_predictions=False,
                                         return_image=True)
        return image[:, :, :3]

    def set_renderer(self, renderer):
        self.use_renderer = renderer
        if self.use_renderer:
            self.initialize_renderer(mode=self.use_renderer)

    def close(self):
        super().close()
        if self.renderer:
            try:
                if self.renderer.show:
                    self.renderer.close_window()
            except Exception as e:
                # This is since the last step(Due to a stopping criteria) is skipped by rllib
                # Due to this done is not true and the env does not close
                # Finally the env is closed when RLLib exits but at that time there is no window
                # and hence the error
                print("Could Not close window due to:", e)
            self.renderer = None
def main(args):
    try:
        opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""])
    except getopt.GetoptError as err:
        print(str(err))  # will print something like "option -a not recognized"
        sys.exit(2)
    sleep_for_animation = True
    for o, a in opts:
        if o in ("--sleep-for-animation"):
            sleep_for_animation = str2bool(a)
        else:
            assert False, "unhandled option"

    batch_builder = SampleBatchBuilder()  # or MultiAgentSampleBatchBuilder
    writer = JsonWriter("./out/")

    #  Setting these 2 parameters to True can slow down training
    visuals = False
    sleep_for_animation = False

    if visuals:
        from flatland.utils.rendertools import RenderTool

    max_depth = 30
    tree_depth = 2
    trial_start = 100
    n_trials = 999
    start = 0

    columns = [
        'Agents', 'X_DIM', 'Y_DIM', 'TRIAL_NO', 'REWARD', 'NORMALIZED_REWARD',
        'DONE_RATIO', 'STEPS', 'ACTION_PROB'
    ]
    df_all_results = pd.DataFrame(columns=columns)

    for trials in range(trial_start, n_trials + 1):

        env_file = f"envs-100-999/envs/Level_{trials}.pkl"
        # env_file = f"../env_configs/round_1-small/Test_0/Level_{trials}.mpk"

        # file = f"../env_configs/actions-small/Test_0/Level_{trials}.mpk"
        file = f"envs-100-999/actions/envs/Level_{trials}.json"

        if not os.path.isfile(env_file) or not os.path.isfile(file):
            print("Missing file!", env_file, file)
            continue

        step = 0

        obs_builder_object = TreeObsForRailEnv(
            max_depth=tree_depth,
            predictor=ShortestPathPredictorForRailEnv(max_depth))

        env = RailEnv(
            width=1,
            height=1,
            rail_generator=rail_from_file(env_file),
            schedule_generator=schedule_from_file(env_file),
            malfunction_generator_and_process_data=malfunction_from_file(
                env_file),
            obs_builder_object=obs_builder_object)

        obs, info = env.reset(regenerate_rail=True,
                              regenerate_schedule=True,
                              activate_agents=False,
                              random_seed=1001)

        with open(file, "r") as files:
            expert_actions = json.load(files)

        n_agents = env.get_num_agents()
        x_dim, y_dim = env.width, env.height

        agent_obs = [None] * n_agents
        agent_obs_buffer = [None] * n_agents
        done = dict()
        done["__all__"] = False

        if imitate:
            agent_action_buffer = list(expert_actions[step].values())
        else:
            # , p=[0.2, 0, 0.5])  # [0] * n_agents
            agent_action_buffer = np.random.choice(5, n_agents, replace=True)
        update_values = [False] * n_agents

        max_steps = int(4 * 2 * (20 + env.height + env.width))

        action_size = 5  # 3

        # And some variables to keep track of the progress
        action_dict = dict()
        scores_window = deque(maxlen=100)
        reward_window = deque(maxlen=100)
        done_window = deque(maxlen=100)
        action_prob = [0] * action_size

        # agent = Agent(state_size, action_size)

        if visuals:
            env_renderer = RenderTool(env, gl="PILSVG")
            env_renderer.render_env(show=True,
                                    frames=True,
                                    show_observations=True)

        for a in range(n_agents):
            if obs[a]:
                agent_obs[a] = normalize_observation(obs[a],
                                                     tree_depth,
                                                     observation_radius=10)
                agent_obs_buffer[a] = agent_obs[a].copy()

        # Reset score and done
        score = 0
        agent_action_buffer = np.zeros(n_agents)
        # prev_action = np.zeros_like(envs.action_space.sample())
        prev_reward = np.zeros(n_agents)
        for step in range(max_steps):
            for a in range(n_agents):
                if info['action_required'][a]:
                    if imitate:
                        if step < len(expert_actions):
                            action = expert_actions[step][str(a)]
                        else:
                            action = 0
                    else:
                        action = 0

                    action_prob[action] += 1
                    update_values[a] = True

                else:
                    update_values[a] = False
                    action = 0

                action_dict.update({a: action})

            next_obs, all_rewards, done, info = env.step(action_dict)

            for a in range(n_agents):

                if next_obs[a] is not None:
                    agent_obs[a] = normalize_observation(next_obs[a],
                                                         tree_depth,
                                                         observation_radius=10)

                # Only update the values when we are done or when an action
                # was taken and thus relevant information is present
                if update_values[a] or done[a]:
                    start += 1

                    batch_builder.add_values(
                        t=step,
                        eps_id=trials,
                        agent_index=0,
                        obs=agent_obs_buffer[a],
                        actions=action_dict[a],
                        action_prob=1.0,  # put the true action probability
                        rewards=all_rewards[a],
                        prev_actions=agent_action_buffer[a],
                        prev_rewards=prev_reward[a],
                        dones=done[a],
                        infos=info['action_required'][a],
                        new_obs=agent_obs[a])

                agent_obs_buffer[a] = agent_obs[a].copy()
                agent_action_buffer[a] = action_dict[a]
                prev_reward[a] = all_rewards[a]

                score += all_rewards[a]  # / envs.get_num_agents()

            if visuals:
                env_renderer.render_env(show=True,
                                        frames=True,
                                        show_observations=True)
                if sleep_for_animation:
                    time.sleep(0.5)

            if done["__all__"] or step > max_steps:
                writer.write(batch_builder.build_and_reset())
                break

            # Collection information about training
            if step % 100 == 0:
                tasks_finished = 0
                for current_agent in env.agents:
                    if current_agent.status == RailAgentStatus.DONE_REMOVED:
                        tasks_finished += 1
                print(
                    '\rTrial No {} Training {} Agents on ({},{}).\t Steps {}\t Reward: {:.3f}\t Normalized Reward: {:.3f}\tDones: {:.2f}%\t'
                    .format(
                        trials, env.get_num_agents(), x_dim, y_dim, step,
                        score, score / (max_steps + n_agents), 100 * np.mean(
                            tasks_finished / max(1, env.get_num_agents()))),
                    end=" ")

        tasks_finished = 0
        for current_agent in env.agents:
            if current_agent.status == RailAgentStatus.DONE_REMOVED:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        reward_window.append(score)
        scores_window.append(score / (max_steps + n_agents))

        data = [[
            n_agents, x_dim, y_dim, trials,
            np.mean(reward_window),
            np.mean(scores_window), 100 * np.mean(done_window), step,
            action_prob / np.sum(action_prob)
        ]]

        df_cur = pd.DataFrame(data, columns=columns)
        df_all_results = pd.concat([df_all_results, df_cur])

        if imitate:
            df_all_results.to_csv(
                f'TreeImitationLearning_DQN_TrainingResults.csv', index=False)

        print(
            '\rTrial No {} Training {} Agents on ({},{}).\t Total Steps {}\t Reward: {:.3f}\t Normalized Reward: {:.3f}\tDones: {:.2f}%\t'
            .format(trials, env.get_num_agents(), x_dim, y_dim, step,
                    np.mean(reward_window), np.mean(scores_window),
                    100 * np.mean(done_window)))

        if visuals:
            env_renderer.close_window()

        gc.collect()
Exemple #14
0
class FlatlandGymEnv(gym.Env):
    """
    gym.Env wrapper of the Flatland environment providing deadlocks and observation normalization.
    """
    def __init__(self,
                 rail_env,
                 custom_observations,
                 env_params,
                 render=False,
                 regenerate_rail_on_reset=True,
                 regenerate_schedule_on_reset=True):

        self._regenerate_rail_on_reset = regenerate_rail_on_reset
        self._regenerate_schedule_on_reset = regenerate_schedule_on_reset
        self.rail_env = rail_env
        self.deadlocks_detector = DeadlocksDetector()

        self.observation_normalizer = NormalizeObservations(self.rail_env.obs_builder.observation_dim,
                                                            env_params.observation_tree_depth,
                                                            custom_observations,
                                                            self.rail_env.width,
                                                            self.rail_env.height,
                                                            env_params.observation_radius)

        self.state_size = self.observation_normalizer.state_size

        self.render = render
        self.env_renderer = None

    def reset(self):
        obs, info = self.rail_env.reset(regenerate_rail=self._regenerate_rail_on_reset,
                                        regenerate_schedule=self._regenerate_schedule_on_reset)
        # Reset rendering
        if self.render:
            self.env_renderer = RenderTool(self.rail_env, gl="PGL")
            self.env_renderer.set_new_rail()

        # Reset custom observations
        self.observation_normalizer.reset_custom_obs(self.rail_env)

        # Compute deadlocks
        self.deadlocks_detector.reset(self.rail_env.get_num_agents())
        info["deadlocks"] = {}

        for agent in range(self.rail_env.get_num_agents()):
            info["deadlocks"][agent] = self.deadlocks_detector.deadlocks[agent]

        # Normalization
        for agent in obs:
            if obs[agent] is not None:
                obs[agent] = self.observation_normalizer.normalize_observation(obs[agent], self.rail_env,
                                                                               agent, info["deadlocks"][agent])

        return obs, info

    def step(self, action_dict):
        """
        Normalize observations by default, update deadlocks and step.

        :param action_dict:
        :return:
        """
        obs, rewards, dones, info = self.rail_env.step(action_dict)

        # Compute deadlocks
        deadlocks = self.deadlocks_detector.step(self.rail_env)
        info["deadlocks"] = {}
        for agent in range(self.rail_env.get_num_agents()):
            info["deadlocks"][agent] = deadlocks[agent]

        # Normalization
        for agent in obs:
            if obs[agent] is not None:
                obs[agent] = self.observation_normalizer.normalize_observation(obs[agent], self.rail_env,
                                                                               agent, info["deadlocks"][agent])

        return obs, rewards, dones, info

    def show_render(self):
        """
        Open rendering window.

        :return:
        """
        if self.render:
            return self.env_renderer.render_env(
                show=True,
                frames=False,
                show_observations=False,
                show_predictions=False)

    def close(self):
        """
        Close rendering window.
        :return:
        """
        if self.render:
            return self.env_renderer.close_window()
Exemple #15
0
        my_env_current_state = new_state
        current_state = get_current_state(env)

        for handle in range(number_agents):

            if not (env.agents[handle].position == None
                    and my_env.agents[handle].position
                    == my_env.agents[handle].target):
                if env.agents[handle].position != my_env.agents[
                        handle].position or env.agents[
                            handle].direction != my_env.agents[
                                handle].direction:
                    print("#################### EPISODE ", episode,
                          " #######################")
                    print('  --------------------- step ', step,
                          '  --------------------- action', action)
                    print('')
                    print(tmp, tmp_2)
                    print(my_env_current_state, current_state)
                    print('')

    if episode % checkout_episode == 0:
        renderer.close_window()

    env = gen_env(number_agents, width, height, n_start_goal, seed)

renderer = RenderTool(env, agent_render_variant=3)
renderer.reset()
renderer.render_env(show=True, show_predictions=False, show_observations=False)