Ejemplo n.º 1
0
def eval_policy(env, policy, n_eval_episodes, max_steps):
    action_dict = dict()
    scores = []
    completions = []
    nb_steps = []

    for episode_idx in range(n_eval_episodes):
        agent_obs = [None] * env.get_num_agents()
        score = 0.0

        obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True)

        final_step = 0

        for step in range(max_steps - 1):
            for agent in env.get_agent_handles():
                if obs[agent]:
                    # TODO pass parameters properly
                    # agent_obs[agent] = normalize_observation(obs[agent], tree_depth=2, observation_radius=10)
                    agent_obs[agent] = normalize_observation(
                        obs[agent], tree_depth=2, observation_radius=10)

                action = 0
                if info['action_required'][agent]:
                    action = policy.act(agent_obs[agent], eps=0.0)
                action_dict.update({agent: action})

            obs, all_rewards, done, info = env.step(action_dict)

            for agent in env.get_agent_handles():
                score += all_rewards[agent]

            final_step = step

            if done['__all__']:
                break

        normalized_score = score / (max_steps * env.get_num_agents())
        scores.append(normalized_score)

        tasks_finished = sum(done[idx] for idx in env.get_agent_handles())
        completion = tasks_finished / max(1, env.get_num_agents())
        completions.append(completion)

        nb_steps.append(final_step)

    print("\t✅ Eval: score {:.3f} done {:.1f}%".format(
        np.mean(scores),
        np.mean(completions) * 100.0))

    return scores, completions, nb_steps
Ejemplo n.º 2
0
def my_controller(env, obs, number_of_agents):
    # ====================S====================
    for a in range(number_of_agents):
        if done[a]:
           continue
        agent = env.agents[a]
        if agent.speed_data['position_fraction']>5:
           action_dict.update({a: 4}) # stop
           continue
        if info['action_required'][a]:
            if agent.position is not None:
                possible_transition_num = np.count_nonzero(env.rail.get_transitions(*agent.position, agent.direction))
                if possible_transition_num == 1:
                    action = 2
                else:
                    action = policy.act(normalize_observation(obs[a], observation_tree_depth, zero_center=False), eps=1.0)
            else:
                action = 2
        else:
            action = 0
        action_dict.update({a: action})
    return  action_dict
Ejemplo n.º 3
0
def train_agent(env_params, train_params):
    # Environment parameters
    n_agents = env_params.n_agents
    x_dim = env_params.x_dim
    y_dim = env_params.y_dim
    n_cities = env_params.n_cities
    max_rails_between_cities = env_params.max_rails_between_cities
    max_rails_in_city = env_params.max_rails_in_city
    seed = env_params.seed

    # Observation parameters
    observation_tree_depth = env_params.observation_tree_depth
    observation_radius = env_params.observation_radius
    observation_max_path_depth = env_params.observation_max_path_depth

    # Training parameters
    eps_start = train_params.eps_start
    eps_end = train_params.eps_end
    eps_decay = train_params.eps_decay
    n_episodes = train_params.n_episodes
    checkpoint_interval = train_params.checkpoint_interval
    n_eval_episodes = train_params.n_evaluation_episodes

    # Set the seeds
    random.seed(seed)
    np.random.seed(seed)

    # Break agents from time to time
    malfunction_parameters = MalfunctionParameters(
        malfunction_rate=1. / 10000,  # Rate of malfunctions
        min_duration=15,  # Minimal duration
        max_duration=50  # Max duration
    )

    # Observation builder
    predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
    tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth,
                                         predictor=predictor)

    # Fraction of train which each speed
    speed_profiles = {
        1.: 1.0,  # Fast passenger train
        1. / 2.: 0.0,  # Fast freight train
        1. / 3.: 0.0,  # Slow commuter train
        1. / 4.: 0.0  # Slow freight train
    }

    # Setup the environment
    env = RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=n_cities,
            grid_mode=False,
            max_rails_between_cities=max_rails_between_cities,
            max_rails_in_city=max_rails_in_city),
        schedule_generator=sparse_schedule_generator(speed_profiles),
        number_of_agents=n_agents,
        malfunction_generator_and_process_data=malfunction_from_params(
            malfunction_parameters),
        obs_builder_object=tree_observation,
        random_seed=seed)

    env.reset(regenerate_schedule=True, regenerate_rail=True)

    # Setup renderer
    if train_params.render:
        env_renderer = RenderTool(env, gl="PGL")

    # Calculate the state size given the depth of the tree observation and the number of features
    n_features_per_node = env.obs_builder.observation_dim
    n_nodes = 0
    for i in range(observation_tree_depth + 1):
        n_nodes += np.power(4, i)
    state_size = n_features_per_node * n_nodes

    # The action space of flatland is 5 discrete actions
    action_size = 5

    # Max number of steps per episode
    # This is the official formula used during evaluations
    # See details in flatland.envs.schedule_generators.sparse_schedule_generator
    max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities)))

    action_count = [0] * action_size
    action_dict = dict()
    agent_obs = [None] * env.get_num_agents()
    agent_prev_obs = [None] * env.get_num_agents()
    agent_prev_action = [2] * env.get_num_agents()
    update_values = False
    smoothed_normalized_score = -1.0
    smoothed_eval_normalized_score = -1.0
    smoothed_completion = 0.0
    smoothed_eval_completion = 0.0

    # Double Dueling DQN policy
    policy = DDDQNPolicy(state_size, action_size, train_params)

    # TensorBoard writer
    writer = SummaryWriter()
    writer.add_hparams(vars(train_params), {})
    writer.add_hparams(vars(env_params), {})

    training_timer = Timer()
    training_timer.start()

    print(
        "\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes.\n"
        .format(env.get_num_agents(), x_dim, y_dim, n_episodes,
                n_eval_episodes, checkpoint_interval))

    for episode_idx in range(n_episodes + 1):
        # Timers
        step_timer = Timer()
        reset_timer = Timer()
        learn_timer = Timer()
        preproc_timer = Timer()

        # Reset environment
        reset_timer.start()
        obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True)
        reset_timer.end()

        if train_params.render:
            env_renderer.set_new_rail()

        score = 0
        nb_steps = 0
        actions_taken = []

        # Build agent specific observations
        for agent in env.get_agent_handles():
            if obs[agent]:
                agent_obs[agent] = normalize_observation(
                    obs[agent],
                    observation_tree_depth,
                    observation_radius=observation_radius)
                agent_prev_obs[agent] = agent_obs[agent].copy()

        # Run episode
        for step in range(max_steps - 1):
            for agent in env.get_agent_handles():
                if info['action_required'][agent]:
                    # If an action is required, we want to store the obs at that step as well as the action
                    update_values = True
                    action = policy.act(agent_obs[agent], eps=eps_start)
                    action_count[action] += 1
                    actions_taken.append(action)
                else:
                    update_values = False
                    action = 0
                action_dict.update({agent: action})

            # Environment step
            step_timer.start()
            next_obs, all_rewards, done, info = env.step(action_dict)
            step_timer.end()

            if train_params.render and episode_idx % checkpoint_interval == 0:
                env_renderer.render_env(show=True,
                                        frames=False,
                                        show_observations=False,
                                        show_predictions=False)

            for agent in range(env.get_num_agents()):
                # Update replay buffer and train agent
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values or done[agent]:
                    learn_timer.start()
                    policy.step(agent_prev_obs[agent],
                                agent_prev_action[agent], all_rewards[agent],
                                agent_obs[agent], done[agent])
                    learn_timer.end()

                    agent_prev_obs[agent] = agent_obs[agent].copy()
                    agent_prev_action[agent] = action_dict[agent]

                # Preprocess the new observations
                if next_obs[agent]:
                    preproc_timer.start()
                    agent_obs[agent] = normalize_observation(
                        next_obs[agent],
                        observation_tree_depth,
                        observation_radius=observation_radius)
                    preproc_timer.end()

                score += all_rewards[agent]

            nb_steps = step

            if done['__all__']:
                break

        # Epsilon decay
        eps_start = max(eps_end, eps_decay * eps_start)

        # Collection information about training
        tasks_finished = sum(done[idx] for idx in env.get_agent_handles())
        completion = tasks_finished / max(1, env.get_num_agents())
        normalized_score = score / (max_steps * env.get_num_agents())
        action_probs = action_count / np.sum(action_count)
        action_count = [1] * action_size

        # Smoothed values for terminal display and for more stable hyper-parameter tuning
        smoothing = 0.99
        smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * (
            1.0 - smoothing)
        smoothed_completion = smoothed_completion * smoothing + completion * (
            1.0 - smoothing)

        # Print logs
        if episode_idx % checkpoint_interval == 0:
            torch.save(
                policy.qnetwork_local,
                './checkpoints/origin_multi-' + str(episode_idx) + '.pth')
            if train_params.render:
                env_renderer.close_window()

        print('\r🚂 Episode {}'
              '\t 🏆 Score: {:.3f}'
              ' Avg: {:.3f}'
              '\t 💯 Done: {:.2f}%'
              ' Avg: {:.2f}%'
              '\t 🎲 Epsilon: {:.2f} '
              '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score,
                                             smoothed_normalized_score,
                                             100 * completion,
                                             100 * smoothed_completion,
                                             eps_start,
                                             format_action_prob(action_probs)),
              end=" ")

        # Evaluate policy
        if episode_idx % train_params.checkpoint_interval == 0:
            scores, completions, nb_steps_eval = eval_policy(
                env, policy, n_eval_episodes, max_steps)
            writer.add_scalar("evaluation/scores_min", np.min(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_max", np.max(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_mean", np.mean(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_std", np.std(scores),
                              episode_idx)
            writer.add_histogram("evaluation/scores", np.array(scores),
                                 episode_idx)
            writer.add_scalar("evaluation/completions_min",
                              np.min(completions), episode_idx)
            writer.add_scalar("evaluation/completions_max",
                              np.max(completions), episode_idx)
            writer.add_scalar("evaluation/completions_mean",
                              np.mean(completions), episode_idx)
            writer.add_scalar("evaluation/completions_std",
                              np.std(completions), episode_idx)
            writer.add_histogram("evaluation/completions",
                                 np.array(completions), episode_idx)
            writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval),
                              episode_idx)
            writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval),
                              episode_idx)
            writer.add_scalar("evaluation/nb_steps_mean",
                              np.mean(nb_steps_eval), episode_idx)
            writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval),
                              episode_idx)
            writer.add_histogram("evaluation/nb_steps",
                                 np.array(nb_steps_eval), episode_idx)

            smoothing = 0.9
            smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean(
                scores) * (1.0 - smoothing)
            smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean(
                completions) * (1.0 - smoothing)
            writer.add_scalar("evaluation/smoothed_score",
                              smoothed_eval_normalized_score, episode_idx)
            writer.add_scalar("evaluation/smoothed_completion",
                              smoothed_eval_completion, episode_idx)

        # Save logs to tensorboard
        writer.add_scalar("training/score", normalized_score, episode_idx)
        writer.add_scalar("training/smoothed_score", smoothed_normalized_score,
                          episode_idx)
        writer.add_scalar("training/completion", np.mean(completion),
                          episode_idx)
        writer.add_scalar("training/smoothed_completion",
                          np.mean(smoothed_completion), episode_idx)
        writer.add_scalar("training/nb_steps", nb_steps, episode_idx)
        writer.add_histogram("actions/distribution", np.array(actions_taken),
                             episode_idx)
        writer.add_scalar("actions/nothing",
                          action_probs[RailEnvActions.DO_NOTHING], episode_idx)
        writer.add_scalar("actions/left",
                          action_probs[RailEnvActions.MOVE_LEFT], episode_idx)
        writer.add_scalar("actions/forward",
                          action_probs[RailEnvActions.MOVE_FORWARD],
                          episode_idx)
        writer.add_scalar("actions/right",
                          action_probs[RailEnvActions.MOVE_RIGHT], episode_idx)
        writer.add_scalar("actions/stop",
                          action_probs[RailEnvActions.STOP_MOVING],
                          episode_idx)
        writer.add_scalar("training/epsilon", eps_start, episode_idx)
        writer.add_scalar("training/buffer_size", len(policy.memory),
                          episode_idx)
        writer.add_scalar("training/loss", policy.loss, episode_idx)
        writer.add_scalar("timer/reset", reset_timer.get(), episode_idx)
        writer.add_scalar("timer/step", step_timer.get(), episode_idx)
        writer.add_scalar("timer/learn", learn_timer.get(), episode_idx)
        writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx)
        writer.add_scalar("timer/total", training_timer.get_current(),
                          episode_idx)
Ejemplo n.º 4
0
def main():
    np.random.seed(1)

    env = RailEnv(
        width=x_dim,
        height=y_dim,
        number_of_agents=n_agents,
        rail_generator=rail_generator,
        schedule_generator=schedule_generator,
        malfunction_generator_and_process_data=malfunction_from_params(
            StochasticData(1 / 8000, 15, 50)),
        obs_builder_object=TreeObservation(max_depth=tree_depth))

    # After training we want to render the results so we also load a renderer
    env_renderer = RenderTool(env, gl="PILSVG")

    # Calculate the state size based on the number of nodes in the tree observation
    num_features_per_node = env.obs_builder.observation_dim
    num_nodes = sum(np.power(4, i) for i in range(tree_depth + 1))
    state_size = num_features_per_node * num_nodes
    action_size = 5

    # Now we load a double dueling DQN agent and initialize it from the checkpoint
    agent = Agent(state_size, action_size)
    if load_from_checkpoint:
        start, eps = agent.load(project_root / 'checkpoints', 0, 1.0)
    else:
        start, eps = 0, 1.0

    # And some variables to keep track of the progress
    action_dict, final_action_dict = {}, {}
    scores_window, done_window = deque(maxlen=500), deque(maxlen=500)
    action_prob = [0] * action_size
    agent_obs = [None] * n_agents
    agent_obs_buffer = [None] * n_agents
    agent_action_buffer = [2] * n_agents

    max_steps = int(3 * (x_dim + y_dim))
    update_values = False
    start_time = time.time()

    # We don't want to retrain on old railway networks when we restart from a checkpoint, so we just loop
    # through the generators to get all the old networks out of the way
    for _ in range(0, start):
        rail_generator()
        schedule_generator()

    # Start the training loop
    for episode in range(start + 1, n_trials + 1):
        env_renderer.reset()
        obs, info = env.reset(True, True)
        score = 0

        # Build agent specific observations
        for a in range(n_agents):
            if obs[a]:
                agent_obs[a] = normalize_observation(obs[a],
                                                     tree_depth,
                                                     observation_radius=10)
                agent_obs_buffer[a] = agent_obs[a].copy()

        # Run episode
        for step in range(max_steps):
            for a in range(n_agents):
                if info['action_required'][a]:
                    # If an action is required, we want to store the obs a that step as well as the action
                    update_values = True
                    action = agent.act(agent_obs[a], eps=eps)
                    # action = np.random.randint(4)
                    action_dict[a] = action
                    action_prob[action] += 1
                else:
                    update_values = False
                    action_dict[a] = 0

            # Environment step
            next_obs, all_rewards, done, info = env.step(action_dict)

            # Update replay buffer and train agent
            for a in range(n_agents):
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values or done[a]:
                    agent.step(agent_obs_buffer[a], agent_action_buffer[a],
                               all_rewards[a], agent_obs[a], done[a], train)
                    agent_obs_buffer[a] = agent_obs[a].copy()
                    agent_action_buffer[a] = action_dict[a]
                if next_obs[a]:
                    agent_obs[a] = normalize_observation(next_obs[a],
                                                         tree_depth,
                                                         observation_radius=10)

                score += all_rewards[a] / n_agents

            # Render
            if episode % render_interval == 0: render(env_renderer)
            if done['__all__']: break

        # Epsilon decay
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon

        # Collection information about training
        tasks_finished = sum(done[i] for i in range(n_agents))
        done_window.append(tasks_finished / max(1, n_agents))
        scores_window.append(score / max_steps)  # save most recent score

        action_probs = ', '.join(f'{x:.3f}'
                                 for x in action_prob / np.sum(action_prob))
        print(f'\rTraining {n_agents} Agents on ({x_dim},{y_dim}) \t ' +
              f'Episode {episode} \t ' +
              f'Average Score: {np.mean(scores_window):.3f} \t ' +
              f'Dones: {100 * np.mean(done_window):.2f}% \t ' +
              f'Epsilon: {eps:.2f} \t ' +
              f'Action Probabilities: {action_probs}',
              end=" ")

        if episode % report_interval == 0:
            print(f'\rTraining {n_agents} Agents on ({x_dim},{y_dim}) \t ' +
                  f'Episode {episode} \t ' +
                  f'Average Score: {np.mean(scores_window):.3f} \t ' +
                  f'Dones: {100 * np.mean(done_window):.2f}% \t ' +
                  f'Epsilon: {eps:.2f} \t ' +
                  f'Action Probabilities: {action_probs} \t ' +
                  f'Time taken: {time.time() - start_time:.2f}s')

            if train: agent.save(project_root / 'checkpoints', episode, eps)
            start_time = time.time()
            action_prob = [1] * action_size
Ejemplo n.º 5
0
            f'Epsilon: {eps:.2f}' if flags.agent_type == "dqn" else None,
            f'Time taken: {time.time() - start_time:.2f}s'
            if show_time else None
        ])) + '  '


# Main training loop
for episode in range(start + 1, flags.num_episodes + 1):
    agent.reset()
    env_renderer.reset()
    obs, info = env.reset(True, True)
    score, steps_taken, collision = 0, 0, False

    # Build initial observations for each agent
    for a in range(flags.num_agents):
        agent_obs[a] = normalize_observation(
            obs[a], flags.tree_depth, zero_center=flags.agent_type == 'dqn')
        agent_obs_buffer[a] = agent_obs[a].copy()

    # Run an episode
    for step in range(max_steps):
        update_values = [False] * flags.num_agents
        action_dict = {}

        for a in range(flags.num_agents):
            if info['action_required'][a]:
                action_dict[a] = agent.act(agent_obs[a], eps=eps)
                # action_dict[a] = np.random.randint(5)
                update_values[a] = True
                steps_taken += 1
            else:
                action_dict[a] = 0
Ejemplo n.º 6
0
def train_agent(n_episodes):
    # Environment parameters
    n_agents = 1
    x_dim = 25
    y_dim = 25
    n_cities = 4
    max_rails_between_cities = 2
    max_rails_in_city = 3
    seed = 42

    # Observation parameters
    observation_tree_depth = 2
    observation_radius = 10

    # Exploration parameters
    eps_start = 1.0
    eps_end = 0.01
    eps_decay = 0.997  # for 2500ts

    # Set the seeds
    random.seed(seed)
    np.random.seed(seed)

    # Observation builder
    tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth)

    # Setup the environment
    env = RailEnv(width=x_dim,
                  height=y_dim,
                  rail_generator=sparse_rail_generator(
                      max_num_cities=n_cities,
                      seed=seed,
                      grid_mode=False,
                      max_rails_between_cities=max_rails_between_cities,
                      max_rails_in_city=max_rails_in_city),
                  schedule_generator=sparse_schedule_generator(),
                  number_of_agents=n_agents,
                  obs_builder_object=tree_observation)

    env.reset(True, True)

    # Calculate the state size given the depth of the tree observation and the number of features
    n_features_per_node = env.obs_builder.observation_dim
    n_nodes = 0
    for i in range(observation_tree_depth + 1):
        n_nodes += np.power(4, i)
    state_size = n_features_per_node * n_nodes

    # The action space of flatland is 5 discrete actions
    action_size = 5

    # Max number of steps per episode
    # This is the official formula used during evaluations
    max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities)))

    action_dict = dict()

    # And some variables to keep track of the progress
    scores_window = deque(maxlen=100)  # todo smooth when rendering instead
    completion_window = deque(maxlen=100)
    scores = []
    completion = []
    action_count = [0] * action_size
    agent_obs = [None] * env.get_num_agents()
    agent_prev_obs = [None] * env.get_num_agents()
    agent_prev_action = [2] * env.get_num_agents()
    update_values = False

    # Training parameters
    training_parameters = {
        'buffer_size': int(1e5),
        'batch_size': 32,
        'update_every': 8,
        'learning_rate': 0.5e-4,
        'tau': 1e-3,
        'gamma': 0.99,
        'buffer_min_size': 0,
        'hidden_size': 256,
        'use_gpu': False
    }

    # Double Dueling DQN policy
    policy = DDDQNPolicy(state_size, action_size,
                         Namespace(**training_parameters))

    for episode_idx in range(n_episodes):
        score = 0

        # Reset environment
        obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True)
        # print("reset 이후 ",obs)
        # print(type(obs))
        # print(len(obs))
        # Build agent specific observations
        for agent in env.get_agent_handles():
            #print(obs[agent])
            if obs[agent]:
                # print("normalize전")
                # print(type(obs[agent]))
                # print(obs[agent])
                #print(agent_obs[agent].ndim)
                agent_obs[agent] = normalize_observation(
                    obs[agent],
                    observation_tree_depth,
                    observation_radius=observation_radius)
                agent_prev_obs[agent] = agent_obs[agent].copy()

        # Run episode
        for step in range(max_steps - 1):
            for agent in env.get_agent_handles():
                if info['action_required'][agent]:
                    # If an action is required, we want to store the obs at that step as well as the action
                    update_values = True

                    # if(step == 10):
                    # print("normalize 후")
                    # print(type(agent_obs[agent]))
                    # print(agent_obs[agent])
                    # print(agent_obs[agent].size)
                    action = policy.act(agent_obs[agent], eps=eps_start)
                    #print(action)
                    #print(type(action))
                    action_count[action] += 1
                else:
                    update_values = False
                    action = 0
                action_dict.update({agent: action})
                #print(action_dict)

            # Environment step
            next_obs, all_rewards, done, info = env.step(action_dict)

            # Update replay buffer and train agent
            for agent in range(env.get_num_agents()):
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values or done[agent]:
                    policy.step(agent_prev_obs[agent],
                                agent_prev_action[agent], all_rewards[agent],
                                agent_obs[agent], done[agent])

                    agent_prev_obs[agent] = agent_obs[agent].copy()
                    agent_prev_action[agent] = action_dict[agent]

                if next_obs[agent]:
                    agent_obs[agent] = normalize_observation(
                        next_obs[agent],
                        observation_tree_depth,
                        observation_radius=10)

                score += all_rewards[agent]

            if done['__all__']:
                break

        # Epsilon decay
        eps_start = max(eps_end, eps_decay * eps_start)

        # Collection information about training
        tasks_finished = np.sum(
            [int(done[idx]) for idx in env.get_agent_handles()])
        completion_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / (max_steps * env.get_num_agents()))
        completion.append((np.mean(completion_window)))
        scores.append(np.mean(scores_window))
        action_probs = action_count / np.sum(action_count)

        if episode_idx % 100 == 0:
            end = "\n"
            torch.save(policy.qnetwork_local,
                       './checkpoints/single-' + str(episode_idx) + '.pth')
            action_count = [1] * action_size
        else:
            end = " "

        print(
            '\rTraining {} agents on {}x{}\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'
            .format(env.get_num_agents(), x_dim, y_dim, episode_idx,
                    np.mean(scores_window), 100 * np.mean(completion_window),
                    eps_start, action_probs),
            end=end)

    # Plot overall training progress at the end
    plt.plot(scores)
    plt.show()

    plt.plot(completion)
    plt.show()
Ejemplo n.º 7
0
def main():
    np.random.seed(1)

    env = RailEnv(
        width=flags.grid_width,
        height=flags.grid_height,
        number_of_agents=flags.num_agents,
        rail_generator=rail_generator,
        schedule_generator=schedule_generator,
        malfunction_generator_and_process_data=malfunction_from_params(
            MalfunctionParameters(1 / 8000, 15, 50)),
        obs_builder_object=TreeObservation(max_depth=flags.tree_depth))

    # After training we want to render the results so we also load a renderer
    env_renderer = RenderTool(env, gl="PILSVG")

    # Calculate the state size based on the number of nodes in the tree observation
    num_features_per_node = env.obs_builder.observation_dim
    num_nodes = sum(np.power(4, i) for i in range(flags.tree_depth + 1))
    state_size = num_nodes * num_features_per_node
    action_size = 5

    # Now we load a double dueling DQN agent and initialize it from the checkpoint
    agent = Agent(state_size, action_size)
    if flags.load_from_checkpoint:
        start, eps = agent.load(project_root / 'checkpoints', 0, 1.0)
    else:
        start, eps = 0, 1.0

    # And some variables to keep track of the progress
    action_dict, final_action_dict = {}, {}
    scores_window, steps_window, done_window = deque(maxlen=200), deque(
        maxlen=200), deque(maxlen=200)
    action_prob = [0] * action_size
    agent_obs = [None] * flags.num_agents
    agent_obs_buffer = [None] * flags.num_agents
    agent_action_buffer = [2] * flags.num_agents

    max_steps = int(8 * (flags.grid_width + flags.grid_height))
    update_values = False
    start_time = time.time()

    # We don't want to retrain on old railway networks when we restart from a checkpoint, so we just loop
    # through the generators to get all the old networks out of the way
    if start > 0: print(f"Skipping {start} railways")
    for _ in range(0, start):
        rail_generator()
        schedule_generator()

    # Start the training loop
    for episode in range(start + 1, flags.num_episodes + 1):
        env_renderer.reset()
        obs, info = env.reset(True, True)
        score, steps_taken = 0, 0

        # Build agent specific observations
        for a in range(flags.num_agents):
            if obs[a]:
                agent_obs[a] = normalize_observation(obs[a], flags.tree_depth)
                agent_obs_buffer[a] = agent_obs[a].copy()

        # Run episode
        for step in range(max_steps):
            for a in range(flags.num_agents):
                # if not isinstance(obs[a].childs['L'], float) or not isinstance(obs[a].childs['R'], float):
                if info['action_required'][a]:
                    # If an action is required, we want to store the obs a that step as well as the action
                    update_values = True

                    # distances = { key: child.dist_min_to_target for key, child in obs[a].childs.items() if not isinstance(child, float) }
                    # action_key = min(distances, key=distances.get)
                    # action = { 'L': 1, 'F': 2, 'R': 3 }[action_key]
                    # action = np.argmin(agent_obs[a])

                    # action = np.random.randint(4)
                    action = agent.act(agent_obs[a], eps=eps)
                    action_dict[a] = action
                    action_prob[action] += 1
                    steps_taken += 1
                else:
                    update_values = False
                    action_dict[a] = 2

            # Environment step
            obs, all_rewards, done, info = env.step(action_dict)

            # Update replay buffer and train agent
            for a in range(flags.num_agents):
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values or done[a]:
                    agent.step(agent_obs_buffer[a], agent_action_buffer[a],
                               all_rewards[a], agent_obs[a], done[a],
                               flags.train)
                    agent_obs_buffer[a] = agent_obs[a].copy()
                    agent_action_buffer[a] = action_dict[a]
                if obs[a]:
                    agent_obs[a] = normalize_observation(
                        obs[a], flags.tree_depth)

                score += all_rewards[a] / flags.num_agents

            # Render
            if flags.render_interval and episode % flags.render_interval == 0:
                render(env_renderer)
            if done['__all__']: break

        # Epsilon decay
        eps = max(0.01, flags.epsilon_decay * eps)

        # Save some training statistics in their respective deques
        tasks_finished = sum(done[i] for i in range(flags.num_agents))
        done_window.append(tasks_finished / max(1, flags.num_agents))
        scores_window.append(score / max_steps)
        steps_window.append(steps_taken)
        action_probs = ', '.join(f'{x:.3f}'
                                 for x in action_prob / np.sum(action_prob))

        print(
            f'\rTraining {flags.num_agents} Agents on ({flags.grid_width},{flags.grid_height}) \t '
            + f'Episode {episode} \t ' +
            f'Average Score: {np.mean(scores_window):.3f} \t ' +
            f'Average Steps Taken: {np.mean(steps_window):.1f} \t ' +
            f'Dones: {100 * np.mean(done_window):.2f}% \t ' +
            f'Epsilon: {eps:.2f} \t ' +
            f'Action Probabilities: {action_probs}',
            end=" ")

        if episode % flags.report_interval == 0:
            print(
                f'\rTraining {flags.num_agents} Agents on ({flags.grid_width},{flags.grid_height}) \t '
                + f'Episode {episode} \t ' +
                f'Average Score: {np.mean(scores_window):.3f} \t ' +
                f'Average Steps Taken: {np.mean(steps_window):.1f} \t ' +
                f'Dones: {100 * np.mean(done_window):.2f}% \t ' +
                f'Epsilon: {eps:.2f} \t ' +
                f'Action Probabilities: {action_probs} \t ' +
                f'Time taken: {time.time() - start_time:.2f}s')

            if flags.train:
                agent.save(project_root / 'checkpoints', episode, eps)
            start_time = time.time()
            action_prob = [1] * action_size
Ejemplo n.º 8
0
    # the state of the remote copy of the env, and the observations and 
    # rewards, etc will behave unexpectedly
    # 
    # You can however probe the local_env instance to get any information
    # you need from the environment. It is a valid RailEnv instance.
    local_env = remote_client.env
    number_of_agents = len(local_env.agents)

    #====================S====================
    done = local_env.dones
    agent_obs = [None] * number_of_agents
    #agent_obs_buffer = [None] * number_of_agents

    # Build initial observations for each agent
    for a in range(number_of_agents):
        agent_obs[a] = normalize_observation(observation[a], observation_tree_depth, zero_center=False)
    #====================E====================

    # Now we enter into another infinite loop where we 
    # compute the actions for all the individual steps in this episode
    # until the episode is `done`
    # 
    # An episode is considered done when either all the agents have 
    # reached their target destination
    # or when the number of time steps has exceed max_time_steps, which 
    # is defined by : 
    #
    # max_time_steps = int(4 * 2 * (env.width + env.height + 20))
    #
    time_taken_by_controller = []
    time_taken_per_step = []