Exemple #1
0
def create_rail_env(env_params, tree_observation):
    n_agents = env_params.n_agents
    x_dim = env_params.x_dim
    y_dim = env_params.y_dim
    n_cities = env_params.n_cities
    max_rails_between_cities = env_params.max_rails_between_cities
    max_rails_in_city = env_params.max_rails_in_city
    seed = env_params.seed

    # Break agents from time to time
    malfunction_parameters = MalfunctionParameters(
        malfunction_rate=env_params.malfunction_rate,
        min_duration=20,
        max_duration=50)

    return RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=n_cities,
            grid_mode=False,
            max_rails_between_cities=max_rails_between_cities,
            max_rails_in_city=max_rails_in_city),
        schedule_generator=sparse_schedule_generator(),
        number_of_agents=n_agents,
        malfunction_generator_and_process_data=malfunction_from_params(
            malfunction_parameters),
        obs_builder_object=tree_observation,
        random_seed=seed)
Exemple #2
0
def test_malfunction_before_entry():
    """Tests that malfunctions are working properly for agents before entering the environment!"""
    # Set fixed malfunction duration for this test
    stochastic_data = MalfunctionParameters(malfunction_rate=2,  # Rate of malfunction occurence
                                            min_duration=10,  # Minimal duration of malfunction
                                            max_duration=10  # Max duration of malfunction
                                            )

    rail, rail_map = make_simple_rail2()

    env = RailEnv(width=25,
                  height=30,
                  rail_generator=rail_from_grid_transition_map(rail),
                  schedule_generator=random_schedule_generator(),
                  number_of_agents=10,
                  malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
                  obs_builder_object=SingleAgentNavigationObs()
                  )
    env.reset(False, False, False, random_seed=10)
    env.agents[0].target = (0, 0)

    # Test initial malfunction values for all agents
    # we want some agents to be malfuncitoning already and some to be working
    # we want different next_malfunction values for the agents
    assert env.agents[0].malfunction_data['malfunction'] == 0
    assert env.agents[1].malfunction_data['malfunction'] == 10
    assert env.agents[2].malfunction_data['malfunction'] == 0
    assert env.agents[3].malfunction_data['malfunction'] == 10
    assert env.agents[4].malfunction_data['malfunction'] == 10
    assert env.agents[5].malfunction_data['malfunction'] == 10
    assert env.agents[6].malfunction_data['malfunction'] == 10
    assert env.agents[7].malfunction_data['malfunction'] == 10
    assert env.agents[8].malfunction_data['malfunction'] == 10
    assert env.agents[9].malfunction_data['malfunction'] == 10
Exemple #3
0
def test_malfunction_values_and_behavior():
    """
    Test the malfunction counts down as desired
    Returns
    -------

    """
    # Set fixed malfunction duration for this test

    rail, rail_map = make_simple_rail2()
    action_dict: Dict[int, RailEnvActions] = {}
    stochastic_data = MalfunctionParameters(malfunction_rate=0.001,  # Rate of malfunction occurence
                                            min_duration=10,  # Minimal duration of malfunction
                                            max_duration=10  # Max duration of malfunction
                                            )
    env = RailEnv(width=25,
                  height=30,
                  rail_generator=rail_from_grid_transition_map(rail),
                  schedule_generator=random_schedule_generator(),
                  number_of_agents=1,
                  malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
                  obs_builder_object=SingleAgentNavigationObs()
                  )

    env.reset(False, False, activate_agents=True, random_seed=10)

    # Assertions
    assert_list = [9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 10, 9, 8, 7, 6, 5]
    print("[")
    for time_step in range(15):
        # Move in the env
        env.step(action_dict)
        # Check that next_step decreases as expected
        assert env.agents[0].malfunction_data['malfunction'] == assert_list[time_step]
def test_malfanction_from_params():
    """
    Test loading malfunction from
    Returns
    -------

    """
    stochastic_data = MalfunctionParameters(
        malfunction_rate=1000,  # Rate of malfunction occurence
        min_duration=2,  # Minimal duration of malfunction
        max_duration=5  # Max duration of malfunction
    )
    rail, rail_map = make_simple_rail2()

    env = RailEnv(
        width=25,
        height=30,
        rail_generator=rail_from_grid_transition_map(rail),
        schedule_generator=random_schedule_generator(),
        number_of_agents=10,
        malfunction_generator_and_process_data=malfunction_from_params(
            stochastic_data))
    env.reset()
    assert env.malfunction_process_data.malfunction_rate == 1000
    assert env.malfunction_process_data.min_duration == 2
    assert env.malfunction_process_data.max_duration == 5
Exemple #5
0
def test_malfunction_process():
    # Set fixed malfunction duration for this test
    stochastic_data = MalfunctionParameters(
        malfunction_rate=1,  # Rate of malfunction occurence
        min_duration=3,  # Minimal duration of malfunction
        max_duration=3  # Max duration of malfunction
    )

    rail, rail_map = make_simple_rail2()

    env = RailEnv(
        width=25,
        height=30,
        rail_generator=rail_from_grid_transition_map(rail),
        schedule_generator=random_schedule_generator(),
        number_of_agents=1,
        malfunction_generator_and_process_data=malfunction_from_params(
            stochastic_data),
        obs_builder_object=SingleAgentNavigationObs())
    obs, info = env.reset(False, False, True, random_seed=10)

    agent_halts = 0
    total_down_time = 0
    agent_old_position = env.agents[0].position

    # Move target to unreachable position in order to not interfere with test
    env.agents[0].target = (0, 0)
    for step in range(100):
        actions = {}

        for i in range(len(obs)):
            actions[i] = np.argmax(obs[i]) + 1

        obs, all_rewards, done, _ = env.step(actions)

        if env.agents[0].malfunction_data['malfunction'] > 0:
            agent_malfunctioning = True
        else:
            agent_malfunctioning = False

        if agent_malfunctioning:
            # Check that agent is not moving while malfunctioning
            assert agent_old_position == env.agents[0].position

        agent_old_position = env.agents[0].position
        total_down_time += env.agents[0].malfunction_data['malfunction']

    # Check that the appropriate number of malfunctions is achieved
    assert env.agents[0].malfunction_data[
        'nr_malfunctions'] == 23, "Actual {}".format(
            env.agents[0].malfunction_data['nr_malfunctions'])

    # Check that malfunctioning data was standing around
    assert total_down_time > 0
def create_rail_env(args, load_env=""):
    '''
    Build a RailEnv object with the specified parameters,
    as described in the .yml file
    '''
    # Check if an environment file is provided
    if load_env:
        rail_generator = rail_from_file(load_env)
    else:
        rail_generator = sparse_rail_generator(
            max_num_cities=args.env.max_cities,
            grid_mode=args.env.grid,
            max_rails_between_cities=args.env.max_rails_between_cities,
            max_rails_in_city=args.env.max_rails_in_cities,
            seed=args.env.seed)

    # Build predictor and observator
    obs_type = args.policy.type.get_true_key()
    if PREDICTORS[obs_type] is ShortestDeviationPathPredictor:
        predictor = PREDICTORS[obs_type](
            max_depth=args.observator.max_depth,
            max_deviations=args.predictor.max_depth)
    else:
        predictor = PREDICTORS[obs_type](max_depth=args.predictor.max_depth)
    observator = OBSERVATORS[obs_type](args.observator.max_depth, predictor)

    # Initialize malfunctions
    malfunctions = None
    if args.env.malfunctions.enabled:
        malfunctions = ParamMalfunctionGen(
            MalfunctionParameters(
                malfunction_rate=args.env.malfunctions.rate,
                min_duration=args.env.malfunctions.min_duration,
                max_duration=args.env.malfunctions.max_duration))

    # Initialize agents speeds
    speed_map = None
    if args.env.variable_speed:
        speed_map = {1.: 0.25, 1. / 2.: 0.25, 1. / 3.: 0.25, 1. / 4.: 0.25}
    schedule_generator = sparse_schedule_generator(speed_map,
                                                   seed=args.env.seed)

    # Build the environment
    return RailEnvWrapper(params=args,
                          width=args.env.width,
                          height=args.env.height,
                          rail_generator=rail_generator,
                          schedule_generator=schedule_generator,
                          number_of_agents=args.env.num_trains,
                          obs_builder_object=observator,
                          malfunction_generator=malfunctions,
                          remove_agents_at_target=True,
                          random_seed=args.env.seed)
Exemple #7
0
    def _launch(self):
        rail_generator = sparse_rail_generator(
            seed=self._config['seed'],
            max_num_cities=self._config['max_num_cities'],
            grid_mode=self._config['grid_mode'],
            max_rails_between_cities=self._config['max_rails_between_cities'],
            max_rails_in_city=self._config['max_rails_in_city'])

        malfunction_generator = no_malfunction_generator()
        if {
                'malfunction_rate', 'malfunction_min_duration',
                'malfunction_max_duration'
        } <= self._config.keys():
            stochastic_data = MalfunctionParameters(
                malfunction_rate=self._config['malfunction_rate'],
                min_duration=self._config['malfunction_min_duration'],
                max_duration=self._config['malfunction_max_duration'])
            malfunction_generator = malfunction_from_params(stochastic_data)

        speed_ratio_map = None
        if 'speed_ratio_map' in self._config:
            speed_ratio_map = {
                float(k): float(v)
                for k, v in self._config['speed_ratio_map'].items()
            }
        schedule_generator = sparse_schedule_generator(speed_ratio_map)

        env = None
        try:
            env = RailEnv(
                width=self._config['width'],
                height=self._config['height'],
                rail_generator=rail_generator,
                schedule_generator=schedule_generator,
                number_of_agents=self._config['number_of_agents'],
                malfunction_generator_and_process_data=malfunction_generator,
                obs_builder_object=self._observation.builder(),
                remove_agents_at_target=False,
                random_seed=self._config['seed'],
                # Should Below line be commented as here the env tries different configs,
                # hence opening it can be wasteful, morever the render has to be closed
                use_renderer=self._env_config.get('render'))

            env.reset()
        except ValueError as e:
            logging.error("=" * 50)
            logging.error(f"Error while creating env: {e}")
            logging.error("=" * 50)

        return env
Exemple #8
0
def test_malfunction_process_statistically():
    """Tests that malfunctions are produced by stochastic_data!"""
    # Set fixed malfunction duration for this test
    stochastic_data = MalfunctionParameters(
        malfunction_rate=1 / 5,  # Rate of malfunction occurence
        min_duration=5,  # Minimal duration of malfunction
        max_duration=5  # Max duration of malfunction
    )

    rail, rail_map = make_simple_rail2()

    env = RailEnv(
        width=25,
        height=30,
        rail_generator=rail_from_grid_transition_map(rail),
        schedule_generator=random_schedule_generator(),
        number_of_agents=10,
        malfunction_generator_and_process_data=malfunction_from_params(
            stochastic_data),
        obs_builder_object=SingleAgentNavigationObs())

    env.reset(True, True, False, random_seed=10)

    env.agents[0].target = (0, 0)
    # Next line only for test generation
    # agent_malfunction_list = [[] for i in range(10)]
    agent_malfunction_list = [
        [0, 0, 0, 0, 5, 4, 3, 2, 1, 0, 5, 4, 3, 2, 1, 0, 0, 0, 5, 4],
        [0, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 5, 4, 3, 2, 1, 0, 5, 4, 3, 2, 1, 0, 0, 5, 4, 3, 2],
        [0, 0, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 5, 4, 3, 2, 1],
        [0, 0, 5, 4, 3, 2, 1, 0, 0, 5, 4, 3, 2, 1, 0, 5, 4, 3, 2, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0],
        [5, 4, 3, 2, 1, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 5],
        [5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 4, 3, 2],
        [5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0, 5, 4]
    ]

    for step in range(20):
        action_dict: Dict[int, RailEnvActions] = {}
        for agent_idx in range(env.get_num_agents()):
            # We randomly select an action
            action_dict[agent_idx] = RailEnvActions(np.random.randint(4))
            # For generating tests only:
            # agent_malfunction_list[agent_idx].append(env.agents[agent_idx].malfunction_data['malfunction'])
            assert env.agents[agent_idx].malfunction_data[
                'malfunction'] == agent_malfunction_list[agent_idx][step]
        env.step(action_dict)
Exemple #9
0
    def _launch(self):
        rail_generator = self.get_rail_generator()

        malfunction_generator = NoMalfunctionGen()
        if {'malfunction_rate', 'malfunction_min_duration', 'malfunction_max_duration'} <= self._config.keys():
            print("MALFUNCTIONS POSSIBLE")
            params = MalfunctionParameters(malfunction_rate=1 / self._config['malfunction_rate'],
                                           max_duration=self._config['malfunction_max_duration'],
                                           min_duration=self._config['malfunction_min_duration'])
            malfunction_generator = ParamMalfunctionGen(params)

        speed_ratio_map = None
        if 'speed_ratio_map' in self._config:
            speed_ratio_map = {
                float(k): float(v) for k, v in self._config['speed_ratio_map'].items()
            }
        if self._gym_env_class == SequentialFlatlandGymEnv:
            schedule_generator = SequentialSparseSchedGen(speed_ratio_map, seed=1)
        else:
            schedule_generator = sparse_schedule_generator(speed_ratio_map)

        env = None
        try:
            if self._fine_tune_env_path is None:
                env = RailEnv(
                    width=self._config['width'],
                    height=self._config['height'],
                    rail_generator=rail_generator,
                    schedule_generator=schedule_generator,
                    number_of_agents=self._config['number_of_agents'],
                    malfunction_generator=malfunction_generator,
                    obs_builder_object=self._observation.builder(),
                    remove_agents_at_target=True,
                    random_seed=self._config['seed'],
                    use_renderer=self._env_config.get('render')
                )
                env.reset()
            else:
                env, _ = RailEnvPersister.load_new(self._fine_tune_env_path)
                env.reset(regenerate_rail=False, regenerate_schedule=False)
                env.obs_builder = self._observation.builder()
                env.obs_builder.set_env(env)

        except ValueError as e:
            logging.error("=" * 50)
            logging.error(f"Error while creating env: {e}")
            logging.error("=" * 50)

        return env
Exemple #10
0
def create_and_save_env(file_name: str, schedule_generator: ScheduleGenerator,
                        rail_generator: RailGenerator):
    stochastic_data = MalfunctionParameters(
        malfunction_rate=1000,  # Rate of malfunction occurence
        min_duration=15,  # Minimal duration of malfunction
        max_duration=50  # Max duration of malfunction
    )

    env = RailEnv(
        width=30,
        height=30,
        rail_generator=rail_generator,
        schedule_generator=schedule_generator,
        number_of_agents=10,
        malfunction_generator_and_process_data=malfunction_from_params(
            stochastic_data),
        remove_agents_at_target=True)
    env.reset(True, True)
    #env.save(file_name)
    RailEnvPersister.save(env, file_name)
Exemple #11
0
def get_env(config=None, rl=False):
    n_agents = 16
    schedule_generator = sparse_schedule_generator(None)

    rail_generator = sparse_rail_generator(
        seed=seed,
        max_num_cities=3,
        grid_mode=False,
        max_rails_between_cities=2,
        max_rails_in_city=4,
    )

    if rl:
        obs_builder = make_obs("combined", {
            "path": None,
            "simple_meta": None
        }).builder()
    else:
        obs_builder = DummyObs()

    params = MalfunctionParameters(malfunction_rate=1 / 1000,
                                   max_duration=50,
                                   min_duration=20)
    malfunction_generator = ParamMalfunctionGen(params)

    env = RailEnv(
        width=28,
        height=28,
        rail_generator=rail_generator,
        schedule_generator=schedule_generator,
        number_of_agents=n_agents,
        malfunction_generator=malfunction_generator,
        obs_builder_object=obs_builder,
        remove_agents_at_target=True,
        random_seed=seed,
    )

    return env
def get_env(config=None, rl=False):
    n_agents = 32
    schedule_generator = sparse_schedule_generator(None)

    rail_generator = sparse_rail_generator(
        seed=seed,
        max_num_cities=4,
        grid_mode=False,
        max_rails_between_cities=2,
        max_rails_in_city=4,
    )

    if rl:
        obs_builder = make_obs(
            config["env_config"]['observation'],
            config["env_config"].get('observation_config')).builder()
    else:
        obs_builder = DummyObs()

    params = MalfunctionParameters(malfunction_rate=1 / 1000,
                                   max_duration=50,
                                   min_duration=20)
    malfunction_generator = ParamMalfunctionGen(params)

    env = RailEnv(
        width=32,
        height=32,
        rail_generator=rail_generator,
        schedule_generator=schedule_generator,
        number_of_agents=n_agents,
        malfunction_generator=malfunction_generator,
        obs_builder_object=obs_builder,
        remove_agents_at_target=True,
        random_seed=seed,
    )

    return env
Exemple #13
0
speed_ration_map = {
    1.: 1,  # Fast passenger train
    1. / 2.: 0,  # Fast freight train
    1. / 3.: 0,  # Slow commuter train
    1. / 4.: 0
}  # Slow freight train

# We can now initiate the schedule generator with the given speed profiles

schedule_generator = sparse_schedule_generator(speed_ration_map)

# We can furthermore pass stochastic data to the RailEnv constructor which will allow for stochastic malfunctions
# during an episode.
stochastic_data = MalfunctionParameters(
    malfunction_rate=0,  # Rate of malfunction occurence
    min_duration=3,  # Minimal duration of malfunction
    max_duration=20  # Max duration of malfunction
)
print(stochastic_data)

# Custom observation builder without predictor
observation_builder = GlobalObsForRailEnv()

# Custom observation builder with predictor, uncomment line below if you want to try this one
# observation_builder = TreeObsForRailEnv(max_depth=2, predictor=ShortestPathPredictorForRailEnv())

# Construct the enviornment with the given observation, generataors, predictors, and stochastic data
env = RailEnv(
    width=width,
    height=height,
    rail_generator=rail_generator,
Exemple #14
0
def create_test_env(fnParams, nTest, sDir):
    (seed, width, height, nr_trains, nr_cities, max_rails_between_cities,
     max_rails_in_cities, malfunction_rate, malfunction_min_duration,
     malfunction_max_duration) = fnParams(nTest)
    #if not ShouldRunTest(test_id):
    #    continue

    rail_generator = sparse_rail_generator(
        max_num_cities=nr_cities,
        seed=seed,
        grid_mode=False,
        max_rails_between_cities=max_rails_between_cities,
        max_rails_in_city=max_rails_in_cities,
    )

    #stochastic_data = {'malfunction_rate': malfunction_rate,
    #                    'min_duration': malfunction_min_duration,
    #                    'max_duration': malfunction_max_duration
    #                }

    stochastic_data = MalfunctionParameters(
        malfunction_rate=malfunction_rate,
        min_duration=malfunction_min_duration,
        max_duration=malfunction_max_duration)

    observation_builder = GlobalObsForRailEnv()

    DEFAULT_SPEED_RATIO_MAP = {
        1.: 0.25,
        1. / 2.: 0.25,
        1. / 3.: 0.25,
        1. / 4.: 0.25
    }

    schedule_generator = sparse_schedule_generator(DEFAULT_SPEED_RATIO_MAP)

    for iAttempt in range(5):
        try:
            env = RailEnv(
                width=width,
                height=height,
                rail_generator=rail_generator,
                schedule_generator=schedule_generator,
                number_of_agents=nr_trains,
                malfunction_generator_and_process_data=malfunction_from_params(
                    stochastic_data),
                obs_builder_object=observation_builder,
                remove_agents_at_target=True)
            obs = env.reset(random_seed=seed)
            break
        except ValueError as oErr:
            print("Error:", oErr)
            width += 5
            height += 5
            print("Try again with larger env: (w,h):", width, height)

    if not os.path.exists(sDir):
        os.makedirs(sDir)

    sfName = "{}/Level_{}.mpk".format(sDir, nTest)
    if os.path.exists(sfName):
        os.remove(sfName)
    env.save(sfName)

    sys.stdout.write(".")
    sys.stdout.flush()

    return env
Exemple #15
0
def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps,
                action_size, state_size, seed, render, allow_skipping,
                allow_caching):
    # Evaluation is faster on CPU (except if you use a really huge policy)
    parameters = {'use_gpu': False}

    policy = DDDQNPolicy(state_size,
                         action_size,
                         Namespace(**parameters),
                         evaluation_mode=True)
    policy.qnetwork_local = torch.load(checkpoint)

    env_params = Namespace(**env_params)

    # Environment parameters
    n_agents = env_params.n_agents
    x_dim = env_params.x_dim
    y_dim = env_params.y_dim
    n_cities = env_params.n_cities
    max_rails_between_cities = env_params.max_rails_between_cities
    max_rails_in_city = env_params.max_rails_in_city

    # Malfunction and speed profiles
    # TODO pass these parameters properly from main!
    malfunction_parameters = MalfunctionParameters(
        malfunction_rate=1. / 2000,  # Rate of malfunctions
        min_duration=20,  # Minimal duration
        max_duration=50  # Max duration
    )

    # Only fast trains in Round 1
    speed_profiles = {
        1.: 1.0,  # Fast passenger train
        1. / 2.: 0.0,  # Fast freight train
        1. / 3.: 0.0,  # Slow commuter train
        1. / 4.: 0.0  # Slow freight train
    }

    # Observation parameters
    observation_tree_depth = env_params.observation_tree_depth
    observation_radius = env_params.observation_radius
    observation_max_path_depth = env_params.observation_max_path_depth

    # Observation builder
    predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
    tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth,
                                         predictor=predictor)

    # Setup the environment
    env = RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=n_cities,
            grid_mode=False,
            max_rails_between_cities=max_rails_between_cities,
            max_rails_in_city=max_rails_in_city,
        ),
        schedule_generator=sparse_schedule_generator(speed_profiles),
        number_of_agents=n_agents,
        malfunction_generator_and_process_data=malfunction_from_params(
            malfunction_parameters),
        obs_builder_object=tree_observation)

    if render:
        env_renderer = RenderTool(env, gl="PGL")

    action_dict = dict()
    scores = []
    completions = []
    nb_steps = []
    inference_times = []
    preproc_times = []
    agent_times = []
    step_times = []

    for episode_idx in range(n_eval_episodes):
        seed += 1

        inference_timer = Timer()
        preproc_timer = Timer()
        agent_timer = Timer()
        step_timer = Timer()

        step_timer.start()
        obs, info = env.reset(regenerate_rail=True,
                              regenerate_schedule=True,
                              random_seed=seed)
        step_timer.end()

        agent_obs = [None] * env.get_num_agents()
        score = 0.0

        if render:
            env_renderer.set_new_rail()

        final_step = 0
        skipped = 0

        nb_hit = 0
        agent_last_obs = {}
        agent_last_action = {}

        for step in range(max_steps - 1):
            if allow_skipping and check_if_all_blocked(env):
                # FIXME why -1? bug where all agents are "done" after max_steps!
                skipped = max_steps - step - 1
                final_step = max_steps - 2
                n_unfinished_agents = sum(not done[idx]
                                          for idx in env.get_agent_handles())
                score -= skipped * n_unfinished_agents
                break

            agent_timer.start()
            for agent in env.get_agent_handles():
                if obs[agent] and info['action_required'][agent]:
                    if agent in agent_last_obs and np.all(
                            agent_last_obs[agent] == obs[agent]):
                        nb_hit += 1
                        action = agent_last_action[agent]

                    else:
                        preproc_timer.start()
                        norm_obs = normalize_observation(
                            obs[agent],
                            tree_depth=observation_tree_depth,
                            observation_radius=observation_radius)
                        preproc_timer.end()

                        inference_timer.start()
                        action = policy.act(norm_obs, eps=0.0)
                        inference_timer.end()

                    action_dict.update({agent: action})

                    if allow_caching:
                        agent_last_obs[agent] = obs[agent]
                        agent_last_action[agent] = action
            agent_timer.end()

            step_timer.start()
            obs, all_rewards, done, info = env.step(action_dict)
            step_timer.end()

            if render:
                env_renderer.render_env(show=True,
                                        frames=False,
                                        show_observations=False,
                                        show_predictions=False)

                if step % 100 == 0:
                    print("{}/{}".format(step, max_steps - 1))

            for agent in env.get_agent_handles():
                score += all_rewards[agent]

            final_step = step

            if done['__all__']:
                break

        normalized_score = score / (max_steps * env.get_num_agents())
        scores.append(normalized_score)

        tasks_finished = sum(done[idx] for idx in env.get_agent_handles())
        completion = tasks_finished / max(1, env.get_num_agents())
        completions.append(completion)

        nb_steps.append(final_step)

        inference_times.append(inference_timer.get())
        preproc_times.append(preproc_timer.get())
        agent_times.append(agent_timer.get())
        step_times.append(step_timer.get())

        skipped_text = ""
        if skipped > 0:
            skipped_text = "\t⚡ Skipped {}".format(skipped)

        hit_text = ""
        if nb_hit > 0:
            hit_text = "\t⚡ Hit {} ({:.1f}%)".format(nb_hit, (100 * nb_hit) /
                                                     (n_agents * final_step))

        print(
            "☑️  Score: {:.3f} \tDone: {:.1f}% \tNb steps: {:.3f} "
            "\t🍭 Seed: {}"
            "\t🚉 Env: {:.3f}s  "
            "\t🤖 Agent: {:.3f}s (per step: {:.3f}s) \t[preproc: {:.3f}s \tinfer: {:.3f}s]"
            "{}{}".format(normalized_score, completion * 100.0, final_step,
                          seed, step_timer.get(), agent_timer.get(),
                          agent_timer.get() / final_step, preproc_timer.get(),
                          inference_timer.get(), skipped_text, hit_text))

    return scores, completions, nb_steps, agent_times, step_times
Exemple #16
0
def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps,
                action_size, state_size, seed, render, allow_skipping,
                allow_caching):
    # Evaluation is faster on CPU (except if you use a really huge policy)
    parameters = {'use_gpu': False}

    # policy = DDDQNPolicy(state_size, action_size, Namespace(**parameters), evaluation_mode=True)
    # policy.qnetwork_local = torch.load(checkpoint, map_location={'cuda:0': 'cpu'})

    env_params = Namespace(**env_params)

    # Environment parameters
    n_agents = env_params.n_agents
    x_dim = env_params.x_dim
    y_dim = env_params.y_dim
    n_cities = env_params.n_cities
    max_rails_between_cities = env_params.max_rails_between_cities
    max_rails_in_city = env_params.max_rails_in_city

    agents = []
    for agent_id in range(n_agents):
        agent = AttentionAgent(num_in_pol=state_size,
                               num_out_pol=action_size,
                               hidden_dim=256,
                               lr=0.001)

        agent.policy = torch.load(os.path.join(
            checkpoint, f'2300_agent{agent_id}' + '.pth'),
                                  map_location=torch.device('cpu'))
        agent.policy.eval()

        agents.append(agent)

    # Malfunction and speed profiles
    # TODO pass these parameters properly from main!
    malfunction_parameters = MalfunctionParameters(
        malfunction_rate=1. / 2000,  # Rate of malfunctions
        min_duration=20,  # Minimal duration
        max_duration=50  # Max duration
    )

    # Only fast trains in Round 1
    speed_profiles = {
        1.: 1.0,  # Fast passenger train
        1. / 2.: 0.0,  # Fast freight train
        1. / 3.: 0.0,  # Slow commuter train
        1. / 4.: 0.0  # Slow freight train
    }

    # Observation parameters
    observation_tree_depth = env_params.observation_tree_depth
    observation_radius = env_params.observation_radius
    observation_max_path_depth = env_params.observation_max_path_depth

    # Observation builder
    predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
    tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth,
                                         predictor=predictor)

    # Setup the environment
    env = RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=n_cities,
            grid_mode=False,
            max_rails_between_cities=max_rails_between_cities,
            max_rails_in_city=max_rails_in_city,
        ),
        # rail_generator = complex_rail_generator(
        #     nr_start_goal=10,
        #     nr_extra=10,
        #     min_dist=10,
        #     max_dist=99999,
        #     seed=1
        # ),
        schedule_generator=sparse_schedule_generator(speed_profiles),
        number_of_agents=n_agents,
        malfunction_generator_and_process_data=malfunction_from_params(
            malfunction_parameters),
        obs_builder_object=tree_observation)

    if render:
        # env_renderer = RenderTool(env, gl="PGL")
        env_renderer = RenderTool(
            env,
            # gl="PGL",
            agent_render_variant=AgentRenderVariant.
            AGENT_SHOWS_OPTIONS_AND_BOX,
            show_debug=False,
            screen_height=600,  # Adjust these parameters to fit your resolution
            screen_width=800)

    action_dict = dict()
    scores = []
    completions = []
    nb_steps = []
    inference_times = []
    preproc_times = []
    agent_times = []
    step_times = []

    for agent_id in range(n_agents):
        action_dict[agent_id] = 0

    for episode_idx in range(n_eval_episodes):
        images = []
        seed += 1

        inference_timer = Timer()
        preproc_timer = Timer()
        agent_timer = Timer()
        step_timer = Timer()

        step_timer.start()
        obs, info = env.reset(regenerate_rail=True,
                              regenerate_schedule=True,
                              random_seed=seed)
        step_timer.end()

        agent_obs = [None] * env.get_num_agents()
        score = 0.0

        if render:
            env_renderer.set_new_rail()

        final_step = 0
        skipped = 0

        nb_hit = 0
        agent_last_obs = {}
        agent_last_action = {}

        for step in range(max_steps - 1):
            # time.sleep(0.2)
            if allow_skipping and check_if_all_blocked(env):
                # FIXME why -1? bug where all agents are "done" after max_steps!
                skipped = max_steps - step - 1
                final_step = max_steps - 2
                n_unfinished_agents = sum(not done[idx]
                                          for idx in env.get_agent_handles())
                score -= skipped * n_unfinished_agents
                break

            agent_timer.start()
            for agent in env.get_agent_handles():
                agent_model = agents[agent]
                if obs[agent] and info['action_required'][agent]:
                    if agent in agent_last_obs and np.all(
                            agent_last_obs[agent] == obs[agent]):
                        nb_hit += 1
                        action = agent_last_action[agent]

                    else:
                        preproc_timer.start()
                        norm_obs = normalize_observation(
                            obs[agent],
                            tree_depth=observation_tree_depth,
                            observation_radius=observation_radius)
                        preproc_timer.end()

                        inference_timer.start()
                        action = act(agent_model, norm_obs)
                        inference_timer.end()

                    action_dict.update({agent: action})

                    if allow_caching:
                        agent_last_obs[agent] = obs[agent]
                        agent_last_action[agent] = action
            agent_timer.end()

            step_timer.start()
            obs, all_rewards, done, info = env.step(action_dict)
            step_timer.end()

            if render:
                env_renderer.render_env(show=True,
                                        frames=False,
                                        show_observations=False,
                                        show_predictions=False)

                im = env_renderer.get_image()
                im = PIL.Image.fromarray(im)
                images.append(im)

                if step % 100 == 0:
                    print("{}/{}".format(step, max_steps - 1))

            for agent in env.get_agent_handles():
                score += all_rewards[agent]

            final_step = step

            if done['__all__']:
                break

        if render:
            for _ in range(10):
                images.append(images[len(images) - 1])

            # save video
            images[0].save(
                f'/Users/nikhilvs/repos/nyu/flatland-reinforcement-learning/videos/maac-final/out_{episode_idx}.gif',
                save_all=True,
                append_images=images[1:],
                optimize=False,
                duration=60,
                loop=0)

        normalized_score = score / (max_steps * env.get_num_agents())
        scores.append(normalized_score)

        tasks_finished = sum(done[idx] for idx in env.get_agent_handles())
        completion = tasks_finished / max(1, env.get_num_agents())
        completions.append(completion)

        nb_steps.append(final_step)

        inference_times.append(inference_timer.get())
        preproc_times.append(preproc_timer.get())
        agent_times.append(agent_timer.get())
        step_times.append(step_timer.get())

        skipped_text = ""
        if skipped > 0:
            skipped_text = "\t⚡ Skipped {}".format(skipped)

        hit_text = ""
        if nb_hit > 0:
            hit_text = "\t⚡ Hit {} ({:.1f}%)".format(nb_hit, (100 * nb_hit) /
                                                     (n_agents * final_step))

        print(
            "☑️  Score: {:.3f} \tDone: {:.1f}% \tNb steps: {:.3f} "
            "\t🍭 Seed: {}"
            "\t🚉 Env: {:.3f}s  "
            "\t🤖 Agent: {:.3f}s (per step: {:.3f}s) \t[preproc: {:.3f}s \tinfer: {:.3f}s]"
            "{}{}".format(normalized_score, completion * 100.0, final_step,
                          seed, step_timer.get(), agent_timer.get(),
                          agent_timer.get() / final_step, preproc_timer.get(),
                          inference_timer.get(), skipped_text, hit_text))

    return scores, completions, nb_steps, agent_times, step_times
Exemple #17
0
def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps, seed,
                render):
    # evaluation is faster on CPU, except if you have huge networks
    parameters = {'use_gpu': False}

    policy = DDDQNPolicy(state_size,
                         action_size,
                         Namespace(**parameters),
                         evaluation_mode=True)
    policy.qnetwork_local = torch.load(checkpoint)

    env_params = Namespace(**env_params)

    # Environment parameters
    n_agents = env_params.n_agents
    x_dim = env_params.x_dim
    y_dim = env_params.y_dim
    n_cities = env_params.n_cities
    max_rails_between_cities = env_params.max_rails_between_cities
    max_rails_in_city = env_params.max_rails_in_city

    # Observation parameters
    observation_tree_depth = env_params.observation_tree_depth
    observation_radius = env_params.observation_radius
    observation_max_path_depth = env_params.observation_max_path_depth

    # Malfunction and speed profiles
    # TODO pass these parameters properly from main!
    malfunction_parameters = MalfunctionParameters(
        malfunction_rate=1. / 2000,  # Rate of malfunctions
        min_duration=20,  # Minimal duration
        max_duration=50  # Max duration
    )
    speed_profiles = {
        1.: 1.0,  # Fast passenger train
        1. / 2.: 0.0,  # Fast freight train
        1. / 3.: 0.0,  # Slow commuter train
        1. / 4.: 0.0  # Slow freight train
    }

    # Observation builder
    predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
    tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth,
                                         predictor=predictor)

    # Setup the environment
    env = RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=n_cities,
            grid_mode=False,
            max_rails_between_cities=max_rails_between_cities,
            max_rails_in_city=max_rails_in_city),
        schedule_generator=sparse_schedule_generator(speed_profiles),
        number_of_agents=n_agents,
        malfunction_generator_and_process_data=malfunction_from_params(
            malfunction_parameters),
        obs_builder_object=tree_observation,
        random_seed=seed)
    env.reset(True, True)

    if render:
        env_renderer = RenderTool(env, gl="PGL")

    action_dict = dict()
    scores = []
    completions = []
    nb_steps = []
    inference_times = []
    preproc_times = []
    agent_times = []
    step_times = []

    for episode_idx in range(n_eval_episodes):
        inference_timer = Timer()
        preproc_timer = Timer()
        agent_timer = Timer()
        step_timer = Timer()

        agent_obs = [None] * env.get_num_agents()
        score = 0.0

        step_timer.start()
        obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True)
        step_timer.end()

        if render:
            env_renderer.set_new_rail()

        final_step = 0

        for step in range(max_steps - 1):
            agent_timer.start()
            for agent in env.get_agent_handles():
                if obs[agent]:
                    preproc_timer.start()
                    agent_obs[agent] = normalize_observation(
                        obs[agent],
                        tree_depth=observation_tree_depth,
                        observation_radius=observation_radius)
                    preproc_timer.end()

                action = 0
                if info['action_required'][agent]:
                    inference_timer.start()
                    action = policy.act(agent_obs[agent], eps=0.0)
                    inference_timer.end()
                action_dict.update({agent: action})
            agent_timer.end()

            step_timer.start()
            obs, all_rewards, done, info = env.step(action_dict)
            step_timer.end()

            if render:
                env_renderer.render_env(show=True,
                                        frames=False,
                                        show_observations=False,
                                        show_predictions=False)

            for agent in env.get_agent_handles():
                score += all_rewards[agent]

            final_step = step

            if done['__all__']:
                break

        normalized_score = score / (max_steps * env.get_num_agents())
        scores.append(normalized_score)

        tasks_finished = sum(done[idx] for idx in env.get_agent_handles())
        completion = tasks_finished / max(1, env.get_num_agents())
        completions.append(completion)

        nb_steps.append(final_step)

        inference_times.append(inference_timer.get())
        preproc_times.append(preproc_timer.get())
        agent_times.append(agent_timer.get())
        step_times.append(step_timer.get())

        print(
            "☑️  Score: {:.3f} \tDone: {:.1f}% \tNb steps: {:.3f} "
            "\t🚉 Env: {:.3f}s "
            "\t🤖 Agent: {:.3f}s (per step: {:.3f}s) \t[preproc: {:.3f}s \tinfer: {:.3f}s]"
            .format(normalized_score, completion * 100.0, final_step,
                    step_timer.get(), agent_timer.get(),
                    agent_timer.get() / final_step, preproc_timer.get(),
                    inference_timer.get()))

    return scores, completions, nb_steps, agent_times, step_times
Exemple #18
0
def test_initial_malfunction_do_nothing():
    stochastic_data = MalfunctionParameters(malfunction_rate=70,  # Rate of malfunction occurence
                                            min_duration=2,  # Minimal duration of malfunction
                                            max_duration=5  # Max duration of malfunction
                                            )

    rail, rail_map = make_simple_rail2()

    env = RailEnv(width=25,
                  height=30,
                  rail_generator=rail_from_grid_transition_map(rail),
                  schedule_generator=random_schedule_generator(),
                  number_of_agents=1,
                  malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
                  # Malfunction data generator
                  )
    env.reset()
    set_penalties_for_replay(env)
    replay_config = ReplayConfig(
        replay=[
            Replay(
                position=None,
                direction=Grid4TransitionsEnum.EAST,
                action=RailEnvActions.MOVE_FORWARD,
                set_malfunction=3,
                malfunction=3,
                reward=env.step_penalty,  # full step penalty while malfunctioning
                status=RailAgentStatus.READY_TO_DEPART
            ),
            Replay(
                position=(3, 2),
                direction=Grid4TransitionsEnum.EAST,
                action=RailEnvActions.DO_NOTHING,
                malfunction=2,
                reward=env.step_penalty,  # full step penalty while malfunctioning
                status=RailAgentStatus.ACTIVE
            ),
            # malfunction stops in the next step and we're still at the beginning of the cell
            # --> if we take action DO_NOTHING, agent should restart without moving
            #
            Replay(
                position=(3, 2),
                direction=Grid4TransitionsEnum.EAST,
                action=RailEnvActions.DO_NOTHING,
                malfunction=1,
                reward=env.step_penalty,  # full step penalty while stopped
                status=RailAgentStatus.ACTIVE
            ),
            # we haven't started moving yet --> stay here
            Replay(
                position=(3, 2),
                direction=Grid4TransitionsEnum.EAST,
                action=RailEnvActions.DO_NOTHING,
                malfunction=0,
                reward=env.step_penalty,  # full step penalty while stopped
                status=RailAgentStatus.ACTIVE
            ),

            Replay(
                position=(3, 2),
                direction=Grid4TransitionsEnum.EAST,
                action=RailEnvActions.MOVE_FORWARD,
                malfunction=0,
                reward=env.start_penalty + env.step_penalty * 1.0,  # start penalty + step penalty for speed 1.0
                status=RailAgentStatus.ACTIVE
            ),  # we start to move forward --> should go to next cell now
            Replay(
                position=(3, 3),
                direction=Grid4TransitionsEnum.EAST,
                action=RailEnvActions.MOVE_FORWARD,
                malfunction=0,
                reward=env.step_penalty * 1.0,  # step penalty for speed 1.0
                status=RailAgentStatus.ACTIVE
            )
        ],
        speed=env.agents[0].speed_data['speed'],
        target=env.agents[0].target,
        initial_position=(3, 2),
        initial_direction=Grid4TransitionsEnum.EAST,
    )
    run_replay_config(env, [replay_config], activate_agents=False)
Exemple #19
0
def test_initial_malfunction():
    stochastic_data = MalfunctionParameters(malfunction_rate=1000,  # Rate of malfunction occurence
                                            min_duration=2,  # Minimal duration of malfunction
                                            max_duration=5  # Max duration of malfunction
                                            )

    rail, rail_map = make_simple_rail2()

    env = RailEnv(width=25,
                  height=30,
                  rail_generator=rail_from_grid_transition_map(rail),
                  schedule_generator=random_schedule_generator(seed=10),
                  number_of_agents=1,
                  malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
                  # Malfunction data generator
                  obs_builder_object=SingleAgentNavigationObs()
                  )
    # reset to initialize agents_static
    env.reset(False, False, True, random_seed=10)
    print(env.agents[0].malfunction_data)
    env.agents[0].target = (0, 5)
    set_penalties_for_replay(env)
    replay_config = ReplayConfig(
        replay=[
            Replay(
                position=(3, 2),
                direction=Grid4TransitionsEnum.EAST,
                action=RailEnvActions.MOVE_FORWARD,
                set_malfunction=3,
                malfunction=3,
                reward=env.step_penalty  # full step penalty when malfunctioning
            ),
            Replay(
                position=(3, 2),
                direction=Grid4TransitionsEnum.EAST,
                action=RailEnvActions.MOVE_FORWARD,
                malfunction=2,
                reward=env.step_penalty  # full step penalty when malfunctioning
            ),
            # malfunction stops in the next step and we're still at the beginning of the cell
            # --> if we take action MOVE_FORWARD, agent should restart and move to the next cell
            Replay(
                position=(3, 2),
                direction=Grid4TransitionsEnum.EAST,
                action=RailEnvActions.MOVE_FORWARD,
                malfunction=1,
                reward=env.step_penalty

            ),  # malfunctioning ends: starting and running at speed 1.0
            Replay(
                position=(3, 2),
                direction=Grid4TransitionsEnum.EAST,
                action=RailEnvActions.MOVE_FORWARD,
                malfunction=0,
                reward=env.start_penalty + env.step_penalty * 1.0  # running at speed 1.0
            ),
            Replay(
                position=(3, 3),
                direction=Grid4TransitionsEnum.EAST,
                action=RailEnvActions.MOVE_FORWARD,
                malfunction=0,
                reward=env.step_penalty  # running at speed 1.0
            )
        ],
        speed=env.agents[0].speed_data['speed'],
        target=env.agents[0].target,
        initial_position=(3, 2),
        initial_direction=Grid4TransitionsEnum.EAST,
    )
    run_replay_config(env, [replay_config])
flags = parser.parse_args()


# Seeded RNG so we can replicate our results
np.random.seed(0)

# We need to either load in some pre-generated railways from disk, or else create a random railway generator.
if flags.load_railways:
      rail_generator, schedule_generator = load_precomputed_railways(project_root, flags)
else: rail_generator, schedule_generator = create_random_railways(project_root)

# Create the Flatland environment
env = RailEnv(width=flags.grid_width, height=flags.grid_height, number_of_agents=flags.num_agents,
              rail_generator=rail_generator,
              schedule_generator=schedule_generator,
              malfunction_generator=ParamMalfunctionGen(MalfunctionParameters(1 / 8000, 15, 50)),
              obs_builder_object=TreeObservation(max_depth=flags.tree_depth))

# After training we want to render the results so we also load a renderer
env_renderer = RenderTool(env, gl="PILSVG", screen_width=800, screen_height=800, agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX)

# Calculate the state size based on the number of nodes in the tree observation
num_features_per_node = env.obs_builder.observation_dim
num_nodes = sum(np.power(4, i) for i in range(flags.tree_depth + 1))
state_size = num_nodes * num_features_per_node
action_size = 5

# Add some variables to keep track of the progress
scores_window, steps_window, collisions_window, done_window = [deque(maxlen=200) for _ in range(4)]
agent_obs = [None] * flags.num_agents
agent_obs_buffer = [None] * flags.num_agents
Exemple #21
0
def curriculum_learning():
    num_levels = 70
    num_episodes_in_env = 20
    model_saving_name = ""
    """
    curriculum = Manual_Curriculum("curriculum.yml")
    """

    curriculum = Semi_Auto_Curriculum(
        offset_curriculum_generator(
            num_levels,
            {
                "x_dim": 25,
                "y_dim": 25,
                "n_agents": 1.75,
                "n_cities": 3,
                # "n_extra": 3,
                # "min_dist": 2,
                # "max_dist": 5
                "max_rails_between_cities": 1,
                "max_rails_in_city": 1,
                "malfunction_level": 0,
                "speed_level": 0
            }),
        num_levels)

    try:
        myseed = 19
        level = 0
        threshold = 0.57

        while True:
            if curriculum.get("speed_level") < 1:
                speed_profiles = {
                    1.: 1.0,
                    1. / 2.: 0.0,
                    1. / 3.: 0.0,
                    1. / 4.: 0.0
                }
            elif curriculum.get("speed_level") < 2:
                speed_profiles = {
                    1.: 0.75,
                    1. / 2.: 0.25,
                    1. / 3.: 0.0,
                    1. / 4.: 0.0
                }
            elif curriculum.get("speed_level") < 3:
                speed_profiles = {
                    1.: 0.4,
                    1. / 2.: 0.3,
                    1. / 3.: 0.2,
                    1. / 4.: 0.1
                }
            else:
                speed_profiles = {
                    1.: 0.25,
                    1. / 2.: 0.25,
                    1. / 3.: 0.25,
                    1. / 4.: 0.25
                }

            if curriculum.get("malfunction_level") < 1:
                malfunction_rate = 0.0
            elif curriculum.get("malfunction_level") < 2:
                malfunction_rate = 0.0025
            elif curriculum.get("malfunction_level") < 3:
                malfunction_rate = 0.0055
            else:
                malfunction_rate = 0.0075

            print("=" * 203)
            print("Running the level {}".format(level))
            print("=" * 203)

            print(
                "{}x{} grid, {} agents, {} cities, {} rails between cities, {} rails in cities, malfunctions rate {} "
                "and speed {}. Completion threshold of {}".format(
                    curriculum.get("x_dim"), curriculum.get("y_dim"),
                    curriculum.get("n_agents"), curriculum.get("n_cities"),
                    curriculum.get("max_rails_between_cities"),
                    curriculum.get("max_rails_in_city"), malfunction_rate,
                    speed_profiles, threshold))

            try_outs = 0
            completion = 0

            while try_outs < 10 and completion < threshold:
                namefile = "curriculum_{}_{}_{}".format(
                    myseed, level, try_outs)

                environment_parameters = {
                    "n_agents":
                    curriculum.get("n_agents"),
                    "x_dim":
                    curriculum.get("x_dim"),
                    "y_dim":
                    curriculum.get("y_dim"),
                    "n_cities":
                    curriculum.get("n_cities"),
                    # Old curriculum
                    # "n_extra": curriculum.get("n_extra"),
                    # "min_dist": curriculum.get("min_dist"),
                    # "max_dist": curriculum.get("max_dist"),
                    "max_rails_between_cities":
                    curriculum.get("max_rails_between_cities"),
                    "max_rails_in_city":
                    curriculum.get("max_rails_in_city"),
                    "seed":
                    myseed,
                    "observation_tree_depth":
                    2,
                    "observation_radius":
                    10,
                    "observation_max_path_depth":
                    30,

                    # Malfunctions
                    "malfunction_parameters":
                    MalfunctionParameters(malfunction_rate=malfunction_rate,
                                          min_duration=15,
                                          max_duration=50),
                    # Speeds
                    "speed_profiles":
                    speed_profiles,

                    # ============================
                    # Custom observations&rewards
                    # ============================
                    "custom_observations":
                    False,
                    "reward_shaping":
                    True,
                    "uniform_reward":
                    True,
                    "stop_penalty":
                    -0.0,
                    "invalid_action_penalty":
                    -0.0,
                    "deadlock_penalty":
                    -15.0,
                    # 1.0 for skipping
                    "shortest_path_penalty_coefficient":
                    1 + 1 / 15,
                    "done_bonus":
                    1 / 15,
                }

                training_parameters = {
                    # ============================
                    # Network architecture
                    # ============================
                    # Shared actor-critic layer
                    # If shared is True then the considered sizes are taken from the critic
                    "shared":
                    False,
                    "shared_recurrent":
                    False,
                    "linear_size":
                    128,
                    "hidden_size":
                    64,
                    # Policy network
                    "critic_mlp_width":
                    128,
                    "critic_mlp_depth":
                    3,
                    "last_critic_layer_scaling":
                    0.1,
                    # Actor network
                    "actor_mlp_width":
                    128,
                    "actor_mlp_depth":
                    3,
                    "last_actor_layer_scaling":
                    0.01,
                    "learning_rate":
                    0.001,
                    "adam_eps":
                    1e-5,
                    "activation":
                    "Tanh",
                    "lmbda":
                    0.95,
                    "entropy_coefficient":
                    0.01,
                    "value_loss_coefficient":
                    0.001,

                    # ============================
                    # Training setup
                    # ============================
                    "n_episodes":
                    num_episodes_in_env,
                    "horizon":
                    512,
                    "epochs":
                    8,
                    "batch_size":
                    32,
                    "batch_mode":
                    "shuffle",

                    # ============================
                    # Normalization and clipping
                    # ============================
                    "discount_factor":
                    0.99,
                    "max_grad_norm":
                    0.5,
                    "eps_clip":
                    0.2,

                    # ============================
                    # Advantage estimation
                    # ============================
                    "advantage_estimator":
                    "gae",

                    # ============================
                    # Optimization and rendering
                    # ============================
                    "checkpoint_interval":
                    num_episodes_in_env,
                    "evaluation_mode":
                    False,
                    "eval_episodes":
                    None,
                    "use_gpu":
                    False,
                    "render":
                    False,
                    "print_stats":
                    True,
                    "wandb_project":
                    "flatland-challenge-ps-ppo-curriculum",
                    "wandb_entity":
                    "lomb",
                    "wandb_tag":
                    "curriculum_{}".format(myseed),
                    "save_model_path":
                    "/content/drive/My Drive/Colab Notebooks/models/" +
                    namefile + ".pt",
                    "load_model_path":
                    "/content/drive/My Drive/Colab Notebooks/models/" +
                    model_saving_name + ".pt",
                    "tensorboard_path":
                    "log_" + namefile + "/",
                    "automatic_name_saving":
                    False,

                    # ============================
                    # Action Masking
                    # ============================
                    "action_masking":
                    True,
                    "allow_no_op":
                    False
                }

                print("\nLevel %d try out number % d" % (level, try_outs))
                # Train
                _, completions, _, _ = train_multiple_agents(
                    Namespace(**environment_parameters),
                    Namespace(**training_parameters))
                try_outs += 1
                model_saving_name = namefile
                completion = np.mean(completions)
            print("\n" + "=" * 203)

            # Update curriculum
            curriculum.update()
            level += 1
    except StopIteration:
        return
Exemple #22
0
def train(env):
    n_agents = env["n_agents"]
    x_dim = env["x_dim"]
    y_dim = env["y_dim"]
    n_cities = env["n_cities"]
    max_rails_between_cities = env["max_rails_between_cities"]
    max_rails_in_city = env["max_rails_in_city"]
    seed = 0
    use_fast_tree_obs = False

    # Observation parameters
    observation_tree_depth = 4
    observation_radius = 10
    observation_max_path_depth = 30

    # Set the seeds
    random.seed(seed)
    np.random.seed(seed)

    # Break agents from time to time
    malfunction_parameters = MalfunctionParameters(
        malfunction_rate=1. / 10000,  # Rate of malfunctions
        min_duration=15,  # Minimal duration
        max_duration=50  # Max duration
    )

    # Observation builder
    predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
    tree_observation = None

    if use_fast_tree_obs:
        tree_observation = FastTreeObs(max_depth=observation_tree_depth)
        print("Using FastTreeObs")
    else:
        tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth,
                                             predictor=predictor)
        print("Using StandardTreeObs")

    speed_profiles = {
        1.: 1.0,  # Fast passenger train
        1. / 2.: 0.0,  # Fast freight train
        1. / 3.: 0.0,  # Slow commuter train
        1. / 4.: 0.0  # Slow freight train
    }

    env = RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=n_cities,
            grid_mode=False,
            max_rails_between_cities=max_rails_between_cities,
            max_rails_in_city=max_rails_in_city),
        schedule_generator=sparse_schedule_generator(speed_profiles),
        number_of_agents=n_agents,
        malfunction_generator_and_process_data=malfunction_from_params(
            malfunction_parameters),
        obs_builder_object=tree_observation,
        random_seed=seed)

    rewards = []
    obs, info = env.reset()

    if use_fast_tree_obs:
        state_size = tree_observation.observation_dim
    else:
        # Calculate the state size given the depth of the tree observation and the
        # number of features
        n_features_per_node = env.obs_builder.observation_dim
        n_nodes = 0
        for i in range(observation_tree_depth + 1):
            n_nodes += np.power(4, i)

        state_size = n_features_per_node * n_nodes

    action_size = 5

    DEVICE = 'cpu'
    # if torch.cuda.is_available():
    # 	DEVICE = 'gpu'

    buffer_length = 10000
    steps_to_save_model = 10
    step_size = 100
    num_steps = 100  # update every 100 steps
    avg_steps = 20  # num steps to average and plot rewards
    reward_q = []
    batch_size = 100

    agent_obs = np.array([None] * env.get_num_agents())

    max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities)))
    num_episodes = 100000

    agent_init_params = []
    sa_size = []

    for i in range(n_agents):
        agent_init_params.append({
            'num_in_pol': state_size,
            'num_out_pol': action_size,
            'init_weights': 'model.pt'
        })
        sa_size.append((state_size, action_size))

    hyperparams = {
        "tau": 0.01,
        "pi_lr": 0.00001,
        "q_lr": 0.00005,
        "pol_hidden_dim": 256,
        "critic_hidden_dim": 256,
        "attend_heads": 8
    }

    model = AttentionSAC(agent_init_params=agent_init_params,
                         sa_size=sa_size,
                         tau=hyperparams["tau"],
                         pi_lr=hyperparams["pi_lr"],
                         q_lr=hyperparams["q_lr"],
                         pol_hidden_dim=hyperparams["pol_hidden_dim"],
                         critic_hidden_dim=hyperparams["critic_hidden_dim"],
                         attend_heads=hyperparams["attend_heads"])
    model.init_dict = {}

    replay_buffer = ReplayBuffer(buffer_length, n_agents,
                                 [state_size for i in range(n_agents)],
                                 [action_size for i in range(n_agents)])

    print("MAX STEPS: " + str(max_steps))
    print("NUM EPISODES: ", num_episodes)
    print("HYPERPARAMS: ")
    print(hyperparams)

    start_time = time.time()

    for ep in range(num_episodes):
        print("Episode " + str(ep) + ":", flush=True)
        obs, info = env.reset(True, True)
        model.prep_rollouts(device=DEVICE)
        reward_sum_for_this_episode = 0

        for steps in range(max_steps):
            if steps % step_size == 0:
                print("=", end="", flush=True)
            for agent in env.get_agent_handles():
                if obs[agent] is not None:
                    if use_fast_tree_obs:
                        agent_obs[agent] = obs[agent]
                    else:
                        agent_obs[agent] = normalize_observation(
                            obs[agent],
                            observation_tree_depth,
                            observation_radius=observation_radius)
                else:
                    agent_obs[agent] = np.array([0.] * state_size)

            action_dict = {}
            agent_actions = []

            torch_obs = [
                Variable(torch.Tensor([agent_obs[i]]), requires_grad=False)
                for i in range(n_agents)
            ]
            torch_agent_actions = model.step(torch_obs, explore=True)
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]

            for i in range(n_agents):
                dist = torch_agent_actions[i][0]
                idx = -1
                for j in range(action_size):
                    if dist[j] != 0:
                        idx = j
                        break
                action_dict[i] = idx

            next_obs, all_rewards, done, info = env.step(action_dict)

            rewards = []
            dones = []

            next_agent_obs = np.array([None] * env.get_num_agents())

            for agent in env.get_agent_handles():
                if next_obs[agent] is not None:
                    if use_fast_tree_obs:
                        next_agent_obs[agent] = next_obs[agent]
                    else:
                        next_agent_obs[agent] = normalize_observation(
                            obs[agent],
                            observation_tree_depth,
                            observation_radius=observation_radius)
                else:
                    next_agent_obs[agent] = np.array([0.] * state_size)

            for i in range(n_agents):
                reward_sum_for_this_episode += all_rewards[i]
                rewards.append(all_rewards[i])
                all_rewards[i] += augment_reward(agent_obs[agent])
                dones.append(done[i])

            replay_buffer.push(np.array([agent_obs]), np.array(agent_actions),
                               np.array([rewards]), np.array([next_agent_obs]),
                               np.array([dones]))

            if steps % num_steps == 0:
                model.prep_training(device=DEVICE)
                sample = replay_buffer.sample(batch_size, norm_rews=False)
                #print(sample)
                model.update_critic(sample)
                model.update_policies(sample)
                model.update_all_targets()
                model.prep_rollouts(device=DEVICE)

        reward_sum_for_this_episode /= n_agents
        reward_q.append(reward_sum_for_this_episode)

        if len(reward_q) == avg_steps:
            wandb.log({'reward': np.mean(reward_q)})
            reward_q = []

        print()

        if ep % steps_to_save_model == 0:
            print("\nSaving model")
            model.save(os.getcwd() + "/model.pt")
            cur_time = time.time()
            time_elapsed = (cur_time - start_time) // 60
            print("Time Elapsed: " + str(time_elapsed) + "\n")
Exemple #23
0
def main(argv):

    try:
        opts, args = getopt.getopt(argv, "n:", ["n_trials="])
    except getopt.GetoptError:
        print('training_navigation.py -n <n_trials>')
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-n', '--n_trials'):
            n_trials = int(arg)

    random.seed(1)
    np.random.seed(1)

    # Parameters for the Environment
    x_dim = 35
    y_dim = 35
    n_agents = 1


    # Use a the malfunction generator to break agents from time to time
    # stochastic_data = {'malfunction_rate': 8000,  # Rate of malfunction occurence of single agent
    #                    'min_duration': 15,  # Minimal duration of malfunction
    #                    'max_duration': 50  # Max duration of malfunction
    #                    }
    stochastic_data = MalfunctionParameters(malfunction_rate=8000,  # Rate of malfunction occurence
                                            min_duration=15,  # Minimal duration of malfunction
                                            max_duration=50  # Max duration of malfunction
                                            )

    # Custom observation builder
    TreeObservation = TreeObsForRailEnv(max_depth=2)

    # Different agent types (trains) with different speeds.
    speed_ration_map = {1.: 0.,  # Fast passenger train
                        1. / 2.: 1.0,  # Fast freight train
                        1. / 3.: 0.0,  # Slow commuter train
                        1. / 4.: 0.0}  # Slow freight train

    env = RailEnv(width=x_dim,
                  height=y_dim,
                  rail_generator=sparse_rail_generator(max_num_cities=3,
                                                       # Number of cities in map (where train stations are)
                                                       seed=1,  # Random seed
                                                       grid_mode=False,
                                                       max_rails_between_cities=2,
                                                       max_rails_in_city=3),
                  schedule_generator=sparse_schedule_generator(speed_ration_map),
                  number_of_agents=n_agents,
                  malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
                  # Malfunction data generator
                  obs_builder_object=TreeObservation)

    # After training we want to render the results so we also load a renderer
    env_renderer = RenderTool(env, gl="PILSVG", )
    # Given the depth of the tree observation and the number of features per node we get the following state_size
    num_features_per_node = env.obs_builder.observation_dim
    tree_depth = 2
    nr_nodes = 0
    for i in range(tree_depth + 1):
        nr_nodes += np.power(4, i)
    state_size = num_features_per_node * nr_nodes

    # The action space of flatland is 5 discrete actions
    action_size = 5

    # We set the number of episodes we would like to train on
    if 'n_trials' not in locals():
        n_trials = 50

    # And the max number of steps we want to take per episode
    max_steps = int(3 * (env.height + env.width))

    # Define training parameters
    eps = 1.
    eps_end = 0.005
    eps_decay = 0.998

    # And some variables to keep track of the progress
    action_dict = dict()
    final_action_dict = dict()
    scores_window = deque(maxlen=100)
    done_window = deque(maxlen=100)
    scores = []
    dones_list = []
    action_prob = [0] * action_size
    agent_obs = [None] * n_agents
    agent_next_obs = [None] * n_agents
    agent_obs_buffer = [None] * n_agents
    agent_action_buffer = [2] * n_agents
    cummulated_reward = np.zeros(n_agents)
    update_values = False
    # Now we load a Double dueling DQN agent
    agent = Agent(state_size, action_size)

    for trials in range(1, n_trials + 1):

        # Reset environment
        obs, info = env.reset(False, False)
        # env_renderer.reset()
        # Build agent specific observations
        for a in range(env.get_num_agents()):
            if obs[a]:
                agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
                agent_obs_buffer[a] = agent_obs[a].copy()

        # Reset score and done
        score = 0
        env_done = 0

        # Run episode
        for step in range(max_steps):
            # Action
            for a in range(env.get_num_agents()):
                if info['action_required'][a]:
                    # If an action is require, we want to store the obs a that step as well as the action
                    update_values = True
                    action = agent.act(agent_obs[a], eps=eps)
                    action_prob[action] += 1
                else:
                    update_values = False
                    action = 0
                action_dict.update({a: action})

            # Environment step
            next_obs, all_rewards, done, info = env.step(action_dict)
            # Update replay buffer and train agent
            for a in range(env.get_num_agents()):
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values or done[a]:
                    agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a],
                               agent_obs[a], done[a])
                    cummulated_reward[a] = 0.

                    agent_obs_buffer[a] = agent_obs[a].copy()
                    agent_action_buffer[a] = action_dict[a]
                if next_obs[a]:
                    agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10)

                score += all_rewards[a] / env.get_num_agents()

            # env_renderer.render_env(show=True, show_observations=False, show_predictions=False)

            # Copy observation
            if done['__all__']:
                env_done = 1
                break

        # Epsilon decay
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon

        # Collection information about training
        tasks_finished = 0
        for _idx in range(env.get_num_agents()):
            if done[_idx] == 1:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / max_steps)  # save most recent score
        scores.append(np.mean(scores_window))
        dones_list.append((np.mean(done_window)))

        print(
            '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
                env.get_num_agents(), x_dim, y_dim,
                trials,
                np.mean(scores_window),
                100 * np.mean(done_window),
                eps, action_prob / np.sum(action_prob)), end=" ")

        if trials % 100 == 0:
            print(
                '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
                    env.get_num_agents(), x_dim, y_dim,
                    trials,
                    np.mean(scores_window),
                    100 * np.mean(done_window),
                    eps, action_prob / np.sum(action_prob)))
            torch.save(agent.qnetwork_local.state_dict(),
                       './Nets/navigator_checkpoint' + str(trials) + '.pth')
            action_prob = [1] * action_size
Exemple #24
0
def main():
    np.random.seed(1)

    env = RailEnv(
        width=flags.grid_width,
        height=flags.grid_height,
        number_of_agents=flags.num_agents,
        rail_generator=rail_generator,
        schedule_generator=schedule_generator,
        malfunction_generator_and_process_data=malfunction_from_params(
            MalfunctionParameters(1 / 8000, 15, 50)),
        obs_builder_object=TreeObservation(max_depth=flags.tree_depth))

    # After training we want to render the results so we also load a renderer
    env_renderer = RenderTool(env, gl="PILSVG")

    # Calculate the state size based on the number of nodes in the tree observation
    num_features_per_node = env.obs_builder.observation_dim
    num_nodes = sum(np.power(4, i) for i in range(flags.tree_depth + 1))
    state_size = num_nodes * num_features_per_node
    action_size = 5

    # Now we load a double dueling DQN agent and initialize it from the checkpoint
    agent = Agent(state_size, action_size)
    if flags.load_from_checkpoint:
        start, eps = agent.load(project_root / 'checkpoints', 0, 1.0)
    else:
        start, eps = 0, 1.0

    # And some variables to keep track of the progress
    action_dict, final_action_dict = {}, {}
    scores_window, steps_window, done_window = deque(maxlen=200), deque(
        maxlen=200), deque(maxlen=200)
    action_prob = [0] * action_size
    agent_obs = [None] * flags.num_agents
    agent_obs_buffer = [None] * flags.num_agents
    agent_action_buffer = [2] * flags.num_agents

    max_steps = int(8 * (flags.grid_width + flags.grid_height))
    update_values = False
    start_time = time.time()

    # We don't want to retrain on old railway networks when we restart from a checkpoint, so we just loop
    # through the generators to get all the old networks out of the way
    if start > 0: print(f"Skipping {start} railways")
    for _ in range(0, start):
        rail_generator()
        schedule_generator()

    # Start the training loop
    for episode in range(start + 1, flags.num_episodes + 1):
        env_renderer.reset()
        obs, info = env.reset(True, True)
        score, steps_taken = 0, 0

        # Build agent specific observations
        for a in range(flags.num_agents):
            if obs[a]:
                agent_obs[a] = normalize_observation(obs[a], flags.tree_depth)
                agent_obs_buffer[a] = agent_obs[a].copy()

        # Run episode
        for step in range(max_steps):
            for a in range(flags.num_agents):
                # if not isinstance(obs[a].childs['L'], float) or not isinstance(obs[a].childs['R'], float):
                if info['action_required'][a]:
                    # If an action is required, we want to store the obs a that step as well as the action
                    update_values = True

                    # distances = { key: child.dist_min_to_target for key, child in obs[a].childs.items() if not isinstance(child, float) }
                    # action_key = min(distances, key=distances.get)
                    # action = { 'L': 1, 'F': 2, 'R': 3 }[action_key]
                    # action = np.argmin(agent_obs[a])

                    # action = np.random.randint(4)
                    action = agent.act(agent_obs[a], eps=eps)
                    action_dict[a] = action
                    action_prob[action] += 1
                    steps_taken += 1
                else:
                    update_values = False
                    action_dict[a] = 2

            # Environment step
            obs, all_rewards, done, info = env.step(action_dict)

            # Update replay buffer and train agent
            for a in range(flags.num_agents):
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values or done[a]:
                    agent.step(agent_obs_buffer[a], agent_action_buffer[a],
                               all_rewards[a], agent_obs[a], done[a],
                               flags.train)
                    agent_obs_buffer[a] = agent_obs[a].copy()
                    agent_action_buffer[a] = action_dict[a]
                if obs[a]:
                    agent_obs[a] = normalize_observation(
                        obs[a], flags.tree_depth)

                score += all_rewards[a] / flags.num_agents

            # Render
            if flags.render_interval and episode % flags.render_interval == 0:
                render(env_renderer)
            if done['__all__']: break

        # Epsilon decay
        eps = max(0.01, flags.epsilon_decay * eps)

        # Save some training statistics in their respective deques
        tasks_finished = sum(done[i] for i in range(flags.num_agents))
        done_window.append(tasks_finished / max(1, flags.num_agents))
        scores_window.append(score / max_steps)
        steps_window.append(steps_taken)
        action_probs = ', '.join(f'{x:.3f}'
                                 for x in action_prob / np.sum(action_prob))

        print(
            f'\rTraining {flags.num_agents} Agents on ({flags.grid_width},{flags.grid_height}) \t '
            + f'Episode {episode} \t ' +
            f'Average Score: {np.mean(scores_window):.3f} \t ' +
            f'Average Steps Taken: {np.mean(steps_window):.1f} \t ' +
            f'Dones: {100 * np.mean(done_window):.2f}% \t ' +
            f'Epsilon: {eps:.2f} \t ' +
            f'Action Probabilities: {action_probs}',
            end=" ")

        if episode % flags.report_interval == 0:
            print(
                f'\rTraining {flags.num_agents} Agents on ({flags.grid_width},{flags.grid_height}) \t '
                + f'Episode {episode} \t ' +
                f'Average Score: {np.mean(scores_window):.3f} \t ' +
                f'Average Steps Taken: {np.mean(steps_window):.1f} \t ' +
                f'Dones: {100 * np.mean(done_window):.2f}% \t ' +
                f'Epsilon: {eps:.2f} \t ' +
                f'Action Probabilities: {action_probs} \t ' +
                f'Time taken: {time.time() - start_time:.2f}s')

            if flags.train:
                agent.save(project_root / 'checkpoints', episode, eps)
            start_time = time.time()
            action_prob = [1] * action_size
Exemple #25
0
from flatland.envs.agent_utils import RailAgentStatus

from flatland.envs.observations import TreeObsForRailEnv
from flatland.envs.rail_env import RailEnv
from flatland.envs.rail_generators import random_rail_generator, complex_rail_generator, sparse_rail_generator
from flatland.utils.rendertools import RenderTool

rail_generator = sparse_rail_generator(seed=0,
                                       max_num_cities=4,
                                       grid_mode=False,
                                       max_rails_between_cities=2,
                                       max_rails_in_city=2)

malfunction_generator = ParamMalfunctionGen(
    MalfunctionParameters(malfunction_rate=10,
                          min_duration=20,
                          max_duration=50))

speed_ratio_map = None
speed_ratio_map = {1: 1}
schedule_generator = sparse_schedule_generator(speed_ratio_map)

n_agents = 5
env = RailEnv(
    width=25,
    height=25,
    rail_generator=rail_generator,
    schedule_generator=schedule_generator,
    number_of_agents=n_agents,
    malfunction_generator=malfunction_generator,
    obs_builder_object=TreeObsForRailEnv(
Exemple #26
0
def train():
    seed = 14

    namefile = "d3qn_1_mm5"
    print("Running {}".format(namefile))

    environment_parameters = {
        "n_agents": 5,
        "x_dim": 16 * 3,
        "y_dim": 9 * 3,
        "n_cities": 5,
        "max_rails_between_cities": 2,
        "max_rails_in_city": 3,
        "seed": seed,
        "observation_tree_depth": 2,
        "observation_radius": 10,
        "observation_max_path_depth": 30,
        # Malfunctions
        "malfunction_parameters": MalfunctionParameters(
            malfunction_rate=0.005,
            min_duration=15,
            max_duration=50),
        # Speeds
        "speed_profiles": {
            1.: 0.25,
            1. / 2.: 0.25,
            1. / 3.: 0.25,
            1. / 4.: 0.25},

        # ============================
        # Custom observations&rewards
        # ============================
        "custom_observations": False,

        "reward_shaping": True,
        "uniform_reward": True,
        "stop_penalty": -0.2,
        "invalid_action_penalty": -0.0,
        "deadlock_penalty": -5.0,
        # 1.0 for skipping
        "shortest_path_penalty_coefficient": 1.2,
        "done_bonus": 0.2,
    }

    training_parameters = {
        # ============================
        # Network architecture
        # ============================
        "double_dqn": True,
        "shared": False,
        "hidden_size": 256,
        "hidden_layers": 2,
        "update_every": 16,
        "type": 1,

        # epsilon greedy decay regulators
        "eps_decay": 0.99739538258046,
        "eps_start": 1.0,
        "eps_end": 0.02,

        "learning_rate": 0.52e-4,
        # To compute q targets
        "gamma": 0.99,
        # To compute target network soft update
        "tau": 1e-3,

        # ============================
        # Training setup
        # ============================
        "n_episodes": 15000,
        "batch_size": 32,
        # Minimum number of samples to start learning
        "buffer_min_size": 0,
        "fingerprints": True,
        # If not set the default value is the standard FingerprintType.EPSILON_STEP
        "fingerprint_type": FingerprintType.EPSILON_EPISODE,

        # ============================
        # Memory
        # ============================
        # Memory maximum size
        "buffer_size": int(1e6),
        # memory type uer or per
        "memory_type": "per",


        # ============================
        # Saving and rendering
        # ============================
        "checkpoint_interval": 500,
        "evaluation_mode": False,
        "eval_episodes": 25,
        "use_gpu": False,
        "render": False,
        "print_stats": True,
        "wandb_project": "flatland-challenge-final",
        "wandb_entity": "fiorenzoparascandolo",
        "wandb_tag": "d3qn",
        "save_model_path": "/content/drive/My Drive/Colab Notebooks/models/" + namefile + ".pt",
        "load_model_path": "/content/drive/My Drive/Colab Notebooks/models/todo.pt",
        "tensorboard_path": "/content/drive/My Drive/Colab Notebooks/logs/logs" + namefile + "/",
        "automatic_name_saving": True,

        # ============================
        # Action Masking / Skipping
        # ============================
        "action_masking": True,
        "allow_no_op": False
    }

    if training_parameters["evaluation_mode"]:
        eval_policy(Namespace(**environment_parameters), Namespace(**training_parameters))
    else:
        train_multiple_agents(Namespace(**environment_parameters), Namespace(**training_parameters))
Exemple #27
0
def train_agent(env_params, train_params):
    # Environment parameters
    n_agents = env_params.n_agents
    x_dim = env_params.x_dim
    y_dim = env_params.y_dim
    n_cities = env_params.n_cities
    max_rails_between_cities = env_params.max_rails_between_cities
    max_rails_in_city = env_params.max_rails_in_city
    seed = env_params.seed

    # Observation parameters
    observation_tree_depth = env_params.observation_tree_depth
    observation_radius = env_params.observation_radius
    observation_max_path_depth = env_params.observation_max_path_depth

    # Training parameters
    eps_start = train_params.eps_start
    eps_end = train_params.eps_end
    eps_decay = train_params.eps_decay
    n_episodes = train_params.n_episodes
    checkpoint_interval = train_params.checkpoint_interval
    n_eval_episodes = train_params.n_evaluation_episodes

    # Set the seeds
    random.seed(seed)
    np.random.seed(seed)

    # Break agents from time to time
    malfunction_parameters = MalfunctionParameters(
        malfunction_rate=1. / 10000,  # Rate of malfunctions
        min_duration=15,  # Minimal duration
        max_duration=50  # Max duration
    )

    # Observation builder
    predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
    tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth,
                                         predictor=predictor)

    # Fraction of train which each speed
    speed_profiles = {
        1.: 1.0,  # Fast passenger train
        1. / 2.: 0.0,  # Fast freight train
        1. / 3.: 0.0,  # Slow commuter train
        1. / 4.: 0.0  # Slow freight train
    }

    # Setup the environment
    env = RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=n_cities,
            grid_mode=False,
            max_rails_between_cities=max_rails_between_cities,
            max_rails_in_city=max_rails_in_city),
        schedule_generator=sparse_schedule_generator(speed_profiles),
        number_of_agents=n_agents,
        malfunction_generator_and_process_data=malfunction_from_params(
            malfunction_parameters),
        obs_builder_object=tree_observation,
        random_seed=seed)

    env.reset(regenerate_schedule=True, regenerate_rail=True)

    # Setup renderer
    if train_params.render:
        env_renderer = RenderTool(env, gl="PGL")

    # Calculate the state size given the depth of the tree observation and the number of features
    n_features_per_node = env.obs_builder.observation_dim
    n_nodes = 0
    for i in range(observation_tree_depth + 1):
        n_nodes += np.power(4, i)
    state_size = n_features_per_node * n_nodes

    # The action space of flatland is 5 discrete actions
    action_size = 5

    # Max number of steps per episode
    # This is the official formula used during evaluations
    # See details in flatland.envs.schedule_generators.sparse_schedule_generator
    max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities)))

    action_count = [0] * action_size
    action_dict = dict()
    agent_obs = [None] * env.get_num_agents()
    agent_prev_obs = [None] * env.get_num_agents()
    agent_prev_action = [2] * env.get_num_agents()
    update_values = False
    smoothed_normalized_score = -1.0
    smoothed_eval_normalized_score = -1.0
    smoothed_completion = 0.0
    smoothed_eval_completion = 0.0

    # Double Dueling DQN policy
    policy = DDDQNPolicy(state_size, action_size, train_params)

    # TensorBoard writer
    writer = SummaryWriter()
    writer.add_hparams(vars(train_params), {})
    writer.add_hparams(vars(env_params), {})

    training_timer = Timer()
    training_timer.start()

    print(
        "\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes.\n"
        .format(env.get_num_agents(), x_dim, y_dim, n_episodes,
                n_eval_episodes, checkpoint_interval))

    for episode_idx in range(n_episodes + 1):
        # Timers
        step_timer = Timer()
        reset_timer = Timer()
        learn_timer = Timer()
        preproc_timer = Timer()

        # Reset environment
        reset_timer.start()
        obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True)
        reset_timer.end()

        if train_params.render:
            env_renderer.set_new_rail()

        score = 0
        nb_steps = 0
        actions_taken = []

        # Build agent specific observations
        for agent in env.get_agent_handles():
            if obs[agent]:
                agent_obs[agent] = normalize_observation(
                    obs[agent],
                    observation_tree_depth,
                    observation_radius=observation_radius)
                agent_prev_obs[agent] = agent_obs[agent].copy()

        # Run episode
        for step in range(max_steps - 1):
            for agent in env.get_agent_handles():
                if info['action_required'][agent]:
                    # If an action is required, we want to store the obs at that step as well as the action
                    update_values = True
                    action = policy.act(agent_obs[agent], eps=eps_start)
                    action_count[action] += 1
                    actions_taken.append(action)
                else:
                    update_values = False
                    action = 0
                action_dict.update({agent: action})

            # Environment step
            step_timer.start()
            next_obs, all_rewards, done, info = env.step(action_dict)
            step_timer.end()

            if train_params.render and episode_idx % checkpoint_interval == 0:
                env_renderer.render_env(show=True,
                                        frames=False,
                                        show_observations=False,
                                        show_predictions=False)

            for agent in range(env.get_num_agents()):
                # Update replay buffer and train agent
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values or done[agent]:
                    learn_timer.start()
                    policy.step(agent_prev_obs[agent],
                                agent_prev_action[agent], all_rewards[agent],
                                agent_obs[agent], done[agent])
                    learn_timer.end()

                    agent_prev_obs[agent] = agent_obs[agent].copy()
                    agent_prev_action[agent] = action_dict[agent]

                # Preprocess the new observations
                if next_obs[agent]:
                    preproc_timer.start()
                    agent_obs[agent] = normalize_observation(
                        next_obs[agent],
                        observation_tree_depth,
                        observation_radius=observation_radius)
                    preproc_timer.end()

                score += all_rewards[agent]

            nb_steps = step

            if done['__all__']:
                break

        # Epsilon decay
        eps_start = max(eps_end, eps_decay * eps_start)

        # Collection information about training
        tasks_finished = sum(done[idx] for idx in env.get_agent_handles())
        completion = tasks_finished / max(1, env.get_num_agents())
        normalized_score = score / (max_steps * env.get_num_agents())
        action_probs = action_count / np.sum(action_count)
        action_count = [1] * action_size

        # Smoothed values for terminal display and for more stable hyper-parameter tuning
        smoothing = 0.99
        smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * (
            1.0 - smoothing)
        smoothed_completion = smoothed_completion * smoothing + completion * (
            1.0 - smoothing)

        # Print logs
        if episode_idx % checkpoint_interval == 0:
            torch.save(
                policy.qnetwork_local,
                './checkpoints/origin_multi-' + str(episode_idx) + '.pth')
            if train_params.render:
                env_renderer.close_window()

        print('\r🚂 Episode {}'
              '\t 🏆 Score: {:.3f}'
              ' Avg: {:.3f}'
              '\t 💯 Done: {:.2f}%'
              ' Avg: {:.2f}%'
              '\t 🎲 Epsilon: {:.2f} '
              '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score,
                                             smoothed_normalized_score,
                                             100 * completion,
                                             100 * smoothed_completion,
                                             eps_start,
                                             format_action_prob(action_probs)),
              end=" ")

        # Evaluate policy
        if episode_idx % train_params.checkpoint_interval == 0:
            scores, completions, nb_steps_eval = eval_policy(
                env, policy, n_eval_episodes, max_steps)
            writer.add_scalar("evaluation/scores_min", np.min(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_max", np.max(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_mean", np.mean(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_std", np.std(scores),
                              episode_idx)
            writer.add_histogram("evaluation/scores", np.array(scores),
                                 episode_idx)
            writer.add_scalar("evaluation/completions_min",
                              np.min(completions), episode_idx)
            writer.add_scalar("evaluation/completions_max",
                              np.max(completions), episode_idx)
            writer.add_scalar("evaluation/completions_mean",
                              np.mean(completions), episode_idx)
            writer.add_scalar("evaluation/completions_std",
                              np.std(completions), episode_idx)
            writer.add_histogram("evaluation/completions",
                                 np.array(completions), episode_idx)
            writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval),
                              episode_idx)
            writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval),
                              episode_idx)
            writer.add_scalar("evaluation/nb_steps_mean",
                              np.mean(nb_steps_eval), episode_idx)
            writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval),
                              episode_idx)
            writer.add_histogram("evaluation/nb_steps",
                                 np.array(nb_steps_eval), episode_idx)

            smoothing = 0.9
            smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean(
                scores) * (1.0 - smoothing)
            smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean(
                completions) * (1.0 - smoothing)
            writer.add_scalar("evaluation/smoothed_score",
                              smoothed_eval_normalized_score, episode_idx)
            writer.add_scalar("evaluation/smoothed_completion",
                              smoothed_eval_completion, episode_idx)

        # Save logs to tensorboard
        writer.add_scalar("training/score", normalized_score, episode_idx)
        writer.add_scalar("training/smoothed_score", smoothed_normalized_score,
                          episode_idx)
        writer.add_scalar("training/completion", np.mean(completion),
                          episode_idx)
        writer.add_scalar("training/smoothed_completion",
                          np.mean(smoothed_completion), episode_idx)
        writer.add_scalar("training/nb_steps", nb_steps, episode_idx)
        writer.add_histogram("actions/distribution", np.array(actions_taken),
                             episode_idx)
        writer.add_scalar("actions/nothing",
                          action_probs[RailEnvActions.DO_NOTHING], episode_idx)
        writer.add_scalar("actions/left",
                          action_probs[RailEnvActions.MOVE_LEFT], episode_idx)
        writer.add_scalar("actions/forward",
                          action_probs[RailEnvActions.MOVE_FORWARD],
                          episode_idx)
        writer.add_scalar("actions/right",
                          action_probs[RailEnvActions.MOVE_RIGHT], episode_idx)
        writer.add_scalar("actions/stop",
                          action_probs[RailEnvActions.STOP_MOVING],
                          episode_idx)
        writer.add_scalar("training/epsilon", eps_start, episode_idx)
        writer.add_scalar("training/buffer_size", len(policy.memory),
                          episode_idx)
        writer.add_scalar("training/loss", policy.loss, episode_idx)
        writer.add_scalar("timer/reset", reset_timer.get(), episode_idx)
        writer.add_scalar("timer/step", step_timer.get(), episode_idx)
        writer.add_scalar("timer/learn", learn_timer.get(), episode_idx)
        writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx)
        writer.add_scalar("timer/total", training_timer.get_current(),
                          episode_idx)
Exemple #28
0
# We need to either load in some pre-generated railways from disk, or else create a random railway generator.
if flags.load_railways:
    rail_generator, schedule_generator = load_precomputed_railways(
        project_root, flags)
else:
    rail_generator, schedule_generator = create_random_railways(project_root)

# Create the Flatland environment
env = RailEnv(width=flags.grid_width,
              height=flags.grid_height,
              number_of_agents=flags.num_agents,
              rail_generator=rail_generator,
              schedule_generator=schedule_generator,
              malfunction_generator_and_process_data=malfunction_from_params(
                  MalfunctionParameters(1 / 8000, 15, 50)),
              obs_builder_object=TreeObservation(max_depth=flags.tree_depth))

# After training we want to render the results so we also load a renderer
env_renderer = RenderTool(
    env,
    gl="PILSVG",
    screen_width=800,
    screen_height=800,
    agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX)

# Calculate the state size based on the number of nodes in the tree observation
num_features_per_node = env.obs_builder.observation_dim
num_nodes = sum(np.power(4, i) for i in range(flags.tree_depth + 1))
state_size = num_nodes * num_features_per_node
action_size = 5
# Training parameters
eps_start = train_params.eps_start
eps_end = train_params.eps_end
eps_decay = train_params.eps_decay
n_episodes = train_params.n_episodes
checkpoint_interval = train_params.checkpoint_interval
n_eval_episodes = train_params.n_evaluation_episodes

# Set the seeds
random.seed(seed)
np.random.seed(seed)

# Break agents from time to time
malfunction_parameters = MalfunctionParameters(
    malfunction_rate=1. / 10000,  # Rate of malfunctions
    min_duration=15,  # Minimal duration
    max_duration=50  # Max duration
)

# Observation builder
predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth,
                                     predictor=predictor)

# Fraction of train which each speed
speed_profiles = {
    1.: 1.0,  # Fast passenger train
    1. / 2.: 0.0,  # Fast freight train
    1. / 3.: 0.0,  # Slow commuter train
    1. / 4.: 0.0  # Slow freight train
}
Exemple #30
0
def train():
    seed = 14

    namefile = "psppo_" + datetime.now().strftime("%m_%d_%Y_%H_%M_%S")
    print("Running {}".format(namefile))

    environment_parameters = {
        "n_agents": 5,
        "x_dim": 16 * 3,
        "y_dim": 9 * 3,
        "n_cities": 5,
        "max_rails_between_cities": 2,
        "max_rails_in_city": 3,
        "seed": seed,
        "observation_tree_depth": 2,
        "observation_radius": 10,
        "observation_max_path_depth": 30,
        # Malfunctions
        "malfunction_parameters": MalfunctionParameters(
            malfunction_rate=0.005,
            min_duration=15,
            max_duration=50),
        # Speeds
        "speed_profiles": {
            1.: 0.25,
            1. / 2.: 0.25,
            1. / 3.: 0.25,
            1. / 4.: 0.25},

        # ============================
        # Custom observations&rewards
        # ============================
        "custom_observations": False,

        "reward_shaping": True,
        "uniform_reward": True,
        "stop_penalty": -0.2,
        "invalid_action_penalty": -0.0,
        "deadlock_penalty": -5.0,
        # 1.0 for skipping
        "shortest_path_penalty_coefficient": 1.2,
        "done_bonus": 0.2,
    }

    training_parameters = {
        # ============================
        # Network architecture
        # ============================
        # Shared actor-critic layer
        # If shared is True then the considered sizes are taken from the critic
        "shared": False,
        "shared_recurrent": True,
        "linear_size": 128,
        "hidden_size": 64,
        # Policy network
        "critic_mlp_width": 128,
        "critic_mlp_depth": 3,
        "last_critic_layer_scaling": 0.1,
        # Actor network
        "actor_mlp_width": 128,
        "actor_mlp_depth": 3,
        "last_actor_layer_scaling": 0.01,
        # Adam learning rate
        "learning_rate": 0.002,
        # Adam epsilon
        "adam_eps": 1e-5,
        # Activation
        "activation": "Tanh",
        "lmbda": 0.95,
        "entropy_coefficient": 0.01,
        # Called also baseline cost in shared setting (0.5)
        # (C54): {0.001, 0.1, 1.0, 10.0, 100.0}
        "value_loss_coefficient": 0.001,

        # ============================
        # Training setup
        # ============================
        "n_episodes": 500,
        "horizon": 2048,
        "epochs": 8,
        # 64, 128, 256
        "batch_size": 256,
        "batch_mode": "shuffle",

        # ============================
        # Normalization and clipping
        # ============================
        # Discount factor (0.95, 0.97, 0.99, 0.999)
        "discount_factor": 0.99,
        "max_grad_norm": 0.5,
        # PPO-style value clipping
        "eps_clip": 0.3,

        # ============================
        # Advantage estimation
        # ============================
        # gae or n-steps
        "advantage_estimator": "gae",

        # ============================
        # Optimization and rendering
        # ============================
        # Save and evaluate interval
        "checkpoint_interval": 100,
        "evaluation_mode": False,
        "eval_episodes": 500,
        "use_gpu": False,
        "render": False,
        "print_stats": True,
        "wandb_project": "flatland-challenge-ps-ppo-test",
        "wandb_entity": "fiorenzoparascandolo",
        "wandb_tag": "ps-ppo",
        "save_model_path": namefile + ".pt",
        "load_model_path": namefile + ".pt",
        "automatic_name_saving": True,
        "tensorboard_path": "log_" + namefile + "/",

        # ============================
        # Action Masking / Skipping
        # ============================
        "action_masking": True,
        "allow_no_op": False
    }

    """
    # Save on Google Drive on Colab
    "save_model_path": "/content/drive/My Drive/Colab Notebooks/models/" + namefile + ".pt",
    "load_model_path": "/content/drive/My Drive/Colab Notebooks/models/todo.pt",
    "tensorboard_path": "/content/drive/My Drive/Colab Notebooks/logs/logs" + namefile + "/",
    """

    """
    # Mount Drive on Colab
    from google.colab import drive
    drive.mount("/content/drive", force_remount=True)

    # Show Tensorboard on Colab
    import tensorflow
    %load_ext tensorboard
    % tensorboard --logdir "/content/drive/My Drive/Colab Notebooks/logs_todo"
    """

    if training_parameters["evaluation_mode"]:
        eval_policy(Namespace(**environment_parameters), Namespace(**training_parameters))
    else:
        train_multiple_agents(Namespace(**environment_parameters), Namespace(**training_parameters))