Exemple #1
0
def demo(args=None):
    """Demo script to check installation"""
    env = RailEnv(width=15,
                  height=15,
                  rail_generator=complex_rail_generator(nr_start_goal=10,
                                                        nr_extra=1,
                                                        min_dist=8,
                                                        max_dist=99999),
                  schedule_generator=complex_schedule_generator(),
                  number_of_agents=5)

    env._max_episode_steps = int(15 * (env.width + env.height))
    env_renderer = RenderTool(env)

    while True:
        obs, info = env.reset()
        _done = False
        # Run a single episode here
        step = 0
        while not _done:
            # Compute Action
            _action = {}
            for _idx, _ in enumerate(env.agents):
                _action[_idx] = np.random.randint(0, 5)
            obs, all_rewards, done, _ = env.step(_action)
            _done = done['__all__']
            step += 1
            env_renderer.render_env(show=True,
                                    frames=False,
                                    show_observations=False,
                                    show_predictions=False)
            time.sleep(0.3)
    return 0
Exemple #2
0
def render_env(env, fname):
    env_renderer = RenderTool(env, gl="PGL")
    env_renderer.render_env()

    image = env_renderer.get_image()
    pil_image = PIL.Image.fromarray(image)
    pil_image.save(fname)
def render_test(parameters, test_nr=0, nr_examples=5):
    for trial in range(nr_examples):
        # Reset the env
        print(
            'Showing {} Level {} with (x_dim,y_dim) = ({},{}) and {} Agents.'.
            format(test_nr, trial, parameters[0], parameters[1],
                   parameters[2]))
        file_name = "./Tests/{}/Level_{}.pkl".format(test_nr, trial)

        env = RailEnv(
            width=1,
            height=1,
            rail_generator=rail_from_file(file_name),
            obs_builder_object=TreeObsForRailEnv(max_depth=2),
            number_of_agents=1,
        )
        env_renderer = RenderTool(
            env,
            gl="PILSVG",
        )
        env_renderer.set_new_rail()

        env.reset(False, False)
        env_renderer.render_env(show=True, show_observations=False)

        time.sleep(0.1)
        env_renderer.close_window()
    return
Exemple #4
0
def test_path_not_exists(rendering=False):
    rail, rail_map = make_simple_rail_unconnected()
    env = RailEnv(
        width=rail_map.shape[1],
        height=rail_map.shape[0],
        rail_generator=rail_from_grid_transition_map(rail),
        schedule_generator=random_schedule_generator(),
        number_of_agents=1,
        obs_builder_object=TreeObsForRailEnv(
            max_depth=2, predictor=ShortestPathPredictorForRailEnv()),
    )
    env.reset()

    check_path(
        env,
        rail,
        (5, 6),  # south dead-end
        0,  # north
        (0, 3),  # north dead-end
        False)

    if rendering:
        renderer = RenderTool(env, gl="PILSVG")
        renderer.render_env(show=True, show_observations=False)
        input("Continue?")
    def replay_verify(max_episode_steps: int, ctl: ControllerFromTrainRuns,
                      env: RailEnv, rendering: bool):
        """Replays this deterministic `ActionPlan` and verifies whether it is feasible."""
        if rendering:
            renderer = RenderTool(env,
                                  gl="PILSVG",
                                  agent_render_variant=AgentRenderVariant.
                                  AGENT_SHOWS_OPTIONS_AND_BOX,
                                  show_debug=True,
                                  clear_debug_text=True,
                                  screen_height=1000,
                                  screen_width=1000)
            renderer.render_env(show=True,
                                show_observations=False,
                                show_predictions=False)
        i = 0
        while not env.dones['__all__'] and i <= max_episode_steps:
            for agent_id, agent in enumerate(env.agents):
                way_point: WayPoint = ctl.get_way_point_before_or_at_step(
                    agent_id, i)
                assert agent.position == way_point.position, \
                    "before {}, agent {} at {}, expected {}".format(i, agent_id, agent.position,
                                                                    way_point.position)
            actions = ctl.act(i)
            print("actions for {}: {}".format(i, actions))

            obs, all_rewards, done, _ = env.step(actions)

            if rendering:
                renderer.render_env(show=True,
                                    show_observations=False,
                                    show_predictions=False)

            i += 1
def evaluate(n_episodes):
    run = SUBMISSIONS["rlps-tcpr"]
    config, run = init_run(run)
    agent = ShortestPathRllibAgent(get_agent(config, run))
    env = get_env(config, rl=True)
    env_renderer = RenderTool(env, screen_width=8800)
    returns = []
    pcs = []
    malfs = []

    for _ in tqdm(range(n_episodes)):

        obs, _ = env.reset(regenerate_schedule=True, regenerate_rail=True)
        if RENDER:
            env_renderer.reset()
            env_renderer.render_env(show=True,
                                    frames=True,
                                    show_observations=False)

        if not obs:
            break

        steps = 0
        ep_return = 0
        done = defaultdict(lambda: False)
        robust_env = RobustFlatlandGymEnv(rail_env=env,
                                          max_nr_active_agents=200,
                                          observation_space=None,
                                          priorizer=DistToTargetPriorizer(),
                                          allow_noop=True)

        sorted_handles = robust_env.priorizer.priorize(handles=list(
            obs.keys()),
                                                       rail_env=env)

        while not done['__all__']:
            actions = agent.compute_actions(obs, env)
            robust_actions = robust_env.get_robust_actions(
                actions, sorted_handles)
            obs, all_rewards, done, info = env.step(robust_actions)
            if RENDER:
                env_renderer.render_env(show=True,
                                        frames=True,
                                        show_observations=False)
            print('.', end='', flush=True)
            steps += 1
            ep_return += np.sum(list(all_rewards.values()))

        pc = np.sum(np.array([1 for a in env.agents if is_done(a)
                              ])) / env.get_num_agents()
        print("EPISODE PC:", pc)
        n_episodes += 1
        pcs.append(pc)
        returns.append(ep_return /
                       (env._max_episode_steps * env.get_num_agents()))
        malfs.append(
            np.sum([a.malfunction_data['nr_malfunctions']
                    for a in env.agents]))
    return pcs, returns, malfs
def render_env(env):
    env_renderer = RenderTool(env, gl="PGL")
    env_renderer.render_env()

    image = env_renderer.get_image()
    pil_image = PIL.Image.fromarray(image)
    #print("RENDER")
    #pil_image.show()
    images.append(pil_image)
    print(len(images))
def test_shortest_path_predictor_conflicts(rendering=False):
    rail, rail_map = make_invalid_simple_rail()
    env = RailEnv(
        width=rail_map.shape[1],
        height=rail_map.shape[0],
        rail_generator=rail_from_grid_transition_map(rail),
        schedule_generator=random_schedule_generator(),
        number_of_agents=2,
        obs_builder_object=TreeObsForRailEnv(
            max_depth=2, predictor=ShortestPathPredictorForRailEnv()),
    )
    env.reset()

    # set the initial position
    agent = env.agents[0]
    agent.initial_position = (5, 6)  # south dead-end
    agent.position = (5, 6)  # south dead-end
    agent.direction = 0  # north
    agent.initial_direction = 0  # north
    agent.target = (3, 9)  # east dead-end
    agent.moving = True
    agent.status = RailAgentStatus.ACTIVE

    agent = env.agents[1]
    agent.initial_position = (3, 8)  # east dead-end
    agent.position = (3, 8)  # east dead-end
    agent.direction = 3  # west
    agent.initial_direction = 3  # west
    agent.target = (6, 6)  # south dead-end
    agent.moving = True
    agent.status = RailAgentStatus.ACTIVE

    observations, info = env.reset(False, False, True)

    if rendering:
        renderer = RenderTool(env, gl="PILSVG")
        renderer.render_env(show=True, show_observations=False)
        input("Continue?")

    # get the trees to test
    obs_builder: TreeObsForRailEnv = env.obs_builder
    pp = pprint.PrettyPrinter(indent=4)
    tree_0 = observations[0]
    tree_1 = observations[1]
    env.obs_builder.util_print_obs_subtree(tree_0)
    env.obs_builder.util_print_obs_subtree(tree_1)

    # check the expectations
    expected_conflicts_0 = [('F', 'R')]
    expected_conflicts_1 = [('F', 'L')]
    _check_expected_conflicts(expected_conflicts_0, obs_builder, tree_0,
                              "agent[0]: ")
    _check_expected_conflicts(expected_conflicts_1, obs_builder, tree_1,
                              "agent[1]: ")
    def render(self, mode='human'):
        # TODO: Merge both strategies (Jupyter vs .py)
        # In .py files
        # self.renderer.render_env(show=False, show_observations=False, show_predictions=False)
        # In Jupyter Notebooks
        env_renderer = RenderTool(self.flatland_env, gl="PILSVG")
        env_renderer.render_env()

        image = env_renderer.get_image()
        pil_image = Image.fromarray(image)
        display(pil_image)
        return image
Exemple #10
0
def main(args):
    try:
        opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""])
    except getopt.GetoptError as err:
        print(str(err))  # will print something like "option -a not recognized"
        sys.exit(2)
    sleep_for_animation = True
    for o, a in opts:
        if o in ("--sleep-for-animation"):
            sleep_for_animation = str2bool(a)
        else:
            assert False, "unhandled option"

    # Initiate the Predictor
    custom_predictor = ShortestPathPredictorForRailEnv(10)

    # Pass the Predictor to the observation builder
    custom_obs_builder = ObservePredictions(custom_predictor)

    # Initiate Environment
    env = RailEnv(width=10,
                  height=10,
                  rail_generator=complex_rail_generator(nr_start_goal=5,
                                                        nr_extra=1,
                                                        min_dist=8,
                                                        max_dist=99999,
                                                        seed=1),
                  schedule_generator=complex_schedule_generator(),
                  number_of_agents=3,
                  obs_builder_object=custom_obs_builder)

    obs, info = env.reset()
    env_renderer = RenderTool(env, gl="PILSVG")

    # We render the initial step and show the obsered cells as colored boxes
    env_renderer.render_env(show=True,
                            frames=True,
                            show_observations=True,
                            show_predictions=False)

    action_dict = {}
    for step in range(100):
        for a in range(env.get_num_agents()):
            action = np.random.randint(0, 5)
            action_dict[a] = action
        obs, all_rewards, done, _ = env.step(action_dict)
        print("Rewards: ", all_rewards, "  [done=", done, "]")
        env_renderer.render_env(show=True,
                                frames=True,
                                show_observations=True,
                                show_predictions=False)
        if sleep_for_animation:
            time.sleep(0.5)
Exemple #11
0
class OurEnv(RailEnv):
    def reset(self, *args, **kwargs):
        return_val = super().reset(*args, **kwargs)
        self.env_renderer = RenderTool(env)
        self.step({0: RailEnvActions.MOVE_FORWARD})
        return return_val

    def step(self, *args, **kwargs):
        self.env_renderer.render_env(show=True)
        print(args[0])
        observation, reward, done, info = super().step(*args, **kwargs)
        return observation, reward, done["__all__"], info
Exemple #12
0
def evaluate(n_episodes):
    run = SUBMISSIONS["ato"]
    config, run = init_run(run)
    agent = get_agent(config, run)
    env = get_env(config, rl=True)
    env_renderer = RenderTool(env, screen_width=8800)
    returns = []
    pcs = []
    malfs = []

    for _ in tqdm(range(n_episodes)):

        obs, _ = env.reset(regenerate_schedule=True, regenerate_rail=True)
        if RENDER:
            env_renderer.reset()
            env_renderer.render_env(show=True,
                                    frames=True,
                                    show_observations=False)

        if not obs:
            break

        steps = 0
        ep_return = 0
        done = defaultdict(lambda: False)

        while not done['__all__']:
            actions = agent.compute_actions(obs, explore=False)
            obs, all_rewards, done, info = env.step(actions)
            if RENDER:
                env_renderer.render_env(show=True,
                                        frames=True,
                                        show_observations=False)
            print('.', end='', flush=True)
            steps += 1
            ep_return += np.sum(list(all_rewards.values()))

        pc = np.sum(np.array([1 for a in env.agents if is_done(a)
                              ])) / env.get_num_agents()
        print("EPISODE PC:", pc)
        n_episodes += 1
        pcs.append(pc)
        returns.append(ep_return /
                       (env._max_episode_steps * env.get_num_agents()))
        malfs.append(
            np.sum([a.malfunction_data['nr_malfunctions']
                    for a in env.agents]))
    return pcs, returns, malfs
Exemple #13
0
def solve(env, width, height, naive, predictor):
    env_renderer = RenderTool(env)
    solver = r2_solver.Solver(1)

    obs, _ = env.reset()
    env.obs_builder.find_safe_edges(env)

    predictor.env = env
    predictor.get()
    for step in range(100):

        # print(obs)
        # print(obs.shape)

        if naive:
            _action = naive_solver(env, obs)
        else:
            _action = solver.GetMoves(env.agents, obs)

        obs_paths = TL_detector(env, obs, _action)
        for k in obs_paths.keys():
            if obs_paths[k] is not None and improved_solver(obs_paths[k]) == 0:
                _action[k] = 4

        for k in _action.keys():
            if env.agents[k].position is None:
                continue

            pos = (env.agents[k].position[0], env.agents[k].position[1],
                   env.agents[k].direction)
            if _action[k] != 0 and _action[
                    k] != 4 and pos in env.dev_pred_dict[k]:
                env.dev_pred_dict[k].remove(pos)

        next_obs, all_rewards, done, _ = env.step(_action)

        print("Rewards: {}, [done={}]".format(all_rewards, done))
        img = env_renderer.render_env(show=True,
                                      show_inactive_agents=False,
                                      show_predictions=True,
                                      show_observations=False,
                                      frames=True,
                                      return_image=True)
        cv2.imwrite("./env_images/" + str(step).zfill(3) + ".jpg", img)

        obs = next_obs.copy()
        if obs is None or done['__all__']:
            break

    unfinished_agents = []
    for k in done.keys():
        if not done[k] and type(k) is int:
            unfinished_agents.append(k)

    with open('observations_and_agents.pickle', 'wb') as f:
        pickle.dump((env.obs_builder.obs_dict, unfinished_agents,
                     env.obs_builder.branches, env.obs_builder.safe_map), f)
    return
Exemple #14
0
def check_path(env,
               rail,
               position,
               direction,
               target,
               expected,
               rendering=False):
    agent = env.agents[0]
    agent.position = position  # south dead-end
    agent.direction = direction  # north
    agent.target = target  # east dead-end
    agent.moving = True
    if rendering:
        renderer = RenderTool(env, gl="PILSVG")
        renderer.render_env(show=True, show_observations=False)
        input("Continue?")
    assert rail.check_path_exists(agent.position, agent.direction,
                                  agent.target) == expected
Exemple #15
0
def createEnvSet(nStart, nEnd, sDir, bSmall=True):
    # print("Generate small envs in train-envs-small:")
    print(f"Generate envs (small={bSmall}) in dir {sDir}:")

    sDirImages = "train-envs-small/images/"
    if not os.path.exists(sDirImages):
        os.makedirs(sDirImages)

    for test_id in range(nStart, nEnd, 1):
        env = create_test_env(RandomTestParams_small, test_id, sDir)

        oRender = RenderTool(env, gl="PILSVG")

        # oRender.envs = envs
        # oRender.set_new_rail()
        oRender.render_env()
        g2img = oRender.get_image()
        imgPIL = Image.fromarray(g2img)
        # imgPIL.show()

        imgPIL.save(sDirImages + "Level_{}.png".format(test_id))
Exemple #16
0
def main(args):
    try:
        opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""])
    except getopt.GetoptError as err:
        print(str(err))  # will print something like "option -a not recognized"
        sys.exit(2)
    sleep_for_animation = True
    for o, a in opts:
        if o in ("--sleep-for-animation"):
            sleep_for_animation = str2bool(a)
        else:
            assert False, "unhandled option"

    env = RailEnv(width=7,
                  height=7,
                  rail_generator=complex_rail_generator(nr_start_goal=10,
                                                        nr_extra=1,
                                                        min_dist=5,
                                                        max_dist=99999,
                                                        seed=1),
                  schedule_generator=complex_schedule_generator(),
                  number_of_agents=1,
                  obs_builder_object=SingleAgentNavigationObs())

    obs, info = env.reset()
    env_renderer = RenderTool(env)
    env_renderer.render_env(show=True, frames=True, show_observations=True)
    for step in range(100):
        action = np.argmax(obs[0]) + 1
        obs, all_rewards, done, _ = env.step({0: action})
        print("Rewards: ", all_rewards, "  [done=", done, "]")
        env_renderer.render_env(show=True, frames=True, show_observations=True)
        if sleep_for_animation:
            time.sleep(0.1)
        if done["__all__"]:
            break
    env_renderer.close_window()
Exemple #17
0
def main(args, dir):
    '''
	
	:param args: 
	:return: 
	Episodes to debug (set breakpoint in episodes loop to debug):
	- ep = 3, agent 1 spawns in front of 3, blocking its path; 0 and 2 are in a deadlock since they have same priority
	- ep = 4, agents stop because of wrong priorities even though the conflict zone wasn't entered,
	- ep = 14, 
	'''
    rail_generator = sparse_rail_generator(
        max_num_cities=args.max_num_cities,
        seed=args.seed,
        grid_mode=args.grid_mode,
        max_rails_between_cities=args.max_rails_between_cities,
        max_rails_in_city=args.max_rails_in_city,
    )

    # Maps speeds to % of appearance in the env
    speed_ration_map = {
        1.: 0.25,  # Fast passenger train
        1. / 2.: 0.25,  # Fast freight train
        1. / 3.: 0.25,  # Slow commuter train
        1. / 4.: 0.25
    }  # Slow freight train

    observation_builder = GraphObsForRailEnv(
        predictor=ShortestPathPredictorForRailEnv(
            max_depth=args.prediction_depth),
        bfs_depth=4)

    env = RailEnv(
        width=args.width,
        height=args.height,
        rail_generator=rail_generator,
        schedule_generator=sparse_schedule_generator(speed_ration_map),
        number_of_agents=args.num_agents,
        obs_builder_object=observation_builder,
        malfunction_generator_and_process_data=malfunction_from_params(
            parameters={
                'malfunction_rate':
                args.malfunction_rate,  # Rate of malfunction occurrence
                'min_duration':
                args.min_duration,  # Minimal duration of malfunction
                'max_duration':
                args.max_duration  # Max duration of malfunction
            }))

    if args.render:
        env_renderer = RenderTool(env,
                                  agent_render_variant=AgentRenderVariant.
                                  AGENT_SHOWS_OPTIONS_AND_BOX,
                                  show_debug=True)

    sm = stateMachine()
    tb = TestBattery(env, observation_builder)

    state_machine_action_dict = {}
    railenv_action_dict = {}
    # max_time_steps = env.compute_max_episode_steps(args.width, args.height)
    max_time_steps = 200
    T_rewards = []  # List of episodes rewards
    T_Qs = []  # List of q values
    T_num_done_agents = []  # List of number of done agents for each episode
    T_all_done = []  # If all agents completed in each episode
    T_episodes = []  # Time taken for each episode

    if args.save_image and not os.path.isdir("image_dump"):
        os.makedirs("image_dump")

    step_taken = 0
    total_step_taken = 0
    total_episodes = 0
    step_times = []  # Time taken for each step

    for ep in range(args.num_episodes):
        # Reset info at the beginning of an episode
        start_time = time.time()  # Take time of one episode

        if args.generate_baseline:
            if not os.path.isdir("image_dump/" + str(dir)) and args.save_image:
                os.makedirs("image_dump/" + str(dir))
        else:
            if not os.path.isdir("image_dump/" + str(ep)) and args.save_image:
                os.makedirs("image_dump/" + str(ep))

        state, info = env.reset()
        tb.reset()

        if args.render:
            env_renderer.reset()
        reward_sum, all_done = 0, False  # reward_sum contains the cumulative reward obtained as sum during the steps
        num_done_agents = 0

        state_machine_action = {}
        for i in range(env.number_of_agents):
            state_machine_action[i] = 0

        for step in range(max_time_steps):
            start_step_time = time.time()

            #if step % 10 == 0:
            #	print(step)

            # Test battery
            # see test_battery.py
            triggers = tb.tests(state, args.prediction_depth,
                                state_machine_action)
            # state machine based on triggers of test battery
            # see state_machine.py
            state_machine_action = sm.act(
                triggers)  # State machine picks action

            for a in range(env.get_num_agents()):
                #if info['action_required'][a]:
                #	#railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a])
                #	railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a])
                #	state_machine_action_dict.update({a: state_machine_action})
                #	railenv_action_dict.update({a: railenv_action})
                # railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a])
                railenv_action = observation_builder.choose_railenv_action(
                    a, state_machine_action[a])
                state_machine_action_dict.update({a: state_machine_action})
                railenv_action_dict.update({a: railenv_action})

            state, reward, done, info = env.step(
                railenv_action_dict)  # Env step

            if args.generate_baseline:
                #env_renderer.render_env(show=True, show_observations=False, show_predictions=True)
                env_renderer.render_env(show=False,
                                        show_observations=False,
                                        show_predictions=True)
            else:
                env_renderer.render_env(show=True,
                                        show_observations=False,
                                        show_predictions=True)

            if args.generate_baseline:
                if args.save_image:
                    env_renderer.save_image("image_dump/" + str(dir) +
                                            "/image_" + str(step) + "_.png")
            else:
                if args.save_image:
                    env_renderer.save_image("image_dump/" + str(ep) +
                                            "/image_" + str(step) + "_.png")

            if args.debug:
                for a in range(env.get_num_agents()):
                    log('\n\n#########################################')
                    log('\nInfo for agent {}'.format(a))
                    #log('\npath : {}'.format(state[a]["path"]))
                    log('\noverlap : {}'.format(state[a]["overlap"]))
                    log('\ndirection : {}'.format(state[a]["direction"]))
                    log('\nOccupancy, first layer: {}'.format(
                        state[a]["occupancy"]))
                    log('\nOccupancy, second layer: {}'.format(
                        state[a]["conflict"]))
                    log('\nForks: {}'.format(state[a]["forks"]))
                    log('\nTarget: {}'.format(state[a]["target"]))
                    log('\nPriority: {}'.format(state[a]["priority"]))
                    log('\nMax priority encountered: {}'.format(
                        state[a]["max_priority"]))
                    log('\nNum malfunctioning agents (globally): {}'.format(
                        state[a]["n_malfunction"]))
                    log('\nNum agents ready to depart (globally): {}'.format(
                        state[a]["ready_to_depart"]))
                    log('\nStatus: {}'.format(info['status'][a]))
                    log('\nPosition: {}'.format(env.agents[a].position))
                    log('\nTarget: {}'.format(env.agents[a].target))
                    log('\nMoving? {} at speed: {}'.format(
                        env.agents[a].moving, info['speed'][a]))
                    log('\nAction required? {}'.format(
                        info['action_required'][a]))
                    log('\nState machine action: {}'.format(
                        state_machine_action_dict[a]))
                    log('\nRailenv action: {}'.format(railenv_action_dict[a]))
                    log('\nRewards: {}'.format(reward[a]))
                    log('\n\n#########################################')

            reward_sum += sum(reward[a] for a in range(env.get_num_agents()))

            step_taken = step
            time_taken_step = time.time() - start_step_time
            step_times.append(time_taken_step)

            if done['__all__']:
                all_done = True
                break

        total_step_taken += step_taken

        time_taken = time.time() - start_time  # Time taken for one episode
        total_episodes = ep

        # Time metrics - too precise
        avg_time_step = sum(step_times) / step_taken
        #print("Avg time step: " + str(avg_time_step))

        # No need to close the renderer since env parameter sizes stay the same
        T_rewards.append(reward_sum)
        # Compute num of agents that reached their target
        for a in range(env.get_num_agents()):
            if done[a]:
                num_done_agents += 1
        percentage_done_agents = num_done_agents / env.get_num_agents()
        log("\nDone agents in episode: {}".format(percentage_done_agents))
        T_num_done_agents.append(
            percentage_done_agents)  # In proportion to total
        T_all_done.append(all_done)

    # Average number of agents that reached their target
    avg_done_agents = sum(T_num_done_agents) / len(T_num_done_agents) if len(
        T_num_done_agents) > 0 else 0
    avg_reward = sum(T_rewards) / len(T_rewards) if len(T_rewards) > 0 else 0
    avg_norm_reward = avg_reward / (max_time_steps / env.get_num_agents())

    avg_ep_time = sum(T_episodes) / args.num_episodes

    if total_episodes == 0:
        total_episodes = 1

    log("\nSeed: " + str(args.seed) \
      + "\t | Avg_done_agents: " + str(avg_done_agents)\
      + "\t | Avg_reward: " + str(avg_reward)\
      + "\t | Avg_norm_reward: " + str(avg_norm_reward)\
      + "\t | Max_num_time_steps: " + str(max_time_steps)\
      + "\t | Avg_num_time_steps: " + str(total_step_taken/total_episodes)
            + "\t | Avg episode time: " + str(avg_ep_time))
def test(args, T, ep, dqn, val_mem, metrics, results_dir, evaluate=False):

    # Init env and set in evaluation mode
    # Maps speeds to % of appearance in the env
    speed_ration_map = {
        1.: 0.25,  # Fast passenger train
        1. / 2.: 0.25,  # Fast freight train
        1. / 3.: 0.25,  # Slow commuter train
        1. / 4.: 0.25
    }  # Slow freight train

    schedule_generator = sparse_schedule_generator(speed_ration_map)

    observation_builder = GraphObsForRailEnv(
        predictor=ShortestPathPredictorForRailEnv(
            max_depth=args.prediction_depth))

    env = RailEnv(
        width=args.width,
        height=args.height,
        rail_generator=sparse_rail_generator(
            max_num_cities=args.max_num_cities,
            seed=
            ep,  # Use episode as seed when evaluation is performed during training
            grid_mode=args.grid_mode,
            max_rails_between_cities=args.max_rails_between_cities,
            max_rails_in_city=args.max_rails_in_city,
        ),
        schedule_generator=schedule_generator,
        number_of_agents=args.num_agents,
        obs_builder_object=observation_builder,
        malfunction_generator_and_process_data=malfunction_from_params(
            parameters={
                'malfunction_rate': args.malfunction_rate,
                'min_duration': args.min_duration,
                'max_duration': args.max_duration
            }),
    )

    if args.render:
        env_renderer = RenderTool(env,
                                  gl="PILSVG",
                                  agent_render_variant=AgentRenderVariant.
                                  AGENT_SHOWS_OPTIONS_AND_BOX,
                                  show_debug=True,
                                  screen_height=1080,
                                  screen_width=1920)

    #max_time_steps = env.compute_max_episode_steps(env.width, env.height)
    max_time_steps = 200  # TODO Debug
    # metrics['steps'].append(T)
    metrics['episodes'].append(ep)
    T_rewards = []  # List of episodes rewards
    T_Qs = []  # List
    T_num_done_agents = []  # List of number of done agents for each episode
    T_all_done = []  # If all agents completed in each episode
    network_action_dict = dict()
    railenv_action_dict = dict()
    qvalues = {}

    # Test performance over several episodes
    for ep in range(args.evaluation_episodes):
        # Reset info
        state, info = env.reset()
        reward_sum, all_done = 0, False  # reward_sum contains the cumulative reward obtained as sum during the steps
        num_done_agents = 0
        if args.render:
            env_renderer.reset()

        # Choose first action - decide entering of agents into the environment
        for a in range(env.get_num_agents()):
            action = np.random.choice((0, 2))
            railenv_action_dict.update({a: action})
        state, reward, done, info = env.step(railenv_action_dict)  # Env step
        reward_sum += sum(reward[a] for a in range(env.get_num_agents()))

        if args.render:
            env_renderer.render_env(show=True,
                                    show_observations=False,
                                    show_predictions=True)

        for step in range(max_time_steps - 1):
            # Choose actions
            for a in range(env.get_num_agents()):
                if info['action_required'][a]:
                    network_action = dqn.act(
                        state[a]
                    )  # Choose an action greedily (with noisy weights)
                    # network_action = 0
                    railenv_action = observation_builder.choose_railenv_action(
                        a, network_action)
                    qvalues.update({a: dqn.get_q_values(state[a])})
                else:
                    network_action = 0
                    railenv_action = 0
                    qvalues.update({a: [0, 0]})  # '0' if wasn't updated

                railenv_action_dict.update({a: railenv_action})
                network_action_dict.update({a: network_action})

            if args.debug:
                for a in range(env.get_num_agents()):
                    print('#########################################')
                    print('Info for agent {}'.format(a))
                    print('Occupancy, first layer: {}'.format(
                        state[a][:args.prediction_depth]))
                    print('Occupancy, second layer: {}'.format(
                        state[a][args.prediction_depth:args.prediction_depth *
                                 2]))
                    print('Forks: {}'.format(
                        state[a][args.prediction_depth *
                                 2:args.prediction_depth * 3]))
                    print('Target: {}'.format(
                        state[a][args.prediction_depth *
                                 3:args.prediction_depth * 4]))
                    print('Priority: {}'.format(
                        state[a][args.prediction_depth * 4]))
                    print('Max priority encountered: {}'.format(
                        state[a][args.prediction_depth * 4 + 1]))
                    print('Num malfunctoning agents (globally): {}'.format(
                        state[a][args.prediction_depth * 4 + 2]))
                    print('Num agents ready to depart (globally): {}'.format(
                        state[a][args.prediction_depth * 4 + 3]))
                    print('Status: {}'.format(info['status'][a]))
                    print('Position: {}'.format(env.agents[a].position))
                    print('Moving? {} at speed: {}'.format(
                        env.agents[a].moving, info['speed'][a]))
                    print('Action required? {}'.format(
                        info['action_required'][a]))
                    print('Network action: {}'.format(network_action_dict[a]))
                    print('Railenv action: {}'.format(railenv_action_dict[a]))
                    print('Q values: {}'.format(qvalues[a]))
                    # print('QValues: {}'.format(qvalues))
                    print('Rewards: {}'.format(reward[a]))

            # Breakpoint for debugging here
            state, reward, done, info = env.step(
                railenv_action_dict)  # Env step
            if args.render:
                env_renderer.render_env(show=True,
                                        show_observations=False,
                                        show_predictions=True)

            reward_sum += sum(reward[a] for a in range(env.get_num_agents()))

            if done['__all__']:
                all_done = True
                break
        # No need to close the renderer since env parameter sizes stay the same
        T_rewards.append(reward_sum)
        # Compute num of agents that reached their target
        for a in range(env.get_num_agents()):
            if done[a]:
                num_done_agents += 1
        T_num_done_agents.append(
            num_done_agents / env.get_num_agents())  # In proportion to total
        T_all_done.append(all_done)

    # Test Q-values over validation memory
    for state in val_mem:  # Iterate over valid states
        T_Qs.append(dqn.evaluate_q(state))
    if args.debug:
        print('T_Qs: {}'.format(T_Qs))  # These are Qs from a single agent TODO

    avg_done_agents = sum(T_num_done_agents) / len(
        T_num_done_agents
    )  # Average number of agents that reached their target
    avg_reward = sum(T_rewards) / len(T_rewards)
    avg_norm_reward = avg_reward / (max_time_steps / env.get_num_agents())

    # avg_reward, avg_Q = sum(T_rewards) / len(T_rewards), sum(T_Qs) / len(T_Qs)
    if not evaluate:
        # Save model parameters if improved
        if avg_done_agents > metrics['best_avg_done_agents']:
            metrics['best_avg_done_agents'] = avg_done_agents
            dqn.save(results_dir)

        # Append to results and save metrics
        metrics['rewards'].append(T_rewards)
        metrics['Qs'].append(T_Qs)
        torch.save(metrics, os.path.join(results_dir, 'metrics.pth'))

        # Plot HTML
        _plot_line(metrics['episodes'],
                   metrics['rewards'],
                   'Reward',
                   path=results_dir)  # Plot rewards in episodes
        _plot_line(metrics['episodes'], metrics['Qs'], 'Q', path=results_dir)

    # Return average number of done agents (in proportion) and average reward
    return avg_done_agents, avg_reward, avg_norm_reward
def evalfun(num_samples = 100, timed=True, debug = False, refresh = 0.1):

    # A list of (mapsize, agent count) tuples, change or extend this to test different sizes.
    #problemsizes = [(5, 3), (7, 4), (9, 5), (11, 6), (13, 7)]
    problemsizes = [(25, 4)]
    # Create a list of seeds to consider.
    seeds = numpy.random.randint(2**29, size=3*num_samples)

    print("%10s\t%8s\t%8s\t%9s" % ("Dimensions", "Success", "Rewards", "Runtime"))
    for problemsize in problemsizes:
        j = 0
        for _ in range(0, num_samples):

            # Create environments while they are not the intended dimension.
            env = create_multi_agent_environment(problemsize[0], problemsize[1], timed, seeds[j])
            j = j + 1
            while len(env.agents) != problemsize[1]:
                env = create_multi_agent_environment(problemsize[0], problemsize[1], timed, seeds[j])
                j = j + 1

            # Create a renderer only if in debug mode.
            if debug:
                env_renderer = RenderTool(env, screen_width=1920, screen_height=1080)

            # Time the search.
            for i in range(len(env.agents)):
                print(
                    "Agent ID: " + str(i) + " Initial Position: (" + str(env.agents[i].initial_position[0]) + "," + str(
                        env.agents[i].initial_position[1]) + ")" + " Release Date " + str(env.agents[i].release_date)+ " Deadline " + str(env.agents[i].deadline))

            start = time.time()
            # Task 1 prioritized planning ########
            a_schdule = prioritized_planning(env)

            # Task 3 Improvement prioritized planning (Genetic Algorithm)
            # Uncomment to run task 3
            #a_schdule = genetic_algorithm(env)


            duration = time.time() - start;

            schedule = a_schdule
            print(schedule)
            if debug:
                env_renderer.render_env(show=True, frames=False, show_observations=False)
                time.sleep(refresh)

            # Validate that environment state is unchanged.
            assert env.num_resets == 1 and env._elapsed_steps == 0

            # Run the schedule
            success = False
            sumreward = 0
            for action in schedule:
                _, _reward_dict, _done, _ = env.step(action)
                success = all(_done.values())
                sumreward = sumreward + sum(_reward_dict.values())
                if debug:
                    #print(action)
                    env_renderer.render_env(show=True, frames=False, show_observations=False)
                    time.sleep(refresh)

            # Print the performance of the algorithm
            print("%10s\t%8s\t%8.3f\t%9.6f" % (str(problemsize), str(success), sumreward, duration))
                action = agent.act(state=norm_obs, eps=eps)
                action_dict.update({_idx: action})

        # Environment executes action and returns
        #     1. next observations for all agents
        #     2. corresponding rewards for all agents
        #     3. status if the agents are done
        #     4. information about actions, malfunction, speed and status
        next_obs, all_rewards, done, info = env.step(action_dict)
        for _idx in range(n_agents):
            if not done[_idx]:
                next_norm_obs = normalize_observation(next_obs[_idx], tree_depth=tree_depth)
                agent.remember((norm_obs, action_dict[_idx], all_rewards[_idx], next_norm_obs, done[_idx]))

        # Render the environment -> show me what you got!
        env_renderer.render_env(show=True, show_observations=True)

        #  Prepare for new step and stop if agent is done
        obs = next_obs.copy()
        if done["__all__"]:
            break

    # Train the agent
    if len(agent.memory) > batch_size:
        agent.step(batch_size)
    score += all_rewards[0]

    # Epsilon decay
    eps = max(eps_end, eps_decay * eps)

    # Copy weights from Q' to Q
def main(argv):
    try:
        opts, args = getopt.getopt(argv, "n:", ["n_trials="])
    except getopt.GetoptError:
        print('test_navigation_single_agent.py -n <n_trials>')
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-n', '--n_trials'):
            n_trials = int(arg)

    random.seed(1)
    np.random.seed(1)

    ######## TEST SET SELECTION - PARAMETERS ########
    
    test_multi_agent_setup = 1             # 1 for Medium size test, 2 for Big size test
    test_n_agents = 5                      # Number of agents to test (3 - 5 - 7 for Medium, 5 - 7 - 10 for Big)
    test_malfunctions_enabled = True       # Malfunctions enabled?
    test_agents_one_speed = True           # Test agents with the same speed (1) or with 4 different speeds?

    #################################################

    # Medium size
    if test_multi_agent_setup == 1:
        x_dim = 16*3
        y_dim = 9*3
        max_num_cities = 5
        max_rails_between_cities = 2
        max_rails_in_city = 3

    # Big size
    if test_multi_agent_setup == 2:
        x_dim = 16*4
        y_dim = 9*4
        max_num_cities = 9
        max_rails_between_cities = 5
        max_rails_in_city = 5


    stochastic_data = {'malfunction_rate': 80,  # Rate of malfunction occurence of single agent
                       'min_duration': 15,  # Minimal duration of malfunction
                       'max_duration': 50  # Max duration of malfunction
                       }

    # Custom observation builder
    tree_depth = 2
    TreeObservation = TreeObsForRailEnv(max_depth=tree_depth, predictor = ShortestPathPredictorForRailEnv(20))

    np.savetxt(fname=path.join('NetsTest' , 'info.txt'), X=[x_dim,y_dim,test_n_agents,max_num_cities,max_rails_between_cities,max_rails_in_city,tree_depth],delimiter=';')

    # Different agent types (trains) with different speeds.
    if test_agents_one_speed:
        speed_ration_map = {1.: 1.,  # Fast passenger train
                            1. / 2.: 0.0,  # Fast freight train
                            1. / 3.: 0.0,  # Slow commuter train
                            1. / 4.: 0.0}  # Slow freight train
    else:
        speed_ration_map = {1.: 0.25,  # Fast passenger train
                            1. / 2.: 0.25,  # Fast freight train
                            1. / 3.: 0.25,  # Slow commuter train
                            1. / 4.: 0.25}  # Slow freight train

    
    if test_malfunctions_enabled:
        env = RailEnv(width=x_dim,
                      height=y_dim,
                      rail_generator=sparse_rail_generator(max_num_cities=max_num_cities,
                                                           # Number of cities in map (where train stations are)
                                                           seed=14,  # Random seed
                                                           grid_mode=False,
                                                           max_rails_between_cities=max_rails_between_cities,
                                                               max_rails_in_city=max_rails_in_city),
                    schedule_generator=sparse_schedule_generator(speed_ration_map),
                    malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
                    number_of_agents=test_n_agents,
                    obs_builder_object=TreeObservation)
    else:
        env = RailEnv(width=x_dim,
                      height=y_dim,
                      rail_generator=sparse_rail_generator(max_num_cities=max_num_cities,
                                                           # Number of cities in map (where train stations are)
                                                           seed=14,  # Random seed
                                                           grid_mode=False,
                                                           max_rails_between_cities=max_rails_between_cities,
                                                               max_rails_in_city=max_rails_in_city),
                    schedule_generator=sparse_schedule_generator(speed_ration_map),
                    number_of_agents=test_n_agents,
                    obs_builder_object=TreeObservation)
    
    env.reset()

    #env_renderer = RenderTool(env, gl="PILSVG", )
    env_renderer = RenderTool(env, gl="PILSVG",
                          agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX,
                          show_debug=False,
                          screen_height=(1080*0.8),  # Adjust these parameters to fit your resolution
                          screen_width=(1920*0.8))
    num_features_per_node = env.obs_builder.observation_dim

    
    nr_nodes = 0
    for i in range(tree_depth + 1):
        nr_nodes += np.power(4, i)
    state_size = num_features_per_node * nr_nodes
    action_size = 5

    # We set the number of episodes we would like to train on
    if 'n_trials' not in locals():
        n_trials = 15000
    
    # max_steps computation
    speed_weighted_mean = 0

    for key in speed_ration_map.keys():
        speed_weighted_mean += key * speed_ration_map[key]
    
    #max_steps = int(3 * (env.height + env.width))
    max_steps = int((1/speed_weighted_mean) * 3 * (env.height + env.width))
    #eps = 1.
    #eps_end = 0.005
    #eps_decay = 0.9995

    # And some variables to keep track of the performance
    action_dict = dict()
    final_action_dict = dict()
    action_prob_list = []
    scores_window = deque(maxlen=100)
    done_window = deque(maxlen=100)
    scores = []
    scores_list = []
    deadlock_list =[]
    dones_list_window = []
    dones_list = []
    action_prob = [0] * action_size
    agent_obs = [None] * env.get_num_agents()
    agent_next_obs = [None] * env.get_num_agents() # Useless
    agent = Agent(state_size, action_size)
    
    # LOAD MODEL WEIGHTS TO TEST
    agent.qnetwork_local.load_state_dict(torch.load(path.join('NetsTest' , 'navigator_checkpoint3800_multi10_deadlock_global10.pth')))

    record_images = False
    frame_step = 0

    for trials in range(1, n_trials + 1):

        # Reset environment
        obs, info = env.reset()#(True, True)
        env_renderer.reset()
        # Build agent specific observations
        for a in range(env.get_num_agents()):
            agent_obs[a] = agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
        # Reset score and done
        score = 0
        env_done = 0

        # Run episode
        for step in range(max_steps):

            # Action
            for a in range(env.get_num_agents()):
                if info['action_required'][a]:
                    action = agent.act(agent_obs[a], eps=0.)
                    action_prob[action] += 1

                else:
                    action = 0

                action_dict.update({a: action})
            # Environment step
            obs, all_rewards, done, deadlocks, info = env.step(action_dict)

            env_renderer.render_env(show=True, show_predictions=True, show_observations=False)
            # Build agent specific observations and normalize
            for a in range(env.get_num_agents()):
                if obs[a]:
                    agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)

                score += all_rewards[a] / env.get_num_agents()


            if done['__all__']:
                break

        # Collection information about training
        tasks_finished = 0
        for _idx in range(env.get_num_agents()):
            if done[_idx] == 1:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / max_steps)  # save most recent score
        scores.append(np.mean(scores_window))
        dones_list.append(tasks_finished / max(1, env.get_num_agents()))
        dones_list_window.append((np.mean(done_window)))
        scores_list.append(score / max_steps)
        deadlock_list.append(deadlocks.count(1)/max(1, env.get_num_agents()))

        if (np.sum(action_prob) == 0):
            action_prob_normalized = [0] * action_size
        else:
            action_prob_normalized = action_prob / np.sum(action_prob)



        print(
                '\rTesting {} Agents on ({},{}).\t Episode {}\t Score: {:.3f}\tDones: {:.2f}%\tDeadlocks: {:.2f}\t Action Probabilities: \t {}'.format(
                    env.get_num_agents(), x_dim, y_dim,
                    trials,
                    score / max_steps,
                    100 * tasks_finished / max(1, env.get_num_agents()),
                    deadlocks.count(1)/max(1, env.get_num_agents()),
                    action_prob_normalized), end=" ")

        #if trials % 100 == 0:
        action_prob_list.append(action_prob_normalized)
        action_prob = [0] * action_size

        if trials % 50 == 0:

            np.savetxt(fname=path.join('NetsTest' , 'test_metrics.csv'), X=np.transpose(np.asarray([scores_list,scores,dones_list,dones_list_window,deadlock_list])), delimiter=';',newline='\n')
            np.savetxt(fname=path.join('NetsTest' , 'test_action_prob.csv'), X=np.asarray(action_prob_list), delimiter=';',newline='\n')
Exemple #22
0
def train_agent(env_params, train_params):
    # Environment parameters
    n_agents = env_params.n_agents
    x_dim = env_params.x_dim
    y_dim = env_params.y_dim
    n_cities = env_params.n_cities
    max_rails_between_cities = env_params.max_rails_between_cities
    max_rails_in_city = env_params.max_rails_in_city
    seed = env_params.seed

    # Observation parameters
    observation_tree_depth = env_params.observation_tree_depth
    observation_radius = env_params.observation_radius
    observation_max_path_depth = env_params.observation_max_path_depth

    # Training parameters
    eps_start = train_params.eps_start
    eps_end = train_params.eps_end
    eps_decay = train_params.eps_decay
    n_episodes = train_params.n_episodes
    checkpoint_interval = train_params.checkpoint_interval
    n_eval_episodes = train_params.n_evaluation_episodes

    # Set the seeds
    random.seed(seed)
    np.random.seed(seed)

    # Break agents from time to time
    malfunction_parameters = MalfunctionParameters(
        malfunction_rate=1. / 10000,  # Rate of malfunctions
        min_duration=15,  # Minimal duration
        max_duration=50  # Max duration
    )

    # Observation builder
    predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
    tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth,
                                         predictor=predictor)

    # Fraction of train which each speed
    speed_profiles = {
        1.: 1.0,  # Fast passenger train
        1. / 2.: 0.0,  # Fast freight train
        1. / 3.: 0.0,  # Slow commuter train
        1. / 4.: 0.0  # Slow freight train
    }

    # Setup the environment
    env = RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=n_cities,
            grid_mode=False,
            max_rails_between_cities=max_rails_between_cities,
            max_rails_in_city=max_rails_in_city),
        schedule_generator=sparse_schedule_generator(speed_profiles),
        number_of_agents=n_agents,
        malfunction_generator_and_process_data=malfunction_from_params(
            malfunction_parameters),
        obs_builder_object=tree_observation,
        random_seed=seed)

    env.reset(regenerate_schedule=True, regenerate_rail=True)

    # Setup renderer
    if train_params.render:
        env_renderer = RenderTool(env, gl="PGL")

    # Calculate the state size given the depth of the tree observation and the number of features
    n_features_per_node = env.obs_builder.observation_dim
    n_nodes = 0
    for i in range(observation_tree_depth + 1):
        n_nodes += np.power(4, i)
    state_size = n_features_per_node * n_nodes

    # The action space of flatland is 5 discrete actions
    action_size = 5

    # Max number of steps per episode
    # This is the official formula used during evaluations
    # See details in flatland.envs.schedule_generators.sparse_schedule_generator
    max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities)))

    action_count = [0] * action_size
    action_dict = dict()
    agent_obs = [None] * env.get_num_agents()
    agent_prev_obs = [None] * env.get_num_agents()
    agent_prev_action = [2] * env.get_num_agents()
    update_values = False
    smoothed_normalized_score = -1.0
    smoothed_eval_normalized_score = -1.0
    smoothed_completion = 0.0
    smoothed_eval_completion = 0.0

    # Double Dueling DQN policy
    policy = DDDQNPolicy(state_size, action_size, train_params)

    # TensorBoard writer
    writer = SummaryWriter()
    writer.add_hparams(vars(train_params), {})
    writer.add_hparams(vars(env_params), {})

    training_timer = Timer()
    training_timer.start()

    print(
        "\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes.\n"
        .format(env.get_num_agents(), x_dim, y_dim, n_episodes,
                n_eval_episodes, checkpoint_interval))

    for episode_idx in range(n_episodes + 1):
        # Timers
        step_timer = Timer()
        reset_timer = Timer()
        learn_timer = Timer()
        preproc_timer = Timer()

        # Reset environment
        reset_timer.start()
        obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True)
        reset_timer.end()

        if train_params.render:
            env_renderer.set_new_rail()

        score = 0
        nb_steps = 0
        actions_taken = []

        # Build agent specific observations
        for agent in env.get_agent_handles():
            if obs[agent]:
                agent_obs[agent] = normalize_observation(
                    obs[agent],
                    observation_tree_depth,
                    observation_radius=observation_radius)
                agent_prev_obs[agent] = agent_obs[agent].copy()

        # Run episode
        for step in range(max_steps - 1):
            for agent in env.get_agent_handles():
                if info['action_required'][agent]:
                    # If an action is required, we want to store the obs at that step as well as the action
                    update_values = True
                    action = policy.act(agent_obs[agent], eps=eps_start)
                    action_count[action] += 1
                    actions_taken.append(action)
                else:
                    update_values = False
                    action = 0
                action_dict.update({agent: action})

            # Environment step
            step_timer.start()
            next_obs, all_rewards, done, info = env.step(action_dict)
            step_timer.end()

            if train_params.render and episode_idx % checkpoint_interval == 0:
                env_renderer.render_env(show=True,
                                        frames=False,
                                        show_observations=False,
                                        show_predictions=False)

            for agent in range(env.get_num_agents()):
                # Update replay buffer and train agent
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values or done[agent]:
                    learn_timer.start()
                    policy.step(agent_prev_obs[agent],
                                agent_prev_action[agent], all_rewards[agent],
                                agent_obs[agent], done[agent])
                    learn_timer.end()

                    agent_prev_obs[agent] = agent_obs[agent].copy()
                    agent_prev_action[agent] = action_dict[agent]

                # Preprocess the new observations
                if next_obs[agent]:
                    preproc_timer.start()
                    agent_obs[agent] = normalize_observation(
                        next_obs[agent],
                        observation_tree_depth,
                        observation_radius=observation_radius)
                    preproc_timer.end()

                score += all_rewards[agent]

            nb_steps = step

            if done['__all__']:
                break

        # Epsilon decay
        eps_start = max(eps_end, eps_decay * eps_start)

        # Collection information about training
        tasks_finished = sum(done[idx] for idx in env.get_agent_handles())
        completion = tasks_finished / max(1, env.get_num_agents())
        normalized_score = score / (max_steps * env.get_num_agents())
        action_probs = action_count / np.sum(action_count)
        action_count = [1] * action_size

        # Smoothed values for terminal display and for more stable hyper-parameter tuning
        smoothing = 0.99
        smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * (
            1.0 - smoothing)
        smoothed_completion = smoothed_completion * smoothing + completion * (
            1.0 - smoothing)

        # Print logs
        if episode_idx % checkpoint_interval == 0:
            torch.save(
                policy.qnetwork_local,
                './checkpoints/origin_multi-' + str(episode_idx) + '.pth')
            if train_params.render:
                env_renderer.close_window()

        print('\r🚂 Episode {}'
              '\t 🏆 Score: {:.3f}'
              ' Avg: {:.3f}'
              '\t 💯 Done: {:.2f}%'
              ' Avg: {:.2f}%'
              '\t 🎲 Epsilon: {:.2f} '
              '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score,
                                             smoothed_normalized_score,
                                             100 * completion,
                                             100 * smoothed_completion,
                                             eps_start,
                                             format_action_prob(action_probs)),
              end=" ")

        # Evaluate policy
        if episode_idx % train_params.checkpoint_interval == 0:
            scores, completions, nb_steps_eval = eval_policy(
                env, policy, n_eval_episodes, max_steps)
            writer.add_scalar("evaluation/scores_min", np.min(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_max", np.max(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_mean", np.mean(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_std", np.std(scores),
                              episode_idx)
            writer.add_histogram("evaluation/scores", np.array(scores),
                                 episode_idx)
            writer.add_scalar("evaluation/completions_min",
                              np.min(completions), episode_idx)
            writer.add_scalar("evaluation/completions_max",
                              np.max(completions), episode_idx)
            writer.add_scalar("evaluation/completions_mean",
                              np.mean(completions), episode_idx)
            writer.add_scalar("evaluation/completions_std",
                              np.std(completions), episode_idx)
            writer.add_histogram("evaluation/completions",
                                 np.array(completions), episode_idx)
            writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval),
                              episode_idx)
            writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval),
                              episode_idx)
            writer.add_scalar("evaluation/nb_steps_mean",
                              np.mean(nb_steps_eval), episode_idx)
            writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval),
                              episode_idx)
            writer.add_histogram("evaluation/nb_steps",
                                 np.array(nb_steps_eval), episode_idx)

            smoothing = 0.9
            smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean(
                scores) * (1.0 - smoothing)
            smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean(
                completions) * (1.0 - smoothing)
            writer.add_scalar("evaluation/smoothed_score",
                              smoothed_eval_normalized_score, episode_idx)
            writer.add_scalar("evaluation/smoothed_completion",
                              smoothed_eval_completion, episode_idx)

        # Save logs to tensorboard
        writer.add_scalar("training/score", normalized_score, episode_idx)
        writer.add_scalar("training/smoothed_score", smoothed_normalized_score,
                          episode_idx)
        writer.add_scalar("training/completion", np.mean(completion),
                          episode_idx)
        writer.add_scalar("training/smoothed_completion",
                          np.mean(smoothed_completion), episode_idx)
        writer.add_scalar("training/nb_steps", nb_steps, episode_idx)
        writer.add_histogram("actions/distribution", np.array(actions_taken),
                             episode_idx)
        writer.add_scalar("actions/nothing",
                          action_probs[RailEnvActions.DO_NOTHING], episode_idx)
        writer.add_scalar("actions/left",
                          action_probs[RailEnvActions.MOVE_LEFT], episode_idx)
        writer.add_scalar("actions/forward",
                          action_probs[RailEnvActions.MOVE_FORWARD],
                          episode_idx)
        writer.add_scalar("actions/right",
                          action_probs[RailEnvActions.MOVE_RIGHT], episode_idx)
        writer.add_scalar("actions/stop",
                          action_probs[RailEnvActions.STOP_MOVING],
                          episode_idx)
        writer.add_scalar("training/epsilon", eps_start, episode_idx)
        writer.add_scalar("training/buffer_size", len(policy.memory),
                          episode_idx)
        writer.add_scalar("training/loss", policy.loss, episode_idx)
        writer.add_scalar("timer/reset", reset_timer.get(), episode_idx)
        writer.add_scalar("timer/step", step_timer.get(), episode_idx)
        writer.add_scalar("timer/learn", learn_timer.get(), episode_idx)
        writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx)
        writer.add_scalar("timer/total", training_timer.get_current(),
                          episode_idx)
Exemple #23
0
def train_agent(train_params, train_env_params, eval_env_params, obs_params):
    # Environment parameters
    n_agents = train_env_params.n_agents
    x_dim = train_env_params.x_dim
    y_dim = train_env_params.y_dim
    n_cities = train_env_params.n_cities
    max_rails_between_cities = train_env_params.max_rails_between_cities
    max_rails_in_city = train_env_params.max_rails_in_city
    seed = train_env_params.seed

    # Unique ID for this training
    now = datetime.now()
    training_id = now.strftime('%y%m%d%H%M%S')

    # Observation parameters
    observation_tree_depth = obs_params.observation_tree_depth
    observation_radius = obs_params.observation_radius
    observation_max_path_depth = obs_params.observation_max_path_depth

    # Training parameters
    eps_start = train_params.eps_start
    eps_end = train_params.eps_end
    eps_decay = train_params.eps_decay
    n_episodes = train_params.n_episodes
    checkpoint_interval = train_params.checkpoint_interval
    n_eval_episodes = train_params.n_evaluation_episodes
    restore_replay_buffer = train_params.restore_replay_buffer
    save_replay_buffer = train_params.save_replay_buffer

    # Set the seeds
    random.seed(seed)
    np.random.seed(seed)

    # Observation builder
    predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
    tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth,
                                         predictor=predictor)

    # Setup the environments
    train_env = create_rail_env(train_env_params, tree_observation)
    train_env.reset(regenerate_schedule=True, regenerate_rail=True)
    eval_env = create_rail_env(eval_env_params, tree_observation)
    eval_env.reset(regenerate_schedule=True, regenerate_rail=True)

    # Setup renderer
    if train_params.render:
        env_renderer = RenderTool(train_env, gl="PGL")

    # Calculate the state size given the depth of the tree observation and the number of features
    n_features_per_node = train_env.obs_builder.observation_dim
    n_nodes = sum([np.power(4, i) for i in range(observation_tree_depth + 1)])
    state_size = n_features_per_node * n_nodes

    # The action space of flatland is 5 discrete actions
    action_size = 5

    # Max number of steps per episode
    # This is the official formula used during evaluations
    # See details in flatland.envs.schedule_generators.sparse_schedule_generator
    # max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities)))
    max_steps = train_env._max_episode_steps

    action_count = [0] * action_size
    action_dict = dict()
    agent_obs = [None] * n_agents
    agent_prev_obs = [None] * n_agents
    agent_prev_action = [2] * n_agents
    update_values = [False] * n_agents

    # Smoothed values used as target for hyperparameter tuning
    smoothed_normalized_score = -1.0
    smoothed_eval_normalized_score = -1.0
    smoothed_completion = 0.0
    smoothed_eval_completion = 0.0

    # Double Dueling DQN policy
    policy = DDDQNPolicy(state_size, action_size, train_params)

    # Loads existing replay buffer
    if restore_replay_buffer:
        try:
            policy.load_replay_buffer(restore_replay_buffer)
            policy.test()
        except RuntimeError as e:
            print(
                "\n🛑 Could't load replay buffer, were the experiences generated using the same tree depth?"
            )
            print(e)
            exit(1)

    print("\n💾 Replay buffer status: {}/{} experiences".format(
        len(policy.memory.memory), train_params.buffer_size))

    hdd = psutil.disk_usage('/')
    if save_replay_buffer and (hdd.free / (2**30)) < 500.0:
        print(
            "⚠️  Careful! Saving replay buffers will quickly consume a lot of disk space. You have {:.2f}gb left."
            .format(hdd.free / (2**30)))

    # TensorBoard writer
    writer = SummaryWriter()
    writer.add_hparams(vars(train_params), {})
    writer.add_hparams(vars(train_env_params), {})
    writer.add_hparams(vars(obs_params), {})

    training_timer = Timer()
    training_timer.start()

    print(
        "\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes. Training id '{}'.\n"
        .format(train_env.get_num_agents(), x_dim, y_dim, n_episodes,
                n_eval_episodes, checkpoint_interval, training_id))

    for episode_idx in range(n_episodes + 1):
        step_timer = Timer()
        reset_timer = Timer()
        learn_timer = Timer()
        preproc_timer = Timer()
        inference_timer = Timer()

        # Reset environment
        reset_timer.start()
        obs, info = train_env.reset(regenerate_rail=True,
                                    regenerate_schedule=True)
        reset_timer.end()

        if train_params.render:
            env_renderer.set_new_rail()

        score = 0
        nb_steps = 0
        actions_taken = []

        # Build initial agent-specific observations
        for agent in train_env.get_agent_handles():
            if obs[agent]:
                agent_obs[agent] = normalize_observation(
                    obs[agent],
                    observation_tree_depth,
                    observation_radius=observation_radius)
                agent_prev_obs[agent] = agent_obs[agent].copy()

        # Run episode
        for step in range(max_steps - 1):
            inference_timer.start()
            for agent in train_env.get_agent_handles():
                if info['action_required'][agent]:
                    update_values[agent] = True
                    action = policy.act(agent_obs[agent], eps=eps_start)

                    action_count[action] += 1
                    actions_taken.append(action)
                else:
                    # An action is not required if the train hasn't joined the railway network,
                    # if it already reached its target, or if is currently malfunctioning.
                    update_values[agent] = False
                    action = 0
                action_dict.update({agent: action})
            inference_timer.end()

            # Environment step
            step_timer.start()
            next_obs, all_rewards, done, info = train_env.step(action_dict)
            step_timer.end()

            # Render an episode at some interval
            if train_params.render and episode_idx % checkpoint_interval == 0:
                env_renderer.render_env(show=True,
                                        frames=False,
                                        show_observations=False,
                                        show_predictions=False)

            # Update replay buffer and train agent
            for agent in train_env.get_agent_handles():
                if update_values[agent] or done['__all__']:
                    # Only learn from timesteps where somethings happened
                    learn_timer.start()
                    policy.step(agent_prev_obs[agent],
                                agent_prev_action[agent], all_rewards[agent],
                                agent_obs[agent], done[agent])
                    learn_timer.end()

                    agent_prev_obs[agent] = agent_obs[agent].copy()
                    agent_prev_action[agent] = action_dict[agent]

                # Preprocess the new observations
                if next_obs[agent]:
                    preproc_timer.start()
                    agent_obs[agent] = normalize_observation(
                        next_obs[agent],
                        observation_tree_depth,
                        observation_radius=observation_radius)
                    preproc_timer.end()

                score += all_rewards[agent]

            nb_steps = step

            if done['__all__']:
                break

        # Epsilon decay
        eps_start = max(eps_end, eps_decay * eps_start)

        # Collect information about training
        tasks_finished = sum(done[idx]
                             for idx in train_env.get_agent_handles())
        completion = tasks_finished / max(1, train_env.get_num_agents())
        normalized_score = score / (max_steps * train_env.get_num_agents())
        action_probs = action_count / np.sum(action_count)
        action_count = [1] * action_size

        smoothing = 0.99
        smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * (
            1.0 - smoothing)
        smoothed_completion = smoothed_completion * smoothing + completion * (
            1.0 - smoothing)

        # Print logs
        if episode_idx % checkpoint_interval == 0:
            torch.save(
                policy.qnetwork_local, './checkpoints/' + training_id + '-' +
                str(episode_idx) + '.pth')

            if save_replay_buffer:
                policy.save_replay_buffer('./replay_buffers/' + training_id +
                                          '-' + str(episode_idx) + '.pkl')

            if train_params.render:
                env_renderer.close_window()

        print('\r🚂 Episode {}'
              '\t 🏆 Score: {:.3f}'
              ' Avg: {:.3f}'
              '\t 💯 Done: {:.2f}%'
              ' Avg: {:.2f}%'
              '\t 🎲 Epsilon: {:.3f} '
              '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score,
                                             smoothed_normalized_score,
                                             100 * completion,
                                             100 * smoothed_completion,
                                             eps_start,
                                             format_action_prob(action_probs)),
              end=" ")

        # Evaluate policy and log results at some interval
        if episode_idx % checkpoint_interval == 0 and n_eval_episodes > 0:
            scores, completions, nb_steps_eval = eval_policy(
                eval_env, policy, train_params, obs_params)

            writer.add_scalar("evaluation/scores_min", np.min(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_max", np.max(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_mean", np.mean(scores),
                              episode_idx)
            writer.add_scalar("evaluation/scores_std", np.std(scores),
                              episode_idx)
            writer.add_histogram("evaluation/scores", np.array(scores),
                                 episode_idx)
            writer.add_scalar("evaluation/completions_min",
                              np.min(completions), episode_idx)
            writer.add_scalar("evaluation/completions_max",
                              np.max(completions), episode_idx)
            writer.add_scalar("evaluation/completions_mean",
                              np.mean(completions), episode_idx)
            writer.add_scalar("evaluation/completions_std",
                              np.std(completions), episode_idx)
            writer.add_histogram("evaluation/completions",
                                 np.array(completions), episode_idx)
            writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval),
                              episode_idx)
            writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval),
                              episode_idx)
            writer.add_scalar("evaluation/nb_steps_mean",
                              np.mean(nb_steps_eval), episode_idx)
            writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval),
                              episode_idx)
            writer.add_histogram("evaluation/nb_steps",
                                 np.array(nb_steps_eval), episode_idx)

            smoothing = 0.9
            smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean(
                scores) * (1.0 - smoothing)
            smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean(
                completions) * (1.0 - smoothing)
            writer.add_scalar("evaluation/smoothed_score",
                              smoothed_eval_normalized_score, episode_idx)
            writer.add_scalar("evaluation/smoothed_completion",
                              smoothed_eval_completion, episode_idx)

        # Save logs to tensorboard
        writer.add_scalar("training/score", normalized_score, episode_idx)
        writer.add_scalar("training/smoothed_score", smoothed_normalized_score,
                          episode_idx)
        writer.add_scalar("training/completion", np.mean(completion),
                          episode_idx)
        writer.add_scalar("training/smoothed_completion",
                          np.mean(smoothed_completion), episode_idx)
        writer.add_scalar("training/nb_steps", nb_steps, episode_idx)
        writer.add_histogram("actions/distribution", np.array(actions_taken),
                             episode_idx)
        writer.add_scalar("actions/nothing",
                          action_probs[RailEnvActions.DO_NOTHING], episode_idx)
        writer.add_scalar("actions/left",
                          action_probs[RailEnvActions.MOVE_LEFT], episode_idx)
        writer.add_scalar("actions/forward",
                          action_probs[RailEnvActions.MOVE_FORWARD],
                          episode_idx)
        writer.add_scalar("actions/right",
                          action_probs[RailEnvActions.MOVE_RIGHT], episode_idx)
        writer.add_scalar("actions/stop",
                          action_probs[RailEnvActions.STOP_MOVING],
                          episode_idx)
        writer.add_scalar("training/epsilon", eps_start, episode_idx)
        writer.add_scalar("training/buffer_size", len(policy.memory),
                          episode_idx)
        writer.add_scalar("training/loss", policy.loss, episode_idx)
        writer.add_scalar("timer/reset", reset_timer.get(), episode_idx)
        writer.add_scalar("timer/step", step_timer.get(), episode_idx)
        writer.add_scalar("timer/learn", learn_timer.get(), episode_idx)
        writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx)
        writer.add_scalar("timer/total", training_timer.get_current(),
                          episode_idx)
        return observation


n_agents = 5
env = RailEnv(width=25,
              height=25,
              rail_generator=sparse_rail_generator(),
              number_of_agents=n_agents,
              obs_builder_object=SingleAgentNavigationObs())

env_renderer = None

for _ in range(100):
    if env_renderer is not None:
        env_renderer.close_window()
    obs, _ = env.reset()
    env_renderer = RenderTool(env)
    env_renderer.render_env(show=True, frames=True, show_observations=False)
    done = {"__all__": False}
    while not done["__all__"]:
        action = {i: np.argmax(o) for i, o in obs.items()}
        obs, all_rewards, done, _ = env.step(action)
        print("Rewards: ", all_rewards, "  [done=", done, "]")
        print("Observations: ", obs)
        assert len(obs.keys()) == n_agents

        env_renderer.render_env(show=True,
                                frames=True,
                                show_observations=False)
        time.sleep(0.1)
def evaluate(n_episodes, rl_prio=True):
    agent = None
    if rl_prio:
        config, run = init_run()
        agent = get_agent(config, run)
        env = get_env(config, rl=True)
    else:
        env = get_env(rl=False)
    env_renderer = RenderTool(env, screen_width=8800)
    returns = []
    pcs = []
    malfs = []

    for _ in tqdm(range(n_episodes)):

        obs, _ = env.reset(regenerate_schedule=True, regenerate_rail=True)
        if RENDER:
            env_renderer.reset()
            env_renderer.render_env(show=True,
                                    frames=True,
                                    show_observations=False)

        if not obs:
            break

        steps = 0
        ep_return = 0
        done = defaultdict(lambda: False)
        robust_env = CprFlatlandGymEnv(rail_env=env,
                                       max_nr_active_agents=200,
                                       observation_space=None,
                                       priorizer=NrAgentsSameStart(),
                                       allow_noop=True)
        # if rl_prio:
        #     priorities = prio_agent.compute_actions(obs, explore=False)
        #     sorted_actions = {k: v for k, v in sorted(priorities.items(), key=lambda item: item[1], reverse=True)}
        #     sorted_handles = list(sorted_actions.keys())
        # else:
        sorted_handles = robust_env.priorizer.priorize(handles=list(
            obs.keys()),
                                                       rail_env=env)

        while not done['__all__']:
            actions = ShortestPathAgent().compute_actions(obs, env)
            robust_actions = robust_env.get_robust_actions(
                actions, sorted_handles)
            obs, all_rewards, done, info = env.step(robust_actions)
            if RENDER:
                env_renderer.render_env(show=True,
                                        frames=True,
                                        show_observations=False)
            print('.', end='', flush=True)
            steps += 1
            ep_return += np.sum(list(all_rewards.values()))

        pc = np.sum(np.array([1 for a in env.agents if is_done(a)
                              ])) / env.get_num_agents()
        print("EPISODE PC:", pc)
        n_episodes += 1
        pcs.append(pc)
        returns.append(ep_return /
                       (env._max_episode_steps * env.get_num_agents()))
        malfs.append(
            np.sum([a.malfunction_data['nr_malfunctions']
                    for a in env.agents]))
    return pcs, returns, malfs
Exemple #26
0
def run_episode(kwargs) -> [Trajectory]:
    """
    Runs a single episode and collects the trajectories of each agent
    """
    total_controller_time = 0
    env_dict: Callable = kwargs.get("env_dict")
    obs_builder = kwargs.get("obs_builder")
    controller_creator: Callable = kwargs.get("controller_creator")
    episode_id: int = kwargs.get("episode_id")
    max_episode_length: int = kwargs.get("max_episode_length", 1000)
    render: bool = kwargs.get("render", False)
    # Create and Start Environment
    _env = load_env(env_dict, obs_builder_object=obs_builder)
    obs, info = _env.reset(
        regenerate_rail=False,
        regenerate_schedule=True,
    )
    score = 0
    _trajectories = [Trajectory() for _ in _env.get_agent_handles()]

    # Create and Start Controller
    controller: AbstractController = controller_creator()
    start = time.time()
    controller.start_of_round(obs=obs, env=_env)
    total_controller_time += time.time() - start

    if render:
        env_renderer = RenderTool(_env)
        env_renderer.reset()

    for step in range(max_episode_length):
        start = time.time()
        action_dict, processed_obs = controller.act(observation=obs)
        total_controller_time += time.time() - start
        next_obs, all_rewards, done, info = _env.step(action_dict)

        if render:
            env_renderer.render_env(show=True,
                                    show_observations=True,
                                    show_predictions=False)

        # Save actions and rewards for each agent
        [
            _trajectories[agent_handle].add_row(
                state=processed_obs[agent_handle],
                action=action_dict[agent_handle],
                reward=all_rewards[agent_handle],
                done=done[agent_handle])
            for agent_handle in _env.get_agent_handles()
        ]

        score += sum(all_rewards)

        obs = next_obs.copy()
        if done['__all__']:
            break

    if render:
        env_renderer.close_window()
    # print(f"\nController took a total time of: {total_controller_time} seconds", flush=True)
    return _trajectories
Exemple #27
0
def evalfun(num_samples=100, timed=True, debug=False, refresh=0.1):

    # A list of (mapsize, agent count) tuples, change or extend this to test different sizes.
    #problemsizes = [(5, 3), (7, 4), (9, 5), (11, 6), (13, 7)]
    problemsizes = [(7, 4)]

    # Create a list of seeds to consider.
    #seeds = numpy.random.randint(2**29, size=3*num_samples)
    scores = []
    successes = 0
    completion_window = deque(maxlen=100)
    completion = []
    runtime = []
    schedule_lenth = []
    seeds = 37429879
    print("%10s\t%8s\t%8s\t%9s" %
          ("Dimensions", "Success", "Rewards", "Runtime"))
    for problemsize in problemsizes:
        # Create environments while they are not the intended dimension.
        j = 0
        env = create_multi_agent_environment(problemsize[0], problemsize[1],
                                             timed, seeds)
        for _ in range(0, num_samples):

            # Create environments while they are not the intended dimension.

            # Create a renderer only if in debug mode.
            if debug:
                env_renderer = RenderTool(env,
                                          screen_width=1920,
                                          screen_height=1080)

            # Find the schedules
            start = time.time()
            _, schedule = genetic_algorithm(env)
            duration = time.time() - start
            runtime.append(duration)
            schedule_lenth.append(len(schedule))
            if debug:
                env_renderer.render_env(show=True,
                                        frames=False,
                                        show_observations=False)
                time.sleep(refresh)

            # Validate that environment state is unchanged.
            #assert env.num_resets == 1 and env._elapsed_steps == 0

            # Run the schedule
            success = False
            sumreward = 0
            for action in schedule:
                _, _reward_dict, _done, _ = env.step(action)
                success = all(_done.values())
                sumreward = sumreward + sum(_reward_dict.values())
                if debug:
                    env_renderer.render_env(show=True,
                                            frames=False,
                                            show_observations=False)
                    time.sleep(refresh)
            # Print the performance of the algorithm
            if success == True:
                successes += 1

            tasks_finished = np.sum(
                [int(_done[idx]) for idx in range(env.get_num_agents())])
            completion_window.append(tasks_finished /
                                     max(1, env.get_num_agents()))
            completion.append((np.mean(completion_window)))
            scores.append(sumreward)
            print("%10s\t%8s\t%8.3f\t%9.6f" %
                  (str(problemsize), str(success), sumreward, duration))
            print(schedule_lenth)
            env.reset()
        print("Number of sucesses", successes)
        print("Number of samples", num_samples)
        print("Successful:    %8.2f%%" % (100 * successes / num_samples))
        print("Mean reward:   %8.2f" % (np.mean(scores)))
        print("Median reward: %8.2f" % (np.median(scores)))
        print("Instances solved: %8.2f" % (np.mean(completion)))
        print("Run Time   %8.2f" % (np.mean(runtime)))
        print("Avg schedule length %8.2f", np.mean(schedule_lenth))
Exemple #28
0
    # Reset score and done
    score = 0
    env_done = 0

    # Run episode
    for step in range(max_steps):

        # Action
        for a in range(env.get_num_agents()):
            if info['action_required'][a]:
                action = agent.act(agent_obs[a], eps=0.)

            else:
                action = 0

            action_prob[action] += 1
            action_dict.update({a: action})
        # Environment step
        obs, all_rewards, done, _ = env.step(action_dict)

        env_renderer.render_env(show=True, show_predictions=True, show_observations=False)
        # Build agent specific observations and normalize
        for a in range(env.get_num_agents()):
            if obs[a]:
                agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)


        if done['__all__']:
            break

                  grid_mode=False,
                  max_rails_between_cities=max_rails_between_cities,
                  max_rails_in_city=max_rails_in_city),
              schedule_generator=sparse_schedule_generator(speed_profiles),
              number_of_agents=n_agents,
              malfunction_generator_and_process_data=malfunction_from_params(
                  malfunction_parameters),
              obs_builder_object=tree_observation,
              random_seed=seed)

env.reset(regenerate_schedule=True, regenerate_rail=True)

# Setup renderer

env_renderer = RenderTool(env)
'''
env_renderer.render_env(show=True,show_predictions=False)
time.sleep(5)
env_renderer.close_window()
'''
n_features_per_node = env.obs_builder.observation_dim
n_nodes = 0
for i in range(observation_tree_depth + 1):
    n_nodes += np.power(4, i)
state_size = n_features_per_node * n_nodes

action_size = 5

# Max number of steps per episode
# This is the official formula used during evaluations
# See details in flatland.envs.schedule_generators.sparse_schedule_generator
Exemple #30
0
class FlatlandRemoteEvaluationService:
    """
    A remote evaluation service which exposes the following interfaces
    of a RailEnv :
    - env_create
    - env_step
    and an additional `env_submit` to cater to score computation and on-episode-complete post processings.

    This service is designed to be used in conjunction with
    `FlatlandRemoteClient` and both the srevice and client maintain a
    local instance of the RailEnv instance, and in case of any unexpected
    divergences in the state of both the instances, the local RailEnv
    instance of the `FlatlandRemoteEvaluationService` is supposed to act
    as the single source of truth.

    Both the client and remote service communicate with each other
    via Redis as a message broker. The individual messages are packed and
    unpacked with `msgpack` (a patched version of msgpack which also supports
    numpy arrays).
    """
    def __init__(self,
                 test_env_folder="/tmp",
                 flatland_rl_service_id='FLATLAND_RL_SERVICE_ID',
                 remote_host='127.0.0.1',
                 remote_port=6379,
                 remote_db=0,
                 remote_password=None,
                 visualize=False,
                 video_generation_envs=[],
                 report=None,
                 verbose=False):

        # Test Env folder Paths
        self.test_env_folder = test_env_folder
        self.video_generation_envs = video_generation_envs
        self.env_file_paths = self.get_env_filepaths()
        random.shuffle(self.env_file_paths)
        print(self.env_file_paths)
        # Shuffle all the env_file_paths for more exciting videos
        # and for more uniform time progression

        # Logging and Reporting related vars
        self.verbose = verbose
        self.report = report

        # Communication Protocol Related vars
        self.namespace = "flatland-rl"
        self.service_id = flatland_rl_service_id
        self.command_channel = "{}::{}::commands".format(
            self.namespace, self.service_id)

        # Message Broker related vars
        self.remote_host = remote_host
        self.remote_port = remote_port
        self.remote_db = remote_db
        self.remote_password = remote_password
        self.instantiate_redis_connection_pool()

        # AIcrowd evaluation specific vars
        self.oracle_events = crowdai_api.events.CrowdAIEvents(with_oracle=True)
        self.evaluation_state = {
            "state": "PENDING",
            "progress": 0.0,
            "simulation_count": 0,
            "total_simulation_count": len(self.env_file_paths),
            "score": {
                "score": 0.0,
                "score_secondary": 0.0
            },
            "meta": {
                "normalized_reward": 0.0
            }
        }
        self.stats = {}

        # RailEnv specific variables
        self.env = False
        self.env_renderer = False
        self.reward = 0
        self.simulation_count = -1
        self.simulation_rewards = []
        self.simulation_rewards_normalized = []
        self.simulation_percentage_complete = []
        self.simulation_steps = []
        self.simulation_times = []
        self.env_step_times = []
        self.begin_simulation = False
        self.current_step = 0
        self.visualize = visualize
        self.vizualization_folder_name = "./.visualizations"
        self.record_frame_step = 0

        if self.visualize:
            if os.path.exists(self.vizualization_folder_name):
                print(
                    "[WARNING] Deleting already existing visualizations folder at : {}"
                    .format(self.vizualization_folder_name))
                shutil.rmtree(self.vizualization_folder_name)
            os.mkdir(self.vizualization_folder_name)

    def update_running_mean_stats(self, key, scalar):
        """
        Computes the running mean for certain params
        """
        mean_key = "{}_mean".format(key)
        counter_key = "{}_counter".format(key)

        try:
            self.stats[mean_key] = \
                ((self.stats[mean_key] * self.stats[counter_key]) + scalar) / (self.stats[counter_key] + 1)
            self.stats[counter_key] += 1
        except KeyError:
            self.stats[mean_key] = 0
            self.stats[counter_key] = 0

    def get_env_filepaths(self):
        """
        Gathers a list of all available rail env files to be used
        for evaluation. The folder structure expected at the `test_env_folder`
        is similar to :

            .
            ├── Test_0
            │   ├── Level_1.pkl
            │   ├── .......
            │   ├── .......
            │   └── Level_99.pkl
            └── Test_1
                ├── Level_1.pkl
                ├── .......
                ├── .......
                └── Level_99.pkl
        """
        env_paths = sorted(
            glob.glob(os.path.join(self.test_env_folder, "*/*.pkl")))
        # Remove the root folder name from the individual
        # lists, so that we only have the path relative
        # to the test root folder
        env_paths = sorted(
            [os.path.relpath(x, self.test_env_folder) for x in env_paths])

        return env_paths

    def instantiate_redis_connection_pool(self):
        """
        Instantiates a Redis connection pool which can be used to
        communicate with the message broker
        """
        if self.verbose or self.report:
            print("Attempting to connect to redis server at {}:{}/{}".format(
                self.remote_host, self.remote_port, self.remote_db))

        self.redis_pool = redis.ConnectionPool(host=self.remote_host,
                                               port=self.remote_port,
                                               db=self.remote_db,
                                               password=self.remote_password)
        self.redis_conn = redis.Redis(connection_pool=self.redis_pool)

    def get_redis_connection(self):
        """
        Obtains a new redis connection from a previously instantiated
        redis connection pool
        """
        return self.redis_conn

    def _error_template(self, payload):
        """
        Simple helper function to pass a payload as a part of a
        flatland comms error template.
        """
        _response = {}
        _response['type'] = messages.FLATLAND_RL.ERROR
        _response['payload'] = payload
        return _response

    @timeout_decorator.timeout(PER_STEP_TIMEOUT,
                               use_signals=use_signals_in_timeout
                               )  # timeout for each command
    def _get_next_command(self, _redis):
        """
        A low level wrapper for obtaining the next command from a
        pre-agreed command channel.
        At the momment, the communication protocol uses lpush for pushing
        in commands, and brpop for reading out commands.
        """
        command = _redis.brpop(self.command_channel)[1]
        return command

    def get_next_command(self):
        """
        A helper function to obtain the next command, which transparently
        also deals with things like unpacking of the command from the
        packed message, and consider the timeouts, etc when trying to
        fetch a new command.
        """
        try:
            _redis = self.get_redis_connection()
            command = self._get_next_command(_redis)
            if self.verbose or self.report:
                print("Command Service: ", command)
        except timeout_decorator.timeout_decorator.TimeoutError:
            raise Exception("Timeout in step {} of simulation {}".format(
                self.current_step, self.simulation_count))
        command = msgpack.unpackb(command,
                                  object_hook=m.decode,
                                  encoding="utf8")
        if self.verbose:
            print("Received Request : ", command)

        message_queue_latency = time.time() - command["timestamp"]
        self.update_running_mean_stats("message_queue_latency",
                                       message_queue_latency)
        return command

    def send_response(self, _command_response, command, suppress_logs=False):
        _redis = self.get_redis_connection()
        command_response_channel = command['response_channel']

        if self.verbose and not suppress_logs:
            print("Responding with : ", _command_response)

        _redis.rpush(
            command_response_channel,
            msgpack.packb(_command_response,
                          default=m.encode,
                          use_bin_type=True))

    def handle_ping(self, command):
        """
        Handles PING command from the client.
        """
        service_version = flatland.__version__
        if "version" in command["payload"].keys():
            client_version = command["payload"]["version"]
        else:
            # 2.1.4 -> when the version mismatch check was added
            client_version = "2.1.4"

        _command_response = {}
        _command_response['type'] = messages.FLATLAND_RL.PONG
        _command_response['payload'] = {}
        if client_version not in SUPPORTED_CLIENT_VERSIONS:
            _command_response['type'] = messages.FLATLAND_RL.ERROR
            _command_response['payload']['message'] = \
                "Client-Server Version Mismatch => " + \
                "[ Client Version : {} ] ".format(client_version) + \
                "[ Server Version : {} ] ".format(service_version)
            self.send_response(_command_response, command)
            raise Exception(_command_response['payload']['message'])

        self.send_response(_command_response, command)

    def handle_env_create(self, command):
        """
        Handles a ENV_CREATE command from the client
        TODO: Add a high level summary of everything thats happening here.
        """
        self.simulation_count += 1
        if self.simulation_count < len(self.env_file_paths):
            """
            There are still test envs left that are yet to be evaluated 
            """
            test_env_file_path = self.env_file_paths[self.simulation_count]
            print("Evaluating : {}".format(test_env_file_path))
            test_env_file_path = os.path.join(self.test_env_folder,
                                              test_env_file_path)
            del self.env
            self.env = RailEnv(
                width=1,
                height=1,
                rail_generator=rail_from_file(test_env_file_path),
                schedule_generator=schedule_from_file(test_env_file_path),
                malfunction_generator_and_process_data=malfunction_from_file(
                    test_env_file_path),
                obs_builder_object=DummyObservationBuilder())

            if self.begin_simulation:
                # If begin simulation has already been initialized
                # atleast once
                self.simulation_times.append(time.time() -
                                             self.begin_simulation)
            self.begin_simulation = time.time()

            self.simulation_rewards.append(0)
            self.simulation_rewards_normalized.append(0)
            self.simulation_percentage_complete.append(0)
            self.simulation_steps.append(0)

            self.current_step = 0

            _observation, _info = self.env.reset(regenerate_rail=True,
                                                 regenerate_schedule=True,
                                                 activate_agents=False,
                                                 random_seed=RANDOM_SEED)

            if self.visualize:
                if self.env_renderer:
                    del self.env_renderer
                self.env_renderer = RenderTool(
                    self.env,
                    gl="PILSVG",
                )

            _command_response = {}
            _command_response[
                'type'] = messages.FLATLAND_RL.ENV_CREATE_RESPONSE
            _command_response['payload'] = {}
            _command_response['payload']['observation'] = _observation
            _command_response['payload'][
                'env_file_path'] = self.env_file_paths[self.simulation_count]
            _command_response['payload']['info'] = _info
            _command_response['payload']['random_seed'] = RANDOM_SEED
        else:
            """
            All test env evaluations are complete
            """
            _command_response = {}
            _command_response[
                'type'] = messages.FLATLAND_RL.ENV_CREATE_RESPONSE
            _command_response['payload'] = {}
            _command_response['payload']['observation'] = False
            _command_response['payload']['env_file_path'] = False
            _command_response['payload']['info'] = False
            _command_response['payload']['random_seed'] = False

        self.send_response(_command_response, command)
        #####################################################################
        # Update evaluation state
        #####################################################################
        progress = np.clip(
            self.simulation_count * 1.0 / len(self.env_file_paths), 0, 1)
        mean_reward = round(np.mean(self.simulation_rewards), 2)
        mean_normalized_reward = round(
            np.mean(self.simulation_rewards_normalized), 2)
        mean_percentage_complete = round(
            np.mean(self.simulation_percentage_complete), 3)
        self.evaluation_state["state"] = "IN_PROGRESS"
        self.evaluation_state["progress"] = progress
        self.evaluation_state["simulation_count"] = self.simulation_count
        self.evaluation_state["score"]["score"] = mean_percentage_complete
        self.evaluation_state["score"]["score_secondary"] = mean_reward
        self.evaluation_state["meta"][
            "normalized_reward"] = mean_normalized_reward
        self.handle_aicrowd_info_event(self.evaluation_state)

    def handle_env_step(self, command):
        """
        Handles a ENV_STEP command from the client
        TODO: Add a high level summary of everything thats happening here.
        """
        _payload = command['payload']

        if not self.env:
            raise Exception(
                "env_client.step called before env_client.env_create() call")
        if self.env.dones['__all__']:
            raise Exception(
                "Client attempted to perform an action on an Env which \
                has done['__all__']==True")

        action = _payload['action']
        time_start = time.time()
        _observation, all_rewards, done, info = self.env.step(action)
        time_diff = time.time() - time_start
        self.update_running_mean_stats("internal_env_step_time", time_diff)

        cumulative_reward = sum(all_rewards.values())
        self.simulation_rewards[-1] += cumulative_reward
        self.simulation_steps[-1] += 1
        """
        The normalized rewards normalize the reward for an 
        episode by dividing the whole reward by max-time-steps 
        allowed in that episode, and the number of agents present in 
        that episode
        """
        self.simulation_rewards_normalized[-1] += \
            cumulative_reward / (
                self.env._max_episode_steps +
                self.env.get_num_agents()
            )

        if done["__all__"]:
            # Compute percentage complete
            complete = 0
            for i_agent in range(self.env.get_num_agents()):
                agent = self.env.agents[i_agent]
                if agent.status in [RailAgentStatus.DONE_REMOVED]:
                    complete += 1
            percentage_complete = complete * 1.0 / self.env.get_num_agents()
            self.simulation_percentage_complete[-1] = percentage_complete

        # Record Frame
        if self.visualize:
            self.env_renderer.render_env(show=False,
                                         show_observations=False,
                                         show_predictions=False)
            """
            Only save the frames for environments which are separately provided 
            in video_generation_indices param
            """
            current_env_path = self.env_file_paths[self.simulation_count]
            if current_env_path in self.video_generation_envs:
                self.env_renderer.gl.save_image(
                    os.path.join(
                        self.vizualization_folder_name,
                        "flatland_frame_{:04d}.png".format(
                            self.record_frame_step)))
                self.record_frame_step += 1

    def handle_env_submit(self, command):
        """
        Handles a ENV_SUBMIT command from the client
        TODO: Add a high level summary of everything thats happening here.
        """
        _payload = command['payload']

        ######################################################################
        # Print Local Stats
        ######################################################################
        print("=" * 100)
        print("=" * 100)
        print("## Server Performance Stats")
        print("=" * 100)
        for _key in self.stats:
            if _key.endswith("_mean"):
                print("\t - {}\t:{}".format(_key, self.stats[_key]))
        print("=" * 100)

        # Register simulation time of the last episode
        self.simulation_times.append(time.time() - self.begin_simulation)

        if len(self.simulation_rewards) != len(self.env_file_paths):
            raise Exception(
                """env.submit called before the agent had the chance 
                to operate on all the test environments.
                """)

        mean_reward = round(np.mean(self.simulation_rewards), 2)
        mean_normalized_reward = round(
            np.mean(self.simulation_rewards_normalized), 2)
        mean_percentage_complete = round(
            np.mean(self.simulation_percentage_complete), 3)

        if self.visualize and len(os.listdir(
                self.vizualization_folder_name)) > 0:
            # Generate the video
            #
            # Note, if you had depdency issues due to ffmpeg, you can
            # install it by :
            #
            # conda install -c conda-forge x264 ffmpeg

            print("Generating Video from thumbnails...")
            video_output_path, video_thumb_output_path = \
                aicrowd_helpers.generate_movie_from_frames(
                    self.vizualization_folder_name
                )
            print("Videos : ", video_output_path, video_thumb_output_path)
            # Upload to S3 if configuration is available
            if aicrowd_helpers.is_grading(
            ) and aicrowd_helpers.is_aws_configured() and self.visualize:
                video_s3_key = aicrowd_helpers.upload_to_s3(video_output_path)
                video_thumb_s3_key = aicrowd_helpers.upload_to_s3(
                    video_thumb_output_path)
                static_thumbnail_s3_key = aicrowd_helpers.upload_random_frame_to_s3(
                    self.vizualization_folder_name)
                self.evaluation_state["score"][
                    "media_content_type"] = "video/mp4"
                self.evaluation_state["score"]["media_large"] = video_s3_key
                self.evaluation_state["score"][
                    "media_thumbnail"] = video_thumb_s3_key

                self.evaluation_state["meta"][
                    "static_media_frame"] = static_thumbnail_s3_key
            else:
                print("[WARNING] Ignoring uploading of video to S3")

        _command_response = {}
        _command_response['type'] = messages.FLATLAND_RL.ENV_SUBMIT_RESPONSE
        _payload = {}
        _payload['mean_reward'] = mean_reward
        _payload['mean_normalized_reward'] = mean_normalized_reward
        _payload['mean_percentage_complete'] = mean_percentage_complete
        _command_response['payload'] = _payload
        self.send_response(_command_response, command)

        #####################################################################
        # Update evaluation state
        #####################################################################
        self.evaluation_state["state"] = "FINISHED"
        self.evaluation_state["progress"] = 1.0
        self.evaluation_state["simulation_count"] = self.simulation_count
        self.evaluation_state["score"]["score"] = mean_percentage_complete
        self.evaluation_state["score"]["score_secondary"] = mean_reward
        self.evaluation_state["meta"][
            "normalized_reward"] = mean_normalized_reward
        self.handle_aicrowd_success_event(self.evaluation_state)
        print("#" * 100)
        print("EVALUATION COMPLETE !!")
        print("#" * 100)
        print("# Mean Reward : {}".format(mean_reward))
        print("# Mean Normalized Reward : {}".format(mean_normalized_reward))
        print(
            "# Mean Percentage Complete : {}".format(mean_percentage_complete))
        print("#" * 100)
        print("#" * 100)

    def report_error(self, error_message, command_response_channel):
        """
        A helper function used to report error back to the client
        """
        _redis = self.get_redis_connection()
        _command_response = {}
        _command_response['type'] = messages.FLATLAND_RL.ERROR
        _command_response['payload'] = error_message
        _redis.rpush(
            command_response_channel,
            msgpack.packb(_command_response,
                          default=m.encode,
                          use_bin_type=True))
        self.evaluation_state["state"] = "ERROR"
        self.evaluation_state["error"] = error_message
        self.handle_aicrowd_error_event(self.evaluation_state)

    def handle_aicrowd_info_event(self, payload):
        self.oracle_events.register_event(
            event_type=self.oracle_events.CROWDAI_EVENT_INFO, payload=payload)

    def handle_aicrowd_success_event(self, payload):
        self.oracle_events.register_event(
            event_type=self.oracle_events.CROWDAI_EVENT_SUCCESS,
            payload=payload)

    def handle_aicrowd_error_event(self, payload):
        self.oracle_events.register_event(
            event_type=self.oracle_events.CROWDAI_EVENT_ERROR, payload=payload)

    def run(self):
        """
        Main runner function which waits for commands from the client
        and acts accordingly.
        """
        print("Listening at : ", self.command_channel)
        MESSAGE_QUEUE_LATENCY = []
        while True:
            command = self.get_next_command()
            if "timestamp" in command.keys():
                latency = time.time() - command["timestamp"]
                MESSAGE_QUEUE_LATENCY.append(latency)

            if self.verbose:
                print("Self.Reward : ", self.reward)
                print("Current Simulation : ", self.simulation_count)
                if self.env_file_paths and \
                    self.simulation_count < len(self.env_file_paths):
                    print("Current Env Path : ",
                          self.env_file_paths[self.simulation_count])

            try:
                if command['type'] == messages.FLATLAND_RL.PING:
                    """
                        INITIAL HANDSHAKE : Respond with PONG
                    """
                    self.handle_ping(command)

                elif command['type'] == messages.FLATLAND_RL.ENV_CREATE:
                    """
                        ENV_CREATE

                        Respond with an internal _env object
                    """
                    self.handle_env_create(command)
                elif command['type'] == messages.FLATLAND_RL.ENV_STEP:
                    """
                        ENV_STEP

                        Request : Action dict
                        Respond with updated [observation,reward,done,info] after step
                    """
                    self.handle_env_step(command)
                elif command['type'] == messages.FLATLAND_RL.ENV_SUBMIT:
                    """
                        ENV_SUBMIT

                        Submit the final cumulative reward
                    """

                    print("Overall Message Queue Latency : ",
                          np.array(MESSAGE_QUEUE_LATENCY).mean())
                    self.handle_env_submit(command)
                else:
                    _error = self._error_template("UNKNOWN_REQUEST:{}".format(
                        str(command)))
                    if self.verbose:
                        print("Responding with : ", _error)
                    self.report_error(_error, command['response_channel'])
                    return _error
            except Exception as e:
                print("Error : ", str(e))
                print(traceback.format_exc())
                self.report_error(self._error_template(str(e)),
                                  command['response_channel'])
                return self._error_template(str(e))