コード例 #1
0
def evaluate(n_episodes):
    run = SUBMISSIONS["rlps-tcpr"]
    config, run = init_run(run)
    agent = ShortestPathRllibAgent(get_agent(config, run))
    env = get_env(config, rl=True)
    env_renderer = RenderTool(env, screen_width=8800)
    returns = []
    pcs = []
    malfs = []

    for _ in tqdm(range(n_episodes)):

        obs, _ = env.reset(regenerate_schedule=True, regenerate_rail=True)
        if RENDER:
            env_renderer.reset()
            env_renderer.render_env(show=True,
                                    frames=True,
                                    show_observations=False)

        if not obs:
            break

        steps = 0
        ep_return = 0
        done = defaultdict(lambda: False)
        robust_env = RobustFlatlandGymEnv(rail_env=env,
                                          max_nr_active_agents=200,
                                          observation_space=None,
                                          priorizer=DistToTargetPriorizer(),
                                          allow_noop=True)

        sorted_handles = robust_env.priorizer.priorize(handles=list(
            obs.keys()),
                                                       rail_env=env)

        while not done['__all__']:
            actions = agent.compute_actions(obs, env)
            robust_actions = robust_env.get_robust_actions(
                actions, sorted_handles)
            obs, all_rewards, done, info = env.step(robust_actions)
            if RENDER:
                env_renderer.render_env(show=True,
                                        frames=True,
                                        show_observations=False)
            print('.', end='', flush=True)
            steps += 1
            ep_return += np.sum(list(all_rewards.values()))

        pc = np.sum(np.array([1 for a in env.agents if is_done(a)
                              ])) / env.get_num_agents()
        print("EPISODE PC:", pc)
        n_episodes += 1
        pcs.append(pc)
        returns.append(ep_return /
                       (env._max_episode_steps * env.get_num_agents()))
        malfs.append(
            np.sum([a.malfunction_data['nr_malfunctions']
                    for a in env.agents]))
    return pcs, returns, malfs
コード例 #2
0
def explorative_plan(env: RailEnv,
                     obs_dict,
                     budget_seconds=60,
                     exploring_agent=None):
    start_t = time()
    best_actions = []
    best_return = -np.inf
    best_pc = -np.inf
    all_returns = []
    all_pcs = []
    plan_step = 0
    budget_used = False

    while not budget_used:
        local_env = deepcopy(env)
        episode_return = 0
        action_memory = []
        dones = defaultdict(lambda: False)
        print(f'\nPlanning step {plan_step + 1}')

        while not dones['__all__'] and not budget_used:
            actions = defaultdict(
                lambda: None,
                exploring_agent.compute_actions(obs_dict, env=local_env))

            action_memory.append(actions)
            obs_dict, all_rewards, dones, info = local_env.step(actions)
            episode_return += np.sum(list(all_rewards))

            budget_used = (time() - start_t) > budget_seconds

        if not budget_used:
            all_returns.append(episode_return)
            pc = np.sum(np.array([1 for a in local_env.agents if is_done(a)
                                  ])) / local_env.get_num_agents()
            all_pcs.append(pc)

            if pc > best_pc:
                best_return = episode_return
                best_pc = pc
                best_actions = action_memory

            if pc == 1.0:
                print(
                    f'MAX PC: {best_pc}, MIN PC: {np.min(all_pcs)}, MAX RETURN: {best_return}\n'
                )
                return best_actions

            plan_step += 1

    if len(all_pcs) > 0:
        print(
            f'MAX PC: {best_pc}, MIN PC: {np.min(all_pcs)}, MAX RETURN: {best_return}\n'
        )
    else:
        print(f'Budget reached before any planning step could finish!')
    return best_actions if len(best_actions) > 0 else None
コード例 #3
0
def evaluate(n_episodes):
    run = SUBMISSIONS["ato"]
    config, run = init_run(run)
    agent = get_agent(config, run)
    env = get_env(config, rl=True)
    env_renderer = RenderTool(env, screen_width=8800)
    returns = []
    pcs = []
    malfs = []

    for _ in tqdm(range(n_episodes)):

        obs, _ = env.reset(regenerate_schedule=True, regenerate_rail=True)
        if RENDER:
            env_renderer.reset()
            env_renderer.render_env(show=True,
                                    frames=True,
                                    show_observations=False)

        if not obs:
            break

        steps = 0
        ep_return = 0
        done = defaultdict(lambda: False)

        while not done['__all__']:
            actions = agent.compute_actions(obs, explore=False)
            obs, all_rewards, done, info = env.step(actions)
            if RENDER:
                env_renderer.render_env(show=True,
                                        frames=True,
                                        show_observations=False)
            print('.', end='', flush=True)
            steps += 1
            ep_return += np.sum(list(all_rewards.values()))

        pc = np.sum(np.array([1 for a in env.agents if is_done(a)
                              ])) / env.get_num_agents()
        print("EPISODE PC:", pc)
        n_episodes += 1
        pcs.append(pc)
        returns.append(ep_return /
                       (env._max_episode_steps * env.get_num_agents()))
        malfs.append(
            np.sum([a.malfunction_data['nr_malfunctions']
                    for a in env.agents]))
    return pcs, returns, malfs
コード例 #4
0
def evaluate(n_episodes, rl_prio=True):
    agent = None
    if rl_prio:
        config, run = init_run()
        agent = get_agent(config, run)
        env = get_env(config, rl=True)
    else:
        env = get_env(rl=False)
    env_renderer = RenderTool(env, screen_width=8800)
    returns = []
    pcs = []
    malfs = []

    for _ in tqdm(range(n_episodes)):

        obs, _ = env.reset(regenerate_schedule=True, regenerate_rail=True)
        if RENDER:
            env_renderer.reset()
            env_renderer.render_env(show=True,
                                    frames=True,
                                    show_observations=False)

        if not obs:
            break

        steps = 0
        ep_return = 0
        done = defaultdict(lambda: False)
        robust_env = CprFlatlandGymEnv(rail_env=env,
                                       max_nr_active_agents=200,
                                       observation_space=None,
                                       priorizer=NrAgentsSameStart(),
                                       allow_noop=True)
        # if rl_prio:
        #     priorities = prio_agent.compute_actions(obs, explore=False)
        #     sorted_actions = {k: v for k, v in sorted(priorities.items(), key=lambda item: item[1], reverse=True)}
        #     sorted_handles = list(sorted_actions.keys())
        # else:
        sorted_handles = robust_env.priorizer.priorize(handles=list(
            obs.keys()),
                                                       rail_env=env)

        while not done['__all__']:
            actions = ShortestPathAgent().compute_actions(obs, env)
            robust_actions = robust_env.get_robust_actions(
                actions, sorted_handles)
            obs, all_rewards, done, info = env.step(robust_actions)
            if RENDER:
                env_renderer.render_env(show=True,
                                        frames=True,
                                        show_observations=False)
            print('.', end='', flush=True)
            steps += 1
            ep_return += np.sum(list(all_rewards.values()))

        pc = np.sum(np.array([1 for a in env.agents if is_done(a)
                              ])) / env.get_num_agents()
        print("EPISODE PC:", pc)
        n_episodes += 1
        pcs.append(pc)
        returns.append(ep_return /
                       (env._max_episode_steps * env.get_num_agents()))
        malfs.append(
            np.sum([a.malfunction_data['nr_malfunctions']
                    for a in env.agents]))
    return pcs, returns, malfs
コード例 #5
0
def epsilon_greedy_plan(env: RailEnv,
                        obs_dict,
                        budget_seconds=60,
                        epsilon=0.1,
                        policy_agent=HeuristicPriorityAgent()):
    start_t = time()
    best_actions = []
    best_return = -np.inf
    best_pc = -np.inf
    all_returns = []
    all_pcs = []
    plan_step = 0
    budget_used = False

    while not budget_used:
        local_env = deepcopy(env)
        episode_return = 0
        action_memory = []
        dones = defaultdict(lambda: False)
        print(f'\nPlanning step {plan_step + 1}')

        while not dones['__all__'] and not budget_used:
            actions = defaultdict(
                lambda: None,
                policy_agent.compute_actions(obs_dict, env=local_env))
            for agent in env.agents:
                pos = get_agent_pos(agent)
                next_possible_moves = local_env.rail.get_transitions(
                    *pos, agent.direction)
                departed = agent.status.value != RailAgentStatus.READY_TO_DEPART.value

                if np.random.random() < epsilon and promising(
                        next_possible_moves, departed):
                    possible_actions = set(np.flatnonzero(next_possible_moves))
                    possible_actions = possible_actions.union({
                        RailEnvActions.STOP_MOVING.value,
                        RailEnvActions.MOVE_FORWARD.value
                    })
                    non_default_actions = possible_actions.difference(
                        {actions[agent.handle]})
                    actions[agent.handle] = np.random.choice(
                        list(non_default_actions))

            action_memory.append(actions)
            obs_dict, all_rewards, dones, info = local_env.step(actions)
            episode_return += np.sum(list(all_rewards))

            budget_used = (time() - start_t) > budget_seconds

        if not budget_used:
            all_returns.append(episode_return)
            pc = np.sum(np.array([1 for a in local_env.agents if is_done(a)
                                  ])) / local_env.get_num_agents()
            all_pcs.append(pc)

            if pc > best_pc:
                best_return = episode_return
                best_pc = pc
                best_actions = action_memory

            if pc == 1.0:
                print(
                    f'MAX PC: {best_pc}, MIN PC: {np.min(all_pcs)}, MAX RETURN: {best_return}\n'
                )
                return best_actions

            plan_step += 1

    if len(all_pcs) > 0:
        print(
            f'MAX PC: {best_pc}, MIN PC: {np.min(all_pcs)}, MAX RETURN: {best_return}\n'
        )
    else:
        print(f'Budget reached before any planning step could finish!')
    return best_actions if len(best_actions) > 0 else None
コード例 #6
0
    def simulate(self):
        episode_return = 0
        dones = defaultdict(lambda: False)
        local_env = deepcopy(self.env)
        obs_dict = self.initial_obs

        for actions in self.genotype:
            if not self.budget_function():
                obs_dict, all_rewards, dones, info = local_env.step(actions)
                episode_return += np.sum(list(all_rewards))

        while not dones['__all__'] and not self.budget_function():
            actions: defaultdict[int, Optional[int]] = defaultdict(
                lambda: None,
                self.default_behaviour.compute_actions(obs_dict,
                                                       env=local_env))
            transitions = defaultdict(lambda: None)
            agents_departed = defaultdict(lambda: True)

            for agent in local_env.agents:
                pos = get_agent_pos(agent)
                next_possible_moves = local_env.rail.get_transitions(
                    *pos, agent.direction)

                actions = defaultdict(
                    lambda: None,
                    self.default_behaviour.compute_actions(obs_dict,
                                                           env=local_env))
                for agent in local_env.agents:
                    pos = get_agent_pos(agent)
                    next_possible_moves = local_env.rail.get_transitions(
                        *pos, agent.direction)

                    if np.random.random() < self.epsilon:
                        possible_actions = set(
                            np.flatnonzero(next_possible_moves))
                        possible_actions = possible_actions.union({
                            RailEnvActions.STOP_MOVING.value,
                            RailEnvActions.MOVE_FORWARD.value
                        })
                        non_default_actions = possible_actions.difference(
                            {actions[agent.handle]})
                        actions[agent.handle] = np.random.choice(
                            list(non_default_actions))

                transitions[agent.handle] = next_possible_moves
                agents_departed[
                    agent.
                    handle] = agent.status.value != RailAgentStatus.READY_TO_DEPART.value

            self.genotype.append(actions)
            self.possible_mutations.append(transitions)
            self.departed.append(agents_departed)
            obs_dict, all_rewards, dones, info = local_env.step(actions)
            episode_return += np.sum(list(all_rewards))

        if not self.budget_function():
            percentage_complete = np.sum(
                np.array([1 for a in local_env.agents if is_done(a)
                          ])) / local_env.get_num_agents()
            self.fitness = percentage_complete

            print(
                f"Simulation of candidate finished with fitness (PC): {percentage_complete}"
            )