Ejemplo n.º 1
0
def demo(window: RailViewWindow):
    """Demo script to check installation"""
    env = RailEnv(width=15,
                  height=15,
                  rail_generator=complex_rail_generator(nr_start_goal=10,
                                                        nr_extra=1,
                                                        min_dist=8,
                                                        max_dist=99999),
                  schedule_generator=complex_schedule_generator(),
                  number_of_agents=5)

    env._max_episode_steps = int(15 * (env.width + env.height))
    env_renderer = RenderTool(env)

    while window.alive:
        obs, info = env.reset()
        env_renderer.reset()
        _done = False
        # Run a single episode here
        step = 0
        while not _done and window.alive:
            # Compute Action
            _action = {}
            for _idx, _ in enumerate(env.agents):
                _action[_idx] = np.random.randint(0, 5)
            obs, all_rewards, done, _ = env.step(_action)
            _done = done['__all__']
            step += 1
            env_renderer.render_env(show=True,
                                    frames=False,
                                    show_observations=False,
                                    show_predictions=False)
            time.sleep(0.1)
Ejemplo n.º 2
0
def evaluate(n_episodes):
    run = SUBMISSIONS["rlps-tcpr"]
    config, run = init_run(run)
    agent = ShortestPathRllibAgent(get_agent(config, run))
    env = get_env(config, rl=True)
    env_renderer = RenderTool(env, screen_width=8800)
    returns = []
    pcs = []
    malfs = []

    for _ in tqdm(range(n_episodes)):

        obs, _ = env.reset(regenerate_schedule=True, regenerate_rail=True)
        if RENDER:
            env_renderer.reset()
            env_renderer.render_env(show=True,
                                    frames=True,
                                    show_observations=False)

        if not obs:
            break

        steps = 0
        ep_return = 0
        done = defaultdict(lambda: False)
        robust_env = RobustFlatlandGymEnv(rail_env=env,
                                          max_nr_active_agents=200,
                                          observation_space=None,
                                          priorizer=DistToTargetPriorizer(),
                                          allow_noop=True)

        sorted_handles = robust_env.priorizer.priorize(handles=list(
            obs.keys()),
                                                       rail_env=env)

        while not done['__all__']:
            actions = agent.compute_actions(obs, env)
            robust_actions = robust_env.get_robust_actions(
                actions, sorted_handles)
            obs, all_rewards, done, info = env.step(robust_actions)
            if RENDER:
                env_renderer.render_env(show=True,
                                        frames=True,
                                        show_observations=False)
            print('.', end='', flush=True)
            steps += 1
            ep_return += np.sum(list(all_rewards.values()))

        pc = np.sum(np.array([1 for a in env.agents if is_done(a)
                              ])) / env.get_num_agents()
        print("EPISODE PC:", pc)
        n_episodes += 1
        pcs.append(pc)
        returns.append(ep_return /
                       (env._max_episode_steps * env.get_num_agents()))
        malfs.append(
            np.sum([a.malfunction_data['nr_malfunctions']
                    for a in env.agents]))
    return pcs, returns, malfs
Ejemplo n.º 3
0
def evaluate(n_episodes):
    run = SUBMISSIONS["ato"]
    config, run = init_run(run)
    agent = get_agent(config, run)
    env = get_env(config, rl=True)
    env_renderer = RenderTool(env, screen_width=8800)
    returns = []
    pcs = []
    malfs = []

    for _ in tqdm(range(n_episodes)):

        obs, _ = env.reset(regenerate_schedule=True, regenerate_rail=True)
        if RENDER:
            env_renderer.reset()
            env_renderer.render_env(show=True,
                                    frames=True,
                                    show_observations=False)

        if not obs:
            break

        steps = 0
        ep_return = 0
        done = defaultdict(lambda: False)

        while not done['__all__']:
            actions = agent.compute_actions(obs, explore=False)
            obs, all_rewards, done, info = env.step(actions)
            if RENDER:
                env_renderer.render_env(show=True,
                                        frames=True,
                                        show_observations=False)
            print('.', end='', flush=True)
            steps += 1
            ep_return += np.sum(list(all_rewards.values()))

        pc = np.sum(np.array([1 for a in env.agents if is_done(a)
                              ])) / env.get_num_agents()
        print("EPISODE PC:", pc)
        n_episodes += 1
        pcs.append(pc)
        returns.append(ep_return /
                       (env._max_episode_steps * env.get_num_agents()))
        malfs.append(
            np.sum([a.malfunction_data['nr_malfunctions']
                    for a in env.agents]))
    return pcs, returns, malfs
Ejemplo n.º 4
0
def evaluate(seed=37429879,
             timed=False,
             filename="./rl-weights.pth",
             debug=False,
             refresh=1):
    # Attempt to load policy from disk.
    policy = load_policy(filename, seed=seed)

    # Create environment with given seeding.
    env, max_steps, _, _, observation_tree_depth, _ = create_multi_agent_rail_env(
        seed + 1, timed)

    # Fixed environment parameters (note, these must correspond with the training parameters!)
    observation_radius = 10

    env_renderer = None
    if (debug):
        env_renderer = RenderTool(env, screen_width=1920, screen_height=1080)

    # Create container for the agent actions and observations.
    action_dict = dict()
    agent_obs = [None] * env.number_of_agents

    num_maps = 100
    scores = []
    successes = 0
    scores_window = deque(maxlen=100)  # todo smooth when rendering instead
    completion_window = deque(maxlen=100)
    completion = []
    schedule_length = []

    for _ in range(0, num_maps):

        # Create a new map.
        obs, info = env.reset(True, True)
        score = 0

        if debug:
            env_renderer.reset()
            env_renderer.render_env(show=True,
                                    frames=False,
                                    show_observations=False)
            time.sleep(refresh)

        # Run episode
        for _ in range(max_steps - 1):

            # Build agent specific observations
            for agent in env.get_agent_handles():
                if obs[agent]:
                    agent_obs[agent] = normalize_observation(
                        obs[agent],
                        observation_tree_depth,
                        observation_radius=observation_radius)

            # If an action is required, select the action.
            for agent in env.get_agent_handles():
                action = 0
                if info['action_required'][agent]:
                    action = policy.act(agent_obs[agent], eps=0.08)
                    # print("Required " + str(action))
                action_dict.update({agent: action})
            schedule_length.append(len(action_dict))
            # Environment step
            obs, all_rewards, done, info = env.step(action_dict)

            if debug:
                env_renderer.render_env(show=True,
                                        frames=False,
                                        show_observations=False)
                time.sleep(refresh)

            # Track rewards.
            for agent in env.get_agent_handles():
                score = score + all_rewards[agent]

            if done['__all__']:
                successes = successes + 1
                break
        tasks_finished = np.sum(
            [int(done[idx]) for idx in env.get_agent_handles()])
        completion_window.append(tasks_finished / max(1, env.get_num_agents()))
        completion.append((np.mean(completion_window)))
        # Record scores.
        scores.append(score)

    print("Successful:    %8.2f%%" % (100 * successes / num_maps))
    print("Mean reward:   %8.2f" % (np.mean(scores)))
    print("Median reward: %8.2f" % (np.median(scores)))
    print("Instances solved: %8.2f" % (np.mean(completion)))
    print("Instances solved: %8.2f" % (np.mean(schedule_length)))
Ejemplo n.º 5
0
class FlatlandEnv(gym.Env):
    def __init__(self,
                 n_cars=3,
                 n_acts=5,
                 min_obs=-1,
                 max_obs=1,
                 n_nodes=2,
                 ob_radius=10,
                 x_dim=36,
                 y_dim=36,
                 feats='all'):

        self.tree_obs = tree_observation.TreeObservation(n_nodes)
        self.n_cars = n_cars
        self.n_nodes = n_nodes
        self.ob_radius = ob_radius
        self.feats = feats

        rail_gen = sparse_rail_generator(max_num_cities=3,
                                         seed=666,
                                         grid_mode=False,
                                         max_rails_between_cities=2,
                                         max_rails_in_city=3)

        self._rail_env = RailEnv(
            width=x_dim,
            height=y_dim,
            rail_generator=rail_gen,
            schedule_generator=sparse_schedule_generator(speed_ration_map),
            number_of_agents=n_cars,
            malfunction_generator_and_process_data=malfunction_from_params(
                stochastic_data),
            obs_builder_object=self.tree_obs)

        self.renderer = RenderTool(self._rail_env, gl="PILSVG")
        self.action_dict = dict()
        self.info = dict()
        self.old_obs = dict()

    def step(self, action):
        # Update the action of each agent
        for agent_id in range(self.n_cars):
            if action[agent_id] is None:
                action[agent_id] = 2
            self.action_dict.update({
                agent_id: action[agent_id] + 1
            })  # FIXME: Hack for ignoring action 0 (model only outputs 4)

        # Take actions, get observations
        next_obs, all_rewards, done, self.info = self._rail_env.step(
            self.action_dict)

        # Normalise observations for each agent
        for agent_id in range(self._rail_env.get_num_agents()):

            # Check if agent is finished
            if not done[agent_id]:
                # Normalise next observation
                next_obs[agent_id] = normalize_observation(
                    tree=next_obs[agent_id],
                    max_depth=self.n_nodes,
                    observation_radius=self.ob_radius,
                    feats=self.feats)

                # Keep track of last observation for trains that finish
                self.old_obs[agent_id] = next_obs[agent_id].copy()
            else:
                # Use last observation if agent finished
                next_obs[agent_id] = self.old_obs[agent_id]

        return next_obs, all_rewards, done, self.info

    def reset(self):
        """
        Reset the state of the environment and returns an initial observation.
        return obs: initial observation of the space
        """
        self.action_dict = dict()
        self.info = dict()
        self.old_obs = dict()

        obs, self.info = self._rail_env.reset(True, True)
        for agent_id in range(self.n_cars):
            if obs[agent_id]:
                obs[agent_id] = normalize_observation(obs[agent_id],
                                                      self.n_nodes,
                                                      self.ob_radius,
                                                      feats=self.feats)
        self.renderer.reset()
        return obs, self.info

    def render(self, mode=None):
        self.renderer.render_env()
        image = self.renderer.get_image()
        cv2.imshow('Render', image)
        cv2.waitKey(20)
Ejemplo n.º 6
0
def main():
    np.random.seed(1)

    env = RailEnv(
        width=x_dim,
        height=y_dim,
        number_of_agents=n_agents,
        rail_generator=rail_generator,
        schedule_generator=schedule_generator,
        malfunction_generator_and_process_data=malfunction_from_params(
            StochasticData(1 / 8000, 15, 50)),
        obs_builder_object=TreeObservation(max_depth=tree_depth))

    # After training we want to render the results so we also load a renderer
    env_renderer = RenderTool(env, gl="PILSVG")

    # Calculate the state size based on the number of nodes in the tree observation
    num_features_per_node = env.obs_builder.observation_dim
    num_nodes = sum(np.power(4, i) for i in range(tree_depth + 1))
    state_size = num_features_per_node * num_nodes
    action_size = 5

    # Now we load a double dueling DQN agent and initialize it from the checkpoint
    agent = Agent(state_size, action_size)
    if load_from_checkpoint:
        start, eps = agent.load(project_root / 'checkpoints', 0, 1.0)
    else:
        start, eps = 0, 1.0

    # And some variables to keep track of the progress
    action_dict, final_action_dict = {}, {}
    scores_window, done_window = deque(maxlen=500), deque(maxlen=500)
    action_prob = [0] * action_size
    agent_obs = [None] * n_agents
    agent_obs_buffer = [None] * n_agents
    agent_action_buffer = [2] * n_agents

    max_steps = int(3 * (x_dim + y_dim))
    update_values = False
    start_time = time.time()

    # We don't want to retrain on old railway networks when we restart from a checkpoint, so we just loop
    # through the generators to get all the old networks out of the way
    for _ in range(0, start):
        rail_generator()
        schedule_generator()

    # Start the training loop
    for episode in range(start + 1, n_trials + 1):
        env_renderer.reset()
        obs, info = env.reset(True, True)
        score = 0

        # Build agent specific observations
        for a in range(n_agents):
            if obs[a]:
                agent_obs[a] = normalize_observation(obs[a],
                                                     tree_depth,
                                                     observation_radius=10)
                agent_obs_buffer[a] = agent_obs[a].copy()

        # Run episode
        for step in range(max_steps):
            for a in range(n_agents):
                if info['action_required'][a]:
                    # If an action is required, we want to store the obs a that step as well as the action
                    update_values = True
                    action = agent.act(agent_obs[a], eps=eps)
                    # action = np.random.randint(4)
                    action_dict[a] = action
                    action_prob[action] += 1
                else:
                    update_values = False
                    action_dict[a] = 0

            # Environment step
            next_obs, all_rewards, done, info = env.step(action_dict)

            # Update replay buffer and train agent
            for a in range(n_agents):
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values or done[a]:
                    agent.step(agent_obs_buffer[a], agent_action_buffer[a],
                               all_rewards[a], agent_obs[a], done[a], train)
                    agent_obs_buffer[a] = agent_obs[a].copy()
                    agent_action_buffer[a] = action_dict[a]
                if next_obs[a]:
                    agent_obs[a] = normalize_observation(next_obs[a],
                                                         tree_depth,
                                                         observation_radius=10)

                score += all_rewards[a] / n_agents

            # Render
            if episode % render_interval == 0: render(env_renderer)
            if done['__all__']: break

        # Epsilon decay
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon

        # Collection information about training
        tasks_finished = sum(done[i] for i in range(n_agents))
        done_window.append(tasks_finished / max(1, n_agents))
        scores_window.append(score / max_steps)  # save most recent score

        action_probs = ', '.join(f'{x:.3f}'
                                 for x in action_prob / np.sum(action_prob))
        print(f'\rTraining {n_agents} Agents on ({x_dim},{y_dim}) \t ' +
              f'Episode {episode} \t ' +
              f'Average Score: {np.mean(scores_window):.3f} \t ' +
              f'Dones: {100 * np.mean(done_window):.2f}% \t ' +
              f'Epsilon: {eps:.2f} \t ' +
              f'Action Probabilities: {action_probs}',
              end=" ")

        if episode % report_interval == 0:
            print(f'\rTraining {n_agents} Agents on ({x_dim},{y_dim}) \t ' +
                  f'Episode {episode} \t ' +
                  f'Average Score: {np.mean(scores_window):.3f} \t ' +
                  f'Dones: {100 * np.mean(done_window):.2f}% \t ' +
                  f'Epsilon: {eps:.2f} \t ' +
                  f'Action Probabilities: {action_probs} \t ' +
                  f'Time taken: {time.time() - start_time:.2f}s')

            if train: agent.save(project_root / 'checkpoints', episode, eps)
            start_time = time.time()
            action_prob = [1] * action_size
Ejemplo n.º 7
0
def main(args, dir):
    '''
	
	:param args: 
	:return: 
	Episodes to debug (set breakpoint in episodes loop to debug):
	- ep = 3, agent 1 spawns in front of 3, blocking its path; 0 and 2 are in a deadlock since they have same priority
	- ep = 4, agents stop because of wrong priorities even though the conflict zone wasn't entered,
	- ep = 14, 
	'''
    rail_generator = sparse_rail_generator(
        max_num_cities=args.max_num_cities,
        seed=args.seed,
        grid_mode=args.grid_mode,
        max_rails_between_cities=args.max_rails_between_cities,
        max_rails_in_city=args.max_rails_in_city,
    )

    # Maps speeds to % of appearance in the env
    speed_ration_map = {
        1.: 0.25,  # Fast passenger train
        1. / 2.: 0.25,  # Fast freight train
        1. / 3.: 0.25,  # Slow commuter train
        1. / 4.: 0.25
    }  # Slow freight train

    observation_builder = GraphObsForRailEnv(
        predictor=ShortestPathPredictorForRailEnv(
            max_depth=args.prediction_depth),
        bfs_depth=4)

    env = RailEnv(
        width=args.width,
        height=args.height,
        rail_generator=rail_generator,
        schedule_generator=sparse_schedule_generator(speed_ration_map),
        number_of_agents=args.num_agents,
        obs_builder_object=observation_builder,
        malfunction_generator_and_process_data=malfunction_from_params(
            parameters={
                'malfunction_rate':
                args.malfunction_rate,  # Rate of malfunction occurrence
                'min_duration':
                args.min_duration,  # Minimal duration of malfunction
                'max_duration':
                args.max_duration  # Max duration of malfunction
            }))

    if args.render:
        env_renderer = RenderTool(env,
                                  agent_render_variant=AgentRenderVariant.
                                  AGENT_SHOWS_OPTIONS_AND_BOX,
                                  show_debug=True)

    sm = stateMachine()
    tb = TestBattery(env, observation_builder)

    state_machine_action_dict = {}
    railenv_action_dict = {}
    # max_time_steps = env.compute_max_episode_steps(args.width, args.height)
    max_time_steps = 200
    T_rewards = []  # List of episodes rewards
    T_Qs = []  # List of q values
    T_num_done_agents = []  # List of number of done agents for each episode
    T_all_done = []  # If all agents completed in each episode
    T_episodes = []  # Time taken for each episode

    if args.save_image and not os.path.isdir("image_dump"):
        os.makedirs("image_dump")

    step_taken = 0
    total_step_taken = 0
    total_episodes = 0
    step_times = []  # Time taken for each step

    for ep in range(args.num_episodes):
        # Reset info at the beginning of an episode
        start_time = time.time()  # Take time of one episode

        if args.generate_baseline:
            if not os.path.isdir("image_dump/" + str(dir)) and args.save_image:
                os.makedirs("image_dump/" + str(dir))
        else:
            if not os.path.isdir("image_dump/" + str(ep)) and args.save_image:
                os.makedirs("image_dump/" + str(ep))

        state, info = env.reset()
        tb.reset()

        if args.render:
            env_renderer.reset()
        reward_sum, all_done = 0, False  # reward_sum contains the cumulative reward obtained as sum during the steps
        num_done_agents = 0

        state_machine_action = {}
        for i in range(env.number_of_agents):
            state_machine_action[i] = 0

        for step in range(max_time_steps):
            start_step_time = time.time()

            #if step % 10 == 0:
            #	print(step)

            # Test battery
            # see test_battery.py
            triggers = tb.tests(state, args.prediction_depth,
                                state_machine_action)
            # state machine based on triggers of test battery
            # see state_machine.py
            state_machine_action = sm.act(
                triggers)  # State machine picks action

            for a in range(env.get_num_agents()):
                #if info['action_required'][a]:
                #	#railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a])
                #	railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a])
                #	state_machine_action_dict.update({a: state_machine_action})
                #	railenv_action_dict.update({a: railenv_action})
                # railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a])
                railenv_action = observation_builder.choose_railenv_action(
                    a, state_machine_action[a])
                state_machine_action_dict.update({a: state_machine_action})
                railenv_action_dict.update({a: railenv_action})

            state, reward, done, info = env.step(
                railenv_action_dict)  # Env step

            if args.generate_baseline:
                #env_renderer.render_env(show=True, show_observations=False, show_predictions=True)
                env_renderer.render_env(show=False,
                                        show_observations=False,
                                        show_predictions=True)
            else:
                env_renderer.render_env(show=True,
                                        show_observations=False,
                                        show_predictions=True)

            if args.generate_baseline:
                if args.save_image:
                    env_renderer.save_image("image_dump/" + str(dir) +
                                            "/image_" + str(step) + "_.png")
            else:
                if args.save_image:
                    env_renderer.save_image("image_dump/" + str(ep) +
                                            "/image_" + str(step) + "_.png")

            if args.debug:
                for a in range(env.get_num_agents()):
                    log('\n\n#########################################')
                    log('\nInfo for agent {}'.format(a))
                    #log('\npath : {}'.format(state[a]["path"]))
                    log('\noverlap : {}'.format(state[a]["overlap"]))
                    log('\ndirection : {}'.format(state[a]["direction"]))
                    log('\nOccupancy, first layer: {}'.format(
                        state[a]["occupancy"]))
                    log('\nOccupancy, second layer: {}'.format(
                        state[a]["conflict"]))
                    log('\nForks: {}'.format(state[a]["forks"]))
                    log('\nTarget: {}'.format(state[a]["target"]))
                    log('\nPriority: {}'.format(state[a]["priority"]))
                    log('\nMax priority encountered: {}'.format(
                        state[a]["max_priority"]))
                    log('\nNum malfunctioning agents (globally): {}'.format(
                        state[a]["n_malfunction"]))
                    log('\nNum agents ready to depart (globally): {}'.format(
                        state[a]["ready_to_depart"]))
                    log('\nStatus: {}'.format(info['status'][a]))
                    log('\nPosition: {}'.format(env.agents[a].position))
                    log('\nTarget: {}'.format(env.agents[a].target))
                    log('\nMoving? {} at speed: {}'.format(
                        env.agents[a].moving, info['speed'][a]))
                    log('\nAction required? {}'.format(
                        info['action_required'][a]))
                    log('\nState machine action: {}'.format(
                        state_machine_action_dict[a]))
                    log('\nRailenv action: {}'.format(railenv_action_dict[a]))
                    log('\nRewards: {}'.format(reward[a]))
                    log('\n\n#########################################')

            reward_sum += sum(reward[a] for a in range(env.get_num_agents()))

            step_taken = step
            time_taken_step = time.time() - start_step_time
            step_times.append(time_taken_step)

            if done['__all__']:
                all_done = True
                break

        total_step_taken += step_taken

        time_taken = time.time() - start_time  # Time taken for one episode
        total_episodes = ep

        # Time metrics - too precise
        avg_time_step = sum(step_times) / step_taken
        #print("Avg time step: " + str(avg_time_step))

        # No need to close the renderer since env parameter sizes stay the same
        T_rewards.append(reward_sum)
        # Compute num of agents that reached their target
        for a in range(env.get_num_agents()):
            if done[a]:
                num_done_agents += 1
        percentage_done_agents = num_done_agents / env.get_num_agents()
        log("\nDone agents in episode: {}".format(percentage_done_agents))
        T_num_done_agents.append(
            percentage_done_agents)  # In proportion to total
        T_all_done.append(all_done)

    # Average number of agents that reached their target
    avg_done_agents = sum(T_num_done_agents) / len(T_num_done_agents) if len(
        T_num_done_agents) > 0 else 0
    avg_reward = sum(T_rewards) / len(T_rewards) if len(T_rewards) > 0 else 0
    avg_norm_reward = avg_reward / (max_time_steps / env.get_num_agents())

    avg_ep_time = sum(T_episodes) / args.num_episodes

    if total_episodes == 0:
        total_episodes = 1

    log("\nSeed: " + str(args.seed) \
      + "\t | Avg_done_agents: " + str(avg_done_agents)\
      + "\t | Avg_reward: " + str(avg_reward)\
      + "\t | Avg_norm_reward: " + str(avg_norm_reward)\
      + "\t | Max_num_time_steps: " + str(max_time_steps)\
      + "\t | Avg_num_time_steps: " + str(total_step_taken/total_episodes)
            + "\t | Avg episode time: " + str(avg_ep_time))
def main(argv):
    try:
        opts, args = getopt.getopt(argv, "n:", ["n_trials="])
    except getopt.GetoptError:
        print('test_navigation_single_agent.py -n <n_trials>')
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-n', '--n_trials'):
            n_trials = int(arg)

    random.seed(1)
    np.random.seed(1)

    ######## TEST SET SELECTION - PARAMETERS ########
    
    test_multi_agent_setup = 1             # 1 for Medium size test, 2 for Big size test
    test_n_agents = 5                      # Number of agents to test (3 - 5 - 7 for Medium, 5 - 7 - 10 for Big)
    test_malfunctions_enabled = True       # Malfunctions enabled?
    test_agents_one_speed = True           # Test agents with the same speed (1) or with 4 different speeds?

    #################################################

    # Medium size
    if test_multi_agent_setup == 1:
        x_dim = 16*3
        y_dim = 9*3
        max_num_cities = 5
        max_rails_between_cities = 2
        max_rails_in_city = 3

    # Big size
    if test_multi_agent_setup == 2:
        x_dim = 16*4
        y_dim = 9*4
        max_num_cities = 9
        max_rails_between_cities = 5
        max_rails_in_city = 5


    stochastic_data = {'malfunction_rate': 80,  # Rate of malfunction occurence of single agent
                       'min_duration': 15,  # Minimal duration of malfunction
                       'max_duration': 50  # Max duration of malfunction
                       }

    # Custom observation builder
    tree_depth = 2
    TreeObservation = TreeObsForRailEnv(max_depth=tree_depth, predictor = ShortestPathPredictorForRailEnv(20))

    np.savetxt(fname=path.join('NetsTest' , 'info.txt'), X=[x_dim,y_dim,test_n_agents,max_num_cities,max_rails_between_cities,max_rails_in_city,tree_depth],delimiter=';')

    # Different agent types (trains) with different speeds.
    if test_agents_one_speed:
        speed_ration_map = {1.: 1.,  # Fast passenger train
                            1. / 2.: 0.0,  # Fast freight train
                            1. / 3.: 0.0,  # Slow commuter train
                            1. / 4.: 0.0}  # Slow freight train
    else:
        speed_ration_map = {1.: 0.25,  # Fast passenger train
                            1. / 2.: 0.25,  # Fast freight train
                            1. / 3.: 0.25,  # Slow commuter train
                            1. / 4.: 0.25}  # Slow freight train

    
    if test_malfunctions_enabled:
        env = RailEnv(width=x_dim,
                      height=y_dim,
                      rail_generator=sparse_rail_generator(max_num_cities=max_num_cities,
                                                           # Number of cities in map (where train stations are)
                                                           seed=14,  # Random seed
                                                           grid_mode=False,
                                                           max_rails_between_cities=max_rails_between_cities,
                                                               max_rails_in_city=max_rails_in_city),
                    schedule_generator=sparse_schedule_generator(speed_ration_map),
                    malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
                    number_of_agents=test_n_agents,
                    obs_builder_object=TreeObservation)
    else:
        env = RailEnv(width=x_dim,
                      height=y_dim,
                      rail_generator=sparse_rail_generator(max_num_cities=max_num_cities,
                                                           # Number of cities in map (where train stations are)
                                                           seed=14,  # Random seed
                                                           grid_mode=False,
                                                           max_rails_between_cities=max_rails_between_cities,
                                                               max_rails_in_city=max_rails_in_city),
                    schedule_generator=sparse_schedule_generator(speed_ration_map),
                    number_of_agents=test_n_agents,
                    obs_builder_object=TreeObservation)
    
    env.reset()

    #env_renderer = RenderTool(env, gl="PILSVG", )
    env_renderer = RenderTool(env, gl="PILSVG",
                          agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX,
                          show_debug=False,
                          screen_height=(1080*0.8),  # Adjust these parameters to fit your resolution
                          screen_width=(1920*0.8))
    num_features_per_node = env.obs_builder.observation_dim

    
    nr_nodes = 0
    for i in range(tree_depth + 1):
        nr_nodes += np.power(4, i)
    state_size = num_features_per_node * nr_nodes
    action_size = 5

    # We set the number of episodes we would like to train on
    if 'n_trials' not in locals():
        n_trials = 15000
    
    # max_steps computation
    speed_weighted_mean = 0

    for key in speed_ration_map.keys():
        speed_weighted_mean += key * speed_ration_map[key]
    
    #max_steps = int(3 * (env.height + env.width))
    max_steps = int((1/speed_weighted_mean) * 3 * (env.height + env.width))
    #eps = 1.
    #eps_end = 0.005
    #eps_decay = 0.9995

    # And some variables to keep track of the performance
    action_dict = dict()
    final_action_dict = dict()
    action_prob_list = []
    scores_window = deque(maxlen=100)
    done_window = deque(maxlen=100)
    scores = []
    scores_list = []
    deadlock_list =[]
    dones_list_window = []
    dones_list = []
    action_prob = [0] * action_size
    agent_obs = [None] * env.get_num_agents()
    agent_next_obs = [None] * env.get_num_agents() # Useless
    agent = Agent(state_size, action_size)
    
    # LOAD MODEL WEIGHTS TO TEST
    agent.qnetwork_local.load_state_dict(torch.load(path.join('NetsTest' , 'navigator_checkpoint3800_multi10_deadlock_global10.pth')))

    record_images = False
    frame_step = 0

    for trials in range(1, n_trials + 1):

        # Reset environment
        obs, info = env.reset()#(True, True)
        env_renderer.reset()
        # Build agent specific observations
        for a in range(env.get_num_agents()):
            agent_obs[a] = agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
        # Reset score and done
        score = 0
        env_done = 0

        # Run episode
        for step in range(max_steps):

            # Action
            for a in range(env.get_num_agents()):
                if info['action_required'][a]:
                    action = agent.act(agent_obs[a], eps=0.)
                    action_prob[action] += 1

                else:
                    action = 0

                action_dict.update({a: action})
            # Environment step
            obs, all_rewards, done, deadlocks, info = env.step(action_dict)

            env_renderer.render_env(show=True, show_predictions=True, show_observations=False)
            # Build agent specific observations and normalize
            for a in range(env.get_num_agents()):
                if obs[a]:
                    agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)

                score += all_rewards[a] / env.get_num_agents()


            if done['__all__']:
                break

        # Collection information about training
        tasks_finished = 0
        for _idx in range(env.get_num_agents()):
            if done[_idx] == 1:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / max_steps)  # save most recent score
        scores.append(np.mean(scores_window))
        dones_list.append(tasks_finished / max(1, env.get_num_agents()))
        dones_list_window.append((np.mean(done_window)))
        scores_list.append(score / max_steps)
        deadlock_list.append(deadlocks.count(1)/max(1, env.get_num_agents()))

        if (np.sum(action_prob) == 0):
            action_prob_normalized = [0] * action_size
        else:
            action_prob_normalized = action_prob / np.sum(action_prob)



        print(
                '\rTesting {} Agents on ({},{}).\t Episode {}\t Score: {:.3f}\tDones: {:.2f}%\tDeadlocks: {:.2f}\t Action Probabilities: \t {}'.format(
                    env.get_num_agents(), x_dim, y_dim,
                    trials,
                    score / max_steps,
                    100 * tasks_finished / max(1, env.get_num_agents()),
                    deadlocks.count(1)/max(1, env.get_num_agents()),
                    action_prob_normalized), end=" ")

        #if trials % 100 == 0:
        action_prob_list.append(action_prob_normalized)
        action_prob = [0] * action_size

        if trials % 50 == 0:

            np.savetxt(fname=path.join('NetsTest' , 'test_metrics.csv'), X=np.transpose(np.asarray([scores_list,scores,dones_list,dones_list_window,deadlock_list])), delimiter=';',newline='\n')
            np.savetxt(fname=path.join('NetsTest' , 'test_action_prob.csv'), X=np.asarray(action_prob_list), delimiter=';',newline='\n')
Ejemplo n.º 9
0
def main(args):

    rail_generator = sparse_rail_generator(
        max_num_cities=args.max_num_cities,
        #seed=args.seed,
        seed=0,  # 0, 3, 7, 10, 14, 16, 20, 22, 23, 25, 26, 32
        grid_mode=args.grid_mode,
        max_rails_between_cities=args.max_rails_between_cities,
        max_rails_in_city=args.max_rails_in_city,
    )

    # Maps speeds to % of appearance in the env
    speed_ration_map = {
        1.: 0.25,  # Fast passenger train
        1. / 2.: 0.25,  # Fast freight train
        1. / 3.: 0.25,  # Slow commuter train
        1. / 4.: 0.25
    }  # Slow freight train

    observation_builder = GraphObsForRailEnv(
        bfs_depth=args.bfs_depth,
        predictor=ShortestPathPredictorForRailEnv(
            max_depth=args.prediction_depth))

    # Construct the environment with the given observation, generators, predictors, and stochastic data
    env = RailEnv(
        width=args.width,
        height=args.height,
        rail_generator=rail_generator,  # rail_from_file boh...
        schedule_generator=sparse_schedule_generator(speed_ration_map),
        number_of_agents=args.num_agents,
        obs_builder_object=observation_builder,
        malfunction_generator_and_process_data=malfunction_from_params(
            parameters={
                'malfunction_rate':
                args.malfunction_rate,  # Rate of malfunction occurrence
                'min_duration':
                args.min_duration,  # Minimal duration of malfunction
                'max_duration':
                args.max_duration  # Max duration of malfunction
            }))

    env_renderer = RenderTool(
        env,
        agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX,
        show_debug=True,
        screen_height=1080,
        screen_width=1920)

    state_machine_action_dict = {}
    railenv_action_dict = {}
    max_time_steps = 150
    T_rewards = []  # List of episodes rewards
    T_Qs = []  # List of q values
    T_num_done_agents = []  # List of number of done agents for each episode
    T_all_done = []  # If all agents completed in each episode

    for ep in range(args.num_episodes):
        # Reset info at the beginning of an episode
        # env.load(filename="map" + str(ep))
        state, info = env.reset()
        # env.save(filename="map" + str(ep))
        env_renderer.reset()
        reward_sum, all_done = 0, False  # reward_sum contains the cumulative reward obtained as sum during the steps
        num_done_agents = 0

        for step in range(max_time_steps):

            for a in range(env.get_num_agents()):
                shortest_path_prediction = observation_builder.cells_sequence[
                    a]
                state_machine_action, is_alternative = act(
                    args, env, a, state[a],
                    shortest_path_prediction)  # State machine picks action
                if not is_alternative:
                    railenv_action = observation_builder.choose_railenv_action(
                        a, state_machine_action)
                else:
                    railenv_action = state_machine_action
                state_machine_action_dict.update({a: state_machine_action})
                railenv_action_dict.update({a: railenv_action})

            state, reward, done, info = env.step(
                railenv_action_dict)  # Env step
            env_renderer.render_env(show=True,
                                    show_observations=False,
                                    show_predictions=True)

            for a in range(env.get_num_agents()):
                print('#########################################')
                print('Info for agent {}'.format(a))
                print('Occupancy, first layer: {}'.format(
                    state[a][:args.prediction_depth]))
                print('Occupancy, second layer: {}'.format(
                    state[a][args.prediction_depth:args.prediction_depth * 2]))
                print('Forks: {}'.format(
                    state[a][args.prediction_depth * 2:args.prediction_depth *
                             3]))
                print('Target: {}'.format(
                    state[a][args.prediction_depth * 3:args.prediction_depth *
                             4]))
                print('Priority: {}'.format(state[a][args.prediction_depth *
                                                     4]))
                print('Max priority encountered: {}'.format(
                    state[a][args.prediction_depth * 4 + 1]))
                print('Num malfunctoning agents (globally): {}'.format(
                    state[a][args.prediction_depth * 4 + 2]))
                print('Num agents ready to depart (globally): {}'.format(
                    state[a][args.prediction_depth * 4 + 3]))
                print('Status: {}'.format(info['status'][a]))
                print('Position: {}'.format(env.agents[a].position))
                print('Moving? {} at speed: {}'.format(env.agents[a].moving,
                                                       info['speed'][a]))
                print('Action required? {}'.format(info['action_required'][a]))
                print('Network action: {}'.format(
                    state_machine_action_dict[a]))
                print('Railenv action: {}'.format(railenv_action_dict[a]))
                # print('Q values: {}'.format(qvalues[a]))
                print('Rewards: {}'.format(reward[a]))

            reward_sum += sum(reward[a] for a in range(env.get_num_agents()))

            if done['__all__']:
                all_done = True
                break
        # No need to close the renderer since env parameter sizes stay the same
        T_rewards.append(reward_sum)
        # Compute num of agents that reached their target
        for a in range(env.get_num_agents()):
            if done[a]:
                num_done_agents += 1
        T_num_done_agents.append(
            num_done_agents / env.get_num_agents())  # In proportion to total
        T_all_done.append(all_done)

    avg_done_agents = sum(T_num_done_agents) / len(
        T_num_done_agents
    )  # Average number of agents that reached their target
    avg_reward = sum(T_rewards) / len(T_rewards)
    avg_norm_reward = avg_reward / (max_time_steps / env.get_num_agents())

    print("Avg. done agents: {}".format(avg_done_agents))
    print("Avg. reward: {}".format(avg_reward))
    print("Avg. norm reward: {}".format(avg_norm_reward))
Ejemplo n.º 10
0
seed = 1

env = RailEnv(width=width,
              height=height,
              rail_generator=complex_rail_generator(nr_start_goal=n_start_goal,
                                                    nr_extra=3,
                                                    min_dist=6,
                                                    max_dist=99999,
                                                    seed=seed),
              schedule_generator=complex_schedule_generator(),
              number_of_agents=number_agents,
              obs_builder_object=TreeObsForRailEnv(max_depth=5))

n_episodes = 6000
n_steps = (width + height) * number_agents

steps_per_episode, my_env, qtable = run(env,
                                        n_episodes,
                                        n_steps,
                                        initial_value=0,
                                        learning_rate=0.8,
                                        gamma=0.95,
                                        epsilon=0.1)

renderer = RenderTool(env, agent_render_variant=3)
renderer.reset()
renderer.render_env(show=True, show_predictions=False, show_observations=False)

steps_per_episode = np.array(steps_per_episode)
plt.plot(moving_average(steps_per_episode, 1000))
Ejemplo n.º 11
0
def main():
    np.random.seed(1)

    env = RailEnv(
        width=flags.grid_width,
        height=flags.grid_height,
        number_of_agents=flags.num_agents,
        rail_generator=rail_generator,
        schedule_generator=schedule_generator,
        malfunction_generator_and_process_data=malfunction_from_params(
            MalfunctionParameters(1 / 8000, 15, 50)),
        obs_builder_object=TreeObservation(max_depth=flags.tree_depth))

    # After training we want to render the results so we also load a renderer
    env_renderer = RenderTool(env, gl="PILSVG")

    # Calculate the state size based on the number of nodes in the tree observation
    num_features_per_node = env.obs_builder.observation_dim
    num_nodes = sum(np.power(4, i) for i in range(flags.tree_depth + 1))
    state_size = num_nodes * num_features_per_node
    action_size = 5

    # Now we load a double dueling DQN agent and initialize it from the checkpoint
    agent = Agent(state_size, action_size)
    if flags.load_from_checkpoint:
        start, eps = agent.load(project_root / 'checkpoints', 0, 1.0)
    else:
        start, eps = 0, 1.0

    # And some variables to keep track of the progress
    action_dict, final_action_dict = {}, {}
    scores_window, steps_window, done_window = deque(maxlen=200), deque(
        maxlen=200), deque(maxlen=200)
    action_prob = [0] * action_size
    agent_obs = [None] * flags.num_agents
    agent_obs_buffer = [None] * flags.num_agents
    agent_action_buffer = [2] * flags.num_agents

    max_steps = int(8 * (flags.grid_width + flags.grid_height))
    update_values = False
    start_time = time.time()

    # We don't want to retrain on old railway networks when we restart from a checkpoint, so we just loop
    # through the generators to get all the old networks out of the way
    if start > 0: print(f"Skipping {start} railways")
    for _ in range(0, start):
        rail_generator()
        schedule_generator()

    # Start the training loop
    for episode in range(start + 1, flags.num_episodes + 1):
        env_renderer.reset()
        obs, info = env.reset(True, True)
        score, steps_taken = 0, 0

        # Build agent specific observations
        for a in range(flags.num_agents):
            if obs[a]:
                agent_obs[a] = normalize_observation(obs[a], flags.tree_depth)
                agent_obs_buffer[a] = agent_obs[a].copy()

        # Run episode
        for step in range(max_steps):
            for a in range(flags.num_agents):
                # if not isinstance(obs[a].childs['L'], float) or not isinstance(obs[a].childs['R'], float):
                if info['action_required'][a]:
                    # If an action is required, we want to store the obs a that step as well as the action
                    update_values = True

                    # distances = { key: child.dist_min_to_target for key, child in obs[a].childs.items() if not isinstance(child, float) }
                    # action_key = min(distances, key=distances.get)
                    # action = { 'L': 1, 'F': 2, 'R': 3 }[action_key]
                    # action = np.argmin(agent_obs[a])

                    # action = np.random.randint(4)
                    action = agent.act(agent_obs[a], eps=eps)
                    action_dict[a] = action
                    action_prob[action] += 1
                    steps_taken += 1
                else:
                    update_values = False
                    action_dict[a] = 2

            # Environment step
            obs, all_rewards, done, info = env.step(action_dict)

            # Update replay buffer and train agent
            for a in range(flags.num_agents):
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values or done[a]:
                    agent.step(agent_obs_buffer[a], agent_action_buffer[a],
                               all_rewards[a], agent_obs[a], done[a],
                               flags.train)
                    agent_obs_buffer[a] = agent_obs[a].copy()
                    agent_action_buffer[a] = action_dict[a]
                if obs[a]:
                    agent_obs[a] = normalize_observation(
                        obs[a], flags.tree_depth)

                score += all_rewards[a] / flags.num_agents

            # Render
            if flags.render_interval and episode % flags.render_interval == 0:
                render(env_renderer)
            if done['__all__']: break

        # Epsilon decay
        eps = max(0.01, flags.epsilon_decay * eps)

        # Save some training statistics in their respective deques
        tasks_finished = sum(done[i] for i in range(flags.num_agents))
        done_window.append(tasks_finished / max(1, flags.num_agents))
        scores_window.append(score / max_steps)
        steps_window.append(steps_taken)
        action_probs = ', '.join(f'{x:.3f}'
                                 for x in action_prob / np.sum(action_prob))

        print(
            f'\rTraining {flags.num_agents} Agents on ({flags.grid_width},{flags.grid_height}) \t '
            + f'Episode {episode} \t ' +
            f'Average Score: {np.mean(scores_window):.3f} \t ' +
            f'Average Steps Taken: {np.mean(steps_window):.1f} \t ' +
            f'Dones: {100 * np.mean(done_window):.2f}% \t ' +
            f'Epsilon: {eps:.2f} \t ' +
            f'Action Probabilities: {action_probs}',
            end=" ")

        if episode % flags.report_interval == 0:
            print(
                f'\rTraining {flags.num_agents} Agents on ({flags.grid_width},{flags.grid_height}) \t '
                + f'Episode {episode} \t ' +
                f'Average Score: {np.mean(scores_window):.3f} \t ' +
                f'Average Steps Taken: {np.mean(steps_window):.1f} \t ' +
                f'Dones: {100 * np.mean(done_window):.2f}% \t ' +
                f'Epsilon: {eps:.2f} \t ' +
                f'Action Probabilities: {action_probs} \t ' +
                f'Time taken: {time.time() - start_time:.2f}s')

            if flags.train:
                agent.save(project_root / 'checkpoints', episode, eps)
            start_time = time.time()
            action_prob = [1] * action_size
Ejemplo n.º 12
0
def get_reward(weights, model, render=False):
    cloned_model = copy.deepcopy(model)
    for i, param in enumerate(cloned_model.parameters()):
        try:
            param.data.copy_(weights[i])
        except:
            param.data.copy_(weights[i].data)

    env_Orig = RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=3,
            # Number of cities in map (where train stations are)
            seed=1,  # Random seed
            grid_mode=False,
            max_rails_between_cities=2,
            max_rails_in_city=3),
        schedule_generator=sparse_schedule_generator(speed_ration_map),
        number_of_agents=n_agents,
        stochastic_data=stochastic_data,  # Malfunction data generator
        obs_builder_object=TreeObservation)

    env = copy.deepcopy(env_Orig)

    # After training we want to render the results so we also load a renderer
    env_renderer = RenderTool(
        env,
        gl="PILSVG",
    )

    # And the max number of steps we want to take per episode
    max_steps = int(4 * 2 * (20 + env.height + env.width))

    n_episodes = 1
    for trials in range(1, n_episodes + 1):
        # Reset environment
        obs, info = env.reset(True, True)
        env_renderer.reset()
        # Build agent specific observations
        for a in range(env.get_num_agents()):
            if obs[a]:
                agent_obs[a] = normalize_observation(obs[a],
                                                     tree_depth,
                                                     observation_radius=10)
                agent_obs_buffer[a] = agent_obs[a].copy()

        # Reset score and done
        score = 0
        env_done = 0
        step = 0

        # Run episode
        while True:
            # Action
            for a in range(env.get_num_agents()):
                if info['action_required'][a]:
                    # If an action is require, we want to store the obs a that step as well as the action
                    update_values[a] = True

                    batch = torch.from_numpy(agent_obs[a][np.newaxis,
                                                          ...]).float()
                    if cuda:
                        batch = batch.cuda()
                    prediction = cloned_model(Variable(batch))
                    action = prediction.data.cpu().numpy().argmax()

                    # action = agent.act(agent_obs[a], eps=eps)
                    action_prob[action] += 1
                else:
                    update_values[a] = False
                    action = 0
                action_dict.update({a: action})

            # Environment step
            # print("Action Values:", action_dict)
            next_obs, all_rewards, done, info = env.step(action_dict)
            step += 1
            if (render):
                env_renderer.render_env(show=True,
                                        show_predictions=True,
                                        show_observations=False)

            for a in range(env.get_num_agents()):
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values[a] or done[a]:
                    # agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a],
                    #           agent_obs[a], done[a])
                    cummulated_reward[a] = 0.

                    agent_obs_buffer[a] = agent_obs[a].copy()
                    agent_action_buffer[a] = action_dict[a]
                if next_obs[a]:
                    agent_obs[a] = normalize_observation(next_obs[a],
                                                         tree_depth,
                                                         observation_radius=10)

                score += all_rewards[a] / env.get_num_agents()
            # print(all_rewards)
            # Copy observation
            if done['__all__'] or step >= max_steps:
                env_done = 1
                break

        # Collection information about training
        tasks_finished = 0
        for current_agent in env.agents:
            if current_agent.status == RailAgentStatus.DONE_REMOVED:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / max_steps)  # save most recent score
        scores.append(np.mean(scores_window))
        dones_list.append((np.mean(done_window)))

        print(
            '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\t Action Probabilities: \t {}'
            .format(env.get_num_agents(), x_dim, y_dim, trials,
                    np.mean(scores_window), 100 * np.mean(done_window),
                    action_prob / np.sum(action_prob)),
            end=" ")

    # env.close()
    data = [[
        n_agents, x_dim, y_dim, trials,
        np.mean(scores_window), 100 * np.mean(done_window), step,
        action_prob / np.sum(action_prob)
    ]]

    dfCur = pd.DataFrame(data)

    with open(f'ES_TrainingResults_{n_agents}_{x_dim}_{y_dim}.csv', 'a') as f:
        dfCur.to_csv(f, index=False, header=False)

    return np.mean(scores)
Ejemplo n.º 13
0
def main(argv):

    random.seed(1)
    np.random.seed(1)

    # Initialize a random map with a random number of agents
    x_dim = np.random.randint(20, 40)
    y_dim = np.random.randint(20, 40)
    n_agents = np.random.randint(3, 4)
    n_goals = n_agents + np.random.randint(0, 3)
    min_dist = int(0.75 * min(x_dim, y_dim))
    tree_depth = 4

    # Get an observation builder and predictor
    predictor = ShortestPathPredictorForRailEnv()
    observation_helper = TreeObsForRailEnv(max_depth=tree_depth, predictor=predictor)

    # Use a the malfunction generator to break agents from time to time
    stochastic_data = {'prop_malfunction': 0.0,  # Percentage of defective agents
                       'malfunction_rate': 0,  # Rate of malfunction occurrence
                       'min_duration': 3,  # Minimal duration of malfunction
                       'max_duration': 20  # Max duration of malfunction
                       }

    # Different agent types (trains) with different speeds.
    speed_ration_map = {1.: 0.25,  # Fast passenger train
                        1. / 2.: 0.25,  # Fast freight train
                        1. / 3.: 0.25,  # Slow commuter train
                        1. / 4.: 0.25}  # Slow freight train

    env = RailEnv(width=x_dim,
                  height=y_dim,
                  rail_generator=sparse_rail_generator(max_num_cities=3,
                                                       # Number of cities in map (where train stations are)
                                                       seed=1,  # Random seed
                                                       grid_mode=False,
                                                       max_rails_between_cities=2,
                                                       max_rails_in_city=3),
                  schedule_generator=sparse_schedule_generator(speed_ration_map),
                  number_of_agents=n_agents,
                  stochastic_data=stochastic_data,  # Malfunction data generator
                  obs_builder_object=observation_helper)
    env.reset(True, True)

    # Initiate the renderer
    env_renderer = RenderTool(env, gl="PILSVG",
                              agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX,
                              show_debug=False,
                              screen_height=1000,  # Adjust these parameters to fit your resolution
                              screen_width=1000)  # Adjust these parameters to fit your resolution
    handle = env.get_agent_handles()
    num_features_per_node = env.obs_builder.observation_dim

    nr_nodes = 0
    for i in range(tree_depth + 1):
        nr_nodes += np.power(4, i)
    state_size = 2 * num_features_per_node * nr_nodes
    action_size = 5

    n_trials = 10
    observation_radius = 10
    max_steps = int(3 * (env.height + env.width))
    action_dict = dict()
    time_obs = deque(maxlen=2)
    agent_obs = [None] * env.get_num_agents()

    # Init and load agent
    agent = Agent(state_size, action_size)
    with path(fc_treeobs.nets, "multi_agent_2ts_checkpoint200.pth") as file_in:
        agent.qnetwork_local.load_state_dict(torch.load(file_in))

    # Vars used to record agent performance
    record_images = False
    frame_step = 0

    for trials in range(1, n_trials + 1):
        # Reset environment
        obs, info = env.reset(True, True)
        env_renderer.reset()

        # Build first two-time step observation
        for a in range(env.get_num_agents()):
            obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
        # Accumulate two time steps of observation (Here just twice the first state)
        for i in range(2):
            time_obs.append(obs)
        # Build the agent specific double ti
        for a in range(env.get_num_agents()):
            agent_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))

        # Run episode
        for step in range(max_steps):
            time.sleep(0.01)

            env_renderer.render_env(show=True, show_observations=False, show_predictions=True)

            if record_images:
                env_renderer.gl.save_image("./Images/Avoiding/flatland_frame_{:04d}.bmp".format(frame_step))
                frame_step += 1

            # Perform action for each agent
            for a in range(env.get_num_agents()):
                action = agent.act(agent_obs[a], eps=0)
                action_dict.update({a: action})

            # Environment step
            next_obs, all_rewards, done, _ = env.step(action_dict)

            # Collect observation after environment step
            for a in range(env.get_num_agents()):
                next_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10)
            # Add new obs to the obs vector
            # Since time_obs is a deque of max_len = 2, an append on the right side when the deque is full
            # provokes a pop of the element from the left side
            time_obs.append(next_obs)
            # Create obs using obs at time step t-1 and ob at time step t
            for a in range(env.get_num_agents()):
                agent_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))

            if done['__all__']:
                break
Ejemplo n.º 14
0
def run_episode(kwargs) -> [Trajectory]:
    """
    Runs a single episode and collects the trajectories of each agent
    """
    total_controller_time = 0
    env_dict: Callable = kwargs.get("env_dict")
    obs_builder = kwargs.get("obs_builder")
    controller_creator: Callable = kwargs.get("controller_creator")
    episode_id: int = kwargs.get("episode_id")
    max_episode_length: int = kwargs.get("max_episode_length", 1000)
    render: bool = kwargs.get("render", False)
    # Create and Start Environment
    _env = load_env(env_dict, obs_builder_object=obs_builder)
    obs, info = _env.reset(
        regenerate_rail=False,
        regenerate_schedule=True,
    )
    score = 0
    _trajectories = [Trajectory() for _ in _env.get_agent_handles()]

    # Create and Start Controller
    controller: AbstractController = controller_creator()
    start = time.time()
    controller.start_of_round(obs=obs, env=_env)
    total_controller_time += time.time() - start

    if render:
        env_renderer = RenderTool(_env)
        env_renderer.reset()

    for step in range(max_episode_length):
        start = time.time()
        action_dict, processed_obs = controller.act(observation=obs)
        total_controller_time += time.time() - start
        next_obs, all_rewards, done, info = _env.step(action_dict)

        if render:
            env_renderer.render_env(show=True,
                                    show_observations=True,
                                    show_predictions=False)

        # Save actions and rewards for each agent
        [
            _trajectories[agent_handle].add_row(
                state=processed_obs[agent_handle],
                action=action_dict[agent_handle],
                reward=all_rewards[agent_handle],
                done=done[agent_handle])
            for agent_handle in _env.get_agent_handles()
        ]

        score += sum(all_rewards)

        obs = next_obs.copy()
        if done['__all__']:
            break

    if render:
        env_renderer.close_window()
    # print(f"\nController took a total time of: {total_controller_time} seconds", flush=True)
    return _trajectories
def main(argv):
    try:
        opts, args = getopt.getopt(argv, "n:", ["n_trials="])
    except getopt.GetoptError:
        print('training_navigation.py -n <n_trials>')
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-n', '--n_trials'):
            n_trials = int(arg)

    random.seed(1)
    np.random.seed(1)

    # Parameters for the Environment
    x_dim = 35
    y_dim = 35
    n_agents = 10


    # Use a the malfunction generator to break agents from time to time
    stochastic_data = {'malfunction_rate': 8000,  # Rate of malfunction occurence of single agent
                       'min_duration': 15,  # Minimal duration of malfunction
                       'max_duration': 50  # Max duration of malfunction
                       }

    # Custom observation builder
    TreeObservation = TreeObsForRailEnv(max_depth=2, predictor=ShortestPathPredictorForRailEnv(30))

    # Different agent types (trains) with different speeds.
    speed_ration_map = {1.: 0.25,  # Fast passenger train
                        1. / 2.: 0.25,  # Fast freight train
                        1. / 3.: 0.25,  # Slow commuter train
                        1. / 4.: 0.25}  # Slow freight train

    env = RailEnv(width=x_dim,
                  height=y_dim,
                  rail_generator=sparse_rail_generator(max_num_cities=3,
                                                       # Number of cities in map (where train stations are)
                                                       seed=1,  # Random seed
                                                       grid_mode=False,
                                                       max_rails_between_cities=2,
                                                       max_rails_in_city=3),
                  schedule_generator=sparse_schedule_generator(speed_ration_map),
                  number_of_agents=n_agents,
                  malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
                  obs_builder_object=TreeObservation)

    # After training we want to render the results so we also load a renderer
    env_renderer = RenderTool(env, gl="PILSVG", )
    # Given the depth of the tree observation and the number of features per node we get the following state_size
    num_features_per_node = env.obs_builder.observation_dim
    tree_depth = 2
    nr_nodes = 0
    for i in range(tree_depth + 1):
        nr_nodes += np.power(4, i)
    state_size = num_features_per_node * nr_nodes

    # The action space of flatland is 5 discrete actions
    action_size = 5

    # We set the number of episodes we would like to train on
    if 'n_trials' not in locals():
        n_trials = 15000

    # And the max number of steps we want to take per episode
    max_steps = int(4 * 2 * (20 + env.height + env.width))

    # Define training parameters
    eps = 1.
    eps_end = 0.005
    eps_decay = 0.998

    # And some variables to keep track of the progress
    action_dict = dict()
    final_action_dict = dict()
    scores_window = deque(maxlen=100)
    done_window = deque(maxlen=100)
    scores = []
    dones_list = []
    action_prob = [0] * action_size
    agent_obs = [None] * env.get_num_agents()
    agent_next_obs = [None] * env.get_num_agents()
    agent_obs_buffer = [None] * env.get_num_agents()
    agent_action_buffer = [2] * env.get_num_agents()
    cummulated_reward = np.zeros(env.get_num_agents())
    update_values = [False] * env.get_num_agents()
    # Now we load a Double dueling DQN agent
    agent = Agent(state_size, action_size)

    for trials in range(1, n_trials + 1):

        # Reset environment
        obs, info = env.reset(True, True)
        env_renderer.reset()
        # Build agent specific observations
        for a in range(env.get_num_agents()):
            if obs[a]:
                agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
                agent_obs_buffer[a] = agent_obs[a].copy()

        # Reset score and done
        score = 0
        env_done = 0

        # Run episode
        while True:
            # Action
            for a in range(env.get_num_agents()):
                if info['action_required'][a]:
                    # If an action is require, we want to store the obs a that step as well as the action
                    update_values[a] = True
                    action = agent.act(agent_obs[a], eps=eps)
                    action_prob[action] += 1
                else:
                    update_values[a] = False
                    action = 0
                action_dict.update({a: action})

            # Environment step
            next_obs, all_rewards, done, info = env.step(action_dict)
            # Update replay buffer and train agent
            for a in range(env.get_num_agents()):
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values[a] or done[a]:
                    agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a],
                               agent_obs[a], done[a])
                    cummulated_reward[a] = 0.

                    agent_obs_buffer[a] = agent_obs[a].copy()
                    agent_action_buffer[a] = action_dict[a]
                if next_obs[a]:
                    agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10)

                score += all_rewards[a] / env.get_num_agents()

            # Copy observation
            if done['__all__']:
                env_done = 1
                break

        # Epsilon decay
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon

        # Collection information about training
        tasks_finished = 0
        for current_agent in env.agents:
            if current_agent.status == RailAgentStatus.DONE_REMOVED:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / max_steps)  # save most recent score
        scores.append(np.mean(scores_window))
        dones_list.append((np.mean(done_window)))

        print(
            '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
                env.get_num_agents(), x_dim, y_dim,
                trials,
                np.mean(scores_window),
                100 * np.mean(done_window),
                eps, action_prob / np.sum(action_prob)), end=" ")

        if trials % 100 == 0:
            print(
                '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
                    env.get_num_agents(), x_dim, y_dim,
                    trials,
                    np.mean(scores_window),
                    100 * np.mean(done_window),
                    eps, action_prob / np.sum(action_prob)))
            torch.save(agent.qnetwork_local.state_dict(),
                       './Nets/navigator_checkpoint' + str(trials) + '.pth')
            action_prob = [1] * action_size

    # Plot overall training progress at the end
    plt.plot(scores)
    plt.show()
Ejemplo n.º 16
0
def evaluate(n_episodes, rl_prio=True):
    agent = None
    if rl_prio:
        config, run = init_run()
        agent = get_agent(config, run)
        env = get_env(config, rl=True)
    else:
        env = get_env(rl=False)
    env_renderer = RenderTool(env, screen_width=8800)
    returns = []
    pcs = []
    malfs = []

    for _ in tqdm(range(n_episodes)):

        obs, _ = env.reset(regenerate_schedule=True, regenerate_rail=True)
        if RENDER:
            env_renderer.reset()
            env_renderer.render_env(show=True,
                                    frames=True,
                                    show_observations=False)

        if not obs:
            break

        steps = 0
        ep_return = 0
        done = defaultdict(lambda: False)
        robust_env = CprFlatlandGymEnv(rail_env=env,
                                       max_nr_active_agents=200,
                                       observation_space=None,
                                       priorizer=NrAgentsSameStart(),
                                       allow_noop=True)
        # if rl_prio:
        #     priorities = prio_agent.compute_actions(obs, explore=False)
        #     sorted_actions = {k: v for k, v in sorted(priorities.items(), key=lambda item: item[1], reverse=True)}
        #     sorted_handles = list(sorted_actions.keys())
        # else:
        sorted_handles = robust_env.priorizer.priorize(handles=list(
            obs.keys()),
                                                       rail_env=env)

        while not done['__all__']:
            actions = ShortestPathAgent().compute_actions(obs, env)
            robust_actions = robust_env.get_robust_actions(
                actions, sorted_handles)
            obs, all_rewards, done, info = env.step(robust_actions)
            if RENDER:
                env_renderer.render_env(show=True,
                                        frames=True,
                                        show_observations=False)
            print('.', end='', flush=True)
            steps += 1
            ep_return += np.sum(list(all_rewards.values()))

        pc = np.sum(np.array([1 for a in env.agents if is_done(a)
                              ])) / env.get_num_agents()
        print("EPISODE PC:", pc)
        n_episodes += 1
        pcs.append(pc)
        returns.append(ep_return /
                       (env._max_episode_steps * env.get_num_agents()))
        malfs.append(
            np.sum([a.malfunction_data['nr_malfunctions']
                    for a in env.agents]))
    return pcs, returns, malfs
Ejemplo n.º 17
0
def main(argv):
    try:
        opts, args = getopt.getopt(argv, "n:", ["n_episodes="])
    except getopt.GetoptError:
        print('single_agent_inference.py -n <n_episodes>')
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-n', '--n_episodes'):
            n_episodes = int(arg)

    random.seed(1)
    np.random.seed(1)

    # Preload an agent
    training = False

    # Initialize a random map with a random number of agents
    x_dim = np.random.randint(20, 40)
    y_dim = np.random.randint(20, 40)
    n_agents = 1  # np.random.randint(3, 8)
    n_goals = n_agents + np.random.randint(0, 3)
    min_dist = int(0.75 * min(x_dim, y_dim))
    tree_depth = 4

    # Use a the malfunction generator to break agents from time to time
    stochastic_data = {
        'prop_malfunction': 0.0,  # Percentage of defective agents
        'malfunction_rate': 0,  # Rate of malfunction occurrence
        'min_duration': 0,  # Minimal duration of malfunction
        'max_duration': 0  # Max duration of malfunction
    }

    # Different agent types (trains) with different speeds.
    speed_ration_map = {
        1.: 0.25,  # Fast passenger train
        1. / 2.: 0.25,  # Fast freight train
        1. / 3.: 0.25,  # Slow commuter train
        1. / 4.: 0.25
    }  # Slow freight train

    # Get an observation builder and predictor
    observation_helper = TreeObsForRailEnv(
        max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv())

    env = RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=3,
            # Number of cities in map (where train stations are)
            seed=1,  # Random seed
            grid_mode=False,
            max_rails_between_cities=2,
            max_rails_in_city=3),
        schedule_generator=sparse_schedule_generator(speed_ration_map),
        number_of_agents=n_agents,
        stochastic_data=stochastic_data,  # Malfunction data generator
        obs_builder_object=observation_helper)
    env.reset(True, True)
    env_renderer = RenderTool(
        env,
        gl="PILSVG",
        agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX,
        show_debug=False,
        screen_height=1000,  # Adjust these parameters to fit your resolution
        screen_width=1000)  # Adjust these parameters to fit your resolution

    handle = env.get_agent_handles()
    features_per_node = env.obs_builder.observation_dim
    nr_nodes = 0
    for i in range(tree_depth + 1):
        nr_nodes += np.power(4, i)
    state_size = features_per_node * nr_nodes
    action_size = 5

    # We set the number of episodes we would like to train on
    if 'n_episodes' not in locals():
        n_episodes = 1000

    # Set max number of steps per episode as well as other training relevant parameter
    max_steps = int(3 * (env.height + env.width))
    eps = 1.
    eps_end = 0.005
    eps_decay = 0.998
    action_dict = dict()
    final_action_dict = dict()
    scores_window = deque(maxlen=100)
    done_window = deque(maxlen=100)
    scores = []
    dones_list = []
    action_prob = [0] * action_size
    agent_obs = [None] * env.get_num_agents()
    agent_next_obs = [None] * env.get_num_agents()
    # Initialize the agent
    agent = Agent(state_size, action_size)

    # Here you can pre-load an agent
    with path(fc_treeobs.nets,
              "single_agent_navigation_checkpoint1000.pth") as file_in:
        agent.qnetwork_local.load_state_dict(torch.load(file_in))

    # Do training over n_episodes
    for episodes in range(1, n_episodes + 1):

        # Reset environment
        obs, info = env.reset(True, True)
        env_renderer.reset()

        # Build agent specific observations
        for a in range(env.get_num_agents()):
            data, distance, agent_data = split_tree_into_feature_groups(
                obs[a], tree_depth)
            data = norm_obs_clip(data)
            distance = norm_obs_clip(distance)
            agent_data = np.clip(agent_data, -1, 1)
            agent_obs[a] = obs[a] = np.concatenate((np.concatenate(
                (data, distance)), agent_data))

        # Run episode
        for step in range(max_steps):

            # Action
            for a in range(env.get_num_agents()):
                # action = agent.act(np.array(obs[a]), eps=eps)
                action = agent.act(agent_obs[a], eps=eps)
                action_prob[action] += 1
                action_dict.update({a: action})

            # Environment step
            next_obs, all_rewards, done, _ = env.step(action_dict)
            env_renderer.render_env(show=True,
                                    show_predictions=True,
                                    show_observations=False)

            if done['__all__']:
                break
Ejemplo n.º 18
0
def test(args, T, ep, dqn, val_mem, metrics, results_dir, evaluate=False):

    # Init env and set in evaluation mode
    # Maps speeds to % of appearance in the env
    speed_ration_map = {
        1.: 0.25,  # Fast passenger train
        1. / 2.: 0.25,  # Fast freight train
        1. / 3.: 0.25,  # Slow commuter train
        1. / 4.: 0.25
    }  # Slow freight train

    schedule_generator = sparse_schedule_generator(speed_ration_map)

    observation_builder = GraphObsForRailEnv(
        predictor=ShortestPathPredictorForRailEnv(
            max_depth=args.prediction_depth))

    env = RailEnv(
        width=args.width,
        height=args.height,
        rail_generator=sparse_rail_generator(
            max_num_cities=args.max_num_cities,
            seed=
            ep,  # Use episode as seed when evaluation is performed during training
            grid_mode=args.grid_mode,
            max_rails_between_cities=args.max_rails_between_cities,
            max_rails_in_city=args.max_rails_in_city,
        ),
        schedule_generator=schedule_generator,
        number_of_agents=args.num_agents,
        obs_builder_object=observation_builder,
        malfunction_generator_and_process_data=malfunction_from_params(
            parameters={
                'malfunction_rate': args.malfunction_rate,
                'min_duration': args.min_duration,
                'max_duration': args.max_duration
            }),
    )

    if args.render:
        env_renderer = RenderTool(env,
                                  gl="PILSVG",
                                  agent_render_variant=AgentRenderVariant.
                                  AGENT_SHOWS_OPTIONS_AND_BOX,
                                  show_debug=True,
                                  screen_height=1080,
                                  screen_width=1920)

    #max_time_steps = env.compute_max_episode_steps(env.width, env.height)
    max_time_steps = 200  # TODO Debug
    # metrics['steps'].append(T)
    metrics['episodes'].append(ep)
    T_rewards = []  # List of episodes rewards
    T_Qs = []  # List
    T_num_done_agents = []  # List of number of done agents for each episode
    T_all_done = []  # If all agents completed in each episode
    network_action_dict = dict()
    railenv_action_dict = dict()
    qvalues = {}

    # Test performance over several episodes
    for ep in range(args.evaluation_episodes):
        # Reset info
        state, info = env.reset()
        reward_sum, all_done = 0, False  # reward_sum contains the cumulative reward obtained as sum during the steps
        num_done_agents = 0
        if args.render:
            env_renderer.reset()

        # Choose first action - decide entering of agents into the environment
        for a in range(env.get_num_agents()):
            action = np.random.choice((0, 2))
            railenv_action_dict.update({a: action})
        state, reward, done, info = env.step(railenv_action_dict)  # Env step
        reward_sum += sum(reward[a] for a in range(env.get_num_agents()))

        if args.render:
            env_renderer.render_env(show=True,
                                    show_observations=False,
                                    show_predictions=True)

        for step in range(max_time_steps - 1):
            # Choose actions
            for a in range(env.get_num_agents()):
                if info['action_required'][a]:
                    network_action = dqn.act(
                        state[a]
                    )  # Choose an action greedily (with noisy weights)
                    # network_action = 0
                    railenv_action = observation_builder.choose_railenv_action(
                        a, network_action)
                    qvalues.update({a: dqn.get_q_values(state[a])})
                else:
                    network_action = 0
                    railenv_action = 0
                    qvalues.update({a: [0, 0]})  # '0' if wasn't updated

                railenv_action_dict.update({a: railenv_action})
                network_action_dict.update({a: network_action})

            if args.debug:
                for a in range(env.get_num_agents()):
                    print('#########################################')
                    print('Info for agent {}'.format(a))
                    print('Occupancy, first layer: {}'.format(
                        state[a][:args.prediction_depth]))
                    print('Occupancy, second layer: {}'.format(
                        state[a][args.prediction_depth:args.prediction_depth *
                                 2]))
                    print('Forks: {}'.format(
                        state[a][args.prediction_depth *
                                 2:args.prediction_depth * 3]))
                    print('Target: {}'.format(
                        state[a][args.prediction_depth *
                                 3:args.prediction_depth * 4]))
                    print('Priority: {}'.format(
                        state[a][args.prediction_depth * 4]))
                    print('Max priority encountered: {}'.format(
                        state[a][args.prediction_depth * 4 + 1]))
                    print('Num malfunctoning agents (globally): {}'.format(
                        state[a][args.prediction_depth * 4 + 2]))
                    print('Num agents ready to depart (globally): {}'.format(
                        state[a][args.prediction_depth * 4 + 3]))
                    print('Status: {}'.format(info['status'][a]))
                    print('Position: {}'.format(env.agents[a].position))
                    print('Moving? {} at speed: {}'.format(
                        env.agents[a].moving, info['speed'][a]))
                    print('Action required? {}'.format(
                        info['action_required'][a]))
                    print('Network action: {}'.format(network_action_dict[a]))
                    print('Railenv action: {}'.format(railenv_action_dict[a]))
                    print('Q values: {}'.format(qvalues[a]))
                    # print('QValues: {}'.format(qvalues))
                    print('Rewards: {}'.format(reward[a]))

            # Breakpoint for debugging here
            state, reward, done, info = env.step(
                railenv_action_dict)  # Env step
            if args.render:
                env_renderer.render_env(show=True,
                                        show_observations=False,
                                        show_predictions=True)

            reward_sum += sum(reward[a] for a in range(env.get_num_agents()))

            if done['__all__']:
                all_done = True
                break
        # No need to close the renderer since env parameter sizes stay the same
        T_rewards.append(reward_sum)
        # Compute num of agents that reached their target
        for a in range(env.get_num_agents()):
            if done[a]:
                num_done_agents += 1
        T_num_done_agents.append(
            num_done_agents / env.get_num_agents())  # In proportion to total
        T_all_done.append(all_done)

    # Test Q-values over validation memory
    for state in val_mem:  # Iterate over valid states
        T_Qs.append(dqn.evaluate_q(state))
    if args.debug:
        print('T_Qs: {}'.format(T_Qs))  # These are Qs from a single agent TODO

    avg_done_agents = sum(T_num_done_agents) / len(
        T_num_done_agents
    )  # Average number of agents that reached their target
    avg_reward = sum(T_rewards) / len(T_rewards)
    avg_norm_reward = avg_reward / (max_time_steps / env.get_num_agents())

    # avg_reward, avg_Q = sum(T_rewards) / len(T_rewards), sum(T_Qs) / len(T_Qs)
    if not evaluate:
        # Save model parameters if improved
        if avg_done_agents > metrics['best_avg_done_agents']:
            metrics['best_avg_done_agents'] = avg_done_agents
            dqn.save(results_dir)

        # Append to results and save metrics
        metrics['rewards'].append(T_rewards)
        metrics['Qs'].append(T_Qs)
        torch.save(metrics, os.path.join(results_dir, 'metrics.pth'))

        # Plot HTML
        _plot_line(metrics['episodes'],
                   metrics['rewards'],
                   'Reward',
                   path=results_dir)  # Plot rewards in episodes
        _plot_line(metrics['episodes'], metrics['Qs'], 'Q', path=results_dir)

    # Return average number of done agents (in proportion) and average reward
    return avg_done_agents, avg_reward, avg_norm_reward
Ejemplo n.º 19
0
def main(argv):
    try:
        opts, args = getopt.getopt(argv, "n:", ["n_trials="])
    except getopt.GetoptError:
        print('test_navigation_single_agent.py -n <n_trials>')
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-n', '--n_trials'):
            n_trials = int(arg)

    random.seed(1)
    np.random.seed(1)

    # Parameters for the Environment
    x_dim = 35
    y_dim = 35
    n_agents = 1
    max_num_cities = 3
    max_rails_between_cities = 2
    max_rails_in_city = 3

    # We are training an Agent using the Tree Observation with depth 2
    #observation_builder = TreeObsForRailEnv(max_depth=2, predictor = ShortestPathPredictorForRailEnv(20))

    # Use a the malfunction generator to break agents from time to time
    stochastic_data = {
        'malfunction_rate':
        80,  # Rate of malfunction occurence of single agent
        'min_duration': 15,  # Minimal duration of malfunction
        'max_duration': 50  # Max duration of malfunction
    }

    # Custom observation builder
    tree_depth = 2
    TreeObservation = TreeObsForRailEnv(
        max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv(20))

    np.savetxt(fname=path.join('NetsTest', 'info.txt'),
               X=[
                   x_dim, y_dim, n_agents, max_num_cities,
                   max_rails_between_cities, max_rails_in_city, tree_depth
               ],
               delimiter=';')

    # Different agent types (trains) with different speeds.
    speed_ration_map = {
        1.: 1.,  # Fast passenger train
        1. / 2.: 0.0,  # Fast freight train
        1. / 3.: 0.0,  # Slow commuter train
        1. / 4.: 0.0
    }  # Slow freight train

    env = RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=max_num_cities,
            # Number of cities in map (where train stations are)
            seed=14,  # Random seed
            grid_mode=False,
            max_rails_between_cities=max_rails_between_cities,
            max_rails_in_city=max_rails_in_city),
        schedule_generator=sparse_schedule_generator(speed_ration_map),
        number_of_agents=n_agents,
        obs_builder_object=TreeObservation)
    env.reset()

    env_renderer = RenderTool(
        env,
        gl="PILSVG",
    )
    #env_renderer = RenderTool(env, gl="PILSVG",
    #                      agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX,
    #                      show_debug=False,
    #                      screen_height=(1080*0.8),  # Adjust these parameters to fit your resolution
    #                      screen_width=(1920*0.8))
    num_features_per_node = env.obs_builder.observation_dim

    nr_nodes = 0
    for i in range(tree_depth + 1):
        nr_nodes += np.power(4, i)
    state_size = num_features_per_node * nr_nodes
    action_size = 5

    # We set the number of episodes we would like to train on
    if 'n_trials' not in locals():
        n_trials = 15000
    max_steps = int(3 * (env.height + env.width))
    eps = 1.
    eps_end = 0.005
    eps_decay = 0.9995

    # And some variables to keep track of the performance
    action_dict = dict()
    final_action_dict = dict()
    action_prob_list = []
    scores_window = deque(maxlen=100)
    done_window = deque(maxlen=100)
    scores = []
    scores_list = []
    dones_list_window = []
    dones_list = []
    action_prob = [0] * action_size
    agent_obs = [None] * env.get_num_agents()
    agent_next_obs = [None] * env.get_num_agents()  # Useless
    agent = RandomAgent(state_size, action_size)

    #agent.qnetwork_local.load_state_dict(torch.load(path.join('NetsTest' , 'navigator_checkpoint_glob10.pth')))
    #agent.qnetwork_local.load_state_dict(torch.load(path.join('NetsLoad' , 'navigator_checkpoint2100.pth')))

    record_images = False
    frame_step = 0

    for trials in range(1, n_trials + 1):

        # Reset environment
        obs, info = env.reset()  #(True, True)
        env_renderer.reset()
        # Build agent specific observations
        for a in range(env.get_num_agents()):
            agent_obs[a] = agent_obs[a] = normalize_observation(
                obs[a], tree_depth, observation_radius=10)
        # Reset score and done
        score = 0
        env_done = 0

        # Run episode
        for step in range(max_steps):

            # Action
            for a in range(env.get_num_agents()):
                if info['action_required'][a]:
                    action = agent.act(agent_obs[a])  #, eps=0.)
                    action_prob[action] += 1

                else:
                    action = 0

                action_dict.update({a: action})
            # Environment step
            obs, all_rewards, done, deadlocks, info = env.step(action_dict)

            #env_renderer.render_env(show=True, show_predictions=True, show_observations=False)
            # Build agent specific observations and normalize
            for a in range(env.get_num_agents()):
                if obs[a]:
                    agent_obs[a] = normalize_observation(obs[a],
                                                         tree_depth,
                                                         observation_radius=10)

                score += all_rewards[a] / env.get_num_agents()

            if done['__all__']:
                break

        # Collection information about training
        tasks_finished = 0
        for _idx in range(env.get_num_agents()):
            if done[_idx] == 1:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / max_steps)  # save most recent score
        scores.append(np.mean(scores_window))
        dones_list.append(tasks_finished / max(1, env.get_num_agents()))
        dones_list_window.append((np.mean(done_window)))
        scores_list.append(score / max_steps)

        action_prob_list.append(action_prob / np.sum(action_prob))

        print(
            '\rTesting {} Agents on ({},{}).\t Episode {}\t Score: {:.3f}\tDones: {:.2f}%\t Action Probabilities: \t {}'
            .format(env.get_num_agents(), x_dim, y_dim, trials,
                    score / max_steps,
                    100 * tasks_finished / max(1, env.get_num_agents()),
                    action_prob / np.sum(action_prob)),
            end=" ")

        if trials % 100 == 0:
            action_prob = [1] * action_size

        if trials % 50 == 0:

            #np.savetxt(fname=path.join('Nets' , 'scores_metric.txt'), X=scores)
            #np.savetxt(fname=path.join('Nets' , 'dones_metric.txt'), X=dones_list)
            np.savetxt(fname=path.join('NetsTest', 'test_metrics.csv'),
                       X=np.transpose(
                           np.asarray([
                               scores_list, scores, dones_list,
                               dones_list_window
                           ])),
                       delimiter=';',
                       newline='\n')
            np.savetxt(fname=path.join('NetsTest', 'test_action_prob.csv'),
                       X=np.asarray(action_prob_list),
                       delimiter=';',
                       newline='\n')
Ejemplo n.º 20
0
dones_list = []
action_prob = [0] * action_size
agent_obs = [None] * env.get_num_agents()
agent_next_obs = [None] * env.get_num_agents()
agent = Agent(state_size, action_size)
with path(torch_training.Nets, "navigator_checkpoint1000.pth") as file_in:
    agent.qnetwork_local.load_state_dict(torch.load(file_in))

record_images = False
frame_step = 0

for trials in range(1, n_trials + 1):

    # Reset environment
    obs, info = env.reset(True, True)
    env_renderer.reset()
    # Build agent specific observations
    for a in range(env.get_num_agents()):
        agent_obs[a] = agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
    # Reset score and done
    score = 0
    env_done = 0

    # Run episode
    for step in range(max_steps):

        # Action
        for a in range(env.get_num_agents()):
            if info['action_required'][a]:
                action = agent.act(agent_obs[a], eps=0.)
Ejemplo n.º 21
0
def main(args):

    # Show options and values
    print(' ' * 26 + 'Options')
    for k, v in vars(args).items():
        print(' ' * 26 + k + ': ' + str(v))
    # Where to save models
    results_dir = os.path.join('results', args.model_id)
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)

    rail_generator = sparse_rail_generator(
        max_num_cities=args.max_num_cities,
        seed=args.seed,
        grid_mode=args.grid_mode,
        max_rails_between_cities=args.max_rails_between_cities,
        max_rails_in_city=args.max_rails_in_city,
    )

    # Maps speeds to % of appearance in the env
    speed_ration_map = {1.: 1}  # Fast passenger train

    if args.multi_speed:
        speed_ration_map = {
            1.: 0.25,  # Fast passenger train
            1. / 2.: 0.25,  # Fast freight train
            1. / 3.: 0.25,  # Slow commuter train
            1. / 4.: 0.25
        }  # Slow freight train

    schedule_generator = sparse_schedule_generator(speed_ration_map)

    prediction_builder = ShortestPathPredictorForRailEnv(
        max_depth=args.prediction_depth)
    obs_builder = RailObsForRailEnv(predictor=prediction_builder)

    env = RailEnv(
        width=args.width,
        height=args.height,
        rail_generator=rail_generator,
        random_seed=0,
        schedule_generator=schedule_generator,
        number_of_agents=args.num_agents,
        obs_builder_object=obs_builder,
        malfunction_generator_and_process_data=malfunction_from_params(
            parameters={
                'malfunction_rate': args.malfunction_rate,
                'min_duration': args.min_duration,
                'max_duration': args.max_duration
            }))

    if args.render:
        env_renderer = RenderTool(env,
                                  agent_render_variant=AgentRenderVariant.
                                  AGENT_SHOWS_OPTIONS_AND_BOX,
                                  show_debug=True,
                                  screen_height=800,
                                  screen_width=800)

    if args.plot:
        writer = SummaryWriter(log_dir='runs/' + args.model_id)

    max_rails = 100  # TODO Must be a parameter of the env (estimated)
    # max_steps = env.compute_max_episode_steps(env.width, env.height)
    max_steps = 200

    preprocessor = ObsPreprocessor(max_rails, args.reorder_rails)

    dqn = DQNAgent(args, bitmap_height=max_rails * 3, action_space=2)

    if args.load_path:
        file = os.path.isfile(args.load_path)
        if file:
            dqn.qnetwork_local.load_state_dict(torch.load(args.load_path))
            print('WEIGHTS LOADED from: ', args.load_path)

    eps = args.start_eps
    railenv_action_dict = {}
    network_action_dict = {}
    # Metrics
    done_window = deque(
        maxlen=args.window_size)  # Env dones over last window_size episodes
    done_agents_window = deque(
        maxlen=args.window_size)  # Fraction of done agents over last ...
    reward_window = deque(
        maxlen=args.window_size
    )  # Cumulative rewards over last window_size episodes
    norm_reward_window = deque(
        maxlen=args.window_size
    )  # Normalized cum. rewards over last window_size episodes
    # Track means over windows of window_size episodes
    mean_dones = []
    mean_agent_dones = []
    mean_rewards = []
    mean_norm_rewards = []
    # Episode rewards/dones/norm rewards since beginning of training TODO
    #env_dones = []

    crash = [False] * args.num_agents
    update_values = [False] * args.num_agents
    buffer_obs = [[]] * args.num_agents

    ############ Main loop
    for ep in range(args.num_episodes):
        cumulative_reward = 0
        env_done = 0
        altmaps = [None] * args.num_agents
        altpaths = [[]] * args.num_agents
        buffer_rew = [0] * args.num_agents
        buffer_done = [False] * args.num_agents
        curr_obs = [None] * args.num_agents

        maps, info = env.reset()
        if args.print:
            debug.print_bitmaps(maps)

        if args.render:
            env_renderer.reset()

        for step in range(max_steps - 1):
            # Save a copy of maps at the beginning
            buffer_maps = maps.copy()
            # rem first bit is 0 for agent not departed
            for a in range(env.get_num_agents()):
                agent = env.agents[a]
                crash[a] = False
                update_values[a] = False
                network_action = None
                action = None

                # If agent is arrived
                if agent.status == RailAgentStatus.DONE or agent.status == RailAgentStatus.DONE_REMOVED:
                    # TODO if agent !removed you should leave a bit in the bitmap
                    # TODO? set bitmap only the first time
                    maps[a, :, :] = 0
                    network_action = 0
                    action = RailEnvActions.DO_NOTHING

                # If agent is not departed
                elif agent.status == RailAgentStatus.READY_TO_DEPART:
                    update_values[a] = True
                    obs = preprocessor.get_obs(a, maps[a], buffer_maps)
                    curr_obs[a] = obs.copy()

                    # Network chooses action
                    q_values = dqn.act(obs).cpu().data.numpy()
                    if np.random.random() > eps:
                        network_action = np.argmax(q_values)
                    else:
                        network_action = np.random.choice([0, 1])

                    if network_action == 0:
                        action = RailEnvActions.DO_NOTHING
                    else:  # Go
                        crash[a] = obs_builder.check_crash(a, maps)

                        if crash[a]:
                            network_action = 0
                            action = RailEnvActions.STOP_MOVING
                        else:
                            maps = obs_builder.update_bitmaps(a, maps)
                            action = obs_builder.get_agent_action(a)

                # If the agent is entering a switch
                elif obs_builder.is_before_switch(
                        a) and info['action_required'][a]:
                    # If the altpaths cache is empty or already contains
                    # the altpaths from the current agent's position
                    if len(
                            altpaths[a]
                    ) == 0 or agent.position != altpaths[a][0][0].position:
                        altmaps[a], altpaths[a] = obs_builder.get_altmaps(a)

                    if len(altmaps[a]) > 0:
                        update_values[a] = True
                        altobs = [None] * len(altmaps[a])
                        q_values = np.array([])
                        for i in range(len(altmaps[a])):
                            altobs[i] = preprocessor.get_obs(
                                a, altmaps[a][i], buffer_maps)
                            q_values = np.concatenate([
                                q_values,
                                dqn.act(altobs[i]).cpu().data.numpy()
                            ])

                        # Epsilon-greedy action selection
                        if np.random.random() > eps:
                            argmax = np.argmax(q_values)
                            network_action = argmax % 2
                            best_i = argmax // 2
                        else:
                            network_action = np.random.choice([0, 1])
                            best_i = np.random.choice(
                                np.arange(len(altmaps[a])))

                        # Use new bitmaps and paths
                        maps[a, :, :] = altmaps[a][best_i]
                        obs_builder.set_agent_path(a, altpaths[a][best_i])
                        curr_obs[a] = altobs[best_i].copy()

                    else:
                        print('[ERROR] NO ALTHPATHS EP: {} STEP: {} AGENT: {}'.
                              format(ep, step, a))
                        network_action = 0

                    if network_action == 0:
                        action = RailEnvActions.STOP_MOVING
                    else:
                        crash[a] = obs_builder.check_crash(
                            a, maps, is_before_switch=True)

                        if crash[a]:
                            network_action = 0
                            action = RailEnvActions.STOP_MOVING
                        else:
                            action = obs_builder.get_agent_action(a)
                            maps = obs_builder.update_bitmaps(
                                a, maps, is_before_switch=True)

                # If the agent is following a rail
                elif info['action_required'][a]:
                    crash[a] = obs_builder.check_crash(a, maps)

                    if crash[a]:
                        network_action = 0
                        action = RailEnvActions.STOP_MOVING
                    else:
                        network_action = 1
                        action = obs_builder.get_agent_action(a)
                        maps = obs_builder.update_bitmaps(a, maps)

                else:  # not action_required
                    network_action = 1
                    action = RailEnvActions.DO_NOTHING
                    maps = obs_builder.update_bitmaps(a, maps)

                network_action_dict.update({a: network_action})
                railenv_action_dict.update({a: action})

            # Obs is computed from bitmaps while state is computed from env step (temporarily)
            _, reward, done, info = env.step(railenv_action_dict)  # Env step

            if args.render:
                env_renderer.render_env(show=True,
                                        show_observations=False,
                                        show_predictions=True)

            if args.debug:
                for a in range(env.get_num_agents()):
                    print('#########################################')
                    print('Info for agent {}'.format(a))
                    print('Status: {}'.format(info['status'][a]))
                    print('Position: {}'.format(env.agents[a].position))
                    print('Target: {}'.format(env.agents[a].target))
                    print('Moving? {} at speed: {}'.format(
                        env.agents[a].moving, info['speed'][a]))
                    print('Action required? {}'.format(
                        info['action_required'][a]))
                    print('Network action: {}'.format(network_action_dict[a]))
                    print('Railenv action: {}'.format(railenv_action_dict[a]))
            # Update replay buffer and train agent
            if args.train:
                for a in range(env.get_num_agents()):
                    if args.crash_penalty and crash[a]:
                        # Store bad experience
                        dqn.step(curr_obs[a], 1, -100, curr_obs[a], True)

                    if not args.switch2switch:
                        if update_values[a] and not buffer_done[a]:
                            next_obs = preprocessor.get_obs(a, maps[a], maps)
                            dqn.step(curr_obs[a], network_action_dict[a],
                                     reward[a], next_obs, done[a])

                    else:
                        if update_values[a] and not buffer_done[a]:
                            # If I had an obs from a previous switch
                            if len(buffer_obs[a]) != 0:
                                dqn.step(buffer_obs[a], 1, buffer_rew[a],
                                         curr_obs[a], done[a])
                                buffer_obs[a] = []
                                buffer_rew[a] = 0

                            if network_action_dict[a] == 0:
                                dqn.step(curr_obs[a], 1, reward[a],
                                         curr_obs[a], False)
                            elif network_action_dict[a] == 1:
                                # I store the obs and update at the next switch
                                buffer_obs[a] = curr_obs[a].copy()

                        # Cache reward only if we have an obs from a prev switch
                        if len(buffer_obs[a]) != 0:
                            buffer_rew[a] += reward[a]

                    # Now update the done cache to avoid adding experience many times
                    buffer_done[a] = done[a]

            for a in range(env.get_num_agents()):
                cumulative_reward += reward[
                    a]  # / env.get_num_agents() # Update cumulative reward (not norm)

            # TODO? env sets done[all] = True for everyone when time limit is reached
            # devid: I also remember this, but debuggind doesn't seem to happen
            if done['__all__']:
                env_done = 1
                break

        ################### End of the episode
        eps = max(args.end_eps, args.eps_decay * eps)  # Decrease epsilon
        # Metrics
        done_window.append(env_done)  # Save done in this episode

        num_agents_done = 0  # Num of agents that reached their target in the last episode
        for a in range(env.get_num_agents()):
            if done[a]:
                num_agents_done += 1
        done_agents_window.append(num_agents_done / env.get_num_agents())
        reward_window.append(
            cumulative_reward)  # Save cumulative reward in this episode
        normalized_reward = cumulative_reward / (env.compute_max_episode_steps(
            env.width, env.height) + env.get_num_agents())
        norm_reward_window.append(normalized_reward)

        mean_dones.append((np.mean(done_window)))
        mean_agent_dones.append((np.mean(done_agents_window)))
        mean_rewards.append(np.mean(reward_window))
        mean_norm_rewards.append(np.mean(norm_reward_window))

        # Print training results info
        print(
            '\r{} Agents on ({},{}). Episode: {}\t Mean done agents: {:.2f}\t Mean reward: {:.2f}\t Mean normalized reward: {:.2f}\t Done agents in last episode: {:.2f}%\t Epsilon: {:.2f}'
            .format(
                env.get_num_agents(),
                args.width,
                args.height,
                ep,
                mean_agent_dones[-1],  # Fraction of done agents
                mean_rewards[-1],
                mean_norm_rewards[-1],
                (num_agents_done / args.num_agents),
                eps),
            end=" ")

        if ep != 0 and (ep + 1) % args.checkpoint_interval == 0:
            print(
                '\r{} Agents on ({},{}). Episode: {}\t Mean done agents: {:.2f}\t Mean reward: {:.2f}\t Mean normalized reward: {:.2f}\t Epsilon: {:.2f}'
                .format(env.get_num_agents(), args.width, args.height, ep,
                        mean_agent_dones[-1], mean_rewards[-1],
                        mean_norm_rewards[-1], eps))

        if args.train and ep != 0 and (ep + 1) % args.save_interval == 0:
            torch.save(dqn.qnetwork_local.state_dict(),
                       results_dir + '/weights.pt')

        if args.plot:
            writer.add_scalar('mean_agent_dones', mean_agent_dones[-1], ep)
            writer.add_scalar('mean_rewards', mean_rewards[-1], ep)
            writer.add_scalar('mean_dones', mean_dones[-1], ep)
            writer.add_scalar('mean_norm_rewards', mean_norm_rewards[-1], ep)
            writer.add_scalar('epsilon', eps, ep)