Beispiel #1
0
def train(
    scenario_info,
    num_episodes,
    policy_classes,
    max_episode_steps,
    max_steps,
    eval_info,
    timestep_sec,
    headless,
    seed,
    log_dir,
    policy_ids=None,
):
    torch.set_num_threads(1)
    total_step = 0
    finished = False
    evaluation_task_ids = dict()

    agent_ids, agent_classes, agent_specs, agents, etag = build_agents(
        policy_classes, policy_ids, max_episode_steps)

    # Create the environment.
    env = gym.make(
        "ultra.env:ultra-v0",
        agent_specs=agent_specs,
        scenario_info=scenario_info,
        headless=headless,
        timestep_sec=timestep_sec,
        seed=seed,
    )

    for episode in episodes(num_episodes, etag=etag, log_dir=log_dir):

        # Reset the environment and retrieve the initial observations.
        observations = env.reset()
        dones = {"__all__": False}
        infos = None
        episode.reset()

        experiment_dir = episode.experiment_dir
        # Name of agent metadata pickle file
        filename = "agent_metadata.pkl"
        if not os.path.exists(os.path.join(experiment_dir, filename)):
            _save_agent_metadata(
                experiment_dir,
                filename,
                agent_ids,
                agent_classes,
                agent_specs,
            )

        evaluation_check(
            agents=agents,
            agent_ids=agent_ids,
            policy_classes=agent_classes,
            episode=episode,
            log_dir=log_dir,
            max_episode_steps=max_episode_steps,
            evaluation_task_ids=evaluation_task_ids,
            **eval_info,
            **env.info,
        )

        collect_evaluations(evaluation_task_ids=evaluation_task_ids)

        evaluation_check(
            agents=agents,
            agent_ids=agent_ids,
            policy_classes=agent_classes,
            episode=episode,
            log_dir=log_dir,
            max_episode_steps=max_episode_steps,
            evaluation_task_ids=evaluation_task_ids,
            **eval_info,
            **env.info,
        )

        collect_evaluations(evaluation_task_ids=evaluation_task_ids)

        while not dones["__all__"]:
            # Break if any of the agent's step counts is max_steps (default is 1000000) or greater.
            if any([
                    episode.get_itr(agent_id) >= max_steps
                    for agent_id in agents
            ]):
                finished = True
                break
            # Request and perform actions on each agent that received an observation.
            actions = {
                agent_id: agents[agent_id].act(observation, explore=True)
                for agent_id, observation in observations.items()
            }
            next_observations, rewards, dones, infos = env.step(actions)

            # Active agents are those that receive observations in this step and the next
            # step. Step each active agent (obtaining their network loss if applicable).
            active_agent_ids = observations.keys() & next_observations.keys()
            loss_outputs = {
                agent_id: agents[agent_id].step(
                    state=observations[agent_id],
                    action=actions[agent_id],
                    reward=rewards[agent_id],
                    next_state=next_observations[agent_id],
                    done=dones[agent_id],
                    info=infos[agent_id],
                )
                for agent_id in active_agent_ids
            }

            # Record the data from this episode.
            episode.record_step(
                agent_ids_to_record=active_agent_ids,
                infos=infos,
                rewards=rewards,
                total_step=total_step,
                loss_outputs=loss_outputs,
            )

            # Update variables for the next step.
            total_step += 1
            observations = next_observations

        episode.record_episode()
        episode.record_tensorboard(recording_step=episode.index)

        if finished:
            break

    # Wait on the remaining evaluations to finish.
    while collect_evaluations(evaluation_task_ids):
        time.sleep(0.1)

    env.close()
Beispiel #2
0
def run_experiment(scenario_info, num_agents, log_dir, headless=True):
    agent_ids = [
        "0" * max(0, 3 - len(str(i))) + str(i) for i in range(num_agents)
    ]
    agent_classes = {
        agent_id: "ultra.baselines.sac:sac-v0"
        for agent_id in agent_ids
    }
    agent_specs = {
        agent_id: BaselineAgentSpec(policy_class=SACPolicy,
                                    max_episode_steps=2)
        for agent_id in agent_ids
    }

    env = gym.make(
        "ultra.env:ultra-v0",
        agent_specs=agent_specs,
        scenario_info=scenario_info,
        headless=headless,
        timestep_sec=0.1,
        seed=seed,
    )

    agents = {
        agent_id: agent_spec.build_agent()
        for agent_id, agent_spec in agent_specs.items()
    }

    total_step = 0
    etag = ":".join(
        [policy_class.split(":")[-1] for policy_class in agent_classes])
    evaluation_task_ids = dict()

    for episode in episodes(1, etag=etag, log_dir=log_dir):
        observations = env.reset()
        dones = {"__all__": False}
        infos = None
        episode.reset()
        experiment_dir = episode.experiment_dir

        if not os.path.exists(f"{experiment_dir}/agent_metadata.pkl"):
            if not os.path.exists(experiment_dir):
                os.makedirs(experiment_dir)
            with open(f"{experiment_dir}/agent_metadata.pkl",
                      "wb") as metadata_file:
                dill.dump(
                    {
                        "agent_ids": agent_ids,
                        "agent_classes": agent_classes,
                        "agent_specs": agent_specs,
                    },
                    metadata_file,
                    pickle.HIGHEST_PROTOCOL,
                )

        while not dones["__all__"]:
            evaluation_check(
                agents=agents,
                agent_ids=agent_ids,
                episode=episode,
                eval_rate=10,
                eval_episodes=1,
                max_episode_steps=2,
                policy_classes=agent_classes,
                scenario_info=scenario_info,
                evaluation_task_ids=evaluation_task_ids,
                timestep_sec=0.1,
                headless=True,
                log_dir=log_dir,
            )
            collect_evaluations(evaluation_task_ids=evaluation_task_ids)

            actions = {
                agent_id: agents[agent_id].act(observation, explore=True)
                for agent_id, observation in observations.items()
            }
            next_observations, rewards, dones, infos = env.step(actions)

            active_agent_ids = observations.keys() & next_observations.keys()
            loss_outputs = {
                agent_id: agents[agent_id].step(
                    state=observations[agent_id],
                    action=actions[agent_id],
                    reward=rewards[agent_id],
                    next_state=next_observations[agent_id],
                    done=dones[agent_id],
                    info=infos[agent_id],
                )
                for agent_id in active_agent_ids
            }

            episode.record_step(
                agent_ids_to_record=active_agent_ids,
                infos=infos,
                rewards=rewards,
                total_step=total_step,
                loss_outputs=loss_outputs,
            )

            total_step += 1
            observations = next_observations

    # Wait on the remaining evaluations to finish.
    while collect_evaluations(evaluation_task_ids):
        time.sleep(0.1)

    env.close()
Beispiel #3
0
def train(
    scenario_info,
    num_episodes,
    policy_classes,
    max_episode_steps,
    eval_info,
    timestep_sec,
    headless,
    seed,
    log_dir,
    policy_ids=None,
):
    torch.set_num_threads(1)
    total_step = 0
    finished = False
    evaluation_task_ids = dict()

    # Make agent_ids in the form of 000, 001, ..., 010, 011, ..., 999, 1000, ...;
    # or use the provided policy_ids if available.
    agent_ids = ([
        "0" * max(0, 3 - len(str(i))) + str(i)
        for i in range(len(policy_classes))
    ] if not policy_ids else policy_ids)
    # Ensure there is an ID for each policy, and a policy for each ID.
    assert len(agent_ids) == len(policy_classes), (
        "The number of agent IDs provided ({}) must be equal to "
        "the number of policy classes provided ({}).".format(
            len(agent_ids), len(policy_classes)))

    # Assign the policy classes to their associated ID.
    agent_classes = {
        agent_id: policy_class
        for agent_id, policy_class in zip(agent_ids, policy_classes)
    }
    # Create the agent specifications matched with their associated ID.
    agent_specs = {
        agent_id: make(locator=policy_class,
                       max_episode_steps=max_episode_steps)
        for agent_id, policy_class in agent_classes.items()
    }
    # Create the agents matched with their associated ID.
    agents = {
        agent_id: agent_spec.build_agent()
        for agent_id, agent_spec in agent_specs.items()
    }

    # Create the environment.
    env = gym.make(
        "ultra.env:ultra-v0",
        agent_specs=agent_specs,
        scenario_info=scenario_info,
        headless=headless,
        timestep_sec=timestep_sec,
        seed=seed,
    )

    # Define an 'etag' for this experiment's data directory based off policy_classes.
    # E.g. From a ["ultra.baselines.dqn:dqn-v0", "ultra.baselines.ppo:ppo-v0"]
    # policy_classes list, transform it to an etag of "dqn-v0:ppo-v0".
    etag = ":".join(
        [policy_class.split(":")[-1] for policy_class in policy_classes])

    old_episode = None
    for episode in episodes(num_episodes, etag=etag, log_dir=log_dir):

        # Reset the environment and retrieve the initial observations.
        observations = env.reset()
        dones = {"__all__": False}
        infos = None
        episode.reset()
        experiment_dir = episode.experiment_dir

        # Save relevant agent metadata.
        if not os.path.exists(f"{experiment_dir}/agent_metadata.pkl"):
            if not os.path.exists(experiment_dir):
                os.makedirs(experiment_dir)
            with open(f"{experiment_dir}/agent_metadata.pkl",
                      "wb") as metadata_file:
                dill.dump(
                    {
                        "agent_ids": agent_ids,
                        "agent_classes": agent_classes,
                        "agent_specs": agent_specs,
                    },
                    metadata_file,
                    pickle.HIGHEST_PROTOCOL,
                )

        evaluation_check(
            agents=agents,
            agent_ids=agent_ids,
            policy_classes=agent_classes,
            episode=episode,
            log_dir=log_dir,
            max_episode_steps=max_episode_steps,
            evaluation_task_ids=evaluation_task_ids,
            **eval_info,
            **env.info,
        )

        collect_evaluations(evaluation_task_ids=evaluation_task_ids)

        while not dones["__all__"]:
            # Break if any of the agent's step counts is 1000000 or greater.
            if any(
                [episode.get_itr(agent_id) >= 1000000 for agent_id in agents]):
                finished = True
                break
            # Request and perform actions on each agent that received an observation.
            actions = {
                agent_id: agents[agent_id].act(observation, explore=True)
                for agent_id, observation in observations.items()
            }
            next_observations, rewards, dones, infos = env.step(actions)

            # Active agents are those that receive observations in this step and the next
            # step. Step each active agent (obtaining their network loss if applicable).
            active_agent_ids = observations.keys() & next_observations.keys()
            loss_outputs = {
                agent_id: agents[agent_id].step(
                    state=observations[agent_id],
                    action=actions[agent_id],
                    reward=rewards[agent_id],
                    next_state=next_observations[agent_id],
                    done=dones[agent_id],
                    info=infos[agent_id],
                )
                for agent_id in active_agent_ids
            }

            # Record the data from this episode.
            episode.record_step(
                agent_ids_to_record=active_agent_ids,
                infos=infos,
                rewards=rewards,
                total_step=total_step,
                loss_outputs=loss_outputs,
            )

            # Update variables for the next step.
            total_step += 1
            observations = next_observations

        episode.record_episode()
        episode.record_tensorboard()

        if finished:
            break

    # Wait on the remaining evaluations to finish.
    while collect_evaluations(evaluation_task_ids):
        time.sleep(0.1)

    env.close()