Esempio n. 1
0
        def run_experiment():
            total_step = 0
            agent, env, spec = prepare_test_env_agent(headless=True)
            timestep_sec = env.timestep_sec
            policy_class = "ultra.baselines.sac:sac-v0"
            log_dir = "tests/output_eval_check_logs"

            for episode in episodes(1, etag=policy_class, log_dir=log_dir):
                observations = env.reset()
                state = observations[AGENT_ID]
                dones, infos = {"__all__": False}, None
                episode.reset()
                experiment_dir = episode.experiment_dir

                if not os.path.exists(f"{experiment_dir}/spec.pkl"):
                    if not os.path.exists(experiment_dir):
                        os.makedirs(experiment_dir)
                    with open(f"{experiment_dir}/spec.pkl",
                              "wb") as spec_output:
                        dill.dump(spec, spec_output, pickle.HIGHEST_PROTOCOL)

                while not dones["__all__"]:
                    evaluation_check(
                        agent=agent,
                        agent_id=AGENT_ID,
                        episode=episode,
                        eval_rate=10,
                        eval_episodes=1,
                        max_episode_steps=2,
                        policy_class=policy_class,
                        scenario_info=("00", "eval_test"),
                        timestep_sec=0.1,
                        headless=True,
                        log_dir=log_dir,
                    )
                    action = agent.act(state, explore=True)
                    observations, rewards, dones, infos = env.step(
                        {AGENT_ID: action})
                    next_state = observations[AGENT_ID]

                    # retrieve some relavant information from reward processor
                    # observations[AGENT_ID]["ego"].update(rewards[AGENT_ID]["log"])
                    loss_output = agent.step(
                        state=state,
                        action=action,
                        reward=rewards[AGENT_ID],
                        next_state=next_state,
                        done=dones[AGENT_ID],
                    )
                    episode.record_step(
                        agent_id=AGENT_ID,
                        infos=infos,
                        rewards=rewards,
                        total_step=total_step,
                        loss_output=loss_output,
                    )
                    total_step += 1
                    state = next_state

            env.close()
Esempio n. 2
0
def train(
    scenario_info,
    num_episodes,
    policy_classes,
    max_episode_steps,
    eval_info,
    timestep_sec,
    headless,
    seed,
    log_dir,
    policy_ids=None,
):
    torch.set_num_threads(1)
    total_step = 0
    finished = False

    # Make agent_ids in the form of 000, 001, ..., 010, 011, ..., 999, 1000, ...;
    # or use the provided policy_ids if available.
    agent_ids = (
        ["0" * max(0, 3 - len(str(i))) + str(i) for i in range(len(policy_classes))]
        if not policy_ids
        else policy_ids
    )
    # Ensure there is an ID for each policy, and a policy for each ID.
    assert len(agent_ids) == len(policy_classes), (
        "The number of agent IDs provided ({}) must be equal to "
        "the number of policy classes provided ({}).".format(
            len(agent_ids), len(policy_classes)
        )
    )

    # Assign the policy classes to their associated ID.
    agent_classes = {
        agent_id: policy_class
        for agent_id, policy_class in zip(agent_ids, policy_classes)
    }
    # Create the agent specifications matched with their associated ID.
    agent_specs = {
        agent_id: make(locator=policy_class, max_episode_steps=max_episode_steps)
        for agent_id, policy_class in agent_classes.items()
    }
    # Create the agents matched with their associated ID.
    agents = {
        agent_id: agent_spec.build_agent()
        for agent_id, agent_spec in agent_specs.items()
    }

    # Create the environment.
    env = gym.make(
        "ultra.env:ultra-v0",
        agent_specs=agent_specs,
        scenario_info=scenario_info,
        headless=headless,
        timestep_sec=timestep_sec,
        seed=seed,
    )

    # Define an 'etag' for this experiment's data directory based off policy_classes.
    # E.g. From a ["ultra.baselines.dqn:dqn-v0", "ultra.baselines.ppo:ppo-v0"]
    # policy_classes list, transform it to an etag of "dqn-v0:ppo-v0".
    etag = ":".join([policy_class.split(":")[-1] for policy_class in policy_classes])

    for episode in episodes(num_episodes, etag=etag, log_dir=log_dir):
        # Reset the environment and retrieve the initial observations.
        observations = env.reset()
        dones = {"__all__": False}
        infos = None
        episode.reset()
        experiment_dir = episode.experiment_dir

        # Save relevant agent metadata.
        if not os.path.exists(f"{experiment_dir}/agent_metadata.pkl"):
            if not os.path.exists(experiment_dir):
                os.makedirs(experiment_dir)
            with open(f"{experiment_dir}/agent_metadata.pkl", "wb") as metadata_file:
                dill.dump(
                    {
                        "agent_ids": agent_ids,
                        "agent_classes": agent_classes,
                        "agent_specs": agent_specs,
                    },
                    metadata_file,
                    pickle.HIGHEST_PROTOCOL,
                )

        while not dones["__all__"]:
            # Break if any of the agent's step counts is 1000000 or greater.
            if any([episode.get_itr(agent_id) >= 1000000 for agent_id in agents]):
                finished = True
                break

            # Perform the evaluation check.
            evaluation_check(
                agents=agents,
                agent_ids=agent_ids,
                policy_classes=agent_classes,
                episode=episode,
                log_dir=log_dir,
                max_episode_steps=max_episode_steps,
                **eval_info,
                **env.info,
            )

            # Request and perform actions on each agent that received an observation.
            actions = {
                agent_id: agents[agent_id].act(observation, explore=True)
                for agent_id, observation in observations.items()
            }
            next_observations, rewards, dones, infos = env.step(actions)

            # Active agents are those that receive observations in this step and the next
            # step. Step each active agent (obtaining their network loss if applicable).
            active_agent_ids = observations.keys() & next_observations.keys()
            loss_outputs = {
                agent_id: agents[agent_id].step(
                    state=observations[agent_id],
                    action=actions[agent_id],
                    reward=rewards[agent_id],
                    next_state=next_observations[agent_id],
                    done=dones[agent_id],
                    info=infos[agent_id],
                )
                for agent_id in active_agent_ids
            }

            # Record the data from this episode.
            episode.record_step(
                agent_ids_to_record=active_agent_ids,
                infos=infos,
                rewards=rewards,
                total_step=total_step,
                loss_outputs=loss_outputs,
            )

            # Update variables for the next step.
            total_step += 1
            observations = next_observations

        # Normalize the data and record this episode on tensorboard.
        episode.record_episode()
        episode.record_tensorboard()

        if finished:
            break

    env.close()
Esempio n. 3
0
def train(
    scenario_info,
    num_episodes,
    max_episode_steps,
    policy_class,
    eval_info,
    timestep_sec,
    headless,
    seed,
    log_dir,
):
    torch.set_num_threads(1)
    total_step = 0
    finished = False

    AGENT_ID = "007"

    spec = make(locator=policy_class, max_episode_steps=max_episode_steps)
    env = gym.make(
        "ultra.env:ultra-v0",
        agent_specs={AGENT_ID: spec},
        scenario_info=scenario_info,
        headless=headless,
        timestep_sec=timestep_sec,
        seed=seed,
    )

    agent = spec.build_agent()

    for episode in episodes(num_episodes, etag=policy_class, log_dir=log_dir):
        observations = env.reset()
        state = observations[AGENT_ID]
        dones, infos = {"__all__": False}, None
        episode.reset()
        experiment_dir = episode.experiment_dir

        # save entire spec [ policy_params, reward_adapter, observation_adapter]
        if not os.path.exists(f"{experiment_dir}/spec.pkl"):
            if not os.path.exists(experiment_dir):
                os.makedirs(experiment_dir)
            with open(f"{experiment_dir}/spec.pkl", "wb") as spec_output:
                dill.dump(spec, spec_output, pickle.HIGHEST_PROTOCOL)

        while not dones["__all__"]:
            if episode.get_itr(AGENT_ID) >= 1000000:
                finished = True
                break
            evaluation_check(
                agent=agent,
                agent_id=AGENT_ID,
                policy_class=policy_class,
                episode=episode,
                log_dir=log_dir,
                max_episode_steps=max_episode_steps,
                **eval_info,
                **env.info,
            )
            action = agent.act(state, explore=True)
            observations, rewards, dones, infos = env.step({AGENT_ID: action})
            next_state = observations[AGENT_ID]

            loss_output = agent.step(
                state=state,
                action=action,
                reward=rewards[AGENT_ID],
                next_state=next_state,
                done=dones[AGENT_ID],
            )
            episode.record_step(
                agent_id=AGENT_ID,
                infos=infos,
                rewards=rewards,
                total_step=total_step,
                loss_output=loss_output,
            )
            total_step += 1
            state = next_state

        episode.record_episode()
        episode.record_tensorboard(agent_id=AGENT_ID)
        if finished:
            break

    env.close()
Esempio n. 4
0
def run_experiment(scenario_info, num_agents, log_dir, headless=True):
    agent_ids = [
        "0" * max(0, 3 - len(str(i))) + str(i) for i in range(num_agents)
    ]
    agent_classes = {
        agent_id: "ultra.baselines.sac:sac-v0"
        for agent_id in agent_ids
    }
    agent_specs = {
        agent_id: BaselineAgentSpec(policy_class=SACPolicy,
                                    max_episode_steps=2)
        for agent_id in agent_ids
    }

    env = gym.make(
        "ultra.env:ultra-v0",
        agent_specs=agent_specs,
        scenario_info=scenario_info,
        headless=headless,
        timestep_sec=0.1,
        seed=seed,
    )

    agents = {
        agent_id: agent_spec.build_agent()
        for agent_id, agent_spec in agent_specs.items()
    }

    total_step = 0
    etag = ":".join(
        [policy_class.split(":")[-1] for policy_class in agent_classes])
    evaluation_task_ids = dict()

    for episode in episodes(1, etag=etag, log_dir=log_dir):
        observations = env.reset()
        dones = {"__all__": False}
        infos = None
        episode.reset()
        experiment_dir = episode.experiment_dir

        if not os.path.exists(f"{experiment_dir}/agent_metadata.pkl"):
            if not os.path.exists(experiment_dir):
                os.makedirs(experiment_dir)
            with open(f"{experiment_dir}/agent_metadata.pkl",
                      "wb") as metadata_file:
                dill.dump(
                    {
                        "agent_ids": agent_ids,
                        "agent_classes": agent_classes,
                        "agent_specs": agent_specs,
                    },
                    metadata_file,
                    pickle.HIGHEST_PROTOCOL,
                )

        while not dones["__all__"]:
            evaluation_check(
                agents=agents,
                agent_ids=agent_ids,
                episode=episode,
                eval_rate=10,
                eval_episodes=1,
                max_episode_steps=2,
                policy_classes=agent_classes,
                scenario_info=scenario_info,
                evaluation_task_ids=evaluation_task_ids,
                timestep_sec=0.1,
                headless=True,
                log_dir=log_dir,
            )
            collect_evaluations(evaluation_task_ids=evaluation_task_ids)

            actions = {
                agent_id: agents[agent_id].act(observation, explore=True)
                for agent_id, observation in observations.items()
            }
            next_observations, rewards, dones, infos = env.step(actions)

            active_agent_ids = observations.keys() & next_observations.keys()
            loss_outputs = {
                agent_id: agents[agent_id].step(
                    state=observations[agent_id],
                    action=actions[agent_id],
                    reward=rewards[agent_id],
                    next_state=next_observations[agent_id],
                    done=dones[agent_id],
                    info=infos[agent_id],
                )
                for agent_id in active_agent_ids
            }

            episode.record_step(
                agent_ids_to_record=active_agent_ids,
                infos=infos,
                rewards=rewards,
                total_step=total_step,
                loss_outputs=loss_outputs,
            )

            total_step += 1
            observations = next_observations

    # Wait on the remaining evaluations to finish.
    while collect_evaluations(evaluation_task_ids):
        time.sleep(0.1)

    env.close()
Esempio n. 5
0
def train(
    scenario_info,
    num_episodes,
    policy_classes,
    max_episode_steps,
    max_steps,
    eval_info,
    timestep_sec,
    headless,
    seed,
    log_dir,
    policy_ids=None,
):
    torch.set_num_threads(1)
    total_step = 0
    finished = False
    evaluation_task_ids = dict()

    agent_ids, agent_classes, agent_specs, agents, etag = build_agents(
        policy_classes, policy_ids, max_episode_steps)

    # Create the environment.
    env = gym.make(
        "ultra.env:ultra-v0",
        agent_specs=agent_specs,
        scenario_info=scenario_info,
        headless=headless,
        timestep_sec=timestep_sec,
        seed=seed,
    )

    for episode in episodes(num_episodes, etag=etag, log_dir=log_dir):

        # Reset the environment and retrieve the initial observations.
        observations = env.reset()
        dones = {"__all__": False}
        infos = None
        episode.reset()

        experiment_dir = episode.experiment_dir
        # Name of agent metadata pickle file
        filename = "agent_metadata.pkl"
        if not os.path.exists(os.path.join(experiment_dir, filename)):
            _save_agent_metadata(
                experiment_dir,
                filename,
                agent_ids,
                agent_classes,
                agent_specs,
            )

        evaluation_check(
            agents=agents,
            agent_ids=agent_ids,
            policy_classes=agent_classes,
            episode=episode,
            log_dir=log_dir,
            max_episode_steps=max_episode_steps,
            evaluation_task_ids=evaluation_task_ids,
            **eval_info,
            **env.info,
        )

        collect_evaluations(evaluation_task_ids=evaluation_task_ids)

        evaluation_check(
            agents=agents,
            agent_ids=agent_ids,
            policy_classes=agent_classes,
            episode=episode,
            log_dir=log_dir,
            max_episode_steps=max_episode_steps,
            evaluation_task_ids=evaluation_task_ids,
            **eval_info,
            **env.info,
        )

        collect_evaluations(evaluation_task_ids=evaluation_task_ids)

        while not dones["__all__"]:
            # Break if any of the agent's step counts is max_steps (default is 1000000) or greater.
            if any([
                    episode.get_itr(agent_id) >= max_steps
                    for agent_id in agents
            ]):
                finished = True
                break
            # Request and perform actions on each agent that received an observation.
            actions = {
                agent_id: agents[agent_id].act(observation, explore=True)
                for agent_id, observation in observations.items()
            }
            next_observations, rewards, dones, infos = env.step(actions)

            # Active agents are those that receive observations in this step and the next
            # step. Step each active agent (obtaining their network loss if applicable).
            active_agent_ids = observations.keys() & next_observations.keys()
            loss_outputs = {
                agent_id: agents[agent_id].step(
                    state=observations[agent_id],
                    action=actions[agent_id],
                    reward=rewards[agent_id],
                    next_state=next_observations[agent_id],
                    done=dones[agent_id],
                    info=infos[agent_id],
                )
                for agent_id in active_agent_ids
            }

            # Record the data from this episode.
            episode.record_step(
                agent_ids_to_record=active_agent_ids,
                infos=infos,
                rewards=rewards,
                total_step=total_step,
                loss_outputs=loss_outputs,
            )

            # Update variables for the next step.
            total_step += 1
            observations = next_observations

        episode.record_episode()
        episode.record_tensorboard(recording_step=episode.index)

        if finished:
            break

    # Wait on the remaining evaluations to finish.
    while collect_evaluations(evaluation_task_ids):
        time.sleep(0.1)

    env.close()