Exemple #1
0
def wrap_adv_noise_ball(env_name, our_idx, multi_venv, adv_noise_params,
                        deterministic):
    adv_noise_agent_val = adv_noise_params["noise_val"]
    base_policy_path = adv_noise_params["base_path"]
    base_policy_type = adv_noise_params["base_type"]
    base_policy = load_policy(
        policy_path=base_policy_path,
        policy_type=base_policy_type,
        env=multi_venv,
        env_name=env_name,
        index=our_idx,
    )

    base_action_space = multi_venv.action_space.spaces[our_idx]
    adv_noise_action_space = Box(
        low=adv_noise_agent_val * base_action_space.low,
        high=adv_noise_agent_val * base_action_space.high,
    )
    multi_venv = MergeAgentVecEnv(
        venv=multi_venv,
        policy=base_policy,
        replace_action_space=adv_noise_action_space,
        merge_agent_idx=our_idx,
        deterministic=deterministic,
    )
    return multi_venv
def create_multi_agent_curried_policy_wrapper(mon_dir,
                                              env_name,
                                              num_envs,
                                              embed_index,
                                              max_steps,
                                              state_shape=None,
                                              add_zoo=False,
                                              num_zoo=5):
    def episode_limit(env):
        return time_limit.TimeLimit(env, max_episode_steps=max_steps)

    def env_fn(i):
        return make_env(env_name,
                        seed=42,
                        i=i,
                        out_dir=mon_dir,
                        pre_wrappers=[episode_limit])

    vec_env = make_dummy_vec_multi_env(
        [lambda: env_fn(i) for i in range(num_envs)])

    zoo = load_policy(
        policy_path="1",
        policy_type="zoo",
        env=vec_env,
        env_name=env_name,
        index=1 - embed_index,
        transparent_params=None,
    )
    half_env = FakeSingleSpacesVec(vec_env, agent_id=embed_index)
    policies = [
        _get_constant_policy(half_env,
                             constant_value=half_env.action_space.sample(),
                             state_shape=state_shape) for _ in range(10)
    ]
    if add_zoo:
        policies += [zoo] * num_zoo

    policy_wrapper = MultiPolicyWrapper(policies=policies, num_envs=num_envs)

    vec_env = CurryVecEnv(venv=vec_env,
                          policy=policy_wrapper,
                          agent_idx=embed_index,
                          deterministic=False)
    vec_env = FlattenSingletonVecEnv(vec_env)

    yield vec_env, policy_wrapper, zoo
    policy_wrapper.close()
    def __init__(
        self,
        venv,
        env_name,
        use_debug,
        victim_index,
        victim_path,
        victim_type,
        transparent_params,
        lb_mul,
        lb_num,
        lb_path,
        lb_type,
    ):
        super().__init__(venv)
        self.lb_num = lb_num
        self.lb_mul = lb_mul
        if transparent_params is None:
            raise ValueError(
                "LookbackRewardVecWrapper assumes transparent policies and venvs."
            )
        self.transparent_params = transparent_params
        self.victim_index = victim_index

        self._policy = load_policy(
            lb_type,
            lb_path,
            self.venv.unwrapped,
            env_name,
            1 - victim_index,
            transparent_params=None,
        )
        self._action = None
        self._obs = None
        self._state = None
        self._new_lb_state = None
        self._dones = [False] * self.num_envs
        self.ep_lens = np.zeros(self.num_envs).astype(int)
        self.lb_tuples = self._create_lb_tuples(env_name, use_debug,
                                                victim_index, victim_path,
                                                victim_type)
        self.use_debug = use_debug
        if self.use_debug:
            # create a debug file for this venv and also every lookback venv ordinally
            self.debug_files = [
                open(f"debug{i}.pkl", "wb") for i in range(self.lb_num + 1)
            ]
            self.get_debug_venv().set_debug_file(self.debug_files[0])
    def _create_lb_tuples(self, env_name, use_debug, victim_index, victim_path,
                          victim_type):
        """Create lookback data structures which are used to compare our episode rollouts against
        those of an environment where a lookback base policy acted instead.

        params victim_index, victim_path, victim_type are the same as in policy_loader.load_policy
        :param use_debug (bool): Use DummyVecEnv instead of SubprocVecEnv
        :return: (list<LookbackTuple>) lb_tuples
        """
        def env_fn(i):
            return make_env(env_name,
                            0,
                            i,
                            out_dir="data/lookbacks/",
                            pre_wrappers=[OldMujocoResettableWrapper])

        lb_tuples = []
        for _ in range(self.lb_num):
            make_vec_env = make_dummy_vec_multi_env if use_debug else make_subproc_vec_multi_env
            multi_venv = make_vec_env(
                [lambda: env_fn(i) for i in range(self.num_envs)])
            if use_debug:
                multi_venv = DebugVenv(multi_venv)

            victim = load_policy(
                policy_path=victim_path,
                policy_type=victim_type,
                env=multi_venv,
                env_name=env_name,
                index=victim_index,
                transparent_params=self.transparent_params,
            )

            multi_venv = TransparentCurryVecEnv(venv=multi_venv,
                                                policy=victim,
                                                agent_idx=victim_index,
                                                deterministic=True)

            single_venv = FlattenSingletonVecEnv(multi_venv)
            data_dict = {
                "state": None,
                "action": None,
                "info": defaultdict(dict)
            }
            lb_tuples.append(LookbackTuple(venv=single_venv, data=data_dict))
        return lb_tuples
Exemple #5
0
def maybe_embed_agent(
    multi_venv,
    our_idx,
    scheduler,
    log_callbacks,
    env_name,
    embed_types,
    embed_paths,
    embed_index,
    embed_noise,
    embed_noise_params,
    adv_noise_params,
    transparent_params,
    lookback_params,
):
    if len(embed_types) > 0:
        deterministic = lookback_params is not None
        # If we are actually training an epsilon-ball noise agent on top of a zoo agent
        if adv_noise_params["noise_val"] is not None:
            multi_venv = wrap_adv_noise_ball(
                env_name,
                our_idx,
                multi_venv,
                adv_noise_params=adv_noise_params,
                deterministic=deterministic,
            )
        embedded_policies = []
        # If we're loading multiple embedded agents
        for embed_type, embed_path in zip(embed_types, embed_paths):
            embedded_policies.append(
                load_policy(
                    policy_path=embed_path,
                    policy_type=embed_type,
                    env=multi_venv,
                    env_name=env_name,
                    index=embed_index,
                    transparent_params=transparent_params,
                ))

        if embed_noise:
            for i in range(len(embedded_policies)):
                embedded = apply_embedded_agent_wrapper(
                    embedded=embedded_policies[i],
                    noise_params=embed_noise_params,
                    scheduler=scheduler,
                )
                log_callbacks.append(LoggerOnlyLogCallback(embedded))
                embedded_policies[i] = embedded

        if len(embedded_policies) > 1:
            embedded_policy = MultiPolicyWrapper(embedded_policies,
                                                 num_envs=multi_venv.num_envs)
        else:
            embedded_policy = embedded_policies[0]

        # Curry the embedded agent
        cls = TransparentCurryVecEnv if transparent_params is not None else CurryVecEnv
        multi_venv = cls(
            venv=multi_venv,
            policy=embedded_policy,
            agent_idx=embed_index,
            deterministic=deterministic,
        )
    return multi_venv
def score_agent(
    _run,
    _seed,
    env_name,
    agent_a_path,
    agent_b_path,
    agent_a_type,
    agent_b_type,
    record_traj,
    record_traj_params,
    transparent_params,
    num_env,
    videos,
    video_params,
    mask_agent_index,
    noisy_agent_index,
    noisy_agent_magnitude,
    mask_agent_noise,
):
    save_dir = video_params["save_dir"]
    if videos:
        if save_dir is None:
            score_ex_logger.info(
                "No directory provided for saving videos; using a tmpdir instead,"
                " but videos will be saved to Sacred run directory")
            tmp_dir = tempfile.TemporaryDirectory(prefix="score-videos")
            save_dir = tmp_dir.name
        else:
            tmp_dir = None
        video_dirs = [osp.join(save_dir, str(i)) for i in range(num_env)]

    agent_wrappers = {}
    if mask_agent_index is not None:
        mask_agent_kwargs = {}
        if mask_agent_noise is not None:
            mask_agent_kwargs["noise_magnitude"] = mask_agent_noise

        agent_wrappers = make_mask_agent_wrappers(env_name, mask_agent_index,
                                                  **mask_agent_kwargs)

    video_params = utils.sacred_copy(video_params)  # Sacred issue #499

    def env_fn(i):
        env = make_env(env_name, _seed, i, None, agent_wrappers=agent_wrappers)
        if videos:
            if video_params["annotated"]:
                if "multicomp" in env_name:
                    assert num_env == 1, "pretty videos requires num_env=1"
                    env = AnnotatedGymCompete(
                        env,
                        env_name,
                        agent_a_type,
                        agent_a_path,
                        agent_b_type,
                        agent_b_path,
                        mask_agent_index,
                        **video_params["annotation_params"],
                    )
                else:
                    warnings.warn(
                        f"Annotated videos not supported for environment '{env_name}'"
                    )
            env = VideoWrapper(env, video_dirs[i], video_params["single_file"])
        return env

    env_fns = [functools.partial(env_fn, i) for i in range(num_env)]

    if num_env > 1:
        venv = make_subproc_vec_multi_env(env_fns)
    else:
        venv = make_dummy_vec_multi_env(env_fns)

    if record_traj:
        venv = TrajectoryRecorder(venv, record_traj_params["agent_indices"])

    if venv.num_agents == 1 and agent_b_path != "none":
        raise ValueError(
            "Set agent_b_path to 'none' if environment only uses one agent.")

    agent_paths = [agent_a_path, agent_b_path]
    agent_types = [agent_a_type, agent_b_type]
    zipped = list(zip(agent_types, agent_paths))
    agents = [
        load_policy(policy_type, policy_path, venv, env_name, i,
                    transparent_params)
        for i, (policy_type,
                policy_path) in enumerate(zipped[:venv.num_agents])
    ]

    if noisy_agent_index is not None:
        agents[noisy_agent_index] = NoisyAgentWrapper(
            agents[noisy_agent_index],
            noise_annealer=lambda: noisy_agent_magnitude)

    score = get_empirical_score(venv, agents)

    for agent in agents:
        if agent.sess is not None:
            agent.sess.close()

    if record_traj:
        save_paths = venv.save(save_dir=record_traj_params["save_dir"])
        for save_path in save_paths:
            score_ex.add_artifact(save_path, name="victim_activations.npz")

    venv.close()

    if videos:
        for env_video_dir in video_dirs:
            added = False
            for file_path in os.listdir(env_video_dir):
                added |= _save_video_or_metadata(env_video_dir, file_path)
            if not added:
                raise FileNotFoundError(
                    f"No video artifacts found in path {env_video_dir}.")

        if tmp_dir is not None:
            tmp_dir.cleanup()

    for observer in score_ex.observers:
        if hasattr(observer, "dir"):
            _clean_video_directory_structure(observer)

    return score