Esempio n. 1
0
    def create_policy(
        self,
        policy_id: PolicyID,
        policy_cls: Type["Policy"],
        observation_space: gym.Space,
        action_space: gym.Space,
        config_override: PartialAlgorithmConfigDict,
        merged_config: AlgorithmConfigDict,
    ) -> None:
        """Creates a new policy and stores it to the cache.

        Args:
            policy_id: The policy ID. This is the key under which
                the created policy will be stored in this map.
            policy_cls: The (original) policy class to use.
                This may still be altered in case tf-eager (and tracing)
                is used.
            observation_space: The observation space of the
                policy.
            action_space: The action space of the policy.
            config_override: The config override
                dict for this policy. This is the partial dict provided by
                the user.
            merged_config: The entire config (merged
                default config + `config_override`).
        """
        _class = get_tf_eager_cls_if_necessary(policy_cls, merged_config)

        self[policy_id] = create_policy_for_framework(
            policy_id,
            _class,
            merged_config,
            observation_space,
            action_space,
            self.worker_index,
            self.session_creator,
            self.seed,
        )

        # Store spec (class, obs-space, act-space, and config overrides) such
        # that the map will be able to reproduce on-the-fly added policies
        # from disk.
        self.policy_specs[policy_id] = PolicySpec(
            policy_class=policy_cls,
            observation_space=observation_space,
            action_space=action_space,
            config=config_override,
        )
Esempio n. 2
0
    def add_policy(self, policy_id: PolicyID, policy_spec: PolicySpec):
        # Merge the policies config overrides with the main config.
        # Also, adjust `num_gpus` (to indicate an individual policy's
        # num_gpus, not the total number of GPUs).
        cfg = Algorithm.merge_trainer_configs(
            self.config,
            dict(policy_spec.config, **{"num_gpus": self.num_gpus_per_policy}),
        )

        # Need to create the replay actor first. Then add the first policy.
        if self.replay_actor is None:
            return self._add_replay_buffer_and_policy(policy_id, policy_spec,
                                                      cfg)

        # Replay actor already exists -> Just add a new policy here.

        assert len(self.policy_actors) < self.max_num_policies

        actual_policy_class = get_tf_eager_cls_if_necessary(
            policy_spec.policy_class, cfg)

        colocated = create_colocated_actors(
            actor_specs=[(
                ray.remote(
                    num_cpus=1,
                    num_gpus=self.num_gpus_per_policy
                    if not cfg["_fake_gpus"] else 0,
                )(actual_policy_class),
                # Policy c'tor args.
                (policy_spec.observation_space, policy_spec.action_space, cfg),
                # Policy c'tor kwargs={}.
                {},
                # Count=1,
                1,
            )],
            # Force co-locate on the already existing replay actor's node.
            node=ray.get(self.replay_actor.get_host.remote()),
        )

        self.policy_actors[policy_id] = colocated[0][0]

        return self.policy_actors[policy_id]
Esempio n. 3
0
    def _add_replay_buffer_and_policy(
        self,
        policy_id: PolicyID,
        policy_spec: PolicySpec,
        config: AlgorithmConfigDict,
    ):
        assert self.replay_actor is None
        assert len(self.policy_actors) == 0

        actual_policy_class = get_tf_eager_cls_if_necessary(
            policy_spec.policy_class, config)

        colocated = create_colocated_actors(
            actor_specs=[
                (self.replay_actor_class, self.replay_actor_args, {}, 1),
            ] + [(
                ray.remote(
                    num_cpus=1,
                    num_gpus=self.num_gpus_per_policy
                    if not config["_fake_gpus"] else 0,
                )(actual_policy_class),
                # Policy c'tor args.
                (policy_spec.observation_space, policy_spec.action_space,
                 config),
                # Policy c'tor kwargs={}.
                {},
                # Count=1,
                1,
            )],
            node=None,
        )  # None

        self.replay_actor = colocated[0][0]
        self.policy_actors[policy_id] = colocated[1][0]
        self.has_replay_buffer = True

        return self.policy_actors[policy_id]
Esempio n. 4
0
    def create_policy(self, policy_id: PolicyID, policy_cls: Type["Policy"],
                      observation_space: gym.Space, action_space: gym.Space,
                      config_override: PartialTrainerConfigDict,
                      merged_config: TrainerConfigDict) -> None:
        """Creates a new policy and stores it to the cache.

        Args:
            policy_id: The policy ID. This is the key under which
                the created policy will be stored in this map.
            policy_cls: The (original) policy class to use.
                This may still be altered in case tf-eager (and tracing)
                is used.
            observation_space: The observation space of the
                policy.
            action_space: The action space of the policy.
            config_override: The config override
                dict for this policy. This is the partial dict provided by
                the user.
            merged_config: The entire config (merged
                default config + `config_override`).
        """
        framework = merged_config.get("framework", "tf")
        class_ = get_tf_eager_cls_if_necessary(policy_cls, merged_config)

        # Tf.
        if framework in ["tf2", "tf", "tfe"]:
            var_scope = policy_id + (
                ("_wk" + str(self.worker_index)) if self.worker_index else "")

            # For tf static graph, build every policy in its own graph
            # and create a new session for it.
            if framework == "tf":
                with tf1.Graph().as_default():
                    if self.session_creator:
                        sess = self.session_creator()
                    else:
                        sess = tf1.Session(config=tf1.ConfigProto(
                            gpu_options=tf1.GPUOptions(allow_growth=True)))
                    with sess.as_default():
                        # Set graph-level seed.
                        if self.seed is not None:
                            tf1.set_random_seed(self.seed)
                        with tf1.variable_scope(var_scope):
                            self[policy_id] = class_(observation_space,
                                                     action_space,
                                                     merged_config)
            # For tf-eager: no graph, no session.
            else:
                with tf1.variable_scope(var_scope):
                    self[policy_id] = \
                        class_(observation_space, action_space, merged_config)
        # Non-tf: No graph, no session.
        else:
            class_ = policy_cls
            self[policy_id] = class_(observation_space, action_space,
                                     merged_config)

        # Store spec (class, obs-space, act-space, and config overrides) such
        # that the map will be able to reproduce on-the-fly added policies
        # from disk.
        self.policy_specs[policy_id] = PolicySpec(
            policy_class=policy_cls,
            observation_space=observation_space,
            action_space=action_space,
            config=config_override)