def create_policy( self, policy_id: PolicyID, policy_cls: Type["Policy"], observation_space: gym.Space, action_space: gym.Space, config_override: PartialAlgorithmConfigDict, merged_config: AlgorithmConfigDict, ) -> None: """Creates a new policy and stores it to the cache. Args: policy_id: The policy ID. This is the key under which the created policy will be stored in this map. policy_cls: The (original) policy class to use. This may still be altered in case tf-eager (and tracing) is used. observation_space: The observation space of the policy. action_space: The action space of the policy. config_override: The config override dict for this policy. This is the partial dict provided by the user. merged_config: The entire config (merged default config + `config_override`). """ _class = get_tf_eager_cls_if_necessary(policy_cls, merged_config) self[policy_id] = create_policy_for_framework( policy_id, _class, merged_config, observation_space, action_space, self.worker_index, self.session_creator, self.seed, ) # Store spec (class, obs-space, act-space, and config overrides) such # that the map will be able to reproduce on-the-fly added policies # from disk. self.policy_specs[policy_id] = PolicySpec( policy_class=policy_cls, observation_space=observation_space, action_space=action_space, config=config_override, )
def add_policy(self, policy_id: PolicyID, policy_spec: PolicySpec): # Merge the policies config overrides with the main config. # Also, adjust `num_gpus` (to indicate an individual policy's # num_gpus, not the total number of GPUs). cfg = Algorithm.merge_trainer_configs( self.config, dict(policy_spec.config, **{"num_gpus": self.num_gpus_per_policy}), ) # Need to create the replay actor first. Then add the first policy. if self.replay_actor is None: return self._add_replay_buffer_and_policy(policy_id, policy_spec, cfg) # Replay actor already exists -> Just add a new policy here. assert len(self.policy_actors) < self.max_num_policies actual_policy_class = get_tf_eager_cls_if_necessary( policy_spec.policy_class, cfg) colocated = create_colocated_actors( actor_specs=[( ray.remote( num_cpus=1, num_gpus=self.num_gpus_per_policy if not cfg["_fake_gpus"] else 0, )(actual_policy_class), # Policy c'tor args. (policy_spec.observation_space, policy_spec.action_space, cfg), # Policy c'tor kwargs={}. {}, # Count=1, 1, )], # Force co-locate on the already existing replay actor's node. node=ray.get(self.replay_actor.get_host.remote()), ) self.policy_actors[policy_id] = colocated[0][0] return self.policy_actors[policy_id]
def _add_replay_buffer_and_policy( self, policy_id: PolicyID, policy_spec: PolicySpec, config: AlgorithmConfigDict, ): assert self.replay_actor is None assert len(self.policy_actors) == 0 actual_policy_class = get_tf_eager_cls_if_necessary( policy_spec.policy_class, config) colocated = create_colocated_actors( actor_specs=[ (self.replay_actor_class, self.replay_actor_args, {}, 1), ] + [( ray.remote( num_cpus=1, num_gpus=self.num_gpus_per_policy if not config["_fake_gpus"] else 0, )(actual_policy_class), # Policy c'tor args. (policy_spec.observation_space, policy_spec.action_space, config), # Policy c'tor kwargs={}. {}, # Count=1, 1, )], node=None, ) # None self.replay_actor = colocated[0][0] self.policy_actors[policy_id] = colocated[1][0] self.has_replay_buffer = True return self.policy_actors[policy_id]
def create_policy(self, policy_id: PolicyID, policy_cls: Type["Policy"], observation_space: gym.Space, action_space: gym.Space, config_override: PartialTrainerConfigDict, merged_config: TrainerConfigDict) -> None: """Creates a new policy and stores it to the cache. Args: policy_id: The policy ID. This is the key under which the created policy will be stored in this map. policy_cls: The (original) policy class to use. This may still be altered in case tf-eager (and tracing) is used. observation_space: The observation space of the policy. action_space: The action space of the policy. config_override: The config override dict for this policy. This is the partial dict provided by the user. merged_config: The entire config (merged default config + `config_override`). """ framework = merged_config.get("framework", "tf") class_ = get_tf_eager_cls_if_necessary(policy_cls, merged_config) # Tf. if framework in ["tf2", "tf", "tfe"]: var_scope = policy_id + ( ("_wk" + str(self.worker_index)) if self.worker_index else "") # For tf static graph, build every policy in its own graph # and create a new session for it. if framework == "tf": with tf1.Graph().as_default(): if self.session_creator: sess = self.session_creator() else: sess = tf1.Session(config=tf1.ConfigProto( gpu_options=tf1.GPUOptions(allow_growth=True))) with sess.as_default(): # Set graph-level seed. if self.seed is not None: tf1.set_random_seed(self.seed) with tf1.variable_scope(var_scope): self[policy_id] = class_(observation_space, action_space, merged_config) # For tf-eager: no graph, no session. else: with tf1.variable_scope(var_scope): self[policy_id] = \ class_(observation_space, action_space, merged_config) # Non-tf: No graph, no session. else: class_ = policy_cls self[policy_id] = class_(observation_space, action_space, merged_config) # Store spec (class, obs-space, act-space, and config overrides) such # that the map will be able to reproduce on-the-fly added policies # from disk. self.policy_specs[policy_id] = PolicySpec( policy_class=policy_cls, observation_space=observation_space, action_space=action_space, config=config_override)