def _init(self): self._validate_config() self.local_evaluator = self.make_local_evaluator( self.env_creator, self._policy_graph) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, self._policy_graph, self.config["num_workers"]) if self.config["simple_optimizer"]: self.optimizer = SyncSamplesOptimizer( self.local_evaluator, self.remote_evaluators, { "num_sgd_iter": self.config["num_sgd_iter"], "train_batch_size": self.config["train_batch_size"], }) else: self.optimizer = LocalMultiGPUOptimizer( self.local_evaluator, self.remote_evaluators, { "sgd_batch_size": self.config["sgd_minibatch_size"], "num_sgd_iter": self.config["num_sgd_iter"], "num_gpus": self.config["num_gpus"], "sample_batch_size": self.config["sample_batch_size"], "num_envs_per_worker": self.config["num_envs_per_worker"], "train_batch_size": self.config["train_batch_size"], "standardize_fields": ["advantages"], "straggler_mitigation": (self.config["straggler_mitigation"]), })
def _init(self): waste_ratio = (self.config["sample_batch_size"] * self.config["num_workers"] / self.config["train_batch_size"]) if waste_ratio > 1: msg = ("sample_batch_size * num_workers >> train_batch_size. " "This means that many steps will be discarded. Consider " "reducing sample_batch_size, or increase train_batch_size.") if waste_ratio > 1.5: raise ValueError(msg) else: print("Warning: " + msg) self.local_evaluator = self.make_local_evaluator( self.env_creator, self._policy_graph) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, self._policy_graph, self.config["num_workers"], { "num_cpus": self.config["num_cpus_per_worker"], "num_gpus": self.config["num_gpus_per_worker"] }) if self.config["simple_optimizer"]: self.optimizer = SyncSamplesOptimizer( self.local_evaluator, self.remote_evaluators, { "num_sgd_iter": self.config["num_sgd_iter"], "train_batch_size": self.config["train_batch_size"] }) else: self.optimizer = LocalMultiGPUOptimizer( self.local_evaluator, self.remote_evaluators, { "sgd_batch_size": self.config["sgd_minibatch_size"], "num_sgd_iter": self.config["num_sgd_iter"], "num_gpus": self.config["num_gpus"], "train_batch_size": self.config["train_batch_size"], "standardize_fields": ["advantages"], })
def _init(self): self._validate_config() self.local_evaluator = self.make_local_evaluator( self.env_creator, self._policy_graph) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, self._policy_graph, self.config["num_workers"]) if self.config["simple_optimizer"]: self.optimizer = SyncSamplesOptimizer( self.local_evaluator, self.remote_evaluators, { "num_sgd_iter": self.config["num_sgd_iter"], "train_batch_size": self.config["train_batch_size"], }) else: self.optimizer = LocalMultiGPUOptimizer( self.local_evaluator, self.remote_evaluators, { "sgd_batch_size": self.config["sgd_minibatch_size"], "num_sgd_iter": self.config["num_sgd_iter"], "num_gpus": self.config["num_gpus"], "sample_batch_size": self.config["sample_batch_size"], "num_envs_per_worker": self.config["num_envs_per_worker"], "train_batch_size": self.config["train_batch_size"], "standardize_fields": ["advantages"], "straggler_mitigation": ( self.config["straggler_mitigation"]), })
def _init(self): self.local_evaluator = self.make_local_evaluator( self.env_creator, PPOPolicyGraph) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, PPOPolicyGraph, self.config["num_workers"], { "num_cpus": self.config["num_cpus_per_worker"], "num_gpus": self.config["num_gpus_per_worker"] }) if self.config["simple_optimizer"]: self.optimizer = SyncSamplesOptimizer( self.local_evaluator, self.remote_evaluators, {"num_sgd_iter": self.config["num_sgd_iter"]}) else: self.optimizer = LocalMultiGPUOptimizer( self.local_evaluator, self.remote_evaluators, { "sgd_batch_size": self.config["sgd_batchsize"], "sgd_stepsize": self.config["sgd_stepsize"], "num_sgd_iter": self.config["num_sgd_iter"], "timesteps_per_batch": self.config["timesteps_per_batch"], "standardize_fields": ["advantages"] })
def _init(self, config, env_creator): self._validate_config() self.local_evaluator = self.make_local_evaluator( env_creator, self._policy_graph) self.remote_evaluators = self.make_remote_evaluators( env_creator, self._policy_graph, config["num_workers"]) if config["simple_optimizer"]: self.optimizer = SyncSamplesOptimizer( self.local_evaluator, self.remote_evaluators, num_sgd_iter=config["num_sgd_iter"], train_batch_size=config["train_batch_size"]) else: self.optimizer = LocalMultiGPUOptimizer( self.local_evaluator, self.remote_evaluators, sgd_batch_size=config["sgd_minibatch_size"], num_sgd_iter=config["num_sgd_iter"], num_gpus=config["num_gpus"], sample_batch_size=config["sample_batch_size"], num_envs_per_worker=config["num_envs_per_worker"], train_batch_size=config["train_batch_size"], standardize_fields=["advantages"], straggler_mitigation=config["straggler_mitigation"])
def choose_policy_optimizer(workers, config): if config["distributed_data_parallel_optimizer"]: if not config["use_pytorch"]: raise ValueError( "Distributed data parallel is only supported for PyTorch") if config["num_gpus"]: raise ValueError( "When using distributed data parallel, you should set " "num_gpus=0 since all optimization " "is happening on workers. Enable GPUs for workers by setting " "num_gpus_per_worker=1.") if config["batch_mode"] != "truncate_episodes": raise ValueError( "Distributed data parallel requires truncate_episodes " "batch mode.") if config["sample_batch_size"] != config["train_batch_size"]: raise ValueError( "Distributed data parallel requires sample_batch_size to be " "equal to train_batch_size. Each worker will sample and learn " "on train_batch_size samples per iteration.") return TorchDistributedDataParallelOptimizer( workers, num_sgd_iter=config["num_sgd_iter"], train_batch_size=config["train_batch_size"], sgd_minibatch_size=config["sgd_minibatch_size"], standardize_fields=["advantages"]) if config["simple_optimizer"]: return SyncSamplesOptimizer( workers, num_sgd_iter=config["num_sgd_iter"], train_batch_size=config["train_batch_size"], sgd_minibatch_size=config["sgd_minibatch_size"], standardize_fields=["advantages"]) return LocalMultiGPUOptimizer( workers, sgd_batch_size=config["sgd_minibatch_size"], num_sgd_iter=config["num_sgd_iter"], num_gpus=config["num_gpus"], sample_batch_size=config["sample_batch_size"], num_envs_per_worker=config["num_envs_per_worker"], train_batch_size=config["train_batch_size"], standardize_fields=["advantages"], shuffle_sequences=config["shuffle_sequences"])
def choose_policy_optimizer(workers, config): if config["simple_optimizer"]: return SyncSamplesOptimizer( workers, num_sgd_iter=config["num_sgd_iter"], train_batch_size=config["train_batch_size"]) return LocalMultiGPUOptimizer( workers, sgd_batch_size=config["sgd_minibatch_size"], num_sgd_iter=config["num_sgd_iter"], num_gpus=config["num_gpus"], sample_batch_size=config["sample_batch_size"], num_envs_per_worker=config["num_envs_per_worker"], train_batch_size=config["train_batch_size"], standardize_fields=["advantages"], shuffle_sequences=config["shuffle_sequences"])
def make_optimizer(local_evaluator, remote_evaluators, config): if config["simple_optimizer"]: return SyncSamplesOptimizer( local_evaluator, remote_evaluators, num_sgd_iter=config["num_sgd_iter"], train_batch_size=config["train_batch_size"]) return LocalMultiGPUOptimizer( local_evaluator, remote_evaluators, sgd_batch_size=config["sgd_minibatch_size"], num_sgd_iter=config["num_sgd_iter"], num_gpus=config["num_gpus"], sample_batch_size=config["sample_batch_size"], num_envs_per_worker=config["num_envs_per_worker"], train_batch_size=config["train_batch_size"], standardize_fields=["advantages"], straggler_mitigation=config["straggler_mitigation"])
def make_policy_optimizer_tnbes(workers, config): """We implement the knob of NORMALIZE_ADVANTAGE here.""" if config["simple_optimizer"]: raise NotImplementedError() if config[NORMALIZE_ADVANTAGE]: normalized_fields = ["advantages", DIVERSITY_ADVANTAGES] else: normalized_fields = [] return LocalMultiGPUOptimizer( workers, sgd_batch_size=config["sgd_minibatch_size"], num_sgd_iter=config["num_sgd_iter"], num_gpus=config["num_gpus"], rollout_fragment_length=config["rollout_fragment_length"], num_envs_per_worker=config["num_envs_per_worker"], train_batch_size=config["train_batch_size"], standardize_fields=normalized_fields, shuffle_sequences=config["shuffle_sequences"])
class PPOAgent(Agent): """Multi-GPU optimized implementation of PPO in TensorFlow.""" _agent_name = "PPO" _default_config = DEFAULT_CONFIG _policy_graph = PPOPolicyGraph @override(Agent) def _init(self): self._validate_config() self.local_evaluator = self.make_local_evaluator( self.env_creator, self._policy_graph) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, self._policy_graph, self.config["num_workers"]) if self.config["simple_optimizer"]: self.optimizer = SyncSamplesOptimizer( self.local_evaluator, self.remote_evaluators, { "num_sgd_iter": self.config["num_sgd_iter"], "train_batch_size": self.config["train_batch_size"], }) else: self.optimizer = LocalMultiGPUOptimizer( self.local_evaluator, self.remote_evaluators, { "sgd_batch_size": self.config["sgd_minibatch_size"], "num_sgd_iter": self.config["num_sgd_iter"], "num_gpus": self.config["num_gpus"], "train_batch_size": self.config["train_batch_size"], "standardize_fields": ["advantages"], }) @override(Agent) def _train(self): prev_steps = self.optimizer.num_steps_sampled fetches = self.optimizer.step() if "kl" in fetches: # single-agent self.local_evaluator.for_policy( lambda pi: pi.update_kl(fetches["kl"])) else: # multi-agent self.local_evaluator.foreach_trainable_policy( lambda pi, pi_id: pi.update_kl(fetches[pi_id]["kl"])) res = self.optimizer.collect_metrics( self.config["collect_metrics_timeout"]) res.update(timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps, info=dict(fetches, **res.get("info", {}))) return res def _validate_config(self): waste_ratio = (self.config["sample_batch_size"] * self.config["num_workers"] / self.config["train_batch_size"]) if waste_ratio > 1: msg = ("sample_batch_size * num_workers >> train_batch_size. " "This means that many steps will be discarded. Consider " "reducing sample_batch_size, or increase train_batch_size.") if waste_ratio > 1.5: raise ValueError(msg) else: logger.warn(msg) if self.config["sgd_minibatch_size"] > self.config["train_batch_size"]: raise ValueError( "Minibatch size {} must be <= train batch size {}.".format( self.config["sgd_minibatch_size"], self.config["train_batch_size"])) if (self.config["batch_mode"] == "truncate_episodes" and not self.config["use_gae"]): raise ValueError( "Episode truncation is not supported without a value function") if (self.config["multiagent"]["policy_graphs"] and not self.config["simple_optimizer"]): logger.info( "In multi-agent mode, policies will be optimized sequentially " "by the multi-GPU optimizer. Consider setting " "simple_optimizer=True if this doesn't work for you.") if self.config["observation_filter"] != "NoFilter": # TODO(ekl): consider setting the default to be NoFilter logger.warn( "By default, observations will be normalized with {}".format( self.config["observation_filter"]))
class PPOAgent(Agent): """Multi-GPU optimized implementation of PPO in TensorFlow.""" _agent_name = "PPO" _default_config = DEFAULT_CONFIG _policy_graph = PPOPolicyGraph @classmethod def default_resource_request(cls, config): cf = merge_dicts(cls._default_config, config) return Resources( cpu=1, gpu=cf["num_gpus"], extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"]) def _init(self): self.local_evaluator = self.make_local_evaluator( self.env_creator, self._policy_graph) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, self._policy_graph, self.config["num_workers"], { "num_cpus": self.config["num_cpus_per_worker"], "num_gpus": self.config["num_gpus_per_worker"] }) if self.config["simple_optimizer"]: self.optimizer = SyncSamplesOptimizer( self.local_evaluator, self.remote_evaluators, { "num_sgd_iter": self.config["num_sgd_iter"], "timesteps_per_batch": self.config["timesteps_per_batch"] }) else: self.optimizer = LocalMultiGPUOptimizer( self.local_evaluator, self.remote_evaluators, { "sgd_batch_size": self.config["sgd_batchsize"], "num_sgd_iter": self.config["num_sgd_iter"], "num_gpus": self.config["num_gpus"], "timesteps_per_batch": self.config["timesteps_per_batch"], "standardize_fields": ["advantages"], }) def _train(self): prev_steps = self.optimizer.num_steps_sampled fetches = self.optimizer.step() if "kl" in fetches: # single-agent self.local_evaluator.for_policy( lambda pi: pi.update_kl(fetches["kl"])) else: # multi-agent self.local_evaluator.foreach_trainable_policy( lambda pi, pi_id: pi.update_kl(fetches[pi_id]["kl"])) res = self.optimizer.collect_metrics() res.update(timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps, info=dict(fetches, **res.get("info", {}))) return res def _stop(self): # workaround for https://github.com/ray-project/ray/issues/1516 for ev in self.remote_evaluators: ev.__ray_terminate__.remote() def _save(self, checkpoint_dir): checkpoint_path = os.path.join(checkpoint_dir, "checkpoint-{}".format(self.iteration)) agent_state = ray.get( [a.save.remote() for a in self.remote_evaluators]) extra_data = [self.local_evaluator.save(), agent_state] pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb")) return checkpoint_path def _restore(self, checkpoint_path): extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb")) self.local_evaluator.restore(extra_data[0]) ray.get([ a.restore.remote(o) for (a, o) in zip(self.remote_evaluators, extra_data[1]) ])
class PPOAgent(Agent): """Multi-GPU optimized implementation of PPO in TensorFlow.""" _agent_name = "PPO" _default_config = DEFAULT_CONFIG _policy_graph = PPOPolicyGraph @override(Agent) def _init(self): self._validate_config() self.local_evaluator = self.make_local_evaluator( self.env_creator, self._policy_graph) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, self._policy_graph, self.config["num_workers"]) if self.config["simple_optimizer"]: self.optimizer = SyncSamplesOptimizer( self.local_evaluator, self.remote_evaluators, { "num_sgd_iter": self.config["num_sgd_iter"], "train_batch_size": self.config["train_batch_size"], }) else: self.optimizer = LocalMultiGPUOptimizer( self.local_evaluator, self.remote_evaluators, { "sgd_batch_size": self.config["sgd_minibatch_size"], "num_sgd_iter": self.config["num_sgd_iter"], "num_gpus": self.config["num_gpus"], "sample_batch_size": self.config["sample_batch_size"], "num_envs_per_worker": self.config["num_envs_per_worker"], "train_batch_size": self.config["train_batch_size"], "standardize_fields": ["advantages"], "straggler_mitigation": (self.config["straggler_mitigation"]), }) @override(Agent) def _train(self): if "observation_filter" not in self.raw_user_config: # TODO(ekl) remove this message after a few releases logger.info( "Important! Since 0.7.0, observation normalization is no " "longer enabled by default. To enable running-mean " "normalization, set 'observation_filter': 'MeanStdFilter'. " "You can ignore this message if your environment doesn't " "require observation normalization.") prev_steps = self.optimizer.num_steps_sampled fetches = self.optimizer.step() if "kl" in fetches: # single-agent self.local_evaluator.for_policy( lambda pi: pi.update_kl(fetches["kl"])) else: def update(pi, pi_id): if pi_id in fetches: pi.update_kl(fetches[pi_id]["kl"]) else: logger.debug( "No data for {}, not updating kl".format(pi_id)) # multi-agent self.local_evaluator.foreach_trainable_policy(update) res = self.optimizer.collect_metrics( self.config["collect_metrics_timeout"]) res.update(timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps, info=dict(fetches, **res.get("info", {}))) # Warn about bad clipping configs if self.config["vf_clip_param"] <= 0: rew_scale = float("inf") elif res["policy_reward_mean"]: rew_scale = 0 # punt on handling multiagent case else: rew_scale = round( abs(res["episode_reward_mean"]) / self.config["vf_clip_param"], 0) if rew_scale > 100: logger.warning( "The magnitude of your environment rewards are more than " "{}x the scale of `vf_clip_param`. ".format(rew_scale) + "This means that it will take more than " "{} iterations for your value ".format(rew_scale) + "function to converge. If this is not intended, consider " "increasing `vf_clip_param`.") return res def _validate_config(self): if self.config["entropy_coeff"] < 0: raise DeprecationWarning("entropy_coeff must be >= 0") if self.config["sgd_minibatch_size"] > self.config["train_batch_size"]: raise ValueError( "Minibatch size {} must be <= train batch size {}.".format( self.config["sgd_minibatch_size"], self.config["train_batch_size"])) if (self.config["batch_mode"] == "truncate_episodes" and not self.config["use_gae"]): raise ValueError( "Episode truncation is not supported without a value " "function. Consider setting batch_mode=complete_episodes.") if (self.config["multiagent"]["policy_graphs"] and not self.config["simple_optimizer"]): logger.info( "In multi-agent mode, policies will be optimized sequentially " "by the multi-GPU optimizer. Consider setting " "simple_optimizer=True if this doesn't work for you.") if not self.config["vf_share_layers"]: logger.warning( "FYI: By default, the value function will not share layers " "with the policy model ('vf_share_layers': False).")
class PPOAgent(Agent): """Multi-GPU optimized implementation of PPO in TensorFlow.""" _agent_name = "PPO" _default_config = DEFAULT_CONFIG _policy_graph = PPOPolicyGraph @classmethod def default_resource_request(cls, config): cf = merge_dicts(cls._default_config, config) return Resources( cpu=1, gpu=cf["num_gpus"], extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"]) def _init(self): self._validate_config() self.local_evaluator = self.make_local_evaluator( self.env_creator, self._policy_graph) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, self._policy_graph, self.config["num_workers"], { "num_cpus": self.config["num_cpus_per_worker"], "num_gpus": self.config["num_gpus_per_worker"] }) if self.config["simple_optimizer"]: self.optimizer = SyncSamplesOptimizer( self.local_evaluator, self.remote_evaluators, { "num_sgd_iter": self.config["num_sgd_iter"], "train_batch_size": self.config["train_batch_size"], }) else: self.optimizer = LocalMultiGPUOptimizer( self.local_evaluator, self.remote_evaluators, { "sgd_batch_size": self.config["sgd_minibatch_size"], "num_sgd_iter": self.config["num_sgd_iter"], "num_gpus": self.config["num_gpus"], "train_batch_size": self.config["train_batch_size"], "standardize_fields": ["advantages"], }) def _validate_config(self): waste_ratio = ( self.config["sample_batch_size"] * self.config["num_workers"] / self.config["train_batch_size"]) if waste_ratio > 1: msg = ("sample_batch_size * num_workers >> train_batch_size. " "This means that many steps will be discarded. Consider " "reducing sample_batch_size, or increase train_batch_size.") if waste_ratio > 1.5: raise ValueError(msg) else: print("Warning: " + msg) if self.config["sgd_minibatch_size"] > self.config["train_batch_size"]: raise ValueError( "Minibatch size {} must be <= train batch size {}.".format( self.config["sgd_minibatch_size"], self.config["train_batch_size"])) if (self.config["batch_mode"] == "truncate_episodes" and not self.config["use_gae"]): raise ValueError( "Episode truncation is not supported without a value function") def _train(self): prev_steps = self.optimizer.num_steps_sampled fetches = self.optimizer.step() if "kl" in fetches: # single-agent self.local_evaluator.for_policy( lambda pi: pi.update_kl(fetches["kl"])) else: # multi-agent self.local_evaluator.foreach_trainable_policy( lambda pi, pi_id: pi.update_kl(fetches[pi_id]["kl"])) res = self.optimizer.collect_metrics() res.update( timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps, info=dict(fetches, **res.get("info", {}))) return res
class PPOAgentICM(Agent): """Multi-GPU optimized implementation of PPO in TensorFlow.""" # _agent_name = "PPO" # _default_config = DEFAULT_CONFIG # _policy_graph = PPOPolicyGraph _agent_name = "PPO_ICM" _default_config = DEFAULT_CONFIG _policy_graph = PPOPolicyGraphICM @classmethod def default_resource_request(cls, config): cf = merge_dicts(cls._default_config, config) return Resources( cpu=1, gpu=cf["num_gpus"], extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"]) def _init(self): self._validate_config() self.local_evaluator = self.make_local_evaluator( self.env_creator, self._policy_graph) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, self._policy_graph, self.config["num_workers"], { "num_cpus": self.config["num_cpus_per_worker"], "num_gpus": self.config["num_gpus_per_worker"] }) if self.config["simple_optimizer"]: self.optimizer = SyncSamplesOptimizer( self.local_evaluator, self.remote_evaluators, { "num_sgd_iter": self.config["num_sgd_iter"], "train_batch_size": self.config["train_batch_size"] }) else: self.optimizer = LocalMultiGPUOptimizer( self.local_evaluator, self.remote_evaluators, { "sgd_batch_size": self.config["sgd_minibatch_size"], "num_sgd_iter": self.config["num_sgd_iter"], "num_gpus": self.config["num_gpus"], "train_batch_size": self.config["train_batch_size"], "standardize_fields": ["advantages"], }) def _validate_config(self): waste_ratio = (self.config["sample_batch_size"] * self.config["num_workers"] / self.config["train_batch_size"]) if waste_ratio > 1: msg = ("sample_batch_size * num_workers >> train_batch_size. " "This means that many steps will be discarded. Consider " "reducing sample_batch_size, or increase train_batch_size.") if waste_ratio > 1.5: raise ValueError(msg) else: print("Warning: " + msg) if self.config["sgd_minibatch_size"] > self.config["train_batch_size"]: raise ValueError( "Minibatch size {} must be <= train batch size {}.".format( self.config["sgd_minibatch_size"], self.config["train_batch_size"])) if (self.config["batch_mode"] == "truncate_episodes" and not self.config["use_gae"]): raise ValueError( "Episode truncation is not supported without a value function") def _train(self): prev_steps = self.optimizer.num_steps_sampled fetches = self.optimizer.step() if "kl" in fetches: # single-agent self.local_evaluator.for_policy( lambda pi: pi.update_kl(fetches["kl"])) else: # multi-agent self.local_evaluator.foreach_trainable_policy( lambda pi, pi_id: pi.update_kl(fetches[pi_id]["kl"])) # samples = self.local_evaluator.sample() res = self.optimizer.collect_metrics() res.update(timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps, info=dict(fetches, **res.get("info", {}))) return res def _stop(self): # workaround for https://github.com/ray-project/ray/issues/1516 for ev in self.remote_evaluators: ev.__ray_terminate__.remote() def _save(self, checkpoint_dir): checkpoint_path = os.path.join(checkpoint_dir, "checkpoint-{}".format(self.iteration)) agent_state = ray.get( [a.save.remote() for a in self.remote_evaluators]) extra_data = [self.local_evaluator.save(), agent_state] pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb")) return checkpoint_path def _restore(self, checkpoint_path): extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb")) self.local_evaluator.restore(extra_data[0]) ray.get([ a.restore.remote(o) for (a, o) in zip(self.remote_evaluators, extra_data[1]) ])
class PPOAgent(Agent): """Multi-GPU optimized implementation of PPO in TensorFlow.""" _agent_name = "PPO" _default_config = DEFAULT_CONFIG _policy_graph = PPOPolicyGraph @override(Agent) def _init(self): self._validate_config() self.local_evaluator = self.make_local_evaluator( self.env_creator, self._policy_graph) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, self._policy_graph, self.config["num_workers"]) if self.config["simple_optimizer"]: self.optimizer = SyncSamplesOptimizer( self.local_evaluator, self.remote_evaluators, { "num_sgd_iter": self.config["num_sgd_iter"], "train_batch_size": self.config["train_batch_size"], }) else: self.optimizer = LocalMultiGPUOptimizer( self.local_evaluator, self.remote_evaluators, { "sgd_batch_size": self.config["sgd_minibatch_size"], "num_sgd_iter": self.config["num_sgd_iter"], "num_gpus": self.config["num_gpus"], "sample_batch_size": self.config["sample_batch_size"], "num_envs_per_worker": self.config["num_envs_per_worker"], "train_batch_size": self.config["train_batch_size"], "standardize_fields": ["advantages"], "straggler_mitigation": ( self.config["straggler_mitigation"]), }) @override(Agent) def _train(self): if "observation_filter" not in self.raw_user_config: # TODO(ekl) remove this message after a few releases logger.info( "Important! Since 0.7.0, observation normalization is no " "longer enabled by default. To enable running-mean " "normalization, set 'observation_filter': 'MeanStdFilter'. " "You can ignore this message if your environment doesn't " "require observation normalization.") prev_steps = self.optimizer.num_steps_sampled fetches = self.optimizer.step() if "kl" in fetches: # single-agent self.local_evaluator.for_policy( lambda pi: pi.update_kl(fetches["kl"])) else: def update(pi, pi_id): if pi_id in fetches: pi.update_kl(fetches[pi_id]["kl"]) else: logger.debug( "No data for {}, not updating kl".format(pi_id)) # multi-agent self.local_evaluator.foreach_trainable_policy(update) res = self.optimizer.collect_metrics( self.config["collect_metrics_timeout"]) res.update( timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps, info=dict(fetches, **res.get("info", {}))) # Warn about bad clipping configs if self.config["vf_clip_param"] <= 0: rew_scale = float("inf") elif res["policy_reward_mean"]: rew_scale = 0 # punt on handling multiagent case else: rew_scale = round( abs(res["episode_reward_mean"]) / self.config["vf_clip_param"], 0) if rew_scale > 100: logger.warning( "The magnitude of your environment rewards are more than " "{}x the scale of `vf_clip_param`. ".format(rew_scale) + "This means that it will take more than " "{} iterations for your value ".format(rew_scale) + "function to converge. If this is not intended, consider " "increasing `vf_clip_param`.") return res def _validate_config(self): if self.config["sgd_minibatch_size"] > self.config["train_batch_size"]: raise ValueError( "Minibatch size {} must be <= train batch size {}.".format( self.config["sgd_minibatch_size"], self.config["train_batch_size"])) if (self.config["batch_mode"] == "truncate_episodes" and not self.config["use_gae"]): raise ValueError( "Episode truncation is not supported without a value " "function. Consider setting batch_mode=complete_episodes.") if (self.config["multiagent"]["policy_graphs"] and not self.config["simple_optimizer"]): logger.info( "In multi-agent mode, policies will be optimized sequentially " "by the multi-GPU optimizer. Consider setting " "simple_optimizer=True if this doesn't work for you.") if not self.config["vf_share_layers"]: logger.warning( "FYI: By default, the value function will not share layers " "with the policy model ('vf_share_layers': False).")