class LocalMultiGPUOptimizer(PolicyOptimizer): """A synchronous optimizer that uses multiple local GPUs. Samples are pulled synchronously from multiple remote evaluators, concatenated, and then split across the memory of multiple local GPUs. A number of SGD passes are then taken over the in-memory data. For more details, see `multi_gpu_impl.LocalSyncParallelOptimizer`. This optimizer is Tensorflow-specific and require the underlying PolicyGraph to be a TFPolicyGraph instance that support `.copy()`. Note that all replicas of the TFPolicyGraph will merge their extra_compute_grad and apply_grad feed_dicts and fetches. This may result in unexpected behavior. """ def _init(self, sgd_batch_size=128, sgd_stepsize=5e-5, num_sgd_iter=10, timesteps_per_batch=1024, standardize_fields=[]): self.batch_size = sgd_batch_size self.sgd_stepsize = sgd_stepsize self.num_sgd_iter = num_sgd_iter self.timesteps_per_batch = timesteps_per_batch gpu_ids = ray.get_gpu_ids() if not gpu_ids: self.devices = ["/cpu:0"] else: self.devices = ["/gpu:{}".format(i) for i in range(len(gpu_ids))] self.batch_size = int(sgd_batch_size / len(self.devices)) * len( self.devices) assert self.batch_size % len(self.devices) == 0 assert self.batch_size >= len(self.devices), "batch size too small" self.per_device_batch_size = int(self.batch_size / len(self.devices)) self.sample_timer = TimerStat() self.load_timer = TimerStat() self.grad_timer = TimerStat() self.update_weights_timer = TimerStat() self.standardize_fields = standardize_fields print("LocalMultiGPUOptimizer devices", self.devices) if set(self.local_evaluator.policy_map.keys()) != {"default"}: raise ValueError( "Multi-agent is not supported with multi-GPU. Try using the " "simple optimizer instead.") self.policy = self.local_evaluator.policy_map["default"] if not isinstance(self.policy, TFPolicyGraph): raise ValueError( "Only TF policies are supported with multi-GPU. Try using the " "simple optimizer instead.") # per-GPU graph copies created below must share vars with the policy # reuse is set to AUTO_REUSE because Adam nodes are created after # all of the device copies are created. with self.local_evaluator.tf_sess.graph.as_default(): with self.local_evaluator.tf_sess.as_default(): with tf.variable_scope("default", reuse=tf.AUTO_REUSE): if self.policy._state_inputs: rnn_inputs = self.policy._state_inputs + [ self.policy._seq_lens ] else: rnn_inputs = [] self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer( self.sgd_stepsize), self.devices, [v for _, v in self.policy.loss_inputs()], rnn_inputs, self.per_device_batch_size, self.policy.copy, os.getcwd()) self.sess = self.local_evaluator.tf_sess self.sess.run(tf.global_variables_initializer()) def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: # TODO(rliaw): remove when refactoring from ray.rllib.agents.ppo.rollout import collect_samples samples = collect_samples(self.remote_evaluators, self.timesteps_per_batch) else: samples = self.local_evaluator.sample() self._check_not_multiagent(samples) for field in self.standardize_fields: value = samples[field] standardized = (value - value.mean()) / max(1e-4, value.std()) samples[field] = standardized samples.shuffle() with self.load_timer: tuples = self.policy._get_loss_inputs_dict(samples) data_keys = [ph for _, ph in self.policy.loss_inputs()] if self.policy._state_inputs: state_keys = ( self.policy._state_inputs + [self.policy._seq_lens]) else: state_keys = [] tuples_per_device = self.par_opt.load_data( self.sess, [tuples[k] for k in data_keys], [tuples[k] for k in state_keys]) with self.grad_timer: num_batches = ( int(tuples_per_device) // int(self.per_device_batch_size)) print("== sgd epochs ==") for i in range(self.num_sgd_iter): iter_extra_fetches = defaultdict(list) permutation = np.random.permutation(num_batches) for batch_index in range(num_batches): batch_fetches = self.par_opt.optimize( self.sess, permutation[batch_index] * self.per_device_batch_size) for k, v in batch_fetches.items(): iter_extra_fetches[k].append(v) print(i, _averaged(iter_extra_fetches)) self.num_steps_sampled += samples.count self.num_steps_trained += samples.count return _averaged(iter_extra_fetches) def stats(self): return dict( PolicyOptimizer.stats(self), **{ "sample_time_ms": round(1000 * self.sample_timer.mean, 3), "load_time_ms": round(1000 * self.load_timer.mean, 3), "grad_time_ms": round(1000 * self.grad_timer.mean, 3), "update_time_ms": round(1000 * self.update_weights_timer.mean, 3), })
class LocalMultiGPUOptimizer(PolicyOptimizer): """A synchronous optimizer that uses multiple local GPUs. Samples are pulled synchronously from multiple remote evaluators, concatenated, and then split across the memory of multiple local GPUs. A number of SGD passes are then taken over the in-memory data. For more details, see `multi_gpu_impl.LocalSyncParallelOptimizer`. This optimizer is Tensorflow-specific and require the underlying PolicyGraph to be a TFPolicyGraph instance that support `.copy()`. Note that all replicas of the TFPolicyGraph will merge their extra_compute_grad and apply_grad feed_dicts and fetches. This may result in unexpected behavior. """ def _init(self, sgd_batch_size=128, sgd_stepsize=5e-5, num_sgd_iter=10, timesteps_per_batch=1024): self.batch_size = sgd_batch_size self.sgd_stepsize = sgd_stepsize self.num_sgd_iter = num_sgd_iter self.timesteps_per_batch = timesteps_per_batch gpu_ids = ray.get_gpu_ids() if not gpu_ids: self.devices = ["/cpu:0"] else: self.devices = ["/gpu:{}".format(i) for i in range(len(gpu_ids))] self.batch_size = int( sgd_batch_size / len(self.devices)) * len(self.devices) assert self.batch_size % len(self.devices) == 0 assert self.batch_size >= len(self.devices), "batch size too small" self.per_device_batch_size = int(self.batch_size / len(self.devices)) self.sample_timer = TimerStat() self.load_timer = TimerStat() self.grad_timer = TimerStat() self.update_weights_timer = TimerStat() print("LocalMultiGPUOptimizer devices", self.devices) assert set(self.local_evaluator.policy_map.keys()) == {"default"}, \ "Multi-agent is not supported" self.policy = self.local_evaluator.policy_map["default"] assert isinstance(self.policy, TFPolicyGraph), \ "Only TF policies are supported" # per-GPU graph copies created below must share vars with the policy # reuse is set to AUTO_REUSE because Adam nodes are created after # all of the device copies are created. with self.local_evaluator.tf_sess.graph.as_default(): with self.local_evaluator.tf_sess.as_default(): with tf.variable_scope("default", reuse=tf.AUTO_REUSE): self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.sgd_stepsize), self.devices, self.policy.loss_inputs(), self.per_device_batch_size, self.policy.copy, os.getcwd()) self.sess = self.local_evaluator.tf_sess self.sess.run(tf.global_variables_initializer()) def step(self, postprocess_fn=None): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: # TODO(rliaw): remove when refactoring from ray.rllib.agents.ppo.rollout import collect_samples samples = collect_samples(self.remote_evaluators, self.timesteps_per_batch) else: samples = self.local_evaluator.sample() self._check_not_multiagent(samples) if postprocess_fn: postprocess_fn(samples) with self.load_timer: tuples_per_device = self.par_opt.load_data( self.sess, samples.columns([key for key, _ in self.policy.loss_inputs()])) with self.grad_timer: all_extra_fetches = defaultdict(list) num_batches = ( int(tuples_per_device) // int(self.per_device_batch_size)) for i in range(self.num_sgd_iter): iter_extra_fetches = defaultdict(list) permutation = np.random.permutation(num_batches) for batch_index in range(num_batches): # TODO(ekl) support ppo's debugging features, e.g. # printing the current loss and tracing batch_fetches = self.par_opt.optimize( self.sess, permutation[batch_index] * self.per_device_batch_size) for k, v in batch_fetches.items(): iter_extra_fetches[k] += [v] for k, v in iter_extra_fetches.items(): all_extra_fetches[k] += [v] self.num_steps_sampled += samples.count self.num_steps_trained += samples.count return all_extra_fetches def stats(self): return dict(PolicyOptimizer.stats(self), **{ "sample_time_ms": round(1000 * self.sample_timer.mean, 3), "load_time_ms": round(1000 * self.load_timer.mean, 3), "grad_time_ms": round(1000 * self.grad_timer.mean, 3), "update_time_ms": round(1000 * self.update_weights_timer.mean, 3), })
class PPOEvaluator(Evaluator): """ Runner class that holds the simulator environment and the policy. Initializes the tensorflow graphs for both training and evaluation. One common policy graph is initialized on '/cpu:0' and holds all the shared network weights. When run as a remote agent, only this graph is used. """ def __init__(self, registry, env_creator, config, logdir, is_remote): self.registry = registry self.is_remote = is_remote if is_remote: os.environ["CUDA_VISIBLE_DEVICES"] = "" devices = ["/cpu:0"] else: devices = config["devices"] self.devices = devices self.config = config self.logdir = logdir self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=config_proto) if config["tf_debug_inf_or_nan"] and not is_remote: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder(name="newkl", shape=(), dtype=tf.float32) # The input observations. self.observations = tf.placeholder(tf.float32, shape=(None, ) + self.env.observation_space.shape) # Targets of the value function. self.value_targets = tf.placeholder(tf.float32, shape=(None, )) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None, )) action_space = self.env.action_space # TODO(rliaw): pull this into model_catalog if isinstance(action_space, gym.spaces.Box): self.actions = tf.placeholder(tf.float32, shape=(None, action_space.shape[0])) elif isinstance(action_space, gym.spaces.Discrete): self.actions = tf.placeholder(tf.int64, shape=(None, )) else: raise NotImplemented("action space" + str(type(action_space)) + "currently not supported") self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder(tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None, )) assert config["sgd_batchsize"] % len(devices) == 0, \ "Batch size must be evenly divisible by devices" if is_remote: self.batch_size = config["rollout_batchsize"] self.per_device_batch_size = config["rollout_batchsize"] else: self.batch_size = config["sgd_batchsize"] self.per_device_batch_size = int(self.batch_size / len(devices)) def build_loss(obs, vtargets, advs, acts, plog, pvf_preds): return ProximalPolicyLoss(self.env.observation_space, self.env.action_space, obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess, self.registry) self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices, [ self.observations, self.value_targets, self.advantages, self.actions, self.prev_logits, self.prev_vf_preds ], self.per_device_batch_size, build_loss, self.logdir) # Metric ops with tf.name_scope("test_outputs"): policies = self.par_opt.get_device_losses() self.mean_loss = tf.reduce_mean( tf.stack(values=[policy.loss for policy in policies]), 0) self.mean_policy_loss = tf.reduce_mean( tf.stack( values=[policy.mean_policy_loss for policy in policies]), 0) self.mean_vf_loss = tf.reduce_mean( tf.stack(values=[policy.mean_vf_loss for policy in policies]), 0) self.mean_kl = tf.reduce_mean( tf.stack(values=[policy.mean_kl for policy in policies]), 0) self.mean_entropy = tf.reduce_mean( tf.stack(values=[policy.mean_entropy for policy in policies]), 0) # References to the model weights self.common_policy = self.par_opt.get_common_loss() self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) self.obs_filter = get_filter(config["observation_filter"], self.env.observation_space.shape) self.rew_filter = MeanStdFilter((), clip=5.0) self.filters = { "obs_filter": self.obs_filter, "rew_filter": self.rew_filter } self.sampler = SyncSampler(self.env, self.common_policy, self.obs_filter, self.config["horizon"], self.config["horizon"]) self.sess.run(tf.global_variables_initializer()) def load_data(self, trajectories, full_trace): use_gae = self.config["use_gae"] dummy = np.zeros_like(trajectories["advantages"]) return self.par_opt.load_data(self.sess, [ trajectories["observations"], trajectories["value_targets"] if use_gae else dummy, trajectories["advantages"], trajectories["actions"], trajectories["logprobs"], trajectories["vf_preds"] if use_gae else dummy ], full_trace=full_trace) def run_sgd_minibatch(self, batch_index, kl_coeff, full_trace, file_writer): return self.par_opt.optimize( self.sess, batch_index, extra_ops=[ self.mean_loss, self.mean_policy_loss, self.mean_vf_loss, self.mean_kl, self.mean_entropy ], extra_feed_dict={self.kl_coeff: kl_coeff}, file_writer=file_writer if full_trace else None) def compute_gradients(self, samples): raise NotImplementedError def apply_gradients(self, grads): raise NotImplementedError def save(self): filters = self.get_filters(flush_after=True) return pickle.dumps({"filters": filters}) def restore(self, objs): objs = pickle.loads(objs) self.sync_filters(objs["filters"]) def get_weights(self): return self.variables.get_weights() def set_weights(self, weights): self.variables.set_weights(weights) def sample(self): """Returns experience samples from this Evaluator. Observation filter and reward filters are flushed here. Returns: SampleBatch: A columnar batch of experiences. """ num_steps_so_far = 0 all_samples = [] while num_steps_so_far < self.config["min_steps_per_task"]: rollout = self.sampler.get_data() samples = process_rollout(rollout, self.rew_filter, self.config["gamma"], self.config["lambda"], use_gae=self.config["use_gae"]) num_steps_so_far += samples.count all_samples.append(samples) return SampleBatch.concat_samples(all_samples) def get_completed_rollout_metrics(self): """Returns metrics on previously completed rollouts. Calling this clears the queue of completed rollout metrics. """ return self.sampler.get_metrics() def sync_filters(self, new_filters): """Changes self's filter to given and rebases any accumulated delta. Args: new_filters (dict): Filters with new state to update local copy. """ assert all(k in new_filters for k in self.filters) for k in self.filters: self.filters[k].sync(new_filters[k]) def get_filters(self, flush_after=False): """Returns a snapshot of filters. Args: flush_after (bool): Clears the filter buffer state. Returns: return_filters (dict): Dict for serializable filters """ return_filters = {} for k, f in self.filters.items(): return_filters[k] = f.as_serializable() if flush_after: f.clear_buffer() return return_filters
class LocalMultiGPUOptimizer(PolicyOptimizer): """A synchronous optimizer that uses multiple local GPUs. Samples are pulled synchronously from multiple remote evaluators, concatenated, and then split across the memory of multiple local GPUs. A number of SGD passes are then taken over the in-memory data. For more details, see `multi_gpu_impl.LocalSyncParallelOptimizer`. This optimizer is Tensorflow-specific and require evaluators to implement the TFMultiGPUSupport API. """ def _init(self, sgd_batch_size=128, sgd_stepsize=5e-5, num_sgd_iter=10, timesteps_per_batch=1024): assert isinstance(self.local_evaluator, TFMultiGPUSupport) self.batch_size = sgd_batch_size self.sgd_stepsize = sgd_stepsize self.num_sgd_iter = num_sgd_iter self.timesteps_per_batch = timesteps_per_batch gpu_ids = ray.get_gpu_ids() if not gpu_ids: self.devices = ["/cpu:0"] else: self.devices = ["/gpu:{}".format(i) for i in range(len(gpu_ids))] self.batch_size = int( sgd_batch_size / len(self.devices)) * len(self.devices) assert self.batch_size % len(self.devices) == 0 assert self.batch_size >= len(self.devices), "batch size too small" self.per_device_batch_size = int(self.batch_size / len(self.devices)) self.sample_timer = TimerStat() self.load_timer = TimerStat() self.grad_timer = TimerStat() self.update_weights_timer = TimerStat() print("LocalMultiGPUOptimizer devices", self.devices) print("LocalMultiGPUOptimizer batch size", self.batch_size) # List of (feature name, feature placeholder) tuples self.loss_inputs = self.local_evaluator.tf_loss_inputs() # per-GPU graph copies created below must share vars with the policy main_thread_scope = tf.get_variable_scope() # reuse is set to AUTO_REUSE because Adam nodes are created after # all of the device copies are created. with tf.variable_scope(main_thread_scope, reuse=tf.AUTO_REUSE): self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.sgd_stepsize), self.devices, [ph for _, ph in self.loss_inputs], self.per_device_batch_size, lambda *ph: self.local_evaluator.build_tf_loss(ph), os.getcwd()) # TODO(rliaw): Find more elegant solution for this if hasattr(self.local_evaluator, "init_extra_ops"): self.local_evaluator.init_extra_ops( self.par_opt.get_device_losses()) self.sess = self.local_evaluator.sess self.sess.run(tf.global_variables_initializer()) def step(self, postprocess_fn=None): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: # TODO(rliaw): remove when refactoring from ray.rllib.ppo.rollout import collect_samples samples = collect_samples(self.remote_evaluators, self.timesteps_per_batch) else: samples = self.local_evaluator.sample() assert isinstance(samples, SampleBatch) if postprocess_fn: postprocess_fn(samples) with self.load_timer: tuples_per_device = self.par_opt.load_data( self.local_evaluator.sess, samples.columns([key for key, _ in self.loss_inputs])) with self.grad_timer: all_extra_fetches = [] model = self.local_evaluator num_batches = ( int(tuples_per_device) // int(self.per_device_batch_size)) for i in range(self.num_sgd_iter): iter_extra_fetches = [] permutation = np.random.permutation(num_batches) for batch_index in range(num_batches): # TODO(ekl) support ppo's debugging features, e.g. # printing the current loss and tracing batch_fetches = self.par_opt.optimize( self.sess, permutation[batch_index] * self.per_device_batch_size, extra_ops=model.extra_apply_grad_fetches(), extra_feed_dict=model.extra_apply_grad_feed_dict()) iter_extra_fetches += [batch_fetches] all_extra_fetches += [iter_extra_fetches] self.num_steps_sampled += samples.count self.num_steps_trained += samples.count return all_extra_fetches def stats(self): return dict(PolicyOptimizer.stats(), **{ "sample_time_ms": round(1000 * self.sample_timer.mean, 3), "load_time_ms": round(1000 * self.load_timer.mean, 3), "grad_time_ms": round(1000 * self.grad_timer.mean, 3), "update_time_ms": round(1000 * self.update_weights_timer.mean, 3), })
class PPOEvaluator(Evaluator): """ Runner class that holds the simulator environment and the policy. Initializes the tensorflow graphs for both training and evaluation. One common policy graph is initialized on '/cpu:0' and holds all the shared network weights. When run as a remote agent, only this graph is used. """ def __init__(self, registry, env_creator, config, logdir, is_remote): self.registry = registry self.is_remote = is_remote if is_remote: os.environ["CUDA_VISIBLE_DEVICES"] = "" devices = ["/cpu:0"] else: devices = config["devices"] self.devices = devices self.config = config self.logdir = logdir self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=config_proto) if config["tf_debug_inf_or_nan"] and not is_remote: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter( "has_inf_or_nan", tf_debug.has_inf_or_nan) # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder( name="newkl", shape=(), dtype=tf.float32) # The input observations. self.observations = tf.placeholder( tf.float32, shape=(None,) + self.env.observation_space.shape) # Targets of the value function. self.value_targets = tf.placeholder(tf.float32, shape=(None,)) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None,)) action_space = self.env.action_space self.actions = ModelCatalog.get_action_placeholder(action_space) self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder( tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None,)) if is_remote: self.batch_size = config["rollout_batchsize"] self.per_device_batch_size = config["rollout_batchsize"] else: self.batch_size = int( config["sgd_batchsize"] / len(devices)) * len(devices) assert self.batch_size % len(devices) == 0 self.per_device_batch_size = int(self.batch_size / len(devices)) def build_loss(obs, vtargets, advs, acts, plog, pvf_preds): return ProximalPolicyLoss( self.env.observation_space, self.env.action_space, obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess, self.registry) self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices, [self.observations, self.value_targets, self.advantages, self.actions, self.prev_logits, self.prev_vf_preds], self.per_device_batch_size, build_loss, self.logdir) # Metric ops with tf.name_scope("test_outputs"): policies = self.par_opt.get_device_losses() self.mean_loss = tf.reduce_mean( tf.stack(values=[ policy.loss for policy in policies]), 0) self.mean_policy_loss = tf.reduce_mean( tf.stack(values=[ policy.mean_policy_loss for policy in policies]), 0) self.mean_vf_loss = tf.reduce_mean( tf.stack(values=[ policy.mean_vf_loss for policy in policies]), 0) self.mean_kl = tf.reduce_mean( tf.stack(values=[ policy.mean_kl for policy in policies]), 0) self.mean_entropy = tf.reduce_mean( tf.stack(values=[ policy.mean_entropy for policy in policies]), 0) # References to the model weights self.common_policy = self.par_opt.get_common_loss() self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) self.obs_filter = get_filter( config["observation_filter"], self.env.observation_space.shape) self.rew_filter = MeanStdFilter((), clip=5.0) self.filters = {"obs_filter": self.obs_filter, "rew_filter": self.rew_filter} self.sampler = SyncSampler( self.env, self.common_policy, self.obs_filter, self.config["horizon"], self.config["horizon"]) self.sess.run(tf.global_variables_initializer()) def load_data(self, trajectories, full_trace): use_gae = self.config["use_gae"] dummy = np.zeros_like(trajectories["advantages"]) return self.par_opt.load_data( self.sess, [trajectories["observations"], trajectories["value_targets"] if use_gae else dummy, trajectories["advantages"], trajectories["actions"], trajectories["logprobs"], trajectories["vf_preds"] if use_gae else dummy], full_trace=full_trace) def run_sgd_minibatch( self, batch_index, kl_coeff, full_trace, file_writer): return self.par_opt.optimize( self.sess, batch_index, extra_ops=[ self.mean_loss, self.mean_policy_loss, self.mean_vf_loss, self.mean_kl, self.mean_entropy], extra_feed_dict={self.kl_coeff: kl_coeff}, file_writer=file_writer if full_trace else None) def compute_gradients(self, samples): raise NotImplementedError def apply_gradients(self, grads): raise NotImplementedError def save(self): filters = self.get_filters(flush_after=True) return pickle.dumps({"filters": filters}) def restore(self, objs): objs = pickle.loads(objs) self.sync_filters(objs["filters"]) def get_weights(self): return self.variables.get_weights() def set_weights(self, weights): self.variables.set_weights(weights) def sample(self): """Returns experience samples from this Evaluator. Observation filter and reward filters are flushed here. Returns: SampleBatch: A columnar batch of experiences. """ num_steps_so_far = 0 all_samples = [] while num_steps_so_far < self.config["min_steps_per_task"]: rollout = self.sampler.get_data() samples = process_rollout( rollout, self.rew_filter, self.config["gamma"], self.config["lambda"], use_gae=self.config["use_gae"]) num_steps_so_far += samples.count all_samples.append(samples) return SampleBatch.concat_samples(all_samples) def get_completed_rollout_metrics(self): """Returns metrics on previously completed rollouts. Calling this clears the queue of completed rollout metrics. """ return self.sampler.get_metrics() def sync_filters(self, new_filters): """Changes self's filter to given and rebases any accumulated delta. Args: new_filters (dict): Filters with new state to update local copy. """ assert all(k in new_filters for k in self.filters) for k in self.filters: self.filters[k].sync(new_filters[k]) def get_filters(self, flush_after=False): """Returns a snapshot of filters. Args: flush_after (bool): Clears the filter buffer state. Returns: return_filters (dict): Dict for serializable filters """ return_filters = {} for k, f in self.filters.items(): return_filters[k] = f.as_serializable() if flush_after: f.clear_buffer() return return_filters
class LocalMultiGPUOptimizer(Optimizer): """A synchronous optimizer that uses multiple local GPUs. Samples are pulled synchronously from multiple remote evaluators, concatenated, and then split across the memory of multiple local GPUs. A number of SGD passes are then taken over the in-memory data. For more details, see `multi_gpu_impl.LocalSyncParallelOptimizer`. This optimizer is Tensorflow-specific and require evaluators to implement the TFMultiGPUSupport API. """ def _init(self): assert isinstance(self.local_evaluator, TFMultiGPUSupport) self.batch_size = self.config.get("sgd_batch_size", 128) gpu_ids = ray.get_gpu_ids() if not gpu_ids: self.devices = ["/cpu:0"] else: self.devices = ["/gpu:{}".format(i) for i in range(len(gpu_ids))] assert self.batch_size > len(self.devices), "batch size too small" self.per_device_batch_size = self.batch_size // len(self.devices) self.sample_timer = TimerStat() self.load_timer = TimerStat() self.grad_timer = TimerStat() self.update_weights_timer = TimerStat() print("LocalMultiGPUOptimizer devices", self.devices) print("LocalMultiGPUOptimizer batch size", self.batch_size) # List of (feature name, feature placeholder) tuples self.loss_inputs = self.local_evaluator.tf_loss_inputs() # per-GPU graph copies created below must share vars with the policy tf.get_variable_scope().reuse_variables() self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config.get("sgd_stepsize", 5e-5)), self.devices, [ph for _, ph in self.loss_inputs], self.per_device_batch_size, lambda *ph: self.local_evaluator.build_tf_loss(ph), self.config.get("logdir", os.getcwd())) self.sess = self.local_evaluator.sess self.sess.run(tf.global_variables_initializer()) def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: samples = SampleBatch.concat_samples( ray.get( [e.sample.remote() for e in self.remote_evaluators])) else: samples = self.local_evaluator.sample() assert isinstance(samples, SampleBatch) with self.load_timer: tuples_per_device = self.par_opt.load_data( self.local_evaluator.sess, samples.columns([key for key, _ in self.loss_inputs])) with self.grad_timer: for i in range(self.config.get("num_sgd_iter", 10)): batch_index = 0 num_batches = (int(tuples_per_device) // int(self.per_device_batch_size)) permutation = np.random.permutation(num_batches) while batch_index < num_batches: # TODO(ekl) support ppo's debugging features, e.g. # printing the current loss and tracing self.par_opt.optimize( self.sess, permutation[batch_index] * self.per_device_batch_size) batch_index += 1 def stats(self): return { "sample_time_ms": round(1000 * self.sample_timer.mean, 3), "load_time_ms": round(1000 * self.load_timer.mean, 3), "grad_time_ms": round(1000 * self.grad_timer.mean, 3), "update_time_ms": round(1000 * self.update_weights_timer.mean, 3), }
class LocalMultiGPUOptimizer(Optimizer): """A synchronous optimizer that uses multiple local GPUs. Samples are pulled synchronously from multiple remote evaluators, concatenated, and then split across the memory of multiple local GPUs. A number of SGD passes are then taken over the in-memory data. For more details, see `multi_gpu_impl.LocalSyncParallelOptimizer`. This optimizer is Tensorflow-specific and require evaluators to implement the TFMultiGPUSupport API. """ def _init(self, sgd_batch_size=128, sgd_stepsize=5e-5, num_sgd_iter=10): assert isinstance(self.local_evaluator, TFMultiGPUSupport) self.batch_size = sgd_batch_size self.sgd_stepsize = sgd_stepsize self.num_sgd_iter = num_sgd_iter gpu_ids = ray.get_gpu_ids() if not gpu_ids: self.devices = ["/cpu:0"] else: self.devices = ["/gpu:{}".format(i) for i in range(len(gpu_ids))] assert self.batch_size > len(self.devices), "batch size too small" self.per_device_batch_size = self.batch_size // len(self.devices) self.sample_timer = TimerStat() self.load_timer = TimerStat() self.grad_timer = TimerStat() self.update_weights_timer = TimerStat() print("LocalMultiGPUOptimizer devices", self.devices) print("LocalMultiGPUOptimizer batch size", self.batch_size) # List of (feature name, feature placeholder) tuples self.loss_inputs = self.local_evaluator.tf_loss_inputs() # per-GPU graph copies created below must share vars with the policy tf.get_variable_scope().reuse_variables() self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.sgd_stepsize), self.devices, [ph for _, ph in self.loss_inputs], self.per_device_batch_size, lambda *ph: self.local_evaluator.build_tf_loss(ph), os.getcwd()) self.sess = self.local_evaluator.sess self.sess.run(tf.global_variables_initializer()) def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: samples = SampleBatch.concat_samples( ray.get( [e.sample.remote() for e in self.remote_evaluators])) else: samples = self.local_evaluator.sample() assert isinstance(samples, SampleBatch) with self.load_timer: tuples_per_device = self.par_opt.load_data( self.local_evaluator.sess, samples.columns([key for key, _ in self.loss_inputs])) with self.grad_timer: for i in range(self.num_sgd_iter): batch_index = 0 num_batches = ( int(tuples_per_device) // int(self.per_device_batch_size)) permutation = np.random.permutation(num_batches) while batch_index < num_batches: # TODO(ekl) support ppo's debugging features, e.g. # printing the current loss and tracing self.par_opt.optimize( self.sess, permutation[batch_index] * self.per_device_batch_size) batch_index += 1 self.num_steps_sampled += samples.count self.num_steps_trained += samples.count def stats(self): return dict(Optimizer.stats(), **{ "sample_time_ms": round(1000 * self.sample_timer.mean, 3), "load_time_ms": round(1000 * self.load_timer.mean, 3), "grad_time_ms": round(1000 * self.grad_timer.mean, 3), "update_time_ms": round(1000 * self.update_weights_timer.mean, 3), })
class PPOEvaluator(PolicyEvaluator): """ Runner class that holds the simulator environment and the policy. Initializes the tensorflow graphs for both training and evaluation. One common policy graph is initialized on '/cpu:0' and holds all the shared network weights. When run as a remote agent, only this graph is used. """ def __init__(self, env_id, config, logdir, is_remote): self.is_remote = is_remote if is_remote: os.environ["CUDA_VISIBLE_DEVICES"] = "" devices = ["/cpu:0"] else: devices = config["devices"] self.devices = devices self.config = config self.logdir = logdir # self.env = ModelCatalog.get_preprocessor_as_wrapper( # registry, env_creator(config["env_config"]), config["model"]) env = gym.make(env_id) preprocessor = AtariPixelPreprocessor(env.observation_space, config["model"]) self.env = _RLlibPreprocessorWrapper(env, preprocessor) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=config_proto) if config["tf_debug_inf_or_nan"] and not is_remote: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter( "has_inf_or_nan", tf_debug.has_inf_or_nan) # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder( name="newkl", shape=(), dtype=tf.float32) self.e_kl_coeff = tf.placeholder( name="e_newkl", shape=(), dtype=tf.float32) # The input observations. self.observations = tf.placeholder( tf.float32, shape=(None,) + self.env.observation_space.shape) # Targets of the value function. self.value_targets = tf.placeholder(tf.float32, shape=(None,)) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None,)) # for explore self.e_value_targets = tf.placeholder(tf.float32, shape=(None,)) self.e_advantages = tf.placeholder(tf.float32, shape=(None,)) action_space = self.env.action_space self.actions = ModelCatalog.get_action_placeholder(action_space) self.e_actions = ModelCatalog.get_action_placeholder(action_space) self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder( tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None,)) # for explore self.e_prev_logits = tf.placeholder( tf.float32, shape=(None, self.logit_dim)) self.e_prev_vf_preds = tf.placeholder(tf.float32, shape=(None,)) if is_remote: self.batch_size = config["rollout_batchsize"] self.per_device_batch_size = config["rollout_batchsize"] else: self.batch_size = int( config["sgd_batchsize"] / len(devices)) * len(devices) assert self.batch_size % len(devices) == 0 self.per_device_batch_size = int(self.batch_size / len(devices)) def build_loss(obs, vtargets, advs, acts, plog, pvf_preds, e_vtargets, e_advs, e_plog, e_pvf_preds): return ProximalPolicyLoss( self.env.observation_space, self.env.action_space, obs, vtargets, advs, acts, plog, pvf_preds, e_vtargets, e_advs, e_plog, e_pvf_preds, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess) self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices, [self.observations, self.value_targets, self.advantages, self.actions, self.prev_logits, self.prev_vf_preds, self.e_value_targets, self.e_advantages, self.e_prev_logits, self.e_prev_vf_preds], self.per_device_batch_size, build_loss, self.logdir) # References to the model weights self.common_policy = self.par_opt.get_common_loss() self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) self.obs_filter = get_filter( config["observation_filter"], self.env.observation_space.shape) self.rew_filter = MeanStdFilter((), clip=5.0) self.e_rew_filter = MeanStdFilter((), clip=5.0) self.filters = {"obs_filter": self.obs_filter, "rew_filter": self.rew_filter, "e_rew_filter": self.e_rew_filter} self.sampler = SyncSampler( self.env, self.common_policy, self.obs_filter, self.config["horizon"], self.config["horizon"]) self.init_op = tf.global_variables_initializer() # self.sess.run(tf.global_variables_initializer()) def run_init_op(self): self.sess.run(self.init_op) def load_data(self, trajectories, full_trace): use_gae = self.config["use_gae"] dummy = np.zeros_like(trajectories["advantages"]) return self.par_opt.load_data( self.sess, [trajectories["observations"], trajectories["value_targets"] if use_gae else dummy, trajectories["advantages"], trajectories["actions"], trajectories["logprobs"], trajectories["vf_preds"] if use_gae else dummy, trajectories["e_value_targets"] if use_gae else dummy, trajectories["e_advantages"], trajectories["e_logprobs"], trajectories["e_vf_preds"] if use_gae else dummy, ], full_trace=full_trace) def run_sgd_minibatch( self, batch_index, kl_coeff, full_trace, file_writer): return self.par_opt.optimize( self.sess, batch_index, extra_ops=[], # extra_ops=[ # self.mean_loss, self.mean_policy_loss, self.mean_vf_loss, # self.mean_kl, self.mean_entropy], extra_feed_dict={self.kl_coeff: kl_coeff}, file_writer=file_writer if full_trace else None) def compute_gradients(self, samples): raise NotImplementedError def apply_gradients(self, grads): raise NotImplementedError def save(self): filters = self.get_filters(flush_after=True) return pickle.dumps({"filters": filters}) def restore(self, objs): objs = pickle.loads(objs) self.sync_filters(objs["filters"]) def get_weights(self): return self.variables.get_weights() def set_weights(self, weights): self.variables.set_weights(weights) def sample(self): """Returns experience samples from this Evaluator. Observation filter and reward filters are flushed here. Returns: SampleBatch: A columnar batch of experiences. """ num_steps_so_far = 0 all_samples = [] while num_steps_so_far < self.config["min_steps_per_task"]: rollout = self.sampler.get_data() samples = process_rollout( rollout, self.rew_filter, self.e_rew_filter, self.config["gamma"], self.config["lambda"], use_gae=self.config["use_gae"]) num_steps_so_far += samples.count all_samples.append(samples) return SampleBatch.concat_samples(all_samples) def sync_filters(self, new_filters): """Changes self's filter to given and rebases any accumulated delta. Args: new_filters (dict): Filters with new state to update local copy. """ assert all(k in new_filters for k in self.filters) for k in self.filters: self.filters[k].sync(new_filters[k]) def get_filters(self, flush_after=False): """Returns a snapshot of filters. Args: flush_after (bool): Clears the filter buffer state. Returns: return_filters (dict): Dict for serializable filters """ return_filters = {} for k, f in self.filters.items(): return_filters[k] = f.as_serializable() if flush_after: f.clear_buffer() return return_filters def get_evaluate_metrics(self): policy_loss = [] value_loss = [] entropy = [] kl = [] e_policy_loss = [] e_value_loss = [] e_entropy = [] e_kl = [] reward = [] length = [] for i in range(self.config['num_evaluation']): rollout, rew, leng = self.sampler.get_episode_data() samples = process_rollout( rollout, self.rew_filter, self.e_rew_filter, self.config["gamma"], self.config["lambda"], use_gae=self.config["use_gae"]) feed_dict = { 'observations': samples["observations"], 'value_targets': samples["value_targets"], 'advantages': samples["advantages"], 'actions': samples["actions"], # 'logprobs': samples["logprobs"], 'prev_logits': samples["logprobs"], # 'vf_preds': samples["vf_preds"], 'prev_vf_preds': samples["vf_preds"], 'e_value_targets': samples["e_value_targets"], 'e_advantages': samples["e_advantages"], # 'e_logprobs': samples["e_logprobs"], # 'e_vf_preds': samples["e_vf_preds"], 'e_prev_logits': samples["e_logprobs"], 'e_prev_vf_preds': samples["e_vf_preds"], } pl, vl, ent, k, e_pl, e_vl, e_ent, e_k = self.common_policy.get_summary_data(feed_dict) policy_loss.append(pl) value_loss.append(vl) entropy.append(ent) kl.append(k) e_policy_loss.append(e_pl) e_value_loss.append(e_vl) e_entropy.append(e_ent) e_kl.append(e_k) reward.append(rew) length.append(leng) return np.mean(policy_loss), np.mean(value_loss), np.mean(entropy), np.mean(kl), \ np.mean(e_policy_loss), np.mean(e_value_loss), np.mean(e_entropy), np.mean(e_kl), \ np.mean(reward), np.mean(length)