class DDPGEvaluator(PolicyEvaluator): def __init__(self, registry, env_creator, config): self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"])) # contains model, target_model self.model = DDPGModel(registry, self.env, config) self.sampler = SyncSampler(self.env, self.model.model, NoFilter(), config["num_local_steps"], horizon=config["horizon"]) def sample(self): """Returns a batch of samples.""" rollout = self.sampler.get_data() rollout.data["weights"] = np.ones_like(rollout.data["rewards"]) # since each sample is one step, no discounting needs to be applied; # this does not involve config["gamma"] samples = process_rollout(rollout, NoFilter(), gamma=1.0, use_gae=False) return samples def update_target(self): """Updates target critic and target actor.""" self.model.update_target() def compute_gradients(self, samples): """Returns critic, actor gradients.""" return self.model.compute_gradients(samples) def apply_gradients(self, grads): """Applies gradients to evaluator weights.""" self.model.apply_gradients(grads) def compute_apply(self, samples): grads, _ = self.compute_gradients(samples) self.apply_gradients(grads) def get_weights(self): """Returns model weights.""" return self.model.get_weights() def set_weights(self, weights): """Sets model weights.""" self.model.set_weights(weights) def get_completed_rollout_metrics(self): """Returns metrics on previously completed rollouts. Calling this clears the queue of completed rollout metrics. """ return self.sampler.get_metrics()
class PGEvaluator(Evaluator): """Evaluator for simple policy gradient.""" def __init__(self, registry, env_creator, config): self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) self.config = config self.policy = PGPolicy(registry, self.env.observation_space, self.env.action_space, config) self.sampler = SyncSampler(self.env, self.policy, NoFilter(), config["batch_size"], horizon=config["horizon"]) def sample(self): rollout = self.sampler.get_data() samples = process_rollout(rollout, NoFilter(), gamma=self.config["gamma"], use_gae=False) return samples def get_completed_rollout_metrics(self): """Returns metrics on previously completed rollouts. Calling this clears the queue of completed rollout metrics. """ return self.sampler.get_metrics() def compute_gradients(self, samples): """ Returns gradient w.r.t. samples.""" gradient, info = self.policy.compute_gradients(samples) return gradient def apply_gradients(self, grads): """Applies gradients to evaluator weights.""" self.policy.apply_gradients(grads) def get_weights(self): """Returns model weights.""" return self.policy.get_weights() def set_weights(self, weights): """Sets model weights.""" return self.policy.set_weights(weights)
class PGEvaluator(Evaluator): """Evaluator for simple policy gradient.""" def __init__(self, registry, env_creator, config): self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) self.config = config self.policy = PGPolicy(registry, self.env.observation_space, self.env.action_space, config) self.sampler = SyncSampler( self.env, self.policy, NoFilter(), config["batch_size"], horizon=config["horizon"]) def sample(self): rollout = self.sampler.get_data() samples = process_rollout( rollout, NoFilter(), gamma=self.config["gamma"], use_gae=False) return samples def get_completed_rollout_metrics(self): """Returns metrics on previously completed rollouts. Calling this clears the queue of completed rollout metrics. """ return self.sampler.get_metrics() def compute_gradients(self, samples): """ Returns gradient w.r.t. samples.""" gradient, info = self.policy.compute_gradients(samples) return gradient def apply_gradients(self, grads): """Applies gradients to evaluator weights.""" self.policy.apply_gradients(grads) def get_weights(self): """Returns model weights.""" return self.policy.get_weights() def set_weights(self, weights): """Sets model weights.""" return self.policy.set_weights(weights)
class PPOEvaluator(Evaluator): """ Runner class that holds the simulator environment and the policy. Initializes the tensorflow graphs for both training and evaluation. One common policy graph is initialized on '/cpu:0' and holds all the shared network weights. When run as a remote agent, only this graph is used. """ def __init__(self, registry, env_creator, config, logdir, is_remote): self.registry = registry self.is_remote = is_remote if is_remote: os.environ["CUDA_VISIBLE_DEVICES"] = "" devices = ["/cpu:0"] else: devices = config["devices"] self.devices = devices self.config = config self.logdir = logdir self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=config_proto) if config["tf_debug_inf_or_nan"] and not is_remote: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder(name="newkl", shape=(), dtype=tf.float32) # The input observations. self.observations = tf.placeholder(tf.float32, shape=(None, ) + self.env.observation_space.shape) # Targets of the value function. self.value_targets = tf.placeholder(tf.float32, shape=(None, )) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None, )) action_space = self.env.action_space # TODO(rliaw): pull this into model_catalog if isinstance(action_space, gym.spaces.Box): self.actions = tf.placeholder(tf.float32, shape=(None, action_space.shape[0])) elif isinstance(action_space, gym.spaces.Discrete): self.actions = tf.placeholder(tf.int64, shape=(None, )) else: raise NotImplemented("action space" + str(type(action_space)) + "currently not supported") self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder(tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None, )) assert config["sgd_batchsize"] % len(devices) == 0, \ "Batch size must be evenly divisible by devices" if is_remote: self.batch_size = config["rollout_batchsize"] self.per_device_batch_size = config["rollout_batchsize"] else: self.batch_size = config["sgd_batchsize"] self.per_device_batch_size = int(self.batch_size / len(devices)) def build_loss(obs, vtargets, advs, acts, plog, pvf_preds): return ProximalPolicyLoss(self.env.observation_space, self.env.action_space, obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess, self.registry) self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices, [ self.observations, self.value_targets, self.advantages, self.actions, self.prev_logits, self.prev_vf_preds ], self.per_device_batch_size, build_loss, self.logdir) # Metric ops with tf.name_scope("test_outputs"): policies = self.par_opt.get_device_losses() self.mean_loss = tf.reduce_mean( tf.stack(values=[policy.loss for policy in policies]), 0) self.mean_policy_loss = tf.reduce_mean( tf.stack( values=[policy.mean_policy_loss for policy in policies]), 0) self.mean_vf_loss = tf.reduce_mean( tf.stack(values=[policy.mean_vf_loss for policy in policies]), 0) self.mean_kl = tf.reduce_mean( tf.stack(values=[policy.mean_kl for policy in policies]), 0) self.mean_entropy = tf.reduce_mean( tf.stack(values=[policy.mean_entropy for policy in policies]), 0) # References to the model weights self.common_policy = self.par_opt.get_common_loss() self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) self.obs_filter = get_filter(config["observation_filter"], self.env.observation_space.shape) self.rew_filter = MeanStdFilter((), clip=5.0) self.filters = { "obs_filter": self.obs_filter, "rew_filter": self.rew_filter } self.sampler = SyncSampler(self.env, self.common_policy, self.obs_filter, self.config["horizon"], self.config["horizon"]) self.sess.run(tf.global_variables_initializer()) def load_data(self, trajectories, full_trace): use_gae = self.config["use_gae"] dummy = np.zeros_like(trajectories["advantages"]) return self.par_opt.load_data(self.sess, [ trajectories["observations"], trajectories["value_targets"] if use_gae else dummy, trajectories["advantages"], trajectories["actions"], trajectories["logprobs"], trajectories["vf_preds"] if use_gae else dummy ], full_trace=full_trace) def run_sgd_minibatch(self, batch_index, kl_coeff, full_trace, file_writer): return self.par_opt.optimize( self.sess, batch_index, extra_ops=[ self.mean_loss, self.mean_policy_loss, self.mean_vf_loss, self.mean_kl, self.mean_entropy ], extra_feed_dict={self.kl_coeff: kl_coeff}, file_writer=file_writer if full_trace else None) def compute_gradients(self, samples): raise NotImplementedError def apply_gradients(self, grads): raise NotImplementedError def save(self): filters = self.get_filters(flush_after=True) return pickle.dumps({"filters": filters}) def restore(self, objs): objs = pickle.loads(objs) self.sync_filters(objs["filters"]) def get_weights(self): return self.variables.get_weights() def set_weights(self, weights): self.variables.set_weights(weights) def sample(self): """Returns experience samples from this Evaluator. Observation filter and reward filters are flushed here. Returns: SampleBatch: A columnar batch of experiences. """ num_steps_so_far = 0 all_samples = [] while num_steps_so_far < self.config["min_steps_per_task"]: rollout = self.sampler.get_data() samples = process_rollout(rollout, self.rew_filter, self.config["gamma"], self.config["lambda"], use_gae=self.config["use_gae"]) num_steps_so_far += samples.count all_samples.append(samples) return SampleBatch.concat_samples(all_samples) def get_completed_rollout_metrics(self): """Returns metrics on previously completed rollouts. Calling this clears the queue of completed rollout metrics. """ return self.sampler.get_metrics() def sync_filters(self, new_filters): """Changes self's filter to given and rebases any accumulated delta. Args: new_filters (dict): Filters with new state to update local copy. """ assert all(k in new_filters for k in self.filters) for k in self.filters: self.filters[k].sync(new_filters[k]) def get_filters(self, flush_after=False): """Returns a snapshot of filters. Args: flush_after (bool): Clears the filter buffer state. Returns: return_filters (dict): Dict for serializable filters """ return_filters = {} for k, f in self.filters.items(): return_filters[k] = f.as_serializable() if flush_after: f.clear_buffer() return return_filters
class PPOEvaluator(TFMultiGPUSupport): """ Runner class that holds the simulator environment and the policy. Initializes the tensorflow graphs for both training and evaluation. One common policy graph is initialized on '/cpu:0' and holds all the shared network weights. When run as a remote agent, only this graph is used. """ def __init__(self, registry, env_creator, config, logdir, is_remote): self.registry = registry self.config = config self.logdir = logdir self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=config_proto) self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder(name="newkl", shape=(), dtype=tf.float32) # The input observations. self.observations = tf.placeholder(tf.float32, shape=(None, ) + self.env.observation_space.shape) # Targets of the value function. self.value_targets = tf.placeholder(tf.float32, shape=(None, )) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None, )) action_space = self.env.action_space self.actions = ModelCatalog.get_action_placeholder(action_space) self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space, config["model"]) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder(tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None, )) self.inputs = [("obs", self.observations), ("value_targets", self.value_targets), ("advantages", self.advantages), ("actions", self.actions), ("logprobs", self.prev_logits), ("vf_preds", self.prev_vf_preds)] self.common_policy = self.build_tf_loss([ph for _, ph in self.inputs]) # References to the model weights self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) self.obs_filter = get_filter(config["observation_filter"], self.env.observation_space.shape) self.rew_filter = MeanStdFilter((), clip=5.0) self.filters = { "obs_filter": self.obs_filter, "rew_filter": self.rew_filter } self.sampler = SyncSampler(self.env, self.common_policy, self.obs_filter, self.config["horizon"], self.config["horizon"]) def tf_loss_inputs(self): return self.inputs def build_tf_loss(self, input_placeholders): obs, vtargets, advs, acts, plog, pvf_preds = input_placeholders return ProximalPolicyGraph(self.env.observation_space, self.env.action_space, obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess, self.registry) def init_extra_ops(self, device_losses): self.extra_ops = OrderedDict() with tf.name_scope("test_outputs"): policies = device_losses self.extra_ops["loss"] = tf.reduce_mean( tf.stack(values=[policy.loss for policy in policies]), 0) self.extra_ops["policy_loss"] = tf.reduce_mean( tf.stack( values=[policy.mean_policy_loss for policy in policies]), 0) self.extra_ops["vf_loss"] = tf.reduce_mean( tf.stack(values=[policy.mean_vf_loss for policy in policies]), 0) self.extra_ops["kl"] = tf.reduce_mean( tf.stack(values=[policy.mean_kl for policy in policies]), 0) self.extra_ops["entropy"] = tf.reduce_mean( tf.stack(values=[policy.mean_entropy for policy in policies]), 0) def extra_apply_grad_fetches(self): return list(self.extra_ops.values()) def extra_apply_grad_feed_dict(self): return {self.kl_coeff: self.kl_coeff_val} def update_kl(self, sampled_kl): if sampled_kl > 2.0 * self.kl_target: self.kl_coeff_val *= 1.5 elif sampled_kl < 0.5 * self.kl_target: self.kl_coeff_val *= 0.5 def save(self): filters = self.get_filters(flush_after=True) return pickle.dumps({ "filters": filters, "kl_coeff_val": self.kl_coeff_val, "kl_target": self.kl_target, }) def restore(self, objs): objs = pickle.loads(objs) self.sync_filters(objs["filters"]) self.kl_coeff_val = objs["kl_coeff_val"] self.kl_target = objs["kl_target"] def get_weights(self): return self.variables.get_weights() def set_weights(self, weights): self.variables.set_weights(weights) def sample(self): """Returns experience samples from this Evaluator. Observation filter and reward filters are flushed here. Returns: SampleBatch: A columnar batch of experiences. """ num_steps_so_far = 0 all_samples = [] while num_steps_so_far < self.config["min_steps_per_task"]: rollout = self.sampler.get_data() last_r = 0.0 # note: not needed since we don't truncate rollouts samples = compute_advantages(rollout, last_r, self.config["gamma"], self.config["lambda"], use_gae=self.config["use_gae"]) num_steps_so_far += samples.count all_samples.append(samples) return SampleBatch.concat_samples(all_samples) def get_completed_rollout_metrics(self): """Returns metrics on previously completed rollouts. Calling this clears the queue of completed rollout metrics. """ return self.sampler.get_metrics() def sync_filters(self, new_filters): """Changes self's filter to given and rebases any accumulated delta. Args: new_filters (dict): Filters with new state to update local copy. """ assert all(k in new_filters for k in self.filters) for k in self.filters: self.filters[k].sync(new_filters[k]) def get_filters(self, flush_after=False): """Returns a snapshot of filters. Args: flush_after (bool): Clears the filter buffer state. Returns: return_filters (dict): Dict for serializable filters """ return_filters = {} for k, f in self.filters.items(): return_filters[k] = f.as_serializable() if flush_after: f.clear_buffer() return return_filters
class Runner(object): """ Runner class that holds the simulator environment and the policy. Initializes the tensorflow graphs for both training and evaluation. One common policy graph is initialized on '/cpu:0' and holds all the shared network weights. When run as a remote agent, only this graph is used. """ def __init__(self, env_creator, config, logdir, is_remote): self.is_remote = is_remote if is_remote: os.environ["CUDA_VISIBLE_DEVICES"] = "" devices = ["/cpu:0"] else: devices = config["devices"] self.devices = devices self.config = config self.logdir = logdir self.env = create_and_wrap(env_creator, config["model"]) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=config_proto) if config["tf_debug_inf_or_nan"] and not is_remote: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter( "has_inf_or_nan", tf_debug.has_inf_or_nan) # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder( name="newkl", shape=(), dtype=tf.float32) # The input observations. self.observations = tf.placeholder( tf.float32, shape=(None,) + self.env.observation_space.shape) # Targets of the value function. self.value_targets = tf.placeholder(tf.float32, shape=(None,)) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None,)) action_space = self.env.action_space # TODO(rliaw): pull this into model_catalog if isinstance(action_space, gym.spaces.Box): self.actions = tf.placeholder( tf.float32, shape=(None, action_space.shape[0])) elif isinstance(action_space, gym.spaces.Discrete): self.actions = tf.placeholder(tf.int64, shape=(None,)) else: raise NotImplemented( "action space" + str(type(action_space)) + "currently not supported") self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder( tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None,)) assert config["sgd_batchsize"] % len(devices) == 0, \ "Batch size must be evenly divisible by devices" if is_remote: self.batch_size = config["rollout_batchsize"] self.per_device_batch_size = config["rollout_batchsize"] else: self.batch_size = config["sgd_batchsize"] self.per_device_batch_size = int(self.batch_size / len(devices)) def build_loss(obs, vtargets, advs, acts, plog, pvf_preds): return ProximalPolicyLoss( self.env.observation_space, self.env.action_space, obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess) self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices, [self.observations, self.value_targets, self.advantages, self.actions, self.prev_logits, self.prev_vf_preds], self.per_device_batch_size, build_loss, self.logdir) # Metric ops with tf.name_scope("test_outputs"): policies = self.par_opt.get_device_losses() self.mean_loss = tf.reduce_mean( tf.stack(values=[ policy.loss for policy in policies]), 0) self.mean_policy_loss = tf.reduce_mean( tf.stack(values=[ policy.mean_policy_loss for policy in policies]), 0) self.mean_vf_loss = tf.reduce_mean( tf.stack(values=[ policy.mean_vf_loss for policy in policies]), 0) self.mean_kl = tf.reduce_mean( tf.stack(values=[ policy.mean_kl for policy in policies]), 0) self.mean_entropy = tf.reduce_mean( tf.stack(values=[ policy.mean_entropy for policy in policies]), 0) # References to the model weights self.common_policy = self.par_opt.get_common_loss() self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) obs_filter = get_filter( config["observation_filter"], self.env.observation_space.shape) self.sampler = SyncSampler( self.env, self.common_policy, obs_filter, self.config["horizon"], self.config["horizon"]) self.reward_filter = MeanStdFilter((), clip=5.0) self.sess.run(tf.global_variables_initializer()) def load_data(self, trajectories, full_trace): use_gae = self.config["use_gae"] dummy = np.zeros_like(trajectories["advantages"]) return self.par_opt.load_data( self.sess, [trajectories["observations"], trajectories["value_targets"] if use_gae else dummy, trajectories["advantages"], trajectories["actions"].squeeze(), trajectories["logprobs"], trajectories["vf_preds"] if use_gae else dummy], full_trace=full_trace) def run_sgd_minibatch( self, batch_index, kl_coeff, full_trace, file_writer): return self.par_opt.optimize( self.sess, batch_index, extra_ops=[ self.mean_loss, self.mean_policy_loss, self.mean_vf_loss, self.mean_kl, self.mean_entropy], extra_feed_dict={self.kl_coeff: kl_coeff}, file_writer=file_writer if full_trace else None) def save(self): obs_filter = self.sampler.get_obs_filter() return pickle.dumps([obs_filter, self.reward_filter]) def restore(self, objs): objs = pickle.loads(objs) obs_filter = objs[0] rew_filter = objs[1] self.update_filters(obs_filter, rew_filter) def get_weights(self): return self.variables.get_weights() def load_weights(self, weights): self.variables.set_weights(weights) def update_filters(self, obs_filter=None, rew_filter=None): if rew_filter: # No special handling required since outside of threaded code self.reward_filter = rew_filter.copy() if obs_filter: self.sampler.update_obs_filter(obs_filter) def get_obs_filter(self): return self.sampler.get_obs_filter() def compute_steps(self, config, obs_filter, rew_filter): """Compute multiple rollouts and concatenate the results. Args: config: Configuration parameters obs_filter: Function that is applied to each of the observations. reward_filter: Function that is applied to each of the rewards. Returns: states: List of states. total_rewards: Total rewards of the trajectories. trajectory_lengths: Lengths of the trajectories. """ num_steps_so_far = 0 trajectories = [] self.update_filters(obs_filter, rew_filter) while num_steps_so_far < config["min_steps_per_task"]: rollout = self.sampler.get_data() trajectory = process_rollout( rollout, self.reward_filter, config["gamma"], config["lambda"], use_gae=config["use_gae"]) num_steps_so_far += trajectory["rewards"].shape[0] trajectories.append(trajectory) metrics = self.sampler.get_metrics() total_rewards, trajectory_lengths = zip(*[ (c.episode_reward, c.episode_length) for c in metrics]) updated_obs_filter = self.sampler.get_obs_filter(flush=True) return ( concatenate(trajectories), total_rewards, trajectory_lengths, updated_obs_filter, self.reward_filter)
class PPOEvaluator(Evaluator): """ Runner class that holds the simulator environment and the policy. Initializes the tensorflow graphs for both training and evaluation. One common policy graph is initialized on '/cpu:0' and holds all the shared network weights. When run as a remote agent, only this graph is used. """ def __init__(self, registry, env_creator, config, logdir, is_remote): self.registry = registry self.is_remote = is_remote if is_remote: os.environ["CUDA_VISIBLE_DEVICES"] = "" devices = ["/cpu:0"] else: devices = config["devices"] self.devices = devices self.config = config self.logdir = logdir self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=config_proto) if config["tf_debug_inf_or_nan"] and not is_remote: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter( "has_inf_or_nan", tf_debug.has_inf_or_nan) # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder( name="newkl", shape=(), dtype=tf.float32) # The input observations. self.observations = tf.placeholder( tf.float32, shape=(None,) + self.env.observation_space.shape) # Targets of the value function. self.value_targets = tf.placeholder(tf.float32, shape=(None,)) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None,)) action_space = self.env.action_space self.actions = ModelCatalog.get_action_placeholder(action_space) self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder( tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None,)) if is_remote: self.batch_size = config["rollout_batchsize"] self.per_device_batch_size = config["rollout_batchsize"] else: self.batch_size = int( config["sgd_batchsize"] / len(devices)) * len(devices) assert self.batch_size % len(devices) == 0 self.per_device_batch_size = int(self.batch_size / len(devices)) def build_loss(obs, vtargets, advs, acts, plog, pvf_preds): return ProximalPolicyLoss( self.env.observation_space, self.env.action_space, obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess, self.registry) self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices, [self.observations, self.value_targets, self.advantages, self.actions, self.prev_logits, self.prev_vf_preds], self.per_device_batch_size, build_loss, self.logdir) # Metric ops with tf.name_scope("test_outputs"): policies = self.par_opt.get_device_losses() self.mean_loss = tf.reduce_mean( tf.stack(values=[ policy.loss for policy in policies]), 0) self.mean_policy_loss = tf.reduce_mean( tf.stack(values=[ policy.mean_policy_loss for policy in policies]), 0) self.mean_vf_loss = tf.reduce_mean( tf.stack(values=[ policy.mean_vf_loss for policy in policies]), 0) self.mean_kl = tf.reduce_mean( tf.stack(values=[ policy.mean_kl for policy in policies]), 0) self.mean_entropy = tf.reduce_mean( tf.stack(values=[ policy.mean_entropy for policy in policies]), 0) # References to the model weights self.common_policy = self.par_opt.get_common_loss() self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) self.obs_filter = get_filter( config["observation_filter"], self.env.observation_space.shape) self.rew_filter = MeanStdFilter((), clip=5.0) self.filters = {"obs_filter": self.obs_filter, "rew_filter": self.rew_filter} self.sampler = SyncSampler( self.env, self.common_policy, self.obs_filter, self.config["horizon"], self.config["horizon"]) self.sess.run(tf.global_variables_initializer()) def load_data(self, trajectories, full_trace): use_gae = self.config["use_gae"] dummy = np.zeros_like(trajectories["advantages"]) return self.par_opt.load_data( self.sess, [trajectories["observations"], trajectories["value_targets"] if use_gae else dummy, trajectories["advantages"], trajectories["actions"], trajectories["logprobs"], trajectories["vf_preds"] if use_gae else dummy], full_trace=full_trace) def run_sgd_minibatch( self, batch_index, kl_coeff, full_trace, file_writer): return self.par_opt.optimize( self.sess, batch_index, extra_ops=[ self.mean_loss, self.mean_policy_loss, self.mean_vf_loss, self.mean_kl, self.mean_entropy], extra_feed_dict={self.kl_coeff: kl_coeff}, file_writer=file_writer if full_trace else None) def compute_gradients(self, samples): raise NotImplementedError def apply_gradients(self, grads): raise NotImplementedError def save(self): filters = self.get_filters(flush_after=True) return pickle.dumps({"filters": filters}) def restore(self, objs): objs = pickle.loads(objs) self.sync_filters(objs["filters"]) def get_weights(self): return self.variables.get_weights() def set_weights(self, weights): self.variables.set_weights(weights) def sample(self): """Returns experience samples from this Evaluator. Observation filter and reward filters are flushed here. Returns: SampleBatch: A columnar batch of experiences. """ num_steps_so_far = 0 all_samples = [] while num_steps_so_far < self.config["min_steps_per_task"]: rollout = self.sampler.get_data() samples = process_rollout( rollout, self.rew_filter, self.config["gamma"], self.config["lambda"], use_gae=self.config["use_gae"]) num_steps_so_far += samples.count all_samples.append(samples) return SampleBatch.concat_samples(all_samples) def get_completed_rollout_metrics(self): """Returns metrics on previously completed rollouts. Calling this clears the queue of completed rollout metrics. """ return self.sampler.get_metrics() def sync_filters(self, new_filters): """Changes self's filter to given and rebases any accumulated delta. Args: new_filters (dict): Filters with new state to update local copy. """ assert all(k in new_filters for k in self.filters) for k in self.filters: self.filters[k].sync(new_filters[k]) def get_filters(self, flush_after=False): """Returns a snapshot of filters. Args: flush_after (bool): Clears the filter buffer state. Returns: return_filters (dict): Dict for serializable filters """ return_filters = {} for k, f in self.filters.items(): return_filters[k] = f.as_serializable() if flush_after: f.clear_buffer() return return_filters