class DDPGEvaluator(PolicyEvaluator): def __init__(self, registry, env_creator, config): self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"])) # contains model, target_model self.model = DDPGModel(registry, self.env, config) self.sampler = SyncSampler(self.env, self.model.model, NoFilter(), config["num_local_steps"], horizon=config["horizon"]) def sample(self): """Returns a batch of samples.""" rollout = self.sampler.get_data() rollout.data["weights"] = np.ones_like(rollout.data["rewards"]) # since each sample is one step, no discounting needs to be applied; # this does not involve config["gamma"] samples = process_rollout(rollout, NoFilter(), gamma=1.0, use_gae=False) return samples def update_target(self): """Updates target critic and target actor.""" self.model.update_target() def compute_gradients(self, samples): """Returns critic, actor gradients.""" return self.model.compute_gradients(samples) def apply_gradients(self, grads): """Applies gradients to evaluator weights.""" self.model.apply_gradients(grads) def compute_apply(self, samples): grads, _ = self.compute_gradients(samples) self.apply_gradients(grads) def get_weights(self): """Returns model weights.""" return self.model.get_weights() def set_weights(self, weights): """Sets model weights.""" self.model.set_weights(weights) def get_completed_rollout_metrics(self): """Returns metrics on previously completed rollouts. Calling this clears the queue of completed rollout metrics. """ return self.sampler.get_metrics()
class PGEvaluator(Evaluator): """Evaluator for simple policy gradient.""" def __init__(self, registry, env_creator, config): self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) self.config = config self.policy = PGPolicy(registry, self.env.observation_space, self.env.action_space, config) self.sampler = SyncSampler(self.env, self.policy, NoFilter(), config["batch_size"], horizon=config["horizon"]) def sample(self): rollout = self.sampler.get_data() samples = process_rollout(rollout, NoFilter(), gamma=self.config["gamma"], use_gae=False) return samples def get_completed_rollout_metrics(self): """Returns metrics on previously completed rollouts. Calling this clears the queue of completed rollout metrics. """ return self.sampler.get_metrics() def compute_gradients(self, samples): """ Returns gradient w.r.t. samples.""" gradient, info = self.policy.compute_gradients(samples) return gradient def apply_gradients(self, grads): """Applies gradients to evaluator weights.""" self.policy.apply_gradients(grads) def get_weights(self): """Returns model weights.""" return self.policy.get_weights() def set_weights(self, weights): """Sets model weights.""" return self.policy.set_weights(weights)
class PGEvaluator(Evaluator): """Evaluator for simple policy gradient.""" def __init__(self, registry, env_creator, config): self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) self.config = config self.policy = PGPolicy(registry, self.env.observation_space, self.env.action_space, config) self.sampler = SyncSampler( self.env, self.policy, NoFilter(), config["batch_size"], horizon=config["horizon"]) def sample(self): rollout = self.sampler.get_data() samples = process_rollout( rollout, NoFilter(), gamma=self.config["gamma"], use_gae=False) return samples def get_completed_rollout_metrics(self): """Returns metrics on previously completed rollouts. Calling this clears the queue of completed rollout metrics. """ return self.sampler.get_metrics() def compute_gradients(self, samples): """ Returns gradient w.r.t. samples.""" gradient, info = self.policy.compute_gradients(samples) return gradient def apply_gradients(self, grads): """Applies gradients to evaluator weights.""" self.policy.apply_gradients(grads) def get_weights(self): """Returns model weights.""" return self.policy.get_weights() def set_weights(self, weights): """Sets model weights.""" return self.policy.set_weights(weights)
class PPOEvaluator(Evaluator): """ Runner class that holds the simulator environment and the policy. Initializes the tensorflow graphs for both training and evaluation. One common policy graph is initialized on '/cpu:0' and holds all the shared network weights. When run as a remote agent, only this graph is used. """ def __init__(self, registry, env_creator, config, logdir, is_remote): self.registry = registry self.is_remote = is_remote if is_remote: os.environ["CUDA_VISIBLE_DEVICES"] = "" devices = ["/cpu:0"] else: devices = config["devices"] self.devices = devices self.config = config self.logdir = logdir self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=config_proto) if config["tf_debug_inf_or_nan"] and not is_remote: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder(name="newkl", shape=(), dtype=tf.float32) # The input observations. self.observations = tf.placeholder(tf.float32, shape=(None, ) + self.env.observation_space.shape) # Targets of the value function. self.value_targets = tf.placeholder(tf.float32, shape=(None, )) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None, )) action_space = self.env.action_space # TODO(rliaw): pull this into model_catalog if isinstance(action_space, gym.spaces.Box): self.actions = tf.placeholder(tf.float32, shape=(None, action_space.shape[0])) elif isinstance(action_space, gym.spaces.Discrete): self.actions = tf.placeholder(tf.int64, shape=(None, )) else: raise NotImplemented("action space" + str(type(action_space)) + "currently not supported") self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder(tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None, )) assert config["sgd_batchsize"] % len(devices) == 0, \ "Batch size must be evenly divisible by devices" if is_remote: self.batch_size = config["rollout_batchsize"] self.per_device_batch_size = config["rollout_batchsize"] else: self.batch_size = config["sgd_batchsize"] self.per_device_batch_size = int(self.batch_size / len(devices)) def build_loss(obs, vtargets, advs, acts, plog, pvf_preds): return ProximalPolicyLoss(self.env.observation_space, self.env.action_space, obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess, self.registry) self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices, [ self.observations, self.value_targets, self.advantages, self.actions, self.prev_logits, self.prev_vf_preds ], self.per_device_batch_size, build_loss, self.logdir) # Metric ops with tf.name_scope("test_outputs"): policies = self.par_opt.get_device_losses() self.mean_loss = tf.reduce_mean( tf.stack(values=[policy.loss for policy in policies]), 0) self.mean_policy_loss = tf.reduce_mean( tf.stack( values=[policy.mean_policy_loss for policy in policies]), 0) self.mean_vf_loss = tf.reduce_mean( tf.stack(values=[policy.mean_vf_loss for policy in policies]), 0) self.mean_kl = tf.reduce_mean( tf.stack(values=[policy.mean_kl for policy in policies]), 0) self.mean_entropy = tf.reduce_mean( tf.stack(values=[policy.mean_entropy for policy in policies]), 0) # References to the model weights self.common_policy = self.par_opt.get_common_loss() self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) self.obs_filter = get_filter(config["observation_filter"], self.env.observation_space.shape) self.rew_filter = MeanStdFilter((), clip=5.0) self.filters = { "obs_filter": self.obs_filter, "rew_filter": self.rew_filter } self.sampler = SyncSampler(self.env, self.common_policy, self.obs_filter, self.config["horizon"], self.config["horizon"]) self.sess.run(tf.global_variables_initializer()) def load_data(self, trajectories, full_trace): use_gae = self.config["use_gae"] dummy = np.zeros_like(trajectories["advantages"]) return self.par_opt.load_data(self.sess, [ trajectories["observations"], trajectories["value_targets"] if use_gae else dummy, trajectories["advantages"], trajectories["actions"], trajectories["logprobs"], trajectories["vf_preds"] if use_gae else dummy ], full_trace=full_trace) def run_sgd_minibatch(self, batch_index, kl_coeff, full_trace, file_writer): return self.par_opt.optimize( self.sess, batch_index, extra_ops=[ self.mean_loss, self.mean_policy_loss, self.mean_vf_loss, self.mean_kl, self.mean_entropy ], extra_feed_dict={self.kl_coeff: kl_coeff}, file_writer=file_writer if full_trace else None) def compute_gradients(self, samples): raise NotImplementedError def apply_gradients(self, grads): raise NotImplementedError def save(self): filters = self.get_filters(flush_after=True) return pickle.dumps({"filters": filters}) def restore(self, objs): objs = pickle.loads(objs) self.sync_filters(objs["filters"]) def get_weights(self): return self.variables.get_weights() def set_weights(self, weights): self.variables.set_weights(weights) def sample(self): """Returns experience samples from this Evaluator. Observation filter and reward filters are flushed here. Returns: SampleBatch: A columnar batch of experiences. """ num_steps_so_far = 0 all_samples = [] while num_steps_so_far < self.config["min_steps_per_task"]: rollout = self.sampler.get_data() samples = process_rollout(rollout, self.rew_filter, self.config["gamma"], self.config["lambda"], use_gae=self.config["use_gae"]) num_steps_so_far += samples.count all_samples.append(samples) return SampleBatch.concat_samples(all_samples) def get_completed_rollout_metrics(self): """Returns metrics on previously completed rollouts. Calling this clears the queue of completed rollout metrics. """ return self.sampler.get_metrics() def sync_filters(self, new_filters): """Changes self's filter to given and rebases any accumulated delta. Args: new_filters (dict): Filters with new state to update local copy. """ assert all(k in new_filters for k in self.filters) for k in self.filters: self.filters[k].sync(new_filters[k]) def get_filters(self, flush_after=False): """Returns a snapshot of filters. Args: flush_after (bool): Clears the filter buffer state. Returns: return_filters (dict): Dict for serializable filters """ return_filters = {} for k, f in self.filters.items(): return_filters[k] = f.as_serializable() if flush_after: f.clear_buffer() return return_filters
class PPOEvaluator(TFMultiGPUSupport): """ Runner class that holds the simulator environment and the policy. Initializes the tensorflow graphs for both training and evaluation. One common policy graph is initialized on '/cpu:0' and holds all the shared network weights. When run as a remote agent, only this graph is used. """ def __init__(self, registry, env_creator, config, logdir, is_remote): self.registry = registry self.config = config self.logdir = logdir self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=config_proto) self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder(name="newkl", shape=(), dtype=tf.float32) # The input observations. self.observations = tf.placeholder(tf.float32, shape=(None, ) + self.env.observation_space.shape) # Targets of the value function. self.value_targets = tf.placeholder(tf.float32, shape=(None, )) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None, )) action_space = self.env.action_space self.actions = ModelCatalog.get_action_placeholder(action_space) self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space, config["model"]) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder(tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None, )) self.inputs = [("obs", self.observations), ("value_targets", self.value_targets), ("advantages", self.advantages), ("actions", self.actions), ("logprobs", self.prev_logits), ("vf_preds", self.prev_vf_preds)] self.common_policy = self.build_tf_loss([ph for _, ph in self.inputs]) # References to the model weights self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) self.obs_filter = get_filter(config["observation_filter"], self.env.observation_space.shape) self.rew_filter = MeanStdFilter((), clip=5.0) self.filters = { "obs_filter": self.obs_filter, "rew_filter": self.rew_filter } self.sampler = SyncSampler(self.env, self.common_policy, self.obs_filter, self.config["horizon"], self.config["horizon"]) def tf_loss_inputs(self): return self.inputs def build_tf_loss(self, input_placeholders): obs, vtargets, advs, acts, plog, pvf_preds = input_placeholders return ProximalPolicyGraph(self.env.observation_space, self.env.action_space, obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess, self.registry) def init_extra_ops(self, device_losses): self.extra_ops = OrderedDict() with tf.name_scope("test_outputs"): policies = device_losses self.extra_ops["loss"] = tf.reduce_mean( tf.stack(values=[policy.loss for policy in policies]), 0) self.extra_ops["policy_loss"] = tf.reduce_mean( tf.stack( values=[policy.mean_policy_loss for policy in policies]), 0) self.extra_ops["vf_loss"] = tf.reduce_mean( tf.stack(values=[policy.mean_vf_loss for policy in policies]), 0) self.extra_ops["kl"] = tf.reduce_mean( tf.stack(values=[policy.mean_kl for policy in policies]), 0) self.extra_ops["entropy"] = tf.reduce_mean( tf.stack(values=[policy.mean_entropy for policy in policies]), 0) def extra_apply_grad_fetches(self): return list(self.extra_ops.values()) def extra_apply_grad_feed_dict(self): return {self.kl_coeff: self.kl_coeff_val} def update_kl(self, sampled_kl): if sampled_kl > 2.0 * self.kl_target: self.kl_coeff_val *= 1.5 elif sampled_kl < 0.5 * self.kl_target: self.kl_coeff_val *= 0.5 def save(self): filters = self.get_filters(flush_after=True) return pickle.dumps({ "filters": filters, "kl_coeff_val": self.kl_coeff_val, "kl_target": self.kl_target, }) def restore(self, objs): objs = pickle.loads(objs) self.sync_filters(objs["filters"]) self.kl_coeff_val = objs["kl_coeff_val"] self.kl_target = objs["kl_target"] def get_weights(self): return self.variables.get_weights() def set_weights(self, weights): self.variables.set_weights(weights) def sample(self): """Returns experience samples from this Evaluator. Observation filter and reward filters are flushed here. Returns: SampleBatch: A columnar batch of experiences. """ num_steps_so_far = 0 all_samples = [] while num_steps_so_far < self.config["min_steps_per_task"]: rollout = self.sampler.get_data() last_r = 0.0 # note: not needed since we don't truncate rollouts samples = compute_advantages(rollout, last_r, self.config["gamma"], self.config["lambda"], use_gae=self.config["use_gae"]) num_steps_so_far += samples.count all_samples.append(samples) return SampleBatch.concat_samples(all_samples) def get_completed_rollout_metrics(self): """Returns metrics on previously completed rollouts. Calling this clears the queue of completed rollout metrics. """ return self.sampler.get_metrics() def sync_filters(self, new_filters): """Changes self's filter to given and rebases any accumulated delta. Args: new_filters (dict): Filters with new state to update local copy. """ assert all(k in new_filters for k in self.filters) for k in self.filters: self.filters[k].sync(new_filters[k]) def get_filters(self, flush_after=False): """Returns a snapshot of filters. Args: flush_after (bool): Clears the filter buffer state. Returns: return_filters (dict): Dict for serializable filters """ return_filters = {} for k, f in self.filters.items(): return_filters[k] = f.as_serializable() if flush_after: f.clear_buffer() return return_filters
class CommonPolicyEvaluator(PolicyEvaluator): """Policy evaluator implementation that operates on a rllib.PolicyGraph. TODO: multi-agent TODO: multi-gpu Examples: # Create a policy evaluator and using it to collect experiences. >>> evaluator = CommonPolicyEvaluator( env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=PGPolicyGraph) >>> print(evaluator.sample().keys()) {"obs": [[...]], "actions": [[...]], "rewards": [[...]], "dones": [[...]], "new_obs": [[...]]} # Creating policy evaluators using optimizer_cls.make(). >>> optimizer = LocalSyncOptimizer.make( evaluator_cls=CommonPolicyEvaluator, evaluator_args={ "env_creator": lambda _: gym.make("CartPole-v0"), "policy_graph": PGPolicyGraph, }, num_workers=10) >>> for _ in range(10): optimizer.step() """ @classmethod def as_remote(cls, num_cpus=None, num_gpus=None): return ray.remote(num_cpus=num_cpus, num_gpus=num_gpus)(cls) def __init__(self, env_creator, policy_graph, tf_session_creator=None, batch_steps=100, batch_mode="truncate_episodes", episode_horizon=None, preprocessor_pref="rllib", sample_async=False, compress_observations=False, num_envs=1, observation_filter="NoFilter", env_config=None, model_config=None, policy_config=None): """Initialize a policy evaluator. Arguments: env_creator (func): Function that returns a gym.Env given an env config dict. policy_graph (class): A class implementing rllib.PolicyGraph or rllib.TFPolicyGraph. tf_session_creator (func): A function that returns a TF session. This is optional and only useful with TFPolicyGraph. batch_steps (int): The target number of env transitions to include in each sample batch returned from this evaluator. batch_mode (str): One of the following batch modes: "truncate_episodes": Each call to sample() will return a batch of exactly `batch_steps` in size. Episodes may be truncated in order to meet this size requirement. When `num_envs > 1`, episodes will be truncated to sequences of `batch_size / num_envs` in length. "complete_episodes": Each call to sample() will return a batch of at least `batch_steps in size. Episodes will not be truncated, but multiple episodes may be packed within one batch to meet the batch size. Note that when `num_envs > 1`, episode steps will be buffered until the episode completes, and hence batches may contain significant amounts of off-policy data. episode_horizon (int): Whether to stop episodes at this horizon. preprocessor_pref (str): Whether to prefer RLlib preprocessors ("rllib") or deepmind ("deepmind") when applicable. sample_async (bool): Whether to compute samples asynchronously in the background, which improves throughput but can cause samples to be slightly off-policy. compress_observations (bool): If true, compress the observations returned. num_envs (int): If more than one, will create multiple envs and vectorize the computation of actions. This has no effect if if the env already implements VectorEnv. observation_filter (str): Name of observation filter to use. env_config (dict): Config to pass to the env creator. model_config (dict): Config to use when creating the policy model. policy_config (dict): Config to pass to the policy. """ env_config = env_config or {} policy_config = policy_config or {} model_config = model_config or {} self.env_creator = env_creator self.policy_graph = policy_graph self.batch_steps = batch_steps self.batch_mode = batch_mode self.compress_observations = compress_observations self.env = env_creator(env_config) if isinstance(self.env, VectorEnv) or \ isinstance(self.env, ServingEnv) or \ isinstance(self.env, AsyncVectorEnv): def wrap(env): return env # we can't auto-wrap these env types elif is_atari(self.env) and \ "custom_preprocessor" not in model_config and \ preprocessor_pref == "deepmind": def wrap(env): return wrap_deepmind(env, dim=model_config.get("dim", 80)) else: def wrap(env): return ModelCatalog.get_preprocessor_as_wrapper( env, model_config) self.env = wrap(self.env) def make_env(): return wrap(env_creator(env_config)) self.policy_map = {} if issubclass(policy_graph, TFPolicyGraph): with tf.Graph().as_default(): if tf_session_creator: self.sess = tf_session_creator() else: self.sess = tf.Session(config=tf.ConfigProto( gpu_options=tf.GPUOptions(allow_growth=True))) with self.sess.as_default(): policy = policy_graph(self.env.observation_space, self.env.action_space, policy_config) else: policy = policy_graph(self.env.observation_space, self.env.action_space, policy_config) self.policy_map = {"default": policy} self.obs_filter = get_filter(observation_filter, self.env.observation_space.shape) self.filters = {"obs_filter": self.obs_filter} # Always use vector env for consistency even if num_envs = 1 if not isinstance(self.env, AsyncVectorEnv): if isinstance(self.env, ServingEnv): self.vector_env = _ServingEnvToAsync(self.env) else: if not isinstance(self.env, VectorEnv): self.env = VectorEnv.wrap(make_env, [self.env], num_envs=num_envs) self.vector_env = _VectorEnvToAsync(self.env) else: self.vector_env = self.env if self.batch_mode == "truncate_episodes": if batch_steps % num_envs != 0: raise ValueError( "In 'truncate_episodes' batch mode, `batch_steps` must be " "evenly divisible by `num_envs`. Got {} and {}.".format( batch_steps, num_envs)) batch_steps = batch_steps // num_envs pack_episodes = True elif self.batch_mode == "complete_episodes": batch_steps = float("inf") # never cut episodes pack_episodes = False # sampler will return 1 episode per poll else: raise ValueError("Unsupported batch mode: {}".format( self.batch_mode)) if sample_async: self.sampler = AsyncSampler(self.vector_env, self.policy_map["default"], self.obs_filter, batch_steps, horizon=episode_horizon, pack=pack_episodes) self.sampler.start() else: self.sampler = SyncSampler(self.vector_env, self.policy_map["default"], self.obs_filter, batch_steps, horizon=episode_horizon, pack=pack_episodes) def sample(self): """Evaluate the current policies and return a batch of experiences. Return: SampleBatch from evaluating the current policies. """ batches = [self.sampler.get_data()] steps_so_far = batches[0].count while steps_so_far < self.batch_steps: batch = self.sampler.get_data() steps_so_far += batch.count batches.append(batch) batch = SampleBatch.concat_samples(batches) if self.compress_observations: batch["obs"] = [pack(o) for o in batch["obs"]] batch["new_obs"] = [pack(o) for o in batch["new_obs"]] return batch def for_policy(self, func): """Apply the given function to this evaluator's default policy.""" return func(self.policy_map["default"]) def sync_filters(self, new_filters): """Changes self's filter to given and rebases any accumulated delta. Args: new_filters (dict): Filters with new state to update local copy. """ assert all(k in new_filters for k in self.filters) for k in self.filters: self.filters[k].sync(new_filters[k]) def get_filters(self, flush_after=False): """Returns a snapshot of filters. Args: flush_after (bool): Clears the filter buffer state. Returns: return_filters (dict): Dict for serializable filters """ return_filters = {} for k, f in self.filters.items(): return_filters[k] = f.as_serializable() if flush_after: f.clear_buffer() return return_filters def get_weights(self): return self.policy_map["default"].get_weights() def set_weights(self, weights): return self.policy_map["default"].set_weights(weights) def compute_gradients(self, samples): return self.policy_map["default"].compute_gradients(samples) def apply_gradients(self, grads): return self.policy_map["default"].apply_gradients(grads) def compute_apply(self, samples): grad_fetch, apply_fetch = self.policy_map["default"].compute_apply( samples) return grad_fetch def save(self): filters = self.get_filters(flush_after=True) state = self.policy_map["default"].get_state() return pickle.dumps({"filters": filters, "state": state}) def restore(self, objs): objs = pickle.loads(objs) self.sync_filters(objs["filters"]) self.policy_map["default"].set_state(objs["state"])
class Runner(object): """ Runner class that holds the simulator environment and the policy. Initializes the tensorflow graphs for both training and evaluation. One common policy graph is initialized on '/cpu:0' and holds all the shared network weights. When run as a remote agent, only this graph is used. """ def __init__(self, env_creator, config, logdir, is_remote): self.is_remote = is_remote if is_remote: os.environ["CUDA_VISIBLE_DEVICES"] = "" devices = ["/cpu:0"] else: devices = config["devices"] self.devices = devices self.config = config self.logdir = logdir self.env = create_and_wrap(env_creator, config["model"]) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=config_proto) if config["tf_debug_inf_or_nan"] and not is_remote: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter( "has_inf_or_nan", tf_debug.has_inf_or_nan) # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder( name="newkl", shape=(), dtype=tf.float32) # The input observations. self.observations = tf.placeholder( tf.float32, shape=(None,) + self.env.observation_space.shape) # Targets of the value function. self.value_targets = tf.placeholder(tf.float32, shape=(None,)) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None,)) action_space = self.env.action_space # TODO(rliaw): pull this into model_catalog if isinstance(action_space, gym.spaces.Box): self.actions = tf.placeholder( tf.float32, shape=(None, action_space.shape[0])) elif isinstance(action_space, gym.spaces.Discrete): self.actions = tf.placeholder(tf.int64, shape=(None,)) else: raise NotImplemented( "action space" + str(type(action_space)) + "currently not supported") self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder( tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None,)) assert config["sgd_batchsize"] % len(devices) == 0, \ "Batch size must be evenly divisible by devices" if is_remote: self.batch_size = config["rollout_batchsize"] self.per_device_batch_size = config["rollout_batchsize"] else: self.batch_size = config["sgd_batchsize"] self.per_device_batch_size = int(self.batch_size / len(devices)) def build_loss(obs, vtargets, advs, acts, plog, pvf_preds): return ProximalPolicyLoss( self.env.observation_space, self.env.action_space, obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess) self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices, [self.observations, self.value_targets, self.advantages, self.actions, self.prev_logits, self.prev_vf_preds], self.per_device_batch_size, build_loss, self.logdir) # Metric ops with tf.name_scope("test_outputs"): policies = self.par_opt.get_device_losses() self.mean_loss = tf.reduce_mean( tf.stack(values=[ policy.loss for policy in policies]), 0) self.mean_policy_loss = tf.reduce_mean( tf.stack(values=[ policy.mean_policy_loss for policy in policies]), 0) self.mean_vf_loss = tf.reduce_mean( tf.stack(values=[ policy.mean_vf_loss for policy in policies]), 0) self.mean_kl = tf.reduce_mean( tf.stack(values=[ policy.mean_kl for policy in policies]), 0) self.mean_entropy = tf.reduce_mean( tf.stack(values=[ policy.mean_entropy for policy in policies]), 0) # References to the model weights self.common_policy = self.par_opt.get_common_loss() self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) obs_filter = get_filter( config["observation_filter"], self.env.observation_space.shape) self.sampler = SyncSampler( self.env, self.common_policy, obs_filter, self.config["horizon"], self.config["horizon"]) self.reward_filter = MeanStdFilter((), clip=5.0) self.sess.run(tf.global_variables_initializer()) def load_data(self, trajectories, full_trace): use_gae = self.config["use_gae"] dummy = np.zeros_like(trajectories["advantages"]) return self.par_opt.load_data( self.sess, [trajectories["observations"], trajectories["value_targets"] if use_gae else dummy, trajectories["advantages"], trajectories["actions"].squeeze(), trajectories["logprobs"], trajectories["vf_preds"] if use_gae else dummy], full_trace=full_trace) def run_sgd_minibatch( self, batch_index, kl_coeff, full_trace, file_writer): return self.par_opt.optimize( self.sess, batch_index, extra_ops=[ self.mean_loss, self.mean_policy_loss, self.mean_vf_loss, self.mean_kl, self.mean_entropy], extra_feed_dict={self.kl_coeff: kl_coeff}, file_writer=file_writer if full_trace else None) def save(self): obs_filter = self.sampler.get_obs_filter() return pickle.dumps([obs_filter, self.reward_filter]) def restore(self, objs): objs = pickle.loads(objs) obs_filter = objs[0] rew_filter = objs[1] self.update_filters(obs_filter, rew_filter) def get_weights(self): return self.variables.get_weights() def load_weights(self, weights): self.variables.set_weights(weights) def update_filters(self, obs_filter=None, rew_filter=None): if rew_filter: # No special handling required since outside of threaded code self.reward_filter = rew_filter.copy() if obs_filter: self.sampler.update_obs_filter(obs_filter) def get_obs_filter(self): return self.sampler.get_obs_filter() def compute_steps(self, config, obs_filter, rew_filter): """Compute multiple rollouts and concatenate the results. Args: config: Configuration parameters obs_filter: Function that is applied to each of the observations. reward_filter: Function that is applied to each of the rewards. Returns: states: List of states. total_rewards: Total rewards of the trajectories. trajectory_lengths: Lengths of the trajectories. """ num_steps_so_far = 0 trajectories = [] self.update_filters(obs_filter, rew_filter) while num_steps_so_far < config["min_steps_per_task"]: rollout = self.sampler.get_data() trajectory = process_rollout( rollout, self.reward_filter, config["gamma"], config["lambda"], use_gae=config["use_gae"]) num_steps_so_far += trajectory["rewards"].shape[0] trajectories.append(trajectory) metrics = self.sampler.get_metrics() total_rewards, trajectory_lengths = zip(*[ (c.episode_reward, c.episode_length) for c in metrics]) updated_obs_filter = self.sampler.get_obs_filter(flush=True) return ( concatenate(trajectories), total_rewards, trajectory_lengths, updated_obs_filter, self.reward_filter)
class CommonPolicyEvaluator(PolicyEvaluator): """Policy evaluator implementation that operates on a rllib.PolicyGraph. TODO: multi-gpu Examples: # Create a policy evaluator and using it to collect experiences. >>> evaluator = CommonPolicyEvaluator( env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=PGPolicyGraph) >>> print(evaluator.sample()) SampleBatch({ "obs": [[...]], "actions": [[...]], "rewards": [[...]], "dones": [[...]], "new_obs": [[...]]}) # Creating policy evaluators using optimizer_cls.make(). >>> optimizer = SyncSamplesOptimizer.make( evaluator_cls=CommonPolicyEvaluator, evaluator_args={ "env_creator": lambda _: gym.make("CartPole-v0"), "policy_graph": PGPolicyGraph, }, num_workers=10) >>> for _ in range(10): optimizer.step() # Creating a multi-agent policy evaluator >>> evaluator = CommonPolicyEvaluator( env_creator=lambda _: MultiAgentTrafficGrid(num_cars=25), policy_graph={ # Use an ensemble of two policies for car agents "car_policy1": (PGPolicyGraph, Box(...), Discrete(...), {"gamma": 0.99}), "car_policy2": (PGPolicyGraph, Box(...), Discrete(...), {"gamma": 0.95}), # Use a single shared policy for all traffic lights "traffic_light_policy": (PGPolicyGraph, Box(...), Discrete(...), {}), }, policy_mapping_fn=lambda agent_id: random.choice(["car_policy1", "car_policy2"]) if agent_id.startswith("car_") else "traffic_light_policy") >>> print(evaluator.sample().keys()) MultiAgentBatch({ "car_policy1": SampleBatch(...), "car_policy2": SampleBatch(...), "traffic_light_policy": SampleBatch(...)}) """ @classmethod def as_remote(cls, num_cpus=None, num_gpus=None): return ray.remote(num_cpus=num_cpus, num_gpus=num_gpus)(cls) def __init__(self, env_creator, policy_graph, policy_mapping_fn=None, tf_session_creator=None, batch_steps=100, batch_mode="truncate_episodes", episode_horizon=None, preprocessor_pref="rllib", sample_async=False, compress_observations=False, num_envs=1, observation_filter="NoFilter", env_config=None, model_config=None, policy_config=None, worker_index=0): """Initialize a policy evaluator. Arguments: env_creator (func): Function that returns a gym.Env given an EnvContext wrapped configuration. policy_graph (class|dict): Either a class implementing PolicyGraph, or a dictionary of policy id strings to (PolicyGraph, obs_space, action_space, config) tuples. If a dict is specified, then we are in multi-agent mode and a policy_mapping_fn should also be set. policy_mapping_fn (func): A function that maps agent ids to policy ids in multi-agent mode. This function will be called each time a new agent appears in an episode, to bind that agent to a policy for the duration of the episode. tf_session_creator (func): A function that returns a TF session. This is optional and only useful with TFPolicyGraph. batch_steps (int): The target number of env transitions to include in each sample batch returned from this evaluator. batch_mode (str): One of the following batch modes: "truncate_episodes": Each call to sample() will return a batch of exactly `batch_steps` in size. Episodes may be truncated in order to meet this size requirement. When `num_envs > 1`, episodes will be truncated to sequences of `batch_size / num_envs` in length. "complete_episodes": Each call to sample() will return a batch of at least `batch_steps in size. Episodes will not be truncated, but multiple episodes may be packed within one batch to meet the batch size. Note that when `num_envs > 1`, episode steps will be buffered until the episode completes, and hence batches may contain significant amounts of off-policy data. episode_horizon (int): Whether to stop episodes at this horizon. preprocessor_pref (str): Whether to prefer RLlib preprocessors ("rllib") or deepmind ("deepmind") when applicable. sample_async (bool): Whether to compute samples asynchronously in the background, which improves throughput but can cause samples to be slightly off-policy. compress_observations (bool): If true, compress the observations returned. num_envs (int): If more than one, will create multiple envs and vectorize the computation of actions. This has no effect if if the env already implements VectorEnv. observation_filter (str): Name of observation filter to use. env_config (dict): Config to pass to the env creator. model_config (dict): Config to use when creating the policy model. policy_config (dict): Config to pass to the policy. In the multi-agent case, this config will be merged with the per-policy configs specified by `policy_graph`. worker_index (int): For remote evaluators, this should be set to a non-zero and unique value. This index is passed to created envs through EnvContext so that envs can be configured per worker. """ env_context = EnvContext(env_config or {}, worker_index) policy_config = policy_config or {} self.policy_config = policy_config model_config = model_config or {} policy_mapping_fn = (policy_mapping_fn or (lambda agent_id: DEFAULT_POLICY_ID)) self.env_creator = env_creator self.policy_graph = policy_graph self.batch_steps = batch_steps self.batch_mode = batch_mode self.compress_observations = compress_observations self.env = env_creator(env_context) if isinstance(self.env, VectorEnv) or \ isinstance(self.env, ServingEnv) or \ isinstance(self.env, MultiAgentEnv) or \ isinstance(self.env, AsyncVectorEnv): def wrap(env): return env # we can't auto-wrap these env types elif is_atari(self.env) and \ "custom_preprocessor" not in model_config and \ preprocessor_pref == "deepmind": def wrap(env): return wrap_deepmind(env, dim=model_config.get("dim", 80)) else: def wrap(env): return ModelCatalog.get_preprocessor_as_wrapper( env, model_config) self.env = wrap(self.env) def make_env(): return wrap(env_creator(env_context)) self.tf_sess = None policy_dict = _validate_and_canonicalize(policy_graph, self.env) if _has_tensorflow_graph(policy_dict): with tf.Graph().as_default(): if tf_session_creator: self.tf_sess = tf_session_creator() else: self.tf_sess = tf.Session(config=tf.ConfigProto( gpu_options=tf.GPUOptions(allow_growth=True))) with self.tf_sess.as_default(): self.policy_map = self._build_policy_map( policy_dict, policy_config) else: self.policy_map = self._build_policy_map(policy_dict, policy_config) self.multiagent = self.policy_map.keys() != set(DEFAULT_POLICY_ID) self.filters = { policy_id: get_filter(observation_filter, policy.observation_space.shape) for (policy_id, policy) in self.policy_map.items() } # Always use vector env for consistency even if num_envs = 1 self.async_env = AsyncVectorEnv.wrap_async(self.env, make_env=make_env, num_envs=num_envs) if self.batch_mode == "truncate_episodes": if batch_steps % num_envs != 0: raise ValueError( "In 'truncate_episodes' batch mode, `batch_steps` must be " "evenly divisible by `num_envs`. Got {} and {}.".format( batch_steps, num_envs)) batch_steps = batch_steps // num_envs pack_episodes = True elif self.batch_mode == "complete_episodes": batch_steps = float("inf") # never cut episodes pack_episodes = False # sampler will return 1 episode per poll else: raise ValueError("Unsupported batch mode: {}".format( self.batch_mode)) if sample_async: self.sampler = AsyncSampler(self.async_env, self.policy_map, policy_mapping_fn, self.filters, batch_steps, horizon=episode_horizon, pack=pack_episodes, tf_sess=self.tf_sess) self.sampler.start() else: self.sampler = SyncSampler(self.async_env, self.policy_map, policy_mapping_fn, self.filters, batch_steps, horizon=episode_horizon, pack=pack_episodes, tf_sess=self.tf_sess) def _build_policy_map(self, policy_dict, policy_config): policy_map = {} for name, (cls, obs_space, act_space, conf) in sorted(policy_dict.items()): merged_conf = policy_config.copy() merged_conf.update(conf) with tf.variable_scope(name): policy_map[name] = cls(obs_space, act_space, merged_conf) return policy_map def sample(self): """Evaluate the current policies and return a batch of experiences. Return: SampleBatch|MultiAgentBatch from evaluating the current policies. """ batches = [self.sampler.get_data()] steps_so_far = batches[0].count while steps_so_far < self.batch_steps: batch = self.sampler.get_data() steps_so_far += batch.count batches.append(batch) batch = batches[0].concat_samples(batches) if self.compress_observations: if isinstance(batch, MultiAgentBatch): for data in batch.policy_batches.values(): data["obs"] = [pack(o) for o in data["obs"]] data["new_obs"] = [pack(o) for o in data["new_obs"]] else: batch["obs"] = [pack(o) for o in batch["obs"]] batch["new_obs"] = [pack(o) for o in batch["new_obs"]] return batch def for_policy(self, func, policy_id=DEFAULT_POLICY_ID): """Apply the given function to the specified policy graph.""" return func(self.policy_map[policy_id]) def foreach_policy(self, func): """Apply the given function to each (policy, policy_id) tuple.""" return [func(policy, pid) for pid, policy in self.policy_map.items()] def sync_filters(self, new_filters): """Changes self's filter to given and rebases any accumulated delta. Args: new_filters (dict): Filters with new state to update local copy. """ assert all(k in new_filters for k in self.filters) for k in self.filters: self.filters[k].sync(new_filters[k]) def get_filters(self, flush_after=False): """Returns a snapshot of filters. Args: flush_after (bool): Clears the filter buffer state. Returns: return_filters (dict): Dict for serializable filters """ return_filters = {} for k, f in self.filters.items(): return_filters[k] = f.as_serializable() if flush_after: f.clear_buffer() return return_filters def get_weights(self): return { pid: policy.get_weights() for pid, policy in self.policy_map.items() } def set_weights(self, weights): for pid, w in weights.items(): self.policy_map[pid].set_weights(w) def compute_gradients(self, samples): if isinstance(samples, MultiAgentBatch): grad_out, info_out = {}, {} if self.tf_sess is not None: builder = TFRunBuilder(self.tf_sess, "compute_gradients") for pid, batch in samples.policy_batches.items(): grad_out[pid], info_out[pid] = ( self.policy_map[pid].build_compute_gradients( builder, batch)) grad_out = {k: builder.get(v) for k, v in grad_out.items()} info_out = {k: builder.get(v) for k, v in info_out.items()} else: for pid, batch in samples.policy_batches.items(): grad_out[pid], info_out[pid] = ( self.policy_map[pid].compute_gradients(batch)) return grad_out, info_out else: return self.policy_map[DEFAULT_POLICY_ID].compute_gradients( samples) def apply_gradients(self, grads): if isinstance(grads, dict): if self.tf_sess is not None: builder = TFRunBuilder(self.tf_sess, "apply_gradients") outputs = { pid: self.policy_map[pid].build_apply_gradients(builder, grad) for pid, grad in grads.items() } return {k: builder.get(v) for k, v in outputs.items()} else: return { pid: self.policy_map[pid].apply_gradients(g) for pid, g in grads.items() } else: return self.policy_map[DEFAULT_POLICY_ID].apply_gradients(grads) def compute_apply(self, samples): if isinstance(samples, MultiAgentBatch): info_out = {} if self.tf_sess is not None: builder = TFRunBuilder(self.tf_sess, "compute_apply") for pid, batch in samples.policy_batches.items(): info_out[pid], _ = ( self.policy_map[pid].build_compute_apply( builder, batch)) info_out = {k: builder.get(v) for k, v in info_out.items()} else: for pid, batch in samples.policy_batches.items(): info_out[pid], _ = ( self.policy_map[pid].compute_apply(batch)) return info_out else: grad_fetch, apply_fetch = ( self.policy_map[DEFAULT_POLICY_ID].compute_apply(samples)) return grad_fetch def save(self): filters = self.get_filters(flush_after=True) state = { pid: self.policy_map[pid].get_state() for pid in self.policy_map } return pickle.dumps({"filters": filters, "state": state}) def restore(self, objs): objs = pickle.loads(objs) self.sync_filters(objs["filters"]) for pid, state in objs["state"].items(): self.policy_map[pid].set_state(state)
class CommonPolicyEvaluator(PolicyEvaluator): """Policy evaluator implementation that operates on a rllib.PolicyGraph. TODO: vector env TODO: multi-agent TODO: consumer buffering for multi-agent TODO: complete episode batch mode Examples: # Create a policy evaluator and using it to collect experiences. >>> evaluator = CommonPolicyEvaluator( env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=PGPolicyGraph) >>> print(evaluator.sample().keys()) {"obs": [[...]], "actions": [[...]], "rewards": [[...]], "dones": [[...]], "new_obs": [[...]]} # Creating policy evaluators using optimizer_cls.make(). >>> optimizer = LocalSyncOptimizer.make( evaluator_cls=CommonPolicyEvaluator, evaluator_args={ "env_creator": lambda _: gym.make("CartPole-v0"), "policy_graph": PGPolicyGraph, }, num_workers=10) >>> for _ in range(10): optimizer.step() """ @classmethod def as_remote(cls, num_cpus=None, num_gpus=None): return ray.remote(num_cpus=num_cpus, num_gpus=num_gpus)(cls) def __init__(self, env_creator, policy_graph, tf_session_creator=None, batch_steps=100, batch_mode="truncate_episodes", preprocessor_pref="rllib", sample_async=False, compress_observations=False, observation_filter="NoFilter", registry=None, env_config=None, model_config=None, policy_config=None): """Initialize a policy evaluator. Arguments: env_creator (func): Function that returns a gym.Env given an env config dict. policy_graph (class): A class implementing rllib.PolicyGraph or rllib.TFPolicyGraph. tf_session_creator (func): A function that returns a TF session. This is optional and only useful with TFPolicyGraph. batch_steps (int): The target number of env transitions to include in each sample batch returned from this evaluator. batch_mode (str): One of the following choices: complete_episodes: each batch will be at least batch_steps in size, and will include one or more complete episodes. truncate_episodes: each batch will be around batch_steps in size, and include transitions from one episode only. pack_episodes: each batch will be exactly batch_steps in size, and may include transitions from multiple episodes. preprocessor_pref (str): Whether to prefer RLlib preprocessors ("rllib") or deepmind ("deepmind") when applicable. sample_async (bool): Whether to compute samples asynchronously in the background, which improves throughput but can cause samples to be slightly off-policy. compress_observations (bool): If true, compress the observations returned. observation_filter (str): Name of observation filter to use. registry (tune.Registry): User-registered objects. Pass in the value from tune.registry.get_registry() if you're having trouble resolving things like custom envs. env_config (dict): Config to pass to the env creator. model_config (dict): Config to use when creating the policy model. policy_config (dict): Config to pass to the policy. """ registry = registry or get_registry() env_config = env_config or {} policy_config = policy_config or {} model_config = model_config or {} assert batch_mode in [ "complete_episodes", "truncate_episodes", "pack_episodes" ] self.env_creator = env_creator self.policy_graph = policy_graph self.batch_steps = batch_steps self.batch_mode = batch_mode self.compress_observations = compress_observations self.env = env_creator(env_config) is_atari = hasattr(self.env.unwrapped, "ale") if is_atari and "custom_preprocessor" not in model_config and \ preprocessor_pref == "deepmind": self.env = wrap_deepmind(self.env, dim=model_config.get("dim", 80)) else: self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, self.env, model_config) self.vectorized = hasattr(self.env, "vector_reset") self.policy_map = {} if issubclass(policy_graph, TFPolicyGraph): with tf.Graph().as_default(): if tf_session_creator: self.sess = tf_session_creator() else: self.sess = tf.Session(config=tf.ConfigProto( gpu_options=tf.GPUOptions(allow_growth=True))) with self.sess.as_default(): policy = policy_graph(self.env.observation_space, self.env.action_space, registry, policy_config) else: policy = policy_graph(self.env.observation_space, self.env.action_space, registry, policy_config) self.policy_map = {"default": policy} self.obs_filter = get_filter(observation_filter, self.env.observation_space.shape) self.filters = {"obs_filter": self.obs_filter} if self.vectorized: raise NotImplementedError("Vector envs not yet supported") else: if batch_mode not in [ "pack_episodes", "truncate_episodes", "complete_episodes" ]: raise NotImplementedError("Batch mode not yet supported") pack = batch_mode == "pack_episodes" if batch_mode == "complete_episodes": batch_steps = 999999 if sample_async: self.sampler = AsyncSampler(self.env, self.policy_map["default"], self.obs_filter, batch_steps, pack=pack) self.sampler.start() else: self.sampler = SyncSampler(self.env, self.policy_map["default"], self.obs_filter, batch_steps, pack=pack) def sample(self): """Evaluate the current policies and return a batch of experiences. Return: SampleBatch from evaluating the current policies. """ batch = self.policy_map["default"].postprocess_trajectory( self.sampler.get_data()) if self.compress_observations: batch["obs"] = [pack(o) for o in batch["obs"]] batch["new_obs"] = [pack(o) for o in batch["new_obs"]] return batch def apply(self, func): """Apply the given function to this evaluator instance.""" return func(self) def for_policy(self, func): """Apply the given function to this evaluator's default policy.""" return func(self.policy_map["default"]) def sync_filters(self, new_filters): """Changes self's filter to given and rebases any accumulated delta. Args: new_filters (dict): Filters with new state to update local copy. """ assert all(k in new_filters for k in self.filters) for k in self.filters: self.filters[k].sync(new_filters[k]) def get_filters(self, flush_after=False): """Returns a snapshot of filters. Args: flush_after (bool): Clears the filter buffer state. Returns: return_filters (dict): Dict for serializable filters """ return_filters = {} for k, f in self.filters.items(): return_filters[k] = f.as_serializable() if flush_after: f.clear_buffer() return return_filters def get_weights(self): return self.policy_map["default"].get_weights() def set_weights(self, weights): return self.policy_map["default"].set_weights(weights) def compute_gradients(self, samples): return self.policy_map["default"].compute_gradients(samples) def apply_gradients(self, grads): return self.policy_map["default"].apply_gradients(grads) def compute_apply(self, samples): grad_fetch, apply_fetch = self.policy_map["default"].compute_apply( samples) return grad_fetch def save(self): filters = self.get_filters(flush_after=True) state = self.policy_map["default"].get_state() return pickle.dumps({"filters": filters, "state": state}) def restore(self, objs): objs = pickle.loads(objs) self.sync_filters(objs["filters"]) self.policy_map["default"].set_state(objs["state"])
class PPOEvaluator(Evaluator): """ Runner class that holds the simulator environment and the policy. Initializes the tensorflow graphs for both training and evaluation. One common policy graph is initialized on '/cpu:0' and holds all the shared network weights. When run as a remote agent, only this graph is used. """ def __init__(self, registry, env_creator, config, logdir, is_remote): self.registry = registry self.is_remote = is_remote if is_remote: os.environ["CUDA_VISIBLE_DEVICES"] = "" devices = ["/cpu:0"] else: devices = config["devices"] self.devices = devices self.config = config self.logdir = logdir self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=config_proto) if config["tf_debug_inf_or_nan"] and not is_remote: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter( "has_inf_or_nan", tf_debug.has_inf_or_nan) # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder( name="newkl", shape=(), dtype=tf.float32) # The input observations. self.observations = tf.placeholder( tf.float32, shape=(None,) + self.env.observation_space.shape) # Targets of the value function. self.value_targets = tf.placeholder(tf.float32, shape=(None,)) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None,)) action_space = self.env.action_space self.actions = ModelCatalog.get_action_placeholder(action_space) self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder( tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None,)) if is_remote: self.batch_size = config["rollout_batchsize"] self.per_device_batch_size = config["rollout_batchsize"] else: self.batch_size = int( config["sgd_batchsize"] / len(devices)) * len(devices) assert self.batch_size % len(devices) == 0 self.per_device_batch_size = int(self.batch_size / len(devices)) def build_loss(obs, vtargets, advs, acts, plog, pvf_preds): return ProximalPolicyLoss( self.env.observation_space, self.env.action_space, obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess, self.registry) self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices, [self.observations, self.value_targets, self.advantages, self.actions, self.prev_logits, self.prev_vf_preds], self.per_device_batch_size, build_loss, self.logdir) # Metric ops with tf.name_scope("test_outputs"): policies = self.par_opt.get_device_losses() self.mean_loss = tf.reduce_mean( tf.stack(values=[ policy.loss for policy in policies]), 0) self.mean_policy_loss = tf.reduce_mean( tf.stack(values=[ policy.mean_policy_loss for policy in policies]), 0) self.mean_vf_loss = tf.reduce_mean( tf.stack(values=[ policy.mean_vf_loss for policy in policies]), 0) self.mean_kl = tf.reduce_mean( tf.stack(values=[ policy.mean_kl for policy in policies]), 0) self.mean_entropy = tf.reduce_mean( tf.stack(values=[ policy.mean_entropy for policy in policies]), 0) # References to the model weights self.common_policy = self.par_opt.get_common_loss() self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) self.obs_filter = get_filter( config["observation_filter"], self.env.observation_space.shape) self.rew_filter = MeanStdFilter((), clip=5.0) self.filters = {"obs_filter": self.obs_filter, "rew_filter": self.rew_filter} self.sampler = SyncSampler( self.env, self.common_policy, self.obs_filter, self.config["horizon"], self.config["horizon"]) self.sess.run(tf.global_variables_initializer()) def load_data(self, trajectories, full_trace): use_gae = self.config["use_gae"] dummy = np.zeros_like(trajectories["advantages"]) return self.par_opt.load_data( self.sess, [trajectories["observations"], trajectories["value_targets"] if use_gae else dummy, trajectories["advantages"], trajectories["actions"], trajectories["logprobs"], trajectories["vf_preds"] if use_gae else dummy], full_trace=full_trace) def run_sgd_minibatch( self, batch_index, kl_coeff, full_trace, file_writer): return self.par_opt.optimize( self.sess, batch_index, extra_ops=[ self.mean_loss, self.mean_policy_loss, self.mean_vf_loss, self.mean_kl, self.mean_entropy], extra_feed_dict={self.kl_coeff: kl_coeff}, file_writer=file_writer if full_trace else None) def compute_gradients(self, samples): raise NotImplementedError def apply_gradients(self, grads): raise NotImplementedError def save(self): filters = self.get_filters(flush_after=True) return pickle.dumps({"filters": filters}) def restore(self, objs): objs = pickle.loads(objs) self.sync_filters(objs["filters"]) def get_weights(self): return self.variables.get_weights() def set_weights(self, weights): self.variables.set_weights(weights) def sample(self): """Returns experience samples from this Evaluator. Observation filter and reward filters are flushed here. Returns: SampleBatch: A columnar batch of experiences. """ num_steps_so_far = 0 all_samples = [] while num_steps_so_far < self.config["min_steps_per_task"]: rollout = self.sampler.get_data() samples = process_rollout( rollout, self.rew_filter, self.config["gamma"], self.config["lambda"], use_gae=self.config["use_gae"]) num_steps_so_far += samples.count all_samples.append(samples) return SampleBatch.concat_samples(all_samples) def get_completed_rollout_metrics(self): """Returns metrics on previously completed rollouts. Calling this clears the queue of completed rollout metrics. """ return self.sampler.get_metrics() def sync_filters(self, new_filters): """Changes self's filter to given and rebases any accumulated delta. Args: new_filters (dict): Filters with new state to update local copy. """ assert all(k in new_filters for k in self.filters) for k in self.filters: self.filters[k].sync(new_filters[k]) def get_filters(self, flush_after=False): """Returns a snapshot of filters. Args: flush_after (bool): Clears the filter buffer state. Returns: return_filters (dict): Dict for serializable filters """ return_filters = {} for k, f in self.filters.items(): return_filters[k] = f.as_serializable() if flush_after: f.clear_buffer() return return_filters