def collect_samples(agents, config, observation_filter=NoFilter(), reward_filter=NoFilter()): num_timesteps_so_far = 0 trajectories = [] total_rewards = [] trajectory_lengths = [] # This variable maps the object IDs of trajectories that are currently # computed to the agent that they are computed on; we start some initial # tasks here. agent_dict = {agent.compute_steps.remote( config["gamma"], config["lambda"], config["horizon"], config["min_steps_per_task"]): agent for agent in agents} while num_timesteps_so_far < config["timesteps_per_batch"]: # TODO(pcm): Make wait support arbitrary iterators and remove the # conversion to list here. [next_trajectory], waiting_trajectories = ray.wait( list(agent_dict.keys())) agent = agent_dict.pop(next_trajectory) # Start task with next trajectory and record it in the dictionary. agent_dict[agent.compute_steps.remote( config["gamma"], config["lambda"], config["horizon"], config["min_steps_per_task"])] = ( agent) trajectory, rewards, lengths = ray.get(next_trajectory) total_rewards.extend(rewards) trajectory_lengths.extend(lengths) num_timesteps_so_far += len(trajectory["dones"]) trajectories.append(trajectory) return (concatenate(trajectories), np.mean(total_rewards), np.mean(trajectory_lengths))
def rollouts(policy, env, horizon, observation_filter=NoFilter(), reward_filter=NoFilter()): """Perform a batch of rollouts of a policy in an environment. Args: policy: The policy that will be rollout out. Can be an arbitrary object that supports a compute_actions(observation) function. env: The environment the rollout is computed in. Needs to support the OpenAI gym API and needs to support batches of data. horizon: Upper bound for the number of timesteps for each rollout in the batch. observation_filter: Function that is applied to each of the observations. reward_filter: Function that is applied to each of the rewards. Returns: A trajectory, which is a dictionary with keys "observations", "rewards", "orig_rewards", "actions", "logprobs", "dones". Each value is an array of shape (num_timesteps, env.batchsize, shape). """ observation = observation_filter(env.reset()) done = np.array(env.batchsize * [False]) t = 0 observations = [] # Filtered observations raw_rewards = [] # Empirical rewards actions = [] # Actions sampled by the policy logprobs = [] # Last layer of the policy network vf_preds = [] # Value function predictions dones = [] # Has this rollout terminated? while True: action, logprob, vfpred = policy.compute(observation) vf_preds.append(vfpred) observations.append(observation[None]) actions.append(action[None]) logprobs.append(logprob[None]) observation, raw_reward, done = env.step(action) observation = observation_filter(observation) raw_rewards.append(raw_reward[None]) dones.append(done[None]) t += 1 if done.all() or t >= horizon: break return {"observations": np.vstack(observations), "raw_rewards": np.vstack(raw_rewards), "actions": np.vstack(actions), "logprobs": np.vstack(logprobs), "vf_preds": np.vstack(vf_preds), "dones": np.vstack(dones)}
def __init__(self, sess, action_space, preprocessor, observation_filter, action_noise_std): self.sess = sess self.action_space = action_space self.action_noise_std = action_noise_std self.preprocessor = preprocessor if observation_filter == "MeanStdFilter": self.observation_filter = MeanStdFilter(self.preprocessor.shape, clip=None) elif observation_filter == "NoFilter": self.observation_filter = NoFilter() else: raise Exception("Unknown observation_filter: " + str("observation_filter")) self.inputs = tf.placeholder(tf.float32, [None] + list(self.preprocessor.shape)) # Policy network. dist_class, dist_dim = ModelCatalog.get_action_dist( self.action_space, dist_type="deterministic") model = ModelCatalog.get_model(self.inputs, dist_dim) dist = dist_class(model.outputs) self.sampler = dist.sample() self.variables = ray.experimental.TensorFlowVariables( model.outputs, self.sess) self.num_params = sum([ np.prod(variable.shape.as_list()) for _, variable in self.variables.variables.items() ]) self.sess.run(tf.global_variables_initializer())
def __init__(self, env_creator, batchsize, config, logdir, is_remote): if is_remote: os.environ["CUDA_VISIBLE_DEVICES"] = "" devices = ["/cpu:0"] else: devices = config["devices"] self.devices = devices self.config = config self.logdir = logdir self.env = BatchedEnv(env_creator, batchsize, config) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.preprocessor = self.env.preprocessor self.sess = tf.Session(config=config_proto) if config["tf_debug_inf_or_nan"] and not is_remote: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder(name="newkl", shape=(), dtype=tf.float32) # The input observations. self.observations = tf.placeholder(tf.float32, shape=(None, ) + self.preprocessor.shape) # Targets of the value function. self.returns = tf.placeholder(tf.float32, shape=(None, )) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None, )) action_space = self.env.action_space if isinstance(action_space, gym.spaces.Box): self.actions = tf.placeholder(tf.float32, shape=(None, action_space.shape[0])) elif isinstance(action_space, gym.spaces.Discrete): self.actions = tf.placeholder(tf.int64, shape=(None, )) else: raise NotImplemented("action space" + str(type(action_space)) + "currently not supported") self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder(tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None, )) assert config["sgd_batchsize"] % len(devices) == 0, \ "Batch size must be evenly divisible by devices" if is_remote: self.batch_size = config["rollout_batchsize"] self.per_device_batch_size = config["rollout_batchsize"] else: self.batch_size = config["sgd_batchsize"] self.per_device_batch_size = int(self.batch_size / len(devices)) def build_loss(obs, rets, advs, acts, plog, pvf_preds): return ProximalPolicyLoss(self.env.observation_space, self.env.action_space, obs, rets, advs, acts, plog, pvf_preds, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess) self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices, [ self.observations, self.returns, self.advantages, self.actions, self.prev_logits, self.prev_vf_preds ], self.per_device_batch_size, build_loss, self.logdir) # Metric ops with tf.name_scope("test_outputs"): policies = self.par_opt.get_device_losses() self.mean_loss = tf.reduce_mean( tf.stack(values=[policy.loss for policy in policies]), 0) self.mean_policy_loss = tf.reduce_mean( tf.stack( values=[policy.mean_policy_loss for policy in policies]), 0) self.mean_vf_loss = tf.reduce_mean( tf.stack(values=[policy.mean_vf_loss for policy in policies]), 0) self.mean_kl = tf.reduce_mean( tf.stack(values=[policy.mean_kl for policy in policies]), 0) self.mean_entropy = tf.reduce_mean( tf.stack(values=[policy.mean_entropy for policy in policies]), 0) # References to the model weights self.common_policy = self.par_opt.get_common_loss() self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) if config["observation_filter"] == "MeanStdFilter": self.observation_filter = MeanStdFilter(self.preprocessor.shape, clip=None) elif config["observation_filter"] == "NoFilter": self.observation_filter = NoFilter() else: raise Exception("Unknown observation_filter: " + str(config["observation_filter"])) self.reward_filter = NoFilter() self.sess.run(tf.global_variables_initializer())