def define_ppo_step(data_points, hparams, action_space, lr): """Define ppo step.""" observation, action, discounted_reward, norm_advantage, old_pdf = data_points (logits, new_value) = get_policy(observation, hparams, action_space) new_policy_dist = tfp.distributions.Categorical(logits=logits) new_pdf = new_policy_dist.prob(action) ratio = new_pdf / old_pdf clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef, 1 + hparams.clipping_coef) surrogate_objective = tf.minimum(clipped_ratio * norm_advantage, ratio * norm_advantage) policy_loss = -tf.reduce_mean(surrogate_objective) value_error = new_value - discounted_reward value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error**2) entropy = new_policy_dist.entropy() entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy) losses = [policy_loss, value_loss, entropy_loss] loss = sum(losses) train_op = optimize.optimize(loss, lr, hparams) with tf.control_dependencies([train_op]): return [tf.identity(x) for x in losses]
def define_ppo_step(data_points, hparams, action_space, lr): """Define ppo step.""" observation, action, discounted_reward, norm_advantage, old_pdf = data_points obs_shape = common_layers.shape_list(observation) observation = tf.reshape(observation, [obs_shape[0] * obs_shape[1]] + obs_shape[2:]) (logits, new_value) = get_policy(observation, hparams, action_space) logits = tf.reshape(logits, obs_shape[:2] + [action_space.n]) new_value = tf.reshape(new_value, obs_shape[:2]) new_policy_dist = tf.distributions.Categorical(logits=logits) new_pdf = new_policy_dist.prob(action) ratio = new_pdf / old_pdf clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef, 1 + hparams.clipping_coef) surrogate_objective = tf.minimum(clipped_ratio * norm_advantage, ratio * norm_advantage) policy_loss = -tf.reduce_mean(surrogate_objective) value_error = new_value - discounted_reward value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error**2) entropy = new_policy_dist.entropy() entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy) losses = [policy_loss, value_loss, entropy_loss] loss = sum(losses) variables = tf.global_variables(hparams.policy_network + "/.*") train_op = optimize.optimize(loss, lr, hparams, variables=variables) with tf.control_dependencies([train_op]): return [tf.identity(x) for x in losses]
def __init__( self, batch_size, observation_space, action_space, policy_hparams, policy_dir, sampling_temp ): super(PolicyAgent, self).__init__( batch_size, observation_space, action_space ) self._sampling_temp = sampling_temp with tf.Graph().as_default(): self._observations_t = tf.placeholder( shape=((batch_size,) + self.observation_space.shape), dtype=self.observation_space.dtype ) (logits, self._values_t) = rl.get_policy( self._observations_t, policy_hparams, self.action_space ) actions = common_layers.sample_with_temperature(logits, sampling_temp) self._probs_t = tf.nn.softmax(logits / sampling_temp) self._actions_t = tf.cast(actions, tf.int32) model_saver = tf.train.Saver( tf.global_variables(policy_hparams.policy_network + "/.*") # pylint: disable=unexpected-keyword-arg ) self._sess = tf.Session() self._sess.run(tf.global_variables_initializer()) trainer_lib.restore_checkpoint(policy_dir, model_saver, self._sess)
def _prepare_networks(self, hparams, sess): self.action = tf.placeholder(shape=(1,), dtype=tf.int32) batch_env = SimulatedBatchEnv(hparams.environment_spec, hparams.num_agents) self.reward, self.done = batch_env.simulate(self.action) self.observation = batch_env.observ self.reset_op = batch_env.reset(tf.constant([0], dtype=tf.int32)) environment_wrappers = hparams.environment_spec.wrappers wrappers = copy.copy(environment_wrappers) if environment_wrappers else [] to_initialize = [batch_env] for w in wrappers: batch_env = w[0](batch_env, **w[1]) to_initialize.append(batch_env) def initialization_lambda(): for batch_env in to_initialize: batch_env.initialize(sess) self.initialize = initialization_lambda obs_copy = batch_env.observ + 0 actor_critic = get_policy(tf.expand_dims(obs_copy, 0), hparams) self.policy_probs = actor_critic.policy.probs[0, 0, :] self.value = actor_critic.value[0, :]
def define_ppo_step(data_points, hparams, action_space, lr, distributional_size=1, distributional_subscale=0.04): """Define ppo step.""" observation, action, discounted_reward, norm_advantage, old_pdf = data_points obs_shape = common_layers.shape_list(observation) observation = tf.reshape(observation, [obs_shape[0] * obs_shape[1]] + obs_shape[2:]) (logits, new_value) = get_policy(observation, hparams, action_space, distributional_size=distributional_size) logits = tf.reshape(logits, obs_shape[:2] + [action_space.n]) new_policy_dist = tfp.distributions.Categorical(logits=logits) new_pdf = new_policy_dist.prob(action) ratio = new_pdf / old_pdf clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef, 1 + hparams.clipping_coef) surrogate_objective = tf.minimum(clipped_ratio * norm_advantage, ratio * norm_advantage) policy_loss = -tf.reduce_mean(surrogate_objective) if distributional_size > 1: new_value = tf.reshape(new_value, obs_shape[:2] + [distributional_size]) new_value = tf.nn.log_softmax(new_value, axis=-1) # We assume the values range from (-half, half) -- set subscale accordingly. half = (distributional_size // 2) * distributional_subscale # To make values integers, we add half (to move range to (0, 2*half) and # then multiply by subscale after which we floor to get nearest int. quantized_dr = tf.floor( (discounted_reward + half) / distributional_subscale) hot_dr = tf.one_hot(tf.cast(quantized_dr, tf.int32), distributional_size) value_loss = -tf.reduce_sum(new_value * hot_dr, axis=-1) value_loss = hparams.value_loss_coef * tf.reduce_mean(value_loss) else: new_value = tf.reshape(new_value, obs_shape[:2]) value_error = new_value - discounted_reward value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error**2) entropy = new_policy_dist.entropy() entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy) losses = [policy_loss, value_loss, entropy_loss] loss = sum(losses) variables = tf.global_variables(hparams.policy_network + "/.*") train_op = optimize.optimize(loss, lr, hparams, variables=variables) with tf.control_dependencies([train_op]): return [tf.identity(x) for x in losses]
def env_step(arg1, arg2, arg3): # pylint: disable=unused-argument """Step of the environment.""" actor_critic = get_policy(tf.expand_dims(obs_copy, 0), hparams) policy = actor_critic.policy action = hparams.policy_to_actions_lambda(policy) postprocessed_action = actor_critic.action_postprocessing(action) reward, done = batch_env.simulate(postprocessed_action[0, ...]) pdf = policy.prob(action)[0] value_function = actor_critic.value[0] pdf = tf.reshape(pdf, shape=(num_agents,)) value_function = tf.reshape(value_function, shape=(num_agents,)) done = tf.reshape(done, shape=(num_agents,)) with tf.control_dependencies([reward, done]): return tf.identity(pdf), tf.identity(value_function), \ tf.identity(done)
def __init__(self, hparams, action_space, observation_space, policy_dir): assert hparams.base_algo == "ppo" ppo_hparams = trainer_lib.create_hparams(hparams.base_algo_params) frame_stack_shape = ( 1, hparams.frame_stack_size) + observation_space.shape self._frame_stack = np.zeros(frame_stack_shape, dtype=np.uint8) with tf.Graph().as_default(): self.obs_t = tf.placeholder(shape=self.frame_stack_shape, dtype=np.uint8) self.logits_t, self.value_function_t = get_policy( self.obs_t, ppo_hparams, action_space) model_saver = tf.train.Saver( tf.global_variables(scope=ppo_hparams.policy_network + "/.*") # pylint: disable=unexpected-keyword-arg ) self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) trainer_lib.restore_checkpoint(policy_dir, model_saver, self.sess)
def env_step(arg1, arg2, arg3): # pylint: disable=unused-argument """Step of the environment.""" (logits, value_function) = get_policy( obs_copy, ppo_hparams, batch_env.action_space ) action = common_layers.sample_with_temperature(logits, sampling_temp) action = tf.cast(action, tf.int32) reward, done = batch_env.simulate(action[:, 0, ...]) pdf = tfp.distributions.Categorical(logits=logits).prob(action) pdf = tf.reshape(pdf, shape=(num_agents,)) value_function = tf.reshape(value_function, shape=(num_agents,)) done = tf.reshape(done, shape=(num_agents,)) with tf.control_dependencies([reward, done]): return tf.identity(pdf), tf.identity(value_function), \ tf.identity(done)
def define_ppo_step(data_points, optimizer, hparams, action_space): """Define ppo step.""" observation, action, discounted_reward, norm_advantage, old_pdf = data_points new_policy_dist, new_value, _ = get_policy(observation, hparams, action_space) new_pdf = new_policy_dist.prob(action) ratio = new_pdf / old_pdf clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef, 1 + hparams.clipping_coef) surrogate_objective = tf.minimum(clipped_ratio * norm_advantage, ratio * norm_advantage) policy_loss = -tf.reduce_mean(surrogate_objective) value_error = new_value - discounted_reward value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error**2) entropy = new_policy_dist.entropy() entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy) losses = [policy_loss, value_loss, entropy_loss] gradients = [ list(zip(*optimizer.compute_gradients(loss))) for loss in losses ] gradients_norms = [tf.global_norm(gradient[0]) for gradient in gradients] gradients_flat = sum([gradient[0] for gradient in gradients], ()) gradients_variables_flat = sum([gradient[1] for gradient in gradients], ()) if hparams.max_gradients_norm: gradients_flat, _ = tf.clip_by_global_norm(gradients_flat, hparams.max_gradients_norm) optimize_op = optimizer.apply_gradients( zip(gradients_flat, gradients_variables_flat)) with tf.control_dependencies([optimize_op]): return [tf.identity(x) for x in losses + gradients_norms]
def define_ppo_step(data_points, optimizer, hparams, action_space): """Define ppo step.""" observation, action, discounted_reward, norm_advantage, old_pdf = data_points new_policy_dist, new_value, _ = get_policy(observation, hparams, action_space) new_pdf = new_policy_dist.prob(action) ratio = new_pdf / old_pdf clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef, 1 + hparams.clipping_coef) surrogate_objective = tf.minimum(clipped_ratio * norm_advantage, ratio * norm_advantage) policy_loss = -tf.reduce_mean(surrogate_objective) value_error = new_value - discounted_reward value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error ** 2) entropy = new_policy_dist.entropy() entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy) losses = [policy_loss, value_loss, entropy_loss] gradients = [list(zip(*optimizer.compute_gradients(loss))) for loss in losses] gradients_norms = [tf.global_norm(gradient[0]) for gradient in gradients] gradients_flat = sum([gradient[0] for gradient in gradients], ()) gradients_variables_flat = sum([gradient[1] for gradient in gradients], ()) if hparams.max_gradients_norm: gradients_flat, _ = tf.clip_by_global_norm(gradients_flat, hparams.max_gradients_norm) optimize_op = optimizer.apply_gradients(zip(gradients_flat, gradients_variables_flat)) with tf.control_dependencies([optimize_op]): return [tf.identity(x) for x in losses + gradients_norms]
def define_ppo_step(data_points, hparams, action_space, lr, epoch=-1, distributional_size=1, distributional_subscale=0.04): """Define ppo step.""" del distributional_subscale (observation, action, discounted_reward, discounted_reward_probs, norm_advantage, old_pdf) = data_points obs_shape = common_layers.shape_list(observation) observation = tf.reshape( observation, [obs_shape[0] * obs_shape[1]] + obs_shape[2:] ) (logits, new_value) = get_policy(observation, hparams, action_space, epoch=epoch, distributional_size=distributional_size) logits = tf.reshape(logits, obs_shape[:2] + [action_space.n]) new_policy_dist = tfp.distributions.Categorical(logits=logits) new_pdf = new_policy_dist.prob(action) ratio = new_pdf / old_pdf clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef, 1 + hparams.clipping_coef) surrogate_objective = tf.minimum(clipped_ratio * norm_advantage, ratio * norm_advantage) policy_loss = -tf.reduce_mean(surrogate_objective) if distributional_size > 1: new_value = tf.reshape(new_value, obs_shape[:2] + [distributional_size]) new_value = tf.nn.log_softmax(new_value, axis=-1) value_shape = common_layers.shape_list(new_value) # The above is the new value distribution. We are also given as discounted # reward the value distribution and the corresponding probabilities. # The given discounted reward is already rounded to integers but in range # increased by 2x for greater fidelity. Increase range of new_values here. new_value_shifted = tf.concat([new_value[1:], new_value[-1:]], axis=0) new_value_mean = (new_value + new_value_shifted) / 2 new_value = tf.concat([tf.expand_dims(new_value, axis=-1), tf.expand_dims(new_value_mean, axis=-1)], -1) new_value = tf.reshape(new_value, value_shape[:-1] + [2 * value_shape[-1]]) # Cast discounted reward to integers and gather the new log-probs for them. discounted_reward = tf.cast(discounted_reward, tf.int32) value_loss = tf.batch_gather(new_value, discounted_reward) # Weight the gathered (new) log-probs by the old probabilities. discounted_reward_probs = tf.expand_dims(discounted_reward_probs, axis=1) value_loss = - tf.reduce_sum(value_loss * discounted_reward_probs, axis=-1) # Take the mean over batch and time as final loss, multiply by coefficient. value_loss = hparams.value_loss_coef * tf.reduce_mean(value_loss) else: new_value = tf.reshape(new_value, obs_shape[:2]) value_error = new_value - discounted_reward value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error ** 2) entropy = new_policy_dist.entropy() entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy) losses = [policy_loss, value_loss, entropy_loss] loss = sum(losses) variables = tf.global_variables(hparams.policy_network + "/.*") train_op = optimize.optimize(loss, lr, hparams, variables=variables) with tf.control_dependencies([train_op]): return [tf.identity(x) for x in losses]