def update(self, value): """Update the mean and variance estimates. Args: value: Batch or single value tensor. Returns: Summary tensor. """ with tf.name_scope(self._name + '/update'): if value.shape.ndims == self._mean.shape.ndims: # Add a batch dimension if necessary. value = value[None, ...] count = tf.shape(value)[0] with tf.control_dependencies([self._count.assign_add(count)]): step = tf.cast(self._count, tf.float32) mean_delta = tf.reduce_sum(value - self._mean[None, ...], 0) new_mean = self._mean + mean_delta / step new_mean = tf.cond(self._count > 1, lambda: new_mean, lambda: value[0]) var_delta = (value - self._mean[None, ...]) * ( value - new_mean[None, ...]) new_var_sum = self._var_sum + tf.reduce_sum(var_delta, 0) with tf.control_dependencies([new_mean, new_var_sum]): update = self._mean.assign(new_mean), self._var_sum.assign( new_var_sum) with tf.control_dependencies(update): if value.shape.ndims == 1: value = tf.reduce_mean(value) return self._summary('value', tf.reduce_mean(value))
def _build_losses(self, json_data): actor_weight_decay = 0 if ( self.ACTOR_WEIGHT_DECAY_KEY not in json_data) else json_data[self.ACTOR_WEIGHT_DECAY_KEY] critic_weight_decay = 0 if ( self.CRITIC_WEIGHT_DECAY_KEY not in json_data) else json_data[self.CRITIC_WEIGHT_DECAY_KEY] norm_val_diff = self.val_norm.normalize_tf( self.tar_val_tf) - self.val_norm.normalize_tf(self.critic_tf) self.critic_loss_tf = 0.5 * tf.reduce_mean(tf.square(norm_val_diff)) if (critic_weight_decay != 0): self.critic_loss_tf += critic_weight_decay * self._weight_decay_loss( 'main/critic') norm_a_mean_tf = self.a_norm.normalize_tf(self.actor_tf) norm_a_diff = self.a_norm.normalize_tf(self.a_tf) - norm_a_mean_tf self.actor_loss_tf = tf.reduce_sum(tf.square(norm_a_diff), axis=-1) self.actor_loss_tf *= self.adv_tf self.actor_loss_tf = 0.5 * tf.reduce_mean(self.actor_loss_tf) norm_a_bound_min = self.a_norm.normalize(self.a_bound_min) norm_a_bound_max = self.a_norm.normalize(self.a_bound_max) a_bound_loss = TFUtil.calc_bound_loss(norm_a_mean_tf, norm_a_bound_min, norm_a_bound_max) a_bound_loss /= self.exp_params_curr.noise self.actor_loss_tf += a_bound_loss if (actor_weight_decay != 0): self.actor_loss_tf += actor_weight_decay * self._weight_decay_loss( 'main/actor') return
def _value_loss(self, observ, reward, length): """Compute the loss function for the value baseline. The value loss is the difference between empirical and approximated returns over the collected episodes. Returns the loss tensor and a summary strin. Args: observ: Sequences of observations. reward: Sequences of reward. length: Batch of sequence lengths. Returns: Tuple of loss tensor and summary tensor. """ with tf.name_scope('value_loss'): value = self._network(observ, length).value return_ = utility.discounted_return(reward, length, self._config.discount) advantage = return_ - value value_loss = 0.5 * self._mask(advantage**2, length) summary = tf.summary.merge([ tf.summary.histogram('value_loss', value_loss), tf.summary.scalar('avg_value_loss', tf.reduce_mean(value_loss)) ]) value_loss = tf.reduce_mean(value_loss) return tf.check_numerics(value_loss, 'value_loss'), summary
def _define_experience(self, agent_indices, observ, action, reward): """Implement the branch of experience() entered during training.""" update_filters = tf.summary.merge([ self._observ_filter.update(observ), self._reward_filter.update(reward) ]) with tf.control_dependencies([update_filters]): if self._config.train_on_agent_action: # NOTE: Doesn't seem to change much. action = self._last_action batch = (observ, action, tf.gather(self._last_mean, agent_indices), tf.gather(self._last_logstd, agent_indices), reward) append = self._episodes.append(batch, agent_indices) with tf.control_dependencies([append]): norm_observ = self._observ_filter.transform(observ) norm_reward = tf.reduce_mean(self._reward_filter.transform(reward)) # pylint: disable=g-long-lambda summary = tf.cond( self._should_log, lambda: tf.summary.merge([ update_filters, self._observ_filter.summary(), self._reward_filter.summary(), tf.summary.scalar('memory_size', self._memory_index), tf.summary.histogram('normalized_observ', norm_observ), tf.summary.histogram('action', self._last_action), tf.summary.scalar('normalized_reward', norm_reward) ]), str) return summary
def _perform_update_steps(self, observ, action, old_mean, old_logstd, reward, length): """Perform multiple update steps of value function and policy. The advantage is computed once at the beginning and shared across iterations. We need to decide for the summary of one iteration, and thus choose the one after half of the iterations. Args: observ: Sequences of observations. action: Sequences of actions. old_mean: Sequences of action means of the behavioral policy. old_logstd: Sequences of action log stddevs of the behavioral policy. reward: Sequences of rewards. length: Batch of sequence lengths. Returns: Summary tensor. """ return_ = utility.discounted_return(reward, length, self._config.discount) value = self._network(observ, length).value if self._config.gae_lambda: advantage = utility.lambda_return(reward, value, length, self._config.discount, self._config.gae_lambda) else: advantage = return_ - value mean, variance = tf.nn.moments(advantage, axes=[0, 1], keep_dims=True) advantage = (advantage - mean) / (tf.sqrt(variance) + 1e-8) advantage = tf.Print(advantage, [tf.reduce_mean(return_), tf.reduce_mean(value)], 'return and value: ') advantage = tf.Print(advantage, [tf.reduce_mean(advantage)], 'normalized advantage: ') # pylint: disable=g-long-lambda value_loss, policy_loss, summary = tf.scan( lambda _1, _2: self._update_step( observ, action, old_mean, old_logstd, reward, advantage, length ), tf.range(self._config.update_epochs), [0., 0., ''], parallel_iterations=1) print_losses = tf.group( tf.Print(0, [tf.reduce_mean(value_loss)], 'value loss: '), tf.Print(0, [tf.reduce_mean(policy_loss)], 'policy loss: ')) with tf.control_dependencies([value_loss, policy_loss, print_losses]): return summary[self._config.update_epochs // 2]
def _build_losses(self, json_data): actor_weight_decay = 0 if ( self.ACTOR_WEIGHT_DECAY_KEY not in json_data) else json_data[self.ACTOR_WEIGHT_DECAY_KEY] critic_weight_decay = 0 if ( self.CRITIC_WEIGHT_DECAY_KEY not in json_data) else json_data[self.CRITIC_WEIGHT_DECAY_KEY] norm_val_diff = self.val_norm.normalize_tf( self.tar_val_tf) - self.val_norm.normalize_tf(self.critic_tf) self.critic_loss_tf = 0.5 * tf.reduce_mean(tf.square(norm_val_diff)) if (critic_weight_decay != 0): self.critic_loss_tf += critic_weight_decay * self._weight_decay_loss( 'main/critic') norm_tar_a_tf = self.a_norm.normalize_tf(self.a_tf) self._norm_a_mean_tf = self.a_norm.normalize_tf(self.a_mean_tf) self.logp_tf = TFUtil.calc_logp_gaussian(norm_tar_a_tf, self._norm_a_mean_tf, self.norm_a_std_tf) ratio_tf = tf.exp(self.logp_tf - self.old_logp_tf) actor_loss0 = self.adv_tf * ratio_tf actor_loss1 = self.adv_tf * tf.clip_by_value( ratio_tf, 1.0 - self.ratio_clip, 1 + self.ratio_clip) self.actor_loss_tf = -tf.reduce_mean( tf.minimum(actor_loss0, actor_loss1)) norm_a_bound_min = self.a_norm.normalize(self.a_bound_min) norm_a_bound_max = self.a_norm.normalize(self.a_bound_max) a_bound_loss = TFUtil.calc_bound_loss(self._norm_a_mean_tf, norm_a_bound_min, norm_a_bound_max) self.actor_loss_tf += a_bound_loss if (actor_weight_decay != 0): self.actor_loss_tf += actor_weight_decay * self._weight_decay_loss( 'main/actor') # for debugging self.clip_frac_tf = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio_tf - 1.0), self.ratio_clip))) return
def calc_bound_loss(x_tf, bound_min, bound_max): # penalty for violating bounds violation_min = tf.minimum(x_tf - bound_min, 0) violation_max = tf.maximum(x_tf - bound_max, 0) violation = tf.reduce_sum(tf.square(violation_min), axis=-1) + tf.reduce_sum( tf.square(violation_max), axis=-1) loss = 0.5 * tf.reduce_mean(violation) return loss
def _adjust_penalty(self, observ, old_mean, old_logstd, length): """Adjust the KL policy between the behavioral and current policy. Compute how much the policy actually changed during the multiple update steps. Adjust the penalty strength for the next training phase if we overshot or undershot the target divergence too much. Args: observ: Sequences of observations. old_mean: Sequences of action means of the behavioral policy. old_logstd: Sequences of action log stddevs of the behavioral policy. length: Batch of sequence lengths. Returns: Summary tensor. """ with tf.name_scope('adjust_penalty'): network = self._network(observ, length) assert_change = tf.assert_equal(tf.reduce_all( tf.equal(network.mean, old_mean)), False, message='policy should change') print_penalty = tf.Print(0, [self._penalty], 'current penalty: ') with tf.control_dependencies([assert_change, print_penalty]): kl_change = tf.reduce_mean( self._mask( utility.diag_normal_kl(old_mean, old_logstd, network.mean, network.logstd), length)) kl_change = tf.Print(kl_change, [kl_change], 'kl change: ') maybe_increase = tf.cond( kl_change > 1.3 * self._config.kl_target, # pylint: disable=g-long-lambda lambda: tf.Print(self._penalty.assign(self._penalty * 1.5), [0], 'increase penalty '), float) maybe_decrease = tf.cond( kl_change < 0.7 * self._config.kl_target, # pylint: disable=g-long-lambda lambda: tf.Print(self._penalty.assign(self._penalty / 1.5), [0], 'decrease penalty '), float) with tf.control_dependencies([maybe_increase, maybe_decrease]): return tf.summary.merge([ tf.summary.scalar('kl_change', kl_change), tf.summary.scalar('penalty', self._penalty) ])
def _update_value(self, observ, reward, length): """Perform multiple update steps of the value baseline. We need to decide for the summary of one iteration, and thus choose the one after half of the iterations. Args: observ: Sequences of observations. reward: Sequences of reward. length: Batch of sequence lengths. Returns: Summary tensor. """ with tf.name_scope('update_value'): loss, summary = tf.scan( lambda _1, _2: self._update_value_step(observ, reward, length), tf.range(self._config.update_epochs_value), [0., ''], parallel_iterations=1) print_loss = tf.Print(0, [tf.reduce_mean(loss)], 'value loss: ') with tf.control_dependencies([loss, print_loss]): return summary[self._config.update_epochs_value // 2]
def _policy_loss(self, mean, logstd, old_mean, old_logstd, action, advantage, length): """Compute the policy loss composed of multiple components. 1. The policy gradient loss is importance sampled from the data-collecting policy at the beginning of training. 2. The second term is a KL penalty between the policy at the beginning of training and the current policy. 3. Additionally, if this KL already changed more than twice the target amount, we activate a strong penalty discouraging further divergence. Args: mean: Sequences of action means of the current policy. logstd: Sequences of action log stddevs of the current policy. old_mean: Sequences of action means of the behavioral policy. old_logstd: Sequences of action log stddevs of the behavioral policy. action: Sequences of actions. advantage: Sequences of advantages. length: Batch of sequence lengths. Returns: Tuple of loss tensor and summary tensor. """ with tf.name_scope('policy_loss'): entropy = utility.diag_normal_entropy(mean, logstd) kl = tf.reduce_mean( self._mask( utility.diag_normal_kl(old_mean, old_logstd, mean, logstd), length), 1) policy_gradient = tf.exp( utility.diag_normal_logpdf(mean, logstd, action) - utility.diag_normal_logpdf(old_mean, old_logstd, action)) surrogate_loss = -tf.reduce_mean( self._mask(policy_gradient * tf.stop_gradient(advantage), length), 1) kl_penalty = self._penalty * kl cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor cutoff_count = tf.reduce_sum( tf.cast(kl > cutoff_threshold, tf.int32)) with tf.control_dependencies([ tf.cond(cutoff_count > 0, lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '), int) ]): kl_cutoff = (self._config.kl_cutoff_coef * tf.cast(kl > cutoff_threshold, tf.float32) * (kl - cutoff_threshold)**2) policy_loss = surrogate_loss + kl_penalty + kl_cutoff summary = tf.summary.merge([ tf.summary.histogram('entropy', entropy), tf.summary.histogram('kl', kl), tf.summary.histogram('surrogate_loss', surrogate_loss), tf.summary.histogram('kl_penalty', kl_penalty), tf.summary.histogram('kl_cutoff', kl_cutoff), tf.summary.histogram('kl_penalty_combined', kl_penalty + kl_cutoff), tf.summary.histogram('policy_loss', policy_loss), tf.summary.scalar('avg_surr_loss', tf.reduce_mean(surrogate_loss)), tf.summary.scalar('avg_kl_penalty', tf.reduce_mean(kl_penalty)), tf.summary.scalar('avg_policy_loss', tf.reduce_mean(policy_loss)) ]) policy_loss = tf.reduce_mean(policy_loss, 0) return tf.check_numerics(policy_loss, 'policy_loss'), summary