def discounted_return(reward, length, discount): """Discounted Monte-Carlo returns.""" timestep = tf.range(reward.shape[1].value) mask = tf.cast(timestep[None, :] < length[:, None], tf.float32) return_ = tf.reverse( tf.transpose( tf.scan(lambda agg, cur: cur + discount * agg, tf.transpose(tf.reverse(mask * reward, [1]), [1, 0]), tf.zeros_like(reward[:, -1]), 1, False), [1, 0]), [1]) return tf.check_numerics(tf.stop_gradient(return_), 'return')
def fixed_step_return(reward, value, length, discount, window): """N-step discounted return.""" timestep = tf.range(reward.shape[1].value) mask = tf.cast(timestep[None, :] < length[:, None], tf.float32) return_ = tf.zeros_like(reward) for _ in range(window): return_ += reward reward = discount * tf.concat([reward[:, 1:], tf.zeros_like(reward[:, -1:])], 1) return_ += discount**window * tf.concat( [value[:, window:], tf.zeros_like(value[:, -window:]), 1]) return tf.check_numerics(tf.stop_gradient(mask * return_), 'return')
def lambda_advantage(reward, value, length, discount): """Generalized Advantage Estimation.""" timestep = tf.range(reward.shape[1].value) mask = tf.cast(timestep[None, :] < length[:, None], tf.float32) next_value = tf.concat([value[:, 1:], tf.zeros_like(value[:, -1:])], 1) delta = reward + discount * next_value - value advantage = tf.reverse( tf.transpose( tf.scan(lambda agg, cur: cur + discount * agg, tf.transpose(tf.reverse(mask * delta, [1]), [1, 0]), tf.zeros_like(delta[:, -1]), 1, False), [1, 0]), [1]) return tf.check_numerics(tf.stop_gradient(advantage), 'advantage')
def lambda_return(reward, value, length, discount, lambda_): """TD-lambda returns.""" timestep = tf.range(reward.shape[1].value) mask = tf.cast(timestep[None, :] < length[:, None], tf.float32) sequence = mask * reward + discount * value * (1 - lambda_) discount = mask * discount * lambda_ sequence = tf.stack([sequence, discount], 2) return_ = tf.reverse( tf.transpose( tf.scan(lambda agg, cur: cur[0] + cur[1] * agg, tf.transpose(tf.reverse(sequence, [1]), [1, 2, 0]), tf.zeros_like(value[:, -1]), 1, False), [1, 0]), [1]) return tf.check_numerics(tf.stop_gradient(return_), 'return')
def _policy_loss(self, mean, logstd, old_mean, old_logstd, action, advantage, length): """Compute the policy loss composed of multiple components. 1. The policy gradient loss is importance sampled from the data-collecting policy at the beginning of training. 2. The second term is a KL penalty between the policy at the beginning of training and the current policy. 3. Additionally, if this KL already changed more than twice the target amount, we activate a strong penalty discouraging further divergence. Args: mean: Sequences of action means of the current policy. logstd: Sequences of action log stddevs of the current policy. old_mean: Sequences of action means of the behavioral policy. old_logstd: Sequences of action log stddevs of the behavioral policy. action: Sequences of actions. advantage: Sequences of advantages. length: Batch of sequence lengths. Returns: Tuple of loss tensor and summary tensor. """ with tf.name_scope('policy_loss'): entropy = utility.diag_normal_entropy(mean, logstd) kl = tf.reduce_mean( self._mask( utility.diag_normal_kl(old_mean, old_logstd, mean, logstd), length), 1) policy_gradient = tf.exp( utility.diag_normal_logpdf(mean, logstd, action) - utility.diag_normal_logpdf(old_mean, old_logstd, action)) surrogate_loss = -tf.reduce_mean( self._mask(policy_gradient * tf.stop_gradient(advantage), length), 1) kl_penalty = self._penalty * kl cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor cutoff_count = tf.reduce_sum( tf.cast(kl > cutoff_threshold, tf.int32)) with tf.control_dependencies([ tf.cond(cutoff_count > 0, lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '), int) ]): kl_cutoff = (self._config.kl_cutoff_coef * tf.cast(kl > cutoff_threshold, tf.float32) * (kl - cutoff_threshold)**2) policy_loss = surrogate_loss + kl_penalty + kl_cutoff summary = tf.summary.merge([ tf.summary.histogram('entropy', entropy), tf.summary.histogram('kl', kl), tf.summary.histogram('surrogate_loss', surrogate_loss), tf.summary.histogram('kl_penalty', kl_penalty), tf.summary.histogram('kl_cutoff', kl_cutoff), tf.summary.histogram('kl_penalty_combined', kl_penalty + kl_cutoff), tf.summary.histogram('policy_loss', policy_loss), tf.summary.scalar('avg_surr_loss', tf.reduce_mean(surrogate_loss)), tf.summary.scalar('avg_kl_penalty', tf.reduce_mean(kl_penalty)), tf.summary.scalar('avg_policy_loss', tf.reduce_mean(policy_loss)) ]) policy_loss = tf.reduce_mean(policy_loss, 0) return tf.check_numerics(policy_loss, 'policy_loss'), summary