def diag_normal_kl(mean0, logstd0, mean1, logstd1): """Epirical KL divergence of two normals with diagonal covariance.""" logstd0_2, logstd1_2 = 2 * logstd0, 2 * logstd1 return 0.5 * (tf.reduce_sum(tf.exp(logstd0_2 - logstd1_2), -1) + tf.reduce_sum((mean1 - mean0)**2 / tf.exp(logstd1_2), -1) + tf.reduce_sum(logstd1_2, -1) - tf.reduce_sum(logstd0_2, -1) - mean0.shape[-1].value)
def update(self, value): """Update the mean and variance estimates. Args: value: Batch or single value tensor. Returns: Summary tensor. """ with tf.name_scope(self._name + '/update'): if value.shape.ndims == self._mean.shape.ndims: # Add a batch dimension if necessary. value = value[None, ...] count = tf.shape(value)[0] with tf.control_dependencies([self._count.assign_add(count)]): step = tf.cast(self._count, tf.float32) mean_delta = tf.reduce_sum(value - self._mean[None, ...], 0) new_mean = self._mean + mean_delta / step new_mean = tf.cond(self._count > 1, lambda: new_mean, lambda: value[0]) var_delta = (value - self._mean[None, ...]) * ( value - new_mean[None, ...]) new_var_sum = self._var_sum + tf.reduce_sum(var_delta, 0) with tf.control_dependencies([new_mean, new_var_sum]): update = self._mean.assign(new_mean), self._var_sum.assign( new_var_sum) with tf.control_dependencies(update): if value.shape.ndims == 1: value = tf.reduce_mean(value) return self._summary('value', tf.reduce_mean(value))
def calc_bound_loss(x_tf, bound_min, bound_max): # penalty for violating bounds violation_min = tf.minimum(x_tf - bound_min, 0) violation_max = tf.maximum(x_tf - bound_max, 0) violation = tf.reduce_sum(tf.square(violation_min), axis=-1) + tf.reduce_sum( tf.square(violation_max), axis=-1) loss = 0.5 * tf.reduce_mean(violation) return loss
def calc_logp_gaussian(x_tf, mean_tf, std_tf): dim = tf.to_float(tf.shape(x_tf)[-1]) if mean_tf is None: diff_tf = x_tf else: diff_tf = x_tf - mean_tf logp_tf = -0.5 * tf.reduce_sum(tf.square(diff_tf / std_tf), axis=-1) logp_tf += -0.5 * dim * np.log(2 * np.pi) - tf.reduce_sum(tf.log(std_tf), axis=-1) return logp_tf
def append(self, transitions, rows=None): """Append a batch of transitions to rows of the memory. Args: transitions: Tuple of transition quantities with batch dimension. rows: Episodes to append to, defaults to all. Returns: Operation. """ rows = tf.range(self._capacity) if rows is None else rows assert rows.shape.ndims == 1 assert_capacity = tf.assert_less(rows, self._capacity, message='capacity exceeded') with tf.control_dependencies([assert_capacity]): assert_max_length = tf.assert_less(tf.gather(self._length, rows), self._max_length, message='max length exceeded') append_ops = [] with tf.control_dependencies([assert_max_length]): for buffer_, elements in zip(self._buffers, transitions): timestep = tf.gather(self._length, rows) indices = tf.stack([rows, timestep], 1) append_ops.append( tf.scatter_nd_update(buffer_, indices, elements)) with tf.control_dependencies(append_ops): episode_mask = tf.reduce_sum( tf.one_hot(rows, self._capacity, dtype=tf.int32), 0) return self._length.assign_add(episode_mask)
def submit(self, value): """Submit a single or batch tensor to refine the streaming mean.""" # Add a batch dimension if necessary. if value.shape.ndims == self._sum.shape.ndims: value = value[None, ...] return tf.group(self._sum.assign_add(tf.reduce_sum(value, 0)), self._count.assign_add(tf.shape(value)[0]))
def _build_losses(self, json_data): actor_weight_decay = 0 if ( self.ACTOR_WEIGHT_DECAY_KEY not in json_data) else json_data[self.ACTOR_WEIGHT_DECAY_KEY] critic_weight_decay = 0 if ( self.CRITIC_WEIGHT_DECAY_KEY not in json_data) else json_data[self.CRITIC_WEIGHT_DECAY_KEY] norm_val_diff = self.val_norm.normalize_tf( self.tar_val_tf) - self.val_norm.normalize_tf(self.critic_tf) self.critic_loss_tf = 0.5 * tf.reduce_mean(tf.square(norm_val_diff)) if (critic_weight_decay != 0): self.critic_loss_tf += critic_weight_decay * self._weight_decay_loss( 'main/critic') norm_a_mean_tf = self.a_norm.normalize_tf(self.actor_tf) norm_a_diff = self.a_norm.normalize_tf(self.a_tf) - norm_a_mean_tf self.actor_loss_tf = tf.reduce_sum(tf.square(norm_a_diff), axis=-1) self.actor_loss_tf *= self.adv_tf self.actor_loss_tf = 0.5 * tf.reduce_mean(self.actor_loss_tf) norm_a_bound_min = self.a_norm.normalize(self.a_bound_min) norm_a_bound_max = self.a_norm.normalize(self.a_bound_max) a_bound_loss = TFUtil.calc_bound_loss(norm_a_mean_tf, norm_a_bound_min, norm_a_bound_max) a_bound_loss /= self.exp_params_curr.noise self.actor_loss_tf += a_bound_loss if (actor_weight_decay != 0): self.actor_loss_tf += actor_weight_decay * self._weight_decay_loss( 'main/actor') return
def _policy_loss(self, mean, logstd, old_mean, old_logstd, action, advantage, length): """Compute the policy loss composed of multiple components. 1. The policy gradient loss is importance sampled from the data-collecting policy at the beginning of training. 2. The second term is a KL penalty between the policy at the beginning of training and the current policy. 3. Additionally, if this KL already changed more than twice the target amount, we activate a strong penalty discouraging further divergence. Args: mean: Sequences of action means of the current policy. logstd: Sequences of action log stddevs of the current policy. old_mean: Sequences of action means of the behavioral policy. old_logstd: Sequences of action log stddevs of the behavioral policy. action: Sequences of actions. advantage: Sequences of advantages. length: Batch of sequence lengths. Returns: Tuple of loss tensor and summary tensor. """ with tf.name_scope('policy_loss'): entropy = utility.diag_normal_entropy(mean, logstd) kl = tf.reduce_mean( self._mask( utility.diag_normal_kl(old_mean, old_logstd, mean, logstd), length), 1) policy_gradient = tf.exp( utility.diag_normal_logpdf(mean, logstd, action) - utility.diag_normal_logpdf(old_mean, old_logstd, action)) surrogate_loss = -tf.reduce_mean( self._mask(policy_gradient * tf.stop_gradient(advantage), length), 1) kl_penalty = self._penalty * kl cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor cutoff_count = tf.reduce_sum( tf.cast(kl > cutoff_threshold, tf.int32)) with tf.control_dependencies([ tf.cond(cutoff_count > 0, lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '), int) ]): kl_cutoff = (self._config.kl_cutoff_coef * tf.cast(kl > cutoff_threshold, tf.float32) * (kl - cutoff_threshold)**2) policy_loss = surrogate_loss + kl_penalty + kl_cutoff summary = tf.summary.merge([ tf.summary.histogram('entropy', entropy), tf.summary.histogram('kl', kl), tf.summary.histogram('surrogate_loss', surrogate_loss), tf.summary.histogram('kl_penalty', kl_penalty), tf.summary.histogram('kl_cutoff', kl_cutoff), tf.summary.histogram('kl_penalty_combined', kl_penalty + kl_cutoff), tf.summary.histogram('policy_loss', policy_loss), tf.summary.scalar('avg_surr_loss', tf.reduce_mean(surrogate_loss)), tf.summary.scalar('avg_kl_penalty', tf.reduce_mean(kl_penalty)), tf.summary.scalar('avg_policy_loss', tf.reduce_mean(policy_loss)) ]) policy_loss = tf.reduce_mean(policy_loss, 0) return tf.check_numerics(policy_loss, 'policy_loss'), summary
def diag_normal_entropy(mean, logstd): """Empirical entropy of a normal with diagonal covariance.""" constant = mean.shape[-1].value * math.log(2 * math.pi * math.e) return (constant + tf.reduce_sum(2 * logstd, 1)) / 2
def diag_normal_logpdf(mean, logstd, loc): """Log density of a normal with diagonal covariance.""" constant = -0.5 * (math.log(2 * math.pi) + logstd) value = -0.5 * ((loc - mean) / tf.exp(logstd))**2 return tf.reduce_sum(constant + value, -1)