def value_loss(target_v, out_v, old_v): """Compute value loss for PPO.""" vpredclipped = old_v + tf.clip_by_value(out_v - old_v, -VF_CLIP, VF_CLIP) vf_losses1 = tf.square(out_v - target_v) vf_losses2 = tf.square(vpredclipped - target_v) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) return vf_loss
def kl(self, other): assert isinstance(other, DiagGaussianDist), 'Distribution type not match.' return tf.reduce_sum( (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) + other.log_std - self.log_std - 0.5, axis=-1, keepdims=True)
def critic_loss(target_v, out_v, old_v, val_clip): """Use clipped value loss as default.""" vf_losses1 = tf.square(out_v - target_v) val_pred_clipped = old_v + tf.clip_by_value(out_v - old_v, -val_clip, val_clip) vf_losses2 = tf.square(val_pred_clipped - target_v) vf_loss = 0.5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) return vf_loss
def impala_loss(inputs, lables): """Compute loss for IMPALA.""" policy, value = inputs target_p, target_v, adv = lables log_policy = tf.log(policy + 1e-10) entropy = (-policy * log_policy) cross_entropy = (-target_p * log_policy) p_loss = tf.reduce_mean(adv * cross_entropy - ENTROPY_LOSS * entropy) v_loss = 0.5 * tf.reduce_mean(tf.square(value - target_v)) return p_loss + v_loss
def calc_baseline_loss(advantages): """Calculate the baseline loss.""" return 0.5 * tf.reduce_sum(tf.square(advantages))
def neglog_prob(self, x): return 0.5 * np.log(2.0 * np.pi) * tf.cast((tf.shape(x)[-1]), tf.float32) + \ 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1, keepdims=True) + \ tf.reduce_sum(self.log_std, axis=-1, keepdims=True)