Exemple #1
0
def value_loss(target_v, out_v, old_v):
    """Compute value loss for PPO."""
    vpredclipped = old_v + tf.clip_by_value(out_v - old_v, -VF_CLIP, VF_CLIP)
    vf_losses1 = tf.square(out_v - target_v)
    vf_losses2 = tf.square(vpredclipped - target_v)
    vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
    return vf_loss
Exemple #2
0
 def kl(self, other):
     assert isinstance(other,
                       DiagGaussianDist), 'Distribution type not match.'
     return tf.reduce_sum(
         (tf.square(self.std) + tf.square(self.mean - other.mean)) /
         (2.0 * tf.square(other.std)) + other.log_std - self.log_std - 0.5,
         axis=-1,
         keepdims=True)
Exemple #3
0
def critic_loss(target_v, out_v, old_v, val_clip):
    """Use clipped value loss as default."""
    vf_losses1 = tf.square(out_v - target_v)
    val_pred_clipped = old_v + tf.clip_by_value(out_v - old_v, -val_clip,
                                                val_clip)
    vf_losses2 = tf.square(val_pred_clipped - target_v)
    vf_loss = 0.5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
    return vf_loss
def impala_loss(inputs, lables):
    """Compute loss for IMPALA."""
    policy, value = inputs
    target_p, target_v, adv = lables

    log_policy = tf.log(policy + 1e-10)
    entropy = (-policy * log_policy)
    cross_entropy = (-target_p * log_policy)
    p_loss = tf.reduce_mean(adv * cross_entropy - ENTROPY_LOSS * entropy)

    v_loss = 0.5 * tf.reduce_mean(tf.square(value - target_v))

    return p_loss + v_loss
def calc_baseline_loss(advantages):
    """Calculate the baseline loss."""
    return 0.5 * tf.reduce_sum(tf.square(advantages))
Exemple #6
0
 def neglog_prob(self, x):
     return 0.5 * np.log(2.0 * np.pi) * tf.cast((tf.shape(x)[-1]), tf.float32) + \
         0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1, keepdims=True) + \
         tf.reduce_sum(self.log_std, axis=-1, keepdims=True)