Esempio n. 1
0
 def policy_loss(self, pi, oldpi, ob, ac, atarg, ret, clip_param):
     ratio = tf.exp(pi.pd.logp(ac) - tf.clip_by_value(oldpi.pd.logp(ac), -20, 20)) # advantage * pnew / pold
     surr1 = ratio * atarg
     surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg
     pol_surr = - U.mean(tf.minimum(surr1, surr2))
     vfloss1 = tf.square(pi.vpred - ret)
     vpredclipped = oldpi.vpred + tf.clip_by_value(pi.vpred - oldpi.vpred, -clip_param, clip_param)
     vfloss2 = tf.square(vpredclipped - ret)
     vf_loss = .5 * U.mean(tf.maximum(vfloss1, vfloss2))
     total_loss = pol_surr + vf_loss
     return total_loss
Esempio n. 2
0
 def policy_loss(self, pi, other_pi, oldpi, ob, ac, atarg, ret, clip_param):
     policy_seperation_loss = 0.1*U.mean(tf.reciprocal(pi.pd.kl(other_pi.pd)))
     ratio = tf.exp(pi.pd.logp(ac) - tf.clip_by_value(oldpi.pd.logp(ac), -20, 20)) # advantage * pnew / pold
     surr1 = ratio * atarg
     surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg
     pol_surr = - U.mean(tf.minimum(surr1, surr2))
     vfloss1 = tf.square(pi.vpred - ret)
     # We dont want to clip vf losses because value is critical to learning of master.
     vpredclipped = oldpi.vpred + tf.clip_by_value(pi.vpred - oldpi.vpred, -clip_param, clip_param)
     vfloss2 = tf.square(vpredclipped - ret)
     vf_loss = .5 *tf.maximum(vfloss1, vfloss2)
     total_loss = pol_surr + vf_loss# + policy_seperation_loss
     return total_loss
Esempio n. 3
0
 def policy_loss(self, pi, oldpi, ob, ac, atarg, ret, clip_param, mask=1, vfcoeff=1., 
         entcoeff=0, divcoeff=0., logpacs=None):
     LOGP_MAX = 20
     KL_MAX = 5
     entropy = tf.reduce_mean(pi.pd.entropy())
     ratio = tf.exp(pi.pd.logp(ac) - U.clip(oldpi.pd.logp(ac), -LOGP_MAX, LOGP_MAX)) 
     approx_kl = tf.reduce_mean(tf.square(pi.pd.logp(ac) - oldpi.pd.logp(ac)))
     surr1 = ratio * atarg
     surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg
     pol_surr = -U.mean(mask*tf.minimum(surr1, surr2))
     vfloss1 = tf.square(pi.vpred - ret)
     vpredclipped = oldpi.vpred + U.clip(pi.vpred - oldpi.vpred, -clip_param, 
             clip_param)
     vfloss2 = tf.square(vpredclipped - ret)
     vf_loss = U.mean(mask*tf.maximum(vfloss1, vfloss2))
     mask_scalar = tf.reduce_mean(mask)
     total_loss = pol_surr + vfcoeff*vf_loss - mask_scalar*entcoeff*entropy
     div_loss = None
     if logpacs is not None:
         div_loss = tf.reduce_sum(U.clip(tf.reduce_mean(tf.square(tf.exp(pi.pd.logp(ac))-
             U.clip(tf.exp(logpacs), -LOGP_MAX, LOGP_MAX))*mask, axis=1), 0, KL_MAX))
         total_loss -= divcoeff*div_loss
     return total_loss, approx_kl, pol_surr, vf_loss, entropy, pi.vpred, div_loss