def policy_loss(self, pi, oldpi, ob, ac, atarg, ret, clip_param): ratio = tf.exp(pi.pd.logp(ac) - tf.clip_by_value(oldpi.pd.logp(ac), -20, 20)) # advantage * pnew / pold surr1 = ratio * atarg surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg pol_surr = - U.mean(tf.minimum(surr1, surr2)) vfloss1 = tf.square(pi.vpred - ret) vpredclipped = oldpi.vpred + tf.clip_by_value(pi.vpred - oldpi.vpred, -clip_param, clip_param) vfloss2 = tf.square(vpredclipped - ret) vf_loss = .5 * U.mean(tf.maximum(vfloss1, vfloss2)) total_loss = pol_surr + vf_loss return total_loss
def policy_loss(self, pi, other_pi, oldpi, ob, ac, atarg, ret, clip_param): policy_seperation_loss = 0.1*U.mean(tf.reciprocal(pi.pd.kl(other_pi.pd))) ratio = tf.exp(pi.pd.logp(ac) - tf.clip_by_value(oldpi.pd.logp(ac), -20, 20)) # advantage * pnew / pold surr1 = ratio * atarg surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg pol_surr = - U.mean(tf.minimum(surr1, surr2)) vfloss1 = tf.square(pi.vpred - ret) # We dont want to clip vf losses because value is critical to learning of master. vpredclipped = oldpi.vpred + tf.clip_by_value(pi.vpred - oldpi.vpred, -clip_param, clip_param) vfloss2 = tf.square(vpredclipped - ret) vf_loss = .5 *tf.maximum(vfloss1, vfloss2) total_loss = pol_surr + vf_loss# + policy_seperation_loss return total_loss
def policy_loss(self, pi, oldpi, ob, ac, atarg, ret, clip_param, mask=1, vfcoeff=1., entcoeff=0, divcoeff=0., logpacs=None): LOGP_MAX = 20 KL_MAX = 5 entropy = tf.reduce_mean(pi.pd.entropy()) ratio = tf.exp(pi.pd.logp(ac) - U.clip(oldpi.pd.logp(ac), -LOGP_MAX, LOGP_MAX)) approx_kl = tf.reduce_mean(tf.square(pi.pd.logp(ac) - oldpi.pd.logp(ac))) surr1 = ratio * atarg surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg pol_surr = -U.mean(mask*tf.minimum(surr1, surr2)) vfloss1 = tf.square(pi.vpred - ret) vpredclipped = oldpi.vpred + U.clip(pi.vpred - oldpi.vpred, -clip_param, clip_param) vfloss2 = tf.square(vpredclipped - ret) vf_loss = U.mean(mask*tf.maximum(vfloss1, vfloss2)) mask_scalar = tf.reduce_mean(mask) total_loss = pol_surr + vfcoeff*vf_loss - mask_scalar*entcoeff*entropy div_loss = None if logpacs is not None: div_loss = tf.reduce_sum(U.clip(tf.reduce_mean(tf.square(tf.exp(pi.pd.logp(ac))- U.clip(tf.exp(logpacs), -LOGP_MAX, LOGP_MAX))*mask, axis=1), 0, KL_MAX)) total_loss -= divcoeff*div_loss return total_loss, approx_kl, pol_surr, vf_loss, entropy, pi.vpred, div_loss