def __init__( self, action_dist: ActionDistribution, actions: TensorType, advantages: TensorType, v_target: TensorType, vf: TensorType, valid_mask: TensorType, vf_loss_coeff: float = 0.5, entropy_coeff: float = 0.01, use_critic: bool = True, ): log_prob = action_dist.logp(actions) # The "policy gradients" loss self.pi_loss = -tf.reduce_sum( tf.boolean_mask(log_prob * advantages, valid_mask)) delta = tf.boolean_mask(vf - v_target, valid_mask) # Compute a value function loss. if use_critic: self.vf_loss = 0.5 * tf.reduce_sum(tf.math.square(delta)) # Ignore the value function. else: self.vf_loss = tf.constant(0.0) self.entropy = tf.reduce_sum( tf.boolean_mask(action_dist.entropy(), valid_mask)) self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff - self.entropy * entropy_coeff)
def kl(self, q: ActionDistribution) -> torch.Tensor: """ KL(self || q) estimated with monte carlo sampling """ rsamples = self.__rsamples().unbind(0) log_ratios = torch.stack( [self.logp(rsample) - q.logp(rsample) for rsample in rsamples]) assert not torch.isnan(log_ratios).any(), "output nan aborting" return log_ratios.mean(0)
def kl(self, q: ActionDistribution) -> torch.Tensor: """ KL(self || q) estimated with monte carlo sampling """ rsamples, logps = self.__rsamples_logps() logp_rsamples = zip(logps.unbind(0), rsamples.unbind(0)) log_ratios = torch.stack( [logp - q.logp(rsample) for (logp, rsample) in logp_rsamples]) assert not torch.isnan(log_ratios).any(), "output nan aborting" return log_ratios.mean(0)
def __init__(self, policy: Policy, value_estimates: TensorType, action_dist: ActionDistribution, train_batch: SampleBatch, vf_loss_coeff: float, beta: float): # L = - A * log\pi_\theta(a|s) logprobs = action_dist.logp(train_batch[SampleBatch.ACTIONS]) if beta != 0.0: cumulative_rewards = train_batch[Postprocessing.ADVANTAGES] # Advantage Estimation. adv = cumulative_rewards - value_estimates adv_squared = tf.reduce_mean(tf.math.square(adv)) # Value function's loss term (MSE). self.v_loss = 0.5 * adv_squared # Perform moving averaging of advantage^2. rate = policy.config["moving_average_sqd_adv_norm_update_rate"] # Update averaged advantage norm. # Eager. if policy.config["framework"] in ["tf2", "tfe"]: update_term = adv_squared - policy._moving_average_sqd_adv_norm policy._moving_average_sqd_adv_norm.assign_add(rate * update_term) # Exponentially weighted advantages. c = tf.math.sqrt(policy._moving_average_sqd_adv_norm) exp_advs = tf.math.exp(beta * (adv / (1e-8 + c))) # Static graph. else: update_adv_norm = tf1.assign_add( ref=policy._moving_average_sqd_adv_norm, value=rate * (adv_squared - policy._moving_average_sqd_adv_norm)) # Exponentially weighted advantages. with tf1.control_dependencies([update_adv_norm]): exp_advs = tf.math.exp(beta * tf.math.divide( adv, 1e-8 + tf.math.sqrt(policy._moving_average_sqd_adv_norm))) exp_advs = tf.stop_gradient(exp_advs) self.explained_variance = tf.reduce_mean( explained_variance(cumulative_rewards, value_estimates)) else: # Value function's loss term (MSE). self.v_loss = tf.constant(0.0) exp_advs = 1.0 self.p_loss = -1.0 * tf.reduce_mean(exp_advs * logprobs) self.total_loss = self.p_loss + vf_loss_coeff * self.v_loss
def __init__( self, policy: Policy, value_estimates: TensorType, action_dist: ActionDistribution, train_batch: SampleBatch, vf_loss_coeff: float, beta: float, ): # L = - A * log\pi_\theta(a|s) logprobs = action_dist.logp(train_batch[SampleBatch.ACTIONS]) if beta != 0.0: cumulative_rewards = train_batch[Postprocessing.ADVANTAGES] # Advantage Estimation. adv = cumulative_rewards - value_estimates adv_squared = tf.reduce_mean(tf.math.square(adv)) # Value function's loss term (MSE). self.v_loss = 0.5 * adv_squared # Perform moving averaging of advantage^2. rate = policy.config["moving_average_sqd_adv_norm_update_rate"] # Update averaged advantage norm. # Eager. if policy.config["framework"] in ["tf2", "tfe"]: update_term = adv_squared - policy._moving_average_sqd_adv_norm policy._moving_average_sqd_adv_norm.assign_add(rate * update_term) # Exponentially weighted advantages. c = tf.math.sqrt(policy._moving_average_sqd_adv_norm) exp_advs = tf.math.exp(beta * (adv / (1e-8 + c))) # Static graph. else: update_adv_norm = tf1.assign_add( ref=policy._moving_average_sqd_adv_norm, value=rate * (adv_squared - policy._moving_average_sqd_adv_norm), ) # Exponentially weighted advantages. with tf1.control_dependencies([update_adv_norm]): exp_advs = tf.math.exp( beta * tf.math.divide( adv, 1e-8 + tf.math.sqrt(policy._moving_average_sqd_adv_norm), ) ) exp_advs = tf.stop_gradient(exp_advs) self.explained_variance = tf.reduce_mean( explained_variance(cumulative_rewards, value_estimates) ) else: # Value function's loss term (MSE). self.v_loss = tf.constant(0.0) exp_advs = 1.0 # logprob loss alone tends to push action distributions to # have very low entropy, resulting in worse performance for # unfamiliar situations. # A scaled logstd loss term encourages stochasticity, thus # alleviate the problem to some extent. logstd_coeff = policy.config["bc_logstd_coeff"] if logstd_coeff > 0.0: logstds = tf.reduce_sum(action_dist.log_std, axis=1) else: logstds = 0.0 self.p_loss = -1.0 * tf.reduce_mean( exp_advs * (logprobs + logstd_coeff * logstds) ) self.total_loss = self.p_loss + vf_loss_coeff * self.v_loss