def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: drop_last = self.config["vtrace"] and self.config[ "vtrace_drop_last_ts"] values_batched = _make_time_major( self, train_batch.get(SampleBatch.SEQ_LENS), self.model.value_function(), drop_last=drop_last, ) return { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.vtrace_loss.mean_pi_loss, "entropy": self.vtrace_loss.mean_entropy, "entropy_coeff": tf.cast(self.entropy_coeff, tf.float64), "var_gnorm": tf.linalg.global_norm(self.model.trainable_variables()), "vf_loss": self.vtrace_loss.mean_vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.vtrace_loss.value_targets, [-1]), tf.reshape(values_batched, [-1]), ), }
def central_vf_stats(policy, train_batch): # Report the explained variance of the central value function. return { "vf_explained_var": explained_variance(train_batch[Postprocessing.VALUE_TARGETS], policy._central_value_out) }
def kl_and_loss_stats(policy: Policy, train_batch: SampleBatch) -> Dict[str, TensorType]: """Stats function for PPO. Returns a dict with important KL and loss stats. Args: policy (Policy): The Policy to generate stats for. train_batch (SampleBatch): The SampleBatch (already) used for training. Returns: Dict[str, TensorType]: The stats dict. """ return { "cur_kl_coeff": tf.cast(policy.kl_coeff, tf.float64), "cur_lr": tf.cast(policy.cur_lr, tf.float64), "total_loss": policy._total_loss, "policy_loss": policy._mean_policy_loss, "vf_loss": policy._mean_vf_loss, "vf_explained_var": explained_variance(train_batch[Postprocessing.VALUE_TARGETS], policy._value_fn_out), "kl": policy._mean_kl_loss, "entropy": policy._mean_entropy, "entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64), }
def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: values_batched = _make_time_major( self, train_batch.get(SampleBatch.SEQ_LENS), self.model.value_function(), drop_last=self.config["vtrace"] and self.config["vtrace_drop_last_ts"], ) stats_dict = { "cur_lr": tf.cast(self.cur_lr, tf.float64), "total_loss": self._total_loss, "policy_loss": self._mean_policy_loss, "entropy": self._mean_entropy, "var_gnorm": tf.linalg.global_norm(self.model.trainable_variables()), "vf_loss": self._mean_vf_loss, "vf_explained_var": explained_variance( tf.reshape(self._value_targets, [-1]), tf.reshape(values_batched, [-1]), ), "entropy_coeff": tf.cast(self.entropy_coeff, tf.float64), } if self.config["vtrace"]: is_stat_mean, is_stat_var = tf.nn.moments(self._is_ratio, [0, 1]) stats_dict["mean_IS"] = is_stat_mean stats_dict["var_IS"] = is_stat_var if self.config["use_kl_loss"]: stats_dict["kl"] = self._mean_kl_loss stats_dict["KL_Coeff"] = self.kl_coeff return stats_dict
def stats(policy, train_batch): drop_last = policy.config["vtrace"] and \ policy.config["vtrace_drop_last_ts"] values_batched = _make_time_major(policy, train_batch.get(SampleBatch.SEQ_LENS), policy.model.value_function(), drop_last=drop_last) return { "cur_lr": tf.cast(policy.cur_lr, tf.float64), "policy_loss": policy.loss.mean_pi_loss, "entropy": policy.loss.mean_entropy, "entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64), "var_gnorm": tf.linalg.global_norm(policy.model.trainable_variables()), "vf_loss": policy.loss.mean_vf_loss, "vf_explained_var": explained_variance(tf.reshape(policy.loss.value_targets, [-1]), tf.reshape(values_batched, [-1])) }
def grad_stats(policy: Policy, train_batch: SampleBatch, grads: ModelGradients) -> Dict[str, TensorType]: return { "grad_gnorm": tf.linalg.global_norm(grads), "vf_explained_var": explained_variance( train_batch[Postprocessing.VALUE_TARGETS], policy.model.value_function()) }
def __init__(self, policy: Policy, value_estimates: TensorType, action_dist: ActionDistribution, train_batch: SampleBatch, vf_loss_coeff: float, beta: float): # L = - A * log\pi_\theta(a|s) logprobs = action_dist.logp(train_batch[SampleBatch.ACTIONS]) if beta != 0.0: cumulative_rewards = train_batch[Postprocessing.ADVANTAGES] # Advantage Estimation. adv = cumulative_rewards - value_estimates adv_squared = tf.reduce_mean(tf.math.square(adv)) # Value function's loss term (MSE). self.v_loss = 0.5 * adv_squared # Perform moving averaging of advantage^2. rate = policy.config["moving_average_sqd_adv_norm_update_rate"] # Update averaged advantage norm. # Eager. if policy.config["framework"] in ["tf2", "tfe"]: update_term = adv_squared - policy._moving_average_sqd_adv_norm policy._moving_average_sqd_adv_norm.assign_add(rate * update_term) # Exponentially weighted advantages. c = tf.math.sqrt(policy._moving_average_sqd_adv_norm) exp_advs = tf.math.exp(beta * (adv / (1e-8 + c))) # Static graph. else: update_adv_norm = tf1.assign_add( ref=policy._moving_average_sqd_adv_norm, value=rate * (adv_squared - policy._moving_average_sqd_adv_norm)) # Exponentially weighted advantages. with tf1.control_dependencies([update_adv_norm]): exp_advs = tf.math.exp(beta * tf.math.divide( adv, 1e-8 + tf.math.sqrt(policy._moving_average_sqd_adv_norm))) exp_advs = tf.stop_gradient(exp_advs) self.explained_variance = tf.reduce_mean( explained_variance(cumulative_rewards, value_estimates)) else: # Value function's loss term (MSE). self.v_loss = tf.constant(0.0) exp_advs = 1.0 self.p_loss = -1.0 * tf.reduce_mean(exp_advs * logprobs) self.total_loss = self.p_loss + vf_loss_coeff * self.v_loss
def stats(policy: Policy, train_batch: SampleBatch) -> Dict[str, TensorType]: """Stats function for APPO. Returns a dict with important loss stats. Args: policy (Policy): The Policy to generate stats for. train_batch (SampleBatch): The SampleBatch (already) used for training. Returns: Dict[str, TensorType]: The stats dict. """ values_batched = _make_time_major( policy, train_batch.get(SampleBatch.SEQ_LENS), policy.model.value_function(), drop_last=policy.config["vtrace"] and policy.config["vtrace_drop_last_ts"], ) stats_dict = { "cur_lr": tf.cast(policy.cur_lr, tf.float64), "total_loss": policy._total_loss, "policy_loss": policy._mean_policy_loss, "entropy": policy._mean_entropy, "var_gnorm": tf.linalg.global_norm(policy.model.trainable_variables()), "vf_loss": policy._mean_vf_loss, "vf_explained_var": explained_variance(tf.reshape(policy._value_targets, [-1]), tf.reshape(values_batched, [-1])), "entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64), } if policy.config["vtrace"]: is_stat_mean, is_stat_var = tf.nn.moments(policy._is_ratio, [0, 1]) stats_dict["mean_IS"] = is_stat_mean stats_dict["var_IS"] = is_stat_var if policy.config["use_kl_loss"]: stats_dict["kl"] = policy._mean_kl_loss stats_dict["KL_Coeff"] = policy.kl_coeff return stats_dict
def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: return { "cur_kl_coeff": tf.cast(self.kl_coeff, tf.float64), "cur_lr": tf.cast(self.cur_lr, tf.float64), "total_loss": self._total_loss, "policy_loss": self._mean_policy_loss, "vf_loss": self._mean_vf_loss, "vf_explained_var": explained_variance(train_batch[Postprocessing.VALUE_TARGETS], self._value_fn_out), "kl": self._mean_kl_loss, "entropy": self._mean_entropy, "entropy_coeff": tf.cast(self.entropy_coeff, tf.float64), }
def __init__( self, policy: Policy, value_estimates: TensorType, action_dist: ActionDistribution, train_batch: SampleBatch, vf_loss_coeff: float, beta: float, ): # L = - A * log\pi_\theta(a|s) logprobs = action_dist.logp(train_batch[SampleBatch.ACTIONS]) if beta != 0.0: cumulative_rewards = train_batch[Postprocessing.ADVANTAGES] # Advantage Estimation. adv = cumulative_rewards - value_estimates adv_squared = tf.reduce_mean(tf.math.square(adv)) # Value function's loss term (MSE). self.v_loss = 0.5 * adv_squared # Perform moving averaging of advantage^2. rate = policy.config["moving_average_sqd_adv_norm_update_rate"] # Update averaged advantage norm. # Eager. if policy.config["framework"] in ["tf2", "tfe"]: update_term = adv_squared - policy._moving_average_sqd_adv_norm policy._moving_average_sqd_adv_norm.assign_add(rate * update_term) # Exponentially weighted advantages. c = tf.math.sqrt(policy._moving_average_sqd_adv_norm) exp_advs = tf.math.exp(beta * (adv / (1e-8 + c))) # Static graph. else: update_adv_norm = tf1.assign_add( ref=policy._moving_average_sqd_adv_norm, value=rate * (adv_squared - policy._moving_average_sqd_adv_norm), ) # Exponentially weighted advantages. with tf1.control_dependencies([update_adv_norm]): exp_advs = tf.math.exp( beta * tf.math.divide( adv, 1e-8 + tf.math.sqrt(policy._moving_average_sqd_adv_norm), ) ) exp_advs = tf.stop_gradient(exp_advs) self.explained_variance = tf.reduce_mean( explained_variance(cumulative_rewards, value_estimates) ) else: # Value function's loss term (MSE). self.v_loss = tf.constant(0.0) exp_advs = 1.0 # logprob loss alone tends to push action distributions to # have very low entropy, resulting in worse performance for # unfamiliar situations. # A scaled logstd loss term encourages stochasticity, thus # alleviate the problem to some extent. logstd_coeff = policy.config["bc_logstd_coeff"] if logstd_coeff > 0.0: logstds = tf.reduce_sum(action_dist.log_std, axis=1) else: logstds = 0.0 self.p_loss = -1.0 * tf.reduce_mean( exp_advs * (logprobs + logstd_coeff * logstds) ) self.total_loss = self.p_loss + vf_loss_coeff * self.v_loss