Esempio n. 1
0
        def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
            drop_last = self.config["vtrace"] and self.config[
                "vtrace_drop_last_ts"]
            values_batched = _make_time_major(
                self,
                train_batch.get(SampleBatch.SEQ_LENS),
                self.model.value_function(),
                drop_last=drop_last,
            )

            return {
                "cur_lr":
                tf.cast(self.cur_lr, tf.float64),
                "policy_loss":
                self.vtrace_loss.mean_pi_loss,
                "entropy":
                self.vtrace_loss.mean_entropy,
                "entropy_coeff":
                tf.cast(self.entropy_coeff, tf.float64),
                "var_gnorm":
                tf.linalg.global_norm(self.model.trainable_variables()),
                "vf_loss":
                self.vtrace_loss.mean_vf_loss,
                "vf_explained_var":
                explained_variance(
                    tf.reshape(self.vtrace_loss.value_targets, [-1]),
                    tf.reshape(values_batched, [-1]),
                ),
            }
Esempio n. 2
0
def central_vf_stats(policy, train_batch):
    # Report the explained variance of the central value function.
    return {
        "vf_explained_var":
        explained_variance(train_batch[Postprocessing.VALUE_TARGETS],
                           policy._central_value_out)
    }
Esempio n. 3
0
def kl_and_loss_stats(policy: Policy,
                      train_batch: SampleBatch) -> Dict[str, TensorType]:
    """Stats function for PPO. Returns a dict with important KL and loss stats.

    Args:
        policy (Policy): The Policy to generate stats for.
        train_batch (SampleBatch): The SampleBatch (already) used for training.

    Returns:
        Dict[str, TensorType]: The stats dict.
    """
    return {
        "cur_kl_coeff":
        tf.cast(policy.kl_coeff, tf.float64),
        "cur_lr":
        tf.cast(policy.cur_lr, tf.float64),
        "total_loss":
        policy._total_loss,
        "policy_loss":
        policy._mean_policy_loss,
        "vf_loss":
        policy._mean_vf_loss,
        "vf_explained_var":
        explained_variance(train_batch[Postprocessing.VALUE_TARGETS],
                           policy._value_fn_out),
        "kl":
        policy._mean_kl_loss,
        "entropy":
        policy._mean_entropy,
        "entropy_coeff":
        tf.cast(policy.entropy_coeff, tf.float64),
    }
Esempio n. 4
0
        def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
            values_batched = _make_time_major(
                self,
                train_batch.get(SampleBatch.SEQ_LENS),
                self.model.value_function(),
                drop_last=self.config["vtrace"] and self.config["vtrace_drop_last_ts"],
            )

            stats_dict = {
                "cur_lr": tf.cast(self.cur_lr, tf.float64),
                "total_loss": self._total_loss,
                "policy_loss": self._mean_policy_loss,
                "entropy": self._mean_entropy,
                "var_gnorm": tf.linalg.global_norm(self.model.trainable_variables()),
                "vf_loss": self._mean_vf_loss,
                "vf_explained_var": explained_variance(
                    tf.reshape(self._value_targets, [-1]),
                    tf.reshape(values_batched, [-1]),
                ),
                "entropy_coeff": tf.cast(self.entropy_coeff, tf.float64),
            }

            if self.config["vtrace"]:
                is_stat_mean, is_stat_var = tf.nn.moments(self._is_ratio, [0, 1])
                stats_dict["mean_IS"] = is_stat_mean
                stats_dict["var_IS"] = is_stat_var

            if self.config["use_kl_loss"]:
                stats_dict["kl"] = self._mean_kl_loss
                stats_dict["KL_Coeff"] = self.kl_coeff

            return stats_dict
Esempio n. 5
0
def stats(policy, train_batch):
    drop_last = policy.config["vtrace"] and \
                policy.config["vtrace_drop_last_ts"]
    values_batched = _make_time_major(policy,
                                      train_batch.get(SampleBatch.SEQ_LENS),
                                      policy.model.value_function(),
                                      drop_last=drop_last)

    return {
        "cur_lr":
        tf.cast(policy.cur_lr, tf.float64),
        "policy_loss":
        policy.loss.mean_pi_loss,
        "entropy":
        policy.loss.mean_entropy,
        "entropy_coeff":
        tf.cast(policy.entropy_coeff, tf.float64),
        "var_gnorm":
        tf.linalg.global_norm(policy.model.trainable_variables()),
        "vf_loss":
        policy.loss.mean_vf_loss,
        "vf_explained_var":
        explained_variance(tf.reshape(policy.loss.value_targets, [-1]),
                           tf.reshape(values_batched, [-1]))
    }
Esempio n. 6
0
def grad_stats(policy: Policy, train_batch: SampleBatch,
               grads: ModelGradients) -> Dict[str, TensorType]:
    return {
        "grad_gnorm": tf.linalg.global_norm(grads),
        "vf_explained_var": explained_variance(
            train_batch[Postprocessing.VALUE_TARGETS],
            policy.model.value_function())
    }
Esempio n. 7
0
    def __init__(self, policy: Policy, value_estimates: TensorType,
                 action_dist: ActionDistribution, train_batch: SampleBatch,
                 vf_loss_coeff: float, beta: float):

        # L = - A * log\pi_\theta(a|s)
        logprobs = action_dist.logp(train_batch[SampleBatch.ACTIONS])

        if beta != 0.0:
            cumulative_rewards = train_batch[Postprocessing.ADVANTAGES]
            # Advantage Estimation.
            adv = cumulative_rewards - value_estimates
            adv_squared = tf.reduce_mean(tf.math.square(adv))
            # Value function's loss term (MSE).
            self.v_loss = 0.5 * adv_squared

            # Perform moving averaging of advantage^2.
            rate = policy.config["moving_average_sqd_adv_norm_update_rate"]

            # Update averaged advantage norm.
            # Eager.
            if policy.config["framework"] in ["tf2", "tfe"]:
                update_term = adv_squared - policy._moving_average_sqd_adv_norm
                policy._moving_average_sqd_adv_norm.assign_add(rate *
                                                               update_term)

                # Exponentially weighted advantages.
                c = tf.math.sqrt(policy._moving_average_sqd_adv_norm)
                exp_advs = tf.math.exp(beta * (adv / (1e-8 + c)))
            # Static graph.
            else:
                update_adv_norm = tf1.assign_add(
                    ref=policy._moving_average_sqd_adv_norm,
                    value=rate *
                    (adv_squared - policy._moving_average_sqd_adv_norm))

                # Exponentially weighted advantages.
                with tf1.control_dependencies([update_adv_norm]):
                    exp_advs = tf.math.exp(beta * tf.math.divide(
                        adv, 1e-8 +
                        tf.math.sqrt(policy._moving_average_sqd_adv_norm)))
            exp_advs = tf.stop_gradient(exp_advs)

            self.explained_variance = tf.reduce_mean(
                explained_variance(cumulative_rewards, value_estimates))

        else:
            # Value function's loss term (MSE).
            self.v_loss = tf.constant(0.0)
            exp_advs = 1.0

        self.p_loss = -1.0 * tf.reduce_mean(exp_advs * logprobs)

        self.total_loss = self.p_loss + vf_loss_coeff * self.v_loss
Esempio n. 8
0
def stats(policy: Policy, train_batch: SampleBatch) -> Dict[str, TensorType]:
    """Stats function for APPO. Returns a dict with important loss stats.

    Args:
        policy (Policy): The Policy to generate stats for.
        train_batch (SampleBatch): The SampleBatch (already) used for training.

    Returns:
        Dict[str, TensorType]: The stats dict.
    """
    values_batched = _make_time_major(
        policy,
        train_batch.get(SampleBatch.SEQ_LENS),
        policy.model.value_function(),
        drop_last=policy.config["vtrace"]
        and policy.config["vtrace_drop_last_ts"],
    )

    stats_dict = {
        "cur_lr":
        tf.cast(policy.cur_lr, tf.float64),
        "total_loss":
        policy._total_loss,
        "policy_loss":
        policy._mean_policy_loss,
        "entropy":
        policy._mean_entropy,
        "var_gnorm":
        tf.linalg.global_norm(policy.model.trainable_variables()),
        "vf_loss":
        policy._mean_vf_loss,
        "vf_explained_var":
        explained_variance(tf.reshape(policy._value_targets, [-1]),
                           tf.reshape(values_batched, [-1])),
        "entropy_coeff":
        tf.cast(policy.entropy_coeff, tf.float64),
    }

    if policy.config["vtrace"]:
        is_stat_mean, is_stat_var = tf.nn.moments(policy._is_ratio, [0, 1])
        stats_dict["mean_IS"] = is_stat_mean
        stats_dict["var_IS"] = is_stat_var

    if policy.config["use_kl_loss"]:
        stats_dict["kl"] = policy._mean_kl_loss
        stats_dict["KL_Coeff"] = policy.kl_coeff

    return stats_dict
Esempio n. 9
0
 def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
     return {
         "cur_kl_coeff":
         tf.cast(self.kl_coeff, tf.float64),
         "cur_lr":
         tf.cast(self.cur_lr, tf.float64),
         "total_loss":
         self._total_loss,
         "policy_loss":
         self._mean_policy_loss,
         "vf_loss":
         self._mean_vf_loss,
         "vf_explained_var":
         explained_variance(train_batch[Postprocessing.VALUE_TARGETS],
                            self._value_fn_out),
         "kl":
         self._mean_kl_loss,
         "entropy":
         self._mean_entropy,
         "entropy_coeff":
         tf.cast(self.entropy_coeff, tf.float64),
     }
Esempio n. 10
0
    def __init__(
        self,
        policy: Policy,
        value_estimates: TensorType,
        action_dist: ActionDistribution,
        train_batch: SampleBatch,
        vf_loss_coeff: float,
        beta: float,
    ):
        # L = - A * log\pi_\theta(a|s)
        logprobs = action_dist.logp(train_batch[SampleBatch.ACTIONS])
        if beta != 0.0:
            cumulative_rewards = train_batch[Postprocessing.ADVANTAGES]
            # Advantage Estimation.
            adv = cumulative_rewards - value_estimates
            adv_squared = tf.reduce_mean(tf.math.square(adv))
            # Value function's loss term (MSE).
            self.v_loss = 0.5 * adv_squared

            # Perform moving averaging of advantage^2.
            rate = policy.config["moving_average_sqd_adv_norm_update_rate"]

            # Update averaged advantage norm.
            # Eager.
            if policy.config["framework"] in ["tf2", "tfe"]:
                update_term = adv_squared - policy._moving_average_sqd_adv_norm
                policy._moving_average_sqd_adv_norm.assign_add(rate * update_term)

                # Exponentially weighted advantages.
                c = tf.math.sqrt(policy._moving_average_sqd_adv_norm)
                exp_advs = tf.math.exp(beta * (adv / (1e-8 + c)))
            # Static graph.
            else:
                update_adv_norm = tf1.assign_add(
                    ref=policy._moving_average_sqd_adv_norm,
                    value=rate * (adv_squared - policy._moving_average_sqd_adv_norm),
                )

                # Exponentially weighted advantages.
                with tf1.control_dependencies([update_adv_norm]):
                    exp_advs = tf.math.exp(
                        beta
                        * tf.math.divide(
                            adv,
                            1e-8 + tf.math.sqrt(policy._moving_average_sqd_adv_norm),
                        )
                    )
            exp_advs = tf.stop_gradient(exp_advs)

            self.explained_variance = tf.reduce_mean(
                explained_variance(cumulative_rewards, value_estimates)
            )

        else:
            # Value function's loss term (MSE).
            self.v_loss = tf.constant(0.0)
            exp_advs = 1.0

        # logprob loss alone tends to push action distributions to
        # have very low entropy, resulting in worse performance for
        # unfamiliar situations.
        # A scaled logstd loss term encourages stochasticity, thus
        # alleviate the problem to some extent.
        logstd_coeff = policy.config["bc_logstd_coeff"]
        if logstd_coeff > 0.0:
            logstds = tf.reduce_sum(action_dist.log_std, axis=1)
        else:
            logstds = 0.0

        self.p_loss = -1.0 * tf.reduce_mean(
            exp_advs * (logprobs + logstd_coeff * logstds)
        )

        self.total_loss = self.p_loss + vf_loss_coeff * self.v_loss