コード例 #1
0
    def _create_sac_optimizer_ops(self) -> None:
        """
        Creates the Adam optimizers and update ops for SAC, including
        the policy, value, and entropy updates, as well as the target network update.
        """
        policy_optimizer = self.create_optimizer_op(
            learning_rate=self.learning_rate, name="sac_policy_opt"
        )
        entropy_optimizer = self.create_optimizer_op(
            learning_rate=self.learning_rate, name="sac_entropy_opt"
        )
        value_optimizer = self.create_optimizer_op(
            learning_rate=self.learning_rate, name="sac_value_opt"
        )

        self.target_update_op = [
            tf.assign(target, (1 - self.tau) * target + self.tau * source)
            for target, source in zip(
                self.target_network.value_vars, self.policy_network.value_vars
            )
        ]
        logger.debug("value_vars")
        self.print_all_vars(self.policy_network.value_vars)
        logger.debug("targvalue_vars")
        self.print_all_vars(self.target_network.value_vars)
        logger.debug("critic_vars")
        self.print_all_vars(self.policy_network.critic_vars)
        logger.debug("q_vars")
        self.print_all_vars(self.policy_network.q_vars)
        logger.debug("policy_vars")
        policy_vars = self.policy.get_trainable_variables()
        self.print_all_vars(policy_vars)

        self.target_init_op = [
            tf.assign(target, source)
            for target, source in zip(
                self.target_network.value_vars, self.policy_network.value_vars
            )
        ]

        self.update_batch_policy = policy_optimizer.minimize(
            self.policy_loss, var_list=policy_vars
        )

        # Make sure policy is updated first, then value, then entropy.
        with tf.control_dependencies([self.update_batch_policy]):
            self.update_batch_value = value_optimizer.minimize(
                self.total_value_loss, var_list=self.policy_network.critic_vars
            )
            # Add entropy coefficient optimization operation
            with tf.control_dependencies([self.update_batch_value]):
                self.update_batch_entropy = entropy_optimizer.minimize(
                    self.entropy_loss, var_list=self.log_ent_coef
                )
コード例 #2
0
    def make_beta_update(self) -> None:
        """
        Creates the beta parameter and its updater for GAIL
        """

        new_beta = tf.maximum(
            self.beta + self.alpha * (self.kl_loss - self.mutual_information), EPSILON
        )
        with tf.control_dependencies([self.update_batch]):
            self.update_beta = tf.assign(self.beta, new_beta)