Esempio n. 1
0
 def create_network(self) -> None:
     """
     Helper for creating the intrinsic reward nodes
     """
     if self.use_vail:
         self.z_sigma = tf.get_variable(
             "gail_sigma_vail",
             self.z_size,
             dtype=tf.float32,
             initializer=tf.ones_initializer(),
         )
         self.z_sigma_sq = self.z_sigma * self.z_sigma
         self.z_log_sigma_sq = tf.log(self.z_sigma_sq + EPSILON)
         self.use_noise = tf.placeholder(
             shape=[1], dtype=tf.float32, name="gail_NoiseLevel"
         )
     self.expert_estimate, self.z_mean_expert, _ = self.create_encoder(
         self.encoded_expert, self.expert_action, self.done_expert, reuse=False
     )
     self.policy_estimate, self.z_mean_policy, _ = self.create_encoder(
         self.encoded_policy,
         self.policy.selected_actions,
         self.done_policy,
         reuse=True,
     )
     self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate)
     self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate)
     self.discriminator_score = tf.reshape(
         self.policy_estimate, [-1], name="gail_reward"
     )
     self.intrinsic_reward = -tf.log(1.0 - self.discriminator_score + EPSILON)
Esempio n. 2
0
 def create_normalizer(self, vector_obs):
     self.normalization_steps = tf.get_variable(
         "normalization_steps",
         [],
         trainable=False,
         dtype=tf.int32,
         initializer=tf.ones_initializer(),
     )
     self.running_mean = tf.get_variable(
         "running_mean",
         [self.vec_obs_size],
         trainable=False,
         dtype=tf.float32,
         initializer=tf.zeros_initializer(),
     )
     self.running_variance = tf.get_variable(
         "running_variance",
         [self.vec_obs_size],
         trainable=False,
         dtype=tf.float32,
         initializer=tf.ones_initializer(),
     )
     self.update_normalization = self.create_normalizer_update(vector_obs)
Esempio n. 3
0
    def create_loss(self, learning_rate: float) -> None:
        """
        Creates the loss and update nodes for the GAIL reward generator
        :param learning_rate: The learning rate for the optimizer
        """
        self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate)
        self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate)

        if self.use_vail:
            self.beta = tf.get_variable(
                "gail_beta",
                [],
                trainable=False,
                dtype=tf.float32,
                initializer=tf.ones_initializer(),
            )

        self.discriminator_loss = -tf.reduce_mean(
            tf.log(self.expert_estimate + EPSILON)
            + tf.log(1.0 - self.policy_estimate + EPSILON)
        )

        if self.use_vail:
            # KL divergence loss (encourage latent representation to be normal)
            self.kl_loss = tf.reduce_mean(
                -tf.reduce_sum(
                    1
                    + self.z_log_sigma_sq
                    - 0.5 * tf.square(self.z_mean_expert)
                    - 0.5 * tf.square(self.z_mean_policy)
                    - tf.exp(self.z_log_sigma_sq),
                    1,
                )
            )
            self.loss = (
                self.beta * (self.kl_loss - self.mutual_information)
                + self.discriminator_loss
            )
        else:
            self.loss = self.discriminator_loss

        if self.gradient_penalty_weight > 0.0:
            self.loss += self.gradient_penalty_weight * self.create_gradient_magnitude()

        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        self.update_batch = optimizer.minimize(self.loss)
Esempio n. 4
0
    def create_normalizer(vector_obs: tf.Tensor) -> NormalizerTensors:
        """
        Creates the normalizer and the variables required to store its state.
        :param vector_obs: A Tensor representing the next value to normalize. When the
            update operation is called, it will use vector_obs to update the running mean
            and variance.
        :return: A NormalizerTensors tuple that holds running mean, running variance, number of steps,
            and the update operation.
        """
        vec_obs_size = vector_obs.shape[1]

        steps = tf.get_variable(
            "normalization_steps",
            [],
            trainable=False,
            dtype=tf.int32,
            initializer=tf.zeros_initializer(),
        )
        running_mean = tf.get_variable(
            "running_mean",
            [vec_obs_size],
            trainable=False,
            dtype=tf.float32,
            initializer=tf.zeros_initializer(),
        )
        running_variance = tf.get_variable(
            "running_variance",
            [vec_obs_size],
            trainable=False,
            dtype=tf.float32,
            initializer=tf.ones_initializer(),
        )
        (
            initialize_normalization,
            update_normalization,
        ) = ModelUtils.create_normalizer_update(vector_obs, steps,
                                                running_mean, running_variance)
        return NormalizerTensors(
            initialize_normalization,
            update_normalization,
            steps,
            running_mean,
            running_variance,
        )