def create_network(self) -> None: """ Helper for creating the intrinsic reward nodes """ if self.use_vail: self.z_sigma = tf.get_variable( "gail_sigma_vail", self.z_size, dtype=tf.float32, initializer=tf.ones_initializer(), ) self.z_sigma_sq = self.z_sigma * self.z_sigma self.z_log_sigma_sq = tf.log(self.z_sigma_sq + EPSILON) self.use_noise = tf.placeholder( shape=[1], dtype=tf.float32, name="gail_NoiseLevel" ) self.expert_estimate, self.z_mean_expert, _ = self.create_encoder( self.encoded_expert, self.expert_action, self.done_expert, reuse=False ) self.policy_estimate, self.z_mean_policy, _ = self.create_encoder( self.encoded_policy, self.policy.selected_actions, self.done_policy, reuse=True, ) self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate) self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate) self.discriminator_score = tf.reshape( self.policy_estimate, [-1], name="gail_reward" ) self.intrinsic_reward = -tf.log(1.0 - self.discriminator_score + EPSILON)
def create_normalizer(self, vector_obs): self.normalization_steps = tf.get_variable( "normalization_steps", [], trainable=False, dtype=tf.int32, initializer=tf.ones_initializer(), ) self.running_mean = tf.get_variable( "running_mean", [self.vec_obs_size], trainable=False, dtype=tf.float32, initializer=tf.zeros_initializer(), ) self.running_variance = tf.get_variable( "running_variance", [self.vec_obs_size], trainable=False, dtype=tf.float32, initializer=tf.ones_initializer(), ) self.update_normalization = self.create_normalizer_update(vector_obs)
def create_loss(self, learning_rate: float) -> None: """ Creates the loss and update nodes for the GAIL reward generator :param learning_rate: The learning rate for the optimizer """ self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate) self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate) if self.use_vail: self.beta = tf.get_variable( "gail_beta", [], trainable=False, dtype=tf.float32, initializer=tf.ones_initializer(), ) self.discriminator_loss = -tf.reduce_mean( tf.log(self.expert_estimate + EPSILON) + tf.log(1.0 - self.policy_estimate + EPSILON) ) if self.use_vail: # KL divergence loss (encourage latent representation to be normal) self.kl_loss = tf.reduce_mean( -tf.reduce_sum( 1 + self.z_log_sigma_sq - 0.5 * tf.square(self.z_mean_expert) - 0.5 * tf.square(self.z_mean_policy) - tf.exp(self.z_log_sigma_sq), 1, ) ) self.loss = ( self.beta * (self.kl_loss - self.mutual_information) + self.discriminator_loss ) else: self.loss = self.discriminator_loss if self.gradient_penalty_weight > 0.0: self.loss += self.gradient_penalty_weight * self.create_gradient_magnitude() optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) self.update_batch = optimizer.minimize(self.loss)
def create_normalizer(vector_obs: tf.Tensor) -> NormalizerTensors: """ Creates the normalizer and the variables required to store its state. :param vector_obs: A Tensor representing the next value to normalize. When the update operation is called, it will use vector_obs to update the running mean and variance. :return: A NormalizerTensors tuple that holds running mean, running variance, number of steps, and the update operation. """ vec_obs_size = vector_obs.shape[1] steps = tf.get_variable( "normalization_steps", [], trainable=False, dtype=tf.int32, initializer=tf.zeros_initializer(), ) running_mean = tf.get_variable( "running_mean", [vec_obs_size], trainable=False, dtype=tf.float32, initializer=tf.zeros_initializer(), ) running_variance = tf.get_variable( "running_variance", [vec_obs_size], trainable=False, dtype=tf.float32, initializer=tf.ones_initializer(), ) ( initialize_normalization, update_normalization, ) = ModelUtils.create_normalizer_update(vector_obs, steps, running_mean, running_variance) return NormalizerTensors( initialize_normalization, update_normalization, steps, running_mean, running_variance, )