def _create_entropy( self, encoded: "GaussianDistribution.MuSigmaTensors") -> tf.Tensor: single_dim_entropy = 0.5 * tf.reduce_mean( tf.log(2 * np.pi * np.e) + tf.square(encoded.log_sigma)) # Make entropy the right shape return tf.ones_like(tf.reshape(encoded.mu[:, 0], [-1])) * single_dim_entropy
def create_loss(self, learning_rate: float) -> None: """ Creates the loss and update nodes for the GAIL reward generator :param learning_rate: The learning rate for the optimizer """ self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate) self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate) if self.use_vail: self.beta = tf.get_variable( "gail_beta", [], trainable=False, dtype=tf.float32, initializer=tf.ones_initializer(), ) self.discriminator_loss = -tf.reduce_mean( tf.log(self.expert_estimate + EPSILON) + tf.log(1.0 - self.policy_estimate + EPSILON) ) if self.use_vail: # KL divergence loss (encourage latent representation to be normal) self.kl_loss = tf.reduce_mean( -tf.reduce_sum( 1 + self.z_log_sigma_sq - 0.5 * tf.square(self.z_mean_expert) - 0.5 * tf.square(self.z_mean_policy) - tf.exp(self.z_log_sigma_sq), 1, ) ) self.loss = ( self.beta * (self.kl_loss - self.mutual_information) + self.discriminator_loss ) else: self.loss = self.discriminator_loss if self.gradient_penalty_weight > 0.0: self.loss += self.gradient_penalty_weight * self.create_gradient_magnitude() optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) self.update_batch = optimizer.minimize(self.loss)
def create_cc_actor_critic(self, h_size: int, num_layers: int, vis_encode_type: EncoderType) -> None: """ Creates Continuous control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. """ hidden_streams = self.create_observation_streams( 2, h_size, num_layers, vis_encode_type) if self.use_recurrent: self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in") _half_point = int(self.m_size / 2) hidden_policy, memory_policy_out = self.create_recurrent_encoder( hidden_streams[0], self.memory_in[:, :_half_point], self.sequence_length, name="lstm_policy", ) hidden_value, memory_value_out = self.create_recurrent_encoder( hidden_streams[1], self.memory_in[:, _half_point:], self.sequence_length, name="lstm_value", ) self.memory_out = tf.concat([memory_policy_out, memory_value_out], axis=1, name="recurrent_out") else: hidden_policy = hidden_streams[0] hidden_value = hidden_streams[1] mu = tf.layers.dense( hidden_policy, self.act_size[0], activation=None, kernel_initializer=LearningModel.scaled_init(0.01), reuse=tf.AUTO_REUSE, ) self.log_sigma_sq = tf.get_variable( "log_sigma_squared", [self.act_size[0]], dtype=tf.float32, initializer=tf.zeros_initializer(), ) sigma_sq = tf.exp(self.log_sigma_sq) self.epsilon = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon") # Clip and scale output to ensure actions are always within [-1, 1] range. self.output_pre = mu + tf.sqrt(sigma_sq) * self.epsilon output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3 self.output = tf.identity(output_post, name="action") self.selected_actions = tf.stop_gradient(output_post) # Compute probability of model output. all_probs = (-0.5 * tf.square(tf.stop_gradient(self.output_pre) - mu) / sigma_sq - 0.5 * tf.log(2.0 * np.pi) - 0.5 * self.log_sigma_sq) self.all_log_probs = tf.identity(all_probs, name="action_probs") self.entropy = 0.5 * tf.reduce_mean( tf.log(2 * np.pi * np.e) + self.log_sigma_sq) self.create_value_heads(self.stream_names, hidden_value) self.all_old_log_probs = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name="old_probabilities") # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control. self.log_probs = tf.reduce_sum((tf.identity(self.all_log_probs)), axis=1, keepdims=True) self.old_log_probs = tf.reduce_sum( (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True)