def create_network(self) -> None: """ Helper for creating the intrinsic reward nodes """ if self.use_vail: self.z_sigma = tf.get_variable( "gail_sigma_vail", self.z_size, dtype=tf.float32, initializer=tf.ones_initializer(), ) self.z_sigma_sq = self.z_sigma * self.z_sigma self.z_log_sigma_sq = tf.log(self.z_sigma_sq + EPSILON) self.use_noise = tf.placeholder( shape=[1], dtype=tf.float32, name="gail_NoiseLevel" ) self.expert_estimate, self.z_mean_expert, _ = self.create_encoder( self.encoded_expert, self.expert_action, self.done_expert, reuse=False ) self.policy_estimate, self.z_mean_policy, _ = self.create_encoder( self.encoded_policy, self.policy.selected_actions, self.done_policy, reuse=True, ) self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate) self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate) self.discriminator_score = tf.reshape( self.policy_estimate, [-1], name="gail_reward" ) self.intrinsic_reward = -tf.log(1.0 - self.discriminator_score + EPSILON)
def create_discrete_action_masking_layer(all_logits, action_masks, action_size): """ Creates a masking layer for the discrete actions :param all_logits: The concatenated unnormalized action probabilities for all branches :param action_masks: The mask for the logits. Must be of dimension [None x total_number_of_action] :param action_size: A list containing the number of possible actions for each branch :return: The action output dimension [batch_size, num_branches], the concatenated normalized probs (after softmax) and the concatenated normalized log probs """ action_idx = [0] + list(np.cumsum(action_size)) branches_logits = [ all_logits[:, action_idx[i]:action_idx[i + 1]] for i in range(len(action_size)) ] branch_masks = [ action_masks[:, action_idx[i]:action_idx[i + 1]] for i in range(len(action_size)) ] raw_probs = [ tf.multiply( tf.nn.softmax(branches_logits[k]) + EPSILON, branch_masks[k]) for k in range(len(action_size)) ] normalized_probs = [ tf.divide(raw_probs[k], tf.reduce_sum(raw_probs[k], axis=1, keepdims=True)) for k in range(len(action_size)) ] output = tf.concat( [ tf.multinomial(tf.log(normalized_probs[k] + EPSILON), 1) for k in range(len(action_size)) ], axis=1, ) return ( output, tf.concat([normalized_probs[k] for k in range(len(action_size))], axis=1), tf.concat( [ tf.log(normalized_probs[k] + EPSILON) for k in range(len(action_size)) ], axis=1, ), )
def _create_entropy( self, encoded: "GaussianDistribution.MuSigmaTensors") -> tf.Tensor: single_dim_entropy = 0.5 * tf.reduce_mean( tf.log(2 * np.pi * np.e) + 2 * encoded.log_sigma) # Make entropy the right shape return tf.ones_like(tf.reshape(encoded.mu[:, 0], [-1])) * single_dim_entropy
def create_loss(self, learning_rate: float, anneal_steps: int) -> None: """ Creates the loss and update nodes for the BC module :param learning_rate: The learning rate for the optimizer :param anneal_steps: Number of steps over which to anneal the learning_rate """ selected_action = self.policy.output if self.policy.use_continuous_act: self.loss = tf.reduce_mean( tf.squared_difference(selected_action, self.expert_action)) else: log_probs = self.policy.all_log_probs self.loss = tf.reduce_mean( -tf.log(tf.nn.softmax(log_probs) + 1e-7) * self.expert_action) if anneal_steps > 0: self.annealed_learning_rate = tf.train.polynomial_decay( learning_rate, self.policy.global_step, anneal_steps, 0.0, power=1.0) else: self.annealed_learning_rate = tf.Variable(learning_rate) optimizer = tf.train.AdamOptimizer( learning_rate=self.annealed_learning_rate, name="bc_adam") self.update_batch = optimizer.minimize(self.loss)
def create_loss(self, learning_rate: float) -> None: """ Creates the loss and update nodes for the GAIL reward generator :param learning_rate: The learning rate for the optimizer """ self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate) self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate) if self.use_vail: self.beta = tf.get_variable( "gail_beta", [], trainable=False, dtype=tf.float32, initializer=tf.ones_initializer(), ) self.discriminator_loss = -tf.reduce_mean( tf.log(self.expert_estimate + EPSILON) + tf.log(1.0 - self.policy_estimate + EPSILON) ) if self.use_vail: # KL divergence loss (encourage latent representation to be normal) self.kl_loss = tf.reduce_mean( -tf.reduce_sum( 1 + self.z_log_sigma_sq - 0.5 * tf.square(self.z_mean_expert) - 0.5 * tf.square(self.z_mean_policy) - tf.exp(self.z_log_sigma_sq), 1, ) ) self.loss = ( self.beta * (self.kl_loss - self.mutual_information) + self.discriminator_loss ) else: self.loss = self.discriminator_loss if self.gradient_penalty_weight > 0.0: self.loss += self.gradient_penalty_weight * self.create_gradient_magnitude() optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) self.update_batch = optimizer.minimize(self.loss)
def create_discrete_action_masking_layer( branches_logits: List[tf.Tensor], action_masks: tf.Tensor, action_size: List[int], ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: """ Creates a masking layer for the discrete actions :param branches_logits: A List of the unnormalized action probabilities for each branch :param action_masks: The mask for the logits. Must be of dimension [None x total_number_of_action] :param action_size: A list containing the number of possible actions for each branch :return: The action output dimension [batch_size, num_branches], the concatenated normalized probs (after softmax) and the concatenated normalized log probs """ branch_masks = ModelUtils.break_into_branches(action_masks, action_size) raw_probs = [ tf.multiply( tf.nn.softmax(branches_logits[k]) + EPSILON, branch_masks[k]) for k in range(len(action_size)) ] normalized_probs = [ tf.divide(raw_probs[k], tf.reduce_sum(raw_probs[k], axis=1, keepdims=True)) for k in range(len(action_size)) ] output = tf.concat( [ tf.multinomial(tf.log(normalized_probs[k] + EPSILON), 1) for k in range(len(action_size)) ], axis=1, ) return ( output, tf.concat([normalized_probs[k] for k in range(len(action_size))], axis=1), tf.concat( [ tf.log(normalized_probs[k] + EPSILON) for k in range(len(action_size)) ], axis=1, ), )
def create_inverse_model(self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor) -> None: """ Creates inverse model TensorFlow ops for Curiosity module. Predicts action taken given current and future encoded states. :param encoded_state: Tensor corresponding to encoded current state. :param encoded_next_state: Tensor corresponding to encoded next state. """ combined_input = tf.concat([encoded_state, encoded_next_state], axis=1) hidden = tf.layers.dense(combined_input, 256, activation=LearningModel.swish) if self.policy_model.brain.vector_action_space_type == "continuous": pred_action = tf.layers.dense(hidden, self.policy_model.act_size[0], activation=None) squared_difference = tf.reduce_sum( tf.squared_difference(pred_action, self.policy_model.selected_actions), axis=1, ) self.inverse_loss = tf.reduce_mean( tf.dynamic_partition(squared_difference, self.policy_model.mask, 2)[1]) else: pred_action = tf.concat( [ tf.layers.dense(hidden, self.policy_model.act_size[i], activation=tf.nn.softmax) for i in range(len(self.policy_model.act_size)) ], axis=1, ) cross_entropy = tf.reduce_sum( -tf.log(pred_action + 1e-10) * self.policy_model.selected_actions, axis=1, ) self.inverse_loss = tf.reduce_mean( tf.dynamic_partition(cross_entropy, self.policy_model.mask, 2)[1])
def _do_squash_correction_for_tanh(self, probs, squashed_policy): """ Adjust probabilities for squashed sample before output """ adjusted_probs = probs - tf.log(1 - squashed_policy**2 + EPSILON) return adjusted_probs
def create_cc_actor(self, hidden_policy, scope): """ Creates Continuous control actor for SAC. :param hidden_policy: Output of feature extractor (i.e. the input for vector obs, output of CNN for visual obs). :param num_layers: TF scope to assign whatever is created in this block. """ # Create action input (continuous) self.action_holder = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name="action_holder") self.external_action_in = self.action_holder scope = self.join_scopes(scope, "policy") with tf.variable_scope(scope): hidden_policy = self.create_vector_observation_encoder( hidden_policy, self.h_size, self.activ_fn, self.num_layers, "encoder", False, ) if self.use_recurrent: hidden_policy, memory_out = self.create_recurrent_encoder( hidden_policy, self.policy_memory_in, self.sequence_length, name="lstm_policy", ) self.policy_memory_out = memory_out with tf.variable_scope(scope): mu = tf.layers.dense( hidden_policy, self.act_size[0], activation=None, name="mu", kernel_initializer=LearningModel.scaled_init(0.01), ) # Policy-dependent log_sigma_sq log_sigma_sq = tf.layers.dense( hidden_policy, self.act_size[0], activation=None, name="log_std", kernel_initializer=LearningModel.scaled_init(0.01), ) self.log_sigma_sq = tf.clip_by_value(log_sigma_sq, LOG_STD_MIN, LOG_STD_MAX) sigma_sq = tf.exp(self.log_sigma_sq) # Do the reparameterization trick policy_ = mu + tf.random_normal(tf.shape(mu)) * sigma_sq _gauss_pre = -0.5 * (((policy_ - mu) / (tf.exp(self.log_sigma_sq) + EPSILON))**2 + 2 * self.log_sigma_sq + np.log(2 * np.pi)) all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True) self.entropy = tf.reduce_sum(self.log_sigma_sq + 0.5 * np.log(2.0 * np.pi * np.e), axis=-1) # Squash probabilities # Keep deterministic around in case we want to use it. self.deterministic_output = tf.tanh(mu) # Note that this is just for symmetry with PPO. self.output_pre = tf.tanh(policy_) # Squash correction all_probs -= tf.reduce_sum(tf.log(1 - self.output_pre**2 + EPSILON), axis=1, keepdims=True) self.all_log_probs = all_probs self.selected_actions = tf.stop_gradient(self.output_pre) self.action_probs = all_probs # Extract output for Barracuda self.output = tf.identity(self.output_pre, name="action") # Get all policy vars self.policy_vars = self.get_vars(scope)
def __init__( self, brain, h_size=128, lr=1e-4, n_layers=2, m_size=128, normalize=False, use_recurrent=False, seed=0, ): LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed) num_streams = 1 hidden_streams = self.create_observation_streams(num_streams, h_size, n_layers) hidden = hidden_streams[0] self.dropout_rate = tf.placeholder( dtype=tf.float32, shape=[], name="dropout_rate" ) hidden_reg = tf.layers.dropout(hidden, self.dropout_rate) if self.use_recurrent: tf.Variable( self.m_size, name="memory_size", trainable=False, dtype=tf.int32 ) self.memory_in = tf.placeholder( shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in" ) hidden_reg, self.memory_out = self.create_recurrent_encoder( hidden_reg, self.memory_in, self.sequence_length ) self.memory_out = tf.identity(self.memory_out, name="recurrent_out") if brain.vector_action_space_type == "discrete": policy_branches = [] for size in self.act_size: policy_branches.append( tf.layers.dense( hidden_reg, size, activation=None, use_bias=False, kernel_initializer=tf.initializers.variance_scaling(0.01), ) ) self.action_probs = tf.concat( [tf.nn.softmax(branch) for branch in policy_branches], axis=1, name="action_probs", ) self.action_masks = tf.placeholder( shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks" ) self.sample_action_float, _, normalized_logits = self.create_discrete_action_masking_layer( tf.concat(policy_branches, axis=1), self.action_masks, self.act_size ) tf.identity(normalized_logits, name="action") self.sample_action = tf.cast(self.sample_action_float, tf.int32) self.true_action = tf.placeholder( shape=[None, len(policy_branches)], dtype=tf.int32, name="teacher_action", ) self.action_oh = tf.concat( [ tf.one_hot(self.true_action[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) self.loss = tf.reduce_sum( -tf.log(self.action_probs + 1e-10) * self.action_oh ) self.action_percent = tf.reduce_mean( tf.cast( tf.equal( tf.cast(tf.argmax(self.action_probs, axis=1), tf.int32), self.sample_action, ), tf.float32, ) ) else: self.policy = tf.layers.dense( hidden_reg, self.act_size[0], activation=None, use_bias=False, name="pre_action", kernel_initializer=tf.initializers.variance_scaling(0.01), ) self.clipped_sample_action = tf.clip_by_value(self.policy, -1, 1) self.sample_action = tf.identity(self.clipped_sample_action, name="action") self.true_action = tf.placeholder( shape=[None, self.act_size[0]], dtype=tf.float32, name="teacher_action" ) self.clipped_true_action = tf.clip_by_value(self.true_action, -1, 1) self.loss = tf.reduce_sum( tf.squared_difference(self.clipped_true_action, self.sample_action) ) optimizer = tf.train.AdamOptimizer(learning_rate=lr) self.update = optimizer.minimize(self.loss)
def create_cc_actor_critic(self, h_size: int, num_layers: int, vis_encode_type: EncoderType) -> None: """ Creates Continuous control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. """ hidden_streams = self.create_observation_streams( 2, h_size, num_layers, vis_encode_type) if self.use_recurrent: self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in") _half_point = int(self.m_size / 2) hidden_policy, memory_policy_out = self.create_recurrent_encoder( hidden_streams[0], self.memory_in[:, :_half_point], self.sequence_length, name="lstm_policy", ) hidden_value, memory_value_out = self.create_recurrent_encoder( hidden_streams[1], self.memory_in[:, _half_point:], self.sequence_length, name="lstm_value", ) self.memory_out = tf.concat([memory_policy_out, memory_value_out], axis=1, name="recurrent_out") else: hidden_policy = hidden_streams[0] hidden_value = hidden_streams[1] mu = tf.layers.dense( hidden_policy, self.act_size[0], activation=None, kernel_initializer=LearningModel.scaled_init(0.01), reuse=tf.AUTO_REUSE, ) self.log_sigma_sq = tf.get_variable( "log_sigma_squared", [self.act_size[0]], dtype=tf.float32, initializer=tf.zeros_initializer(), ) sigma_sq = tf.exp(self.log_sigma_sq) self.epsilon = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon") # Clip and scale output to ensure actions are always within [-1, 1] range. self.output_pre = mu + tf.sqrt(sigma_sq) * self.epsilon output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3 self.output = tf.identity(output_post, name="action") self.selected_actions = tf.stop_gradient(output_post) # Compute probability of model output. all_probs = (-0.5 * tf.square(tf.stop_gradient(self.output_pre) - mu) / sigma_sq - 0.5 * tf.log(2.0 * np.pi) - 0.5 * self.log_sigma_sq) self.all_log_probs = tf.identity(all_probs, name="action_probs") self.entropy = 0.5 * tf.reduce_mean( tf.log(2 * np.pi * np.e) + self.log_sigma_sq) self.create_value_heads(self.stream_names, hidden_value) self.all_old_log_probs = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name="old_probabilities") # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control. self.log_probs = tf.reduce_sum((tf.identity(self.all_log_probs)), axis=1, keepdims=True) self.old_log_probs = tf.reduce_sum( (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True)
def _create_cc_actor( self, encoded: tf.Tensor, tanh_squash: bool = False, reparameterize: bool = False, condition_sigma_on_obs: bool = True, ) -> None: """ Creates Continuous control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. :param vis_encode_type: Type of visual encoder to use if visual input. :param tanh_squash: Whether to use a tanh function, or a clipped output. :param reparameterize: Whether we are using the resampling trick to update the policy. """ if self.use_recurrent: self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in") hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder( encoded, self.memory_in, self.sequence_length_ph, name="lstm_policy") self.memory_out = tf.identity(memory_policy_out, name="recurrent_out") else: hidden_policy = encoded with tf.variable_scope("policy"): mu = tf.layers.dense( hidden_policy, self.act_size[0], activation=None, name="mu", kernel_initializer=ModelUtils.scaled_init(0.01), reuse=tf.AUTO_REUSE, ) # Policy-dependent log_sigma if condition_sigma_on_obs: log_sigma = tf.layers.dense( hidden_policy, self.act_size[0], activation=None, name="log_sigma", kernel_initializer=ModelUtils.scaled_init(0.01), ) else: log_sigma = tf.get_variable( "log_sigma", [self.act_size[0]], dtype=tf.float32, initializer=tf.zeros_initializer(), ) log_sigma = tf.clip_by_value(log_sigma, self.log_std_min, self.log_std_max) sigma = tf.exp(log_sigma) epsilon = tf.random_normal(tf.shape(mu)) sampled_policy = mu + sigma * epsilon # Stop gradient if we're not doing the resampling trick if not reparameterize: sampled_policy_probs = tf.stop_gradient(sampled_policy) else: sampled_policy_probs = sampled_policy # Compute probability of model output. _gauss_pre = -0.5 * ( ((sampled_policy_probs - mu) / (sigma + EPSILON))**2 + 2 * log_sigma + np.log(2 * np.pi)) all_probs = _gauss_pre all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True) if tanh_squash: self.output_pre = tf.tanh(sampled_policy) # Squash correction all_probs -= tf.reduce_sum(tf.log(1 - self.output_pre**2 + EPSILON), axis=1, keepdims=True) self.output = tf.identity(self.output_pre, name="action") else: self.output_pre = sampled_policy # Clip and scale output to ensure actions are always within [-1, 1] range. output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3 self.output = tf.identity(output_post, name="action") self.selected_actions = tf.stop_gradient(self.output) self.all_log_probs = tf.identity(all_probs, name="action_probs") single_dim_entropy = 0.5 * tf.reduce_mean( tf.log(2 * np.pi * np.e) + 2 * log_sigma) # Make entropy the right shape self.entropy = tf.ones_like(tf.reshape(mu[:, 0], [-1])) * single_dim_entropy # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control. self.log_probs = tf.reduce_sum((tf.identity(self.all_log_probs)), axis=1, keepdims=True) self.action_holder = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name="action_holder")