def create_gradient_magnitude(self) -> tf.Tensor: """ Gradient penalty from https://arxiv.org/pdf/1704.00028. Adds stability esp. for off-policy. Compute gradients w.r.t randomly interpolated input. """ expert = [self.encoded_expert, self.expert_action, self.done_expert] policy = [ self.encoded_policy, self.policy_model.selected_actions, self.done_policy, ] interp = [] for _expert_in, _policy_in in zip(expert, policy): alpha = tf.random_uniform(tf.shape(_expert_in)) interp.append(alpha * _expert_in + (1 - alpha) * _policy_in) grad_estimate, _, grad_input = self.create_encoder( interp[0], interp[1], interp[2], reuse=True ) grad = tf.gradients(grad_estimate, [grad_input])[0] # Norm's gradient could be NaN at 0. Use our own safe_norm safe_norm = tf.sqrt(tf.reduce_sum(grad ** 2, axis=-1) + EPSILON) gradient_mag = tf.reduce_mean(tf.pow(safe_norm - 1, 2)) return gradient_mag
def normalize_vector_obs(self, vector_obs): normalized_state = tf.clip_by_value( (vector_obs - self.running_mean) / tf.sqrt(self.running_variance / (tf.cast(self.normalization_steps, tf.float32) + 1)), -5, 5, name="normalized_state", ) return normalized_state
def normalize_vector_obs( vector_obs: tf.Tensor, running_mean: tf.Tensor, running_variance: tf.Tensor, normalization_steps: tf.Tensor, ) -> tf.Tensor: """ Create a normalized version of an input tensor. :param vector_obs: Input vector observation tensor. :param running_mean: Tensorflow tensor representing the current running mean. :param running_variance: Tensorflow tensor representing the current running variance. :param normalization_steps: Tensorflow tensor representing the current number of normalization_steps. :return: A normalized version of vector_obs. """ normalized_state = tf.clip_by_value( (vector_obs - running_mean) / tf.sqrt(running_variance / (tf.cast(normalization_steps, tf.float32) + 1)), -5, 5, name="normalized_state", ) return normalized_state
def create_cc_actor_critic(self, h_size: int, num_layers: int, vis_encode_type: EncoderType) -> None: """ Creates Continuous control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. """ hidden_streams = self.create_observation_streams( 2, h_size, num_layers, vis_encode_type) if self.use_recurrent: self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in") _half_point = int(self.m_size / 2) hidden_policy, memory_policy_out = self.create_recurrent_encoder( hidden_streams[0], self.memory_in[:, :_half_point], self.sequence_length, name="lstm_policy", ) hidden_value, memory_value_out = self.create_recurrent_encoder( hidden_streams[1], self.memory_in[:, _half_point:], self.sequence_length, name="lstm_value", ) self.memory_out = tf.concat([memory_policy_out, memory_value_out], axis=1, name="recurrent_out") else: hidden_policy = hidden_streams[0] hidden_value = hidden_streams[1] mu = tf.layers.dense( hidden_policy, self.act_size[0], activation=None, kernel_initializer=LearningModel.scaled_init(0.01), reuse=tf.AUTO_REUSE, ) self.log_sigma_sq = tf.get_variable( "log_sigma_squared", [self.act_size[0]], dtype=tf.float32, initializer=tf.zeros_initializer(), ) sigma_sq = tf.exp(self.log_sigma_sq) self.epsilon = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon") # Clip and scale output to ensure actions are always within [-1, 1] range. self.output_pre = mu + tf.sqrt(sigma_sq) * self.epsilon output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3 self.output = tf.identity(output_post, name="action") self.selected_actions = tf.stop_gradient(output_post) # Compute probability of model output. all_probs = (-0.5 * tf.square(tf.stop_gradient(self.output_pre) - mu) / sigma_sq - 0.5 * tf.log(2.0 * np.pi) - 0.5 * self.log_sigma_sq) self.all_log_probs = tf.identity(all_probs, name="action_probs") self.entropy = 0.5 * tf.reduce_mean( tf.log(2 * np.pi * np.e) + self.log_sigma_sq) self.create_value_heads(self.stream_names, hidden_value) self.all_old_log_probs = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name="old_probabilities") # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control. self.log_probs = tf.reduce_sum((tf.identity(self.all_log_probs)), axis=1, keepdims=True) self.old_log_probs = tf.reduce_sum( (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True)