Esempio n. 1
0
    def create_gradient_magnitude(self) -> tf.Tensor:
        """
        Gradient penalty from https://arxiv.org/pdf/1704.00028. Adds stability esp.
        for off-policy. Compute gradients w.r.t randomly interpolated input.
        """
        expert = [self.encoded_expert, self.expert_action, self.done_expert]
        policy = [
            self.encoded_policy,
            self.policy_model.selected_actions,
            self.done_policy,
        ]
        interp = []
        for _expert_in, _policy_in in zip(expert, policy):
            alpha = tf.random_uniform(tf.shape(_expert_in))
            interp.append(alpha * _expert_in + (1 - alpha) * _policy_in)

        grad_estimate, _, grad_input = self.create_encoder(
            interp[0], interp[1], interp[2], reuse=True
        )

        grad = tf.gradients(grad_estimate, [grad_input])[0]

        # Norm's gradient could be NaN at 0. Use our own safe_norm
        safe_norm = tf.sqrt(tf.reduce_sum(grad ** 2, axis=-1) + EPSILON)
        gradient_mag = tf.reduce_mean(tf.pow(safe_norm - 1, 2))

        return gradient_mag
Esempio n. 2
0
 def normalize_vector_obs(self, vector_obs):
     normalized_state = tf.clip_by_value(
         (vector_obs - self.running_mean) /
         tf.sqrt(self.running_variance /
                 (tf.cast(self.normalization_steps, tf.float32) + 1)),
         -5,
         5,
         name="normalized_state",
     )
     return normalized_state
Esempio n. 3
0
 def normalize_vector_obs(
     vector_obs: tf.Tensor,
     running_mean: tf.Tensor,
     running_variance: tf.Tensor,
     normalization_steps: tf.Tensor,
 ) -> tf.Tensor:
     """
     Create a normalized version of an input tensor.
     :param vector_obs: Input vector observation tensor.
     :param running_mean: Tensorflow tensor representing the current running mean.
     :param running_variance: Tensorflow tensor representing the current running variance.
     :param normalization_steps: Tensorflow tensor representing the current number of normalization_steps.
     :return: A normalized version of vector_obs.
     """
     normalized_state = tf.clip_by_value(
         (vector_obs - running_mean) /
         tf.sqrt(running_variance /
                 (tf.cast(normalization_steps, tf.float32) + 1)),
         -5,
         5,
         name="normalized_state",
     )
     return normalized_state
Esempio n. 4
0
    def create_cc_actor_critic(self, h_size: int, num_layers: int,
                               vis_encode_type: EncoderType) -> None:
        """
        Creates Continuous control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        """
        hidden_streams = self.create_observation_streams(
            2, h_size, num_layers, vis_encode_type)

        if self.use_recurrent:
            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            _half_point = int(self.m_size / 2)
            hidden_policy, memory_policy_out = self.create_recurrent_encoder(
                hidden_streams[0],
                self.memory_in[:, :_half_point],
                self.sequence_length,
                name="lstm_policy",
            )

            hidden_value, memory_value_out = self.create_recurrent_encoder(
                hidden_streams[1],
                self.memory_in[:, _half_point:],
                self.sequence_length,
                name="lstm_value",
            )
            self.memory_out = tf.concat([memory_policy_out, memory_value_out],
                                        axis=1,
                                        name="recurrent_out")
        else:
            hidden_policy = hidden_streams[0]
            hidden_value = hidden_streams[1]

        mu = tf.layers.dense(
            hidden_policy,
            self.act_size[0],
            activation=None,
            kernel_initializer=LearningModel.scaled_init(0.01),
            reuse=tf.AUTO_REUSE,
        )

        self.log_sigma_sq = tf.get_variable(
            "log_sigma_squared",
            [self.act_size[0]],
            dtype=tf.float32,
            initializer=tf.zeros_initializer(),
        )

        sigma_sq = tf.exp(self.log_sigma_sq)

        self.epsilon = tf.placeholder(shape=[None, self.act_size[0]],
                                      dtype=tf.float32,
                                      name="epsilon")
        # Clip and scale output to ensure actions are always within [-1, 1] range.
        self.output_pre = mu + tf.sqrt(sigma_sq) * self.epsilon
        output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
        self.output = tf.identity(output_post, name="action")
        self.selected_actions = tf.stop_gradient(output_post)

        # Compute probability of model output.
        all_probs = (-0.5 * tf.square(tf.stop_gradient(self.output_pre) - mu) /
                     sigma_sq - 0.5 * tf.log(2.0 * np.pi) -
                     0.5 * self.log_sigma_sq)

        self.all_log_probs = tf.identity(all_probs, name="action_probs")

        self.entropy = 0.5 * tf.reduce_mean(
            tf.log(2 * np.pi * np.e) + self.log_sigma_sq)

        self.create_value_heads(self.stream_names, hidden_value)

        self.all_old_log_probs = tf.placeholder(shape=[None, self.act_size[0]],
                                                dtype=tf.float32,
                                                name="old_probabilities")

        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
        self.log_probs = tf.reduce_sum((tf.identity(self.all_log_probs)),
                                       axis=1,
                                       keepdims=True)
        self.old_log_probs = tf.reduce_sum(
            (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True)