Exemple #1
0
    def create_normalizer_update(
        vector_input: tf.Tensor,
        steps: tf.Tensor,
        running_mean: tf.Tensor,
        running_variance: tf.Tensor,
    ) -> tf.Operation:
        """
        Creates the update operation for the normalizer.
        :param vector_input: Vector observation to use for updating the running mean and variance.
        :param running_mean: Tensorflow tensor representing the current running mean.
        :param running_variance: Tensorflow tensor representing the current running variance.
        :param steps: Tensorflow tensor representing the current number of steps that have been normalized.
        :return: A TF operation that updates the normalization based on vector_input.
        """
        # Based on Welford's algorithm for running mean and standard deviation, for batch updates. Discussion here:
        # https://stackoverflow.com/questions/56402955/whats-the-formula-for-welfords-algorithm-for-variance-std-with-batch-updates
        steps_increment = tf.shape(vector_input)[0]
        total_new_steps = tf.add(steps, steps_increment)

        # Compute the incremental update and divide by the number of new steps.
        input_to_old_mean = tf.subtract(vector_input, running_mean)
        new_mean = running_mean + tf.reduce_sum(
            input_to_old_mean / tf.cast(total_new_steps, dtype=tf.float32),
            axis=0)
        # Compute difference of input to the new mean for Welford update
        input_to_new_mean = tf.subtract(vector_input, new_mean)
        new_variance = running_variance + tf.reduce_sum(
            input_to_new_mean * input_to_old_mean, axis=0)
        update_mean = tf.assign(running_mean, new_mean)
        update_variance = tf.assign(running_variance, new_variance)
        update_norm_step = tf.assign(steps, total_new_steps)
        return tf.group([update_mean, update_variance, update_norm_step])
Exemple #2
0
 def create_forward_model(
     self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor
 ) -> None:
     """
     Creates forward model TensorFlow ops for Curiosity module.
     Predicts encoded future state based on encoded current state and given action.
     :param encoded_state: Tensor corresponding to encoded current state.
     :param encoded_next_state: Tensor corresponding to encoded next state.
     """
     combined_input = tf.concat(
         [encoded_state, self.policy.selected_actions], axis=1
     )
     hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish)
     pred_next_state = tf.layers.dense(
         hidden,
         self.encoding_size
         * (self.policy.vis_obs_size + int(self.policy.vec_obs_size > 0)),
         activation=None,
     )
     squared_difference = 0.5 * tf.reduce_sum(
         tf.squared_difference(pred_next_state, encoded_next_state), axis=1
     )
     self.intrinsic_reward = squared_difference
     self.forward_loss = tf.reduce_mean(
         tf.dynamic_partition(squared_difference, self.policy.mask, 2)[1]
     )
Exemple #3
0
    def create_gradient_magnitude(self) -> tf.Tensor:
        """
        Gradient penalty from https://arxiv.org/pdf/1704.00028. Adds stability esp.
        for off-policy. Compute gradients w.r.t randomly interpolated input.
        """
        expert = [self.encoded_expert, self.expert_action, self.done_expert]
        policy = [
            self.encoded_policy,
            self.policy_model.selected_actions,
            self.done_policy,
        ]
        interp = []
        for _expert_in, _policy_in in zip(expert, policy):
            alpha = tf.random_uniform(tf.shape(_expert_in))
            interp.append(alpha * _expert_in + (1 - alpha) * _policy_in)

        grad_estimate, _, grad_input = self.create_encoder(
            interp[0], interp[1], interp[2], reuse=True
        )

        grad = tf.gradients(grad_estimate, [grad_input])[0]

        # Norm's gradient could be NaN at 0. Use our own safe_norm
        safe_norm = tf.sqrt(tf.reduce_sum(grad ** 2, axis=-1) + EPSILON)
        gradient_mag = tf.reduce_mean(tf.pow(safe_norm - 1, 2))

        return gradient_mag
Exemple #4
0
    def _create_dc_critic(self, h_size: int, num_layers: int,
                          vis_encode_type: EncoderType) -> None:
        """
        Creates Discrete control critic (value) network.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: The type of visual encoder to use.
        """
        hidden_stream = ModelUtils.create_observation_streams(
            self.policy.visual_in,
            self.policy.processed_vector_in,
            1,
            h_size,
            num_layers,
            vis_encode_type,
        )[0]

        if self.policy.use_recurrent:
            hidden_value, memory_value_out = ModelUtils.create_recurrent_encoder(
                hidden_stream,
                self.memory_in,
                self.policy.sequence_length_ph,
                name="lstm_value",
            )
            self.memory_out = memory_value_out
        else:
            hidden_value = hidden_stream

        self.value_heads, self.value = ModelUtils.create_value_heads(
            self.stream_names, hidden_value)

        self.all_old_log_probs = tf.placeholder(
            shape=[None, sum(self.policy.act_size)],
            dtype=tf.float32,
            name="old_probabilities",
        )
        _, _, old_normalized_logits = ModelUtils.create_discrete_action_masking_layer(
            self.all_old_log_probs, self.policy.action_masks,
            self.policy.act_size)

        action_idx = [0] + list(np.cumsum(self.policy.act_size))

        self.old_log_probs = tf.reduce_sum(
            (tf.stack(
                [
                    -tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=self.policy.
                        selected_actions[:, action_idx[i]:action_idx[i + 1]],
                        logits=old_normalized_logits[:, action_idx[i]:
                                                     action_idx[i + 1]],
                    ) for i in range(len(self.policy.act_size))
                ],
                axis=1,
            )),
            axis=1,
            keepdims=True,
        )
Exemple #5
0
    def create_normalizer_update(self, vector_input):
        # Based on Welford's algorithm for running mean and standard deviation, for batch updates. Discussion here:
        # https://stackoverflow.com/questions/56402955/whats-the-formula-for-welfords-algorithm-for-variance-std-with-batch-updates
        steps_increment = tf.shape(vector_input)[0]
        total_new_steps = tf.add(self.normalization_steps, steps_increment)

        # Compute the incremental update and divide by the number of new steps.
        input_to_old_mean = tf.subtract(vector_input, self.running_mean)
        new_mean = self.running_mean + tf.reduce_sum(
            input_to_old_mean / tf.cast(total_new_steps, dtype=tf.float32),
            axis=0)
        # Compute difference of input to the new mean for Welford update
        input_to_new_mean = tf.subtract(vector_input, new_mean)
        new_variance = self.running_variance + tf.reduce_sum(
            input_to_new_mean * input_to_old_mean, axis=0)
        update_mean = tf.assign(self.running_mean, new_mean)
        update_variance = tf.assign(self.running_variance, new_variance)
        update_norm_step = tf.assign(self.normalization_steps, total_new_steps)
        return tf.group([update_mean, update_variance, update_norm_step])
Exemple #6
0
 def create_inverse_model(self, encoded_state: tf.Tensor,
                          encoded_next_state: tf.Tensor) -> None:
     """
     Creates inverse model TensorFlow ops for Curiosity module.
     Predicts action taken given current and future encoded states.
     :param encoded_state: Tensor corresponding to encoded current state.
     :param encoded_next_state: Tensor corresponding to encoded next state.
     """
     combined_input = tf.concat([encoded_state, encoded_next_state], axis=1)
     hidden = tf.layers.dense(combined_input,
                              256,
                              activation=LearningModel.swish)
     if self.policy_model.brain.vector_action_space_type == "continuous":
         pred_action = tf.layers.dense(hidden,
                                       self.policy_model.act_size[0],
                                       activation=None)
         squared_difference = tf.reduce_sum(
             tf.squared_difference(pred_action,
                                   self.policy_model.selected_actions),
             axis=1,
         )
         self.inverse_loss = tf.reduce_mean(
             tf.dynamic_partition(squared_difference,
                                  self.policy_model.mask, 2)[1])
     else:
         pred_action = tf.concat(
             [
                 tf.layers.dense(hidden,
                                 self.policy_model.act_size[i],
                                 activation=tf.nn.softmax)
                 for i in range(len(self.policy_model.act_size))
             ],
             axis=1,
         )
         cross_entropy = tf.reduce_sum(
             -tf.log(pred_action + 1e-10) *
             self.policy_model.selected_actions,
             axis=1,
         )
         self.inverse_loss = tf.reduce_mean(
             tf.dynamic_partition(cross_entropy, self.policy_model.mask,
                                  2)[1])
Exemple #7
0
 def create_discrete_action_masking_layer(all_logits, action_masks,
                                          action_size):
     """
     Creates a masking layer for the discrete actions
     :param all_logits: The concatenated unnormalized action probabilities for all branches
     :param action_masks: The mask for the logits. Must be of dimension [None x total_number_of_action]
     :param action_size: A list containing the number of possible actions for each branch
     :return: The action output dimension [batch_size, num_branches], the concatenated
         normalized probs (after softmax)
     and the concatenated normalized log probs
     """
     action_idx = [0] + list(np.cumsum(action_size))
     branches_logits = [
         all_logits[:, action_idx[i]:action_idx[i + 1]]
         for i in range(len(action_size))
     ]
     branch_masks = [
         action_masks[:, action_idx[i]:action_idx[i + 1]]
         for i in range(len(action_size))
     ]
     raw_probs = [
         tf.multiply(
             tf.nn.softmax(branches_logits[k]) + EPSILON, branch_masks[k])
         for k in range(len(action_size))
     ]
     normalized_probs = [
         tf.divide(raw_probs[k],
                   tf.reduce_sum(raw_probs[k], axis=1, keepdims=True))
         for k in range(len(action_size))
     ]
     output = tf.concat(
         [
             tf.multinomial(tf.log(normalized_probs[k] + EPSILON), 1)
             for k in range(len(action_size))
         ],
         axis=1,
     )
     return (
         output,
         tf.concat([normalized_probs[k] for k in range(len(action_size))],
                   axis=1),
         tf.concat(
             [
                 tf.log(normalized_probs[k] + EPSILON)
                 for k in range(len(action_size))
             ],
             axis=1,
         ),
     )
Exemple #8
0
    def create_loss(self, learning_rate: float) -> None:
        """
        Creates the loss and update nodes for the GAIL reward generator
        :param learning_rate: The learning rate for the optimizer
        """
        self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate)
        self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate)

        if self.use_vail:
            self.beta = tf.get_variable(
                "gail_beta",
                [],
                trainable=False,
                dtype=tf.float32,
                initializer=tf.ones_initializer(),
            )

        self.discriminator_loss = -tf.reduce_mean(
            tf.log(self.expert_estimate + EPSILON)
            + tf.log(1.0 - self.policy_estimate + EPSILON)
        )

        if self.use_vail:
            # KL divergence loss (encourage latent representation to be normal)
            self.kl_loss = tf.reduce_mean(
                -tf.reduce_sum(
                    1
                    + self.z_log_sigma_sq
                    - 0.5 * tf.square(self.z_mean_expert)
                    - 0.5 * tf.square(self.z_mean_policy)
                    - tf.exp(self.z_log_sigma_sq),
                    1,
                )
            )
            self.loss = (
                self.beta * (self.kl_loss - self.mutual_information)
                + self.discriminator_loss
            )
        else:
            self.loss = self.discriminator_loss

        if self.gradient_penalty_weight > 0.0:
            self.loss += self.gradient_penalty_weight * self.create_gradient_magnitude()

        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        self.update_batch = optimizer.minimize(self.loss)
Exemple #9
0
 def create_discrete_action_masking_layer(
     branches_logits: List[tf.Tensor],
     action_masks: tf.Tensor,
     action_size: List[int],
 ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
     """
     Creates a masking layer for the discrete actions
     :param branches_logits: A List of the unnormalized action probabilities for each branch
     :param action_masks: The mask for the logits. Must be of dimension [None x total_number_of_action]
     :param action_size: A list containing the number of possible actions for each branch
     :return: The action output dimension [batch_size, num_branches], the concatenated
         normalized probs (after softmax)
     and the concatenated normalized log probs
     """
     branch_masks = ModelUtils.break_into_branches(action_masks,
                                                   action_size)
     raw_probs = [
         tf.multiply(
             tf.nn.softmax(branches_logits[k]) + EPSILON, branch_masks[k])
         for k in range(len(action_size))
     ]
     normalized_probs = [
         tf.divide(raw_probs[k],
                   tf.reduce_sum(raw_probs[k], axis=1, keepdims=True))
         for k in range(len(action_size))
     ]
     output = tf.concat(
         [
             tf.multinomial(tf.log(normalized_probs[k] + EPSILON), 1)
             for k in range(len(action_size))
         ],
         axis=1,
     )
     return (
         output,
         tf.concat([normalized_probs[k] for k in range(len(action_size))],
                   axis=1),
         tf.concat(
             [
                 tf.log(normalized_probs[k] + EPSILON)
                 for k in range(len(action_size))
             ],
             axis=1,
         ),
     )
Exemple #10
0
    def _create_entropy(self, all_log_probs: tf.Tensor, action_idx: List[int],
                        act_size: List[int]) -> tf.Tensor:
        entropy = tf.reduce_sum(
            (tf.stack(
                [
                    tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=tf.nn.softmax(
                            all_log_probs[:, action_idx[i]:action_idx[i + 1]]),
                        logits=all_log_probs[:,
                                             action_idx[i]:action_idx[i + 1]],
                    ) for i in range(len(act_size))
                ],
                axis=1,
            )),
            axis=1,
        )

        return entropy
Exemple #11
0
 def __init__(
     self,
     logits: tf.Tensor,
     act_size: List[int],
     reparameterize: bool = False,
     tanh_squash: bool = False,
     condition_sigma: bool = True,
     log_sigma_min: float = -20,
     log_sigma_max: float = 2,
 ):
     """
     A Gaussian output distribution for continuous actions.
     :param logits: Hidden layer to use as the input to the Gaussian distribution.
     :param act_size: List containing the number of continuous actions.
     :param reparameterize: Whether or not to use the reparameterization trick (block gradients through
         log probability calculation.)
     :param tanh_squash: Squash the output using tanh, constraining it between -1 and 1.
         From: Haarnoja et. al, https://arxiv.org/abs/1801.01290
     :param log_sigma_min: Minimum log standard deviation to clip by.
     :param log_sigma_max: Maximum log standard deviation to clip by.
     """
     encoded = self._create_mu_log_sigma(
         logits,
         act_size,
         log_sigma_min,
         log_sigma_max,
         condition_sigma=condition_sigma,
     )
     self._sampled_policy = self._create_sampled_policy(encoded)
     if not reparameterize:
         _sampled_policy_probs = tf.stop_gradient(self._sampled_policy)
     else:
         _sampled_policy_probs = self._sampled_policy
     self._all_probs = self._create_log_probs(_sampled_policy_probs,
                                              encoded)
     if tanh_squash:
         self._sampled_policy = tf.tanh(self._sampled_policy)
         self._all_probs = self._do_squash_correction_for_tanh(
             self._all_probs, self._sampled_policy)
     self._total_prob = tf.reduce_sum(self._all_probs,
                                      axis=1,
                                      keepdims=True)
     self._entropy = self._create_entropy(encoded)
Exemple #12
0
    def _create_cc_critic(
        self, h_size: int, num_layers: int, vis_encode_type: EncoderType
    ) -> None:
        """
        Creates Continuous control critic (value) network.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: The type of visual encoder to use.
        """
        hidden_stream = ModelUtils.create_observation_streams(
            self.policy.visual_in,
            self.policy.processed_vector_in,
            1,
            h_size,
            num_layers,
            vis_encode_type,
        )[0]

        if self.policy.use_recurrent:
            hidden_value, memory_value_out = ModelUtils.create_recurrent_encoder(
                hidden_stream,
                self.memory_in,
                self.policy.sequence_length_ph,
                name="lstm_value",
            )
            self.memory_out = memory_value_out
        else:
            hidden_value = hidden_stream

        self.value_heads, self.value = ModelUtils.create_value_heads(
            self.stream_names, hidden_value
        )
        self.all_old_log_probs = tf.placeholder(
            shape=[None, sum(self.policy.act_size)],
            dtype=tf.float32,
            name="old_probabilities",
        )

        self.old_log_probs = tf.reduce_sum(
            (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True
        )
Exemple #13
0
 def _get_log_probs(
     self,
     sample_onehot: tf.Tensor,
     all_log_probs: tf.Tensor,
     action_idx: List[int],
     act_size: List[int],
 ) -> tf.Tensor:
     log_probs = tf.reduce_sum(
         (tf.stack(
             [
                 -tf.nn.softmax_cross_entropy_with_logits_v2(
                     labels=sample_onehot[:,
                                          action_idx[i]:action_idx[i + 1]],
                     logits=all_log_probs[:,
                                          action_idx[i]:action_idx[i + 1]],
                 ) for i in range(len(act_size))
             ],
             axis=1,
         )),
         axis=1,
         keepdims=True,
     )
     return log_probs
Exemple #14
0
    def _create_losses(
        self,
        q1_streams: Dict[str, tf.Tensor],
        q2_streams: Dict[str, tf.Tensor],
        lr: tf.Tensor,
        max_step: int,
        stream_names: List[str],
        discrete: bool = False,
    ) -> None:
        """
        Creates training-specific Tensorflow ops for SAC models.
        :param q1_streams: Q1 streams from policy network
        :param q1_streams: Q2 streams from policy network
        :param lr: Learning rate
        :param max_step: Total number of training steps.
        :param stream_names: List of reward stream names.
        :param discrete: Whether or not to use discrete action losses.
        """

        if discrete:
            self.target_entropy = [
                self.discrete_target_entropy_scale *
                np.log(i).astype(np.float32) for i in self.act_size
            ]
            discrete_action_probs = tf.exp(self.policy.all_log_probs)
            per_action_entropy = discrete_action_probs * self.policy.all_log_probs
        else:
            self.target_entropy = (
                -1 * self.continuous_target_entropy_scale *
                np.prod(self.act_size[0]).astype(np.float32))

        self.rewards_holders = {}
        self.min_policy_qs = {}

        for name in stream_names:
            if discrete:
                _branched_mpq1 = ModelUtils.break_into_branches(
                    self.policy_network.q1_pheads[name] *
                    discrete_action_probs,
                    self.act_size,
                )
                branched_mpq1 = tf.stack([
                    tf.reduce_sum(_br, axis=1, keep_dims=True)
                    for _br in _branched_mpq1
                ])
                _q1_p_mean = tf.reduce_mean(branched_mpq1, axis=0)

                _branched_mpq2 = ModelUtils.break_into_branches(
                    self.policy_network.q2_pheads[name] *
                    discrete_action_probs,
                    self.act_size,
                )
                branched_mpq2 = tf.stack([
                    tf.reduce_sum(_br, axis=1, keep_dims=True)
                    for _br in _branched_mpq2
                ])
                _q2_p_mean = tf.reduce_mean(branched_mpq2, axis=0)

                self.min_policy_qs[name] = tf.minimum(_q1_p_mean, _q2_p_mean)
            else:
                self.min_policy_qs[name] = tf.minimum(
                    self.policy_network.q1_pheads[name],
                    self.policy_network.q2_pheads[name],
                )

            rewards_holder = tf.placeholder(shape=[None],
                                            dtype=tf.float32,
                                            name=f"{name}_rewards")
            self.rewards_holders[name] = rewards_holder

        q1_losses = []
        q2_losses = []
        # Multiple q losses per stream
        expanded_dones = tf.expand_dims(self.dones_holder, axis=-1)
        for i, name in enumerate(stream_names):
            _expanded_rewards = tf.expand_dims(self.rewards_holders[name],
                                               axis=-1)

            q_backup = tf.stop_gradient(
                _expanded_rewards +
                (1.0 - self.use_dones_in_backup[name] * expanded_dones) *
                self.gammas[i] * self.target_network.value_heads[name])

            if discrete:
                # We need to break up the Q functions by branch, and update them individually.
                branched_q1_stream = ModelUtils.break_into_branches(
                    self.policy.selected_actions * q1_streams[name],
                    self.act_size)
                branched_q2_stream = ModelUtils.break_into_branches(
                    self.policy.selected_actions * q2_streams[name],
                    self.act_size)

                # Reduce each branch into scalar
                branched_q1_stream = [
                    tf.reduce_sum(_branch, axis=1, keep_dims=True)
                    for _branch in branched_q1_stream
                ]
                branched_q2_stream = [
                    tf.reduce_sum(_branch, axis=1, keep_dims=True)
                    for _branch in branched_q2_stream
                ]

                q1_stream = tf.reduce_mean(branched_q1_stream, axis=0)
                q2_stream = tf.reduce_mean(branched_q2_stream, axis=0)

            else:
                q1_stream = q1_streams[name]
                q2_stream = q2_streams[name]

            _q1_loss = 0.5 * tf.reduce_mean(
                tf.to_float(self.policy.mask) *
                tf.squared_difference(q_backup, q1_stream))

            _q2_loss = 0.5 * tf.reduce_mean(
                tf.to_float(self.policy.mask) *
                tf.squared_difference(q_backup, q2_stream))

            q1_losses.append(_q1_loss)
            q2_losses.append(_q2_loss)

        self.q1_loss = tf.reduce_mean(q1_losses)
        self.q2_loss = tf.reduce_mean(q2_losses)

        # Learn entropy coefficient
        if discrete:
            # Create a log_ent_coef for each branch
            self.log_ent_coef = tf.get_variable(
                "log_ent_coef",
                dtype=tf.float32,
                initializer=np.log([self.init_entcoef] *
                                   len(self.act_size)).astype(np.float32),
                trainable=True,
            )
        else:
            self.log_ent_coef = tf.get_variable(
                "log_ent_coef",
                dtype=tf.float32,
                initializer=np.log(self.init_entcoef).astype(np.float32),
                trainable=True,
            )

        self.ent_coef = tf.exp(self.log_ent_coef)
        if discrete:
            # We also have to do a different entropy and target_entropy per branch.
            branched_per_action_ent = ModelUtils.break_into_branches(
                per_action_entropy, self.act_size)
            branched_ent_sums = tf.stack(
                [
                    tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te for _lp,
                    _te in zip(branched_per_action_ent, self.target_entropy)
                ],
                axis=1,
            )
            self.entropy_loss = -tf.reduce_mean(
                tf.to_float(self.policy.mask) * tf.reduce_mean(
                    self.log_ent_coef *
                    tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2),
                    axis=1,
                ))

            # Same with policy loss, we have to do the loss per branch and average them,
            # so that larger branches don't get more weight.
            # The equivalent KL divergence from Eq 10 of Haarnoja et al. is also pi*log(pi) - Q
            branched_q_term = ModelUtils.break_into_branches(
                discrete_action_probs * self.policy_network.q1_p,
                self.act_size)

            branched_policy_loss = tf.stack([
                tf.reduce_sum(self.ent_coef[i] * _lp - _qt,
                              axis=1,
                              keep_dims=True)
                for i, (_lp, _qt) in enumerate(
                    zip(branched_per_action_ent, branched_q_term))
            ])
            self.policy_loss = tf.reduce_mean(
                tf.to_float(self.policy.mask) *
                tf.squeeze(branched_policy_loss))

            # Do vbackup entropy bonus per branch as well.
            branched_ent_bonus = tf.stack([
                tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=True)
                for i, _lp in enumerate(branched_per_action_ent)
            ])
            value_losses = []
            for name in stream_names:
                v_backup = tf.stop_gradient(
                    self.min_policy_qs[name] -
                    tf.reduce_mean(branched_ent_bonus, axis=0))
                value_losses.append(0.5 * tf.reduce_mean(
                    tf.to_float(self.policy.mask) * tf.squared_difference(
                        self.policy_network.value_heads[name], v_backup)))

        else:
            self.entropy_loss = -tf.reduce_mean(
                self.log_ent_coef * tf.to_float(self.policy.mask) *
                tf.stop_gradient(
                    tf.reduce_sum(
                        self.policy.all_log_probs + self.target_entropy,
                        axis=1,
                        keep_dims=True,
                    )))
            batch_policy_loss = tf.reduce_mean(
                self.ent_coef * self.policy.all_log_probs -
                self.policy_network.q1_p,
                axis=1,
            )
            self.policy_loss = tf.reduce_mean(
                tf.to_float(self.policy.mask) * batch_policy_loss)

            value_losses = []
            for name in stream_names:
                v_backup = tf.stop_gradient(
                    self.min_policy_qs[name] - tf.reduce_sum(
                        self.ent_coef * self.policy.all_log_probs, axis=1))
                value_losses.append(0.5 * tf.reduce_mean(
                    tf.to_float(self.policy.mask) * tf.squared_difference(
                        self.policy_network.value_heads[name], v_backup)))
        self.value_loss = tf.reduce_mean(value_losses)

        self.total_value_loss = self.q1_loss + self.q2_loss + self.value_loss

        self.entropy = self.policy_network.entropy
Exemple #15
0
    def create_dc_actor_critic(self, h_size: int, num_layers: int,
                               vis_encode_type: EncoderType) -> None:
        """
        Creates Discrete control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        """
        hidden_streams = self.create_observation_streams(
            1, h_size, num_layers, vis_encode_type)
        hidden = hidden_streams[0]

        if self.use_recurrent:
            self.prev_action = tf.placeholder(shape=[None,
                                                     len(self.act_size)],
                                              dtype=tf.int32,
                                              name="prev_action")
            prev_action_oh = tf.concat(
                [
                    tf.one_hot(self.prev_action[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            hidden = tf.concat([hidden, prev_action_oh], axis=1)

            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            hidden, memory_out = self.create_recurrent_encoder(
                hidden, self.memory_in, self.sequence_length)
            self.memory_out = tf.identity(memory_out, name="recurrent_out")

        policy_branches = []
        for size in self.act_size:
            policy_branches.append(
                tf.layers.dense(
                    hidden,
                    size,
                    activation=None,
                    use_bias=False,
                    kernel_initializer=LearningModel.scaled_init(0.01),
                ))

        self.all_log_probs = tf.concat(policy_branches,
                                       axis=1,
                                       name="action_probs")

        self.action_masks = tf.placeholder(shape=[None,
                                                  sum(self.act_size)],
                                           dtype=tf.float32,
                                           name="action_masks")
        output, _, normalized_logits = self.create_discrete_action_masking_layer(
            self.all_log_probs, self.action_masks, self.act_size)

        self.output = tf.identity(output)
        self.normalized_logits = tf.identity(normalized_logits, name="action")

        self.create_value_heads(self.stream_names, hidden)

        self.action_holder = tf.placeholder(shape=[None,
                                                   len(policy_branches)],
                                            dtype=tf.int32,
                                            name="action_holder")
        self.action_oh = tf.concat(
            [
                tf.one_hot(self.action_holder[:, i], self.act_size[i])
                for i in range(len(self.act_size))
            ],
            axis=1,
        )
        self.selected_actions = tf.stop_gradient(self.action_oh)

        self.all_old_log_probs = tf.placeholder(
            shape=[None, sum(self.act_size)],
            dtype=tf.float32,
            name="old_probabilities")
        _, _, old_normalized_logits = self.create_discrete_action_masking_layer(
            self.all_old_log_probs, self.action_masks, self.act_size)

        action_idx = [0] + list(np.cumsum(self.act_size))

        self.entropy = tf.reduce_sum(
            (tf.stack(
                [
                    tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=tf.nn.softmax(
                            self.all_log_probs[:,
                                               action_idx[i]:action_idx[i +
                                                                        1]]),
                        logits=self.all_log_probs[:,
                                                  action_idx[i]:action_idx[i +
                                                                           1]],
                    ) for i in range(len(self.act_size))
                ],
                axis=1,
            )),
            axis=1,
        )

        self.log_probs = tf.reduce_sum(
            (tf.stack(
                [
                    -tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=self.action_oh[:,
                                              action_idx[i]:action_idx[i + 1]],
                        logits=normalized_logits[:,
                                                 action_idx[i]:action_idx[i +
                                                                          1]],
                    ) for i in range(len(self.act_size))
                ],
                axis=1,
            )),
            axis=1,
            keepdims=True,
        )
        self.old_log_probs = tf.reduce_sum(
            (tf.stack(
                [
                    -tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=self.action_oh[:,
                                              action_idx[i]:action_idx[i + 1]],
                        logits=old_normalized_logits[:, action_idx[i]:
                                                     action_idx[i + 1]],
                    ) for i in range(len(self.act_size))
                ],
                axis=1,
            )),
            axis=1,
            keepdims=True,
        )
Exemple #16
0
    def create_cc_actor_critic(self, h_size: int, num_layers: int,
                               vis_encode_type: EncoderType) -> None:
        """
        Creates Continuous control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        """
        hidden_streams = self.create_observation_streams(
            2, h_size, num_layers, vis_encode_type)

        if self.use_recurrent:
            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            _half_point = int(self.m_size / 2)
            hidden_policy, memory_policy_out = self.create_recurrent_encoder(
                hidden_streams[0],
                self.memory_in[:, :_half_point],
                self.sequence_length,
                name="lstm_policy",
            )

            hidden_value, memory_value_out = self.create_recurrent_encoder(
                hidden_streams[1],
                self.memory_in[:, _half_point:],
                self.sequence_length,
                name="lstm_value",
            )
            self.memory_out = tf.concat([memory_policy_out, memory_value_out],
                                        axis=1,
                                        name="recurrent_out")
        else:
            hidden_policy = hidden_streams[0]
            hidden_value = hidden_streams[1]

        mu = tf.layers.dense(
            hidden_policy,
            self.act_size[0],
            activation=None,
            kernel_initializer=LearningModel.scaled_init(0.01),
            reuse=tf.AUTO_REUSE,
        )

        self.log_sigma_sq = tf.get_variable(
            "log_sigma_squared",
            [self.act_size[0]],
            dtype=tf.float32,
            initializer=tf.zeros_initializer(),
        )

        sigma_sq = tf.exp(self.log_sigma_sq)

        self.epsilon = tf.placeholder(shape=[None, self.act_size[0]],
                                      dtype=tf.float32,
                                      name="epsilon")
        # Clip and scale output to ensure actions are always within [-1, 1] range.
        self.output_pre = mu + tf.sqrt(sigma_sq) * self.epsilon
        output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
        self.output = tf.identity(output_post, name="action")
        self.selected_actions = tf.stop_gradient(output_post)

        # Compute probability of model output.
        all_probs = (-0.5 * tf.square(tf.stop_gradient(self.output_pre) - mu) /
                     sigma_sq - 0.5 * tf.log(2.0 * np.pi) -
                     0.5 * self.log_sigma_sq)

        self.all_log_probs = tf.identity(all_probs, name="action_probs")

        self.entropy = 0.5 * tf.reduce_mean(
            tf.log(2 * np.pi * np.e) + self.log_sigma_sq)

        self.create_value_heads(self.stream_names, hidden_value)

        self.all_old_log_probs = tf.placeholder(shape=[None, self.act_size[0]],
                                                dtype=tf.float32,
                                                name="old_probabilities")

        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
        self.log_probs = tf.reduce_sum((tf.identity(self.all_log_probs)),
                                       axis=1,
                                       keepdims=True)
        self.old_log_probs = tf.reduce_sum(
            (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True)
Exemple #17
0
    def __init__(
        self,
        brain,
        h_size=128,
        lr=1e-4,
        n_layers=2,
        m_size=128,
        normalize=False,
        use_recurrent=False,
        seed=0,
    ):
        LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed)
        num_streams = 1
        hidden_streams = self.create_observation_streams(num_streams, h_size, n_layers)
        hidden = hidden_streams[0]
        self.dropout_rate = tf.placeholder(
            dtype=tf.float32, shape=[], name="dropout_rate"
        )
        hidden_reg = tf.layers.dropout(hidden, self.dropout_rate)
        if self.use_recurrent:
            tf.Variable(
                self.m_size, name="memory_size", trainable=False, dtype=tf.int32
            )
            self.memory_in = tf.placeholder(
                shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
            )
            hidden_reg, self.memory_out = self.create_recurrent_encoder(
                hidden_reg, self.memory_in, self.sequence_length
            )
            self.memory_out = tf.identity(self.memory_out, name="recurrent_out")

        if brain.vector_action_space_type == "discrete":
            policy_branches = []
            for size in self.act_size:
                policy_branches.append(
                    tf.layers.dense(
                        hidden_reg,
                        size,
                        activation=None,
                        use_bias=False,
                        kernel_initializer=tf.initializers.variance_scaling(0.01),
                    )
                )
            self.action_probs = tf.concat(
                [tf.nn.softmax(branch) for branch in policy_branches],
                axis=1,
                name="action_probs",
            )
            self.action_masks = tf.placeholder(
                shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks"
            )
            self.sample_action_float, _, normalized_logits = self.create_discrete_action_masking_layer(
                tf.concat(policy_branches, axis=1), self.action_masks, self.act_size
            )
            tf.identity(normalized_logits, name="action")
            self.sample_action = tf.cast(self.sample_action_float, tf.int32)
            self.true_action = tf.placeholder(
                shape=[None, len(policy_branches)],
                dtype=tf.int32,
                name="teacher_action",
            )
            self.action_oh = tf.concat(
                [
                    tf.one_hot(self.true_action[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            self.loss = tf.reduce_sum(
                -tf.log(self.action_probs + 1e-10) * self.action_oh
            )
            self.action_percent = tf.reduce_mean(
                tf.cast(
                    tf.equal(
                        tf.cast(tf.argmax(self.action_probs, axis=1), tf.int32),
                        self.sample_action,
                    ),
                    tf.float32,
                )
            )
        else:
            self.policy = tf.layers.dense(
                hidden_reg,
                self.act_size[0],
                activation=None,
                use_bias=False,
                name="pre_action",
                kernel_initializer=tf.initializers.variance_scaling(0.01),
            )
            self.clipped_sample_action = tf.clip_by_value(self.policy, -1, 1)
            self.sample_action = tf.identity(self.clipped_sample_action, name="action")
            self.true_action = tf.placeholder(
                shape=[None, self.act_size[0]], dtype=tf.float32, name="teacher_action"
            )
            self.clipped_true_action = tf.clip_by_value(self.true_action, -1, 1)
            self.loss = tf.reduce_sum(
                tf.squared_difference(self.clipped_true_action, self.sample_action)
            )

        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update = optimizer.minimize(self.loss)
Exemple #18
0
    def create_dc_actor(self, hidden_policy, scope):
        """
        Creates Discrete control actor for SAC.
        :param hidden_policy: Output of feature extractor (i.e. the input for vector obs, output of CNN for visual obs).
        :param num_layers: TF scope to assign whatever is created in this block.
        """
        scope = self.join_scopes(scope, "policy")

        # Create inputs outside of the scope
        self.action_masks = tf.placeholder(shape=[None,
                                                  sum(self.act_size)],
                                           dtype=tf.float32,
                                           name="action_masks")

        if self.use_recurrent:
            self.prev_action = tf.placeholder(shape=[None,
                                                     len(self.act_size)],
                                              dtype=tf.int32,
                                              name="prev_action")

        with tf.variable_scope(scope):
            hidden_policy = self.create_vector_observation_encoder(
                hidden_policy,
                self.h_size,
                self.activ_fn,
                self.num_layers,
                "encoder",
                False,
            )
        if self.use_recurrent:
            prev_action_oh = tf.concat(
                [
                    tf.one_hot(self.prev_action[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            hidden_policy = tf.concat([hidden_policy, prev_action_oh], axis=1)

            hidden_policy, memory_out = self.create_recurrent_encoder(
                hidden_policy,
                self.policy_memory_in,
                self.sequence_length,
                name="lstm_policy",
            )
            self.policy_memory_out = memory_out
        with tf.variable_scope(scope):
            policy_branches = []
            for size in self.act_size:
                policy_branches.append(
                    tf.layers.dense(
                        hidden_policy,
                        size,
                        activation=None,
                        use_bias=False,
                        kernel_initializer=tf.initializers.variance_scaling(
                            0.01),
                    ))
            all_logits = tf.concat(policy_branches,
                                   axis=1,
                                   name="action_probs")

            output, normalized_probs, normalized_logprobs = self.create_discrete_action_masking_layer(
                all_logits, self.action_masks, self.act_size)

            self.action_probs = normalized_probs

            # Really, this is entropy, but it has an analogous purpose to the log probs in the
            # continuous case.
            self.all_log_probs = self.action_probs * normalized_logprobs
            self.output = output

            # Create action input (discrete)
            self.action_holder = tf.placeholder(
                shape=[None, len(policy_branches)],
                dtype=tf.int32,
                name="action_holder")

            self.output_oh = tf.concat(
                [
                    tf.one_hot(self.action_holder[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )

            # For Curiosity and GAIL to retrieve selected actions. We don't
            # need the mask at this point because it's already stored in the buffer.
            self.selected_actions = tf.stop_gradient(self.output_oh)

            self.external_action_in = tf.concat(
                [
                    tf.one_hot(self.action_holder[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )

            # This is total entropy over all branches
            self.entropy = -1 * tf.reduce_sum(self.all_log_probs, axis=1)

        # Extract the normalized logprobs for Barracuda
        self.normalized_logprobs = tf.identity(normalized_logprobs,
                                               name="action")

        # We kept the LSTMs at a different scope than the rest, so add them if they exist.
        self.policy_vars = self.get_vars(scope)
        if self.use_recurrent:
            self.policy_vars += self.get_vars("lstm")
Exemple #19
0
    def create_cc_actor(self, hidden_policy, scope):
        """
        Creates Continuous control actor for SAC.
        :param hidden_policy: Output of feature extractor (i.e. the input for vector obs, output of CNN for visual obs).
        :param num_layers: TF scope to assign whatever is created in this block.
        """
        # Create action input (continuous)
        self.action_holder = tf.placeholder(shape=[None, self.act_size[0]],
                                            dtype=tf.float32,
                                            name="action_holder")
        self.external_action_in = self.action_holder

        scope = self.join_scopes(scope, "policy")

        with tf.variable_scope(scope):
            hidden_policy = self.create_vector_observation_encoder(
                hidden_policy,
                self.h_size,
                self.activ_fn,
                self.num_layers,
                "encoder",
                False,
            )
        if self.use_recurrent:
            hidden_policy, memory_out = self.create_recurrent_encoder(
                hidden_policy,
                self.policy_memory_in,
                self.sequence_length,
                name="lstm_policy",
            )
            self.policy_memory_out = memory_out
        with tf.variable_scope(scope):
            mu = tf.layers.dense(
                hidden_policy,
                self.act_size[0],
                activation=None,
                name="mu",
                kernel_initializer=LearningModel.scaled_init(0.01),
            )

            # Policy-dependent log_sigma_sq
            log_sigma_sq = tf.layers.dense(
                hidden_policy,
                self.act_size[0],
                activation=None,
                name="log_std",
                kernel_initializer=LearningModel.scaled_init(0.01),
            )

            self.log_sigma_sq = tf.clip_by_value(log_sigma_sq, LOG_STD_MIN,
                                                 LOG_STD_MAX)

            sigma_sq = tf.exp(self.log_sigma_sq)

            # Do the reparameterization trick
            policy_ = mu + tf.random_normal(tf.shape(mu)) * sigma_sq

            _gauss_pre = -0.5 * (((policy_ - mu) /
                                  (tf.exp(self.log_sigma_sq) + EPSILON))**2 +
                                 2 * self.log_sigma_sq + np.log(2 * np.pi))

            all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True)

            self.entropy = tf.reduce_sum(self.log_sigma_sq +
                                         0.5 * np.log(2.0 * np.pi * np.e),
                                         axis=-1)

            # Squash probabilities
            # Keep deterministic around in case we want to use it.
            self.deterministic_output = tf.tanh(mu)

            # Note that this is just for symmetry with PPO.
            self.output_pre = tf.tanh(policy_)

            # Squash correction
            all_probs -= tf.reduce_sum(tf.log(1 - self.output_pre**2 +
                                              EPSILON),
                                       axis=1,
                                       keepdims=True)

            self.all_log_probs = all_probs
            self.selected_actions = tf.stop_gradient(self.output_pre)

            self.action_probs = all_probs

        # Extract output for Barracuda
        self.output = tf.identity(self.output_pre, name="action")

        # Get all policy vars
        self.policy_vars = self.get_vars(scope)
Exemple #20
0
    def _create_dc_actor(self, encoded: tf.Tensor) -> None:
        """
        Creates Discrete control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: Type of visual encoder to use if visual input.
        """
        if self.use_recurrent:
            self.prev_action = tf.placeholder(shape=[None,
                                                     len(self.act_size)],
                                              dtype=tf.int32,
                                              name="prev_action")
            prev_action_oh = tf.concat(
                [
                    tf.one_hot(self.prev_action[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            hidden_policy = tf.concat([encoded, prev_action_oh], axis=1)

            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
                hidden_policy,
                self.memory_in,
                self.sequence_length_ph,
                name="lstm_policy",
            )

            self.memory_out = tf.identity(memory_policy_out, "recurrent_out")
        else:
            hidden_policy = encoded

        policy_branches = []
        with tf.variable_scope("policy"):
            for size in self.act_size:
                policy_branches.append(
                    tf.layers.dense(
                        hidden_policy,
                        size,
                        activation=None,
                        use_bias=False,
                        kernel_initializer=ModelUtils.scaled_init(0.01),
                    ))

        raw_log_probs = tf.concat(policy_branches, axis=1, name="action_probs")

        self.action_masks = tf.placeholder(shape=[None,
                                                  sum(self.act_size)],
                                           dtype=tf.float32,
                                           name="action_masks")
        output, self.action_probs, normalized_logits = ModelUtils.create_discrete_action_masking_layer(
            raw_log_probs, self.action_masks, self.act_size)

        self.output = tf.identity(output)
        self.all_log_probs = tf.identity(normalized_logits, name="action")

        self.action_holder = tf.placeholder(shape=[None,
                                                   len(policy_branches)],
                                            dtype=tf.int32,
                                            name="action_holder")
        self.action_oh = tf.concat(
            [
                tf.one_hot(self.action_holder[:, i], self.act_size[i])
                for i in range(len(self.act_size))
            ],
            axis=1,
        )
        self.selected_actions = tf.stop_gradient(self.action_oh)

        action_idx = [0] + list(np.cumsum(self.act_size))

        self.entropy = tf.reduce_sum(
            (tf.stack(
                [
                    tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=tf.nn.softmax(
                            self.all_log_probs[:,
                                               action_idx[i]:action_idx[i +
                                                                        1]]),
                        logits=self.all_log_probs[:,
                                                  action_idx[i]:action_idx[i +
                                                                           1]],
                    ) for i in range(len(self.act_size))
                ],
                axis=1,
            )),
            axis=1,
        )

        self.log_probs = tf.reduce_sum(
            (tf.stack(
                [
                    -tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=self.action_oh[:,
                                              action_idx[i]:action_idx[i + 1]],
                        logits=normalized_logits[:,
                                                 action_idx[i]:action_idx[i +
                                                                          1]],
                    ) for i in range(len(self.act_size))
                ],
                axis=1,
            )),
            axis=1,
            keepdims=True,
        )
Exemple #21
0
    def _create_losses(self, probs, old_probs, value_heads, entropy, beta,
                       epsilon, lr, max_step):
        """
        Creates training-specific Tensorflow ops for PPO models.
        :param probs: Current policy probabilities
        :param old_probs: Past policy probabilities
        :param value_heads: Value estimate tensors from each value stream
        :param beta: Entropy regularization strength
        :param entropy: Current policy entropy
        :param epsilon: Value for policy-divergence threshold
        :param lr: Learning rate
        :param max_step: Total number of training steps.
        """
        self.returns_holders = {}
        self.old_values = {}
        for name in value_heads.keys():
            returns_holder = tf.placeholder(shape=[None],
                                            dtype=tf.float32,
                                            name="{}_returns".format(name))
            old_value = tf.placeholder(shape=[None],
                                       dtype=tf.float32,
                                       name="{}_value_estimate".format(name))
            self.returns_holders[name] = returns_holder
            self.old_values[name] = old_value
        self.advantage = tf.placeholder(shape=[None],
                                        dtype=tf.float32,
                                        name="advantages")
        advantage = tf.expand_dims(self.advantage, -1)

        decay_epsilon = tf.train.polynomial_decay(epsilon,
                                                  self.policy.global_step,
                                                  max_step,
                                                  0.1,
                                                  power=1.0)
        decay_beta = tf.train.polynomial_decay(beta,
                                               self.policy.global_step,
                                               max_step,
                                               1e-5,
                                               power=1.0)

        value_losses = []
        for name, head in value_heads.items():
            clipped_value_estimate = self.old_values[name] + tf.clip_by_value(
                tf.reduce_sum(head, axis=1) - self.old_values[name],
                -decay_epsilon,
                decay_epsilon,
            )
            v_opt_a = tf.squared_difference(self.returns_holders[name],
                                            tf.reduce_sum(head, axis=1))
            v_opt_b = tf.squared_difference(self.returns_holders[name],
                                            clipped_value_estimate)
            value_loss = tf.reduce_mean(
                tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b),
                                     self.policy.mask, 2)[1])
            value_losses.append(value_loss)
        self.value_loss = tf.reduce_mean(value_losses)

        r_theta = tf.exp(probs - old_probs)
        p_opt_a = r_theta * advantage
        p_opt_b = (tf.clip_by_value(r_theta, 1.0 - decay_epsilon,
                                    1.0 + decay_epsilon) * advantage)
        self.policy_loss = -tf.reduce_mean(
            tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b),
                                 self.policy.mask, 2)[1])
        # For cleaner stats reporting
        self.abs_policy_loss = tf.abs(self.policy_loss)

        self.loss = (
            self.policy_loss + 0.5 * self.value_loss -
            decay_beta * tf.reduce_mean(
                tf.dynamic_partition(entropy, self.policy.mask, 2)[1]))
Exemple #22
0
    def _create_cc_actor(
        self,
        encoded: tf.Tensor,
        tanh_squash: bool = False,
        reparameterize: bool = False,
        condition_sigma_on_obs: bool = True,
    ) -> None:
        """
        Creates Continuous control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: Type of visual encoder to use if visual input.
        :param tanh_squash: Whether to use a tanh function, or a clipped output.
        :param reparameterize: Whether we are using the resampling trick to update the policy.
        """
        if self.use_recurrent:
            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
                encoded,
                self.memory_in,
                self.sequence_length_ph,
                name="lstm_policy")

            self.memory_out = tf.identity(memory_policy_out,
                                          name="recurrent_out")
        else:
            hidden_policy = encoded

        with tf.variable_scope("policy"):
            mu = tf.layers.dense(
                hidden_policy,
                self.act_size[0],
                activation=None,
                name="mu",
                kernel_initializer=ModelUtils.scaled_init(0.01),
                reuse=tf.AUTO_REUSE,
            )

            # Policy-dependent log_sigma
            if condition_sigma_on_obs:
                log_sigma = tf.layers.dense(
                    hidden_policy,
                    self.act_size[0],
                    activation=None,
                    name="log_sigma",
                    kernel_initializer=ModelUtils.scaled_init(0.01),
                )
            else:
                log_sigma = tf.get_variable(
                    "log_sigma",
                    [self.act_size[0]],
                    dtype=tf.float32,
                    initializer=tf.zeros_initializer(),
                )
            log_sigma = tf.clip_by_value(log_sigma, self.log_std_min,
                                         self.log_std_max)

            sigma = tf.exp(log_sigma)

            epsilon = tf.random_normal(tf.shape(mu))

            sampled_policy = mu + sigma * epsilon

            # Stop gradient if we're not doing the resampling trick
            if not reparameterize:
                sampled_policy_probs = tf.stop_gradient(sampled_policy)
            else:
                sampled_policy_probs = sampled_policy

            # Compute probability of model output.
            _gauss_pre = -0.5 * (
                ((sampled_policy_probs - mu) /
                 (sigma + EPSILON))**2 + 2 * log_sigma + np.log(2 * np.pi))
            all_probs = _gauss_pre
            all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True)

        if tanh_squash:
            self.output_pre = tf.tanh(sampled_policy)

            # Squash correction
            all_probs -= tf.reduce_sum(tf.log(1 - self.output_pre**2 +
                                              EPSILON),
                                       axis=1,
                                       keepdims=True)
            self.output = tf.identity(self.output_pre, name="action")
        else:
            self.output_pre = sampled_policy
            # Clip and scale output to ensure actions are always within [-1, 1] range.
            output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
            self.output = tf.identity(output_post, name="action")

        self.selected_actions = tf.stop_gradient(self.output)

        self.all_log_probs = tf.identity(all_probs, name="action_probs")

        single_dim_entropy = 0.5 * tf.reduce_mean(
            tf.log(2 * np.pi * np.e) + 2 * log_sigma)
        # Make entropy the right shape
        self.entropy = tf.ones_like(tf.reshape(mu[:, 0],
                                               [-1])) * single_dim_entropy

        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
        self.log_probs = tf.reduce_sum((tf.identity(self.all_log_probs)),
                                       axis=1,
                                       keepdims=True)

        self.action_holder = tf.placeholder(shape=[None, self.act_size[0]],
                                            dtype=tf.float32,
                                            name="action_holder")