Beispiel #1
0
 def create_observation_ins(self, vis_encode_type, share_ac_cnn):
     """
     Creates the observation inputs, and a CNN if needed,
     :param vis_encode_type: Type of CNN encoder.
     :param share_ac_cnn: Whether or not to share the actor and critic CNNs.
     :return A tuple of (hidden_policy, hidden_critic). We don't save it to self since they're used
     once and thrown away.
     """
     if share_ac_cnn:
         with tf.variable_scope(POLICY_SCOPE):
             hidden_streams = self.create_observation_streams(
                 1,
                 self.h_size,
                 0,
                 vis_encode_type=vis_encode_type,
                 stream_scopes=["critic/value/"],
             )
         hidden_policy = hidden_streams[0]
         hidden_critic = hidden_streams[0]
     else:
         with tf.variable_scope(POLICY_SCOPE):
             hidden_streams = self.create_observation_streams(
                 2,
                 self.h_size,
                 0,
                 vis_encode_type=vis_encode_type,
                 stream_scopes=["policy/", "critic/value/"],
             )
         hidden_policy = hidden_streams[0]
         hidden_critic = hidden_streams[1]
     return hidden_policy, hidden_critic
Beispiel #2
0
    def create_q_heads(
        self,
        stream_names,
        hidden_input,
        num_layers,
        h_size,
        scope,
        reuse=False,
        num_outputs=1,
    ):
        """
        Creates two q heads for each reward signal in stream_names.
        Also creates the node corresponding to the mean of all the value heads in self.value.
        self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.
        :param stream_names: The list of reward signal names
        :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
        of the hidden input.
        :param num_layers: Number of hidden layers for Q network
        :param h_size: size of hidden layers for Q network
        :param scope: TF scope for Q network.
        :param reuse: Whether or not to reuse variables. Useful for creating Q of policy.
        :param num_outputs: Number of outputs of each Q function. If discrete, equal to number of actions.
        """
        with tf.variable_scope(self.join_scopes(scope, "q1_encoding"), reuse=reuse):
            q1_hidden = self.create_vector_observation_encoder(
                hidden_input, h_size, self.activ_fn, num_layers, "q1_encoder", reuse
            )
            if self.use_recurrent:
                q1_hidden, memory_out = self.create_recurrent_encoder(
                    q1_hidden, self.q1_memory_in, self.sequence_length, name="lstm_q1"
                )
                self.q1_memory_out = memory_out

            q1_heads = {}
            for name in stream_names:
                _q1 = tf.layers.dense(q1_hidden, num_outputs, name="{}_q1".format(name))
                q1_heads[name] = _q1

            q1 = tf.reduce_mean(list(q1_heads.values()), axis=0)
        with tf.variable_scope(self.join_scopes(scope, "q2_encoding"), reuse=reuse):
            q2_hidden = self.create_vector_observation_encoder(
                hidden_input, h_size, self.activ_fn, num_layers, "q2_encoder", reuse
            )
            if self.use_recurrent:
                q2_hidden, memory_out = self.create_recurrent_encoder(
                    q2_hidden, self.q2_memory_in, self.sequence_length, name="lstm_q2"
                )
                self.q2_memory_out = memory_out

            q2_heads = {}
            for name in stream_names:
                _q2 = tf.layers.dense(q2_hidden, num_outputs, name="{}_q2".format(name))
                q2_heads[name] = _q2

            q2 = tf.reduce_mean(list(q2_heads.values()), axis=0)

        return q1_heads, q2_heads, q1, q2
Beispiel #3
0
    def create_nature_cnn_visual_observation_encoder(
        image_input: tf.Tensor,
        h_size: int,
        activation: ActivationFunction,
        num_layers: int,
        scope: str,
        reuse: bool,
    ) -> tf.Tensor:
        """
        Builds a set of resnet visual encoders.
        :param image_input: The placeholder for the image input to use.
        :param h_size: Hidden layer size.
        :param activation: What type of activation function to use for layers.
        :param num_layers: number of hidden layers to create.
        :param scope: The scope of the graph within which to create the ops.
        :param reuse: Whether to re-use the weights within the same scope.
        :return: List of hidden layer tensors.
        """
        with tf.variable_scope(scope):
            conv1 = tf.layers.conv2d(
                image_input,
                32,
                kernel_size=[8, 8],
                strides=[4, 4],
                activation=tf.nn.elu,
                reuse=reuse,
                name="conv_1",
            )
            conv2 = tf.layers.conv2d(
                conv1,
                64,
                kernel_size=[4, 4],
                strides=[2, 2],
                activation=tf.nn.elu,
                reuse=reuse,
                name="conv_2",
            )
            conv3 = tf.layers.conv2d(
                conv2,
                64,
                kernel_size=[3, 3],
                strides=[1, 1],
                activation=tf.nn.elu,
                reuse=reuse,
                name="conv_3",
            )
            hidden = tf.layers.flatten(conv3)

        with tf.variable_scope(scope + "/" + "flat_encoding"):
            hidden_flat = ModelUtils.create_vector_observation_encoder(
                hidden, h_size, activation, num_layers, scope, reuse
            )
        return hidden_flat
Beispiel #4
0
def test_ppo_model_dc_visual():
    tf.reset_default_graph()
    with tf.Session() as sess:
        with tf.variable_scope("FakeGraphScope"):
            model = PPOModel(
                make_brain_parameters(discrete_action=True, visual_inputs=2))
            init = tf.global_variables_initializer()
            sess.run(init)

            run_list = [
                model.output,
                model.all_log_probs,
                model.value,
                model.entropy,
                model.learning_rate,
            ]
            feed_dict = {
                model.batch_size: 2,
                model.sequence_length: 1,
                model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
                                           [3, 4, 5, 3, 4, 5]]),
                model.visual_in[0]: np.ones([2, 40, 30, 3], dtype=np.float32),
                model.visual_in[1]: np.ones([2, 40, 30, 3], dtype=np.float32),
                model.action_masks: np.ones([2, 2], dtype=np.float32),
            }
            sess.run(run_list, feed_dict=feed_dict)
    def create_reward_signals(self, reward_signal_configs):
        """
        Create reward signals
        :param reward_signal_configs: Reward signal config.
        """
        with self.graph.as_default():
            with tf.variable_scope(TOWER_SCOPE_NAME, reuse=tf.AUTO_REUSE):
                for device_id, device in enumerate(self.devices):
                    with tf.device(device):
                        reward_tower = {}
                        for reward_signal, config in reward_signal_configs.items(
                        ):
                            reward_tower[reward_signal] = create_reward_signal(
                                self, self.towers[device_id], reward_signal,
                                config)
                            for k, v in reward_tower[
                                    reward_signal].update_dict.items():
                                self.update_dict[k + "_" + str(device_id)] = v
                        self.reward_signal_towers.append(reward_tower)
                for _, reward_tower in self.reward_signal_towers[0].items():
                    for _, update_key in reward_tower.stats_name_to_update_name.items(
                    ):
                        all_reward_signal_stats = tf.stack([
                            self.update_dict[update_key + "_" + str(i)]
                            for i in range(len(self.towers))
                        ])
                        mean_reward_signal_stats = tf.reduce_mean(
                            all_reward_signal_stats, 0)
                        self.update_dict.update(
                            {update_key: mean_reward_signal_stats})

            self.reward_signals = self.reward_signal_towers[0]
Beispiel #6
0
def test_ppo_model_cc_vector_rnn():
    tf.reset_default_graph()
    with tf.Session() as sess:
        with tf.variable_scope("FakeGraphScope"):
            memory_size = 128
            model = PPOModel(
                make_brain_parameters(discrete_action=False, visual_inputs=0),
                use_recurrent=True,
                m_size=memory_size,
            )
            init = tf.global_variables_initializer()
            sess.run(init)

            run_list = [
                model.output,
                model.all_log_probs,
                model.value,
                model.entropy,
                model.learning_rate,
                model.memory_out,
            ]
            feed_dict = {
                model.batch_size: 1,
                model.sequence_length: 2,
                model.memory_in: np.zeros((1, memory_size), dtype=np.float32),
                model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
                                           [3, 4, 5, 3, 4, 5]]),
                model.epsilon: np.array([[0, 1]]),
            }
            sess.run(run_list, feed_dict=feed_dict)
Beispiel #7
0
 def _create_encoder(
     self,
     visual_in: List[tf.Tensor],
     vector_in: tf.Tensor,
     h_size: int,
     num_layers: int,
     vis_encode_type: EncoderType,
 ) -> tf.Tensor:
     """
     Creates an encoder for visual and vector observations.
     :param h_size: Size of hidden linear layers.
     :param num_layers: Number of hidden linear layers.
     :param vis_encode_type: Type of visual encoder to use if visual input.
     :return: The hidden layer (tf.Tensor) after the encoder.
     """
     with tf.variable_scope("policy"):
         encoded = ModelUtils.create_observation_streams(
             self.visual_in,
             self.processed_vector_in,
             1,
             h_size,
             num_layers,
             vis_encode_type,
         )[0]
     return encoded
Beispiel #8
0
    def create_recurrent_encoder(input_state,
                                 memory_in,
                                 sequence_length,
                                 name="lstm"):
        """
        Builds a recurrent encoder for either state or observations (LSTM).
        :param sequence_length: Length of sequence to unroll.
        :param input_state: The input tensor to the LSTM cell.
        :param memory_in: The input memory to the LSTM cell.
        :param name: The scope of the LSTM cell.
        """
        s_size = input_state.get_shape().as_list()[1]
        m_size = memory_in.get_shape().as_list()[1]
        lstm_input_state = tf.reshape(input_state,
                                      shape=[-1, sequence_length, s_size])
        memory_in = tf.reshape(memory_in[:, :], [-1, m_size])
        half_point = int(m_size / 2)
        with tf.variable_scope(name):
            rnn_cell = tf.nn.rnn_cell.BasicLSTMCell(half_point)
            lstm_vector_in = tf.nn.rnn_cell.LSTMStateTuple(
                memory_in[:, :half_point], memory_in[:, half_point:])
            recurrent_output, lstm_state_out = tf.nn.dynamic_rnn(
                rnn_cell, lstm_input_state, initial_state=lstm_vector_in)

        recurrent_output = tf.reshape(recurrent_output, shape=[-1, half_point])
        return recurrent_output, tf.concat(
            [lstm_state_out.c, lstm_state_out.h], axis=1)
Beispiel #9
0
 def create_vector_observation_encoder(
     observation_input: tf.Tensor,
     h_size: int,
     activation: ActivationFunction,
     num_layers: int,
     scope: str,
     reuse: bool,
 ) -> tf.Tensor:
     """
     Builds a set of hidden state encoders.
     :param reuse: Whether to re-use the weights within the same scope.
     :param scope: Graph scope for the encoder ops.
     :param observation_input: Input vector.
     :param h_size: Hidden layer size.
     :param activation: What type of activation function to use for layers.
     :param num_layers: number of hidden layers to create.
     :return: List of hidden layer tensors.
     """
     with tf.variable_scope(scope):
         hidden = observation_input
         for i in range(num_layers):
             hidden = tf.layers.dense(
                 hidden,
                 h_size,
                 activation=activation,
                 reuse=reuse,
                 name=f"hidden_{i}",
                 kernel_initializer=tf.initializers.variance_scaling(1.0),
             )
     return hidden
Beispiel #10
0
 def create_sac_value_head(self, stream_names, hidden_input, num_layers,
                           h_size, scope):
     """
     Creates one value estimator head for each reward signal in stream_names.
     Also creates the node corresponding to the mean of all the value heads in self.value.
     self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.
     :param stream_names: The list of reward signal names
     :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
     of the hidden input.
     :param num_layers: Number of hidden layers for value network
     :param h_size: size of hidden layers for value network
     :param scope: TF scope for value network.
     """
     with tf.variable_scope(scope):
         value_hidden = self.create_vector_observation_encoder(
             hidden_input, h_size, self.activ_fn, num_layers, "encoder",
             False)
         if self.use_recurrent:
             value_hidden, memory_out = self.create_recurrent_encoder(
                 value_hidden,
                 self.value_memory_in,
                 self.sequence_length,
                 name="lstm_value",
             )
             self.value_memory_out = memory_out
         self.create_value_heads(stream_names, value_hidden)
Beispiel #11
0
    def create_encoder(
        self, state_in: tf.Tensor, action_in: tf.Tensor, done_in: tf.Tensor, reuse: bool
    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
        """
        Creates the encoder for the discriminator
        :param state_in: The encoded observation input
        :param action_in: The action input
        :param done_in: The done flags input
        :param reuse: If true, the weights will be shared with the previous encoder created
        """
        with tf.variable_scope("GAIL_model"):
            if self.use_actions:
                concat_input = tf.concat([state_in, action_in, done_in], axis=1)
            else:
                concat_input = state_in

            hidden_1 = tf.layers.dense(
                concat_input,
                self.h_size,
                activation=ModelUtils.swish,
                name="gail_d_hidden_1",
                reuse=reuse,
            )

            hidden_2 = tf.layers.dense(
                hidden_1,
                self.h_size,
                activation=ModelUtils.swish,
                name="gail_d_hidden_2",
                reuse=reuse,
            )

            z_mean = None
            if self.use_vail:
                # Latent representation
                z_mean = tf.layers.dense(
                    hidden_2,
                    self.z_size,
                    reuse=reuse,
                    name="gail_z_mean",
                    kernel_initializer=ModelUtils.scaled_init(0.01),
                )

                self.noise = tf.random_normal(tf.shape(z_mean), dtype=tf.float32)

                # Sampled latent code
                self.z = z_mean + self.z_sigma * self.noise * self.use_noise
                estimate_input = self.z
            else:
                estimate_input = hidden_2

            estimate = tf.layers.dense(
                estimate_input,
                1,
                activation=tf.nn.sigmoid,
                name="gail_d_estimate",
                reuse=reuse,
            )
            return estimate, z_mean, concat_input
Beispiel #12
0
    def create_match3_visual_observation_encoder(
        image_input: tf.Tensor,
        h_size: int,
        activation: ActivationFunction,
        num_layers: int,
        scope: str,
        reuse: bool,
    ) -> tf.Tensor:
        """
        Builds a CNN with the architecture used by King for Candy Crush. Optimized
        for grid-shaped boards, such as with Match-3 games.
        :param image_input: The placeholder for the image input to use.
        :param h_size: Hidden layer size.
        :param activation: What type of activation function to use for layers.
        :param num_layers: number of hidden layers to create.
        :param scope: The scope of the graph within which to create the ops.
        :param reuse: Whether to re-use the weights within the same scope.
        :return: List of hidden layer tensors.
        """
        with tf.variable_scope(scope):
            conv1 = tf.layers.conv2d(
                image_input,
                35,
                kernel_size=[3, 3],
                strides=[1, 1],
                activation=tf.nn.elu,
                reuse=reuse,
                name="conv_1",
            )
            conv2 = tf.layers.conv2d(
                conv1,
                144,
                kernel_size=[3, 3],
                strides=[1, 1],
                activation=tf.nn.elu,
                reuse=reuse,
                name="conv_2",
            )
            hidden = tf.layers.flatten(conv2)

        with tf.variable_scope(scope + "/" + "flat_encoding"):
            hidden_flat = ModelUtils.create_vector_observation_encoder(
                hidden, h_size, activation, num_layers, scope, reuse)
        return hidden_flat
Beispiel #13
0
    def _create_cc_actor(
        self,
        encoded: tf.Tensor,
        tanh_squash: bool = False,
        reparameterize: bool = False,
        condition_sigma_on_obs: bool = True,
    ) -> None:
        """
        Creates Continuous control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: Type of visual encoder to use if visual input.
        :param tanh_squash: Whether to use a tanh function, or a clipped output.
        :param reparameterize: Whether we are using the resampling trick to update the policy.
        """
        if self.use_recurrent:
            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
                encoded,
                self.memory_in,
                self.sequence_length_ph,
                name="lstm_policy")

            self.memory_out = tf.identity(memory_policy_out,
                                          name="recurrent_out")
        else:
            hidden_policy = encoded

        with tf.variable_scope("policy"):
            distribution = GaussianDistribution(
                hidden_policy,
                self.act_size,
                reparameterize=reparameterize,
                tanh_squash=tanh_squash,
                condition_sigma=condition_sigma_on_obs,
            )

        if tanh_squash:
            self.output_pre = distribution.sample
            self.output = tf.identity(self.output_pre, name="action")
        else:
            self.output_pre = distribution.sample
            # Clip and scale output to ensure actions are always within [-1, 1] range.
            output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
            self.output = tf.identity(output_post, name="action")

        self.selected_actions = tf.stop_gradient(self.output)

        self.all_log_probs = tf.identity(distribution.log_probs,
                                         name="action_probs")
        self.entropy = distribution.entropy

        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
        self.total_log_probs = distribution.total_log_probs
Beispiel #14
0
    def _create_dc_actor(self, encoded: tf.Tensor) -> None:
        """
        Creates Discrete control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: Type of visual encoder to use if visual input.
        """
        if self.use_recurrent:
            self.prev_action = tf.placeholder(shape=[None,
                                                     len(self.act_size)],
                                              dtype=tf.int32,
                                              name="prev_action")
            prev_action_oh = tf.concat(
                [
                    tf.one_hot(self.prev_action[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            hidden_policy = tf.concat([encoded, prev_action_oh], axis=1)

            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
                hidden_policy,
                self.memory_in,
                self.sequence_length_ph,
                name="lstm_policy",
            )

            self.memory_out = tf.identity(memory_policy_out, "recurrent_out")
        else:
            hidden_policy = encoded

        self.action_masks = tf.placeholder(shape=[None,
                                                  sum(self.act_size)],
                                           dtype=tf.float32,
                                           name="action_masks")

        with tf.variable_scope("policy"):
            distribution = MultiCategoricalDistribution(
                hidden_policy, self.act_size, self.action_masks)
        # It's important that we are able to feed_dict a value into this tensor to get the
        # right one-hot encoding, so we can't do identity on it.
        self.output = distribution.sample
        self.all_log_probs = tf.identity(distribution.log_probs, name="action")
        self.selected_actions = tf.stop_gradient(
            distribution.sample_onehot)  # In discrete, these are onehot
        self.entropy = distribution.entropy
        self.total_log_probs = distribution.total_log_probs
Beispiel #15
0
 def __init__(
     self,
     brain,
     m_size=None,
     h_size=128,
     normalize=False,
     use_recurrent=False,
     num_layers=2,
     stream_names=None,
     seed=0,
     vis_encode_type=EncoderType.SIMPLE,
 ):
     super().__init__(
         brain,
         m_size,
         h_size,
         normalize,
         use_recurrent,
         num_layers,
         stream_names,
         seed,
         vis_encode_type,
     )
     if self.use_recurrent:
         self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                         dtype=tf.float32,
                                         name="recurrent_in")
         self.value_memory_in = self.memory_in
     with tf.variable_scope(TARGET_SCOPE):
         hidden_streams = self.create_observation_streams(
             1,
             self.h_size,
             0,
             vis_encode_type=vis_encode_type,
             stream_scopes=["critic/value/"],
         )
     if brain.vector_action_space_type == "continuous":
         self.create_cc_critic(hidden_streams[0],
                               TARGET_SCOPE,
                               create_qs=False)
     else:
         self.create_dc_critic(hidden_streams[0],
                               TARGET_SCOPE,
                               create_qs=False)
     if self.use_recurrent:
         self.memory_out = tf.concat(self.value_memory_out,
                                     axis=1)  # Needed for Barracuda to work
Beispiel #16
0
def test_sac_model_cc_vector():
    tf.reset_default_graph()
    with tf.Session() as sess:
        with tf.variable_scope("FakeGraphScope"):
            model = SACModel(
                make_brain_parameters(discrete_action=False, visual_inputs=0)
            )
            init = tf.global_variables_initializer()
            sess.run(init)

            run_list = [model.output, model.value, model.entropy, model.learning_rate]
            feed_dict = {
                model.batch_size: 2,
                model.sequence_length: 1,
                model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
            }
            sess.run(run_list, feed_dict=feed_dict)
Beispiel #17
0
def test_visual_cc_bc_model():
    tf.reset_default_graph()
    with tf.Session() as sess:
        with tf.variable_scope("FakeGraphScope"):
            model = BehavioralCloningModel(
                make_brain_parameters(discrete_action=False, visual_inputs=2))
            init = tf.global_variables_initializer()
            sess.run(init)

            run_list = [model.sample_action, model.policy]
            feed_dict = {
                model.batch_size: 2,
                model.sequence_length: 1,
                model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
                                           [3, 4, 5, 3, 4, 5]]),
                model.visual_in[0]: np.ones([2, 40, 30, 3], dtype=np.float32),
                model.visual_in[1]: np.ones([2, 40, 30, 3], dtype=np.float32),
            }
            sess.run(run_list, feed_dict=feed_dict)
Beispiel #18
0
def test_dc_bc_model():
    tf.reset_default_graph()
    with tf.Session() as sess:
        with tf.variable_scope("FakeGraphScope"):
            model = BehavioralCloningModel(
                make_brain_parameters(discrete_action=True, visual_inputs=0))
            init = tf.global_variables_initializer()
            sess.run(init)

            run_list = [model.sample_action, model.action_probs]
            feed_dict = {
                model.batch_size: 2,
                model.dropout_rate: 1.0,
                model.sequence_length: 1,
                model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
                                           [3, 4, 5, 3, 4, 5]]),
                model.action_masks: np.ones([2, 2]),
            }
            sess.run(run_list, feed_dict=feed_dict)
Beispiel #19
0
 def _create_observation_in(self, vis_encode_type):
     """
     Creates the observation inputs, and a CNN if needed,
     :param vis_encode_type: Type of CNN encoder.
     :param share_ac_cnn: Whether or not to share the actor and critic CNNs.
     :return A tuple of (hidden_policy, hidden_critic). We don't save it to self since they're used
     once and thrown away.
     """
     with tf.variable_scope(POLICY_SCOPE):
         hidden_streams = ModelUtils.create_observation_streams(
             self.policy.visual_in,
             self.policy.processed_vector_in,
             1,
             self.h_size,
             0,
             vis_encode_type=vis_encode_type,
             stream_scopes=["critic/value/"],
         )
     hidden_critic = hidden_streams[0]
     return hidden_critic
Beispiel #20
0
    def _create_cc_actor(
        self,
        encoded: tf.Tensor,
        tanh_squash: bool = False,
        reparameterize: bool = False,
        condition_sigma_on_obs: bool = True,
    ) -> None:
        """
        Creates Continuous control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: Type of visual encoder to use if visual input.
        :param tanh_squash: Whether to use a tanh function, or a clipped output.
        :param reparameterize: Whether we are using the resampling trick to update the policy.
        """
        if self.use_recurrent:
            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
                encoded,
                self.memory_in,
                self.sequence_length_ph,
                name="lstm_policy")

            self.memory_out = tf.identity(memory_policy_out,
                                          name="recurrent_out")
        else:
            hidden_policy = encoded

        with tf.variable_scope("policy"):
            mu = tf.layers.dense(
                hidden_policy,
                self.act_size[0],
                activation=None,
                name="mu",
                kernel_initializer=ModelUtils.scaled_init(0.01),
                reuse=tf.AUTO_REUSE,
            )

            # Policy-dependent log_sigma
            if condition_sigma_on_obs:
                log_sigma = tf.layers.dense(
                    hidden_policy,
                    self.act_size[0],
                    activation=None,
                    name="log_sigma",
                    kernel_initializer=ModelUtils.scaled_init(0.01),
                )
            else:
                log_sigma = tf.get_variable(
                    "log_sigma",
                    [self.act_size[0]],
                    dtype=tf.float32,
                    initializer=tf.zeros_initializer(),
                )
            log_sigma = tf.clip_by_value(log_sigma, self.log_std_min,
                                         self.log_std_max)

            sigma = tf.exp(log_sigma)

            epsilon = tf.random_normal(tf.shape(mu))

            sampled_policy = mu + sigma * epsilon

            # Stop gradient if we're not doing the resampling trick
            if not reparameterize:
                sampled_policy_probs = tf.stop_gradient(sampled_policy)
            else:
                sampled_policy_probs = sampled_policy

            # Compute probability of model output.
            _gauss_pre = -0.5 * (
                ((sampled_policy_probs - mu) /
                 (sigma + EPSILON))**2 + 2 * log_sigma + np.log(2 * np.pi))
            all_probs = _gauss_pre
            all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True)

        if tanh_squash:
            self.output_pre = tf.tanh(sampled_policy)

            # Squash correction
            all_probs -= tf.reduce_sum(tf.log(1 - self.output_pre**2 +
                                              EPSILON),
                                       axis=1,
                                       keepdims=True)
            self.output = tf.identity(self.output_pre, name="action")
        else:
            self.output_pre = sampled_policy
            # Clip and scale output to ensure actions are always within [-1, 1] range.
            output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
            self.output = tf.identity(output_post, name="action")

        self.selected_actions = tf.stop_gradient(self.output)

        self.all_log_probs = tf.identity(all_probs, name="action_probs")

        single_dim_entropy = 0.5 * tf.reduce_mean(
            tf.log(2 * np.pi * np.e) + 2 * log_sigma)
        # Make entropy the right shape
        self.entropy = tf.ones_like(tf.reshape(mu[:, 0],
                                               [-1])) * single_dim_entropy

        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
        self.log_probs = tf.reduce_sum((tf.identity(self.all_log_probs)),
                                       axis=1,
                                       keepdims=True)

        self.action_holder = tf.placeholder(shape=[None, self.act_size[0]],
                                            dtype=tf.float32,
                                            name="action_holder")
Beispiel #21
0
    def _create_dc_actor(self, encoded: tf.Tensor) -> None:
        """
        Creates Discrete control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: Type of visual encoder to use if visual input.
        """
        if self.use_recurrent:
            self.prev_action = tf.placeholder(shape=[None,
                                                     len(self.act_size)],
                                              dtype=tf.int32,
                                              name="prev_action")
            prev_action_oh = tf.concat(
                [
                    tf.one_hot(self.prev_action[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            hidden_policy = tf.concat([encoded, prev_action_oh], axis=1)

            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
                hidden_policy,
                self.memory_in,
                self.sequence_length_ph,
                name="lstm_policy",
            )

            self.memory_out = tf.identity(memory_policy_out, "recurrent_out")
        else:
            hidden_policy = encoded

        policy_branches = []
        with tf.variable_scope("policy"):
            for size in self.act_size:
                policy_branches.append(
                    tf.layers.dense(
                        hidden_policy,
                        size,
                        activation=None,
                        use_bias=False,
                        kernel_initializer=ModelUtils.scaled_init(0.01),
                    ))

        raw_log_probs = tf.concat(policy_branches, axis=1, name="action_probs")

        self.action_masks = tf.placeholder(shape=[None,
                                                  sum(self.act_size)],
                                           dtype=tf.float32,
                                           name="action_masks")
        output, self.action_probs, normalized_logits = ModelUtils.create_discrete_action_masking_layer(
            raw_log_probs, self.action_masks, self.act_size)

        self.output = tf.identity(output)
        self.all_log_probs = tf.identity(normalized_logits, name="action")

        self.action_holder = tf.placeholder(shape=[None,
                                                   len(policy_branches)],
                                            dtype=tf.int32,
                                            name="action_holder")
        self.action_oh = tf.concat(
            [
                tf.one_hot(self.action_holder[:, i], self.act_size[i])
                for i in range(len(self.act_size))
            ],
            axis=1,
        )
        self.selected_actions = tf.stop_gradient(self.action_oh)

        action_idx = [0] + list(np.cumsum(self.act_size))

        self.entropy = tf.reduce_sum(
            (tf.stack(
                [
                    tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=tf.nn.softmax(
                            self.all_log_probs[:,
                                               action_idx[i]:action_idx[i +
                                                                        1]]),
                        logits=self.all_log_probs[:,
                                                  action_idx[i]:action_idx[i +
                                                                           1]],
                    ) for i in range(len(self.act_size))
                ],
                axis=1,
            )),
            axis=1,
        )

        self.log_probs = tf.reduce_sum(
            (tf.stack(
                [
                    -tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=self.action_oh[:,
                                              action_idx[i]:action_idx[i + 1]],
                        logits=normalized_logits[:,
                                                 action_idx[i]:action_idx[i +
                                                                          1]],
                    ) for i in range(len(self.act_size))
                ],
                axis=1,
            )),
            axis=1,
            keepdims=True,
        )
Beispiel #22
0
    def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
        """
        Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
        The PPO optimizer has a value estimator and a loss function.
        :param policy: A TFPolicy object that will be updated by this PPO Optimizer.
        :param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer.
        """
        # Create the graph here to give more granular control of the TF graph to the Optimizer.
        policy.create_tf_graph()

        with policy.graph.as_default():
            with tf.variable_scope("optimizer/"):
                super().__init__(policy, trainer_params)

                lr = float(trainer_params["learning_rate"])
                lr_schedule = LearningRateSchedule(
                    trainer_params.get("learning_rate_schedule", "linear"))
                h_size = int(trainer_params["hidden_units"])
                epsilon = float(trainer_params["epsilon"])
                beta = float(trainer_params["beta"])
                max_step = float(trainer_params["max_steps"])
                num_layers = int(trainer_params["num_layers"])
                vis_encode_type = EncoderType(
                    trainer_params.get("vis_encode_type", "simple"))
                self.burn_in_ratio = float(
                    trainer_params.get("burn_in_ratio", 0.0))

                self.stream_names = list(self.reward_signals.keys())

                self.tf_optimizer: Optional[tf.train.AdamOptimizer] = None
                self.grads = None
                self.update_batch: Optional[tf.Operation] = None

                self.stats_name_to_update_name = {
                    "Losses/Value Loss": "value_loss",
                    "Losses/Policy Loss": "policy_loss",
                    "Policy/Learning Rate": "learning_rate",
                }
                if self.policy.use_recurrent:
                    self.m_size = self.policy.m_size
                    self.memory_in = tf.placeholder(
                        shape=[None, self.m_size],
                        dtype=tf.float32,
                        name="recurrent_value_in",
                    )

                if num_layers < 1:
                    num_layers = 1
                if policy.use_continuous_act:
                    self._create_cc_critic(h_size, num_layers, vis_encode_type)
                else:
                    self._create_dc_critic(h_size, num_layers, vis_encode_type)

                self.learning_rate = ModelUtils.create_learning_rate(
                    lr_schedule, lr, self.policy.global_step, int(max_step))
                self._create_losses(
                    self.policy.total_log_probs,
                    self.old_log_probs,
                    self.value_heads,
                    self.policy.entropy,
                    beta,
                    epsilon,
                    lr,
                    max_step,
                )
                self._create_ppo_optimizer_ops()

            self.update_dict.update({
                "value_loss": self.value_loss,
                "policy_loss": self.abs_policy_loss,
                "update_batch": self.update_batch,
                "learning_rate": self.learning_rate,
            })

            self.policy.initialize_or_load()
Beispiel #23
0
    def create_resnet_visual_observation_encoder(
        image_input: tf.Tensor,
        h_size: int,
        activation: ActivationFunction,
        num_layers: int,
        scope: str,
        reuse: bool,
    ) -> tf.Tensor:
        """
        Builds a set of resnet visual encoders.
        :param image_input: The placeholder for the image input to use.
        :param h_size: Hidden layer size.
        :param activation: What type of activation function to use for layers.
        :param num_layers: number of hidden layers to create.
        :param scope: The scope of the graph within which to create the ops.
        :param reuse: Whether to re-use the weights within the same scope.
        :return: List of hidden layer tensors.
        """
        n_channels = [16, 32, 32]  # channel for each stack
        n_blocks = 2  # number of residual blocks
        with tf.variable_scope(scope):
            hidden = image_input
            for i, ch in enumerate(n_channels):
                hidden = tf.layers.conv2d(
                    hidden,
                    ch,
                    kernel_size=[3, 3],
                    strides=[1, 1],
                    reuse=reuse,
                    name="layer%dconv_1" % i,
                )
                hidden = tf.layers.max_pooling2d(hidden,
                                                 pool_size=[3, 3],
                                                 strides=[2, 2],
                                                 padding="same")
                # create residual blocks
                for j in range(n_blocks):
                    block_input = hidden
                    hidden = tf.nn.relu(hidden)
                    hidden = tf.layers.conv2d(
                        hidden,
                        ch,
                        kernel_size=[3, 3],
                        strides=[1, 1],
                        padding="same",
                        reuse=reuse,
                        name="layer%d_%d_conv1" % (i, j),
                    )
                    hidden = tf.nn.relu(hidden)
                    hidden = tf.layers.conv2d(
                        hidden,
                        ch,
                        kernel_size=[3, 3],
                        strides=[1, 1],
                        padding="same",
                        reuse=reuse,
                        name="layer%d_%d_conv2" % (i, j),
                    )
                    hidden = tf.add(block_input, hidden)
            hidden = tf.nn.relu(hidden)
            hidden = tf.layers.flatten(hidden)

        with tf.variable_scope(scope + "/" + "flat_encoding"):
            hidden_flat = ModelUtils.create_vector_observation_encoder(
                hidden, h_size, activation, num_layers, scope, reuse)
        return hidden_flat
    def create_model(self, brain, trainer_params, reward_signal_configs,
                     is_training, load, seed):
        """
        Create PPO models, one on each device
        :param brain: Assigned Brain object.
        :param trainer_params: Defined training parameters.
        :param reward_signal_configs: Reward signal config
        :param seed: Random seed.
        """
        self.devices = get_devices()

        with self.graph.as_default():
            with tf.variable_scope("", reuse=tf.AUTO_REUSE):
                for device in self.devices:
                    with tf.device(device):
                        self.towers.append(
                            PPOModel(
                                brain=brain,
                                lr=float(trainer_params["learning_rate"]),
                                lr_schedule=LearningRateSchedule(
                                    trainer_params.get(
                                        "learning_rate_schedule", "linear")),
                                h_size=int(trainer_params["hidden_units"]),
                                epsilon=float(trainer_params["epsilon"]),
                                beta=float(trainer_params["beta"]),
                                max_step=float(trainer_params["max_steps"]),
                                normalize=trainer_params["normalize"],
                                use_recurrent=trainer_params["use_recurrent"],
                                num_layers=int(trainer_params["num_layers"]),
                                m_size=self.m_size,
                                seed=seed,
                                stream_names=list(
                                    reward_signal_configs.keys()),
                                vis_encode_type=EncoderType(
                                    trainer_params.get("vis_encode_type",
                                                       "simple")),
                            ))
                        self.towers[-1].create_ppo_optimizer()
            self.model = self.towers[0]
            avg_grads = self.average_gradients([t.grads for t in self.towers])
            update_batch = self.model.optimizer.apply_gradients(avg_grads)

            avg_value_loss = tf.reduce_mean(
                tf.stack([model.value_loss for model in self.towers]), 0)
            avg_policy_loss = tf.reduce_mean(
                tf.stack([model.policy_loss for model in self.towers]), 0)

        self.inference_dict.update({
            "action": self.model.output,
            "log_probs": self.model.all_log_probs,
            "value_heads": self.model.value_heads,
            "value": self.model.value,
            "entropy": self.model.entropy,
            "learning_rate": self.model.learning_rate,
        })
        if self.use_continuous_act:
            self.inference_dict["pre_action"] = self.model.output_pre
        if self.use_recurrent:
            self.inference_dict["memory_out"] = self.model.memory_out
        if (is_training and self.use_vec_obs and trainer_params["normalize"]
                and not load):
            self.inference_dict[
                "update_mean"] = self.model.update_normalization

        self.total_policy_loss = self.model.abs_policy_loss
        self.update_dict.update({
            "value_loss": avg_value_loss,
            "policy_loss": avg_policy_loss,
            "update_batch": update_batch,
        })
Beispiel #25
0
    def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
        """
        Takes a Unity environment and model-specific hyper-parameters and returns the
        appropriate PPO agent model for the environment.
        :param brain: Brain parameters used to generate specific network graph.
        :param lr: Learning rate.
        :param lr_schedule: Learning rate decay schedule.
        :param h_size: Size of hidden layers
        :param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster,
            set higher to explore more.
        :return: a sub-class of PPOAgent tailored to the environment.
        :param max_step: Total number of training steps.
        :param normalize: Whether to normalize vector observation input.
        :param use_recurrent: Whether to use an LSTM layer in the network.
        :param num_layers: Number of hidden layers between encoded input and policy & value layers
        :param tau: Strength of soft-Q update.
        :param m_size: Size of brain memory.
        """
        # Create the graph here to give more granular control of the TF graph to the Optimizer.
        policy.create_tf_graph()

        with policy.graph.as_default():
            with tf.variable_scope(""):
                super().__init__(policy, trainer_params)
                hyperparameters: SACSettings = cast(
                    SACSettings, trainer_params.hyperparameters)
                lr = hyperparameters.learning_rate
                lr_schedule = hyperparameters.learning_rate_schedule
                max_step = trainer_params.max_steps
                self.tau = hyperparameters.tau
                self.init_entcoef = hyperparameters.init_entcoef

                self.policy = policy
                self.act_size = policy.act_size
                policy_network_settings = policy.network_settings
                h_size = policy_network_settings.hidden_units
                num_layers = policy_network_settings.num_layers
                vis_encode_type = policy_network_settings.vis_encode_type

                self.tau = hyperparameters.tau
                self.burn_in_ratio = 0.0

                # Non-exposed SAC parameters
                self.discrete_target_entropy_scale = (
                    0.2  # Roughly equal to e-greedy 0.05
                )
                self.continuous_target_entropy_scale = 1.0

                stream_names = list(self.reward_signals.keys())
                # Use to reduce "survivor bonus" when using Curiosity or GAIL.
                self.gammas = [
                    _val.gamma
                    for _val in trainer_params.reward_signals.values()
                ]
                self.use_dones_in_backup = {
                    name: tf.Variable(1.0)
                    for name in stream_names
                }
                self.disable_use_dones = {
                    name: self.use_dones_in_backup[name].assign(0.0)
                    for name in stream_names
                }

                if num_layers < 1:
                    num_layers = 1

                self.target_init_op: List[tf.Tensor] = []
                self.target_update_op: List[tf.Tensor] = []
                self.update_batch_policy: Optional[tf.Operation] = None
                self.update_batch_value: Optional[tf.Operation] = None
                self.update_batch_entropy: Optional[tf.Operation] = None

                self.policy_network = SACPolicyNetwork(
                    policy=self.policy,
                    m_size=self.policy.m_size,  # 3x policy.m_size
                    h_size=h_size,
                    normalize=self.policy.normalize,
                    use_recurrent=self.policy.use_recurrent,
                    num_layers=num_layers,
                    stream_names=stream_names,
                    vis_encode_type=vis_encode_type,
                )
                self.target_network = SACTargetNetwork(
                    policy=self.policy,
                    m_size=self.policy.m_size,  # 1x policy.m_size
                    h_size=h_size,
                    normalize=self.policy.normalize,
                    use_recurrent=self.policy.use_recurrent,
                    num_layers=num_layers,
                    stream_names=stream_names,
                    vis_encode_type=vis_encode_type,
                )
                # The optimizer's m_size is 3 times the policy (Q1, Q2, and Value)
                self.m_size = 3 * self.policy.m_size
                self._create_inputs_and_outputs()
                self.learning_rate = ModelUtils.create_schedule(
                    lr_schedule,
                    lr,
                    self.policy.global_step,
                    int(max_step),
                    min_value=1e-10,
                )
                self._create_losses(
                    self.policy_network.q1_heads,
                    self.policy_network.q2_heads,
                    lr,
                    int(max_step),
                    stream_names,
                    discrete=not self.policy.use_continuous_act,
                )
                self._create_sac_optimizer_ops()

                self.selected_actions = (self.policy.selected_actions
                                         )  # For GAIL and other reward signals
                if self.policy.normalize:
                    target_update_norm = self.target_network.copy_normalization(
                        self.policy.running_mean,
                        self.policy.running_variance,
                        self.policy.normalization_steps,
                    )
                    # Update the normalization of the optimizer when the policy does.
                    self.policy.update_normalization_op = tf.group([
                        self.policy.update_normalization_op, target_update_norm
                    ])

        self.stats_name_to_update_name = {
            "Losses/Value Loss": "value_loss",
            "Losses/Policy Loss": "policy_loss",
            "Losses/Q1 Loss": "q1_loss",
            "Losses/Q2 Loss": "q2_loss",
            "Policy/Entropy Coeff": "entropy_coef",
            "Policy/Learning Rate": "learning_rate",
        }

        self.update_dict = {
            "value_loss": self.total_value_loss,
            "policy_loss": self.policy_loss,
            "q1_loss": self.q1_loss,
            "q2_loss": self.q2_loss,
            "entropy_coef": self.ent_coef,
            "update_batch": self.update_batch_policy,
            "update_value": self.update_batch_value,
            "update_entropy": self.update_batch_entropy,
            "learning_rate": self.learning_rate,
        }
Beispiel #26
0
    def __init__(
        self,
        policy,
        m_size=None,
        h_size=128,
        normalize=False,
        use_recurrent=False,
        num_layers=2,
        stream_names=None,
        vis_encode_type=EncoderType.SIMPLE,
    ):
        super().__init__(
            policy,
            m_size,
            h_size,
            normalize,
            use_recurrent,
            num_layers,
            stream_names,
            vis_encode_type,
        )
        with tf.variable_scope(TARGET_SCOPE):
            self.visual_in = ModelUtils.create_visual_input_placeholders(
                policy.brain.camera_resolutions
            )
            self.vector_in = ModelUtils.create_vector_input(policy.vec_obs_size)
            if self.policy.normalize:
                normalization_tensors = ModelUtils.create_normalizer(self.vector_in)
                self.update_normalization_op = normalization_tensors.update_op
                self.normalization_steps = normalization_tensors.steps
                self.running_mean = normalization_tensors.running_mean
                self.running_variance = normalization_tensors.running_variance
                self.processed_vector_in = ModelUtils.normalize_vector_obs(
                    self.vector_in,
                    self.running_mean,
                    self.running_variance,
                    self.normalization_steps,
                )
            else:
                self.processed_vector_in = self.vector_in
                self.update_normalization_op = None

            if self.policy.use_recurrent:
                self.memory_in = tf.placeholder(
                    shape=[None, m_size], dtype=tf.float32, name="target_recurrent_in"
                )
                self.value_memory_in = self.memory_in
            hidden_streams = ModelUtils.create_observation_streams(
                self.visual_in,
                self.processed_vector_in,
                1,
                self.h_size,
                0,
                vis_encode_type=vis_encode_type,
                stream_scopes=["critic/value/"],
            )
        if self.policy.use_continuous_act:
            self._create_cc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False)
        else:
            self._create_dc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False)
        if self.use_recurrent:
            self.memory_out = tf.concat(
                self.value_memory_out, axis=1
            )  # Needed for Barracuda to work
Beispiel #27
0
    def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
        """
        Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
        The PPO optimizer has a value estimator and a loss function.
        :param policy: A TFPolicy object that will be updated by this PPO Optimizer.
        :param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer.
        """
        # Create the graph here to give more granular control of the TF graph to the Optimizer.
        policy.create_tf_graph()

        with policy.graph.as_default():
            with tf.variable_scope("optimizer/"):
                super().__init__(policy, trainer_params)
                hyperparameters: PPOSettings = cast(
                    PPOSettings, trainer_params.hyperparameters)
                lr = float(hyperparameters.learning_rate)
                self._schedule = hyperparameters.learning_rate_schedule
                epsilon = float(hyperparameters.epsilon)
                beta = float(hyperparameters.beta)
                max_step = float(trainer_params.max_steps)

                policy_network_settings = policy.network_settings
                h_size = int(policy_network_settings.hidden_units)
                num_layers = policy_network_settings.num_layers
                vis_encode_type = policy_network_settings.vis_encode_type
                self.burn_in_ratio = 0.0

                self.stream_names = list(self.reward_signals.keys())

                self.tf_optimizer_op: Optional[tf.train.Optimizer] = None
                self.grads = None
                self.update_batch: Optional[tf.Operation] = None

                self.stats_name_to_update_name = {
                    "Losses/Value Loss": "value_loss",
                    "Losses/Policy Loss": "policy_loss",
                    "Policy/Learning Rate": "learning_rate",
                    "Policy/Epsilon": "decay_epsilon",
                    "Policy/Beta": "decay_beta",
                }
                if self.policy.use_recurrent:
                    self.m_size = self.policy.m_size
                    self.memory_in = tf.placeholder(
                        shape=[None, self.m_size],
                        dtype=tf.float32,
                        name="recurrent_value_in",
                    )

                if num_layers < 1:
                    num_layers = 1
                if policy.use_continuous_act:
                    self._create_cc_critic(h_size, num_layers, vis_encode_type)
                else:
                    self._create_dc_critic(h_size, num_layers, vis_encode_type)

                self.learning_rate = ModelUtils.create_schedule(
                    self._schedule,
                    lr,
                    self.policy.global_step,
                    int(max_step),
                    min_value=1e-10,
                )
                self._create_losses(
                    self.policy.total_log_probs,
                    self.old_log_probs,
                    self.value_heads,
                    self.policy.entropy,
                    beta,
                    epsilon,
                    lr,
                    max_step,
                )
                self._create_ppo_optimizer_ops()

            self.update_dict.update({
                "value_loss": self.value_loss,
                "policy_loss": self.abs_policy_loss,
                "update_batch": self.update_batch,
                "learning_rate": self.learning_rate,
                "decay_epsilon": self.decay_epsilon,
                "decay_beta": self.decay_beta,
            })
Beispiel #28
0
    def create_cc_actor(self, hidden_policy, scope):
        """
        Creates Continuous control actor for SAC.
        :param hidden_policy: Output of feature extractor (i.e. the input for vector obs, output of CNN for visual obs).
        :param num_layers: TF scope to assign whatever is created in this block.
        """
        # Create action input (continuous)
        self.action_holder = tf.placeholder(shape=[None, self.act_size[0]],
                                            dtype=tf.float32,
                                            name="action_holder")
        self.external_action_in = self.action_holder

        scope = self.join_scopes(scope, "policy")

        with tf.variable_scope(scope):
            hidden_policy = self.create_vector_observation_encoder(
                hidden_policy,
                self.h_size,
                self.activ_fn,
                self.num_layers,
                "encoder",
                False,
            )
        if self.use_recurrent:
            hidden_policy, memory_out = self.create_recurrent_encoder(
                hidden_policy,
                self.policy_memory_in,
                self.sequence_length,
                name="lstm_policy",
            )
            self.policy_memory_out = memory_out
        with tf.variable_scope(scope):
            mu = tf.layers.dense(
                hidden_policy,
                self.act_size[0],
                activation=None,
                name="mu",
                kernel_initializer=LearningModel.scaled_init(0.01),
            )

            # Policy-dependent log_sigma_sq
            log_sigma_sq = tf.layers.dense(
                hidden_policy,
                self.act_size[0],
                activation=None,
                name="log_std",
                kernel_initializer=LearningModel.scaled_init(0.01),
            )

            self.log_sigma_sq = tf.clip_by_value(log_sigma_sq, LOG_STD_MIN,
                                                 LOG_STD_MAX)

            sigma_sq = tf.exp(self.log_sigma_sq)

            # Do the reparameterization trick
            policy_ = mu + tf.random_normal(tf.shape(mu)) * sigma_sq

            _gauss_pre = -0.5 * (((policy_ - mu) /
                                  (tf.exp(self.log_sigma_sq) + EPSILON))**2 +
                                 2 * self.log_sigma_sq + np.log(2 * np.pi))

            all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True)

            self.entropy = tf.reduce_sum(self.log_sigma_sq +
                                         0.5 * np.log(2.0 * np.pi * np.e),
                                         axis=-1)

            # Squash probabilities
            # Keep deterministic around in case we want to use it.
            self.deterministic_output = tf.tanh(mu)

            # Note that this is just for symmetry with PPO.
            self.output_pre = tf.tanh(policy_)

            # Squash correction
            all_probs -= tf.reduce_sum(tf.log(1 - self.output_pre**2 +
                                              EPSILON),
                                       axis=1,
                                       keepdims=True)

            self.all_log_probs = all_probs
            self.selected_actions = tf.stop_gradient(self.output_pre)

            self.action_probs = all_probs

        # Extract output for Barracuda
        self.output = tf.identity(self.output_pre, name="action")

        # Get all policy vars
        self.policy_vars = self.get_vars(scope)
Beispiel #29
0
    def create_dc_actor(self, hidden_policy, scope):
        """
        Creates Discrete control actor for SAC.
        :param hidden_policy: Output of feature extractor (i.e. the input for vector obs, output of CNN for visual obs).
        :param num_layers: TF scope to assign whatever is created in this block.
        """
        scope = self.join_scopes(scope, "policy")

        # Create inputs outside of the scope
        self.action_masks = tf.placeholder(shape=[None,
                                                  sum(self.act_size)],
                                           dtype=tf.float32,
                                           name="action_masks")

        if self.use_recurrent:
            self.prev_action = tf.placeholder(shape=[None,
                                                     len(self.act_size)],
                                              dtype=tf.int32,
                                              name="prev_action")

        with tf.variable_scope(scope):
            hidden_policy = self.create_vector_observation_encoder(
                hidden_policy,
                self.h_size,
                self.activ_fn,
                self.num_layers,
                "encoder",
                False,
            )
        if self.use_recurrent:
            prev_action_oh = tf.concat(
                [
                    tf.one_hot(self.prev_action[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            hidden_policy = tf.concat([hidden_policy, prev_action_oh], axis=1)

            hidden_policy, memory_out = self.create_recurrent_encoder(
                hidden_policy,
                self.policy_memory_in,
                self.sequence_length,
                name="lstm_policy",
            )
            self.policy_memory_out = memory_out
        with tf.variable_scope(scope):
            policy_branches = []
            for size in self.act_size:
                policy_branches.append(
                    tf.layers.dense(
                        hidden_policy,
                        size,
                        activation=None,
                        use_bias=False,
                        kernel_initializer=tf.initializers.variance_scaling(
                            0.01),
                    ))
            all_logits = tf.concat(policy_branches,
                                   axis=1,
                                   name="action_probs")

            output, normalized_probs, normalized_logprobs = self.create_discrete_action_masking_layer(
                all_logits, self.action_masks, self.act_size)

            self.action_probs = normalized_probs

            # Really, this is entropy, but it has an analogous purpose to the log probs in the
            # continuous case.
            self.all_log_probs = self.action_probs * normalized_logprobs
            self.output = output

            # Create action input (discrete)
            self.action_holder = tf.placeholder(
                shape=[None, len(policy_branches)],
                dtype=tf.int32,
                name="action_holder")

            self.output_oh = tf.concat(
                [
                    tf.one_hot(self.action_holder[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )

            # For Curiosity and GAIL to retrieve selected actions. We don't
            # need the mask at this point because it's already stored in the buffer.
            self.selected_actions = tf.stop_gradient(self.output_oh)

            self.external_action_in = tf.concat(
                [
                    tf.one_hot(self.action_holder[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )

            # This is total entropy over all branches
            self.entropy = -1 * tf.reduce_sum(self.all_log_probs, axis=1)

        # Extract the normalized logprobs for Barracuda
        self.normalized_logprobs = tf.identity(normalized_logprobs,
                                               name="action")

        # We kept the LSTMs at a different scope than the rest, so add them if they exist.
        self.policy_vars = self.get_vars(scope)
        if self.use_recurrent:
            self.policy_vars += self.get_vars("lstm")