Ejemplo n.º 1
0
    def create_cc_critic(self, hidden_value, scope, create_qs=True):
        """
        Creates just the critic network
        """
        scope = self.join_scopes(scope, "critic")
        self.create_sac_value_head(
            self.stream_names,
            hidden_value,
            self.num_layers,
            self.h_size,
            self.join_scopes(scope, "value"),
        )

        self.value_vars = self.get_vars(self.join_scopes(scope, "value"))

        if create_qs:
            hidden_q = tf.concat([hidden_value, self.external_action_in], axis=-1)
            hidden_qp = tf.concat([hidden_value, self.output], axis=-1)
            self.q1_heads, self.q2_heads, self.q1, self.q2 = self.create_q_heads(
                self.stream_names,
                hidden_q,
                self.num_layers,
                self.h_size,
                self.join_scopes(scope, "q"),
            )
            self.q1_pheads, self.q2_pheads, self.q1_p, self.q2_p = self.create_q_heads(
                self.stream_names,
                hidden_qp,
                self.num_layers,
                self.h_size,
                self.join_scopes(scope, "q"),
                reuse=True,
            )
            self.q_vars = self.get_vars(self.join_scopes(scope, "q"))
        self.critic_vars = self.get_vars(scope)
Ejemplo n.º 2
0
    def _create_dc_actor(self, encoded: tf.Tensor) -> None:
        """
        Creates Discrete control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: Type of visual encoder to use if visual input.
        """
        if self.use_recurrent:
            self.prev_action = tf.placeholder(shape=[None,
                                                     len(self.act_size)],
                                              dtype=tf.int32,
                                              name="prev_action")
            prev_action_oh = tf.concat(
                [
                    tf.one_hot(self.prev_action[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            hidden_policy = tf.concat([encoded, prev_action_oh], axis=1)

            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
                hidden_policy,
                self.memory_in,
                self.sequence_length_ph,
                name="lstm_policy",
            )

            self.memory_out = tf.identity(memory_policy_out, "recurrent_out")
        else:
            hidden_policy = encoded

        self.action_masks = tf.placeholder(shape=[None,
                                                  sum(self.act_size)],
                                           dtype=tf.float32,
                                           name="action_masks")

        with tf.variable_scope("policy"):
            distribution = MultiCategoricalDistribution(
                hidden_policy, self.act_size, self.action_masks)
        # It's important that we are able to feed_dict a value into this tensor to get the
        # right one-hot encoding, so we can't do identity on it.
        self.output = distribution.sample
        self.all_log_probs = tf.identity(distribution.log_probs, name="action")
        self.selected_actions = tf.stop_gradient(
            distribution.sample_onehot)  # In discrete, these are onehot
        self.entropy = distribution.entropy
        self.total_log_probs = distribution.total_log_probs
Ejemplo n.º 3
0
 def create_discrete_action_masking_layer(all_logits, action_masks,
                                          action_size):
     """
     Creates a masking layer for the discrete actions
     :param all_logits: The concatenated unnormalized action probabilities for all branches
     :param action_masks: The mask for the logits. Must be of dimension [None x total_number_of_action]
     :param action_size: A list containing the number of possible actions for each branch
     :return: The action output dimension [batch_size, num_branches], the concatenated
         normalized probs (after softmax)
     and the concatenated normalized log probs
     """
     action_idx = [0] + list(np.cumsum(action_size))
     branches_logits = [
         all_logits[:, action_idx[i]:action_idx[i + 1]]
         for i in range(len(action_size))
     ]
     branch_masks = [
         action_masks[:, action_idx[i]:action_idx[i + 1]]
         for i in range(len(action_size))
     ]
     raw_probs = [
         tf.multiply(
             tf.nn.softmax(branches_logits[k]) + EPSILON, branch_masks[k])
         for k in range(len(action_size))
     ]
     normalized_probs = [
         tf.divide(raw_probs[k],
                   tf.reduce_sum(raw_probs[k], axis=1, keepdims=True))
         for k in range(len(action_size))
     ]
     output = tf.concat(
         [
             tf.multinomial(tf.log(normalized_probs[k] + EPSILON), 1)
             for k in range(len(action_size))
         ],
         axis=1,
     )
     return (
         output,
         tf.concat([normalized_probs[k] for k in range(len(action_size))],
                   axis=1),
         tf.concat(
             [
                 tf.log(normalized_probs[k] + EPSILON)
                 for k in range(len(action_size))
             ],
             axis=1,
         ),
     )
Ejemplo n.º 4
0
 def create_forward_model(
     self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor
 ) -> None:
     """
     Creates forward model TensorFlow ops for Curiosity module.
     Predicts encoded future state based on encoded current state and given action.
     :param encoded_state: Tensor corresponding to encoded current state.
     :param encoded_next_state: Tensor corresponding to encoded next state.
     """
     combined_input = tf.concat(
         [encoded_state, self.policy.selected_actions], axis=1
     )
     hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish)
     pred_next_state = tf.layers.dense(
         hidden,
         self.encoding_size
         * (self.policy.vis_obs_size + int(self.policy.vec_obs_size > 0)),
         activation=None,
     )
     squared_difference = 0.5 * tf.reduce_sum(
         tf.squared_difference(pred_next_state, encoded_next_state), axis=1
     )
     self.intrinsic_reward = squared_difference
     self.forward_loss = tf.reduce_mean(
         tf.dynamic_partition(squared_difference, self.policy.mask, 2)[1]
     )
Ejemplo n.º 5
0
    def create_recurrent_encoder(input_state,
                                 memory_in,
                                 sequence_length,
                                 name="lstm"):
        """
        Builds a recurrent encoder for either state or observations (LSTM).
        :param sequence_length: Length of sequence to unroll.
        :param input_state: The input tensor to the LSTM cell.
        :param memory_in: The input memory to the LSTM cell.
        :param name: The scope of the LSTM cell.
        """
        s_size = input_state.get_shape().as_list()[1]
        m_size = memory_in.get_shape().as_list()[1]
        lstm_input_state = tf.reshape(input_state,
                                      shape=[-1, sequence_length, s_size])
        memory_in = tf.reshape(memory_in[:, :], [-1, m_size])
        half_point = int(m_size / 2)
        with tf.variable_scope(name):
            rnn_cell = tf.nn.rnn_cell.BasicLSTMCell(half_point)
            lstm_vector_in = tf.nn.rnn_cell.LSTMStateTuple(
                memory_in[:, :half_point], memory_in[:, half_point:])
            recurrent_output, lstm_state_out = tf.nn.dynamic_rnn(
                rnn_cell, lstm_input_state, initial_state=lstm_vector_in)

        recurrent_output = tf.reshape(recurrent_output, shape=[-1, half_point])
        return recurrent_output, tf.concat(
            [lstm_state_out.c, lstm_state_out.h], axis=1)
Ejemplo n.º 6
0
    def create_encoder(
        self, state_in: tf.Tensor, action_in: tf.Tensor, done_in: tf.Tensor, reuse: bool
    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
        """
        Creates the encoder for the discriminator
        :param state_in: The encoded observation input
        :param action_in: The action input
        :param done_in: The done flags input
        :param reuse: If true, the weights will be shared with the previous encoder created
        """
        with tf.variable_scope("GAIL_model"):
            if self.use_actions:
                concat_input = tf.concat([state_in, action_in, done_in], axis=1)
            else:
                concat_input = state_in

            hidden_1 = tf.layers.dense(
                concat_input,
                self.h_size,
                activation=ModelUtils.swish,
                name="gail_d_hidden_1",
                reuse=reuse,
            )

            hidden_2 = tf.layers.dense(
                hidden_1,
                self.h_size,
                activation=ModelUtils.swish,
                name="gail_d_hidden_2",
                reuse=reuse,
            )

            z_mean = None
            if self.use_vail:
                # Latent representation
                z_mean = tf.layers.dense(
                    hidden_2,
                    self.z_size,
                    reuse=reuse,
                    name="gail_z_mean",
                    kernel_initializer=ModelUtils.scaled_init(0.01),
                )

                self.noise = tf.random_normal(tf.shape(z_mean), dtype=tf.float32)

                # Sampled latent code
                self.z = z_mean + self.z_sigma * self.noise * self.use_noise
                estimate_input = self.z
            else:
                estimate_input = hidden_2

            estimate = tf.layers.dense(
                estimate_input,
                1,
                activation=tf.nn.sigmoid,
                name="gail_d_estimate",
                reuse=reuse,
            )
            return estimate, z_mean, concat_input
Ejemplo n.º 7
0
 def create_discrete_action_masking_layer(
     branches_logits: List[tf.Tensor],
     action_masks: tf.Tensor,
     action_size: List[int],
 ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
     """
     Creates a masking layer for the discrete actions
     :param branches_logits: A List of the unnormalized action probabilities for each branch
     :param action_masks: The mask for the logits. Must be of dimension [None x total_number_of_action]
     :param action_size: A list containing the number of possible actions for each branch
     :return: The action output dimension [batch_size, num_branches], the concatenated
         normalized probs (after softmax)
     and the concatenated normalized log probs
     """
     branch_masks = ModelUtils.break_into_branches(action_masks,
                                                   action_size)
     raw_probs = [
         tf.multiply(
             tf.nn.softmax(branches_logits[k]) + EPSILON, branch_masks[k])
         for k in range(len(action_size))
     ]
     normalized_probs = [
         tf.divide(raw_probs[k],
                   tf.reduce_sum(raw_probs[k], axis=1, keepdims=True))
         for k in range(len(action_size))
     ]
     output = tf.concat(
         [
             tf.multinomial(tf.log(normalized_probs[k] + EPSILON), 1)
             for k in range(len(action_size))
         ],
         axis=1,
     )
     return (
         output,
         tf.concat([normalized_probs[k] for k in range(len(action_size))],
                   axis=1),
         tf.concat(
             [
                 tf.log(normalized_probs[k] + EPSILON)
                 for k in range(len(action_size))
             ],
             axis=1,
         ),
     )
Ejemplo n.º 8
0
 def _action_onehot(self, sample: tf.Tensor,
                    act_size: List[int]) -> tf.Tensor:
     action_oh = tf.concat(
         [
             tf.one_hot(sample[:, i], act_size[i])
             for i in range(len(act_size))
         ],
         axis=1,
     )
     return action_oh
Ejemplo n.º 9
0
 def create_inverse_model(self, encoded_state: tf.Tensor,
                          encoded_next_state: tf.Tensor) -> None:
     """
     Creates inverse model TensorFlow ops for Curiosity module.
     Predicts action taken given current and future encoded states.
     :param encoded_state: Tensor corresponding to encoded current state.
     :param encoded_next_state: Tensor corresponding to encoded next state.
     """
     combined_input = tf.concat([encoded_state, encoded_next_state], axis=1)
     hidden = tf.layers.dense(combined_input,
                              256,
                              activation=LearningModel.swish)
     if self.policy_model.brain.vector_action_space_type == "continuous":
         pred_action = tf.layers.dense(hidden,
                                       self.policy_model.act_size[0],
                                       activation=None)
         squared_difference = tf.reduce_sum(
             tf.squared_difference(pred_action,
                                   self.policy_model.selected_actions),
             axis=1,
         )
         self.inverse_loss = tf.reduce_mean(
             tf.dynamic_partition(squared_difference,
                                  self.policy_model.mask, 2)[1])
     else:
         pred_action = tf.concat(
             [
                 tf.layers.dense(hidden,
                                 self.policy_model.act_size[i],
                                 activation=tf.nn.softmax)
                 for i in range(len(self.policy_model.act_size))
             ],
             axis=1,
         )
         cross_entropy = tf.reduce_sum(
             -tf.log(pred_action + 1e-10) *
             self.policy_model.selected_actions,
             axis=1,
         )
         self.inverse_loss = tf.reduce_mean(
             tf.dynamic_partition(cross_entropy, self.policy_model.mask,
                                  2)[1])
Ejemplo n.º 10
0
    def __init__(
        self,
        brain,
        m_size=None,
        h_size=128,
        normalize=False,
        use_recurrent=False,
        num_layers=2,
        stream_names=None,
        seed=0,
        vis_encode_type=EncoderType.SIMPLE,
    ):
        super().__init__(
            brain,
            m_size,
            h_size,
            normalize,
            use_recurrent,
            num_layers,
            stream_names,
            seed,
            vis_encode_type,
        )
        self.share_ac_cnn = False
        if self.use_recurrent:
            self.create_memory_ins(self.m_size)

        hidden_policy, hidden_critic = self.create_observation_ins(
            vis_encode_type, self.share_ac_cnn
        )

        if brain.vector_action_space_type == "continuous":
            self.create_cc_actor(hidden_policy, POLICY_SCOPE)
            self.create_cc_critic(hidden_critic, POLICY_SCOPE)

        else:
            self.create_dc_actor(hidden_policy, POLICY_SCOPE)
            self.create_dc_critic(hidden_critic, POLICY_SCOPE)

        if self.share_ac_cnn:
            # Make sure that the policy also contains the CNN
            self.policy_vars += self.get_vars(
                self.join_scopes(POLICY_SCOPE, "critic/value/main_graph_0_encoder0")
            )
        if self.use_recurrent:
            mem_outs = [
                self.value_memory_out,
                self.q1_memory_out,
                self.q2_memory_out,
                self.policy_memory_out,
            ]
            self.memory_out = tf.concat(mem_outs, axis=1)
Ejemplo n.º 11
0
 def _create_policy_branches(self, logits: tf.Tensor,
                             act_size: List[int]) -> List[tf.Tensor]:
     policy_branches = []
     for size in act_size:
         policy_branches.append(
             tf.layers.dense(
                 logits,
                 size,
                 activation=None,
                 use_bias=False,
                 kernel_initializer=ModelUtils.scaled_init(0.01),
             ))
     unmasked_log_probs = tf.concat(policy_branches, axis=1)
     return unmasked_log_probs
Ejemplo n.º 12
0
 def __init__(
     self,
     brain,
     m_size=None,
     h_size=128,
     normalize=False,
     use_recurrent=False,
     num_layers=2,
     stream_names=None,
     seed=0,
     vis_encode_type=EncoderType.SIMPLE,
 ):
     super().__init__(
         brain,
         m_size,
         h_size,
         normalize,
         use_recurrent,
         num_layers,
         stream_names,
         seed,
         vis_encode_type,
     )
     if self.use_recurrent:
         self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                         dtype=tf.float32,
                                         name="recurrent_in")
         self.value_memory_in = self.memory_in
     with tf.variable_scope(TARGET_SCOPE):
         hidden_streams = self.create_observation_streams(
             1,
             self.h_size,
             0,
             vis_encode_type=vis_encode_type,
             stream_scopes=["critic/value/"],
         )
     if brain.vector_action_space_type == "continuous":
         self.create_cc_critic(hidden_streams[0],
                               TARGET_SCOPE,
                               create_qs=False)
     else:
         self.create_dc_critic(hidden_streams[0],
                               TARGET_SCOPE,
                               create_qs=False)
     if self.use_recurrent:
         self.memory_out = tf.concat(self.value_memory_out,
                                     axis=1)  # Needed for Barracuda to work
Ejemplo n.º 13
0
    def __init__(
        self,
        policy,
        m_size=None,
        h_size=128,
        normalize=False,
        use_recurrent=False,
        num_layers=2,
        stream_names=None,
        vis_encode_type=EncoderType.SIMPLE,
    ):
        super().__init__(
            policy,
            m_size,
            h_size,
            normalize,
            use_recurrent,
            num_layers,
            stream_names,
            vis_encode_type,
        )
        if self.policy.use_recurrent:
            self._create_memory_ins(m_size)

        hidden_critic = self._create_observation_in(vis_encode_type)
        self.policy.output = self.policy.output
        # Use the sequence length of the policy
        self.sequence_length_ph = self.policy.sequence_length_ph

        if self.policy.use_continuous_act:
            self._create_cc_critic(hidden_critic, POLICY_SCOPE)

        else:
            self._create_dc_critic(hidden_critic, POLICY_SCOPE)

        if self.use_recurrent:
            mem_outs = [
                self.value_memory_out, self.q1_memory_out, self.q2_memory_out
            ]
            self.memory_out = tf.concat(mem_outs, axis=1)
Ejemplo n.º 14
0
    def create_memory_ins(self, m_size):
        """
        Creates the memory input placeholders for LSTM.
        :param m_size: the total size of the memory.
        """
        # Create the Policy input separate from the rest
        # This is so in inference we only have to run the Policy network.
        # Barracuda will grab the recurrent_in and recurrent_out named tensors.
        self.inference_memory_in = tf.placeholder(
            shape=[None, m_size // 4], dtype=tf.float32, name="recurrent_in"
        )
        # We assume m_size is divisible by 4
        # Create the non-Policy inputs
        # Use a default placeholder here so nothing has to be provided during
        # Barracuda inference. Note that the default value is just the tiled input
        # for the policy, which is thrown away.
        three_fourths_m_size = m_size * 3 // 4
        self.other_memory_in = tf.placeholder_with_default(
            input=tf.tile(self.inference_memory_in, [1, 3]),
            shape=[None, three_fourths_m_size],
            name="other_recurrent_in",
        )

        # Concat and use this as the "placeholder"
        # for training
        self.memory_in = tf.concat(
            [self.other_memory_in, self.inference_memory_in], axis=1
        )

        # Re-break-up for each network
        num_mems = 4
        mem_ins = []
        for i in range(num_mems):
            _start = m_size // num_mems * i
            _end = m_size // num_mems * (i + 1)
            mem_ins.append(self.memory_in[:, _start:_end])
        self.value_memory_in = mem_ins[0]
        self.q1_memory_in = mem_ins[1]
        self.q2_memory_in = mem_ins[2]
        self.policy_memory_in = mem_ins[3]
Ejemplo n.º 15
0
    def make_inputs(self) -> None:
        """
        Creates the input layers for the discriminator
        """
        self.done_expert = tf.placeholder(shape=[None, 1], dtype=tf.float32)
        self.done_policy = tf.placeholder(shape=[None, 1], dtype=tf.float32)

        if self.policy.behavior_spec.action_spec.is_continuous():
            action_length = self.policy.act_size[0]
            self.action_in_expert = tf.placeholder(shape=[None, action_length],
                                                   dtype=tf.float32)
            self.expert_action = tf.identity(self.action_in_expert)
        else:
            action_length = len(self.policy.act_size)
            self.action_in_expert = tf.placeholder(shape=[None, action_length],
                                                   dtype=tf.int32)
            self.expert_action = tf.concat(
                [
                    tf.one_hot(self.action_in_expert[:, i], act_size)
                    for i, act_size in enumerate(self.policy.act_size)
                ],
                axis=1,
            )
Ejemplo n.º 16
0
    def create_dc_actor(self, hidden_policy, scope):
        """
        Creates Discrete control actor for SAC.
        :param hidden_policy: Output of feature extractor (i.e. the input for vector obs, output of CNN for visual obs).
        :param num_layers: TF scope to assign whatever is created in this block.
        """
        scope = self.join_scopes(scope, "policy")

        # Create inputs outside of the scope
        self.action_masks = tf.placeholder(shape=[None,
                                                  sum(self.act_size)],
                                           dtype=tf.float32,
                                           name="action_masks")

        if self.use_recurrent:
            self.prev_action = tf.placeholder(shape=[None,
                                                     len(self.act_size)],
                                              dtype=tf.int32,
                                              name="prev_action")

        with tf.variable_scope(scope):
            hidden_policy = self.create_vector_observation_encoder(
                hidden_policy,
                self.h_size,
                self.activ_fn,
                self.num_layers,
                "encoder",
                False,
            )
        if self.use_recurrent:
            prev_action_oh = tf.concat(
                [
                    tf.one_hot(self.prev_action[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            hidden_policy = tf.concat([hidden_policy, prev_action_oh], axis=1)

            hidden_policy, memory_out = self.create_recurrent_encoder(
                hidden_policy,
                self.policy_memory_in,
                self.sequence_length,
                name="lstm_policy",
            )
            self.policy_memory_out = memory_out
        with tf.variable_scope(scope):
            policy_branches = []
            for size in self.act_size:
                policy_branches.append(
                    tf.layers.dense(
                        hidden_policy,
                        size,
                        activation=None,
                        use_bias=False,
                        kernel_initializer=tf.initializers.variance_scaling(
                            0.01),
                    ))
            all_logits = tf.concat(policy_branches,
                                   axis=1,
                                   name="action_probs")

            output, normalized_probs, normalized_logprobs = self.create_discrete_action_masking_layer(
                all_logits, self.action_masks, self.act_size)

            self.action_probs = normalized_probs

            # Really, this is entropy, but it has an analogous purpose to the log probs in the
            # continuous case.
            self.all_log_probs = self.action_probs * normalized_logprobs
            self.output = output

            # Create action input (discrete)
            self.action_holder = tf.placeholder(
                shape=[None, len(policy_branches)],
                dtype=tf.int32,
                name="action_holder")

            self.output_oh = tf.concat(
                [
                    tf.one_hot(self.action_holder[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )

            # For Curiosity and GAIL to retrieve selected actions. We don't
            # need the mask at this point because it's already stored in the buffer.
            self.selected_actions = tf.stop_gradient(self.output_oh)

            self.external_action_in = tf.concat(
                [
                    tf.one_hot(self.action_holder[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )

            # This is total entropy over all branches
            self.entropy = -1 * tf.reduce_sum(self.all_log_probs, axis=1)

        # Extract the normalized logprobs for Barracuda
        self.normalized_logprobs = tf.identity(normalized_logprobs,
                                               name="action")

        # We kept the LSTMs at a different scope than the rest, so add them if they exist.
        self.policy_vars = self.get_vars(scope)
        if self.use_recurrent:
            self.policy_vars += self.get_vars("lstm")
Ejemplo n.º 17
0
    def create_curiosity_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]:
        """
        Creates state encoders for current and future observations.
        Used for implementation of Curiosity-driven Exploration by Self-supervised Prediction
        See https://arxiv.org/abs/1705.05363 for more details.
        :return: current and future state encoder tensors.
        """
        encoded_state_list = []
        encoded_next_state_list = []

        # Create input ops for next (t+1) visual observations.
        self.next_vector_in, self.next_visual_in = ModelUtils.create_input_placeholders(
            self.policy.behavior_spec.observation_shapes,
            name_prefix="curiosity_next_")

        if self.next_visual_in:
            visual_encoders = []
            next_visual_encoders = []
            for i, (vis_in, next_vis_in) in enumerate(
                    zip(self.policy.visual_in, self.next_visual_in)):
                # Create the encoder ops for current and next visual input.
                # Note that these encoders are siamese.
                encoded_visual = ModelUtils.create_visual_observation_encoder(
                    vis_in,
                    self.encoding_size,
                    ModelUtils.swish,
                    1,
                    "curiosity_stream_{}_visual_obs_encoder".format(i),
                    False,
                )

                encoded_next_visual = ModelUtils.create_visual_observation_encoder(
                    next_vis_in,
                    self.encoding_size,
                    ModelUtils.swish,
                    1,
                    "curiosity_stream_{}_visual_obs_encoder".format(i),
                    True,
                )
                visual_encoders.append(encoded_visual)
                next_visual_encoders.append(encoded_next_visual)

            hidden_visual = tf.concat(visual_encoders, axis=1)
            hidden_next_visual = tf.concat(next_visual_encoders, axis=1)
            encoded_state_list.append(hidden_visual)
            encoded_next_state_list.append(hidden_next_visual)

        if self.policy.vec_obs_size > 0:
            encoded_vector_obs = ModelUtils.create_vector_observation_encoder(
                self.policy.vector_in,
                self.encoding_size,
                ModelUtils.swish,
                2,
                "curiosity_vector_obs_encoder",
                False,
            )
            encoded_next_vector_obs = ModelUtils.create_vector_observation_encoder(
                self.next_vector_in,
                self.encoding_size,
                ModelUtils.swish,
                2,
                "curiosity_vector_obs_encoder",
                True,
            )
            encoded_state_list.append(encoded_vector_obs)
            encoded_next_state_list.append(encoded_next_vector_obs)
        encoded_state = tf.concat(encoded_state_list, axis=1)
        encoded_next_state = tf.concat(encoded_next_state_list, axis=1)
        return encoded_state, encoded_next_state
Ejemplo n.º 18
0
    def make_inputs(self) -> None:
        """
        Creates the input layers for the discriminator
        """
        self.done_expert_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        self.done_policy_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        self.done_expert = tf.expand_dims(self.done_expert_holder, -1)
        self.done_policy = tf.expand_dims(self.done_policy_holder, -1)

        if self.policy.behavior_spec.is_action_continuous():
            action_length = self.policy.act_size[0]
            self.action_in_expert = tf.placeholder(
                shape=[None, action_length], dtype=tf.float32
            )
            self.expert_action = tf.identity(self.action_in_expert)
        else:
            action_length = len(self.policy.act_size)
            self.action_in_expert = tf.placeholder(
                shape=[None, action_length], dtype=tf.int32
            )
            self.expert_action = tf.concat(
                [
                    tf.one_hot(self.action_in_expert[:, i], act_size)
                    for i, act_size in enumerate(self.policy.act_size)
                ],
                axis=1,
            )

        encoded_policy_list = []
        encoded_expert_list = []

        (
            self.obs_in_expert,
            self.expert_visual_in,
        ) = ModelUtils.create_input_placeholders(
            self.policy.behavior_spec.observation_shapes, "gail_"
        )

        if self.policy.vec_obs_size > 0:
            if self.policy.normalize:
                encoded_expert_list.append(
                    ModelUtils.normalize_vector_obs(
                        self.obs_in_expert,
                        self.policy.running_mean,
                        self.policy.running_variance,
                        self.policy.normalization_steps,
                    )
                )
                encoded_policy_list.append(self.policy.processed_vector_in)
            else:
                encoded_expert_list.append(self.obs_in_expert)
                encoded_policy_list.append(self.policy.vector_in)

        if self.expert_visual_in:
            visual_policy_encoders = []
            visual_expert_encoders = []
            for i, (vis_in, exp_vis_in) in enumerate(
                zip(self.policy.visual_in, self.expert_visual_in)
            ):
                encoded_policy_visual = ModelUtils.create_visual_observation_encoder(
                    vis_in,
                    self.encoding_size,
                    ModelUtils.swish,
                    1,
                    f"gail_stream_{i}_visual_obs_encoder",
                    False,
                )

                encoded_expert_visual = ModelUtils.create_visual_observation_encoder(
                    exp_vis_in,
                    self.encoding_size,
                    ModelUtils.swish,
                    1,
                    f"gail_stream_{i}_visual_obs_encoder",
                    True,
                )
                visual_policy_encoders.append(encoded_policy_visual)
                visual_expert_encoders.append(encoded_expert_visual)
            hidden_policy_visual = tf.concat(visual_policy_encoders, axis=1)
            hidden_expert_visual = tf.concat(visual_expert_encoders, axis=1)
            encoded_policy_list.append(hidden_policy_visual)
            encoded_expert_list.append(hidden_expert_visual)

        self.encoded_expert = tf.concat(encoded_expert_list, axis=1)
        self.encoded_policy = tf.concat(encoded_policy_list, axis=1)
Ejemplo n.º 19
0
    def __init__(
        self,
        policy,
        m_size=None,
        h_size=128,
        normalize=False,
        use_recurrent=False,
        num_layers=2,
        stream_names=None,
        vis_encode_type=EncoderType.SIMPLE,
    ):
        super().__init__(
            policy,
            m_size,
            h_size,
            normalize,
            use_recurrent,
            num_layers,
            stream_names,
            vis_encode_type,
        )
        with tf.variable_scope(TARGET_SCOPE):
            self.visual_in = ModelUtils.create_visual_input_placeholders(
                policy.brain.camera_resolutions
            )
            self.vector_in = ModelUtils.create_vector_input(policy.vec_obs_size)
            if self.policy.normalize:
                normalization_tensors = ModelUtils.create_normalizer(self.vector_in)
                self.update_normalization_op = normalization_tensors.update_op
                self.normalization_steps = normalization_tensors.steps
                self.running_mean = normalization_tensors.running_mean
                self.running_variance = normalization_tensors.running_variance
                self.processed_vector_in = ModelUtils.normalize_vector_obs(
                    self.vector_in,
                    self.running_mean,
                    self.running_variance,
                    self.normalization_steps,
                )
            else:
                self.processed_vector_in = self.vector_in
                self.update_normalization_op = None

            if self.policy.use_recurrent:
                self.memory_in = tf.placeholder(
                    shape=[None, m_size], dtype=tf.float32, name="target_recurrent_in"
                )
                self.value_memory_in = self.memory_in
            hidden_streams = ModelUtils.create_observation_streams(
                self.visual_in,
                self.processed_vector_in,
                1,
                self.h_size,
                0,
                vis_encode_type=vis_encode_type,
                stream_scopes=["critic/value/"],
            )
        if self.policy.use_continuous_act:
            self._create_cc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False)
        else:
            self._create_dc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False)
        if self.use_recurrent:
            self.memory_out = tf.concat(
                self.value_memory_out, axis=1
            )  # Needed for Barracuda to work
Ejemplo n.º 20
0
    def create_curiosity_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]:
        """
        Creates state encoders for current and future observations.
        Used for implementation of Curiosity-driven Exploration by Self-supervised Prediction
        See https://arxiv.org/abs/1705.05363 for more details.
        :return: current and future state encoder tensors.
        """
        encoded_state_list = []
        encoded_next_state_list = []

        if self.policy.vis_obs_size > 0:
            self.next_visual_in = []
            visual_encoders = []
            next_visual_encoders = []
            for i in range(self.policy.vis_obs_size):
                # Create input ops for next (t+1) visual observations.
                next_visual_input = ModelUtils.create_visual_input(
                    self.policy.brain.camera_resolutions[i],
                    name="curiosity_next_visual_observation_" + str(i),
                )
                self.next_visual_in.append(next_visual_input)

                # Create the encoder ops for current and next visual input.
                # Note that these encoders are siamese.
                encoded_visual = ModelUtils.create_visual_observation_encoder(
                    self.policy.visual_in[i],
                    self.encoding_size,
                    ModelUtils.swish,
                    1,
                    "curiosity_stream_{}_visual_obs_encoder".format(i),
                    False,
                )

                encoded_next_visual = ModelUtils.create_visual_observation_encoder(
                    self.next_visual_in[i],
                    self.encoding_size,
                    ModelUtils.swish,
                    1,
                    "curiosity_stream_{}_visual_obs_encoder".format(i),
                    True,
                )
                visual_encoders.append(encoded_visual)
                next_visual_encoders.append(encoded_next_visual)

            hidden_visual = tf.concat(visual_encoders, axis=1)
            hidden_next_visual = tf.concat(next_visual_encoders, axis=1)
            encoded_state_list.append(hidden_visual)
            encoded_next_state_list.append(hidden_next_visual)

        if self.policy.vec_obs_size > 0:
            # Create the encoder ops for current and next vector input.
            # Note that these encoders are siamese.
            # Create input op for next (t+1) vector observation.
            self.next_vector_in = tf.placeholder(
                shape=[None, self.policy.vec_obs_size],
                dtype=tf.float32,
                name="curiosity_next_vector_observation",
            )

            encoded_vector_obs = ModelUtils.create_vector_observation_encoder(
                self.policy.vector_in,
                self.encoding_size,
                ModelUtils.swish,
                2,
                "curiosity_vector_obs_encoder",
                False,
            )
            encoded_next_vector_obs = ModelUtils.create_vector_observation_encoder(
                self.next_vector_in,
                self.encoding_size,
                ModelUtils.swish,
                2,
                "curiosity_vector_obs_encoder",
                True,
            )
            encoded_state_list.append(encoded_vector_obs)
            encoded_next_state_list.append(encoded_next_vector_obs)

        encoded_state = tf.concat(encoded_state_list, axis=1)
        encoded_next_state = tf.concat(encoded_next_state_list, axis=1)
        return encoded_state, encoded_next_state
Ejemplo n.º 21
0
    def create_observation_streams(
        visual_in: List[tf.Tensor],
        vector_in: tf.Tensor,
        num_streams: int,
        h_size: int,
        num_layers: int,
        vis_encode_type: EncoderType = EncoderType.SIMPLE,
        stream_scopes: List[str] = None,
    ) -> List[tf.Tensor]:
        """
        Creates encoding stream for observations.
        :param num_streams: Number of streams to create.
        :param h_size: Size of hidden linear layers in stream.
        :param num_layers: Number of hidden linear layers in stream.
        :param stream_scopes: List of strings (length == num_streams), which contains
            the scopes for each of the streams. None if all under the same TF scope.
        :return: List of encoded streams.
        """
        activation_fn = ModelUtils.swish
        vector_observation_input = vector_in

        final_hiddens = []
        for i in range(num_streams):
            # Pick the encoder function based on the EncoderType
            create_encoder_func = ModelUtils.get_encoder_for_type(
                vis_encode_type)

            visual_encoders = []
            hidden_state, hidden_visual = None, None
            _scope_add = stream_scopes[i] if stream_scopes else ""
            if len(visual_in) > 0:
                for j, vis_in in enumerate(visual_in):
                    ModelUtils._check_resolution_for_encoder(
                        vis_in, vis_encode_type)
                    encoded_visual = create_encoder_func(
                        vis_in,
                        h_size,
                        activation_fn,
                        num_layers,
                        f"{_scope_add}main_graph_{i}_encoder{j}",  # scope
                        False,  # reuse
                    )
                    visual_encoders.append(encoded_visual)
                hidden_visual = tf.concat(visual_encoders, axis=1)
            if vector_in.get_shape()[-1] > 0:
                # Don't encode non-existant or 0-shape inputs
                hidden_state = ModelUtils.create_vector_observation_encoder(
                    vector_observation_input,
                    h_size,
                    activation_fn,
                    num_layers,
                    scope=f"{_scope_add}main_graph_{i}",
                    reuse=False,
                )
            if hidden_state is not None and hidden_visual is not None:
                final_hidden = tf.concat([hidden_visual, hidden_state], axis=1)
            elif hidden_state is None and hidden_visual is not None:
                final_hidden = hidden_visual
            elif hidden_state is not None and hidden_visual is None:
                final_hidden = hidden_state
            else:
                raise Exception(
                    "No valid network configuration possible. "
                    "There are no states or observations in this brain")
            final_hiddens.append(final_hidden)
        return final_hiddens
Ejemplo n.º 22
0
    def create_observation_streams(
        self,
        num_streams: int,
        h_size: int,
        num_layers: int,
        vis_encode_type: EncoderType = EncoderType.SIMPLE,
        stream_scopes: List[str] = None,
    ) -> List[tf.Tensor]:
        """
        Creates encoding stream for observations.
        :param num_streams: Number of streams to create.
        :param h_size: Size of hidden linear layers in stream.
        :param num_layers: Number of hidden linear layers in stream.
        :param stream_scopes: List of strings (length == num_streams), which contains
            the scopes for each of the streams. None if all under the same TF scope.
        :return: List of encoded streams.
        """
        brain = self.brain
        activation_fn = self.swish

        self.visual_in = []
        for i in range(brain.number_visual_observations):
            visual_input = self.create_visual_input(
                brain.camera_resolutions[i],
                name="visual_observation_" + str(i))
            self.visual_in.append(visual_input)
        vector_observation_input = self.create_vector_input()

        # Pick the encoder function based on the EncoderType
        create_encoder_func = LearningModel.create_visual_observation_encoder
        if vis_encode_type == EncoderType.RESNET:
            create_encoder_func = LearningModel.create_resnet_visual_observation_encoder
        elif vis_encode_type == EncoderType.NATURE_CNN:
            create_encoder_func = (
                LearningModel.create_nature_cnn_visual_observation_encoder)

        final_hiddens = []
        for i in range(num_streams):
            visual_encoders = []
            hidden_state, hidden_visual = None, None
            _scope_add = stream_scopes[i] if stream_scopes else ""
            if self.vis_obs_size > 0:
                for j in range(brain.number_visual_observations):
                    encoded_visual = create_encoder_func(
                        self.visual_in[j],
                        h_size,
                        activation_fn,
                        num_layers,
                        scope=f"{_scope_add}main_graph_{i}_encoder{j}",
                        reuse=False,
                    )
                    visual_encoders.append(encoded_visual)
                hidden_visual = tf.concat(visual_encoders, axis=1)
            if brain.vector_observation_space_size > 0:
                hidden_state = self.create_vector_observation_encoder(
                    vector_observation_input,
                    h_size,
                    activation_fn,
                    num_layers,
                    scope=f"{_scope_add}main_graph_{i}",
                    reuse=False,
                )
            if hidden_state is not None and hidden_visual is not None:
                final_hidden = tf.concat([hidden_visual, hidden_state], axis=1)
            elif hidden_state is None and hidden_visual is not None:
                final_hidden = hidden_visual
            elif hidden_state is not None and hidden_visual is None:
                final_hidden = hidden_state
            else:
                raise Exception(
                    "No valid network configuration possible. "
                    "There are no states or observations in this brain")
            final_hiddens.append(final_hidden)
        return final_hiddens
Ejemplo n.º 23
0
    def create_cc_actor_critic(self, h_size: int, num_layers: int,
                               vis_encode_type: EncoderType) -> None:
        """
        Creates Continuous control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        """
        hidden_streams = self.create_observation_streams(
            2, h_size, num_layers, vis_encode_type)

        if self.use_recurrent:
            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            _half_point = int(self.m_size / 2)
            hidden_policy, memory_policy_out = self.create_recurrent_encoder(
                hidden_streams[0],
                self.memory_in[:, :_half_point],
                self.sequence_length,
                name="lstm_policy",
            )

            hidden_value, memory_value_out = self.create_recurrent_encoder(
                hidden_streams[1],
                self.memory_in[:, _half_point:],
                self.sequence_length,
                name="lstm_value",
            )
            self.memory_out = tf.concat([memory_policy_out, memory_value_out],
                                        axis=1,
                                        name="recurrent_out")
        else:
            hidden_policy = hidden_streams[0]
            hidden_value = hidden_streams[1]

        mu = tf.layers.dense(
            hidden_policy,
            self.act_size[0],
            activation=None,
            kernel_initializer=LearningModel.scaled_init(0.01),
            reuse=tf.AUTO_REUSE,
        )

        self.log_sigma_sq = tf.get_variable(
            "log_sigma_squared",
            [self.act_size[0]],
            dtype=tf.float32,
            initializer=tf.zeros_initializer(),
        )

        sigma_sq = tf.exp(self.log_sigma_sq)

        self.epsilon = tf.placeholder(shape=[None, self.act_size[0]],
                                      dtype=tf.float32,
                                      name="epsilon")
        # Clip and scale output to ensure actions are always within [-1, 1] range.
        self.output_pre = mu + tf.sqrt(sigma_sq) * self.epsilon
        output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
        self.output = tf.identity(output_post, name="action")
        self.selected_actions = tf.stop_gradient(output_post)

        # Compute probability of model output.
        all_probs = (-0.5 * tf.square(tf.stop_gradient(self.output_pre) - mu) /
                     sigma_sq - 0.5 * tf.log(2.0 * np.pi) -
                     0.5 * self.log_sigma_sq)

        self.all_log_probs = tf.identity(all_probs, name="action_probs")

        self.entropy = 0.5 * tf.reduce_mean(
            tf.log(2 * np.pi * np.e) + self.log_sigma_sq)

        self.create_value_heads(self.stream_names, hidden_value)

        self.all_old_log_probs = tf.placeholder(shape=[None, self.act_size[0]],
                                                dtype=tf.float32,
                                                name="old_probabilities")

        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
        self.log_probs = tf.reduce_sum((tf.identity(self.all_log_probs)),
                                       axis=1,
                                       keepdims=True)
        self.old_log_probs = tf.reduce_sum(
            (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True)
Ejemplo n.º 24
0
    def make_inputs(self) -> None:
        """
        Creates the input layers for the discriminator
        """
        self.done_expert_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        self.done_policy_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        self.done_expert = tf.expand_dims(self.done_expert_holder, -1)
        self.done_policy = tf.expand_dims(self.done_policy_holder, -1)

        if self.policy.brain.vector_action_space_type == "continuous":
            action_length = self.policy.act_size[0]
            self.action_in_expert = tf.placeholder(
                shape=[None, action_length], dtype=tf.float32
            )
            self.expert_action = tf.identity(self.action_in_expert)
        else:
            action_length = len(self.policy.act_size)
            self.action_in_expert = tf.placeholder(
                shape=[None, action_length], dtype=tf.int32
            )
            self.expert_action = tf.concat(
                [
                    tf.one_hot(self.action_in_expert[:, i], act_size)
                    for i, act_size in enumerate(self.policy.act_size)
                ],
                axis=1,
            )

        encoded_policy_list = []
        encoded_expert_list = []

        if self.policy.vec_obs_size > 0:
            self.obs_in_expert = tf.placeholder(
                shape=[None, self.policy.vec_obs_size], dtype=tf.float32
            )
            if self.policy.normalize:
                encoded_expert_list.append(
                    ModelUtils.normalize_vector_obs(
                        self.obs_in_expert,
                        self.policy.running_mean,
                        self.policy.running_variance,
                        self.policy.normalization_steps,
                    )
                )
                encoded_policy_list.append(self.policy.processed_vector_in)
            else:
                encoded_expert_list.append(self.obs_in_expert)
                encoded_policy_list.append(self.policy.vector_in)

        if self.policy.vis_obs_size > 0:
            self.expert_visual_in: List[tf.Tensor] = []
            visual_policy_encoders = []
            visual_expert_encoders = []
            for i in range(self.policy.vis_obs_size):
                # Create input ops for next (t+1) visual observations.
                visual_input = ModelUtils.create_visual_input(
                    self.policy.brain.camera_resolutions[i],
                    name="gail_visual_observation_" + str(i),
                )
                self.expert_visual_in.append(visual_input)

                encoded_policy_visual = ModelUtils.create_visual_observation_encoder(
                    self.policy.visual_in[i],
                    self.encoding_size,
                    ModelUtils.swish,
                    1,
                    "gail_stream_{}_visual_obs_encoder".format(i),
                    False,
                )

                encoded_expert_visual = ModelUtils.create_visual_observation_encoder(
                    self.expert_visual_in[i],
                    self.encoding_size,
                    ModelUtils.swish,
                    1,
                    "gail_stream_{}_visual_obs_encoder".format(i),
                    True,
                )
                visual_policy_encoders.append(encoded_policy_visual)
                visual_expert_encoders.append(encoded_expert_visual)
            hidden_policy_visual = tf.concat(visual_policy_encoders, axis=1)
            hidden_expert_visual = tf.concat(visual_expert_encoders, axis=1)
            encoded_policy_list.append(hidden_policy_visual)
            encoded_expert_list.append(hidden_expert_visual)

        self.encoded_expert = tf.concat(encoded_expert_list, axis=1)
        self.encoded_policy = tf.concat(encoded_policy_list, axis=1)
Ejemplo n.º 25
0
    def _create_dc_actor(self, encoded: tf.Tensor) -> None:
        """
        Creates Discrete control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: Type of visual encoder to use if visual input.
        """
        if self.use_recurrent:
            self.prev_action = tf.placeholder(shape=[None,
                                                     len(self.act_size)],
                                              dtype=tf.int32,
                                              name="prev_action")
            prev_action_oh = tf.concat(
                [
                    tf.one_hot(self.prev_action[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            hidden_policy = tf.concat([encoded, prev_action_oh], axis=1)

            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
                hidden_policy,
                self.memory_in,
                self.sequence_length_ph,
                name="lstm_policy",
            )

            self.memory_out = tf.identity(memory_policy_out, "recurrent_out")
        else:
            hidden_policy = encoded

        policy_branches = []
        with tf.variable_scope("policy"):
            for size in self.act_size:
                policy_branches.append(
                    tf.layers.dense(
                        hidden_policy,
                        size,
                        activation=None,
                        use_bias=False,
                        kernel_initializer=ModelUtils.scaled_init(0.01),
                    ))

        raw_log_probs = tf.concat(policy_branches, axis=1, name="action_probs")

        self.action_masks = tf.placeholder(shape=[None,
                                                  sum(self.act_size)],
                                           dtype=tf.float32,
                                           name="action_masks")
        output, self.action_probs, normalized_logits = ModelUtils.create_discrete_action_masking_layer(
            raw_log_probs, self.action_masks, self.act_size)

        self.output = tf.identity(output)
        self.all_log_probs = tf.identity(normalized_logits, name="action")

        self.action_holder = tf.placeholder(shape=[None,
                                                   len(policy_branches)],
                                            dtype=tf.int32,
                                            name="action_holder")
        self.action_oh = tf.concat(
            [
                tf.one_hot(self.action_holder[:, i], self.act_size[i])
                for i in range(len(self.act_size))
            ],
            axis=1,
        )
        self.selected_actions = tf.stop_gradient(self.action_oh)

        action_idx = [0] + list(np.cumsum(self.act_size))

        self.entropy = tf.reduce_sum(
            (tf.stack(
                [
                    tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=tf.nn.softmax(
                            self.all_log_probs[:,
                                               action_idx[i]:action_idx[i +
                                                                        1]]),
                        logits=self.all_log_probs[:,
                                                  action_idx[i]:action_idx[i +
                                                                           1]],
                    ) for i in range(len(self.act_size))
                ],
                axis=1,
            )),
            axis=1,
        )

        self.log_probs = tf.reduce_sum(
            (tf.stack(
                [
                    -tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=self.action_oh[:,
                                              action_idx[i]:action_idx[i + 1]],
                        logits=normalized_logits[:,
                                                 action_idx[i]:action_idx[i +
                                                                          1]],
                    ) for i in range(len(self.act_size))
                ],
                axis=1,
            )),
            axis=1,
            keepdims=True,
        )
Ejemplo n.º 26
0
    def __init__(
        self,
        brain,
        h_size=128,
        lr=1e-4,
        n_layers=2,
        m_size=128,
        normalize=False,
        use_recurrent=False,
        seed=0,
    ):
        LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed)
        num_streams = 1
        hidden_streams = self.create_observation_streams(num_streams, h_size, n_layers)
        hidden = hidden_streams[0]
        self.dropout_rate = tf.placeholder(
            dtype=tf.float32, shape=[], name="dropout_rate"
        )
        hidden_reg = tf.layers.dropout(hidden, self.dropout_rate)
        if self.use_recurrent:
            tf.Variable(
                self.m_size, name="memory_size", trainable=False, dtype=tf.int32
            )
            self.memory_in = tf.placeholder(
                shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
            )
            hidden_reg, self.memory_out = self.create_recurrent_encoder(
                hidden_reg, self.memory_in, self.sequence_length
            )
            self.memory_out = tf.identity(self.memory_out, name="recurrent_out")

        if brain.vector_action_space_type == "discrete":
            policy_branches = []
            for size in self.act_size:
                policy_branches.append(
                    tf.layers.dense(
                        hidden_reg,
                        size,
                        activation=None,
                        use_bias=False,
                        kernel_initializer=tf.initializers.variance_scaling(0.01),
                    )
                )
            self.action_probs = tf.concat(
                [tf.nn.softmax(branch) for branch in policy_branches],
                axis=1,
                name="action_probs",
            )
            self.action_masks = tf.placeholder(
                shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks"
            )
            self.sample_action_float, _, normalized_logits = self.create_discrete_action_masking_layer(
                tf.concat(policy_branches, axis=1), self.action_masks, self.act_size
            )
            tf.identity(normalized_logits, name="action")
            self.sample_action = tf.cast(self.sample_action_float, tf.int32)
            self.true_action = tf.placeholder(
                shape=[None, len(policy_branches)],
                dtype=tf.int32,
                name="teacher_action",
            )
            self.action_oh = tf.concat(
                [
                    tf.one_hot(self.true_action[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            self.loss = tf.reduce_sum(
                -tf.log(self.action_probs + 1e-10) * self.action_oh
            )
            self.action_percent = tf.reduce_mean(
                tf.cast(
                    tf.equal(
                        tf.cast(tf.argmax(self.action_probs, axis=1), tf.int32),
                        self.sample_action,
                    ),
                    tf.float32,
                )
            )
        else:
            self.policy = tf.layers.dense(
                hidden_reg,
                self.act_size[0],
                activation=None,
                use_bias=False,
                name="pre_action",
                kernel_initializer=tf.initializers.variance_scaling(0.01),
            )
            self.clipped_sample_action = tf.clip_by_value(self.policy, -1, 1)
            self.sample_action = tf.identity(self.clipped_sample_action, name="action")
            self.true_action = tf.placeholder(
                shape=[None, self.act_size[0]], dtype=tf.float32, name="teacher_action"
            )
            self.clipped_true_action = tf.clip_by_value(self.true_action, -1, 1)
            self.loss = tf.reduce_sum(
                tf.squared_difference(self.clipped_true_action, self.sample_action)
            )

        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update = optimizer.minimize(self.loss)
Ejemplo n.º 27
0
    def create_dc_actor_critic(self, h_size: int, num_layers: int,
                               vis_encode_type: EncoderType) -> None:
        """
        Creates Discrete control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        """
        hidden_streams = self.create_observation_streams(
            1, h_size, num_layers, vis_encode_type)
        hidden = hidden_streams[0]

        if self.use_recurrent:
            self.prev_action = tf.placeholder(shape=[None,
                                                     len(self.act_size)],
                                              dtype=tf.int32,
                                              name="prev_action")
            prev_action_oh = tf.concat(
                [
                    tf.one_hot(self.prev_action[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            hidden = tf.concat([hidden, prev_action_oh], axis=1)

            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            hidden, memory_out = self.create_recurrent_encoder(
                hidden, self.memory_in, self.sequence_length)
            self.memory_out = tf.identity(memory_out, name="recurrent_out")

        policy_branches = []
        for size in self.act_size:
            policy_branches.append(
                tf.layers.dense(
                    hidden,
                    size,
                    activation=None,
                    use_bias=False,
                    kernel_initializer=LearningModel.scaled_init(0.01),
                ))

        self.all_log_probs = tf.concat(policy_branches,
                                       axis=1,
                                       name="action_probs")

        self.action_masks = tf.placeholder(shape=[None,
                                                  sum(self.act_size)],
                                           dtype=tf.float32,
                                           name="action_masks")
        output, _, normalized_logits = self.create_discrete_action_masking_layer(
            self.all_log_probs, self.action_masks, self.act_size)

        self.output = tf.identity(output)
        self.normalized_logits = tf.identity(normalized_logits, name="action")

        self.create_value_heads(self.stream_names, hidden)

        self.action_holder = tf.placeholder(shape=[None,
                                                   len(policy_branches)],
                                            dtype=tf.int32,
                                            name="action_holder")
        self.action_oh = tf.concat(
            [
                tf.one_hot(self.action_holder[:, i], self.act_size[i])
                for i in range(len(self.act_size))
            ],
            axis=1,
        )
        self.selected_actions = tf.stop_gradient(self.action_oh)

        self.all_old_log_probs = tf.placeholder(
            shape=[None, sum(self.act_size)],
            dtype=tf.float32,
            name="old_probabilities")
        _, _, old_normalized_logits = self.create_discrete_action_masking_layer(
            self.all_old_log_probs, self.action_masks, self.act_size)

        action_idx = [0] + list(np.cumsum(self.act_size))

        self.entropy = tf.reduce_sum(
            (tf.stack(
                [
                    tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=tf.nn.softmax(
                            self.all_log_probs[:,
                                               action_idx[i]:action_idx[i +
                                                                        1]]),
                        logits=self.all_log_probs[:,
                                                  action_idx[i]:action_idx[i +
                                                                           1]],
                    ) for i in range(len(self.act_size))
                ],
                axis=1,
            )),
            axis=1,
        )

        self.log_probs = tf.reduce_sum(
            (tf.stack(
                [
                    -tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=self.action_oh[:,
                                              action_idx[i]:action_idx[i + 1]],
                        logits=normalized_logits[:,
                                                 action_idx[i]:action_idx[i +
                                                                          1]],
                    ) for i in range(len(self.act_size))
                ],
                axis=1,
            )),
            axis=1,
            keepdims=True,
        )
        self.old_log_probs = tf.reduce_sum(
            (tf.stack(
                [
                    -tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=self.action_oh[:,
                                              action_idx[i]:action_idx[i + 1]],
                        logits=old_normalized_logits[:, action_idx[i]:
                                                     action_idx[i + 1]],
                    ) for i in range(len(self.act_size))
                ],
                axis=1,
            )),
            axis=1,
            keepdims=True,
        )