Exemple #1
0
 def __init__(
     self, m_size, normalize, use_recurrent, brain, seed, stream_names=None
 ):
     tf.set_random_seed(seed)
     self.brain = brain
     self.vector_in = None
     self.global_step, self.increment_step, self.steps_to_increment = (
         self.create_global_steps()
     )
     self.visual_in = []
     self.batch_size = tf.placeholder(shape=None, dtype=tf.int32, name="batch_size")
     self.sequence_length = tf.placeholder(
         shape=None, dtype=tf.int32, name="sequence_length"
     )
     self.mask_input = tf.placeholder(shape=[None], dtype=tf.float32, name="masks")
     self.mask = tf.cast(self.mask_input, tf.int32)
     self.stream_names = stream_names or []
     self.use_recurrent = use_recurrent
     if self.use_recurrent:
         self.m_size = m_size
     else:
         self.m_size = 0
     self.normalize = normalize
     self.act_size = brain.vector_action_space_size
     self.vec_obs_size = brain.vector_observation_space_size
     self.vis_obs_size = brain.number_visual_observations
     tf.Variable(
         int(brain.vector_action_space_type == "continuous"),
         name="is_continuous_control",
         trainable=False,
         dtype=tf.int32,
     )
     tf.Variable(
         self._version_number_,
         name="version_number",
         trainable=False,
         dtype=tf.int32,
     )
     tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32)
     if brain.vector_action_space_type == "continuous":
         tf.Variable(
             self.act_size[0],
             name="action_output_shape",
             trainable=False,
             dtype=tf.int32,
         )
     else:
         tf.Variable(
             sum(self.act_size),
             name="action_output_shape",
             trainable=False,
             dtype=tf.int32,
         )
     self.value_heads: Dict[str, tf.Tensor] = {}
     self.normalization_steps: Optional[tf.Variable] = None
     self.running_mean: Optional[tf.Variable] = None
     self.running_variance: Optional[tf.Variable] = None
     self.update_normalization: Optional[tf.Operation] = None
     self.value: Optional[tf.Tensor] = None
Exemple #2
0
    def _create_dc_actor(self, encoded: tf.Tensor) -> None:
        """
        Creates Discrete control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: Type of visual encoder to use if visual input.
        """
        if self.use_recurrent:
            self.prev_action = tf.placeholder(shape=[None,
                                                     len(self.act_size)],
                                              dtype=tf.int32,
                                              name="prev_action")
            prev_action_oh = tf.concat(
                [
                    tf.one_hot(self.prev_action[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            hidden_policy = tf.concat([encoded, prev_action_oh], axis=1)

            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
                hidden_policy,
                self.memory_in,
                self.sequence_length_ph,
                name="lstm_policy",
            )

            self.memory_out = tf.identity(memory_policy_out, "recurrent_out")
        else:
            hidden_policy = encoded

        self.action_masks = tf.placeholder(shape=[None,
                                                  sum(self.act_size)],
                                           dtype=tf.float32,
                                           name="action_masks")

        with tf.variable_scope("policy"):
            distribution = MultiCategoricalDistribution(
                hidden_policy, self.act_size, self.action_masks)
        # It's important that we are able to feed_dict a value into this tensor to get the
        # right one-hot encoding, so we can't do identity on it.
        self.output = distribution.sample
        self.all_log_probs = tf.identity(distribution.log_probs, name="action")
        self.selected_actions = tf.stop_gradient(
            distribution.sample_onehot)  # In discrete, these are onehot
        self.entropy = distribution.entropy
        self.total_log_probs = distribution.total_log_probs
Exemple #3
0
 def create_input_placeholders(
         observation_shapes: List[Tuple],
         name_prefix: str = "") -> Tuple[tf.Tensor, List[tf.Tensor]]:
     """
     Creates input placeholders for visual inputs.
     :param observation_shapes: A List of tuples that specify the resolutions
         of the input observations. Tuples for now are restricted to 1D (vector) or 3D (Tensor)
     :param name_prefix: A name prefix to add to the placeholder names. This is used so that there
         is no conflict when creating multiple placeholder sets.
     :returns: A List of Tensorflow placeholders where the input iamges should be fed.
     """
     visual_in: List[tf.Tensor] = []
     vector_in_size = 0
     for i, dimension in enumerate(observation_shapes):
         if len(dimension) == 3:
             _res = Tensor3DShape(height=dimension[0],
                                  width=dimension[1],
                                  num_channels=dimension[2])
             visual_input = ModelUtils.create_visual_input(
                 _res, name=name_prefix + "visual_observation_" + str(i))
             visual_in.append(visual_input)
         elif len(dimension) == 1:
             vector_in_size += dimension[0]
         else:
             raise UnityTrainerException(
                 f"Unsupported shape of {dimension} for observation {i}")
     vector_in = tf.placeholder(
         shape=[None, vector_in_size],
         dtype=tf.float32,
         name=name_prefix + "vector_observation",
     )
     return vector_in, visual_in
Exemple #4
0
 def create_network(self) -> None:
     """
     Helper for creating the intrinsic reward nodes
     """
     if self.use_vail:
         self.z_sigma = tf.get_variable(
             "gail_sigma_vail",
             self.z_size,
             dtype=tf.float32,
             initializer=tf.ones_initializer(),
         )
         self.z_sigma_sq = self.z_sigma * self.z_sigma
         self.z_log_sigma_sq = tf.log(self.z_sigma_sq + EPSILON)
         self.use_noise = tf.placeholder(
             shape=[1], dtype=tf.float32, name="gail_NoiseLevel"
         )
     self.expert_estimate, self.z_mean_expert, _ = self.create_encoder(
         self.encoded_expert, self.expert_action, self.done_expert, reuse=False
     )
     self.policy_estimate, self.z_mean_policy, _ = self.create_encoder(
         self.encoded_policy,
         self.policy.selected_actions,
         self.done_policy,
         reuse=True,
     )
     self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate)
     self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate)
     self.discriminator_score = tf.reshape(
         self.policy_estimate, [-1], name="gail_reward"
     )
     self.intrinsic_reward = -tf.log(1.0 - self.discriminator_score + EPSILON)
Exemple #5
0
    def _create_inputs_and_outputs(self) -> None:
        """
        Assign the higher-level SACModel's inputs and outputs to those of its policy or
        target network.
        """
        self.vector_in = self.policy.vector_in
        self.visual_in = self.policy.visual_in
        self.next_vector_in = self.target_network.vector_in
        self.next_visual_in = self.target_network.visual_in
        self.sequence_length_ph = self.policy.sequence_length_ph
        self.next_sequence_length_ph = self.target_network.sequence_length_ph
        if not self.policy.use_continuous_act:
            self.action_masks = self.policy_network.action_masks
        else:
            self.output_pre = self.policy_network.output_pre

        # Don't use value estimate during inference.
        self.value = tf.identity(self.policy_network.value,
                                 name="value_estimate_unused")
        self.value_heads = self.policy_network.value_heads
        self.dones_holder = tf.placeholder(shape=[None],
                                           dtype=tf.float32,
                                           name="dones_holder")

        if self.policy.use_recurrent:
            self.memory_in = self.policy_network.memory_in
            self.memory_out = self.policy_network.memory_out
            if not self.policy.use_continuous_act:
                self.prev_action = self.policy_network.prev_action
            self.next_memory_in = self.target_network.memory_in
Exemple #6
0
 def init_load_weights(self):
     with self.graph.as_default():
         _vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
         values = [v.eval(session=self.sess) for v in _vars]
         for var, value in zip(_vars, values):
             assign_ph = tf.placeholder(var.dtype, shape=value.shape)
             self.assign_phs.append(assign_ph)
             self.assign_ops.append(tf.assign(var, assign_ph))
Exemple #7
0
    def _create_dc_critic(self, h_size: int, num_layers: int,
                          vis_encode_type: EncoderType) -> None:
        """
        Creates Discrete control critic (value) network.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: The type of visual encoder to use.
        """
        hidden_stream = ModelUtils.create_observation_streams(
            self.policy.visual_in,
            self.policy.processed_vector_in,
            1,
            h_size,
            num_layers,
            vis_encode_type,
        )[0]

        if self.policy.use_recurrent:
            hidden_value, memory_value_out = ModelUtils.create_recurrent_encoder(
                hidden_stream,
                self.memory_in,
                self.policy.sequence_length_ph,
                name="lstm_value",
            )
            self.memory_out = memory_value_out
        else:
            hidden_value = hidden_stream

        self.value_heads, self.value = ModelUtils.create_value_heads(
            self.stream_names, hidden_value)

        self.all_old_log_probs = tf.placeholder(
            shape=[None, sum(self.policy.act_size)],
            dtype=tf.float32,
            name="old_probabilities",
        )
        _, _, old_normalized_logits = ModelUtils.create_discrete_action_masking_layer(
            self.all_old_log_probs, self.policy.action_masks,
            self.policy.act_size)

        action_idx = [0] + list(np.cumsum(self.policy.act_size))

        self.old_log_probs = tf.reduce_sum(
            (tf.stack(
                [
                    -tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=self.policy.
                        selected_actions[:, action_idx[i]:action_idx[i + 1]],
                        logits=old_normalized_logits[:, action_idx[i]:
                                                     action_idx[i + 1]],
                    ) for i in range(len(self.policy.act_size))
                ],
                axis=1,
            )),
            axis=1,
            keepdims=True,
        )
Exemple #8
0
    def create_inputs_and_outputs(self):
        """
        Assign the higher-level SACModel's inputs and outputs to those of its policy or
        target network.
        """
        self.vector_in = self.policy_network.vector_in
        self.visual_in = self.policy_network.visual_in
        self.next_vector_in = self.target_network.vector_in
        self.next_visual_in = self.target_network.visual_in
        self.action_holder = self.policy_network.action_holder
        self.sequence_length = self.policy_network.sequence_length
        self.next_sequence_length = self.target_network.sequence_length
        if self.brain.vector_action_space_type == "discrete":
            self.action_masks = self.policy_network.action_masks
        else:
            self.output_pre = self.policy_network.output_pre

        self.output = self.policy_network.output
        # Don't use value estimate during inference. TODO: Check why PPO uses value_estimate in inference.
        self.value = tf.identity(
            self.policy_network.value, name="value_estimate_unused"
        )
        self.value_heads = self.policy_network.value_heads
        self.all_log_probs = self.policy_network.all_log_probs
        self.dones_holder = tf.placeholder(
            shape=[None], dtype=tf.float32, name="dones_holder"
        )
        # This is just a dummy to get pretraining to work. PPO has this but SAC doesn't.
        # TODO: Proper input and output specs for models
        self.epsilon = tf.placeholder(
            shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon"
        )
        if self.use_recurrent:
            self.memory_in = self.policy_network.memory_in
            self.memory_out = self.policy_network.memory_out

            # For Barracuda
            self.inference_memory_out = tf.identity(
                self.policy_network.policy_memory_out, name="recurrent_out"
            )

            if self.brain.vector_action_space_type == "discrete":
                self.prev_action = self.policy_network.prev_action
            self.next_memory_in = self.target_network.memory_in
Exemple #9
0
    def __init__(
        self,
        policy=None,
        m_size=None,
        h_size=128,
        normalize=False,
        use_recurrent=False,
        num_layers=2,
        stream_names=None,
        vis_encode_type=EncoderType.SIMPLE,
    ):
        self.normalize = normalize
        self.use_recurrent = use_recurrent
        self.num_layers = num_layers
        self.stream_names = stream_names
        self.h_size = h_size
        self.activ_fn = ModelUtils.swish

        self.sequence_length_ph = tf.placeholder(
            shape=None, dtype=tf.int32, name="sac_sequence_length"
        )

        self.policy_memory_in: Optional[tf.Tensor] = None
        self.policy_memory_out: Optional[tf.Tensor] = None
        self.value_memory_in: Optional[tf.Tensor] = None
        self.value_memory_out: Optional[tf.Tensor] = None
        self.q1: Optional[tf.Tensor] = None
        self.q2: Optional[tf.Tensor] = None
        self.q1_p: Optional[tf.Tensor] = None
        self.q2_p: Optional[tf.Tensor] = None
        self.q1_memory_in: Optional[tf.Tensor] = None
        self.q2_memory_in: Optional[tf.Tensor] = None
        self.q1_memory_out: Optional[tf.Tensor] = None
        self.q2_memory_out: Optional[tf.Tensor] = None
        self.prev_action: Optional[tf.Tensor] = None
        self.action_masks: Optional[tf.Tensor] = None
        self.external_action_in: Optional[tf.Tensor] = None
        self.log_sigma_sq: Optional[tf.Tensor] = None
        self.entropy: Optional[tf.Tensor] = None
        self.deterministic_output: Optional[tf.Tensor] = None
        self.normalized_logprobs: Optional[tf.Tensor] = None
        self.action_probs: Optional[tf.Tensor] = None
        self.output_oh: Optional[tf.Tensor] = None
        self.output_pre: Optional[tf.Tensor] = None

        self.value_vars = None
        self.q_vars = None
        self.critic_vars = None
        self.policy_vars = None

        self.q1_heads: Dict[str, tf.Tensor] = None
        self.q2_heads: Dict[str, tf.Tensor] = None
        self.q1_pheads: Dict[str, tf.Tensor] = None
        self.q2_pheads: Dict[str, tf.Tensor] = None

        self.policy = policy
Exemple #10
0
    def _create_cc_actor(
        self,
        encoded: tf.Tensor,
        tanh_squash: bool = False,
        reparameterize: bool = False,
        condition_sigma_on_obs: bool = True,
    ) -> None:
        """
        Creates Continuous control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: Type of visual encoder to use if visual input.
        :param tanh_squash: Whether to use a tanh function, or a clipped output.
        :param reparameterize: Whether we are using the resampling trick to update the policy.
        """
        if self.use_recurrent:
            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
                encoded,
                self.memory_in,
                self.sequence_length_ph,
                name="lstm_policy")

            self.memory_out = tf.identity(memory_policy_out,
                                          name="recurrent_out")
        else:
            hidden_policy = encoded

        with tf.variable_scope("policy"):
            distribution = GaussianDistribution(
                hidden_policy,
                self.act_size,
                reparameterize=reparameterize,
                tanh_squash=tanh_squash,
                condition_sigma=condition_sigma_on_obs,
            )

        if tanh_squash:
            self.output_pre = distribution.sample
            self.output = tf.identity(self.output_pre, name="action")
        else:
            self.output_pre = distribution.sample
            # Clip and scale output to ensure actions are always within [-1, 1] range.
            output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
            self.output = tf.identity(output_post, name="action")

        self.selected_actions = tf.stop_gradient(self.output)

        self.all_log_probs = tf.identity(distribution.log_probs,
                                         name="action_probs")
        self.entropy = distribution.entropy

        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
        self.total_log_probs = distribution.total_log_probs
Exemple #11
0
 def create_global_steps():
     """Creates TF ops to track and increment global training step."""
     global_step = tf.Variable(
         0, name="global_step", trainable=False, dtype=tf.int32
     )
     steps_to_increment = tf.placeholder(
         shape=[], dtype=tf.int32, name="steps_to_increment"
     )
     increment_step = tf.assign(global_step, tf.add(global_step, steps_to_increment))
     return global_step, increment_step, steps_to_increment
Exemple #12
0
 def create_vector_input(vec_obs_size: int,
                         name: str = "vector_observation") -> tf.Tensor:
     """
     Creates ops for vector observation input.
     :param vec_obs_size: Size of stacked vector observation.
     :param name: Name of the placeholder op.
     :return: Placeholder for vector observations.
     """
     vector_in = tf.placeholder(shape=[None, vec_obs_size],
                                dtype=tf.float32,
                                name=name)
     return vector_in
Exemple #13
0
    def make_inputs(self) -> None:
        """
        Creates the input layers for the discriminator
        """
        self.done_expert = tf.placeholder(shape=[None, 1], dtype=tf.float32)
        self.done_policy = tf.placeholder(shape=[None, 1], dtype=tf.float32)

        if self.policy.behavior_spec.action_spec.is_continuous():
            action_length = self.policy.act_size[0]
            self.action_in_expert = tf.placeholder(shape=[None, action_length],
                                                   dtype=tf.float32)
            self.expert_action = tf.identity(self.action_in_expert)
        else:
            action_length = len(self.policy.act_size)
            self.action_in_expert = tf.placeholder(shape=[None, action_length],
                                                   dtype=tf.int32)
            self.expert_action = tf.concat(
                [
                    tf.one_hot(self.action_in_expert[:, i], act_size)
                    for i, act_size in enumerate(self.policy.act_size)
                ],
                axis=1,
            )
Exemple #14
0
 def create_vector_input(self, name="vector_observation"):
     """
     Creates ops for vector observation input.
     :param name: Name of the placeholder op.
     :param vec_obs_size: Size of stacked vector observation.
     :return:
     """
     self.vector_in = tf.placeholder(shape=[None, self.vec_obs_size],
                                     dtype=tf.float32,
                                     name=name)
     if self.normalize:
         self.create_normalizer(self.vector_in)
         return self.normalize_vector_obs(self.vector_in)
     else:
         return self.vector_in
Exemple #15
0
    def create_visual_input(camera_parameters: Tensor3DShape, name: str) -> tf.Tensor:
        """
        Creates image input op.
        :param camera_parameters: Parameters for visual observation.
        :param name: Desired name of input op.
        :return: input op.
        """
        o_size_h = camera_parameters.height
        o_size_w = camera_parameters.width
        c_channels = camera_parameters.num_channels

        visual_in = tf.placeholder(
            shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32, name=name
        )
        return visual_in
Exemple #16
0
 def __init__(
     self,
     brain,
     m_size=None,
     h_size=128,
     normalize=False,
     use_recurrent=False,
     num_layers=2,
     stream_names=None,
     seed=0,
     vis_encode_type=EncoderType.SIMPLE,
 ):
     super().__init__(
         brain,
         m_size,
         h_size,
         normalize,
         use_recurrent,
         num_layers,
         stream_names,
         seed,
         vis_encode_type,
     )
     if self.use_recurrent:
         self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                         dtype=tf.float32,
                                         name="recurrent_in")
         self.value_memory_in = self.memory_in
     with tf.variable_scope(TARGET_SCOPE):
         hidden_streams = self.create_observation_streams(
             1,
             self.h_size,
             0,
             vis_encode_type=vis_encode_type,
             stream_scopes=["critic/value/"],
         )
     if brain.vector_action_space_type == "continuous":
         self.create_cc_critic(hidden_streams[0],
                               TARGET_SCOPE,
                               create_qs=False)
     else:
         self.create_dc_critic(hidden_streams[0],
                               TARGET_SCOPE,
                               create_qs=False)
     if self.use_recurrent:
         self.memory_out = tf.concat(self.value_memory_out,
                                     axis=1)  # Needed for Barracuda to work
Exemple #17
0
    def _create_memory_ins(self, m_size):
        """
        Creates the memory input placeholders for LSTM.
        :param m_size: the total size of the memory.
        """
        self.memory_in = tf.placeholder(
            shape=[None, m_size * 3], dtype=tf.float32, name="value_recurrent_in"
        )

        # Re-break-up for each network
        num_mems = 3
        input_size = self.memory_in.get_shape().as_list()[1]
        mem_ins = []
        for i in range(num_mems):
            _start = input_size // num_mems * i
            _end = input_size // num_mems * (i + 1)
            mem_ins.append(self.memory_in[:, _start:_end])
        self.value_memory_in = mem_ins[0]
        self.q1_memory_in = mem_ins[1]
        self.q2_memory_in = mem_ins[2]
Exemple #18
0
    def _create_cc_critic(
        self, h_size: int, num_layers: int, vis_encode_type: EncoderType
    ) -> None:
        """
        Creates Continuous control critic (value) network.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: The type of visual encoder to use.
        """
        hidden_stream = ModelUtils.create_observation_streams(
            self.policy.visual_in,
            self.policy.processed_vector_in,
            1,
            h_size,
            num_layers,
            vis_encode_type,
        )[0]

        if self.policy.use_recurrent:
            hidden_value, memory_value_out = ModelUtils.create_recurrent_encoder(
                hidden_stream,
                self.memory_in,
                self.policy.sequence_length_ph,
                name="lstm_value",
            )
            self.memory_out = memory_value_out
        else:
            hidden_value = hidden_stream

        self.value_heads, self.value = ModelUtils.create_value_heads(
            self.stream_names, hidden_value
        )
        self.all_old_log_probs = tf.placeholder(
            shape=[None, sum(self.policy.act_size)],
            dtype=tf.float32,
            name="old_probabilities",
        )

        self.old_log_probs = tf.reduce_sum(
            (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True
        )
Exemple #19
0
    def create_memory_ins(self, m_size):
        """
        Creates the memory input placeholders for LSTM.
        :param m_size: the total size of the memory.
        """
        # Create the Policy input separate from the rest
        # This is so in inference we only have to run the Policy network.
        # Barracuda will grab the recurrent_in and recurrent_out named tensors.
        self.inference_memory_in = tf.placeholder(
            shape=[None, m_size // 4], dtype=tf.float32, name="recurrent_in"
        )
        # We assume m_size is divisible by 4
        # Create the non-Policy inputs
        # Use a default placeholder here so nothing has to be provided during
        # Barracuda inference. Note that the default value is just the tiled input
        # for the policy, which is thrown away.
        three_fourths_m_size = m_size * 3 // 4
        self.other_memory_in = tf.placeholder_with_default(
            input=tf.tile(self.inference_memory_in, [1, 3]),
            shape=[None, three_fourths_m_size],
            name="other_recurrent_in",
        )

        # Concat and use this as the "placeholder"
        # for training
        self.memory_in = tf.concat(
            [self.other_memory_in, self.inference_memory_in], axis=1
        )

        # Re-break-up for each network
        num_mems = 4
        mem_ins = []
        for i in range(num_mems):
            _start = m_size // num_mems * i
            _end = m_size // num_mems * (i + 1)
            mem_ins.append(self.memory_in[:, _start:_end])
        self.value_memory_in = mem_ins[0]
        self.q1_memory_in = mem_ins[1]
        self.q2_memory_in = mem_ins[2]
        self.policy_memory_in = mem_ins[3]
Exemple #20
0
 def _create_cc_critic(self, hidden_value, scope, create_qs=True):
     """
     Creates just the critic network
     """
     scope = self.join_scopes(scope, "critic")
     self.create_sac_value_head(
         self.stream_names,
         hidden_value,
         self.num_layers,
         self.h_size,
         self.join_scopes(scope, "value"),
     )
     self.external_action_in = tf.placeholder(
         shape=[None, self.policy.act_size[0]],
         dtype=tf.float32,
         name="external_action_in",
     )
     self.value_vars = self.get_vars(self.join_scopes(scope, "value"))
     if create_qs:
         hidden_q = tf.concat([hidden_value, self.external_action_in],
                              axis=-1)
         hidden_qp = tf.concat([hidden_value, self.policy.output], axis=-1)
         self.q1_heads, self.q2_heads, self.q1, self.q2 = self.create_q_heads(
             self.stream_names,
             hidden_q,
             self.num_layers,
             self.h_size,
             self.join_scopes(scope, "q"),
         )
         self.q1_pheads, self.q2_pheads, self.q1_p, self.q2_p = self.create_q_heads(
             self.stream_names,
             hidden_qp,
             self.num_layers,
             self.h_size,
             self.join_scopes(scope, "q"),
             reuse=True,
         )
         self.q_vars = self.get_vars(self.join_scopes(scope, "q"))
     self.critic_vars = self.get_vars(scope)
Exemple #21
0
    def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
        """
        Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
        The PPO optimizer has a value estimator and a loss function.
        :param policy: A TFPolicy object that will be updated by this PPO Optimizer.
        :param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer.
        """
        # Create the graph here to give more granular control of the TF graph to the Optimizer.
        policy.create_tf_graph()

        with policy.graph.as_default():
            with tf.variable_scope("optimizer/"):
                super().__init__(policy, trainer_params)
                hyperparameters: PPOSettings = cast(
                    PPOSettings, trainer_params.hyperparameters)
                lr = float(hyperparameters.learning_rate)
                self._schedule = hyperparameters.learning_rate_schedule
                epsilon = float(hyperparameters.epsilon)
                beta = float(hyperparameters.beta)
                max_step = float(trainer_params.max_steps)

                policy_network_settings = policy.network_settings
                h_size = int(policy_network_settings.hidden_units)
                num_layers = policy_network_settings.num_layers
                vis_encode_type = policy_network_settings.vis_encode_type
                self.burn_in_ratio = 0.0

                self.stream_names = list(self.reward_signals.keys())

                self.tf_optimizer_op: Optional[tf.train.Optimizer] = None
                self.grads = None
                self.update_batch: Optional[tf.Operation] = None

                self.stats_name_to_update_name = {
                    "Losses/Value Loss": "value_loss",
                    "Losses/Policy Loss": "policy_loss",
                    "Policy/Learning Rate": "learning_rate",
                    "Policy/Epsilon": "decay_epsilon",
                    "Policy/Beta": "decay_beta",
                }
                if self.policy.use_recurrent:
                    self.m_size = self.policy.m_size
                    self.memory_in = tf.placeholder(
                        shape=[None, self.m_size],
                        dtype=tf.float32,
                        name="recurrent_value_in",
                    )

                if num_layers < 1:
                    num_layers = 1
                if policy.use_continuous_act:
                    self._create_cc_critic(h_size, num_layers, vis_encode_type)
                else:
                    self._create_dc_critic(h_size, num_layers, vis_encode_type)

                self.learning_rate = ModelUtils.create_schedule(
                    self._schedule,
                    lr,
                    self.policy.global_step,
                    int(max_step),
                    min_value=1e-10,
                )
                self._create_losses(
                    self.policy.total_log_probs,
                    self.old_log_probs,
                    self.value_heads,
                    self.policy.entropy,
                    beta,
                    epsilon,
                    lr,
                    max_step,
                )
                self._create_ppo_optimizer_ops()

            self.update_dict.update({
                "value_loss": self.value_loss,
                "policy_loss": self.abs_policy_loss,
                "update_batch": self.update_batch,
                "learning_rate": self.learning_rate,
                "decay_epsilon": self.decay_epsilon,
                "decay_beta": self.decay_beta,
            })
Exemple #22
0
    def create_input_placeholders(self):
        with self.graph.as_default():
            (
                self.global_step,
                self.increment_step_op,
                self.steps_to_increment,
            ) = ModelUtils.create_global_steps()
            self.vector_in, self.visual_in = ModelUtils.create_input_placeholders(
                self.behavior_spec.observation_shapes)
            if self.normalize:
                self.first_normalization_update = True
                normalization_tensors = ModelUtils.create_normalizer(
                    self.vector_in)
                self.update_normalization_op = normalization_tensors.update_op
                self.init_normalization_op = normalization_tensors.init_op
                self.normalization_steps = normalization_tensors.steps
                self.running_mean = normalization_tensors.running_mean
                self.running_variance = normalization_tensors.running_variance
                self.processed_vector_in = ModelUtils.normalize_vector_obs(
                    self.vector_in,
                    self.running_mean,
                    self.running_variance,
                    self.normalization_steps,
                )
            else:
                self.processed_vector_in = self.vector_in
                self.update_normalization_op = None

            self.batch_size_ph = tf.placeholder(shape=None,
                                                dtype=tf.int32,
                                                name="batch_size")
            self.sequence_length_ph = tf.placeholder(shape=None,
                                                     dtype=tf.int32,
                                                     name="sequence_length")
            self.mask_input = tf.placeholder(shape=[None],
                                             dtype=tf.float32,
                                             name="masks")
            # Only needed for PPO, but needed for BC module
            self.epsilon = tf.placeholder(shape=[None, self.act_size[0]],
                                          dtype=tf.float32,
                                          name="epsilon")
            self.mask = tf.cast(self.mask_input, tf.int32)

            tf.Variable(
                int(self.behavior_spec.is_action_continuous()),
                name="is_continuous_control",
                trainable=False,
                dtype=tf.int32,
            )
            int_version = TFPolicy._convert_version_string(__version__)
            major_ver_t = tf.Variable(
                int_version[0],
                name="trainer_major_version",
                trainable=False,
                dtype=tf.int32,
            )
            minor_ver_t = tf.Variable(
                int_version[1],
                name="trainer_minor_version",
                trainable=False,
                dtype=tf.int32,
            )
            patch_ver_t = tf.Variable(
                int_version[2],
                name="trainer_patch_version",
                trainable=False,
                dtype=tf.int32,
            )
            self.version_tensors = (major_ver_t, minor_ver_t, patch_ver_t)
            tf.Variable(
                MODEL_FORMAT_VERSION,
                name="version_number",
                trainable=False,
                dtype=tf.int32,
            )
            tf.Variable(self.m_size,
                        name="memory_size",
                        trainable=False,
                        dtype=tf.int32)
            if self.behavior_spec.is_action_continuous():
                tf.Variable(
                    self.act_size[0],
                    name="action_output_shape",
                    trainable=False,
                    dtype=tf.int32,
                )
            else:
                tf.Variable(
                    sum(self.act_size),
                    name="action_output_shape",
                    trainable=False,
                    dtype=tf.int32,
                )
Exemple #23
0
    def _create_losses(
        self,
        q1_streams: Dict[str, tf.Tensor],
        q2_streams: Dict[str, tf.Tensor],
        lr: tf.Tensor,
        max_step: int,
        stream_names: List[str],
        discrete: bool = False,
    ) -> None:
        """
        Creates training-specific Tensorflow ops for SAC models.
        :param q1_streams: Q1 streams from policy network
        :param q1_streams: Q2 streams from policy network
        :param lr: Learning rate
        :param max_step: Total number of training steps.
        :param stream_names: List of reward stream names.
        :param discrete: Whether or not to use discrete action losses.
        """

        if discrete:
            self.target_entropy = [
                self.discrete_target_entropy_scale *
                np.log(i).astype(np.float32) for i in self.act_size
            ]
            discrete_action_probs = tf.exp(self.policy.all_log_probs)
            per_action_entropy = discrete_action_probs * self.policy.all_log_probs
        else:
            self.target_entropy = (
                -1 * self.continuous_target_entropy_scale *
                np.prod(self.act_size[0]).astype(np.float32))

        self.rewards_holders = {}
        self.min_policy_qs = {}

        for name in stream_names:
            if discrete:
                _branched_mpq1 = ModelUtils.break_into_branches(
                    self.policy_network.q1_pheads[name] *
                    discrete_action_probs,
                    self.act_size,
                )
                branched_mpq1 = tf.stack([
                    tf.reduce_sum(_br, axis=1, keep_dims=True)
                    for _br in _branched_mpq1
                ])
                _q1_p_mean = tf.reduce_mean(branched_mpq1, axis=0)

                _branched_mpq2 = ModelUtils.break_into_branches(
                    self.policy_network.q2_pheads[name] *
                    discrete_action_probs,
                    self.act_size,
                )
                branched_mpq2 = tf.stack([
                    tf.reduce_sum(_br, axis=1, keep_dims=True)
                    for _br in _branched_mpq2
                ])
                _q2_p_mean = tf.reduce_mean(branched_mpq2, axis=0)

                self.min_policy_qs[name] = tf.minimum(_q1_p_mean, _q2_p_mean)
            else:
                self.min_policy_qs[name] = tf.minimum(
                    self.policy_network.q1_pheads[name],
                    self.policy_network.q2_pheads[name],
                )

            rewards_holder = tf.placeholder(shape=[None],
                                            dtype=tf.float32,
                                            name=f"{name}_rewards")
            self.rewards_holders[name] = rewards_holder

        q1_losses = []
        q2_losses = []
        # Multiple q losses per stream
        expanded_dones = tf.expand_dims(self.dones_holder, axis=-1)
        for i, name in enumerate(stream_names):
            _expanded_rewards = tf.expand_dims(self.rewards_holders[name],
                                               axis=-1)

            q_backup = tf.stop_gradient(
                _expanded_rewards +
                (1.0 - self.use_dones_in_backup[name] * expanded_dones) *
                self.gammas[i] * self.target_network.value_heads[name])

            if discrete:
                # We need to break up the Q functions by branch, and update them individually.
                branched_q1_stream = ModelUtils.break_into_branches(
                    self.policy.selected_actions * q1_streams[name],
                    self.act_size)
                branched_q2_stream = ModelUtils.break_into_branches(
                    self.policy.selected_actions * q2_streams[name],
                    self.act_size)

                # Reduce each branch into scalar
                branched_q1_stream = [
                    tf.reduce_sum(_branch, axis=1, keep_dims=True)
                    for _branch in branched_q1_stream
                ]
                branched_q2_stream = [
                    tf.reduce_sum(_branch, axis=1, keep_dims=True)
                    for _branch in branched_q2_stream
                ]

                q1_stream = tf.reduce_mean(branched_q1_stream, axis=0)
                q2_stream = tf.reduce_mean(branched_q2_stream, axis=0)

            else:
                q1_stream = q1_streams[name]
                q2_stream = q2_streams[name]

            _q1_loss = 0.5 * tf.reduce_mean(
                tf.to_float(self.policy.mask) *
                tf.squared_difference(q_backup, q1_stream))

            _q2_loss = 0.5 * tf.reduce_mean(
                tf.to_float(self.policy.mask) *
                tf.squared_difference(q_backup, q2_stream))

            q1_losses.append(_q1_loss)
            q2_losses.append(_q2_loss)

        self.q1_loss = tf.reduce_mean(q1_losses)
        self.q2_loss = tf.reduce_mean(q2_losses)

        # Learn entropy coefficient
        if discrete:
            # Create a log_ent_coef for each branch
            self.log_ent_coef = tf.get_variable(
                "log_ent_coef",
                dtype=tf.float32,
                initializer=np.log([self.init_entcoef] *
                                   len(self.act_size)).astype(np.float32),
                trainable=True,
            )
        else:
            self.log_ent_coef = tf.get_variable(
                "log_ent_coef",
                dtype=tf.float32,
                initializer=np.log(self.init_entcoef).astype(np.float32),
                trainable=True,
            )

        self.ent_coef = tf.exp(self.log_ent_coef)
        if discrete:
            # We also have to do a different entropy and target_entropy per branch.
            branched_per_action_ent = ModelUtils.break_into_branches(
                per_action_entropy, self.act_size)
            branched_ent_sums = tf.stack(
                [
                    tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te for _lp,
                    _te in zip(branched_per_action_ent, self.target_entropy)
                ],
                axis=1,
            )
            self.entropy_loss = -tf.reduce_mean(
                tf.to_float(self.policy.mask) * tf.reduce_mean(
                    self.log_ent_coef *
                    tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2),
                    axis=1,
                ))

            # Same with policy loss, we have to do the loss per branch and average them,
            # so that larger branches don't get more weight.
            # The equivalent KL divergence from Eq 10 of Haarnoja et al. is also pi*log(pi) - Q
            branched_q_term = ModelUtils.break_into_branches(
                discrete_action_probs * self.policy_network.q1_p,
                self.act_size)

            branched_policy_loss = tf.stack([
                tf.reduce_sum(self.ent_coef[i] * _lp - _qt,
                              axis=1,
                              keep_dims=True)
                for i, (_lp, _qt) in enumerate(
                    zip(branched_per_action_ent, branched_q_term))
            ])
            self.policy_loss = tf.reduce_mean(
                tf.to_float(self.policy.mask) *
                tf.squeeze(branched_policy_loss))

            # Do vbackup entropy bonus per branch as well.
            branched_ent_bonus = tf.stack([
                tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=True)
                for i, _lp in enumerate(branched_per_action_ent)
            ])
            value_losses = []
            for name in stream_names:
                v_backup = tf.stop_gradient(
                    self.min_policy_qs[name] -
                    tf.reduce_mean(branched_ent_bonus, axis=0))
                value_losses.append(0.5 * tf.reduce_mean(
                    tf.to_float(self.policy.mask) * tf.squared_difference(
                        self.policy_network.value_heads[name], v_backup)))

        else:
            self.entropy_loss = -tf.reduce_mean(
                self.log_ent_coef * tf.to_float(self.policy.mask) *
                tf.stop_gradient(
                    tf.reduce_sum(
                        self.policy.all_log_probs + self.target_entropy,
                        axis=1,
                        keep_dims=True,
                    )))
            batch_policy_loss = tf.reduce_mean(
                self.ent_coef * self.policy.all_log_probs -
                self.policy_network.q1_p,
                axis=1,
            )
            self.policy_loss = tf.reduce_mean(
                tf.to_float(self.policy.mask) * batch_policy_loss)

            value_losses = []
            for name in stream_names:
                v_backup = tf.stop_gradient(
                    self.min_policy_qs[name] - tf.reduce_sum(
                        self.ent_coef * self.policy.all_log_probs, axis=1))
                value_losses.append(0.5 * tf.reduce_mean(
                    tf.to_float(self.policy.mask) * tf.squared_difference(
                        self.policy_network.value_heads[name], v_backup)))
        self.value_loss = tf.reduce_mean(value_losses)

        self.total_value_loss = self.q1_loss + self.q2_loss + self.value_loss

        self.entropy = self.policy_network.entropy
Exemple #24
0
    def create_input_placeholders(self):
        with self.graph.as_default():
            (
                self.global_step,
                self.increment_step_op,
                self.steps_to_increment,
            ) = ModelUtils.create_global_steps()
            self.visual_in = ModelUtils.create_visual_input_placeholders(
                self.brain.camera_resolutions
            )
            self.vector_in = ModelUtils.create_vector_input(self.vec_obs_size)
            if self.normalize:
                normalization_tensors = ModelUtils.create_normalizer(self.vector_in)
                self.update_normalization_op = normalization_tensors.update_op
                self.normalization_steps = normalization_tensors.steps
                self.running_mean = normalization_tensors.running_mean
                self.running_variance = normalization_tensors.running_variance
                self.processed_vector_in = ModelUtils.normalize_vector_obs(
                    self.vector_in,
                    self.running_mean,
                    self.running_variance,
                    self.normalization_steps,
                )
            else:
                self.processed_vector_in = self.vector_in
                self.update_normalization_op = None

            self.batch_size_ph = tf.placeholder(
                shape=None, dtype=tf.int32, name="batch_size"
            )
            self.sequence_length_ph = tf.placeholder(
                shape=None, dtype=tf.int32, name="sequence_length"
            )
            self.mask_input = tf.placeholder(
                shape=[None], dtype=tf.float32, name="masks"
            )
            # Only needed for PPO, but needed for BC module
            self.epsilon = tf.placeholder(
                shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon"
            )
            self.mask = tf.cast(self.mask_input, tf.int32)

            tf.Variable(
                int(self.brain.vector_action_space_type == "continuous"),
                name="is_continuous_control",
                trainable=False,
                dtype=tf.int32,
            )
            tf.Variable(
                self._version_number_,
                name="version_number",
                trainable=False,
                dtype=tf.int32,
            )
            tf.Variable(
                self.m_size, name="memory_size", trainable=False, dtype=tf.int32
            )
            if self.brain.vector_action_space_type == "continuous":
                tf.Variable(
                    self.act_size[0],
                    name="action_output_shape",
                    trainable=False,
                    dtype=tf.int32,
                )
            else:
                tf.Variable(
                    sum(self.act_size),
                    name="action_output_shape",
                    trainable=False,
                    dtype=tf.int32,
                )
Exemple #25
0
    def make_inputs(self) -> None:
        """
        Creates the input layers for the discriminator
        """
        self.done_expert_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        self.done_policy_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        self.done_expert = tf.expand_dims(self.done_expert_holder, -1)
        self.done_policy = tf.expand_dims(self.done_policy_holder, -1)

        if self.policy.brain.vector_action_space_type == "continuous":
            action_length = self.policy.act_size[0]
            self.action_in_expert = tf.placeholder(
                shape=[None, action_length], dtype=tf.float32
            )
            self.expert_action = tf.identity(self.action_in_expert)
        else:
            action_length = len(self.policy.act_size)
            self.action_in_expert = tf.placeholder(
                shape=[None, action_length], dtype=tf.int32
            )
            self.expert_action = tf.concat(
                [
                    tf.one_hot(self.action_in_expert[:, i], act_size)
                    for i, act_size in enumerate(self.policy.act_size)
                ],
                axis=1,
            )

        encoded_policy_list = []
        encoded_expert_list = []

        if self.policy.vec_obs_size > 0:
            self.obs_in_expert = tf.placeholder(
                shape=[None, self.policy.vec_obs_size], dtype=tf.float32
            )
            if self.policy.normalize:
                encoded_expert_list.append(
                    ModelUtils.normalize_vector_obs(
                        self.obs_in_expert,
                        self.policy.running_mean,
                        self.policy.running_variance,
                        self.policy.normalization_steps,
                    )
                )
                encoded_policy_list.append(self.policy.processed_vector_in)
            else:
                encoded_expert_list.append(self.obs_in_expert)
                encoded_policy_list.append(self.policy.vector_in)

        if self.policy.vis_obs_size > 0:
            self.expert_visual_in: List[tf.Tensor] = []
            visual_policy_encoders = []
            visual_expert_encoders = []
            for i in range(self.policy.vis_obs_size):
                # Create input ops for next (t+1) visual observations.
                visual_input = ModelUtils.create_visual_input(
                    self.policy.brain.camera_resolutions[i],
                    name="gail_visual_observation_" + str(i),
                )
                self.expert_visual_in.append(visual_input)

                encoded_policy_visual = ModelUtils.create_visual_observation_encoder(
                    self.policy.visual_in[i],
                    self.encoding_size,
                    ModelUtils.swish,
                    1,
                    "gail_stream_{}_visual_obs_encoder".format(i),
                    False,
                )

                encoded_expert_visual = ModelUtils.create_visual_observation_encoder(
                    self.expert_visual_in[i],
                    self.encoding_size,
                    ModelUtils.swish,
                    1,
                    "gail_stream_{}_visual_obs_encoder".format(i),
                    True,
                )
                visual_policy_encoders.append(encoded_policy_visual)
                visual_expert_encoders.append(encoded_expert_visual)
            hidden_policy_visual = tf.concat(visual_policy_encoders, axis=1)
            hidden_expert_visual = tf.concat(visual_expert_encoders, axis=1)
            encoded_policy_list.append(hidden_policy_visual)
            encoded_expert_list.append(hidden_expert_visual)

        self.encoded_expert = tf.concat(encoded_expert_list, axis=1)
        self.encoded_policy = tf.concat(encoded_policy_list, axis=1)
Exemple #26
0
    def create_dc_actor(self, hidden_policy, scope):
        """
        Creates Discrete control actor for SAC.
        :param hidden_policy: Output of feature extractor (i.e. the input for vector obs, output of CNN for visual obs).
        :param num_layers: TF scope to assign whatever is created in this block.
        """
        scope = self.join_scopes(scope, "policy")

        # Create inputs outside of the scope
        self.action_masks = tf.placeholder(shape=[None,
                                                  sum(self.act_size)],
                                           dtype=tf.float32,
                                           name="action_masks")

        if self.use_recurrent:
            self.prev_action = tf.placeholder(shape=[None,
                                                     len(self.act_size)],
                                              dtype=tf.int32,
                                              name="prev_action")

        with tf.variable_scope(scope):
            hidden_policy = self.create_vector_observation_encoder(
                hidden_policy,
                self.h_size,
                self.activ_fn,
                self.num_layers,
                "encoder",
                False,
            )
        if self.use_recurrent:
            prev_action_oh = tf.concat(
                [
                    tf.one_hot(self.prev_action[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            hidden_policy = tf.concat([hidden_policy, prev_action_oh], axis=1)

            hidden_policy, memory_out = self.create_recurrent_encoder(
                hidden_policy,
                self.policy_memory_in,
                self.sequence_length,
                name="lstm_policy",
            )
            self.policy_memory_out = memory_out
        with tf.variable_scope(scope):
            policy_branches = []
            for size in self.act_size:
                policy_branches.append(
                    tf.layers.dense(
                        hidden_policy,
                        size,
                        activation=None,
                        use_bias=False,
                        kernel_initializer=tf.initializers.variance_scaling(
                            0.01),
                    ))
            all_logits = tf.concat(policy_branches,
                                   axis=1,
                                   name="action_probs")

            output, normalized_probs, normalized_logprobs = self.create_discrete_action_masking_layer(
                all_logits, self.action_masks, self.act_size)

            self.action_probs = normalized_probs

            # Really, this is entropy, but it has an analogous purpose to the log probs in the
            # continuous case.
            self.all_log_probs = self.action_probs * normalized_logprobs
            self.output = output

            # Create action input (discrete)
            self.action_holder = tf.placeholder(
                shape=[None, len(policy_branches)],
                dtype=tf.int32,
                name="action_holder")

            self.output_oh = tf.concat(
                [
                    tf.one_hot(self.action_holder[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )

            # For Curiosity and GAIL to retrieve selected actions. We don't
            # need the mask at this point because it's already stored in the buffer.
            self.selected_actions = tf.stop_gradient(self.output_oh)

            self.external_action_in = tf.concat(
                [
                    tf.one_hot(self.action_holder[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )

            # This is total entropy over all branches
            self.entropy = -1 * tf.reduce_sum(self.all_log_probs, axis=1)

        # Extract the normalized logprobs for Barracuda
        self.normalized_logprobs = tf.identity(normalized_logprobs,
                                               name="action")

        # We kept the LSTMs at a different scope than the rest, so add them if they exist.
        self.policy_vars = self.get_vars(scope)
        if self.use_recurrent:
            self.policy_vars += self.get_vars("lstm")
Exemple #27
0
    def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
        """
        Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
        The PPO optimizer has a value estimator and a loss function.
        :param policy: A TFPolicy object that will be updated by this PPO Optimizer.
        :param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer.
        """
        # Create the graph here to give more granular control of the TF graph to the Optimizer.
        policy.create_tf_graph()

        with policy.graph.as_default():
            with tf.variable_scope("optimizer/"):
                super().__init__(policy, trainer_params)

                lr = float(trainer_params["learning_rate"])
                lr_schedule = LearningRateSchedule(
                    trainer_params.get("learning_rate_schedule", "linear"))
                h_size = int(trainer_params["hidden_units"])
                epsilon = float(trainer_params["epsilon"])
                beta = float(trainer_params["beta"])
                max_step = float(trainer_params["max_steps"])
                num_layers = int(trainer_params["num_layers"])
                vis_encode_type = EncoderType(
                    trainer_params.get("vis_encode_type", "simple"))
                self.burn_in_ratio = float(
                    trainer_params.get("burn_in_ratio", 0.0))

                self.stream_names = list(self.reward_signals.keys())

                self.tf_optimizer: Optional[tf.train.AdamOptimizer] = None
                self.grads = None
                self.update_batch: Optional[tf.Operation] = None

                self.stats_name_to_update_name = {
                    "Losses/Value Loss": "value_loss",
                    "Losses/Policy Loss": "policy_loss",
                    "Policy/Learning Rate": "learning_rate",
                }
                if self.policy.use_recurrent:
                    self.m_size = self.policy.m_size
                    self.memory_in = tf.placeholder(
                        shape=[None, self.m_size],
                        dtype=tf.float32,
                        name="recurrent_value_in",
                    )

                if num_layers < 1:
                    num_layers = 1
                if policy.use_continuous_act:
                    self._create_cc_critic(h_size, num_layers, vis_encode_type)
                else:
                    self._create_dc_critic(h_size, num_layers, vis_encode_type)

                self.learning_rate = ModelUtils.create_learning_rate(
                    lr_schedule, lr, self.policy.global_step, int(max_step))
                self._create_losses(
                    self.policy.total_log_probs,
                    self.old_log_probs,
                    self.value_heads,
                    self.policy.entropy,
                    beta,
                    epsilon,
                    lr,
                    max_step,
                )
                self._create_ppo_optimizer_ops()

            self.update_dict.update({
                "value_loss": self.value_loss,
                "policy_loss": self.abs_policy_loss,
                "update_batch": self.update_batch,
                "learning_rate": self.learning_rate,
            })

            self.policy.initialize_or_load()
Exemple #28
0
    def make_inputs(self) -> None:
        """
        Creates the input layers for the discriminator
        """
        self.done_expert_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        self.done_policy_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        self.done_expert = tf.expand_dims(self.done_expert_holder, -1)
        self.done_policy = tf.expand_dims(self.done_policy_holder, -1)

        if self.policy.behavior_spec.is_action_continuous():
            action_length = self.policy.act_size[0]
            self.action_in_expert = tf.placeholder(
                shape=[None, action_length], dtype=tf.float32
            )
            self.expert_action = tf.identity(self.action_in_expert)
        else:
            action_length = len(self.policy.act_size)
            self.action_in_expert = tf.placeholder(
                shape=[None, action_length], dtype=tf.int32
            )
            self.expert_action = tf.concat(
                [
                    tf.one_hot(self.action_in_expert[:, i], act_size)
                    for i, act_size in enumerate(self.policy.act_size)
                ],
                axis=1,
            )

        encoded_policy_list = []
        encoded_expert_list = []

        (
            self.obs_in_expert,
            self.expert_visual_in,
        ) = ModelUtils.create_input_placeholders(
            self.policy.behavior_spec.observation_shapes, "gail_"
        )

        if self.policy.vec_obs_size > 0:
            if self.policy.normalize:
                encoded_expert_list.append(
                    ModelUtils.normalize_vector_obs(
                        self.obs_in_expert,
                        self.policy.running_mean,
                        self.policy.running_variance,
                        self.policy.normalization_steps,
                    )
                )
                encoded_policy_list.append(self.policy.processed_vector_in)
            else:
                encoded_expert_list.append(self.obs_in_expert)
                encoded_policy_list.append(self.policy.vector_in)

        if self.expert_visual_in:
            visual_policy_encoders = []
            visual_expert_encoders = []
            for i, (vis_in, exp_vis_in) in enumerate(
                zip(self.policy.visual_in, self.expert_visual_in)
            ):
                encoded_policy_visual = ModelUtils.create_visual_observation_encoder(
                    vis_in,
                    self.encoding_size,
                    ModelUtils.swish,
                    1,
                    f"gail_stream_{i}_visual_obs_encoder",
                    False,
                )

                encoded_expert_visual = ModelUtils.create_visual_observation_encoder(
                    exp_vis_in,
                    self.encoding_size,
                    ModelUtils.swish,
                    1,
                    f"gail_stream_{i}_visual_obs_encoder",
                    True,
                )
                visual_policy_encoders.append(encoded_policy_visual)
                visual_expert_encoders.append(encoded_expert_visual)
            hidden_policy_visual = tf.concat(visual_policy_encoders, axis=1)
            hidden_expert_visual = tf.concat(visual_expert_encoders, axis=1)
            encoded_policy_list.append(hidden_policy_visual)
            encoded_expert_list.append(hidden_expert_visual)

        self.encoded_expert = tf.concat(encoded_expert_list, axis=1)
        self.encoded_policy = tf.concat(encoded_policy_list, axis=1)
Exemple #29
0
    def _create_losses(self, probs, old_probs, value_heads, entropy, beta,
                       epsilon, lr, max_step):
        """
        Creates training-specific Tensorflow ops for PPO models.
        :param probs: Current policy probabilities
        :param old_probs: Past policy probabilities
        :param value_heads: Value estimate tensors from each value stream
        :param beta: Entropy regularization strength
        :param entropy: Current policy entropy
        :param epsilon: Value for policy-divergence threshold
        :param lr: Learning rate
        :param max_step: Total number of training steps.
        """
        self.returns_holders = {}
        self.old_values = {}
        for name in value_heads.keys():
            returns_holder = tf.placeholder(shape=[None],
                                            dtype=tf.float32,
                                            name="{}_returns".format(name))
            old_value = tf.placeholder(shape=[None],
                                       dtype=tf.float32,
                                       name="{}_value_estimate".format(name))
            self.returns_holders[name] = returns_holder
            self.old_values[name] = old_value
        self.advantage = tf.placeholder(shape=[None],
                                        dtype=tf.float32,
                                        name="advantages")
        advantage = tf.expand_dims(self.advantage, -1)

        decay_epsilon = tf.train.polynomial_decay(epsilon,
                                                  self.policy.global_step,
                                                  max_step,
                                                  0.1,
                                                  power=1.0)
        decay_beta = tf.train.polynomial_decay(beta,
                                               self.policy.global_step,
                                               max_step,
                                               1e-5,
                                               power=1.0)

        value_losses = []
        for name, head in value_heads.items():
            clipped_value_estimate = self.old_values[name] + tf.clip_by_value(
                tf.reduce_sum(head, axis=1) - self.old_values[name],
                -decay_epsilon,
                decay_epsilon,
            )
            v_opt_a = tf.squared_difference(self.returns_holders[name],
                                            tf.reduce_sum(head, axis=1))
            v_opt_b = tf.squared_difference(self.returns_holders[name],
                                            clipped_value_estimate)
            value_loss = tf.reduce_mean(
                tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b),
                                     self.policy.mask, 2)[1])
            value_losses.append(value_loss)
        self.value_loss = tf.reduce_mean(value_losses)

        r_theta = tf.exp(probs - old_probs)
        p_opt_a = r_theta * advantage
        p_opt_b = (tf.clip_by_value(r_theta, 1.0 - decay_epsilon,
                                    1.0 + decay_epsilon) * advantage)
        self.policy_loss = -tf.reduce_mean(
            tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b),
                                 self.policy.mask, 2)[1])
        # For cleaner stats reporting
        self.abs_policy_loss = tf.abs(self.policy_loss)

        self.loss = (
            self.policy_loss + 0.5 * self.value_loss -
            decay_beta * tf.reduce_mean(
                tf.dynamic_partition(entropy, self.policy.mask, 2)[1]))
Exemple #30
0
    def create_cc_actor(self, hidden_policy, scope):
        """
        Creates Continuous control actor for SAC.
        :param hidden_policy: Output of feature extractor (i.e. the input for vector obs, output of CNN for visual obs).
        :param num_layers: TF scope to assign whatever is created in this block.
        """
        # Create action input (continuous)
        self.action_holder = tf.placeholder(shape=[None, self.act_size[0]],
                                            dtype=tf.float32,
                                            name="action_holder")
        self.external_action_in = self.action_holder

        scope = self.join_scopes(scope, "policy")

        with tf.variable_scope(scope):
            hidden_policy = self.create_vector_observation_encoder(
                hidden_policy,
                self.h_size,
                self.activ_fn,
                self.num_layers,
                "encoder",
                False,
            )
        if self.use_recurrent:
            hidden_policy, memory_out = self.create_recurrent_encoder(
                hidden_policy,
                self.policy_memory_in,
                self.sequence_length,
                name="lstm_policy",
            )
            self.policy_memory_out = memory_out
        with tf.variable_scope(scope):
            mu = tf.layers.dense(
                hidden_policy,
                self.act_size[0],
                activation=None,
                name="mu",
                kernel_initializer=LearningModel.scaled_init(0.01),
            )

            # Policy-dependent log_sigma_sq
            log_sigma_sq = tf.layers.dense(
                hidden_policy,
                self.act_size[0],
                activation=None,
                name="log_std",
                kernel_initializer=LearningModel.scaled_init(0.01),
            )

            self.log_sigma_sq = tf.clip_by_value(log_sigma_sq, LOG_STD_MIN,
                                                 LOG_STD_MAX)

            sigma_sq = tf.exp(self.log_sigma_sq)

            # Do the reparameterization trick
            policy_ = mu + tf.random_normal(tf.shape(mu)) * sigma_sq

            _gauss_pre = -0.5 * (((policy_ - mu) /
                                  (tf.exp(self.log_sigma_sq) + EPSILON))**2 +
                                 2 * self.log_sigma_sq + np.log(2 * np.pi))

            all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True)

            self.entropy = tf.reduce_sum(self.log_sigma_sq +
                                         0.5 * np.log(2.0 * np.pi * np.e),
                                         axis=-1)

            # Squash probabilities
            # Keep deterministic around in case we want to use it.
            self.deterministic_output = tf.tanh(mu)

            # Note that this is just for symmetry with PPO.
            self.output_pre = tf.tanh(policy_)

            # Squash correction
            all_probs -= tf.reduce_sum(tf.log(1 - self.output_pre**2 +
                                              EPSILON),
                                       axis=1,
                                       keepdims=True)

            self.all_log_probs = all_probs
            self.selected_actions = tf.stop_gradient(self.output_pre)

            self.action_probs = all_probs

        # Extract output for Barracuda
        self.output = tf.identity(self.output_pre, name="action")

        # Get all policy vars
        self.policy_vars = self.get_vars(scope)