Beispiel #1
0
    def _create_losses(self, probs, old_probs, value_heads, entropy, beta,
                       epsilon, lr, max_step):
        """
        Creates training-specific Tensorflow ops for PPO models.
        :param probs: Current policy probabilities
        :param old_probs: Past policy probabilities
        :param value_heads: Value estimate tensors from each value stream
        :param beta: Entropy regularization strength
        :param entropy: Current policy entropy
        :param epsilon: Value for policy-divergence threshold
        :param lr: Learning rate
        :param max_step: Total number of training steps.
        """
        self.returns_holders = {}
        self.old_values = {}
        for name in value_heads.keys():
            returns_holder = tf.placeholder(shape=[None],
                                            dtype=tf.float32,
                                            name="{}_returns".format(name))
            old_value = tf.placeholder(shape=[None],
                                       dtype=tf.float32,
                                       name="{}_value_estimate".format(name))
            self.returns_holders[name] = returns_holder
            self.old_values[name] = old_value
        self.advantage = tf.placeholder(shape=[None],
                                        dtype=tf.float32,
                                        name="advantages")
        advantage = tf.expand_dims(self.advantage, -1)

        decay_epsilon = tf.train.polynomial_decay(epsilon,
                                                  self.policy.global_step,
                                                  max_step,
                                                  0.1,
                                                  power=1.0)
        decay_beta = tf.train.polynomial_decay(beta,
                                               self.policy.global_step,
                                               max_step,
                                               1e-5,
                                               power=1.0)

        value_losses = []
        for name, head in value_heads.items():
            clipped_value_estimate = self.old_values[name] + tf.clip_by_value(
                tf.reduce_sum(head, axis=1) - self.old_values[name],
                -decay_epsilon,
                decay_epsilon,
            )
            v_opt_a = tf.squared_difference(self.returns_holders[name],
                                            tf.reduce_sum(head, axis=1))
            v_opt_b = tf.squared_difference(self.returns_holders[name],
                                            clipped_value_estimate)
            value_loss = tf.reduce_mean(
                tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b),
                                     self.policy.mask, 2)[1])
            value_losses.append(value_loss)
        self.value_loss = tf.reduce_mean(value_losses)

        r_theta = tf.exp(probs - old_probs)
        p_opt_a = r_theta * advantage
        p_opt_b = (tf.clip_by_value(r_theta, 1.0 - decay_epsilon,
                                    1.0 + decay_epsilon) * advantage)
        self.policy_loss = -tf.reduce_mean(
            tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b),
                                 self.policy.mask, 2)[1])
        # For cleaner stats reporting
        self.abs_policy_loss = tf.abs(self.policy_loss)

        self.loss = (
            self.policy_loss + 0.5 * self.value_loss -
            decay_beta * tf.reduce_mean(
                tf.dynamic_partition(entropy, self.policy.mask, 2)[1]))
Beispiel #2
0
    def make_inputs(self) -> None:
        """
        Creates the input layers for the discriminator
        """
        self.done_expert_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        self.done_policy_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        self.done_expert = tf.expand_dims(self.done_expert_holder, -1)
        self.done_policy = tf.expand_dims(self.done_policy_holder, -1)

        if self.policy.brain.vector_action_space_type == "continuous":
            action_length = self.policy.act_size[0]
            self.action_in_expert = tf.placeholder(
                shape=[None, action_length], dtype=tf.float32
            )
            self.expert_action = tf.identity(self.action_in_expert)
        else:
            action_length = len(self.policy.act_size)
            self.action_in_expert = tf.placeholder(
                shape=[None, action_length], dtype=tf.int32
            )
            self.expert_action = tf.concat(
                [
                    tf.one_hot(self.action_in_expert[:, i], act_size)
                    for i, act_size in enumerate(self.policy.act_size)
                ],
                axis=1,
            )

        encoded_policy_list = []
        encoded_expert_list = []

        if self.policy.vec_obs_size > 0:
            self.obs_in_expert = tf.placeholder(
                shape=[None, self.policy.vec_obs_size], dtype=tf.float32
            )
            if self.policy.normalize:
                encoded_expert_list.append(
                    ModelUtils.normalize_vector_obs(
                        self.obs_in_expert,
                        self.policy.running_mean,
                        self.policy.running_variance,
                        self.policy.normalization_steps,
                    )
                )
                encoded_policy_list.append(self.policy.processed_vector_in)
            else:
                encoded_expert_list.append(self.obs_in_expert)
                encoded_policy_list.append(self.policy.vector_in)

        if self.policy.vis_obs_size > 0:
            self.expert_visual_in: List[tf.Tensor] = []
            visual_policy_encoders = []
            visual_expert_encoders = []
            for i in range(self.policy.vis_obs_size):
                # Create input ops for next (t+1) visual observations.
                visual_input = ModelUtils.create_visual_input(
                    self.policy.brain.camera_resolutions[i],
                    name="gail_visual_observation_" + str(i),
                )
                self.expert_visual_in.append(visual_input)

                encoded_policy_visual = ModelUtils.create_visual_observation_encoder(
                    self.policy.visual_in[i],
                    self.encoding_size,
                    ModelUtils.swish,
                    1,
                    "gail_stream_{}_visual_obs_encoder".format(i),
                    False,
                )

                encoded_expert_visual = ModelUtils.create_visual_observation_encoder(
                    self.expert_visual_in[i],
                    self.encoding_size,
                    ModelUtils.swish,
                    1,
                    "gail_stream_{}_visual_obs_encoder".format(i),
                    True,
                )
                visual_policy_encoders.append(encoded_policy_visual)
                visual_expert_encoders.append(encoded_expert_visual)
            hidden_policy_visual = tf.concat(visual_policy_encoders, axis=1)
            hidden_expert_visual = tf.concat(visual_expert_encoders, axis=1)
            encoded_policy_list.append(hidden_policy_visual)
            encoded_expert_list.append(hidden_expert_visual)

        self.encoded_expert = tf.concat(encoded_expert_list, axis=1)
        self.encoded_policy = tf.concat(encoded_policy_list, axis=1)
Beispiel #3
0
    def _create_losses(
        self,
        q1_streams: Dict[str, tf.Tensor],
        q2_streams: Dict[str, tf.Tensor],
        lr: tf.Tensor,
        max_step: int,
        stream_names: List[str],
        discrete: bool = False,
    ) -> None:
        """
        Creates training-specific Tensorflow ops for SAC models.
        :param q1_streams: Q1 streams from policy network
        :param q1_streams: Q2 streams from policy network
        :param lr: Learning rate
        :param max_step: Total number of training steps.
        :param stream_names: List of reward stream names.
        :param discrete: Whether or not to use discrete action losses.
        """

        if discrete:
            self.target_entropy = [
                self.discrete_target_entropy_scale *
                np.log(i).astype(np.float32) for i in self.act_size
            ]
            discrete_action_probs = tf.exp(self.policy.all_log_probs)
            per_action_entropy = discrete_action_probs * self.policy.all_log_probs
        else:
            self.target_entropy = (
                -1 * self.continuous_target_entropy_scale *
                np.prod(self.act_size[0]).astype(np.float32))

        self.rewards_holders = {}
        self.min_policy_qs = {}

        for name in stream_names:
            if discrete:
                _branched_mpq1 = ModelUtils.break_into_branches(
                    self.policy_network.q1_pheads[name] *
                    discrete_action_probs,
                    self.act_size,
                )
                branched_mpq1 = tf.stack([
                    tf.reduce_sum(_br, axis=1, keep_dims=True)
                    for _br in _branched_mpq1
                ])
                _q1_p_mean = tf.reduce_mean(branched_mpq1, axis=0)

                _branched_mpq2 = ModelUtils.break_into_branches(
                    self.policy_network.q2_pheads[name] *
                    discrete_action_probs,
                    self.act_size,
                )
                branched_mpq2 = tf.stack([
                    tf.reduce_sum(_br, axis=1, keep_dims=True)
                    for _br in _branched_mpq2
                ])
                _q2_p_mean = tf.reduce_mean(branched_mpq2, axis=0)

                self.min_policy_qs[name] = tf.minimum(_q1_p_mean, _q2_p_mean)
            else:
                self.min_policy_qs[name] = tf.minimum(
                    self.policy_network.q1_pheads[name],
                    self.policy_network.q2_pheads[name],
                )

            rewards_holder = tf.placeholder(shape=[None],
                                            dtype=tf.float32,
                                            name=f"{name}_rewards")
            self.rewards_holders[name] = rewards_holder

        q1_losses = []
        q2_losses = []
        # Multiple q losses per stream
        expanded_dones = tf.expand_dims(self.dones_holder, axis=-1)
        for i, name in enumerate(stream_names):
            _expanded_rewards = tf.expand_dims(self.rewards_holders[name],
                                               axis=-1)

            q_backup = tf.stop_gradient(
                _expanded_rewards +
                (1.0 - self.use_dones_in_backup[name] * expanded_dones) *
                self.gammas[i] * self.target_network.value_heads[name])

            if discrete:
                # We need to break up the Q functions by branch, and update them individually.
                branched_q1_stream = ModelUtils.break_into_branches(
                    self.policy.selected_actions * q1_streams[name],
                    self.act_size)
                branched_q2_stream = ModelUtils.break_into_branches(
                    self.policy.selected_actions * q2_streams[name],
                    self.act_size)

                # Reduce each branch into scalar
                branched_q1_stream = [
                    tf.reduce_sum(_branch, axis=1, keep_dims=True)
                    for _branch in branched_q1_stream
                ]
                branched_q2_stream = [
                    tf.reduce_sum(_branch, axis=1, keep_dims=True)
                    for _branch in branched_q2_stream
                ]

                q1_stream = tf.reduce_mean(branched_q1_stream, axis=0)
                q2_stream = tf.reduce_mean(branched_q2_stream, axis=0)

            else:
                q1_stream = q1_streams[name]
                q2_stream = q2_streams[name]

            _q1_loss = 0.5 * tf.reduce_mean(
                tf.to_float(self.policy.mask) *
                tf.squared_difference(q_backup, q1_stream))

            _q2_loss = 0.5 * tf.reduce_mean(
                tf.to_float(self.policy.mask) *
                tf.squared_difference(q_backup, q2_stream))

            q1_losses.append(_q1_loss)
            q2_losses.append(_q2_loss)

        self.q1_loss = tf.reduce_mean(q1_losses)
        self.q2_loss = tf.reduce_mean(q2_losses)

        # Learn entropy coefficient
        if discrete:
            # Create a log_ent_coef for each branch
            self.log_ent_coef = tf.get_variable(
                "log_ent_coef",
                dtype=tf.float32,
                initializer=np.log([self.init_entcoef] *
                                   len(self.act_size)).astype(np.float32),
                trainable=True,
            )
        else:
            self.log_ent_coef = tf.get_variable(
                "log_ent_coef",
                dtype=tf.float32,
                initializer=np.log(self.init_entcoef).astype(np.float32),
                trainable=True,
            )

        self.ent_coef = tf.exp(self.log_ent_coef)
        if discrete:
            # We also have to do a different entropy and target_entropy per branch.
            branched_per_action_ent = ModelUtils.break_into_branches(
                per_action_entropy, self.act_size)
            branched_ent_sums = tf.stack(
                [
                    tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te for _lp,
                    _te in zip(branched_per_action_ent, self.target_entropy)
                ],
                axis=1,
            )
            self.entropy_loss = -tf.reduce_mean(
                tf.to_float(self.policy.mask) * tf.reduce_mean(
                    self.log_ent_coef *
                    tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2),
                    axis=1,
                ))

            # Same with policy loss, we have to do the loss per branch and average them,
            # so that larger branches don't get more weight.
            # The equivalent KL divergence from Eq 10 of Haarnoja et al. is also pi*log(pi) - Q
            branched_q_term = ModelUtils.break_into_branches(
                discrete_action_probs * self.policy_network.q1_p,
                self.act_size)

            branched_policy_loss = tf.stack([
                tf.reduce_sum(self.ent_coef[i] * _lp - _qt,
                              axis=1,
                              keep_dims=True)
                for i, (_lp, _qt) in enumerate(
                    zip(branched_per_action_ent, branched_q_term))
            ])
            self.policy_loss = tf.reduce_mean(
                tf.to_float(self.policy.mask) *
                tf.squeeze(branched_policy_loss))

            # Do vbackup entropy bonus per branch as well.
            branched_ent_bonus = tf.stack([
                tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=True)
                for i, _lp in enumerate(branched_per_action_ent)
            ])
            value_losses = []
            for name in stream_names:
                v_backup = tf.stop_gradient(
                    self.min_policy_qs[name] -
                    tf.reduce_mean(branched_ent_bonus, axis=0))
                value_losses.append(0.5 * tf.reduce_mean(
                    tf.to_float(self.policy.mask) * tf.squared_difference(
                        self.policy_network.value_heads[name], v_backup)))

        else:
            self.entropy_loss = -tf.reduce_mean(
                self.log_ent_coef * tf.to_float(self.policy.mask) *
                tf.stop_gradient(
                    tf.reduce_sum(
                        self.policy.all_log_probs + self.target_entropy,
                        axis=1,
                        keep_dims=True,
                    )))
            batch_policy_loss = tf.reduce_mean(
                self.ent_coef * self.policy.all_log_probs -
                self.policy_network.q1_p,
                axis=1,
            )
            self.policy_loss = tf.reduce_mean(
                tf.to_float(self.policy.mask) * batch_policy_loss)

            value_losses = []
            for name in stream_names:
                v_backup = tf.stop_gradient(
                    self.min_policy_qs[name] - tf.reduce_sum(
                        self.ent_coef * self.policy.all_log_probs, axis=1))
                value_losses.append(0.5 * tf.reduce_mean(
                    tf.to_float(self.policy.mask) * tf.squared_difference(
                        self.policy_network.value_heads[name], v_backup)))
        self.value_loss = tf.reduce_mean(value_losses)

        self.total_value_loss = self.q1_loss + self.q2_loss + self.value_loss

        self.entropy = self.policy_network.entropy
Beispiel #4
0
    def make_inputs(self) -> None:
        """
        Creates the input layers for the discriminator
        """
        self.done_expert_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        self.done_policy_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        self.done_expert = tf.expand_dims(self.done_expert_holder, -1)
        self.done_policy = tf.expand_dims(self.done_policy_holder, -1)

        if self.policy.behavior_spec.is_action_continuous():
            action_length = self.policy.act_size[0]
            self.action_in_expert = tf.placeholder(
                shape=[None, action_length], dtype=tf.float32
            )
            self.expert_action = tf.identity(self.action_in_expert)
        else:
            action_length = len(self.policy.act_size)
            self.action_in_expert = tf.placeholder(
                shape=[None, action_length], dtype=tf.int32
            )
            self.expert_action = tf.concat(
                [
                    tf.one_hot(self.action_in_expert[:, i], act_size)
                    for i, act_size in enumerate(self.policy.act_size)
                ],
                axis=1,
            )

        encoded_policy_list = []
        encoded_expert_list = []

        (
            self.obs_in_expert,
            self.expert_visual_in,
        ) = ModelUtils.create_input_placeholders(
            self.policy.behavior_spec.observation_shapes, "gail_"
        )

        if self.policy.vec_obs_size > 0:
            if self.policy.normalize:
                encoded_expert_list.append(
                    ModelUtils.normalize_vector_obs(
                        self.obs_in_expert,
                        self.policy.running_mean,
                        self.policy.running_variance,
                        self.policy.normalization_steps,
                    )
                )
                encoded_policy_list.append(self.policy.processed_vector_in)
            else:
                encoded_expert_list.append(self.obs_in_expert)
                encoded_policy_list.append(self.policy.vector_in)

        if self.expert_visual_in:
            visual_policy_encoders = []
            visual_expert_encoders = []
            for i, (vis_in, exp_vis_in) in enumerate(
                zip(self.policy.visual_in, self.expert_visual_in)
            ):
                encoded_policy_visual = ModelUtils.create_visual_observation_encoder(
                    vis_in,
                    self.encoding_size,
                    ModelUtils.swish,
                    1,
                    f"gail_stream_{i}_visual_obs_encoder",
                    False,
                )

                encoded_expert_visual = ModelUtils.create_visual_observation_encoder(
                    exp_vis_in,
                    self.encoding_size,
                    ModelUtils.swish,
                    1,
                    f"gail_stream_{i}_visual_obs_encoder",
                    True,
                )
                visual_policy_encoders.append(encoded_policy_visual)
                visual_expert_encoders.append(encoded_expert_visual)
            hidden_policy_visual = tf.concat(visual_policy_encoders, axis=1)
            hidden_expert_visual = tf.concat(visual_expert_encoders, axis=1)
            encoded_policy_list.append(hidden_policy_visual)
            encoded_expert_list.append(hidden_expert_visual)

        self.encoded_expert = tf.concat(encoded_expert_list, axis=1)
        self.encoded_policy = tf.concat(encoded_policy_list, axis=1)