Python gaussian_entropy Exemples, rls.utils.tf2_utils.gaussian_entropy Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : a2c.py Projet : wyz1074152339/RLs

 def train(self, memories):
     s, visual_s, a, dc_r, cell_state = memories
     with tf.device(self.device):
         with tf.GradientTape(persistent=True) as tape:
             feat, _ = self._representation_net(s, visual_s, cell_state=cell_state)
             if self.is_continuous:
                 mu, log_std = self.net.policy_net(feat)
                 log_act_prob = gaussian_likelihood_sum(a, mu, log_std)
                 entropy = gaussian_entropy(log_std)
             else:
                 logits = self.net.policy_net(feat)
                 logp_all = tf.nn.log_softmax(logits)
                 log_act_prob = tf.reduce_sum(a * logp_all, axis=1, keepdims=True)
                 entropy = -tf.reduce_mean(tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True))
             v = self.net.value_net(feat)
             advantage = tf.stop_gradient(dc_r - v)
             td_error = dc_r - v
             critic_loss = tf.reduce_mean(tf.square(td_error))
             actor_loss = -(tf.reduce_mean(log_act_prob * advantage) + self.beta * entropy)
         critic_grads = tape.gradient(critic_loss, self.net.critic_trainable_variables)
         self.optimizer_critic.apply_gradients(
             zip(critic_grads, self.net.critic_trainable_variables)
         )
         if self.is_continuous:
             actor_grads = tape.gradient(actor_loss, self.net.actor_trainable_variables)
             self.optimizer_actor.apply_gradients(
                 zip(actor_grads, self.net.actor_trainable_variables)
             )
         else:
             actor_grads = tape.gradient(actor_loss, self.net.actor_trainable_variables)
             self.optimizer_actor.apply_gradients(
                 zip(actor_grads, self.net.actor_trainable_variables)
             )
         self.global_step.assign_add(1)
         return actor_loss, critic_loss, entropy

Exemple #2

0

Afficher le fichier

 def train_actor(self, memories):
     s, visual_s, a, old_log_prob, advantage, cell_state = memories
     with tf.device(self.device):
         with tf.GradientTape() as tape:
             output, _ = self.net(s, visual_s, cell_state=cell_state)
             if self.is_continuous:
                 mu, log_std = output
                 new_log_prob = gaussian_likelihood_sum(a, mu, log_std)
                 entropy = gaussian_entropy(log_std)
             else:
                 logits = output
                 logp_all = tf.nn.log_softmax(logits)
                 new_log_prob = tf.reduce_sum(a * logp_all,
                                              axis=1,
                                              keepdims=True)
                 entropy = -tf.reduce_mean(
                     tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                   axis=1,
                                   keepdims=True))
             ratio = tf.exp(new_log_prob - old_log_prob)
             actor_loss = -tf.reduce_mean(ratio * advantage)
         actor_grads = tape.gradient(actor_loss,
                                     self.net.actor_trainable_variables)
         gradients = flat_concat(actor_grads)
         self.global_step.assign_add(1)
         return actor_loss, entropy, gradients

Exemple #3

0

Afficher le fichier

 def train(self, memories):
     s, visual_s, a, dc_r, cell_state = memories
     with tf.device(self.device):
         with tf.GradientTape() as tape:
             output, cell_state = self.net(s,
                                           visual_s,
                                           cell_state=cell_state)
             if self.is_continuous:
                 mu, log_std = output
                 log_act_prob = gaussian_likelihood_sum(a, mu, log_std)
                 entropy = gaussian_entropy(log_std)
             else:
                 logits = output
                 logp_all = tf.nn.log_softmax(logits)
                 log_act_prob = tf.reduce_sum(tf.multiply(logp_all, a),
                                              axis=1,
                                              keepdims=True)
                 entropy = -tf.reduce_mean(
                     tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                   axis=1,
                                   keepdims=True))
             loss = -tf.reduce_mean(log_act_prob * dc_r)
         loss_grads = tape.gradient(loss, self.net.trainable_variables)
         self.optimizer.apply_gradients(
             zip(loss_grads, self.net.trainable_variables))
         self.global_step.assign_add(1)
         return loss, entropy

Exemple #4

0

Afficher le fichier

Fichier : ac.py Projet : wyz1074152339/RLs

    def _train(self, memories, isw, cell_state):
        ss, vvss, a, r, done, old_log_prob = memories
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                (feat,
                 feat_), _ = self._representation_net(ss,
                                                      vvss,
                                                      cell_state=cell_state,
                                                      need_split=True)
                if self.is_continuous:
                    mu, log_std = self.net.policy_net(feat)
                    log_prob = gaussian_likelihood_sum(a, mu, log_std)
                    entropy = gaussian_entropy(log_std)

                    next_mu, _ = self.net.policy_net(feat_)
                    max_q_next = tf.stop_gradient(
                        self.net.value_net(feat_, next_mu))
                else:
                    logits = self.net.policy_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    log_prob = tf.reduce_sum(tf.multiply(logp_all, a),
                                             axis=1,
                                             keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))

                    logits = self.net.policy_net(feat_)
                    max_a = tf.argmax(logits, axis=1)
                    max_a_one_hot = tf.one_hot(max_a, self.a_dim)
                    max_q_next = tf.stop_gradient(
                        self.net.value_net(feat_, max_a_one_hot))
                q = self.net.value_net(feat, a)
                ratio = tf.stop_gradient(tf.exp(log_prob - old_log_prob))
                q_value = tf.stop_gradient(q)
                td_error = q - (r + self.gamma * (1 - done) * max_q_next)
                critic_loss = tf.reduce_mean(tf.square(td_error) * isw)
                actor_loss = -tf.reduce_mean(ratio * log_prob * q_value)
            critic_grads = tape.gradient(critic_loss,
                                         self.net.critic_trainable_variables)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.net.critic_trainable_variables))
            actor_grads = tape.gradient(actor_loss,
                                        self.net.actor_trainable_variables)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.net.actor_trainable_variables))
            self.global_step.assign_add(1)
            return td_error, dict([['LOSS/actor_loss', actor_loss],
                                   ['LOSS/critic_loss', critic_loss],
                                   ['Statistics/q_max',
                                    tf.reduce_max(q)],
                                   ['Statistics/q_min',
                                    tf.reduce_min(q)],
                                   ['Statistics/q_mean',
                                    tf.reduce_mean(q)],
                                   ['Statistics/ratio',
                                    tf.reduce_mean(ratio)],
                                   ['Statistics/entropy', entropy]])

Exemple #5

0

Afficher le fichier

Fichier : ppo.py Projet : ncepuwwy97/RLs

    def train_share(self, memories, kl_coef, crsty_loss, cell_state):
        s, visual_s, a, dc_r, old_log_prob, advantage, old_value = memories
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat = self.get_feature(s, visual_s, cell_state=cell_state)
                if self.is_continuous:
                    mu, value = self.net(feat)
                    new_log_prob = gaussian_likelihood_sum(a, mu, self.log_std)
                    entropy = gaussian_entropy(self.log_std)
                else:
                    logits, value = self.net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    new_log_prob = tf.reduce_sum(a * logp_all,
                                                 axis=1,
                                                 keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                ratio = tf.exp(new_log_prob - old_log_prob)

                # https://github.com/joschu/modular_rl/blob/6970cde3da265cf2a98537250fea5e0c0d9a7639/modular_rl/ppo.py#L40
                if self.kl_reverse:
                    kl = tf.reduce_mean(new_log_prob - old_log_prob)
                else:
                    kl = tf.reduce_mean(
                        old_log_prob - new_log_prob
                    )  # a sample estimate for KL-divergence, easy to compute
                surrogate = ratio * advantage

                # https://github.com/llSourcell/OpenAI_Five_vs_Dota2_Explained/blob/c5def7e57aa70785c2394ea2eeb3e5f66ad59a53/train.py#L154
                value_clip = old_value + tf.clip_by_value(
                    value - old_value, -self.value_epsilon, self.value_epsilon)
                td_error = dc_r - value
                td_error_clip = dc_r - value_clip
                td_square = tf.maximum(tf.square(td_error),
                                       tf.square(td_error_clip))

                pi_loss = -tf.reduce_mean(
                    tf.minimum(
                        surrogate,
                        tf.clip_by_value(ratio, 1.0 - self.epsilon,
                                         1.0 + self.epsilon) * advantage))
                kl_loss = kl_coef * kl
                extra_loss = 1000.0 * tf.square(
                    tf.maximum(0., kl - self.kl_cutoff))
                actor_loss = pi_loss + kl_loss + extra_loss
                value_loss = 0.5 * tf.reduce_mean(td_square)
                loss = actor_loss + 1.0 * value_loss - self.beta * entropy + crsty_loss
            loss_grads = tape.gradient(loss, self.net_tv)
            self.optimizer.apply_gradients(zip(loss_grads, self.net_tv))
            self.global_step.assign_add(1)
            return actor_loss, value_loss, entropy, kl

Exemple #6

0

Afficher le fichier

Fichier : ppo.py Projet : zhijie-ai/RLs

    def train_actor(self, BATCH, cell_state, kl_coef):
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                output, _ = self.net(BATCH.obs, cell_state=cell_state)
                if self.is_continuous:
                    mu, log_std = output
                    new_log_prob = gaussian_likelihood_sum(
                        BATCH.action, mu, log_std)
                    entropy = gaussian_entropy(log_std)
                else:
                    logits = output
                    logp_all = tf.nn.log_softmax(logits)
                    new_log_prob = tf.reduce_sum(BATCH.action * logp_all,
                                                 axis=1,
                                                 keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                ratio = tf.exp(new_log_prob - BATCH.log_prob)
                kl = tf.reduce_mean(BATCH.log_prob - new_log_prob)
                surrogate = ratio * BATCH.gae_adv
                clipped_surrogate = tf.minimum(
                    surrogate,
                    tf.where(BATCH.gae_adv > 0,
                             (1 + self.epsilon) * BATCH.gae_adv,
                             (1 - self.epsilon) * BATCH.gae_adv))
                if self.use_duel_clip:
                    clipped_surrogate = tf.maximum(clipped_surrogate,
                                                   (1.0 + self.duel_epsilon) *
                                                   BATCH.gae_adv)

                actor_loss = -(tf.reduce_mean(clipped_surrogate) +
                               self.ent_coef * entropy)

                if self.use_kl_loss:
                    kl_loss = kl_coef * kl
                    actor_loss += kl_loss
                if self.use_extra_loss:
                    extra_loss = self.extra_coef * tf.square(
                        tf.maximum(0., kl - self.kl_cutoff))
                    actor_loss += extra_loss

            actor_grads = tape.gradient(actor_loss,
                                        self.net.actor_trainable_variables)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.net.actor_trainable_variables))
            self.global_step.assign_add(1)
            return actor_loss, entropy, kl

Exemple #7

0

Afficher le fichier

Fichier : ppo.py Projet : ncepuwwy97/RLs

    def train_actor(self, memories, kl_coef, cell_state):
        s, visual_s, a, old_log_prob, advantage = memories
        with tf.device(self.device):
            feat = self.get_feature(s, visual_s, cell_state=cell_state)
            with tf.GradientTape() as tape:
                if self.is_continuous:
                    mu = self.actor_net(feat)
                    new_log_prob = gaussian_likelihood_sum(a, mu, self.log_std)
                    entropy = gaussian_entropy(self.log_std)
                else:
                    logits = self.actor_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    new_log_prob = tf.reduce_sum(a * logp_all,
                                                 axis=1,
                                                 keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                ratio = tf.exp(new_log_prob - old_log_prob)
                kl = tf.reduce_mean(old_log_prob - new_log_prob)
                surrogate = ratio * advantage
                min_adv = tf.where(advantage > 0,
                                   (1 + self.epsilon) * advantage,
                                   (1 - self.epsilon) * advantage)
                pi_loss = -(tf.reduce_mean(tf.minimum(surrogate, min_adv)) +
                            self.beta * entropy)

                kl_loss = kl_coef * kl
                extra_loss = 1000.0 * tf.square(
                    tf.maximum(0., kl - self.kl_cutoff))
                actor_loss = pi_loss + kl_loss + extra_loss

            actor_grads = tape.gradient(actor_loss, self.actor_net_tv)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_net_tv))
            self.global_step.assign_add(1)
            return actor_loss, entropy, kl

Exemple #8

0

Afficher le fichier

Fichier : a2c.py Projet : ncepuwwy97/RLs

 def train(self, memories, crsty_loss, cell_state):
     s, visual_s, a, dc_r = memories
     with tf.device(self.device):
         with tf.GradientTape() as tape:
             feat = self.get_feature(s, visual_s, cell_state=cell_state)
             v = self.critic_net(feat)
             td_error = dc_r - v
             critic_loss = tf.reduce_mean(tf.square(td_error)) + crsty_loss
         critic_grads = tape.gradient(critic_loss, self.critic_tv)
         self.optimizer_critic.apply_gradients(
             zip(critic_grads, self.critic_tv)
         )
         with tf.GradientTape() as tape:
             if self.is_continuous:
                 mu = self.actor_net(feat)
                 log_act_prob = gaussian_likelihood_sum(a, mu, self.log_std)
                 entropy = gaussian_entropy(self.log_std)
             else:
                 logits = self.actor_net(feat)
                 logp_all = tf.nn.log_softmax(logits)
                 log_act_prob = tf.reduce_sum(a * logp_all, axis=1, keepdims=True)
                 entropy = -tf.reduce_mean(tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True))
             v = self.critic_net(feat)
             advantage = tf.stop_gradient(dc_r - v)
             actor_loss = -(tf.reduce_mean(log_act_prob * advantage) + self.beta * entropy)
         if self.is_continuous:
             actor_grads = tape.gradient(actor_loss, self.actor_tv)
             self.optimizer_actor.apply_gradients(
                 zip(actor_grads, self.actor_tv)
             )
         else:
             actor_grads = tape.gradient(actor_loss, self.actor_tv)
             self.optimizer_actor.apply_gradients(
                 zip(actor_grads, self.actor_tv)
             )
         self.global_step.assign_add(1)
         return actor_loss, critic_loss, entropy

Exemple #9

0

Afficher le fichier

    def _train(self, memories, isw, cell_state):
        s, visual_s, a, r, s_, visual_s_, done, last_options, options = memories
        last_options = tf.cast(last_options, tf.int32)
        options = tf.cast(options, tf.int32)
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                feat, _ = self._representation_net(s,
                                                   visual_s,
                                                   cell_state=cell_state)
                feat_, _ = self._representation_target_net(
                    s_, visual_s_, cell_state=cell_state)
                q = self.q_net.value_net(feat)  # [B, P]
                pi = self.intra_option_net.value_net(feat)  # [B, P, A]
                beta = self.termination_net.value_net(feat)  # [B, P]
                q_next = self.q_target_net.value_net(
                    feat_)  # [B, P], [B, P, A], [B, P]
                beta_next = self.termination_net.value_net(feat_)  # [B, P]
                interests = self.interest_net.value_net(feat)  # [B, P]
                options_onehot = tf.one_hot(options,
                                            self.options_num,
                                            dtype=tf.float32)  # [B,] => [B, P]

                q_s = qu_eval = tf.reduce_sum(q * options_onehot,
                                              axis=-1,
                                              keepdims=True)  # [B, 1]
                beta_s_ = tf.reduce_sum(beta_next * options_onehot,
                                        axis=-1,
                                        keepdims=True)  # [B, 1]
                q_s_ = tf.reduce_sum(q_next * options_onehot,
                                     axis=-1,
                                     keepdims=True)  # [B, 1]
                if self.double_q:
                    q_ = self.q_net.value_net(
                        feat)  # [B, P], [B, P, A], [B, P]
                    max_a_idx = tf.one_hot(
                        tf.argmax(q_, axis=-1),
                        self.options_num,
                        dtype=tf.float32)  # [B, P] => [B, ] => [B, P]
                    q_s_max = tf.reduce_sum(q_next * max_a_idx,
                                            axis=-1,
                                            keepdims=True)  # [B, 1]
                else:
                    q_s_max = tf.reduce_max(q_next, axis=-1,
                                            keepdims=True)  # [B, 1]
                u_target = (1 - beta_s_) * q_s_ + beta_s_ * q_s_max  # [B, 1]
                qu_target = tf.stop_gradient(r + self.gamma *
                                             (1 - done) * u_target)
                td_error = qu_target - qu_eval  # gradient : q
                q_loss = tf.reduce_mean(tf.square(td_error) *
                                        isw)  # [B, 1] => 1

                if self.use_baseline:
                    adv = tf.stop_gradient(qu_target - qu_eval)
                else:
                    adv = tf.stop_gradient(qu_target)
                options_onehot_expanded = tf.expand_dims(
                    options_onehot, axis=-1)  # [B, P] => [B, P, 1]
                pi = tf.reduce_sum(pi * options_onehot_expanded,
                                   axis=1)  # [B, P, A] => [B, A]
                if self.is_continuous:
                    log_std = tf.gather(self.log_std, options)
                    mu = tf.math.tanh(pi)
                    log_p = gaussian_likelihood_sum(a, mu, log_std)
                    entropy = gaussian_entropy(log_std)
                else:
                    pi = pi / self.boltzmann_temperature
                    log_pi = tf.nn.log_softmax(pi, axis=-1)  # [B, A]
                    entropy = -tf.reduce_sum(tf.exp(log_pi) * log_pi,
                                             axis=1,
                                             keepdims=True)  # [B, 1]
                    log_p = tf.reduce_sum(a * log_pi, axis=-1,
                                          keepdims=True)  # [B, 1]
                pi_loss = tf.reduce_mean(
                    -(log_p * adv + self.ent_coff * entropy)
                )  # [B, 1] * [B, 1] => [B, 1] => 1

                last_options_onehot = tf.one_hot(
                    last_options, self.options_num,
                    dtype=tf.float32)  # [B,] => [B, P]
                beta_s = tf.reduce_sum(beta * last_options_onehot,
                                       axis=-1,
                                       keepdims=True)  # [B, 1]

                pi_op = tf.nn.softmax(
                    interests *
                    tf.stop_gradient(q))  # [B, P] or tf.nn.softmax(q)
                interest_loss = -tf.reduce_mean(beta_s * tf.reduce_sum(
                    pi_op * options_onehot, axis=-1, keepdims=True) *
                                                q_s)  # [B, 1] => 1

                v_s = tf.reduce_sum(q * pi_op, axis=-1,
                                    keepdims=True)  # [B, P] * [B, P] => [B, 1]
                beta_loss = beta_s * tf.stop_gradient(q_s - v_s)  # [B, 1]
                if self.terminal_mask:
                    beta_loss *= (1 - done)
                beta_loss = tf.reduce_mean(beta_loss)  # [B, 1] => 1

            q_grads = tape.gradient(q_loss, self.q_net.trainable_variables)
            intra_option_grads = tape.gradient(pi_loss, self.actor_tv)
            termination_grads = tape.gradient(
                beta_loss, self.termination_net.trainable_variables)
            interest_grads = tape.gradient(
                interest_loss, self.interest_net.trainable_variables)
            self.q_optimizer.apply_gradients(
                zip(q_grads, self.q_net.trainable_variables))
            self.intra_option_optimizer.apply_gradients(
                zip(intra_option_grads, self.actor_tv))
            self.termination_optimizer.apply_gradients(
                zip(termination_grads,
                    self.termination_net.trainable_variables))
            self.interest_optimizer.apply_gradients(
                zip(interest_grads, self.interest_net.trainable_variables))
            self.global_step.assign_add(1)
            return td_error, dict(
                [['LOSS/q_loss', tf.reduce_mean(q_loss)],
                 ['LOSS/pi_loss', tf.reduce_mean(pi_loss)],
                 ['LOSS/beta_loss',
                  tf.reduce_mean(beta_loss)],
                 ['LOSS/interest_loss',
                  tf.reduce_mean(interest_loss)],
                 ['Statistics/q_option_max',
                  tf.reduce_max(q_s)],
                 ['Statistics/q_option_min',
                  tf.reduce_min(q_s)],
                 ['Statistics/q_option_mean',
                  tf.reduce_mean(q_s)]])

Exemple #10

0

Afficher le fichier

    def train_continuous(self, BATCH, isw, cell_state):
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                feat, _ = self._representation_net(BATCH.obs,
                                                   cell_state=cell_state)
                v = self.v_net.value_net(feat)
                v_target, _ = self.v_target_net(BATCH.obs_,
                                                cell_state=cell_state)

                if self.is_continuous:
                    mu, log_std = self.actor_net.value_net(feat)
                    pi, log_pi = squash_rsample(mu, log_std)
                    entropy = gaussian_entropy(log_std)
                else:
                    logits = self.actor_net.value_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    gumbel_noise = tf.cast(self.gumbel_dist.sample(
                        BATCH.action.shape),
                                           dtype=tf.float32)
                    _pi = tf.nn.softmax(
                        (logp_all + gumbel_noise) / self.discrete_tau)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1),
                                                  self.a_dim)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    pi = _pi_diff + _pi
                    log_pi = tf.reduce_sum(tf.multiply(logp_all, pi),
                                           axis=1,
                                           keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                q1, q2 = self.q_net.get_value(feat, BATCH.action)
                q1_pi, q2_pi = self.q_net.get_value(feat, pi)
                dc_r = tf.stop_gradient(BATCH.reward + self.gamma * v_target *
                                        (1 - BATCH.done))
                v_from_q_stop = tf.stop_gradient(
                    tf.minimum(q1_pi, q2_pi) - self.alpha * log_pi)
                td_v = v - v_from_q_stop
                td_error1 = q1 - dc_r
                td_error2 = q2 - dc_r
                q1_loss = tf.reduce_mean(tf.square(td_error1) * isw)
                q2_loss = tf.reduce_mean(tf.square(td_error2) * isw)
                v_loss_stop = tf.reduce_mean(tf.square(td_v) * isw)
                critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + 0.5 * v_loss_stop
                actor_loss = -tf.reduce_mean(q1_pi - self.alpha * log_pi)
                if self.auto_adaption:
                    alpha_loss = -tf.reduce_mean(
                        self.alpha *
                        tf.stop_gradient(log_pi + self.target_entropy))
            actor_grads = tape.gradient(actor_loss,
                                        self.actor_net.trainable_variables)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_net.trainable_variables))
            critic_grads = tape.gradient(
                critic_loss, self.q_net.trainable_variables +
                self.v_net.trainable_variables)
            self.optimizer_critic.apply_gradients(
                zip(
                    critic_grads, self.q_net.trainable_variables +
                    self.v_net.trainable_variables))
            if self.auto_adaption:
                alpha_grad = tape.gradient(alpha_loss, self.log_alpha)
                self.optimizer_alpha.apply_gradients([(alpha_grad,
                                                       self.log_alpha)])
            self.global_step.assign_add(1)
            summaries = dict(
                [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss],
                 ['LOSS/q2_loss', q2_loss], ['LOSS/v_loss', v_loss_stop],
                 ['LOSS/critic_loss', critic_loss],
                 ['Statistics/log_alpha', self.log_alpha],
                 ['Statistics/alpha', self.alpha],
                 ['Statistics/entropy', entropy],
                 ['Statistics/q_min',
                  tf.reduce_min(tf.minimum(q1, q2))],
                 ['Statistics/q_mean',
                  tf.reduce_mean(tf.minimum(q1, q2))],
                 ['Statistics/q_max',
                  tf.reduce_max(tf.maximum(q1, q2))],
                 ['Statistics/v_mean', tf.reduce_mean(v)]])
            if self.auto_adaption:
                summaries.update({'LOSS/alpha_loss': alpha_loss})
            return (td_error1 + td_error2) / 2, summaries

Exemple #11

0

Afficher le fichier

Fichier : oc.py Projet : wyz1074152339/RLs

    def _train(self, memories, isw, cell_state):
        s, visual_s, a, r, s_, visual_s_, done, last_options, options = memories
        last_options = tf.cast(last_options, tf.int32)
        options = tf.cast(options, tf.int32)
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                feat, _ = self._representation_net(s,
                                                   visual_s,
                                                   cell_state=cell_state)
                feat_, _ = self._representation_target_net(
                    s_, visual_s_, cell_state=cell_state)
                q = self.q_net.value_net(feat)  # [B, P]
                pi = self.intra_option_net.value_net(feat)  # [B, P, A]
                beta = self.termination_net.value_net(feat)  # [B, P]
                q_next = self.q_target_net.value_net(
                    feat_)  # [B, P], [B, P, A], [B, P]
                beta_next = self.termination_net.value_net(feat_)  # [B, P]
                options_onehot = tf.one_hot(options,
                                            self.options_num,
                                            dtype=tf.float32)  # [B,] => [B, P]

                q_s = qu_eval = tf.reduce_sum(q * options_onehot,
                                              axis=-1,
                                              keepdims=True)  # [B, 1]
                beta_s_ = tf.reduce_sum(beta_next * options_onehot,
                                        axis=-1,
                                        keepdims=True)  # [B, 1]
                q_s_ = tf.reduce_sum(q_next * options_onehot,
                                     axis=-1,
                                     keepdims=True)  # [B, 1]
                # https://github.com/jeanharb/option_critic/blob/5d6c81a650a8f452bc8ad3250f1f211d317fde8c/neural_net.py#L94
                if self.double_q:
                    q_ = self.q_net.value_net(
                        feat)  # [B, P], [B, P, A], [B, P]
                    max_a_idx = tf.one_hot(
                        tf.argmax(q_, axis=-1),
                        self.options_num,
                        dtype=tf.float32)  # [B, P] => [B, ] => [B, P]
                    q_s_max = tf.reduce_sum(q_next * max_a_idx,
                                            axis=-1,
                                            keepdims=True)  # [B, 1]
                else:
                    q_s_max = tf.reduce_max(q_next, axis=-1,
                                            keepdims=True)  # [B, 1]
                u_target = (1 - beta_s_) * q_s_ + beta_s_ * q_s_max  # [B, 1]
                qu_target = tf.stop_gradient(r + self.gamma *
                                             (1 - done) * u_target)
                td_error = qu_target - qu_eval  # gradient : q
                q_loss = tf.reduce_mean(tf.square(td_error) *
                                        isw)  # [B, 1] => 1

                # https://github.com/jeanharb/option_critic/blob/5d6c81a650a8f452bc8ad3250f1f211d317fde8c/neural_net.py#L130
                if self.use_baseline:
                    adv = tf.stop_gradient(qu_target - qu_eval)
                else:
                    adv = tf.stop_gradient(qu_target)
                options_onehot_expanded = tf.expand_dims(
                    options_onehot, axis=-1)  # [B, P] => [B, P, 1]
                pi = tf.reduce_sum(pi * options_onehot_expanded,
                                   axis=1)  # [B, P, A] => [B, A]
                if self.is_continuous:
                    log_std = tf.gather(self.log_std, options)
                    mu = tf.math.tanh(pi)
                    log_p = gaussian_likelihood_sum(a, mu, log_std)
                    entropy = gaussian_entropy(log_std)
                else:
                    pi = pi / self.boltzmann_temperature
                    log_pi = tf.nn.log_softmax(pi, axis=-1)  # [B, A]
                    entropy = -tf.reduce_sum(tf.exp(log_pi) * log_pi,
                                             axis=1,
                                             keepdims=True)  # [B, 1]
                    log_p = tf.reduce_sum(a * log_pi, axis=-1,
                                          keepdims=True)  # [B, 1]
                pi_loss = tf.reduce_mean(
                    -(log_p * adv + self.ent_coff * entropy)
                )  # [B, 1] * [B, 1] => [B, 1] => 1

                last_options_onehot = tf.one_hot(
                    last_options, self.options_num,
                    dtype=tf.float32)  # [B,] => [B, P]
                beta_s = tf.reduce_sum(beta * last_options_onehot,
                                       axis=-1,
                                       keepdims=True)  # [B, 1]
                if self.use_eps_greedy:
                    v_s = tf.reduce_max(
                        q, axis=-1,
                        keepdims=True) - self.termination_regularizer  # [B, 1]
                else:
                    v_s = (1 - beta_s) * q_s + beta_s * tf.reduce_max(
                        q, axis=-1, keepdims=True)  # [B, 1]
                    # v_s = tf.reduce_mean(q, axis=-1, keepdims=True)   # [B, 1]
                beta_loss = beta_s * tf.stop_gradient(q_s - v_s)  # [B, 1]
                # https://github.com/lweitkamp/option-critic-pytorch/blob/0c57da7686f8903ed2d8dded3fae832ee9defd1a/option_critic.py#L238
                if self.terminal_mask:
                    beta_loss *= (1 - done)
                beta_loss = tf.reduce_mean(beta_loss)  # [B, 1] => 1

            q_grads = tape.gradient(q_loss, self.q_net.trainable_variables)
            intra_option_grads = tape.gradient(pi_loss, self.actor_tv)
            termination_grads = tape.gradient(
                beta_loss, self.termination_net.trainable_variables)
            self.q_optimizer.apply_gradients(
                zip(q_grads, self.q_net.trainable_variables))
            self.intra_option_optimizer.apply_gradients(
                zip(intra_option_grads, self.actor_tv))
            self.termination_optimizer.apply_gradients(
                zip(termination_grads,
                    self.termination_net.trainable_variables))
            self.global_step.assign_add(1)
            return td_error, dict(
                [['LOSS/q_loss', tf.reduce_mean(q_loss)],
                 ['LOSS/pi_loss', tf.reduce_mean(pi_loss)],
                 ['LOSS/beta_loss',
                  tf.reduce_mean(beta_loss)],
                 ['Statistics/q_option_max',
                  tf.reduce_max(q_s)],
                 ['Statistics/q_option_min',
                  tf.reduce_min(q_s)],
                 ['Statistics/q_option_mean',
                  tf.reduce_mean(q_s)]])

Exemple #12

0

Afficher le fichier

    def train(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done = memories
        batch_size = tf.shape(a)[0]
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat, feat_ = self.get_feature(ss,
                                               vvss,
                                               cell_state=cell_state,
                                               s_and_s_=True)
                if self.is_continuous:
                    target_mu, target_log_std = self.actor_net(feat_)
                    target_log_std = clip_nn_log_std(target_log_std)
                    target_pi, target_log_pi = squash_rsample(
                        target_mu, target_log_std)
                else:
                    target_logits = self.actor_net(feat_)
                    target_cate_dist = tfp.distributions.Categorical(
                        target_logits)
                    target_pi = target_cate_dist.sample()
                    target_log_pi = target_cate_dist.log_prob(target_pi)
                    target_pi = tf.one_hot(target_pi,
                                           self.a_dim,
                                           dtype=tf.float32)
                q1, q2 = self.critic_net(feat, a)
                q1_target, q2_target = self.critic_target_net(feat_, target_pi)
                dc_r_q1 = tf.stop_gradient(
                    r + self.gamma * (1 - done) *
                    (q1_target - self.alpha * target_log_pi))
                dc_r_q2 = tf.stop_gradient(
                    r + self.gamma * (1 - done) *
                    (q2_target - self.alpha * target_log_pi))
                td_error1 = q1 - dc_r_q1
                td_error2 = q2 - dc_r_q2
                q1_loss = tf.reduce_mean(tf.square(td_error1) * isw)
                q2_loss = tf.reduce_mean(tf.square(td_error2) * isw)
                critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + crsty_loss
            critic_grads = tape.gradient(critic_loss, self.critic_tv)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.critic_tv))

            with tf.GradientTape() as tape:
                if self.is_continuous:
                    mu, log_std = self.actor_net(feat)
                    log_std = clip_nn_log_std(log_std, self.log_std_min,
                                              self.log_std_max)
                    pi, log_pi = squash_rsample(mu, log_std)
                    entropy = gaussian_entropy(log_std)
                else:
                    logits = self.actor_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    gumbel_noise = tf.cast(self.gumbel_dist.sample(
                        [batch_size, self.a_dim]),
                                           dtype=tf.float32)
                    _pi = tf.nn.softmax(
                        (logp_all + gumbel_noise) / self.discrete_tau)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1),
                                                  self.a_dim)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    pi = _pi_diff + _pi
                    log_pi = tf.reduce_sum(tf.multiply(logp_all, pi),
                                           axis=1,
                                           keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                q_s_pi = self.critic_net.get_min(feat, pi)
                actor_loss = -tf.reduce_mean(q_s_pi - self.alpha * log_pi)
            actor_grads = tape.gradient(actor_loss, self.actor_tv)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_tv))

            if self.auto_adaption:
                with tf.GradientTape() as tape:
                    if self.is_continuous:
                        mu, log_std = self.actor_net(feat)
                        log_std = clip_nn_log_std(log_std, self.log_std_min,
                                                  self.log_std_max)
                        norm_dist = tfp.distributions.Normal(
                            loc=mu, scale=tf.exp(log_std))
                        log_pi = tf.reduce_sum(norm_dist.log_prob(
                            norm_dist.sample()),
                                               axis=-1)
                    else:
                        logits = self.actor_net(feat)
                        cate_dist = tfp.distributions.Categorical(logits)
                        log_pi = cate_dist.log_prob(cate_dist.sample())
                    # $J(\alpha)=\mathbb{E}_{\mathbf{a}_{t} \sim \pi_{t}}\left[-\alpha \log \pi_{t}\left(\mathbf{a}_{t} | \mathbf{s}_{t}\right)-\alpha \overline{\mathcal{H}}\right.$
                    # \overline{\mathcal{H}} is negative
                    alpha_loss = -tf.reduce_mean(
                        self.alpha *
                        tf.stop_gradient(log_pi + self.target_entropy))
                alpha_grad = tape.gradient(alpha_loss, self.log_alpha)
                self.optimizer_alpha.apply_gradients([(alpha_grad,
                                                       self.log_alpha)])
            self.global_step.assign_add(1)
            summaries = dict(
                [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss],
                 ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss],
                 ['Statistics/log_alpha', self.log_alpha],
                 ['Statistics/alpha', self.alpha],
                 ['Statistics/entropy', entropy],
                 ['Statistics/q_min',
                  tf.reduce_min(tf.minimum(q1, q2))],
                 ['Statistics/q_mean',
                  tf.reduce_mean(tf.minimum(q1, q2))],
                 ['Statistics/q_max',
                  tf.reduce_max(tf.maximum(q1, q2))]])
            if self.auto_adaption:
                summaries.update({'LOSS/alpha_loss': alpha_loss})
            return (td_error1 + td_error2) / 2., summaries

Exemple #13

0

Afficher le fichier

Fichier : sac_v.py Projet : ncepuwwy97/RLs

 def train_persistent(self, memories, isw, crsty_loss, cell_state):
     ss, vvss, a, r, done = memories
     batch_size = tf.shape(a)[0]
     with tf.device(self.device):
         with tf.GradientTape(persistent=True) as tape:
             feat, feat_ = self.get_feature(ss,
                                            vvss,
                                            cell_state=cell_state,
                                            s_and_s_=True)
             if self.is_continuous:
                 mu, log_std = self.actor_net(feat)
                 log_std = clip_nn_log_std(log_std, self.log_std_min,
                                           self.log_std_max)
                 pi, log_pi = squash_rsample(mu, log_std)
                 entropy = gaussian_entropy(log_std)
             else:
                 logits = self.actor_net(feat)
                 logp_all = tf.nn.log_softmax(logits)
                 gumbel_noise = tf.cast(self.gumbel_dist.sample(
                     [batch_size, self.a_dim]),
                                        dtype=tf.float32)
                 _pi = tf.nn.softmax(
                     (logp_all + gumbel_noise) / self.discrete_tau)
                 _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1),
                                               self.a_dim)
                 _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                 pi = _pi_diff + _pi
                 log_pi = tf.reduce_sum(tf.multiply(logp_all, pi),
                                        axis=1,
                                        keepdims=True)
                 entropy = -tf.reduce_mean(
                     tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                   axis=1,
                                   keepdims=True))
             q1, q2 = self.q_net(feat, a)
             v = self.v_net(feat)
             q1_pi, q2_pi = self.q_net(feat, pi)
             v_target = self.v_target_net(feat_)
             dc_r = tf.stop_gradient(r + self.gamma * v_target * (1 - done))
             v_from_q_stop = tf.stop_gradient(
                 tf.minimum(q1_pi, q2_pi) - self.alpha * log_pi)
             td_v = v - v_from_q_stop
             td_error1 = q1 - dc_r
             td_error2 = q2 - dc_r
             q1_loss = tf.reduce_mean(tf.square(td_error1) * isw)
             q2_loss = tf.reduce_mean(tf.square(td_error2) * isw)
             v_loss_stop = tf.reduce_mean(tf.square(td_v) * isw)
             critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + 0.5 * v_loss_stop + crsty_loss
             actor_loss = -tf.reduce_mean(q1_pi - self.alpha * log_pi)
             if self.auto_adaption:
                 alpha_loss = -tf.reduce_mean(
                     self.alpha *
                     tf.stop_gradient(log_pi + self.target_entropy))
         actor_grads = tape.gradient(actor_loss, self.actor_tv)
         self.optimizer_actor.apply_gradients(
             zip(actor_grads, self.actor_tv))
         critic_grads = tape.gradient(critic_loss, self.critic_tv)
         self.optimizer_critic.apply_gradients(
             zip(critic_grads, self.critic_tv))
         if self.auto_adaption:
             alpha_grad = tape.gradient(alpha_loss, self.log_alpha)
             self.optimizer_alpha.apply_gradients([(alpha_grad,
                                                    self.log_alpha)])
         self.global_step.assign_add(1)
         summaries = dict(
             [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss],
              ['LOSS/q2_loss', q2_loss], ['LOSS/v_loss', v_loss_stop],
              ['LOSS/critic_loss', critic_loss],
              ['Statistics/log_alpha', self.log_alpha],
              ['Statistics/alpha', self.alpha],
              ['Statistics/entropy', entropy],
              ['Statistics/q_min',
               tf.reduce_min(tf.minimum(q1, q2))],
              ['Statistics/q_mean',
               tf.reduce_mean(tf.minimum(q1, q2))],
              ['Statistics/q_max',
               tf.reduce_max(tf.maximum(q1, q2))],
              ['Statistics/v_mean', tf.reduce_mean(v)]])
         if self.auto_adaption:
             summaries.update({'LOSS/alpha_loss': alpha_loss})
         return (td_error1 + td_error2) / 2, summaries

Exemple #14

0

Afficher le fichier

    def share(self, BATCH, cell_state, kl_coef):
        last_options = tf.cast(BATCH.last_options, tf.int32)  # [B,]
        options = tf.cast(BATCH.options, tf.int32)
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                (q, pi, beta, o), cell_state = self.net(
                    BATCH.obs,
                    cell_state=cell_state)  # [B, P], [B, P, A], [B, P], [B, P]

                options_onehot = tf.one_hot(options,
                                            self.options_num,
                                            dtype=tf.float32)  # [B, P]
                options_onehot_expanded = tf.expand_dims(options_onehot,
                                                         axis=-1)  # [B, P, 1]
                last_options_onehot = tf.one_hot(
                    last_options, self.options_num,
                    dtype=tf.float32)  # [B,] => [B, P]

                pi = tf.reduce_sum(pi * options_onehot_expanded,
                                   axis=1)  # [B, P, A] => [B, A]
                value = tf.reduce_sum(q * options_onehot,
                                      axis=1,
                                      keepdims=True)  # [B, 1]

                if self.is_continuous:
                    log_std = tf.gather(self.log_std, options)
                    mu = pi  # [B, A]
                    new_log_prob = gaussian_likelihood_sum(
                        BATCH.action, mu, log_std)
                    entropy = gaussian_entropy(log_std)
                else:
                    logits = pi  # [B, A]
                    logp_all = tf.nn.log_softmax(logits)
                    new_log_prob = tf.reduce_sum(BATCH.action * logp_all,
                                                 axis=1,
                                                 keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                ratio = tf.exp(new_log_prob - BATCH.log_prob)

                if self.kl_reverse:
                    kl = tf.reduce_mean(new_log_prob - BATCH.log_prob)
                else:
                    kl = tf.reduce_mean(
                        BATCH.log_prob - new_log_prob
                    )  # a sample estimate for KL-divergence, easy to compute
                surrogate = ratio * BATCH.gae_adv

                value_clip = BATCH.value + tf.clip_by_value(
                    value - BATCH.value, -self.value_epsilon,
                    self.value_epsilon)
                td_error = BATCH.discounted_reward - value
                td_error_clip = BATCH.discounted_reward - value_clip
                td_square = tf.maximum(tf.square(td_error),
                                       tf.square(td_error_clip))

                pi_loss = -tf.reduce_mean(
                    tf.minimum(
                        surrogate,
                        tf.clip_by_value(ratio, 1.0 - self.epsilon,
                                         1.0 + self.epsilon) * BATCH.gae_adv))
                kl_loss = kl_coef * kl
                extra_loss = 1000.0 * tf.square(
                    tf.maximum(0., kl - self.kl_cutoff))
                pi_loss = pi_loss + kl_loss + extra_loss
                q_loss = 0.5 * tf.reduce_mean(td_square)

                beta_s = tf.reduce_sum(beta * last_options_onehot,
                                       axis=-1,
                                       keepdims=True)  # [B, 1]
                beta_loss = tf.reduce_mean(beta_s * BATCH.beta_advantage)
                if self.terminal_mask:
                    beta_loss *= (1 - done)

                o_log_prob = tf.reduce_sum(o * options_onehot,
                                           axis=-1,
                                           keepdims=True)  # [B, 1]
                o_ratio = tf.exp(o_log_prob - BATCH.o_log_prob)
                o_entropy = -tf.reduce_mean(
                    tf.reduce_sum(tf.exp(o) * o, axis=1, keepdims=True))
                o_loss = -tf.reduce_mean(
                    tf.minimum(
                        o_ratio * BATCH.gae_adv,
                        tf.clip_by_value(o_ratio, 1.0 - self.epsilon,
                                         1.0 + self.epsilon) * BATCH.gae_adv))

                loss = pi_loss + 1.0 * q_loss + o_loss + beta_loss - self.pi_beta * entropy - self.o_beta * o_entropy
            loss_grads = tape.gradient(loss, self.net_tv)
            self.optimizer.apply_gradients(zip(loss_grads, self.net_tv))
            self.global_step.assign_add(1)
            return loss, pi_loss, q_loss, o_loss, beta_loss, entropy, o_entropy, kl

Exemple #15

0

Afficher le fichier

Fichier : tac.py Projet : wyz1074152339/RLs

    def train(self, memories, isw, cell_state):
        ss, vvss, a, r, done, s_, visual_s_ = memories
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                (feat,
                 feat_), _ = self._representation_net(ss,
                                                      vvss,
                                                      cell_state=cell_state,
                                                      need_split=True)
                if self.is_continuous:
                    mu, log_std = self.actor_net.value_net(feat)
                    log_std = clip_nn_log_std(log_std, self.log_std_min,
                                              self.log_std_max)
                    pi, log_pi = tsallis_squash_rsample(
                        mu, log_std, self.entropic_index)
                    entropy = gaussian_entropy(log_std)
                    target_mu, target_log_std = self.actor_net.value_net(feat_)
                    target_log_std = clip_nn_log_std(target_log_std,
                                                     self.log_std_min,
                                                     self.log_std_max)
                    target_pi, target_log_pi = tsallis_squash_rsample(
                        target_mu, target_log_std, self.entropic_index)
                else:
                    logits = self.actor_net.value_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    gumbel_noise = tf.cast(self.gumbel_dist.sample(a.shape),
                                           dtype=tf.float32)
                    _pi = tf.nn.softmax(
                        (logp_all + gumbel_noise) / self.discrete_tau)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1),
                                                  self.a_dim)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    pi = _pi_diff + _pi
                    log_pi = tf.reduce_sum(tf.multiply(logp_all, pi),
                                           axis=1,
                                           keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))

                    target_logits = self.actor_net.value_net(feat_)
                    target_cate_dist = tfp.distributions.Categorical(
                        logits=tf.nn.log_softmax(target_logits))
                    target_pi = target_cate_dist.sample()
                    target_log_pi = target_cate_dist.log_prob(target_pi)
                    target_pi = tf.one_hot(target_pi,
                                           self.a_dim,
                                           dtype=tf.float32)
                q1, q2 = self.critic_net.get_value(feat, a)
                q_s_pi = self.critic_net.get_min(feat, pi)

                q1_target, q2_target, _ = self.critic_target_net(
                    s_, visual_s_, target_pi, cell_state=cell_state)
                q_target = tf.minimum(q1_target, q2_target)
                dc_r = tf.stop_gradient(
                    r + self.gamma * (1 - done) *
                    (q_target - self.alpha * target_log_pi))
                td_error1 = q1 - dc_r
                td_error2 = q2 - dc_r
                q1_loss = tf.reduce_mean(tf.square(td_error1) * isw)
                q2_loss = tf.reduce_mean(tf.square(td_error2) * isw)
                critic_loss = 0.5 * q1_loss + 0.5 * q2_loss
                actor_loss = -tf.reduce_mean(q_s_pi - self.alpha * log_pi)
                if self.auto_adaption:
                    alpha_loss = -tf.reduce_mean(
                        self.alpha *
                        tf.stop_gradient(log_pi + self.target_entropy))
            critic_grads = tape.gradient(critic_loss,
                                         self.critic_net.trainable_variables)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.critic_net.trainable_variables))
            actor_grads = tape.gradient(actor_loss,
                                        self.actor_net.trainable_variables)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_net.trainable_variables))
            if self.auto_adaption:
                alpha_grad = tape.gradient(alpha_loss, self.log_alpha)
                self.optimizer_alpha.apply_gradients([(alpha_grad,
                                                       self.log_alpha)])
            self.global_step.assign_add(1)
            summaries = dict(
                [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss],
                 ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss],
                 ['Statistics/log_alpha', self.log_alpha],
                 ['Statistics/alpha', self.alpha],
                 ['Statistics/entropy', entropy],
                 ['Statistics/q_min',
                  tf.reduce_min(tf.minimum(q1, q2))],
                 ['Statistics/q_mean',
                  tf.reduce_mean(tf.minimum(q1, q2))],
                 ['Statistics/q_max',
                  tf.reduce_max(tf.maximum(q1, q2))]])
            if self.auto_adaption:
                summaries.update({'LOSS/alpha_loss': alpha_loss})
            return (td_error1 + td_error2) / 2, summaries

Exemple #16

0

Afficher le fichier

Fichier : ppo.py Projet : zhijie-ai/RLs

    def train_share(self, BATCH, cell_state, kl_coef):
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                output, cell_state = self.net(BATCH.obs, cell_state=cell_state)
                if self.is_continuous:
                    mu, log_std, value = output
                    new_log_prob = gaussian_likelihood_sum(
                        BATCH.action, mu, log_std)
                    entropy = gaussian_entropy(log_std)
                else:
                    logits, value = output
                    logp_all = tf.nn.log_softmax(logits)
                    new_log_prob = tf.reduce_sum(BATCH.action * logp_all,
                                                 axis=1,
                                                 keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                ratio = tf.exp(new_log_prob - BATCH.log_prob)
                surrogate = ratio * BATCH.gae_adv
                clipped_surrogate = tf.minimum(
                    surrogate,
                    tf.clip_by_value(ratio, 1.0 - self.epsilon,
                                     1.0 + self.epsilon) * BATCH.gae_adv)
                # ref: https://github.com/thu-ml/tianshou/blob/c97aa4065ee8464bd5897bb86f1f81abd8e2cff9/tianshou/policy/modelfree/ppo.py#L159
                if self.use_duel_clip:
                    clipped_surrogate = tf.maximum(clipped_surrogate,
                                                   (1.0 + self.duel_epsilon) *
                                                   BATCH.gae_adv)
                actor_loss = -(tf.reduce_mean(clipped_surrogate) +
                               self.ent_coef * entropy)

                # ref: https://github.com/joschu/modular_rl/blob/6970cde3da265cf2a98537250fea5e0c0d9a7639/modular_rl/ppo.py#L40
                # ref: https://github.com/hill-a/stable-baselines/blob/b3f414f4f2900403107357a2206f80868af16da3/stable_baselines/ppo2/ppo2.py#L185
                if self.kl_reverse:
                    kl = .5 * tf.reduce_mean(
                        tf.square(new_log_prob - BATCH.log_prob))
                else:
                    kl = .5 * tf.reduce_mean(
                        tf.square(BATCH.log_prob - new_log_prob)
                    )  # a sample estimate for KL-divergence, easy to compute

                td_error = BATCH.discounted_reward - value
                if self.use_vclip:
                    # ref: https://github.com/llSourcell/OpenAI_Five_vs_Dota2_Explained/blob/c5def7e57aa70785c2394ea2eeb3e5f66ad59a53/train.py#L154
                    # ref: https://github.com/hill-a/stable-baselines/blob/b3f414f4f2900403107357a2206f80868af16da3/stable_baselines/ppo2/ppo2.py#L172
                    value_clip = BATCH.value + tf.clip_by_value(
                        value - BATCH.value, -self.value_epsilon,
                        self.value_epsilon)
                    td_error_clip = BATCH.discounted_reward - value_clip
                    td_square = tf.maximum(tf.square(td_error),
                                           tf.square(td_error_clip))
                else:
                    td_square = tf.square(td_error)

                if self.use_kl_loss:
                    kl_loss = kl_coef * kl
                    actor_loss += kl_loss

                if self.use_extra_loss:
                    extra_loss = self.extra_coef * tf.square(
                        tf.maximum(0., kl - self.kl_cutoff))
                    actor_loss += extra_loss
                value_loss = 0.5 * tf.reduce_mean(td_square)
                loss = actor_loss + self.vf_coef * value_loss
            loss_grads = tape.gradient(loss, self.net.trainable_variables)
            self.optimizer.apply_gradients(
                zip(loss_grads, self.net.trainable_variables))
            self.global_step.assign_add(1)
            return actor_loss, value_loss, entropy, kl

Exemple #17

0

Afficher le fichier

    def train(self, BATCH, isw, cell_state, visual, visual_, pos):
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                vis_feat = self.encoder(visual)
                vis_feat_ = self.encoder(visual_)
                target_vis_feat_ = self.encoder_target(visual_)
                feat = tf.concat(
                    [vis_feat, BATCH.obs.flatten_vector()], axis=-1)
                feat_ = tf.concat(
                    [vis_feat_, BATCH.obs_.flatten_vector()], axis=-1)
                target_feat_ = tf.concat(
                    [target_vis_feat_,
                     BATCH.obs_.flatten_vector()], axis=-1)
                if self.is_continuous:
                    target_mu, target_log_std = self.actor_net.value_net(feat_)
                    target_pi, target_log_pi = squash_rsample(
                        target_mu, target_log_std)
                else:
                    target_logits = self.actor_net.value_net(feat_)
                    target_cate_dist = tfp.distributions.Categorical(
                        logits=tf.nn.log_softmax(target_logits))
                    target_pi = target_cate_dist.sample()
                    target_log_pi = target_cate_dist.log_prob(target_pi)
                    target_pi = tf.one_hot(target_pi,
                                           self.a_dim,
                                           dtype=tf.float32)
                q1, q2 = self.critic_net.value_net(feat, BATCH.action)
                q1_target, q2_target = self.critic_target_net.value_net(
                    feat_, target_pi)
                q_target = tf.minimum(q1_target, q2_target)
                dc_r = tf.stop_gradient(
                    BATCH.reward + self.gamma * (1 - BATCH.done) *
                    (q_target - self.alpha * target_log_pi))
                td_error1 = q1 - dc_r
                td_error2 = q2 - dc_r
                q1_loss = tf.reduce_mean(tf.square(td_error1) * isw)
                q2_loss = tf.reduce_mean(tf.square(td_error2) * isw)
                critic_loss = 0.5 * q1_loss + 0.5 * q2_loss

                z_a = vis_feat  # [B, N]
                z_out = self.encoder_target(pos)
                logits = tf.matmul(
                    z_a, tf.matmul(self.curl_w, tf.transpose(z_out, [1, 0])))
                logits -= tf.reduce_max(logits, axis=-1, keepdims=True)
                curl_loss = tf.reduce_mean(
                    tf.keras.losses.sparse_categorical_crossentropy(
                        tf.range(self.batch_size), logits))
            critic_grads = tape.gradient(critic_loss, self.critic_tv)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.critic_tv))
            curl_grads = tape.gradient(curl_loss, [self.curl_w] +
                                       self.encoder.trainable_variables)
            self.optimizer_curl.apply_gradients(
                zip(curl_grads,
                    [self.curl_w] + self.encoder.trainable_variables))

            with tf.GradientTape() as tape:
                if self.is_continuous:
                    mu, log_std = self.actor_net.value_net(feat)
                    pi, log_pi = squash_rsample(mu, log_std)
                    entropy = gaussian_entropy(log_std)
                else:
                    logits = self.actor_net.value_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    gumbel_noise = tf.cast(self.gumbel_dist.sample(
                        BATCH.action.shape),
                                           dtype=tf.float32)
                    _pi = tf.nn.softmax(
                        (logp_all + gumbel_noise) / self.discrete_tau)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1),
                                                  self.a_dim)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    pi = _pi_diff + _pi
                    log_pi = tf.reduce_sum(tf.multiply(logp_all, pi),
                                           axis=1,
                                           keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                q_s_pi = self.critic_net.get_min(feat, pi)
                actor_loss = -tf.reduce_mean(q_s_pi - self.alpha * log_pi)
            actor_grads = tape.gradient(actor_loss,
                                        self.actor_net.trainable_variables)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_net.trainable_variables))

            if self.auto_adaption:
                with tf.GradientTape() as tape:
                    if self.is_continuous:
                        mu, log_std = self.actor_net.value_net(feat)
                        norm_dist = tfp.distributions.Normal(
                            loc=mu, scale=tf.exp(log_std))
                        log_pi = tf.reduce_sum(norm_dist.log_prob(
                            norm_dist.sample()),
                                               axis=-1,
                                               keep_dims=True)  # [B, 1]
                    else:
                        logits = self.actor_net.value_net(feat)
                        norm_dist = tfp.distributions.Categorical(
                            logits=tf.nn.log_softmax(logits))
                        log_pi = norm_dist.log_prob(cate_dist.sample())
                    alpha_loss = -tf.reduce_mean(
                        self.alpha *
                        tf.stop_gradient(log_pi + self.target_entropy))
                alpha_grad = tape.gradient(alpha_loss, self.log_alpha)
                self.optimizer_alpha.apply_gradients([(alpha_grad,
                                                       self.log_alpha)])
            self.global_step.assign_add(1)
            summaries = dict(
                [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss],
                 ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss],
                 ['LOSS/curl_loss', curl_loss],
                 ['Statistics/log_alpha', self.log_alpha],
                 ['Statistics/alpha', self.alpha],
                 ['Statistics/entropy', entropy],
                 ['Statistics/q_min',
                  tf.reduce_min(tf.minimum(q1, q2))],
                 ['Statistics/q_mean',
                  tf.reduce_mean(tf.minimum(q1, q2))],
                 ['Statistics/q_max',
                  tf.reduce_max(tf.maximum(q1, q2))]])
            if self.auto_adaption:
                summaries.update({'LOSS/alpha_loss': alpha_loss})
            return (td_error1 + td_error2) / 2., summaries

Exemple #18

0

Afficher le fichier

    def train(self, memories, kl_coef):
        s, visual_s, a, dc_r, old_log_prob, advantage, old_value, beta_advantage, last_options, options, cell_state = memories
        last_options = tf.reshape(tf.cast(last_options, tf.int32),
                                  (-1, ))  # [B, 1] => [B,]
        options = tf.reshape(tf.cast(options, tf.int32), (-1, ))
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                (q, pi, beta), cell_state = self.net(
                    s, visual_s,
                    cell_state=cell_state)  # [B, P], [B, P, A], [B, P], [B, P]

                options_onehot = tf.one_hot(options,
                                            self.options_num,
                                            dtype=tf.float32)  # [B, P]
                options_onehot_expanded = tf.expand_dims(options_onehot,
                                                         axis=-1)  # [B, P, 1]
                last_options_onehot = tf.one_hot(
                    last_options, self.options_num,
                    dtype=tf.float32)  # [B,] => [B, P]

                pi = tf.reduce_sum(pi * options_onehot_expanded,
                                   axis=1)  # [B, P, A] => [B, A]
                value = tf.reduce_sum(q * options_onehot,
                                      axis=1,
                                      keepdims=True)  # [B, 1]

                if self.is_continuous:
                    mu = pi  # [B, A]
                    log_std = tf.gather(self.log_std, options)
                    new_log_prob = gaussian_likelihood_sum(a, mu, log_std)
                    entropy = gaussian_entropy(log_std)
                else:
                    logits = pi  # [B, A]
                    logp_all = tf.nn.log_softmax(logits)
                    new_log_prob = tf.reduce_sum(a * logp_all,
                                                 axis=1,
                                                 keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                ratio = tf.exp(new_log_prob - old_log_prob)

                if self.kl_reverse:
                    kl = tf.reduce_mean(new_log_prob - old_log_prob)
                else:
                    kl = tf.reduce_mean(
                        old_log_prob - new_log_prob
                    )  # a sample estimate for KL-divergence, easy to compute
                surrogate = ratio * advantage

                value_clip = old_value + tf.clip_by_value(
                    value - old_value, -self.value_epsilon, self.value_epsilon)
                td_error = dc_r - value
                td_error_clip = dc_r - value_clip
                td_square = tf.maximum(tf.square(td_error),
                                       tf.square(td_error_clip))

                pi_loss = -tf.reduce_mean(
                    tf.minimum(
                        surrogate,
                        tf.clip_by_value(ratio, 1.0 - self.epsilon,
                                         1.0 + self.epsilon) * advantage))
                kl_loss = kl_coef * kl
                extra_loss = 1000.0 * tf.square(
                    tf.maximum(0., kl - self.kl_cutoff))
                pi_loss = pi_loss + kl_loss + extra_loss
                q_loss = 0.5 * tf.reduce_mean(td_square)

                beta_s = tf.reduce_sum(beta * last_options_onehot,
                                       axis=-1,
                                       keepdims=True)  # [B, 1]
                beta_loss = tf.reduce_mean(beta_s * beta_advantage)
                if self.terminal_mask:
                    beta_loss *= (1 - done)

                loss = pi_loss + 1.0 * q_loss + beta_loss - self.pi_beta * entropy
            loss_grads = tape.gradient(loss, self.net_tv)
            self.optimizer.apply_gradients(zip(loss_grads, self.net_tv))
            self.global_step.assign_add(1)
            return loss, pi_loss, q_loss, beta_loss, entropy, kl

Exemple #19

0

Afficher le fichier

Fichier : ac.py Projet : ncepuwwy97/RLs

 def train(self, memories, isw, crsty_loss, cell_state):
     ss, vvss, a, r, done, old_log_prob = memories
     with tf.device(self.device):
         with tf.GradientTape() as tape:
             feat, feat_ = self.get_feature(ss,
                                            vvss,
                                            cell_state=cell_state,
                                            s_and_s_=True)
             if self.is_continuous:
                 next_mu = self.actor_net(feat_)
                 max_q_next = tf.stop_gradient(
                     self.critic_net(feat_, next_mu))
             else:
                 logits = self.actor_net(feat_)
                 max_a = tf.argmax(logits, axis=1)
                 max_a_one_hot = tf.one_hot(max_a,
                                            self.a_dim,
                                            dtype=tf.float32)
                 max_q_next = tf.stop_gradient(
                     self.critic_net(feat_, max_a_one_hot))
             q = self.critic_net(feat, a)
             td_error = q - (r + self.gamma * (1 - done) * max_q_next)
             critic_loss = tf.reduce_mean(
                 tf.square(td_error) * isw) + crsty_loss
         critic_grads = tape.gradient(critic_loss, self.critic_tv)
         self.optimizer_critic.apply_gradients(
             zip(critic_grads, self.critic_tv))
         with tf.GradientTape() as tape:
             if self.is_continuous:
                 mu = self.actor_net(feat)
                 log_prob = gaussian_likelihood_sum(a, mu, self.log_std)
                 entropy = gaussian_entropy(self.log_std)
             else:
                 logits = self.actor_net(feat)
                 logp_all = tf.nn.log_softmax(logits)
                 log_prob = tf.reduce_sum(tf.multiply(logp_all, a),
                                          axis=1,
                                          keepdims=True)
                 entropy = -tf.reduce_mean(
                     tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                   axis=1,
                                   keepdims=True))
             q = self.critic_net(feat, a)
             ratio = tf.stop_gradient(tf.exp(log_prob - old_log_prob))
             q_value = tf.stop_gradient(q)
             actor_loss = -tf.reduce_mean(ratio * log_prob * q_value)
         actor_grads = tape.gradient(actor_loss, self.actor_tv)
         self.optimizer_actor.apply_gradients(
             zip(actor_grads, self.actor_tv))
         self.global_step.assign_add(1)
         return td_error, dict([['LOSS/actor_loss', actor_loss],
                                ['LOSS/critic_loss', critic_loss],
                                ['Statistics/q_max',
                                 tf.reduce_max(q)],
                                ['Statistics/q_min',
                                 tf.reduce_min(q)],
                                ['Statistics/q_mean',
                                 tf.reduce_mean(q)],
                                ['Statistics/ratio',
                                 tf.reduce_mean(ratio)],
                                ['Statistics/entropy', entropy]])