Esempio n. 1
0
 def call(self, x):
     x = self.share(x)
     mu = self.mu(x)
     log_std = self.log_std(x)
     if self.soft_clip:
         log_std = tf.tanh(log_std)
         log_std = clip_nn_log_std(log_std, self.log_std_min,
                                   self.log_std_max)
     else:
         log_std = tf.clip_by_value(log_std, self.log_std_min,
                                    self.log_std_max)
     return (mu, log_std)
Esempio n. 2
0
 def _get_action(self, s, visual_s):
     with tf.device(self.device):
         feat = tf.concat([self.encoder(visual_s), s], axis=-1)
         if self.is_continuous:
             mu, log_std = self.actor_net(feat)
             log_std = clip_nn_log_std(log_std, self.log_std_min,
                                       self.log_std_max)
             pi, _ = squash_rsample(mu, log_std)
             mu = tf.tanh(mu)  # squash mu
         else:
             logits = self.actor_net(feat)
             mu = tf.argmax(logits, axis=1)
             cate_dist = tfp.distributions.Categorical(logits)
             pi = cate_dist.sample()
         return mu, pi
Esempio n. 3
0
 def _get_action(self, s, visual_s, cell_state):
     with tf.device(self.device):
         feat, cell_state = self._representation_net(s,
                                                     visual_s,
                                                     cell_state=cell_state)
         if self.is_continuous:
             mu, log_std = self.actor_net.value_net(feat)
             log_std = clip_nn_log_std(log_std, self.log_std_min,
                                       self.log_std_max)
             pi, _ = squash_rsample(mu, log_std)
             mu = tf.tanh(mu)  # squash mu
         else:
             logits = self.actor_net.value_net(feat)
             mu = tf.argmax(logits, axis=1)
             cate_dist = tfp.distributions.Categorical(
                 logits=tf.nn.log_softmax(logits))
             pi = cate_dist.sample()
         return mu, pi, cell_state
Esempio n. 4
0
 def _get_action(self, s, visual_s, cell_state):
     with tf.device(self.device):
         feat, cell_state = self.get_feature(s,
                                             visual_s,
                                             cell_state=cell_state,
                                             record_cs=True)
         if self.is_continuous:
             mu, log_std = self.actor_net(feat)
             log_std = clip_nn_log_std(log_std, self.log_std_min,
                                       self.log_std_max)
             pi, _ = tsallis_squash_rsample(mu, log_std,
                                            self.entropic_index)
             mu = tf.tanh(mu)  # squash mu
         else:
             logits = self.actor_net(feat)
             mu = tf.argmax(logits, axis=1)
             cate_dist = tfp.distributions.Categorical(logits)
             pi = cate_dist.sample()
         return mu, pi, cell_state
Esempio n. 5
0
    def train(self, memories, isw, crsty_loss, cell_state):
        s, visual_s, a, r, s_, visual_s_, done, pos = memories
        batch_size = tf.shape(a)[0]
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                vis_feat = self.encoder(visual_s)
                vis_feat_ = self.encoder(visual_s_)
                target_vis_feat_ = self.encoder_target(visual_s_)
                feat = tf.concat([vis_feat, s], axis=-1)
                feat_ = tf.concat([vis_feat_, s_], axis=-1)
                target_feat_ = tf.concat([target_vis_feat_, s_], axis=-1)
                if self.is_continuous:
                    target_mu, target_log_std = self.actor_net(feat_)
                    target_log_std = clip_nn_log_std(target_log_std)
                    target_pi, target_log_pi = squash_rsample(
                        target_mu, target_log_std)
                else:
                    target_logits = self.actor_net(feat_)
                    target_cate_dist = tfp.distributions.Categorical(
                        target_logits)
                    target_pi = target_cate_dist.sample()
                    target_log_pi = target_cate_dist.log_prob(target_pi)
                    target_pi = tf.one_hot(target_pi,
                                           self.a_dim,
                                           dtype=tf.float32)
                q1, q2 = self.critic_net(feat, a)
                q1_target, q2_target = self.critic_target_net(feat_, target_pi)
                dc_r_q1 = tf.stop_gradient(
                    r + self.gamma * (1 - done) *
                    (q1_target - self.alpha * target_log_pi))
                dc_r_q2 = tf.stop_gradient(
                    r + self.gamma * (1 - done) *
                    (q2_target - self.alpha * target_log_pi))
                td_error1 = q1 - dc_r_q1
                td_error2 = q2 - dc_r_q2
                q1_loss = tf.reduce_mean(tf.square(td_error1) * isw)
                q2_loss = tf.reduce_mean(tf.square(td_error2) * isw)
                critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + crsty_loss

                z_a = vis_feat  # [B, N]
                z_out = self.encoder_target(pos)
                logits = tf.matmul(
                    z_a, tf.matmul(self.curl_w, tf.transpose(z_out, [1, 0])))
                logits -= tf.reduce_max(logits, axis=-1, keepdims=True)
                curl_loss = tf.reduce_mean(
                    tf.keras.losses.sparse_categorical_crossentropy(
                        tf.range(self.batch_size), logits))
            critic_grads = tape.gradient(critic_loss, self.critic_tv)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.critic_tv))
            curl_grads = tape.gradient(curl_loss, [self.curl_w] +
                                       self.encoder.trainable_variables)
            self.optimizer_curl.apply_gradients(
                zip(curl_grads,
                    [self.curl_w] + self.encoder.trainable_variables))

            with tf.GradientTape() as tape:
                if self.is_continuous:
                    mu, log_std = self.actor_net(feat)
                    log_std = clip_nn_log_std(log_std, self.log_std_min,
                                              self.log_std_max)
                    pi, log_pi = squash_rsample(mu, log_std)
                    entropy = gaussian_entropy(log_std)
                else:
                    logits = self.actor_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    gumbel_noise = tf.cast(self.gumbel_dist.sample(
                        [batch_size, self.a_dim]),
                                           dtype=tf.float32)
                    _pi = tf.nn.softmax(
                        (logp_all + gumbel_noise) / self.discrete_tau)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1),
                                                  self.a_dim)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    pi = _pi_diff + _pi
                    log_pi = tf.reduce_sum(tf.multiply(logp_all, pi),
                                           axis=1,
                                           keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                q_s_pi = self.critic_net.get_min(feat, pi)
                actor_loss = -tf.reduce_mean(q_s_pi - self.alpha * log_pi)
            actor_grads = tape.gradient(actor_loss, self.actor_tv)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_tv))

            if self.auto_adaption:
                with tf.GradientTape() as tape:
                    if self.is_continuous:
                        mu, log_std = self.actor_net(feat)
                        log_std = clip_nn_log_std(log_std, self.log_std_min,
                                                  self.log_std_max)
                        norm_dist = tfp.distributions.Normal(
                            loc=mu, scale=tf.exp(log_std))
                        log_pi = tf.reduce_sum(norm_dist.log_prob(
                            norm_dist.sample()),
                                               axis=-1)
                    else:
                        logits = self.actor_net(feat)
                        cate_dist = tfp.distributions.Categorical(logits)
                        log_pi = cate_dist.log_prob(cate_dist.sample())
                    alpha_loss = -tf.reduce_mean(
                        self.alpha *
                        tf.stop_gradient(log_pi + self.target_entropy))
                alpha_grad = tape.gradient(alpha_loss, self.log_alpha)
                self.optimizer_alpha.apply_gradients([(alpha_grad,
                                                       self.log_alpha)])
            self.global_step.assign_add(1)
            summaries = dict(
                [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss],
                 ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss],
                 ['LOSS/curl_loss', curl_loss],
                 ['Statistics/log_alpha', self.log_alpha],
                 ['Statistics/alpha', self.alpha],
                 ['Statistics/entropy', entropy],
                 ['Statistics/q_min',
                  tf.reduce_min(tf.minimum(q1, q2))],
                 ['Statistics/q_mean',
                  tf.reduce_mean(tf.minimum(q1, q2))],
                 ['Statistics/q_max',
                  tf.reduce_max(tf.maximum(q1, q2))]])
            if self.auto_adaption:
                summaries.update({'LOSS/alpha_loss': alpha_loss})
            return (td_error1 + td_error2) / 2., summaries
Esempio n. 6
0
    def train(self, memories, isw, cell_state):
        ss, vvss, a, r, done, s_, visual_s_ = memories
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                (feat,
                 feat_), _ = self._representation_net(ss,
                                                      vvss,
                                                      cell_state=cell_state,
                                                      need_split=True)
                if self.is_continuous:
                    mu, log_std = self.actor_net.value_net(feat)
                    log_std = clip_nn_log_std(log_std, self.log_std_min,
                                              self.log_std_max)
                    pi, log_pi = tsallis_squash_rsample(
                        mu, log_std, self.entropic_index)
                    entropy = gaussian_entropy(log_std)
                    target_mu, target_log_std = self.actor_net.value_net(feat_)
                    target_log_std = clip_nn_log_std(target_log_std,
                                                     self.log_std_min,
                                                     self.log_std_max)
                    target_pi, target_log_pi = tsallis_squash_rsample(
                        target_mu, target_log_std, self.entropic_index)
                else:
                    logits = self.actor_net.value_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    gumbel_noise = tf.cast(self.gumbel_dist.sample(a.shape),
                                           dtype=tf.float32)
                    _pi = tf.nn.softmax(
                        (logp_all + gumbel_noise) / self.discrete_tau)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1),
                                                  self.a_dim)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    pi = _pi_diff + _pi
                    log_pi = tf.reduce_sum(tf.multiply(logp_all, pi),
                                           axis=1,
                                           keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))

                    target_logits = self.actor_net.value_net(feat_)
                    target_cate_dist = tfp.distributions.Categorical(
                        logits=tf.nn.log_softmax(target_logits))
                    target_pi = target_cate_dist.sample()
                    target_log_pi = target_cate_dist.log_prob(target_pi)
                    target_pi = tf.one_hot(target_pi,
                                           self.a_dim,
                                           dtype=tf.float32)
                q1, q2 = self.critic_net.get_value(feat, a)
                q_s_pi = self.critic_net.get_min(feat, pi)

                q1_target, q2_target, _ = self.critic_target_net(
                    s_, visual_s_, target_pi, cell_state=cell_state)
                q_target = tf.minimum(q1_target, q2_target)
                dc_r = tf.stop_gradient(
                    r + self.gamma * (1 - done) *
                    (q_target - self.alpha * target_log_pi))
                td_error1 = q1 - dc_r
                td_error2 = q2 - dc_r
                q1_loss = tf.reduce_mean(tf.square(td_error1) * isw)
                q2_loss = tf.reduce_mean(tf.square(td_error2) * isw)
                critic_loss = 0.5 * q1_loss + 0.5 * q2_loss
                actor_loss = -tf.reduce_mean(q_s_pi - self.alpha * log_pi)
                if self.auto_adaption:
                    alpha_loss = -tf.reduce_mean(
                        self.alpha *
                        tf.stop_gradient(log_pi + self.target_entropy))
            critic_grads = tape.gradient(critic_loss,
                                         self.critic_net.trainable_variables)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.critic_net.trainable_variables))
            actor_grads = tape.gradient(actor_loss,
                                        self.actor_net.trainable_variables)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_net.trainable_variables))
            if self.auto_adaption:
                alpha_grad = tape.gradient(alpha_loss, self.log_alpha)
                self.optimizer_alpha.apply_gradients([(alpha_grad,
                                                       self.log_alpha)])
            self.global_step.assign_add(1)
            summaries = dict(
                [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss],
                 ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss],
                 ['Statistics/log_alpha', self.log_alpha],
                 ['Statistics/alpha', self.alpha],
                 ['Statistics/entropy', entropy],
                 ['Statistics/q_min',
                  tf.reduce_min(tf.minimum(q1, q2))],
                 ['Statistics/q_mean',
                  tf.reduce_mean(tf.minimum(q1, q2))],
                 ['Statistics/q_max',
                  tf.reduce_max(tf.maximum(q1, q2))]])
            if self.auto_adaption:
                summaries.update({'LOSS/alpha_loss': alpha_loss})
            return (td_error1 + td_error2) / 2, summaries
Esempio n. 7
0
    def train(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done = memories
        batch_size = tf.shape(a)[0]
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat, feat_ = self.get_feature(ss,
                                               vvss,
                                               cell_state=cell_state,
                                               s_and_s_=True)
                if self.is_continuous:
                    target_mu, target_log_std = self.actor_net(feat_)
                    target_log_std = clip_nn_log_std(target_log_std)
                    target_pi, target_log_pi = squash_rsample(
                        target_mu, target_log_std)
                else:
                    target_logits = self.actor_net(feat_)
                    target_cate_dist = tfp.distributions.Categorical(
                        target_logits)
                    target_pi = target_cate_dist.sample()
                    target_log_pi = target_cate_dist.log_prob(target_pi)
                    target_pi = tf.one_hot(target_pi,
                                           self.a_dim,
                                           dtype=tf.float32)
                q1, q2 = self.critic_net(feat, a)
                q1_target, q2_target = self.critic_target_net(feat_, target_pi)
                dc_r_q1 = tf.stop_gradient(
                    r + self.gamma * (1 - done) *
                    (q1_target - self.alpha * target_log_pi))
                dc_r_q2 = tf.stop_gradient(
                    r + self.gamma * (1 - done) *
                    (q2_target - self.alpha * target_log_pi))
                td_error1 = q1 - dc_r_q1
                td_error2 = q2 - dc_r_q2
                q1_loss = tf.reduce_mean(tf.square(td_error1) * isw)
                q2_loss = tf.reduce_mean(tf.square(td_error2) * isw)
                critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + crsty_loss
            critic_grads = tape.gradient(critic_loss, self.critic_tv)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.critic_tv))

            with tf.GradientTape() as tape:
                if self.is_continuous:
                    mu, log_std = self.actor_net(feat)
                    log_std = clip_nn_log_std(log_std, self.log_std_min,
                                              self.log_std_max)
                    pi, log_pi = squash_rsample(mu, log_std)
                    entropy = gaussian_entropy(log_std)
                else:
                    logits = self.actor_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    gumbel_noise = tf.cast(self.gumbel_dist.sample(
                        [batch_size, self.a_dim]),
                                           dtype=tf.float32)
                    _pi = tf.nn.softmax(
                        (logp_all + gumbel_noise) / self.discrete_tau)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1),
                                                  self.a_dim)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    pi = _pi_diff + _pi
                    log_pi = tf.reduce_sum(tf.multiply(logp_all, pi),
                                           axis=1,
                                           keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                q_s_pi = self.critic_net.get_min(feat, pi)
                actor_loss = -tf.reduce_mean(q_s_pi - self.alpha * log_pi)
            actor_grads = tape.gradient(actor_loss, self.actor_tv)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_tv))

            if self.auto_adaption:
                with tf.GradientTape() as tape:
                    if self.is_continuous:
                        mu, log_std = self.actor_net(feat)
                        log_std = clip_nn_log_std(log_std, self.log_std_min,
                                                  self.log_std_max)
                        norm_dist = tfp.distributions.Normal(
                            loc=mu, scale=tf.exp(log_std))
                        log_pi = tf.reduce_sum(norm_dist.log_prob(
                            norm_dist.sample()),
                                               axis=-1)
                    else:
                        logits = self.actor_net(feat)
                        cate_dist = tfp.distributions.Categorical(logits)
                        log_pi = cate_dist.log_prob(cate_dist.sample())
                    # $J(\alpha)=\mathbb{E}_{\mathbf{a}_{t} \sim \pi_{t}}\left[-\alpha \log \pi_{t}\left(\mathbf{a}_{t} | \mathbf{s}_{t}\right)-\alpha \overline{\mathcal{H}}\right.$
                    # \overline{\mathcal{H}} is negative
                    alpha_loss = -tf.reduce_mean(
                        self.alpha *
                        tf.stop_gradient(log_pi + self.target_entropy))
                alpha_grad = tape.gradient(alpha_loss, self.log_alpha)
                self.optimizer_alpha.apply_gradients([(alpha_grad,
                                                       self.log_alpha)])
            self.global_step.assign_add(1)
            summaries = dict(
                [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss],
                 ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss],
                 ['Statistics/log_alpha', self.log_alpha],
                 ['Statistics/alpha', self.alpha],
                 ['Statistics/entropy', entropy],
                 ['Statistics/q_min',
                  tf.reduce_min(tf.minimum(q1, q2))],
                 ['Statistics/q_mean',
                  tf.reduce_mean(tf.minimum(q1, q2))],
                 ['Statistics/q_max',
                  tf.reduce_max(tf.maximum(q1, q2))]])
            if self.auto_adaption:
                summaries.update({'LOSS/alpha_loss': alpha_loss})
            return (td_error1 + td_error2) / 2., summaries
Esempio n. 8
0
 def train_persistent(self, memories, isw, crsty_loss, cell_state):
     ss, vvss, a, r, done = memories
     batch_size = tf.shape(a)[0]
     with tf.device(self.device):
         with tf.GradientTape(persistent=True) as tape:
             feat, feat_ = self.get_feature(ss,
                                            vvss,
                                            cell_state=cell_state,
                                            s_and_s_=True)
             if self.is_continuous:
                 mu, log_std = self.actor_net(feat)
                 log_std = clip_nn_log_std(log_std, self.log_std_min,
                                           self.log_std_max)
                 pi, log_pi = squash_rsample(mu, log_std)
                 entropy = gaussian_entropy(log_std)
             else:
                 logits = self.actor_net(feat)
                 logp_all = tf.nn.log_softmax(logits)
                 gumbel_noise = tf.cast(self.gumbel_dist.sample(
                     [batch_size, self.a_dim]),
                                        dtype=tf.float32)
                 _pi = tf.nn.softmax(
                     (logp_all + gumbel_noise) / self.discrete_tau)
                 _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1),
                                               self.a_dim)
                 _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                 pi = _pi_diff + _pi
                 log_pi = tf.reduce_sum(tf.multiply(logp_all, pi),
                                        axis=1,
                                        keepdims=True)
                 entropy = -tf.reduce_mean(
                     tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                   axis=1,
                                   keepdims=True))
             q1, q2 = self.q_net(feat, a)
             v = self.v_net(feat)
             q1_pi, q2_pi = self.q_net(feat, pi)
             v_target = self.v_target_net(feat_)
             dc_r = tf.stop_gradient(r + self.gamma * v_target * (1 - done))
             v_from_q_stop = tf.stop_gradient(
                 tf.minimum(q1_pi, q2_pi) - self.alpha * log_pi)
             td_v = v - v_from_q_stop
             td_error1 = q1 - dc_r
             td_error2 = q2 - dc_r
             q1_loss = tf.reduce_mean(tf.square(td_error1) * isw)
             q2_loss = tf.reduce_mean(tf.square(td_error2) * isw)
             v_loss_stop = tf.reduce_mean(tf.square(td_v) * isw)
             critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + 0.5 * v_loss_stop + crsty_loss
             actor_loss = -tf.reduce_mean(q1_pi - self.alpha * log_pi)
             if self.auto_adaption:
                 alpha_loss = -tf.reduce_mean(
                     self.alpha *
                     tf.stop_gradient(log_pi + self.target_entropy))
         actor_grads = tape.gradient(actor_loss, self.actor_tv)
         self.optimizer_actor.apply_gradients(
             zip(actor_grads, self.actor_tv))
         critic_grads = tape.gradient(critic_loss, self.critic_tv)
         self.optimizer_critic.apply_gradients(
             zip(critic_grads, self.critic_tv))
         if self.auto_adaption:
             alpha_grad = tape.gradient(alpha_loss, self.log_alpha)
             self.optimizer_alpha.apply_gradients([(alpha_grad,
                                                    self.log_alpha)])
         self.global_step.assign_add(1)
         summaries = dict(
             [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss],
              ['LOSS/q2_loss', q2_loss], ['LOSS/v_loss', v_loss_stop],
              ['LOSS/critic_loss', critic_loss],
              ['Statistics/log_alpha', self.log_alpha],
              ['Statistics/alpha', self.alpha],
              ['Statistics/entropy', entropy],
              ['Statistics/q_min',
               tf.reduce_min(tf.minimum(q1, q2))],
              ['Statistics/q_mean',
               tf.reduce_mean(tf.minimum(q1, q2))],
              ['Statistics/q_max',
               tf.reduce_max(tf.maximum(q1, q2))],
              ['Statistics/v_mean', tf.reduce_mean(v)]])
         if self.auto_adaption:
             summaries.update({'LOSS/alpha_loss': alpha_loss})
         return (td_error1 + td_error2) / 2, summaries