Beispiel #1
0
    def train(self, training_num):
        total_a_loss = 0
        total_c_loss = 0

        for i in range(training_num):
            self.current_step += 1
            s, a, r, ns, d = self.buffer.sample(self.batch_size)

            value_next = tf.stop_gradient(
                self.target_critic(ns, self.target_actor(ns)))
            target_value = r + (1 - d) * self.gamma * value_next

            with tf.GradientTape(persistent=True) as tape:
                critic_loss = 0.5 * tf.reduce_mean(
                    tf.square(target_value - self.critic(s, a)))
                actor_loss = -tf.reduce_mean(self.critic(s, self.actor(s)))

            critic_grad = tape.gradient(critic_loss,
                                        self.critic.trainable_variables)
            self.critic_optimizer.apply_gradients(
                zip(critic_grad, self.critic.trainable_variables))

            actor_grad = tape.gradient(actor_loss,
                                       self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                (zip(actor_grad, self.actor.trainable_variables)))

            soft_update(self.actor, self.target_actor, self.tau)
            soft_update(self.critic, self.target_critic, self.tau)

            del tape

            total_a_loss += actor_loss.numpy()
            total_c_loss += critic_loss.numpy()

        return [['Loss/Actor', total_a_loss], ['Loss/Critic', total_c_loss]]
Beispiel #2
0
    def train(self, training_num):
        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        for i in range(training_num):
            self.current_step += 1
            s, a, r, ns, d = self.buffer.sample(self.batch_size)

            target_action = tf.clip_by_value(
                self.target_actor(ns) + tf.clip_by_value(
                    tf.random.normal(shape=self.target_actor(ns).shape,
                                     mean=0,
                                     stddev=self.target_noise),
                    -self.noise_clip, self.noise_clip), -1, 1)

            target_value = tf.stop_gradient(
                r + self.gamma *
                (1 - d) * tf.minimum(self.target_critic1(ns, target_action),
                                     self.target_critic2(ns, target_action)))

            with tf.GradientTape(persistent=True) as tape:
                critic1_loss = 0.5 * tf.reduce_mean(
                    tf.square(target_value - self.critic1(s, a)))
                critic2_loss = 0.5 * tf.reduce_mean(
                    tf.square(target_value - self.critic2(s, a)))

            critic1_grad = tape.gradient(critic1_loss,
                                         self.critic1.trainable_variables)
            self.critic1_optimizer.apply_gradients(
                zip(critic1_grad, self.critic1.trainable_variables))

            critic2_grad = tape.gradient(critic2_loss,
                                         self.critic2.trainable_variables)
            self.critic2_optimizer.apply_gradients(
                zip(critic2_grad, self.critic2.trainable_variables))

            if self.current_step % self.policy_delay == 0:
                with tf.GradientTape() as tape2:
                    actor_loss = -tf.reduce_mean(self.critic1(
                        s, self.actor(s)))

                actor_grad = tape2.gradient(actor_loss,
                                            self.actor.trainable_variables)
                self.actor_optimizer.apply_gradients(
                    zip(actor_grad, self.actor.trainable_variables))

                soft_update(self.actor, self.target_actor, self.tau)
                soft_update(self.critic1, self.target_critic1, self.tau)
                soft_update(self.critic2, self.target_critic2, self.tau)

            del tape, tape2
            total_a_loss += actor_loss.numpy()
            total_c1_loss += critic1_loss.numpy()
            total_c2_loss += critic2_loss.numpy()

        return [['Loss/Actor', total_a_loss], ['Loss/Critic1', total_c1_loss],
                ['Loss/Critic2', total_c2_loss]]
Beispiel #3
0
    def train(self, training_num):
        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        total_alpha_loss = 0

        for i in range(training_num):
            self.current_step += 1
            s, a, r, ns, d = self.buffer.sample(self.batch_size)

            ns_action, ns_logpi = self.actor(ns)

            target_min_aq = tf.minimum(self.target_critic1(ns, ns_action),
                                       self.target_critic2(ns, ns_action))

            target_q = tf.stop_gradient(
                r + self.gamma * (1 - d) *
                (target_min_aq - self.alpha.numpy() * ns_logpi))

            with tf.GradientTape(persistent=True) as tape1:

                critic1_loss = 0.5 * tf.reduce_mean(
                    tf.square(self.critic1(s, a) - target_q))
                critic2_loss = 0.5 * tf.reduce_mean(
                    tf.square(self.critic2(s, a) - target_q))

            critic1_gradients = tape1.gradient(
                critic1_loss, self.critic1.trainable_variables)
            self.critic1_optimizer.apply_gradients(
                zip(critic1_gradients, self.critic1.trainable_variables))
            critic2_gradients = tape1.gradient(
                critic2_loss, self.critic2.trainable_variables)
            self.critic2_optimizer.apply_gradients(
                zip(critic2_gradients, self.critic2.trainable_variables))

            del tape1

            with tf.GradientTape() as tape2:
                s_action, s_logpi = self.actor(s)

                min_aq_rep = tf.minimum(self.critic1(s, s_action),
                                        self.critic2(s, s_action))

                actor_loss = 0.5 * tf.reduce_mean(self.alpha.numpy() *
                                                  s_logpi - min_aq_rep)

            actor_gradients = tape2.gradient(actor_loss,
                                             self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_gradients, self.actor.trainable_variables))

            del tape2

            if self.train_alpha == True:
                with tf.GradientTape() as tape3:
                    _, s_logpi = self.actor(s)
                    alpha_loss = -(
                        tf.exp(self.log_alpha) *
                        (tf.stop_gradient(s_logpi + self.target_entropy)))
                    alpha_loss = tf.nn.compute_average_loss(
                        alpha_loss)  #from softlearning package

                alpha_grad = tape3.gradient(alpha_loss, [self.log_alpha])
                self.alpha_optimizer.apply_gradients(
                    zip(alpha_grad, [self.log_alpha]))

                del tape3

            if self.current_step % self.critic_update == 0:
                soft_update(self.critic1, self.target_critic1, self.tau)
                soft_update(self.critic2, self.target_critic2, self.tau)

            total_a_loss += actor_loss.numpy()
            total_c1_loss += critic1_loss.numpy()
            total_c2_loss += critic2_loss.numpy()
            if self.train_alpha == True:
                total_alpha_loss += alpha_loss.numpy()

        return [['Loss/Actor', total_a_loss], ['Loss/Critic1', total_c1_loss],
                ['Loss/Critic2', total_c2_loss],
                ['Loss/alpha', total_alpha_loss],
                ['Alpha', tf.exp(self.log_alpha).numpy()]]
Beispiel #4
0
    def train(self, local_step):
        self.current_step += 1

        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        total_alpha_loss = 0
        loss_list = []

        s, a, r, ns, d = self.buffer.rad_sample(self.batch_size, self.aug_funcs, self.pre_image_size)

        ns_action, ns_logpi = self.actor(self.encoder(ns))

        target_min_aq = tf.minimum(self.target_critic1(self.target_encoder(ns), ns_action),
                                   self.target_critic2(self.target_encoder(ns), ns_action))

        target_q = tf.stop_gradient(r + self.gamma * (1 - d) * (
                target_min_aq - self.alpha.numpy() * ns_logpi))

        with tf.GradientTape(persistent=True) as tape1:
            critic1_loss = tf.reduce_mean(tf.square(self.critic1(self.encoder(s), a) - target_q))
            critic2_loss = tf.reduce_mean(tf.square(self.critic2(self.encoder(s), a) - target_q))

        critic1_gradients = tape1.gradient(critic1_loss,
                                           self.encoder.trainable_variables + self.critic1.trainable_variables)
        self.critic1_optimizer.apply_gradients(
            zip(critic1_gradients, self.encoder.trainable_variables + self.critic1.trainable_variables))

        critic2_gradients = tape1.gradient(critic2_loss,
                                           self.encoder.trainable_variables + self.critic2.trainable_variables)
        self.critic2_optimizer.apply_gradients(
            zip(critic2_gradients, self.encoder.trainable_variables + self.critic2.trainable_variables))

        del tape1

        with tf.GradientTape() as tape2:

            s_action, s_logpi = self.actor(tf.stop_gradient(self.encoder(s)))

            min_aq_rep = tf.minimum(self.critic1(tf.stop_gradient(self.encoder(s)), s_action),
                                    self.critic2(tf.stop_gradient(self.encoder(s)), s_action))

            actor_loss = tf.reduce_mean(self.alpha.numpy() * s_logpi - min_aq_rep)

        actor_gradients = tape2.gradient(actor_loss, self.actor.trainable_variables)
        self.actor_optimizer.apply_gradients(zip(actor_gradients, self.actor.trainable_variables))
        del tape2

        if self.train_alpha == True:
            with tf.GradientTape() as tape3:
                _, s_logpi = self.actor(self.encoder(s))
                alpha_loss = -tf.exp(self.log_alpha) * tf.stop_gradient(s_logpi + self.target_entropy)
                alpha_loss = tf.nn.compute_average_loss(alpha_loss)
                #alpha_loss = tf.reduce_mean(alpha_loss)

            log_alpha_gradients = tape3.gradient(alpha_loss, [self.log_alpha])
            self.log_alpha_optimizer.apply_gradients(zip(log_alpha_gradients, [self.log_alpha]))

            del tape3

        if self.current_step % self.critic_update == 0:
            soft_update(self.critic1, self.target_critic1, self.tau)
            soft_update(self.critic2, self.target_critic2, self.tau)
            soft_update(self.encoder, self.target_encoder, self.encoder_tau)


        total_c1_loss += critic1_loss.numpy()
        total_c2_loss += critic2_loss.numpy()

        loss_list.append(['Loss/Critic1', total_c1_loss])
        loss_list.append(['Loss/Critic2', total_c2_loss])

        total_a_loss += actor_loss.numpy()
        loss_list.append(['Loss/Actor', total_a_loss])

        if self.train_alpha == True:
            total_alpha_loss += alpha_loss.numpy()
            loss_list.append(['Loss/Alpha', total_alpha_loss])

        loss_list.append(['Alpha', tf.exp(self.log_alpha).numpy()])

        return loss_list
Beispiel #5
0
    def train(self, local_step):
        self.current_step += 1

        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        total_cpc_loss = 0
        loss_list = []
        s, a, r, ns, d, cpc_kwargs = self.buffer.cpc_sample(
            self.batch_size, self.image_size)

        obs_anchor, obs_pos = cpc_kwargs["obs_anchor"], cpc_kwargs["obs_pos"]

        with tf.GradientTape(persistent=True) as tape:
            z_a = self.encoder(obs_anchor)
            z_pos = tf.stop_gradient(self.target_encoder(obs_pos))
            logits = self.curl.compute_logits(z_a, z_pos)
            labels = tf.range(logits.shape[0], dtype='int64')

            cpc_loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(labels, logits))

        cpc_gradients = tape.gradient(cpc_loss, self.curl.trainable_variables)
        self.cpc_optimizer.apply_gradients(
            zip(cpc_gradients, self.curl.trainable_variables))

        encoder_gradients = tape.gradient(cpc_loss,
                                          self.encoder.trainable_variables)
        self.encoder_optimizer.apply_gradients(
            zip(encoder_gradients, self.encoder.trainable_variables))

        total_cpc_loss += cpc_loss.numpy()
        loss_list.append(['Loss/CPC', total_cpc_loss])

        del tape

        if self.current_step % 2 == 0:
            target_action = tf.clip_by_value(
                self.target_actor(self.target_encoder(ns)) + tf.clip_by_value(
                    tf.random.normal(shape=self.target_actor(
                        self.target_encoder(ns)).shape,
                                     mean=0,
                                     stddev=self.target_noise),
                    -self.noise_clip, self.noise_clip), -1, 1)

            target_value = tf.stop_gradient(r + self.gamma * (
                1 - d
            ) * tf.minimum(
                self.target_critic1(self.target_encoder(ns), target_action),
                self.target_critic2(self.target_encoder(ns), target_action)))

            with tf.GradientTape(persistent=True) as tape:
                critic1_loss = 0.5 * tf.reduce_mean(
                    tf.square(target_value - self.critic1(self.encoder(s), a)))
                critic2_loss = 0.5 * tf.reduce_mean(
                    tf.square(target_value - self.critic2(self.encoder(s), a)))

            critic1_grad = tape.gradient(
                critic1_loss, self.encoder.trainable_variables +
                self.critic1.trainable_variables)
            self.critic1_optimizer.apply_gradients(
                zip(
                    critic1_grad, self.encoder.trainable_variables +
                    self.critic1.trainable_variables))

            critic2_grad = tape.gradient(
                critic2_loss, self.encoder.trainable_variables +
                self.critic2.trainable_variables)
            self.critic2_optimizer.apply_gradients(
                zip(
                    critic2_grad, self.encoder.trainable_variables +
                    self.critic2.trainable_variables))

            if self.current_step % (2 * self.policy_delay) == 0:
                with tf.GradientTape() as tape2:
                    actor_loss = -tf.reduce_mean(
                        self.critic1(
                            tf.stop_gradient(self.encoder(s)),
                            self.actor(tf.stop_gradient(self.encoder(s)))))

                actor_grad = tape2.gradient(actor_loss,
                                            self.actor.trainable_variables)
                self.actor_optimizer.apply_gradients(
                    zip(actor_grad, self.actor.trainable_variables))

                soft_update(self.actor, self.target_actor, self.tau)
                soft_update(self.critic1, self.target_critic1, self.tau)
                soft_update(self.critic2, self.target_critic2, self.tau)
                soft_update(self.encoder, self.target_encoder,
                            self.encoder_tau)

                total_a_loss += actor_loss.numpy()
                loss_list.append(['Loss/Actor', total_a_loss])

            total_c1_loss += critic1_loss.numpy()
            total_c2_loss += critic2_loss.numpy()

            loss_list.append(['Loss/Critic1', total_c1_loss])
            loss_list.append(['Loss/Critic2', total_c2_loss])

        return loss_list
Beispiel #6
0
    def train(self, local_step):
        self.current_step += 1

        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        total_cpc_loss = 0
        total_alpha_loss = 0
        loss_list = []

        s, a, r, ns, d, cpc_kwargs = self.buffer.cpc_sample(
            self.batch_size, self.image_size)

        obs_anchor, obs_pos = cpc_kwargs["obs_anchor"], cpc_kwargs["obs_pos"]

        ns_action, ns_logpi = self.actor(self.encoder(ns))

        target_min_aq = tf.minimum(
            self.target_critic1(self.target_encoder(ns), ns_action),
            self.target_critic2(self.target_encoder(ns), ns_action))

        target_q = tf.stop_gradient(
            r + self.gamma * (1 - d) *
            (target_min_aq - self.alpha.numpy() * ns_logpi))

        with tf.GradientTape(persistent=True) as tape1:
            critic1_loss = tf.reduce_mean(
                tf.square(self.critic1(self.encoder(s), a) - target_q))
            critic2_loss = tf.reduce_mean(
                tf.square(self.critic2(self.encoder(s), a) - target_q))

        critic1_gradients = tape1.gradient(
            critic1_loss, self.encoder.trainable_variables +
            self.critic1.trainable_variables)
        self.critic1_optimizer.apply_gradients(
            zip(
                critic1_gradients, self.encoder.trainable_variables +
                self.critic1.trainable_variables))

        critic2_gradients = tape1.gradient(
            critic2_loss, self.encoder.trainable_variables +
            self.critic2.trainable_variables)
        self.critic2_optimizer.apply_gradients(
            zip(
                critic2_gradients, self.encoder.trainable_variables +
                self.critic2.trainable_variables))

        del tape1

        with tf.GradientTape() as tape2:

            s_action, s_logpi = self.actor(tf.stop_gradient(self.encoder(s)))

            min_aq_rep = tf.minimum(
                self.critic1(tf.stop_gradient(self.encoder(s)), s_action),
                self.critic2(tf.stop_gradient(self.encoder(s)), s_action))

            actor_loss = tf.reduce_mean(self.alpha.numpy() * s_logpi -
                                        min_aq_rep)

        actor_gradients = tape2.gradient(actor_loss,
                                         self.actor.trainable_variables)
        self.actor_optimizer.apply_gradients(
            zip(actor_gradients, self.actor.trainable_variables))

        del tape2

        if self.train_alpha == True:
            with tf.GradientTape() as tape3:
                _, s_logpi = self.actor(self.encoder(s))
                alpha_loss = -(tf.exp(self.log_alpha) *
                               tf.stop_gradient(s_logpi + self.target_entropy))
                alpha_loss = tf.nn.compute_average_loss(alpha_loss)
                #alpha_loss = tf.reduce_mean(alpha_loss)

            log_alpha_gradients = tape3.gradient(alpha_loss, [self.log_alpha])
            self.log_alpha_optimizer.apply_gradients(
                zip(log_alpha_gradients, [self.log_alpha]))

            del tape3

        if self.current_step % self.critic_update == 0:
            soft_update(self.critic1, self.target_critic1, self.tau)
            soft_update(self.critic2, self.target_critic2, self.tau)
            soft_update(self.encoder, self.target_encoder, self.encoder_tau)

        with tf.GradientTape(persistent=True) as tape4:
            z_a = self.encoder(obs_anchor)
            z_pos = tf.stop_gradient(self.target_encoder(obs_pos))
            logits = self.curl.compute_logits(z_a, z_pos)
            labels = tf.range(logits.shape[0], dtype='int64')

            cpc_loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(labels, logits))

        cpc_gradients = tape4.gradient(cpc_loss, self.curl.trainable_variables)
        self.cpc_optimizer.apply_gradients(
            zip(cpc_gradients, self.curl.trainable_variables))

        encoder_gradients = tape4.gradient(cpc_loss,
                                           self.encoder.trainable_variables)
        self.encoder_optimizer.apply_gradients(
            zip(encoder_gradients, self.encoder.trainable_variables))

        del tape4

        total_c1_loss += critic1_loss.numpy()
        total_c2_loss += critic2_loss.numpy()

        loss_list.append(['Loss/Critic1', total_c1_loss])
        loss_list.append(['Loss/Critic2', total_c2_loss])

        total_a_loss += actor_loss.numpy()
        loss_list.append(['Loss/Actor', total_a_loss])

        total_cpc_loss += cpc_loss.numpy()
        loss_list.append(['Loss/CPC', total_cpc_loss])

        if self.train_alpha == True:
            total_alpha_loss += alpha_loss.numpy()
            loss_list.append(['Loss/Alpha', total_alpha_loss])

        loss_list.append(['Alpha', tf.exp(self.log_alpha).numpy()])

        return loss_list
Beispiel #7
0
    def train(self, training_step):
        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        total_v_loss = 0
        total_cpc_loss = 0
        loss_list = []

        self.current_step += 1

        s, a, r, ns, d, cpc_kwargs = self.buffer.cpc_sample(
            self.batch_size, self.image_size)

        obs_anchor, obs_pos = cpc_kwargs["obs_anchor"], cpc_kwargs["obs_pos"]

        s_action, s_logpi = self.actor(self.encoder(s))

        min_aq = tf.minimum(self.critic1(self.encoder(s), s_action),
                            self.critic2(self.encoder(s), s_action))
        target_v = tf.stop_gradient(min_aq - self.alpha * s_logpi)

        with tf.GradientTape() as tape1:
            v_loss = 0.5 * tf.reduce_mean(
                tf.square(
                    self.v_network(tf.stop_gradient(self.encoder(s))) -
                    target_v))

        v_gradients = tape1.gradient(v_loss,
                                     self.v_network.trainable_variables)
        self.v_network_optimizer.apply_gradients(
            zip(v_gradients, self.v_network.trainable_variables))

        del tape1

        target_q = tf.stop_gradient(
            r + self.gamma *
            (1 - d) * self.target_v_network(self.target_encoder(ns)))

        with tf.GradientTape(persistent=True) as tape2:
            critic1_loss = 0.5 * tf.reduce_mean(
                tf.square(self.critic1(self.encoder(s), a) - target_q))
            critic2_loss = 0.5 * tf.reduce_mean(
                tf.square(self.critic2(self.encoder(s), a) - target_q))

        critic1_gradients = tape2.gradient(
            critic1_loss, self.encoder.trainable_variables +
            self.critic1.trainable_variables)

        critic2_gradients = tape2.gradient(
            critic2_loss, self.encoder.trainable_variables +
            self.critic2.trainable_variables)

        self.critic1_optimizer.apply_gradients(
            zip(
                critic1_gradients, self.encoder.trainable_variables +
                self.critic1.trainable_variables))

        self.critic2_optimizer.apply_gradients(
            zip(
                critic2_gradients, self.encoder.trainable_variables +
                self.critic2.trainable_variables))

        del tape2

        with tf.GradientTape() as tape3:
            s_action, s_logpi = self.actor(tf.stop_gradient(self.encoder(s)))

            min_aq_rep = tf.minimum(
                self.critic1(tf.stop_gradient(self.encoder(s)), s_action),
                self.critic2(tf.stop_gradient(self.encoder(s)), s_action))

            actor_loss = tf.reduce_mean(self.alpha * s_logpi - min_aq_rep)

        actor_gradients = tape3.gradient(actor_loss,
                                         self.actor.trainable_variables)
        self.actor_optimizer.apply_gradients(
            zip(actor_gradients, self.actor.trainable_variables))

        soft_update(self.v_network, self.target_v_network, self.tau)

        with tf.GradientTape(persistent=True) as tape4:
            z_a = self.encoder(obs_anchor)
            z_pos = tf.stop_gradient(self.target_encoder(obs_pos))

            logits = self.curl.compute_logits(z_a, z_pos)
            labels = tf.range(logits.shape[0], dtype='int64')

            cpc_loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(labels, logits))

        cpc_gradients = tape4.gradient(cpc_loss, self.curl.trainable_variables)
        self.cpc_optimizer.apply_gradients(
            (zip(cpc_gradients, self.curl.trainable_variables)))

        encoder_gradients = tape4.gradient(cpc_loss,
                                           self.encoder.trainable_variables)
        self.encoder_optimizer.apply_gradients(
            zip(encoder_gradients, self.encoder.trainable_variables))

        soft_update(self.encoder, self.target_encoder, self.encoder_tau)

        del tape4

        total_v_loss += v_loss.numpy()
        loss_list.append(['Loss/V', total_v_loss])

        total_c1_loss += critic1_loss.numpy()
        total_c2_loss += critic2_loss.numpy()

        loss_list.append(['Loss/Critic1', total_c1_loss])
        loss_list.append(['Loss/Critic2', total_c2_loss])

        total_a_loss += actor_loss.numpy()
        loss_list.append(['Loss/Actor', total_a_loss])

        total_cpc_loss += cpc_loss.numpy()
        loss_list.append(['Loss/CPC', total_cpc_loss])

        return loss_list
Beispiel #8
0
    def train(self,
              local_step):  #critic -> transition -> reward -> encoder -> actor
        set1, set2 = self.buffer.dbc_sample(self.batch_size)

        s, a, r, ns, d = set1
        s2, a2, r2, ns2, d2 = set2

        target_action = tf.clip_by_value(
            self.target_actor(self.target_encoder(ns)) + tf.clip_by_value(
                tf.random.normal(shape=self.target_actor(
                    self.target_encoder(ns)).shape,
                                 mean=0,
                                 stddev=self.target_noise), -self.noise_clip,
                self.noise_clip), -1, 1)

        target_value = tf.stop_gradient(r + self.gamma * (1 - d) * tf.minimum(
            self.target_critic1(self.target_encoder(ns), target_action),
            self.target_critic2(self.target_encoder(ns), target_action)))

        with tf.GradientTape(persistent=True) as tape1:
            critic1_loss = 0.5 * tf.reduce_mean(
                tf.square(target_value - self.critic1(self.encoder(s), a)))
            critic2_loss = 0.5 * tf.reduce_mean(
                tf.square(target_value - self.critic2(self.encoder(s), a)))

        critic1_grad = tape1.gradient(
            critic1_loss, self.encoder.trainable_variables +
            self.critic1.trainable_variables)
        self.critic1_optimizer.apply_gradients(
            zip(
                critic1_grad, self.encoder.trainable_variables +
                self.critic1.trainable_variables))

        critic2_grad = tape1.gradient(
            critic2_loss, self.encoder.trainable_variables +
            self.critic2.trainable_variables)
        self.critic2_optimizer.apply_gradients(
            zip(
                critic2_grad, self.encoder.trainable_variables +
                self.critic2.trainable_variables))

        del tape1

        #train dynamics
        with tf.GradientTape() as tape2:
            feature = self.encoder(s)
            next_feature = self.encoder(ns)
            mu, sigma = self.dynamics_model(tf.concat([feature, a], axis=1))

            if (sigma[0][0].numpy() == 0):
                sigma = tf.ones_like(mu)
            diff = (mu - tf.stop_gradient(next_feature)) / sigma
            dynamics_loss = tf.reduce_mean(0.5 * tf.square(diff) +
                                           tf.math.log(sigma))

        dynamics_gradients = tape2.gradient(
            dynamics_loss, self.encoder.trainable_variables +
            self.dynamics_model.trainable_variables)
        self.dynamics_optimizer.apply_gradients(
            zip(
                dynamics_gradients, self.encoder.trainable_variables +
                self.dynamics_model.trainable_variables))

        #dynamics_gradients = tape2.gradient(dynamics_loss, self.dynamics_model.trainable_variables)
        #self.dynamics_optimizer.apply_gradients(zip(dynamics_gradients, self.dynamics_model.trainable_variables))

        del tape2

        #train reward
        with tf.GradientTape() as tape3:
            feature = self.encoder(s)
            sample_dynamics = self.dynamics_model.sample(
                tf.concat([feature, a], axis=1))
            reward_prediction = self.reward_model(sample_dynamics)

            reward_loss = tf.reduce_mean(tf.square(reward_prediction - (r)))

        reward_gradients = tape3.gradient(
            reward_loss, self.encoder.trainable_variables +
            self.reward_model.trainable_variables)
        self.reward_optimizer.apply_gradients(
            zip(
                reward_gradients, self.encoder.trainable_variables +
                self.reward_model.trainable_variables))

        #reward_gradients = tape3.gradient(reward_loss, self.reward_model.trainable_variables)
        #self.reward_optimizer.apply_gradients(zip(reward_gradients, self.reward_model.trainable_variables))

        del tape3

        #train encoder
        with tf.GradientTape() as tape4:
            feature1 = self.encoder(s)
            feature2 = self.encoder(s2)

            mu1, sigma1 = self.dynamics_model(tf.concat([feature1, a], axis=1))
            mu2, sigma2 = self.dynamics_model(tf.concat([feature2, a2],
                                                        axis=1))

            z_dist = tf.abs(feature1 - feature2)
            r_dist = tf.abs(r - r2)

            transition_dist = tf.sqrt(
                tf.square(tf.abs(mu1 - mu2)) +
                tf.square(tf.abs(sigma1 - sigma2)))
            bisimilarity = tf.stop_gradient(
                tf.cast(r_dist, tf.float32) +
                self.gamma * tf.cast(transition_dist, tf.float32))
            encoder_loss = self.bisim_coef * tf.reduce_mean(
                tf.square(z_dist - bisimilarity))

        encoder_gradients = tape4.gradient(encoder_loss,
                                           self.encoder.trainable_variables)
        self.encoder_optimizer.apply_gradients(
            zip(encoder_gradients, self.encoder.trainable_variables))

        del tape4

        if local_step % (self.policy_delay) == 0:
            with tf.GradientTape() as tape5:
                actor_loss = -tf.reduce_mean(
                    self.critic1(tf.stop_gradient(self.encoder(s)),
                                 self.actor(tf.stop_gradient(
                                     self.encoder(s)))))

            actor_grad = tape5.gradient(actor_loss,
                                        self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor.trainable_variables))

            del tape5

            soft_update(self.actor, self.target_actor, self.tau)
            soft_update(self.critic1, self.target_critic1, self.tau)
            soft_update(self.critic2, self.target_critic2, self.tau)
            soft_update(self.encoder, self.target_encoder, self.encoder_tau)
Beispiel #9
0
    def train(self, local_step):
        self.current_step += 1
        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        total_alpha_loss = 0
        total_encoder_loss = 0
        total_dynamics_loss = 0
        total_reward_loss = 0
        loss_list = []
        s, a, r, ns, d = self.buffer.sample(self.batch_size)

        ns_action, ns_logpi = self.actor(self.encoder(ns))

        target_min_aq = tf.minimum(
            self.target_critic1(self.target_encoder(ns), ns_action),
            self.target_critic2(self.target_encoder(ns), ns_action))

        target_q = tf.stop_gradient(
            r + self.gamma * (1 - d) *
            (target_min_aq - self.alpha.numpy() * ns_logpi))

        with tf.GradientTape(persistent=True) as tape1:
            critic1_loss = tf.reduce_mean(
                tf.square(self.critic1(self.encoder(s), a) - target_q))
            critic2_loss = tf.reduce_mean(
                tf.square(self.critic2(self.encoder(s), a) - target_q))

        critic1_gradients = tape1.gradient(
            critic1_loss, self.encoder.trainable_variables +
            self.critic1.trainable_variables)
        self.critic1_optimizer.apply_gradients(
            zip(
                critic1_gradients, self.encoder.trainable_variables +
                self.critic1.trainable_variables))

        critic2_gradients = tape1.gradient(
            critic2_loss, self.encoder.trainable_variables +
            self.critic2.trainable_variables)
        self.critic2_optimizer.apply_gradients(
            zip(
                critic2_gradients, self.encoder.trainable_variables +
                self.critic2.trainable_variables))

        del tape1

        if self.current_step % self.actor_update == 0:
            with tf.GradientTape() as tape2:

                s_action, s_logpi = self.actor(
                    tf.stop_gradient(self.encoder(s)))

                min_aq_rep = tf.minimum(
                    self.critic1(tf.stop_gradient(self.encoder(s)), s_action),
                    self.critic2(tf.stop_gradient(self.encoder(s)), s_action))

                actor_loss = tf.reduce_mean(self.alpha.numpy() * s_logpi -
                                            min_aq_rep)

            actor_gradients = tape2.gradient(actor_loss,
                                             self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_gradients, self.actor.trainable_variables))

            del tape2

            if self.train_alpha == True:
                with tf.GradientTape() as tape3:
                    _, s_logpi = self.actor(self.encoder(s))
                    alpha_loss = -(
                        tf.exp(self.log_alpha) *
                        tf.stop_gradient(s_logpi + self.target_entropy))
                    alpha_loss = tf.nn.compute_average_loss(alpha_loss)
                    #alpha_loss = tf.reduce_mean(alpha_loss)

                log_alpha_gradients = tape3.gradient(alpha_loss,
                                                     [self.log_alpha])
                self.log_alpha_optimizer.apply_gradients(
                    zip(log_alpha_gradients, [self.log_alpha]))

                del tape3

        if self.current_step % self.critic_update == 0:
            soft_update(self.critic1, self.target_critic1, self.tau)
            soft_update(self.critic2, self.target_critic2, self.tau)
            soft_update(self.encoder, self.target_encoder, self.encoder_tau)

        #train encoder
        with tf.GradientTape() as tape4:
            new_ids = np.arange(len(s))
            np.random.shuffle(new_ids)
            s2 = tf.gather(s, new_ids)

            feature = self.encoder(s)
            #feature2 = tf.gather(feature, new_ids)
            feature2 = self.encoder(s2)

            reward = self.reward_model(tf.stop_gradient(feature))
            #reward2 = tf.gather(reward, new_ids)
            reward2 = self.reward_model(tf.stop_gradient(feature2))

            feature_action, _ = self.actor(tf.stop_gradient(feature), True)
            feature2_action, _ = self.actor(tf.stop_gradient(feature2), True)

            mu, sigma = self.dynamics_model(tf.stop_gradient(feature),
                                            feature_action)
            mu2, sigma2 = self.dynamics_model(tf.stop_gradient(feature2),
                                              feature2_action)

            z_dist = tf.reshape(tf.keras.losses.huber(feature, feature2),
                                shape=[-1, 1])
            r_dist = tf.reshape(tf.keras.losses.huber(reward, reward2),
                                shape=[-1, 1])
            transition_dist = tf.sqrt(
                tf.square(mu - mu2) + tf.square(sigma - sigma2))

            bisimilarity = r_dist + self.gamma * transition_dist
            encoder_loss = tf.reduce_mean(tf.square(z_dist - bisimilarity))

        encoder_gradients = tape4.gradient(encoder_loss,
                                           self.encoder.trainable_variables)
        self.encoder_optimizer.apply_gradients(
            zip(encoder_gradients, self.encoder.trainable_variables))

        #train dynamics
        with tf.GradientTape() as tape5:
            feature = self.encoder(s)
            mu, sigma = self.dynamics_model(feature, a)

            if (sigma[0][0].numpy() == 0):
                if self.dynamics_model.deterministic == False:
                    print("error")
                sigma = tf.ones_like(mu)

            next_feature = self.encoder(ns)
            diff = (mu - tf.stop_gradient(next_feature)) / sigma

            dynamics_loss = tf.reduce_mean(0.5 * tf.square(diff) +
                                           tf.math.log(sigma))

        dynamics_gradients = tape5.gradient(
            dynamics_loss, self.encoder.trainable_variables +
            self.dynamics_model.trainable_variables)
        self.dynamics_optimizer.apply_gradients(
            zip(
                dynamics_gradients, self.encoder.trainable_variables +
                self.dynamics_model.trainable_variables))

        #train reward
        with tf.GradientTape() as tape6:
            feature = self.encoder(s)
            sample_dynamics = self.dynamics_model.sample(feature, a)
            reward_prediction = self.reward_model(sample_dynamics)

            reward_loss = tf.reduce_mean(tf.square(reward_prediction - r))

        reward_gradients = tape6.gradient(
            reward_loss, self.encoder.trainable_variables +
            self.reward_model.trainable_variables)
        self.reward_optimizer.apply_gradients(
            zip(
                reward_gradients, self.encoder.trainable_variables +
                self.reward_model.trainable_variables))

        total_c1_loss += critic1_loss.numpy()
        total_c2_loss += critic2_loss.numpy()

        loss_list.append(['Loss/Critic1', total_c1_loss])
        loss_list.append(['Loss/Critic2', total_c2_loss])

        if self.current_step % self.actor_update == 0:
            total_a_loss += actor_loss.numpy()
            loss_list.append(['Loss/Actor', total_a_loss])

        total_encoder_loss += encoder_loss.numpy()
        loss_list.append(['Loss/Encoder', total_encoder_loss])

        total_dynamics_loss += dynamics_loss.numpy()
        loss_list.append(['Loss/Dynamics', total_dynamics_loss])

        total_reward_loss += reward_loss.numpy()
        loss_list.append(['Loss/Reward', total_reward_loss])

        if self.current_step % self.actor_update == 0 and self.train_alpha == True:
            total_alpha_loss += alpha_loss.numpy()
            loss_list.append(['Loss/Alpha', total_alpha_loss])

        loss_list.append(['Alpha', tf.exp(self.log_alpha).numpy()])

        return loss_list
Beispiel #10
0
    def train(self, local_step):
        set1, set2 = self.buffer.dbc_sample(self.batch_size)

        s, a, r, ns, d = set1
        s2, a2, r2, ns2, d2 = set2

        target_min_aq = tf.minimum(
            self.target_critic1(self.target_encoder(ns),
                                self.actor(self.encoder(ns))),
            self.target_critic2(self.target_encoder(ns),
                                self.actor(self.encoder(ns))))

        target_q = tf.stop_gradient(r + self.gamma * (1 - d) *
                                    (target_min_aq - self.alpha.numpy() *
                                     self.actor.log_pi(self.encoder(ns))))

        with tf.GradientTape(persistent=True) as tape1:
            critic1_loss = tf.reduce_mean(
                tf.square(self.critic1(self.encoder(s), a) - target_q))
            critic2_loss = tf.reduce_mean(
                tf.square(self.critic2(self.encoder(s), a) - target_q))

        critic1_gradients = tape1.gradient(
            critic1_loss, self.encoder.trainable_variables +
            self.critic1.trainable_variables)
        self.critic1_optimizer.apply_gradients(
            zip(
                critic1_gradients, self.encoder.trainable_variables +
                self.critic1.trainable_variables))

        critic2_gradients = tape1.gradient(
            critic2_loss, self.encoder.trainable_variables +
            self.critic2.trainable_variables)
        self.critic2_optimizer.apply_gradients(
            zip(
                critic2_gradients, self.encoder.trainable_variables +
                self.critic2.trainable_variables))

        del tape1

        #train dynamics(encoder used together)
        next_feature = self.encoder(ns)
        with tf.GradientTape() as tape2:
            feature = self.encoder(s)

            mu, sigma = self.dynamics_model(tf.concat([feature, a], axis=1))

            if (sigma[0][0].numpy() == 0):
                if self.dynamics_model.deterministic == False:
                    print("error")
                sigma = tf.ones_like(mu)

            diff = (mu - tf.stop_gradient(next_feature)) / sigma
            dynamics_loss = tf.reduce_mean(0.5 * tf.square(diff) +
                                           tf.math.log(sigma))

        dynamics_gradients = tape2.gradient(
            dynamics_loss, self.encoder.trainable_variables +
            self.dynamics_model.trainable_variables)
        self.dynamics_optimizer.apply_gradients(
            zip(
                dynamics_gradients, self.encoder.trainable_variables +
                self.dynamics_model.trainable_variables))

        del tape2

        #train rewards(encoder used together)
        with tf.GradientTape() as tape3:
            feature = self.encoder(s)
            sample_dynamics = self.dynamics_model.sample(
                tf.concat([feature, a], axis=1))
            reward_prediction = self.reward_model(sample_dynamics)

            reward_loss = tf.reduce_mean(tf.square(reward_prediction - r))

        reward_gradients = tape3.gradient(
            reward_loss, self.encoder.trainable_variables +
            self.reward_model.trainable_variables)
        self.reward_optimizer.apply_gradients(
            zip(
                reward_gradients, self.encoder.trainable_variables +
                self.reward_model.trainable_variables))

        del tape3

        # train encoder
        with tf.GradientTape() as tape4:
            feature1 = self.encoder(s)
            feature2 = self.encoder(s2)

            mu1, sigma1 = self.dynamics_model(tf.concat([feature1, a], axis=1))
            mu2, sigma2 = self.dynamics_model(tf.concat([feature2, a2],
                                                        axis=1))

            z_dist = tf.abs(feature1 - feature2)
            r_dist = tf.abs(r - r2)

            transition_dist = tf.sqrt(
                tf.square(tf.abs(mu1 - mu2)) +
                tf.square(tf.abs(sigma1 - sigma2)))
            bisimilarity = (
                tf.cast(r_dist, tf.float32) +
                self.gamma * tf.cast(transition_dist, tf.float32)).numpy()
            encoder_loss = self.bisim_coef * tf.reduce_mean(
                tf.square(z_dist - bisimilarity))

        encoder_gradients = tape4.gradient(encoder_loss,
                                           self.encoder.trainable_variables)
        self.encoder_optimizer.apply_gradients(
            zip(encoder_gradients, self.encoder.trainable_variables))

        del tape4

        if local_step % 2 == 0:
            with tf.GradientTape() as tape5:
                mu, sigma = self.actor.mu_sigma(
                    tf.stop_gradient(self.encoder(s)))
                output = mu + tf.random.normal(shape=mu.shape) * sigma

                min_aq_rep = tf.minimum(
                    self.critic1(tf.stop_gradient(self.encoder(s)), output),
                    self.critic2(tf.stop_gradient(self.encoder(s)), output))

                actor_loss = tf.reduce_mean(
                    self.alpha.numpy() *
                    self.actor.log_pi(tf.stop_gradient(self.encoder(s))) -
                    min_aq_rep)

            actor_gradients = tape5.gradient(actor_loss,
                                             self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_gradients, self.actor.trainable_variables))

            del tape5

            if self.train_alpha == True:
                with tf.GradientTape() as tape6:
                    alpha_loss = -(tf.exp(self.log_alpha) * tf.stop_gradient(
                        self.actor.log_pi(self.encoder(s)) +
                        self.target_entropy))
                    alpha_loss = tf.nn.compute_average_loss(alpha_loss)

                log_alpha_gradients = tape6.gradient(alpha_loss,
                                                     [self.log_alpha])
                self.log_alpha_optimizer.apply_gradients(
                    zip(log_alpha_gradients, [self.log_alpha]))

                del tape6

            soft_update(self.critic1, self.target_critic1, self.tau)
            soft_update(self.critic2, self.target_critic2, self.tau)
            soft_update(self.encoder, self.target_encoder, self.encoder_tau)
Beispiel #11
0
    def train(self, local_step):
        self.current_step += 1

        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        total_alpha_loss = 0
        total_ae_loss = 0
        loss_list = []

        s, a, r, ns, d = self.buffer.sample(self.batch_size)

        ns_action, ns_logpi = self.actor(self.encoder(ns))

        target_min_aq = tf.minimum(
            self.target_critic1(self.target_encoder(ns), ns_action),
            self.target_critic2(self.target_encoder(ns), ns_action))

        target_q = tf.stop_gradient(
            r + self.gamma * (1 - d) *
            (target_min_aq - self.alpha.numpy() * ns_logpi))
        #critic update
        with tf.GradientTape(persistent=True) as tape1:
            critic1_loss = tf.reduce_mean(
                tf.square(self.critic1(self.encoder(s), a) - target_q))
            critic2_loss = tf.reduce_mean(
                tf.square(self.critic2(self.encoder(s), a) - target_q))

        critic1_gradients = tape1.gradient(
            critic1_loss, self.encoder.trainable_variables +
            self.critic1.trainable_variables)
        self.critic1_optimizer.apply_gradients(
            zip(
                critic1_gradients, self.encoder.trainable_variables +
                self.critic1.trainable_variables))

        critic2_gradients = tape1.gradient(
            critic2_loss, self.encoder.trainable_variables +
            self.critic2.trainable_variables)
        self.critic2_optimizer.apply_gradients(
            zip(
                critic2_gradients, self.encoder.trainable_variables +
                self.critic2.trainable_variables))

        del tape1

        #actor update
        if self.current_step % self.actor_update == 0:
            with tf.GradientTape() as tape2:

                s_action, s_logpi = self.actor(
                    tf.stop_gradient(self.encoder(s)))

                min_aq_rep = tf.minimum(
                    self.critic1(tf.stop_gradient(self.encoder(s)), s_action),
                    self.critic2(tf.stop_gradient(self.encoder(s)), s_action))

                actor_loss = tf.reduce_mean(self.alpha.numpy() * s_logpi -
                                            min_aq_rep)

            actor_gradients = tape2.gradient(actor_loss,
                                             self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_gradients, self.actor.trainable_variables))

            del tape2
            #alpha update
            if self.train_alpha == True:
                with tf.GradientTape() as tape3:
                    _, s_logpi = self.actor(self.encoder(s))
                    alpha_loss = -(
                        tf.exp(self.log_alpha) *
                        tf.stop_gradient(s_logpi + self.target_entropy))
                    alpha_loss = tf.nn.compute_average_loss(alpha_loss)

                log_alpha_gradients = tape3.gradient(alpha_loss,
                                                     [self.log_alpha])
                self.log_alpha_optimizer.apply_gradients(
                    zip(log_alpha_gradients, [self.log_alpha]))

                del tape3

        if self.current_step % self.decoder_update == 0:
            #encoder, decoder update
            with tf.GradientTape(persistent=True) as tape4:
                feature = self.encoder(s)
                recovered_s = self.decoder(feature)
                real_s = preprocess_obs(s)

                rec_loss = tf.reduce_mean(tf.square(recovered_s - real_s))
                latent_loss = tf.reduce_mean(
                    0.5 * tf.reduce_sum(tf.square(feature), axis=1))

                ae_loss = rec_loss + self.decoder_latent_lambda * latent_loss

            encoder_gradients = tape4.gradient(
                ae_loss, self.encoder.trainable_variables)
            decoder_gradients = tape4.gradient(
                ae_loss, self.decoder.trainable_variables)

            self.encoder_optimizer.apply_gradients(
                zip(encoder_gradients, self.encoder.trainable_variables))
            self.decoder_optimizer.apply_gradients(
                zip(decoder_gradients, self.decoder.trainable_variables))

        if self.current_step % self.critic_update == 0:

            soft_update(self.critic1, self.target_critic1, self.tau)
            soft_update(self.critic2, self.target_critic2, self.tau)
            soft_update(self.encoder, self.target_encoder, self.encoder_tau)

        del tape4

        total_c1_loss += critic1_loss.numpy()
        total_c2_loss += critic2_loss.numpy()

        loss_list.append(['Loss/Critic1', total_c1_loss])
        loss_list.append(['Loss/Critic2', total_c2_loss])

        if self.current_step % self.decoder_update == 0:
            total_ae_loss += ae_loss.numpy()
            loss_list.append(['Loss/AutoEncoder', total_ae_loss])

        if self.current_step % self.actor_update == 0:
            total_a_loss += actor_loss.numpy()
            loss_list.append(['Loss/Actor', total_a_loss])
            if self.train_alpha == True:
                total_alpha_loss += alpha_loss.numpy()
                loss_list.append(['Loss/Alpha', total_alpha_loss])

        loss_list.append(['Alpha', tf.exp(self.log_alpha).numpy()])

        return loss_list
Beispiel #12
0
    def train(self, training_num):
        for i in range(training_num):
            s, a, r, ns, d = self.buffer.sample(self.batch_size)

            target_min_aq = tf.minimum(self.target_critic1(ns, self.actor(ns)),
                                       self.target_critic2(ns, self.actor(ns)))

            target_q = tf.stop_gradient(
                r + self.gamma * (1 - d) *
                (target_min_aq - self.alpha.numpy() * self.actor.log_pi(ns)))

            #critic training
            with tf.GradientTape(persistent=True) as tape1:
                critic1_loss = tf.reduce_mean(
                    tf.square(self.critic1(s, a) - target_q))
                critic2_loss = tf.reduce_mean(
                    tf.square(self.critic2(s, a) - target_q))

            critic1_gradients = tape1.gradient(
                critic1_loss, self.critic1.trainable_variables)
            self.critic1_optimizer.apply_gradients(
                zip(critic1_gradients, self.critic1.trainable_variables))
            critic2_gradients = tape1.gradient(
                critic2_loss, self.critic2.trainable_variables)
            self.critic2_optimizer.apply_gradients(
                zip(critic2_gradients, self.critic2.trainable_variables))

            del tape1

            #actor training
            with tf.GradientTape() as tape2:
                mu, sigma = self.actor.mu_sigma(s)
                output = mu + tf.random.normal(shape=mu.shape) * sigma

                min_aq_rep = tf.minimum(self.critic1(s, output),
                                        self.critic2(s, output))

                actor_loss = tf.reduce_mean(self.alpha.numpy() *
                                            self.actor.log_pi(s) - min_aq_rep)

            actor_gradients = tape2.gradient(actor_loss,
                                             self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_gradients, self.actor.trainable_variables))

            del tape2

            #alpha(temperature) training
            if self.train_alpha == True:
                with tf.GradientTape() as tape3:
                    alpha_loss = -(tf.exp(self.log_alpha) * (tf.stop_gradient(
                        self.actor.log_pi(s) + self.target_entropy)))
                    alpha_loss = tf.nn.compute_average_loss(
                        alpha_loss)  #from softlearning package

                alpha_grad = tape3.gradient(alpha_loss, [self.log_alpha])
                self.alpha_optimizer.apply_gradients(
                    zip(alpha_grad, [self.log_alpha]))

                del tape3

            soft_update(self.critic1, self.target_critic1, self.tau)
            soft_update(self.critic2, self.target_critic2, self.tau)
Beispiel #13
0
    def train(self, training_num):
        for i in range(training_num):
            s, a, r, ns, d = self.buffer.sample(self.batch_size)

            min_aq = tf.minimum(self.critic1(s, self.actor(s)),
                                self.critic2(s, self.actor(s)))

            target_v = tf.stop_gradient(min_aq -
                                        self.alpha * self.actor.log_pi(s))
            #v_network training
            with tf.GradientTape(persistent=True) as tape1:
                v_loss = 0.5 * tf.reduce_mean(
                    tf.square(self.v_network(s) - target_v))

            v_gradients = tape1.gradient(v_loss,
                                         self.v_network.trainable_variables)
            self.v_network_optimizer.apply_gradients(
                zip(v_gradients, self.v_network.trainable_variables))

            del tape1

            target_q = tf.stop_gradient(r + self.gamma *
                                        (1 - d) * self.target_v_network(ns))
            #critic training
            with tf.GradientTape(persistent=True) as tape2:

                critic1_loss = 0.5 * tf.reduce_mean(
                    tf.square(self.critic1(s, a) - target_q))
                critic2_loss = 0.5 * tf.reduce_mean(
                    tf.square(self.critic2(s, a) - target_q))

            critic1_gradients = tape2.gradient(
                critic1_loss, self.critic1.trainable_variables)
            self.critic1_optimizer.apply_gradients(
                zip(critic1_gradients, self.critic1.trainable_variables))

            critic2_gradients = tape2.gradient(
                critic2_loss, self.critic2.trainable_variables)
            self.critic2_optimizer.apply_gradients(
                zip(critic2_gradients, self.critic2.trainable_variables))

            del tape2
            #actor training
            with tf.GradientTape() as tape3:
                mu, sigma = self.actor.mu_sigma(s)
                output = mu + tf.random.normal(shape=sigma.shape) * sigma

                min_aq_rep = tf.minimum(self.critic1(s, output),
                                        self.critic2(s, output))

                actor_loss = tf.reduce_mean(self.alpha * self.actor.log_pi(s) -
                                            min_aq_rep)

            actor_grad = tape3.gradient(actor_loss,
                                        self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor.trainable_variables))

            del tape3

            soft_update(self.v_network, self.target_v_network, self.tau)
Beispiel #14
0
    def train(self, training_num):
        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        total_v_loss = 0
        for i in range(training_num):
            self.current_step += 1

            s, a, r, ns, d = self.buffer.sample(self.batch_size)
            s_action, s_logpi = self.actor(s)
            min_aq = tf.minimum(self.critic1(s, s_action),
                                self.critic2(s, s_action))
            target_v = tf.stop_gradient(min_aq - self.alpha * s_logpi)

            with tf.GradientTape() as tape1:
                v_loss = 0.5 * tf.reduce_mean(
                    tf.square(self.v_network(s) - target_v))

            v_gradients = tape1.gradient(v_loss,
                                         self.v_network.trainable_variables)
            self.v_network_optimizer.apply_gradients(
                zip(v_gradients, self.v_network.trainable_variables))

            target_q = tf.stop_gradient(r + self.gamma *
                                        (1 - d) * self.target_v_network(ns))

            with tf.GradientTape(persistent=True) as tape2:
                critic1_loss = 0.5 * tf.reduce_mean(
                    tf.square(self.critic1(s, a) - target_q))
                critic2_loss = 0.5 * tf.reduce_mean(
                    tf.square(self.critic2(s, a) - target_q))

            critic1_gradients = tape2.gradient(
                critic1_loss, self.critic1.trainable_variables)
            self.critic1_optimizer.apply_gradients(
                zip(critic1_gradients, self.critic1.trainable_variables))

            critic2_gradients = tape2.gradient(
                critic2_loss, self.critic2.trainable_variables)
            self.critic2_optimizer.apply_gradients(
                zip(critic2_gradients, self.critic2.trainable_variables))

            with tf.GradientTape() as tape3:
                s_action, s_logpi = self.actor(s)

                min_aq_rep = tf.minimum(self.critic1(s, s_action),
                                        self.critic2(s, s_action))
                actor_loss = tf.reduce_mean(self.alpha * s_logpi - min_aq_rep)

            actor_grad = tape3.gradient(actor_loss,
                                        self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor.trainable_variables))

            soft_update(self.v_network, self.target_v_network, self.tau)

            del tape1, tape2, tape3

            total_a_loss += actor_loss.numpy()
            total_c1_loss += critic1_loss.numpy()
            total_c2_loss += critic2_loss.numpy()
            total_v_loss += v_loss.numpy()

        return [['Loss/Actor', total_a_loss], ['Loss/Critic1', total_c1_loss],
                ['Loss/Critic2', total_c2_loss], ['Loss/V', total_v_loss]]