예제 #1
0
파일: CURL.py 프로젝트: Cerphilly/SimpleRL
class CURL_SACv2:
    def __init__(self, obs_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.image_size = obs_dim[-1]
        self.current_step = 0

        self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha),
                                     trainable=True)
        self.target_entropy = -action_dim
        self.gamma = args.gamma

        self.batch_size = args.batch_size
        self.feature_dim = args.feature_dim
        self.curl_latent_dim = args.curl_latent_dim

        self.layer_num = args.layer_num
        self.filter_num = args.filter_num
        self.tau = args.tau
        self.encoder_tau = args.encoder_tau
        self.critic_update = args.critic_update

        self.training_start = args.training_start
        self.training_step = args.training_step
        self.train_alpha = args.train_alpha

        self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim,
                                             args.hidden_dim, args.log_std_min,
                                             args.log_std_max)
        self.critic1 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.critic2 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic1 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)
        self.target_critic2 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)

        self.encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                           self.layer_num, self.filter_num)

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.curl = CURL(self.feature_dim, self.curl_latent_dim)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr)
        self.cpc_optimizer = tf.keras.optimizers.Adam(args.cpc_lr)
        self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr,
                                                            beta_1=0.5)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2,
            'Curl': self.curl,
            'Encoder': self.encoder,
            'Target_Encoder': self.target_encoder
        }

        self.name = 'CURL_SACv2'

    @property
    def alpha(self):
        return tf.exp(self.log_alpha)

    def get_action(self, obs):

        if obs.shape[-1] != self.image_size:
            obs = center_crop_image(obs, self.image_size)

        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action, _ = self.actor(feature)
        action = action.numpy()[0]

        return action

    def eval_action(self, obs):

        if obs.shape[-1] != self.image_size:
            obs = center_crop_image(obs, self.image_size)

        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action, _ = self.actor(feature, deterministic=True)
        action = action.numpy()[0]

        return action

    def train(self, local_step):
        self.current_step += 1

        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        total_cpc_loss = 0
        total_alpha_loss = 0
        loss_list = []

        s, a, r, ns, d, cpc_kwargs = self.buffer.cpc_sample(
            self.batch_size, self.image_size)

        obs_anchor, obs_pos = cpc_kwargs["obs_anchor"], cpc_kwargs["obs_pos"]

        ns_action, ns_logpi = self.actor(self.encoder(ns))

        target_min_aq = tf.minimum(
            self.target_critic1(self.target_encoder(ns), ns_action),
            self.target_critic2(self.target_encoder(ns), ns_action))

        target_q = tf.stop_gradient(
            r + self.gamma * (1 - d) *
            (target_min_aq - self.alpha.numpy() * ns_logpi))

        with tf.GradientTape(persistent=True) as tape1:
            critic1_loss = tf.reduce_mean(
                tf.square(self.critic1(self.encoder(s), a) - target_q))
            critic2_loss = tf.reduce_mean(
                tf.square(self.critic2(self.encoder(s), a) - target_q))

        critic1_gradients = tape1.gradient(
            critic1_loss, self.encoder.trainable_variables +
            self.critic1.trainable_variables)
        self.critic1_optimizer.apply_gradients(
            zip(
                critic1_gradients, self.encoder.trainable_variables +
                self.critic1.trainable_variables))

        critic2_gradients = tape1.gradient(
            critic2_loss, self.encoder.trainable_variables +
            self.critic2.trainable_variables)
        self.critic2_optimizer.apply_gradients(
            zip(
                critic2_gradients, self.encoder.trainable_variables +
                self.critic2.trainable_variables))

        del tape1

        with tf.GradientTape() as tape2:

            s_action, s_logpi = self.actor(tf.stop_gradient(self.encoder(s)))

            min_aq_rep = tf.minimum(
                self.critic1(tf.stop_gradient(self.encoder(s)), s_action),
                self.critic2(tf.stop_gradient(self.encoder(s)), s_action))

            actor_loss = tf.reduce_mean(self.alpha.numpy() * s_logpi -
                                        min_aq_rep)

        actor_gradients = tape2.gradient(actor_loss,
                                         self.actor.trainable_variables)
        self.actor_optimizer.apply_gradients(
            zip(actor_gradients, self.actor.trainable_variables))

        del tape2

        if self.train_alpha == True:
            with tf.GradientTape() as tape3:
                _, s_logpi = self.actor(self.encoder(s))
                alpha_loss = -(tf.exp(self.log_alpha) *
                               tf.stop_gradient(s_logpi + self.target_entropy))
                alpha_loss = tf.nn.compute_average_loss(alpha_loss)
                #alpha_loss = tf.reduce_mean(alpha_loss)

            log_alpha_gradients = tape3.gradient(alpha_loss, [self.log_alpha])
            self.log_alpha_optimizer.apply_gradients(
                zip(log_alpha_gradients, [self.log_alpha]))

            del tape3

        if self.current_step % self.critic_update == 0:
            soft_update(self.critic1, self.target_critic1, self.tau)
            soft_update(self.critic2, self.target_critic2, self.tau)
            soft_update(self.encoder, self.target_encoder, self.encoder_tau)

        with tf.GradientTape(persistent=True) as tape4:
            z_a = self.encoder(obs_anchor)
            z_pos = tf.stop_gradient(self.target_encoder(obs_pos))
            logits = self.curl.compute_logits(z_a, z_pos)
            labels = tf.range(logits.shape[0], dtype='int64')

            cpc_loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(labels, logits))

        cpc_gradients = tape4.gradient(cpc_loss, self.curl.trainable_variables)
        self.cpc_optimizer.apply_gradients(
            zip(cpc_gradients, self.curl.trainable_variables))

        encoder_gradients = tape4.gradient(cpc_loss,
                                           self.encoder.trainable_variables)
        self.encoder_optimizer.apply_gradients(
            zip(encoder_gradients, self.encoder.trainable_variables))

        del tape4

        total_c1_loss += critic1_loss.numpy()
        total_c2_loss += critic2_loss.numpy()

        loss_list.append(['Loss/Critic1', total_c1_loss])
        loss_list.append(['Loss/Critic2', total_c2_loss])

        total_a_loss += actor_loss.numpy()
        loss_list.append(['Loss/Actor', total_a_loss])

        total_cpc_loss += cpc_loss.numpy()
        loss_list.append(['Loss/CPC', total_cpc_loss])

        if self.train_alpha == True:
            total_alpha_loss += alpha_loss.numpy()
            loss_list.append(['Loss/Alpha', total_alpha_loss])

        loss_list.append(['Alpha', tf.exp(self.log_alpha).numpy()])

        return loss_list
예제 #2
0
파일: CURL.py 프로젝트: Cerphilly/SimpleRL
class CURL_TD3:
    def __init__(self, obs_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.image_size = obs_dim[-1]

        self.current_step = 0

        self.gamma = args.gamma

        self.batch_size = args.batch_size
        self.feature_dim = args.feature_dim
        self.curl_latent_dim = args.curl_latent_dim

        self.layer_num = args.layer_num
        self.filter_num = args.filter_num
        self.tau = args.tau
        self.encoder_tau = args.encoder_tau

        self.policy_delay = args.policy_delay
        self.actor_noise = args.actor_noise
        self.target_noise = args.target_noise
        self.noise_clip = args.noise_clip

        self.training_start = args.training_start
        self.training_step = args.training_step

        self.actor = Policy_network(self.feature_dim, self.action_dim,
                                    args.hidden_dim)
        self.target_actor = Policy_network(self.feature_dim, self.action_dim,
                                           args.hidden_dim)
        self.critic1 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.critic2 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic1 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)
        self.target_critic2 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)

        self.encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                           self.layer_num, self.filter_num)

        copy_weight(self.actor, self.target_actor)
        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.curl = CURL(self.feature_dim, self.curl_latent_dim)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr)
        self.cpc_optimizer = tf.keras.optimizers.Adam(args.cpc_lr)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2,
            'Curl': self.curl,
            'Encoder': self.encoder,
            'Target_Encoder': self.target_encoder
        }

        self.name = 'CURL_TD3'

    def get_action(self, obs):

        if obs.shape[-1] != self.image_size:
            obs = center_crop_image(obs, self.image_size)

        obs = np.expand_dims(np.array(obs), axis=0)
        noise = np.random.normal(loc=0,
                                 scale=self.actor_noise,
                                 size=self.action_dim)
        feature = self.encoder(obs)
        action = self.actor(feature).numpy()[0] + noise
        action = np.clip(action, -1, 1)

        return action

    def eval_action(self, obs):

        if obs.shape[-1] != self.image_size:
            obs = center_crop_image(obs, self.image_size)

        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action = self.actor(feature).numpy()[0]
        action = np.clip(action, -1, 1)

        return action

    def train(self, local_step):
        self.current_step += 1

        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        total_cpc_loss = 0
        loss_list = []
        s, a, r, ns, d, cpc_kwargs = self.buffer.cpc_sample(
            self.batch_size, self.image_size)

        obs_anchor, obs_pos = cpc_kwargs["obs_anchor"], cpc_kwargs["obs_pos"]

        with tf.GradientTape(persistent=True) as tape:
            z_a = self.encoder(obs_anchor)
            z_pos = tf.stop_gradient(self.target_encoder(obs_pos))
            logits = self.curl.compute_logits(z_a, z_pos)
            labels = tf.range(logits.shape[0], dtype='int64')

            cpc_loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(labels, logits))

        cpc_gradients = tape.gradient(cpc_loss, self.curl.trainable_variables)
        self.cpc_optimizer.apply_gradients(
            zip(cpc_gradients, self.curl.trainable_variables))

        encoder_gradients = tape.gradient(cpc_loss,
                                          self.encoder.trainable_variables)
        self.encoder_optimizer.apply_gradients(
            zip(encoder_gradients, self.encoder.trainable_variables))

        total_cpc_loss += cpc_loss.numpy()
        loss_list.append(['Loss/CPC', total_cpc_loss])

        del tape

        if self.current_step % 2 == 0:
            target_action = tf.clip_by_value(
                self.target_actor(self.target_encoder(ns)) + tf.clip_by_value(
                    tf.random.normal(shape=self.target_actor(
                        self.target_encoder(ns)).shape,
                                     mean=0,
                                     stddev=self.target_noise),
                    -self.noise_clip, self.noise_clip), -1, 1)

            target_value = tf.stop_gradient(r + self.gamma * (
                1 - d
            ) * tf.minimum(
                self.target_critic1(self.target_encoder(ns), target_action),
                self.target_critic2(self.target_encoder(ns), target_action)))

            with tf.GradientTape(persistent=True) as tape:
                critic1_loss = 0.5 * tf.reduce_mean(
                    tf.square(target_value - self.critic1(self.encoder(s), a)))
                critic2_loss = 0.5 * tf.reduce_mean(
                    tf.square(target_value - self.critic2(self.encoder(s), a)))

            critic1_grad = tape.gradient(
                critic1_loss, self.encoder.trainable_variables +
                self.critic1.trainable_variables)
            self.critic1_optimizer.apply_gradients(
                zip(
                    critic1_grad, self.encoder.trainable_variables +
                    self.critic1.trainable_variables))

            critic2_grad = tape.gradient(
                critic2_loss, self.encoder.trainable_variables +
                self.critic2.trainable_variables)
            self.critic2_optimizer.apply_gradients(
                zip(
                    critic2_grad, self.encoder.trainable_variables +
                    self.critic2.trainable_variables))

            if self.current_step % (2 * self.policy_delay) == 0:
                with tf.GradientTape() as tape2:
                    actor_loss = -tf.reduce_mean(
                        self.critic1(
                            tf.stop_gradient(self.encoder(s)),
                            self.actor(tf.stop_gradient(self.encoder(s)))))

                actor_grad = tape2.gradient(actor_loss,
                                            self.actor.trainable_variables)
                self.actor_optimizer.apply_gradients(
                    zip(actor_grad, self.actor.trainable_variables))

                soft_update(self.actor, self.target_actor, self.tau)
                soft_update(self.critic1, self.target_critic1, self.tau)
                soft_update(self.critic2, self.target_critic2, self.tau)
                soft_update(self.encoder, self.target_encoder,
                            self.encoder_tau)

                total_a_loss += actor_loss.numpy()
                loss_list.append(['Loss/Actor', total_a_loss])

            total_c1_loss += critic1_loss.numpy()
            total_c2_loss += critic2_loss.numpy()

            loss_list.append(['Loss/Critic1', total_c1_loss])
            loss_list.append(['Loss/Critic2', total_c2_loss])

        return loss_list
예제 #3
0
파일: CURL.py 프로젝트: Cerphilly/SimpleRL
class CURL_SACv1:
    def __init__(self, obs_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.image_size = obs_dim[-1]

        self.gamma = args.gamma
        self.alpha = args.alpha

        self.batch_size = args.batch_size
        self.feature_dim = args.feature_dim
        self.curl_latent_dim = args.curl_latent_dim

        self.layer_num = args.layer_num
        self.filter_num = args.filter_num
        self.tau = args.tau
        self.encoder_tau = args.encoder_tau

        self.training_start = args.training_start
        self.training_step = args.training_step

        self.encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                           self.layer_num, self.filter_num)

        self.actor = Squashed_Gaussian_Actor(
            self.feature_dim,
            self.action_dim,
            args.hidden_dim,
            args.log_std_min,
            args.log_std_max,
            kernel_initializer=tf.keras.initializers.orthogonal())
        self.critic1 = Q_network(
            self.feature_dim,
            self.action_dim,
            args.hidden_dim,
            kernel_initializer=tf.keras.initializers.orthogonal())
        self.critic2 = Q_network(
            self.feature_dim,
            self.action_dim,
            args.hidden_dim,
            kernel_initializer=tf.keras.initializers.orthogonal())
        self.v_network = V_network(
            self.feature_dim,
            args.hidden_dim,
            kernel_initializer=tf.keras.initializers.orthogonal())
        self.target_v_network = V_network(
            self.feature_dim,
            args.hidden_dim,
            kernel_initializer=tf.keras.initializers.orthogonal())

        self.curl = CURL(self.feature_dim, self.curl_latent_dim)

        copy_weight(self.v_network, self.target_v_network)
        copy_weight(self.encoder, self.target_encoder)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.v_network_optimizer = tf.keras.optimizers.Adam(args.v_lr)

        self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr)
        self.cpc_optimizer = tf.keras.optimizers.Adam(args.cpc_lr)

        self.current_step = 0

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'V_network': self.v_network,
            'Target_V_network': self.target_v_network,
            'Curl': self.curl,
            'Encoder': self.encoder,
            'Target_Encoder': self.target_encoder
        }

        self.name = 'CURL_SACv1'

    def get_action(self, obs):

        if obs.shape[-1] != self.image_size:
            obs = center_crop_image(obs, self.image_size)

        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action, _ = self.actor(feature)
        action = action.numpy()[0]

        return action

    def eval_action(self, obs):

        if obs.shape[-1] != self.image_size:
            obs = center_crop_image(obs, self.image_size)

        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action, _ = self.actor(feature, deterministic=True)
        action = action.numpy()[0]

        return action

    def train(self, training_step):
        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        total_v_loss = 0
        total_cpc_loss = 0
        loss_list = []

        self.current_step += 1

        s, a, r, ns, d, cpc_kwargs = self.buffer.cpc_sample(
            self.batch_size, self.image_size)

        obs_anchor, obs_pos = cpc_kwargs["obs_anchor"], cpc_kwargs["obs_pos"]

        s_action, s_logpi = self.actor(self.encoder(s))

        min_aq = tf.minimum(self.critic1(self.encoder(s), s_action),
                            self.critic2(self.encoder(s), s_action))
        target_v = tf.stop_gradient(min_aq - self.alpha * s_logpi)

        with tf.GradientTape() as tape1:
            v_loss = 0.5 * tf.reduce_mean(
                tf.square(
                    self.v_network(tf.stop_gradient(self.encoder(s))) -
                    target_v))

        v_gradients = tape1.gradient(v_loss,
                                     self.v_network.trainable_variables)
        self.v_network_optimizer.apply_gradients(
            zip(v_gradients, self.v_network.trainable_variables))

        del tape1

        target_q = tf.stop_gradient(
            r + self.gamma *
            (1 - d) * self.target_v_network(self.target_encoder(ns)))

        with tf.GradientTape(persistent=True) as tape2:
            critic1_loss = 0.5 * tf.reduce_mean(
                tf.square(self.critic1(self.encoder(s), a) - target_q))
            critic2_loss = 0.5 * tf.reduce_mean(
                tf.square(self.critic2(self.encoder(s), a) - target_q))

        critic1_gradients = tape2.gradient(
            critic1_loss, self.encoder.trainable_variables +
            self.critic1.trainable_variables)

        critic2_gradients = tape2.gradient(
            critic2_loss, self.encoder.trainable_variables +
            self.critic2.trainable_variables)

        self.critic1_optimizer.apply_gradients(
            zip(
                critic1_gradients, self.encoder.trainable_variables +
                self.critic1.trainable_variables))

        self.critic2_optimizer.apply_gradients(
            zip(
                critic2_gradients, self.encoder.trainable_variables +
                self.critic2.trainable_variables))

        del tape2

        with tf.GradientTape() as tape3:
            s_action, s_logpi = self.actor(tf.stop_gradient(self.encoder(s)))

            min_aq_rep = tf.minimum(
                self.critic1(tf.stop_gradient(self.encoder(s)), s_action),
                self.critic2(tf.stop_gradient(self.encoder(s)), s_action))

            actor_loss = tf.reduce_mean(self.alpha * s_logpi - min_aq_rep)

        actor_gradients = tape3.gradient(actor_loss,
                                         self.actor.trainable_variables)
        self.actor_optimizer.apply_gradients(
            zip(actor_gradients, self.actor.trainable_variables))

        soft_update(self.v_network, self.target_v_network, self.tau)

        with tf.GradientTape(persistent=True) as tape4:
            z_a = self.encoder(obs_anchor)
            z_pos = tf.stop_gradient(self.target_encoder(obs_pos))

            logits = self.curl.compute_logits(z_a, z_pos)
            labels = tf.range(logits.shape[0], dtype='int64')

            cpc_loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(labels, logits))

        cpc_gradients = tape4.gradient(cpc_loss, self.curl.trainable_variables)
        self.cpc_optimizer.apply_gradients(
            (zip(cpc_gradients, self.curl.trainable_variables)))

        encoder_gradients = tape4.gradient(cpc_loss,
                                           self.encoder.trainable_variables)
        self.encoder_optimizer.apply_gradients(
            zip(encoder_gradients, self.encoder.trainable_variables))

        soft_update(self.encoder, self.target_encoder, self.encoder_tau)

        del tape4

        total_v_loss += v_loss.numpy()
        loss_list.append(['Loss/V', total_v_loss])

        total_c1_loss += critic1_loss.numpy()
        total_c2_loss += critic2_loss.numpy()

        loss_list.append(['Loss/Critic1', total_c1_loss])
        loss_list.append(['Loss/Critic2', total_c2_loss])

        total_a_loss += actor_loss.numpy()
        loss_list.append(['Loss/Actor', total_a_loss])

        total_cpc_loss += cpc_loss.numpy()
        loss_list.append(['Loss/CPC', total_cpc_loss])

        return loss_list