예제 #1
0
class DBC_TD3:
    def __init__(self,
                 obs_dim,
                 action_dim,
                 hidden_dim=512,
                 gamma=0.99,
                 learning_rate=0.001,
                 batch_size=512,
                 policy_delay=2,
                 actor_noise=0.1,
                 target_noise=0.2,
                 noise_clip=0.5,
                 buffer_size=1e6,
                 feature_dim=50,
                 layer_num=4,
                 filter_num=32,
                 tau=0.005,
                 encoder_tau=0.005,
                 bisim_coef=0.5,
                 training_start=1000):

        self.buffer = Buffer(buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.hidden_dim = hidden_dim
        self.gamma = gamma
        self.learning_rate = learning_rate

        self.batch_size = batch_size
        self.feature_dim = feature_dim

        self.layer_num = layer_num
        self.filter_num = filter_num
        self.tau = tau
        self.encoder_tau = encoder_tau
        self.bisim_coef = bisim_coef

        self.policy_delay = policy_delay
        self.actor_noise = actor_noise
        self.target_noise = target_noise
        self.noise_clip = noise_clip

        self.training_start = training_start

        self.actor = Policy_network(feature_dim, action_dim,
                                    (hidden_dim, hidden_dim))
        self.target_actor = Policy_network(feature_dim, action_dim,
                                           (hidden_dim, hidden_dim))
        self.critic1 = Q_network(feature_dim, action_dim,
                                 (hidden_dim, hidden_dim))
        self.critic2 = Q_network(feature_dim, action_dim,
                                 (hidden_dim, hidden_dim))
        self.target_critic1 = Q_network(feature_dim, action_dim,
                                        (hidden_dim, hidden_dim))
        self.target_critic2 = Q_network(feature_dim, action_dim,
                                        (hidden_dim, hidden_dim))

        self.encoder = PixelEncoder(self.obs_dim, feature_dim, layer_num,
                                    filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, feature_dim,
                                           layer_num, filter_num)

        self.dynamics_model = Transition_Network(feature_dim, action_dim)
        self.reward_model = Reward_Network(feature_dim)

        copy_weight(self.actor, self.target_actor)
        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.encoder_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.dynamics_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.reward_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.name = 'DBC_TD3'

    def get_action(self, obs):
        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action = self.actor(feature).numpy()[0]

        return action

    def train(self,
              local_step):  #critic -> transition -> reward -> encoder -> actor
        set1, set2 = self.buffer.dbc_sample(self.batch_size)

        s, a, r, ns, d = set1
        s2, a2, r2, ns2, d2 = set2

        target_action = tf.clip_by_value(
            self.target_actor(self.target_encoder(ns)) + tf.clip_by_value(
                tf.random.normal(shape=self.target_actor(
                    self.target_encoder(ns)).shape,
                                 mean=0,
                                 stddev=self.target_noise), -self.noise_clip,
                self.noise_clip), -1, 1)

        target_value = tf.stop_gradient(r + self.gamma * (1 - d) * tf.minimum(
            self.target_critic1(self.target_encoder(ns), target_action),
            self.target_critic2(self.target_encoder(ns), target_action)))

        with tf.GradientTape(persistent=True) as tape1:
            critic1_loss = 0.5 * tf.reduce_mean(
                tf.square(target_value - self.critic1(self.encoder(s), a)))
            critic2_loss = 0.5 * tf.reduce_mean(
                tf.square(target_value - self.critic2(self.encoder(s), a)))

        critic1_grad = tape1.gradient(
            critic1_loss, self.encoder.trainable_variables +
            self.critic1.trainable_variables)
        self.critic1_optimizer.apply_gradients(
            zip(
                critic1_grad, self.encoder.trainable_variables +
                self.critic1.trainable_variables))

        critic2_grad = tape1.gradient(
            critic2_loss, self.encoder.trainable_variables +
            self.critic2.trainable_variables)
        self.critic2_optimizer.apply_gradients(
            zip(
                critic2_grad, self.encoder.trainable_variables +
                self.critic2.trainable_variables))

        del tape1

        #train dynamics
        with tf.GradientTape() as tape2:
            feature = self.encoder(s)
            next_feature = self.encoder(ns)
            mu, sigma = self.dynamics_model(tf.concat([feature, a], axis=1))

            if (sigma[0][0].numpy() == 0):
                sigma = tf.ones_like(mu)
            diff = (mu - tf.stop_gradient(next_feature)) / sigma
            dynamics_loss = tf.reduce_mean(0.5 * tf.square(diff) +
                                           tf.math.log(sigma))

        dynamics_gradients = tape2.gradient(
            dynamics_loss, self.encoder.trainable_variables +
            self.dynamics_model.trainable_variables)
        self.dynamics_optimizer.apply_gradients(
            zip(
                dynamics_gradients, self.encoder.trainable_variables +
                self.dynamics_model.trainable_variables))

        #dynamics_gradients = tape2.gradient(dynamics_loss, self.dynamics_model.trainable_variables)
        #self.dynamics_optimizer.apply_gradients(zip(dynamics_gradients, self.dynamics_model.trainable_variables))

        del tape2

        #train reward
        with tf.GradientTape() as tape3:
            feature = self.encoder(s)
            sample_dynamics = self.dynamics_model.sample(
                tf.concat([feature, a], axis=1))
            reward_prediction = self.reward_model(sample_dynamics)

            reward_loss = tf.reduce_mean(tf.square(reward_prediction - (r)))

        reward_gradients = tape3.gradient(
            reward_loss, self.encoder.trainable_variables +
            self.reward_model.trainable_variables)
        self.reward_optimizer.apply_gradients(
            zip(
                reward_gradients, self.encoder.trainable_variables +
                self.reward_model.trainable_variables))

        #reward_gradients = tape3.gradient(reward_loss, self.reward_model.trainable_variables)
        #self.reward_optimizer.apply_gradients(zip(reward_gradients, self.reward_model.trainable_variables))

        del tape3

        #train encoder
        with tf.GradientTape() as tape4:
            feature1 = self.encoder(s)
            feature2 = self.encoder(s2)

            mu1, sigma1 = self.dynamics_model(tf.concat([feature1, a], axis=1))
            mu2, sigma2 = self.dynamics_model(tf.concat([feature2, a2],
                                                        axis=1))

            z_dist = tf.abs(feature1 - feature2)
            r_dist = tf.abs(r - r2)

            transition_dist = tf.sqrt(
                tf.square(tf.abs(mu1 - mu2)) +
                tf.square(tf.abs(sigma1 - sigma2)))
            bisimilarity = tf.stop_gradient(
                tf.cast(r_dist, tf.float32) +
                self.gamma * tf.cast(transition_dist, tf.float32))
            encoder_loss = self.bisim_coef * tf.reduce_mean(
                tf.square(z_dist - bisimilarity))

        encoder_gradients = tape4.gradient(encoder_loss,
                                           self.encoder.trainable_variables)
        self.encoder_optimizer.apply_gradients(
            zip(encoder_gradients, self.encoder.trainable_variables))

        del tape4

        if local_step % (self.policy_delay) == 0:
            with tf.GradientTape() as tape5:
                actor_loss = -tf.reduce_mean(
                    self.critic1(tf.stop_gradient(self.encoder(s)),
                                 self.actor(tf.stop_gradient(
                                     self.encoder(s)))))

            actor_grad = tape5.gradient(actor_loss,
                                        self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor.trainable_variables))

            del tape5

            soft_update(self.actor, self.target_actor, self.tau)
            soft_update(self.critic1, self.target_critic1, self.tau)
            soft_update(self.critic2, self.target_critic2, self.tau)
            soft_update(self.encoder, self.target_encoder, self.encoder_tau)
예제 #2
0
파일: DBC.py 프로젝트: Cerphilly/SimpleRL
class DBC_SACv2:
    def __init__(self,
                 obs_dim,
                 action_dim,
                 hidden_dim=256,
                 gamma=0.99,
                 learning_rate=1e-5,
                 batch_size=128,
                 buffer_size=1e6,
                 feature_dim=50,
                 layer_num=4,
                 filter_num=32,
                 tau=0.005,
                 encoder_tau=0.005,
                 bisim_coef=0.5,
                 training_start=1000,
                 train_alpha=True,
                 alpha=0.1):

        self.buffer = Buffer(buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha),
                                     trainable=True)
        self.target_entropy = -action_dim
        self.hidden_dim = hidden_dim
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.bisim_coef = bisim_coef

        self.batch_size = batch_size
        self.feature_dim = feature_dim

        self.layer_num = layer_num
        self.filter_num = filter_num
        self.tau = tau
        self.encoder_tau = encoder_tau

        self.training_start = training_start
        self.train_alpha = train_alpha

        self.actor = Squashed_Gaussian_Actor(feature_dim, action_dim,
                                             (hidden_dim, hidden_dim))
        self.critic1 = Q_network(feature_dim, action_dim,
                                 (hidden_dim, hidden_dim))
        self.critic2 = Q_network(feature_dim, action_dim,
                                 (hidden_dim, hidden_dim))
        self.target_critic1 = Q_network(feature_dim, action_dim,
                                        (hidden_dim, hidden_dim))
        self.target_critic2 = Q_network(feature_dim, action_dim,
                                        (hidden_dim, hidden_dim))

        self.encoder = PixelEncoder(self.obs_dim, feature_dim, layer_num,
                                    filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, feature_dim,
                                           layer_num, filter_num)

        self.dynamics_model = Transition_Network(feature_dim,
                                                 action_dim,
                                                 deterministic=False)
        self.reward_model = Reward_Network(feature_dim)

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.encoder_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.log_alpha_optimizer = tf.keras.optimizers.Adam(10 * learning_rate)

        self.dynamics_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.reward_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.name = 'DBC_SACv2'

    @property
    def alpha(self):
        return tf.exp(self.log_alpha)

    def get_action(self, obs):
        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action = self.actor(feature).numpy()[0]

        return action

    def train(self, local_step):
        set1, set2 = self.buffer.dbc_sample(self.batch_size)

        s, a, r, ns, d = set1
        s2, a2, r2, ns2, d2 = set2

        target_min_aq = tf.minimum(
            self.target_critic1(self.target_encoder(ns),
                                self.actor(self.encoder(ns))),
            self.target_critic2(self.target_encoder(ns),
                                self.actor(self.encoder(ns))))

        target_q = tf.stop_gradient(r + self.gamma * (1 - d) *
                                    (target_min_aq - self.alpha.numpy() *
                                     self.actor.log_pi(self.encoder(ns))))

        with tf.GradientTape(persistent=True) as tape1:
            critic1_loss = tf.reduce_mean(
                tf.square(self.critic1(self.encoder(s), a) - target_q))
            critic2_loss = tf.reduce_mean(
                tf.square(self.critic2(self.encoder(s), a) - target_q))

        critic1_gradients = tape1.gradient(
            critic1_loss, self.encoder.trainable_variables +
            self.critic1.trainable_variables)
        self.critic1_optimizer.apply_gradients(
            zip(
                critic1_gradients, self.encoder.trainable_variables +
                self.critic1.trainable_variables))

        critic2_gradients = tape1.gradient(
            critic2_loss, self.encoder.trainable_variables +
            self.critic2.trainable_variables)
        self.critic2_optimizer.apply_gradients(
            zip(
                critic2_gradients, self.encoder.trainable_variables +
                self.critic2.trainable_variables))

        del tape1

        #train dynamics(encoder used together)
        next_feature = self.encoder(ns)
        with tf.GradientTape() as tape2:
            feature = self.encoder(s)

            mu, sigma = self.dynamics_model(tf.concat([feature, a], axis=1))

            if (sigma[0][0].numpy() == 0):
                if self.dynamics_model.deterministic == False:
                    print("error")
                sigma = tf.ones_like(mu)

            diff = (mu - tf.stop_gradient(next_feature)) / sigma
            dynamics_loss = tf.reduce_mean(0.5 * tf.square(diff) +
                                           tf.math.log(sigma))

        dynamics_gradients = tape2.gradient(
            dynamics_loss, self.encoder.trainable_variables +
            self.dynamics_model.trainable_variables)
        self.dynamics_optimizer.apply_gradients(
            zip(
                dynamics_gradients, self.encoder.trainable_variables +
                self.dynamics_model.trainable_variables))

        del tape2

        #train rewards(encoder used together)
        with tf.GradientTape() as tape3:
            feature = self.encoder(s)
            sample_dynamics = self.dynamics_model.sample(
                tf.concat([feature, a], axis=1))
            reward_prediction = self.reward_model(sample_dynamics)

            reward_loss = tf.reduce_mean(tf.square(reward_prediction - r))

        reward_gradients = tape3.gradient(
            reward_loss, self.encoder.trainable_variables +
            self.reward_model.trainable_variables)
        self.reward_optimizer.apply_gradients(
            zip(
                reward_gradients, self.encoder.trainable_variables +
                self.reward_model.trainable_variables))

        del tape3

        # train encoder
        with tf.GradientTape() as tape4:
            feature1 = self.encoder(s)
            feature2 = self.encoder(s2)

            mu1, sigma1 = self.dynamics_model(tf.concat([feature1, a], axis=1))
            mu2, sigma2 = self.dynamics_model(tf.concat([feature2, a2],
                                                        axis=1))

            z_dist = tf.abs(feature1 - feature2)
            r_dist = tf.abs(r - r2)

            transition_dist = tf.sqrt(
                tf.square(tf.abs(mu1 - mu2)) +
                tf.square(tf.abs(sigma1 - sigma2)))
            bisimilarity = (
                tf.cast(r_dist, tf.float32) +
                self.gamma * tf.cast(transition_dist, tf.float32)).numpy()
            encoder_loss = self.bisim_coef * tf.reduce_mean(
                tf.square(z_dist - bisimilarity))

        encoder_gradients = tape4.gradient(encoder_loss,
                                           self.encoder.trainable_variables)
        self.encoder_optimizer.apply_gradients(
            zip(encoder_gradients, self.encoder.trainable_variables))

        del tape4

        if local_step % 2 == 0:
            with tf.GradientTape() as tape5:
                mu, sigma = self.actor.mu_sigma(
                    tf.stop_gradient(self.encoder(s)))
                output = mu + tf.random.normal(shape=mu.shape) * sigma

                min_aq_rep = tf.minimum(
                    self.critic1(tf.stop_gradient(self.encoder(s)), output),
                    self.critic2(tf.stop_gradient(self.encoder(s)), output))

                actor_loss = tf.reduce_mean(
                    self.alpha.numpy() *
                    self.actor.log_pi(tf.stop_gradient(self.encoder(s))) -
                    min_aq_rep)

            actor_gradients = tape5.gradient(actor_loss,
                                             self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_gradients, self.actor.trainable_variables))

            del tape5

            if self.train_alpha == True:
                with tf.GradientTape() as tape6:
                    alpha_loss = -(tf.exp(self.log_alpha) * tf.stop_gradient(
                        self.actor.log_pi(self.encoder(s)) +
                        self.target_entropy))
                    alpha_loss = tf.nn.compute_average_loss(alpha_loss)

                log_alpha_gradients = tape6.gradient(alpha_loss,
                                                     [self.log_alpha])
                self.log_alpha_optimizer.apply_gradients(
                    zip(log_alpha_gradients, [self.log_alpha]))

                del tape6

            soft_update(self.critic1, self.target_critic1, self.tau)
            soft_update(self.critic2, self.target_critic2, self.tau)
            soft_update(self.encoder, self.target_encoder, self.encoder_tau)