Esempio n. 1
0
    def __init__(self, obs_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha),
                                     trainable=True)
        self.target_entropy = -action_dim
        self.gamma = args.gamma

        self.batch_size = args.batch_size
        self.feature_dim = args.feature_dim

        self.layer_num = args.layer_num
        self.filter_num = args.filter_num
        self.tau = args.tau
        self.encoder_tau = args.encoder_tau

        self.actor_update = args.actor_update
        self.critic_update = args.critic_update

        self.training_start = args.training_start
        self.training_step = args.training_step
        self.train_alpha = args.train_alpha

        self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim,
                                             args.hidden_dim, args.log_std_min,
                                             args.log_std_max)
        self.critic1 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.critic2 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic1 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)
        self.target_critic2 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)

        self.encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                           self.layer_num, self.filter_num)

        self.dynamics_model = Transition_Network(self.feature_dim,
                                                 action_dim,
                                                 deterministic=False)
        self.reward_model = Reward_Network(self.feature_dim)

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr)
        self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr)

        self.dynamics_optimizer = tf.keras.optimizers.Adam(args.decoder_lr)
        self.reward_optimizer = tf.keras.optimizers.Adam(args.decoder_lr)

        self.current_step = 0

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2,
            'Encoder': self.encoder,
            'Target_Encoder': self.target_encoder,
            'Dynamics': self.dynamics_model,
            'Reward': self.reward_model
        }

        self.name = 'DBC_SACv2'
Esempio n. 2
0
    def __init__(self, obs_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.image_size = obs_dim[-1]

        self.gamma = args.gamma
        self.alpha = args.alpha

        self.batch_size = args.batch_size
        self.feature_dim = args.feature_dim
        self.curl_latent_dim = args.curl_latent_dim

        self.layer_num = args.layer_num
        self.filter_num = args.filter_num
        self.tau = args.tau
        self.encoder_tau = args.encoder_tau

        self.training_start = args.training_start
        self.training_step = args.training_step

        self.encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                           self.layer_num, self.filter_num)

        self.actor = Squashed_Gaussian_Actor(
            self.feature_dim,
            self.action_dim,
            args.hidden_dim,
            args.log_std_min,
            args.log_std_max,
            kernel_initializer=tf.keras.initializers.orthogonal())
        self.critic1 = Q_network(
            self.feature_dim,
            self.action_dim,
            args.hidden_dim,
            kernel_initializer=tf.keras.initializers.orthogonal())
        self.critic2 = Q_network(
            self.feature_dim,
            self.action_dim,
            args.hidden_dim,
            kernel_initializer=tf.keras.initializers.orthogonal())
        self.v_network = V_network(
            self.feature_dim,
            args.hidden_dim,
            kernel_initializer=tf.keras.initializers.orthogonal())
        self.target_v_network = V_network(
            self.feature_dim,
            args.hidden_dim,
            kernel_initializer=tf.keras.initializers.orthogonal())

        self.curl = CURL(self.feature_dim, self.curl_latent_dim)

        copy_weight(self.v_network, self.target_v_network)
        copy_weight(self.encoder, self.target_encoder)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.v_network_optimizer = tf.keras.optimizers.Adam(args.v_lr)

        self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr)
        self.cpc_optimizer = tf.keras.optimizers.Adam(args.cpc_lr)

        self.current_step = 0

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'V_network': self.v_network,
            'Target_V_network': self.target_v_network,
            'Curl': self.curl,
            'Encoder': self.encoder,
            'Target_Encoder': self.target_encoder
        }

        self.name = 'CURL_SACv1'
Esempio n. 3
0
    def __init__(self,
                 obs_dim,
                 action_dim,
                 hidden_dim=256,
                 gamma=0.99,
                 learning_rate=1e-5,
                 batch_size=128,
                 buffer_size=1e6,
                 feature_dim=50,
                 layer_num=4,
                 filter_num=32,
                 tau=0.005,
                 encoder_tau=0.005,
                 bisim_coef=0.5,
                 training_start=1000,
                 train_alpha=True,
                 alpha=0.1):

        self.buffer = Buffer(buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha),
                                     trainable=True)
        self.target_entropy = -action_dim
        self.hidden_dim = hidden_dim
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.bisim_coef = bisim_coef

        self.batch_size = batch_size
        self.feature_dim = feature_dim

        self.layer_num = layer_num
        self.filter_num = filter_num
        self.tau = tau
        self.encoder_tau = encoder_tau

        self.training_start = training_start
        self.train_alpha = train_alpha

        self.actor = Squashed_Gaussian_Actor(feature_dim, action_dim,
                                             (hidden_dim, hidden_dim))
        self.critic1 = Q_network(feature_dim, action_dim,
                                 (hidden_dim, hidden_dim))
        self.critic2 = Q_network(feature_dim, action_dim,
                                 (hidden_dim, hidden_dim))
        self.target_critic1 = Q_network(feature_dim, action_dim,
                                        (hidden_dim, hidden_dim))
        self.target_critic2 = Q_network(feature_dim, action_dim,
                                        (hidden_dim, hidden_dim))

        self.encoder = PixelEncoder(self.obs_dim, feature_dim, layer_num,
                                    filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, feature_dim,
                                           layer_num, filter_num)

        self.dynamics_model = Transition_Network(feature_dim,
                                                 action_dim,
                                                 deterministic=False)
        self.reward_model = Reward_Network(feature_dim)

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.encoder_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.log_alpha_optimizer = tf.keras.optimizers.Adam(10 * learning_rate)

        self.dynamics_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.reward_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.name = 'DBC_SACv2'
Esempio n. 4
0
class DBC_SACv2:
    def __init__(self,
                 obs_dim,
                 action_dim,
                 hidden_dim=256,
                 gamma=0.99,
                 learning_rate=1e-5,
                 batch_size=128,
                 buffer_size=1e6,
                 feature_dim=50,
                 layer_num=4,
                 filter_num=32,
                 tau=0.005,
                 encoder_tau=0.005,
                 bisim_coef=0.5,
                 training_start=1000,
                 train_alpha=True,
                 alpha=0.1):

        self.buffer = Buffer(buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha),
                                     trainable=True)
        self.target_entropy = -action_dim
        self.hidden_dim = hidden_dim
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.bisim_coef = bisim_coef

        self.batch_size = batch_size
        self.feature_dim = feature_dim

        self.layer_num = layer_num
        self.filter_num = filter_num
        self.tau = tau
        self.encoder_tau = encoder_tau

        self.training_start = training_start
        self.train_alpha = train_alpha

        self.actor = Squashed_Gaussian_Actor(feature_dim, action_dim,
                                             (hidden_dim, hidden_dim))
        self.critic1 = Q_network(feature_dim, action_dim,
                                 (hidden_dim, hidden_dim))
        self.critic2 = Q_network(feature_dim, action_dim,
                                 (hidden_dim, hidden_dim))
        self.target_critic1 = Q_network(feature_dim, action_dim,
                                        (hidden_dim, hidden_dim))
        self.target_critic2 = Q_network(feature_dim, action_dim,
                                        (hidden_dim, hidden_dim))

        self.encoder = PixelEncoder(self.obs_dim, feature_dim, layer_num,
                                    filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, feature_dim,
                                           layer_num, filter_num)

        self.dynamics_model = Transition_Network(feature_dim,
                                                 action_dim,
                                                 deterministic=False)
        self.reward_model = Reward_Network(feature_dim)

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.encoder_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.log_alpha_optimizer = tf.keras.optimizers.Adam(10 * learning_rate)

        self.dynamics_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.reward_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.name = 'DBC_SACv2'

    @property
    def alpha(self):
        return tf.exp(self.log_alpha)

    def get_action(self, obs):
        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action = self.actor(feature).numpy()[0]

        return action

    def train(self, local_step):
        set1, set2 = self.buffer.dbc_sample(self.batch_size)

        s, a, r, ns, d = set1
        s2, a2, r2, ns2, d2 = set2

        target_min_aq = tf.minimum(
            self.target_critic1(self.target_encoder(ns),
                                self.actor(self.encoder(ns))),
            self.target_critic2(self.target_encoder(ns),
                                self.actor(self.encoder(ns))))

        target_q = tf.stop_gradient(r + self.gamma * (1 - d) *
                                    (target_min_aq - self.alpha.numpy() *
                                     self.actor.log_pi(self.encoder(ns))))

        with tf.GradientTape(persistent=True) as tape1:
            critic1_loss = tf.reduce_mean(
                tf.square(self.critic1(self.encoder(s), a) - target_q))
            critic2_loss = tf.reduce_mean(
                tf.square(self.critic2(self.encoder(s), a) - target_q))

        critic1_gradients = tape1.gradient(
            critic1_loss, self.encoder.trainable_variables +
            self.critic1.trainable_variables)
        self.critic1_optimizer.apply_gradients(
            zip(
                critic1_gradients, self.encoder.trainable_variables +
                self.critic1.trainable_variables))

        critic2_gradients = tape1.gradient(
            critic2_loss, self.encoder.trainable_variables +
            self.critic2.trainable_variables)
        self.critic2_optimizer.apply_gradients(
            zip(
                critic2_gradients, self.encoder.trainable_variables +
                self.critic2.trainable_variables))

        del tape1

        #train dynamics(encoder used together)
        next_feature = self.encoder(ns)
        with tf.GradientTape() as tape2:
            feature = self.encoder(s)

            mu, sigma = self.dynamics_model(tf.concat([feature, a], axis=1))

            if (sigma[0][0].numpy() == 0):
                if self.dynamics_model.deterministic == False:
                    print("error")
                sigma = tf.ones_like(mu)

            diff = (mu - tf.stop_gradient(next_feature)) / sigma
            dynamics_loss = tf.reduce_mean(0.5 * tf.square(diff) +
                                           tf.math.log(sigma))

        dynamics_gradients = tape2.gradient(
            dynamics_loss, self.encoder.trainable_variables +
            self.dynamics_model.trainable_variables)
        self.dynamics_optimizer.apply_gradients(
            zip(
                dynamics_gradients, self.encoder.trainable_variables +
                self.dynamics_model.trainable_variables))

        del tape2

        #train rewards(encoder used together)
        with tf.GradientTape() as tape3:
            feature = self.encoder(s)
            sample_dynamics = self.dynamics_model.sample(
                tf.concat([feature, a], axis=1))
            reward_prediction = self.reward_model(sample_dynamics)

            reward_loss = tf.reduce_mean(tf.square(reward_prediction - r))

        reward_gradients = tape3.gradient(
            reward_loss, self.encoder.trainable_variables +
            self.reward_model.trainable_variables)
        self.reward_optimizer.apply_gradients(
            zip(
                reward_gradients, self.encoder.trainable_variables +
                self.reward_model.trainable_variables))

        del tape3

        # train encoder
        with tf.GradientTape() as tape4:
            feature1 = self.encoder(s)
            feature2 = self.encoder(s2)

            mu1, sigma1 = self.dynamics_model(tf.concat([feature1, a], axis=1))
            mu2, sigma2 = self.dynamics_model(tf.concat([feature2, a2],
                                                        axis=1))

            z_dist = tf.abs(feature1 - feature2)
            r_dist = tf.abs(r - r2)

            transition_dist = tf.sqrt(
                tf.square(tf.abs(mu1 - mu2)) +
                tf.square(tf.abs(sigma1 - sigma2)))
            bisimilarity = (
                tf.cast(r_dist, tf.float32) +
                self.gamma * tf.cast(transition_dist, tf.float32)).numpy()
            encoder_loss = self.bisim_coef * tf.reduce_mean(
                tf.square(z_dist - bisimilarity))

        encoder_gradients = tape4.gradient(encoder_loss,
                                           self.encoder.trainable_variables)
        self.encoder_optimizer.apply_gradients(
            zip(encoder_gradients, self.encoder.trainable_variables))

        del tape4

        if local_step % 2 == 0:
            with tf.GradientTape() as tape5:
                mu, sigma = self.actor.mu_sigma(
                    tf.stop_gradient(self.encoder(s)))
                output = mu + tf.random.normal(shape=mu.shape) * sigma

                min_aq_rep = tf.minimum(
                    self.critic1(tf.stop_gradient(self.encoder(s)), output),
                    self.critic2(tf.stop_gradient(self.encoder(s)), output))

                actor_loss = tf.reduce_mean(
                    self.alpha.numpy() *
                    self.actor.log_pi(tf.stop_gradient(self.encoder(s))) -
                    min_aq_rep)

            actor_gradients = tape5.gradient(actor_loss,
                                             self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_gradients, self.actor.trainable_variables))

            del tape5

            if self.train_alpha == True:
                with tf.GradientTape() as tape6:
                    alpha_loss = -(tf.exp(self.log_alpha) * tf.stop_gradient(
                        self.actor.log_pi(self.encoder(s)) +
                        self.target_entropy))
                    alpha_loss = tf.nn.compute_average_loss(alpha_loss)

                log_alpha_gradients = tape6.gradient(alpha_loss,
                                                     [self.log_alpha])
                self.log_alpha_optimizer.apply_gradients(
                    zip(log_alpha_gradients, [self.log_alpha]))

                del tape6

            soft_update(self.critic1, self.target_critic1, self.tau)
            soft_update(self.critic2, self.target_critic2, self.tau)
            soft_update(self.encoder, self.target_encoder, self.encoder_tau)
Esempio n. 5
0
    def __init__(self, obs_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.image_size = args.image_size
        self.current_step = 0

        self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha),
                                     trainable=True)
        self.target_entropy = -action_dim
        self.gamma = args.gamma

        self.batch_size = args.batch_size
        self.feature_dim = args.feature_dim

        self.layer_num = args.layer_num
        self.filter_num = args.filter_num
        self.tau = args.tau
        self.encoder_tau = args.encoder_tau
        self.actor_update = args.actor_update
        self.critic_update = args.critic_update
        self.decoder_update = args.decoder_update
        self.decoder_latent_lambda = args.decoder_latent_lambda
        self.decoder_weight_lambda = args.decoder_weight_lambda

        self.training_start = args.training_start
        self.training_step = args.training_step
        self.train_alpha = args.train_alpha

        self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim,
                                             args.hidden_dim, args.log_std_min,
                                             args.log_std_max)
        self.critic1 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.critic2 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic1 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)
        self.target_critic2 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)

        self.encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                           self.layer_num, self.filter_num)
        self.decoder = PixelDecoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num)

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr)
        self.decoder_optimizer = tfa.optimizers.AdamW(
            weight_decay=self.decoder_weight_lambda,
            learning_rate=args.decoder_lr)

        self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr,
                                                            beta_1=0.5)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2,
            'Encoder': self.encoder,
            'Target_Encoder': self.target_encoder,
            'Decoder': self.decoder
        }
        self.name = 'SACv2_AE'
Esempio n. 6
0
class SAC_v2:
    def __init__(self,
                 state_dim,
                 action_dim,
                 hidden_dim=256,
                 training_step=1,
                 alpha=0.1,
                 train_alpha=True,
                 batch_size=128,
                 buffer_size=1e6,
                 tau=0.005,
                 learning_rate=0.0003,
                 gamma=0.99,
                 reward_scale=1,
                 training_start=500):

        self.buffer = Buffer(buffer_size)

        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.batch_size = batch_size
        self.tau = tau
        self.gamma = gamma
        self.reward_scale = reward_scale
        self.training_start = training_start
        self.training_step = training_step

        self.log_alpha = tf.Variable(np.log(alpha),
                                     dtype=tf.float32,
                                     trainable=True)
        self.target_entropy = -action_dim
        self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.train_alpha = train_alpha

        self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim,
                                             (hidden_dim, hidden_dim))
        self.critic1 = Q_network(self.state_dim, self.action_dim,
                                 (hidden_dim, hidden_dim))
        self.target_critic1 = Q_network(self.state_dim, self.action_dim,
                                        (hidden_dim, hidden_dim))
        self.critic2 = Q_network(self.state_dim, self.action_dim,
                                 (hidden_dim, hidden_dim))
        self.target_critic2 = Q_network(self.state_dim, self.action_dim,
                                        (hidden_dim, hidden_dim))

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2
        }
        self.name = 'SAC_v2'

    @property
    def alpha(self):
        return tf.exp(self.log_alpha)

    def get_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)

        action = self.actor(state).numpy()[0]

        return action

    def train(self, training_num):
        for i in range(training_num):
            s, a, r, ns, d = self.buffer.sample(self.batch_size)

            target_min_aq = tf.minimum(self.target_critic1(ns, self.actor(ns)),
                                       self.target_critic2(ns, self.actor(ns)))

            target_q = tf.stop_gradient(
                r + self.gamma * (1 - d) *
                (target_min_aq - self.alpha.numpy() * self.actor.log_pi(ns)))

            #critic training
            with tf.GradientTape(persistent=True) as tape1:
                critic1_loss = tf.reduce_mean(
                    tf.square(self.critic1(s, a) - target_q))
                critic2_loss = tf.reduce_mean(
                    tf.square(self.critic2(s, a) - target_q))

            critic1_gradients = tape1.gradient(
                critic1_loss, self.critic1.trainable_variables)
            self.critic1_optimizer.apply_gradients(
                zip(critic1_gradients, self.critic1.trainable_variables))
            critic2_gradients = tape1.gradient(
                critic2_loss, self.critic2.trainable_variables)
            self.critic2_optimizer.apply_gradients(
                zip(critic2_gradients, self.critic2.trainable_variables))

            del tape1

            #actor training
            with tf.GradientTape() as tape2:
                mu, sigma = self.actor.mu_sigma(s)
                output = mu + tf.random.normal(shape=mu.shape) * sigma

                min_aq_rep = tf.minimum(self.critic1(s, output),
                                        self.critic2(s, output))

                actor_loss = tf.reduce_mean(self.alpha.numpy() *
                                            self.actor.log_pi(s) - min_aq_rep)

            actor_gradients = tape2.gradient(actor_loss,
                                             self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_gradients, self.actor.trainable_variables))

            del tape2

            #alpha(temperature) training
            if self.train_alpha == True:
                with tf.GradientTape() as tape3:
                    alpha_loss = -(tf.exp(self.log_alpha) * (tf.stop_gradient(
                        self.actor.log_pi(s) + self.target_entropy)))
                    alpha_loss = tf.nn.compute_average_loss(
                        alpha_loss)  #from softlearning package

                alpha_grad = tape3.gradient(alpha_loss, [self.log_alpha])
                self.alpha_optimizer.apply_gradients(
                    zip(alpha_grad, [self.log_alpha]))

                del tape3

            soft_update(self.critic1, self.target_critic1, self.tau)
            soft_update(self.critic2, self.target_critic2, self.tau)
Esempio n. 7
0
class SAC_v1:
    def __init__(self,
                 state_dim,
                 action_dim,
                 hidden_dim=256,
                 training_step=1,
                 batch_size=128,
                 buffer_size=1e6,
                 tau=0.005,
                 learning_rate=0.0003,
                 gamma=0.99,
                 alpha=0.2,
                 reward_scale=1,
                 training_start=500):

        self.buffer = Buffer(buffer_size)

        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.v_network_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.batch_size = batch_size
        self.tau = tau
        self.gamma = gamma
        self.alpha = alpha
        self.reward_scale = reward_scale
        self.training_start = training_start
        self.training_step = training_step

        self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim,
                                             (hidden_dim, hidden_dim))
        self.critic1 = Q_network(self.state_dim, self.action_dim,
                                 (hidden_dim, hidden_dim))
        self.critic2 = Q_network(self.state_dim, self.action_dim,
                                 (hidden_dim, hidden_dim))
        self.v_network = V_network(self.state_dim, (hidden_dim, hidden_dim))
        self.target_v_network = V_network(self.state_dim,
                                          (hidden_dim, hidden_dim))

        copy_weight(self.v_network, self.target_v_network)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'V_network': self.v_network,
            'Target_V_network': self.target_v_network
        }
        self.name = 'SAC_v1'

    def get_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)

        action = self.actor(state).numpy()[0]

        return action

    def train(self, training_num):
        for i in range(training_num):
            s, a, r, ns, d = self.buffer.sample(self.batch_size)

            min_aq = tf.minimum(self.critic1(s, self.actor(s)),
                                self.critic2(s, self.actor(s)))

            target_v = tf.stop_gradient(min_aq -
                                        self.alpha * self.actor.log_pi(s))
            #v_network training
            with tf.GradientTape(persistent=True) as tape1:
                v_loss = 0.5 * tf.reduce_mean(
                    tf.square(self.v_network(s) - target_v))

            v_gradients = tape1.gradient(v_loss,
                                         self.v_network.trainable_variables)
            self.v_network_optimizer.apply_gradients(
                zip(v_gradients, self.v_network.trainable_variables))

            del tape1

            target_q = tf.stop_gradient(r + self.gamma *
                                        (1 - d) * self.target_v_network(ns))
            #critic training
            with tf.GradientTape(persistent=True) as tape2:

                critic1_loss = 0.5 * tf.reduce_mean(
                    tf.square(self.critic1(s, a) - target_q))
                critic2_loss = 0.5 * tf.reduce_mean(
                    tf.square(self.critic2(s, a) - target_q))

            critic1_gradients = tape2.gradient(
                critic1_loss, self.critic1.trainable_variables)
            self.critic1_optimizer.apply_gradients(
                zip(critic1_gradients, self.critic1.trainable_variables))

            critic2_gradients = tape2.gradient(
                critic2_loss, self.critic2.trainable_variables)
            self.critic2_optimizer.apply_gradients(
                zip(critic2_gradients, self.critic2.trainable_variables))

            del tape2
            #actor training
            with tf.GradientTape() as tape3:
                mu, sigma = self.actor.mu_sigma(s)
                output = mu + tf.random.normal(shape=sigma.shape) * sigma

                min_aq_rep = tf.minimum(self.critic1(s, output),
                                        self.critic2(s, output))

                actor_loss = tf.reduce_mean(self.alpha * self.actor.log_pi(s) -
                                            min_aq_rep)

            actor_grad = tape3.gradient(actor_loss,
                                        self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor.trainable_variables))

            del tape3

            soft_update(self.v_network, self.target_v_network, self.tau)