Example #1
0
    def __init__(self,
                 state_dim,
                 action_dim,
                 hidden_dim=256,
                 training_step=1,
                 alpha=0.1,
                 train_alpha=True,
                 batch_size=128,
                 buffer_size=1e6,
                 tau=0.005,
                 learning_rate=0.0003,
                 gamma=0.99,
                 reward_scale=1,
                 training_start=500):

        self.buffer = Buffer(buffer_size)

        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.batch_size = batch_size
        self.tau = tau
        self.gamma = gamma
        self.reward_scale = reward_scale
        self.training_start = training_start
        self.training_step = training_step

        self.log_alpha = tf.Variable(np.log(alpha),
                                     dtype=tf.float32,
                                     trainable=True)
        self.target_entropy = -action_dim
        self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.train_alpha = train_alpha

        self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim,
                                             (hidden_dim, hidden_dim))
        self.critic1 = Q_network(self.state_dim, self.action_dim,
                                 (hidden_dim, hidden_dim))
        self.target_critic1 = Q_network(self.state_dim, self.action_dim,
                                        (hidden_dim, hidden_dim))
        self.critic2 = Q_network(self.state_dim, self.action_dim,
                                 (hidden_dim, hidden_dim))
        self.target_critic2 = Q_network(self.state_dim, self.action_dim,
                                        (hidden_dim, hidden_dim))

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2
        }
        self.name = 'SAC_v2'
Example #2
0
    def __init__(self, state_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.optimizer = tf.keras.optimizers.Adam(args.learning_rate)

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.batch_size = args.batch_size
        self.gamma = args.gamma
        self.lr = args.learning_rate
        self.epsilon = args.epsilon
        self.training_start = args.training_start
        self.training_step = args.training_step
        self.current_step = 0
        self.copy_iter = args.copy_iter

        self.network = Policy_network(self.state_dim, self.action_dim,
                                      args.hidden_dim)
        self.target_network = Policy_network(self.state_dim, self.action_dim,
                                             args.hidden_dim)

        copy_weight(self.network, self.target_network)

        self.network_list = {
            'Network': self.network,
            'Target_Network': self.target_network
        }
        self.name = 'Double DQN'
Example #3
0
    def __init__(self,
                 state_dim,
                 action_dim,
                 hidden_dim=256,
                 training_step=1,
                 batch_size=128,
                 buffer_size=1e6,
                 tau=0.005,
                 learning_rate=0.0003,
                 gamma=0.99,
                 alpha=0.2,
                 reward_scale=1,
                 training_start=500):

        self.buffer = Buffer(buffer_size)

        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.v_network_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.batch_size = batch_size
        self.tau = tau
        self.gamma = gamma
        self.alpha = alpha
        self.reward_scale = reward_scale
        self.training_start = training_start
        self.training_step = training_step

        self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim,
                                             (hidden_dim, hidden_dim))
        self.critic1 = Q_network(self.state_dim, self.action_dim,
                                 (hidden_dim, hidden_dim))
        self.critic2 = Q_network(self.state_dim, self.action_dim,
                                 (hidden_dim, hidden_dim))
        self.v_network = V_network(self.state_dim, (hidden_dim, hidden_dim))
        self.target_v_network = V_network(self.state_dim,
                                          (hidden_dim, hidden_dim))

        copy_weight(self.v_network, self.target_v_network)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'V_network': self.v_network,
            'Target_V_network': self.target_v_network
        }
        self.name = 'SAC_v1'
Example #4
0
    def __init__(self, state_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.batch_size = args.batch_size
        self.gamma = args.gamma
        self.tau = args.tau
        self.actor_lr = args.actor_lr
        self.critic_lr = args.critic_lr
        self.policy_delay = args.policy_delay
        self.actor_noise = args.actor_noise
        self.target_noise = args.target_noise
        self.noise_clip = args.noise_clip
        self.training_start = args.training_start
        self.training_step = args.training_step
        self.current_step = 0

        self.actor = Policy_network(self.state_dim, self.action_dim,
                                    args.hidden_dim)
        self.target_actor = Policy_network(self.state_dim, self.action_dim,
                                           args.hidden_dim)
        self.critic1 = Q_network(self.state_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic1 = Q_network(self.state_dim, self.action_dim,
                                        args.hidden_dim)
        self.critic2 = Q_network(self.state_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic2 = Q_network(self.state_dim, self.action_dim,
                                        args.hidden_dim)

        copy_weight(self.actor, self.target_actor)
        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2
        }
        self.name = 'TD3'
Example #5
0
    def __init__(self, state_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.batch_size = args.batch_size
        self.tau = args.tau
        self.gamma = args.gamma
        self.training_start = args.training_start
        self.training_step = args.training_step
        self.current_step = 0
        self.critic_update = args.critic_update

        self.log_alpha = tf.Variable(np.log(args.alpha),
                                     dtype=tf.float32,
                                     trainable=True)
        self.target_entropy = -action_dim
        self.alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr)
        self.train_alpha = args.train_alpha

        self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim,
                                             args.hidden_dim, args.log_std_min,
                                             args.log_std_max)
        self.critic1 = Q_network(self.state_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic1 = Q_network(self.state_dim, self.action_dim,
                                        args.hidden_dim)
        self.critic2 = Q_network(self.state_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic2 = Q_network(self.state_dim, self.action_dim,
                                        args.hidden_dim)

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2
        }
        self.name = 'SAC_v2'
Example #6
0
    def __init__(self, obs_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)
        self.optimizer = tf.keras.optimizers.Adam(args.learning_rate)

        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.feature_dim = args.feature_dim

        self.batch_size = args.batch_size
        self.gamma = args.gamma
        self.learning_rate = args.learning_rate
        self.epsilon = args.epsilon
        self.training_start = args.training_start
        self.training_step = args.training_step
        self.current_step = 0
        self.copy_iter = args.copy_iter

        self.layer_num = args.layer_num
        self.filter_num = args.filter_num

        self.network = Policy_network(self.feature_dim, self.action_dim,
                                      args.hidden_dim)
        self.target_network = Policy_network(self.feature_dim, self.action_dim,
                                             args.hidden_dim)

        self.encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num,
                                    'channels_last')
        self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                           self.layer_num, self.filter_num,
                                           'channels_last')

        copy_weight(self.network, self.target_network)
        copy_weight(self.encoder, self.target_encoder)

        self.network_list = {
            'Network': self.network,
            'Target_Network': self.target_network
        }
        self.name = 'ImageDQN'
Example #7
0
    def __init__(self, state_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.v_network_optimizer = tf.keras.optimizers.Adam(args.v_lr)

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.batch_size = args.batch_size
        self.tau = args.tau
        self.gamma = args.gamma
        self.alpha = args.alpha
        self.training_start = args.training_start
        self.training_step = args.training_step
        self.current_step = 0

        self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim,
                                             args.hidden_dim, args.log_std_min,
                                             args.log_std_max)
        self.critic1 = Q_network(self.state_dim, self.action_dim,
                                 args.hidden_dim)
        self.critic2 = Q_network(self.state_dim, self.action_dim,
                                 args.hidden_dim)
        self.v_network = V_network(self.state_dim, args.hidden_dim)
        self.target_v_network = V_network(self.state_dim, args.hidden_dim)

        copy_weight(self.v_network, self.target_v_network)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'V_network': self.v_network,
            'Target_V_network': self.target_v_network
        }
        self.name = 'SAC_v1'
Example #8
0
    def __init__(self, state_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.batch_size = args.batch_size
        self.gamma = args.gamma
        self.tau = args.tau
        self.noise_scale = args.noise_scale
        self.training_start = args.training_start
        self.training_step = args.training_step
        self.current_step = 0

        self.actor = Policy_network(self.state_dim, self.action_dim,
                                    args.hidden_dim)
        self.target_actor = Policy_network(self.state_dim, self.action_dim,
                                           args.hidden_dim)
        self.critic = Q_network(self.state_dim, self.action_dim,
                                args.hidden_dim)
        self.target_critic = Q_network(self.state_dim, self.action_dim,
                                       args.hidden_dim)

        copy_weight(self.actor, self.target_actor)
        copy_weight(self.critic, self.target_critic)

        self.network_list = {
            'Actor': self.actor,
            'Target_Actor': self.target_actor,
            'Critic': self.critic,
            'Target_Critic': self.target_critic
        }
        self.name = 'DDPG'
Example #9
0
    def __init__(self, state_dim, action_dim, args):

        self.discrete = args.discrete

        self.buffer = Buffer(args.buffer_size)

        self.gamma = args.gamma
        self.lambda_gae = args.lambda_gae
        self.batch_size = args.batch_size
        self.backtrack_iter = args.backtrack_iter
        self.backtrack_coeff = args.backtrack_coeff
        self.delta = args.delta

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.state_dim = state_dim
        self.action_dim = action_dim
        self.training_start = 0
        self.training_step = args.training_step

        if self.discrete == True:
            self.actor = Policy_network(self.state_dim, self.action_dim,
                                        args.hidden_dim)
            self.backup_actor = Policy_network(self.state_dim, self.action_dim,
                                               args.hidden_dim)
        else:
            self.actor = Gaussian_Actor(self.state_dim, self.action_dim,
                                        args.hidden_dim)
            self.backup_actor = Gaussian_Actor(self.state_dim, self.action_dim,
                                               args.hidden_dim)

        self.critic = V_network(self.state_dim)

        self.network_list = {'Actor': self.actor, 'Critic': self.critic}
        self.name = 'TRPO'
Example #10
0
    def __init__(self, obs_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.image_size = obs_dim[-1]

        self.current_step = 0

        self.gamma = args.gamma

        self.batch_size = args.batch_size
        self.feature_dim = args.feature_dim
        self.curl_latent_dim = args.curl_latent_dim

        self.layer_num = args.layer_num
        self.filter_num = args.filter_num
        self.tau = args.tau
        self.encoder_tau = args.encoder_tau

        self.policy_delay = args.policy_delay
        self.actor_noise = args.actor_noise
        self.target_noise = args.target_noise
        self.noise_clip = args.noise_clip

        self.training_start = args.training_start
        self.training_step = args.training_step

        self.actor = Policy_network(self.feature_dim, self.action_dim,
                                    args.hidden_dim)
        self.target_actor = Policy_network(self.feature_dim, self.action_dim,
                                           args.hidden_dim)
        self.critic1 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.critic2 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic1 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)
        self.target_critic2 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)

        self.encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                           self.layer_num, self.filter_num)

        copy_weight(self.actor, self.target_actor)
        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.curl = CURL(self.feature_dim, self.curl_latent_dim)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr)
        self.cpc_optimizer = tf.keras.optimizers.Adam(args.cpc_lr)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2,
            'Curl': self.curl,
            'Encoder': self.encoder,
            'Target_Encoder': self.target_encoder
        }

        self.name = 'CURL_TD3'
Example #11
0
class CURL_TD3:
    def __init__(self, obs_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.image_size = obs_dim[-1]

        self.current_step = 0

        self.gamma = args.gamma

        self.batch_size = args.batch_size
        self.feature_dim = args.feature_dim
        self.curl_latent_dim = args.curl_latent_dim

        self.layer_num = args.layer_num
        self.filter_num = args.filter_num
        self.tau = args.tau
        self.encoder_tau = args.encoder_tau

        self.policy_delay = args.policy_delay
        self.actor_noise = args.actor_noise
        self.target_noise = args.target_noise
        self.noise_clip = args.noise_clip

        self.training_start = args.training_start
        self.training_step = args.training_step

        self.actor = Policy_network(self.feature_dim, self.action_dim,
                                    args.hidden_dim)
        self.target_actor = Policy_network(self.feature_dim, self.action_dim,
                                           args.hidden_dim)
        self.critic1 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.critic2 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic1 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)
        self.target_critic2 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)

        self.encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                           self.layer_num, self.filter_num)

        copy_weight(self.actor, self.target_actor)
        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.curl = CURL(self.feature_dim, self.curl_latent_dim)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr)
        self.cpc_optimizer = tf.keras.optimizers.Adam(args.cpc_lr)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2,
            'Curl': self.curl,
            'Encoder': self.encoder,
            'Target_Encoder': self.target_encoder
        }

        self.name = 'CURL_TD3'

    def get_action(self, obs):

        if obs.shape[-1] != self.image_size:
            obs = center_crop_image(obs, self.image_size)

        obs = np.expand_dims(np.array(obs), axis=0)
        noise = np.random.normal(loc=0,
                                 scale=self.actor_noise,
                                 size=self.action_dim)
        feature = self.encoder(obs)
        action = self.actor(feature).numpy()[0] + noise
        action = np.clip(action, -1, 1)

        return action

    def eval_action(self, obs):

        if obs.shape[-1] != self.image_size:
            obs = center_crop_image(obs, self.image_size)

        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action = self.actor(feature).numpy()[0]
        action = np.clip(action, -1, 1)

        return action

    def train(self, local_step):
        self.current_step += 1

        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        total_cpc_loss = 0
        loss_list = []
        s, a, r, ns, d, cpc_kwargs = self.buffer.cpc_sample(
            self.batch_size, self.image_size)

        obs_anchor, obs_pos = cpc_kwargs["obs_anchor"], cpc_kwargs["obs_pos"]

        with tf.GradientTape(persistent=True) as tape:
            z_a = self.encoder(obs_anchor)
            z_pos = tf.stop_gradient(self.target_encoder(obs_pos))
            logits = self.curl.compute_logits(z_a, z_pos)
            labels = tf.range(logits.shape[0], dtype='int64')

            cpc_loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(labels, logits))

        cpc_gradients = tape.gradient(cpc_loss, self.curl.trainable_variables)
        self.cpc_optimizer.apply_gradients(
            zip(cpc_gradients, self.curl.trainable_variables))

        encoder_gradients = tape.gradient(cpc_loss,
                                          self.encoder.trainable_variables)
        self.encoder_optimizer.apply_gradients(
            zip(encoder_gradients, self.encoder.trainable_variables))

        total_cpc_loss += cpc_loss.numpy()
        loss_list.append(['Loss/CPC', total_cpc_loss])

        del tape

        if self.current_step % 2 == 0:
            target_action = tf.clip_by_value(
                self.target_actor(self.target_encoder(ns)) + tf.clip_by_value(
                    tf.random.normal(shape=self.target_actor(
                        self.target_encoder(ns)).shape,
                                     mean=0,
                                     stddev=self.target_noise),
                    -self.noise_clip, self.noise_clip), -1, 1)

            target_value = tf.stop_gradient(r + self.gamma * (
                1 - d
            ) * tf.minimum(
                self.target_critic1(self.target_encoder(ns), target_action),
                self.target_critic2(self.target_encoder(ns), target_action)))

            with tf.GradientTape(persistent=True) as tape:
                critic1_loss = 0.5 * tf.reduce_mean(
                    tf.square(target_value - self.critic1(self.encoder(s), a)))
                critic2_loss = 0.5 * tf.reduce_mean(
                    tf.square(target_value - self.critic2(self.encoder(s), a)))

            critic1_grad = tape.gradient(
                critic1_loss, self.encoder.trainable_variables +
                self.critic1.trainable_variables)
            self.critic1_optimizer.apply_gradients(
                zip(
                    critic1_grad, self.encoder.trainable_variables +
                    self.critic1.trainable_variables))

            critic2_grad = tape.gradient(
                critic2_loss, self.encoder.trainable_variables +
                self.critic2.trainable_variables)
            self.critic2_optimizer.apply_gradients(
                zip(
                    critic2_grad, self.encoder.trainable_variables +
                    self.critic2.trainable_variables))

            if self.current_step % (2 * self.policy_delay) == 0:
                with tf.GradientTape() as tape2:
                    actor_loss = -tf.reduce_mean(
                        self.critic1(
                            tf.stop_gradient(self.encoder(s)),
                            self.actor(tf.stop_gradient(self.encoder(s)))))

                actor_grad = tape2.gradient(actor_loss,
                                            self.actor.trainable_variables)
                self.actor_optimizer.apply_gradients(
                    zip(actor_grad, self.actor.trainable_variables))

                soft_update(self.actor, self.target_actor, self.tau)
                soft_update(self.critic1, self.target_critic1, self.tau)
                soft_update(self.critic2, self.target_critic2, self.tau)
                soft_update(self.encoder, self.target_encoder,
                            self.encoder_tau)

                total_a_loss += actor_loss.numpy()
                loss_list.append(['Loss/Actor', total_a_loss])

            total_c1_loss += critic1_loss.numpy()
            total_c2_loss += critic2_loss.numpy()

            loss_list.append(['Loss/Critic1', total_c1_loss])
            loss_list.append(['Loss/Critic2', total_c2_loss])

        return loss_list
Example #12
0
    def __init__(self, obs_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.image_size = obs_dim[-1]
        self.current_step = 0

        self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha),
                                     trainable=True)
        self.target_entropy = -action_dim
        self.gamma = args.gamma

        self.batch_size = args.batch_size
        self.feature_dim = args.feature_dim
        self.curl_latent_dim = args.curl_latent_dim

        self.layer_num = args.layer_num
        self.filter_num = args.filter_num
        self.tau = args.tau
        self.encoder_tau = args.encoder_tau
        self.critic_update = args.critic_update

        self.training_start = args.training_start
        self.training_step = args.training_step
        self.train_alpha = args.train_alpha

        self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim,
                                             args.hidden_dim, args.log_std_min,
                                             args.log_std_max)
        self.critic1 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.critic2 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic1 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)
        self.target_critic2 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)

        self.encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                           self.layer_num, self.filter_num)

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.curl = CURL(self.feature_dim, self.curl_latent_dim)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr)
        self.cpc_optimizer = tf.keras.optimizers.Adam(args.cpc_lr)
        self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr,
                                                            beta_1=0.5)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2,
            'Curl': self.curl,
            'Encoder': self.encoder,
            'Target_Encoder': self.target_encoder
        }

        self.name = 'CURL_SACv2'
Example #13
0
class CURL_SACv2:
    def __init__(self, obs_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.image_size = obs_dim[-1]
        self.current_step = 0

        self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha),
                                     trainable=True)
        self.target_entropy = -action_dim
        self.gamma = args.gamma

        self.batch_size = args.batch_size
        self.feature_dim = args.feature_dim
        self.curl_latent_dim = args.curl_latent_dim

        self.layer_num = args.layer_num
        self.filter_num = args.filter_num
        self.tau = args.tau
        self.encoder_tau = args.encoder_tau
        self.critic_update = args.critic_update

        self.training_start = args.training_start
        self.training_step = args.training_step
        self.train_alpha = args.train_alpha

        self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim,
                                             args.hidden_dim, args.log_std_min,
                                             args.log_std_max)
        self.critic1 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.critic2 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic1 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)
        self.target_critic2 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)

        self.encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                           self.layer_num, self.filter_num)

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.curl = CURL(self.feature_dim, self.curl_latent_dim)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr)
        self.cpc_optimizer = tf.keras.optimizers.Adam(args.cpc_lr)
        self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr,
                                                            beta_1=0.5)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2,
            'Curl': self.curl,
            'Encoder': self.encoder,
            'Target_Encoder': self.target_encoder
        }

        self.name = 'CURL_SACv2'

    @property
    def alpha(self):
        return tf.exp(self.log_alpha)

    def get_action(self, obs):

        if obs.shape[-1] != self.image_size:
            obs = center_crop_image(obs, self.image_size)

        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action, _ = self.actor(feature)
        action = action.numpy()[0]

        return action

    def eval_action(self, obs):

        if obs.shape[-1] != self.image_size:
            obs = center_crop_image(obs, self.image_size)

        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action, _ = self.actor(feature, deterministic=True)
        action = action.numpy()[0]

        return action

    def train(self, local_step):
        self.current_step += 1

        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        total_cpc_loss = 0
        total_alpha_loss = 0
        loss_list = []

        s, a, r, ns, d, cpc_kwargs = self.buffer.cpc_sample(
            self.batch_size, self.image_size)

        obs_anchor, obs_pos = cpc_kwargs["obs_anchor"], cpc_kwargs["obs_pos"]

        ns_action, ns_logpi = self.actor(self.encoder(ns))

        target_min_aq = tf.minimum(
            self.target_critic1(self.target_encoder(ns), ns_action),
            self.target_critic2(self.target_encoder(ns), ns_action))

        target_q = tf.stop_gradient(
            r + self.gamma * (1 - d) *
            (target_min_aq - self.alpha.numpy() * ns_logpi))

        with tf.GradientTape(persistent=True) as tape1:
            critic1_loss = tf.reduce_mean(
                tf.square(self.critic1(self.encoder(s), a) - target_q))
            critic2_loss = tf.reduce_mean(
                tf.square(self.critic2(self.encoder(s), a) - target_q))

        critic1_gradients = tape1.gradient(
            critic1_loss, self.encoder.trainable_variables +
            self.critic1.trainable_variables)
        self.critic1_optimizer.apply_gradients(
            zip(
                critic1_gradients, self.encoder.trainable_variables +
                self.critic1.trainable_variables))

        critic2_gradients = tape1.gradient(
            critic2_loss, self.encoder.trainable_variables +
            self.critic2.trainable_variables)
        self.critic2_optimizer.apply_gradients(
            zip(
                critic2_gradients, self.encoder.trainable_variables +
                self.critic2.trainable_variables))

        del tape1

        with tf.GradientTape() as tape2:

            s_action, s_logpi = self.actor(tf.stop_gradient(self.encoder(s)))

            min_aq_rep = tf.minimum(
                self.critic1(tf.stop_gradient(self.encoder(s)), s_action),
                self.critic2(tf.stop_gradient(self.encoder(s)), s_action))

            actor_loss = tf.reduce_mean(self.alpha.numpy() * s_logpi -
                                        min_aq_rep)

        actor_gradients = tape2.gradient(actor_loss,
                                         self.actor.trainable_variables)
        self.actor_optimizer.apply_gradients(
            zip(actor_gradients, self.actor.trainable_variables))

        del tape2

        if self.train_alpha == True:
            with tf.GradientTape() as tape3:
                _, s_logpi = self.actor(self.encoder(s))
                alpha_loss = -(tf.exp(self.log_alpha) *
                               tf.stop_gradient(s_logpi + self.target_entropy))
                alpha_loss = tf.nn.compute_average_loss(alpha_loss)
                #alpha_loss = tf.reduce_mean(alpha_loss)

            log_alpha_gradients = tape3.gradient(alpha_loss, [self.log_alpha])
            self.log_alpha_optimizer.apply_gradients(
                zip(log_alpha_gradients, [self.log_alpha]))

            del tape3

        if self.current_step % self.critic_update == 0:
            soft_update(self.critic1, self.target_critic1, self.tau)
            soft_update(self.critic2, self.target_critic2, self.tau)
            soft_update(self.encoder, self.target_encoder, self.encoder_tau)

        with tf.GradientTape(persistent=True) as tape4:
            z_a = self.encoder(obs_anchor)
            z_pos = tf.stop_gradient(self.target_encoder(obs_pos))
            logits = self.curl.compute_logits(z_a, z_pos)
            labels = tf.range(logits.shape[0], dtype='int64')

            cpc_loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(labels, logits))

        cpc_gradients = tape4.gradient(cpc_loss, self.curl.trainable_variables)
        self.cpc_optimizer.apply_gradients(
            zip(cpc_gradients, self.curl.trainable_variables))

        encoder_gradients = tape4.gradient(cpc_loss,
                                           self.encoder.trainable_variables)
        self.encoder_optimizer.apply_gradients(
            zip(encoder_gradients, self.encoder.trainable_variables))

        del tape4

        total_c1_loss += critic1_loss.numpy()
        total_c2_loss += critic2_loss.numpy()

        loss_list.append(['Loss/Critic1', total_c1_loss])
        loss_list.append(['Loss/Critic2', total_c2_loss])

        total_a_loss += actor_loss.numpy()
        loss_list.append(['Loss/Actor', total_a_loss])

        total_cpc_loss += cpc_loss.numpy()
        loss_list.append(['Loss/CPC', total_cpc_loss])

        if self.train_alpha == True:
            total_alpha_loss += alpha_loss.numpy()
            loss_list.append(['Loss/Alpha', total_alpha_loss])

        loss_list.append(['Alpha', tf.exp(self.log_alpha).numpy()])

        return loss_list
Example #14
0
    def __init__(self,
                 obs_dim,
                 action_dim,
                 hidden_dim=256,
                 gamma=0.99,
                 learning_rate=1e-5,
                 batch_size=128,
                 buffer_size=1e6,
                 feature_dim=50,
                 layer_num=4,
                 filter_num=32,
                 tau=0.005,
                 encoder_tau=0.005,
                 bisim_coef=0.5,
                 training_start=1000,
                 train_alpha=True,
                 alpha=0.1):

        self.buffer = Buffer(buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha),
                                     trainable=True)
        self.target_entropy = -action_dim
        self.hidden_dim = hidden_dim
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.bisim_coef = bisim_coef

        self.batch_size = batch_size
        self.feature_dim = feature_dim

        self.layer_num = layer_num
        self.filter_num = filter_num
        self.tau = tau
        self.encoder_tau = encoder_tau

        self.training_start = training_start
        self.train_alpha = train_alpha

        self.actor = Squashed_Gaussian_Actor(feature_dim, action_dim,
                                             (hidden_dim, hidden_dim))
        self.critic1 = Q_network(feature_dim, action_dim,
                                 (hidden_dim, hidden_dim))
        self.critic2 = Q_network(feature_dim, action_dim,
                                 (hidden_dim, hidden_dim))
        self.target_critic1 = Q_network(feature_dim, action_dim,
                                        (hidden_dim, hidden_dim))
        self.target_critic2 = Q_network(feature_dim, action_dim,
                                        (hidden_dim, hidden_dim))

        self.encoder = PixelEncoder(self.obs_dim, feature_dim, layer_num,
                                    filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, feature_dim,
                                           layer_num, filter_num)

        self.dynamics_model = Transition_Network(feature_dim,
                                                 action_dim,
                                                 deterministic=False)
        self.reward_model = Reward_Network(feature_dim)

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.encoder_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.log_alpha_optimizer = tf.keras.optimizers.Adam(10 * learning_rate)

        self.dynamics_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.reward_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.name = 'DBC_SACv2'
Example #15
0
class SAC_v2:
    def __init__(self, state_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.batch_size = args.batch_size
        self.tau = args.tau
        self.gamma = args.gamma
        self.training_start = args.training_start
        self.training_step = args.training_step
        self.current_step = 0
        self.critic_update = args.critic_update

        self.log_alpha = tf.Variable(np.log(args.alpha),
                                     dtype=tf.float32,
                                     trainable=True)
        self.target_entropy = -action_dim
        self.alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr)
        self.train_alpha = args.train_alpha

        self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim,
                                             args.hidden_dim, args.log_std_min,
                                             args.log_std_max)
        self.critic1 = Q_network(self.state_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic1 = Q_network(self.state_dim, self.action_dim,
                                        args.hidden_dim)
        self.critic2 = Q_network(self.state_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic2 = Q_network(self.state_dim, self.action_dim,
                                        args.hidden_dim)

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2
        }
        self.name = 'SAC_v2'

    @property
    def alpha(self):
        return tf.exp(self.log_alpha)

    def get_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)
        action, _ = self.actor(state)
        action = np.clip(action.numpy()[0], -1, 1)

        return action

    def eval_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)
        action, _ = self.actor(state, deterministic=True)
        action = np.clip(action.numpy()[0], -1, 1)

        return action

    def train(self, training_num):
        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        total_alpha_loss = 0

        for i in range(training_num):
            self.current_step += 1
            s, a, r, ns, d = self.buffer.sample(self.batch_size)

            ns_action, ns_logpi = self.actor(ns)

            target_min_aq = tf.minimum(self.target_critic1(ns, ns_action),
                                       self.target_critic2(ns, ns_action))

            target_q = tf.stop_gradient(
                r + self.gamma * (1 - d) *
                (target_min_aq - self.alpha.numpy() * ns_logpi))

            with tf.GradientTape(persistent=True) as tape1:

                critic1_loss = 0.5 * tf.reduce_mean(
                    tf.square(self.critic1(s, a) - target_q))
                critic2_loss = 0.5 * tf.reduce_mean(
                    tf.square(self.critic2(s, a) - target_q))

            critic1_gradients = tape1.gradient(
                critic1_loss, self.critic1.trainable_variables)
            self.critic1_optimizer.apply_gradients(
                zip(critic1_gradients, self.critic1.trainable_variables))
            critic2_gradients = tape1.gradient(
                critic2_loss, self.critic2.trainable_variables)
            self.critic2_optimizer.apply_gradients(
                zip(critic2_gradients, self.critic2.trainable_variables))

            del tape1

            with tf.GradientTape() as tape2:
                s_action, s_logpi = self.actor(s)

                min_aq_rep = tf.minimum(self.critic1(s, s_action),
                                        self.critic2(s, s_action))

                actor_loss = 0.5 * tf.reduce_mean(self.alpha.numpy() *
                                                  s_logpi - min_aq_rep)

            actor_gradients = tape2.gradient(actor_loss,
                                             self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_gradients, self.actor.trainable_variables))

            del tape2

            if self.train_alpha == True:
                with tf.GradientTape() as tape3:
                    _, s_logpi = self.actor(s)
                    alpha_loss = -(
                        tf.exp(self.log_alpha) *
                        (tf.stop_gradient(s_logpi + self.target_entropy)))
                    alpha_loss = tf.nn.compute_average_loss(
                        alpha_loss)  #from softlearning package

                alpha_grad = tape3.gradient(alpha_loss, [self.log_alpha])
                self.alpha_optimizer.apply_gradients(
                    zip(alpha_grad, [self.log_alpha]))

                del tape3

            if self.current_step % self.critic_update == 0:
                soft_update(self.critic1, self.target_critic1, self.tau)
                soft_update(self.critic2, self.target_critic2, self.tau)

            total_a_loss += actor_loss.numpy()
            total_c1_loss += critic1_loss.numpy()
            total_c2_loss += critic2_loss.numpy()
            if self.train_alpha == True:
                total_alpha_loss += alpha_loss.numpy()

        return [['Loss/Actor', total_a_loss], ['Loss/Critic1', total_c1_loss],
                ['Loss/Critic2', total_c2_loss],
                ['Loss/alpha', total_alpha_loss],
                ['Alpha', tf.exp(self.log_alpha).numpy()]]
Example #16
0
class TD3:
    def __init__(self, state_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.batch_size = args.batch_size
        self.gamma = args.gamma
        self.tau = args.tau
        self.actor_lr = args.actor_lr
        self.critic_lr = args.critic_lr
        self.policy_delay = args.policy_delay
        self.actor_noise = args.actor_noise
        self.target_noise = args.target_noise
        self.noise_clip = args.noise_clip
        self.training_start = args.training_start
        self.training_step = args.training_step
        self.current_step = 0

        self.actor = Policy_network(self.state_dim, self.action_dim,
                                    args.hidden_dim)
        self.target_actor = Policy_network(self.state_dim, self.action_dim,
                                           args.hidden_dim)
        self.critic1 = Q_network(self.state_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic1 = Q_network(self.state_dim, self.action_dim,
                                        args.hidden_dim)
        self.critic2 = Q_network(self.state_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic2 = Q_network(self.state_dim, self.action_dim,
                                        args.hidden_dim)

        copy_weight(self.actor, self.target_actor)
        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2
        }
        self.name = 'TD3'

    def get_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)
        noise = np.random.normal(loc=0,
                                 scale=self.actor_noise,
                                 size=self.action_dim)
        action = self.actor(state).numpy()[0] + noise
        action = np.clip(action, -1, 1)

        return action

    def eval_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)
        action = self.actor(state).numpy()[0]
        action = np.clip(action, -1, 1)

        return action

    def train(self, training_num):
        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        for i in range(training_num):
            self.current_step += 1
            s, a, r, ns, d = self.buffer.sample(self.batch_size)

            target_action = tf.clip_by_value(
                self.target_actor(ns) + tf.clip_by_value(
                    tf.random.normal(shape=self.target_actor(ns).shape,
                                     mean=0,
                                     stddev=self.target_noise),
                    -self.noise_clip, self.noise_clip), -1, 1)

            target_value = tf.stop_gradient(
                r + self.gamma *
                (1 - d) * tf.minimum(self.target_critic1(ns, target_action),
                                     self.target_critic2(ns, target_action)))

            with tf.GradientTape(persistent=True) as tape:
                critic1_loss = 0.5 * tf.reduce_mean(
                    tf.square(target_value - self.critic1(s, a)))
                critic2_loss = 0.5 * tf.reduce_mean(
                    tf.square(target_value - self.critic2(s, a)))

            critic1_grad = tape.gradient(critic1_loss,
                                         self.critic1.trainable_variables)
            self.critic1_optimizer.apply_gradients(
                zip(critic1_grad, self.critic1.trainable_variables))

            critic2_grad = tape.gradient(critic2_loss,
                                         self.critic2.trainable_variables)
            self.critic2_optimizer.apply_gradients(
                zip(critic2_grad, self.critic2.trainable_variables))

            if self.current_step % self.policy_delay == 0:
                with tf.GradientTape() as tape2:
                    actor_loss = -tf.reduce_mean(self.critic1(
                        s, self.actor(s)))

                actor_grad = tape2.gradient(actor_loss,
                                            self.actor.trainable_variables)
                self.actor_optimizer.apply_gradients(
                    zip(actor_grad, self.actor.trainable_variables))

                soft_update(self.actor, self.target_actor, self.tau)
                soft_update(self.critic1, self.target_critic1, self.tau)
                soft_update(self.critic2, self.target_critic2, self.tau)

            del tape, tape2
            total_a_loss += actor_loss.numpy()
            total_c1_loss += critic1_loss.numpy()
            total_c2_loss += critic2_loss.numpy()

        return [['Loss/Actor', total_a_loss], ['Loss/Critic1', total_c1_loss],
                ['Loss/Critic2', total_c2_loss]]
Example #17
0
class TRPO:
    def __init__(self, state_dim, action_dim, args):

        self.discrete = args.discrete

        self.buffer = Buffer(args.buffer_size)

        self.gamma = args.gamma
        self.lambda_gae = args.lambda_gae
        self.batch_size = args.batch_size
        self.backtrack_iter = args.backtrack_iter
        self.backtrack_coeff = args.backtrack_coeff
        self.delta = args.delta

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.state_dim = state_dim
        self.action_dim = action_dim
        self.training_start = 0
        self.training_step = args.training_step

        if self.discrete == True:
            self.actor = Policy_network(self.state_dim, self.action_dim,
                                        args.hidden_dim)
            self.backup_actor = Policy_network(self.state_dim, self.action_dim,
                                               args.hidden_dim)
        else:
            self.actor = Gaussian_Actor(self.state_dim, self.action_dim,
                                        args.hidden_dim)
            self.backup_actor = Gaussian_Actor(self.state_dim, self.action_dim,
                                               args.hidden_dim)

        self.critic = V_network(self.state_dim)

        self.network_list = {'Actor': self.actor, 'Critic': self.critic}
        self.name = 'TRPO'

    def get_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)

        if self.discrete == True:
            policy = self.actor(state, activation='softmax').numpy()[0]
            action = np.random.choice(self.action_dim, 1, p=policy)[0]

        else:
            action = self.actor(state).numpy()[0]

        return action

    def eval_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)

        if self.discrete == True:
            policy = self.actor(state, activation='softmax').numpy()[0]
            action = np.argmax(policy)

        else:
            action = self.actor(state, deterministic=True).numpy()[0]

        return action

    def fisher_vector_product(self, states, p):
        with tf.GradientTape() as tape2:
            with tf.GradientTape() as tape1:
                if self.discrete == True:
                    kl_divergence = tfp.distributions.kl_divergence(
                        tfp.distributions.Categorical(
                            probs=self.actor(states, activation='softmax')),
                        tfp.distributions.Categorical(probs=self.backup_actor(
                            states, activation='softmax')))
                else:
                    dist = self.actor.dist(states)
                    backup_dist = self.backup_actor.dist(states)

                    kl_divergence = tfp.distributions.kl_divergence(
                        dist, backup_dist)
                kl_divergence = tf.reduce_mean(kl_divergence)
            kl_grad = tape1.gradient(kl_divergence,
                                     self.actor.trainable_variables)

            flatten_kl_grad = tf.concat(
                [tf.reshape(grad, [-1]) for grad in kl_grad], axis=0)
            kl_grad_p = tf.reduce_sum(flatten_kl_grad * p)

        kl_hessian_p = tape2.gradient(kl_grad_p,
                                      self.actor.trainable_variables)
        flatten_kl_hessian_p = tf.concat(
            [tf.reshape(hessian, [-1]) for hessian in kl_hessian_p],
            axis=0).numpy()

        return flatten_kl_hessian_p + 0.1 * p

    def conjugate_gradient(self, states, b, nsteps):
        x = np.zeros_like(b)
        r = copy.deepcopy(b)
        p = copy.deepcopy(r)
        rdotr = np.dot(r, r)

        for i in range(nsteps):
            _Avp = self.fisher_vector_product(states, p)
            alpha = rdotr / (np.dot(p, _Avp) + 1e-8)
            x += alpha * p
            r -= alpha * _Avp
            new_rdotr = np.dot(r, r)
            beta = new_rdotr / (rdotr + 1e-8)
            p = r + beta * p
            rdotr = new_rdotr

        return x

    def update_model(self, model, new_variables):
        index = 0
        for variable in model.trainable_variables:
            variable_length = len(tf.reshape(variable, [-1]))
            new_variable = new_variables[index:index + variable_length]
            new_variable = tf.reshape(new_variable, tf.shape(variable))
            variable.assign(new_variable)

            index += variable_length

    def train(self, training_num):
        total_c_loss = 0
        s, a, r, ns, d = self.buffer.all_sample()

        old_values = self.critic(s)

        returns = np.zeros_like(r.numpy())
        advantages = np.zeros_like(returns)

        running_return = np.zeros(1)
        previous_value = np.zeros(1)
        running_advantage = np.zeros(1)

        for t in reversed(range(len(r))):  #General Advantage Estimation
            running_return = (r[t] + self.gamma * running_return *
                              (1 - d[t])).numpy()
            running_tderror = (r[t] + self.gamma * previous_value *
                               (1 - d[t]) - old_values[t]).numpy()
            running_advantage = (
                running_tderror +
                (self.gamma * self.lambda_gae) * running_advantage *
                (1 - d[t])).numpy()

            returns[t] = running_return
            previous_value = old_values[t]
            advantages[t] = running_advantage

        if self.discrete == True:
            old_policy = self.actor(s, activation='softmax')
            old_a_one_hot = tf.squeeze(tf.one_hot(tf.cast(a, tf.int32),
                                                  depth=self.action_dim),
                                       axis=1)
            old_log_policy = tf.reduce_sum(tf.math.log(old_policy) *
                                           tf.stop_gradient(old_a_one_hot),
                                           axis=1,
                                           keepdims=True)

        else:
            old_dist = self.actor.dist(s)
            old_log_policy = old_dist.log_prob(a)
            old_log_policy = tf.expand_dims(old_log_policy, axis=1)

        flattened_actor = tf.concat([
            tf.reshape(variable, [-1])
            for variable in self.actor.trainable_variables
        ],
                                    axis=0)
        self.update_model(self.backup_actor, flattened_actor)

        with tf.GradientTape() as tape:
            if self.discrete == True:
                policy = self.actor(s, activation='softmax')
                a_one_hot = tf.squeeze(tf.one_hot(tf.cast(a, tf.int32),
                                                  depth=self.action_dim),
                                       axis=1)
                log_policy = tf.reduce_sum(tf.math.log(policy) *
                                           tf.stop_gradient(a_one_hot),
                                           axis=1,
                                           keepdims=True)

                surrogate = tf.reduce_mean(
                    tf.exp(log_policy - tf.stop_gradient(old_log_policy)) *
                    advantages)

            else:
                dist = self.actor.dist(s)
                log_policy = dist.log_prob(a)
                log_policy = tf.expand_dims(log_policy, axis=1)

                surrogate = tf.reduce_mean(
                    tf.exp(log_policy - tf.stop_gradient(old_log_policy)) *
                    advantages)

        policy_grad = tape.gradient(surrogate, self.actor.trainable_variables)
        flatten_policy_grad = tf.concat(
            [tf.reshape(grad, [-1]) for grad in policy_grad], axis=0)

        step_dir = self.conjugate_gradient(s, flatten_policy_grad.numpy(), 10)

        shs = 0.5 * tf.reduce_sum(
            step_dir * self.fisher_vector_product(s, step_dir), axis=0)
        step_size = 1 / tf.sqrt(shs / self.delta)
        full_step = step_size * step_dir

        expected_improve = tf.reduce_sum(flatten_policy_grad * full_step,
                                         axis=0)

        flag = False
        fraction = 1.0

        for i in range(self.backtrack_iter):
            new_flattened_actor = flattened_actor + fraction * full_step
            self.update_model(self.actor, new_flattened_actor)

            if self.discrete == True:
                new_policy = self.actor(s, activation='softmax')
                new_a_one_hot = tf.squeeze(tf.one_hot(tf.cast(a, tf.int32),
                                                      depth=self.action_dim),
                                           axis=1)
                new_log_policy = tf.reduce_sum(tf.math.log(new_policy) *
                                               tf.stop_gradient(new_a_one_hot),
                                               axis=1,
                                               keepdims=True)
            else:
                new_dist = self.actor.dist(s)
                new_log_policy = new_dist.log_prob(a)
                new_log_policy = tf.expand_dims(new_log_policy, axis=1)

            new_surrogate = tf.reduce_mean(
                tf.exp(new_log_policy - old_log_policy) * advantages)

            loss_improve = new_surrogate - surrogate
            expected_improve *= fraction

            if self.discrete == True:
                new_kl_divergence = tfp.distributions.kl_divergence(
                    tfp.distributions.Categorical(
                        probs=self.actor(s, activation='softmax')),
                    tfp.distributions.Categorical(
                        probs=self.backup_actor(s, activation='softmax')))
            else:
                new_dist = self.actor.dist(s)
                backup_dist = self.backup_actor.dist(s)

                new_kl_divergence = tfp.distributions.kl_divergence(
                    new_dist, backup_dist)

            new_kl_divergence = tf.reduce_mean(new_kl_divergence)

            #print('kl: {:.4f}  loss improve: {:.4f}  expected improve: {:.4f}  ' 'number of line search: {}'.format(new_kl_divergence.numpy(), loss_improve, expected_improve, i))

            if new_kl_divergence.numpy(
            ) <= self.delta and loss_improve >= expected_improve:
                flag = True
                break

            fraction *= self.backtrack_coeff

        if not flag:
            self.update_model(self.actor, flattened_actor)
            print("Policy update failed")

        #critic_train

        n = len(s)
        arr = np.arange(n)

        for epoch in range(self.training_step):

            if n // self.batch_size > 0:
                np.random.shuffle(arr)
                batch_index = arr[:self.batch_size]
                batch_index.sort()
            else:
                batch_index = arr

            batch_s = s.numpy()[batch_index]
            batch_returns = returns[batch_index]

            with tf.GradientTape() as tape:
                critic_loss = 0.5 * tf.reduce_mean(
                    tf.square(
                        tf.stop_gradient(batch_returns) -
                        self.critic(batch_s)))

            critic_variables = self.critic.trainable_variables
            critic_gradients = tape.gradient(critic_loss, critic_variables)
            self.critic_optimizer.apply_gradients(
                zip(critic_gradients, critic_variables))

            total_c_loss += critic_loss.numpy()

        self.buffer.delete()
        return [['Loss/Critic', total_c_loss]]
Example #18
0
class SACv2_AE:
    def __init__(self, obs_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.image_size = args.image_size
        self.current_step = 0

        self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha),
                                     trainable=True)
        self.target_entropy = -action_dim
        self.gamma = args.gamma

        self.batch_size = args.batch_size
        self.feature_dim = args.feature_dim

        self.layer_num = args.layer_num
        self.filter_num = args.filter_num
        self.tau = args.tau
        self.encoder_tau = args.encoder_tau
        self.actor_update = args.actor_update
        self.critic_update = args.critic_update
        self.decoder_update = args.decoder_update
        self.decoder_latent_lambda = args.decoder_latent_lambda
        self.decoder_weight_lambda = args.decoder_weight_lambda

        self.training_start = args.training_start
        self.training_step = args.training_step
        self.train_alpha = args.train_alpha

        self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim,
                                             args.hidden_dim, args.log_std_min,
                                             args.log_std_max)
        self.critic1 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.critic2 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic1 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)
        self.target_critic2 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)

        self.encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                           self.layer_num, self.filter_num)
        self.decoder = PixelDecoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num)

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr)
        self.decoder_optimizer = tfa.optimizers.AdamW(
            weight_decay=self.decoder_weight_lambda,
            learning_rate=args.decoder_lr)

        self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr,
                                                            beta_1=0.5)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2,
            'Encoder': self.encoder,
            'Target_Encoder': self.target_encoder,
            'Decoder': self.decoder
        }
        self.name = 'SACv2_AE'

    @property
    def alpha(self):
        return tf.exp(self.log_alpha)

    def get_action(self, obs):

        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action, _ = self.actor(feature)
        action = action.numpy()[0]

        return action

    def eval_action(self, obs):

        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action, _ = self.actor(feature, deterministic=True)
        action = action.numpy()[0]

        return action

    def train(self, local_step):
        self.current_step += 1

        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        total_alpha_loss = 0
        total_ae_loss = 0
        loss_list = []

        s, a, r, ns, d = self.buffer.sample(self.batch_size)

        ns_action, ns_logpi = self.actor(self.encoder(ns))

        target_min_aq = tf.minimum(
            self.target_critic1(self.target_encoder(ns), ns_action),
            self.target_critic2(self.target_encoder(ns), ns_action))

        target_q = tf.stop_gradient(
            r + self.gamma * (1 - d) *
            (target_min_aq - self.alpha.numpy() * ns_logpi))
        #critic update
        with tf.GradientTape(persistent=True) as tape1:
            critic1_loss = tf.reduce_mean(
                tf.square(self.critic1(self.encoder(s), a) - target_q))
            critic2_loss = tf.reduce_mean(
                tf.square(self.critic2(self.encoder(s), a) - target_q))

        critic1_gradients = tape1.gradient(
            critic1_loss, self.encoder.trainable_variables +
            self.critic1.trainable_variables)
        self.critic1_optimizer.apply_gradients(
            zip(
                critic1_gradients, self.encoder.trainable_variables +
                self.critic1.trainable_variables))

        critic2_gradients = tape1.gradient(
            critic2_loss, self.encoder.trainable_variables +
            self.critic2.trainable_variables)
        self.critic2_optimizer.apply_gradients(
            zip(
                critic2_gradients, self.encoder.trainable_variables +
                self.critic2.trainable_variables))

        del tape1

        #actor update
        if self.current_step % self.actor_update == 0:
            with tf.GradientTape() as tape2:

                s_action, s_logpi = self.actor(
                    tf.stop_gradient(self.encoder(s)))

                min_aq_rep = tf.minimum(
                    self.critic1(tf.stop_gradient(self.encoder(s)), s_action),
                    self.critic2(tf.stop_gradient(self.encoder(s)), s_action))

                actor_loss = tf.reduce_mean(self.alpha.numpy() * s_logpi -
                                            min_aq_rep)

            actor_gradients = tape2.gradient(actor_loss,
                                             self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_gradients, self.actor.trainable_variables))

            del tape2
            #alpha update
            if self.train_alpha == True:
                with tf.GradientTape() as tape3:
                    _, s_logpi = self.actor(self.encoder(s))
                    alpha_loss = -(
                        tf.exp(self.log_alpha) *
                        tf.stop_gradient(s_logpi + self.target_entropy))
                    alpha_loss = tf.nn.compute_average_loss(alpha_loss)

                log_alpha_gradients = tape3.gradient(alpha_loss,
                                                     [self.log_alpha])
                self.log_alpha_optimizer.apply_gradients(
                    zip(log_alpha_gradients, [self.log_alpha]))

                del tape3

        if self.current_step % self.decoder_update == 0:
            #encoder, decoder update
            with tf.GradientTape(persistent=True) as tape4:
                feature = self.encoder(s)
                recovered_s = self.decoder(feature)
                real_s = preprocess_obs(s)

                rec_loss = tf.reduce_mean(tf.square(recovered_s - real_s))
                latent_loss = tf.reduce_mean(
                    0.5 * tf.reduce_sum(tf.square(feature), axis=1))

                ae_loss = rec_loss + self.decoder_latent_lambda * latent_loss

            encoder_gradients = tape4.gradient(
                ae_loss, self.encoder.trainable_variables)
            decoder_gradients = tape4.gradient(
                ae_loss, self.decoder.trainable_variables)

            self.encoder_optimizer.apply_gradients(
                zip(encoder_gradients, self.encoder.trainable_variables))
            self.decoder_optimizer.apply_gradients(
                zip(decoder_gradients, self.decoder.trainable_variables))

        if self.current_step % self.critic_update == 0:

            soft_update(self.critic1, self.target_critic1, self.tau)
            soft_update(self.critic2, self.target_critic2, self.tau)
            soft_update(self.encoder, self.target_encoder, self.encoder_tau)

        del tape4

        total_c1_loss += critic1_loss.numpy()
        total_c2_loss += critic2_loss.numpy()

        loss_list.append(['Loss/Critic1', total_c1_loss])
        loss_list.append(['Loss/Critic2', total_c2_loss])

        if self.current_step % self.decoder_update == 0:
            total_ae_loss += ae_loss.numpy()
            loss_list.append(['Loss/AutoEncoder', total_ae_loss])

        if self.current_step % self.actor_update == 0:
            total_a_loss += actor_loss.numpy()
            loss_list.append(['Loss/Actor', total_a_loss])
            if self.train_alpha == True:
                total_alpha_loss += alpha_loss.numpy()
                loss_list.append(['Loss/Alpha', total_alpha_loss])

        loss_list.append(['Alpha', tf.exp(self.log_alpha).numpy()])

        return loss_list
Example #19
0
class DBC_TD3:
    def __init__(self,
                 obs_dim,
                 action_dim,
                 hidden_dim=512,
                 gamma=0.99,
                 learning_rate=0.001,
                 batch_size=512,
                 policy_delay=2,
                 actor_noise=0.1,
                 target_noise=0.2,
                 noise_clip=0.5,
                 buffer_size=1e6,
                 feature_dim=50,
                 layer_num=4,
                 filter_num=32,
                 tau=0.005,
                 encoder_tau=0.005,
                 bisim_coef=0.5,
                 training_start=1000):

        self.buffer = Buffer(buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.hidden_dim = hidden_dim
        self.gamma = gamma
        self.learning_rate = learning_rate

        self.batch_size = batch_size
        self.feature_dim = feature_dim

        self.layer_num = layer_num
        self.filter_num = filter_num
        self.tau = tau
        self.encoder_tau = encoder_tau
        self.bisim_coef = bisim_coef

        self.policy_delay = policy_delay
        self.actor_noise = actor_noise
        self.target_noise = target_noise
        self.noise_clip = noise_clip

        self.training_start = training_start

        self.actor = Policy_network(feature_dim, action_dim,
                                    (hidden_dim, hidden_dim))
        self.target_actor = Policy_network(feature_dim, action_dim,
                                           (hidden_dim, hidden_dim))
        self.critic1 = Q_network(feature_dim, action_dim,
                                 (hidden_dim, hidden_dim))
        self.critic2 = Q_network(feature_dim, action_dim,
                                 (hidden_dim, hidden_dim))
        self.target_critic1 = Q_network(feature_dim, action_dim,
                                        (hidden_dim, hidden_dim))
        self.target_critic2 = Q_network(feature_dim, action_dim,
                                        (hidden_dim, hidden_dim))

        self.encoder = PixelEncoder(self.obs_dim, feature_dim, layer_num,
                                    filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, feature_dim,
                                           layer_num, filter_num)

        self.dynamics_model = Transition_Network(feature_dim, action_dim)
        self.reward_model = Reward_Network(feature_dim)

        copy_weight(self.actor, self.target_actor)
        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.encoder_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.dynamics_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.reward_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.name = 'DBC_TD3'

    def get_action(self, obs):
        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action = self.actor(feature).numpy()[0]

        return action

    def train(self,
              local_step):  #critic -> transition -> reward -> encoder -> actor
        set1, set2 = self.buffer.dbc_sample(self.batch_size)

        s, a, r, ns, d = set1
        s2, a2, r2, ns2, d2 = set2

        target_action = tf.clip_by_value(
            self.target_actor(self.target_encoder(ns)) + tf.clip_by_value(
                tf.random.normal(shape=self.target_actor(
                    self.target_encoder(ns)).shape,
                                 mean=0,
                                 stddev=self.target_noise), -self.noise_clip,
                self.noise_clip), -1, 1)

        target_value = tf.stop_gradient(r + self.gamma * (1 - d) * tf.minimum(
            self.target_critic1(self.target_encoder(ns), target_action),
            self.target_critic2(self.target_encoder(ns), target_action)))

        with tf.GradientTape(persistent=True) as tape1:
            critic1_loss = 0.5 * tf.reduce_mean(
                tf.square(target_value - self.critic1(self.encoder(s), a)))
            critic2_loss = 0.5 * tf.reduce_mean(
                tf.square(target_value - self.critic2(self.encoder(s), a)))

        critic1_grad = tape1.gradient(
            critic1_loss, self.encoder.trainable_variables +
            self.critic1.trainable_variables)
        self.critic1_optimizer.apply_gradients(
            zip(
                critic1_grad, self.encoder.trainable_variables +
                self.critic1.trainable_variables))

        critic2_grad = tape1.gradient(
            critic2_loss, self.encoder.trainable_variables +
            self.critic2.trainable_variables)
        self.critic2_optimizer.apply_gradients(
            zip(
                critic2_grad, self.encoder.trainable_variables +
                self.critic2.trainable_variables))

        del tape1

        #train dynamics
        with tf.GradientTape() as tape2:
            feature = self.encoder(s)
            next_feature = self.encoder(ns)
            mu, sigma = self.dynamics_model(tf.concat([feature, a], axis=1))

            if (sigma[0][0].numpy() == 0):
                sigma = tf.ones_like(mu)
            diff = (mu - tf.stop_gradient(next_feature)) / sigma
            dynamics_loss = tf.reduce_mean(0.5 * tf.square(diff) +
                                           tf.math.log(sigma))

        dynamics_gradients = tape2.gradient(
            dynamics_loss, self.encoder.trainable_variables +
            self.dynamics_model.trainable_variables)
        self.dynamics_optimizer.apply_gradients(
            zip(
                dynamics_gradients, self.encoder.trainable_variables +
                self.dynamics_model.trainable_variables))

        #dynamics_gradients = tape2.gradient(dynamics_loss, self.dynamics_model.trainable_variables)
        #self.dynamics_optimizer.apply_gradients(zip(dynamics_gradients, self.dynamics_model.trainable_variables))

        del tape2

        #train reward
        with tf.GradientTape() as tape3:
            feature = self.encoder(s)
            sample_dynamics = self.dynamics_model.sample(
                tf.concat([feature, a], axis=1))
            reward_prediction = self.reward_model(sample_dynamics)

            reward_loss = tf.reduce_mean(tf.square(reward_prediction - (r)))

        reward_gradients = tape3.gradient(
            reward_loss, self.encoder.trainable_variables +
            self.reward_model.trainable_variables)
        self.reward_optimizer.apply_gradients(
            zip(
                reward_gradients, self.encoder.trainable_variables +
                self.reward_model.trainable_variables))

        #reward_gradients = tape3.gradient(reward_loss, self.reward_model.trainable_variables)
        #self.reward_optimizer.apply_gradients(zip(reward_gradients, self.reward_model.trainable_variables))

        del tape3

        #train encoder
        with tf.GradientTape() as tape4:
            feature1 = self.encoder(s)
            feature2 = self.encoder(s2)

            mu1, sigma1 = self.dynamics_model(tf.concat([feature1, a], axis=1))
            mu2, sigma2 = self.dynamics_model(tf.concat([feature2, a2],
                                                        axis=1))

            z_dist = tf.abs(feature1 - feature2)
            r_dist = tf.abs(r - r2)

            transition_dist = tf.sqrt(
                tf.square(tf.abs(mu1 - mu2)) +
                tf.square(tf.abs(sigma1 - sigma2)))
            bisimilarity = tf.stop_gradient(
                tf.cast(r_dist, tf.float32) +
                self.gamma * tf.cast(transition_dist, tf.float32))
            encoder_loss = self.bisim_coef * tf.reduce_mean(
                tf.square(z_dist - bisimilarity))

        encoder_gradients = tape4.gradient(encoder_loss,
                                           self.encoder.trainable_variables)
        self.encoder_optimizer.apply_gradients(
            zip(encoder_gradients, self.encoder.trainable_variables))

        del tape4

        if local_step % (self.policy_delay) == 0:
            with tf.GradientTape() as tape5:
                actor_loss = -tf.reduce_mean(
                    self.critic1(tf.stop_gradient(self.encoder(s)),
                                 self.actor(tf.stop_gradient(
                                     self.encoder(s)))))

            actor_grad = tape5.gradient(actor_loss,
                                        self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor.trainable_variables))

            del tape5

            soft_update(self.actor, self.target_actor, self.tau)
            soft_update(self.critic1, self.target_critic1, self.tau)
            soft_update(self.critic2, self.target_critic2, self.tau)
            soft_update(self.encoder, self.target_encoder, self.encoder_tau)
Example #20
0
    def __init__(self, obs_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha),
                                     trainable=True)
        self.target_entropy = -action_dim
        self.gamma = args.gamma

        self.batch_size = args.batch_size
        self.feature_dim = args.feature_dim

        self.layer_num = args.layer_num
        self.filter_num = args.filter_num
        self.tau = args.tau
        self.encoder_tau = args.encoder_tau

        self.actor_update = args.actor_update
        self.critic_update = args.critic_update

        self.training_start = args.training_start
        self.training_step = args.training_step
        self.train_alpha = args.train_alpha

        self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim,
                                             args.hidden_dim, args.log_std_min,
                                             args.log_std_max)
        self.critic1 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.critic2 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic1 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)
        self.target_critic2 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)

        self.encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                           self.layer_num, self.filter_num)

        self.dynamics_model = Transition_Network(self.feature_dim,
                                                 action_dim,
                                                 deterministic=False)
        self.reward_model = Reward_Network(self.feature_dim)

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr)
        self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr)

        self.dynamics_optimizer = tf.keras.optimizers.Adam(args.decoder_lr)
        self.reward_optimizer = tf.keras.optimizers.Adam(args.decoder_lr)

        self.current_step = 0

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2,
            'Encoder': self.encoder,
            'Target_Encoder': self.target_encoder,
            'Dynamics': self.dynamics_model,
            'Reward': self.reward_model
        }

        self.name = 'DBC_SACv2'
Example #21
0
class DBC_SACv2:
    def __init__(self, obs_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha),
                                     trainable=True)
        self.target_entropy = -action_dim
        self.gamma = args.gamma

        self.batch_size = args.batch_size
        self.feature_dim = args.feature_dim

        self.layer_num = args.layer_num
        self.filter_num = args.filter_num
        self.tau = args.tau
        self.encoder_tau = args.encoder_tau

        self.actor_update = args.actor_update
        self.critic_update = args.critic_update

        self.training_start = args.training_start
        self.training_step = args.training_step
        self.train_alpha = args.train_alpha

        self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim,
                                             args.hidden_dim, args.log_std_min,
                                             args.log_std_max)
        self.critic1 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.critic2 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic1 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)
        self.target_critic2 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)

        self.encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                           self.layer_num, self.filter_num)

        self.dynamics_model = Transition_Network(self.feature_dim,
                                                 action_dim,
                                                 deterministic=False)
        self.reward_model = Reward_Network(self.feature_dim)

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr)
        self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr)

        self.dynamics_optimizer = tf.keras.optimizers.Adam(args.decoder_lr)
        self.reward_optimizer = tf.keras.optimizers.Adam(args.decoder_lr)

        self.current_step = 0

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2,
            'Encoder': self.encoder,
            'Target_Encoder': self.target_encoder,
            'Dynamics': self.dynamics_model,
            'Reward': self.reward_model
        }

        self.name = 'DBC_SACv2'

    @property
    def alpha(self):
        return tf.exp(self.log_alpha)

    def get_action(self, obs):
        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action, _ = self.actor(feature)
        action = action.numpy()[0]

        return action

    def eval_action(self, obs):
        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action, _ = self.actor(feature, deterministic=True)
        action = action.numpy()[0]

        return action

    def train(self, local_step):
        self.current_step += 1
        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        total_alpha_loss = 0
        total_encoder_loss = 0
        total_dynamics_loss = 0
        total_reward_loss = 0
        loss_list = []
        s, a, r, ns, d = self.buffer.sample(self.batch_size)

        ns_action, ns_logpi = self.actor(self.encoder(ns))

        target_min_aq = tf.minimum(
            self.target_critic1(self.target_encoder(ns), ns_action),
            self.target_critic2(self.target_encoder(ns), ns_action))

        target_q = tf.stop_gradient(
            r + self.gamma * (1 - d) *
            (target_min_aq - self.alpha.numpy() * ns_logpi))

        with tf.GradientTape(persistent=True) as tape1:
            critic1_loss = tf.reduce_mean(
                tf.square(self.critic1(self.encoder(s), a) - target_q))
            critic2_loss = tf.reduce_mean(
                tf.square(self.critic2(self.encoder(s), a) - target_q))

        critic1_gradients = tape1.gradient(
            critic1_loss, self.encoder.trainable_variables +
            self.critic1.trainable_variables)
        self.critic1_optimizer.apply_gradients(
            zip(
                critic1_gradients, self.encoder.trainable_variables +
                self.critic1.trainable_variables))

        critic2_gradients = tape1.gradient(
            critic2_loss, self.encoder.trainable_variables +
            self.critic2.trainable_variables)
        self.critic2_optimizer.apply_gradients(
            zip(
                critic2_gradients, self.encoder.trainable_variables +
                self.critic2.trainable_variables))

        del tape1

        if self.current_step % self.actor_update == 0:
            with tf.GradientTape() as tape2:

                s_action, s_logpi = self.actor(
                    tf.stop_gradient(self.encoder(s)))

                min_aq_rep = tf.minimum(
                    self.critic1(tf.stop_gradient(self.encoder(s)), s_action),
                    self.critic2(tf.stop_gradient(self.encoder(s)), s_action))

                actor_loss = tf.reduce_mean(self.alpha.numpy() * s_logpi -
                                            min_aq_rep)

            actor_gradients = tape2.gradient(actor_loss,
                                             self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_gradients, self.actor.trainable_variables))

            del tape2

            if self.train_alpha == True:
                with tf.GradientTape() as tape3:
                    _, s_logpi = self.actor(self.encoder(s))
                    alpha_loss = -(
                        tf.exp(self.log_alpha) *
                        tf.stop_gradient(s_logpi + self.target_entropy))
                    alpha_loss = tf.nn.compute_average_loss(alpha_loss)
                    #alpha_loss = tf.reduce_mean(alpha_loss)

                log_alpha_gradients = tape3.gradient(alpha_loss,
                                                     [self.log_alpha])
                self.log_alpha_optimizer.apply_gradients(
                    zip(log_alpha_gradients, [self.log_alpha]))

                del tape3

        if self.current_step % self.critic_update == 0:
            soft_update(self.critic1, self.target_critic1, self.tau)
            soft_update(self.critic2, self.target_critic2, self.tau)
            soft_update(self.encoder, self.target_encoder, self.encoder_tau)

        #train encoder
        with tf.GradientTape() as tape4:
            new_ids = np.arange(len(s))
            np.random.shuffle(new_ids)
            s2 = tf.gather(s, new_ids)

            feature = self.encoder(s)
            #feature2 = tf.gather(feature, new_ids)
            feature2 = self.encoder(s2)

            reward = self.reward_model(tf.stop_gradient(feature))
            #reward2 = tf.gather(reward, new_ids)
            reward2 = self.reward_model(tf.stop_gradient(feature2))

            feature_action, _ = self.actor(tf.stop_gradient(feature), True)
            feature2_action, _ = self.actor(tf.stop_gradient(feature2), True)

            mu, sigma = self.dynamics_model(tf.stop_gradient(feature),
                                            feature_action)
            mu2, sigma2 = self.dynamics_model(tf.stop_gradient(feature2),
                                              feature2_action)

            z_dist = tf.reshape(tf.keras.losses.huber(feature, feature2),
                                shape=[-1, 1])
            r_dist = tf.reshape(tf.keras.losses.huber(reward, reward2),
                                shape=[-1, 1])
            transition_dist = tf.sqrt(
                tf.square(mu - mu2) + tf.square(sigma - sigma2))

            bisimilarity = r_dist + self.gamma * transition_dist
            encoder_loss = tf.reduce_mean(tf.square(z_dist - bisimilarity))

        encoder_gradients = tape4.gradient(encoder_loss,
                                           self.encoder.trainable_variables)
        self.encoder_optimizer.apply_gradients(
            zip(encoder_gradients, self.encoder.trainable_variables))

        #train dynamics
        with tf.GradientTape() as tape5:
            feature = self.encoder(s)
            mu, sigma = self.dynamics_model(feature, a)

            if (sigma[0][0].numpy() == 0):
                if self.dynamics_model.deterministic == False:
                    print("error")
                sigma = tf.ones_like(mu)

            next_feature = self.encoder(ns)
            diff = (mu - tf.stop_gradient(next_feature)) / sigma

            dynamics_loss = tf.reduce_mean(0.5 * tf.square(diff) +
                                           tf.math.log(sigma))

        dynamics_gradients = tape5.gradient(
            dynamics_loss, self.encoder.trainable_variables +
            self.dynamics_model.trainable_variables)
        self.dynamics_optimizer.apply_gradients(
            zip(
                dynamics_gradients, self.encoder.trainable_variables +
                self.dynamics_model.trainable_variables))

        #train reward
        with tf.GradientTape() as tape6:
            feature = self.encoder(s)
            sample_dynamics = self.dynamics_model.sample(feature, a)
            reward_prediction = self.reward_model(sample_dynamics)

            reward_loss = tf.reduce_mean(tf.square(reward_prediction - r))

        reward_gradients = tape6.gradient(
            reward_loss, self.encoder.trainable_variables +
            self.reward_model.trainable_variables)
        self.reward_optimizer.apply_gradients(
            zip(
                reward_gradients, self.encoder.trainable_variables +
                self.reward_model.trainable_variables))

        total_c1_loss += critic1_loss.numpy()
        total_c2_loss += critic2_loss.numpy()

        loss_list.append(['Loss/Critic1', total_c1_loss])
        loss_list.append(['Loss/Critic2', total_c2_loss])

        if self.current_step % self.actor_update == 0:
            total_a_loss += actor_loss.numpy()
            loss_list.append(['Loss/Actor', total_a_loss])

        total_encoder_loss += encoder_loss.numpy()
        loss_list.append(['Loss/Encoder', total_encoder_loss])

        total_dynamics_loss += dynamics_loss.numpy()
        loss_list.append(['Loss/Dynamics', total_dynamics_loss])

        total_reward_loss += reward_loss.numpy()
        loss_list.append(['Loss/Reward', total_reward_loss])

        if self.current_step % self.actor_update == 0 and self.train_alpha == True:
            total_alpha_loss += alpha_loss.numpy()
            loss_list.append(['Loss/Alpha', total_alpha_loss])

        loss_list.append(['Alpha', tf.exp(self.log_alpha).numpy()])

        return loss_list
Example #22
0
class DDQN:
    def __init__(self, state_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.optimizer = tf.keras.optimizers.Adam(args.learning_rate)

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.batch_size = args.batch_size
        self.gamma = args.gamma
        self.lr = args.learning_rate
        self.epsilon = args.epsilon
        self.training_start = args.training_start
        self.training_step = args.training_step
        self.current_step = 0
        self.copy_iter = args.copy_iter

        self.network = Policy_network(self.state_dim, self.action_dim,
                                      args.hidden_dim)
        self.target_network = Policy_network(self.state_dim, self.action_dim,
                                             args.hidden_dim)

        copy_weight(self.network, self.target_network)

        self.network_list = {
            'Network': self.network,
            'Target_Network': self.target_network
        }
        self.name = 'Double DQN'

    def get_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)

        q_value = self.network(state, activation='linear').numpy()
        best_action = np.argmax(q_value, axis=1)[0]

        if np.random.random() < self.epsilon:
            return np.random.randint(low=0, high=self.action_dim)
        else:
            return best_action

    def eval_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)

        q_value = self.network(state, activation='linear').numpy()
        best_action = np.argmax(q_value, axis=1)[0]

        return best_action

    def train(self, training_num):
        total_loss = 0

        for i in range(training_num):
            self.current_step += 1
            s, a, r, ns, d = self.buffer.sample(self.batch_size)

            q_value = tf.expand_dims(tf.argmax(self.network(
                ns, activation='linear'),
                                               axis=1,
                                               output_type=tf.int32),
                                     axis=1)
            q_value_one = tf.squeeze(tf.one_hot(q_value,
                                                depth=self.action_dim),
                                     axis=1)

            target_value = r + self.gamma * (1 - d) * tf.reduce_sum(
                self.target_network(ns, activation='linear') * q_value_one,
                axis=1,
                keepdims=True)
            target_value = tf.stop_gradient(target_value)

            with tf.GradientTape() as tape:
                selected_values = tf.reduce_sum(
                    self.network(s, activation='linear') * tf.squeeze(
                        tf.one_hot(tf.cast(a, tf.int32), self.action_dim),
                        axis=1),
                    axis=1,
                    keepdims=True)
                loss = 0.5 * tf.math.reduce_mean(
                    tf.square(target_value - selected_values))

            variables = self.network.trainable_variables
            gradients = tape.gradient(loss, variables)
            self.optimizer.apply_gradients(zip(gradients, variables))

            if self.current_step % self.copy_iter == 0:
                copy_weight(self.network, self.target_network)

            total_loss += loss.numpy()

        return [['Loss/Loss', total_loss]]
Example #23
0
class ImageDQN:
    def __init__(self, obs_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)
        self.optimizer = tf.keras.optimizers.Adam(args.learning_rate)

        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.feature_dim = args.feature_dim

        self.batch_size = args.batch_size
        self.gamma = args.gamma
        self.learning_rate = args.learning_rate
        self.epsilon = args.epsilon
        self.training_start = args.training_start
        self.training_step = args.training_step
        self.current_step = 0
        self.copy_iter = args.copy_iter

        self.layer_num = args.layer_num
        self.filter_num = args.filter_num

        self.network = Policy_network(self.feature_dim, self.action_dim,
                                      args.hidden_dim)
        self.target_network = Policy_network(self.feature_dim, self.action_dim,
                                             args.hidden_dim)

        self.encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num,
                                    'channels_last')
        self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                           self.layer_num, self.filter_num,
                                           'channels_last')

        copy_weight(self.network, self.target_network)
        copy_weight(self.encoder, self.target_encoder)

        self.network_list = {
            'Network': self.network,
            'Target_Network': self.target_network
        }
        self.name = 'ImageDQN'

    def get_action(self, obs):
        if np.random.random() < self.epsilon:
            return np.random.randint(low=0, high=self.action_dim)
        else:
            obs = np.expand_dims(np.array(obs), axis=0)
            feature = self.encoder(obs)
            q_value = self.network(feature, activation='linear').numpy()
            best_action = np.argmax(q_value, axis=1)[0]
            return best_action

    def eval_action(self, obs):
        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        q_value = self.network(feature, activation='linear').numpy()
        best_action = np.argmax(q_value, axis=1)[0]

        return best_action

    def train(self, training_num):
        total_loss = 0
        for i in range(training_num):
            self.current_step += 1
            s, a, r, ns, d = self.buffer.sample(self.batch_size)

            target_q = tf.reduce_max(self.target_network(
                self.target_encoder(ns), activation='linear'),
                                     axis=1,
                                     keepdims=True)
            target_value = r + self.gamma * (1 - d) * target_q
            target_value = tf.stop_gradient(target_value)

            with tf.GradientTape() as tape:
                selected_values = tf.reduce_sum(
                    self.network(self.encoder(s), activation='linear') *
                    tf.squeeze(tf.one_hot(tf.cast(a, tf.int32),
                                          self.action_dim),
                               axis=1),
                    axis=1,
                    keepdims=True)
                loss = 0.5 * tf.reduce_mean(
                    tf.square(target_value - selected_values))

            gradients = tape.gradient(
                loss, self.encoder.trainable_variables +
                self.network.trainable_variables)

            self.optimizer.apply_gradients(
                zip(
                    gradients, self.encoder.trainable_variables +
                    self.network.trainable_variables))

            if self.current_step % self.copy_iter == 0:
                copy_weight(self.network, self.target_network)
                copy_weight(self.encoder, self.target_encoder)

            total_loss += loss.numpy()

            del tape

        return [['Loss/Loss', total_loss]]
Example #24
0
class RAD_SACv2:
    def __init__(self, obs_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.image_size = args.image_size
        self.pre_image_size = args.pre_image_size
        self.current_step = 0

        self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha), trainable=True)
        self.target_entropy = -action_dim
        self.gamma = args.gamma

        self.batch_size = args.batch_size
        self.feature_dim = args.feature_dim

        self.layer_num = args.layer_num
        self.filter_num = args.filter_num
        self.tau = args.tau
        self.encoder_tau = args.encoder_tau
        self.critic_update = args.critic_update

        self.training_start = args.training_start
        self.training_step = args.training_step
        self.train_alpha = args.train_alpha

        self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max)
        self.critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim)
        self.critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim)
        self.target_critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim)
        self.target_critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim)

        self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num)

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr, beta_1=0.5)

        self.network_list = {'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2,
                             'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2, 'Encoder': self.encoder, 'Target_Encoder': self.target_encoder}



        self.aug_funcs = {}
        self.aug_list = {
            'crop': rad.crop,
            'grayscale': rad.random_grayscale(),
            'cutout': rad.random_cutout(),
            'cutout_color': rad.random_cutout_color(),
            'flip': rad.random_flip(),
            'rotate': rad.random_rotation(),
            'rand_conv': rad.random_convolution(),
            'color_jitter': rad.random_color_jitter(),
            'no_aug': rad.no_aug
        }
        for aug_name in args.data_augs.split('-'):
            assert aug_name in self.aug_list
            self.aug_funcs[aug_name] = self.aug_list[aug_name]

        self.name = 'RAD_SACv2'

    @property
    def alpha(self):
        return tf.exp(self.log_alpha)


    def get_action(self, obs):
        if obs.shape[-1] != self.image_size:
            obs = center_crop_image(obs, self.image_size)
        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action, _ = self.actor(feature)
        action = action.numpy()[0]

        return action

    def eval_action(self, obs):

        if obs.shape[-1] != self.image_size:
            obs = center_crop_image(obs, self.image_size)

        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action, _ = self.actor(feature, deterministic=True)
        action = action.numpy()[0]

        return action

    def train(self, local_step):
        self.current_step += 1

        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        total_alpha_loss = 0
        loss_list = []

        s, a, r, ns, d = self.buffer.rad_sample(self.batch_size, self.aug_funcs, self.pre_image_size)

        ns_action, ns_logpi = self.actor(self.encoder(ns))

        target_min_aq = tf.minimum(self.target_critic1(self.target_encoder(ns), ns_action),
                                   self.target_critic2(self.target_encoder(ns), ns_action))

        target_q = tf.stop_gradient(r + self.gamma * (1 - d) * (
                target_min_aq - self.alpha.numpy() * ns_logpi))

        with tf.GradientTape(persistent=True) as tape1:
            critic1_loss = tf.reduce_mean(tf.square(self.critic1(self.encoder(s), a) - target_q))
            critic2_loss = tf.reduce_mean(tf.square(self.critic2(self.encoder(s), a) - target_q))

        critic1_gradients = tape1.gradient(critic1_loss,
                                           self.encoder.trainable_variables + self.critic1.trainable_variables)
        self.critic1_optimizer.apply_gradients(
            zip(critic1_gradients, self.encoder.trainable_variables + self.critic1.trainable_variables))

        critic2_gradients = tape1.gradient(critic2_loss,
                                           self.encoder.trainable_variables + self.critic2.trainable_variables)
        self.critic2_optimizer.apply_gradients(
            zip(critic2_gradients, self.encoder.trainable_variables + self.critic2.trainable_variables))

        del tape1

        with tf.GradientTape() as tape2:

            s_action, s_logpi = self.actor(tf.stop_gradient(self.encoder(s)))

            min_aq_rep = tf.minimum(self.critic1(tf.stop_gradient(self.encoder(s)), s_action),
                                    self.critic2(tf.stop_gradient(self.encoder(s)), s_action))

            actor_loss = tf.reduce_mean(self.alpha.numpy() * s_logpi - min_aq_rep)

        actor_gradients = tape2.gradient(actor_loss, self.actor.trainable_variables)
        self.actor_optimizer.apply_gradients(zip(actor_gradients, self.actor.trainable_variables))
        del tape2

        if self.train_alpha == True:
            with tf.GradientTape() as tape3:
                _, s_logpi = self.actor(self.encoder(s))
                alpha_loss = -tf.exp(self.log_alpha) * tf.stop_gradient(s_logpi + self.target_entropy)
                alpha_loss = tf.nn.compute_average_loss(alpha_loss)
                #alpha_loss = tf.reduce_mean(alpha_loss)

            log_alpha_gradients = tape3.gradient(alpha_loss, [self.log_alpha])
            self.log_alpha_optimizer.apply_gradients(zip(log_alpha_gradients, [self.log_alpha]))

            del tape3

        if self.current_step % self.critic_update == 0:
            soft_update(self.critic1, self.target_critic1, self.tau)
            soft_update(self.critic2, self.target_critic2, self.tau)
            soft_update(self.encoder, self.target_encoder, self.encoder_tau)


        total_c1_loss += critic1_loss.numpy()
        total_c2_loss += critic2_loss.numpy()

        loss_list.append(['Loss/Critic1', total_c1_loss])
        loss_list.append(['Loss/Critic2', total_c2_loss])

        total_a_loss += actor_loss.numpy()
        loss_list.append(['Loss/Actor', total_a_loss])

        if self.train_alpha == True:
            total_alpha_loss += alpha_loss.numpy()
            loss_list.append(['Loss/Alpha', total_alpha_loss])

        loss_list.append(['Alpha', tf.exp(self.log_alpha).numpy()])

        return loss_list
Example #25
0
    def __init__(self, obs_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.image_size = args.image_size
        self.pre_image_size = args.pre_image_size
        self.current_step = 0

        self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha), trainable=True)
        self.target_entropy = -action_dim
        self.gamma = args.gamma

        self.batch_size = args.batch_size
        self.feature_dim = args.feature_dim

        self.layer_num = args.layer_num
        self.filter_num = args.filter_num
        self.tau = args.tau
        self.encoder_tau = args.encoder_tau
        self.critic_update = args.critic_update

        self.training_start = args.training_start
        self.training_step = args.training_step
        self.train_alpha = args.train_alpha

        self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max)
        self.critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim)
        self.critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim)
        self.target_critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim)
        self.target_critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim)

        self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num)

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr, beta_1=0.5)

        self.network_list = {'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2,
                             'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2, 'Encoder': self.encoder, 'Target_Encoder': self.target_encoder}



        self.aug_funcs = {}
        self.aug_list = {
            'crop': rad.crop,
            'grayscale': rad.random_grayscale(),
            'cutout': rad.random_cutout(),
            'cutout_color': rad.random_cutout_color(),
            'flip': rad.random_flip(),
            'rotate': rad.random_rotation(),
            'rand_conv': rad.random_convolution(),
            'color_jitter': rad.random_color_jitter(),
            'no_aug': rad.no_aug
        }
        for aug_name in args.data_augs.split('-'):
            assert aug_name in self.aug_list
            self.aug_funcs[aug_name] = self.aug_list[aug_name]

        self.name = 'RAD_SACv2'
Example #26
0
class CURL_SACv1:
    def __init__(self, obs_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.image_size = obs_dim[-1]

        self.gamma = args.gamma
        self.alpha = args.alpha

        self.batch_size = args.batch_size
        self.feature_dim = args.feature_dim
        self.curl_latent_dim = args.curl_latent_dim

        self.layer_num = args.layer_num
        self.filter_num = args.filter_num
        self.tau = args.tau
        self.encoder_tau = args.encoder_tau

        self.training_start = args.training_start
        self.training_step = args.training_step

        self.encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                           self.layer_num, self.filter_num)

        self.actor = Squashed_Gaussian_Actor(
            self.feature_dim,
            self.action_dim,
            args.hidden_dim,
            args.log_std_min,
            args.log_std_max,
            kernel_initializer=tf.keras.initializers.orthogonal())
        self.critic1 = Q_network(
            self.feature_dim,
            self.action_dim,
            args.hidden_dim,
            kernel_initializer=tf.keras.initializers.orthogonal())
        self.critic2 = Q_network(
            self.feature_dim,
            self.action_dim,
            args.hidden_dim,
            kernel_initializer=tf.keras.initializers.orthogonal())
        self.v_network = V_network(
            self.feature_dim,
            args.hidden_dim,
            kernel_initializer=tf.keras.initializers.orthogonal())
        self.target_v_network = V_network(
            self.feature_dim,
            args.hidden_dim,
            kernel_initializer=tf.keras.initializers.orthogonal())

        self.curl = CURL(self.feature_dim, self.curl_latent_dim)

        copy_weight(self.v_network, self.target_v_network)
        copy_weight(self.encoder, self.target_encoder)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.v_network_optimizer = tf.keras.optimizers.Adam(args.v_lr)

        self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr)
        self.cpc_optimizer = tf.keras.optimizers.Adam(args.cpc_lr)

        self.current_step = 0

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'V_network': self.v_network,
            'Target_V_network': self.target_v_network,
            'Curl': self.curl,
            'Encoder': self.encoder,
            'Target_Encoder': self.target_encoder
        }

        self.name = 'CURL_SACv1'

    def get_action(self, obs):

        if obs.shape[-1] != self.image_size:
            obs = center_crop_image(obs, self.image_size)

        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action, _ = self.actor(feature)
        action = action.numpy()[0]

        return action

    def eval_action(self, obs):

        if obs.shape[-1] != self.image_size:
            obs = center_crop_image(obs, self.image_size)

        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action, _ = self.actor(feature, deterministic=True)
        action = action.numpy()[0]

        return action

    def train(self, training_step):
        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        total_v_loss = 0
        total_cpc_loss = 0
        loss_list = []

        self.current_step += 1

        s, a, r, ns, d, cpc_kwargs = self.buffer.cpc_sample(
            self.batch_size, self.image_size)

        obs_anchor, obs_pos = cpc_kwargs["obs_anchor"], cpc_kwargs["obs_pos"]

        s_action, s_logpi = self.actor(self.encoder(s))

        min_aq = tf.minimum(self.critic1(self.encoder(s), s_action),
                            self.critic2(self.encoder(s), s_action))
        target_v = tf.stop_gradient(min_aq - self.alpha * s_logpi)

        with tf.GradientTape() as tape1:
            v_loss = 0.5 * tf.reduce_mean(
                tf.square(
                    self.v_network(tf.stop_gradient(self.encoder(s))) -
                    target_v))

        v_gradients = tape1.gradient(v_loss,
                                     self.v_network.trainable_variables)
        self.v_network_optimizer.apply_gradients(
            zip(v_gradients, self.v_network.trainable_variables))

        del tape1

        target_q = tf.stop_gradient(
            r + self.gamma *
            (1 - d) * self.target_v_network(self.target_encoder(ns)))

        with tf.GradientTape(persistent=True) as tape2:
            critic1_loss = 0.5 * tf.reduce_mean(
                tf.square(self.critic1(self.encoder(s), a) - target_q))
            critic2_loss = 0.5 * tf.reduce_mean(
                tf.square(self.critic2(self.encoder(s), a) - target_q))

        critic1_gradients = tape2.gradient(
            critic1_loss, self.encoder.trainable_variables +
            self.critic1.trainable_variables)

        critic2_gradients = tape2.gradient(
            critic2_loss, self.encoder.trainable_variables +
            self.critic2.trainable_variables)

        self.critic1_optimizer.apply_gradients(
            zip(
                critic1_gradients, self.encoder.trainable_variables +
                self.critic1.trainable_variables))

        self.critic2_optimizer.apply_gradients(
            zip(
                critic2_gradients, self.encoder.trainable_variables +
                self.critic2.trainable_variables))

        del tape2

        with tf.GradientTape() as tape3:
            s_action, s_logpi = self.actor(tf.stop_gradient(self.encoder(s)))

            min_aq_rep = tf.minimum(
                self.critic1(tf.stop_gradient(self.encoder(s)), s_action),
                self.critic2(tf.stop_gradient(self.encoder(s)), s_action))

            actor_loss = tf.reduce_mean(self.alpha * s_logpi - min_aq_rep)

        actor_gradients = tape3.gradient(actor_loss,
                                         self.actor.trainable_variables)
        self.actor_optimizer.apply_gradients(
            zip(actor_gradients, self.actor.trainable_variables))

        soft_update(self.v_network, self.target_v_network, self.tau)

        with tf.GradientTape(persistent=True) as tape4:
            z_a = self.encoder(obs_anchor)
            z_pos = tf.stop_gradient(self.target_encoder(obs_pos))

            logits = self.curl.compute_logits(z_a, z_pos)
            labels = tf.range(logits.shape[0], dtype='int64')

            cpc_loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(labels, logits))

        cpc_gradients = tape4.gradient(cpc_loss, self.curl.trainable_variables)
        self.cpc_optimizer.apply_gradients(
            (zip(cpc_gradients, self.curl.trainable_variables)))

        encoder_gradients = tape4.gradient(cpc_loss,
                                           self.encoder.trainable_variables)
        self.encoder_optimizer.apply_gradients(
            zip(encoder_gradients, self.encoder.trainable_variables))

        soft_update(self.encoder, self.target_encoder, self.encoder_tau)

        del tape4

        total_v_loss += v_loss.numpy()
        loss_list.append(['Loss/V', total_v_loss])

        total_c1_loss += critic1_loss.numpy()
        total_c2_loss += critic2_loss.numpy()

        loss_list.append(['Loss/Critic1', total_c1_loss])
        loss_list.append(['Loss/Critic2', total_c2_loss])

        total_a_loss += actor_loss.numpy()
        loss_list.append(['Loss/Actor', total_a_loss])

        total_cpc_loss += cpc_loss.numpy()
        loss_list.append(['Loss/CPC', total_cpc_loss])

        return loss_list
Example #27
0
class SAC_v2:
    def __init__(self,
                 state_dim,
                 action_dim,
                 hidden_dim=256,
                 training_step=1,
                 alpha=0.1,
                 train_alpha=True,
                 batch_size=128,
                 buffer_size=1e6,
                 tau=0.005,
                 learning_rate=0.0003,
                 gamma=0.99,
                 reward_scale=1,
                 training_start=500):

        self.buffer = Buffer(buffer_size)

        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.batch_size = batch_size
        self.tau = tau
        self.gamma = gamma
        self.reward_scale = reward_scale
        self.training_start = training_start
        self.training_step = training_step

        self.log_alpha = tf.Variable(np.log(alpha),
                                     dtype=tf.float32,
                                     trainable=True)
        self.target_entropy = -action_dim
        self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.train_alpha = train_alpha

        self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim,
                                             (hidden_dim, hidden_dim))
        self.critic1 = Q_network(self.state_dim, self.action_dim,
                                 (hidden_dim, hidden_dim))
        self.target_critic1 = Q_network(self.state_dim, self.action_dim,
                                        (hidden_dim, hidden_dim))
        self.critic2 = Q_network(self.state_dim, self.action_dim,
                                 (hidden_dim, hidden_dim))
        self.target_critic2 = Q_network(self.state_dim, self.action_dim,
                                        (hidden_dim, hidden_dim))

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2
        }
        self.name = 'SAC_v2'

    @property
    def alpha(self):
        return tf.exp(self.log_alpha)

    def get_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)

        action = self.actor(state).numpy()[0]

        return action

    def train(self, training_num):
        for i in range(training_num):
            s, a, r, ns, d = self.buffer.sample(self.batch_size)

            target_min_aq = tf.minimum(self.target_critic1(ns, self.actor(ns)),
                                       self.target_critic2(ns, self.actor(ns)))

            target_q = tf.stop_gradient(
                r + self.gamma * (1 - d) *
                (target_min_aq - self.alpha.numpy() * self.actor.log_pi(ns)))

            #critic training
            with tf.GradientTape(persistent=True) as tape1:
                critic1_loss = tf.reduce_mean(
                    tf.square(self.critic1(s, a) - target_q))
                critic2_loss = tf.reduce_mean(
                    tf.square(self.critic2(s, a) - target_q))

            critic1_gradients = tape1.gradient(
                critic1_loss, self.critic1.trainable_variables)
            self.critic1_optimizer.apply_gradients(
                zip(critic1_gradients, self.critic1.trainable_variables))
            critic2_gradients = tape1.gradient(
                critic2_loss, self.critic2.trainable_variables)
            self.critic2_optimizer.apply_gradients(
                zip(critic2_gradients, self.critic2.trainable_variables))

            del tape1

            #actor training
            with tf.GradientTape() as tape2:
                mu, sigma = self.actor.mu_sigma(s)
                output = mu + tf.random.normal(shape=mu.shape) * sigma

                min_aq_rep = tf.minimum(self.critic1(s, output),
                                        self.critic2(s, output))

                actor_loss = tf.reduce_mean(self.alpha.numpy() *
                                            self.actor.log_pi(s) - min_aq_rep)

            actor_gradients = tape2.gradient(actor_loss,
                                             self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_gradients, self.actor.trainable_variables))

            del tape2

            #alpha(temperature) training
            if self.train_alpha == True:
                with tf.GradientTape() as tape3:
                    alpha_loss = -(tf.exp(self.log_alpha) * (tf.stop_gradient(
                        self.actor.log_pi(s) + self.target_entropy)))
                    alpha_loss = tf.nn.compute_average_loss(
                        alpha_loss)  #from softlearning package

                alpha_grad = tape3.gradient(alpha_loss, [self.log_alpha])
                self.alpha_optimizer.apply_gradients(
                    zip(alpha_grad, [self.log_alpha]))

                del tape3

            soft_update(self.critic1, self.target_critic1, self.tau)
            soft_update(self.critic2, self.target_critic2, self.tau)
Example #28
0
    def __init__(self, obs_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.image_size = obs_dim[-1]

        self.gamma = args.gamma
        self.alpha = args.alpha

        self.batch_size = args.batch_size
        self.feature_dim = args.feature_dim
        self.curl_latent_dim = args.curl_latent_dim

        self.layer_num = args.layer_num
        self.filter_num = args.filter_num
        self.tau = args.tau
        self.encoder_tau = args.encoder_tau

        self.training_start = args.training_start
        self.training_step = args.training_step

        self.encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                           self.layer_num, self.filter_num)

        self.actor = Squashed_Gaussian_Actor(
            self.feature_dim,
            self.action_dim,
            args.hidden_dim,
            args.log_std_min,
            args.log_std_max,
            kernel_initializer=tf.keras.initializers.orthogonal())
        self.critic1 = Q_network(
            self.feature_dim,
            self.action_dim,
            args.hidden_dim,
            kernel_initializer=tf.keras.initializers.orthogonal())
        self.critic2 = Q_network(
            self.feature_dim,
            self.action_dim,
            args.hidden_dim,
            kernel_initializer=tf.keras.initializers.orthogonal())
        self.v_network = V_network(
            self.feature_dim,
            args.hidden_dim,
            kernel_initializer=tf.keras.initializers.orthogonal())
        self.target_v_network = V_network(
            self.feature_dim,
            args.hidden_dim,
            kernel_initializer=tf.keras.initializers.orthogonal())

        self.curl = CURL(self.feature_dim, self.curl_latent_dim)

        copy_weight(self.v_network, self.target_v_network)
        copy_weight(self.encoder, self.target_encoder)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.v_network_optimizer = tf.keras.optimizers.Adam(args.v_lr)

        self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr)
        self.cpc_optimizer = tf.keras.optimizers.Adam(args.cpc_lr)

        self.current_step = 0

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'V_network': self.v_network,
            'Target_V_network': self.target_v_network,
            'Curl': self.curl,
            'Encoder': self.encoder,
            'Target_Encoder': self.target_encoder
        }

        self.name = 'CURL_SACv1'
Example #29
0
    def __init__(self,
                 obs_dim,
                 action_dim,
                 hidden_dim=512,
                 gamma=0.99,
                 learning_rate=0.001,
                 batch_size=512,
                 policy_delay=2,
                 actor_noise=0.1,
                 target_noise=0.2,
                 noise_clip=0.5,
                 buffer_size=1e6,
                 feature_dim=50,
                 layer_num=4,
                 filter_num=32,
                 tau=0.005,
                 encoder_tau=0.005,
                 bisim_coef=0.5,
                 training_start=1000):

        self.buffer = Buffer(buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.hidden_dim = hidden_dim
        self.gamma = gamma
        self.learning_rate = learning_rate

        self.batch_size = batch_size
        self.feature_dim = feature_dim

        self.layer_num = layer_num
        self.filter_num = filter_num
        self.tau = tau
        self.encoder_tau = encoder_tau
        self.bisim_coef = bisim_coef

        self.policy_delay = policy_delay
        self.actor_noise = actor_noise
        self.target_noise = target_noise
        self.noise_clip = noise_clip

        self.training_start = training_start

        self.actor = Policy_network(feature_dim, action_dim,
                                    (hidden_dim, hidden_dim))
        self.target_actor = Policy_network(feature_dim, action_dim,
                                           (hidden_dim, hidden_dim))
        self.critic1 = Q_network(feature_dim, action_dim,
                                 (hidden_dim, hidden_dim))
        self.critic2 = Q_network(feature_dim, action_dim,
                                 (hidden_dim, hidden_dim))
        self.target_critic1 = Q_network(feature_dim, action_dim,
                                        (hidden_dim, hidden_dim))
        self.target_critic2 = Q_network(feature_dim, action_dim,
                                        (hidden_dim, hidden_dim))

        self.encoder = PixelEncoder(self.obs_dim, feature_dim, layer_num,
                                    filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, feature_dim,
                                           layer_num, filter_num)

        self.dynamics_model = Transition_Network(feature_dim, action_dim)
        self.reward_model = Reward_Network(feature_dim)

        copy_weight(self.actor, self.target_actor)
        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.encoder_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.dynamics_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.reward_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.name = 'DBC_TD3'
Example #30
0
class DBC_SACv2:
    def __init__(self,
                 obs_dim,
                 action_dim,
                 hidden_dim=256,
                 gamma=0.99,
                 learning_rate=1e-5,
                 batch_size=128,
                 buffer_size=1e6,
                 feature_dim=50,
                 layer_num=4,
                 filter_num=32,
                 tau=0.005,
                 encoder_tau=0.005,
                 bisim_coef=0.5,
                 training_start=1000,
                 train_alpha=True,
                 alpha=0.1):

        self.buffer = Buffer(buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha),
                                     trainable=True)
        self.target_entropy = -action_dim
        self.hidden_dim = hidden_dim
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.bisim_coef = bisim_coef

        self.batch_size = batch_size
        self.feature_dim = feature_dim

        self.layer_num = layer_num
        self.filter_num = filter_num
        self.tau = tau
        self.encoder_tau = encoder_tau

        self.training_start = training_start
        self.train_alpha = train_alpha

        self.actor = Squashed_Gaussian_Actor(feature_dim, action_dim,
                                             (hidden_dim, hidden_dim))
        self.critic1 = Q_network(feature_dim, action_dim,
                                 (hidden_dim, hidden_dim))
        self.critic2 = Q_network(feature_dim, action_dim,
                                 (hidden_dim, hidden_dim))
        self.target_critic1 = Q_network(feature_dim, action_dim,
                                        (hidden_dim, hidden_dim))
        self.target_critic2 = Q_network(feature_dim, action_dim,
                                        (hidden_dim, hidden_dim))

        self.encoder = PixelEncoder(self.obs_dim, feature_dim, layer_num,
                                    filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, feature_dim,
                                           layer_num, filter_num)

        self.dynamics_model = Transition_Network(feature_dim,
                                                 action_dim,
                                                 deterministic=False)
        self.reward_model = Reward_Network(feature_dim)

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.encoder_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.log_alpha_optimizer = tf.keras.optimizers.Adam(10 * learning_rate)

        self.dynamics_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.reward_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.name = 'DBC_SACv2'

    @property
    def alpha(self):
        return tf.exp(self.log_alpha)

    def get_action(self, obs):
        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action = self.actor(feature).numpy()[0]

        return action

    def train(self, local_step):
        set1, set2 = self.buffer.dbc_sample(self.batch_size)

        s, a, r, ns, d = set1
        s2, a2, r2, ns2, d2 = set2

        target_min_aq = tf.minimum(
            self.target_critic1(self.target_encoder(ns),
                                self.actor(self.encoder(ns))),
            self.target_critic2(self.target_encoder(ns),
                                self.actor(self.encoder(ns))))

        target_q = tf.stop_gradient(r + self.gamma * (1 - d) *
                                    (target_min_aq - self.alpha.numpy() *
                                     self.actor.log_pi(self.encoder(ns))))

        with tf.GradientTape(persistent=True) as tape1:
            critic1_loss = tf.reduce_mean(
                tf.square(self.critic1(self.encoder(s), a) - target_q))
            critic2_loss = tf.reduce_mean(
                tf.square(self.critic2(self.encoder(s), a) - target_q))

        critic1_gradients = tape1.gradient(
            critic1_loss, self.encoder.trainable_variables +
            self.critic1.trainable_variables)
        self.critic1_optimizer.apply_gradients(
            zip(
                critic1_gradients, self.encoder.trainable_variables +
                self.critic1.trainable_variables))

        critic2_gradients = tape1.gradient(
            critic2_loss, self.encoder.trainable_variables +
            self.critic2.trainable_variables)
        self.critic2_optimizer.apply_gradients(
            zip(
                critic2_gradients, self.encoder.trainable_variables +
                self.critic2.trainable_variables))

        del tape1

        #train dynamics(encoder used together)
        next_feature = self.encoder(ns)
        with tf.GradientTape() as tape2:
            feature = self.encoder(s)

            mu, sigma = self.dynamics_model(tf.concat([feature, a], axis=1))

            if (sigma[0][0].numpy() == 0):
                if self.dynamics_model.deterministic == False:
                    print("error")
                sigma = tf.ones_like(mu)

            diff = (mu - tf.stop_gradient(next_feature)) / sigma
            dynamics_loss = tf.reduce_mean(0.5 * tf.square(diff) +
                                           tf.math.log(sigma))

        dynamics_gradients = tape2.gradient(
            dynamics_loss, self.encoder.trainable_variables +
            self.dynamics_model.trainable_variables)
        self.dynamics_optimizer.apply_gradients(
            zip(
                dynamics_gradients, self.encoder.trainable_variables +
                self.dynamics_model.trainable_variables))

        del tape2

        #train rewards(encoder used together)
        with tf.GradientTape() as tape3:
            feature = self.encoder(s)
            sample_dynamics = self.dynamics_model.sample(
                tf.concat([feature, a], axis=1))
            reward_prediction = self.reward_model(sample_dynamics)

            reward_loss = tf.reduce_mean(tf.square(reward_prediction - r))

        reward_gradients = tape3.gradient(
            reward_loss, self.encoder.trainable_variables +
            self.reward_model.trainable_variables)
        self.reward_optimizer.apply_gradients(
            zip(
                reward_gradients, self.encoder.trainable_variables +
                self.reward_model.trainable_variables))

        del tape3

        # train encoder
        with tf.GradientTape() as tape4:
            feature1 = self.encoder(s)
            feature2 = self.encoder(s2)

            mu1, sigma1 = self.dynamics_model(tf.concat([feature1, a], axis=1))
            mu2, sigma2 = self.dynamics_model(tf.concat([feature2, a2],
                                                        axis=1))

            z_dist = tf.abs(feature1 - feature2)
            r_dist = tf.abs(r - r2)

            transition_dist = tf.sqrt(
                tf.square(tf.abs(mu1 - mu2)) +
                tf.square(tf.abs(sigma1 - sigma2)))
            bisimilarity = (
                tf.cast(r_dist, tf.float32) +
                self.gamma * tf.cast(transition_dist, tf.float32)).numpy()
            encoder_loss = self.bisim_coef * tf.reduce_mean(
                tf.square(z_dist - bisimilarity))

        encoder_gradients = tape4.gradient(encoder_loss,
                                           self.encoder.trainable_variables)
        self.encoder_optimizer.apply_gradients(
            zip(encoder_gradients, self.encoder.trainable_variables))

        del tape4

        if local_step % 2 == 0:
            with tf.GradientTape() as tape5:
                mu, sigma = self.actor.mu_sigma(
                    tf.stop_gradient(self.encoder(s)))
                output = mu + tf.random.normal(shape=mu.shape) * sigma

                min_aq_rep = tf.minimum(
                    self.critic1(tf.stop_gradient(self.encoder(s)), output),
                    self.critic2(tf.stop_gradient(self.encoder(s)), output))

                actor_loss = tf.reduce_mean(
                    self.alpha.numpy() *
                    self.actor.log_pi(tf.stop_gradient(self.encoder(s))) -
                    min_aq_rep)

            actor_gradients = tape5.gradient(actor_loss,
                                             self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_gradients, self.actor.trainable_variables))

            del tape5

            if self.train_alpha == True:
                with tf.GradientTape() as tape6:
                    alpha_loss = -(tf.exp(self.log_alpha) * tf.stop_gradient(
                        self.actor.log_pi(self.encoder(s)) +
                        self.target_entropy))
                    alpha_loss = tf.nn.compute_average_loss(alpha_loss)

                log_alpha_gradients = tape6.gradient(alpha_loss,
                                                     [self.log_alpha])
                self.log_alpha_optimizer.apply_gradients(
                    zip(log_alpha_gradients, [self.log_alpha]))

                del tape6

            soft_update(self.critic1, self.target_critic1, self.tau)
            soft_update(self.critic2, self.target_critic2, self.tau)
            soft_update(self.encoder, self.target_encoder, self.encoder_tau)