Example #1
0
    def __init__(self,
                 state_dim,
                 action_dim,
                 name="SAC",
                 max_action=1.,
                 lr=3e-4,
                 actor_units=[256, 256],
                 tau=0.005,
                 scale_reward=5.,
                 n_warmup=int(1e4),
                 memory_capacity=int(1e6),
                 **kwargs):
        super().__init__(name=name,
                         memory_capacity=memory_capacity,
                         n_warmup=n_warmup,
                         **kwargs)

        self.actor = GaussianActor(state_dim, action_dim, max_action)
        self.actor_optimizer = tf.train.AdamOptimizer(learning_rate=lr)

        self.vf = CriticV(state_dim)
        self.vf_target = CriticV(state_dim)
        update_target_variables(self.vf_target.weights,
                                self.vf.weights,
                                tau=1.)
        self.vf_optimizer = tf.train.AdamOptimizer(learning_rate=lr)

        self.qf1 = CriticQ(state_dim, action_dim, name="qf1")
        self.qf2 = CriticQ(state_dim, action_dim, name="qf2")
        self.qf1_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.qf2_optimizer = tf.train.AdamOptimizer(learning_rate=lr)

        # Set hyper-parameters
        self.tau = tau
        self.scale_reward = scale_reward
Example #2
0
 def _setup_critic_v(self, state_shape, critic_units, lr):
     self.vf = CriticV(state_shape, critic_units)
     self.vf_target = CriticV(state_shape, critic_units)
     update_target_variables(self.vf_target.weights,
                             self.vf.weights,
                             tau=1.)
     self.vf_optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
Example #3
0
    def __init__(self,
                 state_shape,
                 action_dim,
                 name="TD3",
                 actor_update_freq=2,
                 policy_noise=0.2,
                 noise_clip=0.5,
                 actor_units=[400, 300],
                 critic_units=[400, 300],
                 lr_critic=0.001,
                 **kwargs):
        super().__init__(name=name,
                         state_shape=state_shape,
                         action_dim=action_dim,
                         actor_units=actor_units,
                         critic_units=critic_units,
                         lr_critic=lr_critic,
                         **kwargs)

        self.critic = Critic(state_shape, action_dim, critic_units)
        self.critic_target = Critic(state_shape, action_dim, critic_units)
        update_target_variables(self.critic_target.weights,
                                self.critic.weights,
                                tau=1.)
        self.critic_optimizer = tf.keras.optimizers.Adam(
            learning_rate=lr_critic)

        self._policy_noise = policy_noise
        self._noise_clip = noise_clip

        self._actor_update_freq = actor_update_freq
        self._it = tf.Variable(0, dtype=tf.int32)
Example #4
0
    def _update_critic(self, states, actions, next_states, rewards, dones, weights):
        with tf.device(self.device):
            assert len(dones.shape) == 2
            assert len(rewards.shape) == 2
            rewards = tf.squeeze(rewards, axis=1)
            dones = tf.squeeze(dones, axis=1)

            not_dones = 1. - tf.cast(dones, dtype=tf.float32)

            with tf.GradientTape(persistent=True) as tape:
                # Compute loss of critic Q
                next_actions, next_logps = self.actor(next_states)
                next_target_q1 = tf.stop_gradient(self.qf1_target(next_states, next_actions))
                next_target_q2 = tf.stop_gradient(self.qf2_target(next_states, next_actions))
                min_next_target_q = tf.minimum(next_target_q1, next_target_q2)

                target_q = tf.stop_gradient(
                    rewards + not_dones * self.discount * (min_next_target_q - self.alpha * next_logps))

                current_q1 = self.qf1(states, actions)
                current_q2 = self.qf2(states, actions)
                td_loss_q1 = tf.reduce_mean((target_q - current_q1) ** 2)
                td_loss_q2 = tf.reduce_mean((target_q - current_q2) ** 2)  # Eq.(6)

            q1_grad = tape.gradient(td_loss_q1, self.qf1.trainable_variables)
            self.qf1_optimizer.apply_gradients(
                zip(q1_grad, self.qf1.trainable_variables))
            q2_grad = tape.gradient(td_loss_q2, self.qf2.trainable_variables)
            self.qf2_optimizer.apply_gradients(
                zip(q2_grad, self.qf2.trainable_variables))
            update_target_variables(self.qf1_target.weights, self.qf1.weights, self.tau)
            update_target_variables(self.qf2_target.weights, self.qf2.weights, self.tau)

        return td_loss_q1 + td_loss_q2, td_loss_q1
Example #5
0
    def _train_body(self, states, actions, next_states, rewards, done,
                    weights):
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                td_errors = self._compute_td_error_body(
                    states, actions, next_states, rewards, done)
                critic_loss = tf.reduce_mean(
                    huber_loss(td_errors, delta=self.max_grad) * weights)

            critic_grad = tape.gradient(critic_loss,
                                        self.critic.trainable_variables)
            critic_grad = [(tf.clip_by_value(
                grad, tf.constant(-self.max_grad, dtype=tf.float32),
                tf.constant(self.max_grad, dtype=tf.float32)))
                           for grad in critic_grad]
            self.critic_optimizer.apply_gradients(
                zip(critic_grad, self.critic.trainable_variables))

            with tf.GradientTape() as tape:
                next_action = self.actor(states)
                actor_loss = -tf.reduce_mean(self.critic([states, next_action
                                                          ]))

            actor_grad = tape.gradient(actor_loss,
                                       self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor.trainable_variables))

            # Update target networks
            update_target_variables(self.critic_target.weights,
                                    self.critic.weights, self.tau)
            update_target_variables(self.actor_target.weights,
                                    self.actor.weights, self.tau)

            return actor_loss, critic_loss, td_errors
Example #6
0
    def _train_body(self, states, actions, next_states, rewards, done,
                    weights):
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                td_errors = self._compute_td_error_body(
                    states, actions, next_states, rewards, done)
                critic_loss = tf.reduce_mean(
                    tf.square(td_errors) * weights * 0.5)

            critic_grad = tape.gradient(critic_loss,
                                        self.critic.trainable_variables)
            self.critic_optimizer.apply_gradients(
                zip(critic_grad, self.critic.trainable_variables))

            with tf.GradientTape() as tape:
                next_action = self.actor(states)
                actor_loss = -tf.reduce_mean(self.critic([states, next_action
                                                          ]))

            actor_grad = tape.gradient(actor_loss,
                                       self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor.trainable_variables))

            # Update target networks
            update_target_variables(self.critic_target.weights,
                                    self.critic.weights, self.tau)
            update_target_variables(self.actor_target.weights,
                                    self.actor.weights, self.tau)

            return actor_loss, critic_loss, td_errors
Example #7
0
    def __init__(self,
                 state_shape,
                 action_dim,
                 name="TD3",
                 actor_update_freq=2,
                 policy_noise=0.2,
                 noise_clip=0.5,
                 critic_units=(400, 300),
                 **kwargs):
        super().__init__(name=name,
                         state_shape=state_shape,
                         action_dim=action_dim,
                         **kwargs)

        self.critic = Critic(state_shape, action_dim, critic_units)
        self.critic_target = Critic(state_shape, action_dim, critic_units)
        update_target_variables(self.critic_target.weights,
                                self.critic.weights,
                                tau=1.)

        self._policy_noise = policy_noise
        self._noise_clip = noise_clip

        self._actor_update_freq = actor_update_freq
        self._it = tf.Variable(0, dtype=tf.int32)
Example #8
0
    def _update_encoder(self, obses_anchor, obses_negative):
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                # Compute loss of CURL
                z_anchor = self._encoder(obses_anchor)
                z_negatives = self._encoder_target(obses_negative)
                # Compute similarities with bilinear products
                logits = tf.matmul(
                    z_anchor,
                    tf.matmul(self._curl_w, tf.transpose(z_negatives, [1, 0])))
                logits -= tf.reduce_max(
                    logits, axis=-1, keepdims=True)  # (batch_size, batch_size)
                curl_loss = tf.reduce_mean(
                    tf.keras.losses.sparse_categorical_crossentropy(
                        tf.range(self.batch_size), logits,
                        from_logits=True))  # Eq.4

            curl_grads = tape.gradient(curl_loss, [self._curl_w] +
                                       self._encoder.trainable_variables)
            self._encoder_optimizer.apply_gradients(
                zip(curl_grads,
                    [self._curl_w] + self._encoder.trainable_variables))
            update_target_variables(self._encoder_target.weights,
                                    self._encoder.weights, self._tau_encoder)

        return curl_loss, tf.reduce_mean(tf.abs(self._curl_w)), tf.reduce_mean(
            tf.abs(z_anchor)), tf.reduce_mean(logits)
Example #9
0
    def _train_body(self, states, actions, next_states, rewards, done,
                    weights):
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                td_error1, td_error2 = self._compute_td_error_body(
                    states, actions, next_states, rewards, done)
                critic_loss = tf.reduce_mean(huber_loss(td_error1, delta=self.max_grad) * weights) + \
                              tf.reduce_mean(huber_loss(td_error2, delta=self.max_grad) * weights)

            critic_grad = tape.gradient(critic_loss,
                                        self.critic.trainable_variables)
            self.critic_optimizer.apply_gradients(
                zip(critic_grad, self.critic.trainable_variables))

            self._it.assign_add(1)
            with tf.GradientTape() as tape:
                next_actions = self.actor(states)
                actor_loss = - \
                    tf.reduce_mean(self.critic([states, next_actions]))

            if tf.math.equal(self._it % self._actor_update_freq, 0):
                actor_grad = tape.gradient(actor_loss,
                                           self.actor.trainable_variables)
                self.actor_optimizer.apply_gradients(
                    zip(actor_grad, self.actor.trainable_variables))

            # Update target networks
            update_target_variables(self.critic_target.weights,
                                    self.critic.weights, self.tau)
            update_target_variables(self.actor_target.weights,
                                    self.actor.weights, self.tau)

            return actor_loss, critic_loss, tf.abs(td_error1) + tf.abs(
                td_error2)
Example #10
0
    def train(self, states, actions, next_states, rewards, done, weights=None):
        if weights is None:
            weights = np.ones_like(rewards)
        td_errors, q_func_loss = self._train_body(states, actions, next_states,
                                                  rewards, done, weights)

        tf.summary.scalar(name=self.policy_name + "/q_func_Loss",
                          data=q_func_loss)

        # TODO: Remove following by using tf.global_step
        self.n_update += 1
        # Update target networks
        if self.n_update % self.target_replace_interval == 0:
            update_target_variables(self.q_func_target.weights,
                                    self.q_func.weights,
                                    tau=1.)

        # Update exploration rate
        self.epsilon = max(
            self.epsilon - self.epsilon_decay_rate * self.update_interval,
            self.epsilon_min)
        tf.summary.scalar(name=self.policy_name + "/epsilon",
                          data=self.epsilon)

        return td_errors
Example #11
0
 def _setup_critic_q(self, state_shape, action_dim, critic_units, lr):
     self.qf1 = self.critic_fn(state_shape,
                               action_dim,
                               critic_units,
                               name="qf1")
     self.qf2 = self.critic_fn(state_shape,
                               action_dim,
                               critic_units,
                               name="qf2")
     self.qf1_target = self.critic_fn(state_shape,
                                      action_dim,
                                      critic_units,
                                      name="qf1_target")
     self.qf2_target = self.critic_fn(state_shape,
                                      action_dim,
                                      critic_units,
                                      name="qf2_target")
     update_target_variables(self.qf1_target.weights,
                             self.qf1.weights,
                             tau=1.)
     update_target_variables(self.qf2_target.weights,
                             self.qf2.weights,
                             tau=1.)
     self.qf1_optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
     self.qf2_optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
Example #12
0
    def _update_encoder(self, obses):
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                # Compute loss of critic Q
                obs_features = self._encoder(obses,
                                             stop_q_grad=self._stop_q_grad)

                # Compute loss of AE
                rec_obses = self._decoder(obs_features)
                true_obses = preprocess_img(obses)
                rec_loss = tf.reduce_mean(
                    tf.keras.losses.MSE(true_obses, rec_obses))
                latent_loss = tf.reduce_mean(
                    0.5 * tf.reduce_sum(tf.math.pow(obs_features, 2), axis=1))
                ae_loss = rec_loss + self._lambda_latent_val * latent_loss

            encoder_grads = tape.gradient(ae_loss,
                                          self._encoder.trainable_variables)
            self._encoder_optimizer.apply_gradients(
                zip(encoder_grads, self._encoder.trainable_variables))
            decoder_grads = tape.gradient(ae_loss,
                                          self._decoder.trainable_variables)
            self._encoder_optimizer.apply_gradients(
                zip(decoder_grads, self._decoder.trainable_variables))
            update_target_variables(self._encoder_target.weights,
                                    self._encoder.weights, self._tau_encoder)

        return rec_loss, latent_loss
Example #13
0
    def __init__(self,
                 action_dim,
                 obs_shape=(84, 84, 9),
                 n_conv_layers=4,
                 n_conv_filters=32,
                 feature_dim=50,
                 tau_encoder=0.05,
                 tau_critic=0.01,
                 auto_alpha=True,
                 lr_sac=1e-3,
                 lr_encoder=1e-3,
                 lr_decoder=1e-3,
                 update_critic_target_freq=2,
                 update_actor_freq=2,
                 lr_alpha=1e-4,
                 init_temperature=0.1,
                 stop_q_grad=False,
                 lambda_latent_val=1e-06,
                 decoder_weight_lambda=1e-07,
                 skip_making_decoder=False,
                 name="SACAE",
                 **kwargs):
        super().__init__(state_shape=(feature_dim, ),
                         action_dim=action_dim,
                         name=name,
                         lr=lr_sac,
                         lr_alpha=lr_alpha,
                         tau=tau_critic,
                         auto_alpha=auto_alpha,
                         init_temperature=init_temperature,
                         **kwargs)
        self._encoder = Encoder(obs_shape=obs_shape,
                                feature_dim=feature_dim,
                                n_conv_layers=n_conv_layers,
                                n_conv_filters=n_conv_filters,
                                name="encoder")
        self._encoder_target = Encoder(obs_shape=obs_shape,
                                       feature_dim=feature_dim,
                                       n_conv_layers=n_conv_layers,
                                       n_conv_filters=n_conv_filters,
                                       name="encoder_target")
        update_target_variables(self._encoder_target.weights,
                                self._encoder.weights,
                                tau=1.)

        self._encoder_optimizer = tf.keras.optimizers.Adam(lr=lr_encoder)
        if not skip_making_decoder:
            self._decoder = Decoder()
            self._lambda_latent_val = lambda_latent_val
            self._decoder_optimizer = tfa.optimizers.AdamW(
                learning_rate=lr_decoder, weight_decay=decoder_weight_lambda)

        self._stop_q_grad = stop_q_grad
        self._input_img_size = obs_shape[0]
        self._tau_encoder = tau_encoder
        self._n_update = 0
        self._update_critic_target_freq = update_critic_target_freq
        self._update_actor_freq = update_actor_freq
        self._feature_dim = feature_dim
        self.state_ndim = 3
Example #14
0
 def _setup_critic_q(self, state_shape, action_dim, lr):
     self.qf1 = CriticQ(state_shape, action_dim, name="qf1")
     self.qf2 = CriticQ(state_shape, action_dim, name="qf2")
     update_target_variables(self.vf_target.weights,
                             self.vf.weights,
                             tau=1.)
     self.qf1_optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
     self.qf2_optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
Example #15
0
    def train(self,
              states,
              actions,
              next_states,
              rewards,
              dones,
              weights=None):
        if weights is None:
            weights = np.ones_like(rewards)

        obses_anchor = random_crop(states, self._input_img_size)
        next_obses_anchor = random_crop(next_states, self._input_img_size)
        obses_negative = random_crop(states, self._input_img_size)

        # Update critic
        td_errors, qf_loss = self._update_critic(obses_anchor, actions,
                                                 next_obses_anchor, rewards,
                                                 dones, weights)
        tf.summary.scalar(name=self.policy_name + "/critic_loss", data=qf_loss)
        if self._n_update % self._update_critic_target_freq == 0:
            update_target_variables(self.qf1_target.weights, self.qf1.weights,
                                    self.tau)
            update_target_variables(self.qf2_target.weights, self.qf2.weights,
                                    self.tau)

        # Update actor
        if self._n_update % self._update_actor_freq == 0:
            obs_features = self._encoder(obses_anchor)
            actor_loss, logp_min, logp_max, logp_mean, alpha_loss = self._update_actor(
                obs_features)
            tf.summary.scalar(name=self.policy_name + "/actor_loss",
                              data=actor_loss)
            tf.summary.scalar(name=self.policy_name + "/logp_min",
                              data=logp_min)
            tf.summary.scalar(name=self.policy_name + "/logp_max",
                              data=logp_max)
            tf.summary.scalar(name=self.policy_name + "/logp_mean",
                              data=logp_mean)
            if self.auto_alpha:
                tf.summary.scalar(name=self.policy_name + "/log_ent",
                                  data=self.log_alpha)
                tf.summary.scalar(name=self.policy_name + "/logp_mean+target",
                                  data=logp_mean + self.target_alpha)
            tf.summary.scalar(name=self.policy_name + "/ent", data=self.alpha)
            tf.summary.scalar(name=self.policy_name + "/alpha_loss",
                              data=alpha_loss)

        # Update encoder
        curl_loss, w, z_anchor, logits = self._update_encoder(
            obses_anchor, obses_negative)
        tf.summary.scalar(name="encoder/curl_loss", data=curl_loss)
        tf.summary.scalar(name="encoder/latent_vars", data=z_anchor)
        tf.summary.scalar(name="encoder/w", data=w)
        tf.summary.scalar(name="encoder/logits", data=logits)

        self._n_update += 1

        return td_errors
Example #16
0
    def train(self,
              states,
              actions,
              next_states,
              rewards,
              dones,
              weights=None):
        if weights is None:
            weights = np.ones_like(rewards)

        # Update critic
        td_errors, qf_loss = self._update_critic(states, actions, next_states,
                                                 rewards, dones, weights)
        tf.summary.scalar(name=self.policy_name + "/critic_loss", data=qf_loss)
        if self._n_update % self._update_critic_target_freq == 0:
            update_target_variables(self.qf1_target.weights, self.qf1.weights,
                                    self.tau)
            update_target_variables(self.qf2_target.weights, self.qf2.weights,
                                    self.tau)

        # Update actor
        if self._n_update % self._update_actor_freq == 0:
            obs_features = self._encoder(states)
            actor_loss, logp_min, logp_max, logp_mean, alpha_loss = self._update_actor(
                obs_features)
            tf.summary.scalar(name=self.policy_name + "/actor_loss",
                              data=actor_loss)
            tf.summary.scalar(name=self.policy_name + "/logp_min",
                              data=logp_min)
            tf.summary.scalar(name=self.policy_name + "/logp_max",
                              data=logp_max)
            tf.summary.scalar(name=self.policy_name + "/logp_mean",
                              data=logp_mean)
            if self.auto_alpha:
                tf.summary.scalar(name=self.policy_name + "/log_ent",
                                  data=self.log_alpha)
                tf.summary.scalar(name=self.policy_name + "/logp_mean+target",
                                  data=logp_mean + self.target_alpha)
            tf.summary.scalar(name=self.policy_name + "/ent", data=self.alpha)
            tf.summary.scalar(name=self.policy_name + "/alpha_loss",
                              data=alpha_loss)

        # Update encoder/decoder
        rec_loss, latent_loss = self._update_encoder(states)
        tf.summary.scalar(name=self.policy_name + "/rec_loss", data=rec_loss)
        tf.summary.scalar(name=self.policy_name + "/latent_loss",
                          data=latent_loss)

        self._n_update += 1

        return qf_loss
Example #17
0
File: td3.py Project: ymd-h/tf2rl
    def __init__(self,
                 state_shape,
                 action_dim,
                 name="TD3",
                 actor_update_freq=2,
                 policy_noise=0.2,
                 noise_clip=0.5,
                 critic_units=(400, 300),
                 **kwargs):
        """
        Initialize TD3

        Args:
            shate_shape (iterable of ints): Observation state shape
            action_dim (int): Action dimension
            name (str): Network name. The default is ``"TD3"``.
            actor_update_freq (int): Number of critic updates per one actor upate.
            policy_noise (float):
            noise_clip (float):
            critic_units (iterable of int): Numbers of units at hidden layer of critic. The default is ``(400, 300)``
            max_action (float): Size of maximum action. (``-max_action`` <= action <= ``max_action``). The degault is ``1``.
            lr_actor (float): Learning rate for actor network. The default is ``0.001``.
            lr_critic (float): Learning rage for critic network. The default is ``0.001``.
            actor_units (iterable of int): Number of units at hidden layers of actor.
            sigma (float): Standard deviation of Gaussian noise. The default is ``0.1``.
            tau (float): Weight update ratio for target network. ``target = (1-tau)*target + tau*network`` The default is ``0.005``.
            n_warmup (int): Number of warmup steps before training. The default is ``1e4``.
            memory_capacity (int): Replay Buffer size. The default is ``1e4``.
            batch_size (int): Batch size. The default is ``256``.
            discount (float): Discount factor. The default is ``0.99``.
            max_grad (float): Maximum gradient. The default is ``10``.
            gpu (int): GPU id. ``-1`` disables GPU. The default is ``0``.
        """
        super().__init__(name=name,
                         state_shape=state_shape,
                         action_dim=action_dim,
                         **kwargs)

        self.critic = Critic(state_shape, action_dim, critic_units)
        self.critic_target = Critic(state_shape, action_dim, critic_units)
        update_target_variables(self.critic_target.weights,
                                self.critic.weights,
                                tau=1.)

        self._policy_noise = policy_noise
        self._noise_clip = noise_clip

        self._actor_update_freq = actor_update_freq
        self._it = tf.Variable(0, dtype=tf.int32)
Example #18
0
File: dqn.py Project: Wshoway/tf2rl
    def train(self, states, actions, next_states, rewards, done, weights=None):
        if weights is None:
            weights = np.ones_like(rewards)
        td_error, q_func_loss = self._train_body(states, actions, next_states,
                                                 rewards, done, weights)

        tf.contrib.summary.scalar(name="QFuncLoss",
                                  tensor=q_func_loss,
                                  family="loss")

        # Remove following by using tf.global_step
        self.n_update += 1
        # Update target networks
        if self.n_update % self.target_replace_interval == 0:
            update_target_variables(self.q_func_target.weights,
                                    self.q_func.weights,
                                    tau=1.)

        return td_error
Example #19
0
    def __init__(self, env, params, **kwargs):
        """Initializes a DDPG agent"""

        super().__init__(name=params["agent"]["name"],
                         memory_capacity=params["agent"]["memory_capacity"],
                         n_warmup=params["agent"]["n_warmup"],
                         gpu=params["agent"]["gpu"],
                         batch_size=params["agent"]["batch_size"],
                         update_interval=params["agent"]["update_interval"],
                         **kwargs)

        # Define and initialize Actor network
        self.actor = Actor(state_shape=env.observation_space.shape,
                           action_space=env.action_space,
                           params=params)
        self.actor_target = Actor(state_shape=env.observation_space.shape,
                                  action_space=env.action_space,
                                  params=params)
        self.actor_optimizer = tf.keras.optimizers.Adam(
            learning_rate=params["agent"]["lr_actor"])
        update_target_variables(self.actor_target.weights,
                                self.actor.weights,
                                tau=1.)

        # Define and initialize Critic network
        self.critic = Critic(state_shape=env.observation_space.shape,
                             action_dim=env.action_space.high.size,
                             params=params)
        self.critic_target = Critic(state_shape=env.observation_space.shape,
                                    action_dim=env.action_space.high.size,
                                    params=params)
        self.critic_optimizer = tf.keras.optimizers.Adam(
            learning_rate=params["agent"]["lr_critic"])
        update_target_variables(self.critic_target.weights,
                                self.critic.weights,
                                tau=1.)

        # Set hyperparameters
        self.sigma = params["agent"]["sigma"]
        self.tau = params["agent"]["tau"]

        # in evaluation mode the action of the agent is deterministic, not stochastic.
        self.eval_mode = False
Example #20
0
    def _train_body(self, states, actions, next_states, rewards, done,
                    weights):
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                td_error1, td_error2 = self._compute_td_error_body(
                    states, actions, next_states, rewards, done)
                critic_loss = (
                    tf.reduce_mean(
                        huber_loss(td_error1, delta=self.max_grad) * weights) +
                    tf.reduce_mean(
                        huber_loss(td_error2, delta=self.max_grad) * weights))

            critic_grad = tape.gradient(critic_loss,
                                        self.critic.trainable_variables)
            self.critic_optimizer.apply_gradients(
                zip(critic_grad, self.critic.trainable_variables))

            self._it.assign_add(1)
            with tf.GradientTape() as tape:
                next_actions = self.actor(states)
                actor_loss = -tf.reduce_mean(self.critic(states, next_actions))

            remainder = tf.math.mod(self._it, self._actor_update_freq)

            def optimize_actor():
                actor_grad = tape.gradient(actor_loss,
                                           self.actor.trainable_variables)
                return self.actor_optimizer.apply_gradients(
                    zip(actor_grad, self.actor.trainable_variables))

            tf.cond(pred=tf.equal(remainder, 0),
                    true_fn=optimize_actor,
                    false_fn=tf.no_op)
            # Update target networks
            update_target_variables(self.critic_target.weights,
                                    self.critic.weights, self.tau)
            update_target_variables(self.actor_target.weights,
                                    self.actor.weights, self.tau)

            return actor_loss, critic_loss, tf.abs(td_error1) + tf.abs(
                td_error2)
Example #21
0
    def __init__(self,
                 state_shape,
                 action_dim,
                 name="DDPG",
                 max_action=1.,
                 lr_actor=0.001,
                 lr_critic=0.001,
                 actor_units=(400, 300),
                 critic_units=(400, 300),
                 sigma=0.1,
                 tau=0.005,
                 n_warmup=int(1e4),
                 memory_capacity=int(1e6),
                 **kwargs):
        super().__init__(name=name,
                         memory_capacity=memory_capacity,
                         n_warmup=n_warmup,
                         **kwargs)

        # Define and initialize Actor network
        self.actor = Actor(state_shape, action_dim, max_action, actor_units)
        self.actor_target = Actor(state_shape, action_dim, max_action,
                                  actor_units)
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_actor)
        update_target_variables(self.actor_target.weights,
                                self.actor.weights,
                                tau=1.)

        # Define and initialize Critic network
        self.critic = Critic(state_shape, action_dim, critic_units)
        self.critic_target = Critic(state_shape, action_dim, critic_units)
        self.critic_optimizer = tf.keras.optimizers.Adam(
            learning_rate=lr_critic)
        update_target_variables(self.critic_target.weights,
                                self.critic.weights,
                                tau=1.)

        # Set hyperparameters
        self.sigma = sigma
        self.tau = tau
Example #22
0
def set_weights_fn(policy, weights):
    actor_weights, critic_weights, critic_target_weights = weights
    update_target_variables(policy.actor.weights, actor_weights, tau=1.)
    update_target_variables(policy.critic.weights, critic_weights, tau=1.)
    update_target_variables(policy.critic_target.weights,
                            critic_target_weights,
                            tau=1.)
Example #23
0
File: dqn.py Project: Wshoway/tf2rl
    def __init__(self,
                 state_shape,
                 action_dim,
                 q_func=None,
                 name="DQN",
                 lr=0.001,
                 units=[32, 32],
                 epsilon=0.1,
                 n_warmup=int(1e4),
                 target_replace_interval=int(5e3),
                 memory_capacity=int(1e6),
                 enable_double_dqn=False,
                 enable_dueling_dqn=False,
                 **kwargs):
        super().__init__(name=name,
                         memory_capacity=memory_capacity,
                         n_warmup=n_warmup,
                         **kwargs)

        q_func = q_func if q_func is not None else QFunc
        # Define and initialize Q-function network
        self.q_func = q_func(state_shape, action_dim, units)
        self.q_func_target = q_func(state_shape, action_dim, units)
        self.q_func_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        update_target_variables(self.q_func_target.weights,
                                self.q_func.weights,
                                tau=1.)

        self._action_dim = action_dim

        # Set hyperparameters
        self.epsilon = epsilon
        self.target_replace_interval = target_replace_interval
        self.n_update = 0

        # DQN variants
        self._enable_double_dqn = enable_double_dqn
        self._enable_dueling_dqn = enable_dueling_dqn
Example #24
0
    def _train_body(self, states, actions, next_states, rewards, done,
                    weights):
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                td_error1, td_error2 = self._compute_td_error_body(
                    states, actions, next_states, rewards, done)
                critic_loss = tf.reduce_mean(
                    tf.square(td_error1) * weights * 0.5 + \
                    tf.square(td_error2) * weights * 0.5)

            critic_grad = tape.gradient(critic_loss,
                                        self.critic.trainable_variables)
            self.critic_optimizer.apply_gradients(
                zip(critic_grad, self.critic.trainable_variables))

            actor_loss = None
            # TODO: Update actor and target networks at specified frequency
            # tf.assign(self._it, self._it+1)
            # if tf.mod(self._it, self._actor_update_freq) == 0:
            with tf.GradientTape() as tape:
                next_actions = self.actor(states)
                actor_loss = -tf.reduce_mean(
                    self.critic([states, next_actions]))

            actor_grad = tape.gradient(actor_loss,
                                       self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor.trainable_variables))

            # Update target networks
            update_target_variables(self.critic_target.weights,
                                    self.critic.weights, self.tau)
            update_target_variables(self.actor_target.weights,
                                    self.actor.weights, self.tau)

            return actor_loss, critic_loss, np.abs(td_error1) + np.abs(
                td_error2)
Example #25
0
    def _train_body(self,
                    states,
                    actions,
                    next_states,
                    rewards,
                    done,
                    weights=None):
        with tf.device(self.device):
            batch_size = states.shape[0]
            not_dones = 1. - tf.cast(done, dtype=tf.float32)
            actions = tf.cast(actions, dtype=tf.int32)

            indices = tf.concat(
                values=[tf.expand_dims(tf.range(batch_size), axis=1), actions],
                axis=1)

            with tf.GradientTape(persistent=True) as tape:
                # Compute critic loss
                _, _, next_action_param = self.actor(next_states)
                next_action_prob = next_action_param["prob"]
                next_action_logp = tf.math.log(next_action_prob + 1e-8)
                next_q = tf.minimum(self.qf1_target(next_states),
                                    self.qf2_target(next_states))

                target_q = tf.expand_dims(tf.einsum(
                    'ij,ij->i', next_action_prob,
                    next_q - self.alpha * next_action_logp),
                                          axis=1)  # Eq.(10)
                target_q = tf.stop_gradient(rewards + not_dones *
                                            self.discount * target_q)

                current_q1 = self.qf1(states)
                current_q2 = self.qf2(states)

                td_loss1 = tf.reduce_mean(
                    huber_loss(target_q - tf.expand_dims(
                        tf.gather_nd(current_q1, indices), axis=1),
                               delta=self.max_grad))
                td_loss2 = tf.reduce_mean(
                    huber_loss(target_q - tf.expand_dims(
                        tf.gather_nd(current_q2, indices), axis=1),
                               delta=self.max_grad))  # Eq.(7)

                # Compute actor loss
                _, _, current_action_param = self.actor(states)
                current_action_prob = current_action_param["prob"]
                current_action_logp = tf.math.log(current_action_prob + 1e-8)

                policy_loss = tf.reduce_mean(
                    tf.einsum(
                        'ij,ij->i', current_action_prob,
                        self.alpha * current_action_logp - tf.stop_gradient(
                            tf.minimum(current_q1, current_q2))))  # Eq.(12)
                mean_ent = tf.reduce_mean(
                    tf.einsum('ij,ij->i', current_action_prob,
                              current_action_logp)) * (-1)

            q1_grad = tape.gradient(td_loss1, self.qf1.trainable_variables)
            self.qf1_optimizer.apply_gradients(
                zip(q1_grad, self.qf1.trainable_variables))
            q2_grad = tape.gradient(td_loss2, self.qf2.trainable_variables)
            self.qf2_optimizer.apply_gradients(
                zip(q2_grad, self.qf2.trainable_variables))

            update_target_variables(self.qf1_target.weights,
                                    self.qf1.weights,
                                    tau=self.tau)
            update_target_variables(self.qf2_target.weights,
                                    self.qf2.weights,
                                    tau=self.tau)

            actor_grad = tape.gradient(policy_loss,
                                       self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor.trainable_variables))

        return (td_loss1 + td_loss2) / 2., policy_loss, mean_ent, \
               tf.reduce_min(current_action_logp), tf.reduce_max(current_action_logp)
Example #26
0
    def __init__(self,
                 state_shape,
                 action_dim,
                 q_func=None,
                 name="DQN",
                 lr=0.001,
                 units=[32, 32],
                 epsilon=0.1,
                 epsilon_min=None,
                 epsilon_decay_step=int(1e6),
                 n_warmup=int(1e4),
                 target_replace_interval=int(5e3),
                 memory_capacity=int(1e6),
                 optimizer=None,
                 enable_double_dqn=False,
                 enable_dueling_dqn=False,
                 enable_noisy_dqn=False,
                 enable_categorical_dqn=False,
                 **kwargs):
        super().__init__(name=name,
                         memory_capacity=memory_capacity,
                         n_warmup=n_warmup,
                         **kwargs)

        q_func = q_func if q_func is not None else QFunc
        # Define and initialize Q-function network
        kwargs_dqn = {
            "state_shape": state_shape,
            "action_dim": action_dim,
            "units": units,
            "enable_dueling_dqn": enable_dueling_dqn,
            "enable_noisy_dqn": enable_noisy_dqn,
            "enable_categorical_dqn": enable_categorical_dqn
        }
        self.q_func = q_func(**kwargs_dqn)
        self.q_func_target = q_func(**kwargs_dqn)
        self.q_func_optimizer = optimizer if optimizer is not None else \
            tf.keras.optimizers.Adam(learning_rate=lr)
        update_target_variables(self.q_func_target.weights,
                                self.q_func.weights,
                                tau=1.)

        self._action_dim = action_dim
        # This is used to check if input state to `get_action` is multiple (batch) or single
        self._state_ndim = np.array(state_shape).shape[0]

        # Distributional DQN
        if enable_categorical_dqn:
            self._v_max, self._v_min = 10., -10.
            self._delta_z = (self._v_max - self._v_min) / \
                (self.q_func._n_atoms - 1)
            self._z_list = tf.constant([
                self._v_min + i * self._delta_z
                for i in range(self.q_func._n_atoms)
            ],
                                       dtype=tf.float32)
            self._z_list_broadcasted = tf.tile(
                tf.reshape(self._z_list, [1, self.q_func._n_atoms]),
                tf.constant([self._action_dim, 1]))

        # Set hyper-parameters
        if epsilon_min is not None and not enable_noisy_dqn:
            assert epsilon > epsilon_min
            self.epsilon_min = epsilon_min
            self.epsilon_decay_rate = (epsilon -
                                       epsilon_min) / epsilon_decay_step
            self.epsilon = max(
                epsilon - self.epsilon_decay_rate * self.n_warmup,
                self.epsilon_min)
        else:
            epsilon = epsilon if not enable_noisy_dqn else 0.
            self.epsilon = epsilon
            self.epsilon_min = epsilon
            self.epsilon_decay_rate = 0.
        self.target_replace_interval = target_replace_interval
        self.n_update = 0

        # DQN variants
        self._enable_double_dqn = enable_double_dqn
        self._enable_noisy_dqn = enable_noisy_dqn
        self._enable_categorical_dqn = enable_categorical_dqn
Example #27
0
    def _train_body(self, states, actions, next_states, rewards, dones, weights):
        with tf.device(self.device):
            if tf.rank(rewards) == 2:
                rewards = tf.squeeze(rewards, axis=1)
            not_dones = 1. - tf.cast(dones, dtype=tf.float32)

            with tf.GradientTape(persistent=True) as tape:
                # Compute loss of critic Q
                current_q1 = self.qf1([states, actions])
                current_q2 = self.qf2([states, actions])
                vf_next_target = self.vf_target(next_states)

                target_q = tf.stop_gradient(
                    rewards + not_dones * self.discount * vf_next_target)

                td_loss_q1 = tf.reduce_mean(huber_loss(
                    target_q - current_q1, delta=self.max_grad) * weights)
                td_loss_q2 = tf.reduce_mean(huber_loss(
                    target_q - current_q2, delta=self.max_grad) * weights)  # Eq.(7)

                # Compute loss of critic V
                current_v = self.vf(states)

                sample_actions, logp, _ = self.actor(states)  # Resample actions to update V
                current_q1 = self.qf1([states, sample_actions])
                current_q2 = self.qf2([states, sample_actions])
                current_min_q = tf.minimum(current_q1, current_q2)

                target_v = tf.stop_gradient(
                    current_min_q - self.alpha * logp)
                td_errors = target_v - current_v
                td_loss_v = tf.reduce_mean(
                    huber_loss(td_errors, delta=self.max_grad) * weights)  # Eq.(5)

                # Compute loss of policy
                policy_loss = tf.reduce_mean(
                    (self.alpha * logp - current_min_q) * weights)  # Eq.(12)

                # Compute loss of temperature parameter for entropy
                if self.auto_alpha:
                    alpha_loss = -tf.reduce_mean(
                        (self.log_alpha * tf.stop_gradient(logp + self.target_alpha)))

            q1_grad = tape.gradient(td_loss_q1, self.qf1.trainable_variables)
            self.qf1_optimizer.apply_gradients(
                zip(q1_grad, self.qf1.trainable_variables))
            q2_grad = tape.gradient(td_loss_q2, self.qf2.trainable_variables)
            self.qf2_optimizer.apply_gradients(
                zip(q2_grad, self.qf2.trainable_variables))

            vf_grad = tape.gradient(td_loss_v, self.vf.trainable_variables)
            self.vf_optimizer.apply_gradients(
                zip(vf_grad, self.vf.trainable_variables))
            update_target_variables(
                self.vf_target.weights, self.vf.weights, self.tau)

            actor_grad = tape.gradient(
                policy_loss, self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor.trainable_variables))

            if self.auto_alpha:
                alpha_grad = tape.gradient(alpha_loss, [self.log_alpha])
                self.alpha_optimizer.apply_gradients(
                    zip(alpha_grad, [self.log_alpha]))
                self.alpha.assign(tf.exp(self.log_alpha))

            del tape

        return td_errors, policy_loss, td_loss_v, td_loss_q1, tf.reduce_min(logp), tf.reduce_max(logp), tf.reduce_mean(logp)
Example #28
0
    def _train_body(self,
                    states,
                    actions,
                    next_states,
                    rewards,
                    done,
                    weights=None):
        with tf.device(self.device):
            rewards = tf.squeeze(rewards, axis=1)
            not_done = 1. - tf.cast(done, dtype=tf.float32)

            # Update Critic
            with tf.GradientTape(persistent=True) as tape:
                current_Q1 = self.qf1([states, actions])
                current_Q2 = self.qf2([states, actions])
                vf_next_target = self.vf_target(next_states)

                target_Q = tf.stop_gradient(self.scale_reward * rewards +
                                            not_done * self.discount *
                                            vf_next_target)

                td_loss1 = tf.reduce_mean(
                    huber_loss(target_Q - current_Q1, delta=self.max_grad))
                td_loss2 = tf.reduce_mean(
                    huber_loss(target_Q - current_Q2, delta=self.max_grad))

            q1_grad = tape.gradient(td_loss1, self.qf1.trainable_variables)
            self.qf1_optimizer.apply_gradients(
                zip(q1_grad, self.qf1.trainable_variables))
            q2_grad = tape.gradient(td_loss2, self.qf2.trainable_variables)
            self.qf2_optimizer.apply_gradients(
                zip(q2_grad, self.qf2.trainable_variables))

            del tape

            with tf.GradientTape(persistent=True) as tape:
                current_V = self.vf(states)
                sample_actions, logp = self.actor(states)

                current_Q1 = self.qf1([states, sample_actions])
                current_Q2 = self.qf2([states, sample_actions])
                current_Q = tf.minimum(current_Q1, current_Q2)

                target_V = tf.stop_gradient(current_Q - logp)
                td_errors = target_V - current_V
                vf_loss_t = tf.reduce_mean(
                    huber_loss(td_errors, delta=self.max_grad) * weights)

                # TODO: Add reguralizer
                policy_loss = tf.reduce_mean(logp - current_Q1)

            vf_grad = tape.gradient(vf_loss_t, self.vf.trainable_variables)
            self.vf_optimizer.apply_gradients(
                zip(vf_grad, self.vf.trainable_variables))
            update_target_variables(self.vf_target.weights, self.vf.weights,
                                    self.tau)

            actor_grad = tape.gradient(policy_loss,
                                       self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor.trainable_variables))

            del tape

        return td_errors, policy_loss, vf_loss_t, td_loss1, tf.reduce_min(
            logp), tf.reduce_max(logp)
Example #29
0
File: dqn.py Project: ymd-h/tf2rl
    def __init__(
            self,
            state_shape,
            action_dim,
            q_func=None,
            name="DQN",
            lr=0.001,
            adam_eps=1e-07,
            units=(32, 32),
            epsilon=0.1,
            epsilon_min=None,
            epsilon_decay_step=int(1e6),
            n_warmup=int(1e4),
            target_replace_interval=int(5e3),
            memory_capacity=int(1e6),
            enable_double_dqn=False,
            enable_dueling_dqn=False,
            enable_noisy_dqn=False,
            optimizer=None,
            **kwargs):
        """
        Initialize DQN agent

        Args:
            state_shape (iterable of int): Observation space shape
            action_dim (int): Dimension of discrete action
            q_function (QFunc): Custom Q function class. If ``None`` (default), Q function is constructed with ``QFunc``.
            name (str): Name of agent. The default is ``"DQN"``
            lr (float): Learning rate. The default is ``0.001``.
            adam_eps (float): Epsilon for Adam. The default is ``1e-7``
            units (iterable of int): Units of hidden layers. The default is ``(32, 32)``
            espilon (float): Initial epsilon of e-greedy. The default is ``0.1``
            epsilon_min (float): Minimum epsilon of after decayed.
            epsilon_decay_step (int): Number of steps decaying. The default is ``1e6``
            n_warmup (int): Number of warmup steps befor training. The default is ``1e4``
            target_replace_interval (int): Number of steps between target network update. The default is ``5e3``
            memory_capacity (int): Size of replay buffer. The default is ``1e6``
            enable_double_dqn (bool): Whether use Double DQN. The default is ``False``
            enable_dueling_dqn (bool): Whether use Dueling network. The default is ``False``
            enable_noisy_dqn (bool): Whether use noisy network. The default is ``False``
            optimizer (tf.keras.optimizers.Optimizer): Custom optimizer
            batch_size (int): Batch size. The default is ``256``.
            discount (float): Discount factor. The default is ``0.99``.
            max_grad (float): Maximum gradient. The default is ``10``.
            gpu (int): GPU id. ``-1`` disables GPU. The default is ``0``.
        """
        super().__init__(name=name, memory_capacity=memory_capacity, n_warmup=n_warmup, **kwargs)

        q_func = q_func if q_func is not None else QFunc
        # Define and initialize Q-function network
        kwargs_dqn = {
            "state_shape": state_shape,
            "action_dim": action_dim,
            "units": units,
            "enable_dueling_dqn": enable_dueling_dqn,
            "enable_noisy_dqn": enable_noisy_dqn}
        self.q_func = q_func(**kwargs_dqn)
        self.q_func_target = q_func(**kwargs_dqn)
        self.q_func_optimizer = optimizer or tf.keras.optimizers.Adam(learning_rate=lr, epsilon=adam_eps)
        update_target_variables(self.q_func_target.weights,
                                self.q_func.weights, tau=1.)

        self._action_dim = action_dim
        # This is used to check if input state to `get_action` is multiple (batch) or single
        self._state_ndim = np.array(state_shape).shape[0]

        # Set hyper-parameters
        if epsilon_min is not None and not enable_noisy_dqn:
            assert epsilon > epsilon_min
            self.epsilon_min = epsilon_min
            self.epsilon_decay_rate = (epsilon - epsilon_min) / epsilon_decay_step
            self.epsilon = max(epsilon - self.epsilon_decay_rate * self.n_warmup,
                               self.epsilon_min)
        else:
            epsilon = epsilon if not enable_noisy_dqn else 0.
            self.epsilon = epsilon
            self.epsilon_min = epsilon
            self.epsilon_decay_rate = 0.
        self.target_replace_interval = target_replace_interval
        self.n_update = 0

        # DQN variants
        self._enable_double_dqn = enable_double_dqn
        self._enable_noisy_dqn = enable_noisy_dqn
Example #30
0
    def _train_body(self, states, actions, next_states, rewards, dones,
                    weights):
        with tf.device(self.device):
            batch_size = states.shape[0]
            not_dones = 1. - tf.cast(dones, dtype=tf.float32)
            actions = tf.cast(actions, dtype=tf.int32)

            indices = tf.concat(
                values=[tf.expand_dims(tf.range(batch_size), axis=1), actions],
                axis=1)

            with tf.GradientTape(persistent=True) as tape:
                # Compute critic loss
                next_action_prob = self.actor(next_states)
                next_action_logp = tf.math.log(next_action_prob + 1e-8)
                next_q = tf.minimum(self.qf1_target(next_states),
                                    self.qf2_target(next_states))

                # Compute state value function V by directly computes expectation
                target_q = tf.expand_dims(tf.einsum(
                    'ij,ij->i', next_action_prob,
                    next_q - self.alpha * next_action_logp),
                                          axis=1)  # Eq.(10)
                target_q = tf.stop_gradient(rewards + not_dones *
                                            self.discount * target_q)

                current_q1 = self.qf1(states)

                current_q2 = self.qf2(states)

                td_loss1 = tf.reduce_mean(
                    huber_loss(target_q - tf.expand_dims(
                        tf.gather_nd(current_q1, indices), axis=1),
                               delta=self.max_grad) * weights)
                td_loss2 = tf.reduce_mean(
                    huber_loss(target_q - tf.expand_dims(
                        tf.gather_nd(current_q2, indices), axis=1),
                               delta=self.max_grad) * weights)  # Eq.(7)

                # Compute actor loss
                current_action_prob = self.actor(states)
                current_action_logp = tf.math.log(current_action_prob + 1e-8)

                policy_loss = tf.reduce_mean(
                    tf.einsum(
                        'ij,ij->i', current_action_prob,
                        self.alpha * current_action_logp -
                        tf.stop_gradient(tf.minimum(current_q1, current_q2))) *
                    weights)  # Eq.(12)
                mean_ent = tf.reduce_mean(
                    tf.einsum('ij,ij->i', current_action_prob,
                              current_action_logp)) * (-1)

                if self.auto_alpha:
                    alpha_loss = -tf.reduce_mean(
                        (self.log_alpha *
                         tf.stop_gradient(current_action_logp +
                                          self.target_alpha)))

            q1_grad = tape.gradient(td_loss1, self.qf1.trainable_variables)
            self.qf1_optimizer.apply_gradients(
                zip(q1_grad, self.qf1.trainable_variables))
            q2_grad = tape.gradient(td_loss2, self.qf2.trainable_variables)
            self.qf2_optimizer.apply_gradients(
                zip(q2_grad, self.qf2.trainable_variables))

            if self.target_hard_update:
                if self.n_training % self.target_update_interval == 0:
                    update_target_variables(self.qf1_target.weights,
                                            self.qf1.weights,
                                            tau=1.)
                    update_target_variables(self.qf2_target.weights,
                                            self.qf2.weights,
                                            tau=1.)
            else:
                update_target_variables(self.qf1_target.weights,
                                        self.qf1.weights,
                                        tau=self.tau)
                update_target_variables(self.qf2_target.weights,
                                        self.qf2.weights,
                                        tau=self.tau)

            actor_grad = tape.gradient(policy_loss,
                                       self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor.trainable_variables))

            if self.auto_alpha:
                alpha_grad = tape.gradient(alpha_loss, [self.log_alpha])
                self.alpha_optimizer.apply_gradients(
                    zip(alpha_grad, [self.log_alpha]))
                self.alpha.assign(tf.exp(self.log_alpha))

        return (td_loss1 + td_loss2) / 2., policy_loss, mean_ent, \
            tf.reduce_min(current_action_logp), tf.reduce_max(current_action_logp), \
            tf.reduce_mean(current_action_logp)