Esempio n. 1
0
    def _build_l(self, name="lyapunov_critic", trainable=True, seed=None):
        """Setup lyapunov critic graph.

        Args:
            name (str, optional): Network name. Defaults to "lyapunov_critic".

            trainable (bool, optional): Whether the weights of the network layers should
                be trainable. Defaults to True.

            seed (int, optional): The seed used for the weight initialization. Defaults
                to None.

        Returns:
            tuple: Tuple with network output tensors.
        """

        # Return GA
        # TODO: Check if trainable is needed
        return LyapunovCritic(
            obs_dim=self.s_dim,
            act_dim=self.a_dim,
            hidden_sizes=self.network_structure["critic"],
            name=name,
            trainable=trainable,
            seed=seed,
        )
Esempio n. 2
0
class LAC(tf.Module):
    """The Lyapunov actor critic.

    Attributes:
        ga (tf.keras.Model): The Squashed Gaussian Actor network.

        ga_ (tf.keras.Model): The Squashed Gaussian Actor target network.

        lc (tf.keras.Model: The Lyapunov Critic network.

        lc_ (tf.keras.Model: The Lyapunov Critic target network.

        q_1 (tf.keras.Model: The first Q-Critic network.

        q_2 (tf.keras.Model: The second Q-Critic network.

        q_2_ (tf.keras.Model: The first Q-Critic target network.

        q_2_ (tf.keras.Model): The second Q-Crictic target network.

        log_alpha (tf.Variable): The temperature lagrance multiplier.

        log_labda (tf.Variable): The Lyapunov lagrance multiplier.

        target_entropy (int): The target entropy.

        device (str): The device the networks are placed on (CPU or GPU).

        use_lyapunov (bool): Whether the Lyapunov Critic is used (use_lyapunov=True) or
            the regular Q-critic (use_lyapunov=false).
    """
    def __init__(self, a_dim, s_dim, act_limits=None):
        """Initiates object state.

        Args:
            a_dim (int): Action space dimension.

            s_dim (int): Observation space dimension.

            act_limits (dict, optional): The "high" and "low" action bounds of the
                environment. Used for rescaling the actions that comes out of network
                from (-1, 1) to (low, high). Defaults to (-1, 1).
        """

        # Display information about the algorithm being used (LAC or SAC)
        if ALG_PARAMS["use_lyapunov"]:
            print(
                colorize("INFO: You are using the LAC algorithm.",
                         "green",
                         bold=True))
        else:
            print(
                colorize("WARN: You are using the SAC algorithm.",
                         "yellow",
                         bold=True))

        # Save action and observation space as members
        self._a_dim = a_dim
        self._s_dim = s_dim
        self._act_limits = act_limits

        # Save algorithm parameters as class objects
        self.use_lyapunov = ALG_PARAMS["use_lyapunov"]
        self._network_structure = ALG_PARAMS["network_structure"]
        self._polyak = 1 - ALG_PARAMS["tau"]
        self._gamma = ALG_PARAMS["gamma"]
        self._alpha_3 = ALG_PARAMS["alpha3"]

        # Determine target entropy
        # NOTE (rickstaa): If not defined we use the Lower bound of the policy entropy
        if ALG_PARAMS["target_entropy"] is None:
            self.target_entropy = -self._a_dim
        else:
            self.target_entropy = ALG_PARAMS["target_entropy"]

        # Create Learning rate placeholders
        self._lr_a = tf.Variable(ALG_PARAMS["lr_a"], name="LR_A")
        if self.use_lyapunov:
            self._lr_lag = tf.Variable(ALG_PARAMS["lr_a"], name="LR_lag")
            self._lr_l = tf.Variable(ALG_PARAMS["lr_l"], name="LR_L")
        else:
            self._lr_c = tf.Variable(ALG_PARAMS["lr_c"], name="LR_C")

        # Make sure alpha and alpha are not zero
        # NOTE (rickstaa): This is needed to prevent log_alpha/log_lambda from becoming
        # -np.inf
        ALG_PARAMS["alpha"] = (1e-37 if ALG_PARAMS["alpha"] == 0.0 else
                               ALG_PARAMS["alpha"])
        ALG_PARAMS["labda"] = (1e-37 if ALG_PARAMS["labda"] == 0.0 else
                               ALG_PARAMS["labda"])

        # Create placeholders for the Lagrance multipliers
        self.log_alpha = tf.Variable(tf.math.log(ALG_PARAMS["alpha"]),
                                     name="log_alpha")
        if self.use_lyapunov:
            self.log_labda = tf.Variable(tf.math.log(ALG_PARAMS["labda"]),
                                         name="log_lambda")

        # Create Gaussian Actor (GA) and Lyapunov critic (LC) or Q-Critic (QC) networks
        self.ga = SquashedGaussianActor(
            obs_dim=self._s_dim,
            act_dim=self._a_dim,
            hidden_sizes=self._network_structure["actor"],
            act_limits=self.act_limits,
        )
        if self.use_lyapunov:
            self.lc = LyapunovCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["critic"],
            )
        else:
            # NOTE (rickstaa): We create two Q-critics so we can use the Clipped
            # double-Q trick.
            self.q_1 = QCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["q_critic"],
            )
            self.q_2 = QCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["q_critic"],
            )

        # Create GA, LC and QC target networks
        # Don't get optimized but get updated according to the EMA of the main
        # networks
        self.ga_ = SquashedGaussianActor(
            obs_dim=self._s_dim,
            act_dim=self._a_dim,
            hidden_sizes=self._network_structure["actor"],
            act_limits=self.act_limits,
        )
        if self.use_lyapunov:
            self.lc_ = LyapunovCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["critic"],
            )
        else:
            self.q_1_ = QCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["q_critic"],
            )
            self.q_2_ = QCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["q_critic"],
            )

        self._init_targets()

        # Create optimizers
        # NOTE (rickstaa): We here optimize for log_alpha and log_labda instead of
        # alpha and labda because it is more numerically stable (see:
        # https://github.com/rail-berkeley/softlearning/issues/136)
        self._a_train = tf.keras.optimizers.Adam(learning_rate=self._lr_a)
        self._alpha_train = tf.keras.optimizers.Adam(learning_rate=self._lr_a)
        if self.use_lyapunov:
            self._lambda_train = tf.keras.optimizers.Adam(
                learning_rate=self._lr_lag)
            self._l_train = tf.keras.optimizers.Adam(learning_rate=self._lr_l)
        else:
            self._main_q_vars = (self.q_1.trainable_variables +
                                 self.q_2.trainable_variables
                                 )  # Chain parameters of the two Q-critics
            self._q_train = tf.keras.optimizers.Adam(learning_rate=self._lr_c)

        # Create model save dict
        if self.use_lyapunov:
            self._save_dict = {
                "gaussian_actor": self.ga,
                "lyapunov_critic": self.lc,
                "log_alpha": self.log_alpha,
                "log_labda": self.log_labda,
                "use_lyapunov": self.use_lyapunov,
            }
        else:
            self._save_dict = {
                "gaussian_actor": self.ga,
                "q_critic_1": self.q_1,
                "q_critic_2": self.q_2,
                "log_alpha": self.log_alpha,
                "use_lyapunov": self.use_lyapunov,
            }

    @tf.function
    def choose_action(self, s, evaluation=False):
        """Returns the current action of the policy.

        Args:
            s (np.numpy): The current state.

            evaluation (bool, optional): Whether to return a deterministic action.
                Defaults to False.

        Returns:
            np.numpy: The current action.
        """

        # Make sure s is float32 tensorflow tensor
        if not isinstance(s, tf.Tensor):
            s = tf.convert_to_tensor(s, dtype=tf.float32)
        elif s.dtype != tf.float32:
            s = tf.cast(s, dtype=tf.float32)

        # Get current best action
        if evaluation is True:
            try:
                det_a, _ = self.ga(tf.reshape(s, (1, -1)), deterministic=True)
                return det_a[0]
            except ValueError:
                return
        else:
            a, _ = self.ga(tf.reshape(s, (1, -1)))
            return a[0]

    @tf.function
    def learn(self, lr_a, lr_l, lr_lag, lr_c, batch):
        """Runs the SGD to update all the optimize parameters.

        Args:
            lr_a (float): Current actor learning rate.

            lr_l (float): Lyapunov critic learning rate.

            lr_c (float): Q-Critic learning rate.

            lr_lag (float): Lyapunov constraint langrance multiplier learning rate.

            batch (numpy.ndarray): The batch of experiences.

        Returns:
            tuple: Tuple with some diagnostics about the training.
        """

        # Adjust optimizer learning rates (decay)
        self._set_learning_rates(lr_a=lr_a,
                                 lr_alpha=lr_a,
                                 lr_l=lr_l,
                                 lr_labda=lr_lag,
                                 lr_c=lr_c)

        ################################################
        # Optimize (Lyapunov/Q) critic #################
        ################################################
        if self.use_lyapunov:
            # Get target Lyapunov value (Bellman-backup)
            a2_, _ = self.ga_(
                batch["s_"]
            )  # NOTE (rickstaa): Target actions come from *current* *target* policy
            l_pi_targ = self.lc_(batch["s_"], a2_)
            l_backup = (batch["r"] + self._gamma *
                        (1 - batch["terminal"]) * l_pi_targ
                        )  # The Lyapunov candidate

            # Compute Lyapunov Critic error gradients
            with tf.GradientTape() as l_tape:

                # Get current Lyapunov value
                l1 = self.lc(batch["s"], batch["a"])

                # Calculate Lyapunov *CRITIC* error
                # NOTE (rickstaa): The 0.5 multiplication factor was added to make the
                # derivation cleaner and can be safely removed without influencing the
                # minimization. We kept it here for consistency.
                l_error = 0.5 * tf.reduce_mean((l1 - l_backup)**2)  # See eq. 7

            # Perform one gradient descent step for the Lyapunov critic
            l_grads = l_tape.gradient(l_error, self.lc.trainable_variables)
            self._l_train.apply_gradients(
                zip(l_grads, self.lc.trainable_variables))
        else:

            # Get target Q values (Bellman-backup)
            # NOTE (rickstaa): Here we use max-clipping instead of min-clipping used
            # in the SAC algorithm since we want to minimize the return.
            a2, logp_a2 = self.ga(
                batch["s_"]
            )  # NOTE (rickstaa): Target actions come from *current* policy
            q1_pi_targ = self.q_1_(batch["s_"], a2)
            q2_pi_targ = self.q_2_(batch["s_"], a2)
            q_pi_targ = tf.maximum(
                q1_pi_targ, q2_pi_targ
            )  # Use max clipping  to prevent overestimation bias.
            q_backup = batch["r"] + self._gamma * (1 - batch["terminal"]) * (
                q_pi_targ - self.alpha * logp_a2)

            # Compute the Q-Critic loss gradients
            with tf.GradientTape() as q_tape:
                # Get the current Q values
                q1 = self.q_1(batch["s"], batch["a"])
                q2 = self.q_2(batch["s"], batch["a"])

                # Calculate Q-critic loss
                loss_q1 = 0.5 * tf.reduce_mean(
                    (q1 - q_backup)**2)  # See Haarnoja eq. 5
                loss_q2 = 0.5 * tf.reduce_mean((q2 - q_backup)**2)
                loss_q = loss_q1 + loss_q2

            # Perform one gradient descent step for the Q-critic
            q_grads = q_tape.gradient(loss_q, self._main_q_vars)
            self._q_train.apply_gradients(zip(q_grads, self._main_q_vars))

        ################################################
        # Optimize Gaussian actor ######################
        ################################################

        # Compute actor loss gradients
        with tf.GradientTape() as a_tape:

            # Retrieve log probabilities of batch observations based on *current*
            # policy
            pi, log_pis = self.ga(batch["s"])

            # Compute actor loss
            if self.use_lyapunov:
                # Calculate the target Lyapunov value
                a2, _ = self.ga(
                    batch["s_"]
                )  # NOTE (rickstaa): Target actions come from *current* policy
                lya_l_ = self.lc(batch["s_"], a2)

                # Compute Lyapunov Actor error
                self.l_delta = tf.reduce_mean(
                    lya_l_ - tf.stop_gradient(l1) +
                    self._alpha_3 * batch["r"])  # See eq. 11

                # Calculate actor loss
                a_loss = tf.stop_gradient(
                    self.labda) * self.l_delta + tf.stop_gradient(
                        self.alpha) * tf.reduce_mean(log_pis)  # See eq. 12
            else:

                # Retrieve the current Q values
                q1_pi = self.q_1(
                    batch["s"],
                    pi)  # NOTE (rickstaa): Actions come from *current* policy
                q2_pi = self.q_2(
                    batch["s"],
                    pi)  # NOTE (rickstaa): Actions come from *current* policy
                q_pi = tf.maximum(q1_pi, q2_pi)

                # Calculate actor loss
                a_loss = tf.reduce_mean(
                    tf.stop_gradient(self.alpha) * log_pis -
                    q_pi)  # See Haarnoja eq. 7

        # Perform one gradient descent step for the Gaussian Actor
        a_grads = a_tape.gradient(a_loss, self.ga.trainable_variables)
        self._a_train.apply_gradients(zip(a_grads,
                                          self.ga.trainable_variables))

        ################################################
        # Optimize alpha (Entropy temperature) #########
        ################################################

        # Compute alpha loss gradients
        with tf.GradientTape() as alpha_tape:

            # Calculate alpha loss
            alpha_loss = -tf.reduce_mean(self.alpha * tf.stop_gradient(
                log_pis + self.target_entropy))  # See Haarnoja eq. 17

        # Perform one gradient descent step for alpha
        alpha_grads = alpha_tape.gradient(alpha_loss, [self.log_alpha])
        self._alpha_train.apply_gradients(zip(alpha_grads, [self.log_alpha]))

        ################################################
        # Optimize labda (Lyapunov temperature) ########
        ################################################
        if self.use_lyapunov:

            # Compute labda loss gradients
            with tf.GradientTape() as lambda_tape:

                # Calculate labda loss
                # NOTE (rickstaa): Log_labda was used in the lambda_loss function
                # because using lambda caused the gradients to vanish. This is caused
                # since we restrict lambda within a 0-1.0 range using the clamp function
                # (see #38). Using log_lambda also is more numerically stable.
                labda_loss = -tf.reduce_mean(self.log_labda * tf.stop_gradient(
                    self.l_delta))  # See formulas under eq. 14

            # Perform one gradient descent step for labda
            lambda_grads = lambda_tape.gradient(labda_loss, [self.log_labda])
            self._lambda_train.apply_gradients(
                zip(lambda_grads, [self.log_labda]))

        ################################################
        # Update target networks and return ############
        # diagnostics. #################################
        ################################################

        # Update target networks
        self._update_targets()

        # Return diagnostics
        if self.use_lyapunov:
            return (
                self.labda,
                self.alpha,
                l_error,
                tf.reduce_mean(tf.stop_gradient(-log_pis)),
                a_loss,
                alpha_loss,
                labda_loss,
            )
        else:
            return (
                self.alpha,
                loss_q,
                tf.reduce_mean(tf.stop_gradient(-log_pis)),
                a_loss,
                alpha_loss,
            )

    def save_result(self, path):
        """Saves current policy.

        Args:
            path (str): The path where you want to save the policy.
        """

        # Make save path absolute
        save_path = osp.abspath(osp.join(path, "policy"))

        # Create folder if it does not yet exist
        if not os.path.exists(save_path):
            os.makedirs(save_path)

        # Save all models/tensors in the _save_dict
        vars_dict = {}
        for name, item in self._save_dict.items():
            if issubclass(item.__class__, tf.keras.Model):
                item.save_weights(osp.join(save_path, name))
                print(
                    colorize(
                        f"Saved '{name}' weights to path: {save_path}",
                        "cyan",
                        bold=True,
                    ))
            elif issubclass(item.__class__, tf.Variable):
                vars_dict[name] = item.numpy()
            else:
                vars_dict[name] = item

        # Save vars dictionary
        with open(osp.join(save_path, "vars.json"), "w") as fp:
            vars_dict = convert_json(vars_dict)  # Convert to json format
            json_data = json.dumps(vars_dict,
                                   separators=(",", ":\t"),
                                   indent=4,
                                   sort_keys=True)
            fp.write(json_data)
            print(colorize("Saving other vars:\n", color="cyan", bold=True))
            print(colorize(json_data, "cyan", bold=True))

    def restore(self, path, restore_lagrance_multipliers=True):
        """Restores policy.

        Args:
            path (str): The path where you want to save the policy.

            restore_lagrance_multipliers (bool, optional): Whether you want to restore
                the lagrance multipliers.

        Returns:
            bool: Boolean specifying whether the policy was loaded successfully.
        """

        # Create load path
        load_path = osp.abspath(path)

        # Load train configuration
        try:
            with open(osp.join(load_path, "vars.json"), "r") as f:
                train_config = json.load(f)
        except (FileNotFoundError, NotADirectoryError):
            success_load = False
            return success_load

        # Throw warning if restored model is different from the model use now
        if self.use_lyapunov != train_config["use_lyapunov"]:
            alg_strings = [
                "LAC" if self.use_lyapunov else "SAC",
                "LAC" if train_config["use_lyapunov"] else "SAC",
            ]
            if TRAIN_PARAMS["continue_training"]:
                warn_str = colorize(
                    (f"ERROR: You tried to load a {alg_strings[1]} model while the "
                     f"`variant.py` file specifies you want to train it as a"
                     f"{alg_strings[0]} model. Shutting down training as this is "
                     "not yet supported."),
                    "red",
                    bold=True,
                )
                print(warn_str)
                sys.exit(0)
            else:
                warn_str = colorize(
                    (f"ERROR: You tried to load a {alg_strings[1]} model while the "
                     f"`variant.py` file specifies you want to use it in the "
                     f"inference as a {alg_strings[0]} model. As a result the "
                     "`variant.py` will be ignored."),
                    "yellow",
                    bold=True,
                )
                print(warn_str)
                self.__reload_critic_networks(
                    use_lyapunov=train_config["use_lyapunov"])

        # Check if the models exist
        try:
            checkpoints = [
                f.replace(".index", "") for f in os.listdir(load_path)
                if f.endswith(".index")
            ]
        except (FileNotFoundError, NotADirectoryError):
            success_load = False
            return success_load

        # Check if any checkpoints were found
        if not checkpoints:
            success_load = False
            return success_load

        # Restore network parameters
        try:
            if train_config["use_lyapunov"]:
                self.ga.load_weights(load_path + "/gaussian_actor")
                self.lc.load_weights(load_path + "/lyapunov_critic")
                if restore_lagrance_multipliers:
                    self.log_alpha = train_config["log_alpha"]
                    self.log_labda = train_config["log_labda"]
            else:
                self.ga.load_weights(load_path + "/gaussian_actor")
                self.q_1.load_weights(load_path + "/q_critic_1")
                self.q_2.load_weights(load_path + "/q_critic_2")
                if restore_lagrance_multipliers:
                    self.log_alpha = train_config["log_alpha"]
        except (KeyError, AttributeError):
            alg_string = "LAC" if train_config["use_lyapunov"] else "SAC"
            print(
                colorize(
                    ("ERROR: Something went wrong while trying to load the "
                     f"{alg_string} model. Shutting down the training."),
                    "red",
                    bold=True,
                ))
            sys.exit(0)

        # Return result
        success_load = True
        return success_load

    def __reload_critic_networks(self, use_lyapunov):
        """Function used to reload the right networks when the loaded model type
        differs from the type set in the `variant.py` file. Currently only used during
        inference.

        Args:
            use_lyapunov (bool): Whether the new setup should use lyapunov or not.
        """
        # Create required networks
        if use_lyapunov:  # LAC

            # Print reload message
            print(
                colorize(
                    "INFO: You switched to using the LAC algorithm.",
                    "green",
                    bold=True,
                ))

            # Create log_labda
            self.log_labda = tf.Variable(tf.math.log(ALG_PARAMS["labda"]),
                                         name="log_lambda")

            # Create main and target Lyapunov Critic networks
            self.lc = LyapunovCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["critic"],
            )
            self.lc_ = LyapunovCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["critic"],
            )

            # Remove main and target Q-Critic networks
            # NOTE (rickstaa): Removed to make sure we notice if something goes wrong.
            delattr(self, "q_1")
            delattr(self, "q_2")
            delattr(self, "q_1_")
            delattr(self, "q_2_")
        else:  # SAC

            # Print reload message
            print(
                colorize(
                    "WARN: You switched to using the SAC algorithm.",
                    "yellow",
                    bold=True,
                ))

            # Create main and target Q-Critic networks
            self.q_1 = QCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["q_critic"],
            )
            self.q_2 = QCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["q_critic"],
            )
            self.q_1_ = QCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["q_critic"],
            )
            self.q_2_ = QCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["q_critic"],
            )

            # Remove main and target Q-Critic networks
            delattr(self, "lc")
            delattr(self, "lc_")

    def _set_learning_rates(self,
                            lr_a=None,
                            lr_alpha=None,
                            lr_l=None,
                            lr_labda=None,
                            lr_c=None):
        """Adjusts the learning rates of the optimizers.

        Args:
            lr_a (float, optional): The learning rate of the actor optimizer. Defaults
                to None.

            lr_alpha (float, optional): The learning rate of the temperature optimizer.
                Defaults to None.

            lr_l (float, optional): The learning rate of the Lyapunov critic. Defaults
                to None.

            lr_labda (float, optional): The learning rate of the Lyapunov Lagrance
                multiplier optimizer. Defaults to None.

            lr_c (float, optional): The learning rate of the Q-Critic optimizer.
                Defaults to None.
        """
        if lr_a:
            self._a_train.lr.assign(lr_a)
        if lr_alpha:
            self._alpha_train.lr.assign(lr_alpha)
        if self.use_lyapunov:
            if lr_l:
                self._l_train.lr.assign(lr_l)
            if lr_labda:
                self._lambda_train.lr.assign(lr_labda)
        else:
            if lr_c:
                self._q_train.lr.assign(lr_c)

    @tf.function
    def _init_targets(self):
        """Updates the target network weights to the main network weights."""
        for ga_main, ga_targ in zip(self.ga.variables, self.ga_.variables):
            ga_targ.assign(ga_main)
        if self.use_lyapunov:
            for lc_main, lc_targ in zip(self.lc.variables, self.lc_.variables):
                lc_targ.assign(lc_main)
        else:
            for q_1_main, q_1_targ in zip(self.q_1.variables,
                                          self.q_1_.variables):
                q_1_targ.assign(q_1_main)
            for q_2_main, q_2_targ in zip(self.q_2.variables,
                                          self.q_2_.variables):
                q_2_targ.assign(q_2_main)

    @tf.function
    def _update_targets(self):
        """Updates the target networks based on a Exponential moving average
        (Polyak averaging).
        """
        for ga_main, ga_targ in zip(self.ga.variables, self.ga_.variables):
            ga_targ.assign(self._polyak * ga_targ +
                           (1 - self._polyak) * ga_main)
        if self.use_lyapunov:
            for lc_main, lc_targ in zip(self.lc.variables, self.lc_.variables):
                lc_targ.assign(self._polyak * lc_targ +
                               (1 - self._polyak) * lc_main)
        else:
            for q_1_main, q_1_targ in zip(self.q_1.variables,
                                          self.q_1_.variables):
                q_1_targ.assign(self._polyak * q_1_targ +
                                (1 - self._polyak) * q_1_main)
            for q_2_main, q_2_targ in zip(self.q_2.variables,
                                          self.q_2_.variables):
                q_2_targ.assign(self._polyak * q_2_targ +
                                (1 - self._polyak) * q_2_main)

    @property
    def alpha(self):
        """Property used to clip alpha to be equal or bigger than 0.0 to prevent it from
        becoming nan when log_alpha becomes -inf. For alpha no upper bound is used.
        """
        return tf.clip_by_value(tf.exp(self.log_alpha), *SCALE_ALPHA_MIN_MAX)

    @property
    def labda(self):
        """Property used to clip lambda to be equal or bigger than 0.0 in order to
        prevent it from becoming nan when log_labda becomes -inf. Further we clip it to
        be lower or equal than 1.0 in order to prevent lambda from exploding when the
        the hyperparameters are chosen badly.
        """
        return tf.clip_by_value(tf.exp(self.log_labda), *SCALE_LAMBDA_MIN_MAX)

    @property
    def act_limits(self):
        return self._act_limits

    @act_limits.setter
    def act_limits(self, act_limits):
        """Sets the action limits that are used for scaling the actions that are
        returned from the gaussian policy.
        """

        # Validate input
        missing_keys = [
            key for key in ["low", "high"] if key not in act_limits.keys()
        ]
        if missing_keys:
            warn_string = "WARN: act_limits could not be set as {} not found.".format(
                f"keys {missing_keys} were"
                if len(missing_keys) > 1 else f"key {missing_keys} was")
            print(colorize(warn_string, "yellow"))
        invalid_length = [
            key for key, val in act_limits.items() if len(val) != self._a_dim
        ]
        if invalid_length:
            warn_string = (
                f"WARN: act_limits could not be set as the length of {invalid_length} "
                + "{}".format("were" if len(invalid_length) > 1 else "was") +
                f" unequal to the dimension of the action space (dim={self._a_dim})."
            )
            print(colorize(warn_string, "yellow"))

        # Set action limits
        self._act_limits = {
            "low": act_limits["low"],
            "high": act_limits["high"]
        }
        self.ga.act_limits = self._act_limits
Esempio n. 3
0
    def __reload_critic_networks(self, use_lyapunov):
        """Function used to reload the right networks when the loaded model type
        differs from the type set in the `variant.py` file. Currently only used during
        inference.

        Args:
            use_lyapunov (bool): Whether the new setup should use lyapunov or not.
        """
        # Create required networks
        if use_lyapunov:  # LAC

            # Print reload message
            print(
                colorize(
                    "INFO: You switched to using the LAC algorithm.",
                    "green",
                    bold=True,
                ))

            # Create log_labda
            self.log_labda = tf.Variable(tf.math.log(ALG_PARAMS["labda"]),
                                         name="log_lambda")

            # Create main and target Lyapunov Critic networks
            self.lc = LyapunovCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["critic"],
            )
            self.lc_ = LyapunovCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["critic"],
            )

            # Remove main and target Q-Critic networks
            # NOTE (rickstaa): Removed to make sure we notice if something goes wrong.
            delattr(self, "q_1")
            delattr(self, "q_2")
            delattr(self, "q_1_")
            delattr(self, "q_2_")
        else:  # SAC

            # Print reload message
            print(
                colorize(
                    "WARN: You switched to using the SAC algorithm.",
                    "yellow",
                    bold=True,
                ))

            # Create main and target Q-Critic networks
            self.q_1 = QCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["q_critic"],
            )
            self.q_2 = QCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["q_critic"],
            )
            self.q_1_ = QCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["q_critic"],
            )
            self.q_2_ = QCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["q_critic"],
            )

            # Remove main and target Q-Critic networks
            delattr(self, "lc")
            delattr(self, "lc_")
Esempio n. 4
0
    def __init__(self, a_dim, s_dim, act_limits=None):
        """Initiates object state.

        Args:
            a_dim (int): Action space dimension.

            s_dim (int): Observation space dimension.

            act_limits (dict, optional): The "high" and "low" action bounds of the
                environment. Used for rescaling the actions that comes out of network
                from (-1, 1) to (low, high). Defaults to (-1, 1).
        """

        # Display information about the algorithm being used (LAC or SAC)
        if ALG_PARAMS["use_lyapunov"]:
            print(
                colorize("INFO: You are using the LAC algorithm.",
                         "green",
                         bold=True))
        else:
            print(
                colorize("WARN: You are using the SAC algorithm.",
                         "yellow",
                         bold=True))

        # Save action and observation space as members
        self._a_dim = a_dim
        self._s_dim = s_dim
        self._act_limits = act_limits

        # Save algorithm parameters as class objects
        self.use_lyapunov = ALG_PARAMS["use_lyapunov"]
        self._network_structure = ALG_PARAMS["network_structure"]
        self._polyak = 1 - ALG_PARAMS["tau"]
        self._gamma = ALG_PARAMS["gamma"]
        self._alpha_3 = ALG_PARAMS["alpha3"]

        # Determine target entropy
        # NOTE (rickstaa): If not defined we use the Lower bound of the policy entropy
        if ALG_PARAMS["target_entropy"] is None:
            self.target_entropy = -self._a_dim
        else:
            self.target_entropy = ALG_PARAMS["target_entropy"]

        # Create Learning rate placeholders
        self._lr_a = tf.Variable(ALG_PARAMS["lr_a"], name="LR_A")
        if self.use_lyapunov:
            self._lr_lag = tf.Variable(ALG_PARAMS["lr_a"], name="LR_lag")
            self._lr_l = tf.Variable(ALG_PARAMS["lr_l"], name="LR_L")
        else:
            self._lr_c = tf.Variable(ALG_PARAMS["lr_c"], name="LR_C")

        # Make sure alpha and alpha are not zero
        # NOTE (rickstaa): This is needed to prevent log_alpha/log_lambda from becoming
        # -np.inf
        ALG_PARAMS["alpha"] = (1e-37 if ALG_PARAMS["alpha"] == 0.0 else
                               ALG_PARAMS["alpha"])
        ALG_PARAMS["labda"] = (1e-37 if ALG_PARAMS["labda"] == 0.0 else
                               ALG_PARAMS["labda"])

        # Create placeholders for the Lagrance multipliers
        self.log_alpha = tf.Variable(tf.math.log(ALG_PARAMS["alpha"]),
                                     name="log_alpha")
        if self.use_lyapunov:
            self.log_labda = tf.Variable(tf.math.log(ALG_PARAMS["labda"]),
                                         name="log_lambda")

        # Create Gaussian Actor (GA) and Lyapunov critic (LC) or Q-Critic (QC) networks
        self.ga = SquashedGaussianActor(
            obs_dim=self._s_dim,
            act_dim=self._a_dim,
            hidden_sizes=self._network_structure["actor"],
            act_limits=self.act_limits,
        )
        if self.use_lyapunov:
            self.lc = LyapunovCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["critic"],
            )
        else:
            # NOTE (rickstaa): We create two Q-critics so we can use the Clipped
            # double-Q trick.
            self.q_1 = QCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["q_critic"],
            )
            self.q_2 = QCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["q_critic"],
            )

        # Create GA, LC and QC target networks
        # Don't get optimized but get updated according to the EMA of the main
        # networks
        self.ga_ = SquashedGaussianActor(
            obs_dim=self._s_dim,
            act_dim=self._a_dim,
            hidden_sizes=self._network_structure["actor"],
            act_limits=self.act_limits,
        )
        if self.use_lyapunov:
            self.lc_ = LyapunovCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["critic"],
            )
        else:
            self.q_1_ = QCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["q_critic"],
            )
            self.q_2_ = QCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["q_critic"],
            )

        self._init_targets()

        # Create optimizers
        # NOTE (rickstaa): We here optimize for log_alpha and log_labda instead of
        # alpha and labda because it is more numerically stable (see:
        # https://github.com/rail-berkeley/softlearning/issues/136)
        self._a_train = tf.keras.optimizers.Adam(learning_rate=self._lr_a)
        self._alpha_train = tf.keras.optimizers.Adam(learning_rate=self._lr_a)
        if self.use_lyapunov:
            self._lambda_train = tf.keras.optimizers.Adam(
                learning_rate=self._lr_lag)
            self._l_train = tf.keras.optimizers.Adam(learning_rate=self._lr_l)
        else:
            self._main_q_vars = (self.q_1.trainable_variables +
                                 self.q_2.trainable_variables
                                 )  # Chain parameters of the two Q-critics
            self._q_train = tf.keras.optimizers.Adam(learning_rate=self._lr_c)

        # Create model save dict
        if self.use_lyapunov:
            self._save_dict = {
                "gaussian_actor": self.ga,
                "lyapunov_critic": self.lc,
                "log_alpha": self.log_alpha,
                "log_labda": self.log_labda,
                "use_lyapunov": self.use_lyapunov,
            }
        else:
            self._save_dict = {
                "gaussian_actor": self.ga,
                "q_critic_1": self.q_1,
                "q_critic_2": self.q_2,
                "log_alpha": self.log_alpha,
                "use_lyapunov": self.use_lyapunov,
            }
Esempio n. 5
0
class LAC(object):
    """The Lyapunov actor critic.

    Attributes:
        ga (torch.nn.Module): The Squashed Gaussian Actor network.

        ga_ (torch.nn.Module): The Squashed Gaussian Actor target network.

        lc (torch.nn.Module): The Lyapunov Critic network.

        lc_ (torch.nn.Module): The Lyapunov Critic target network.

        q_1 (torch.nn.Module): The first Q-Critic network.

        q_2 (torch.nn.Module): The second Q-Critic network.

        q_2_ (torch.nn.Module): The first Q-Critic target network.

        q_2_ (torch.nn.Module): The second Q-Crictic target network.

        log_alpha (torch.Tensor): The temperature lagrance multiplier.

        log_labda (torch.Tensor): The Lyapunov lagrance multiplier.

        target_entropy (int): The target entropy.

        device (str): The device the networks are placed on (CPU or GPU).

        use_lyapunov (bool): Whether the Lyapunov Critic is used (use_lyapunov=True) or
            the regular Q-critic (use_lyapunov=false).
    """
    def __init__(self, a_dim, s_dim, act_limits=None):
        """Initiates object state.

        Args:
            a_dim (int): Action space dimension.

            s_dim (int): Observation space dimension.

            act_limits (dict, optional): The "high" and "low" action bounds of the
                environment. Used for rescaling the actions that comes out of network
                from (-1, 1) to (low, high). Defaults to (-1, 1).
        """

        # Display information about the algorithm being used (LAC or SAC)
        if ALG_PARAMS["use_lyapunov"]:
            print(
                colorize("INFO: You are using the LAC algorithm.",
                         "green",
                         bold=True))
        else:
            print(
                colorize("WARN: You are using the SAC algorithm.",
                         "yellow",
                         bold=True))

        # Set the computational device
        self.device = DEVICE

        # Save action and observation space as members
        self._a_dim = a_dim
        self._s_dim = s_dim
        self._act_limits = act_limits

        # Save algorithm parameters as class objects
        self.use_lyapunov = ALG_PARAMS["use_lyapunov"]
        self._network_structure = ALG_PARAMS["network_structure"]
        self._polyak = 1 - ALG_PARAMS["tau"]
        self._gamma = ALG_PARAMS["gamma"]
        self._alpha_3 = ALG_PARAMS["alpha3"]

        # Determine target entropy
        # NOTE (rickstaa): If not defined we use the Lower bound of the policy entropy
        if ALG_PARAMS["target_entropy"] is None:
            self.target_entropy = -self._a_dim
        else:
            self.target_entropy = ALG_PARAMS["target_entropy"]

        # Create Learning rate placeholders
        self._lr_a = ALG_PARAMS["lr_a"]
        if self.use_lyapunov:
            self._lr_lag = ALG_PARAMS["lr_a"]
            self._lr_l = ALG_PARAMS["lr_l"]
        else:
            self._lr_c = ALG_PARAMS["lr_c"]

        # Make sure alpha and alpha are not zero
        # NOTE (rickstaa): This is needed to prevent log_alpha/log_lambda from becoming
        # -np.inf
        ALG_PARAMS["alpha"] = (1e-37 if ALG_PARAMS["alpha"] == 0.0 else
                               ALG_PARAMS["alpha"])
        ALG_PARAMS["labda"] = (1e-37 if ALG_PARAMS["labda"] == 0.0 else
                               ALG_PARAMS["labda"])

        # Create variables for the Lagrance multipliers
        self.log_alpha = torch.tensor(ALG_PARAMS["alpha"],
                                      dtype=torch.float32).log()
        self.log_alpha.requires_grad = True
        if self.use_lyapunov:
            self.log_labda = torch.tensor(ALG_PARAMS["labda"],
                                          dtype=torch.float32).log()
            self.log_labda.requires_grad = True

        # Create Gaussian Actor (GA) and Lyapunov critic (LC) or Q-Critic (QC) networks
        # NOTE (rickstaa): Pytorch currently uses kaiming initialization for the baises
        # in the future this will change to zero initialization
        # (https://github.com/pytorch/pytorch/issues/18182). This however does not
        # influence the results.
        self.ga = SquashedGaussianActor(
            obs_dim=self._s_dim,
            act_dim=self._a_dim,
            hidden_sizes=self._network_structure["actor"],
            act_limits=self.act_limits,
        ).to(self.device)
        if self.use_lyapunov:
            self.lc = LyapunovCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["critic"],
            ).to(self.device)
        else:
            # NOTE (rickstaa): We create two Q-critics so we can use the Clipped
            # double-Q trick.
            self.q_1 = QCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["q_critic"],
            ).to(self.device)
            self.q_2 = QCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["q_critic"],
            ).to(self.device)

        # Create GA, LC and QC target networks
        # Don't get optimized but get updated according to the EMA of the main
        # networks
        self.ga_ = deepcopy(self.ga).to(self.device)
        if self.use_lyapunov:
            self.lc_ = deepcopy(self.lc).to(self.device)
        else:
            self.q_1_ = deepcopy(self.q_1).to(self.device)
            self.q_2_ = deepcopy(self.q_2).to(self.device)

        # Freeze target networks
        for p in self.ga_.parameters():
            p.requires_grad = False
        if self.use_lyapunov:
            for p in self.lc_.parameters():
                p.requires_grad = False
        else:
            for p in self.q_1_.parameters():
                p.requires_grad = False
            for p in self.q_2_.parameters():
                p.requires_grad = False

        # Create optimizers
        # NOTE (rickstaa): We here optimize for log_alpha and log_labda instead of
        # alpha and labda because it is more numerically stable (see:
        # https://github.com/rail-berkeley/softlearning/issues/136)
        self._alpha_train = Adam([self.log_alpha], lr=self._lr_a)
        self._a_train = Adam(self.ga.parameters(), lr=self._lr_a)
        if self.use_lyapunov:
            self._lambda_train = Adam([self.log_labda], lr=self._lr_lag)
            self._l_train = Adam(self.lc.parameters(), lr=self._lr_l)
        else:
            q_params = itertools.chain(
                self.q_1.parameters(),
                self.q_2.parameters())  # Chain parameters of the two Q-critics
            self._q_train = Adam(q_params, lr=self._lr_c)

    def choose_action(self, s, evaluation=False):
        """Returns the current action of the policy.

        Args:
            s (np.numpy): The current state.

            evaluation (bool, optional): Whether to return a deterministic action.
                Defaults to False.

        Returns:
            np.numpy: The current action.
        """

        # Make sure s is float32 torch tensor
        s = torch.as_tensor(s, dtype=torch.float32).to(self.device)

        # Get current best action
        if evaluation is True:
            try:
                with torch.no_grad():
                    det_a, _ = self.ga(s.unsqueeze(0), deterministic=True)
                    return det_a[0].cpu().numpy()
            except ValueError:
                return
        else:
            with torch.no_grad():
                a, _ = self.ga(s.unsqueeze(0))
                return a[0].cpu().numpy()

    def learn(self, lr_a, lr_l, lr_c, lr_lag, batch):
        """Runs the SGD to update all the optimize parameters.

        Args:
            lr_a (float): Current actor learning rate.

            lr_l (float): Lyapunov critic learning rate.

            lr_c (float): Q-Critic learning rate.

            lr_lag (float): Lyapunov constraint langrance multiplier learning rate.

            batch (numpy.ndarray): The batch of experiences.

        Returns:
            tuple: Tuple with some diagnostics about the training.
        """

        # Adjust optimizer learning rates (decay)
        self._set_learning_rates(lr_a=lr_a,
                                 lr_alpha=lr_a,
                                 lr_l=lr_l,
                                 lr_labda=lr_lag,
                                 lr_c=lr_c)

        ################################################
        # Optimize (Lyapunov/Q) critic #################
        ################################################
        if self.use_lyapunov:

            # Zero gradients on the L-critic
            self._l_train.zero_grad()

            # Get target Lyapunov value (Bellman-backup)
            with torch.no_grad():
                a2_, _ = self.ga_(
                    batch["s_"]
                )  # NOTE (rickstaa): Target actions come from *current* *target* policy
                l_pi_targ = self.lc_(batch["s_"], a2_)
                l_backup = (batch["r"] + self._gamma *
                            (1 - batch["terminal"]) * l_pi_targ.detach())

            # Get current Lyapunov value
            l1 = self.lc(batch["s"], batch["a"])

            # Calculate Lyapunov *CRITIC* error
            # NOTE (rickstaa): The 0.5 multiplication factor was added to make the
            # derivation cleaner and can be safely removed without influencing the
            # minimization. We kept it here for consistency.
            # NOTE (rickstaa): I use a manual implementation instead of using
            # F.mse_loss as this is 2 times faster. This can be changed back to
            # F.mse_loss if Torchscript is used.
            l_error = 0.5 * ((l1 - l_backup)**2).mean()  # See eq. 7

            # Perform one gradient descent step for the Lyapunov critic
            l_error.backward()
            self._l_train.step()
        else:

            # Zero gradients on the Q-critic
            self._q_train.zero_grad()

            # Get target Q values (Bellman-backup)
            # NOTE (rickstaa): Here we use max-clipping instead of min-clipping used
            # in the SAC algorithm since we want to minimize the return.
            with torch.no_grad():
                a2, logp_a2 = self.ga(
                    batch["s_"]
                )  # NOTE (rickstaa): Target actions come from *current* policy
                q1_pi_targ = self.q_1_(batch["s_"], a2)
                q2_pi_targ = self.q_2_(batch["s_"], a2)
                q_pi_targ = torch.max(
                    q1_pi_targ,
                    q2_pi_targ,
                )  # Use max clipping  to prevent overestimation bias.
                q_backup = batch["r"] + self._gamma * (
                    1 - batch["terminal"]) * (q_pi_targ - self.alpha * logp_a2)

            # Get the current Q values
            q1 = self.q_1(batch["s"], batch["a"])
            q2 = self.q_2(batch["s"], batch["a"])

            # Calculate Q-critic loss
            loss_q1 = 0.5 * ((q1 - q_backup)**2).mean()  # See Haarnoja eq. 5
            loss_q2 = 0.5 * ((q2 - q_backup)**2).mean()
            loss_q = loss_q1 + loss_q2

            # Perform one gradient descent step for the Q-critic
            loss_q.backward()
            self._q_train.step()

        ################################################
        # Optimize Gaussian actor ######################
        ################################################

        # Zero gradients on the actor
        self._a_train.zero_grad()

        # Retrieve log probabilities of batch observations based on *current* policy
        pi, log_pis = self.ga(batch["s"])

        # Compute actor loss
        if self.use_lyapunov:
            # Calculate the target Lyapunov value
            a2, _ = self.ga(
                batch["s_"]
            )  # NOTE (rickstaa): Target actions come from *current* policy
            lya_l_ = self.lc(batch["s_"], a2)

            # Compute Lyapunov Actor error
            self.l_delta = torch.mean(lya_l_ - l1.detach() +
                                      self._alpha_3 * batch["r"])  # See eq. 11

            # Compute actor loss
            a_loss = (self.labda.detach() * self.l_delta +
                      self.alpha.detach() * log_pis.mean())  # See eq. 12
        else:

            # Retrieve the current Q values
            q1_pi = self.q_1(
                batch["s"],
                pi)  # NOTE (rickstaa): Actions come from *current* policy
            q2_pi = self.q_2(
                batch["s"],
                pi)  # NOTE (rickstaa): Actions come from *current* policy
            q_pi = torch.max(q1_pi, q2_pi)

            # Compute actor loss
            a_loss = (self.alpha.detach() * log_pis -
                      q_pi).mean()  # See Haarnoja eq. 7

        # Perform one gradient descent step for the Gaussian Actor
        a_loss.backward()
        self._a_train.step()

        ################################################
        # Optimize alpha (Entropy temperature) #########
        ################################################

        # Zero gradients on alpha
        self._alpha_train.zero_grad()

        # Calculate alpha loss
        alpha_loss = -(self.alpha * (log_pis + self.target_entropy).detach()
                       ).mean()  # See Haarnoja eq. 17

        # Perform one gradient descent step for alpha
        alpha_loss.backward()
        self._alpha_train.step()

        ################################################
        # Optimize labda (Lyapunov temperature) ########
        ################################################
        if self.use_lyapunov:

            # Zero gradients on labda
            self._lambda_train.zero_grad()

            # Calculate labda loss
            # NOTE (rickstaa): Log_labda was used in the lambda_loss function because
            # using lambda caused the gradients to vanish. This is caused since we
            # restrict lambda within a 0-1.0 range using the clamp function (see #38).
            # Using log_lambda also is more numerically stable.
            labda_loss = -(self.log_labda * self.l_delta.detach()
                           ).mean()  # See formulas under eq. 14

            # Perform one gradient descent step for labda
            labda_loss.backward()
            self._lambda_train.step()

        ################################################
        # Update target networks and return ############
        # diagnostics. #################################
        ################################################

        # Update target networks
        self._update_targets()

        # Return diagnostics
        if self.use_lyapunov:
            return (
                self.labda.cpu().detach(),
                self.alpha.cpu().detach(),
                l_error.cpu().detach(),
                torch.mean(-log_pis.cpu().detach()),
                a_loss.cpu().detach(),
                alpha_loss.cpu().detach(),
                labda_loss.cpu().detach(),
            )
        else:
            return (
                self.alpha.cpu().detach(),
                loss_q.cpu().detach(),
                torch.mean(-log_pis.cpu().detach()),
                a_loss.cpu().detach(),
                alpha_loss.cpu().detach(),
            )

    def save_result(self, path):
        """Saves current policy.

        Args:
            path (str): The path where you want to save the policy.
        """

        # Retrieve save
        save_path = osp.abspath(osp.join(path, "policy/model.pth"))

        # Create folder if not exist
        if osp.exists(osp.dirname(save_path)):
            print(
                colorize(
                    ("WARN: Log dir %s already exists! Storing info there anyway."
                     % osp.dirname(save_path)),
                    "red",
                    bold=True,
                ))
        else:
            os.makedirs(osp.dirname(save_path))

        # Create models state dictionary
        if self.use_lyapunov:
            models_state_save_dict = {
                "use_lyapunov": self.use_lyapunov,
                "ga_state_dict": self.ga.state_dict(),
                "lc_state_dict": self.lc.state_dict(),
                "ga_targ_state_dict": self.ga_.state_dict(),
                "lc_targ_state_dict": self.lc_.state_dict(),
                "log_alpha": self.log_alpha,
                "log_labda": self.log_labda,
            }
        else:
            models_state_save_dict = {
                "use_lyapunov": self.use_lyapunov,
                "ga_state_dict": self.ga.state_dict(),
                "ga_targ_state_dict": self.ga_.state_dict(),
                "q1_state_dict": self.q_1.state_dict(),
                "q2_state_dict": self.q_2.state_dict(),
                "q1_targ_state_dict": self.q_1_.state_dict(),
                "q2_targ_state_dict": self.q_2_.state_dict(),
                "log_alpha": self.log_alpha,
            }

        # Save model state dictionary
        torch.save(models_state_save_dict, save_path)
        print(colorize(f"INFO: Save to path: {save_path}", "cyan", bold=True))

    def restore(self, path, restore_lagrance_multipliers=True):
        """Restores policy.

        Args:
            path (str): The path where you want to save the policy.

            restore_lagrance_multipliers (bool, optional): Whether you want to restore
                the lagrance multipliers.

        Returns:
            bool: Boolean specifying whether the policy was loaded successfully.
        """

        # Create load path
        load_path = osp.abspath(osp.join(path, "model.pth"))

        # Load the model state
        try:
            models_state_dict = torch.load(load_path)
        except (FileNotFoundError, NotADirectoryError):
            success_load = False
            return success_load

        # Throw warning if restored model is different from the model use now
        if self.use_lyapunov != models_state_dict["use_lyapunov"]:
            alg_strings = [
                "LAC" if self.use_lyapunov else "SAC",
                "LAC" if models_state_dict["use_lyapunov"] else "SAC",
            ]
            if TRAIN_PARAMS["continue_training"]:
                warn_str = colorize(
                    (f"ERROR: You tried to load a {alg_strings[1]} model while the "
                     f"`variant.py` file specifies you want to train it as a"
                     f"{alg_strings[0]} model. Shutting down training as this is "
                     "not yet supported."),
                    "red",
                    bold=True,
                )
                print(warn_str)
                sys.exit(0)
            else:
                warn_str = colorize(
                    (f"ERROR: You tried to load a {alg_strings[1]} model while the "
                     f"`variant.py` file specifies you want to use it in the "
                     f"inference as a {alg_strings[0]} model. As a result the "
                     "`variant.py` will be ignored."),
                    "yellow",
                    bold=True,
                )
                print(warn_str)
                self.__reload_critic_networks(
                    use_lyapunov=models_state_dict["use_lyapunov"])

        # Restore network parameters
        try:
            if models_state_dict["use_lyapunov"]:
                self.use_lyapunov = models_state_dict["use_lyapunov"]
                self.ga.load_state_dict(models_state_dict["ga_state_dict"])
                self.lc.load_state_dict(models_state_dict["lc_state_dict"])
                self.ga_.load_state_dict(
                    models_state_dict["ga_targ_state_dict"])
                self.lc_.load_state_dict(
                    models_state_dict["lc_targ_state_dict"])
                if restore_lagrance_multipliers:
                    self.log_alpha = models_state_dict["log_alpha"]
                    self.log_labda = models_state_dict["log_labda"]
            else:
                self.use_lyapunov = models_state_dict["use_lyapunov"]
                self.ga.load_state_dict(models_state_dict["ga_state_dict"])
                self.ga_.load_state_dict(
                    models_state_dict["ga_targ_state_dict"])
                self.q_1.load_state_dict(models_state_dict["q1_state_dict"])
                self.q_2.load_state_dict(models_state_dict["q2_state_dict"])
                self.q_1_.load_state_dict(
                    models_state_dict["q1_targ_state_dict"])
                self.q_2_.load_state_dict(
                    models_state_dict["q2_targ_state_dict"])
                if restore_lagrance_multipliers:
                    self.log_alpha = models_state_dict["log_alpha"]
        except (KeyError, AttributeError):
            alg_string = "LAC" if models_state_dict["use_lyapunov"] else "SAC"
            print(
                colorize(
                    ("ERROR: Something went wrong while trying to load the "
                     f"{alg_string} model. Shutting down the training."),
                    "red",
                    bold=True,
                ))
            sys.exit(0)

        # Return result
        success_load = True
        return success_load

    def __reload_critic_networks(self, use_lyapunov):
        """Function used to reload the right networks when the loaded model type
        differs from the type set in the `variant.py` file. Currently only used during
        inference.

        Args:
            use_lyapunov (bool): Whether the new setup should use lyapunov or not.
        """
        # Create required networks
        if use_lyapunov:  # LAC

            # Print reload message
            print(
                colorize(
                    "INFO: You switched to using the LAC algorithm.",
                    "green",
                    bold=True,
                ))

            # Create log_labda
            self.log_labda = torch.tensor(ALG_PARAMS["labda"],
                                          dtype=torch.float32).log()
            self.log_labda.requires_grad = True

            # Create main and target Lyapunov Critic networks
            self.lc = LyapunovCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["critic"],
            ).to(self.device)
            self.lc_ = deepcopy(self.lc).to(self.device)

            # Remove main and target Q-Critic networks
            # NOTE (rickstaa): Removed to make sure we notice if something goes wrong.
            delattr(self, "q_1")
            delattr(self, "q_2")
            delattr(self, "q_1_")
            delattr(self, "q_2_")
        else:  # SAC

            # Print reload message
            print(
                colorize(
                    "WARN: You switched to using the SAC algorithm.",
                    "yellow",
                    bold=True,
                ))

            # Create main and target Q-Critic networks
            self.q_1 = QCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["q_critic"],
            ).to(self.device)
            self.q_2 = QCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["q_critic"],
            ).to(self.device)
            self.q_1_ = deepcopy(self.q_1).to(self.device)
            self.q_2_ = deepcopy(self.q_2).to(self.device)

            # Remove main and target Q-Critic networks
            delattr(self, "lc")
            delattr(self, "lc_")

    def _set_learning_rates(self,
                            lr_a=None,
                            lr_alpha=None,
                            lr_l=None,
                            lr_labda=None,
                            lr_c=None):
        """Adjusts the learning rates of the optimizers.

        Args:
            lr_a (float, optional): The learning rate of the actor optimizer. Defaults
                to None.

            lr_alpha (float, optional): The learning rate of the temperature optimizer.
                Defaults to None.

            lr_l (float, optional): The learning rate of the Lyapunov critic. Defaults
                to None.

            lr_labda (float, optional): The learning rate of the Lyapunov Lagrance
                multiplier optimizer. Defaults to None.

            lr_c (float, optional): The learning rate of the Q-Critic optimizer.
                Defaults to None.
        """
        if lr_a:
            for param_group in self._a_train.param_groups:
                param_group["lr"] = lr_a
        if lr_alpha:
            for param_group in self._alpha_train.param_groups:
                param_group["lr"] = lr_alpha
        if self.use_lyapunov:
            if lr_l:
                for param_group in self._l_train.param_groups:
                    param_group["lr"] = lr_l
            if lr_labda:
                for param_group in self._lambda_train.param_groups:
                    param_group["lr"] = lr_labda
        else:
            if lr_c:
                for param_group in self._q_train.param_groups:
                    param_group["lr"] = lr_c

    def _update_targets(self):
        """Updates the target networks based on a Exponential moving average
        (Polyak averaging).
        """
        with torch.no_grad():
            for ga_main, ga_targ in zip(self.ga.parameters(),
                                        self.ga_.parameters()):
                ga_targ.data.mul_(self._polyak)
                ga_targ.data.add_((1 - self._polyak) * ga_main.data)
            if self.use_lyapunov:
                for lc_main, lc_targ in zip(self.lc.parameters(),
                                            self.lc_.parameters()):
                    lc_targ.data.mul_(self._polyak)
                    lc_targ.data.add_((1 - self._polyak) * lc_main.data)
            else:
                for q_1_main, q_1_targ in zip(self.q_1.parameters(),
                                              self.q_1_.parameters()):
                    q_1_targ.data.mul_(self._polyak)
                    q_1_targ.data.add_((1 - self._polyak) * q_1_main.data)
                for q_2_main, q_2_targ in zip(self.q_2.parameters(),
                                              self.q_2_.parameters()):
                    q_2_targ.data.mul_(self._polyak)
                    q_2_targ.data.add_((1 - self._polyak) * q_2_main.data)

    @property
    def alpha(self):
        """Property used to clip alpha to be equal or bigger than 0.0 to prevent it from
        becoming nan when log_alpha becomes -inf. For alpha no upper bound is used.
        """
        return torch.clamp(self.log_alpha.exp(), *SCALE_ALPHA_MIN_MAX)

    @property
    def labda(self):
        """Property used to clip lambda to be equal or bigger than 0.0 in order to
        prevent it from becoming nan when log_labda becomes -inf. Further we clip it to
        be lower or equal than 1.0 in order to prevent lambda from exploding when the
        the hyperparameters are chosen badly.
        """
        return torch.clamp(self.log_labda.exp(), *SCALE_LAMBDA_MIN_MAX)

    @property
    def act_limits(self):
        return self._act_limits

    @act_limits.setter
    def act_limits(self, act_limits):
        """Sets the action limits that are used for scaling the actions that are
        returned from the gaussian policy.
        """

        # Validate input
        missing_keys = [
            key for key in ["low", "high"] if key not in act_limits.keys()
        ]
        if missing_keys:
            warn_string = "WARN: act_limits could not be set as {} not found.".format(
                f"keys {missing_keys} were"
                if len(missing_keys) > 1 else f"key {missing_keys} was")
            print(colorize(warn_string, "yellow"))
        invalid_length = [
            key for key, val in act_limits.items() if len(val) != self._a_dim
        ]
        if invalid_length:
            warn_string = (
                f"WARN: act_limits could not be set as the length of {invalid_length} "
                + "{}".format("were" if len(invalid_length) > 1 else "was") +
                f" unequal to the dimension of the action space (dim={self._a_dim})."
            )
            print(colorize(warn_string, "yellow"))

        # Set action limits
        self._act_limits = {
            "low": act_limits["low"],
            "high": act_limits["high"]
        }
        self.ga.act_limits = self._act_limits
Esempio n. 6
0
    def __init__(self, a_dim, s_dim, act_limits=None):
        """Initiates object state.

        Args:
            a_dim (int): Action space dimension.

            s_dim (int): Observation space dimension.

            act_limits (dict, optional): The "high" and "low" action bounds of the
                environment. Used for rescaling the actions that comes out of network
                from (-1, 1) to (low, high). Defaults to (-1, 1).
        """

        # Display information about the algorithm being used (LAC or SAC)
        if ALG_PARAMS["use_lyapunov"]:
            print(
                colorize("INFO: You are using the LAC algorithm.",
                         "green",
                         bold=True))
        else:
            print(
                colorize("WARN: You are using the SAC algorithm.",
                         "yellow",
                         bold=True))

        # Set the computational device
        self.device = DEVICE

        # Save action and observation space as members
        self._a_dim = a_dim
        self._s_dim = s_dim
        self._act_limits = act_limits

        # Save algorithm parameters as class objects
        self.use_lyapunov = ALG_PARAMS["use_lyapunov"]
        self._network_structure = ALG_PARAMS["network_structure"]
        self._polyak = 1 - ALG_PARAMS["tau"]
        self._gamma = ALG_PARAMS["gamma"]
        self._alpha_3 = ALG_PARAMS["alpha3"]

        # Determine target entropy
        # NOTE (rickstaa): If not defined we use the Lower bound of the policy entropy
        if ALG_PARAMS["target_entropy"] is None:
            self.target_entropy = -self._a_dim
        else:
            self.target_entropy = ALG_PARAMS["target_entropy"]

        # Create Learning rate placeholders
        self._lr_a = ALG_PARAMS["lr_a"]
        if self.use_lyapunov:
            self._lr_lag = ALG_PARAMS["lr_a"]
            self._lr_l = ALG_PARAMS["lr_l"]
        else:
            self._lr_c = ALG_PARAMS["lr_c"]

        # Make sure alpha and alpha are not zero
        # NOTE (rickstaa): This is needed to prevent log_alpha/log_lambda from becoming
        # -np.inf
        ALG_PARAMS["alpha"] = (1e-37 if ALG_PARAMS["alpha"] == 0.0 else
                               ALG_PARAMS["alpha"])
        ALG_PARAMS["labda"] = (1e-37 if ALG_PARAMS["labda"] == 0.0 else
                               ALG_PARAMS["labda"])

        # Create variables for the Lagrance multipliers
        self.log_alpha = torch.tensor(ALG_PARAMS["alpha"],
                                      dtype=torch.float32).log()
        self.log_alpha.requires_grad = True
        if self.use_lyapunov:
            self.log_labda = torch.tensor(ALG_PARAMS["labda"],
                                          dtype=torch.float32).log()
            self.log_labda.requires_grad = True

        # Create Gaussian Actor (GA) and Lyapunov critic (LC) or Q-Critic (QC) networks
        # NOTE (rickstaa): Pytorch currently uses kaiming initialization for the baises
        # in the future this will change to zero initialization
        # (https://github.com/pytorch/pytorch/issues/18182). This however does not
        # influence the results.
        self.ga = SquashedGaussianActor(
            obs_dim=self._s_dim,
            act_dim=self._a_dim,
            hidden_sizes=self._network_structure["actor"],
            act_limits=self.act_limits,
        ).to(self.device)
        if self.use_lyapunov:
            self.lc = LyapunovCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["critic"],
            ).to(self.device)
        else:
            # NOTE (rickstaa): We create two Q-critics so we can use the Clipped
            # double-Q trick.
            self.q_1 = QCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["q_critic"],
            ).to(self.device)
            self.q_2 = QCritic(
                obs_dim=self._s_dim,
                act_dim=self._a_dim,
                hidden_sizes=self._network_structure["q_critic"],
            ).to(self.device)

        # Create GA, LC and QC target networks
        # Don't get optimized but get updated according to the EMA of the main
        # networks
        self.ga_ = deepcopy(self.ga).to(self.device)
        if self.use_lyapunov:
            self.lc_ = deepcopy(self.lc).to(self.device)
        else:
            self.q_1_ = deepcopy(self.q_1).to(self.device)
            self.q_2_ = deepcopy(self.q_2).to(self.device)

        # Freeze target networks
        for p in self.ga_.parameters():
            p.requires_grad = False
        if self.use_lyapunov:
            for p in self.lc_.parameters():
                p.requires_grad = False
        else:
            for p in self.q_1_.parameters():
                p.requires_grad = False
            for p in self.q_2_.parameters():
                p.requires_grad = False

        # Create optimizers
        # NOTE (rickstaa): We here optimize for log_alpha and log_labda instead of
        # alpha and labda because it is more numerically stable (see:
        # https://github.com/rail-berkeley/softlearning/issues/136)
        self._alpha_train = Adam([self.log_alpha], lr=self._lr_a)
        self._a_train = Adam(self.ga.parameters(), lr=self._lr_a)
        if self.use_lyapunov:
            self._lambda_train = Adam([self.log_labda], lr=self._lr_lag)
            self._l_train = Adam(self.lc.parameters(), lr=self._lr_l)
        else:
            q_params = itertools.chain(
                self.q_1.parameters(),
                self.q_2.parameters())  # Chain parameters of the two Q-critics
            self._q_train = Adam(q_params, lr=self._lr_c)