def _build_l(self, name="lyapunov_critic", trainable=True, seed=None): """Setup lyapunov critic graph. Args: name (str, optional): Network name. Defaults to "lyapunov_critic". trainable (bool, optional): Whether the weights of the network layers should be trainable. Defaults to True. seed (int, optional): The seed used for the weight initialization. Defaults to None. Returns: tuple: Tuple with network output tensors. """ # Return GA # TODO: Check if trainable is needed return LyapunovCritic( obs_dim=self.s_dim, act_dim=self.a_dim, hidden_sizes=self.network_structure["critic"], name=name, trainable=trainable, seed=seed, )
class LAC(tf.Module): """The Lyapunov actor critic. Attributes: ga (tf.keras.Model): The Squashed Gaussian Actor network. ga_ (tf.keras.Model): The Squashed Gaussian Actor target network. lc (tf.keras.Model: The Lyapunov Critic network. lc_ (tf.keras.Model: The Lyapunov Critic target network. q_1 (tf.keras.Model: The first Q-Critic network. q_2 (tf.keras.Model: The second Q-Critic network. q_2_ (tf.keras.Model: The first Q-Critic target network. q_2_ (tf.keras.Model): The second Q-Crictic target network. log_alpha (tf.Variable): The temperature lagrance multiplier. log_labda (tf.Variable): The Lyapunov lagrance multiplier. target_entropy (int): The target entropy. device (str): The device the networks are placed on (CPU or GPU). use_lyapunov (bool): Whether the Lyapunov Critic is used (use_lyapunov=True) or the regular Q-critic (use_lyapunov=false). """ def __init__(self, a_dim, s_dim, act_limits=None): """Initiates object state. Args: a_dim (int): Action space dimension. s_dim (int): Observation space dimension. act_limits (dict, optional): The "high" and "low" action bounds of the environment. Used for rescaling the actions that comes out of network from (-1, 1) to (low, high). Defaults to (-1, 1). """ # Display information about the algorithm being used (LAC or SAC) if ALG_PARAMS["use_lyapunov"]: print( colorize("INFO: You are using the LAC algorithm.", "green", bold=True)) else: print( colorize("WARN: You are using the SAC algorithm.", "yellow", bold=True)) # Save action and observation space as members self._a_dim = a_dim self._s_dim = s_dim self._act_limits = act_limits # Save algorithm parameters as class objects self.use_lyapunov = ALG_PARAMS["use_lyapunov"] self._network_structure = ALG_PARAMS["network_structure"] self._polyak = 1 - ALG_PARAMS["tau"] self._gamma = ALG_PARAMS["gamma"] self._alpha_3 = ALG_PARAMS["alpha3"] # Determine target entropy # NOTE (rickstaa): If not defined we use the Lower bound of the policy entropy if ALG_PARAMS["target_entropy"] is None: self.target_entropy = -self._a_dim else: self.target_entropy = ALG_PARAMS["target_entropy"] # Create Learning rate placeholders self._lr_a = tf.Variable(ALG_PARAMS["lr_a"], name="LR_A") if self.use_lyapunov: self._lr_lag = tf.Variable(ALG_PARAMS["lr_a"], name="LR_lag") self._lr_l = tf.Variable(ALG_PARAMS["lr_l"], name="LR_L") else: self._lr_c = tf.Variable(ALG_PARAMS["lr_c"], name="LR_C") # Make sure alpha and alpha are not zero # NOTE (rickstaa): This is needed to prevent log_alpha/log_lambda from becoming # -np.inf ALG_PARAMS["alpha"] = (1e-37 if ALG_PARAMS["alpha"] == 0.0 else ALG_PARAMS["alpha"]) ALG_PARAMS["labda"] = (1e-37 if ALG_PARAMS["labda"] == 0.0 else ALG_PARAMS["labda"]) # Create placeholders for the Lagrance multipliers self.log_alpha = tf.Variable(tf.math.log(ALG_PARAMS["alpha"]), name="log_alpha") if self.use_lyapunov: self.log_labda = tf.Variable(tf.math.log(ALG_PARAMS["labda"]), name="log_lambda") # Create Gaussian Actor (GA) and Lyapunov critic (LC) or Q-Critic (QC) networks self.ga = SquashedGaussianActor( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["actor"], act_limits=self.act_limits, ) if self.use_lyapunov: self.lc = LyapunovCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["critic"], ) else: # NOTE (rickstaa): We create two Q-critics so we can use the Clipped # double-Q trick. self.q_1 = QCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["q_critic"], ) self.q_2 = QCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["q_critic"], ) # Create GA, LC and QC target networks # Don't get optimized but get updated according to the EMA of the main # networks self.ga_ = SquashedGaussianActor( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["actor"], act_limits=self.act_limits, ) if self.use_lyapunov: self.lc_ = LyapunovCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["critic"], ) else: self.q_1_ = QCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["q_critic"], ) self.q_2_ = QCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["q_critic"], ) self._init_targets() # Create optimizers # NOTE (rickstaa): We here optimize for log_alpha and log_labda instead of # alpha and labda because it is more numerically stable (see: # https://github.com/rail-berkeley/softlearning/issues/136) self._a_train = tf.keras.optimizers.Adam(learning_rate=self._lr_a) self._alpha_train = tf.keras.optimizers.Adam(learning_rate=self._lr_a) if self.use_lyapunov: self._lambda_train = tf.keras.optimizers.Adam( learning_rate=self._lr_lag) self._l_train = tf.keras.optimizers.Adam(learning_rate=self._lr_l) else: self._main_q_vars = (self.q_1.trainable_variables + self.q_2.trainable_variables ) # Chain parameters of the two Q-critics self._q_train = tf.keras.optimizers.Adam(learning_rate=self._lr_c) # Create model save dict if self.use_lyapunov: self._save_dict = { "gaussian_actor": self.ga, "lyapunov_critic": self.lc, "log_alpha": self.log_alpha, "log_labda": self.log_labda, "use_lyapunov": self.use_lyapunov, } else: self._save_dict = { "gaussian_actor": self.ga, "q_critic_1": self.q_1, "q_critic_2": self.q_2, "log_alpha": self.log_alpha, "use_lyapunov": self.use_lyapunov, } @tf.function def choose_action(self, s, evaluation=False): """Returns the current action of the policy. Args: s (np.numpy): The current state. evaluation (bool, optional): Whether to return a deterministic action. Defaults to False. Returns: np.numpy: The current action. """ # Make sure s is float32 tensorflow tensor if not isinstance(s, tf.Tensor): s = tf.convert_to_tensor(s, dtype=tf.float32) elif s.dtype != tf.float32: s = tf.cast(s, dtype=tf.float32) # Get current best action if evaluation is True: try: det_a, _ = self.ga(tf.reshape(s, (1, -1)), deterministic=True) return det_a[0] except ValueError: return else: a, _ = self.ga(tf.reshape(s, (1, -1))) return a[0] @tf.function def learn(self, lr_a, lr_l, lr_lag, lr_c, batch): """Runs the SGD to update all the optimize parameters. Args: lr_a (float): Current actor learning rate. lr_l (float): Lyapunov critic learning rate. lr_c (float): Q-Critic learning rate. lr_lag (float): Lyapunov constraint langrance multiplier learning rate. batch (numpy.ndarray): The batch of experiences. Returns: tuple: Tuple with some diagnostics about the training. """ # Adjust optimizer learning rates (decay) self._set_learning_rates(lr_a=lr_a, lr_alpha=lr_a, lr_l=lr_l, lr_labda=lr_lag, lr_c=lr_c) ################################################ # Optimize (Lyapunov/Q) critic ################# ################################################ if self.use_lyapunov: # Get target Lyapunov value (Bellman-backup) a2_, _ = self.ga_( batch["s_"] ) # NOTE (rickstaa): Target actions come from *current* *target* policy l_pi_targ = self.lc_(batch["s_"], a2_) l_backup = (batch["r"] + self._gamma * (1 - batch["terminal"]) * l_pi_targ ) # The Lyapunov candidate # Compute Lyapunov Critic error gradients with tf.GradientTape() as l_tape: # Get current Lyapunov value l1 = self.lc(batch["s"], batch["a"]) # Calculate Lyapunov *CRITIC* error # NOTE (rickstaa): The 0.5 multiplication factor was added to make the # derivation cleaner and can be safely removed without influencing the # minimization. We kept it here for consistency. l_error = 0.5 * tf.reduce_mean((l1 - l_backup)**2) # See eq. 7 # Perform one gradient descent step for the Lyapunov critic l_grads = l_tape.gradient(l_error, self.lc.trainable_variables) self._l_train.apply_gradients( zip(l_grads, self.lc.trainable_variables)) else: # Get target Q values (Bellman-backup) # NOTE (rickstaa): Here we use max-clipping instead of min-clipping used # in the SAC algorithm since we want to minimize the return. a2, logp_a2 = self.ga( batch["s_"] ) # NOTE (rickstaa): Target actions come from *current* policy q1_pi_targ = self.q_1_(batch["s_"], a2) q2_pi_targ = self.q_2_(batch["s_"], a2) q_pi_targ = tf.maximum( q1_pi_targ, q2_pi_targ ) # Use max clipping to prevent overestimation bias. q_backup = batch["r"] + self._gamma * (1 - batch["terminal"]) * ( q_pi_targ - self.alpha * logp_a2) # Compute the Q-Critic loss gradients with tf.GradientTape() as q_tape: # Get the current Q values q1 = self.q_1(batch["s"], batch["a"]) q2 = self.q_2(batch["s"], batch["a"]) # Calculate Q-critic loss loss_q1 = 0.5 * tf.reduce_mean( (q1 - q_backup)**2) # See Haarnoja eq. 5 loss_q2 = 0.5 * tf.reduce_mean((q2 - q_backup)**2) loss_q = loss_q1 + loss_q2 # Perform one gradient descent step for the Q-critic q_grads = q_tape.gradient(loss_q, self._main_q_vars) self._q_train.apply_gradients(zip(q_grads, self._main_q_vars)) ################################################ # Optimize Gaussian actor ###################### ################################################ # Compute actor loss gradients with tf.GradientTape() as a_tape: # Retrieve log probabilities of batch observations based on *current* # policy pi, log_pis = self.ga(batch["s"]) # Compute actor loss if self.use_lyapunov: # Calculate the target Lyapunov value a2, _ = self.ga( batch["s_"] ) # NOTE (rickstaa): Target actions come from *current* policy lya_l_ = self.lc(batch["s_"], a2) # Compute Lyapunov Actor error self.l_delta = tf.reduce_mean( lya_l_ - tf.stop_gradient(l1) + self._alpha_3 * batch["r"]) # See eq. 11 # Calculate actor loss a_loss = tf.stop_gradient( self.labda) * self.l_delta + tf.stop_gradient( self.alpha) * tf.reduce_mean(log_pis) # See eq. 12 else: # Retrieve the current Q values q1_pi = self.q_1( batch["s"], pi) # NOTE (rickstaa): Actions come from *current* policy q2_pi = self.q_2( batch["s"], pi) # NOTE (rickstaa): Actions come from *current* policy q_pi = tf.maximum(q1_pi, q2_pi) # Calculate actor loss a_loss = tf.reduce_mean( tf.stop_gradient(self.alpha) * log_pis - q_pi) # See Haarnoja eq. 7 # Perform one gradient descent step for the Gaussian Actor a_grads = a_tape.gradient(a_loss, self.ga.trainable_variables) self._a_train.apply_gradients(zip(a_grads, self.ga.trainable_variables)) ################################################ # Optimize alpha (Entropy temperature) ######### ################################################ # Compute alpha loss gradients with tf.GradientTape() as alpha_tape: # Calculate alpha loss alpha_loss = -tf.reduce_mean(self.alpha * tf.stop_gradient( log_pis + self.target_entropy)) # See Haarnoja eq. 17 # Perform one gradient descent step for alpha alpha_grads = alpha_tape.gradient(alpha_loss, [self.log_alpha]) self._alpha_train.apply_gradients(zip(alpha_grads, [self.log_alpha])) ################################################ # Optimize labda (Lyapunov temperature) ######## ################################################ if self.use_lyapunov: # Compute labda loss gradients with tf.GradientTape() as lambda_tape: # Calculate labda loss # NOTE (rickstaa): Log_labda was used in the lambda_loss function # because using lambda caused the gradients to vanish. This is caused # since we restrict lambda within a 0-1.0 range using the clamp function # (see #38). Using log_lambda also is more numerically stable. labda_loss = -tf.reduce_mean(self.log_labda * tf.stop_gradient( self.l_delta)) # See formulas under eq. 14 # Perform one gradient descent step for labda lambda_grads = lambda_tape.gradient(labda_loss, [self.log_labda]) self._lambda_train.apply_gradients( zip(lambda_grads, [self.log_labda])) ################################################ # Update target networks and return ############ # diagnostics. ################################# ################################################ # Update target networks self._update_targets() # Return diagnostics if self.use_lyapunov: return ( self.labda, self.alpha, l_error, tf.reduce_mean(tf.stop_gradient(-log_pis)), a_loss, alpha_loss, labda_loss, ) else: return ( self.alpha, loss_q, tf.reduce_mean(tf.stop_gradient(-log_pis)), a_loss, alpha_loss, ) def save_result(self, path): """Saves current policy. Args: path (str): The path where you want to save the policy. """ # Make save path absolute save_path = osp.abspath(osp.join(path, "policy")) # Create folder if it does not yet exist if not os.path.exists(save_path): os.makedirs(save_path) # Save all models/tensors in the _save_dict vars_dict = {} for name, item in self._save_dict.items(): if issubclass(item.__class__, tf.keras.Model): item.save_weights(osp.join(save_path, name)) print( colorize( f"Saved '{name}' weights to path: {save_path}", "cyan", bold=True, )) elif issubclass(item.__class__, tf.Variable): vars_dict[name] = item.numpy() else: vars_dict[name] = item # Save vars dictionary with open(osp.join(save_path, "vars.json"), "w") as fp: vars_dict = convert_json(vars_dict) # Convert to json format json_data = json.dumps(vars_dict, separators=(",", ":\t"), indent=4, sort_keys=True) fp.write(json_data) print(colorize("Saving other vars:\n", color="cyan", bold=True)) print(colorize(json_data, "cyan", bold=True)) def restore(self, path, restore_lagrance_multipliers=True): """Restores policy. Args: path (str): The path where you want to save the policy. restore_lagrance_multipliers (bool, optional): Whether you want to restore the lagrance multipliers. Returns: bool: Boolean specifying whether the policy was loaded successfully. """ # Create load path load_path = osp.abspath(path) # Load train configuration try: with open(osp.join(load_path, "vars.json"), "r") as f: train_config = json.load(f) except (FileNotFoundError, NotADirectoryError): success_load = False return success_load # Throw warning if restored model is different from the model use now if self.use_lyapunov != train_config["use_lyapunov"]: alg_strings = [ "LAC" if self.use_lyapunov else "SAC", "LAC" if train_config["use_lyapunov"] else "SAC", ] if TRAIN_PARAMS["continue_training"]: warn_str = colorize( (f"ERROR: You tried to load a {alg_strings[1]} model while the " f"`variant.py` file specifies you want to train it as a" f"{alg_strings[0]} model. Shutting down training as this is " "not yet supported."), "red", bold=True, ) print(warn_str) sys.exit(0) else: warn_str = colorize( (f"ERROR: You tried to load a {alg_strings[1]} model while the " f"`variant.py` file specifies you want to use it in the " f"inference as a {alg_strings[0]} model. As a result the " "`variant.py` will be ignored."), "yellow", bold=True, ) print(warn_str) self.__reload_critic_networks( use_lyapunov=train_config["use_lyapunov"]) # Check if the models exist try: checkpoints = [ f.replace(".index", "") for f in os.listdir(load_path) if f.endswith(".index") ] except (FileNotFoundError, NotADirectoryError): success_load = False return success_load # Check if any checkpoints were found if not checkpoints: success_load = False return success_load # Restore network parameters try: if train_config["use_lyapunov"]: self.ga.load_weights(load_path + "/gaussian_actor") self.lc.load_weights(load_path + "/lyapunov_critic") if restore_lagrance_multipliers: self.log_alpha = train_config["log_alpha"] self.log_labda = train_config["log_labda"] else: self.ga.load_weights(load_path + "/gaussian_actor") self.q_1.load_weights(load_path + "/q_critic_1") self.q_2.load_weights(load_path + "/q_critic_2") if restore_lagrance_multipliers: self.log_alpha = train_config["log_alpha"] except (KeyError, AttributeError): alg_string = "LAC" if train_config["use_lyapunov"] else "SAC" print( colorize( ("ERROR: Something went wrong while trying to load the " f"{alg_string} model. Shutting down the training."), "red", bold=True, )) sys.exit(0) # Return result success_load = True return success_load def __reload_critic_networks(self, use_lyapunov): """Function used to reload the right networks when the loaded model type differs from the type set in the `variant.py` file. Currently only used during inference. Args: use_lyapunov (bool): Whether the new setup should use lyapunov or not. """ # Create required networks if use_lyapunov: # LAC # Print reload message print( colorize( "INFO: You switched to using the LAC algorithm.", "green", bold=True, )) # Create log_labda self.log_labda = tf.Variable(tf.math.log(ALG_PARAMS["labda"]), name="log_lambda") # Create main and target Lyapunov Critic networks self.lc = LyapunovCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["critic"], ) self.lc_ = LyapunovCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["critic"], ) # Remove main and target Q-Critic networks # NOTE (rickstaa): Removed to make sure we notice if something goes wrong. delattr(self, "q_1") delattr(self, "q_2") delattr(self, "q_1_") delattr(self, "q_2_") else: # SAC # Print reload message print( colorize( "WARN: You switched to using the SAC algorithm.", "yellow", bold=True, )) # Create main and target Q-Critic networks self.q_1 = QCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["q_critic"], ) self.q_2 = QCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["q_critic"], ) self.q_1_ = QCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["q_critic"], ) self.q_2_ = QCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["q_critic"], ) # Remove main and target Q-Critic networks delattr(self, "lc") delattr(self, "lc_") def _set_learning_rates(self, lr_a=None, lr_alpha=None, lr_l=None, lr_labda=None, lr_c=None): """Adjusts the learning rates of the optimizers. Args: lr_a (float, optional): The learning rate of the actor optimizer. Defaults to None. lr_alpha (float, optional): The learning rate of the temperature optimizer. Defaults to None. lr_l (float, optional): The learning rate of the Lyapunov critic. Defaults to None. lr_labda (float, optional): The learning rate of the Lyapunov Lagrance multiplier optimizer. Defaults to None. lr_c (float, optional): The learning rate of the Q-Critic optimizer. Defaults to None. """ if lr_a: self._a_train.lr.assign(lr_a) if lr_alpha: self._alpha_train.lr.assign(lr_alpha) if self.use_lyapunov: if lr_l: self._l_train.lr.assign(lr_l) if lr_labda: self._lambda_train.lr.assign(lr_labda) else: if lr_c: self._q_train.lr.assign(lr_c) @tf.function def _init_targets(self): """Updates the target network weights to the main network weights.""" for ga_main, ga_targ in zip(self.ga.variables, self.ga_.variables): ga_targ.assign(ga_main) if self.use_lyapunov: for lc_main, lc_targ in zip(self.lc.variables, self.lc_.variables): lc_targ.assign(lc_main) else: for q_1_main, q_1_targ in zip(self.q_1.variables, self.q_1_.variables): q_1_targ.assign(q_1_main) for q_2_main, q_2_targ in zip(self.q_2.variables, self.q_2_.variables): q_2_targ.assign(q_2_main) @tf.function def _update_targets(self): """Updates the target networks based on a Exponential moving average (Polyak averaging). """ for ga_main, ga_targ in zip(self.ga.variables, self.ga_.variables): ga_targ.assign(self._polyak * ga_targ + (1 - self._polyak) * ga_main) if self.use_lyapunov: for lc_main, lc_targ in zip(self.lc.variables, self.lc_.variables): lc_targ.assign(self._polyak * lc_targ + (1 - self._polyak) * lc_main) else: for q_1_main, q_1_targ in zip(self.q_1.variables, self.q_1_.variables): q_1_targ.assign(self._polyak * q_1_targ + (1 - self._polyak) * q_1_main) for q_2_main, q_2_targ in zip(self.q_2.variables, self.q_2_.variables): q_2_targ.assign(self._polyak * q_2_targ + (1 - self._polyak) * q_2_main) @property def alpha(self): """Property used to clip alpha to be equal or bigger than 0.0 to prevent it from becoming nan when log_alpha becomes -inf. For alpha no upper bound is used. """ return tf.clip_by_value(tf.exp(self.log_alpha), *SCALE_ALPHA_MIN_MAX) @property def labda(self): """Property used to clip lambda to be equal or bigger than 0.0 in order to prevent it from becoming nan when log_labda becomes -inf. Further we clip it to be lower or equal than 1.0 in order to prevent lambda from exploding when the the hyperparameters are chosen badly. """ return tf.clip_by_value(tf.exp(self.log_labda), *SCALE_LAMBDA_MIN_MAX) @property def act_limits(self): return self._act_limits @act_limits.setter def act_limits(self, act_limits): """Sets the action limits that are used for scaling the actions that are returned from the gaussian policy. """ # Validate input missing_keys = [ key for key in ["low", "high"] if key not in act_limits.keys() ] if missing_keys: warn_string = "WARN: act_limits could not be set as {} not found.".format( f"keys {missing_keys} were" if len(missing_keys) > 1 else f"key {missing_keys} was") print(colorize(warn_string, "yellow")) invalid_length = [ key for key, val in act_limits.items() if len(val) != self._a_dim ] if invalid_length: warn_string = ( f"WARN: act_limits could not be set as the length of {invalid_length} " + "{}".format("were" if len(invalid_length) > 1 else "was") + f" unequal to the dimension of the action space (dim={self._a_dim})." ) print(colorize(warn_string, "yellow")) # Set action limits self._act_limits = { "low": act_limits["low"], "high": act_limits["high"] } self.ga.act_limits = self._act_limits
def __reload_critic_networks(self, use_lyapunov): """Function used to reload the right networks when the loaded model type differs from the type set in the `variant.py` file. Currently only used during inference. Args: use_lyapunov (bool): Whether the new setup should use lyapunov or not. """ # Create required networks if use_lyapunov: # LAC # Print reload message print( colorize( "INFO: You switched to using the LAC algorithm.", "green", bold=True, )) # Create log_labda self.log_labda = tf.Variable(tf.math.log(ALG_PARAMS["labda"]), name="log_lambda") # Create main and target Lyapunov Critic networks self.lc = LyapunovCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["critic"], ) self.lc_ = LyapunovCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["critic"], ) # Remove main and target Q-Critic networks # NOTE (rickstaa): Removed to make sure we notice if something goes wrong. delattr(self, "q_1") delattr(self, "q_2") delattr(self, "q_1_") delattr(self, "q_2_") else: # SAC # Print reload message print( colorize( "WARN: You switched to using the SAC algorithm.", "yellow", bold=True, )) # Create main and target Q-Critic networks self.q_1 = QCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["q_critic"], ) self.q_2 = QCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["q_critic"], ) self.q_1_ = QCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["q_critic"], ) self.q_2_ = QCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["q_critic"], ) # Remove main and target Q-Critic networks delattr(self, "lc") delattr(self, "lc_")
def __init__(self, a_dim, s_dim, act_limits=None): """Initiates object state. Args: a_dim (int): Action space dimension. s_dim (int): Observation space dimension. act_limits (dict, optional): The "high" and "low" action bounds of the environment. Used for rescaling the actions that comes out of network from (-1, 1) to (low, high). Defaults to (-1, 1). """ # Display information about the algorithm being used (LAC or SAC) if ALG_PARAMS["use_lyapunov"]: print( colorize("INFO: You are using the LAC algorithm.", "green", bold=True)) else: print( colorize("WARN: You are using the SAC algorithm.", "yellow", bold=True)) # Save action and observation space as members self._a_dim = a_dim self._s_dim = s_dim self._act_limits = act_limits # Save algorithm parameters as class objects self.use_lyapunov = ALG_PARAMS["use_lyapunov"] self._network_structure = ALG_PARAMS["network_structure"] self._polyak = 1 - ALG_PARAMS["tau"] self._gamma = ALG_PARAMS["gamma"] self._alpha_3 = ALG_PARAMS["alpha3"] # Determine target entropy # NOTE (rickstaa): If not defined we use the Lower bound of the policy entropy if ALG_PARAMS["target_entropy"] is None: self.target_entropy = -self._a_dim else: self.target_entropy = ALG_PARAMS["target_entropy"] # Create Learning rate placeholders self._lr_a = tf.Variable(ALG_PARAMS["lr_a"], name="LR_A") if self.use_lyapunov: self._lr_lag = tf.Variable(ALG_PARAMS["lr_a"], name="LR_lag") self._lr_l = tf.Variable(ALG_PARAMS["lr_l"], name="LR_L") else: self._lr_c = tf.Variable(ALG_PARAMS["lr_c"], name="LR_C") # Make sure alpha and alpha are not zero # NOTE (rickstaa): This is needed to prevent log_alpha/log_lambda from becoming # -np.inf ALG_PARAMS["alpha"] = (1e-37 if ALG_PARAMS["alpha"] == 0.0 else ALG_PARAMS["alpha"]) ALG_PARAMS["labda"] = (1e-37 if ALG_PARAMS["labda"] == 0.0 else ALG_PARAMS["labda"]) # Create placeholders for the Lagrance multipliers self.log_alpha = tf.Variable(tf.math.log(ALG_PARAMS["alpha"]), name="log_alpha") if self.use_lyapunov: self.log_labda = tf.Variable(tf.math.log(ALG_PARAMS["labda"]), name="log_lambda") # Create Gaussian Actor (GA) and Lyapunov critic (LC) or Q-Critic (QC) networks self.ga = SquashedGaussianActor( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["actor"], act_limits=self.act_limits, ) if self.use_lyapunov: self.lc = LyapunovCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["critic"], ) else: # NOTE (rickstaa): We create two Q-critics so we can use the Clipped # double-Q trick. self.q_1 = QCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["q_critic"], ) self.q_2 = QCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["q_critic"], ) # Create GA, LC and QC target networks # Don't get optimized but get updated according to the EMA of the main # networks self.ga_ = SquashedGaussianActor( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["actor"], act_limits=self.act_limits, ) if self.use_lyapunov: self.lc_ = LyapunovCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["critic"], ) else: self.q_1_ = QCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["q_critic"], ) self.q_2_ = QCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["q_critic"], ) self._init_targets() # Create optimizers # NOTE (rickstaa): We here optimize for log_alpha and log_labda instead of # alpha and labda because it is more numerically stable (see: # https://github.com/rail-berkeley/softlearning/issues/136) self._a_train = tf.keras.optimizers.Adam(learning_rate=self._lr_a) self._alpha_train = tf.keras.optimizers.Adam(learning_rate=self._lr_a) if self.use_lyapunov: self._lambda_train = tf.keras.optimizers.Adam( learning_rate=self._lr_lag) self._l_train = tf.keras.optimizers.Adam(learning_rate=self._lr_l) else: self._main_q_vars = (self.q_1.trainable_variables + self.q_2.trainable_variables ) # Chain parameters of the two Q-critics self._q_train = tf.keras.optimizers.Adam(learning_rate=self._lr_c) # Create model save dict if self.use_lyapunov: self._save_dict = { "gaussian_actor": self.ga, "lyapunov_critic": self.lc, "log_alpha": self.log_alpha, "log_labda": self.log_labda, "use_lyapunov": self.use_lyapunov, } else: self._save_dict = { "gaussian_actor": self.ga, "q_critic_1": self.q_1, "q_critic_2": self.q_2, "log_alpha": self.log_alpha, "use_lyapunov": self.use_lyapunov, }
class LAC(object): """The Lyapunov actor critic. Attributes: ga (torch.nn.Module): The Squashed Gaussian Actor network. ga_ (torch.nn.Module): The Squashed Gaussian Actor target network. lc (torch.nn.Module): The Lyapunov Critic network. lc_ (torch.nn.Module): The Lyapunov Critic target network. q_1 (torch.nn.Module): The first Q-Critic network. q_2 (torch.nn.Module): The second Q-Critic network. q_2_ (torch.nn.Module): The first Q-Critic target network. q_2_ (torch.nn.Module): The second Q-Crictic target network. log_alpha (torch.Tensor): The temperature lagrance multiplier. log_labda (torch.Tensor): The Lyapunov lagrance multiplier. target_entropy (int): The target entropy. device (str): The device the networks are placed on (CPU or GPU). use_lyapunov (bool): Whether the Lyapunov Critic is used (use_lyapunov=True) or the regular Q-critic (use_lyapunov=false). """ def __init__(self, a_dim, s_dim, act_limits=None): """Initiates object state. Args: a_dim (int): Action space dimension. s_dim (int): Observation space dimension. act_limits (dict, optional): The "high" and "low" action bounds of the environment. Used for rescaling the actions that comes out of network from (-1, 1) to (low, high). Defaults to (-1, 1). """ # Display information about the algorithm being used (LAC or SAC) if ALG_PARAMS["use_lyapunov"]: print( colorize("INFO: You are using the LAC algorithm.", "green", bold=True)) else: print( colorize("WARN: You are using the SAC algorithm.", "yellow", bold=True)) # Set the computational device self.device = DEVICE # Save action and observation space as members self._a_dim = a_dim self._s_dim = s_dim self._act_limits = act_limits # Save algorithm parameters as class objects self.use_lyapunov = ALG_PARAMS["use_lyapunov"] self._network_structure = ALG_PARAMS["network_structure"] self._polyak = 1 - ALG_PARAMS["tau"] self._gamma = ALG_PARAMS["gamma"] self._alpha_3 = ALG_PARAMS["alpha3"] # Determine target entropy # NOTE (rickstaa): If not defined we use the Lower bound of the policy entropy if ALG_PARAMS["target_entropy"] is None: self.target_entropy = -self._a_dim else: self.target_entropy = ALG_PARAMS["target_entropy"] # Create Learning rate placeholders self._lr_a = ALG_PARAMS["lr_a"] if self.use_lyapunov: self._lr_lag = ALG_PARAMS["lr_a"] self._lr_l = ALG_PARAMS["lr_l"] else: self._lr_c = ALG_PARAMS["lr_c"] # Make sure alpha and alpha are not zero # NOTE (rickstaa): This is needed to prevent log_alpha/log_lambda from becoming # -np.inf ALG_PARAMS["alpha"] = (1e-37 if ALG_PARAMS["alpha"] == 0.0 else ALG_PARAMS["alpha"]) ALG_PARAMS["labda"] = (1e-37 if ALG_PARAMS["labda"] == 0.0 else ALG_PARAMS["labda"]) # Create variables for the Lagrance multipliers self.log_alpha = torch.tensor(ALG_PARAMS["alpha"], dtype=torch.float32).log() self.log_alpha.requires_grad = True if self.use_lyapunov: self.log_labda = torch.tensor(ALG_PARAMS["labda"], dtype=torch.float32).log() self.log_labda.requires_grad = True # Create Gaussian Actor (GA) and Lyapunov critic (LC) or Q-Critic (QC) networks # NOTE (rickstaa): Pytorch currently uses kaiming initialization for the baises # in the future this will change to zero initialization # (https://github.com/pytorch/pytorch/issues/18182). This however does not # influence the results. self.ga = SquashedGaussianActor( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["actor"], act_limits=self.act_limits, ).to(self.device) if self.use_lyapunov: self.lc = LyapunovCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["critic"], ).to(self.device) else: # NOTE (rickstaa): We create two Q-critics so we can use the Clipped # double-Q trick. self.q_1 = QCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["q_critic"], ).to(self.device) self.q_2 = QCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["q_critic"], ).to(self.device) # Create GA, LC and QC target networks # Don't get optimized but get updated according to the EMA of the main # networks self.ga_ = deepcopy(self.ga).to(self.device) if self.use_lyapunov: self.lc_ = deepcopy(self.lc).to(self.device) else: self.q_1_ = deepcopy(self.q_1).to(self.device) self.q_2_ = deepcopy(self.q_2).to(self.device) # Freeze target networks for p in self.ga_.parameters(): p.requires_grad = False if self.use_lyapunov: for p in self.lc_.parameters(): p.requires_grad = False else: for p in self.q_1_.parameters(): p.requires_grad = False for p in self.q_2_.parameters(): p.requires_grad = False # Create optimizers # NOTE (rickstaa): We here optimize for log_alpha and log_labda instead of # alpha and labda because it is more numerically stable (see: # https://github.com/rail-berkeley/softlearning/issues/136) self._alpha_train = Adam([self.log_alpha], lr=self._lr_a) self._a_train = Adam(self.ga.parameters(), lr=self._lr_a) if self.use_lyapunov: self._lambda_train = Adam([self.log_labda], lr=self._lr_lag) self._l_train = Adam(self.lc.parameters(), lr=self._lr_l) else: q_params = itertools.chain( self.q_1.parameters(), self.q_2.parameters()) # Chain parameters of the two Q-critics self._q_train = Adam(q_params, lr=self._lr_c) def choose_action(self, s, evaluation=False): """Returns the current action of the policy. Args: s (np.numpy): The current state. evaluation (bool, optional): Whether to return a deterministic action. Defaults to False. Returns: np.numpy: The current action. """ # Make sure s is float32 torch tensor s = torch.as_tensor(s, dtype=torch.float32).to(self.device) # Get current best action if evaluation is True: try: with torch.no_grad(): det_a, _ = self.ga(s.unsqueeze(0), deterministic=True) return det_a[0].cpu().numpy() except ValueError: return else: with torch.no_grad(): a, _ = self.ga(s.unsqueeze(0)) return a[0].cpu().numpy() def learn(self, lr_a, lr_l, lr_c, lr_lag, batch): """Runs the SGD to update all the optimize parameters. Args: lr_a (float): Current actor learning rate. lr_l (float): Lyapunov critic learning rate. lr_c (float): Q-Critic learning rate. lr_lag (float): Lyapunov constraint langrance multiplier learning rate. batch (numpy.ndarray): The batch of experiences. Returns: tuple: Tuple with some diagnostics about the training. """ # Adjust optimizer learning rates (decay) self._set_learning_rates(lr_a=lr_a, lr_alpha=lr_a, lr_l=lr_l, lr_labda=lr_lag, lr_c=lr_c) ################################################ # Optimize (Lyapunov/Q) critic ################# ################################################ if self.use_lyapunov: # Zero gradients on the L-critic self._l_train.zero_grad() # Get target Lyapunov value (Bellman-backup) with torch.no_grad(): a2_, _ = self.ga_( batch["s_"] ) # NOTE (rickstaa): Target actions come from *current* *target* policy l_pi_targ = self.lc_(batch["s_"], a2_) l_backup = (batch["r"] + self._gamma * (1 - batch["terminal"]) * l_pi_targ.detach()) # Get current Lyapunov value l1 = self.lc(batch["s"], batch["a"]) # Calculate Lyapunov *CRITIC* error # NOTE (rickstaa): The 0.5 multiplication factor was added to make the # derivation cleaner and can be safely removed without influencing the # minimization. We kept it here for consistency. # NOTE (rickstaa): I use a manual implementation instead of using # F.mse_loss as this is 2 times faster. This can be changed back to # F.mse_loss if Torchscript is used. l_error = 0.5 * ((l1 - l_backup)**2).mean() # See eq. 7 # Perform one gradient descent step for the Lyapunov critic l_error.backward() self._l_train.step() else: # Zero gradients on the Q-critic self._q_train.zero_grad() # Get target Q values (Bellman-backup) # NOTE (rickstaa): Here we use max-clipping instead of min-clipping used # in the SAC algorithm since we want to minimize the return. with torch.no_grad(): a2, logp_a2 = self.ga( batch["s_"] ) # NOTE (rickstaa): Target actions come from *current* policy q1_pi_targ = self.q_1_(batch["s_"], a2) q2_pi_targ = self.q_2_(batch["s_"], a2) q_pi_targ = torch.max( q1_pi_targ, q2_pi_targ, ) # Use max clipping to prevent overestimation bias. q_backup = batch["r"] + self._gamma * ( 1 - batch["terminal"]) * (q_pi_targ - self.alpha * logp_a2) # Get the current Q values q1 = self.q_1(batch["s"], batch["a"]) q2 = self.q_2(batch["s"], batch["a"]) # Calculate Q-critic loss loss_q1 = 0.5 * ((q1 - q_backup)**2).mean() # See Haarnoja eq. 5 loss_q2 = 0.5 * ((q2 - q_backup)**2).mean() loss_q = loss_q1 + loss_q2 # Perform one gradient descent step for the Q-critic loss_q.backward() self._q_train.step() ################################################ # Optimize Gaussian actor ###################### ################################################ # Zero gradients on the actor self._a_train.zero_grad() # Retrieve log probabilities of batch observations based on *current* policy pi, log_pis = self.ga(batch["s"]) # Compute actor loss if self.use_lyapunov: # Calculate the target Lyapunov value a2, _ = self.ga( batch["s_"] ) # NOTE (rickstaa): Target actions come from *current* policy lya_l_ = self.lc(batch["s_"], a2) # Compute Lyapunov Actor error self.l_delta = torch.mean(lya_l_ - l1.detach() + self._alpha_3 * batch["r"]) # See eq. 11 # Compute actor loss a_loss = (self.labda.detach() * self.l_delta + self.alpha.detach() * log_pis.mean()) # See eq. 12 else: # Retrieve the current Q values q1_pi = self.q_1( batch["s"], pi) # NOTE (rickstaa): Actions come from *current* policy q2_pi = self.q_2( batch["s"], pi) # NOTE (rickstaa): Actions come from *current* policy q_pi = torch.max(q1_pi, q2_pi) # Compute actor loss a_loss = (self.alpha.detach() * log_pis - q_pi).mean() # See Haarnoja eq. 7 # Perform one gradient descent step for the Gaussian Actor a_loss.backward() self._a_train.step() ################################################ # Optimize alpha (Entropy temperature) ######### ################################################ # Zero gradients on alpha self._alpha_train.zero_grad() # Calculate alpha loss alpha_loss = -(self.alpha * (log_pis + self.target_entropy).detach() ).mean() # See Haarnoja eq. 17 # Perform one gradient descent step for alpha alpha_loss.backward() self._alpha_train.step() ################################################ # Optimize labda (Lyapunov temperature) ######## ################################################ if self.use_lyapunov: # Zero gradients on labda self._lambda_train.zero_grad() # Calculate labda loss # NOTE (rickstaa): Log_labda was used in the lambda_loss function because # using lambda caused the gradients to vanish. This is caused since we # restrict lambda within a 0-1.0 range using the clamp function (see #38). # Using log_lambda also is more numerically stable. labda_loss = -(self.log_labda * self.l_delta.detach() ).mean() # See formulas under eq. 14 # Perform one gradient descent step for labda labda_loss.backward() self._lambda_train.step() ################################################ # Update target networks and return ############ # diagnostics. ################################# ################################################ # Update target networks self._update_targets() # Return diagnostics if self.use_lyapunov: return ( self.labda.cpu().detach(), self.alpha.cpu().detach(), l_error.cpu().detach(), torch.mean(-log_pis.cpu().detach()), a_loss.cpu().detach(), alpha_loss.cpu().detach(), labda_loss.cpu().detach(), ) else: return ( self.alpha.cpu().detach(), loss_q.cpu().detach(), torch.mean(-log_pis.cpu().detach()), a_loss.cpu().detach(), alpha_loss.cpu().detach(), ) def save_result(self, path): """Saves current policy. Args: path (str): The path where you want to save the policy. """ # Retrieve save save_path = osp.abspath(osp.join(path, "policy/model.pth")) # Create folder if not exist if osp.exists(osp.dirname(save_path)): print( colorize( ("WARN: Log dir %s already exists! Storing info there anyway." % osp.dirname(save_path)), "red", bold=True, )) else: os.makedirs(osp.dirname(save_path)) # Create models state dictionary if self.use_lyapunov: models_state_save_dict = { "use_lyapunov": self.use_lyapunov, "ga_state_dict": self.ga.state_dict(), "lc_state_dict": self.lc.state_dict(), "ga_targ_state_dict": self.ga_.state_dict(), "lc_targ_state_dict": self.lc_.state_dict(), "log_alpha": self.log_alpha, "log_labda": self.log_labda, } else: models_state_save_dict = { "use_lyapunov": self.use_lyapunov, "ga_state_dict": self.ga.state_dict(), "ga_targ_state_dict": self.ga_.state_dict(), "q1_state_dict": self.q_1.state_dict(), "q2_state_dict": self.q_2.state_dict(), "q1_targ_state_dict": self.q_1_.state_dict(), "q2_targ_state_dict": self.q_2_.state_dict(), "log_alpha": self.log_alpha, } # Save model state dictionary torch.save(models_state_save_dict, save_path) print(colorize(f"INFO: Save to path: {save_path}", "cyan", bold=True)) def restore(self, path, restore_lagrance_multipliers=True): """Restores policy. Args: path (str): The path where you want to save the policy. restore_lagrance_multipliers (bool, optional): Whether you want to restore the lagrance multipliers. Returns: bool: Boolean specifying whether the policy was loaded successfully. """ # Create load path load_path = osp.abspath(osp.join(path, "model.pth")) # Load the model state try: models_state_dict = torch.load(load_path) except (FileNotFoundError, NotADirectoryError): success_load = False return success_load # Throw warning if restored model is different from the model use now if self.use_lyapunov != models_state_dict["use_lyapunov"]: alg_strings = [ "LAC" if self.use_lyapunov else "SAC", "LAC" if models_state_dict["use_lyapunov"] else "SAC", ] if TRAIN_PARAMS["continue_training"]: warn_str = colorize( (f"ERROR: You tried to load a {alg_strings[1]} model while the " f"`variant.py` file specifies you want to train it as a" f"{alg_strings[0]} model. Shutting down training as this is " "not yet supported."), "red", bold=True, ) print(warn_str) sys.exit(0) else: warn_str = colorize( (f"ERROR: You tried to load a {alg_strings[1]} model while the " f"`variant.py` file specifies you want to use it in the " f"inference as a {alg_strings[0]} model. As a result the " "`variant.py` will be ignored."), "yellow", bold=True, ) print(warn_str) self.__reload_critic_networks( use_lyapunov=models_state_dict["use_lyapunov"]) # Restore network parameters try: if models_state_dict["use_lyapunov"]: self.use_lyapunov = models_state_dict["use_lyapunov"] self.ga.load_state_dict(models_state_dict["ga_state_dict"]) self.lc.load_state_dict(models_state_dict["lc_state_dict"]) self.ga_.load_state_dict( models_state_dict["ga_targ_state_dict"]) self.lc_.load_state_dict( models_state_dict["lc_targ_state_dict"]) if restore_lagrance_multipliers: self.log_alpha = models_state_dict["log_alpha"] self.log_labda = models_state_dict["log_labda"] else: self.use_lyapunov = models_state_dict["use_lyapunov"] self.ga.load_state_dict(models_state_dict["ga_state_dict"]) self.ga_.load_state_dict( models_state_dict["ga_targ_state_dict"]) self.q_1.load_state_dict(models_state_dict["q1_state_dict"]) self.q_2.load_state_dict(models_state_dict["q2_state_dict"]) self.q_1_.load_state_dict( models_state_dict["q1_targ_state_dict"]) self.q_2_.load_state_dict( models_state_dict["q2_targ_state_dict"]) if restore_lagrance_multipliers: self.log_alpha = models_state_dict["log_alpha"] except (KeyError, AttributeError): alg_string = "LAC" if models_state_dict["use_lyapunov"] else "SAC" print( colorize( ("ERROR: Something went wrong while trying to load the " f"{alg_string} model. Shutting down the training."), "red", bold=True, )) sys.exit(0) # Return result success_load = True return success_load def __reload_critic_networks(self, use_lyapunov): """Function used to reload the right networks when the loaded model type differs from the type set in the `variant.py` file. Currently only used during inference. Args: use_lyapunov (bool): Whether the new setup should use lyapunov or not. """ # Create required networks if use_lyapunov: # LAC # Print reload message print( colorize( "INFO: You switched to using the LAC algorithm.", "green", bold=True, )) # Create log_labda self.log_labda = torch.tensor(ALG_PARAMS["labda"], dtype=torch.float32).log() self.log_labda.requires_grad = True # Create main and target Lyapunov Critic networks self.lc = LyapunovCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["critic"], ).to(self.device) self.lc_ = deepcopy(self.lc).to(self.device) # Remove main and target Q-Critic networks # NOTE (rickstaa): Removed to make sure we notice if something goes wrong. delattr(self, "q_1") delattr(self, "q_2") delattr(self, "q_1_") delattr(self, "q_2_") else: # SAC # Print reload message print( colorize( "WARN: You switched to using the SAC algorithm.", "yellow", bold=True, )) # Create main and target Q-Critic networks self.q_1 = QCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["q_critic"], ).to(self.device) self.q_2 = QCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["q_critic"], ).to(self.device) self.q_1_ = deepcopy(self.q_1).to(self.device) self.q_2_ = deepcopy(self.q_2).to(self.device) # Remove main and target Q-Critic networks delattr(self, "lc") delattr(self, "lc_") def _set_learning_rates(self, lr_a=None, lr_alpha=None, lr_l=None, lr_labda=None, lr_c=None): """Adjusts the learning rates of the optimizers. Args: lr_a (float, optional): The learning rate of the actor optimizer. Defaults to None. lr_alpha (float, optional): The learning rate of the temperature optimizer. Defaults to None. lr_l (float, optional): The learning rate of the Lyapunov critic. Defaults to None. lr_labda (float, optional): The learning rate of the Lyapunov Lagrance multiplier optimizer. Defaults to None. lr_c (float, optional): The learning rate of the Q-Critic optimizer. Defaults to None. """ if lr_a: for param_group in self._a_train.param_groups: param_group["lr"] = lr_a if lr_alpha: for param_group in self._alpha_train.param_groups: param_group["lr"] = lr_alpha if self.use_lyapunov: if lr_l: for param_group in self._l_train.param_groups: param_group["lr"] = lr_l if lr_labda: for param_group in self._lambda_train.param_groups: param_group["lr"] = lr_labda else: if lr_c: for param_group in self._q_train.param_groups: param_group["lr"] = lr_c def _update_targets(self): """Updates the target networks based on a Exponential moving average (Polyak averaging). """ with torch.no_grad(): for ga_main, ga_targ in zip(self.ga.parameters(), self.ga_.parameters()): ga_targ.data.mul_(self._polyak) ga_targ.data.add_((1 - self._polyak) * ga_main.data) if self.use_lyapunov: for lc_main, lc_targ in zip(self.lc.parameters(), self.lc_.parameters()): lc_targ.data.mul_(self._polyak) lc_targ.data.add_((1 - self._polyak) * lc_main.data) else: for q_1_main, q_1_targ in zip(self.q_1.parameters(), self.q_1_.parameters()): q_1_targ.data.mul_(self._polyak) q_1_targ.data.add_((1 - self._polyak) * q_1_main.data) for q_2_main, q_2_targ in zip(self.q_2.parameters(), self.q_2_.parameters()): q_2_targ.data.mul_(self._polyak) q_2_targ.data.add_((1 - self._polyak) * q_2_main.data) @property def alpha(self): """Property used to clip alpha to be equal or bigger than 0.0 to prevent it from becoming nan when log_alpha becomes -inf. For alpha no upper bound is used. """ return torch.clamp(self.log_alpha.exp(), *SCALE_ALPHA_MIN_MAX) @property def labda(self): """Property used to clip lambda to be equal or bigger than 0.0 in order to prevent it from becoming nan when log_labda becomes -inf. Further we clip it to be lower or equal than 1.0 in order to prevent lambda from exploding when the the hyperparameters are chosen badly. """ return torch.clamp(self.log_labda.exp(), *SCALE_LAMBDA_MIN_MAX) @property def act_limits(self): return self._act_limits @act_limits.setter def act_limits(self, act_limits): """Sets the action limits that are used for scaling the actions that are returned from the gaussian policy. """ # Validate input missing_keys = [ key for key in ["low", "high"] if key not in act_limits.keys() ] if missing_keys: warn_string = "WARN: act_limits could not be set as {} not found.".format( f"keys {missing_keys} were" if len(missing_keys) > 1 else f"key {missing_keys} was") print(colorize(warn_string, "yellow")) invalid_length = [ key for key, val in act_limits.items() if len(val) != self._a_dim ] if invalid_length: warn_string = ( f"WARN: act_limits could not be set as the length of {invalid_length} " + "{}".format("were" if len(invalid_length) > 1 else "was") + f" unequal to the dimension of the action space (dim={self._a_dim})." ) print(colorize(warn_string, "yellow")) # Set action limits self._act_limits = { "low": act_limits["low"], "high": act_limits["high"] } self.ga.act_limits = self._act_limits
def __init__(self, a_dim, s_dim, act_limits=None): """Initiates object state. Args: a_dim (int): Action space dimension. s_dim (int): Observation space dimension. act_limits (dict, optional): The "high" and "low" action bounds of the environment. Used for rescaling the actions that comes out of network from (-1, 1) to (low, high). Defaults to (-1, 1). """ # Display information about the algorithm being used (LAC or SAC) if ALG_PARAMS["use_lyapunov"]: print( colorize("INFO: You are using the LAC algorithm.", "green", bold=True)) else: print( colorize("WARN: You are using the SAC algorithm.", "yellow", bold=True)) # Set the computational device self.device = DEVICE # Save action and observation space as members self._a_dim = a_dim self._s_dim = s_dim self._act_limits = act_limits # Save algorithm parameters as class objects self.use_lyapunov = ALG_PARAMS["use_lyapunov"] self._network_structure = ALG_PARAMS["network_structure"] self._polyak = 1 - ALG_PARAMS["tau"] self._gamma = ALG_PARAMS["gamma"] self._alpha_3 = ALG_PARAMS["alpha3"] # Determine target entropy # NOTE (rickstaa): If not defined we use the Lower bound of the policy entropy if ALG_PARAMS["target_entropy"] is None: self.target_entropy = -self._a_dim else: self.target_entropy = ALG_PARAMS["target_entropy"] # Create Learning rate placeholders self._lr_a = ALG_PARAMS["lr_a"] if self.use_lyapunov: self._lr_lag = ALG_PARAMS["lr_a"] self._lr_l = ALG_PARAMS["lr_l"] else: self._lr_c = ALG_PARAMS["lr_c"] # Make sure alpha and alpha are not zero # NOTE (rickstaa): This is needed to prevent log_alpha/log_lambda from becoming # -np.inf ALG_PARAMS["alpha"] = (1e-37 if ALG_PARAMS["alpha"] == 0.0 else ALG_PARAMS["alpha"]) ALG_PARAMS["labda"] = (1e-37 if ALG_PARAMS["labda"] == 0.0 else ALG_PARAMS["labda"]) # Create variables for the Lagrance multipliers self.log_alpha = torch.tensor(ALG_PARAMS["alpha"], dtype=torch.float32).log() self.log_alpha.requires_grad = True if self.use_lyapunov: self.log_labda = torch.tensor(ALG_PARAMS["labda"], dtype=torch.float32).log() self.log_labda.requires_grad = True # Create Gaussian Actor (GA) and Lyapunov critic (LC) or Q-Critic (QC) networks # NOTE (rickstaa): Pytorch currently uses kaiming initialization for the baises # in the future this will change to zero initialization # (https://github.com/pytorch/pytorch/issues/18182). This however does not # influence the results. self.ga = SquashedGaussianActor( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["actor"], act_limits=self.act_limits, ).to(self.device) if self.use_lyapunov: self.lc = LyapunovCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["critic"], ).to(self.device) else: # NOTE (rickstaa): We create two Q-critics so we can use the Clipped # double-Q trick. self.q_1 = QCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["q_critic"], ).to(self.device) self.q_2 = QCritic( obs_dim=self._s_dim, act_dim=self._a_dim, hidden_sizes=self._network_structure["q_critic"], ).to(self.device) # Create GA, LC and QC target networks # Don't get optimized but get updated according to the EMA of the main # networks self.ga_ = deepcopy(self.ga).to(self.device) if self.use_lyapunov: self.lc_ = deepcopy(self.lc).to(self.device) else: self.q_1_ = deepcopy(self.q_1).to(self.device) self.q_2_ = deepcopy(self.q_2).to(self.device) # Freeze target networks for p in self.ga_.parameters(): p.requires_grad = False if self.use_lyapunov: for p in self.lc_.parameters(): p.requires_grad = False else: for p in self.q_1_.parameters(): p.requires_grad = False for p in self.q_2_.parameters(): p.requires_grad = False # Create optimizers # NOTE (rickstaa): We here optimize for log_alpha and log_labda instead of # alpha and labda because it is more numerically stable (see: # https://github.com/rail-berkeley/softlearning/issues/136) self._alpha_train = Adam([self.log_alpha], lr=self._lr_a) self._a_train = Adam(self.ga.parameters(), lr=self._lr_a) if self.use_lyapunov: self._lambda_train = Adam([self.log_labda], lr=self._lr_lag) self._l_train = Adam(self.lc.parameters(), lr=self._lr_l) else: q_params = itertools.chain( self.q_1.parameters(), self.q_2.parameters()) # Chain parameters of the two Q-critics self._q_train = Adam(q_params, lr=self._lr_c)