def gradients(self, optimizer, loss): if self.config["grad_norm_clipping"] is not None: actor_grads_and_vars = minimize_and_clip( self._actor_optimizer, self.actor_loss, var_list=self.policy_vars, clip_val=self.config["grad_norm_clipping"]) critic_grads_and_vars = minimize_and_clip( self._critic_optimizer, self.critic_loss, var_list=self.q_func_vars + self.twin_q_func_vars if self.config["twin_q"] else self.q_func_vars, clip_val=self.config["grad_norm_clipping"]) else: actor_grads_and_vars = self._actor_optimizer.compute_gradients( self.actor_loss, var_list=self.policy_vars) if self.config["twin_q"]: critic_vars = self.q_func_vars + self.twin_q_func_vars else: critic_vars = self.q_func_vars critic_grads_and_vars = self._critic_optimizer.compute_gradients( self.critic_loss, var_list=critic_vars) # save these for later use in build_apply_op self._actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars if g is not None] self._critic_grads_and_vars = [(g, v) for (g, v) in critic_grads_and_vars if g is not None] grads_and_vars = self._actor_grads_and_vars \ + self._critic_grads_and_vars return grads_and_vars
def gradients(policy, optimizer, loss): if policy.config["grad_norm_clipping"] is not None: actor_grads_and_vars = minimize_and_clip( policy._actor_optimizer, policy.actor_loss, var_list=policy.model.policy_variables(), clip_val=policy.config["grad_norm_clipping"]) critic_grads_and_vars = minimize_and_clip( policy._critic_optimizer, policy.critic_loss, var_list=policy.model.q_variables(), clip_val=policy.config["grad_norm_clipping"]) alpha_grads_and_vars = minimize_and_clip( policy._alpha_optimizer, policy.alpha_loss, var_list=policy.model.alpha, clip_val=policy.config["grad_norm_clipping"]) else: actor_grads_and_vars = policy._actor_optimizer.compute_gradients( policy.actor_loss, var_list=policy.model.policy_variables()) critic_grads_and_vars = policy._critic_optimizer.compute_gradients( policy.critic_loss, var_list=policy.model.q_variables()) alpha_grads_and_vars = policy._critic_optimizer.compute_gradients( policy.alpha_loss, var_list=policy.model.alpha) # save these for later use in build_apply_op policy._actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars if g is not None] policy._critic_grads_and_vars = [(g, v) for (g, v) in critic_grads_and_vars if g is not None] policy._alpha_grads_and_vars = [(g, v) for (g, v) in alpha_grads_and_vars if g is not None] grads_and_vars = (policy._actor_grads_and_vars + policy._critic_grads_and_vars + policy._alpha_grads_and_vars) return grads_and_vars
def clip_gradients(policy: Policy, optimizer: "tf.keras.optimizers.Optimizer", loss: TensorType) -> ModelGradients: if not hasattr(policy, "q_func_vars"): policy.q_func_vars = policy.model.variables() return minimize_and_clip(optimizer, loss, var_list=policy.q_func_vars, clip_val=policy.config["grad_clip"])
def clip_gradients(policy, optimizer, loss): if policy.config["grad_clip"] is not None: grads_and_vars = minimize_and_clip(optimizer, loss, var_list=policy.q_func_vars, clip_val=policy.config["grad_clip"]) else: grads_and_vars = optimizer.compute_gradients( loss, var_list=policy.q_func_vars) grads_and_vars = [(g, v) for (g, v) in grads_and_vars if g is not None] return grads_and_vars
def clip_gradients(policy: Policy, optimizer: "tf.keras.optimizers.Optimizer", loss: TensorType) -> ModelGradients: if policy.config["grad_clip"] is not None: grads_and_vars = minimize_and_clip( optimizer, loss, var_list=policy.q_func_vars, clip_val=policy.config["grad_clip"]) else: grads_and_vars = optimizer.compute_gradients( loss, var_list=policy.q_func_vars) grads_and_vars = [(g, v) for (g, v) in grads_and_vars if g is not None] return grads_and_vars
def clip_gradients(policy: Policy, optimizer: "tf.keras.optimizers.Optimizer", loss: TensorType) -> ModelGradients: return minimize_and_clip(optimizer, loss, var_list=policy.q_func_vars, clip_val=policy.config["grad_clip"])
def dice_sac_gradient(policy, optimizer, loss): if policy.config["grad_norm_clipping"] is not None: actor_grads_and_vars = minimize_and_clip( optimizer, policy.actor_loss, var_list=policy.model.policy_variables(), clip_val=policy.config["grad_norm_clipping"]) if policy.config["twin_q"]: q_variables = policy.model.q_variables() half_cutoff = len(q_variables) // 2 critic_grads_and_vars = [] critic_grads_and_vars += minimize_and_clip( optimizer, policy.critic_loss[0], var_list=q_variables[:half_cutoff], clip_val=policy.config["grad_norm_clipping"]) critic_grads_and_vars += minimize_and_clip( optimizer, policy.critic_loss[1], var_list=q_variables[half_cutoff:], clip_val=policy.config["grad_norm_clipping"]) else: critic_grads_and_vars = minimize_and_clip( optimizer, policy.critic_loss[0], var_list=policy.model.q_variables(), clip_val=policy.config["grad_norm_clipping"]) alpha_grads_and_vars = minimize_and_clip( optimizer, policy.alpha_loss, var_list=[policy.model.log_alpha], clip_val=policy.config["grad_norm_clipping"]) else: actor_grads_and_vars = policy._actor_optimizer.compute_gradients( policy.actor_loss, var_list=policy.model.policy_variables()) if policy.config["twin_q"]: q_variables = policy.model.q_variables() half_cutoff = len(q_variables) // 2 base_q_optimizer, twin_q_optimizer = policy._critic_optimizer critic_grads_and_vars = base_q_optimizer.compute_gradients( policy.critic_loss[0], var_list=q_variables[:half_cutoff] ) + twin_q_optimizer.compute_gradients( policy.critic_loss[1], var_list=q_variables[half_cutoff:]) else: critic_grads_and_vars = policy._critic_optimizer[ 0].compute_gradients( policy.critic_loss[0], var_list=policy.model.q_variables()) alpha_grads_and_vars = policy._alpha_optimizer.compute_gradients( policy.alpha_loss, var_list=[policy.model.log_alpha]) # This part we get the diversity gradient if policy.config["grad_norm_clipping"] is not None: diversity_actor_grads_and_vars = minimize_and_clip( optimizer, policy.diversity_actor_loss, var_list=policy.model.policy_variables(), clip_val=policy.config["grad_norm_clipping"]) diversity_critic_grads_and_vars = minimize_and_clip( optimizer, policy.diversity_critic_loss[0], var_list=policy.model.diversity_q_variables(), clip_val=policy.config["grad_norm_clipping"]) else: diversity_actor_grads_and_vars = \ policy._actor_optimizer.compute_gradients( policy.diversity_actor_loss, var_list=policy.model.policy_variables()) diversity_critic_grads_and_vars = policy._critic_optimizer[ 0].compute_gradients( policy.diversity_critic_loss[0], var_list=policy.model.diversity_q_variables()) policy_grad = actor_grads_and_vars diversity_grad = diversity_actor_grads_and_vars # if policy.config["grad_norm_clipping"] is not None: # diversity_grad = minimize_and_clip( # optimizer, policy.diversity_loss, # var_list=policy.model.policy_variables(), # clip_val=policy.config["grad_norm_clipping"] # ) # else: # diversity_grad = policy._actor_optimizer.compute_gradients( # policy.diversity_loss, var_list=policy.model.policy_variables() # ) return_gradients = {} policy_grad_flatten = [] policy_grad_info = [] diversity_grad_flatten = [] diversity_grad_info = [] # First, flatten task gradient and diversity gradient into two vector. for (pg, var), (ng, var2) in zip(policy_grad, diversity_grad): assert var == var2 if pg is None: return_gradients[var] = (ng, var2) continue if ng is None: return_gradients[var] = (pg, var) continue pg_flat, pg_shape, pg_flat_shape = _flatten(pg) policy_grad_flatten.append(pg_flat) policy_grad_info.append((pg_flat_shape, pg_shape, var)) ng_flat, ng_shape, ng_flat_shape = _flatten(ng) diversity_grad_flatten.append(ng_flat) diversity_grad_info.append((ng_flat_shape, ng_shape)) policy_grad_flatten = tf.concat(policy_grad_flatten, 0) diversity_grad_flatten = tf.concat(diversity_grad_flatten, 0) # Second, compute the norm of two gradient. policy_grad_norm = tf.linalg.l2_normalize(policy_grad_flatten) diversity_grad_norm = tf.linalg.l2_normalize(diversity_grad_flatten) # Third, compute the bisector. final_grad = tf.linalg.l2_normalize(policy_grad_norm + diversity_grad_norm) # Fourth, compute the length of the final gradient. pg_length = tf.norm(tf.multiply(policy_grad_flatten, final_grad)) ng_length = tf.norm(tf.multiply(diversity_grad_flatten, final_grad)) if policy.config[CLIP_DIVERSITY_GRADIENT]: ng_length = tf.minimum(pg_length, ng_length) tg_lenth = (pg_length + ng_length) / 2 final_grad = final_grad * tg_lenth # add some stats. policy.gradient_cosine_similarity = tf.reduce_sum( tf.multiply(policy_grad_norm, diversity_grad_norm) ) policy.policy_grad_norm = tf.norm(policy_grad_flatten) policy.diversity_grad_norm = tf.norm(diversity_grad_flatten) # Fifth, split the flatten vector into the original form as the final # gradients. count = 0 for idx, (flat_shape, org_shape, var) in enumerate(policy_grad_info): assert flat_shape is not None size = flat_shape.as_list()[0] grad = final_grad[count:count + size] return_gradients[var] = (tf.reshape(grad, org_shape), var) count += size if policy.config["grad_clip"] is not None: ret_grads = [return_gradients[var][0] for _, var in policy_grad] clipped_grads, _ = tf.clip_by_global_norm( ret_grads, policy.config["grad_clip"]) actor_grads_and_vars_fused = [(g, return_gradients[var][1]) for g, (_, var) in zip(clipped_grads, policy_grad)] else: actor_grads_and_vars_fused = [ return_gradients[var] for _, var in policy_grad ] # save these for later use in build_apply_op policy._actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars_fused if g is not None] policy._critic_grads_and_vars = [(g, v) for (g, v) in critic_grads_and_vars if g is not None] policy._diversity_critic_grads_and_vars = [(g, v) for (g, v) in diversity_critic_grads_and_vars if g is not None] policy._alpha_grads_and_vars = [(g, v) for (g, v) in alpha_grads_and_vars if g is not None] grads_and_vars = ( policy._actor_grads_and_vars + policy._critic_grads_and_vars + policy._diversity_critic_grads_and_vars + policy._alpha_grads_and_vars ) return grads_and_vars
def gradients(policy, optimizer, loss): if policy.config["grad_norm_clipping"]: actor_grads_and_vars = minimize_and_clip( optimizer, policy.actor_loss, var_list=policy.model.policy_variables(), clip_val=policy.config["grad_norm_clipping"]) if policy.config["twin_q"]: q_variables = policy.model.q_variables() half_cutoff = len(q_variables) // 2 critic_grads_and_vars = [] critic_grads_and_vars += minimize_and_clip( optimizer, policy.critic_loss[0], var_list=q_variables[:half_cutoff], clip_val=policy.config["grad_norm_clipping"]) critic_grads_and_vars += minimize_and_clip( optimizer, policy.critic_loss[1], var_list=q_variables[half_cutoff:], clip_val=policy.config["grad_norm_clipping"]) else: critic_grads_and_vars = minimize_and_clip( optimizer, policy.critic_loss[0], var_list=policy.model.q_variables(), clip_val=policy.config["grad_norm_clipping"]) alpha_grads_and_vars = minimize_and_clip( optimizer, policy.alpha_loss, var_list=[policy.model.log_alpha], clip_val=policy.config["grad_norm_clipping"]) else: actor_grads_and_vars = policy._actor_optimizer.compute_gradients( policy.actor_loss, var_list=policy.model.policy_variables()) if policy.config["twin_q"]: q_variables = policy.model.q_variables() half_cutoff = len(q_variables) // 2 base_q_optimizer, twin_q_optimizer = policy._critic_optimizer critic_grads_and_vars = base_q_optimizer.compute_gradients( policy.critic_loss[0], var_list=q_variables[:half_cutoff] ) + twin_q_optimizer.compute_gradients( policy.critic_loss[1], var_list=q_variables[half_cutoff:]) else: critic_grads_and_vars = policy._critic_optimizer[ 0].compute_gradients(policy.critic_loss[0], var_list=policy.model.q_variables()) alpha_grads_and_vars = policy._alpha_optimizer.compute_gradients( policy.alpha_loss, var_list=[policy.model.log_alpha]) # save these for later use in build_apply_op policy._actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars if g is not None] policy._critic_grads_and_vars = [(g, v) for (g, v) in critic_grads_and_vars if g is not None] policy._alpha_grads_and_vars = [(g, v) for (g, v) in alpha_grads_and_vars if g is not None] grads_and_vars = (policy._actor_grads_and_vars + policy._critic_grads_and_vars + policy._alpha_grads_and_vars) return grads_and_vars