def _setup_deterministic_optimizer(self, action, scope=None): """Create the loss and optimizer of a deterministic policy.""" scope_name = 'model/pi/' if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: print('setting up optimizer') print_params_shape(scope_name, "policy") # Choose the loss function. if self.use_huber: loss_fn = tf.compat.v1.losses.huber_loss else: loss_fn = tf.compat.v1.losses.mean_squared_error # Define the loss function. self.loss = loss_fn(action, self.policy) # Create an optimizer object. optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate) # Create the optimizer operation. self.optimizer = optimizer.minimize( loss=self.loss, var_list=get_trainable_vars(scope_name))
def _setup_critic_optimizer(self, scope): """Create minimization operation for critic Q-function. Create a `tf.optimizer.minimize` operation for updating critic Q-function with gradient descent. See Equations (5, 6) in [1], for further information of the Q-function update rule. """ scope_name = 'model/value_fns' if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: print('setting up critic optimizer') for name in ['qf1', 'qf2', 'vf']: scope_i = '{}/{}'.format(scope_name, name) print_params_shape(scope_i, name) # Take the min of the two Q-Values (Double-Q Learning) min_qf_pi = tf.minimum(self.qf1_pi, self.qf2_pi) # Target for Q value regression q_backup = tf.stop_gradient( self.rew_ph + (1 - self.terminals1) * self.gamma * self.value_target) # choose the loss function if self.use_huber: loss_fn = tf.compat.v1.losses.huber_loss else: loss_fn = tf.compat.v1.losses.mean_squared_error # Compute Q-Function loss qf1_loss = loss_fn(q_backup, self.qf1) qf2_loss = loss_fn(q_backup, self.qf2) # Target for value fn regression # We update the vf towards the min of two Q-functions in order to # reduce overestimation bias from function approximation error. v_backup = tf.stop_gradient(min_qf_pi - self.alpha * self.logp_pi) value_loss = loss_fn(self.value_fn, v_backup) self.critic_loss = (qf1_loss, qf2_loss, value_loss) # Combine the loss functions for the optimizer. critic_loss = qf1_loss + qf2_loss + value_loss # Critic train op critic_optimizer = tf.compat.v1.train.AdamOptimizer(self.critic_lr) self.critic_optimizer = critic_optimizer.minimize( critic_loss, var_list=get_trainable_vars(scope_name))
def _setup_actor_optimizer(self, scope): """Create the actor loss, gradient, and optimizer.""" scope_name = 'model/pi/' if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: print('setting up actor optimizer') print_params_shape(scope_name, "actor") # compute the actor loss self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf[0]) # create an optimizer object optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr) self.actor_optimizer = optimizer.minimize( self.actor_loss, var_list=get_trainable_vars(scope_name))
def _setup_stochastic_optimizer(self, scope): """Create the loss and optimizer of a stochastic policy.""" scope_name = 'model/pi/' if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: print('setting up optimizer') print_params_shape(scope_name, "policy") # Define the loss function. self.loss = -tf.reduce_mean(self.logp_ac) # Create an optimizer object. optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate) # Create the optimizer operation. self.optimizer = optimizer.minimize( loss=self.loss, var_list=get_trainable_vars(scope_name))
def _setup_actor_optimizer(self, scope): """Create minimization operations for policy and entropy. Creates a `tf.optimizer.minimize` operations for updating policy and entropy with gradient descent. See Section 4.2 in [1], for further information of the policy update, and Section 5 in [1] for further information of the entropy update. """ scope_name = 'model/pi/' if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: print('setting up actor and alpha optimizers') print_params_shape(scope_name, "actor") # Take the min of the two Q-Values (Double-Q Learning) min_qf_pi = tf.minimum(self.qf1_pi, self.qf2_pi) # Compute the entropy temperature loss. self.alpha_loss = -tf.reduce_mean( self.log_alpha * tf.stop_gradient(self.logp_pi + self.target_entropy)) alpha_optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr) self.alpha_optimizer = alpha_optimizer.minimize( self.alpha_loss, var_list=self.log_alpha) # Compute the policy loss self.actor_loss = tf.reduce_mean(self.alpha * self.logp_pi - min_qf_pi) # Add a regularization penalty. self.actor_loss += self._l2_loss(self.l2_penalty, scope_name) # Policy train op (has to be separate from value train op, because # min_qf_pi appears in policy_loss) actor_optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr) self.actor_optimizer = actor_optimizer.minimize( self.actor_loss, var_list=get_trainable_vars(scope_name))
def _setup_critic_optimizer(self, critic_target, scope): """Create the critic loss, gradient, and optimizer.""" if self.verbose >= 2: print('setting up critic optimizer') # compute the target critic term with tf.compat.v1.variable_scope("loss", reuse=False): q_obs1 = tf.minimum(critic_target[0], critic_target[1]) target_q = tf.stop_gradient(self.rew_ph + (1. - self.terminals1) * self.gamma * q_obs1) tf.compat.v1.summary.scalar('critic_target', tf.reduce_mean(target_q)) # choose the loss function if self.use_huber: loss_fn = tf.compat.v1.losses.huber_loss else: loss_fn = tf.compat.v1.losses.mean_squared_error self.critic_loss = [loss_fn(q, target_q) for q in self.critic_tf] self.critic_optimizer = [] for i, critic_loss in enumerate(self.critic_loss): scope_name = 'model/qf_{}/'.format(i) if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: print_params_shape(scope_name, "critic {}".format(i)) # create an optimizer object optimizer = tf.compat.v1.train.AdamOptimizer(self.critic_lr) # create the optimizer object self.critic_optimizer.append( optimizer.minimize(loss=critic_loss, var_list=get_trainable_vars(scope_name)))
def _setup_optimizers(self, scope): """Create the actor and critic optimizers.""" scope_name = 'model/' old_scope_name = "oldpi/" if scope is not None: scope_name = scope + '/' + scope_name old_scope_name = scope + '/' + old_scope_name if self.verbose >= 2: print('setting up actor optimizer') print_params_shape("{}pi/".format(scope_name), "actor") print('setting up critic optimizer') print_params_shape("{}vf/".format(scope_name), "critic") # =================================================================== # # Create the policy loss and optimizers. # # =================================================================== # with tf.variable_scope("loss", reuse=False): # Compute the KL divergence. kloldnew = tf.reduce_sum( self.pi_logstd - self.old_pi_logstd + (tf.square(self.old_pi_std) + tf.square(self.old_pi_mean - self.pi_mean)) / (2.0 * tf.square(self.pi_std)) - 0.5, axis=-1) meankl = tf.reduce_mean(kloldnew) # Compute the entropy bonus. entropy = tf.reduce_sum(self.pi_logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1) meanent = tf.reduce_mean(entropy) entbonus = self.ent_coef * meanent # advantage * pnew / pold ratio = tf.exp( self.logp(self.action_ph, old=False) - self.logp(self.action_ph, old=True)) surrgain = tf.reduce_mean(ratio * self.advs_ph) optimgain = surrgain + entbonus self.losses = [optimgain, meankl, entbonus, surrgain, meanent] all_var_list = get_trainable_vars(scope_name) var_list = [ v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name ] vf_var_list = [ v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name ] self.get_flat = GetFlat(var_list, sess=self.sess) self.set_from_flat = SetFromFlat(var_list, sess=self.sess) klgrads = tf.gradients(meankl, var_list) shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: var_size = int(np.prod(shape)) tangents.append( tf.reshape(self.flat_tangent[start:start + var_size], shape)) start += var_size gvp = tf.add_n([ tf.reduce_sum(grad * tangent) for (grad, tangent) in zip(klgrads, tangents) ]) # Fisher vector products self.fvp = flatgrad(gvp, var_list) # =================================================================== # # Update the old model to match the new one. # # =================================================================== # self.assign_old_eq_new = tf.group(*[ tf.assign(oldv, newv) for (oldv, newv) in zip( get_globals_vars(old_scope_name), get_globals_vars(scope_name)) ]) # =================================================================== # # Create the value function optimizer. # # =================================================================== # vferr = tf.reduce_mean(tf.square(self.value_flat - self.ret_ph)) optimizer = tf.compat.v1.train.AdamOptimizer(self.vf_stepsize) self.vf_optimizer = optimizer.minimize( vferr, var_list=vf_var_list, ) # Initialize the model parameters and optimizers. with self.sess.as_default(): self.sess.run(tf.compat.v1.global_variables_initializer()) th_init = self.get_flat() self.set_from_flat(th_init) self.grad = flatgrad(optimgain, var_list)
def _setup_optimizers(self, scope): """Create the actor and critic optimizers.""" scope_name = 'model/' if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: print('setting up actor optimizer') print_params_shape("{}pi/".format(scope_name), "actor") print('setting up critic optimizer') print_params_shape("{}vf/".format(scope_name), "critic") neglogpac = self._neglogp(self.action_ph) self.entropy = tf.reduce_sum(tf.reshape(self.pi_logstd, [-1]) + .5 * np.log(2.0 * np.pi * np.e), axis=-1) # Value function clipping: not present in the original PPO if self.cliprange_vf is None: # Default behavior (legacy from OpenAI baselines): # use the same clipping as for the policy self.cliprange_vf = self.cliprange if self.cliprange_vf < 0: # Original PPO implementation: no value function clipping. vpred_clipped = self.value_flat else: # Clip the different between old and new value # NOTE: this depends on the reward scaling vpred_clipped = self.old_vpred_ph + tf.clip_by_value( self.value_flat - self.old_vpred_ph, -self.cliprange_vf, self.cliprange_vf) vf_losses1 = tf.square(self.value_flat - self.rew_ph) vf_losses2 = tf.square(vpred_clipped - self.rew_ph) self.vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(self.old_neglog_pac_ph - neglogpac) pg_losses = -self.advs_ph * ratio pg_losses2 = -self.advs_ph * tf.clip_by_value( ratio, 1.0 - self.cliprange, 1.0 + self.cliprange) self.pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) self.approxkl = .5 * tf.reduce_mean( tf.square(neglogpac - self.old_neglog_pac_ph)) self.clipfrac = tf.reduce_mean( tf.cast(tf.greater(tf.abs(ratio - 1.0), self.cliprange), tf.float32)) self.loss = self.pg_loss - self.entropy * self.ent_coef \ + self.vf_loss * self.vf_coef # Compute the gradients of the loss. var_list = get_trainable_vars(scope_name) grads = tf.gradients(self.loss, var_list) # Perform gradient clipping if requested. if self.max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, self.max_grad_norm) grads = list(zip(grads, var_list)) # Create the operation that applies the gradients. self.optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate, epsilon=1e-5).apply_gradients(grads)