def _v_train(self, batch, weights=None): vf_variables = self._vf.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape: assert vf_variables, 'No vf variables to optimize.' tape.watch(vf_variables) vf_loss = self.value_function_loss(batch['observations'], batch['annealing'], weights=weights) tf.debugging.check_numerics(vf_loss, 'Actor loss is inf or nan.') vf_grads = tape.gradient(vf_loss, vf_variables) tf_utils.apply_gradients(vf_grads, vf_variables, self._vf_optimizer, self._gradient_clipping) return vf_loss
def _p_train(self, batch, weights=None): actor_variables = self._policy.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape: assert actor_variables, 'No actor variables to optimize.' tape.watch(actor_variables) actor_loss = self.policy_loss(batch['observations'], batch['annealing'], weights=weights) tf.debugging.check_numerics(actor_loss, 'Actor loss is inf or nan.') actor_grads = tape.gradient(actor_loss, actor_variables) tf_utils.apply_gradients(actor_grads, actor_variables, self._policy_optimizer, self._gradient_clipping) return actor_loss
def _q_train(self, batch, weights=None): for qf, qf_optimizer in zip(self._qfs, self._qfs_optimizers): qf_variables = qf.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape: assert qf_variables, 'No qf variables to optimize.' tape.watch(qf_variables) qf_loss = self.q_function_loss(qf, batch['observations'], batch['actions'], batch['rewards'], batch['next_observations'], batch['terminals'], weights=weights) tf.debugging.check_numerics(qf_loss, 'qf loss is inf or nan.') qf_grads = tape.gradient(qf_loss, qf_variables) tf_utils.apply_gradients(qf_grads, qf_variables, qf_optimizer, self._gradient_clipping) return qf_loss
def _prior_train(self, batch): prior_variable = self._prior.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape: assert prior_variable, 'No prior variables to optimize.' tape.watch(prior_variable) prior_loss = self.prior_loss(batch['recent_observations'], batch['recent_opponent_actions']) tf.debugging.check_numerics(prior_loss, 'prior loss is inf or nan.') prior_grads = tape.gradient(prior_loss, prior_variable) tf_utils.apply_gradients(prior_grads, prior_variable, self._opponent_prior_optimizer, self._gradient_clipping) return prior_loss
def _critic_train(self, batch, weights=None): critic_variables = self._qf.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape: assert critic_variables, 'No qf variables to optimize.' tape.watch(critic_variables) critic_loss = self.critic_loss(batch['observations'], batch['actions'], batch['rewards'], batch['next_observations'], weights=weights) tf.debugging.check_numerics(critic_loss, 'qf loss is inf or nan.') critic_grads = tape.gradient(critic_loss, critic_variables) tf_utils.apply_gradients(critic_grads, critic_variables, self._qf_optimizer, self._gradient_clipping) return critic_loss
def _opponent_train(self, batch): opponent_policy_variable = self._opponent_policy.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape: assert opponent_policy_variable, 'No opponent policy variables to optimize.' tape.watch(opponent_policy_variable) opponent_policy_loss = self.opponent_policy_loss( batch['observations'], batch['annealing']) tf.debugging.check_numerics(opponent_policy_loss, 'opponent policy loss is inf or nan.') opponent_policy_grads = tape.gradient(opponent_policy_loss, opponent_policy_variable) tf_utils.apply_gradients(opponent_policy_grads, opponent_policy_variable, self._opponent_policy_optimizer, self._gradient_clipping) return opponent_policy_loss
def _actor_train(self, batch, weights=None): actor_variables = self._policy.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape: assert actor_variables, "No actor variables to optimize." tape.watch(actor_variables) actor_loss = self.actor_loss(batch["observations"], batch["opponent_actions"], weights=weights) tf.debugging.check_numerics(actor_loss, "Actor loss is inf or nan.") actor_grads = tape.gradient(actor_loss, actor_variables) tf_utils.apply_gradients( actor_grads, actor_variables, self._policy_optimizer, self._gradient_clipping, ) return actor_loss
def _critic_train(self, batch, weights=None): critic_variables = self._qf.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape: assert critic_variables, "No qf variables to optimize." tape.watch(critic_variables) critic_loss = self.critic_loss( batch["observations"], batch["actions"], batch["opponent_actions"], batch["rewards"], batch["next_observations"], batch["terminals"], batch["annealing"], weights=weights, ) tf.debugging.check_numerics(critic_loss, "qf loss is inf or nan.") critic_grads = tape.gradient(critic_loss, critic_variables) tf_utils.apply_gradients( critic_grads, critic_variables, self._critic_optimizer, self._gradient_clipping, ) return critic_loss
def _train(self, batch, weights=None): opponent_policy_variable = self._opponent_policy.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape: assert opponent_policy_variable, 'No prior variables to optimize.' tape.watch(opponent_policy_variable) opponent_policy_loss = self.opponent_loss( batch['recent_observations'], batch['recent_opponent_actions']) tf.debugging.check_numerics(opponent_policy_loss, 'prior loss is inf or nan.') opponent_policy_grads = tape.gradient(opponent_policy_loss, opponent_policy_variable) tf_utils.apply_gradients(opponent_policy_grads, opponent_policy_variable, self._opponent_policy_optimizer, self._gradient_clipping) critic_variables = self._qf.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape: assert critic_variables, 'No qf variables to optimize.' tape.watch(critic_variables) critic_loss = self.critic_loss(batch['observations'], batch['actions'], batch['opponent_actions'], batch['rewards'], batch['next_observations'], weights=weights) tf.debugging.check_numerics(critic_loss, 'qf loss is inf or nan.') critic_grads = tape.gradient(critic_loss, critic_variables) tf_utils.apply_gradients(critic_grads, critic_variables, self._qf_optimizer, self._gradient_clipping) actor_variables = self._policy.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape: assert actor_variables, 'No actor variables to optimize.' tape.watch(actor_variables) actor_loss = self.actor_loss(batch['observations'], batch['opponent_actions'], weights=weights) tf.debugging.check_numerics(actor_loss, 'Actor loss is inf or nan.') actor_grads = tape.gradient(actor_loss, actor_variables) tf_utils.apply_gradients(actor_grads, actor_variables, self._policy_optimizer, self._gradient_clipping) self._train_step += 1 if self._train_step % self._target_update_period == 0: self._update_target() losses = { 'pg_loss': actor_loss.numpy(), 'critic_loss': critic_loss.numpy(), 'opponent_policy_loss': opponent_policy_loss.numpy(), } return losses
def _train(self, batch, weights=None): opponent_policy_variable = self._opponent_policy.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape: assert opponent_policy_variable, "No opponent policy variables to optimize." tape.watch(opponent_policy_variable) opponent_policy_loss = self.opponent_policy_loss( batch["recent_observations"], batch["recent_actions"], batch["recent_opponent_actions"], ) tf.debugging.check_numerics( opponent_policy_loss, "opponent policy loss is inf or nan." ) opponent_policy_grads = tape.gradient( opponent_policy_loss, opponent_policy_variable ) tf_utils.apply_gradients( opponent_policy_grads, opponent_policy_variable, self._opponent_policy_optimizer, self._gradient_clipping, ) critic_variables = self._qf.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape: assert critic_variables, "No qf variables to optimize." tape.watch(critic_variables) critic_loss = self.critic_loss( batch["observations"], batch["actions"], batch["opponent_actions"], batch["rewards"], batch["next_observations"], batch["terminals"], batch["annealing"], weights=weights, ) tf.debugging.check_numerics(critic_loss, "qf loss is inf or nan.") critic_grads = tape.gradient(critic_loss, critic_variables) tf_utils.apply_gradients( critic_grads, critic_variables, self._critic_optimizer, self._gradient_clipping, ) actor_variables = self._policy.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape: assert actor_variables, "No actor variables to optimize." tape.watch(actor_variables) actor_loss = self.actor_loss( batch["observations"], batch["annealing"], weights=weights ) tf.debugging.check_numerics(actor_loss, "Actor loss is inf or nan.") actor_grads = tape.gradient(actor_loss, actor_variables) tf_utils.apply_gradients( actor_grads, actor_variables, self._actor_optimizer, self._gradient_clipping ) self._train_step += 1 if self._train_step % self._target_update_period == 0: self._update_target() losses = { "pg_loss": actor_loss.numpy(), "critic_loss": critic_loss.numpy(), "opponent_policy_loss": opponent_policy_loss.numpy(), } return losses