Ejemplo n.º 1
0
 def _v_train(self, batch, weights=None):
     vf_variables = self._vf.trainable_variables
     with tf.GradientTape(watch_accessed_variables=False) as tape:
         assert vf_variables, 'No vf variables to optimize.'
         tape.watch(vf_variables)
         vf_loss = self.value_function_loss(batch['observations'], batch['annealing'], weights=weights)
     tf.debugging.check_numerics(vf_loss, 'Actor loss is inf or nan.')
     vf_grads = tape.gradient(vf_loss, vf_variables)
     tf_utils.apply_gradients(vf_grads, vf_variables, self._vf_optimizer, self._gradient_clipping)
     return vf_loss
Ejemplo n.º 2
0
 def _p_train(self, batch, weights=None):
     actor_variables = self._policy.trainable_variables
     with tf.GradientTape(watch_accessed_variables=False) as tape:
         assert actor_variables, 'No actor variables to optimize.'
         tape.watch(actor_variables)
         actor_loss = self.policy_loss(batch['observations'], batch['annealing'], weights=weights)
     tf.debugging.check_numerics(actor_loss, 'Actor loss is inf or nan.')
     actor_grads = tape.gradient(actor_loss, actor_variables)
     tf_utils.apply_gradients(actor_grads, actor_variables, self._policy_optimizer, self._gradient_clipping)
     return actor_loss
Ejemplo n.º 3
0
 def _q_train(self, batch, weights=None):
     for qf, qf_optimizer in zip(self._qfs, self._qfs_optimizers):
         qf_variables = qf.trainable_variables
         with tf.GradientTape(watch_accessed_variables=False) as tape:
             assert qf_variables, 'No qf variables to optimize.'
             tape.watch(qf_variables)
             qf_loss = self.q_function_loss(qf, batch['observations'], batch['actions'], batch['rewards'],
                                            batch['next_observations'], batch['terminals'], weights=weights)
         tf.debugging.check_numerics(qf_loss, 'qf loss is inf or nan.')
         qf_grads = tape.gradient(qf_loss, qf_variables)
         tf_utils.apply_gradients(qf_grads, qf_variables, qf_optimizer, self._gradient_clipping)
     return qf_loss
Ejemplo n.º 4
0
 def _prior_train(self, batch):
     prior_variable = self._prior.trainable_variables
     with tf.GradientTape(watch_accessed_variables=False) as tape:
         assert prior_variable, 'No prior variables to optimize.'
         tape.watch(prior_variable)
         prior_loss = self.prior_loss(batch['recent_observations'],
                                      batch['recent_opponent_actions'])
     tf.debugging.check_numerics(prior_loss, 'prior loss is inf or nan.')
     prior_grads = tape.gradient(prior_loss, prior_variable)
     tf_utils.apply_gradients(prior_grads, prior_variable,
                              self._opponent_prior_optimizer,
                              self._gradient_clipping)
     return prior_loss
Ejemplo n.º 5
0
 def _critic_train(self, batch, weights=None):
     critic_variables = self._qf.trainable_variables
     with tf.GradientTape(watch_accessed_variables=False) as tape:
         assert critic_variables, 'No qf variables to optimize.'
         tape.watch(critic_variables)
         critic_loss = self.critic_loss(batch['observations'],
                                        batch['actions'],
                                        batch['rewards'],
                                        batch['next_observations'],
                                        weights=weights)
     tf.debugging.check_numerics(critic_loss, 'qf loss is inf or nan.')
     critic_grads = tape.gradient(critic_loss, critic_variables)
     tf_utils.apply_gradients(critic_grads, critic_variables,
                              self._qf_optimizer, self._gradient_clipping)
     return critic_loss
Ejemplo n.º 6
0
 def _opponent_train(self, batch):
     opponent_policy_variable = self._opponent_policy.trainable_variables
     with tf.GradientTape(watch_accessed_variables=False) as tape:
         assert opponent_policy_variable, 'No opponent policy variables to optimize.'
         tape.watch(opponent_policy_variable)
         opponent_policy_loss = self.opponent_policy_loss(
             batch['observations'], batch['annealing'])
     tf.debugging.check_numerics(opponent_policy_loss,
                                 'opponent policy loss is inf or nan.')
     opponent_policy_grads = tape.gradient(opponent_policy_loss,
                                           opponent_policy_variable)
     tf_utils.apply_gradients(opponent_policy_grads,
                              opponent_policy_variable,
                              self._opponent_policy_optimizer,
                              self._gradient_clipping)
     return opponent_policy_loss
Ejemplo n.º 7
0
 def _actor_train(self, batch, weights=None):
     actor_variables = self._policy.trainable_variables
     with tf.GradientTape(watch_accessed_variables=False) as tape:
         assert actor_variables, "No actor variables to optimize."
         tape.watch(actor_variables)
         actor_loss = self.actor_loss(batch["observations"],
                                      batch["opponent_actions"],
                                      weights=weights)
     tf.debugging.check_numerics(actor_loss, "Actor loss is inf or nan.")
     actor_grads = tape.gradient(actor_loss, actor_variables)
     tf_utils.apply_gradients(
         actor_grads,
         actor_variables,
         self._policy_optimizer,
         self._gradient_clipping,
     )
     return actor_loss
Ejemplo n.º 8
0
 def _critic_train(self, batch, weights=None):
     critic_variables = self._qf.trainable_variables
     with tf.GradientTape(watch_accessed_variables=False) as tape:
         assert critic_variables, "No qf variables to optimize."
         tape.watch(critic_variables)
         critic_loss = self.critic_loss(
             batch["observations"],
             batch["actions"],
             batch["opponent_actions"],
             batch["rewards"],
             batch["next_observations"],
             batch["terminals"],
             batch["annealing"],
             weights=weights,
         )
     tf.debugging.check_numerics(critic_loss, "qf loss is inf or nan.")
     critic_grads = tape.gradient(critic_loss, critic_variables)
     tf_utils.apply_gradients(
         critic_grads,
         critic_variables,
         self._critic_optimizer,
         self._gradient_clipping,
     )
     return critic_loss
Ejemplo n.º 9
0
    def _train(self, batch, weights=None):
        opponent_policy_variable = self._opponent_policy.trainable_variables
        with tf.GradientTape(watch_accessed_variables=False) as tape:
            assert opponent_policy_variable, 'No prior variables to optimize.'
            tape.watch(opponent_policy_variable)
            opponent_policy_loss = self.opponent_loss(
                batch['recent_observations'], batch['recent_opponent_actions'])
        tf.debugging.check_numerics(opponent_policy_loss,
                                    'prior loss is inf or nan.')
        opponent_policy_grads = tape.gradient(opponent_policy_loss,
                                              opponent_policy_variable)
        tf_utils.apply_gradients(opponent_policy_grads,
                                 opponent_policy_variable,
                                 self._opponent_policy_optimizer,
                                 self._gradient_clipping)

        critic_variables = self._qf.trainable_variables
        with tf.GradientTape(watch_accessed_variables=False) as tape:
            assert critic_variables, 'No qf variables to optimize.'
            tape.watch(critic_variables)
            critic_loss = self.critic_loss(batch['observations'],
                                           batch['actions'],
                                           batch['opponent_actions'],
                                           batch['rewards'],
                                           batch['next_observations'],
                                           weights=weights)
        tf.debugging.check_numerics(critic_loss, 'qf loss is inf or nan.')
        critic_grads = tape.gradient(critic_loss, critic_variables)
        tf_utils.apply_gradients(critic_grads, critic_variables,
                                 self._qf_optimizer, self._gradient_clipping)

        actor_variables = self._policy.trainable_variables
        with tf.GradientTape(watch_accessed_variables=False) as tape:
            assert actor_variables, 'No actor variables to optimize.'
            tape.watch(actor_variables)
            actor_loss = self.actor_loss(batch['observations'],
                                         batch['opponent_actions'],
                                         weights=weights)
        tf.debugging.check_numerics(actor_loss, 'Actor loss is inf or nan.')
        actor_grads = tape.gradient(actor_loss, actor_variables)
        tf_utils.apply_gradients(actor_grads, actor_variables,
                                 self._policy_optimizer,
                                 self._gradient_clipping)
        self._train_step += 1

        if self._train_step % self._target_update_period == 0:
            self._update_target()

        losses = {
            'pg_loss': actor_loss.numpy(),
            'critic_loss': critic_loss.numpy(),
            'opponent_policy_loss': opponent_policy_loss.numpy(),
        }

        return losses
Ejemplo n.º 10
0
    def _train(self, batch, weights=None):

        opponent_policy_variable = self._opponent_policy.trainable_variables
        with tf.GradientTape(watch_accessed_variables=False) as tape:
            assert opponent_policy_variable, "No opponent policy variables to optimize."
            tape.watch(opponent_policy_variable)
            opponent_policy_loss = self.opponent_policy_loss(
                batch["recent_observations"],
                batch["recent_actions"],
                batch["recent_opponent_actions"],
            )
        tf.debugging.check_numerics(
            opponent_policy_loss, "opponent policy loss is inf or nan."
        )
        opponent_policy_grads = tape.gradient(
            opponent_policy_loss, opponent_policy_variable
        )
        tf_utils.apply_gradients(
            opponent_policy_grads,
            opponent_policy_variable,
            self._opponent_policy_optimizer,
            self._gradient_clipping,
        )

        critic_variables = self._qf.trainable_variables
        with tf.GradientTape(watch_accessed_variables=False) as tape:
            assert critic_variables, "No qf variables to optimize."
            tape.watch(critic_variables)
            critic_loss = self.critic_loss(
                batch["observations"],
                batch["actions"],
                batch["opponent_actions"],
                batch["rewards"],
                batch["next_observations"],
                batch["terminals"],
                batch["annealing"],
                weights=weights,
            )
        tf.debugging.check_numerics(critic_loss, "qf loss is inf or nan.")
        critic_grads = tape.gradient(critic_loss, critic_variables)
        tf_utils.apply_gradients(
            critic_grads,
            critic_variables,
            self._critic_optimizer,
            self._gradient_clipping,
        )

        actor_variables = self._policy.trainable_variables
        with tf.GradientTape(watch_accessed_variables=False) as tape:
            assert actor_variables, "No actor variables to optimize."
            tape.watch(actor_variables)
            actor_loss = self.actor_loss(
                batch["observations"], batch["annealing"], weights=weights
            )
        tf.debugging.check_numerics(actor_loss, "Actor loss is inf or nan.")
        actor_grads = tape.gradient(actor_loss, actor_variables)
        tf_utils.apply_gradients(
            actor_grads, actor_variables, self._actor_optimizer, self._gradient_clipping
        )
        self._train_step += 1

        if self._train_step % self._target_update_period == 0:
            self._update_target()

        losses = {
            "pg_loss": actor_loss.numpy(),
            "critic_loss": critic_loss.numpy(),
            "opponent_policy_loss": opponent_policy_loss.numpy(),
        }

        return losses