Example #1
0
  def _train(self, experience, weights):
    with tf.GradientTape() as tape:
      loss_info = self._loss(
          experience,
          td_errors_loss_fn=self._td_errors_loss_fn,
          gamma=self._gamma,
          reward_scale_factor=self._reward_scale_factor,
          weights=weights)
    tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan')
    variables_to_train = self._q_network.trainable_weights
    non_trainable_weights = self._q_network.non_trainable_weights
    assert list(variables_to_train), "No variables in the agent's q_network."
    grads = tape.gradient(loss_info.loss, variables_to_train)
    # Tuple is used for py3, where zip is a generator producing values once.
    grads_and_vars = list(zip(grads, variables_to_train))
    if self._gradient_clipping is not None:
      grads_and_vars = eager_utils.clip_gradient_norms(grads_and_vars,
                                                       self._gradient_clipping)

    if self._summarize_grads_and_vars:
      grads_and_vars_with_non_trainable = (
          grads_and_vars + [(None, v) for v in non_trainable_weights])
      eager_utils.add_variables_summaries(grads_and_vars_with_non_trainable,
                                          self.train_step_counter)
      eager_utils.add_gradients_summaries(grads_and_vars,
                                          self.train_step_counter)
    training.apply_gradients(
        self._optimizer, grads_and_vars, global_step=self.train_step_counter)

    self._update_target()

    return loss_info
Example #2
0
 def train_single_net(self, net, individual_iql_time_step,
                      individual_iql_next_time_step, time_steps, actions,
                      next_time_steps, i, t):
     variables_to_train = net.agent._q_network.trainable_weights
     assert list(
         variables_to_train), "No variables in the agent's QMIX network."
     with tf.GradientTape(watch_accessed_variables=False) as tape:
         tape.watch(variables_to_train)
         loss_info = self._loss(
             net,
             individual_iql_time_step,
             individual_iql_next_time_step,
             time_steps,
             actions,
             next_time_steps,
             i,
             t,
             td_errors_loss_fn=net.agent._td_errors_loss_fn,
             gamma=net.agent._gamma,
             training=True)
     tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan')
     grads = tape.gradient(loss_info.loss, variables_to_train)
     grads_and_vars = list(zip(grads, variables_to_train))
     training_lib.apply_gradients(net.agent._optimizer,
                                  grads_and_vars,
                                  global_step=net.agent.train_step_counter)
     net.agent._update_target()
     return loss_info
Example #3
0
    def _train(self, experience, weights):
        rewards, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.reward, self._time_step_spec.reward)
        actions, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.action, self._action_spec)
        observations, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.observation, self.training_data_spec.observation)
        if self._observation_and_action_constraint_splitter is not None:
            observations, _ = self._observation_and_action_constraint_splitter(
                observations)
        if self._accepts_per_arm_features:
            # The arm observation we train on needs to be copied from the respective
            # policy info field to the per arm observation field. Pretending there was
            # only one action, we fill the action field with zeros.
            chosen_action, _ = nest_utils.flatten_multi_batched_nested_tensors(
                experience.policy_info.chosen_arm_features,
                self.policy.info_spec.chosen_arm_features)
            observations[
                bandit_spec_utils.PER_ARM_FEATURE_KEY] = tf.expand_dims(
                    chosen_action, axis=1)
            actions = tf.zeros_like(actions)

        with tf.GradientTape() as tape:
            loss_info = self.loss(observations,
                                  actions,
                                  rewards,
                                  weights=weights,
                                  training=True)

        self.compute_summaries(loss_info.loss)
        variables_to_train = self._reward_network.trainable_weights
        if not variables_to_train:
            logging.info('No variable to train in the agent.')
            return loss_info

        grads = tape.gradient(loss_info.loss, variables_to_train)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        training_lib.apply_gradients(self._optimizer,
                                     grads_and_vars,
                                     global_step=self.train_step_counter)

        return loss_info
Example #4
0
    def compute_loss_using_reward_layer(self,
                                        observation,
                                        action,
                                        reward,
                                        weights,
                                        training=False):
        """Computes loss using the reward layer.

    Args:
      observation: A batch of observations.
      action: A batch of actions.
      reward: A batch of rewards.
      weights: Optional scalar or elementwise (per-batch-entry) importance
        weights.  The output batch loss will be scaled by these weights, and
        the final scalar loss is the mean of these values.
      training: Whether the loss is being used for training.

    Returns:
      loss: A `LossInfo` containing the loss for the training step.
    """
        # Update the neural network params.
        with tf.GradientTape() as tape:
            loss_info = self.loss(observation,
                                  action,
                                  reward,
                                  weights,
                                  training=training)
        tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan')
        if self._summarize_grads_and_vars:
            self.compute_summaries(loss_info.loss)
        variables_to_train = (self._encoding_network.trainable_weights +
                              self._reward_layer.trainable_weights)
        if not variables_to_train:
            raise ValueError('No variable to train in the agent.')

        grads = tape.gradient(loss_info.loss, variables_to_train)
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            with tf.name_scope('Reward_network/'):
                eager_utils.add_variables_summaries(grads_and_vars,
                                                    self.train_step_counter)
                eager_utils.add_gradients_summaries(grads_and_vars,
                                                    self.train_step_counter)

        training_lib.apply_gradients(self._optimizer,
                                     grads_and_vars,
                                     global_step=self.train_step_counter)
        return loss_info
Example #5
0
 def train(self, experience, agents, nameDict, networkDict):
     """QMIX - get the Q values from the target network and main network of all the agents"""
     time_steps, policy_steps, next_time_steps = (
         trajectory.experience_to_transitions(experience,
                                              squeeze_time_dim=True))
     variables_to_train = getTrainableVariables(networkDict)
     variables_to_train.append(self.QMIXNet.trainable_weights)
     variables_to_train = tf.nest.flatten(variables_to_train)
     assert list(
         variables_to_train), "No variables in the agent's QMIX network."
     with tf.GradientTape(watch_accessed_variables=False) as tape:
         tape.watch(variables_to_train)
         loss_info = self._loss(time_steps,
                                policy_steps,
                                next_time_steps,
                                agents,
                                nameDict,
                                networkDict,
                                td_errors_loss_fn=self._td_errors_loss_fn,
                                gamma=self._gamma,
                                training=True)
     tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan')
     grads = tape.gradient(loss_info.loss, variables_to_train)
     grads_and_vars = list(zip(grads, variables_to_train))
     self.train_step_counter = training_lib.apply_gradients(
         self._optimizer,
         grads_and_vars,
         global_step=self.train_step_counter)
     self._update_target()
     return loss_info
Example #6
0
    def _train(self, experience, weights):
        rewards, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.reward, self._time_step_spec.reward)
        actions, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.action, self._action_spec)
        observations, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.observation, self._time_step_spec.observation)
        if self._observation_and_action_constraint_splitter is not None:
            observations, _ = self._observation_and_action_constraint_splitter(
                observations)

        with tf.GradientTape() as tape:
            loss_info = self.loss(observations,
                                  actions,
                                  rewards,
                                  weights=weights,
                                  training=True)
        tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan')
        self.compute_summaries(loss_info.loss)
        variables_to_train = self._reward_network.trainable_weights
        if not variables_to_train:
            logging.info('No variable to train in the agent.')
            return loss_info

        grads = tape.gradient(loss_info.loss, variables_to_train)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        training_lib.apply_gradients(self._optimizer,
                                     grads_and_vars,
                                     global_step=self.train_step_counter)

        return loss_info