def testProcessExperienceGlobalFeatures(self):
     observation_spec = {
         'f1': tf.TensorSpec(shape=(5, ), dtype=tf.string),
         'f2': tf.TensorSpec(shape=(5, 2), dtype=tf.int32)
     }
     time_step_spec = time_step.time_step_spec(observation_spec)
     training_data_spec = trajectory.Trajectory(
         step_type=time_step_spec.step_type,
         observation=time_step_spec.observation,
         action=tensor_spec.BoundedTensorSpec(shape=(),
                                              minimum=0,
                                              maximum=4,
                                              dtype=tf.int32),
         policy_info=(),
         next_step_type=time_step_spec.step_type,
         reward=tensor_spec.BoundedTensorSpec(shape=(),
                                              minimum=0,
                                              maximum=2,
                                              dtype=tf.float32),
         discount=time_step_spec.discount)
     experience = tensor_spec.sample_spec_nest(training_data_spec,
                                               outer_dims=(7, 2))
     observation, action, reward = utils.process_experience_for_neural_agents(
         experience, False, training_data_spec)
     self.assertAllEqual(observation['f1'][0],
                         experience.observation['f1'][0, 0])
     self.assertEqual(action[0], experience.action[0, 0])
     self.assertEqual(reward[0], experience.reward[0, 0])
Example #2
0
    def _train(self, experience, weights):
        (observations, actions,
         rewards) = bandit_utils.process_experience_for_neural_agents(
             experience, self._observation_and_action_constraint_splitter,
             self._accepts_per_arm_features, self.training_data_spec)

        with tf.GradientTape() as tape:
            loss_info = self.loss(observations,
                                  actions,
                                  rewards,
                                  weights=weights,
                                  training=True)

        variables_to_train = self._variables_to_train()
        if not variables_to_train:
            logging.info('No variable to train in the agent.')
            return loss_info

        grads = tape.gradient(loss_info.loss, variables_to_train)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars)
        self.train_step_counter.assign_add(1)

        return loss_info
    def _loss(self,
              experience: types.NestedTensor,
              weights: Optional[types.Float] = None,
              training: bool = False) -> tf_agent.LossInfo:
        """Computes loss for training the reward and constraint networks.

    Args:
      experience: A batch of experience data in the form of a `Trajectory` or
        `Transition`.
      weights: Optional scalar or elementwise (per-batch-entry) importance
        weights.  The output batch loss will be scaled by these weights, and
        the final scalar loss is the mean of these values.
      training: Whether the loss is being used for training.

    Returns:
      loss: A `LossInfo` containing the loss for the training step.
    Raises:
      ValueError:
        if the number of actions is greater than 1.
    """
        (observations, actions,
         rewards) = bandit_utils.process_experience_for_neural_agents(
             experience, self._accepts_per_arm_features,
             self.training_data_spec)
        if self._observation_and_action_constraint_splitter is not None:
            observations, _ = self._observation_and_action_constraint_splitter(
                observations)

        if self._constraints:
            rewards_tensor = rewards[bandit_spec_utils.REWARD_SPEC_KEY]
        else:
            rewards_tensor = rewards
        reward_loss = self.reward_loss(observations, actions, rewards_tensor,
                                       weights, training)

        constraint_loss = tf.constant(0.0)
        for i, c in enumerate(self._constraints, 0):
            if self._time_step_spec.reward[
                    bandit_spec_utils.CONSTRAINTS_SPEC_KEY].shape.rank > 1:
                constraint_targets = rewards[
                    bandit_spec_utils.CONSTRAINTS_SPEC_KEY][:, i]
            else:
                constraint_targets = rewards[
                    bandit_spec_utils.CONSTRAINTS_SPEC_KEY]
            constraint_loss += c.compute_loss(observations, actions,
                                              constraint_targets, weights,
                                              training)

        self.compute_summaries(
            reward_loss,
            constraint_loss=(constraint_loss if self._constraints else None))

        total_loss = reward_loss
        if self._constraints:
            total_loss += constraint_loss
        return tf_agent.LossInfo(total_loss, extra=())
  def _train(self, experience, weights=None):
    """Updates the policy based on the data in `experience`.

    Note that `experience` should only contain data points that this agent has
    not previously seen. If `experience` comes from a replay buffer, this buffer
    should be cleared between each call to `train`.

    Args:
      experience: A batch of experience data in the form of a `Trajectory`.
      weights: (optional) sample weights.

    Returns:
        A `LossInfo` containing the loss *before* the training step is taken.
        In most cases, if `weights` is provided, the entries of this tuple will
        have been calculated with the weights.  Note that each Agent chooses
        its own method of applying weights.
    """
    experience = self._as_trajectory(experience)

    (observation, action,
     reward) = bandit_utils.process_experience_for_neural_agents(
         experience, self._accepts_per_arm_features, self.training_data_spec)
    if self._observation_and_action_constraint_splitter is not None:
      observation, _ = self._observation_and_action_constraint_splitter(
          observation)
    reward = tf.cast(reward, self._dtype)

    if tf.distribute.has_strategy():
      if self._distributed_train_encoding_network:
        loss_info = self.compute_loss_using_reward_layer(
            observation, action, reward, weights, training=True)
      else:
        loss_info = self.compute_loss_using_linucb_distributed(
            observation, action, reward, weights, training=True)
      return loss_info

    tf.compat.v1.assign(
        self.actions_from_reward_layer,
        tf.less(self._train_step_counter,
                self._encoding_network_num_train_steps))

    def use_actions_from_reward_layer():
      return self.compute_loss_using_reward_layer(
          observation, action, reward, weights, training=True)

    def no_actions_from_reward_layer():
      return self.compute_loss_using_linucb(
          observation, action, reward, weights, training=True)

    loss_info = tf.cond(
        self.actions_from_reward_layer,
        use_actions_from_reward_layer,
        no_actions_from_reward_layer)
    return loss_info
    def _loss(self,
              experience: types.NestedTensor,
              weights: types.Tensor = None,
              training: bool = False) -> tf_agent.LossInfo:
        """Computes loss for training the objective networks.

    Args:
      experience: A batch of experience data in the form of a `Trajectory` or
        `Transition`.
      weights: Optional scalar or elementwise (per-batch-entry) importance
        weights.  The output batch loss will be scaled by these weights, and the
        final scalar loss is the mean of these values.
      training: Whether the loss is being used for training.

    Returns:
      loss: A `LossInfo` containing the loss for the training step.
    Raises:
      ValueError:
        - If the number of actions is greater than 1.
        - If `objectives` is not rank-2.
        - If the number of columns in `objectives` does not equal
          `self._num_objectives`.
    """
        (observations, actions,
         objective_values) = bandit_utils.process_experience_for_neural_agents(
             experience, self._accepts_per_arm_features,
             self.training_data_spec)
        if self._observation_and_action_constraint_splitter is not None:
            observations, _ = self._observation_and_action_constraint_splitter(
                observations)
        if objective_values.shape.rank != 2:
            raise ValueError(
                'The objectives tensor should be rank-2 [batch_size, num_objectives],'
                ' but found to be rank-{}'.format(objective_values.shape.rank))
        if objective_values.shape[1] != self._num_objectives:
            raise ValueError(
                'The number of objectives in the objective_values tensor: {} '
                'is different from the number of objective networks: {}.'.
                format(objective_values.shape[1], self._num_objectives))

        objective_losses = []
        for idx in range(self._num_objectives):
            single_objective_values = objective_values[:, idx]
            objective_losses.append(
                self._single_objective_loss(idx, observations, actions,
                                            single_objective_values, weights,
                                            training))

        self.compute_summaries(objective_losses)
        total_loss = tf.reduce_sum(objective_losses)
        return tf_agent.LossInfo(total_loss, extra=())
    def _train(self, experience: types.NestedTensor,
               weights: types.Tensor) -> tf_agent.LossInfo:
        (observations, actions,
         objective_values) = bandit_utils.process_experience_for_neural_agents(
             experience, self._accepts_per_arm_features,
             self.training_data_spec)
        if self._observation_and_action_constraint_splitter is not None:
            observations, _ = self._observation_and_action_constraint_splitter(
                observations)
        if objective_values.shape.rank != 2:
            raise ValueError(
                'The objectives tensor should be rank-2 [batch_size, num_objectives],'
                ' but found to be rank-{}'.format(objective_values.shape.rank))
        if objective_values.shape[1] != self._num_objectives:
            raise ValueError(
                'The number of objectives in the objective_values tensor: {} '
                'is different from the number of objective networks: {}.'.
                format(objective_values.shape[1], self._num_objectives))

        with tf.GradientTape() as tape:
            loss_info = self.loss(observations,
                                  actions,
                                  objective_values,
                                  weights=weights,
                                  training=True)

        variables_to_train = self._variables_to_train()
        if not variables_to_train:
            logging.info('No variable to train in the agent.')
            return loss_info

        grads = tape.gradient(loss_info.loss, variables_to_train)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars)
        self.train_step_counter.assign_add(1)

        return loss_info
Example #7
0
 def testProcessExperiencePerArmFeaturesWithMask(self):
     mask_spec = tensor_spec.BoundedTensorSpec(shape=(5, ),
                                               minimum=0,
                                               maximum=1,
                                               dtype=tf.int32)
     observation_spec = ({
         'global':
         tf.TensorSpec(shape=(4, ), dtype=tf.float32),
         'per_arm': {
             'f1': tf.TensorSpec(shape=(5, ), dtype=tf.string),
             'f2': tf.TensorSpec(shape=(5, 2), dtype=tf.int32)
         }
     }, mask_spec)
     time_step_spec = time_step.time_step_spec(observation_spec)
     policy_info_spec = policy_utilities.PerArmPolicyInfo(
         chosen_arm_features={
             'f1': tf.TensorSpec(shape=(), dtype=tf.string),
             'f2': tf.TensorSpec(shape=(2, ), dtype=tf.int32)
         })
     training_data_spec = trajectory.Trajectory(
         step_type=time_step_spec.step_type,
         observation=time_step_spec.observation,
         action=tensor_spec.BoundedTensorSpec(shape=(),
                                              minimum=0,
                                              maximum=4,
                                              dtype=tf.int32),
         policy_info=policy_info_spec,
         next_step_type=time_step_spec.step_type,
         reward=tensor_spec.BoundedTensorSpec(shape=(),
                                              minimum=0,
                                              maximum=2,
                                              dtype=tf.float32),
         discount=time_step_spec.discount)
     experience = tensor_spec.sample_spec_nest(training_data_spec,
                                               outer_dims=(7, 2))
     observation, action, reward = utils.process_experience_for_neural_agents(
         experience, lambda x: (x[0], x[1]), True, training_data_spec)
     self.assertEqual(
         observation['per_arm']['f1'][0],
         experience.policy_info.chosen_arm_features['f1'][0, 0])
     self.assertAllEqual(action, tf.zeros(14, dtype=tf.int32))
     self.assertEqual(reward[0], experience.reward[0, 0])