def __init__(
            self,
            time_step_spec,
            action_spec,
            reward_network,
            optimizer,
            observation_and_action_constraint_splitter=None,
            # Params for training.
            error_loss_fn=tf.compat.v1.losses.mean_squared_error,
            gradient_clipping=None,
            # Params for debugging.
            debug_summaries=False,
            summarize_grads_and_vars=False,
            enable_summaries=True,
            expose_predicted_rewards=False,
            train_step_counter=None,
            name=None):
        """Creates a Greedy Reward Network Prediction Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      reward_network: A `tf_agents.network.Network` to be used by the agent. The
        network will be called with call(observation, step_type) and it is
        expected to provide a reward prediction for all actions.
      optimizer: The optimizer to use for training.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      error_loss_fn: A function for computing the error loss, taking parameters
        labels, predictions, and weights (any function from tf.losses would
        work). The default is `tf.losses.mean_squared_error`.
      gradient_clipping: A float representing the norm length to clip gradients
        (or None for no clipping.)
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      enable_summaries: A Python bool, default True. When False, all summaries
        (debug or otherwise) should not be written.
      expose_predicted_rewards: (bool) Whether to expose the predicted rewards
        in the policy info field under the name 'predicted_rewards'.
      train_step_counter: An optional `tf.Variable` to increment every time the
        train op is run.  Defaults to the `global_step`.
      name: Python str name of this agent. All variables in this module will
        fall under that name. Defaults to the class name.

    Raises:
      ValueError: If the action spec contains more than one action or or it is
      not a bounded scalar int32 spec with minimum 0.
    """
        tf.Module.__init__(self, name=name)
        self._observation_and_action_constraint_splitter = (
            observation_and_action_constraint_splitter)
        self._num_actions = bandit_utils.get_num_actions_from_tensor_spec(
            action_spec)

        self._reward_network = reward_network
        self._optimizer = optimizer
        self._error_loss_fn = error_loss_fn
        self._gradient_clipping = gradient_clipping

        policy = greedy_reward_policy.GreedyRewardPredictionPolicy(
            time_step_spec,
            action_spec,
            reward_network,
            observation_and_action_constraint_splitter,
            expose_predicted_rewards=expose_predicted_rewards)

        super(GreedyRewardPredictionAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy=policy,
                             train_sequence_length=None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             enable_summaries=enable_summaries,
                             train_step_counter=train_step_counter)
    def __init__(
            self,
            time_step_spec,
            action_spec,
            reward_network,
            optimizer,
            # Params for training.
            error_loss_fn=tf.compat.v1.losses.mean_squared_error,
            gradient_clipping=None,
            # Params for debugging.
            debug_summaries=False,
            summarize_grads_and_vars=False,
            train_step_counter=None,
            name=None):
        """Creates a Greedy Reward Network Prediction Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      reward_network: A `tf_agents.network.Network` to be used by the agent. The
        network will be called with call(observation, step_type) and it is
        expected to provide a reward prediction for all actions.
      optimizer: The optimizer to use for training.
      error_loss_fn: A function for computing the error loss, taking parameters
        labels, predictions, and weights (any function from tf.losses would
        work). The default is `tf.losses.mean_squared_error`.
      gradient_clipping: A float representing the norm length to clip gradients
        (or None for no clipping.)
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      train_step_counter: An optional `tf.Variable` to increment every time the
        train op is run.  Defaults to the `global_step`.
      name: Python str name of this agent. All variables in this module will
        fall under that name. Defaults to the class name.

    Raises:
      ValueError: If the action spec contains more than one action or or it is
      not a bounded scalar int32 spec with minimum 0.
    """
        tf.Module.__init__(self, name=name)
        self._num_actions = bandit_utils.get_num_actions_from_tensor_spec(
            action_spec)

        self._reward_network = reward_network
        self._optimizer = optimizer
        self._error_loss_fn = error_loss_fn
        self._gradient_clipping = gradient_clipping

        policy = greedy_reward_policy.GreedyRewardPredictionPolicy(
            time_step_spec, action_spec, reward_network=self._reward_network)

        super(GreedyRewardPredictionAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy=policy,
                             train_sequence_length=None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter)
Esempio n. 3
0
    def __init__(
            self,
            time_step_spec,
            action_spec,
            reward_network,
            optimizer,
            observation_and_action_constraint_splitter=None,
            accepts_per_arm_features=False,
            constraints=(),
            # Params for training.
            error_loss_fn=tf.compat.v1.losses.mean_squared_error,
            gradient_clipping=None,
            # Params for debugging.
            debug_summaries=False,
            summarize_grads_and_vars=False,
            enable_summaries=True,
            emit_policy_info=(),
            train_step_counter=None,
            laplacian_matrix=None,
            laplacian_smoothing_weight=0.001,
            name=None):
        """Creates a Greedy Reward Network Prediction Agent.

     In some use cases, the actions are not independent and they are related to
     each other (e.g., when the actions are ordinal integers). Assuming that
     the relations between arms can be modeled by a graph, we may want to
     enforce that the estimated reward function is smooth over the graph. This
     implies that the estimated rewards `r_i` and `r_j` for two related actions
     `i` and `j`, should be close to each other. To quantify this smoothness
     criterion we use the Laplacian matrix `L` of the graph over the actions.
     When the laplacian smoothing is enabled, the loss is extended to:
     ```
       Loss_new := Loss + lambda r^T * L * r,
     ```
     where `r` is the estimated reward vector for all actions. The second
     term is the laplacian smoothing regularization term and `lambda` is the
     weight that determines how strongly we enforce the regularization.
     For more details, please see:
     "Bandits on graphs and structures", Michal Valko
     https://hal.inria.fr/tel-01359757/document

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      reward_network: A `tf_agents.network.Network` to be used by the agent. The
        network will be called with call(observation, step_type) and it is
        expected to provide a reward prediction for all actions.
      optimizer: The optimizer to use for training.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      accepts_per_arm_features: (bool) Whether the policy accepts per-arm
        features.
      constraints: iterable of constraints objects that are instances of
        `tf_agents.bandits.agents.NeuralConstraint`.
      error_loss_fn: A function for computing the error loss, taking parameters
        labels, predictions, and weights (any function from tf.losses would
        work). The default is `tf.losses.mean_squared_error`.
      gradient_clipping: A float representing the norm length to clip gradients
        (or None for no clipping.)
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      enable_summaries: A Python bool, default True. When False, all summaries
        (debug or otherwise) should not be written.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      train_step_counter: An optional `tf.Variable` to increment every time the
        train op is run.  Defaults to the `global_step`.
      laplacian_matrix: A float `Tensor` or a numpy array shaped
        `[num_actions, num_actions]`. This holds the Laplacian matrix used to
        regularize the smoothness of the estimated expected reward function.
        This only applies to problems where the actions have a graph structure.
        If `None`, the regularization is not applied.
      laplacian_smoothing_weight: A float that determines the weight of the
        regularization term. Note that this has no effect if `laplacian_matrix`
        above is `None`.
      name: Python str name of this agent. All variables in this module will
        fall under that name. Defaults to the class name.

    Raises:
      ValueError: If the action spec contains more than one action or or it is
      not a bounded scalar int32 spec with minimum 0.
      InvalidArgumentError: if the Laplacian provided is not None and not valid.
    """
        tf.Module.__init__(self, name=name)
        common.tf_agents_gauge.get_cell('TFABandit').set(True)
        self._observation_and_action_constraint_splitter = (
            observation_and_action_constraint_splitter)
        self._num_actions = bandit_utils.get_num_actions_from_tensor_spec(
            action_spec)
        self._accepts_per_arm_features = accepts_per_arm_features
        self._constraints = constraints

        reward_network.create_variables()
        self._reward_network = reward_network
        self._optimizer = optimizer
        self._error_loss_fn = error_loss_fn
        self._gradient_clipping = gradient_clipping
        self._heteroscedastic = isinstance(
            reward_network, heteroscedastic_q_network.HeteroscedasticQNetwork)
        self._laplacian_matrix = None
        if laplacian_matrix is not None:
            self._laplacian_matrix = tf.convert_to_tensor(laplacian_matrix,
                                                          dtype=tf.float32)
            # Check the validity of the laplacian matrix.
            tf.debugging.assert_near(
                0.0, tf.norm(tf.reduce_sum(self._laplacian_matrix, 1)))
            tf.debugging.assert_near(
                0.0, tf.norm(tf.reduce_sum(self._laplacian_matrix, 0)))
        self._laplacian_smoothing_weight = laplacian_smoothing_weight

        policy = greedy_reward_policy.GreedyRewardPredictionPolicy(
            time_step_spec,
            action_spec,
            reward_network,
            observation_and_action_constraint_splitter,
            constraints=constraints,
            accepts_per_arm_features=accepts_per_arm_features,
            emit_policy_info=emit_policy_info)
        training_data_spec = None
        if accepts_per_arm_features:
            training_data_spec = bandit_spec_utils.drop_arm_observation(
                policy.trajectory_spec,
                observation_and_action_constraint_splitter)

        super(GreedyRewardPredictionAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy=policy,
                             train_sequence_length=None,
                             training_data_spec=training_data_spec,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             enable_summaries=enable_summaries,
                             train_step_counter=train_step_counter)
  def testBuild(self):
    policy = greedy_reward_policy.GreedyRewardPredictionPolicy(
        self._time_step_spec, self._action_spec, reward_network=DummyNet())

    self.assertEqual(policy.time_step_spec, self._time_step_spec)
    self.assertEqual(policy.action_spec, self._action_spec)
    def testPerArmRewardsSparseObs(self):
        tf.compat.v1.set_random_seed(3000)
        obs_spec = {
            'global': {
                'sport': tensor_spec.TensorSpec((), tf.string)
            },
            'per_arm': {
                'name': tensor_spec.TensorSpec((3, ), tf.string),
                'fruit': tensor_spec.TensorSpec((3, ), tf.string)
            }
        }
        columns_a = tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                'name', ['bob', 'george', 'wanda']))
        columns_b = tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                'fruit', ['banana', 'kiwi', 'pear']))
        columns_c = tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                'sport', ['bridge', 'chess', 'snooker']))

        reward_network = (global_and_arm_feature_network.
                          create_feed_forward_common_tower_network(
                              observation_spec=obs_spec,
                              global_layers=(4, 3, 2),
                              arm_layers=(6, 5, 4),
                              common_layers=(7, 6, 5),
                              global_preprocessing_combiner=(
                                  tf.compat.v2.keras.layers.DenseFeatures(
                                      [columns_c])),
                              arm_preprocessing_combiner=tf.compat.v2.keras.
                              layers.DenseFeatures([columns_a, columns_b])))

        time_step_spec = ts.time_step_spec(obs_spec)
        action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 2)
        policy = greedy_reward_policy.GreedyRewardPredictionPolicy(
            time_step_spec,
            action_spec,
            reward_network=reward_network,
            accepts_per_arm_features=True,
            emit_policy_info=('predicted_rewards_mean', ))
        observations = {
            'global': {
                'sport': tf.constant(['snooker', 'chess'])
            },
            'per_arm': {
                'name':
                tf.constant([['george', 'george', 'george'],
                             ['bob', 'bob', 'bob']]),
                'fruit':
                tf.constant([['banana', 'banana', 'banana'],
                             ['kiwi', 'kiwi', 'kiwi']])
            }
        }

        time_step = ts.restart(observations, batch_size=2)
        action_step = policy.action(time_step, seed=1)
        self.assertEqual(action_step.action.shape.as_list(), [2])
        self.assertEqual(action_step.action.dtype, tf.int32)
        # Initialize all variables
        self.evaluate([
            tf.compat.v1.global_variables_initializer(),
            tf.compat.v1.tables_initializer()
        ])
        action, p_info, first_arm_name_feature = self.evaluate([
            action_step.action, action_step.info,
            observations[bandit_spec_utils.PER_ARM_FEATURE_KEY]['name'][0]
        ])
        self.assertAllEqual(action.shape, [2])
        self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 3])
        self.assertAllEqual(p_info.chosen_arm_features['name'].shape, [2])
        self.assertAllEqual(p_info.chosen_arm_features['fruit'].shape, [2])
        first_action = action[0]
        self.assertAllEqual(p_info.chosen_arm_features['name'][0],
                            first_arm_name_feature[first_action])
    def testPerArmRewards(self):
        tf.compat.v1.set_random_seed(3000)
        obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4)
        time_step_spec = ts.time_step_spec(obs_spec)
        action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3)
        reward_network = (global_and_arm_feature_network.
                          create_feed_forward_common_tower_network(
                              obs_spec, (4, 3), (3, 4), (4, 2)))

        policy = greedy_reward_policy.GreedyRewardPredictionPolicy(
            time_step_spec,
            action_spec,
            reward_network=reward_network,
            accepts_per_arm_features=True,
            emit_policy_info=('predicted_rewards_mean', ))
        action_feature = tf.cast(tf.reshape(tf.random.shuffle(tf.range(24)),
                                            shape=[2, 4, 3]),
                                 dtype=tf.float32)
        observations = {
            bandit_spec_utils.GLOBAL_FEATURE_KEY:
            tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
            bandit_spec_utils.PER_ARM_FEATURE_KEY:
            action_feature
        }
        time_step = ts.restart(observations, batch_size=2)
        action_step = policy.action(time_step, seed=1)
        self.assertEqual(action_step.action.shape.as_list(), [2])
        self.assertEqual(action_step.action.dtype, tf.int32)
        # Initialize all variables
        self.evaluate(tf.compat.v1.global_variables_initializer())
        action, p_info, first_arm_features = self.evaluate([
            action_step.action, action_step.info,
            observations[bandit_spec_utils.PER_ARM_FEATURE_KEY][0]
        ])
        self.assertAllEqual(action.shape, [2])
        self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 4])
        self.assertAllEqual(p_info.chosen_arm_features.shape, [2, 3])
        first_action = action[0]
        self.assertAllEqual(p_info.chosen_arm_features[0],
                            first_arm_features[first_action])

        # Check that zeroing out some of the actions does not affect the predicted
        # rewards for unchanged actions. This is to make sure that action feature
        # padding does not influence the behavior.

        if not tf.executing_eagerly():
            # The below comparison will only work in tf2 because of the random per-arm
            # observations get re-drawn in tf1.
            return
        padded_action_feature = tf.concat([
            action_feature[:, 0:1, :],
            tf.zeros(shape=[2, 3, 3], dtype=tf.float32)
        ],
                                          axis=1)
        observations = {
            bandit_spec_utils.GLOBAL_FEATURE_KEY:
            tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
            bandit_spec_utils.PER_ARM_FEATURE_KEY:
            padded_action_feature
        }
        time_step = ts.restart(observations, batch_size=2)
        padded_action_step = policy.action(time_step, seed=1)
        padded_p_info = self.evaluate(padded_action_step.info)
        self.assertAllEqual(p_info.predicted_rewards_mean[:, 0],
                            padded_p_info.predicted_rewards_mean[:, 0])