Beispiel #1
0
    def testMaskedAction(self):
        tf.compat.v1.set_random_seed(1)
        action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 2)
        observation_spec = (tensor_spec.TensorSpec([2], tf.float32),
                            tensor_spec.TensorSpec([3], tf.int32))
        time_step_spec = ts.time_step_spec(observation_spec)

        def split_fn(obs):
            return obs[0], obs[1]

        policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy(
            time_step_spec,
            action_spec,
            reward_network=DummyNet(observation_spec[0]),
            observation_and_action_constraint_splitter=split_fn)

        observations = (tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
                        tf.constant([[0, 0, 1], [0, 1, 0]], dtype=tf.int32))
        time_step = ts.restart(observations, batch_size=2)
        action_step = policy.action(time_step, seed=1)
        self.assertEqual(action_step.action.shape.as_list(), [2])
        self.assertEqual(action_step.action.dtype, tf.int32)
        # Initialize all variables
        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertAllEqual(self.evaluate(action_step.action), [2, 1])
Beispiel #2
0
 def testBoltzmannGumbelPredictedRewards(self):
   tf.compat.v1.set_random_seed(1)
   num_samples_list = []
   for k in range(3):
     num_samples_list.append(
         tf.compat.v2.Variable(
             tf.zeros([], dtype=tf.int32), name='num_samples_{}'.format(k)))
   num_samples_list[0].assign_add(2)
   num_samples_list[1].assign_add(4)
   num_samples_list[2].assign_add(1)
   policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy(
       self._time_step_spec,
       self._action_spec,
       reward_network=DummyNet(self._obs_spec),
       boltzmann_gumbel_exploration_constant=10.0,
       emit_policy_info=('predicted_rewards_mean',),
       num_samples_list=num_samples_list)
   observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
   time_step = ts.restart(observations, batch_size=2)
   action_step = policy.action(time_step, seed=1)
   self.assertEqual(action_step.action.shape.as_list(), [2])
   self.assertEqual(action_step.action.dtype, tf.int32)
   # Initialize all variables
   self.evaluate(tf.compat.v1.global_variables_initializer())
   p_info = self.evaluate(action_step.info)
   self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 3])
Beispiel #3
0
    def testBuild(self):
        policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy(
            self._time_step_spec,
            self._action_spec,
            reward_network=DummyNet(self._obs_spec))

        self.assertEqual(policy.time_step_spec, self._time_step_spec)
        self.assertEqual(policy.action_spec, self._action_spec)
Beispiel #4
0
 def testMultipleActionsRaiseError(self):
     action_spec = [tensor_spec.BoundedTensorSpec((), tf.int32, 0, 2)] * 2
     with self.assertRaisesRegexp(
             NotImplementedError,
             'action_spec can only contain a single BoundedTensorSpec'):
         boltzmann_reward_policy.BoltzmannRewardPredictionPolicy(
             self._time_step_spec,
             action_spec,
             reward_network=DummyNet(self._obs_spec))
Beispiel #5
0
 def testWrongActionsRaiseError(self):
   action_spec = tensor_spec.BoundedTensorSpec((5, 6, 7), tf.float32, 0, 2)
   with self.assertRaisesRegexp(
       NotImplementedError,
       'action_spec must be a BoundedTensorSpec of type int32.*'):
     boltzmann_reward_policy.BoltzmannRewardPredictionPolicy(
         self._time_step_spec,
         action_spec,
         reward_network=DummyNet(self._obs_spec))
Beispiel #6
0
 def testActionHeteroscedastic(self):
   tf.compat.v1.set_random_seed(1)
   policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy(
       self._time_step_spec, self._action_spec,
       reward_network=HeteroscedasticDummyNet())
   observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
   time_step = ts.restart(observations, batch_size=2)
   action_step = policy.action(time_step, seed=1)
   self.assertEqual(action_step.action.shape.as_list(), [2])
   self.assertEqual(action_step.action.dtype, tf.int32)
   # Initialize all variables
   self.evaluate(tf.compat.v1.global_variables_initializer())
   self.assertAllInSet(self.evaluate(action_step.action), [0, 1, 2])
Beispiel #7
0
 def testWrongOutputLayerRaiseError(self):
     tf.compat.v1.set_random_seed(1)
     action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 10, 20)
     policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy(
         self._time_step_spec,
         action_spec,
         reward_network=DummyNet(self._obs_spec))
     observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
     time_step = ts.restart(observations, batch_size=2)
     with self.assertRaisesRegexp(
             ValueError,
             r'The number of actions \(11\) does not match the reward_network output'
             r' size \(3\)\.'):
         policy.action(time_step, seed=1)
Beispiel #8
0
 def testPredictedRewards(self):
     tf.compat.v1.set_random_seed(1)
     policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy(
         self._time_step_spec,
         self._action_spec,
         reward_network=DummyNet(self._obs_spec),
         emit_policy_info=('predicted_rewards_mean', ))
     observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
     time_step = ts.restart(observations, batch_size=2)
     action_step = policy.action(time_step, seed=1)
     self.assertEqual(action_step.action.shape.as_list(), [2])
     self.assertEqual(action_step.action.dtype, tf.int32)
     # Initialize all variables
     self.evaluate(tf.compat.v1.global_variables_initializer())
     p_info = self.evaluate(action_step.info)
     self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 3])
Beispiel #9
0
    def testActionScalarSpecWithShift(self):
        tf.compat.v1.set_random_seed(1)
        action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 10, 12)
        policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy(
            self._time_step_spec,
            action_spec,
            reward_network=DummyNet(self._obs_spec))

        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        time_step = ts.restart(observations, batch_size=2)
        action_step = policy.action(time_step, seed=1)
        self.assertEqual(action_step.action.shape.as_list(), [2])
        self.assertEqual(action_step.action.dtype, tf.int32)
        # Initialize all variables
        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertAllInSet(self.evaluate(action_step.action), [10, 11, 12])
Beispiel #10
0
    def testPerArmRewardsVariableNumActions(self):
        tf.compat.v1.set_random_seed(3000)
        obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
            2, 3, 4, add_num_actions_feature=True)
        time_step_spec = ts.time_step_spec(obs_spec)
        action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3)
        reward_network = (global_and_arm_feature_network.
                          create_feed_forward_common_tower_network(
                              obs_spec, (4, 3), (3, 4), (4, 2)))

        policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy(
            time_step_spec,
            action_spec,
            reward_network=reward_network,
            accepts_per_arm_features=True,
            emit_policy_info=('predicted_rewards_mean', ))
        action_feature = tf.cast(tf.reshape(tf.random.shuffle(tf.range(24)),
                                            shape=[2, 4, 3]),
                                 dtype=tf.float32)
        observations = {
            bandit_spec_utils.GLOBAL_FEATURE_KEY:
            tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
            bandit_spec_utils.PER_ARM_FEATURE_KEY:
            action_feature,
            bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY:
            tf.constant([2, 3], dtype=tf.int32)
        }
        time_step = ts.restart(observations, batch_size=2)
        action_step = policy.action(time_step, seed=1)
        self.assertEqual(action_step.action.shape.as_list(), [2])
        self.assertEqual(action_step.action.dtype, tf.int32)
        # Initialize all variables
        self.evaluate(tf.compat.v1.global_variables_initializer())
        action, p_info, first_arm_features = self.evaluate([
            action_step.action, action_step.info,
            observations[bandit_spec_utils.PER_ARM_FEATURE_KEY][0]
        ])
        self.assertAllEqual(action.shape, [2])
        self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 4])
        self.assertAllEqual(p_info.chosen_arm_features.shape, [2, 3])
        first_action = action[0]
        self.assertAllEqual(p_info.chosen_arm_features[0],
                            first_arm_features[first_action])
    def __init__(
            self,
            time_step_spec: types.TimeStep,
            action_spec: types.BoundedTensorSpec,
            reward_network: types.Network,
            optimizer: types.Optimizer,
            temperature: types.FloatOrReturningFloat = 1.0,
            observation_and_action_constraint_splitter: Optional[
                types.Splitter] = None,
            accepts_per_arm_features: bool = False,
            constraints: Iterable[constr.NeuralConstraint] = (),
            # Params for training.
            error_loss_fn: types.LossFn = tf.compat.v1.losses.
        mean_squared_error,
            gradient_clipping: Optional[float] = None,
            # Params for debugging.
            debug_summaries: bool = False,
            summarize_grads_and_vars: bool = False,
            enable_summaries: bool = True,
            emit_policy_info: Tuple[Text, ...] = (),
            train_step_counter: Optional[tf.Variable] = None,
            name: Optional[Text] = None):
        """Creates a Neural Boltzmann Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      reward_network: A `tf_agents.network.Network` to be used by the agent. The
        network will be called with call(observation, step_type) and it is
        expected to provide a reward prediction for all actions.
        *Note*: when using `observation_and_action_constraint_splitter`, make
        sure the `reward_network` is compatible with the network-specific half
        of the output of the `observation_and_action_constraint_splitter`. In
        particular, `observation_and_action_constraint_splitter` will be called
        on the observation before passing to the network.
      optimizer: The optimizer to use for training.
      temperature: float or callable that returns a float. The temperature used
        in the Boltzmann exploration.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      accepts_per_arm_features: (bool) Whether the policy accepts per-arm
        features.
      constraints: iterable of constraints objects that are instances of
        `tf_agents.bandits.agents.NeuralConstraint`.
      error_loss_fn: A function for computing the error loss, taking parameters
        labels, predictions, and weights (any function from tf.losses would
        work). The default is `tf.losses.mean_squared_error`.
      gradient_clipping: A float representing the norm length to clip gradients
        (or None for no clipping.)
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      enable_summaries: A Python bool, default True. When False, all summaries
        (debug or otherwise) should not be written.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      train_step_counter: An optional `tf.Variable` to increment every time the
        train op is run.  Defaults to the `global_step`.
      name: Python str name of this agent. All variables in this module will
        fall under that name. Defaults to the class name.

    Raises:
      ValueError: If the action spec contains more than one action or or it is
      not a bounded scalar int32 spec with minimum 0.
    """
        super(NeuralBoltzmannAgent,
              self).__init__(time_step_spec=time_step_spec,
                             action_spec=action_spec,
                             reward_network=reward_network,
                             optimizer=optimizer,
                             observation_and_action_constraint_splitter=(
                                 observation_and_action_constraint_splitter),
                             accepts_per_arm_features=accepts_per_arm_features,
                             constraints=constraints,
                             error_loss_fn=error_loss_fn,
                             gradient_clipping=gradient_clipping,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             enable_summaries=enable_summaries,
                             emit_policy_info=emit_policy_info,
                             train_step_counter=train_step_counter,
                             name=name)
        self._policy = boltzmann_policy.BoltzmannRewardPredictionPolicy(
            time_step_spec,
            action_spec,
            reward_network,
            temperature,
            observation_and_action_constraint_splitter,
            constraints=constraints,
            accepts_per_arm_features=accepts_per_arm_features,
            emit_policy_info=emit_policy_info)

        self._collect_policy = self._policy
Beispiel #12
0
    def testPerArmRewardsSparseObs(self):
        tf.compat.v1.set_random_seed(3000)
        obs_spec = {
            'global': {
                'sport': tensor_spec.TensorSpec((), tf.string)
            },
            'per_arm': {
                'name': tensor_spec.TensorSpec((3, ), tf.string),
                'fruit': tensor_spec.TensorSpec((3, ), tf.string)
            }
        }
        columns_a = tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                'name', ['bob', 'george', 'wanda']))
        columns_b = tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                'fruit', ['banana', 'kiwi', 'pear']))
        columns_c = tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                'sport', ['bridge', 'chess', 'snooker']))

        reward_network = (global_and_arm_feature_network.
                          create_feed_forward_common_tower_network(
                              observation_spec=obs_spec,
                              global_layers=(4, 3, 2),
                              arm_layers=(6, 5, 4),
                              common_layers=(7, 6, 5),
                              global_preprocessing_combiner=(
                                  tf.compat.v2.keras.layers.DenseFeatures(
                                      [columns_c])),
                              arm_preprocessing_combiner=tf.compat.v2.keras.
                              layers.DenseFeatures([columns_a, columns_b])))

        time_step_spec = ts.time_step_spec(obs_spec)
        action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 2)
        policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy(
            time_step_spec,
            action_spec,
            reward_network=reward_network,
            accepts_per_arm_features=True,
            emit_policy_info=('predicted_rewards_mean'))
        observations = {
            'global': {
                'sport': tf.constant(['snooker', 'chess'])
            },
            'per_arm': {
                'name':
                tf.constant([['george', 'george', 'george'],
                             ['bob', 'bob', 'bob']]),
                'fruit':
                tf.constant([['banana', 'banana', 'banana'],
                             ['kiwi', 'kiwi', 'kiwi']])
            }
        }

        time_step = ts.restart(observations, batch_size=2)
        action_step = policy.action(time_step, seed=1)
        self.assertEqual(action_step.action.shape.as_list(), [2])
        self.assertEqual(action_step.action.dtype, tf.int32)
        # Initialize all variables
        self.evaluate([
            tf.compat.v1.global_variables_initializer(),
            tf.compat.v1.tables_initializer()
        ])
        action, p_info, first_arm_name_feature = self.evaluate([
            action_step.action, action_step.info,
            observations[bandit_spec_utils.PER_ARM_FEATURE_KEY]['name'][0]
        ])
        self.assertAllEqual(action.shape, [2])
        self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 3])
        self.assertAllEqual(p_info.chosen_arm_features['name'].shape, [2])
        self.assertAllEqual(p_info.chosen_arm_features['fruit'].shape, [2])
        first_action = action[0]
        self.assertAllEqual(p_info.chosen_arm_features['name'][0],
                            first_arm_name_feature[first_action])
Beispiel #13
0
    def testPerArmRewards(self):
        tf.compat.v1.set_random_seed(3000)
        obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4)
        time_step_spec = ts.time_step_spec(obs_spec)
        action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3)
        reward_network = (global_and_arm_feature_network.
                          create_feed_forward_common_tower_network(
                              obs_spec, (4, 3), (3, 4), (4, 2)))

        policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy(
            time_step_spec,
            action_spec,
            reward_network=reward_network,
            accepts_per_arm_features=True,
            emit_policy_info=('predicted_rewards_mean', ))
        action_feature = tf.cast(tf.reshape(tf.random.shuffle(tf.range(24)),
                                            shape=[2, 4, 3]),
                                 dtype=tf.float32)
        observations = {
            bandit_spec_utils.GLOBAL_FEATURE_KEY:
            tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
            bandit_spec_utils.PER_ARM_FEATURE_KEY:
            action_feature
        }
        time_step = ts.restart(observations, batch_size=2)
        action_step = policy.action(time_step, seed=1)
        self.assertEqual(action_step.action.shape.as_list(), [2])
        self.assertEqual(action_step.action.dtype, tf.int32)
        # Initialize all variables
        self.evaluate(tf.compat.v1.global_variables_initializer())
        action, p_info, first_arm_features = self.evaluate([
            action_step.action, action_step.info,
            observations[bandit_spec_utils.PER_ARM_FEATURE_KEY][0]
        ])
        self.assertAllEqual(action.shape, [2])
        self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 4])
        self.assertAllEqual(p_info.chosen_arm_features.shape, [2, 3])
        first_action = action[0]
        self.assertAllEqual(p_info.chosen_arm_features[0],
                            first_arm_features[first_action])

        # Check that zeroing out some of the actions does not affect the predicted
        # rewards for unchanged actions. This is to make sure that action feature
        # padding does not influence the behavior.

        if not tf.executing_eagerly():
            # The below comparison will only work in tf2 because of the random per-arm
            # observations get re-drawn in tf1.
            return
        padded_action_feature = tf.concat([
            action_feature[:, 0:1, :],
            tf.zeros(shape=[2, 3, 3], dtype=tf.float32)
        ],
                                          axis=1)
        observations = {
            bandit_spec_utils.GLOBAL_FEATURE_KEY:
            tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
            bandit_spec_utils.PER_ARM_FEATURE_KEY:
            padded_action_feature
        }
        time_step = ts.restart(observations, batch_size=2)
        padded_action_step = policy.action(time_step, seed=1)
        padded_p_info = self.evaluate(padded_action_step.info)
        self.assertAllEqual(p_info.predicted_rewards_mean[:, 0],
                            padded_p_info.predicted_rewards_mean[:, 0])