Exemple #1
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 gamma=1.0,
                 dtype=tf.float32,
                 name=None):
        """Initialize an instance of `LinearThompsonSamplingAgent`.

    Args:
      time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s.
      action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype
        describing the number of actions for this agent.
      gamma: a float forgetting factor in [0.0, 1.0]. When set to
        1.0, the algorithm does not forget.
      dtype: The type of the parameters stored and updated by the agent. Should
        be one of `tf.float32` and `tf.float64`. Defaults to `tf.float32`.
      name: a name for this instance of `LinearThompsonSamplingAgent`.

    Raises:
      ValueError if dtype is not one of `tf.float32` or `tf.float64`.
    """
        tf.Module.__init__(self, name=name)
        self._num_actions = bandit_utils.get_num_actions_from_tensor_spec(
            action_spec)
        self._context_dim = int(time_step_spec.observation.shape[0])
        self._gamma = gamma
        if self._gamma < 0.0 or self._gamma > 1.0:
            raise ValueError(
                'Forgetting factor `gamma` must be in [0.0, 1.0].')

        self._weight_covariances = []
        self._parameter_estimators = []
        self._dtype = dtype
        if dtype not in (tf.float32, tf.float64):
            raise ValueError(
                'Agent dtype should be either `tf.float32 or `tf.float64`.')

        for k in range(self._num_actions):
            self._weight_covariances.append(
                tf.compat.v2.Variable(tf.eye(self._context_dim, dtype=dtype),
                                      name='a_' + str(k)))
            self._parameter_estimators.append(
                tf.compat.v2.Variable(tf.zeros(self._context_dim, dtype=dtype),
                                      name='b_' + str(k)))

        policy = ts_policy.LinearThompsonSamplingPolicy(
            action_spec, self._weight_covariances, self._parameter_estimators)
        super(LinearThompsonSamplingAgent,
              self).__init__(time_step_spec=time_step_spec,
                             action_spec=policy.action_spec,
                             policy=policy,
                             collect_policy=policy,
                             train_sequence_length=None)
Exemple #2
0
    def testActionBatch(self, batch_size, num_actions):
        action_spec = tensor_spec.BoundedTensorSpec(shape=(),
                                                    minimum=0,
                                                    maximum=num_actions - 1,
                                                    dtype=tf.int32,
                                                    name='action')
        policy = lin_ts.LinearThompsonSamplingPolicy(
            action_spec, self._weight_covariance_matrices(num_actions),
            self._parameter_estimators(num_actions), self._time_step_spec)

        action_step = policy.action(
            self._time_step_batch(batch_size, num_actions))

        self.assertEqual(action_step.action.shape.as_list(), [batch_size])
        self.assertEqual(action_step.action.dtype, tf.int32)
        actions = self.evaluate(action_step.action)
        self.assertAllGreaterEqual(actions, 0)
        self.assertAllLessEqual(actions, num_actions - 1)
    def testPredictedRewards(self):
        num_actions = 2
        batch_size = 7
        parameter_estimators = tf.unstack(
            tf.constant([[1, 2], [30, 40]], dtype=tf.float32))
        weight_covariance_matrices = tf.unstack(
            tf.constant([[[1, 0], [0, 1]], [[.5, 0], [0, .5]]],
                        dtype=tf.float32))
        action_spec = tensor_spec.BoundedTensorSpec(shape=(),
                                                    minimum=0,
                                                    maximum=num_actions - 1,
                                                    dtype=tf.int32,
                                                    name='action')
        policy = lin_ts.LinearThompsonSamplingPolicy(
            action_spec,
            self._time_step_spec,
            weight_covariance_matrices,
            parameter_estimators,
            emit_policy_info=('predicted_rewards_mean',
                              'predicted_rewards_sampled'))

        observation = tf.constant([6, 7] * batch_size,
                                  dtype=tf.float32,
                                  shape=[batch_size, 2],
                                  name='observation')
        input_time_step = ts.restart(observation, batch_size=batch_size)
        action_step = policy.action(input_time_step)
        p_info = self.evaluate(action_step.info)

        self.assertEqual(p_info.predicted_rewards_sampled.shape[0], batch_size)
        self.assertEqual(p_info.predicted_rewards_sampled.shape[1],
                         num_actions)

        # Check the predicted rewards means.
        expected_means = [[20, 920]] * batch_size
        self.assertAllClose(p_info.predicted_rewards_mean, expected_means)

        # Check that the returned action is at the argmax of the sampled rewards.
        expected_actions = np.argmax(p_info.predicted_rewards_sampled, axis=-1)
        self.assertAllEqual(self.evaluate(action_step.action),
                            expected_actions)
    def testMaskedActions(self, batch_size, num_actions):
        action_spec = tensor_spec.BoundedTensorSpec(shape=(),
                                                    minimum=0,
                                                    maximum=num_actions - 1,
                                                    dtype=tf.int32,
                                                    name='action')
        obs_spec = (tensor_spec.TensorSpec(self._obs_dim, tf.float32),
                    tensor_spec.TensorSpec(num_actions, tf.int32))
        policy = lin_ts.LinearThompsonSamplingPolicy(
            action_spec,
            ts.time_step_spec(obs_spec),
            self._weight_covariance_matrices(num_actions),
            self._parameter_estimators(num_actions),
            observation_and_action_constraint_splitter=lambda x: (x[0], x[1]))

        action_step = policy.action(
            self._time_step_batch_with_action_mask(batch_size, num_actions))

        self.assertEqual(action_step.action.shape.as_list(), [batch_size])
        self.assertEqual(action_step.action.dtype, tf.int32)
        actions = self.evaluate(action_step.action)
        self.assertAllEqual(actions, range(batch_size))
Exemple #5
0
  def __init__(self,
               time_step_spec,
               action_spec,
               gamma=1.0,
               observation_and_action_constraint_splitter=None,
               dtype=tf.float32,
               name=None):
    """Initialize an instance of `LinearThompsonSamplingAgent`.

    Args:
      time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s.
      action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype
        describing the number of actions for this agent.
      gamma: a float forgetting factor in [0.0, 1.0]. When set to
        1.0, the algorithm does not forget.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      dtype: The type of the parameters stored and updated by the agent. Should
        be one of `tf.float32` and `tf.float64`. Defaults to `tf.float32`.
      name: a name for this instance of `LinearThompsonSamplingAgent`.

    Raises:
      ValueError if dtype is not one of `tf.float32` or `tf.float64`.
    """
    tf.Module.__init__(self, name=name)
    self._num_actions = bandit_utils.get_num_actions_from_tensor_spec(
        action_spec)
    self._observation_and_action_constraint_splitter = (
        observation_and_action_constraint_splitter)
    if observation_and_action_constraint_splitter:
      context_shape = observation_and_action_constraint_splitter(
          time_step_spec.observation)[0].shape.as_list()
    else:
      context_shape = time_step_spec.observation.shape.as_list()
    self._context_dim = (
        tf.compat.dimension_value(context_shape[0]) if context_shape else 1)
    self._gamma = gamma
    if self._gamma < 0.0 or self._gamma > 1.0:
      raise ValueError('Forgetting factor `gamma` must be in [0.0, 1.0].')

    self._weight_covariances = []
    self._parameter_estimators = []
    self._dtype = dtype
    if dtype not in (tf.float32, tf.float64):
      raise ValueError(
          'Agent dtype should be either `tf.float32 or `tf.float64`.')

    for k in range(self._num_actions):
      self._weight_covariances.append(
          tf.compat.v2.Variable(
              tf.eye(self._context_dim, dtype=dtype), name='a_' + str(k)))
      self._parameter_estimators.append(
          tf.compat.v2.Variable(
              tf.zeros(self._context_dim, dtype=dtype), name='b_' + str(k)))

    policy = ts_policy.LinearThompsonSamplingPolicy(
        action_spec,
        time_step_spec,
        self._weight_covariances,
        self._parameter_estimators,
        observation_and_action_constraint_splitter=(
            observation_and_action_constraint_splitter))
    super(LinearThompsonSamplingAgent, self).__init__(
        time_step_spec=time_step_spec,
        action_spec=policy.action_spec,
        policy=policy,
        collect_policy=policy,
        train_sequence_length=None)