def __init__(self, time_step_spec, action_spec, gamma=1.0, dtype=tf.float32, name=None): """Initialize an instance of `LinearThompsonSamplingAgent`. Args: time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s. action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype describing the number of actions for this agent. gamma: a float forgetting factor in [0.0, 1.0]. When set to 1.0, the algorithm does not forget. dtype: The type of the parameters stored and updated by the agent. Should be one of `tf.float32` and `tf.float64`. Defaults to `tf.float32`. name: a name for this instance of `LinearThompsonSamplingAgent`. Raises: ValueError if dtype is not one of `tf.float32` or `tf.float64`. """ tf.Module.__init__(self, name=name) self._num_actions = bandit_utils.get_num_actions_from_tensor_spec( action_spec) self._context_dim = int(time_step_spec.observation.shape[0]) self._gamma = gamma if self._gamma < 0.0 or self._gamma > 1.0: raise ValueError( 'Forgetting factor `gamma` must be in [0.0, 1.0].') self._weight_covariances = [] self._parameter_estimators = [] self._dtype = dtype if dtype not in (tf.float32, tf.float64): raise ValueError( 'Agent dtype should be either `tf.float32 or `tf.float64`.') for k in range(self._num_actions): self._weight_covariances.append( tf.compat.v2.Variable(tf.eye(self._context_dim, dtype=dtype), name='a_' + str(k))) self._parameter_estimators.append( tf.compat.v2.Variable(tf.zeros(self._context_dim, dtype=dtype), name='b_' + str(k))) policy = ts_policy.LinearThompsonSamplingPolicy( action_spec, self._weight_covariances, self._parameter_estimators) super(LinearThompsonSamplingAgent, self).__init__(time_step_spec=time_step_spec, action_spec=policy.action_spec, policy=policy, collect_policy=policy, train_sequence_length=None)
def testActionBatch(self, batch_size, num_actions): action_spec = tensor_spec.BoundedTensorSpec(shape=(), minimum=0, maximum=num_actions - 1, dtype=tf.int32, name='action') policy = lin_ts.LinearThompsonSamplingPolicy( action_spec, self._weight_covariance_matrices(num_actions), self._parameter_estimators(num_actions), self._time_step_spec) action_step = policy.action( self._time_step_batch(batch_size, num_actions)) self.assertEqual(action_step.action.shape.as_list(), [batch_size]) self.assertEqual(action_step.action.dtype, tf.int32) actions = self.evaluate(action_step.action) self.assertAllGreaterEqual(actions, 0) self.assertAllLessEqual(actions, num_actions - 1)
def testPredictedRewards(self): num_actions = 2 batch_size = 7 parameter_estimators = tf.unstack( tf.constant([[1, 2], [30, 40]], dtype=tf.float32)) weight_covariance_matrices = tf.unstack( tf.constant([[[1, 0], [0, 1]], [[.5, 0], [0, .5]]], dtype=tf.float32)) action_spec = tensor_spec.BoundedTensorSpec(shape=(), minimum=0, maximum=num_actions - 1, dtype=tf.int32, name='action') policy = lin_ts.LinearThompsonSamplingPolicy( action_spec, self._time_step_spec, weight_covariance_matrices, parameter_estimators, emit_policy_info=('predicted_rewards_mean', 'predicted_rewards_sampled')) observation = tf.constant([6, 7] * batch_size, dtype=tf.float32, shape=[batch_size, 2], name='observation') input_time_step = ts.restart(observation, batch_size=batch_size) action_step = policy.action(input_time_step) p_info = self.evaluate(action_step.info) self.assertEqual(p_info.predicted_rewards_sampled.shape[0], batch_size) self.assertEqual(p_info.predicted_rewards_sampled.shape[1], num_actions) # Check the predicted rewards means. expected_means = [[20, 920]] * batch_size self.assertAllClose(p_info.predicted_rewards_mean, expected_means) # Check that the returned action is at the argmax of the sampled rewards. expected_actions = np.argmax(p_info.predicted_rewards_sampled, axis=-1) self.assertAllEqual(self.evaluate(action_step.action), expected_actions)
def testMaskedActions(self, batch_size, num_actions): action_spec = tensor_spec.BoundedTensorSpec(shape=(), minimum=0, maximum=num_actions - 1, dtype=tf.int32, name='action') obs_spec = (tensor_spec.TensorSpec(self._obs_dim, tf.float32), tensor_spec.TensorSpec(num_actions, tf.int32)) policy = lin_ts.LinearThompsonSamplingPolicy( action_spec, ts.time_step_spec(obs_spec), self._weight_covariance_matrices(num_actions), self._parameter_estimators(num_actions), observation_and_action_constraint_splitter=lambda x: (x[0], x[1])) action_step = policy.action( self._time_step_batch_with_action_mask(batch_size, num_actions)) self.assertEqual(action_step.action.shape.as_list(), [batch_size]) self.assertEqual(action_step.action.dtype, tf.int32) actions = self.evaluate(action_step.action) self.assertAllEqual(actions, range(batch_size))
def __init__(self, time_step_spec, action_spec, gamma=1.0, observation_and_action_constraint_splitter=None, dtype=tf.float32, name=None): """Initialize an instance of `LinearThompsonSamplingAgent`. Args: time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s. action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype describing the number of actions for this agent. gamma: a float forgetting factor in [0.0, 1.0]. When set to 1.0, the algorithm does not forget. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the bandit agent and policy, and 2) the boolean mask. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. dtype: The type of the parameters stored and updated by the agent. Should be one of `tf.float32` and `tf.float64`. Defaults to `tf.float32`. name: a name for this instance of `LinearThompsonSamplingAgent`. Raises: ValueError if dtype is not one of `tf.float32` or `tf.float64`. """ tf.Module.__init__(self, name=name) self._num_actions = bandit_utils.get_num_actions_from_tensor_spec( action_spec) self._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter) if observation_and_action_constraint_splitter: context_shape = observation_and_action_constraint_splitter( time_step_spec.observation)[0].shape.as_list() else: context_shape = time_step_spec.observation.shape.as_list() self._context_dim = ( tf.compat.dimension_value(context_shape[0]) if context_shape else 1) self._gamma = gamma if self._gamma < 0.0 or self._gamma > 1.0: raise ValueError('Forgetting factor `gamma` must be in [0.0, 1.0].') self._weight_covariances = [] self._parameter_estimators = [] self._dtype = dtype if dtype not in (tf.float32, tf.float64): raise ValueError( 'Agent dtype should be either `tf.float32 or `tf.float64`.') for k in range(self._num_actions): self._weight_covariances.append( tf.compat.v2.Variable( tf.eye(self._context_dim, dtype=dtype), name='a_' + str(k))) self._parameter_estimators.append( tf.compat.v2.Variable( tf.zeros(self._context_dim, dtype=dtype), name='b_' + str(k))) policy = ts_policy.LinearThompsonSamplingPolicy( action_spec, time_step_spec, self._weight_covariances, self._parameter_estimators, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter)) super(LinearThompsonSamplingAgent, self).__init__( time_step_spec=time_step_spec, action_spec=policy.action_spec, policy=policy, collect_policy=policy, train_sequence_length=None)