def testActionBatchWithVariablesAndPolicyUpdate(self, batch_size): a_list = [] a_new_list = [] b_list = [] b_new_list = [] num_samples_list = [] num_samples_new_list = [] for k in range(1, self._num_actions + 1): a_initial_value = tf.constant( [[2 * k + 1, k + 1], [k + 1, 2 * k + 1]], dtype=tf.float32) a_for_one_arm = tf.compat.v2.Variable(a_initial_value) a_list.append(a_for_one_arm) b_initial_value = tf.constant([k, k], dtype=tf.float32) b_for_one_arm = tf.compat.v2.Variable(b_initial_value) b_list.append(b_for_one_arm) num_samples_initial_value = tf.constant([1], dtype=tf.float32) num_samples_for_one_arm = tf.compat.v2.Variable( num_samples_initial_value) num_samples_list.append(num_samples_for_one_arm) # Variables for the new policy (they differ by an offset). a_new_for_one_arm = tf.compat.v2.Variable(a_initial_value + _POLICY_VARIABLES_OFFSET) a_new_list.append(a_new_for_one_arm) b_new_for_one_arm = tf.compat.v2.Variable(b_initial_value + _POLICY_VARIABLES_OFFSET) b_new_list.append(b_new_for_one_arm) num_samples_for_one_arm_new = tf.compat.v2.Variable( num_samples_initial_value + _POLICY_VARIABLES_OFFSET) num_samples_new_list.append(num_samples_for_one_arm_new) self.evaluate(tf.compat.v1.global_variables_initializer()) policy = lin_ucb_policy.LinearUCBPolicy(self._action_spec, a_list, b_list, num_samples_list, self._time_step_spec) self.assertLen(policy.variables(), 3 * self._num_actions) new_policy = lin_ucb_policy.LinearUCBPolicy(self._action_spec, a_new_list, b_new_list, num_samples_new_list, self._time_step_spec) self.assertLen(new_policy.variables(), 3 * self._num_actions) self.evaluate(new_policy.update(policy)) action_step = policy.action( self._time_step_batch(batch_size=batch_size)) new_action_step = new_policy.action( self._time_step_batch(batch_size=batch_size)) self.assertEqual(action_step.action.shape, new_action_step.action.shape) self.assertEqual(action_step.action.dtype, new_action_step.action.dtype) actions_, new_actions_ = self.evaluate( [action_step.action, new_action_step.action]) self.assertAllEqual(actions_, new_actions_)
def testObservationShapeMismatch(self, batch_size): policy = lin_ucb_policy.LinearUCBPolicy(self._action_spec, self._a, self._b, self._num_samples_per_arm, self._time_step_spec) current_time_step = ts.TimeStep( tf.constant(ts.StepType.FIRST, dtype=tf.int32, shape=[batch_size], name='step_type'), tf.constant(0.0, dtype=tf.float32, shape=[batch_size], name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[batch_size], name='discount'), tf.constant(np.array(range(batch_size * (self._obs_dim + 1))), dtype=tf.float32, shape=[batch_size, self._obs_dim + 1], name='observation')) with self.assertRaisesRegexp( ValueError, r'Observation shape is expected to be \[None, 2\].' r' Got \[%d, 3\].' % batch_size): policy.action(current_time_step)
def testBuild(self): policy = lin_ucb_policy.LinearUCBPolicy(self._action_spec, self._a, self._b, self._num_samples_per_arm, self._time_step_spec) self.assertEqual(policy.time_step_spec, self._time_step_spec)
def testActionBatch(self, batch_size): policy = lin_ucb_policy.LinearUCBPolicy(self._action_spec, self._a, self._b, self._num_samples_per_arm, self._time_step_spec) action_step = policy.action( self._time_step_batch(batch_size=batch_size)) self.assertEqual(action_step.action.shape.as_list(), [batch_size]) self.assertEqual(action_step.action.dtype, tf.int32) actions_ = self.evaluate(action_step.action) self.assertAllGreaterEqual(actions_, self._action_spec.minimum) self.assertAllLessEqual(actions_, self._action_spec.maximum)
def testActionBatchWithMask(self, batch_size): def split_fn(obs): return obs[0], obs[1] policy = lin_ucb_policy.LinearUCBPolicy( self._action_spec, self._a, self._b, self._num_samples_per_arm, self._time_step_spec_with_mask, observation_and_action_constraint_splitter=split_fn) action_step = policy.action( self._time_step_batch_with_mask(batch_size=batch_size)) self.assertEqual(action_step.action.shape.as_list(), [batch_size]) self.assertEqual(action_step.action.dtype, tf.int32) actions_ = self.evaluate(action_step.action) self.assertAllEqual(actions_, range(batch_size))
def testPredictedRewards(self, batch_size): policy = lin_ucb_policy.LinearUCBPolicy( self._action_spec, self._a, self._b, self._num_samples_per_arm, self._time_step_spec, emit_policy_info=( policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN, )) action_step = policy.action( self._time_step_batch(batch_size=batch_size)) self.assertEqual(action_step.action.shape.as_list(), [batch_size]) self.assertEqual(action_step.action.dtype, tf.int32) observation_numpy = np.array(range(batch_size * self._obs_dim), dtype=np.float32).reshape( [batch_size, self._obs_dim]) p_values = [] predicted_rewards_expected = [] for k in range(self._num_actions): a_inv = np.linalg.inv(self._a_numpy[k] + np.eye(self._obs_dim)) theta = np.matmul(a_inv, self._b_numpy[k].reshape([self._obs_dim, 1])) confidence_intervals = np.sqrt( np.diag( np.matmul( observation_numpy, np.matmul(a_inv, np.transpose(observation_numpy))))) est_mean_reward = np.matmul(observation_numpy, theta) predicted_rewards_expected.append(est_mean_reward) p_value = (est_mean_reward + self._alpha * confidence_intervals.reshape([-1, 1])) p_values.append(p_value) predicted_rewards_expected_array = np.stack(predicted_rewards_expected, axis=-1).reshape( batch_size, self._num_actions) p_info = self.evaluate(action_step.info) self.assertAllClose(p_info.predicted_rewards_mean, predicted_rewards_expected_array)
def testComparisonWithNumpy(self, batch_size, use_decomposition=False): eig_matrix_list = () eig_vals_list = () if use_decomposition: eig_vals_one_arm, eig_matrix_one_arm = tf.linalg.eigh(self._a[0]) eig_vals_list = [eig_vals_one_arm] * self._num_actions eig_matrix_list = [eig_matrix_one_arm] * self._num_actions policy = lin_ucb_policy.LinearUCBPolicy(self._action_spec, self._a, self._b, self._num_samples_per_arm, self._time_step_spec, eig_vals=eig_vals_list, eig_matrix=eig_matrix_list) action_step = policy.action( self._time_step_batch(batch_size=batch_size)) self.assertEqual(action_step.action.shape.as_list(), [batch_size]) self.assertEqual(action_step.action.dtype, tf.int32) actions_ = self.evaluate(action_step.action) observation_numpy = np.array(range(batch_size * self._obs_dim), dtype=np.float32).reshape( [batch_size, self._obs_dim]) p_values = [] for k in range(self._num_actions): a_inv = np.linalg.inv(self._a_numpy[k] + np.eye(self._obs_dim)) theta = np.matmul(a_inv, self._b_numpy[k].reshape([self._obs_dim, 1])) confidence_intervals = np.sqrt( np.diag( np.matmul( observation_numpy, np.matmul(a_inv, np.transpose(observation_numpy))))) p_value = (np.matmul(observation_numpy, theta) + self._alpha * confidence_intervals.reshape([-1, 1])) p_values.append(p_value) actions_numpy = np.argmax(np.stack(p_values, axis=-1), axis=-1).reshape([batch_size]) self.assertAllEqual(actions_.reshape([batch_size]), actions_numpy)
def testActionBatchWithBias(self, batch_size): a = [tf.constant([[4, 1, 2], [1, 5, 3], [2, 3, 6]], dtype=tf.float32) ] * self._num_actions b = [ tf.constant([r, r, r], dtype=tf.float32) for r in range(self._num_actions) ] policy = lin_ucb_policy.LinearUCBPolicy(self._action_spec, a, b, self._num_samples_per_arm, self._time_step_spec, add_bias=True) action_step = policy.action( self._time_step_batch(batch_size=batch_size)) self.assertEqual(action_step.action.shape.as_list(), [batch_size]) self.assertEqual(action_step.action.dtype, tf.int32) actions_ = self.evaluate(action_step.action) self.assertAllGreaterEqual(actions_, self._action_spec.minimum) self.assertAllLessEqual(actions_, self._action_spec.maximum)
def __init__(self, time_step_spec, action_spec, alpha=1.0, gamma=1.0, use_eigendecomp=False, tikhonov_weight=1.0, emit_log_probability=False, dtype=tf.float32, name=None): """Initialize an instance of `LinearUCBAgent`. Args: time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s. action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype describing the number of actions for this agent. alpha: (float) positive scalar. This is the exploration parameter that multiplies the confidence intervals. gamma: a float forgetting factor in [0.0, 1.0]. When set to 1.0, the algorithm does not forget. use_eigendecomp: whether to use eigen-decomposition or not. The default solver is Conjugate Gradient. tikhonov_weight: (float) tikhonov regularization term. emit_log_probability: Whether the LinearUCBPolicy emits log-probabilities or not. Since the policy is deterministic, the probability is just 1. dtype: The type of the parameters stored and updated by the agent. Should be one of `tf.float32` and `tf.float64`. Defaults to `tf.float32`. name: a name for this instance of `LinearUCBAgent`. Raises: ValueError if dtype is not one of `tf.float32` or `tf.float64`. """ tf.Module.__init__(self, name=name) self._num_actions = bandit_utils.get_num_actions_from_tensor_spec( action_spec) self._context_dim = int(time_step_spec.observation.shape[0]) self._alpha = alpha self._cov_matrix_list = [] self._data_vector_list = [] self._eig_matrix_list = [] self._eig_vals_list = [] # We keep track of the number of samples per arm. self._num_samples_list = [] self._gamma = gamma if self._gamma < 0.0 or self._gamma > 1.0: raise ValueError( 'Forgetting factor `gamma` must be in [0.0, 1.0].') self._dtype = dtype if dtype not in (tf.float32, tf.float64): raise ValueError( 'Agent dtype should be either `tf.float32 or `tf.float64`.') self._use_eigendecomp = use_eigendecomp self._tikhonov_weight = tikhonov_weight for k in range(self._num_actions): self._cov_matrix_list.append( tf.compat.v2.Variable(tf.eye(self._context_dim, dtype=dtype), name='a_' + str(k))) self._data_vector_list.append( tf.compat.v2.Variable(tf.zeros(self._context_dim, dtype=dtype), name='b_' + str(k))) self._num_samples_list.append( tf.compat.v2.Variable(tf.zeros([], dtype=dtype), name='num_samples_' + str(k))) if self._use_eigendecomp: self._eig_matrix_list.append( tf.compat.v2.Variable(tf.eye(self._context_dim, dtype=dtype), name='eig_matrix' + str(k))) self._eig_vals_list.append( tf.compat.v2.Variable(tf.ones([self._context_dim], dtype=dtype), name='eig_vals' + str(k))) else: self._eig_matrix_list.append( tf.compat.v2.Variable(tf.constant([], dtype=dtype), name='eig_matrix' + str(k))) self._eig_vals_list.append( tf.compat.v2.Variable(tf.constant([], dtype=dtype), name='eig_vals' + str(k))) policy = lin_ucb_policy.LinearUCBPolicy( action_spec=action_spec, cov_matrix=self._cov_matrix_list, data_vector=self._data_vector_list, num_samples=self._num_samples_list, time_step_spec=time_step_spec, alpha=alpha, eig_vals=self._eig_vals_list if self._use_eigendecomp else (), eig_matrix=self._eig_matrix_list if self._use_eigendecomp else (), tikhonov_weight=self._tikhonov_weight, emit_log_probability=emit_log_probability) super(LinearUCBAgent, self).__init__(time_step_spec=time_step_spec, action_spec=action_spec, policy=policy, collect_policy=policy, train_sequence_length=None)
def __init__(self, time_step_spec, action_spec, alpha=1.0, gamma=1.0, use_eigendecomp=False, tikhonov_weight=1.0, add_bias=False, emit_policy_info=(), emit_log_probability=False, observation_and_action_constraint_splitter=None, debug_summaries=False, summarize_grads_and_vars=False, enable_summaries=True, dtype=tf.float32, name=None): """Initialize an instance of `LinearUCBAgent`. Args: time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s. action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype describing the number of actions for this agent. alpha: (float) positive scalar. This is the exploration parameter that multiplies the confidence intervals. gamma: a float forgetting factor in [0.0, 1.0]. When set to 1.0, the algorithm does not forget. use_eigendecomp: whether to use eigen-decomposition or not. The default solver is Conjugate Gradient. tikhonov_weight: (float) tikhonov regularization term. add_bias: If true, a bias term will be added to the linear reward estimation. emit_policy_info: (tuple of strings) what side information we want to get as part of the policy info. Allowed values can be found in `policy_utilities.PolicyInfo`. emit_log_probability: Whether the LinearUCBPolicy emits log-probabilities or not. Since the policy is deterministic, the probability is just 1. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the bandit agent and policy, and 2) the boolean mask. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. debug_summaries: A Python bool, default False. When True, debug summaries are gathered. summarize_grads_and_vars: A Python bool, default False. When True, gradients and network variable summaries are written during training. enable_summaries: A Python bool, default True. When False, all summaries (debug or otherwise) should not be written. dtype: The type of the parameters stored and updated by the agent. Should be one of `tf.float32` and `tf.float64`. Defaults to `tf.float32`. name: a name for this instance of `LinearUCBAgent`. Raises: ValueError if dtype is not one of `tf.float32` or `tf.float64`. """ tf.Module.__init__(self, name=name) self._num_actions = bandit_utils.get_num_actions_from_tensor_spec( action_spec) if observation_and_action_constraint_splitter is not None: context_shape = observation_and_action_constraint_splitter( time_step_spec.observation)[0].shape.as_list() else: context_shape = time_step_spec.observation.shape.as_list() self._add_bias = add_bias self._context_dim = (tf.compat.dimension_value(context_shape[0]) if context_shape else 1) if self._add_bias: # The bias is added via a constant 1 feature. self._context_dim += 1 self._alpha = alpha self._cov_matrix_list = [] self._data_vector_list = [] self._eig_matrix_list = [] self._eig_vals_list = [] # We keep track of the number of samples per arm. self._num_samples_list = [] self._gamma = gamma if self._gamma < 0.0 or self._gamma > 1.0: raise ValueError( 'Forgetting factor `gamma` must be in [0.0, 1.0].') self._dtype = dtype if dtype not in (tf.float32, tf.float64): raise ValueError( 'Agent dtype should be either `tf.float32 or `tf.float64`.') self._use_eigendecomp = use_eigendecomp self._tikhonov_weight = tikhonov_weight self._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter) for k in range(self._num_actions): self._cov_matrix_list.append( tf.compat.v2.Variable(tf.eye(self._context_dim, dtype=dtype), name='a_' + str(k))) self._data_vector_list.append( tf.compat.v2.Variable(tf.zeros(self._context_dim, dtype=dtype), name='b_' + str(k))) self._num_samples_list.append( tf.compat.v2.Variable(tf.zeros([], dtype=dtype), name='num_samples_' + str(k))) if self._use_eigendecomp: self._eig_matrix_list.append( tf.compat.v2.Variable(tf.eye(self._context_dim, dtype=dtype), name='eig_matrix' + str(k))) self._eig_vals_list.append( tf.compat.v2.Variable(tf.ones([self._context_dim], dtype=dtype), name='eig_vals' + str(k))) else: self._eig_matrix_list.append( tf.compat.v2.Variable(tf.constant([], dtype=dtype), name='eig_matrix' + str(k))) self._eig_vals_list.append( tf.compat.v2.Variable(tf.constant([], dtype=dtype), name='eig_vals' + str(k))) policy = lin_ucb_policy.LinearUCBPolicy( action_spec=action_spec, cov_matrix=self._cov_matrix_list, data_vector=self._data_vector_list, num_samples=self._num_samples_list, time_step_spec=time_step_spec, alpha=alpha, eig_vals=self._eig_vals_list if self._use_eigendecomp else (), eig_matrix=self._eig_matrix_list if self._use_eigendecomp else (), tikhonov_weight=self._tikhonov_weight, add_bias=add_bias, emit_policy_info=emit_policy_info, emit_log_probability=emit_log_probability, observation_and_action_constraint_splitter= observation_and_action_constraint_splitter) super(LinearUCBAgent, self).__init__(time_step_spec=time_step_spec, action_spec=action_spec, policy=policy, collect_policy=policy, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, enable_summaries=enable_summaries, train_sequence_length=None)