def testActionBatchWithMask(self, batch_size, actions_from_reward_layer): obs_spec = (tensor_spec.TensorSpec([self._obs_dim], tf.float32), tensor_spec.TensorSpec([self._num_actions], tf.int32)) time_step_spec = ts.time_step_spec(obs_spec) policy = neural_linucb_policy.NeuralLinUCBPolicy( DummyNet(obs_spec[0]), self._encoding_dim, get_reward_layer(), actions_from_reward_layer=tf.constant(actions_from_reward_layer, dtype=tf.bool), cov_matrix=self._a, data_vector=self._b, num_samples=self._num_samples_per_arm, epsilon_greedy=0.5, time_step_spec=time_step_spec, observation_and_action_constraint_splitter=lambda x: (x[0], x[1])) action_fn = common.function_in_tf1()(policy.action) action_step = action_fn( self._time_step_batch_with_mask(batch_size=batch_size)) self.assertEqual(action_step.action.shape.as_list(), [batch_size]) self.assertEqual(action_step.action.dtype, tf.int32) self.evaluate(tf.compat.v1.global_variables_initializer()) actions = self.evaluate(action_step.action) self.assertAllEqual(actions, range(batch_size))
def testObservationShapeMismatch(self, batch_size, actions_from_reward_layer): policy = neural_linucb_policy.NeuralLinUCBPolicy( DummyNet(), self._encoding_dim, get_reward_layer(), actions_from_reward_layer=actions_from_reward_layer, cov_matrix=self._a, data_vector=self._b, num_samples=self._num_samples_per_arm, epsilon_greedy=0.0, time_step_spec=self._time_step_spec) current_time_step = ts.TimeStep( tf.constant(ts.StepType.FIRST, dtype=tf.int32, shape=[batch_size], name='step_type'), tf.constant(0.0, dtype=tf.float32, shape=[batch_size], name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[batch_size], name='discount'), tf.constant(np.array(range(batch_size * (self._obs_dim + 1))), dtype=tf.float32, shape=[batch_size, self._obs_dim + 1], name='observation')) with self.assertRaisesRegexp( ValueError, r'Observation shape is expected to be \[None, 2\].' r' Got \[%d, 3\].' % batch_size): policy.action(current_time_step)
def testObservationShapeMismatch(self, batch_size, actions_from_reward_layer): policy = neural_linucb_policy.NeuralLinUCBPolicy( DummyNet(self._obs_spec), self._encoding_dim, get_reward_layer(), actions_from_reward_layer=actions_from_reward_layer, cov_matrix=self._a, data_vector=self._b, num_samples=self._num_samples_per_arm, epsilon_greedy=0.0, time_step_spec=self._time_step_spec) current_time_step = ts.TimeStep( tf.constant( ts.StepType.FIRST, dtype=tf.int32, shape=[batch_size], name='step_type'), tf.constant(0.0, dtype=tf.float32, shape=[batch_size], name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[batch_size], name='discount'), tf.constant(np.array(range(batch_size * (self._obs_dim + 1))), dtype=tf.float32, shape=[batch_size, self._obs_dim + 1], name='observation')) if tf.executing_eagerly(): error_type = tf.errors.InvalidArgumentError regexp = r'Matrix size-incompatible: In\[0\]: \[%d,3\]' % batch_size else: error_type = ValueError regexp = r'with shape \[%d, 3\]' % batch_size with self.assertRaisesRegex(error_type, regexp): policy.action(current_time_step)
def testPerArmObservation(self, batch_size, actions_from_reward_layer): global_obs_dim = 7 arm_obs_dim = 3 obs_spec = bandit_spec_utils.create_per_arm_observation_spec( global_obs_dim, arm_obs_dim, self._num_actions) time_step_spec = ts.time_step_spec(obs_spec) dummy_net = arm_network.create_feed_forward_common_tower_network( obs_spec, global_layers=(3, 4, 5), arm_layers=(3, 2), common_layers=(4, 3), output_dim=self._encoding_dim) reward_layer = get_per_arm_reward_layer( encoding_dim=self._encoding_dim) policy = neural_linucb_policy.NeuralLinUCBPolicy( dummy_net, self._encoding_dim, reward_layer, actions_from_reward_layer=tf.constant(actions_from_reward_layer, dtype=tf.bool), cov_matrix=self._a[0:1], data_vector=self._b[0:1], num_samples=self._num_samples_per_arm[0:1], epsilon_greedy=0.0, time_step_spec=time_step_spec, accepts_per_arm_features=True, emit_policy_info=('predicted_rewards_mean', )) current_time_step = self._per_arm_time_step_batch( batch_size=batch_size, global_obs_dim=global_obs_dim, arm_obs_dim=arm_obs_dim) action_step = policy.action(current_time_step) self.assertEqual(action_step.action.dtype, tf.int32) self.evaluate(tf.compat.v1.global_variables_initializer()) action_fn = common.function_in_tf1()(policy.action) action_step = action_fn(current_time_step) input_observation = current_time_step.observation encoded_observation, _ = dummy_net(input_observation) if actions_from_reward_layer: predicted_rewards_from_reward_layer = reward_layer( encoded_observation) predicted_rewards_expected = self.evaluate( predicted_rewards_from_reward_layer).reshape( (-1, self._num_actions)) else: observation_numpy = self.evaluate(encoded_observation) predicted_rewards_expected = ( self._get_predicted_rewards_from_per_arm_linucb( observation_numpy, batch_size)) p_info = self.evaluate(action_step.info) self.assertEqual(p_info.predicted_rewards_mean.dtype, np.float32) self.assertAllClose(p_info.predicted_rewards_mean, predicted_rewards_expected)
def testBuild(self, batch_size, actions_from_reward_layer): policy = neural_linucb_policy.NeuralLinUCBPolicy( DummyNet(), self._encoding_dim, get_reward_layer(), actions_from_reward_layer=actions_from_reward_layer, cov_matrix=self._a, data_vector=self._b, num_samples=self._num_samples_per_arm, epsilon_greedy=0.0, time_step_spec=self._time_step_spec) self.assertEqual(policy.time_step_spec, self._time_step_spec)
def testPredictedRewards(self, batch_size, actions_from_reward_layer): dummy_net = DummyNet(self._obs_spec) reward_layer = get_reward_layer() policy = neural_linucb_policy.NeuralLinUCBPolicy( dummy_net, self._encoding_dim, reward_layer, actions_from_reward_layer=tf.constant(actions_from_reward_layer, dtype=tf.bool), cov_matrix=self._a, data_vector=self._b, num_samples=self._num_samples_per_arm, epsilon_greedy=0.0, time_step_spec=self._time_step_spec, emit_policy_info=('predicted_rewards_mean', )) current_time_step = self._time_step_batch(batch_size=batch_size) action_step = policy.action(current_time_step) self.assertEqual(action_step.action.dtype, tf.int32) self.evaluate(tf.compat.v1.global_variables_initializer()) action_fn = common.function_in_tf1()(policy.action) action_step = action_fn(current_time_step) input_observation = current_time_step.observation encoded_observation, _ = dummy_net(input_observation) predicted_rewards_from_reward_layer = reward_layer(encoded_observation) if actions_from_reward_layer: predicted_rewards_expected = self.evaluate( predicted_rewards_from_reward_layer) else: observation_numpy = self.evaluate(encoded_observation) predicted_rewards_expected = self._get_predicted_rewards_from_linucb( observation_numpy, batch_size) p_info = self.evaluate(action_step.info) self.assertEqual(p_info.predicted_rewards_mean.dtype, np.float32) self.assertAllClose(p_info.predicted_rewards_mean, predicted_rewards_expected)
def testActionBatch(self, batch_size, actions_from_reward_layer): policy = neural_linucb_policy.NeuralLinUCBPolicy( DummyNet(self._obs_spec), self._encoding_dim, get_reward_layer(), actions_from_reward_layer=tf.constant( actions_from_reward_layer, dtype=tf.bool), cov_matrix=self._a, data_vector=self._b, num_samples=self._num_samples_per_arm, epsilon_greedy=0.0, time_step_spec=self._time_step_spec) action_step = policy.action(self._time_step_batch(batch_size=batch_size)) self.assertEqual(action_step.action.dtype, tf.int32) self.evaluate(tf.compat.v1.global_variables_initializer()) action_fn = common.function_in_tf1()(policy.action) action_step = action_fn(self._time_step_batch(batch_size=batch_size)) actions_ = self.evaluate(action_step.action) self.assertAllGreaterEqual(actions_, self._action_spec.minimum) self.assertAllLessEqual(actions_, self._action_spec.maximum)
def __init__( self, time_step_spec, action_spec, encoding_network, encoding_network_num_train_steps, encoding_dim, optimizer, variable_collection=None, alpha=1.0, gamma=1.0, epsilon_greedy=0.0, observation_and_action_constraint_splitter=None, # Params for training. error_loss_fn=tf.compat.v1.losses.mean_squared_error, gradient_clipping=None, # Params for debugging. debug_summaries=False, summarize_grads_and_vars=False, train_step_counter=None, emit_policy_info=(), emit_log_probability=False, dtype=tf.float64, name=None): """Initialize an instance of `NeuralLinUCBAgent`. Args: time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s. action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype describing the number of actions for this agent. encoding_network: a Keras network that encodes the observations. encoding_network_num_train_steps: how many training steps to run for training the encoding network before switching to LinUCB. If negative, the encoding network is assumed to be already trained. encoding_dim: the dimension of encoded observations. optimizer: The optimizer to use for training. variable_collection: Instance of `NeuralLinUCBVariableCollection`. Collection of variables to be updated by the agent. If `None`, a new instance of `LinearBanditVariables` will be created. Note that this collection excludes the variables owned by the encoding network. alpha: (float) positive scalar. This is the exploration parameter that multiplies the confidence intervals. gamma: a float forgetting factor in [0.0, 1.0]. When set to 1.0, the algorithm does not forget. epsilon_greedy: A float representing the probability of choosing a random action instead of the greedy action. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the bandit agent and policy, and 2) the boolean mask. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. error_loss_fn: A function for computing the error loss, taking parameters labels, predictions, and weights (any function from tf.losses would work). The default is `tf.losses.mean_squared_error`. gradient_clipping: A float representing the norm length to clip gradients (or None for no clipping.) debug_summaries: A Python bool, default False. When True, debug summaries are gathered. summarize_grads_and_vars: A Python bool, default False. When True, gradients and network variable summaries are written during training. train_step_counter: An optional `tf.Variable` to increment every time the train op is run. Defaults to the `global_step`. emit_policy_info: (tuple of strings) what side information we want to get as part of the policy info. Allowed values can be found in `policy_utilities.PolicyInfo`. emit_log_probability: Whether the NeuralLinUCBPolicy emits log-probabilities or not. Since the policy is deterministic, the probability is just 1. dtype: The type of the parameters stored and updated by the agent. Should be one of `tf.float32` and `tf.float64`. Defaults to `tf.float64`. name: a name for this instance of `NeuralLinUCBAgent`. Raises: TypeError if variable_collection is not an instance of `NeuralLinUCBVariableCollection`. ValueError if dtype is not one of `tf.float32` or `tf.float64`. """ tf.Module.__init__(self, name=name) common.tf_agents_gauge.get_cell('TFABandit').set(True) self._num_actions = bandit_utils.get_num_actions_from_tensor_spec( action_spec) self._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter) if observation_and_action_constraint_splitter is not None: context_shape = observation_and_action_constraint_splitter( time_step_spec.observation)[0].shape.as_list() else: context_shape = time_step_spec.observation.shape.as_list() self._context_dim = ( tf.compat.dimension_value(context_shape[0]) if context_shape else 1) self._alpha = alpha if variable_collection is None: variable_collection = NeuralLinUCBVariableCollection( self._num_actions, encoding_dim, dtype) elif not isinstance(variable_collection, NeuralLinUCBVariableCollection): raise TypeError('Parameter `variable_collection` should be ' 'of type `NeuralLinUCBVariableCollection`.') self._variable_collection = variable_collection self._gamma = gamma if self._gamma < 0.0 or self._gamma > 1.0: raise ValueError('Forgetting factor `gamma` must be in [0.0, 1.0].') self._dtype = dtype if dtype not in (tf.float32, tf.float64): raise ValueError( 'Agent dtype should be either `tf.float32 or `tf.float64`.') self._epsilon_greedy = epsilon_greedy reward_layer = tf.keras.layers.Dense( self._num_actions, kernel_initializer=tf.compat.v1.initializers.random_uniform( minval=-0.03, maxval=0.03), use_bias=False, activation=None, name='reward_layer') self._encoding_network = encoding_network self._reward_layer = reward_layer self._encoding_network_num_train_steps = encoding_network_num_train_steps self._encoding_dim = encoding_dim self._optimizer = optimizer self._error_loss_fn = error_loss_fn self._gradient_clipping = gradient_clipping train_step_counter = tf.compat.v1.train.get_or_create_global_step() policy = neural_linucb_policy.NeuralLinUCBPolicy( encoding_network=self._encoding_network, encoding_dim=self._encoding_dim, reward_layer=self._reward_layer, epsilon_greedy=self._epsilon_greedy, actions_from_reward_layer=self.actions_from_reward_layer, cov_matrix=self.cov_matrix, data_vector=self.data_vector, num_samples=self.num_samples, time_step_spec=time_step_spec, alpha=alpha, emit_policy_info=emit_policy_info, emit_log_probability=emit_log_probability, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter)) super(NeuralLinUCBAgent, self).__init__( time_step_spec=time_step_spec, action_spec=policy.action_spec, policy=policy, collect_policy=policy, train_sequence_length=None, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter)
def testActionBatchWithVariablesAndPolicyUpdate(self, batch_size, actions_from_reward_layer): a_list = [] a_new_list = [] b_list = [] b_new_list = [] num_samples_list = [] num_samples_new_list = [] for k in range(1, self._num_actions + 1): a_initial_value = k + 1 + 2 * k * tf.eye(self._encoding_dim, dtype=tf.float32) a_for_one_arm = tf.compat.v2.Variable(a_initial_value) a_list.append(a_for_one_arm) b_initial_value = tf.constant(k * np.ones(self._encoding_dim), dtype=tf.float32) b_for_one_arm = tf.compat.v2.Variable(b_initial_value) b_list.append(b_for_one_arm) num_samples_initial_value = tf.constant([1], dtype=tf.float32) num_samples_for_one_arm = tf.compat.v2.Variable( num_samples_initial_value) num_samples_list.append(num_samples_for_one_arm) # Variables for the new policy (they differ by an offset). a_new_for_one_arm = tf.compat.v2.Variable(a_initial_value + _POLICY_VARIABLES_OFFSET) a_new_list.append(a_new_for_one_arm) b_new_for_one_arm = tf.compat.v2.Variable(b_initial_value + _POLICY_VARIABLES_OFFSET) b_new_list.append(b_new_for_one_arm) num_samples_for_one_arm_new = tf.compat.v2.Variable( num_samples_initial_value + _POLICY_VARIABLES_OFFSET) num_samples_new_list.append(num_samples_for_one_arm_new) policy = neural_linucb_policy.NeuralLinUCBPolicy( encoding_network=DummyNet(), encoding_dim=self._encoding_dim, reward_layer=get_reward_layer(), actions_from_reward_layer=tf.constant(actions_from_reward_layer, dtype=tf.bool), cov_matrix=a_list, data_vector=b_list, num_samples=num_samples_list, epsilon_greedy=0.0, time_step_spec=self._time_step_spec) new_policy = neural_linucb_policy.NeuralLinUCBPolicy( encoding_network=DummyNet(), encoding_dim=self._encoding_dim, reward_layer=get_reward_layer(), actions_from_reward_layer=tf.constant(actions_from_reward_layer, dtype=tf.bool), cov_matrix=a_new_list, data_vector=b_new_list, num_samples=num_samples_new_list, epsilon_greedy=0.0, time_step_spec=self._time_step_spec) action_step = policy.action( self._time_step_batch(batch_size=batch_size)) new_action_step = new_policy.action( self._time_step_batch(batch_size=batch_size)) self.assertEqual(action_step.action.shape, new_action_step.action.shape) self.assertEqual(action_step.action.dtype, new_action_step.action.dtype) self.evaluate(tf.compat.v1.global_variables_initializer()) self.evaluate(new_policy.update(policy)) action_fn = common.function_in_tf1()(policy.action) action_step = action_fn(self._time_step_batch(batch_size=batch_size)) new_action_fn = common.function_in_tf1()(new_policy.action) new_action_step = new_action_fn( self._time_step_batch(batch_size=batch_size)) actions_, new_actions_ = self.evaluate( [action_step.action, new_action_step.action]) self.assertAllEqual(actions_, new_actions_)
def __init__( self, time_step_spec, action_spec, encoding_network, encoding_network_num_train_steps, encoding_dim, optimizer, alpha=1.0, gamma=1.0, epsilon_greedy=0.0, # Params for training. error_loss_fn=tf.compat.v1.losses.mean_squared_error, gradient_clipping=None, # Params for debugging. debug_summaries=False, summarize_grads_and_vars=False, train_step_counter=None, emit_log_probability=False, dtype=tf.float64, name=None): """Initialize an instance of `NeuralLinUCBAgent`. Args: time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s. action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype describing the number of actions for this agent. encoding_network: a Keras network that encodes the observations. encoding_network_num_train_steps: how many training steps to run for training the encoding network before switching to LinUCB. If negative, the encoding network is assumed to be already trained. encoding_dim: the dimension of encoded observations. optimizer: The optimizer to use for training. alpha: (float) positive scalar. This is the exploration parameter that multiplies the confidence intervals. gamma: a float forgetting factor in [0.0, 1.0]. When set to 1.0, the algorithm does not forget. epsilon_greedy: A float representing the probability of choosing a random action instead of the greedy action. error_loss_fn: A function for computing the error loss, taking parameters labels, predictions, and weights (any function from tf.losses would work). The default is `tf.losses.mean_squared_error`. gradient_clipping: A float representing the norm length to clip gradients (or None for no clipping.) debug_summaries: A Python bool, default False. When True, debug summaries are gathered. summarize_grads_and_vars: A Python bool, default False. When True, gradients and network variable summaries are written during training. train_step_counter: An optional `tf.Variable` to increment every time the train op is run. Defaults to the `global_step`. emit_log_probability: Whether the NeuralLinUCBPolicy emits log-probabilities or not. Since the policy is deterministic, the probability is just 1. dtype: The type of the parameters stored and updated by the agent. Should be one of `tf.float32` and `tf.float64`. Defaults to `tf.float64`. name: a name for this instance of `NeuralLinUCBAgent`. Raises: ValueError if dtype is not one of `tf.float32` or `tf.float64`. """ tf.Module.__init__(self, name=name) self._num_actions = bandit_utils.get_num_actions_from_tensor_spec( action_spec) self._context_dim = int(time_step_spec.observation.shape[0]) self._alpha = alpha self._cov_matrix_list = [] self._data_vector_list = [] # We keep track of the number of samples per arm. self._num_samples_list = [] self._gamma = gamma if self._gamma < 0.0 or self._gamma > 1.0: raise ValueError( 'Forgetting factor `gamma` must be in [0.0, 1.0].') self._dtype = dtype if dtype not in (tf.float32, tf.float64): raise ValueError( 'Agent dtype should be either `tf.float32 or `tf.float64`.') self._epsilon_greedy = epsilon_greedy reward_layer = tf.keras.layers.Dense( self._num_actions, kernel_initializer=tf.compat.v1.initializers.random_uniform( minval=-0.03, maxval=0.03), bias_initializer=tf.compat.v1.initializers.constant(-0.2), activation=None, name='reward_layer') self._encoding_network = encoding_network self._reward_layer = reward_layer self._encoding_network_num_train_steps = encoding_network_num_train_steps self._encoding_dim = encoding_dim self._optimizer = optimizer self._error_loss_fn = error_loss_fn self._gradient_clipping = gradient_clipping train_step_counter = tf.compat.v1.train.get_or_create_global_step() self._actions_from_reward_layer = tf.compat.v2.Variable(True, dtype=tf.bool) for k in range(self._num_actions): self._cov_matrix_list.append( tf.compat.v2.Variable(tf.eye(self._encoding_dim, dtype=dtype), name='a_' + str(k))) self._data_vector_list.append( tf.compat.v2.Variable(tf.zeros(self._encoding_dim, dtype=dtype), name='b_' + str(k))) self._num_samples_list.append( tf.compat.v2.Variable(tf.zeros([], dtype=dtype), name='num_samples_' + str(k))) policy = neural_linucb_policy.NeuralLinUCBPolicy( encoding_network=self._encoding_network, encoding_dim=self._encoding_dim, reward_layer=self._reward_layer, epsilon_greedy=self._epsilon_greedy, actions_from_reward_layer=self._actions_from_reward_layer, cov_matrix=self._cov_matrix_list, data_vector=self._data_vector_list, num_samples=self._num_samples_list, time_step_spec=time_step_spec, alpha=alpha, emit_log_probability=emit_log_probability) super(NeuralLinUCBAgent, self).__init__(time_step_spec=time_step_spec, action_spec=policy.action_spec, policy=policy, collect_policy=policy, train_sequence_length=None, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter)
def testSparseObs(self, batch_size, actions_from_reward_layer): obs_spec = { 'global': { 'sport': tensor_spec.TensorSpec((), tf.string) }, 'per_arm': { 'name': tensor_spec.TensorSpec((3, ), tf.string), 'fruit': tensor_spec.TensorSpec((3, ), tf.string) } } columns_a = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'name', ['bob', 'george', 'wanda'])) columns_b = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'fruit', ['banana', 'kiwi', 'pear'])) columns_c = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'sport', ['bridge', 'chess', 'snooker'])) dummy_net = arm_network.create_feed_forward_common_tower_network( obs_spec, global_layers=(3, 4, 5), arm_layers=(3, 2), common_layers=(4, 3), output_dim=self._encoding_dim, global_preprocessing_combiner=( tf.compat.v2.keras.layers.DenseFeatures([columns_c])), arm_preprocessing_combiner=tf.compat.v2.keras.layers.DenseFeatures( [columns_a, columns_b])) time_step_spec = ts.time_step_spec(obs_spec) reward_layer = get_per_arm_reward_layer( encoding_dim=self._encoding_dim) policy = neural_linucb_policy.NeuralLinUCBPolicy( dummy_net, self._encoding_dim, reward_layer, actions_from_reward_layer=tf.constant(actions_from_reward_layer, dtype=tf.bool), cov_matrix=self._a[0:1], data_vector=self._b[0:1], num_samples=self._num_samples_per_arm[0:1], epsilon_greedy=0.0, time_step_spec=time_step_spec, accepts_per_arm_features=True, emit_policy_info=('predicted_rewards_mean', )) observations = { 'global': { 'sport': tf.constant(['snooker', 'chess']) }, 'per_arm': { 'name': tf.constant([['george', 'george', 'george'], ['bob', 'bob', 'bob']]), 'fruit': tf.constant([['banana', 'banana', 'banana'], ['kiwi', 'kiwi', 'kiwi']]) } } time_step = ts.restart(observations, batch_size=2) action_fn = common.function_in_tf1()(policy.action) action_step = action_fn(time_step, seed=1) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate([ tf.compat.v1.global_variables_initializer(), tf.compat.v1.tables_initializer() ]) action = self.evaluate(action_step.action) self.assertAllEqual(action.shape, [2]) p_info = self.evaluate(action_step.info) self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 3]) self.assertAllEqual(p_info.chosen_arm_features['name'].shape, [2]) self.assertAllEqual(p_info.chosen_arm_features['fruit'].shape, [2]) first_action = action[0] first_arm_name_feature = observations[ bandit_spec_utils.PER_ARM_FEATURE_KEY]['name'][0] self.assertAllEqual(p_info.chosen_arm_features['name'][0], first_arm_name_feature[first_action])