def _create_arm_policy_and_observations( self ) -> Tuple[greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy, Dict[Text, tf.Tensor]]: obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4) time_step_spec = ts.time_step_spec(obs_spec) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3) objective_networks = [ global_and_arm_feature_network. create_feed_forward_common_tower_network(obs_spec, (4, 3), (3, 4), (4, 2)) for _ in range(3) ] policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy( time_step_spec, action_spec, self._scalarizer, objective_networks, accepts_per_arm_features=True, emit_policy_info=('predicted_rewards_mean', )) action_feature = tf.cast(tf.reshape(tf.random.shuffle(tf.range(24)), shape=[2, 4, 3]), dtype=tf.float32) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [2, 1]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: action_feature } return policy, observations
def testComputeLossWithArmFeatures(self): obs_spec = bandit_spec_utils.create_per_arm_observation_spec( global_dim=2, per_arm_dim=3, num_actions=3) time_step_spec = ts.time_step_spec(obs_spec) constraint_net = (global_and_arm_feature_network. create_feed_forward_common_tower_network( obs_spec, global_layers=(4, ), arm_layers=(4, ), common_layers=(4, ))) neural_constraint = constraints.NeuralConstraint( time_step_spec, self._action_spec, constraint_network=constraint_net) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: tf.cast(tf.reshape(tf.range(18), shape=[2, 3, 3]), dtype=tf.float32) } actions = tf.constant([0, 1], dtype=tf.int32) rewards = tf.constant([0.5, 3.0], dtype=tf.float32) init_op = neural_constraint.initialize() if not tf.executing_eagerly(): with self.cached_session() as sess: common.initialize_uninitialized_variables(sess) self.assertIsNone(sess.run(init_op)) loss = neural_constraint.compute_loss(observations, actions, rewards) self.assertGreater(self.evaluate(loss), 0.0)
def testTrainPerArmAgent(self): obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 3) time_step_spec = ts.time_step_spec(obs_spec) reward_net = (global_and_arm_feature_network. create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2))) optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.1) agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec, self._action_spec, reward_network=reward_net, optimizer=optimizer, epsilon=0.1, accepts_per_arm_features=True) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: tf.cast(tf.reshape(tf.range(18), shape=[2, 3, 3]), dtype=tf.float32) } time_steps = ts.restart(observations, batch_size=2) policy = agent.policy action_step = policy.action(time_steps) self.evaluate(tf.compat.v1.initialize_all_variables()) actions = self.evaluate(action_step.action) self.assertAllEqual(actions.shape, (2, ))
def testTrainPerArmAgent(self): obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4) time_step_spec = ts.time_step_spec(obs_spec) reward_net = ( global_and_arm_feature_network.create_feed_forward_per_arm_network( obs_spec, (4, 3), (3, 4), (4, 2))) optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.1) agent = greedy_agent.GreedyRewardPredictionAgent( time_step_spec, self._action_spec, reward_network=reward_net, accepts_per_arm_features=True, optimizer=optimizer) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: tf.cast(tf.reshape(tf.range(24), shape=[2, 4, 3]), dtype=tf.float32) } actions = np.array([0, 3], dtype=np.int32) rewards = np.array([0.5, 3.0], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps( observations, rewards) action_step = policy_step.PolicyStep( action=tf.convert_to_tensor(actions), info=policy_utilities.PerArmPolicyInfo( chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]], dtype=np.float32))) experience = _get_experience(initial_step, action_step, final_step) agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables())
def testPerArmObservation(self, batch_size, actions_from_reward_layer): global_obs_dim = 7 arm_obs_dim = 3 obs_spec = bandit_spec_utils.create_per_arm_observation_spec( global_obs_dim, arm_obs_dim, self._num_actions) time_step_spec = ts.time_step_spec(obs_spec) dummy_net = arm_network.create_feed_forward_common_tower_network( obs_spec, global_layers=(3, 4, 5), arm_layers=(3, 2), common_layers=(4, 3), output_dim=self._encoding_dim) reward_layer = get_per_arm_reward_layer( encoding_dim=self._encoding_dim) policy = neural_linucb_policy.NeuralLinUCBPolicy( dummy_net, self._encoding_dim, reward_layer, actions_from_reward_layer=tf.constant(actions_from_reward_layer, dtype=tf.bool), cov_matrix=self._a[0:1], data_vector=self._b[0:1], num_samples=self._num_samples_per_arm[0:1], epsilon_greedy=0.0, time_step_spec=time_step_spec, accepts_per_arm_features=True, emit_policy_info=('predicted_rewards_mean', )) current_time_step = self._per_arm_time_step_batch( batch_size=batch_size, global_obs_dim=global_obs_dim, arm_obs_dim=arm_obs_dim) action_step = policy.action(current_time_step) self.assertEqual(action_step.action.dtype, tf.int32) self.evaluate(tf.compat.v1.global_variables_initializer()) action_fn = common.function_in_tf1()(policy.action) action_step = action_fn(current_time_step) input_observation = current_time_step.observation encoded_observation, _ = dummy_net(input_observation) if actions_from_reward_layer: predicted_rewards_from_reward_layer = reward_layer( encoded_observation) predicted_rewards_expected = self.evaluate( predicted_rewards_from_reward_layer).reshape( (-1, self._num_actions)) else: observation_numpy = self.evaluate(encoded_observation) predicted_rewards_expected = ( self._get_predicted_rewards_from_per_arm_linucb( observation_numpy, batch_size)) p_info = self.evaluate(action_step.info) self.assertEqual(p_info.predicted_rewards_mean.dtype, np.float32) self.assertAllClose(p_info.predicted_rewards_mean, predicted_rewards_expected)
def testTrainPerArmAgentVariableActions(self): num_actions = 5 obs_spec = bandit_spec_utils.create_per_arm_observation_spec( 2, 3, num_actions, add_num_actions_feature=True) time_step_spec = time_step.time_step_spec(obs_spec) action_spec = tensor_spec.BoundedTensorSpec( dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) encoding_dim = 10 encoder = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2), encoding_dim)) agent = neural_linucb_agent.NeuralLinUCBAgent( time_step_spec=time_step_spec, action_spec=action_spec, encoding_network=encoder, encoding_network_num_train_steps=10, encoding_dim=encoding_dim, accepts_per_arm_features=True, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001)) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: tf.cast( tf.reshape(tf.range(30), shape=[2, 5, 3]), dtype=tf.float32), bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY: tf.constant([3, 4], dtype=tf.int32) } actions = np.array([0, 3], dtype=np.int32) rewards = np.array([0.5, 3.0], dtype=np.float32) initial_step = time_step.TimeStep( tf.constant( time_step.StepType.FIRST, dtype=tf.int32, shape=[2], name='step_type'), tf.constant(0.0, dtype=tf.float32, shape=[2], name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[2], name='discount'), observations) final_step = time_step.TimeStep( tf.constant( time_step.StepType.LAST, dtype=tf.int32, shape=[2], name='step_type'), tf.constant(rewards, dtype=tf.float32, name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[2], name='discount'), observations) action_step = policy_step.PolicyStep( action=tf.convert_to_tensor(actions), info=policy_utilities.PerArmPolicyInfo( chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]], dtype=np.float32))) experience = _get_experience(initial_step, action_step, final_step) loss_info, _ = agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables()) loss_value = self.evaluate(loss_info) self.assertGreater(loss_value, 0.0)
def testLinearAgentUpdatePerArmFeatures(self, batch_size, context_dim, exploration_policy, dtype, use_eigendecomp=False): """Check that the agent updates for specified actions and rewards.""" # Construct a `Trajectory` for the given action, observation, reward. num_actions = 5 global_context_dim = context_dim arm_context_dim = 3 initial_step, final_step = ( _get_initial_and_final_steps_with_per_arm_features( batch_size, global_context_dim, num_actions, arm_context_dim)) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = policy_step.PolicyStep( action=tf.convert_to_tensor(action), info=policy_utilities.PerArmPolicyInfo( chosen_arm_features=np.arange( batch_size * arm_context_dim, dtype=np.float32).reshape( [batch_size, arm_context_dim]))) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. observation_spec = bandit_spec_utils.create_per_arm_observation_spec( context_dim, arm_context_dim, num_actions) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec( dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) agent = linear_agent.LinearBanditAgent( exploration_policy=exploration_policy, time_step_spec=time_step_spec, action_spec=action_spec, use_eigendecomp=use_eigendecomp, accepts_per_arm_features=True, dtype=dtype) self.evaluate(agent.initialize()) loss_info = agent.train(experience) self.evaluate(loss_info) final_a = self.evaluate(agent.cov_matrix) final_b = self.evaluate(agent.data_vector) # Compute the expected updated estimates. global_observation = experience.observation[ bandit_spec_utils.GLOBAL_FEATURE_KEY] arm_observation = experience.policy_info.chosen_arm_features overall_observation = tf.squeeze( tf.concat([global_observation, arm_observation], axis=-1), axis=1) rewards = tf.squeeze(experience.reward, axis=1) expected_a_new = tf.matmul( overall_observation, overall_observation, transpose_a=True) expected_b_new = bandit_utils.sum_reward_weighted_observations( rewards, overall_observation) self.assertAllClose(expected_a_new, final_a[0]) self.assertAllClose(expected_b_new, final_b[0])
def testCreateFeedForwardDotProductNetwork(self, batch_size, feature_dim, num_actions): obs_spec = bandit_spec_utils.create_per_arm_observation_spec( 7, feature_dim, num_actions) net = gafn.create_feed_forward_dot_product_network( obs_spec, (4, 3, 4), (6, 5, 4)) input_nest = tensor_spec.sample_spec_nest(obs_spec, outer_dims=(batch_size, )) output, _ = self.evaluate(net(input_nest)) self.assertAllEqual(output.shape, (batch_size, num_actions))
def testCreateFeedForwardCommonTowerNetwork(self, batch_size, feature_dim, num_actions): obs_spec = bandit_spec_utils.create_per_arm_observation_spec( 7, feature_dim, num_actions) net = gafn.create_feed_forward_common_tower_network( obs_spec, (4, 3, 2), (6, 5, 4), (7, 6, 5)) input_nest = tensor_spec.sample_spec_nest(obs_spec, outer_dims=(batch_size, )) output, _ = net(input_nest) self.evaluate(tf.compat.v1.global_variables_initializer()) output = self.evaluate(output) self.assertAllEqual(output.shape, (batch_size, num_actions))
def testNoneTimeStepSpecForPerArmFeaturesRaisesError(self): obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3) objective_networks = [ global_and_arm_feature_network. create_feed_forward_common_tower_network(obs_spec, (4, 3), (3, 4), (4, 2)) for _ in range(3) ] with self.assertRaisesRegexp( ValueError, 'time_step_spec should not be None for per-arm-features policies' ): greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy( None, action_spec, self._scalarizer, objective_networks, accepts_per_arm_features=True, emit_policy_info=('predicted_rewards_mean', ))
def testPerArmRewardsVariableNumActions(self): tf.compat.v1.set_random_seed(3000) obs_spec = bandit_spec_utils.create_per_arm_observation_spec( 2, 3, 4, add_num_actions_feature=True) time_step_spec = ts.time_step_spec(obs_spec) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3) reward_network = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2))) policy = greedy_reward_policy.GreedyRewardPredictionPolicy( time_step_spec, action_spec, reward_network=reward_network, accepts_per_arm_features=True, emit_policy_info=('predicted_rewards_mean',)) action_feature = tf.cast( tf.reshape(tf.random.shuffle(tf.range(24)), shape=[2, 4, 3]), dtype=tf.float32) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: action_feature, bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY: tf.constant([2, 3], dtype=tf.int32) } time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step, seed=1) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate(tf.compat.v1.global_variables_initializer()) action, p_info, first_arm_features = self.evaluate([ action_step.action, action_step.info, observations[bandit_spec_utils.PER_ARM_FEATURE_KEY][0] ]) self.assertAllEqual(action.shape, [2]) self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 4]) self.assertAllEqual(p_info.chosen_arm_features.shape, [2, 3]) first_action = action[0] self.assertAllEqual(p_info.chosen_arm_features[0], first_arm_features[first_action])
def testTrainPerArmAgent(self): obs_spec = bandit_spec_utils.create_per_arm_observation_spec( 2, 3, 4, add_num_actions_feature=True) time_step_spec = ts.time_step_spec(observation_spec=obs_spec, reward_spec=tensor_spec.TensorSpec( [3], tf.float32)) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3) networks_and_loss_fns = [ (global_and_arm_feature_network. create_feed_forward_common_tower_network(obs_spec, (4, 3), (3, 4), (4, 2)), tf.compat.v1.losses.mean_squared_error) for _ in range(3) ] optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.01) agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent( time_step_spec, action_spec, self._scalarizer, objective_network_and_loss_fn_sequence=networks_and_loss_fns, accepts_per_arm_features=True, optimizer=optimizer) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: tf.cast(tf.reshape(tf.range(24), shape=[2, 4, 3]), dtype=tf.float32), bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY: tf.ones([2], dtype=tf.int32) } actions = np.array([0, 3], dtype=np.int32) objectives = np.array([[1, 2, 3], [6, 5, 4]], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps( observations, objectives) action_step = policy_step.PolicyStep( action=tf.convert_to_tensor(actions), info=policy_utilities.PerArmPolicyInfo( chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]], dtype=np.float32))) experience = _get_experience(initial_step, action_step, final_step) agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables())
def testComputeMaskFromMultipleSourcesMask(self): observation_spec = bandit_spec_utils.create_per_arm_observation_spec( 4, 5, 6) time_step_spec = ts.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 5) constraint_net = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( observation_spec, (3, 4), (4, 3), (2, 3))) neural_constraint = constraints.NeuralConstraint( time_step_spec, action_spec, constraint_network=constraint_net) original_mask = [[1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0]] observations = ({ 'global': tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=tf.float32), 'per_arm': tf.reshape(tf.range(60, dtype=tf.float32), shape=[2, 6, 5]), }, original_mask) mask = constraints.construct_mask_from_multiple_sources( observations, lambda x: (x[0], x[1]), [neural_constraint], 6) self.assertAllGreaterEqual(original_mask - mask, 0)
def testTrainPerArmAgentWithMask(self): num_actions = 4 obs_spec = bandit_spec_utils.create_per_arm_observation_spec( 2, 3, num_actions) mask_obs_spec = (obs_spec, tensor_spec.BoundedTensorSpec(shape=[num_actions], minimum=0, maximum=1, dtype=tf.float32)) time_step_spec = ts.time_step_spec(mask_obs_spec) reward_net = (global_and_arm_feature_network. create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2))) optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.1) agent = greedy_agent.GreedyRewardPredictionAgent( time_step_spec, self._action_spec, reward_network=reward_net, observation_and_action_constraint_splitter=lambda x: [x[0], x[1]], accepts_per_arm_features=True, optimizer=optimizer) observations = ({ bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: tf.cast(tf.reshape(tf.range(24), shape=[2, 4, 3]), dtype=tf.float32) }, tf.ones([2, num_actions])) actions = np.array([0, 3], dtype=np.int32) rewards = np.array([0.5, 3.0], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps_with_action_mask( observations, rewards) action_step = policy_step.PolicyStep( action=tf.convert_to_tensor(actions), info=policy_utilities.PerArmPolicyInfo( chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]], dtype=np.float32))) experience = _get_experience(initial_step, action_step, final_step) agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables())
def setUp(self): super(LinearBanditPolicyTest, self).setUp() self._obs_dim = 2 self._num_actions = 5 self._obs_spec = tensor_spec.TensorSpec([self._obs_dim], tf.float32) self._obs_spec_with_mask = (tensor_spec.TensorSpec([self._obs_dim], tf.float32), tensor_spec.TensorSpec([self._num_actions], tf.int32)) self._per_arm_obs_spec = bandit_spec_utils.create_per_arm_observation_spec( self._obs_dim, 4, self._num_actions, add_num_actions_feature=True) self._time_step_spec = ts.time_step_spec(self._obs_spec) self._time_step_spec_with_mask = ts.time_step_spec(self._obs_spec_with_mask) self._per_arm_time_step_spec = ts.time_step_spec(self._per_arm_obs_spec) self._alpha = 1.0 self._action_spec = tensor_spec.BoundedTensorSpec( shape=(), dtype=tf.int32, minimum=0, maximum=self._num_actions - 1, name='action')
def testComputeMaskFromMultipleSourcesNumActionsFeature(self): observation_spec = bandit_spec_utils.create_per_arm_observation_spec( 4, 5, 6, add_num_actions_feature=True) time_step_spec = ts.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 5) constraint_net = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( observation_spec, (3, 4), (4, 3), (2, 3))) neural_constraint = constraints.NeuralConstraint( time_step_spec, action_spec, constraint_network=constraint_net) observations = { 'global': tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=tf.float32), 'per_arm': tf.reshape(tf.range(60, dtype=tf.float32), shape=[2, 6, 5]), 'num_actions': tf.constant([4, 3], dtype=tf.int32) } mask = constraints.construct_mask_from_multiple_sources( observations, None, [neural_constraint], 6) implied_mask = [[1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0]] self.assertAllGreaterEqual(implied_mask - mask, 0)
def testPerArmRewards(self): tf.compat.v1.set_random_seed(3000) obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4) time_step_spec = ts.time_step_spec(obs_spec) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3) reward_network = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2))) policy = greedy_reward_policy.GreedyRewardPredictionPolicy( time_step_spec, action_spec, reward_network=reward_network, accepts_per_arm_features=True, emit_policy_info=('predicted_rewards_mean',)) action_feature = tf.cast( tf.reshape(tf.random.shuffle(tf.range(24)), shape=[2, 4, 3]), dtype=tf.float32) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: action_feature } time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step, seed=1) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate(tf.compat.v1.global_variables_initializer()) action, p_info, first_arm_features = self.evaluate([ action_step.action, action_step.info, observations[bandit_spec_utils.PER_ARM_FEATURE_KEY][0] ]) self.assertAllEqual(action.shape, [2]) self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 4]) self.assertAllEqual(p_info.chosen_arm_features.shape, [2, 3]) first_action = action[0] self.assertAllEqual(p_info.chosen_arm_features[0], first_arm_features[first_action]) # Check that zeroing out some of the actions does not affect the predicted # rewards for unchanged actions. This is to make sure that action feature # padding does not influence the behavior. if not tf.executing_eagerly(): # The below comparison will only work in tf2 because of the random per-arm # observations get re-drawn in tf1. return padded_action_feature = tf.concat( [action_feature[:, 0:1, :], tf.zeros(shape=[2, 3, 3], dtype=tf.float32)], axis=1) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: padded_action_feature } time_step = ts.restart(observations, batch_size=2) padded_action_step = policy.action(time_step, seed=1) padded_p_info = self.evaluate(padded_action_step.info) self.assertAllEqual(p_info.predicted_rewards_mean[:, 0], padded_p_info.predicted_rewards_mean[:, 0])