def _create_arm_policy_and_observations(
     self
 ) -> Tuple[greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy,
            Dict[Text, tf.Tensor]]:
     obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4)
     time_step_spec = ts.time_step_spec(obs_spec)
     action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3)
     objective_networks = [
         global_and_arm_feature_network.
         create_feed_forward_common_tower_network(obs_spec, (4, 3), (3, 4),
                                                  (4, 2)) for _ in range(3)
     ]
     policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
         time_step_spec,
         action_spec,
         self._scalarizer,
         objective_networks,
         accepts_per_arm_features=True,
         emit_policy_info=('predicted_rewards_mean', ))
     action_feature = tf.cast(tf.reshape(tf.random.shuffle(tf.range(24)),
                                         shape=[2, 4, 3]),
                              dtype=tf.float32)
     observations = {
         bandit_spec_utils.GLOBAL_FEATURE_KEY:
         tf.constant([[1, 2], [2, 1]], dtype=tf.float32),
         bandit_spec_utils.PER_ARM_FEATURE_KEY:
         action_feature
     }
     return policy, observations
Ejemplo n.º 2
0
    def testComputeLossWithArmFeatures(self):
        obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
            global_dim=2, per_arm_dim=3, num_actions=3)
        time_step_spec = ts.time_step_spec(obs_spec)
        constraint_net = (global_and_arm_feature_network.
                          create_feed_forward_common_tower_network(
                              obs_spec,
                              global_layers=(4, ),
                              arm_layers=(4, ),
                              common_layers=(4, )))
        neural_constraint = constraints.NeuralConstraint(
            time_step_spec,
            self._action_spec,
            constraint_network=constraint_net)

        observations = {
            bandit_spec_utils.GLOBAL_FEATURE_KEY:
            tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
            bandit_spec_utils.PER_ARM_FEATURE_KEY:
            tf.cast(tf.reshape(tf.range(18), shape=[2, 3, 3]),
                    dtype=tf.float32)
        }
        actions = tf.constant([0, 1], dtype=tf.int32)
        rewards = tf.constant([0.5, 3.0], dtype=tf.float32)

        init_op = neural_constraint.initialize()
        if not tf.executing_eagerly():
            with self.cached_session() as sess:
                common.initialize_uninitialized_variables(sess)
                self.assertIsNone(sess.run(init_op))
        loss = neural_constraint.compute_loss(observations, actions, rewards)
        self.assertGreater(self.evaluate(loss), 0.0)
Ejemplo n.º 3
0
 def testTrainPerArmAgent(self):
     obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 3)
     time_step_spec = ts.time_step_spec(obs_spec)
     reward_net = (global_and_arm_feature_network.
                   create_feed_forward_common_tower_network(
                       obs_spec, (4, 3), (3, 4), (4, 2)))
     optimizer = tf.compat.v1.train.GradientDescentOptimizer(
         learning_rate=0.1)
     agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
         time_step_spec,
         self._action_spec,
         reward_network=reward_net,
         optimizer=optimizer,
         epsilon=0.1,
         accepts_per_arm_features=True)
     observations = {
         bandit_spec_utils.GLOBAL_FEATURE_KEY:
         tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
         bandit_spec_utils.PER_ARM_FEATURE_KEY:
         tf.cast(tf.reshape(tf.range(18), shape=[2, 3, 3]),
                 dtype=tf.float32)
     }
     time_steps = ts.restart(observations, batch_size=2)
     policy = agent.policy
     action_step = policy.action(time_steps)
     self.evaluate(tf.compat.v1.initialize_all_variables())
     actions = self.evaluate(action_step.action)
     self.assertAllEqual(actions.shape, (2, ))
 def testTrainPerArmAgent(self):
     obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4)
     time_step_spec = ts.time_step_spec(obs_spec)
     reward_net = (
         global_and_arm_feature_network.create_feed_forward_per_arm_network(
             obs_spec, (4, 3), (3, 4), (4, 2)))
     optimizer = tf.compat.v1.train.GradientDescentOptimizer(
         learning_rate=0.1)
     agent = greedy_agent.GreedyRewardPredictionAgent(
         time_step_spec,
         self._action_spec,
         reward_network=reward_net,
         accepts_per_arm_features=True,
         optimizer=optimizer)
     observations = {
         bandit_spec_utils.GLOBAL_FEATURE_KEY:
         tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
         bandit_spec_utils.PER_ARM_FEATURE_KEY:
         tf.cast(tf.reshape(tf.range(24), shape=[2, 4, 3]),
                 dtype=tf.float32)
     }
     actions = np.array([0, 3], dtype=np.int32)
     rewards = np.array([0.5, 3.0], dtype=np.float32)
     initial_step, final_step = _get_initial_and_final_steps(
         observations, rewards)
     action_step = policy_step.PolicyStep(
         action=tf.convert_to_tensor(actions),
         info=policy_utilities.PerArmPolicyInfo(
             chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]],
                                          dtype=np.float32)))
     experience = _get_experience(initial_step, action_step, final_step)
     agent.train(experience, None)
     self.evaluate(tf.compat.v1.initialize_all_variables())
Ejemplo n.º 5
0
    def testPerArmObservation(self, batch_size, actions_from_reward_layer):
        global_obs_dim = 7
        arm_obs_dim = 3
        obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
            global_obs_dim, arm_obs_dim, self._num_actions)
        time_step_spec = ts.time_step_spec(obs_spec)
        dummy_net = arm_network.create_feed_forward_common_tower_network(
            obs_spec,
            global_layers=(3, 4, 5),
            arm_layers=(3, 2),
            common_layers=(4, 3),
            output_dim=self._encoding_dim)
        reward_layer = get_per_arm_reward_layer(
            encoding_dim=self._encoding_dim)

        policy = neural_linucb_policy.NeuralLinUCBPolicy(
            dummy_net,
            self._encoding_dim,
            reward_layer,
            actions_from_reward_layer=tf.constant(actions_from_reward_layer,
                                                  dtype=tf.bool),
            cov_matrix=self._a[0:1],
            data_vector=self._b[0:1],
            num_samples=self._num_samples_per_arm[0:1],
            epsilon_greedy=0.0,
            time_step_spec=time_step_spec,
            accepts_per_arm_features=True,
            emit_policy_info=('predicted_rewards_mean', ))

        current_time_step = self._per_arm_time_step_batch(
            batch_size=batch_size,
            global_obs_dim=global_obs_dim,
            arm_obs_dim=arm_obs_dim)
        action_step = policy.action(current_time_step)
        self.assertEqual(action_step.action.dtype, tf.int32)
        self.evaluate(tf.compat.v1.global_variables_initializer())
        action_fn = common.function_in_tf1()(policy.action)
        action_step = action_fn(current_time_step)

        input_observation = current_time_step.observation
        encoded_observation, _ = dummy_net(input_observation)

        if actions_from_reward_layer:
            predicted_rewards_from_reward_layer = reward_layer(
                encoded_observation)
            predicted_rewards_expected = self.evaluate(
                predicted_rewards_from_reward_layer).reshape(
                    (-1, self._num_actions))
        else:
            observation_numpy = self.evaluate(encoded_observation)
            predicted_rewards_expected = (
                self._get_predicted_rewards_from_per_arm_linucb(
                    observation_numpy, batch_size))

        p_info = self.evaluate(action_step.info)
        self.assertEqual(p_info.predicted_rewards_mean.dtype, np.float32)
        self.assertAllClose(p_info.predicted_rewards_mean,
                            predicted_rewards_expected)
Ejemplo n.º 6
0
 def testTrainPerArmAgentVariableActions(self):
   num_actions = 5
   obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
       2, 3, num_actions, add_num_actions_feature=True)
   time_step_spec = time_step.time_step_spec(obs_spec)
   action_spec = tensor_spec.BoundedTensorSpec(
       dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1)
   encoding_dim = 10
   encoder = (
       global_and_arm_feature_network.create_feed_forward_common_tower_network(
           obs_spec, (4, 3), (3, 4), (4, 2), encoding_dim))
   agent = neural_linucb_agent.NeuralLinUCBAgent(
       time_step_spec=time_step_spec,
       action_spec=action_spec,
       encoding_network=encoder,
       encoding_network_num_train_steps=10,
       encoding_dim=encoding_dim,
       accepts_per_arm_features=True,
       optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001))
   observations = {
       bandit_spec_utils.GLOBAL_FEATURE_KEY:
           tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
       bandit_spec_utils.PER_ARM_FEATURE_KEY:
           tf.cast(
               tf.reshape(tf.range(30), shape=[2, 5, 3]), dtype=tf.float32),
       bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY:
           tf.constant([3, 4], dtype=tf.int32)
   }
   actions = np.array([0, 3], dtype=np.int32)
   rewards = np.array([0.5, 3.0], dtype=np.float32)
   initial_step = time_step.TimeStep(
       tf.constant(
           time_step.StepType.FIRST,
           dtype=tf.int32,
           shape=[2],
           name='step_type'),
       tf.constant(0.0, dtype=tf.float32, shape=[2], name='reward'),
       tf.constant(1.0, dtype=tf.float32, shape=[2], name='discount'),
       observations)
   final_step = time_step.TimeStep(
       tf.constant(
           time_step.StepType.LAST,
           dtype=tf.int32,
           shape=[2],
           name='step_type'),
       tf.constant(rewards, dtype=tf.float32, name='reward'),
       tf.constant(1.0, dtype=tf.float32, shape=[2], name='discount'),
       observations)
   action_step = policy_step.PolicyStep(
       action=tf.convert_to_tensor(actions),
       info=policy_utilities.PerArmPolicyInfo(
           chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]],
                                        dtype=np.float32)))
   experience = _get_experience(initial_step, action_step, final_step)
   loss_info, _ = agent.train(experience, None)
   self.evaluate(tf.compat.v1.initialize_all_variables())
   loss_value = self.evaluate(loss_info)
   self.assertGreater(loss_value, 0.0)
Ejemplo n.º 7
0
  def testLinearAgentUpdatePerArmFeatures(self,
                                          batch_size,
                                          context_dim,
                                          exploration_policy,
                                          dtype,
                                          use_eigendecomp=False):
    """Check that the agent updates for specified actions and rewards."""

    # Construct a `Trajectory` for the given action, observation, reward.
    num_actions = 5
    global_context_dim = context_dim
    arm_context_dim = 3
    initial_step, final_step = (
        _get_initial_and_final_steps_with_per_arm_features(
            batch_size, global_context_dim, num_actions, arm_context_dim))
    action = np.random.randint(num_actions, size=batch_size, dtype=np.int32)
    action_step = policy_step.PolicyStep(
        action=tf.convert_to_tensor(action),
        info=policy_utilities.PerArmPolicyInfo(
            chosen_arm_features=np.arange(
                batch_size * arm_context_dim, dtype=np.float32).reshape(
                    [batch_size, arm_context_dim])))
    experience = _get_experience(initial_step, action_step, final_step)

    # Construct an agent and perform the update.
    observation_spec = bandit_spec_utils.create_per_arm_observation_spec(
        context_dim, arm_context_dim, num_actions)
    time_step_spec = time_step.time_step_spec(observation_spec)
    action_spec = tensor_spec.BoundedTensorSpec(
        dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1)
    agent = linear_agent.LinearBanditAgent(
        exploration_policy=exploration_policy,
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        use_eigendecomp=use_eigendecomp,
        accepts_per_arm_features=True,
        dtype=dtype)
    self.evaluate(agent.initialize())
    loss_info = agent.train(experience)
    self.evaluate(loss_info)
    final_a = self.evaluate(agent.cov_matrix)
    final_b = self.evaluate(agent.data_vector)

    # Compute the expected updated estimates.
    global_observation = experience.observation[
        bandit_spec_utils.GLOBAL_FEATURE_KEY]
    arm_observation = experience.policy_info.chosen_arm_features
    overall_observation = tf.squeeze(
        tf.concat([global_observation, arm_observation], axis=-1), axis=1)
    rewards = tf.squeeze(experience.reward, axis=1)

    expected_a_new = tf.matmul(
        overall_observation, overall_observation, transpose_a=True)
    expected_b_new = bandit_utils.sum_reward_weighted_observations(
        rewards, overall_observation)
    self.assertAllClose(expected_a_new, final_a[0])
    self.assertAllClose(expected_b_new, final_b[0])
 def testCreateFeedForwardDotProductNetwork(self, batch_size, feature_dim,
                                            num_actions):
     obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
         7, feature_dim, num_actions)
     net = gafn.create_feed_forward_dot_product_network(
         obs_spec, (4, 3, 4), (6, 5, 4))
     input_nest = tensor_spec.sample_spec_nest(obs_spec,
                                               outer_dims=(batch_size, ))
     output, _ = self.evaluate(net(input_nest))
     self.assertAllEqual(output.shape, (batch_size, num_actions))
 def testCreateFeedForwardCommonTowerNetwork(self, batch_size, feature_dim,
                                             num_actions):
     obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
         7, feature_dim, num_actions)
     net = gafn.create_feed_forward_common_tower_network(
         obs_spec, (4, 3, 2), (6, 5, 4), (7, 6, 5))
     input_nest = tensor_spec.sample_spec_nest(obs_spec,
                                               outer_dims=(batch_size, ))
     output, _ = net(input_nest)
     self.evaluate(tf.compat.v1.global_variables_initializer())
     output = self.evaluate(output)
     self.assertAllEqual(output.shape, (batch_size, num_actions))
 def testNoneTimeStepSpecForPerArmFeaturesRaisesError(self):
     obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4)
     action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3)
     objective_networks = [
         global_and_arm_feature_network.
         create_feed_forward_common_tower_network(obs_spec, (4, 3), (3, 4),
                                                  (4, 2)) for _ in range(3)
     ]
     with self.assertRaisesRegexp(
             ValueError,
             'time_step_spec should not be None for per-arm-features policies'
     ):
         greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
             None,
             action_spec,
             self._scalarizer,
             objective_networks,
             accepts_per_arm_features=True,
             emit_policy_info=('predicted_rewards_mean', ))
  def testPerArmRewardsVariableNumActions(self):
    tf.compat.v1.set_random_seed(3000)
    obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
        2, 3, 4, add_num_actions_feature=True)
    time_step_spec = ts.time_step_spec(obs_spec)
    action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3)
    reward_network = (
        global_and_arm_feature_network.create_feed_forward_common_tower_network(
            obs_spec, (4, 3), (3, 4), (4, 2)))

    policy = greedy_reward_policy.GreedyRewardPredictionPolicy(
        time_step_spec,
        action_spec,
        reward_network=reward_network,
        accepts_per_arm_features=True,
        emit_policy_info=('predicted_rewards_mean',))
    action_feature = tf.cast(
        tf.reshape(tf.random.shuffle(tf.range(24)), shape=[2, 4, 3]),
        dtype=tf.float32)
    observations = {
        bandit_spec_utils.GLOBAL_FEATURE_KEY:
            tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
        bandit_spec_utils.PER_ARM_FEATURE_KEY:
            action_feature,
        bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY:
            tf.constant([2, 3], dtype=tf.int32)
    }
    time_step = ts.restart(observations, batch_size=2)
    action_step = policy.action(time_step, seed=1)
    self.assertEqual(action_step.action.shape.as_list(), [2])
    self.assertEqual(action_step.action.dtype, tf.int32)
    # Initialize all variables
    self.evaluate(tf.compat.v1.global_variables_initializer())
    action, p_info, first_arm_features = self.evaluate([
        action_step.action, action_step.info,
        observations[bandit_spec_utils.PER_ARM_FEATURE_KEY][0]
    ])
    self.assertAllEqual(action.shape, [2])
    self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 4])
    self.assertAllEqual(p_info.chosen_arm_features.shape, [2, 3])
    first_action = action[0]
    self.assertAllEqual(p_info.chosen_arm_features[0],
                        first_arm_features[first_action])
Ejemplo n.º 12
0
 def testTrainPerArmAgent(self):
     obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
         2, 3, 4, add_num_actions_feature=True)
     time_step_spec = ts.time_step_spec(observation_spec=obs_spec,
                                        reward_spec=tensor_spec.TensorSpec(
                                            [3], tf.float32))
     action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3)
     networks_and_loss_fns = [
         (global_and_arm_feature_network.
          create_feed_forward_common_tower_network(obs_spec, (4, 3), (3, 4),
                                                   (4, 2)),
          tf.compat.v1.losses.mean_squared_error) for _ in range(3)
     ]
     optimizer = tf.compat.v1.train.GradientDescentOptimizer(
         learning_rate=0.01)
     agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent(
         time_step_spec,
         action_spec,
         self._scalarizer,
         objective_network_and_loss_fn_sequence=networks_and_loss_fns,
         accepts_per_arm_features=True,
         optimizer=optimizer)
     observations = {
         bandit_spec_utils.GLOBAL_FEATURE_KEY:
         tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
         bandit_spec_utils.PER_ARM_FEATURE_KEY:
         tf.cast(tf.reshape(tf.range(24), shape=[2, 4, 3]),
                 dtype=tf.float32),
         bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY:
         tf.ones([2], dtype=tf.int32)
     }
     actions = np.array([0, 3], dtype=np.int32)
     objectives = np.array([[1, 2, 3], [6, 5, 4]], dtype=np.float32)
     initial_step, final_step = _get_initial_and_final_steps(
         observations, objectives)
     action_step = policy_step.PolicyStep(
         action=tf.convert_to_tensor(actions),
         info=policy_utilities.PerArmPolicyInfo(
             chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]],
                                          dtype=np.float32)))
     experience = _get_experience(initial_step, action_step, final_step)
     agent.train(experience, None)
     self.evaluate(tf.compat.v1.initialize_all_variables())
Ejemplo n.º 13
0
 def testComputeMaskFromMultipleSourcesMask(self):
   observation_spec = bandit_spec_utils.create_per_arm_observation_spec(
       4, 5, 6)
   time_step_spec = ts.time_step_spec(observation_spec)
   action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 5)
   constraint_net = (
       global_and_arm_feature_network.create_feed_forward_common_tower_network(
           observation_spec, (3, 4), (4, 3), (2, 3)))
   neural_constraint = constraints.NeuralConstraint(
       time_step_spec,
       action_spec,
       constraint_network=constraint_net)
   original_mask = [[1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0]]
   observations = ({
       'global': tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=tf.float32),
       'per_arm': tf.reshape(tf.range(60, dtype=tf.float32), shape=[2, 6, 5]),
   }, original_mask)
   mask = constraints.construct_mask_from_multiple_sources(
       observations, lambda x: (x[0], x[1]), [neural_constraint], 6)
   self.assertAllGreaterEqual(original_mask - mask, 0)
 def testTrainPerArmAgentWithMask(self):
     num_actions = 4
     obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
         2, 3, num_actions)
     mask_obs_spec = (obs_spec,
                      tensor_spec.BoundedTensorSpec(shape=[num_actions],
                                                    minimum=0,
                                                    maximum=1,
                                                    dtype=tf.float32))
     time_step_spec = ts.time_step_spec(mask_obs_spec)
     reward_net = (global_and_arm_feature_network.
                   create_feed_forward_common_tower_network(
                       obs_spec, (4, 3), (3, 4), (4, 2)))
     optimizer = tf.compat.v1.train.GradientDescentOptimizer(
         learning_rate=0.1)
     agent = greedy_agent.GreedyRewardPredictionAgent(
         time_step_spec,
         self._action_spec,
         reward_network=reward_net,
         observation_and_action_constraint_splitter=lambda x: [x[0], x[1]],
         accepts_per_arm_features=True,
         optimizer=optimizer)
     observations = ({
         bandit_spec_utils.GLOBAL_FEATURE_KEY:
         tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
         bandit_spec_utils.PER_ARM_FEATURE_KEY:
         tf.cast(tf.reshape(tf.range(24), shape=[2, 4, 3]),
                 dtype=tf.float32)
     }, tf.ones([2, num_actions]))
     actions = np.array([0, 3], dtype=np.int32)
     rewards = np.array([0.5, 3.0], dtype=np.float32)
     initial_step, final_step = _get_initial_and_final_steps_with_action_mask(
         observations, rewards)
     action_step = policy_step.PolicyStep(
         action=tf.convert_to_tensor(actions),
         info=policy_utilities.PerArmPolicyInfo(
             chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]],
                                          dtype=np.float32)))
     experience = _get_experience(initial_step, action_step, final_step)
     agent.train(experience, None)
     self.evaluate(tf.compat.v1.initialize_all_variables())
Ejemplo n.º 15
0
 def setUp(self):
   super(LinearBanditPolicyTest, self).setUp()
   self._obs_dim = 2
   self._num_actions = 5
   self._obs_spec = tensor_spec.TensorSpec([self._obs_dim], tf.float32)
   self._obs_spec_with_mask = (tensor_spec.TensorSpec([self._obs_dim],
                                                      tf.float32),
                               tensor_spec.TensorSpec([self._num_actions],
                                                      tf.int32))
   self._per_arm_obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
       self._obs_dim, 4, self._num_actions, add_num_actions_feature=True)
   self._time_step_spec = ts.time_step_spec(self._obs_spec)
   self._time_step_spec_with_mask = ts.time_step_spec(self._obs_spec_with_mask)
   self._per_arm_time_step_spec = ts.time_step_spec(self._per_arm_obs_spec)
   self._alpha = 1.0
   self._action_spec = tensor_spec.BoundedTensorSpec(
       shape=(),
       dtype=tf.int32,
       minimum=0,
       maximum=self._num_actions - 1,
       name='action')
Ejemplo n.º 16
0
  def testComputeMaskFromMultipleSourcesNumActionsFeature(self):
    observation_spec = bandit_spec_utils.create_per_arm_observation_spec(
        4, 5, 6, add_num_actions_feature=True)
    time_step_spec = ts.time_step_spec(observation_spec)
    action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 5)
    constraint_net = (
        global_and_arm_feature_network.create_feed_forward_common_tower_network(
            observation_spec, (3, 4), (4, 3), (2, 3)))
    neural_constraint = constraints.NeuralConstraint(
        time_step_spec,
        action_spec,
        constraint_network=constraint_net)

    observations = {
        'global': tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=tf.float32),
        'per_arm': tf.reshape(tf.range(60, dtype=tf.float32), shape=[2, 6, 5]),
        'num_actions': tf.constant([4, 3], dtype=tf.int32)
    }
    mask = constraints.construct_mask_from_multiple_sources(
        observations, None, [neural_constraint], 6)
    implied_mask = [[1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0]]
    self.assertAllGreaterEqual(implied_mask - mask, 0)
  def testPerArmRewards(self):
    tf.compat.v1.set_random_seed(3000)
    obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4)
    time_step_spec = ts.time_step_spec(obs_spec)
    action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3)
    reward_network = (
        global_and_arm_feature_network.create_feed_forward_common_tower_network(
            obs_spec, (4, 3), (3, 4), (4, 2)))

    policy = greedy_reward_policy.GreedyRewardPredictionPolicy(
        time_step_spec,
        action_spec,
        reward_network=reward_network,
        accepts_per_arm_features=True,
        emit_policy_info=('predicted_rewards_mean',))
    action_feature = tf.cast(
        tf.reshape(tf.random.shuffle(tf.range(24)), shape=[2, 4, 3]),
        dtype=tf.float32)
    observations = {
        bandit_spec_utils.GLOBAL_FEATURE_KEY:
            tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
        bandit_spec_utils.PER_ARM_FEATURE_KEY: action_feature
    }
    time_step = ts.restart(observations, batch_size=2)
    action_step = policy.action(time_step, seed=1)
    self.assertEqual(action_step.action.shape.as_list(), [2])
    self.assertEqual(action_step.action.dtype, tf.int32)
    # Initialize all variables
    self.evaluate(tf.compat.v1.global_variables_initializer())
    action, p_info, first_arm_features = self.evaluate([
        action_step.action, action_step.info,
        observations[bandit_spec_utils.PER_ARM_FEATURE_KEY][0]
    ])
    self.assertAllEqual(action.shape, [2])
    self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 4])
    self.assertAllEqual(p_info.chosen_arm_features.shape, [2, 3])
    first_action = action[0]
    self.assertAllEqual(p_info.chosen_arm_features[0],
                        first_arm_features[first_action])

    # Check that zeroing out some of the actions does not affect the predicted
    # rewards for unchanged actions. This is to make sure that action feature
    # padding does not influence the behavior.

    if not tf.executing_eagerly():
      # The below comparison will only work in tf2 because of the random per-arm
      # observations get re-drawn in tf1.
      return
    padded_action_feature = tf.concat(
        [action_feature[:, 0:1, :],
         tf.zeros(shape=[2, 3, 3], dtype=tf.float32)],
        axis=1)
    observations = {
        bandit_spec_utils.GLOBAL_FEATURE_KEY:
            tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
        bandit_spec_utils.PER_ARM_FEATURE_KEY: padded_action_feature
    }
    time_step = ts.restart(observations, batch_size=2)
    padded_action_step = policy.action(time_step, seed=1)
    padded_p_info = self.evaluate(padded_action_step.info)
    self.assertAllEqual(p_info.predicted_rewards_mean[:, 0],
                        padded_p_info.predicted_rewards_mean[:, 0])