Esempio n. 1
0
  def testTrainPerArmAgentWithConstraint(self):
    obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4)
    reward_spec = {
        'reward': tensor_spec.TensorSpec(
            shape=(), dtype=tf.float32, name='reward'),
        'constraint': tensor_spec.TensorSpec(
            shape=(), dtype=tf.float32, name='constraint')
    }
    time_step_spec = ts.time_step_spec(obs_spec, reward_spec)
    reward_net = (
        global_and_arm_feature_network.create_feed_forward_common_tower_network(
            obs_spec, (4, 3), (3, 4), (4, 2)))
    optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.1)
    constraint_net = (
        global_and_arm_feature_network.create_feed_forward_common_tower_network(
            obs_spec, (4, 3), (3, 4), (4, 2)))
    neural_constraint = constraints.NeuralConstraint(
        time_step_spec,
        self._action_spec,
        constraint_network=constraint_net)

    agent = greedy_agent.GreedyRewardPredictionAgent(
        time_step_spec,
        self._action_spec,
        reward_network=reward_net,
        accepts_per_arm_features=True,
        optimizer=optimizer,
        constraints=[neural_constraint])
    observations = {
        bandit_spec_utils.GLOBAL_FEATURE_KEY:
            tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
        bandit_spec_utils.PER_ARM_FEATURE_KEY:
            tf.cast(
                tf.reshape(tf.range(24), shape=[2, 4, 3]), dtype=tf.float32)
    }
    actions = np.array([0, 3], dtype=np.int32)
    rewards = {
        'reward': np.array([0.5, 3.0], dtype=np.float32),
        'constraint': np.array([6.0, 4.0], dtype=np.float32)
    }
    initial_step, final_step = _get_initial_and_final_steps_nested_rewards(
        observations, rewards)
    action_step = policy_step.PolicyStep(
        action=tf.convert_to_tensor(actions),
        info=policy_utilities.PerArmPolicyInfo(
            chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]],
                                         dtype=np.float32)))
    experience = _get_experience(initial_step, action_step, final_step)
    agent.train(experience, None)
    self.evaluate(tf.compat.v1.initialize_all_variables())
 def testTrainPerArmAgent(self):
   obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 3)
   time_step_spec = ts.time_step_spec(obs_spec)
   reward_net = (
       global_and_arm_feature_network.create_feed_forward_common_tower_network(
           obs_spec, (4, 3), (3, 4), (4, 2)))
   optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.1)
   agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
       time_step_spec,
       self._action_spec,
       reward_network=reward_net,
       optimizer=optimizer,
       epsilon=0.1,
       accepts_per_arm_features=True)
   observations = {
       bandit_spec_utils.GLOBAL_FEATURE_KEY:
           tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
       bandit_spec_utils.PER_ARM_FEATURE_KEY:
           tf.cast(
               tf.reshape(tf.range(18), shape=[2, 3, 3]), dtype=tf.float32)
   }
   time_steps = ts.restart(observations, batch_size=2)
   policy = agent.policy
   action_step = policy.action(time_steps)
   self.evaluate(tf.compat.v1.initialize_all_variables())
   actions = self.evaluate(action_step.action)
   self.assertAllEqual(actions.shape, (2,))
Esempio n. 3
0
  def testComputeLossWithArmFeatures(self):
    obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
        global_dim=2, per_arm_dim=3, max_num_actions=3)
    time_step_spec = ts.time_step_spec(obs_spec)
    constraint_net = (
        global_and_arm_feature_network.create_feed_forward_common_tower_network(
            obs_spec,
            global_layers=(4,),
            arm_layers=(4,),
            common_layers=(4,)))
    neural_constraint = constraints.NeuralConstraint(
        time_step_spec,
        self._action_spec,
        constraint_network=constraint_net)

    observations = {
        bandit_spec_utils.GLOBAL_FEATURE_KEY:
            tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
        bandit_spec_utils.PER_ARM_FEATURE_KEY:
            tf.cast(
                tf.reshape(tf.range(18), shape=[2, 3, 3]), dtype=tf.float32)
    }
    actions = tf.constant([0, 1], dtype=tf.int32)
    rewards = tf.constant([0.5, 3.0], dtype=tf.float32)

    init_op = neural_constraint.initialize()
    if not tf.executing_eagerly():
      with self.cached_session() as sess:
        common.initialize_uninitialized_variables(sess)
        self.assertIsNone(sess.run(init_op))
    loss = neural_constraint.compute_loss(
        observations,
        actions,
        rewards)
    self.assertGreater(self.evaluate(loss), 0.0)
 def testTrainPerArmAgentWithMask(self):
   num_actions = 4
   obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
       2, 3, num_actions, add_action_mask=True)
   time_step_spec = ts.time_step_spec(obs_spec)
   reward_net = (
       global_and_arm_feature_network.create_feed_forward_common_tower_network(
           obs_spec[0], (4, 3), (3, 4), (4, 2)))
   optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.1)
   agent = greedy_agent.GreedyRewardPredictionAgent(
       time_step_spec,
       self._action_spec,
       reward_network=reward_net,
       observation_and_action_constraint_splitter=lambda x: [x[0], x[1]],
       accepts_per_arm_features=True,
       optimizer=optimizer)
   observations = ({
       bandit_spec_utils.GLOBAL_FEATURE_KEY:
           tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
       bandit_spec_utils.PER_ARM_FEATURE_KEY:
           tf.cast(
               tf.reshape(tf.range(24), shape=[2, 4, 3]), dtype=tf.float32)
   }, tf.ones([2, num_actions], dtype=tf.int32))
   actions = np.array([0, 3], dtype=np.int32)
   rewards = np.array([0.5, 3.0], dtype=np.float32)
   initial_step, final_step = _get_initial_and_final_steps_with_action_mask(
       observations, rewards)
   action_step = policy_step.PolicyStep(
       action=tf.convert_to_tensor(actions),
       info=policy_utilities.PerArmPolicyInfo(
           chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]],
                                        dtype=np.float32)))
   experience = _get_experience(initial_step, action_step, final_step)
   agent.train(experience, None)
   self.evaluate(tf.compat.v1.initialize_all_variables())
    def testCreateFeedForwardCommonTowerNetworkWithFeatureColumns(
            self, batch_size=2, feature_dim=4, num_actions=3):
        obs_spec = {
            'global': {
                'dense':
                tensor_spec.TensorSpec(shape=(feature_dim, ),
                                       dtype=tf.float32),
                'composer':
                tensor_spec.TensorSpec((), tf.string)
            },
            'per_arm': {
                'name': tensor_spec.TensorSpec((num_actions, ), tf.string),
                'fruit': tensor_spec.TensorSpec((num_actions, ), tf.string)
            }
        }
        columns_dense = tf.feature_column.numeric_column('dense',
                                                         shape=(feature_dim, ))
        columns_composer = tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                'composer', ['wolfgang', 'amadeus', 'mozart']))

        columns_name = tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                'name', ['bob', 'george', 'wanda']))
        columns_fruit = tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                'fruit', ['banana', 'kiwi', 'pear']))

        net = gafn.create_feed_forward_common_tower_network(
            observation_spec=obs_spec,
            global_layers=(4, 3, 2),
            arm_layers=(6, 5, 4),
            common_layers=(7, 6, 5),
            global_preprocessing_combiner=tf.compat.v2.keras.layers.
            DenseFeatures([columns_dense, columns_composer]),
            arm_preprocessing_combiner=tf.compat.v2.keras.layers.DenseFeatures(
                [columns_name, columns_fruit]))
        input_nest = {
            'global': {
                'dense': tf.constant(np.random.rand(batch_size, feature_dim)),
                'composer': tf.constant(['wolfgang', 'mozart'])
            },
            'per_arm': {
                'name':
                tf.constant([[['george'], ['george'], ['george']],
                             [['bob'], ['bob'], ['bob']]]),
                'fruit':
                tf.constant([[['banana'], ['banana'], ['banana']],
                             [['kiwi'], ['kiwi'], ['kiwi']]])
            }
        }

        output, _ = net(input_nest)
        self.evaluate([
            tf.compat.v1.global_variables_initializer(),
            tf.compat.v1.tables_initializer()
        ])
        output = self.evaluate(output)
        self.assertAllEqual(output.shape, (batch_size, num_actions))
Esempio n. 6
0
    def testPerArmObservation(self, batch_size, actions_from_reward_layer):
        global_obs_dim = 7
        arm_obs_dim = 3
        obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
            global_obs_dim, arm_obs_dim, self._num_actions)
        time_step_spec = ts.time_step_spec(obs_spec)
        dummy_net = arm_network.create_feed_forward_common_tower_network(
            obs_spec,
            global_layers=(3, 4, 5),
            arm_layers=(3, 2),
            common_layers=(4, 3),
            output_dim=self._encoding_dim)
        reward_layer = get_per_arm_reward_layer(
            encoding_dim=self._encoding_dim)

        policy = neural_linucb_policy.NeuralLinUCBPolicy(
            dummy_net,
            self._encoding_dim,
            reward_layer,
            actions_from_reward_layer=tf.constant(actions_from_reward_layer,
                                                  dtype=tf.bool),
            cov_matrix=self._a[0:1],
            data_vector=self._b[0:1],
            num_samples=self._num_samples_per_arm[0:1],
            epsilon_greedy=0.0,
            time_step_spec=time_step_spec,
            accepts_per_arm_features=True,
            emit_policy_info=('predicted_rewards_mean', ))

        current_time_step = self._per_arm_time_step_batch(
            batch_size=batch_size,
            global_obs_dim=global_obs_dim,
            arm_obs_dim=arm_obs_dim)
        action_step = policy.action(current_time_step)
        self.assertEqual(action_step.action.dtype, tf.int32)
        self.evaluate(tf.compat.v1.global_variables_initializer())
        action_fn = common.function_in_tf1()(policy.action)
        action_step = action_fn(current_time_step)

        input_observation = current_time_step.observation
        encoded_observation, _ = dummy_net(input_observation)

        if actions_from_reward_layer:
            predicted_rewards_from_reward_layer = reward_layer(
                encoded_observation)
            predicted_rewards_expected = self.evaluate(
                predicted_rewards_from_reward_layer).reshape(
                    (-1, self._num_actions))
        else:
            observation_numpy = self.evaluate(encoded_observation)
            predicted_rewards_expected = (
                self._get_predicted_rewards_from_per_arm_linucb(
                    observation_numpy, batch_size))

        p_info = self.evaluate(action_step.info)
        self.assertEqual(p_info.predicted_rewards_mean.dtype, np.float32)
        self.assertAllClose(p_info.predicted_rewards_mean,
                            predicted_rewards_expected)
Esempio n. 7
0
 def testTrainPerArmAgentVariableActions(self):
   num_actions = 5
   obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
       2, 3, num_actions, add_num_actions_feature=True)
   time_step_spec = time_step.time_step_spec(obs_spec)
   action_spec = tensor_spec.BoundedTensorSpec(
       dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1)
   encoding_dim = 10
   encoder = (
       global_and_arm_feature_network.create_feed_forward_common_tower_network(
           obs_spec, (4, 3), (3, 4), (4, 2), encoding_dim))
   agent = neural_linucb_agent.NeuralLinUCBAgent(
       time_step_spec=time_step_spec,
       action_spec=action_spec,
       encoding_network=encoder,
       encoding_network_num_train_steps=10,
       encoding_dim=encoding_dim,
       accepts_per_arm_features=True,
       optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001))
   observations = {
       bandit_spec_utils.GLOBAL_FEATURE_KEY:
           tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
       bandit_spec_utils.PER_ARM_FEATURE_KEY:
           tf.cast(
               tf.reshape(tf.range(30), shape=[2, 5, 3]), dtype=tf.float32),
       bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY:
           tf.constant([3, 4], dtype=tf.int32)
   }
   actions = np.array([0, 3], dtype=np.int32)
   rewards = np.array([0.5, 3.0], dtype=np.float32)
   initial_step = time_step.TimeStep(
       tf.constant(
           time_step.StepType.FIRST,
           dtype=tf.int32,
           shape=[2],
           name='step_type'),
       tf.constant(0.0, dtype=tf.float32, shape=[2], name='reward'),
       tf.constant(1.0, dtype=tf.float32, shape=[2], name='discount'),
       observations)
   final_step = time_step.TimeStep(
       tf.constant(
           time_step.StepType.LAST,
           dtype=tf.int32,
           shape=[2],
           name='step_type'),
       tf.constant(rewards, dtype=tf.float32, name='reward'),
       tf.constant(1.0, dtype=tf.float32, shape=[2], name='discount'),
       observations)
   action_step = policy_step.PolicyStep(
       action=tf.convert_to_tensor(actions),
       info=policy_utilities.PerArmPolicyInfo(
           chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]],
                                        dtype=np.float32)))
   experience = _get_experience(initial_step, action_step, final_step)
   loss_info, _ = agent.train(experience, None)
   self.evaluate(tf.compat.v1.initialize_all_variables())
   loss_value = self.evaluate(loss_info)
   self.assertGreater(loss_value, 0.0)
 def testCreateFeedForwardCommonTowerNetwork(self, batch_size, feature_dim,
                                             num_actions):
     obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
         7, feature_dim, num_actions)
     net = gafn.create_feed_forward_common_tower_network(
         obs_spec, (4, 3, 2), (6, 5, 4), (7, 6, 5))
     input_nest = tensor_spec.sample_spec_nest(obs_spec,
                                               outer_dims=(batch_size, ))
     output, _ = self.evaluate(net(input_nest))
     self.assertAllEqual(output.shape, (batch_size, num_actions))
Esempio n. 9
0
 def testCreateFeedForwardCommonTowerNetworkWithEmptyLayers(
         self, batch_size, feature_dim, num_actions):
     obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
         7, feature_dim, num_actions)
     net = gafn.create_feed_forward_common_tower_network(obs_spec,
                                                         global_layers=(),
                                                         arm_layers=(),
                                                         common_layers=())
     input_nest = tensor_spec.sample_spec_nest(obs_spec,
                                               outer_dims=(batch_size, ))
     output, _ = net(input_nest)
     self.evaluate(tf.compat.v1.global_variables_initializer())
     output = self.evaluate(output)
     self.assertAllEqual(output.shape, (batch_size, num_actions))
  def testPerArmRewardsVariableNumActions(self):
    tf.compat.v1.set_random_seed(3000)
    obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
        2, 3, 4, add_num_actions_feature=True)
    time_step_spec = ts.time_step_spec(obs_spec)
    action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3)
    reward_network = (
        global_and_arm_feature_network.create_feed_forward_common_tower_network(
            obs_spec, (4, 3), (3, 4), (4, 2)))

    policy = greedy_reward_policy.GreedyRewardPredictionPolicy(
        time_step_spec,
        action_spec,
        reward_network=reward_network,
        accepts_per_arm_features=True,
        emit_policy_info=('predicted_rewards_mean',))
    action_feature = tf.cast(
        tf.reshape(tf.random.shuffle(tf.range(24)), shape=[2, 4, 3]),
        dtype=tf.float32)
    observations = {
        bandit_spec_utils.GLOBAL_FEATURE_KEY:
            tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
        bandit_spec_utils.PER_ARM_FEATURE_KEY:
            action_feature,
        bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY:
            tf.constant([2, 3], dtype=tf.int32)
    }
    time_step = ts.restart(observations, batch_size=2)
    action_step = policy.action(time_step, seed=1)
    self.assertEqual(action_step.action.shape.as_list(), [2])
    self.assertEqual(action_step.action.dtype, tf.int32)
    # Initialize all variables
    self.evaluate(tf.compat.v1.global_variables_initializer())
    action, p_info, first_arm_features = self.evaluate([
        action_step.action, action_step.info,
        observations[bandit_spec_utils.PER_ARM_FEATURE_KEY][0]
    ])
    self.assertAllEqual(action.shape, [2])
    self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 4])
    self.assertAllEqual(p_info.chosen_arm_features.shape, [2, 3])
    first_action = action[0]
    self.assertAllEqual(p_info.chosen_arm_features[0],
                        first_arm_features[first_action])
Esempio n. 11
0
 def testComputeMaskFromMultipleSourcesMask(self):
   observation_spec = bandit_spec_utils.create_per_arm_observation_spec(
       4, 5, 6)
   time_step_spec = ts.time_step_spec(observation_spec)
   action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 5)
   constraint_net = (
       global_and_arm_feature_network.create_feed_forward_common_tower_network(
           observation_spec, (3, 4), (4, 3), (2, 3)))
   neural_constraint = constraints.NeuralConstraint(
       time_step_spec,
       action_spec,
       constraint_network=constraint_net)
   original_mask = [[1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0]]
   observations = ({
       'global': tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=tf.float32),
       'per_arm': tf.reshape(tf.range(60, dtype=tf.float32), shape=[2, 6, 5]),
   }, original_mask)
   mask = constraints.construct_mask_from_multiple_sources(
       observations, lambda x: (x[0], x[1]), [neural_constraint], 6)
   self.assertAllGreaterEqual(original_mask - mask, 0)
 def testTrainPerArmAgent(self):
   obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
       2, 3, 4, add_num_actions_feature=True)
   time_step_spec = ts.time_step_spec(
       observation_spec=obs_spec,
       reward_spec=tensor_spec.TensorSpec([3], tf.float32))
   action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3)
   objective_networks = [
       global_and_arm_feature_network.create_feed_forward_common_tower_network(
           obs_spec, (4, 3), (3, 4), (4, 2)) for _ in range(3)
   ]
   optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.01)
   agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent(
       time_step_spec,
       action_spec,
       self._scalarizer,
       objective_networks=objective_networks,
       accepts_per_arm_features=True,
       optimizer=optimizer)
   observations = {
       bandit_spec_utils.GLOBAL_FEATURE_KEY:
           tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
       bandit_spec_utils.PER_ARM_FEATURE_KEY:
           tf.cast(
               tf.reshape(tf.range(24), shape=[2, 4, 3]), dtype=tf.float32),
       bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY:
           tf.ones([2], dtype=tf.int32)
   }
   actions = np.array([0, 3], dtype=np.int32)
   objectives = np.array([[1, 2, 3], [6, 5, 4]], dtype=np.float32)
   initial_step, final_step = _get_initial_and_final_steps(
       observations, objectives)
   action_step = policy_step.PolicyStep(
       action=tf.convert_to_tensor(actions),
       info=policy_utilities.PerArmPolicyInfo(
           chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]],
                                        dtype=np.float32)))
   experience = _get_experience(initial_step, action_step, final_step)
   agent.train(experience, None)
   self.evaluate(tf.compat.v1.initialize_all_variables())
Esempio n. 13
0
  def testComputeMaskFromMultipleSourcesNumActionsFeature(self):
    observation_spec = bandit_spec_utils.create_per_arm_observation_spec(
        4, 5, 6, add_num_actions_feature=True)
    time_step_spec = ts.time_step_spec(observation_spec)
    action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 5)
    constraint_net = (
        global_and_arm_feature_network.create_feed_forward_common_tower_network(
            observation_spec, (3, 4), (4, 3), (2, 3)))
    neural_constraint = constraints.NeuralConstraint(
        time_step_spec,
        action_spec,
        constraint_network=constraint_net)

    observations = {
        'global': tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=tf.float32),
        'per_arm': tf.reshape(tf.range(60, dtype=tf.float32), shape=[2, 6, 5]),
        'num_actions': tf.constant([4, 3], dtype=tf.int32)
    }
    mask = constraints.construct_mask_from_multiple_sources(
        observations, None, [neural_constraint], 6)
    implied_mask = [[1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0]]
    self.assertAllGreaterEqual(implied_mask - mask, 0)
  def testPerArmRewardsSparseObs(self):
    tf.compat.v1.set_random_seed(3000)
    obs_spec = {
        'global': {'sport': tensor_spec.TensorSpec((), tf.string)},
        'per_arm': {
            'name': tensor_spec.TensorSpec((3,), tf.string),
            'fruit': tensor_spec.TensorSpec((3,), tf.string)
        }
    }
    columns_a = tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            'name', ['bob', 'george', 'wanda']))
    columns_b = tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            'fruit', ['banana', 'kiwi', 'pear']))
    columns_c = tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            'sport', ['bridge', 'chess', 'snooker']))

    reward_network = (
        global_and_arm_feature_network.create_feed_forward_common_tower_network(
            observation_spec=obs_spec,
            global_layers=(4, 3, 2),
            arm_layers=(6, 5, 4),
            common_layers=(7, 6, 5),
            global_preprocessing_combiner=(
                tf.compat.v2.keras.layers.DenseFeatures([columns_c])),
            arm_preprocessing_combiner=tf.compat.v2.keras.layers.DenseFeatures(
                [columns_a, columns_b])))

    time_step_spec = ts.time_step_spec(obs_spec)
    action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 2)
    policy = greedy_reward_policy.GreedyRewardPredictionPolicy(
        time_step_spec,
        action_spec,
        reward_network=reward_network,
        accepts_per_arm_features=True,
        emit_policy_info=('predicted_rewards_mean',))
    observations = {
        'global': {
            'sport': tf.constant(['snooker', 'chess'])
        },
        'per_arm': {
            'name':
                tf.constant([['george', 'george', 'george'],
                             ['bob', 'bob', 'bob']]),
            'fruit':
                tf.constant([['banana', 'banana', 'banana'],
                             ['kiwi', 'kiwi', 'kiwi']])
        }
    }

    time_step = ts.restart(observations, batch_size=2)
    action_step = policy.action(time_step, seed=1)
    self.assertEqual(action_step.action.shape.as_list(), [2])
    self.assertEqual(action_step.action.dtype, tf.int32)
    # Initialize all variables
    self.evaluate([
        tf.compat.v1.global_variables_initializer(),
        tf.compat.v1.tables_initializer()
    ])
    action, p_info, first_arm_name_feature = self.evaluate([
        action_step.action, action_step.info,
        observations[bandit_spec_utils.PER_ARM_FEATURE_KEY]['name'][0]
    ])
    self.assertAllEqual(action.shape, [2])
    self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 3])
    self.assertAllEqual(p_info.chosen_arm_features['name'].shape, [2])
    self.assertAllEqual(p_info.chosen_arm_features['fruit'].shape, [2])
    first_action = action[0]
    self.assertAllEqual(p_info.chosen_arm_features['name'][0],
                        first_arm_name_feature[first_action])
  def testPerArmRewards(self):
    tf.compat.v1.set_random_seed(3000)
    obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4)
    time_step_spec = ts.time_step_spec(obs_spec)
    action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3)
    reward_network = (
        global_and_arm_feature_network.create_feed_forward_common_tower_network(
            obs_spec, (4, 3), (3, 4), (4, 2)))

    policy = greedy_reward_policy.GreedyRewardPredictionPolicy(
        time_step_spec,
        action_spec,
        reward_network=reward_network,
        accepts_per_arm_features=True,
        emit_policy_info=('predicted_rewards_mean',))
    action_feature = tf.cast(
        tf.reshape(tf.random.shuffle(tf.range(24)), shape=[2, 4, 3]),
        dtype=tf.float32)
    observations = {
        bandit_spec_utils.GLOBAL_FEATURE_KEY:
            tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
        bandit_spec_utils.PER_ARM_FEATURE_KEY: action_feature
    }
    time_step = ts.restart(observations, batch_size=2)
    action_step = policy.action(time_step, seed=1)
    self.assertEqual(action_step.action.shape.as_list(), [2])
    self.assertEqual(action_step.action.dtype, tf.int32)
    # Initialize all variables
    self.evaluate(tf.compat.v1.global_variables_initializer())
    action, p_info, first_arm_features = self.evaluate([
        action_step.action, action_step.info,
        observations[bandit_spec_utils.PER_ARM_FEATURE_KEY][0]
    ])
    self.assertAllEqual(action.shape, [2])
    self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 4])
    self.assertAllEqual(p_info.chosen_arm_features.shape, [2, 3])
    first_action = action[0]
    self.assertAllEqual(p_info.chosen_arm_features[0],
                        first_arm_features[first_action])

    # Check that zeroing out some of the actions does not affect the predicted
    # rewards for unchanged actions. This is to make sure that action feature
    # padding does not influence the behavior.

    if not tf.executing_eagerly():
      # The below comparison will only work in tf2 because of the random per-arm
      # observations get re-drawn in tf1.
      return
    padded_action_feature = tf.concat(
        [action_feature[:, 0:1, :],
         tf.zeros(shape=[2, 3, 3], dtype=tf.float32)],
        axis=1)
    observations = {
        bandit_spec_utils.GLOBAL_FEATURE_KEY:
            tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
        bandit_spec_utils.PER_ARM_FEATURE_KEY: padded_action_feature
    }
    time_step = ts.restart(observations, batch_size=2)
    padded_action_step = policy.action(time_step, seed=1)
    padded_p_info = self.evaluate(padded_action_step.info)
    self.assertAllEqual(p_info.predicted_rewards_mean[:, 0],
                        padded_p_info.predicted_rewards_mean[:, 0])
Esempio n. 16
0
def main(unused_argv):
  tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

  feature_dict = np.array([str(i) for i in range(DICTIONARY_SIZE)])
  def _global_context_sampling_fn():
    """Generates one sample of global features.

    It generates a dictionary of size `NUM_GLOBAL_FEATURES`, with the following
    syntax:

    {...,
     'global_feature_4': ['43'],
     ...
    }

    That is, the values are one-element numpy arrays of strings.

    Returns:
      A dictionary with string keys and numpy string array values.
    """
    generated_features = feature_dict[np.random.randint(0, DICTIONARY_SIZE,
                                                        [NUM_GLOBAL_FEATURES])]
    global_features = {
        'global_feature_{}'.format(i): generated_features[[i]]
        for i in range(NUM_GLOBAL_FEATURES)
    }
    return global_features

  def _arm_context_sampling_fn():
    """Generates one sample of arm features.

    It generates a dictionary of size `NUM_ARM_FEATURES`, with the following
    syntax:

    {...,
     'arm_feature_7': ['29'],
     ...
    }

    That is, the values are one-element numpy arrays of strings. Note that the
    output sample is for one arm and one non-batched time step.

    Returns:
      A dictionary with string keys and numpy string array values.
    """
    generated_features = feature_dict[np.random.randint(
        0, DICTIONARY_SIZE, [NUM_ARM_FEATURES])]
    arm_features = {
        'arm_feature_{}'.format(i): generated_features[[i]]
        for i in range(NUM_ARM_FEATURES)
    }
    return arm_features

  def _reward_fn(global_features, arm_features):
    """Outputs a [0, 1] float given a sample.

    The output reward is generated by hashing the concatenation of feature keys
    and values, then adding all up, taking modulo by 1000, and normalizing.

    Args:
      global_features: A dictionary with string keys and 1d string numpy array
        values.
      arm_features: A dictionary with string keys and 1d string numpy array
        values.

    Returns:
      A float value between 0 and 1.
    """
    hashed_global = 0
    for x, y in global_features.items():
      hashed_global += hash(x + y[0])
    hashed_arm = 0
    for x, y in arm_features.items():
      hashed_arm += hash(x + y[0])
    return (hashed_global + hashed_arm) % 1000 / 1000

  env = sspe.StationaryStochasticStructuredPyEnvironment(
      _global_context_sampling_fn,
      _arm_context_sampling_fn,
      NUM_ACTIONS,
      _reward_fn,
      batch_size=BATCH_SIZE)
  environment = tf_py_environment.TFPyEnvironment(env)

  def make_string_feature(name):
    return tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            name, feature_dict))

  global_columns = [
      make_string_feature('global_feature_{}'.format(i))
      for i in range(NUM_GLOBAL_FEATURES)
  ]
  arm_columns = [
      make_string_feature('arm_feature_{}'.format(i))
      for i in range(NUM_ARM_FEATURES)
  ]
  obs_spec = environment.observation_spec()
  if FLAGS.agent == 'epsGredy':
    network = (
        global_and_arm_feature_network.create_feed_forward_common_tower_network(
            obs_spec, (4, 3), (3, 4), (4, 2),
            global_preprocessing_combiner=tf.compat.v2.keras.layers
            .DenseFeatures(global_columns),
            arm_preprocessing_combiner=tf.compat.v2.keras.layers.DenseFeatures(
                arm_columns)))
    agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        reward_network=network,
        optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
        epsilon=EPSILON,
        accepts_per_arm_features=True,
        emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN)
  elif FLAGS.agent == 'NeuralLinUCB':
    network = (
        global_and_arm_feature_network.create_feed_forward_common_tower_network(
            obs_spec, (40, 30), (30, 40), (40, 20),
            ENCODING_DIM,
            global_preprocessing_combiner=tf.compat.v2.keras.layers
            .DenseFeatures(global_columns),
            arm_preprocessing_combiner=tf.compat.v2.keras.layers.DenseFeatures(
                arm_columns)))
    agent = neural_linucb_agent.NeuralLinUCBAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        encoding_network=network,
        encoding_network_num_train_steps=EPS_PHASE_STEPS,
        encoding_dim=ENCODING_DIM,
        optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
        alpha=1.0,
        gamma=1.0,
        epsilon_greedy=EPSILON,
        accepts_per_arm_features=True,
        debug_summaries=True,
        summarize_grads_and_vars=True,
        emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN)

  if FLAGS.drop_arm_obs:
    drop_arm_feature_fn = bandit_spec_utils.drop_arm_observation
  else:
    drop_arm_feature_fn = None
  trainer.train(
      root_dir=FLAGS.root_dir,
      agent=agent,
      environment=environment,
      training_loops=TRAINING_LOOPS,
      steps_per_loop=STEPS_PER_LOOP,
      training_data_spec_transformation_fn=drop_arm_feature_fn)
Esempio n. 17
0
    def testSparseObs(self, batch_size, actions_from_reward_layer):
        obs_spec = {
            'global': {
                'sport': tensor_spec.TensorSpec((), tf.string)
            },
            'per_arm': {
                'name': tensor_spec.TensorSpec((3, ), tf.string),
                'fruit': tensor_spec.TensorSpec((3, ), tf.string)
            }
        }
        columns_a = tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                'name', ['bob', 'george', 'wanda']))
        columns_b = tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                'fruit', ['banana', 'kiwi', 'pear']))
        columns_c = tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                'sport', ['bridge', 'chess', 'snooker']))

        dummy_net = arm_network.create_feed_forward_common_tower_network(
            obs_spec,
            global_layers=(3, 4, 5),
            arm_layers=(3, 2),
            common_layers=(4, 3),
            output_dim=self._encoding_dim,
            global_preprocessing_combiner=(
                tf.compat.v2.keras.layers.DenseFeatures([columns_c])),
            arm_preprocessing_combiner=tf.compat.v2.keras.layers.DenseFeatures(
                [columns_a, columns_b]))
        time_step_spec = ts.time_step_spec(obs_spec)
        reward_layer = get_per_arm_reward_layer(
            encoding_dim=self._encoding_dim)
        policy = neural_linucb_policy.NeuralLinUCBPolicy(
            dummy_net,
            self._encoding_dim,
            reward_layer,
            actions_from_reward_layer=tf.constant(actions_from_reward_layer,
                                                  dtype=tf.bool),
            cov_matrix=self._a[0:1],
            data_vector=self._b[0:1],
            num_samples=self._num_samples_per_arm[0:1],
            epsilon_greedy=0.0,
            time_step_spec=time_step_spec,
            accepts_per_arm_features=True,
            emit_policy_info=('predicted_rewards_mean', ))
        observations = {
            'global': {
                'sport': tf.constant(['snooker', 'chess'])
            },
            'per_arm': {
                'name':
                tf.constant([['george', 'george', 'george'],
                             ['bob', 'bob', 'bob']]),
                'fruit':
                tf.constant([['banana', 'banana', 'banana'],
                             ['kiwi', 'kiwi', 'kiwi']])
            }
        }

        time_step = ts.restart(observations, batch_size=2)
        action_fn = common.function_in_tf1()(policy.action)
        action_step = action_fn(time_step, seed=1)
        self.assertEqual(action_step.action.shape.as_list(), [2])
        self.assertEqual(action_step.action.dtype, tf.int32)
        # Initialize all variables
        self.evaluate([
            tf.compat.v1.global_variables_initializer(),
            tf.compat.v1.tables_initializer()
        ])
        action = self.evaluate(action_step.action)
        self.assertAllEqual(action.shape, [2])
        p_info = self.evaluate(action_step.info)
        self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 3])
        self.assertAllEqual(p_info.chosen_arm_features['name'].shape, [2])
        self.assertAllEqual(p_info.chosen_arm_features['fruit'].shape, [2])
        first_action = action[0]
        first_arm_name_feature = observations[
            bandit_spec_utils.PER_ARM_FEATURE_KEY]['name'][0]
        self.assertAllEqual(p_info.chosen_arm_features['name'][0],
                            first_arm_name_feature[first_action])