Example #1
0
    def test_with_random_policy(self):
        def _global_context_sampling_fn():
            abc = np.array(['a', 'b', 'c'])
            return {
                'global1': np.random.randint(-2, 3, [3, 4]),
                'global2': abc[np.random.randint(0, 2, [1])]
            }

        def _arm_context_sampling_fn():
            aabbcc = np.array(['aa', 'bb', 'cc'])
            return {
                'arm1': np.random.randint(-3, 4, [5]),
                'arm2': np.random.randint(-3, 4, [3, 1]),
                'arm3': aabbcc[np.random.randint(0, 2, [1])]
            }

        def _reward_fn(global_obs, arm_obs):
            return global_obs['global1'][2, 1] + arm_obs['arm1'][4]

        env = ssspe.StationaryStochasticStructuredPyEnvironment(
            _global_context_sampling_fn,
            _arm_context_sampling_fn,
            6,
            _reward_fn,
            batch_size=2)
        time_step_spec = env.time_step_spec()
        action_spec = array_spec.BoundedArraySpec(shape=(),
                                                  minimum=0,
                                                  maximum=5,
                                                  dtype=np.int32)

        random_policy = random_py_policy.RandomPyPolicy(
            time_step_spec=time_step_spec, action_spec=action_spec)

        for _ in range(5):
            time_step = env.reset()
            self.assertTrue(
                check_unbatched_time_step_spec(time_step=time_step,
                                               time_step_spec=time_step_spec,
                                               batch_size=env.batch_size))

            action = random_policy.action(time_step).action
            self.assertAllEqual(action.shape, [2])
            self.assertAllGreaterEqual(action, 0)
            self.assertAllLess(action, 6)
            time_step = env.step(action)
            self.assertEqual(time_step.reward.shape, (2, ))
def main(unused_argv):
    tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

    feature_dict = np.array([str(i) for i in range(DICTIONARY_SIZE)])

    def _global_context_sampling_fn():
        """Generates one sample of global features.

    It generates a dictionary of size `NUM_GLOBAL_FEATURES`, with the following
    syntax:

    {...,
     'global_feature_4': ['43'],
     ...
    }

    That is, the values are one-element numpy arrays of strings.

    Returns:
      A dictionary with string keys and numpy string array values.
    """
        generated_features = feature_dict[np.random.randint(
            0, DICTIONARY_SIZE, [NUM_GLOBAL_FEATURES])]
        global_features = {
            'global_feature_{}'.format(i): generated_features[[i]]
            for i in range(NUM_GLOBAL_FEATURES)
        }
        return global_features

    def _arm_context_sampling_fn():
        """Generates one sample of arm features.

    It generates a dictionary of size `NUM_ARM_FEATURES`, with the following
    syntax:

    {...,
     'arm_feature_7': ['29'],
     ...
    }

    That is, the values are one-element numpy arrays of strings. Note that the
    output sample is for one arm and one non-batched time step.

    Returns:
      A dictionary with string keys and numpy string array values.
    """
        generated_features = feature_dict[np.random.randint(
            0, DICTIONARY_SIZE, [NUM_ARM_FEATURES])]
        arm_features = {
            'arm_feature_{}'.format(i): generated_features[[i]]
            for i in range(NUM_ARM_FEATURES)
        }
        return arm_features

    def _reward_fn(global_features, arm_features):
        """Outputs a [0, 1] float given a sample.

    The output reward is generated by hashing the concatenation of feature keys
    and values, then adding all up, taking modulo by 1000, and normalizing.

    Args:
      global_features: A dictionary with string keys and 1d string numpy array
        values.
      arm_features: A dictionary with string keys and 1d string numpy array
        values.

    Returns:
      A float value between 0 and 1.
    """
        hashed_global = 0
        for x, y in global_features.items():
            hashed_global += hash(x + y[0])
        hashed_arm = 0
        for x, y in arm_features.items():
            hashed_arm += hash(x + y[0])
        return (hashed_global + hashed_arm) % 1000 / 1000

    env = sspe.StationaryStochasticStructuredPyEnvironment(
        _global_context_sampling_fn,
        _arm_context_sampling_fn,
        NUM_ACTIONS,
        _reward_fn,
        batch_size=BATCH_SIZE)
    environment = tf_py_environment.TFPyEnvironment(env)

    def make_string_feature(name):
        return tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                name, feature_dict))

    global_columns = [
        make_string_feature('global_feature_{}'.format(i))
        for i in range(NUM_GLOBAL_FEATURES)
    ]
    arm_columns = [
        make_string_feature('arm_feature_{}'.format(i))
        for i in range(NUM_ARM_FEATURES)
    ]
    obs_spec = environment.observation_spec()
    if FLAGS.agent == 'epsGredy':
        network = (global_and_arm_feature_network.
                   create_feed_forward_common_tower_network(
                       obs_spec, (4, 3), (3, 4), (4, 2),
                       global_preprocessing_combiner=tf.compat.v2.keras.layers.
                       DenseFeatures(global_columns),
                       arm_preprocessing_combiner=tf.compat.v2.keras.layers.
                       DenseFeatures(arm_columns)))
        agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            reward_network=network,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
            epsilon=EPSILON,
            accepts_per_arm_features=True,
            emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN
        )
    elif FLAGS.agent == 'NeuralLinUCB':
        network = (global_and_arm_feature_network.
                   create_feed_forward_common_tower_network(
                       obs_spec, (40, 30), (30, 40), (40, 20),
                       ENCODING_DIM,
                       global_preprocessing_combiner=tf.compat.v2.keras.layers.
                       DenseFeatures(global_columns),
                       arm_preprocessing_combiner=tf.compat.v2.keras.layers.
                       DenseFeatures(arm_columns)))
        agent = neural_linucb_agent.NeuralLinUCBAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            encoding_network=network,
            encoding_network_num_train_steps=EPS_PHASE_STEPS,
            encoding_dim=ENCODING_DIM,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
            alpha=1.0,
            gamma=1.0,
            epsilon_greedy=EPSILON,
            accepts_per_arm_features=True,
            debug_summaries=True,
            summarize_grads_and_vars=True,
            emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN
        )

    if FLAGS.drop_arm_obs:
        drop_arm_feature_fn = bandit_spec_utils.drop_arm_observation
    else:
        drop_arm_feature_fn = None
    trainer.train(root_dir=FLAGS.root_dir,
                  agent=agent,
                  environment=environment,
                  training_loops=TRAINING_LOOPS,
                  steps_per_loop=STEPS_PER_LOOP,
                  training_data_spec_transformation_fn=drop_arm_feature_fn)