def test_with_random_policy(self): def _global_context_sampling_fn(): abc = np.array(['a', 'b', 'c']) return { 'global1': np.random.randint(-2, 3, [3, 4]), 'global2': abc[np.random.randint(0, 2, [1])] } def _arm_context_sampling_fn(): aabbcc = np.array(['aa', 'bb', 'cc']) return { 'arm1': np.random.randint(-3, 4, [5]), 'arm2': np.random.randint(-3, 4, [3, 1]), 'arm3': aabbcc[np.random.randint(0, 2, [1])] } def _reward_fn(global_obs, arm_obs): return global_obs['global1'][2, 1] + arm_obs['arm1'][4] env = ssspe.StationaryStochasticStructuredPyEnvironment( _global_context_sampling_fn, _arm_context_sampling_fn, 6, _reward_fn, batch_size=2) time_step_spec = env.time_step_spec() action_spec = array_spec.BoundedArraySpec(shape=(), minimum=0, maximum=5, dtype=np.int32) random_policy = random_py_policy.RandomPyPolicy( time_step_spec=time_step_spec, action_spec=action_spec) for _ in range(5): time_step = env.reset() self.assertTrue( check_unbatched_time_step_spec(time_step=time_step, time_step_spec=time_step_spec, batch_size=env.batch_size)) action = random_policy.action(time_step).action self.assertAllEqual(action.shape, [2]) self.assertAllGreaterEqual(action, 0) self.assertAllLess(action, 6) time_step = env.step(action) self.assertEqual(time_step.reward.shape, (2, ))
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. feature_dict = np.array([str(i) for i in range(DICTIONARY_SIZE)]) def _global_context_sampling_fn(): """Generates one sample of global features. It generates a dictionary of size `NUM_GLOBAL_FEATURES`, with the following syntax: {..., 'global_feature_4': ['43'], ... } That is, the values are one-element numpy arrays of strings. Returns: A dictionary with string keys and numpy string array values. """ generated_features = feature_dict[np.random.randint( 0, DICTIONARY_SIZE, [NUM_GLOBAL_FEATURES])] global_features = { 'global_feature_{}'.format(i): generated_features[[i]] for i in range(NUM_GLOBAL_FEATURES) } return global_features def _arm_context_sampling_fn(): """Generates one sample of arm features. It generates a dictionary of size `NUM_ARM_FEATURES`, with the following syntax: {..., 'arm_feature_7': ['29'], ... } That is, the values are one-element numpy arrays of strings. Note that the output sample is for one arm and one non-batched time step. Returns: A dictionary with string keys and numpy string array values. """ generated_features = feature_dict[np.random.randint( 0, DICTIONARY_SIZE, [NUM_ARM_FEATURES])] arm_features = { 'arm_feature_{}'.format(i): generated_features[[i]] for i in range(NUM_ARM_FEATURES) } return arm_features def _reward_fn(global_features, arm_features): """Outputs a [0, 1] float given a sample. The output reward is generated by hashing the concatenation of feature keys and values, then adding all up, taking modulo by 1000, and normalizing. Args: global_features: A dictionary with string keys and 1d string numpy array values. arm_features: A dictionary with string keys and 1d string numpy array values. Returns: A float value between 0 and 1. """ hashed_global = 0 for x, y in global_features.items(): hashed_global += hash(x + y[0]) hashed_arm = 0 for x, y in arm_features.items(): hashed_arm += hash(x + y[0]) return (hashed_global + hashed_arm) % 1000 / 1000 env = sspe.StationaryStochasticStructuredPyEnvironment( _global_context_sampling_fn, _arm_context_sampling_fn, NUM_ACTIONS, _reward_fn, batch_size=BATCH_SIZE) environment = tf_py_environment.TFPyEnvironment(env) def make_string_feature(name): return tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( name, feature_dict)) global_columns = [ make_string_feature('global_feature_{}'.format(i)) for i in range(NUM_GLOBAL_FEATURES) ] arm_columns = [ make_string_feature('arm_feature_{}'.format(i)) for i in range(NUM_ARM_FEATURES) ] obs_spec = environment.observation_spec() if FLAGS.agent == 'epsGredy': network = (global_and_arm_feature_network. create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2), global_preprocessing_combiner=tf.compat.v2.keras.layers. DenseFeatures(global_columns), arm_preprocessing_combiner=tf.compat.v2.keras.layers. DenseFeatures(arm_columns))) agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=EPSILON, accepts_per_arm_features=True, emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN ) elif FLAGS.agent == 'NeuralLinUCB': network = (global_and_arm_feature_network. create_feed_forward_common_tower_network( obs_spec, (40, 30), (30, 40), (40, 20), ENCODING_DIM, global_preprocessing_combiner=tf.compat.v2.keras.layers. DenseFeatures(global_columns), arm_preprocessing_combiner=tf.compat.v2.keras.layers. DenseFeatures(arm_columns))) agent = neural_linucb_agent.NeuralLinUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), encoding_network=network, encoding_network_num_train_steps=EPS_PHASE_STEPS, encoding_dim=ENCODING_DIM, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), alpha=1.0, gamma=1.0, epsilon_greedy=EPSILON, accepts_per_arm_features=True, debug_summaries=True, summarize_grads_and_vars=True, emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN ) if FLAGS.drop_arm_obs: drop_arm_feature_fn = bandit_spec_utils.drop_arm_observation else: drop_arm_feature_fn = None trainer.train(root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, training_data_spec_transformation_fn=drop_arm_feature_fn)