def testNeuralLinUCBUpdateNumTrainSteps10(self, batch_size=1, context_dim=10): """Check NeuralLinUCBAgent updates when behaving like eps-greedy.""" # Construct a `Trajectory` for the given action, observation, reward. num_actions = 5 initial_step, final_step = _get_initial_and_final_steps( batch_size, context_dim) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) encoder = DummyNet(observation_spec) encoding_dim = 10 variable_collection = neural_linucb_agent.NeuralLinUCBVariableCollection( num_actions, encoding_dim) agent = neural_linucb_agent.NeuralLinUCBAgent( time_step_spec=time_step_spec, action_spec=action_spec, encoding_network=encoder, encoding_network_num_train_steps=10, encoding_dim=encoding_dim, variable_collection=variable_collection, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001)) loss_info, _ = agent.train(experience) self.evaluate(agent.initialize()) self.evaluate(tf.compat.v1.global_variables_initializer()) loss_value = self.evaluate(loss_info) self.assertGreater(loss_value, 0.0)
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. class LinearNormalReward(object): def __init__(self, theta): self.theta = theta def __call__(self, x): mu = np.dot(x, self.theta) return np.random.normal(mu, 1) def _global_context_sampling_fn(): return np.random.randint(-10, 10, [4]).astype(np.float32) def _arm_context_sampling_fn(): return np.random.randint(-2, 3, [5]).astype(np.float32) reward_fn = LinearNormalReward(HIDDEN_PARAM) observation_and_action_constraint_splitter = None num_actions_fn = None variable_action_method = bandit_spec_utils.VariableActionMethod.FIXED if FLAGS.add_num_actions_feature: num_actions_fn = lambda: NUM_ACTIONS variable_action_method = ( bandit_spec_utils.VariableActionMethod.NUM_ACTIONS_FEATURE) env = sspe.StationaryStochasticPerArmPyEnvironment( _global_context_sampling_fn, _arm_context_sampling_fn, NUM_ACTIONS, reward_fn, num_actions_fn, batch_size=BATCH_SIZE, variable_action_method=variable_action_method) environment = tf_py_environment.TFPyEnvironment(env) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, accepts_per_arm_features=True, dtype=tf.float32) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), accepts_per_arm_features=True, dtype=tf.float32) elif FLAGS.agent == 'epsGreedy': obs_spec = environment.observation_spec() if FLAGS.network == 'commontower': network = (global_and_arm_feature_network. create_feed_forward_common_tower_network( obs_spec, (40, 30), (30, 40), (40, 20))) elif FLAGS.network == 'dotproduct': network = (global_and_arm_feature_network. create_feed_forward_dot_product_network( obs_spec, (4, 3, 6), (3, 4, 6))) agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=EPSILON, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), accepts_per_arm_features=True, emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN ) elif FLAGS.agent == 'NeuralLinUCB': obs_spec = environment.observation_spec() network = (global_and_arm_feature_network. create_feed_forward_common_tower_network( obs_spec, (40, 30), (30, 40), (40, 20), ENCODING_DIM)) agent = neural_linucb_agent.NeuralLinUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), encoding_network=network, encoding_network_num_train_steps=EPS_PHASE_STEPS, encoding_dim=ENCODING_DIM, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), alpha=1.0, gamma=1.0, epsilon_greedy=EPSILON, accepts_per_arm_features=True, debug_summaries=True, summarize_grads_and_vars=True, emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN ) def _all_rewards(observation, hidden_param): """Outputs rewards for all actions, given an observation.""" hidden_param = tf.cast(hidden_param, dtype=tf.float32) global_obs = observation[bandit_spec_utils.GLOBAL_FEATURE_KEY] per_arm_obs = observation[bandit_spec_utils.PER_ARM_FEATURE_KEY] num_actions = tf.shape(per_arm_obs)[1] tiled_global = tf.tile(tf.expand_dims(global_obs, axis=1), [1, num_actions, 1]) concatenated = tf.concat([tiled_global, per_arm_obs], axis=-1) rewards = tf.linalg.matvec(concatenated, hidden_param) return rewards def optimal_reward(observation, hidden_param): return tf.reduce_max(_all_rewards(observation, hidden_param), axis=1) def optimal_action(observation, hidden_param): return tf.argmax(_all_rewards(observation, hidden_param), axis=1, output_type=tf.int32) optimal_reward_fn = functools.partial(optimal_reward, hidden_param=HIDDEN_PARAM) optimal_action_fn = functools.partial(optimal_action, hidden_param=HIDDEN_PARAM) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) if FLAGS.drop_arm_obs: drop_arm_feature_fn = functools.partial( bandit_spec_utils.drop_arm_observation) else: drop_arm_feature_fn = None trainer.train(root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric], training_data_spec_transformation_fn=drop_arm_feature_fn)
def testTrainPerArmAgent(self): num_actions = 5 mask_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(num_actions, ), minimum=0, maximum=1) obs_spec = (bandit_spec_utils.create_per_arm_observation_spec( 2, 3, num_actions), mask_spec) time_step_spec = time_step.time_step_spec(obs_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) encoding_dim = 10 encoder = (global_and_arm_feature_network. create_feed_forward_common_tower_network( obs_spec[0], (4, 3), (3, 4), (4, 2), encoding_dim)) agent = neural_linucb_agent.NeuralLinUCBAgent( time_step_spec=time_step_spec, action_spec=action_spec, encoding_network=encoder, encoding_network_num_train_steps=10, encoding_dim=encoding_dim, observation_and_action_constraint_splitter=lambda x: (x[0], x[1]), accepts_per_arm_features=True, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001)) observations = ({ bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: tf.cast(tf.reshape(tf.range(30), shape=[2, 5, 3]), dtype=tf.float32) }, tf.ones(shape=(2, num_actions), dtype=tf.int32)) actions = np.array([0, 3], dtype=np.int32) rewards = np.array([0.5, 3.0], dtype=np.float32) initial_step = time_step.TimeStep( tf.constant(time_step.StepType.FIRST, dtype=tf.int32, shape=[2], name='step_type'), tf.constant(0.0, dtype=tf.float32, shape=[2], name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[2], name='discount'), observations) final_step = time_step.TimeStep( tf.constant(time_step.StepType.LAST, dtype=tf.int32, shape=[2], name='step_type'), tf.constant(rewards, dtype=tf.float32, name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[2], name='discount'), observations) action_step = policy_step.PolicyStep( action=tf.convert_to_tensor(actions), info=policy_utilities.PerArmPolicyInfo( chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]], dtype=np.float32))) experience = _get_experience(initial_step, action_step, final_step) loss_info, _ = agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables()) loss_value = self.evaluate(loss_info) self.assertGreater(loss_value, 0.0)
def testNeuralLinUCBUpdateDistributed(self, batch_size=1, context_dim=10): """Same as above but with distributed LinUCB updates.""" # Construct a `Trajectory` for the given action, observation, reward. num_actions = 5 initial_step, final_step = _get_initial_and_final_steps( batch_size, context_dim) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) encoder = DummyNet(observation_spec) encoding_dim = 10 agent = neural_linucb_agent.NeuralLinUCBAgent( time_step_spec=time_step_spec, action_spec=action_spec, encoding_network=encoder, encoding_network_num_train_steps=0, encoding_dim=encoding_dim, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=1e-2)) self.evaluate(agent.initialize()) self.evaluate(tf.compat.v1.global_variables_initializer()) # Call the distributed LinUCB training instead of agent.train(). train_fn = common.function_in_tf1()( agent.compute_loss_using_linucb_distributed) reward = tf.cast(experience.reward, agent._dtype) loss_info = train_fn(experience.observation, action, reward, weights=None) self.evaluate(loss_info) final_a = self.evaluate(agent.cov_matrix) final_b = self.evaluate(agent.data_vector) # Compute the expected updated estimates. observations_list = tf.dynamic_partition( data=tf.reshape(tf.cast(experience.observation, tf.float64), [batch_size, context_dim]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) rewards_list = tf.dynamic_partition( data=tf.reshape(tf.cast(experience.reward, tf.float64), [batch_size]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) expected_a_updated_list = [] expected_b_updated_list = [] for _, (observations_for_arm, rewards_for_arm) in enumerate( zip(observations_list, rewards_list)): encoded_observations_for_arm, _ = encoder(observations_for_arm) encoded_observations_for_arm = tf.cast( encoded_observations_for_arm, dtype=tf.float64) num_samples_for_arm_current = tf.cast( tf.shape(rewards_for_arm)[0], tf.float64) num_samples_for_arm_total = num_samples_for_arm_current # pylint: disable=cell-var-from-loop def true_fn(): a_new = tf.matmul(encoded_observations_for_arm, encoded_observations_for_arm, transpose_a=True) b_new = bandit_utils.sum_reward_weighted_observations( rewards_for_arm, encoded_observations_for_arm) return a_new, b_new def false_fn(): return (tf.zeros([encoding_dim, encoding_dim], dtype=tf.float64), tf.zeros([encoding_dim], dtype=tf.float64)) a_new, b_new = tf.cond( tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn) expected_a_updated_list.append(self.evaluate(a_new)) expected_b_updated_list.append(self.evaluate(b_new)) # Check that the actual updated estimates match the expectations. self.assertAllClose(expected_a_updated_list, final_a) self.assertAllClose(expected_b_updated_list, final_b)
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. feature_dict = np.array([str(i) for i in range(DICTIONARY_SIZE)]) def _global_context_sampling_fn(): """Generates one sample of global features. It generates a dictionary of size `NUM_GLOBAL_FEATURES`, with the following syntax: {..., 'global_feature_4': ['43'], ... } That is, the values are one-element numpy arrays of strings. Returns: A dictionary with string keys and numpy string array values. """ generated_features = feature_dict[np.random.randint( 0, DICTIONARY_SIZE, [NUM_GLOBAL_FEATURES])] global_features = { 'global_feature_{}'.format(i): generated_features[[i]] for i in range(NUM_GLOBAL_FEATURES) } return global_features def _arm_context_sampling_fn(): """Generates one sample of arm features. It generates a dictionary of size `NUM_ARM_FEATURES`, with the following syntax: {..., 'arm_feature_7': ['29'], ... } That is, the values are one-element numpy arrays of strings. Note that the output sample is for one arm and one non-batched time step. Returns: A dictionary with string keys and numpy string array values. """ generated_features = feature_dict[np.random.randint( 0, DICTIONARY_SIZE, [NUM_ARM_FEATURES])] arm_features = { 'arm_feature_{}'.format(i): generated_features[[i]] for i in range(NUM_ARM_FEATURES) } return arm_features def _reward_fn(global_features, arm_features): """Outputs a [0, 1] float given a sample. The output reward is generated by hashing the concatenation of feature keys and values, then adding all up, taking modulo by 1000, and normalizing. Args: global_features: A dictionary with string keys and 1d string numpy array values. arm_features: A dictionary with string keys and 1d string numpy array values. Returns: A float value between 0 and 1. """ hashed_global = 0 for x, y in global_features.items(): hashed_global += hash(x + y[0]) hashed_arm = 0 for x, y in arm_features.items(): hashed_arm += hash(x + y[0]) return (hashed_global + hashed_arm) % 1000 / 1000 env = sspe.StationaryStochasticStructuredPyEnvironment( _global_context_sampling_fn, _arm_context_sampling_fn, NUM_ACTIONS, _reward_fn, batch_size=BATCH_SIZE) environment = tf_py_environment.TFPyEnvironment(env) def make_string_feature(name): return tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( name, feature_dict)) global_columns = [ make_string_feature('global_feature_{}'.format(i)) for i in range(NUM_GLOBAL_FEATURES) ] arm_columns = [ make_string_feature('arm_feature_{}'.format(i)) for i in range(NUM_ARM_FEATURES) ] obs_spec = environment.observation_spec() if FLAGS.agent == 'epsGredy': network = (global_and_arm_feature_network. create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2), global_preprocessing_combiner=tf.compat.v2.keras.layers. DenseFeatures(global_columns), arm_preprocessing_combiner=tf.compat.v2.keras.layers. DenseFeatures(arm_columns))) agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=EPSILON, accepts_per_arm_features=True, emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN ) elif FLAGS.agent == 'NeuralLinUCB': network = (global_and_arm_feature_network. create_feed_forward_common_tower_network( obs_spec, (40, 30), (30, 40), (40, 20), ENCODING_DIM, global_preprocessing_combiner=tf.compat.v2.keras.layers. DenseFeatures(global_columns), arm_preprocessing_combiner=tf.compat.v2.keras.layers. DenseFeatures(arm_columns))) agent = neural_linucb_agent.NeuralLinUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), encoding_network=network, encoding_network_num_train_steps=EPS_PHASE_STEPS, encoding_dim=ENCODING_DIM, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), alpha=1.0, gamma=1.0, epsilon_greedy=EPSILON, accepts_per_arm_features=True, debug_summaries=True, summarize_grads_and_vars=True, emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN ) if FLAGS.drop_arm_obs: drop_arm_feature_fn = bandit_spec_utils.drop_arm_observation else: drop_arm_feature_fn = None trainer.train(root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, training_data_spec_transformation_fn=drop_arm_feature_fn)