def get_agent_by_name(agent_name, time_step_spec, action_spec): if agent_name == 'LinUCB': return lin_ucb_agent.LinearUCBAgent(time_step_spec=time_step_spec, action_spec=action_spec, dtype=tf.float32) elif agent_name == 'LinTS': return lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=time_step_spec, action_spec=action_spec, dtype=tf.float32) elif agent_name == 'epsGreedy': network = q_network.QNetwork( input_tensor_spec=time_step_spec.observation, action_spec=action_spec, fc_layer_params=(50, 50, 50)) return neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=time_step_spec, action_spec=action_spec, reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.05), epsilon=0.1) elif agent_name == 'mix': emit_policy_info = ( policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN, ) network = q_network.QNetwork( input_tensor_spec=time_step_spec.observation, action_spec=action_spec, fc_layer_params=(50, 50, 50)) agent_epsgreedy = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=time_step_spec, action_spec=action_spec, reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.05), emit_policy_info=emit_policy_info, epsilon=0.1) agent_linucb = lin_ucb_agent.LinearUCBAgent( time_step_spec=time_step_spec, action_spec=action_spec, emit_policy_info=emit_policy_info, dtype=tf.float32) agent_random = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=time_step_spec, action_spec=action_spec, reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.05), emit_policy_info=emit_policy_info, epsilon=1.) agent_halfrandom = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=time_step_spec, action_spec=action_spec, reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.05), emit_policy_info=emit_policy_info, epsilon=0.5) return exp3_mixture_agent.Exp3MixtureAgent( (agent_epsgreedy, agent_linucb, agent_random, agent_halfrandom))
def testUCBandThompsonSamplingShareVariables(self): if not tf.executing_eagerly(): self.skipTest('Test only works in eager mode.') context_dim = 9 num_actions = 4 batch_size = 7 variable_collection = linear_agent.LinearBanditVariableCollection( context_dim=context_dim, num_models=num_actions) observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) ucb_agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=time_step_spec, action_spec=action_spec, variable_collection=variable_collection) ts_agent = linear_thompson_sampling_agent.LinearThompsonSamplingAgent( time_step_spec=time_step_spec, action_spec=action_spec, variable_collection=variable_collection) initial_step, final_step = _get_initial_and_final_steps( batch_size, context_dim) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action) experience = _get_experience(initial_step, action_step, final_step) self.evaluate(ucb_agent.train(experience)) self.assertAllEqual(ucb_agent._variable_collection.cov_matrix_list[0], ts_agent._variable_collection.cov_matrix_list[0]) self.evaluate(ts_agent.train(experience)) self.assertAllEqual(ucb_agent._variable_collection.data_vector_list[0], ts_agent._variable_collection.data_vector_list[0])
def testAgentUpdate(self, batch_size, context_dim, num_agents): num_actions = 5 observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) agents = [] for _ in range(num_agents): agents.append( lin_ucb_agent.LinearUCBAgent( time_step_spec, action_spec, emit_policy_info=( policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN, ))) mixture_agent = static_mixture_agent.StaticMixtureAgent( [1] * num_agents, agents) initial_step, final_step = _get_initial_and_final_steps( batch_size, context_dim) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action, num_agents, num_actions) experience = _get_experience(initial_step, action_step, final_step) for agent in agents: self.evaluate(agent.initialize()) self.evaluate(mixture_agent.initialize()) loss_info = mixture_agent.train(experience) self.evaluate(loss_info)
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. data_path = FLAGS.data_path if not data_path: raise ValueError('Please specify the location of the data file.') env = movielens_py_environment.MovieLensPyEnvironment( data_path, RANK_K, BATCH_SIZE, num_movies=20) environment = tf_py_environment.TFPyEnvironment(env) optimal_reward_fn = functools.partial( environment_utilities.compute_optimal_reward_with_movielens_environment, environment=environment) optimal_action_fn = functools.partial( environment_utilities.compute_optimal_action_with_movielens_environment, environment=environment) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), tikhonov_weight=0.001, alpha=AGENT_ALPHA, dtype=tf.float32) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), dtype=tf.float32) elif FLAGS.agent == 'epsGreedy': network = q_network.QNetwork( input_tensor_spec=environment.time_step_spec().observation, action_spec=environment.action_spec(), fc_layer_params=LAYERS) agent = eps_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=EPSILON) elif FLAGS.agent == 'DropoutTS': agent = dropout_ts_agent.DropoutThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), dropout_rate=DROPOUT_RATE, network_layers=LAYERS, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR)) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])
def testMixtureUpdate(self, batch_size, context_dim, num_agents): num_actions = 5 observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) agents = [] for _ in range(num_agents): agents.append( lin_ucb_agent.LinearUCBAgent( time_step_spec, action_spec, emit_policy_info=( policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN, ))) mixed_agent = exp3_mixture_agent.Exp3MixtureAgent(agents) initial_step, final_step = _get_initial_and_final_steps( batch_size, context_dim) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action, num_agents, num_actions) experience = _get_experience(initial_step, action_step, final_step) self.evaluate(mixed_agent.initialize()) self.evaluate(mixed_agent._variable_collection.reward_aggregates) self.evaluate(mixed_agent.train(experience)) reward_aggregates = self.evaluate( mixed_agent._variable_collection.reward_aggregates) self.assertAllInSet(reward_aggregates[:num_agents - 1], [0.999]) agent_prob = 1 / num_agents est_rewards = 0.5 / agent_prob per_step_update = est_rewards last_agent_update = 1 - batch_size * per_step_update self.assertAllClose(reward_aggregates[-1], last_agent_update * 0.999)
def testAgentWithDifferentSubagentsUpdate(self): num_actions = 3 context_dim = 2 batch_size = 7 observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec( dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) agent1 = lin_ucb_agent.LinearUCBAgent( time_step_spec, action_spec, emit_policy_info=(policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN,)) reward_net = q_network.QNetwork( input_tensor_spec=observation_spec, action_spec=action_spec, fc_layer_params=(4, 3, 2)) agent2 = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec, action_spec, reward_network=reward_net, emit_policy_info=(policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN,), optimizer=tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.1), epsilon=0.1) agents = [agent1, agent2] dist = tfd.Categorical(probs=tf.Variable([0., 1.])) mixed_agent = WeightRotatingMixtureAgent(dist, agents) initial_step, final_step = _get_initial_and_final_steps( batch_size, context_dim) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action, 2, num_actions) experience = _get_experience(initial_step, action_step, final_step) self.evaluate(mixed_agent.initialize()) loss_info = mixed_agent.train(experience) self.evaluate(loss_info)
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. with tf.device('/CPU:0'): # due to b/128333994 covertype_dataset = dataset_utilities.convert_covertype_dataset( FLAGS.covertype_csv) covertype_reward_distribution = tfd.Independent( tfd.Deterministic(tf.eye(7)), reinterpreted_batch_ndims=2) environment = ce.ClassificationBanditEnvironment( covertype_dataset, covertype_reward_distribution, BATCH_SIZE) optimal_reward_fn = functools.partial( env_util.compute_optimal_reward_with_classification_environment, environment=environment) optimal_action_fn = functools.partial( env_util.compute_optimal_action_with_classification_environment, environment=environment) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, emit_log_probability=False, dtype=tf.float32) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) elif FLAGS.agent == 'epsGreedy': network = q_network.QNetwork( input_tensor_spec=environment.time_step_spec().observation, action_spec=environment.action_spec(), fc_layer_params=LAYERS) agent = eps_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=EPSILON) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])
def testInitializeAgent( self, batch_size, context_dim, dtype, use_eigendecomp=False): num_actions = 5 observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec( dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=time_step_spec, action_spec=action_spec, dtype=dtype) self.evaluate(agent.initialize())
def main(unused_argv): tf.enable_resource_variables() with tf.device('/CPU:0'): # due to b/128333994 env = wheel_py_environment.WheelPyEnvironment(DELTA, MU_BASE, STD_BASE, MU_HIGH, STD_HIGH, BATCH_SIZE) environment = tf_py_environment.TFPyEnvironment(env) optimal_reward_fn = functools.partial( environment_utilities.tf_wheel_bandit_compute_optimal_reward, delta=DELTA, mu_inside=MU_BASE[0], mu_high=MU_HIGH) optimal_action_fn = functools.partial( environment_utilities.tf_wheel_bandit_compute_optimal_action, delta=DELTA) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) elif FLAGS.agent == 'epsGreedy': network = q_network.QNetwork( input_tensor_spec=environment.time_step_spec().observation, action_spec=environment.action_spec(), fc_layer_params=LAYERS) agent = eps_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=EPSILON) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])
def testInitializeAgentEmptyObservationSpec(self): dtype = tf.float32 num_actions = 5 observation_spec = tensor_spec.TensorSpec((), tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec( dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=time_step_spec, action_spec=action_spec, dtype=dtype) self.evaluate(agent.initialize())
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. with tf.device('/CPU:0'): # due to b/128333994 mushroom_reward_distribution = ( dataset_utilities.mushroom_reward_distribution( r_noeat=0.0, r_eat_safe=5.0, r_eat_poison_bad=-35.0, r_eat_poison_good=5.0, prob_poison_bad=0.5)) mushroom_dataset = ( dataset_utilities.convert_mushroom_csv_to_tf_dataset( FLAGS.mushroom_csv)) environment = ce.ClassificationBanditEnvironment( mushroom_dataset, mushroom_reward_distribution, BATCH_SIZE) optimal_reward_fn = functools.partial( env_util.compute_optimal_reward_with_classification_environment, environment=environment) optimal_action_fn = functools.partial( env_util.compute_optimal_action_with_classification_environment, environment=environment) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, gamma=0.95, emit_log_probability=False, dtype=tf.float32) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, gamma=0.95, dtype=tf.float32) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])
def testInitializeAgent(self, batch_size, context_dim, num_agents, emit_policy_info): num_actions = 7 observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec( dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) agents = [ lin_ucb_agent.LinearUCBAgent(time_step_spec, action_spec) for _ in range(num_agents) ] mixed_agent = exp3_mixture_agent.Exp3MixtureAgent(agents) self.evaluate(mixed_agent.initialize())
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. with tf.device('/CPU:0'): # due to b/128333994 observation_shape = [CONTEXT_DIM] overall_shape = [BATCH_SIZE] + observation_shape observation_distribution = tfd.Normal(loc=tf.zeros(overall_shape), scale=tf.ones(overall_shape)) action_shape = [NUM_ACTIONS] observation_to_reward_shape = observation_shape + action_shape observation_to_reward_distribution = tfd.Normal( loc=tf.zeros(observation_to_reward_shape), scale=tf.ones(observation_to_reward_shape)) drift_distribution = tfd.Normal(loc=DRIFT_MEAN, scale=DRIFT_VARIANCE) additive_reward_distribution = tfd.Normal( loc=tf.zeros(action_shape), scale=(REWARD_NOISE_VARIANCE * tf.ones(action_shape))) environment_dynamics = dle.DriftingLinearDynamics( observation_distribution, observation_to_reward_distribution, drift_distribution, additive_reward_distribution) environment = nse.NonStationaryStochasticEnvironment( environment_dynamics) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, gamma=0.95, emit_log_probability=False, dtype=tf.float32) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, gamma=0.95, dtype=tf.float32) regret_metric = tf_bandit_metrics.RegretMetric( environment.environment_dynamics.compute_optimal_reward) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( environment.environment_dynamics.compute_optimal_action) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])
def testInitializeAgent(self, batch_size, context_dim, num_agents): num_actions = 7 observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec( dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) agents = [ lin_ucb_agent.LinearUCBAgent(time_step_spec, action_spec) for _ in range(num_agents) ] dist = tfd.Categorical( probs=tf.Variable(tf.range(num_agents, dtype=tf.float32))) mixed_agent = WeightRotatingMixtureAgent(dist, agents) self.evaluate(mixed_agent.initialize())
def main(unused_argv): tf.compat.v1.enable_resource_variables() with tf.device('/CPU:0'): # due to b/128333994 action_reward_fns = ( environment_utilities.sliding_linear_reward_fn_generator( CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE)) env = sspe.StationaryStochasticPyEnvironment(functools.partial( environment_utilities.context_sampling_fn, batch_size=BATCH_SIZE, context_dim=CONTEXT_DIM), action_reward_fns, batch_size=BATCH_SIZE) environment = tf_py_environment.TFPyEnvironment(env) optimal_reward_fn = functools.partial( environment_utilities.tf_compute_optimal_reward, per_action_reward_fns=action_reward_fns) optimal_action_fn = functools.partial( environment_utilities.tf_compute_optimal_action, per_action_reward_fns=action_reward_fns) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. with tf.device('/CPU:0'): # due to b/128333994 env = wheel_py_environment.WheelPyEnvironment(DELTA, MU_BASE, STD_BASE, MU_HIGH, STD_HIGH, BATCH_SIZE) environment = tf_py_environment.TFPyEnvironment(env) optimal_reward_fn = functools.partial( environment_utilities.tf_wheel_bandit_compute_optimal_reward, delta=DELTA, mu_inside=MU_BASE[0], mu_high=MU_HIGH) optimal_action_fn = functools.partial( environment_utilities.tf_wheel_bandit_compute_optimal_action, delta=DELTA) network = q_network.QNetwork( input_tensor_spec=environment.time_step_spec().observation, action_spec=environment.action_spec(), fc_layer_params=(LAYERS)) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) elif FLAGS.agent == 'epsGreedy': agent = eps_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=EPSILON) elif FLAGS.agent == 'random': agent = eps_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=1.) elif FLAGS.agent == 'Mix': emit_policy_info = ( policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN, ) agent_epsgreedy = eps_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), emit_policy_info=emit_policy_info, epsilon=EPSILON) agent_linucb = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, emit_policy_info=emit_policy_info, dtype=tf.float32) agent_random = eps_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), emit_policy_info=emit_policy_info, epsilon=1.) agent_halfrandom = eps_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), emit_policy_info=emit_policy_info, epsilon=0.5) agent = exp3_mixture_agent.Exp3MixtureAgent( (agent_epsgreedy, agent_linucb, agent_random, agent_halfrandom)) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. with tf.device('/CPU:0'): # due to b/128333994 action_reward_fns = ( environment_utilities.structured_linear_reward_fn_generator( CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE)) env = sspe.StationaryStochasticPyEnvironment(functools.partial( environment_utilities.context_sampling_fn, batch_size=BATCH_SIZE, context_dim=CONTEXT_DIM), action_reward_fns, batch_size=BATCH_SIZE) environment = tf_py_environment.TFPyEnvironment(env) optimal_reward_fn = functools.partial( environment_utilities.tf_compute_optimal_reward, per_action_reward_fns=action_reward_fns) optimal_action_fn = functools.partial( environment_utilities.tf_compute_optimal_action, per_action_reward_fns=action_reward_fns) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) elif FLAGS.agent == 'epsGreedy': laplacian_matrix = utils.build_laplacian_over_ordinal_integer_actions( environment.action_spec()) network = q_network.QNetwork( input_tensor_spec=environment.time_step_spec().observation, action_spec=environment.action_spec(), fc_layer_params=REWARD_NETWORK_LAYER_PARAMS) agent = eps_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=NN_LEARNING_RATE), epsilon=EPSILON, laplacian_matrix=laplacian_matrix, laplacian_smoothing_weight=0.01) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. class LinearNormalReward(object): def __init__(self, theta): self.theta = theta def __call__(self, x): mu = np.dot(x, self.theta) return np.random.normal(mu, 1) def _global_context_sampling_fn(): return np.random.randint(-10, 10, [4]).astype(np.float32) def _arm_context_sampling_fn(): return np.random.randint(-2, 3, [5]).astype(np.float32) reward_fn = LinearNormalReward(HIDDEN_PARAM) observation_and_action_constraint_splitter = None num_actions_fn = None variable_action_method = bandit_spec_utils.VariableActionMethod.FIXED if FLAGS.add_num_actions_feature: num_actions_fn = lambda: NUM_ACTIONS variable_action_method = ( bandit_spec_utils.VariableActionMethod.NUM_ACTIONS_FEATURE) env = sspe.StationaryStochasticPerArmPyEnvironment( _global_context_sampling_fn, _arm_context_sampling_fn, NUM_ACTIONS, reward_fn, num_actions_fn, batch_size=BATCH_SIZE, variable_action_method=variable_action_method) environment = tf_py_environment.TFPyEnvironment(env) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, accepts_per_arm_features=True, dtype=tf.float32) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), accepts_per_arm_features=True, dtype=tf.float32) elif FLAGS.agent == 'epsGreedy': obs_spec = environment.observation_spec() if FLAGS.network == 'commontower': network = (global_and_arm_feature_network. create_feed_forward_common_tower_network( obs_spec, (40, 30), (30, 40), (40, 20))) elif FLAGS.network == 'dotproduct': network = (global_and_arm_feature_network. create_feed_forward_dot_product_network( obs_spec, (4, 3, 6), (3, 4, 6))) agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=EPSILON, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), accepts_per_arm_features=True, emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN ) elif FLAGS.agent == 'NeuralLinUCB': obs_spec = environment.observation_spec() network = (global_and_arm_feature_network. create_feed_forward_common_tower_network( obs_spec, (40, 30), (30, 40), (40, 20), ENCODING_DIM)) agent = neural_linucb_agent.NeuralLinUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), encoding_network=network, encoding_network_num_train_steps=EPS_PHASE_STEPS, encoding_dim=ENCODING_DIM, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), alpha=1.0, gamma=1.0, epsilon_greedy=EPSILON, accepts_per_arm_features=True, debug_summaries=True, summarize_grads_and_vars=True, emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN ) def _all_rewards(observation, hidden_param): """Outputs rewards for all actions, given an observation.""" hidden_param = tf.cast(hidden_param, dtype=tf.float32) global_obs = observation[bandit_spec_utils.GLOBAL_FEATURE_KEY] per_arm_obs = observation[bandit_spec_utils.PER_ARM_FEATURE_KEY] num_actions = tf.shape(per_arm_obs)[1] tiled_global = tf.tile(tf.expand_dims(global_obs, axis=1), [1, num_actions, 1]) concatenated = tf.concat([tiled_global, per_arm_obs], axis=-1) rewards = tf.linalg.matvec(concatenated, hidden_param) return rewards def optimal_reward(observation, hidden_param): return tf.reduce_max(_all_rewards(observation, hidden_param), axis=1) def optimal_action(observation, hidden_param): return tf.argmax(_all_rewards(observation, hidden_param), axis=1, output_type=tf.int32) optimal_reward_fn = functools.partial(optimal_reward, hidden_param=HIDDEN_PARAM) optimal_action_fn = functools.partial(optimal_action, hidden_param=HIDDEN_PARAM) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) if FLAGS.drop_arm_obs: drop_arm_feature_fn = functools.partial( bandit_spec_utils.drop_arm_observation) else: drop_arm_feature_fn = None trainer.train(root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric], training_data_spec_transformation_fn=drop_arm_feature_fn)
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. with tf.device('/CPU:0'): # due to b/128333994 if FLAGS.normalize_reward_fns: action_reward_fns = ( environment_utilities.normalized_sliding_linear_reward_fn_generator( CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE)) else: action_reward_fns = ( environment_utilities.sliding_linear_reward_fn_generator( CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE)) env = sspe.StationaryStochasticPyEnvironment( functools.partial( environment_utilities.context_sampling_fn, batch_size=BATCH_SIZE, context_dim=CONTEXT_DIM), action_reward_fns, batch_size=BATCH_SIZE) mask_split_fn = None if FLAGS.num_disabled_actions > 0: mask_split_fn = lambda x: (x[0], x[1]) env = wrappers.ExtraDisabledActionsWrapper(env, FLAGS.num_disabled_actions) environment = tf_py_environment.TFPyEnvironment(env) optimal_reward_fn = functools.partial( environment_utilities.tf_compute_optimal_reward, per_action_reward_fns=action_reward_fns) optimal_action_fn = functools.partial( environment_utilities.tf_compute_optimal_action, per_action_reward_fns=action_reward_fns) network_input_spec = environment.time_step_spec().observation if FLAGS.num_disabled_actions > 0: def _apply_only_to_observation(fn): def result_fn(obs): return fn(obs[0]) return result_fn optimal_action_fn = _apply_only_to_observation(optimal_action_fn) optimal_reward_fn = _apply_only_to_observation(optimal_reward_fn) network_input_spec = network_input_spec[0] network = q_network.QNetwork( input_tensor_spec=network_input_spec, action_spec=environment.action_spec(), fc_layer_params=LAYERS) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32, observation_and_action_constraint_splitter=mask_split_fn) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32, observation_and_action_constraint_splitter=mask_split_fn) elif FLAGS.agent == 'epsGreedy': agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=EPSILON, observation_and_action_constraint_splitter=mask_split_fn) elif FLAGS.agent == 'Mix': assert FLAGS.num_disabled_actions == 0, ( 'Extra actions with mixture agent not supported.') emit_policy_info = policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN agent_linucb = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), emit_policy_info=emit_policy_info, alpha=AGENT_ALPHA, dtype=tf.float32) agent_lints = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), emit_policy_info=emit_policy_info, alpha=AGENT_ALPHA, dtype=tf.float32) agent_epsgreedy = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), emit_policy_info=emit_policy_info, epsilon=EPSILON) agent = exp3_mixture_agent.Exp3MixtureAgent( (agent_linucb, agent_lints, agent_epsgreedy)) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])
def testLinearUCBUpdateWithForgetting( self, batch_size, context_dim, dtype, use_eigendecomp=False): """Check LinearUCB agent updates for specified actions and rewards.""" gamma = 0.9 # Construct a `Trajectory` for the given action, observation, reward. num_actions = 5 initial_step, final_step = _get_initial_and_final_steps( batch_size, context_dim) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec( dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=time_step_spec, action_spec=action_spec, gamma=gamma, dtype=dtype, use_eigendecomp=use_eigendecomp) self.evaluate(tf.compat.v1.global_variables_initializer()) loss_info = agent.train(experience) self.evaluate(loss_info) final_a = self.evaluate(agent.cov_matrix) final_b = self.evaluate(agent.data_vector) final_eig_vals = self.evaluate(agent.eig_vals) # Compute the expected updated estimates. observations_list = tf.dynamic_partition( data=tf.reshape(experience.observation, [batch_size, context_dim]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) rewards_list = tf.dynamic_partition( data=tf.reshape(experience.reward, [batch_size]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) expected_a_updated_list = [] expected_b_updated_list = [] expected_eigvals_updated_list = [] for _, (observations_for_arm, rewards_for_arm) in enumerate(zip( observations_list, rewards_list)): num_samples_for_arm_current = tf.cast( tf.shape(rewards_for_arm)[0], tf.float32) num_samples_for_arm_total = num_samples_for_arm_current # pylint: disable=cell-var-from-loop def true_fn(): a_new = gamma * tf.eye(context_dim) + tf.matmul( observations_for_arm, observations_for_arm, transpose_a=True) b_new = bandit_utils.sum_reward_weighted_observations( rewards_for_arm, observations_for_arm) eigmatrix_new = tf.constant([], dtype=dtype) eigvals_new = tf.constant([], dtype=dtype) if use_eigendecomp: eigvals_new, eigmatrix_new = tf.linalg.eigh(a_new) return a_new, b_new, eigvals_new, eigmatrix_new def false_fn(): if use_eigendecomp: return (tf.eye(context_dim), tf.zeros([context_dim]), tf.ones([context_dim]), tf.eye(context_dim)) else: return (tf.eye(context_dim), tf.zeros([context_dim]), tf.constant([], dtype=dtype), tf.constant([], dtype=dtype)) a_new, b_new, eig_vals_new, _ = tf.cond( tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn) expected_a_updated_list.append(self.evaluate(a_new)) expected_b_updated_list.append(self.evaluate(b_new)) expected_eigvals_updated_list.append(self.evaluate(eig_vals_new)) # Check that the actual updated estimates match the expectations. self.assertAllClose(expected_a_updated_list, final_a) self.assertAllClose(expected_b_updated_list, final_b) self.assertAllClose( expected_eigvals_updated_list, final_eig_vals, atol=1e-4, rtol=1e-4)
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. with tf.device('/CPU:0'): # due to b/128333994 if FLAGS.normalize_reward_fns: action_reward_fns = (environment_utilities. normalized_sliding_linear_reward_fn_generator( CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE)) else: action_reward_fns = ( environment_utilities.sliding_linear_reward_fn_generator( CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE)) env = sspe.StationaryStochasticPyEnvironment(functools.partial( environment_utilities.context_sampling_fn, batch_size=BATCH_SIZE, context_dim=CONTEXT_DIM), action_reward_fns, batch_size=BATCH_SIZE) mask_split_fn = None if FLAGS.num_disabled_actions > 0: mask_split_fn = lambda x: (x[0], x[1]) env = wrappers.ExtraDisabledActionsWrapper( env, FLAGS.num_disabled_actions) environment = tf_py_environment.TFPyEnvironment(env) optimal_reward_fn = functools.partial( environment_utilities.tf_compute_optimal_reward, per_action_reward_fns=action_reward_fns) optimal_action_fn = functools.partial( environment_utilities.tf_compute_optimal_action, per_action_reward_fns=action_reward_fns) network_input_spec = environment.time_step_spec().observation if FLAGS.num_disabled_actions > 0: def _apply_only_to_observation(fn): def result_fn(obs): return fn(obs[0]) return result_fn optimal_action_fn = _apply_only_to_observation(optimal_action_fn) optimal_reward_fn = _apply_only_to_observation(optimal_reward_fn) network_input_spec = network_input_spec[0] network = q_network.QNetwork(input_tensor_spec=network_input_spec, action_spec=environment.action_spec(), fc_layer_params=LAYERS) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32, observation_and_action_constraint_splitter=mask_split_fn) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32, observation_and_action_constraint_splitter=mask_split_fn) elif FLAGS.agent == 'epsGreedy': agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=EPSILON, observation_and_action_constraint_splitter=mask_split_fn) elif FLAGS.agent == 'Boltzmann': train_step_counter = tf.compat.v1.train.get_or_create_global_step() boundaries = [500] temp_values = [1000.0, TEMPERATURE] temp_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay( boundaries, temp_values) def _temperature_fn(): # Any variable used in the function needs to be saved in the policy. # This is true by default for the `train_step_counter`. return temp_schedule(train_step_counter) agent = neural_boltzmann_agent.NeuralBoltzmannAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, temperature=_temperature_fn, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), observation_and_action_constraint_splitter=mask_split_fn, train_step_counter=train_step_counter) # This is needed, otherwise the PolicySaver complains. agent.policy.step = train_step_counter elif FLAGS.agent == 'BoltzmannGumbel': num_samples_list = [ tf.compat.v2.Variable(0, dtype=tf.int32, name='num_samples_{}'.format(k)) for k in range(NUM_ACTIONS) ] agent = neural_boltzmann_agent.NeuralBoltzmannAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, boltzmann_gumbel_exploration_constant=250.0, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), observation_and_action_constraint_splitter=mask_split_fn, num_samples_list=num_samples_list) elif FLAGS.agent == 'Mix': assert FLAGS.num_disabled_actions == 0, ( 'Extra actions with mixture agent not supported.') emit_policy_info = policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN agent_linucb = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), emit_policy_info=emit_policy_info, alpha=AGENT_ALPHA, dtype=tf.float32) agent_lints = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), emit_policy_info=emit_policy_info, alpha=AGENT_ALPHA, dtype=tf.float32) agent_epsgreedy = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), emit_policy_info=emit_policy_info, epsilon=EPSILON) agent = exp3_mixture_agent.Exp3MixtureAgent( (agent_linucb, agent_lints, agent_epsgreedy)) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])
def testLinearUCBUpdateWithMaskedActions(self, batch_size, context_dim, dtype, use_eigendecomp=False): """Check LinearUCB agent updates for specified actions and rewards.""" # Construct a `Trajectory` for the given action, observation, reward. num_actions = 5 initial_step, final_step = _get_initial_and_final_steps_with_action_mask( batch_size, context_dim, num_actions=num_actions) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. observation_spec = (tensor_spec.TensorSpec([context_dim], tf.float32), tensor_spec.TensorSpec([num_actions], tf.int32)) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec( dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) def observation_and_action_constraint_splitter(obs): return obs[0], obs[1] agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=time_step_spec, action_spec=action_spec, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), dtype=dtype) self.evaluate(agent.initialize()) loss_info = agent.train(experience) self.evaluate(loss_info) final_a = self.evaluate(agent.cov_matrix) final_b = self.evaluate(agent.data_vector) # Compute the expected updated estimates. observations_list = tf.dynamic_partition( data=tf.reshape( observation_and_action_constraint_splitter( experience.observation)[0], [batch_size, -1]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) rewards_list = tf.dynamic_partition( data=tf.reshape(experience.reward, [batch_size]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) expected_a_updated_list = [] expected_b_updated_list = [] for _, (observations_for_arm, rewards_for_arm) in enumerate(zip(observations_list, rewards_list)): num_samples_for_arm_current = tf.cast( tf.shape(rewards_for_arm)[0], tf.float32) num_samples_for_arm_total = num_samples_for_arm_current # pylint: disable=cell-var-from-loop def true_fn(): a_new = tf.eye(context_dim) + tf.matmul( observations_for_arm, observations_for_arm, transpose_a=True) b_new = bandit_utils.sum_reward_weighted_observations( rewards_for_arm, observations_for_arm) return a_new, b_new def false_fn(): return tf.eye(context_dim), tf.zeros([context_dim]) a_new, b_new = tf.cond( tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn) expected_a_updated_list.append(self.evaluate(a_new)) expected_b_updated_list.append(self.evaluate(b_new)) # Check that the actual updated estimates match the expectations. self.assertAllClose(expected_a_updated_list, final_a) self.assertAllClose(expected_b_updated_list, final_b)
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. data_path = FLAGS.data_path if not data_path: raise ValueError('Please specify the location of the data file.') if FLAGS.per_arm: env = movielens_per_arm_py_environment.MovieLensPerArmPyEnvironment( data_path, RANK_K, BATCH_SIZE, num_actions=NUM_ACTIONS, csv_delimiter='\t') else: env = movielens_py_environment.MovieLensPyEnvironment( data_path, RANK_K, BATCH_SIZE, num_movies=NUM_ACTIONS, csv_delimiter='\t') environment = tf_py_environment.TFPyEnvironment(env) optimal_reward_fn = functools.partial( environment_utilities.compute_optimal_reward_with_movielens_environment, environment=environment) optimal_action_fn = functools.partial( environment_utilities.compute_optimal_action_with_movielens_environment, environment=environment) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), tikhonov_weight=0.001, alpha=AGENT_ALPHA, dtype=tf.float32, accepts_per_arm_features=FLAGS.per_arm) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), dtype=tf.float32, accepts_per_arm_features=FLAGS.per_arm) elif FLAGS.agent == 'epsGreedy': if FLAGS.per_arm: network = ( global_and_arm_feature_network .create_feed_forward_dot_product_network( environment.time_step_spec().observation, global_layers=LAYERS, arm_layers=LAYERS)) else: network = q_network.QNetwork( input_tensor_spec=environment.time_step_spec().observation, action_spec=environment.action_spec(), fc_layer_params=LAYERS) agent = eps_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=EPSILON, emit_policy_info='predicted_rewards_mean', info_fields_to_inherit_from_greedy=['predicted_rewards_mean']) elif FLAGS.agent == 'DropoutTS': train_step_counter = tf.compat.v1.train.get_or_create_global_step() def dropout_fn(): return tf.math.maximum( tf.math.reciprocal_no_nan(1.01 + tf.cast(train_step_counter, tf.float32)), 0.0003) agent = dropout_ts_agent.DropoutThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), dropout_rate=dropout_fn, network_layers=LAYERS, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR)) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])
def testDistributedLinearUCBUpdate(self, batch_size, context_dim, dtype, use_eigendecomp=False): """Same as above, but uses the distributed train function of LinUCB.""" # Construct a `Trajectory` for the given action, observation, reward. num_actions = 5 initial_step, final_step = _get_initial_and_final_steps( batch_size, context_dim) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) agent = lin_ucb_agent.LinearUCBAgent(time_step_spec=time_step_spec, action_spec=action_spec, dtype=dtype) self.evaluate(agent.initialize()) train_fn = common.function_in_tf1()(agent._distributed_train_step) loss_info = train_fn(experience=experience) self.evaluate(loss_info) final_a = self.evaluate(agent.cov_matrix) final_b = self.evaluate(agent.data_vector) # Compute the expected updated estimates. observations_list = tf.dynamic_partition( data=tf.reshape(experience.observation, [batch_size, context_dim]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) rewards_list = tf.dynamic_partition( data=tf.reshape(experience.reward, [batch_size]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) expected_a_updated_list = [] expected_b_updated_list = [] expected_theta_updated_list = [] for _, (observations_for_arm, rewards_for_arm) in enumerate( zip(observations_list, rewards_list)): num_samples_for_arm_current = tf.cast( tf.shape(rewards_for_arm)[0], tf.float32) num_samples_for_arm_total = num_samples_for_arm_current # pylint: disable=cell-var-from-loop def true_fn(): a_new = tf.matmul(observations_for_arm, observations_for_arm, transpose_a=True) b_new = bandit_utils.sum_reward_weighted_observations( rewards_for_arm, observations_for_arm) return a_new, b_new def false_fn(): return tf.zeros([context_dim, context_dim]), tf.zeros([context_dim]) a_new, b_new = tf.cond( tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn) theta_new = tf.squeeze(tf.linalg.solve( a_new + tf.eye(context_dim), tf.expand_dims(b_new, axis=-1)), axis=-1) expected_a_updated_list.append(self.evaluate(a_new)) expected_b_updated_list.append(self.evaluate(b_new)) expected_theta_updated_list.append(self.evaluate(theta_new)) # Check that the actual updated estimates match the expectations. self.assertAllClose(expected_a_updated_list, final_a) self.assertAllClose(expected_b_updated_list, final_b)
def get_agent_by_name(agent_name, time_step_spec, action_spec): """Helper function that outputs an agent. Args: agent_name: The name (string) of the desired agent. time_step_spec: The time step spec of the environment on which the agent acts. action_spec: The action spec on which the agent acts. Returns: The desired agent. """ accepts_per_arm_features = isinstance( time_step_spec.observation, dict) and 'per_arm' in time_step_spec.observation if agent_name == 'LinUCB': return lin_ucb_agent.LinearUCBAgent( time_step_spec=time_step_spec, action_spec=action_spec, dtype=tf.float32, accepts_per_arm_features=accepts_per_arm_features) elif agent_name == 'LinTS': return lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=time_step_spec, action_spec=action_spec, dtype=tf.float32, accepts_per_arm_features=accepts_per_arm_features) elif agent_name == 'epsGreedy': if accepts_per_arm_features: network = (global_and_arm_feature_network. create_feed_forward_common_tower_network( time_step_spec.observation, (20, 20), (20, 20), (20, 20))) else: network = q_network.QNetwork( input_tensor_spec=time_step_spec.observation, action_spec=action_spec, fc_layer_params=(50, 50, 50)) return neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=time_step_spec, action_spec=action_spec, reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.05), epsilon=0.1, accepts_per_arm_features=accepts_per_arm_features) elif agent_name == 'mix': assert not accepts_per_arm_features, 'Per-arm mixture agent not supported.' emit_policy_info = ( policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN, ) network = q_network.QNetwork( input_tensor_spec=time_step_spec.observation, action_spec=action_spec, fc_layer_params=(50, 50, 50)) agent_epsgreedy = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=time_step_spec, action_spec=action_spec, reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.05), emit_policy_info=emit_policy_info, epsilon=0.1) agent_linucb = lin_ucb_agent.LinearUCBAgent( time_step_spec=time_step_spec, action_spec=action_spec, emit_policy_info=emit_policy_info, dtype=tf.float32) agent_random = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=time_step_spec, action_spec=action_spec, reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.05), emit_policy_info=emit_policy_info, epsilon=1.) agent_halfrandom = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=time_step_spec, action_spec=action_spec, reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.05), emit_policy_info=emit_policy_info, epsilon=0.5) return exp3_mixture_agent.Exp3MixtureAgent( (agent_epsgreedy, agent_linucb, agent_random, agent_halfrandom))
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. class LinearNormalReward(object): def __init__(self, theta): self.theta = theta def __call__(self, x): mu = np.dot(x, self.theta) return np.random.normal(mu, 1) def _global_context_sampling_fn(): return np.random.randint(-10, 10, [4]).astype(np.float32) def _arm_context_sampling_fn(): return np.random.randint(-2, 3, [5]).astype(np.float32) reward_fn = LinearNormalReward(HIDDEN_PARAM) env = sspe.StationaryStochasticPerArmPyEnvironment( _global_context_sampling_fn, _arm_context_sampling_fn, NUM_ACTIONS, reward_fn, batch_size=BATCH_SIZE) environment = tf_py_environment.TFPyEnvironment(env) obs_spec = environment.observation_spec() if FLAGS.drop_arm_obs: drop_arm_feature_fn = bandit_spec_utils.drop_arm_observation else: drop_arm_feature_fn = None if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, accepts_per_arm_features=True, drop_arm_features=FLAGS.drop_arm_obs, dtype=tf.float32) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, accepts_per_arm_features=True, drop_arm_features=FLAGS.drop_arm_obs, dtype=tf.float32) elif FLAGS.agent == 'epsGreedy': if FLAGS.network == 'commontower': network = (global_and_arm_feature_network. create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2))) elif FLAGS.network == 'dotproduct': network = (global_and_arm_feature_network. create_feed_forward_dot_product_network( obs_spec, (4, 3, 6), (3, 4, 6))) agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=EPSILON, accepts_per_arm_features=True, training_data_spec_transformation_fn=drop_arm_feature_fn, emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN ) optimal_reward_fn = functools.partial(optimal_reward, hidden_param=HIDDEN_PARAM) optimal_action_fn = functools.partial(optimal_action, hidden_param=HIDDEN_PARAM) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train(root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric], training_data_spec_transformation_fn=drop_arm_feature_fn)