def test_mu_base_out_of_bound_parameter(self): mu_base = [1.2, 1.0, 1.0, 1.0, 1.0, 1.0] with self.assertRaisesRegexp( ValueError, 'The length of \'mu_base\' must be 5, but saw ' '\'mu_base\':.*'): wheel_py_environment.WheelPyEnvironment( delta=0.5, mu_base=mu_base, std_base=0.01 * np.ones(5), mu_high=50.0, std_high=0.01)
def test_std_base_out_of_bound_parameter(self): with self.assertRaisesRegexp( ValueError, r'The length of \'std_base\' must be 5\.'): wheel_py_environment.WheelPyEnvironment( delta=0.5, mu_base=[1.2, 1.0, 1.0, 1.0, 1.0], std_base=0.01 * np.ones(6), mu_high=50.0, std_high=0.01)
def test_rewards_validity(self, batch_size): """Tests that the rewards are valid.""" env = wheel_py_environment.WheelPyEnvironment( delta=0.5, mu_base=[1.2, 1.0, 1.0, 1.0, 1.0], std_base=0.01 * np.ones(5), mu_high=50.0, std_high=0.01, batch_size=batch_size) time_step = env.reset() time_step = env.step(np.arange(batch_size)) self.assertEqual(time_step.reward.shape, (batch_size,))
def test_delta_out_of_bound_parameter(self, delta): with self.assertRaisesRegexp( ValueError, r'Delta must be in \(0, 1\)\, but saw delta: %g' % delta): wheel_py_environment.WheelPyEnvironment( delta=delta, mu_base=[1.2, 1.0, 1.0, 1.0, 1.0], std_base=0.01 * np.ones(5), mu_high=50.0, std_high=0.01)
def main(unused_argv): tf.enable_resource_variables() with tf.device('/CPU:0'): # due to b/128333994 env = wheel_py_environment.WheelPyEnvironment(DELTA, MU_BASE, STD_BASE, MU_HIGH, STD_HIGH, BATCH_SIZE) environment = tf_py_environment.TFPyEnvironment(env) optimal_reward_fn = functools.partial( environment_utilities.tf_wheel_bandit_compute_optimal_reward, delta=DELTA, mu_inside=MU_BASE[0], mu_high=MU_HIGH) optimal_action_fn = functools.partial( environment_utilities.tf_wheel_bandit_compute_optimal_action, delta=DELTA) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) elif FLAGS.agent == 'epsGreedy': network = q_network.QNetwork( input_tensor_spec=environment.time_step_spec().observation, action_spec=environment.action_spec(), fc_layer_params=LAYERS) agent = eps_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=EPSILON) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])
def get_environment_and_optimal_functions_by_name(environment_name, batch_size): """Helper function that outputs an environment and related functions. Args: environment_name: The (string) name of the desired environment. batch_size: The batch_size Returns: A tuple of (environment, optimal_reward_fn, optimal_action_fn), where the latter two functions are for calculating regret and the suboptimal actions metrics. """ if environment_name == 'stationary_stochastic': context_dim = 7 num_actions = 5 action_reward_fns = ( environment_utilities.sliding_linear_reward_fn_generator( context_dim, num_actions, 0.1)) py_env = (stationary_stochastic_py_environment. StationaryStochasticPyEnvironment(functools.partial( environment_utilities.context_sampling_fn, batch_size=batch_size, context_dim=context_dim), action_reward_fns, batch_size=batch_size)) optimal_reward_fn = functools.partial( environment_utilities.tf_compute_optimal_reward, per_action_reward_fns=action_reward_fns) optimal_action_fn = functools.partial( environment_utilities.tf_compute_optimal_action, per_action_reward_fns=action_reward_fns) environment = tf_py_environment.TFPyEnvironment(py_env) elif environment_name == 'wheel': delta = 0.5 mu_base = [0.05, 0.01, 0.011, 0.009, 0.012] std_base = [0.001] * 5 mu_high = 0.5 std_high = 0.001 py_env = wheel_py_environment.WheelPyEnvironment( delta, mu_base, std_base, mu_high, std_high, batch_size) environment = tf_py_environment.TFPyEnvironment(py_env) optimal_reward_fn = functools.partial( environment_utilities.tf_wheel_bandit_compute_optimal_reward, delta=delta, mu_inside=mu_base[0], mu_high=mu_high) optimal_action_fn = functools.partial( environment_utilities.tf_wheel_bandit_compute_optimal_action, delta=delta) return (environment, optimal_reward_fn, optimal_action_fn)
def test_observation_validity(self, batch_size): """Tests that the observations fall into the unit circle.""" env = wheel_py_environment.WheelPyEnvironment( delta=0.5, mu_base=[1.2, 1.0, 1.0, 1.0, 1.0], std_base=0.01 * np.ones(5), mu_high=50.0, std_high=0.01, batch_size=batch_size) for _ in range(5): observation = env.reset().observation self.assertEqual(list(observation.shape), [batch_size] + list(env.observation_spec().shape)) for i in range(batch_size): self.assertLessEqual(np.linalg.norm(observation[i, :]), 1)
def get_environment_and_optimal_functions_by_name(environment_name, batch_size): if environment_name == 'stationary_stochastic': context_dim = 7 num_actions = 5 action_reward_fns = ( environment_utilities.sliding_linear_reward_fn_generator( context_dim, num_actions, 0.1)) py_env = ( stationary_stochastic_py_environment .StationaryStochasticPyEnvironment( functools.partial( environment_utilities.context_sampling_fn, batch_size=batch_size, context_dim=context_dim), action_reward_fns, batch_size=batch_size)) optimal_reward_fn = functools.partial( environment_utilities.tf_compute_optimal_reward, per_action_reward_fns=action_reward_fns) optimal_action_fn = functools.partial( environment_utilities.tf_compute_optimal_action, per_action_reward_fns=action_reward_fns) environment = tf_py_environment.TFPyEnvironment(py_env) elif environment_name == 'wheel': delta = 0.5 mu_base = [0.05, 0.01, 0.011, 0.009, 0.012] std_base = [0.001] * 5 mu_high = 0.5 std_high = 0.001 py_env = wheel_py_environment.WheelPyEnvironment(delta, mu_base, std_base, mu_high, std_high, batch_size) environment = tf_py_environment.TFPyEnvironment(py_env) optimal_reward_fn = functools.partial( environment_utilities.tf_wheel_bandit_compute_optimal_reward, delta=delta, mu_inside=mu_base[0], mu_high=mu_high) optimal_action_fn = functools.partial( environment_utilities.tf_wheel_bandit_compute_optimal_action, delta=delta) return (environment, optimal_reward_fn, optimal_action_fn)
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. with tf.device('/CPU:0'): # due to b/128333994 env = wheel_py_environment.WheelPyEnvironment(DELTA, MU_BASE, STD_BASE, MU_HIGH, STD_HIGH, BATCH_SIZE) environment = tf_py_environment.TFPyEnvironment(env) optimal_reward_fn = functools.partial( environment_utilities.tf_wheel_bandit_compute_optimal_reward, delta=DELTA, mu_inside=MU_BASE[0], mu_high=MU_HIGH) optimal_action_fn = functools.partial( environment_utilities.tf_wheel_bandit_compute_optimal_action, delta=DELTA) network = q_network.QNetwork( input_tensor_spec=environment.time_step_spec().observation, action_spec=environment.action_spec(), fc_layer_params=(LAYERS)) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) elif FLAGS.agent == 'epsGreedy': agent = eps_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=EPSILON) elif FLAGS.agent == 'random': agent = eps_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=1.) elif FLAGS.agent == 'Mix': emit_policy_info = ( policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN, ) agent_epsgreedy = eps_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), emit_policy_info=emit_policy_info, epsilon=EPSILON) agent_linucb = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, emit_policy_info=emit_policy_info, dtype=tf.float32) agent_random = eps_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), emit_policy_info=emit_policy_info, epsilon=1.) agent_halfrandom = eps_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), emit_policy_info=emit_policy_info, epsilon=0.5) agent = exp3_mixture_agent.Exp3MixtureAgent( (agent_epsgreedy, agent_linucb, agent_random, agent_halfrandom)) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])