Ejemplo n.º 1
0
 def test_mu_base_out_of_bound_parameter(self):
   mu_base = [1.2, 1.0, 1.0, 1.0, 1.0, 1.0]
   with self.assertRaisesRegexp(
       ValueError, 'The length of \'mu_base\' must be 5, but saw '
       '\'mu_base\':.*'):
     wheel_py_environment.WheelPyEnvironment(
         delta=0.5, mu_base=mu_base,
         std_base=0.01 * np.ones(5), mu_high=50.0, std_high=0.01)
Ejemplo n.º 2
0
 def test_std_base_out_of_bound_parameter(self):
     with self.assertRaisesRegexp(
             ValueError, r'The length of \'std_base\' must be 5\.'):
         wheel_py_environment.WheelPyEnvironment(
             delta=0.5,
             mu_base=[1.2, 1.0, 1.0, 1.0, 1.0],
             std_base=0.01 * np.ones(6),
             mu_high=50.0,
             std_high=0.01)
Ejemplo n.º 3
0
 def test_rewards_validity(self, batch_size):
   """Tests that the rewards are valid."""
   env = wheel_py_environment.WheelPyEnvironment(
       delta=0.5, mu_base=[1.2, 1.0, 1.0, 1.0, 1.0],
       std_base=0.01 * np.ones(5), mu_high=50.0, std_high=0.01,
       batch_size=batch_size)
   time_step = env.reset()
   time_step = env.step(np.arange(batch_size))
   self.assertEqual(time_step.reward.shape, (batch_size,))
Ejemplo n.º 4
0
 def test_delta_out_of_bound_parameter(self, delta):
     with self.assertRaisesRegexp(
             ValueError,
             r'Delta must be in \(0, 1\)\, but saw delta: %g' % delta):
         wheel_py_environment.WheelPyEnvironment(
             delta=delta,
             mu_base=[1.2, 1.0, 1.0, 1.0, 1.0],
             std_base=0.01 * np.ones(5),
             mu_high=50.0,
             std_high=0.01)
Ejemplo n.º 5
0
def main(unused_argv):
    tf.enable_resource_variables()

    with tf.device('/CPU:0'):  # due to b/128333994
        env = wheel_py_environment.WheelPyEnvironment(DELTA, MU_BASE, STD_BASE,
                                                      MU_HIGH, STD_HIGH,
                                                      BATCH_SIZE)
        environment = tf_py_environment.TFPyEnvironment(env)

        optimal_reward_fn = functools.partial(
            environment_utilities.tf_wheel_bandit_compute_optimal_reward,
            delta=DELTA,
            mu_inside=MU_BASE[0],
            mu_high=MU_HIGH)
        optimal_action_fn = functools.partial(
            environment_utilities.tf_wheel_bandit_compute_optimal_action,
            delta=DELTA)

        if FLAGS.agent == 'LinUCB':
            agent = lin_ucb_agent.LinearUCBAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)
        elif FLAGS.agent == 'LinTS':
            agent = lin_ts_agent.LinearThompsonSamplingAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)
        elif FLAGS.agent == 'epsGreedy':
            network = q_network.QNetwork(
                input_tensor_spec=environment.time_step_spec().observation,
                action_spec=environment.action_spec(),
                fc_layer_params=LAYERS)
            agent = eps_greedy_agent.NeuralEpsilonGreedyAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
                epsilon=EPSILON)

        regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
        suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
            optimal_action_fn)

        trainer.train(
            root_dir=FLAGS.root_dir,
            agent=agent,
            environment=environment,
            training_loops=TRAINING_LOOPS,
            steps_per_loop=STEPS_PER_LOOP,
            additional_metrics=[regret_metric, suboptimal_arms_metric])
Ejemplo n.º 6
0
def get_environment_and_optimal_functions_by_name(environment_name,
                                                  batch_size):
    """Helper function that outputs an environment and related functions.

  Args:
    environment_name: The (string) name of the desired environment.
    batch_size: The batch_size

  Returns:
    A tuple of (environment, optimal_reward_fn, optimal_action_fn), where the
    latter two functions are for calculating regret and the suboptimal actions
    metrics.
  """
    if environment_name == 'stationary_stochastic':
        context_dim = 7
        num_actions = 5
        action_reward_fns = (
            environment_utilities.sliding_linear_reward_fn_generator(
                context_dim, num_actions, 0.1))
        py_env = (stationary_stochastic_py_environment.
                  StationaryStochasticPyEnvironment(functools.partial(
                      environment_utilities.context_sampling_fn,
                      batch_size=batch_size,
                      context_dim=context_dim),
                                                    action_reward_fns,
                                                    batch_size=batch_size))
        optimal_reward_fn = functools.partial(
            environment_utilities.tf_compute_optimal_reward,
            per_action_reward_fns=action_reward_fns)

        optimal_action_fn = functools.partial(
            environment_utilities.tf_compute_optimal_action,
            per_action_reward_fns=action_reward_fns)
        environment = tf_py_environment.TFPyEnvironment(py_env)
    elif environment_name == 'wheel':
        delta = 0.5
        mu_base = [0.05, 0.01, 0.011, 0.009, 0.012]
        std_base = [0.001] * 5
        mu_high = 0.5
        std_high = 0.001
        py_env = wheel_py_environment.WheelPyEnvironment(
            delta, mu_base, std_base, mu_high, std_high, batch_size)
        environment = tf_py_environment.TFPyEnvironment(py_env)
        optimal_reward_fn = functools.partial(
            environment_utilities.tf_wheel_bandit_compute_optimal_reward,
            delta=delta,
            mu_inside=mu_base[0],
            mu_high=mu_high)
        optimal_action_fn = functools.partial(
            environment_utilities.tf_wheel_bandit_compute_optimal_action,
            delta=delta)
    return (environment, optimal_reward_fn, optimal_action_fn)
Ejemplo n.º 7
0
  def test_observation_validity(self, batch_size):
    """Tests that the observations fall into the unit circle."""
    env = wheel_py_environment.WheelPyEnvironment(
        delta=0.5, mu_base=[1.2, 1.0, 1.0, 1.0, 1.0],
        std_base=0.01 * np.ones(5), mu_high=50.0, std_high=0.01,
        batch_size=batch_size)

    for _ in range(5):
      observation = env.reset().observation
      self.assertEqual(list(observation.shape),
                       [batch_size] + list(env.observation_spec().shape))
      for i in range(batch_size):
        self.assertLessEqual(np.linalg.norm(observation[i, :]), 1)
Ejemplo n.º 8
0
def get_environment_and_optimal_functions_by_name(environment_name, batch_size):
  if environment_name == 'stationary_stochastic':
    context_dim = 7
    num_actions = 5
    action_reward_fns = (
        environment_utilities.sliding_linear_reward_fn_generator(
            context_dim, num_actions, 0.1))
    py_env = (
        stationary_stochastic_py_environment
        .StationaryStochasticPyEnvironment(
            functools.partial(
                environment_utilities.context_sampling_fn,
                batch_size=batch_size,
                context_dim=context_dim),
            action_reward_fns,
            batch_size=batch_size))
    optimal_reward_fn = functools.partial(
        environment_utilities.tf_compute_optimal_reward,
        per_action_reward_fns=action_reward_fns)

    optimal_action_fn = functools.partial(
        environment_utilities.tf_compute_optimal_action,
        per_action_reward_fns=action_reward_fns)
    environment = tf_py_environment.TFPyEnvironment(py_env)
  elif environment_name == 'wheel':
    delta = 0.5
    mu_base = [0.05, 0.01, 0.011, 0.009, 0.012]
    std_base = [0.001] * 5
    mu_high = 0.5
    std_high = 0.001
    py_env = wheel_py_environment.WheelPyEnvironment(delta, mu_base, std_base,
                                                     mu_high, std_high,
                                                     batch_size)
    environment = tf_py_environment.TFPyEnvironment(py_env)
    optimal_reward_fn = functools.partial(
        environment_utilities.tf_wheel_bandit_compute_optimal_reward,
        delta=delta,
        mu_inside=mu_base[0],
        mu_high=mu_high)
    optimal_action_fn = functools.partial(
        environment_utilities.tf_wheel_bandit_compute_optimal_action,
        delta=delta)
  return (environment, optimal_reward_fn, optimal_action_fn)
Ejemplo n.º 9
0
def main(unused_argv):
    tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

    with tf.device('/CPU:0'):  # due to b/128333994
        env = wheel_py_environment.WheelPyEnvironment(DELTA, MU_BASE, STD_BASE,
                                                      MU_HIGH, STD_HIGH,
                                                      BATCH_SIZE)
        environment = tf_py_environment.TFPyEnvironment(env)

        optimal_reward_fn = functools.partial(
            environment_utilities.tf_wheel_bandit_compute_optimal_reward,
            delta=DELTA,
            mu_inside=MU_BASE[0],
            mu_high=MU_HIGH)
        optimal_action_fn = functools.partial(
            environment_utilities.tf_wheel_bandit_compute_optimal_action,
            delta=DELTA)
        network = q_network.QNetwork(
            input_tensor_spec=environment.time_step_spec().observation,
            action_spec=environment.action_spec(),
            fc_layer_params=(LAYERS))

        if FLAGS.agent == 'LinUCB':
            agent = lin_ucb_agent.LinearUCBAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)
        elif FLAGS.agent == 'LinTS':
            agent = lin_ts_agent.LinearThompsonSamplingAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)
        elif FLAGS.agent == 'epsGreedy':
            agent = eps_greedy_agent.NeuralEpsilonGreedyAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
                epsilon=EPSILON)
        elif FLAGS.agent == 'random':
            agent = eps_greedy_agent.NeuralEpsilonGreedyAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
                epsilon=1.)
        elif FLAGS.agent == 'Mix':
            emit_policy_info = (
                policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN, )
            agent_epsgreedy = eps_greedy_agent.NeuralEpsilonGreedyAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
                emit_policy_info=emit_policy_info,
                epsilon=EPSILON)
            agent_linucb = lin_ucb_agent.LinearUCBAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                emit_policy_info=emit_policy_info,
                dtype=tf.float32)
            agent_random = eps_greedy_agent.NeuralEpsilonGreedyAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
                emit_policy_info=emit_policy_info,
                epsilon=1.)
            agent_halfrandom = eps_greedy_agent.NeuralEpsilonGreedyAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
                emit_policy_info=emit_policy_info,
                epsilon=0.5)
            agent = exp3_mixture_agent.Exp3MixtureAgent(
                (agent_epsgreedy, agent_linucb, agent_random,
                 agent_halfrandom))

        regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
        suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
            optimal_action_fn)

        trainer.train(
            root_dir=FLAGS.root_dir,
            agent=agent,
            environment=environment,
            training_loops=TRAINING_LOOPS,
            steps_per_loop=STEPS_PER_LOOP,
            additional_metrics=[regret_metric, suboptimal_arms_metric])