Esempio n. 1
0
    def test_with_uniform_context_and_normal_mu_reward(self):
        def _context_sampling_fn():
            return np.random.randint(-10, 10, [1, 4])

        reward_fns = [
            LinearNormalReward(theta)
            for theta in ([0, 1, 2, 3], [3, 2, 1, 0], [-1, -2, -3, -4])
        ]

        env = sspe.StationaryStochasticPyEnvironment(_context_sampling_fn,
                                                     reward_fns)
        time_step_spec = env.time_step_spec()
        action_spec = env.action_spec()

        random_policy = random_py_policy.RandomPyPolicy(
            time_step_spec=time_step_spec, action_spec=action_spec)

        for _ in range(5):
            time_step = env.reset()
            self.assertTrue(
                check_unbatched_time_step_spec(time_step=time_step,
                                               time_step_spec=time_step_spec,
                                               batch_size=env.batch_size))

            action = random_policy.action(time_step).action
            time_step = env.step(action)
    def test_non_scalar_rewards(self):
        def _context_sampling_fn():
            return np.array([[4, 3], [4, 3], [5, 6]])

        # Build a case with 4 arms and 2-dimensional rewards and batch size 3.
        reward_fns = [
            LinearDeterministicMultipleRewards(theta)  # pylint: disable=g-complex-comprehension
            for theta in [
                np.array([[0, 1], [1, 0]]),
                np.array([[1, 2], [2, 1]]),
                np.array([[2, 3], [3, 2]]),
                np.array([[3, 4], [4, 3]])
            ]
        ]
        env = sspe.StationaryStochasticPyEnvironment(_context_sampling_fn,
                                                     reward_fns,
                                                     batch_size=3)
        time_step = env.reset()
        self.assertAllEqual(time_step.observation, [[4, 3], [4, 3], [5, 6]])
        time_step = env.step([0, 1, 2])
        self.assertAllEqual(time_step.reward,
                            [[3., 4.], [10., 11.], [28., 27.]])
        env.reset()
        time_step = env.step([2, 3, 0])
        self.assertAllEqual(time_step.reward,
                            [[17., 18.], [24., 25.], [6., 5.]])
        # Check that the reward vectors in the reward spec are 2-dimensional.
        time_step_spec = env.time_step_spec()
        self.assertEqual(time_step_spec.reward.shape[0], 2)
Esempio n. 3
0
  def test_with_normal_context_and_normal_reward(self):

    def _context_sampling_fn():
      return np.random.normal(0, 3, [1, 2])

    def _reward_fn(x):
      return np.random.normal(2 * x[0], abs(x[1]) + 1)

    env = sspe.StationaryStochasticPyEnvironment(_context_sampling_fn,
                                                 [_reward_fn])
    time_step_spec = env.time_step_spec()
    action_spec = env.action_spec()

    random_policy = random_py_policy.RandomPyPolicy(
        time_step_spec=time_step_spec, action_spec=action_spec)

    for _ in range(5):
      time_step = env.reset()
      self.assertTrue(
          check_unbatched_time_step_spec(
              time_step=time_step,
              time_step_spec=time_step_spec,
              batch_size=env.batch_size))

      action = random_policy.action(time_step).action
      time_step = env.step(action)
Esempio n. 4
0
def main(unused_argv):
    tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

    with tf.device('/CPU:0'):  # due to b/128333994
        action_reward_fns = (
            environment_utilities.sliding_linear_reward_fn_generator(
                CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE))

        env = sspe.StationaryStochasticPyEnvironment(functools.partial(
            environment_utilities.context_sampling_fn,
            batch_size=BATCH_SIZE,
            context_dim=CONTEXT_DIM),
                                                     action_reward_fns,
                                                     batch_size=BATCH_SIZE)
        environment = tf_py_environment.TFPyEnvironment(env)

        optimal_reward_fn = functools.partial(
            environment_utilities.tf_compute_optimal_reward,
            per_action_reward_fns=action_reward_fns)

        optimal_action_fn = functools.partial(
            environment_utilities.tf_compute_optimal_action,
            per_action_reward_fns=action_reward_fns)

        q_net = q_network.QNetwork(environment.observation_spec(),
                                   environment.action_spec(),
                                   fc_layer_params=(50, 50))

        agent = dqn_agent.DqnAgent(
            environment.time_step_spec(),
            environment.action_spec(),
            q_network=q_net,
            epsilon_greedy=0.1,
            target_update_tau=0.05,
            target_update_period=5,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=1e-2),
            td_errors_loss_fn=common.element_wise_squared_loss)

        regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
        suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
            optimal_action_fn)

        trainer.train(
            root_dir=FLAGS.root_dir,
            agent=agent,
            environment=environment,
            training_loops=TRAINING_LOOPS,
            steps_per_loop=STEPS_PER_LOOP,
            additional_metrics=[regret_metric, suboptimal_arms_metric])
Esempio n. 5
0
def main(unused_argv):
    tf.compat.v1.enable_resource_variables()

    with tf.device('/CPU:0'):  # due to b/128333994
        action_reward_fns = (
            environment_utilities.sliding_linear_reward_fn_generator(
                CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE))

        env = sspe.StationaryStochasticPyEnvironment(functools.partial(
            environment_utilities.context_sampling_fn,
            batch_size=BATCH_SIZE,
            context_dim=CONTEXT_DIM),
                                                     action_reward_fns,
                                                     batch_size=BATCH_SIZE)
        environment = tf_py_environment.TFPyEnvironment(env)

        optimal_reward_fn = functools.partial(
            environment_utilities.tf_compute_optimal_reward,
            per_action_reward_fns=action_reward_fns)

        optimal_action_fn = functools.partial(
            environment_utilities.tf_compute_optimal_action,
            per_action_reward_fns=action_reward_fns)

        if FLAGS.agent == 'LinUCB':
            agent = lin_ucb_agent.LinearUCBAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)
        elif FLAGS.agent == 'LinTS':
            agent = lin_ts_agent.LinearThompsonSamplingAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)

        regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
        suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
            optimal_action_fn)

        trainer.train(
            root_dir=FLAGS.root_dir,
            agent=agent,
            environment=environment,
            training_loops=TRAINING_LOOPS,
            steps_per_loop=STEPS_PER_LOOP,
            additional_metrics=[regret_metric, suboptimal_arms_metric])
Esempio n. 6
0
    def test_deterministic_with_batch_2(self):
        def _context_sampling_fn():
            return np.array([[4, 3], [4, 3]])

        reward_fns = [
            LinearDeterministicReward(theta)
            for theta in ([0, 1], [1, 2], [2, 3], [3, 4])
        ]
        env = sspe.StationaryStochasticPyEnvironment(_context_sampling_fn,
                                                     reward_fns,
                                                     batch_size=2)
        time_step = env.reset()
        self.assertAllEqual(time_step.observation, [[4, 3], [4, 3]])
        time_step = env.step([0, 1])
        self.assertAllEqual(time_step.reward, [3, 10])
        env.reset()
        time_step = env.step([2, 3])
        self.assertAllEqual(time_step.reward, [17, 24])
Esempio n. 7
0
    def testBanditEnvironment(self):
        def _context_sampling_fn():
            return np.array([[5, -5], [2, -2]])

        reward_fns = [
            environment_utilities.LinearNormalReward(theta, sigma=0.0)
            for theta in ([1, 0], [0, 1])
        ]
        batch_size = 2
        py_env = sspe.StationaryStochasticPyEnvironment(_context_sampling_fn,
                                                        reward_fns,
                                                        batch_size=batch_size)
        env = tf_py_environment.TFPyEnvironment(py_env)
        policy = random_tf_policy.RandomTFPolicy(env.time_step_spec(),
                                                 env.action_spec())

        steps_per_loop = 4
        replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            data_spec=policy.trajectory_spec,
            batch_size=batch_size,
            max_length=steps_per_loop)

        driver = dynamic_episode_driver.DynamicEpisodeDriver(
            env,
            policy,
            num_episodes=steps_per_loop * batch_size,
            observers=[replay_buffer.add_batch])

        run_driver = driver.run()
        rb_gather_all = replay_buffer.gather_all()

        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.evaluate(run_driver)
        trajectories = self.evaluate(rb_gather_all)

        self.assertAllEqual(trajectories.step_type,
                            [[0, 0, 0, 0], [0, 0, 0, 0]])
        self.assertAllEqual(trajectories.next_step_type,
                            [[2, 2, 2, 2], [2, 2, 2, 2]])
def main(unused_argv):
  tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

  with tf.device('/CPU:0'):  # due to b/128333994
    if FLAGS.normalize_reward_fns:
      action_reward_fns = (
          environment_utilities.normalized_sliding_linear_reward_fn_generator(
              CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE))
    else:
      action_reward_fns = (
          environment_utilities.sliding_linear_reward_fn_generator(
              CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE))

    env = sspe.StationaryStochasticPyEnvironment(
        functools.partial(
            environment_utilities.context_sampling_fn,
            batch_size=BATCH_SIZE,
            context_dim=CONTEXT_DIM),
        action_reward_fns,
        batch_size=BATCH_SIZE)
    mask_split_fn = None
    if FLAGS.num_disabled_actions > 0:
      mask_split_fn = lambda x: (x[0], x[1])
      env = wrappers.ExtraDisabledActionsWrapper(env,
                                                 FLAGS.num_disabled_actions)
    environment = tf_py_environment.TFPyEnvironment(env)

    optimal_reward_fn = functools.partial(
        environment_utilities.tf_compute_optimal_reward,
        per_action_reward_fns=action_reward_fns)

    optimal_action_fn = functools.partial(
        environment_utilities.tf_compute_optimal_action,
        per_action_reward_fns=action_reward_fns)

    network_input_spec = environment.time_step_spec().observation
    if FLAGS.num_disabled_actions > 0:

      def _apply_only_to_observation(fn):
        def result_fn(obs):
          return fn(obs[0])
        return result_fn

      optimal_action_fn = _apply_only_to_observation(optimal_action_fn)
      optimal_reward_fn = _apply_only_to_observation(optimal_reward_fn)
      network_input_spec = network_input_spec[0]

    network = q_network.QNetwork(
        input_tensor_spec=network_input_spec,
        action_spec=environment.action_spec(),
        fc_layer_params=LAYERS)

    if FLAGS.agent == 'LinUCB':
      agent = lin_ucb_agent.LinearUCBAgent(
          time_step_spec=environment.time_step_spec(),
          action_spec=environment.action_spec(),
          alpha=AGENT_ALPHA,
          dtype=tf.float32,
          observation_and_action_constraint_splitter=mask_split_fn)
    elif FLAGS.agent == 'LinTS':
      agent = lin_ts_agent.LinearThompsonSamplingAgent(
          time_step_spec=environment.time_step_spec(),
          action_spec=environment.action_spec(),
          alpha=AGENT_ALPHA,
          dtype=tf.float32,
          observation_and_action_constraint_splitter=mask_split_fn)
    elif FLAGS.agent == 'epsGreedy':
      agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
          time_step_spec=environment.time_step_spec(),
          action_spec=environment.action_spec(),
          reward_network=network,
          optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
          epsilon=EPSILON,
          observation_and_action_constraint_splitter=mask_split_fn)
    elif FLAGS.agent == 'Mix':
      assert FLAGS.num_disabled_actions == 0, (
          'Extra actions with mixture agent not supported.')

      emit_policy_info = policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN
      agent_linucb = lin_ucb_agent.LinearUCBAgent(
          time_step_spec=environment.time_step_spec(),
          action_spec=environment.action_spec(),
          emit_policy_info=emit_policy_info,
          alpha=AGENT_ALPHA,
          dtype=tf.float32)
      agent_lints = lin_ts_agent.LinearThompsonSamplingAgent(
          time_step_spec=environment.time_step_spec(),
          action_spec=environment.action_spec(),
          emit_policy_info=emit_policy_info,
          alpha=AGENT_ALPHA,
          dtype=tf.float32)
      agent_epsgreedy = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
          time_step_spec=environment.time_step_spec(),
          action_spec=environment.action_spec(),
          reward_network=network,
          optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
          emit_policy_info=emit_policy_info,
          epsilon=EPSILON)
      agent = exp3_mixture_agent.Exp3MixtureAgent(
          (agent_linucb, agent_lints, agent_epsgreedy))

    regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
    suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
        optimal_action_fn)

    trainer.train(
        root_dir=FLAGS.root_dir,
        agent=agent,
        environment=environment,
        training_loops=TRAINING_LOOPS,
        steps_per_loop=STEPS_PER_LOOP,
        additional_metrics=[regret_metric, suboptimal_arms_metric])
Esempio n. 9
0
def main(unused_argv):
    tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

    with tf.device('/CPU:0'):  # due to b/128333994
        if FLAGS.normalize_reward_fns:
            action_reward_fns = (environment_utilities.
                                 normalized_sliding_linear_reward_fn_generator(
                                     CONTEXT_DIM, NUM_ACTIONS,
                                     REWARD_NOISE_VARIANCE))
        else:
            action_reward_fns = (
                environment_utilities.sliding_linear_reward_fn_generator(
                    CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE))

        env = sspe.StationaryStochasticPyEnvironment(functools.partial(
            environment_utilities.context_sampling_fn,
            batch_size=BATCH_SIZE,
            context_dim=CONTEXT_DIM),
                                                     action_reward_fns,
                                                     batch_size=BATCH_SIZE)
        mask_split_fn = None
        if FLAGS.num_disabled_actions > 0:
            mask_split_fn = lambda x: (x[0], x[1])
            env = wrappers.ExtraDisabledActionsWrapper(
                env, FLAGS.num_disabled_actions)
        environment = tf_py_environment.TFPyEnvironment(env)

        optimal_reward_fn = functools.partial(
            environment_utilities.tf_compute_optimal_reward,
            per_action_reward_fns=action_reward_fns)

        optimal_action_fn = functools.partial(
            environment_utilities.tf_compute_optimal_action,
            per_action_reward_fns=action_reward_fns)

        network_input_spec = environment.time_step_spec().observation
        if FLAGS.num_disabled_actions > 0:

            def _apply_only_to_observation(fn):
                def result_fn(obs):
                    return fn(obs[0])

                return result_fn

            optimal_action_fn = _apply_only_to_observation(optimal_action_fn)
            optimal_reward_fn = _apply_only_to_observation(optimal_reward_fn)
            network_input_spec = network_input_spec[0]

        network = q_network.QNetwork(input_tensor_spec=network_input_spec,
                                     action_spec=environment.action_spec(),
                                     fc_layer_params=LAYERS)

        if FLAGS.agent == 'LinUCB':
            agent = lin_ucb_agent.LinearUCBAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32,
                observation_and_action_constraint_splitter=mask_split_fn)
        elif FLAGS.agent == 'LinTS':
            agent = lin_ts_agent.LinearThompsonSamplingAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32,
                observation_and_action_constraint_splitter=mask_split_fn)
        elif FLAGS.agent == 'epsGreedy':
            agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
                epsilon=EPSILON,
                observation_and_action_constraint_splitter=mask_split_fn)
        elif FLAGS.agent == 'Boltzmann':
            train_step_counter = tf.compat.v1.train.get_or_create_global_step()
            boundaries = [500]
            temp_values = [1000.0, TEMPERATURE]
            temp_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
                boundaries, temp_values)

            def _temperature_fn():
                # Any variable used in the function needs to be saved in the policy.
                # This is true by default for the `train_step_counter`.
                return temp_schedule(train_step_counter)

            agent = neural_boltzmann_agent.NeuralBoltzmannAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                temperature=_temperature_fn,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
                observation_and_action_constraint_splitter=mask_split_fn,
                train_step_counter=train_step_counter)
            # This is needed, otherwise the PolicySaver complains.
            agent.policy.step = train_step_counter
        elif FLAGS.agent == 'BoltzmannGumbel':
            num_samples_list = [
                tf.compat.v2.Variable(0,
                                      dtype=tf.int32,
                                      name='num_samples_{}'.format(k))
                for k in range(NUM_ACTIONS)
            ]
            agent = neural_boltzmann_agent.NeuralBoltzmannAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                boltzmann_gumbel_exploration_constant=250.0,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
                observation_and_action_constraint_splitter=mask_split_fn,
                num_samples_list=num_samples_list)
        elif FLAGS.agent == 'Mix':
            assert FLAGS.num_disabled_actions == 0, (
                'Extra actions with mixture agent not supported.')

            emit_policy_info = policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN
            agent_linucb = lin_ucb_agent.LinearUCBAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                emit_policy_info=emit_policy_info,
                alpha=AGENT_ALPHA,
                dtype=tf.float32)
            agent_lints = lin_ts_agent.LinearThompsonSamplingAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                emit_policy_info=emit_policy_info,
                alpha=AGENT_ALPHA,
                dtype=tf.float32)
            agent_epsgreedy = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
                emit_policy_info=emit_policy_info,
                epsilon=EPSILON)
            agent = exp3_mixture_agent.Exp3MixtureAgent(
                (agent_linucb, agent_lints, agent_epsgreedy))

        regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
        suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
            optimal_action_fn)

        trainer.train(
            root_dir=FLAGS.root_dir,
            agent=agent,
            environment=environment,
            training_loops=TRAINING_LOOPS,
            steps_per_loop=STEPS_PER_LOOP,
            additional_metrics=[regret_metric, suboptimal_arms_metric])
def main(unused_argv):
    tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

    with tf.device('/CPU:0'):  # due to b/128333994
        action_reward_fns = (
            environment_utilities.structured_linear_reward_fn_generator(
                CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE))

        env = sspe.StationaryStochasticPyEnvironment(functools.partial(
            environment_utilities.context_sampling_fn,
            batch_size=BATCH_SIZE,
            context_dim=CONTEXT_DIM),
                                                     action_reward_fns,
                                                     batch_size=BATCH_SIZE)
        environment = tf_py_environment.TFPyEnvironment(env)

        optimal_reward_fn = functools.partial(
            environment_utilities.tf_compute_optimal_reward,
            per_action_reward_fns=action_reward_fns)

        optimal_action_fn = functools.partial(
            environment_utilities.tf_compute_optimal_action,
            per_action_reward_fns=action_reward_fns)

        if FLAGS.agent == 'LinUCB':
            agent = lin_ucb_agent.LinearUCBAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)
        elif FLAGS.agent == 'epsGreedy':
            laplacian_matrix = utils.build_laplacian_over_ordinal_integer_actions(
                environment.action_spec())

            network = q_network.QNetwork(
                input_tensor_spec=environment.time_step_spec().observation,
                action_spec=environment.action_spec(),
                fc_layer_params=REWARD_NETWORK_LAYER_PARAMS)
            agent = eps_greedy_agent.NeuralEpsilonGreedyAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                optimizer=tf.compat.v1.train.AdamOptimizer(
                    learning_rate=NN_LEARNING_RATE),
                epsilon=EPSILON,
                laplacian_matrix=laplacian_matrix,
                laplacian_smoothing_weight=0.01)
        elif FLAGS.agent == 'LinTS':
            agent = lin_ts_agent.LinearThompsonSamplingAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)

        regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
        suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
            optimal_action_fn)

        trainer.train(
            root_dir=FLAGS.root_dir,
            agent=agent,
            environment=environment,
            training_loops=TRAINING_LOOPS,
            steps_per_loop=STEPS_PER_LOOP,
            additional_metrics=[regret_metric, suboptimal_arms_metric])