コード例 #1
0
 def testUCBandThompsonSamplingShareVariables(self):
     if not tf.executing_eagerly():
         self.skipTest('Test only works in eager mode.')
     context_dim = 9
     num_actions = 4
     batch_size = 7
     variable_collection = linear_agent.LinearBanditVariableCollection(
         context_dim=context_dim, num_models=num_actions)
     observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
     time_step_spec = time_step.time_step_spec(observation_spec)
     action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                 shape=(),
                                                 minimum=0,
                                                 maximum=num_actions - 1)
     ucb_agent = lin_ucb_agent.LinearUCBAgent(
         time_step_spec=time_step_spec,
         action_spec=action_spec,
         variable_collection=variable_collection)
     ts_agent = linear_thompson_sampling_agent.LinearThompsonSamplingAgent(
         time_step_spec=time_step_spec,
         action_spec=action_spec,
         variable_collection=variable_collection)
     initial_step, final_step = _get_initial_and_final_steps(
         batch_size, context_dim)
     action = np.random.randint(num_actions,
                                size=batch_size,
                                dtype=np.int32)
     action_step = _get_action_step(action)
     experience = _get_experience(initial_step, action_step, final_step)
     self.evaluate(ucb_agent.train(experience))
     self.assertAllEqual(ucb_agent._variable_collection.cov_matrix_list[0],
                         ts_agent._variable_collection.cov_matrix_list[0])
     self.evaluate(ts_agent.train(experience))
     self.assertAllEqual(ucb_agent._variable_collection.data_vector_list[0],
                         ts_agent._variable_collection.data_vector_list[0])
コード例 #2
0
  def testLinearAgentFinalTheta(self, set_example_weights):
    num_actions = 1
    context_dim = 1
    batch_size = 10
    use_eigendecomp = False
    dtype = tf.float32
    # The observation consists of a single constant feature.
    initial_step, final_step = _get_initial_and_final_steps(
        batch_size, context_dim, use_constant_observations=True)
    action = [0] * batch_size
    action_step = _get_action_step(action)
    experience = _get_experience(initial_step, action_step, final_step)

    # Construct an agent and perform the update.
    observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
    time_step_spec = time_step.time_step_spec(observation_spec)
    action_spec = tensor_spec.BoundedTensorSpec(
        dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1)
    variable_collection = linear_agent.LinearBanditVariableCollection(
        context_dim, num_actions, use_eigendecomp=use_eigendecomp, dtype=dtype)

    agent = linear_agent.LinearBanditAgent(
        exploration_policy=linear_agent.ExplorationPolicy.linear_ucb_policy,
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        variable_collection=variable_collection,
        use_eigendecomp=use_eigendecomp,
        tikhonov_weight=0.0,
        dtype=dtype)
    self.evaluate(agent.initialize())

    reward = tf.reshape(experience.reward, [batch_size])

    weights = tf.linspace(
        start=1.5, stop=10.5, num=batch_size) if set_example_weights else None
    self.evaluate(agent.train(experience, weights=weights))
    final_theta = self.evaluate(agent.theta)
    self.assertAllClose(tf.shape(final_theta), [1, 1])
    # Because the observation consists of a single constant feature and the
    # agent uses zero regularization for training (`tikhonov_weight` set to 0),
    # the final theta is expected to be the average reward when the weights are
    # unset, and the weighted average reward when the weights are set.
    if weights is None:
      self.assertAllClose(final_theta[0, 0], tf.reduce_mean(reward))
    else:
      self.assertAllClose(
          final_theta[0, 0],
          tf.reduce_sum(weights * reward) / tf.reduce_sum(weights))
コード例 #3
0
  def testInitializeRestoreVariableCollection(self):
    if not tf.executing_eagerly():
      self.skipTest('Test only works in eager mode.')
    context_dim = 7
    num_actions = 5
    variable_collection = linear_agent.LinearBanditVariableCollection(
        context_dim=context_dim, num_models=num_actions)
    self.evaluate(tf.compat.v1.global_variables_initializer())
    self.evaluate(variable_collection.num_samples_list)
    checkpoint = tf.train.Checkpoint(variable_collection=variable_collection)
    checkpoint_dir = self.get_temp_dir()
    checkpoint_prefix = os.path.join(checkpoint_dir, 'checkpoint')
    checkpoint.save(file_prefix=checkpoint_prefix)

    variable_collection.num_samples_list[2].assign(14)

    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    checkpoint_load_status = checkpoint.restore(latest_checkpoint)
    self.evaluate(checkpoint_load_status.initialize_or_restore())
    self.assertEqual(self.evaluate(variable_collection.num_samples_list[2]), 0)
コード例 #4
0
    def testLinearAgentUpdateWithBias(self,
                                      batch_size,
                                      context_dim,
                                      exploration_policy,
                                      dtype,
                                      use_eigendecomp=False):
        """Check that the agent updates for specified actions and rewards."""

        # Construct a `Trajectory` for the given action, observation, reward.
        num_actions = 5
        initial_step, final_step = _get_initial_and_final_steps(
            batch_size, context_dim)
        action = np.random.randint(num_actions,
                                   size=batch_size,
                                   dtype=np.int32)
        action_step = _get_action_step(action)
        experience = _get_experience(initial_step, action_step, final_step)

        # Construct an agent and perform the update.
        observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
        time_step_spec = time_step.time_step_spec(observation_spec)
        action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                    shape=(),
                                                    minimum=0,
                                                    maximum=num_actions - 1)
        variable_collection = linear_agent.LinearBanditVariableCollection(
            context_dim + 1, num_actions, use_eigendecomp, dtype)
        agent = linear_agent.LinearBanditAgent(
            exploration_policy=exploration_policy,
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            variable_collection=variable_collection,
            use_eigendecomp=use_eigendecomp,
            add_bias=True,
            dtype=dtype)
        self.evaluate(agent.initialize())
        loss_info = agent.train(experience)
        self.evaluate(loss_info)
        final_a = self.evaluate(agent.cov_matrix)
        final_b = self.evaluate(agent.data_vector)
        final_theta = self.evaluate(agent.theta)

        # Compute the expected updated estimates.
        observations_list = tf.dynamic_partition(
            data=tf.reshape(experience.observation, [batch_size, context_dim]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        rewards_list = tf.dynamic_partition(
            data=tf.reshape(experience.reward, [batch_size]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        expected_a_updated_list = []
        expected_b_updated_list = []
        expected_theta_updated_list = []
        for _, (observations_for_arm, rewards_for_arm) in enumerate(
                zip(observations_list, rewards_list)):
            observations_for_arm = tf.concat([
                observations_for_arm,
                tf.ones_like(observations_for_arm[:, 0:1])
            ],
                                             axis=1)
            num_samples_for_arm_current = tf.cast(
                tf.shape(rewards_for_arm)[0], tf.float32)
            num_samples_for_arm_total = num_samples_for_arm_current

            # pylint: disable=cell-var-from-loop
            def true_fn():
                a_new = tf.matmul(observations_for_arm,
                                  observations_for_arm,
                                  transpose_a=True)
                b_new = bandit_utils.sum_reward_weighted_observations(
                    rewards_for_arm, observations_for_arm)
                return a_new, b_new

            def false_fn():
                return tf.zeros([context_dim + 1,
                                 context_dim + 1]), tf.zeros([context_dim + 1])

            a_new, b_new = tf.cond(
                tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn)
            theta_new = tf.squeeze(tf.linalg.solve(
                a_new + tf.eye(context_dim + 1), tf.expand_dims(b_new,
                                                                axis=-1)),
                                   axis=-1)

            expected_a_updated_list.append(self.evaluate(a_new))
            expected_b_updated_list.append(self.evaluate(b_new))
            expected_theta_updated_list.append(self.evaluate(theta_new))

        # Check that the actual updated estimates match the expectations.
        self.assertAllClose(expected_a_updated_list, final_a)
        self.assertAllClose(expected_b_updated_list, final_b)
        self.assertAllClose(self.evaluate(
            tf.stack(expected_theta_updated_list)),
                            final_theta,
                            atol=0.1,
                            rtol=0.05)