def testUCBandThompsonSamplingShareVariables(self): if not tf.executing_eagerly(): self.skipTest('Test only works in eager mode.') context_dim = 9 num_actions = 4 batch_size = 7 variable_collection = linear_agent.LinearBanditVariableCollection( context_dim=context_dim, num_models=num_actions) observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) ucb_agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=time_step_spec, action_spec=action_spec, variable_collection=variable_collection) ts_agent = linear_thompson_sampling_agent.LinearThompsonSamplingAgent( time_step_spec=time_step_spec, action_spec=action_spec, variable_collection=variable_collection) initial_step, final_step = _get_initial_and_final_steps( batch_size, context_dim) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action) experience = _get_experience(initial_step, action_step, final_step) self.evaluate(ucb_agent.train(experience)) self.assertAllEqual(ucb_agent._variable_collection.cov_matrix_list[0], ts_agent._variable_collection.cov_matrix_list[0]) self.evaluate(ts_agent.train(experience)) self.assertAllEqual(ucb_agent._variable_collection.data_vector_list[0], ts_agent._variable_collection.data_vector_list[0])
def testLinearAgentFinalTheta(self, set_example_weights): num_actions = 1 context_dim = 1 batch_size = 10 use_eigendecomp = False dtype = tf.float32 # The observation consists of a single constant feature. initial_step, final_step = _get_initial_and_final_steps( batch_size, context_dim, use_constant_observations=True) action = [0] * batch_size action_step = _get_action_step(action) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec( dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) variable_collection = linear_agent.LinearBanditVariableCollection( context_dim, num_actions, use_eigendecomp=use_eigendecomp, dtype=dtype) agent = linear_agent.LinearBanditAgent( exploration_policy=linear_agent.ExplorationPolicy.linear_ucb_policy, time_step_spec=time_step_spec, action_spec=action_spec, variable_collection=variable_collection, use_eigendecomp=use_eigendecomp, tikhonov_weight=0.0, dtype=dtype) self.evaluate(agent.initialize()) reward = tf.reshape(experience.reward, [batch_size]) weights = tf.linspace( start=1.5, stop=10.5, num=batch_size) if set_example_weights else None self.evaluate(agent.train(experience, weights=weights)) final_theta = self.evaluate(agent.theta) self.assertAllClose(tf.shape(final_theta), [1, 1]) # Because the observation consists of a single constant feature and the # agent uses zero regularization for training (`tikhonov_weight` set to 0), # the final theta is expected to be the average reward when the weights are # unset, and the weighted average reward when the weights are set. if weights is None: self.assertAllClose(final_theta[0, 0], tf.reduce_mean(reward)) else: self.assertAllClose( final_theta[0, 0], tf.reduce_sum(weights * reward) / tf.reduce_sum(weights))
def testInitializeRestoreVariableCollection(self): if not tf.executing_eagerly(): self.skipTest('Test only works in eager mode.') context_dim = 7 num_actions = 5 variable_collection = linear_agent.LinearBanditVariableCollection( context_dim=context_dim, num_models=num_actions) self.evaluate(tf.compat.v1.global_variables_initializer()) self.evaluate(variable_collection.num_samples_list) checkpoint = tf.train.Checkpoint(variable_collection=variable_collection) checkpoint_dir = self.get_temp_dir() checkpoint_prefix = os.path.join(checkpoint_dir, 'checkpoint') checkpoint.save(file_prefix=checkpoint_prefix) variable_collection.num_samples_list[2].assign(14) latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) checkpoint_load_status = checkpoint.restore(latest_checkpoint) self.evaluate(checkpoint_load_status.initialize_or_restore()) self.assertEqual(self.evaluate(variable_collection.num_samples_list[2]), 0)
def testLinearAgentUpdateWithBias(self, batch_size, context_dim, exploration_policy, dtype, use_eigendecomp=False): """Check that the agent updates for specified actions and rewards.""" # Construct a `Trajectory` for the given action, observation, reward. num_actions = 5 initial_step, final_step = _get_initial_and_final_steps( batch_size, context_dim) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) variable_collection = linear_agent.LinearBanditVariableCollection( context_dim + 1, num_actions, use_eigendecomp, dtype) agent = linear_agent.LinearBanditAgent( exploration_policy=exploration_policy, time_step_spec=time_step_spec, action_spec=action_spec, variable_collection=variable_collection, use_eigendecomp=use_eigendecomp, add_bias=True, dtype=dtype) self.evaluate(agent.initialize()) loss_info = agent.train(experience) self.evaluate(loss_info) final_a = self.evaluate(agent.cov_matrix) final_b = self.evaluate(agent.data_vector) final_theta = self.evaluate(agent.theta) # Compute the expected updated estimates. observations_list = tf.dynamic_partition( data=tf.reshape(experience.observation, [batch_size, context_dim]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) rewards_list = tf.dynamic_partition( data=tf.reshape(experience.reward, [batch_size]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) expected_a_updated_list = [] expected_b_updated_list = [] expected_theta_updated_list = [] for _, (observations_for_arm, rewards_for_arm) in enumerate( zip(observations_list, rewards_list)): observations_for_arm = tf.concat([ observations_for_arm, tf.ones_like(observations_for_arm[:, 0:1]) ], axis=1) num_samples_for_arm_current = tf.cast( tf.shape(rewards_for_arm)[0], tf.float32) num_samples_for_arm_total = num_samples_for_arm_current # pylint: disable=cell-var-from-loop def true_fn(): a_new = tf.matmul(observations_for_arm, observations_for_arm, transpose_a=True) b_new = bandit_utils.sum_reward_weighted_observations( rewards_for_arm, observations_for_arm) return a_new, b_new def false_fn(): return tf.zeros([context_dim + 1, context_dim + 1]), tf.zeros([context_dim + 1]) a_new, b_new = tf.cond( tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn) theta_new = tf.squeeze(tf.linalg.solve( a_new + tf.eye(context_dim + 1), tf.expand_dims(b_new, axis=-1)), axis=-1) expected_a_updated_list.append(self.evaluate(a_new)) expected_b_updated_list.append(self.evaluate(b_new)) expected_theta_updated_list.append(self.evaluate(theta_new)) # Check that the actual updated estimates match the expectations. self.assertAllClose(expected_a_updated_list, final_a) self.assertAllClose(expected_b_updated_list, final_b) self.assertAllClose(self.evaluate( tf.stack(expected_theta_updated_list)), final_theta, atol=0.1, rtol=0.05)