def testInitializeAgent(self, batch_size, context_dim, exploration_policy, dtype, use_eigendecomp=False, set_example_weights=False): del batch_size, use_eigendecomp, set_example_weights # Unused in this test. num_actions = 5 observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec( dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) agent = linear_agent.LinearBanditAgent( exploration_policy=exploration_policy, time_step_spec=time_step_spec, action_spec=action_spec, dtype=dtype) self.evaluate(agent.initialize())
def testDistributedLinearAgentUpdate(self, batch_size, context_dim, exploration_policy, dtype, use_eigendecomp=False): """Same as above, but uses the distributed train function of the agent.""" # Construct a `Trajectory` for the given action, observation, reward. num_actions = 5 initial_step, final_step = _get_initial_and_final_steps( batch_size, context_dim) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) agent = linear_agent.LinearBanditAgent( exploration_policy=exploration_policy, time_step_spec=time_step_spec, action_spec=action_spec, dtype=dtype) self.evaluate(agent.initialize()) train_fn = common.function_in_tf1()(agent._distributed_train_step) loss_info = train_fn(experience=experience) self.evaluate(loss_info) final_a = self.evaluate(agent.cov_matrix) final_b = self.evaluate(agent.data_vector) # Compute the expected updated estimates. observations_list = tf.dynamic_partition( data=tf.reshape(experience.observation, [batch_size, context_dim]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) rewards_list = tf.dynamic_partition( data=tf.reshape(experience.reward, [batch_size]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) expected_a_updated_list = [] expected_b_updated_list = [] expected_theta_updated_list = [] for _, (observations_for_arm, rewards_for_arm) in enumerate( zip(observations_list, rewards_list)): num_samples_for_arm_current = tf.cast( tf.shape(rewards_for_arm)[0], tf.float32) num_samples_for_arm_total = num_samples_for_arm_current # pylint: disable=cell-var-from-loop def true_fn(): a_new = tf.matmul(observations_for_arm, observations_for_arm, transpose_a=True) b_new = bandit_utils.sum_reward_weighted_observations( rewards_for_arm, observations_for_arm) return a_new, b_new def false_fn(): return tf.zeros([context_dim, context_dim]), tf.zeros([context_dim]) a_new, b_new = tf.cond( tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn) theta_new = tf.squeeze(tf.linalg.solve( a_new + tf.eye(context_dim), tf.expand_dims(b_new, axis=-1)), axis=-1) expected_a_updated_list.append(self.evaluate(a_new)) expected_b_updated_list.append(self.evaluate(b_new)) expected_theta_updated_list.append(self.evaluate(theta_new)) # Check that the actual updated estimates match the expectations. self.assertAllClose(expected_a_updated_list, final_a) self.assertAllClose(expected_b_updated_list, final_b)
def testLinearAgentUpdateWithForgetting(self, batch_size, context_dim, exploration_policy, dtype, use_eigendecomp=False): """Check that the agent updates for specified actions and rewards.""" # We should rewrite this test as it currently does not depend on # the value of `gamma`. To properly test the forgetting factor, we need to # call `train` twice. gamma = 0.9 # Construct a `Trajectory` for the given action, observation, reward. num_actions = 5 initial_step, final_step = _get_initial_and_final_steps( batch_size, context_dim) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) agent = linear_agent.LinearBanditAgent( exploration_policy=exploration_policy, time_step_spec=time_step_spec, action_spec=action_spec, gamma=gamma, dtype=dtype, use_eigendecomp=use_eigendecomp) self.evaluate(tf.compat.v1.global_variables_initializer()) loss_info = agent.train(experience) self.evaluate(loss_info) final_a = self.evaluate(agent.cov_matrix) final_b = self.evaluate(agent.data_vector) final_eig_vals = self.evaluate(agent.eig_vals) # Compute the expected updated estimates. observations_list = tf.dynamic_partition( data=tf.reshape(experience.observation, [batch_size, context_dim]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) rewards_list = tf.dynamic_partition( data=tf.reshape(experience.reward, [batch_size]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) expected_a_updated_list = [] expected_b_updated_list = [] expected_eigvals_updated_list = [] for _, (observations_for_arm, rewards_for_arm) in enumerate( zip(observations_list, rewards_list)): num_samples_for_arm_current = tf.cast( tf.shape(rewards_for_arm)[0], tf.float32) num_samples_for_arm_total = num_samples_for_arm_current # pylint: disable=cell-var-from-loop def true_fn(): a_new = tf.matmul(observations_for_arm, observations_for_arm, transpose_a=True) b_new = bandit_utils.sum_reward_weighted_observations( rewards_for_arm, observations_for_arm) eigmatrix_new = tf.constant([], dtype=dtype) eigvals_new = tf.constant([], dtype=dtype) if use_eigendecomp: eigvals_new, eigmatrix_new = tf.linalg.eigh(a_new) return a_new, b_new, eigvals_new, eigmatrix_new def false_fn(): if use_eigendecomp: return (tf.zeros([context_dim, context_dim]), tf.zeros([context_dim]), tf.ones([context_dim]), tf.eye(context_dim)) else: return (tf.zeros([context_dim, context_dim]), tf.zeros([context_dim]), tf.constant([], dtype=dtype), tf.constant([], dtype=dtype)) a_new, b_new, eig_vals_new, _ = tf.cond( tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn) expected_a_updated_list.append(self.evaluate(a_new)) expected_b_updated_list.append(self.evaluate(b_new)) expected_eigvals_updated_list.append(self.evaluate(eig_vals_new)) # Check that the actual updated estimates match the expectations. self.assertAllClose(expected_a_updated_list, final_a) self.assertAllClose(expected_b_updated_list, final_b) self.assertAllClose(expected_eigvals_updated_list, final_eig_vals, atol=1e-4, rtol=1e-4)
def testLinearAgentUpdateWithMaskedActions(self, batch_size, context_dim, exploration_policy, dtype, use_eigendecomp=False): """Check that the agent updates for specified actions and rewards.""" # Construct a `Trajectory` for the given action, observation, reward. num_actions = 5 initial_step, final_step = _get_initial_and_final_steps_with_action_mask( batch_size, context_dim, num_actions=num_actions) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. observation_spec = (tensor_spec.TensorSpec([context_dim], tf.float32), tensor_spec.TensorSpec([num_actions], tf.int32)) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) def observation_and_action_constraint_splitter(obs): return obs[0], obs[1] agent = linear_agent.LinearBanditAgent( exploration_policy=exploration_policy, time_step_spec=time_step_spec, action_spec=action_spec, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), dtype=dtype) self.evaluate(agent.initialize()) loss_info = agent.train(experience) self.evaluate(loss_info) final_a = self.evaluate(agent.cov_matrix) final_b = self.evaluate(agent.data_vector) # Compute the expected updated estimates. observations_list = tf.dynamic_partition( data=tf.reshape( observation_and_action_constraint_splitter( experience.observation)[0], [batch_size, -1]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) rewards_list = tf.dynamic_partition( data=tf.reshape(experience.reward, [batch_size]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) expected_a_updated_list = [] expected_b_updated_list = [] for _, (observations_for_arm, rewards_for_arm) in enumerate( zip(observations_list, rewards_list)): num_samples_for_arm_current = tf.cast( tf.shape(rewards_for_arm)[0], tf.float32) num_samples_for_arm_total = num_samples_for_arm_current # pylint: disable=cell-var-from-loop def true_fn(): a_new = tf.matmul(observations_for_arm, observations_for_arm, transpose_a=True) b_new = bandit_utils.sum_reward_weighted_observations( rewards_for_arm, observations_for_arm) return a_new, b_new def false_fn(): return tf.zeros([context_dim, context_dim]), tf.zeros([context_dim]) a_new, b_new = tf.cond( tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn) expected_a_updated_list.append(self.evaluate(a_new)) expected_b_updated_list.append(self.evaluate(b_new)) # Check that the actual updated estimates match the expectations. self.assertAllClose(expected_a_updated_list, final_a) self.assertAllClose(expected_b_updated_list, final_b)
def testLinearAgentUpdateWithBias(self, batch_size, context_dim, exploration_policy, dtype, use_eigendecomp=False): """Check that the agent updates for specified actions and rewards.""" # Construct a `Trajectory` for the given action, observation, reward. num_actions = 5 initial_step, final_step = _get_initial_and_final_steps( batch_size, context_dim) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) variable_collection = linear_agent.LinearBanditVariableCollection( context_dim + 1, num_actions, use_eigendecomp, dtype) agent = linear_agent.LinearBanditAgent( exploration_policy=exploration_policy, time_step_spec=time_step_spec, action_spec=action_spec, variable_collection=variable_collection, use_eigendecomp=use_eigendecomp, add_bias=True, dtype=dtype) self.evaluate(agent.initialize()) loss_info = agent.train(experience) self.evaluate(loss_info) final_a = self.evaluate(agent.cov_matrix) final_b = self.evaluate(agent.data_vector) final_theta = self.evaluate(agent.theta) # Compute the expected updated estimates. observations_list = tf.dynamic_partition( data=tf.reshape(experience.observation, [batch_size, context_dim]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) rewards_list = tf.dynamic_partition( data=tf.reshape(experience.reward, [batch_size]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) expected_a_updated_list = [] expected_b_updated_list = [] expected_theta_updated_list = [] for _, (observations_for_arm, rewards_for_arm) in enumerate( zip(observations_list, rewards_list)): observations_for_arm = tf.concat([ observations_for_arm, tf.ones_like(observations_for_arm[:, 0:1]) ], axis=1) num_samples_for_arm_current = tf.cast( tf.shape(rewards_for_arm)[0], tf.float32) num_samples_for_arm_total = num_samples_for_arm_current # pylint: disable=cell-var-from-loop def true_fn(): a_new = tf.matmul(observations_for_arm, observations_for_arm, transpose_a=True) b_new = bandit_utils.sum_reward_weighted_observations( rewards_for_arm, observations_for_arm) return a_new, b_new def false_fn(): return tf.zeros([context_dim + 1, context_dim + 1]), tf.zeros([context_dim + 1]) a_new, b_new = tf.cond( tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn) theta_new = tf.squeeze(tf.linalg.solve( a_new + tf.eye(context_dim + 1), tf.expand_dims(b_new, axis=-1)), axis=-1) expected_a_updated_list.append(self.evaluate(a_new)) expected_b_updated_list.append(self.evaluate(b_new)) expected_theta_updated_list.append(self.evaluate(theta_new)) # Check that the actual updated estimates match the expectations. self.assertAllClose(expected_a_updated_list, final_a) self.assertAllClose(expected_b_updated_list, final_b) self.assertAllClose(self.evaluate( tf.stack(expected_theta_updated_list)), final_theta, atol=0.1, rtol=0.05)
def testLinearAgentUpdatePerArmFeatures(self, batch_size, context_dim, exploration_policy, dtype, use_eigendecomp=False): """Check that the agent updates for specified actions and rewards.""" # Construct a `Trajectory` for the given action, observation, reward. num_actions = 5 global_context_dim = context_dim arm_context_dim = 3 initial_step, final_step = ( _get_initial_and_final_steps_with_per_arm_features( batch_size, global_context_dim, num_actions, arm_context_dim)) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = policy_step.PolicyStep( action=tf.convert_to_tensor(action), info=policy_utilities.PerArmPolicyInfo( chosen_arm_features=np.arange( batch_size * arm_context_dim, dtype=np.float32).reshape( [batch_size, arm_context_dim]))) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. observation_spec = bandit_spec_utils.create_per_arm_observation_spec( context_dim, arm_context_dim, num_actions) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) agent = linear_agent.LinearBanditAgent( exploration_policy=exploration_policy, time_step_spec=time_step_spec, action_spec=action_spec, use_eigendecomp=use_eigendecomp, accepts_per_arm_features=True, dtype=dtype) self.evaluate(agent.initialize()) loss_info = agent.train(experience) self.evaluate(loss_info) final_a = self.evaluate(agent.cov_matrix) final_b = self.evaluate(agent.data_vector) # Compute the expected updated estimates. global_observation = experience.observation[ bandit_spec_utils.GLOBAL_FEATURE_KEY] arm_observation = experience.policy_info.chosen_arm_features overall_observation = tf.squeeze(tf.concat( [global_observation, arm_observation], axis=-1), axis=1) rewards = tf.squeeze(experience.reward, axis=1) expected_a_new = tf.matmul(overall_observation, overall_observation, transpose_a=True) expected_b_new = bandit_utils.sum_reward_weighted_observations( rewards, overall_observation) self.assertAllClose(expected_a_new, final_a[0]) self.assertAllClose(expected_b_new, final_b[0])