def update_a_and_b_with_forgetting(a_prev, b_prev, r, x, gamma, compute_eigendecomp=False): r"""Update the covariance matrix `a` and the weighted sum of rewards `b`. This function updates the covariance matrix `a` and the sum of weighted rewards `b` using a forgetting factor `gamma`. Args: a_prev: previous estimate of `a`. b_prev: previous estimate of `b`. r: a `Tensor` of shape [`batch_size`]. This is the rewards of the batched observations. x: a `Tensor` of shape [`batch_size`, `context_dim`]. This is the matrix with the (batched) observations. gamma: a float forgetting factor in [0.0, 1.0]. compute_eigendecomp: whether to compute the eigen-decomposition of the new covariance matrix. Returns: The updated estimates of `a` and `b` and optionally the eigenvalues and eigenvectors of `a`. """ a_new = gamma * a_prev + tf.matmul(x, x, transpose_a=True) b_new = gamma * b_prev + bandit_utils.sum_reward_weighted_observations( r, x) eig_vals = tf.constant([], dtype=a_new.dtype) eig_matrix = tf.constant([], dtype=a_new.dtype) if compute_eigendecomp: eig_vals, eig_matrix = tf.linalg.eigh(a_new) return a_new, b_new, eig_vals, eig_matrix
def true_fn(): a_new = tf.matmul(observations_for_arm, observations_for_arm, transpose_a=True) b_new = bandit_utils.sum_reward_weighted_observations( rewards_for_arm, observations_for_arm) return a_new, b_new
def _train(self, experience, weights=None): """Updates the policy based on the data in `experience`. Note that `experience` should only contain data points that this agent has not previously seen. If `experience` comes from a replay buffer, this buffer should be cleared between each call to `train`. Args: experience: A batch of experience data in the form of a `Trajectory`. weights: Unused. Returns: A `LossInfo` containing the loss *before* the training step is taken. """ del weights # unused # If the experience comes from a replay buffer, the reward has shape: # [batch_size, time_steps] # where `time_steps` is the number of driver steps executed in each # training loop. # We flatten the tensors below in order to reflect the effective batch size. reward, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.reward, self._time_step_spec.reward) action, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.action, self._action_spec) observation, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.observation, self._time_step_spec.observation) if self._observation_and_action_constraint_splitter is not None: observation, _ = self._observation_and_action_constraint_splitter( observation) observation = tf.cast(observation, self._dtype) reward = tf.cast(reward, self._dtype) for k in range(self._num_actions): diag_mask = tf.linalg.tensor_diag( tf.cast(tf.equal(action, k), self._dtype)) observations_for_arm = tf.matmul(diag_mask, observation) rewards_for_arm = tf.matmul(diag_mask, tf.reshape(reward, [-1, 1])) tf.compat.v1.assign( self._weight_covariances[k], self._gamma * self._weight_covariances[k] + tf.matmul(observations_for_arm, observations_for_arm, transpose_a=True)) tf.compat.v1.assign( self._parameter_estimators[k], self._gamma * self._parameter_estimators[k] + bandit_utils.sum_reward_weighted_observations( rewards_for_arm, observations_for_arm)) batch_size = tf.cast(tf.compat.dimension_value(tf.shape(reward)[0]), dtype=tf.int64) self._train_step_counter.assign_add(batch_size) loss_info = tf_agent.LossInfo(loss=(-1. * tf.reduce_sum(experience.reward)), extra=()) return loss_info
def true_fn(): a_new = tf.eye(encoding_dim, dtype=tf.float64) + tf.matmul( encoded_observations_for_arm, encoded_observations_for_arm, transpose_a=True) b_new = bandit_utils.sum_reward_weighted_observations( rewards_for_arm, encoded_observations_for_arm) return a_new, b_new
def testLinearAgentUpdatePerArmFeatures(self, batch_size, context_dim, exploration_policy, dtype, use_eigendecomp=False): """Check that the agent updates for specified actions and rewards.""" # Construct a `Trajectory` for the given action, observation, reward. num_actions = 5 global_context_dim = context_dim arm_context_dim = 3 initial_step, final_step = ( _get_initial_and_final_steps_with_per_arm_features( batch_size, global_context_dim, num_actions, arm_context_dim)) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = policy_step.PolicyStep( action=tf.convert_to_tensor(action), info=policy_utilities.PerArmPolicyInfo( chosen_arm_features=np.arange( batch_size * arm_context_dim, dtype=np.float32).reshape( [batch_size, arm_context_dim]))) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. observation_spec = bandit_spec_utils.create_per_arm_observation_spec( context_dim, arm_context_dim, num_actions) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec( dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) agent = linear_agent.LinearBanditAgent( exploration_policy=exploration_policy, time_step_spec=time_step_spec, action_spec=action_spec, use_eigendecomp=use_eigendecomp, accepts_per_arm_features=True, dtype=dtype) self.evaluate(agent.initialize()) loss_info = agent.train(experience) self.evaluate(loss_info) final_a = self.evaluate(agent.cov_matrix) final_b = self.evaluate(agent.data_vector) # Compute the expected updated estimates. global_observation = experience.observation[ bandit_spec_utils.GLOBAL_FEATURE_KEY] arm_observation = experience.policy_info.chosen_arm_features overall_observation = tf.squeeze( tf.concat([global_observation, arm_observation], axis=-1), axis=1) rewards = tf.squeeze(experience.reward, axis=1) expected_a_new = tf.matmul( overall_observation, overall_observation, transpose_a=True) expected_b_new = bandit_utils.sum_reward_weighted_observations( rewards, overall_observation) self.assertAllClose(expected_a_new, final_a[0]) self.assertAllClose(expected_b_new, final_b[0])
def true_fn(): a_new = gamma * tf.eye(context_dim) + tf.matmul( observations_for_arm, observations_for_arm, transpose_a=True) b_new = bandit_utils.sum_reward_weighted_observations( rewards_for_arm, observations_for_arm) eigmatrix_new = tf.constant([], dtype=dtype) eigvals_new = tf.constant([], dtype=dtype) if use_eigendecomp: eigvals_new, eigmatrix_new = tf.linalg.eigh(a_new) return a_new, b_new, eigvals_new, eigmatrix_new
def _distributed_train_step(self, experience, weights=None): """Distributed train fn to be passed as input to run().""" del weights # unused reward, action, observation, batch_size = self._process_experience( experience) self._train_step_counter.assign_add(batch_size) for k in range(self._num_models): diag_mask = tf.linalg.tensor_diag( tf.cast(tf.equal(action, k), self._dtype)) observations_for_arm = tf.matmul(diag_mask, observation) rewards_for_arm = tf.matmul(diag_mask, tf.reshape(reward, [-1, 1])) # Compute local updates for the matrix A and b of this arm. cov_matrix_local_udpate = tf.matmul(observations_for_arm, observations_for_arm, transpose_a=True) data_vector_local_update = bandit_utils.sum_reward_weighted_observations( rewards_for_arm, observations_for_arm) def _merge_fn(strategy, per_replica_cov_matrix_update, per_replica_data_vector_update): """Merge the per-replica-updates.""" # Reduce the per-replica-updates using SUM. # pylint: disable=cell-var-from-loop updates_and_vars = [ (per_replica_cov_matrix_update, self._cov_matrix_list[k]), (per_replica_data_vector_update, self._data_vector_list[k]) ] reduced_updates = strategy.extended.batch_reduce_to( tf.distribute.ReduceOp.SUM, updates_and_vars) # Update the model variables. self._cov_matrix_list[k].assign_add(reduced_updates[0]) self._data_vector_list[k].assign_add(reduced_updates[1]) # Compute the eigendecomposition, if needed. if self._use_eigendecomp: eig_vals, eig_matrix = tf.linalg.eigh( self._cov_matrix_list[k]) self._eig_vals_list[k].assign(eig_vals) self._eig_matrix_list[k].assign(eig_matrix) # Passes the local_updates to the _merge_fn() above that performs custom # computation on the per-replica values. # All replicas pause their execution until merge_call() is done and then, # execution is resumed. replica_context = tf.distribute.get_replica_context() replica_context.merge_call(_merge_fn, args=(cov_matrix_local_udpate, data_vector_local_update)) loss = -1. * tf.reduce_sum(reward) return tf_agent.LossInfo(loss=(loss), extra=())
def testBUpdate(self, batch_size, context_dim): b_array = np.array(range(context_dim)) r_array = np.array(range(batch_size)).reshape((batch_size, 1)) x_array = np.array(range(batch_size * context_dim)).reshape( (batch_size, context_dim)) rx = r_array * x_array expected_b_updated_array = b_array + np.sum(rx, axis=0) b = tf.constant(b_array, dtype=tf.float32, shape=[context_dim]) r = tf.constant(r_array, dtype=tf.float32, shape=[batch_size]) x = tf.constant(x_array, dtype=tf.float32, shape=[batch_size, context_dim]) b_update = utils.sum_reward_weighted_observations(r, x) self.assertAllClose(expected_b_updated_array, self.evaluate(b + b_update))
def _distributed_train_step(self, experience, weights=None): """Distributed train fn to be passed as input to run().""" del weights # unused reward, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.reward, self._time_step_spec.reward) action, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.action, self._action_spec) observation, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.observation, self._time_step_spec.observation) if self._observation_and_action_constraint_splitter is not None: observation, _ = self._observation_and_action_constraint_splitter( observation) observation = tf.reshape(observation, [-1, self._context_dim]) observation = tf.cast(observation, self._dtype) reward = tf.cast(reward, self._dtype) # Increase the step counter. batch_size = tf.cast(tf.compat.dimension_value(tf.shape(reward)[0]), dtype=tf.int64) self._train_step_counter.assign_add(batch_size) for k in range(self._num_actions): diag_mask = tf.linalg.tensor_diag( tf.cast(tf.equal(action, k), self._dtype)) observations_for_arm = tf.matmul(diag_mask, observation) rewards_for_arm = tf.matmul(diag_mask, tf.reshape(reward, [-1, 1])) # Compute local updates for the matrix A and b of this arm. cov_matrix_local_udpate = tf.matmul(observations_for_arm, observations_for_arm, transpose_a=True) data_vector_local_update = bandit_utils.sum_reward_weighted_observations( rewards_for_arm, observations_for_arm) def _merge_fn(strategy, per_replica_cov_matrix_update, per_replica_data_vector_update): """Merge the per-replica-updates.""" # Reduce the per-replica-updates using SUM. reduced_cov_matrix_updates = strategy.reduce( tf.distribute.ReduceOp.SUM, per_replica_cov_matrix_update, axis=None) reduced_data_vector_updates = strategy.reduce( tf.distribute.ReduceOp.SUM, per_replica_data_vector_update, axis=None) def update_fn(v, t): v.assign(v + t) def assign_fn(v, t): v.assign(t) # Update the model variables. # pylint: disable=cell-var-from-loop strategy.extended.update(self._cov_matrix_list[k], update_fn, args=(reduced_cov_matrix_updates, )) strategy.extended.update(self._data_vector_list[k], update_fn, args=(reduced_data_vector_updates, )) # Compute the eigendecomposition, if needed. if self._use_eigendecomp: eig_vals, eig_matrix = tf.linalg.eigh( self._cov_matrix_list[k]) strategy.extended.update(self._eig_vals_list[k], assign_fn, args=(eig_vals, )) strategy.extended.update(self._eig_matrix_list[k], assign_fn, args=(eig_matrix, )) # Passes the local_updates to the _merge_fn() above that performs custom # computation on the per-replica values. # All replicas pause their execution until merge_call() is done and then, # execution is resumed. replica_context = tf.distribute.get_replica_context() replica_context.merge_call(_merge_fn, args=(cov_matrix_local_udpate, data_vector_local_update)) loss = -1. * tf.reduce_sum(experience.reward) return tf_agent.LossInfo(loss=(loss), extra=())
def testLinearThompsonSamplingUpdateWithForgetting(self, batch_size, context_dim, dtype): """Check forgetting agent updates for specified actions and rewards.""" gamma = 0.9 # Construct a `Trajectory` for the given action, observation, reward. num_actions = 5 initial_step, final_step = _get_initial_and_final_steps( batch_size, context_dim) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. Record initial and final # weights. observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=time_step_spec, action_spec=action_spec, gamma=gamma, dtype=dtype) self.evaluate(tf.compat.v1.global_variables_initializer()) initial_weight_covariances = self.evaluate(agent._weight_covariances) initial_parameter_estimators = self.evaluate( agent._parameter_estimators) loss_info = agent.train(experience) self.evaluate(loss_info) final_weight_covariances = self.evaluate(agent.weight_covariances) final_parameter_estimators = self.evaluate(agent.parameter_estimators) # Compute the expected updates. observations_list = tf.dynamic_partition( data=tf.reshape(experience.observation, [batch_size, context_dim]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) rewards_list = tf.dynamic_partition( data=tf.reshape(experience.reward, [batch_size]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) expected_weight_covariances_update = [] expected_parameter_estimators_update = [] for k, (observations_for_arm, rewards_for_arm) in enumerate( zip(observations_list, rewards_list)): expected_weight_covariances_update.append( self.evaluate(gamma * initial_weight_covariances[k] + tf.matmul(observations_for_arm, observations_for_arm, transpose_a=True))) expected_parameter_estimators_update.append( self.evaluate(gamma * initial_parameter_estimators[k] + bandit_utils.sum_reward_weighted_observations( rewards_for_arm, observations_for_arm))) self.assertAllClose(expected_weight_covariances_update, final_weight_covariances) self.assertAllClose(expected_parameter_estimators_update, final_parameter_estimators)
def compute_loss_using_linucb_distributed( self, observation: types.NestedTensor, action: types.Tensor, reward: types.Tensor, weights: Optional[types.Float] = None, training: bool = False) -> tf_agent.LossInfo: """Computes the loss using LinUCB distributively. Args: observation: A batch of observations. action: A batch of actions. reward: A batch of rewards. weights: unused weights. training: Whether the loss is being used to train. Returns: loss: A `LossInfo` containing the loss for the training step. """ del weights # unused # The network is trained now. Update the covariance matrix. encoded_observation, _ = self._encoding_network(observation, training=training) encoded_observation = tf.cast(encoded_observation, dtype=self._dtype) encoded_observation = tf.reshape(encoded_observation, shape=[-1, self._encoding_dim]) self._train_step_counter.assign_add(1) for k in range(self._num_models): diag_mask = tf.linalg.tensor_diag( tf.cast(tf.equal(action, k), self._dtype)) observations_for_arm = tf.matmul(diag_mask, encoded_observation) rewards_for_arm = tf.matmul(diag_mask, tf.reshape(reward, [-1, 1])) # Compute local updates for the matrix A and b of this arm. cov_matrix_local_udpate = tf.matmul(observations_for_arm, observations_for_arm, transpose_a=True) data_vector_local_update = bandit_utils.sum_reward_weighted_observations( rewards_for_arm, observations_for_arm) def _merge_fn(strategy, per_replica_cov_matrix_update, per_replica_data_vector_update): """Merge the per-replica-updates.""" # Reduce the per-replica-updates using SUM. # pylint: disable=cell-var-from-loop updates_and_vars = [ (per_replica_cov_matrix_update, self.cov_matrix[k]), (per_replica_data_vector_update, self.data_vector[k]) ] reduced_updates = strategy.extended.batch_reduce_to( tf.distribute.ReduceOp.SUM, updates_and_vars) # Update the model variables. self.cov_matrix[k].assign(self._gamma * self.cov_matrix[k] + reduced_updates[0]) self.data_vector[k].assign(self._gamma * self.data_vector[k] + reduced_updates[1]) # Passes the local_updates to the _merge_fn() above that performs custom # computation on the per-replica values. # All replicas pause their execution until merge_call() is done and then, # execution is resumed. replica_context = tf.distribute.get_replica_context() replica_context.merge_call(_merge_fn, args=(cov_matrix_local_udpate, data_vector_local_update)) loss = -1. * tf.reduce_sum(reward) return tf_agent.LossInfo(loss=(loss), extra=())
def testBUpdateEmptyObservations(self, batch_size, context_dim): r = tf.constant([], dtype=tf.float32, shape=[0, 1]) x = tf.constant([], dtype=tf.float32, shape=[0, context_dim]) b_update = utils.sum_reward_weighted_observations(r, x) expected_b_update_array = np.zeros([context_dim], dtype=np.float32) self.assertAllClose(expected_b_update_array, self.evaluate(b_update))