Ejemplo n.º 1
0
def update_a_and_b_with_forgetting(a_prev,
                                   b_prev,
                                   r,
                                   x,
                                   gamma,
                                   compute_eigendecomp=False):
    r"""Update the covariance matrix `a` and the weighted sum of rewards `b`.

  This function updates the covariance matrix `a` and the sum of weighted
  rewards `b` using a forgetting factor `gamma`.

  Args:
    a_prev: previous estimate of `a`.
    b_prev: previous estimate of `b`.
    r: a `Tensor` of shape [`batch_size`]. This is the rewards of the batched
      observations.
    x: a `Tensor` of shape [`batch_size`, `context_dim`]. This is the matrix
      with the (batched) observations.
    gamma: a float forgetting factor in [0.0, 1.0].
    compute_eigendecomp: whether to compute the eigen-decomposition of the new
      covariance matrix.

  Returns:
    The updated estimates of `a` and `b` and optionally the eigenvalues and
    eigenvectors of `a`.
  """
    a_new = gamma * a_prev + tf.matmul(x, x, transpose_a=True)
    b_new = gamma * b_prev + bandit_utils.sum_reward_weighted_observations(
        r, x)

    eig_vals = tf.constant([], dtype=a_new.dtype)
    eig_matrix = tf.constant([], dtype=a_new.dtype)
    if compute_eigendecomp:
        eig_vals, eig_matrix = tf.linalg.eigh(a_new)
    return a_new, b_new, eig_vals, eig_matrix
Ejemplo n.º 2
0
 def true_fn():
     a_new = tf.matmul(observations_for_arm,
                       observations_for_arm,
                       transpose_a=True)
     b_new = bandit_utils.sum_reward_weighted_observations(
         rewards_for_arm, observations_for_arm)
     return a_new, b_new
    def _train(self, experience, weights=None):
        """Updates the policy based on the data in `experience`.

    Note that `experience` should only contain data points that this agent has
    not previously seen. If `experience` comes from a replay buffer, this buffer
    should be cleared between each call to `train`.

    Args:
      experience: A batch of experience data in the form of a `Trajectory`.
      weights: Unused.

    Returns:
        A `LossInfo` containing the loss *before* the training step is taken.
    """
        del weights  # unused

        # If the experience comes from a replay buffer, the reward has shape:
        #     [batch_size, time_steps]
        # where `time_steps` is the number of driver steps executed in each
        # training loop.
        # We flatten the tensors below in order to reflect the effective batch size.

        reward, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.reward, self._time_step_spec.reward)
        action, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.action, self._action_spec)
        observation, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.observation, self._time_step_spec.observation)
        if self._observation_and_action_constraint_splitter is not None:
            observation, _ = self._observation_and_action_constraint_splitter(
                observation)
        observation = tf.cast(observation, self._dtype)
        reward = tf.cast(reward, self._dtype)

        for k in range(self._num_actions):
            diag_mask = tf.linalg.tensor_diag(
                tf.cast(tf.equal(action, k), self._dtype))
            observations_for_arm = tf.matmul(diag_mask, observation)
            rewards_for_arm = tf.matmul(diag_mask, tf.reshape(reward, [-1, 1]))
            tf.compat.v1.assign(
                self._weight_covariances[k],
                self._gamma * self._weight_covariances[k] +
                tf.matmul(observations_for_arm,
                          observations_for_arm,
                          transpose_a=True))
            tf.compat.v1.assign(
                self._parameter_estimators[k],
                self._gamma * self._parameter_estimators[k] +
                bandit_utils.sum_reward_weighted_observations(
                    rewards_for_arm, observations_for_arm))

        batch_size = tf.cast(tf.compat.dimension_value(tf.shape(reward)[0]),
                             dtype=tf.int64)
        self._train_step_counter.assign_add(batch_size)

        loss_info = tf_agent.LossInfo(loss=(-1. *
                                            tf.reduce_sum(experience.reward)),
                                      extra=())
        return loss_info
Ejemplo n.º 4
0
 def true_fn():
     a_new = tf.eye(encoding_dim, dtype=tf.float64) + tf.matmul(
         encoded_observations_for_arm,
         encoded_observations_for_arm,
         transpose_a=True)
     b_new = bandit_utils.sum_reward_weighted_observations(
         rewards_for_arm, encoded_observations_for_arm)
     return a_new, b_new
Ejemplo n.º 5
0
  def testLinearAgentUpdatePerArmFeatures(self,
                                          batch_size,
                                          context_dim,
                                          exploration_policy,
                                          dtype,
                                          use_eigendecomp=False):
    """Check that the agent updates for specified actions and rewards."""

    # Construct a `Trajectory` for the given action, observation, reward.
    num_actions = 5
    global_context_dim = context_dim
    arm_context_dim = 3
    initial_step, final_step = (
        _get_initial_and_final_steps_with_per_arm_features(
            batch_size, global_context_dim, num_actions, arm_context_dim))
    action = np.random.randint(num_actions, size=batch_size, dtype=np.int32)
    action_step = policy_step.PolicyStep(
        action=tf.convert_to_tensor(action),
        info=policy_utilities.PerArmPolicyInfo(
            chosen_arm_features=np.arange(
                batch_size * arm_context_dim, dtype=np.float32).reshape(
                    [batch_size, arm_context_dim])))
    experience = _get_experience(initial_step, action_step, final_step)

    # Construct an agent and perform the update.
    observation_spec = bandit_spec_utils.create_per_arm_observation_spec(
        context_dim, arm_context_dim, num_actions)
    time_step_spec = time_step.time_step_spec(observation_spec)
    action_spec = tensor_spec.BoundedTensorSpec(
        dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1)
    agent = linear_agent.LinearBanditAgent(
        exploration_policy=exploration_policy,
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        use_eigendecomp=use_eigendecomp,
        accepts_per_arm_features=True,
        dtype=dtype)
    self.evaluate(agent.initialize())
    loss_info = agent.train(experience)
    self.evaluate(loss_info)
    final_a = self.evaluate(agent.cov_matrix)
    final_b = self.evaluate(agent.data_vector)

    # Compute the expected updated estimates.
    global_observation = experience.observation[
        bandit_spec_utils.GLOBAL_FEATURE_KEY]
    arm_observation = experience.policy_info.chosen_arm_features
    overall_observation = tf.squeeze(
        tf.concat([global_observation, arm_observation], axis=-1), axis=1)
    rewards = tf.squeeze(experience.reward, axis=1)

    expected_a_new = tf.matmul(
        overall_observation, overall_observation, transpose_a=True)
    expected_b_new = bandit_utils.sum_reward_weighted_observations(
        rewards, overall_observation)
    self.assertAllClose(expected_a_new, final_a[0])
    self.assertAllClose(expected_b_new, final_b[0])
Ejemplo n.º 6
0
 def true_fn():
   a_new = gamma * tf.eye(context_dim) + tf.matmul(
       observations_for_arm, observations_for_arm, transpose_a=True)
   b_new = bandit_utils.sum_reward_weighted_observations(
       rewards_for_arm, observations_for_arm)
   eigmatrix_new = tf.constant([], dtype=dtype)
   eigvals_new = tf.constant([], dtype=dtype)
   if use_eigendecomp:
     eigvals_new, eigmatrix_new = tf.linalg.eigh(a_new)
   return a_new, b_new, eigvals_new, eigmatrix_new
Ejemplo n.º 7
0
    def _distributed_train_step(self, experience, weights=None):
        """Distributed train fn to be passed as input to run()."""
        del weights  # unused
        reward, action, observation, batch_size = self._process_experience(
            experience)
        self._train_step_counter.assign_add(batch_size)

        for k in range(self._num_models):
            diag_mask = tf.linalg.tensor_diag(
                tf.cast(tf.equal(action, k), self._dtype))
            observations_for_arm = tf.matmul(diag_mask, observation)
            rewards_for_arm = tf.matmul(diag_mask, tf.reshape(reward, [-1, 1]))

            # Compute local updates for the matrix A and b of this arm.
            cov_matrix_local_udpate = tf.matmul(observations_for_arm,
                                                observations_for_arm,
                                                transpose_a=True)
            data_vector_local_update = bandit_utils.sum_reward_weighted_observations(
                rewards_for_arm, observations_for_arm)

            def _merge_fn(strategy, per_replica_cov_matrix_update,
                          per_replica_data_vector_update):
                """Merge the per-replica-updates."""
                # Reduce the per-replica-updates using SUM.
                # pylint: disable=cell-var-from-loop
                updates_and_vars = [
                    (per_replica_cov_matrix_update, self._cov_matrix_list[k]),
                    (per_replica_data_vector_update, self._data_vector_list[k])
                ]

                reduced_updates = strategy.extended.batch_reduce_to(
                    tf.distribute.ReduceOp.SUM, updates_and_vars)

                # Update the model variables.
                self._cov_matrix_list[k].assign_add(reduced_updates[0])
                self._data_vector_list[k].assign_add(reduced_updates[1])

                # Compute the eigendecomposition, if needed.
                if self._use_eigendecomp:
                    eig_vals, eig_matrix = tf.linalg.eigh(
                        self._cov_matrix_list[k])
                    self._eig_vals_list[k].assign(eig_vals)
                    self._eig_matrix_list[k].assign(eig_matrix)

            # Passes the local_updates to the _merge_fn() above that performs custom
            # computation on the per-replica values.
            # All replicas pause their execution until merge_call() is done and then,
            # execution is resumed.
            replica_context = tf.distribute.get_replica_context()
            replica_context.merge_call(_merge_fn,
                                       args=(cov_matrix_local_udpate,
                                             data_vector_local_update))

        loss = -1. * tf.reduce_sum(reward)
        return tf_agent.LossInfo(loss=(loss), extra=())
Ejemplo n.º 8
0
  def testBUpdate(self, batch_size, context_dim):
    b_array = np.array(range(context_dim))
    r_array = np.array(range(batch_size)).reshape((batch_size, 1))
    x_array = np.array(range(batch_size * context_dim)).reshape(
        (batch_size, context_dim))
    rx = r_array * x_array
    expected_b_updated_array = b_array + np.sum(rx, axis=0)

    b = tf.constant(b_array, dtype=tf.float32, shape=[context_dim])
    r = tf.constant(r_array, dtype=tf.float32, shape=[batch_size])
    x = tf.constant(x_array, dtype=tf.float32, shape=[batch_size, context_dim])
    b_update = utils.sum_reward_weighted_observations(r, x)
    self.assertAllClose(expected_b_updated_array, self.evaluate(b + b_update))
Ejemplo n.º 9
0
    def _distributed_train_step(self, experience, weights=None):
        """Distributed train fn to be passed as input to run()."""
        del weights  # unused
        reward, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.reward, self._time_step_spec.reward)
        action, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.action, self._action_spec)
        observation, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.observation, self._time_step_spec.observation)

        if self._observation_and_action_constraint_splitter is not None:
            observation, _ = self._observation_and_action_constraint_splitter(
                observation)
        observation = tf.reshape(observation, [-1, self._context_dim])
        observation = tf.cast(observation, self._dtype)
        reward = tf.cast(reward, self._dtype)

        # Increase the step counter.
        batch_size = tf.cast(tf.compat.dimension_value(tf.shape(reward)[0]),
                             dtype=tf.int64)
        self._train_step_counter.assign_add(batch_size)

        for k in range(self._num_actions):
            diag_mask = tf.linalg.tensor_diag(
                tf.cast(tf.equal(action, k), self._dtype))
            observations_for_arm = tf.matmul(diag_mask, observation)
            rewards_for_arm = tf.matmul(diag_mask, tf.reshape(reward, [-1, 1]))

            # Compute local updates for the matrix A and b of this arm.
            cov_matrix_local_udpate = tf.matmul(observations_for_arm,
                                                observations_for_arm,
                                                transpose_a=True)
            data_vector_local_update = bandit_utils.sum_reward_weighted_observations(
                rewards_for_arm, observations_for_arm)

            def _merge_fn(strategy, per_replica_cov_matrix_update,
                          per_replica_data_vector_update):
                """Merge the per-replica-updates."""
                # Reduce the per-replica-updates using SUM.
                reduced_cov_matrix_updates = strategy.reduce(
                    tf.distribute.ReduceOp.SUM,
                    per_replica_cov_matrix_update,
                    axis=None)
                reduced_data_vector_updates = strategy.reduce(
                    tf.distribute.ReduceOp.SUM,
                    per_replica_data_vector_update,
                    axis=None)

                def update_fn(v, t):
                    v.assign(v + t)

                def assign_fn(v, t):
                    v.assign(t)

                # Update the model variables.
                # pylint: disable=cell-var-from-loop
                strategy.extended.update(self._cov_matrix_list[k],
                                         update_fn,
                                         args=(reduced_cov_matrix_updates, ))
                strategy.extended.update(self._data_vector_list[k],
                                         update_fn,
                                         args=(reduced_data_vector_updates, ))
                # Compute the eigendecomposition, if needed.
                if self._use_eigendecomp:
                    eig_vals, eig_matrix = tf.linalg.eigh(
                        self._cov_matrix_list[k])
                    strategy.extended.update(self._eig_vals_list[k],
                                             assign_fn,
                                             args=(eig_vals, ))
                    strategy.extended.update(self._eig_matrix_list[k],
                                             assign_fn,
                                             args=(eig_matrix, ))

            # Passes the local_updates to the _merge_fn() above that performs custom
            # computation on the per-replica values.
            # All replicas pause their execution until merge_call() is done and then,
            # execution is resumed.
            replica_context = tf.distribute.get_replica_context()
            replica_context.merge_call(_merge_fn,
                                       args=(cov_matrix_local_udpate,
                                             data_vector_local_update))

        loss = -1. * tf.reduce_sum(experience.reward)
        return tf_agent.LossInfo(loss=(loss), extra=())
Ejemplo n.º 10
0
    def testLinearThompsonSamplingUpdateWithForgetting(self, batch_size,
                                                       context_dim, dtype):
        """Check forgetting agent updates for specified actions and rewards."""
        gamma = 0.9

        # Construct a `Trajectory` for the given action, observation, reward.
        num_actions = 5
        initial_step, final_step = _get_initial_and_final_steps(
            batch_size, context_dim)
        action = np.random.randint(num_actions,
                                   size=batch_size,
                                   dtype=np.int32)
        action_step = _get_action_step(action)
        experience = _get_experience(initial_step, action_step, final_step)

        # Construct an agent and perform the update. Record initial and final
        # weights.
        observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
        time_step_spec = time_step.time_step_spec(observation_spec)
        action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                    shape=(),
                                                    minimum=0,
                                                    maximum=num_actions - 1)
        agent = lin_ts_agent.LinearThompsonSamplingAgent(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            gamma=gamma,
            dtype=dtype)
        self.evaluate(tf.compat.v1.global_variables_initializer())
        initial_weight_covariances = self.evaluate(agent._weight_covariances)
        initial_parameter_estimators = self.evaluate(
            agent._parameter_estimators)

        loss_info = agent.train(experience)
        self.evaluate(loss_info)
        final_weight_covariances = self.evaluate(agent.weight_covariances)
        final_parameter_estimators = self.evaluate(agent.parameter_estimators)

        # Compute the expected updates.
        observations_list = tf.dynamic_partition(
            data=tf.reshape(experience.observation, [batch_size, context_dim]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        rewards_list = tf.dynamic_partition(
            data=tf.reshape(experience.reward, [batch_size]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        expected_weight_covariances_update = []
        expected_parameter_estimators_update = []
        for k, (observations_for_arm, rewards_for_arm) in enumerate(
                zip(observations_list, rewards_list)):
            expected_weight_covariances_update.append(
                self.evaluate(gamma * initial_weight_covariances[k] +
                              tf.matmul(observations_for_arm,
                                        observations_for_arm,
                                        transpose_a=True)))
            expected_parameter_estimators_update.append(
                self.evaluate(gamma * initial_parameter_estimators[k] +
                              bandit_utils.sum_reward_weighted_observations(
                                  rewards_for_arm, observations_for_arm)))
        self.assertAllClose(expected_weight_covariances_update,
                            final_weight_covariances)
        self.assertAllClose(expected_parameter_estimators_update,
                            final_parameter_estimators)
Ejemplo n.º 11
0
    def compute_loss_using_linucb_distributed(
            self,
            observation: types.NestedTensor,
            action: types.Tensor,
            reward: types.Tensor,
            weights: Optional[types.Float] = None,
            training: bool = False) -> tf_agent.LossInfo:
        """Computes the loss using LinUCB distributively.

    Args:
      observation: A batch of observations.
      action: A batch of actions.
      reward: A batch of rewards.
      weights: unused weights.
      training: Whether the loss is being used to train.

    Returns:
      loss: A `LossInfo` containing the loss for the training step.
    """
        del weights  # unused

        # The network is trained now. Update the covariance matrix.
        encoded_observation, _ = self._encoding_network(observation,
                                                        training=training)
        encoded_observation = tf.cast(encoded_observation, dtype=self._dtype)
        encoded_observation = tf.reshape(encoded_observation,
                                         shape=[-1, self._encoding_dim])

        self._train_step_counter.assign_add(1)

        for k in range(self._num_models):
            diag_mask = tf.linalg.tensor_diag(
                tf.cast(tf.equal(action, k), self._dtype))
            observations_for_arm = tf.matmul(diag_mask, encoded_observation)
            rewards_for_arm = tf.matmul(diag_mask, tf.reshape(reward, [-1, 1]))

            # Compute local updates for the matrix A and b of this arm.
            cov_matrix_local_udpate = tf.matmul(observations_for_arm,
                                                observations_for_arm,
                                                transpose_a=True)
            data_vector_local_update = bandit_utils.sum_reward_weighted_observations(
                rewards_for_arm, observations_for_arm)

            def _merge_fn(strategy, per_replica_cov_matrix_update,
                          per_replica_data_vector_update):
                """Merge the per-replica-updates."""
                # Reduce the per-replica-updates using SUM.
                # pylint: disable=cell-var-from-loop
                updates_and_vars = [
                    (per_replica_cov_matrix_update, self.cov_matrix[k]),
                    (per_replica_data_vector_update, self.data_vector[k])
                ]

                reduced_updates = strategy.extended.batch_reduce_to(
                    tf.distribute.ReduceOp.SUM, updates_and_vars)

                # Update the model variables.
                self.cov_matrix[k].assign(self._gamma * self.cov_matrix[k] +
                                          reduced_updates[0])
                self.data_vector[k].assign(self._gamma * self.data_vector[k] +
                                           reduced_updates[1])

            # Passes the local_updates to the _merge_fn() above that performs custom
            # computation on the per-replica values.
            # All replicas pause their execution until merge_call() is done and then,
            # execution is resumed.
            replica_context = tf.distribute.get_replica_context()
            replica_context.merge_call(_merge_fn,
                                       args=(cov_matrix_local_udpate,
                                             data_vector_local_update))

        loss = -1. * tf.reduce_sum(reward)
        return tf_agent.LossInfo(loss=(loss), extra=())
Ejemplo n.º 12
0
 def testBUpdateEmptyObservations(self, batch_size, context_dim):
   r = tf.constant([], dtype=tf.float32, shape=[0, 1])
   x = tf.constant([], dtype=tf.float32, shape=[0, context_dim])
   b_update = utils.sum_reward_weighted_observations(r, x)
   expected_b_update_array = np.zeros([context_dim], dtype=np.float32)
   self.assertAllClose(expected_b_update_array, self.evaluate(b_update))