Example #1
0
    def testConjugateGradientMultipleRHSPlaceholders(self, n, rhs):
        # Test the case where a_mat and b_mat are placeholders and they have unknown
        # dimension values.

        if tf.executing_eagerly():
            return

        x_obs = tf.constant(np.random.rand(n, 2),
                            dtype=tf.float32,
                            shape=[n, 2])
        a_mat = tf.eye(n) + tf.matmul(x_obs, tf.linalg.matrix_transpose(x_obs))
        a_mat_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, None))
        a_mat_value = self.evaluate(a_mat)

        x_exact = tf.constant(np.random.rand(n, rhs),
                              dtype=tf.float32,
                              shape=[n, rhs])
        b_mat = tf.matmul(a_mat, x_exact)
        b_mat_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, None))
        b_mat_value = self.evaluate(b_mat)

        x_exact_numpy = self.evaluate(x_exact)
        with self.cached_session() as sess:
            x_approx = linalg.conjugate_gradient(a_mat_ph, b_mat_ph)
            x_approx_value = sess.run(x_approx,
                                      feed_dict={
                                          a_mat_ph: a_mat_value,
                                          b_mat_ph: b_mat_value
                                      })
            self.assertAllClose(x_exact_numpy,
                                x_approx_value,
                                rtol=1e-4,
                                atol=1e-4)
Example #2
0
 def testConjugateGradientBasic(self, n, rhs):
     x_obs = tf.constant(np.random.rand(n, 2),
                         dtype=tf.float32,
                         shape=[n, 2])
     a_mat = tf.eye(n) + tf.matmul(x_obs, tf.linalg.matrix_transpose(x_obs))
     x_exact = tf.constant(np.random.rand(n),
                           dtype=tf.float32,
                           shape=[n, 1])
     b = tf.matmul(a_mat, x_exact)
     x_approx = self.evaluate(linalg.conjugate_gradient(a_mat, b))
     x_exact_numpy = self.evaluate(x_exact)
     self.assertAllClose(x_exact_numpy, x_approx, rtol=1e-4, atol=1e-4)
Example #3
0
    def theta(self):
        """Returns the matrix of per-arm feature weights.

    The returned matrix has shape (num_actions, context_dim).
    It's equivalent to a stacking of theta vectors from the paper.
    """
        thetas = []
        for k in range(self._num_models):
            thetas.append(
                tf.squeeze(linalg.conjugate_gradient(
                    self._cov_matrix_list[k] + self._tikhonov_weight *
                    tf.eye(self._overall_context_dim, dtype=self._dtype),
                    tf.expand_dims(self._data_vector_list[k], axis=-1)),
                           axis=-1))

        return tf.stack(thetas, axis=0)
    def _get_actions_from_linucb(
        self, encoded_observation: types.Float, mask: Optional[types.Tensor]
    ) -> Tuple[types.Int, types.Float, types.Float]:
        encoded_observation = tf.cast(encoded_observation, dtype=self._dtype)

        p_values = []
        est_rewards = []
        for k in range(self._num_actions):
            encoded_observation_for_arm = self._get_encoded_observation_for_arm(
                encoded_observation, k)
            model_index = policy_utilities.get_model_index(
                k, self._accepts_per_arm_features)
            a_inv_x = linalg.conjugate_gradient(
                self._cov_matrix[model_index] +
                tf.eye(self._encoding_dim, dtype=self._dtype),
                tf.linalg.matrix_transpose(encoded_observation_for_arm))
            mean_reward_est = tf.einsum('j,jk->k',
                                        self._data_vector[model_index],
                                        a_inv_x)
            est_rewards.append(mean_reward_est)

            ci = tf.reshape(
                tf.linalg.tensor_diag_part(
                    tf.matmul(encoded_observation_for_arm, a_inv_x)), [-1, 1])
            p_values.append(
                tf.reshape(mean_reward_est, [-1, 1]) +
                self._alpha * tf.sqrt(ci))

        stacked_p_values = tf.squeeze(tf.stack(p_values, axis=-1), axis=[1])
        if mask is None:
            chosen_actions = tf.argmax(stacked_p_values,
                                       axis=-1,
                                       output_type=tf.int32)
        else:
            chosen_actions = policy_utilities.masked_argmax(
                stacked_p_values, mask, output_type=tf.int32)

        est_mean_reward = tf.cast(tf.stack(est_rewards, axis=-1), tf.float32)
        return chosen_actions, est_mean_reward, tf.cast(
            stacked_p_values, tf.float32)
    def _distribution(self, time_step, policy_state):
        observation = time_step.observation
        if self.observation_and_action_constraint_splitter is not None:
            observation, _ = self.observation_and_action_constraint_splitter(
                observation)
        observation = tf.nest.map_structure(
            lambda o: tf.cast(o, dtype=self._dtype), observation)
        global_observation, arm_observations = self._split_observation(
            observation)

        if self._add_bias:
            # The bias is added via a constant 1 feature.
            global_observation = tf.concat([
                global_observation,
                tf.ones([tf.shape(global_observation)[0], 1],
                        dtype=self._dtype)
            ],
                                           axis=1)
        # Check the shape of the observation matrix. The observations can be
        # batched.
        if not global_observation.shape.is_compatible_with(
            [None, self._global_context_dim]):
            raise ValueError(
                'Global observation shape is expected to be {}. Got {}.'.
                format([None, self._global_context_dim],
                       global_observation.shape.as_list()))
        global_observation = tf.reshape(global_observation,
                                        [-1, self._global_context_dim])

        est_rewards = []
        confidence_intervals = []
        for k in range(self._num_actions):
            current_observation = self._get_current_observation(
                global_observation, arm_observations, k)
            model_index = policy_utilities.get_model_index(
                k, self._accepts_per_arm_features)
            if self._use_eigendecomp:
                q_t_b = tf.matmul(
                    self._eig_matrix[model_index],
                    tf.linalg.matrix_transpose(current_observation),
                    transpose_a=True)
                lambda_inv = tf.divide(
                    tf.ones_like(self._eig_vals[model_index]),
                    self._eig_vals[model_index] + self._tikhonov_weight)
                a_inv_x = tf.matmul(self._eig_matrix[model_index],
                                    tf.einsum('j,jk->jk', lambda_inv, q_t_b))
            else:
                a_inv_x = linalg.conjugate_gradient(
                    self._cov_matrix[model_index] + self._tikhonov_weight *
                    tf.eye(self._overall_context_dim, dtype=self._dtype),
                    tf.linalg.matrix_transpose(current_observation))
            est_mean_reward = tf.einsum('j,jk->k',
                                        self._data_vector[model_index],
                                        a_inv_x)
            est_rewards.append(est_mean_reward)

            ci = tf.reshape(
                tf.linalg.tensor_diag_part(
                    tf.matmul(current_observation, a_inv_x)), [-1, 1])
            confidence_intervals.append(ci)

        if self._exploration_strategy == ExplorationStrategy.optimistic:
            optimistic_estimates = [
                tf.reshape(mean_reward, [-1, 1]) +
                self._alpha * tf.sqrt(confidence)
                for mean_reward, confidence in zip(est_rewards,
                                                   confidence_intervals)
            ]
            # Keeping the batch dimension during the squeeze, even if batch_size == 1.
            rewards_for_argmax = tf.squeeze(tf.stack(optimistic_estimates,
                                                     axis=-1),
                                            axis=[1])
        elif self._exploration_strategy == ExplorationStrategy.sampling:
            mu_sampler = tfd.Normal(
                loc=tf.stack(est_rewards, axis=-1),
                scale=self._alpha * tf.sqrt(
                    tf.squeeze(tf.stack(confidence_intervals, axis=-1),
                               axis=1)))
            rewards_for_argmax = mu_sampler.sample()
        else:
            raise ValueError('Exploraton strategy %s not implemented.' %
                             self._exploration_strategy)

        mask = constraints.construct_mask_from_multiple_sources(
            time_step.observation,
            self._observation_and_action_constraint_splitter, (),
            self._num_actions)
        if mask is not None:
            chosen_actions = policy_utilities.masked_argmax(
                rewards_for_argmax,
                mask,
                output_type=tf.nest.flatten(self._action_spec)[0].dtype)
        else:
            chosen_actions = tf.argmax(rewards_for_argmax,
                                       axis=-1,
                                       output_type=tf.nest.flatten(
                                           self._action_spec)[0].dtype)

        action_distributions = tfp.distributions.Deterministic(
            loc=chosen_actions)

        policy_info = policy_utilities.populate_policy_info(
            arm_observations, chosen_actions, rewards_for_argmax,
            tf.stack(est_rewards, axis=-1), self._emit_policy_info,
            self._accepts_per_arm_features)

        return policy_step.PolicyStep(action_distributions, policy_state,
                                      policy_info)