def testActionBatchWithVariablesAndPolicyUpdate(self, batch_size):
        a_list = []
        a_new_list = []
        b_list = []
        b_new_list = []
        num_samples_list = []
        num_samples_new_list = []
        for k in range(1, self._num_actions + 1):
            a_initial_value = tf.constant(
                [[2 * k + 1, k + 1], [k + 1, 2 * k + 1]], dtype=tf.float32)
            a_for_one_arm = tf.compat.v2.Variable(a_initial_value)
            a_list.append(a_for_one_arm)
            b_initial_value = tf.constant([k, k], dtype=tf.float32)
            b_for_one_arm = tf.compat.v2.Variable(b_initial_value)
            b_list.append(b_for_one_arm)
            num_samples_initial_value = tf.constant([1], dtype=tf.float32)
            num_samples_for_one_arm = tf.compat.v2.Variable(
                num_samples_initial_value)
            num_samples_list.append(num_samples_for_one_arm)

            # Variables for the new policy (they differ by an offset).
            a_new_for_one_arm = tf.compat.v2.Variable(a_initial_value +
                                                      _POLICY_VARIABLES_OFFSET)
            a_new_list.append(a_new_for_one_arm)
            b_new_for_one_arm = tf.compat.v2.Variable(b_initial_value +
                                                      _POLICY_VARIABLES_OFFSET)
            b_new_list.append(b_new_for_one_arm)
            num_samples_for_one_arm_new = tf.compat.v2.Variable(
                num_samples_initial_value + _POLICY_VARIABLES_OFFSET)
            num_samples_new_list.append(num_samples_for_one_arm_new)

        self.evaluate(tf.compat.v1.global_variables_initializer())

        policy = lin_ucb_policy.LinearUCBPolicy(self._action_spec, a_list,
                                                b_list, num_samples_list,
                                                self._time_step_spec)
        self.assertLen(policy.variables(), 3 * self._num_actions)

        new_policy = lin_ucb_policy.LinearUCBPolicy(self._action_spec,
                                                    a_new_list, b_new_list,
                                                    num_samples_new_list,
                                                    self._time_step_spec)
        self.assertLen(new_policy.variables(), 3 * self._num_actions)

        self.evaluate(new_policy.update(policy))

        action_step = policy.action(
            self._time_step_batch(batch_size=batch_size))
        new_action_step = new_policy.action(
            self._time_step_batch(batch_size=batch_size))
        self.assertEqual(action_step.action.shape,
                         new_action_step.action.shape)
        self.assertEqual(action_step.action.dtype,
                         new_action_step.action.dtype)
        actions_, new_actions_ = self.evaluate(
            [action_step.action, new_action_step.action])
        self.assertAllEqual(actions_, new_actions_)
Beispiel #2
0
    def testObservationShapeMismatch(self, batch_size):
        policy = lin_ucb_policy.LinearUCBPolicy(self._action_spec, self._a,
                                                self._b,
                                                self._num_samples_per_arm,
                                                self._time_step_spec)

        current_time_step = ts.TimeStep(
            tf.constant(ts.StepType.FIRST,
                        dtype=tf.int32,
                        shape=[batch_size],
                        name='step_type'),
            tf.constant(0.0,
                        dtype=tf.float32,
                        shape=[batch_size],
                        name='reward'),
            tf.constant(1.0,
                        dtype=tf.float32,
                        shape=[batch_size],
                        name='discount'),
            tf.constant(np.array(range(batch_size * (self._obs_dim + 1))),
                        dtype=tf.float32,
                        shape=[batch_size, self._obs_dim + 1],
                        name='observation'))
        with self.assertRaisesRegexp(
                ValueError, r'Observation shape is expected to be \[None, 2\].'
                r' Got \[%d, 3\].' % batch_size):
            policy.action(current_time_step)
Beispiel #3
0
    def testBuild(self):
        policy = lin_ucb_policy.LinearUCBPolicy(self._action_spec, self._a,
                                                self._b,
                                                self._num_samples_per_arm,
                                                self._time_step_spec)

        self.assertEqual(policy.time_step_spec, self._time_step_spec)
Beispiel #4
0
    def testActionBatch(self, batch_size):
        policy = lin_ucb_policy.LinearUCBPolicy(self._action_spec, self._a,
                                                self._b,
                                                self._num_samples_per_arm,
                                                self._time_step_spec)

        action_step = policy.action(
            self._time_step_batch(batch_size=batch_size))
        self.assertEqual(action_step.action.shape.as_list(), [batch_size])
        self.assertEqual(action_step.action.dtype, tf.int32)
        actions_ = self.evaluate(action_step.action)
        self.assertAllGreaterEqual(actions_, self._action_spec.minimum)
        self.assertAllLessEqual(actions_, self._action_spec.maximum)
Beispiel #5
0
    def testActionBatchWithMask(self, batch_size):
        def split_fn(obs):
            return obs[0], obs[1]

        policy = lin_ucb_policy.LinearUCBPolicy(
            self._action_spec,
            self._a,
            self._b,
            self._num_samples_per_arm,
            self._time_step_spec_with_mask,
            observation_and_action_constraint_splitter=split_fn)

        action_step = policy.action(
            self._time_step_batch_with_mask(batch_size=batch_size))
        self.assertEqual(action_step.action.shape.as_list(), [batch_size])
        self.assertEqual(action_step.action.dtype, tf.int32)
        actions_ = self.evaluate(action_step.action)
        self.assertAllEqual(actions_, range(batch_size))
Beispiel #6
0
    def testPredictedRewards(self, batch_size):
        policy = lin_ucb_policy.LinearUCBPolicy(
            self._action_spec,
            self._a,
            self._b,
            self._num_samples_per_arm,
            self._time_step_spec,
            emit_policy_info=(
                policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN, ))

        action_step = policy.action(
            self._time_step_batch(batch_size=batch_size))
        self.assertEqual(action_step.action.shape.as_list(), [batch_size])
        self.assertEqual(action_step.action.dtype, tf.int32)

        observation_numpy = np.array(range(batch_size * self._obs_dim),
                                     dtype=np.float32).reshape(
                                         [batch_size, self._obs_dim])

        p_values = []
        predicted_rewards_expected = []
        for k in range(self._num_actions):
            a_inv = np.linalg.inv(self._a_numpy[k] + np.eye(self._obs_dim))
            theta = np.matmul(a_inv,
                              self._b_numpy[k].reshape([self._obs_dim, 1]))
            confidence_intervals = np.sqrt(
                np.diag(
                    np.matmul(
                        observation_numpy,
                        np.matmul(a_inv, np.transpose(observation_numpy)))))
            est_mean_reward = np.matmul(observation_numpy, theta)
            predicted_rewards_expected.append(est_mean_reward)
            p_value = (est_mean_reward +
                       self._alpha * confidence_intervals.reshape([-1, 1]))
            p_values.append(p_value)

        predicted_rewards_expected_array = np.stack(predicted_rewards_expected,
                                                    axis=-1).reshape(
                                                        batch_size,
                                                        self._num_actions)
        p_info = self.evaluate(action_step.info)
        self.assertAllClose(p_info.predicted_rewards_mean,
                            predicted_rewards_expected_array)
Beispiel #7
0
    def testComparisonWithNumpy(self, batch_size, use_decomposition=False):
        eig_matrix_list = ()
        eig_vals_list = ()
        if use_decomposition:
            eig_vals_one_arm, eig_matrix_one_arm = tf.linalg.eigh(self._a[0])
            eig_vals_list = [eig_vals_one_arm] * self._num_actions
            eig_matrix_list = [eig_matrix_one_arm] * self._num_actions

        policy = lin_ucb_policy.LinearUCBPolicy(self._action_spec,
                                                self._a,
                                                self._b,
                                                self._num_samples_per_arm,
                                                self._time_step_spec,
                                                eig_vals=eig_vals_list,
                                                eig_matrix=eig_matrix_list)

        action_step = policy.action(
            self._time_step_batch(batch_size=batch_size))
        self.assertEqual(action_step.action.shape.as_list(), [batch_size])
        self.assertEqual(action_step.action.dtype, tf.int32)
        actions_ = self.evaluate(action_step.action)

        observation_numpy = np.array(range(batch_size * self._obs_dim),
                                     dtype=np.float32).reshape(
                                         [batch_size, self._obs_dim])

        p_values = []
        for k in range(self._num_actions):
            a_inv = np.linalg.inv(self._a_numpy[k] + np.eye(self._obs_dim))
            theta = np.matmul(a_inv,
                              self._b_numpy[k].reshape([self._obs_dim, 1]))
            confidence_intervals = np.sqrt(
                np.diag(
                    np.matmul(
                        observation_numpy,
                        np.matmul(a_inv, np.transpose(observation_numpy)))))
            p_value = (np.matmul(observation_numpy, theta) +
                       self._alpha * confidence_intervals.reshape([-1, 1]))
            p_values.append(p_value)

        actions_numpy = np.argmax(np.stack(p_values, axis=-1),
                                  axis=-1).reshape([batch_size])
        self.assertAllEqual(actions_.reshape([batch_size]), actions_numpy)
Beispiel #8
0
    def testActionBatchWithBias(self, batch_size):
        a = [tf.constant([[4, 1, 2], [1, 5, 3], [2, 3, 6]], dtype=tf.float32)
             ] * self._num_actions
        b = [
            tf.constant([r, r, r], dtype=tf.float32)
            for r in range(self._num_actions)
        ]
        policy = lin_ucb_policy.LinearUCBPolicy(self._action_spec,
                                                a,
                                                b,
                                                self._num_samples_per_arm,
                                                self._time_step_spec,
                                                add_bias=True)

        action_step = policy.action(
            self._time_step_batch(batch_size=batch_size))
        self.assertEqual(action_step.action.shape.as_list(), [batch_size])
        self.assertEqual(action_step.action.dtype, tf.int32)
        actions_ = self.evaluate(action_step.action)
        self.assertAllGreaterEqual(actions_, self._action_spec.minimum)
        self.assertAllLessEqual(actions_, self._action_spec.maximum)
Beispiel #9
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 alpha=1.0,
                 gamma=1.0,
                 use_eigendecomp=False,
                 tikhonov_weight=1.0,
                 emit_log_probability=False,
                 dtype=tf.float32,
                 name=None):
        """Initialize an instance of `LinearUCBAgent`.

    Args:
      time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s.
      action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype
        describing the number of actions for this agent.
      alpha: (float) positive scalar. This is the exploration parameter that
        multiplies the confidence intervals.
      gamma: a float forgetting factor in [0.0, 1.0]. When set to
        1.0, the algorithm does not forget.
      use_eigendecomp: whether to use eigen-decomposition or not. The default
        solver is Conjugate Gradient.
      tikhonov_weight: (float) tikhonov regularization term.
      emit_log_probability: Whether the LinearUCBPolicy emits log-probabilities
        or not. Since the policy is deterministic, the probability is just 1.
      dtype: The type of the parameters stored and updated by the agent. Should
        be one of `tf.float32` and `tf.float64`. Defaults to `tf.float32`.
      name: a name for this instance of `LinearUCBAgent`.

    Raises:
      ValueError if dtype is not one of `tf.float32` or `tf.float64`.
    """
        tf.Module.__init__(self, name=name)
        self._num_actions = bandit_utils.get_num_actions_from_tensor_spec(
            action_spec)
        self._context_dim = int(time_step_spec.observation.shape[0])
        self._alpha = alpha
        self._cov_matrix_list = []
        self._data_vector_list = []
        self._eig_matrix_list = []
        self._eig_vals_list = []
        # We keep track of the number of samples per arm.
        self._num_samples_list = []
        self._gamma = gamma
        if self._gamma < 0.0 or self._gamma > 1.0:
            raise ValueError(
                'Forgetting factor `gamma` must be in [0.0, 1.0].')
        self._dtype = dtype
        if dtype not in (tf.float32, tf.float64):
            raise ValueError(
                'Agent dtype should be either `tf.float32 or `tf.float64`.')
        self._use_eigendecomp = use_eigendecomp
        self._tikhonov_weight = tikhonov_weight

        for k in range(self._num_actions):
            self._cov_matrix_list.append(
                tf.compat.v2.Variable(tf.eye(self._context_dim, dtype=dtype),
                                      name='a_' + str(k)))
            self._data_vector_list.append(
                tf.compat.v2.Variable(tf.zeros(self._context_dim, dtype=dtype),
                                      name='b_' + str(k)))
            self._num_samples_list.append(
                tf.compat.v2.Variable(tf.zeros([], dtype=dtype),
                                      name='num_samples_' + str(k)))
            if self._use_eigendecomp:
                self._eig_matrix_list.append(
                    tf.compat.v2.Variable(tf.eye(self._context_dim,
                                                 dtype=dtype),
                                          name='eig_matrix' + str(k)))
                self._eig_vals_list.append(
                    tf.compat.v2.Variable(tf.ones([self._context_dim],
                                                  dtype=dtype),
                                          name='eig_vals' + str(k)))
            else:
                self._eig_matrix_list.append(
                    tf.compat.v2.Variable(tf.constant([], dtype=dtype),
                                          name='eig_matrix' + str(k)))
                self._eig_vals_list.append(
                    tf.compat.v2.Variable(tf.constant([], dtype=dtype),
                                          name='eig_vals' + str(k)))

        policy = lin_ucb_policy.LinearUCBPolicy(
            action_spec=action_spec,
            cov_matrix=self._cov_matrix_list,
            data_vector=self._data_vector_list,
            num_samples=self._num_samples_list,
            time_step_spec=time_step_spec,
            alpha=alpha,
            eig_vals=self._eig_vals_list if self._use_eigendecomp else (),
            eig_matrix=self._eig_matrix_list if self._use_eigendecomp else (),
            tikhonov_weight=self._tikhonov_weight,
            emit_log_probability=emit_log_probability)
        super(LinearUCBAgent, self).__init__(time_step_spec=time_step_spec,
                                             action_spec=action_spec,
                                             policy=policy,
                                             collect_policy=policy,
                                             train_sequence_length=None)
Beispiel #10
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 alpha=1.0,
                 gamma=1.0,
                 use_eigendecomp=False,
                 tikhonov_weight=1.0,
                 add_bias=False,
                 emit_policy_info=(),
                 emit_log_probability=False,
                 observation_and_action_constraint_splitter=None,
                 debug_summaries=False,
                 summarize_grads_and_vars=False,
                 enable_summaries=True,
                 dtype=tf.float32,
                 name=None):
        """Initialize an instance of `LinearUCBAgent`.

    Args:
      time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s.
      action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype
        describing the number of actions for this agent.
      alpha: (float) positive scalar. This is the exploration parameter that
        multiplies the confidence intervals.
      gamma: a float forgetting factor in [0.0, 1.0]. When set to
        1.0, the algorithm does not forget.
      use_eigendecomp: whether to use eigen-decomposition or not. The default
        solver is Conjugate Gradient.
      tikhonov_weight: (float) tikhonov regularization term.
      add_bias: If true, a bias term will be added to the linear reward
        estimation.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      emit_log_probability: Whether the LinearUCBPolicy emits log-probabilities
        or not. Since the policy is deterministic, the probability is just 1.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      enable_summaries: A Python bool, default True. When False, all summaries
        (debug or otherwise) should not be written.
      dtype: The type of the parameters stored and updated by the agent. Should
        be one of `tf.float32` and `tf.float64`. Defaults to `tf.float32`.
      name: a name for this instance of `LinearUCBAgent`.

    Raises:
      ValueError if dtype is not one of `tf.float32` or `tf.float64`.
    """
        tf.Module.__init__(self, name=name)
        self._num_actions = bandit_utils.get_num_actions_from_tensor_spec(
            action_spec)
        if observation_and_action_constraint_splitter is not None:
            context_shape = observation_and_action_constraint_splitter(
                time_step_spec.observation)[0].shape.as_list()
        else:
            context_shape = time_step_spec.observation.shape.as_list()
        self._add_bias = add_bias
        self._context_dim = (tf.compat.dimension_value(context_shape[0])
                             if context_shape else 1)
        if self._add_bias:
            # The bias is added via a constant 1 feature.
            self._context_dim += 1
        self._alpha = alpha
        self._cov_matrix_list = []
        self._data_vector_list = []
        self._eig_matrix_list = []
        self._eig_vals_list = []
        # We keep track of the number of samples per arm.
        self._num_samples_list = []
        self._gamma = gamma
        if self._gamma < 0.0 or self._gamma > 1.0:
            raise ValueError(
                'Forgetting factor `gamma` must be in [0.0, 1.0].')
        self._dtype = dtype
        if dtype not in (tf.float32, tf.float64):
            raise ValueError(
                'Agent dtype should be either `tf.float32 or `tf.float64`.')
        self._use_eigendecomp = use_eigendecomp
        self._tikhonov_weight = tikhonov_weight
        self._observation_and_action_constraint_splitter = (
            observation_and_action_constraint_splitter)

        for k in range(self._num_actions):
            self._cov_matrix_list.append(
                tf.compat.v2.Variable(tf.eye(self._context_dim, dtype=dtype),
                                      name='a_' + str(k)))
            self._data_vector_list.append(
                tf.compat.v2.Variable(tf.zeros(self._context_dim, dtype=dtype),
                                      name='b_' + str(k)))
            self._num_samples_list.append(
                tf.compat.v2.Variable(tf.zeros([], dtype=dtype),
                                      name='num_samples_' + str(k)))
            if self._use_eigendecomp:
                self._eig_matrix_list.append(
                    tf.compat.v2.Variable(tf.eye(self._context_dim,
                                                 dtype=dtype),
                                          name='eig_matrix' + str(k)))
                self._eig_vals_list.append(
                    tf.compat.v2.Variable(tf.ones([self._context_dim],
                                                  dtype=dtype),
                                          name='eig_vals' + str(k)))
            else:
                self._eig_matrix_list.append(
                    tf.compat.v2.Variable(tf.constant([], dtype=dtype),
                                          name='eig_matrix' + str(k)))
                self._eig_vals_list.append(
                    tf.compat.v2.Variable(tf.constant([], dtype=dtype),
                                          name='eig_vals' + str(k)))

        policy = lin_ucb_policy.LinearUCBPolicy(
            action_spec=action_spec,
            cov_matrix=self._cov_matrix_list,
            data_vector=self._data_vector_list,
            num_samples=self._num_samples_list,
            time_step_spec=time_step_spec,
            alpha=alpha,
            eig_vals=self._eig_vals_list if self._use_eigendecomp else (),
            eig_matrix=self._eig_matrix_list if self._use_eigendecomp else (),
            tikhonov_weight=self._tikhonov_weight,
            add_bias=add_bias,
            emit_policy_info=emit_policy_info,
            emit_log_probability=emit_log_probability,
            observation_and_action_constraint_splitter=
            observation_and_action_constraint_splitter)
        super(LinearUCBAgent,
              self).__init__(time_step_spec=time_step_spec,
                             action_spec=action_spec,
                             policy=policy,
                             collect_policy=policy,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             enable_summaries=enable_summaries,
                             train_sequence_length=None)