Esempio n. 1
0
    def testActionBatchWithMask(self, batch_size, actions_from_reward_layer):
        obs_spec = (tensor_spec.TensorSpec([self._obs_dim], tf.float32),
                    tensor_spec.TensorSpec([self._num_actions], tf.int32))
        time_step_spec = ts.time_step_spec(obs_spec)
        policy = neural_linucb_policy.NeuralLinUCBPolicy(
            DummyNet(obs_spec[0]),
            self._encoding_dim,
            get_reward_layer(),
            actions_from_reward_layer=tf.constant(actions_from_reward_layer,
                                                  dtype=tf.bool),
            cov_matrix=self._a,
            data_vector=self._b,
            num_samples=self._num_samples_per_arm,
            epsilon_greedy=0.5,
            time_step_spec=time_step_spec,
            observation_and_action_constraint_splitter=lambda x: (x[0], x[1]))

        action_fn = common.function_in_tf1()(policy.action)
        action_step = action_fn(
            self._time_step_batch_with_mask(batch_size=batch_size))
        self.assertEqual(action_step.action.shape.as_list(), [batch_size])
        self.assertEqual(action_step.action.dtype, tf.int32)
        self.evaluate(tf.compat.v1.global_variables_initializer())
        actions = self.evaluate(action_step.action)
        self.assertAllEqual(actions, range(batch_size))
Esempio n. 2
0
    def testObservationShapeMismatch(self, batch_size,
                                     actions_from_reward_layer):
        policy = neural_linucb_policy.NeuralLinUCBPolicy(
            DummyNet(),
            self._encoding_dim,
            get_reward_layer(),
            actions_from_reward_layer=actions_from_reward_layer,
            cov_matrix=self._a,
            data_vector=self._b,
            num_samples=self._num_samples_per_arm,
            epsilon_greedy=0.0,
            time_step_spec=self._time_step_spec)

        current_time_step = ts.TimeStep(
            tf.constant(ts.StepType.FIRST,
                        dtype=tf.int32,
                        shape=[batch_size],
                        name='step_type'),
            tf.constant(0.0,
                        dtype=tf.float32,
                        shape=[batch_size],
                        name='reward'),
            tf.constant(1.0,
                        dtype=tf.float32,
                        shape=[batch_size],
                        name='discount'),
            tf.constant(np.array(range(batch_size * (self._obs_dim + 1))),
                        dtype=tf.float32,
                        shape=[batch_size, self._obs_dim + 1],
                        name='observation'))
        with self.assertRaisesRegexp(
                ValueError, r'Observation shape is expected to be \[None, 2\].'
                r' Got \[%d, 3\].' % batch_size):
            policy.action(current_time_step)
  def testObservationShapeMismatch(self, batch_size, actions_from_reward_layer):
    policy = neural_linucb_policy.NeuralLinUCBPolicy(
        DummyNet(self._obs_spec),
        self._encoding_dim,
        get_reward_layer(),
        actions_from_reward_layer=actions_from_reward_layer,
        cov_matrix=self._a,
        data_vector=self._b,
        num_samples=self._num_samples_per_arm,
        epsilon_greedy=0.0,
        time_step_spec=self._time_step_spec)

    current_time_step = ts.TimeStep(
        tf.constant(
            ts.StepType.FIRST, dtype=tf.int32, shape=[batch_size],
            name='step_type'),
        tf.constant(0.0, dtype=tf.float32, shape=[batch_size], name='reward'),
        tf.constant(1.0, dtype=tf.float32, shape=[batch_size], name='discount'),
        tf.constant(np.array(range(batch_size * (self._obs_dim + 1))),
                    dtype=tf.float32, shape=[batch_size, self._obs_dim + 1],
                    name='observation'))
    if tf.executing_eagerly():
      error_type = tf.errors.InvalidArgumentError
      regexp = r'Matrix size-incompatible: In\[0\]: \[%d,3\]' % batch_size
    else:
      error_type = ValueError
      regexp = r'with shape \[%d, 3\]' % batch_size
    with self.assertRaisesRegex(error_type, regexp):
      policy.action(current_time_step)
Esempio n. 4
0
    def testPerArmObservation(self, batch_size, actions_from_reward_layer):
        global_obs_dim = 7
        arm_obs_dim = 3
        obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
            global_obs_dim, arm_obs_dim, self._num_actions)
        time_step_spec = ts.time_step_spec(obs_spec)
        dummy_net = arm_network.create_feed_forward_common_tower_network(
            obs_spec,
            global_layers=(3, 4, 5),
            arm_layers=(3, 2),
            common_layers=(4, 3),
            output_dim=self._encoding_dim)
        reward_layer = get_per_arm_reward_layer(
            encoding_dim=self._encoding_dim)

        policy = neural_linucb_policy.NeuralLinUCBPolicy(
            dummy_net,
            self._encoding_dim,
            reward_layer,
            actions_from_reward_layer=tf.constant(actions_from_reward_layer,
                                                  dtype=tf.bool),
            cov_matrix=self._a[0:1],
            data_vector=self._b[0:1],
            num_samples=self._num_samples_per_arm[0:1],
            epsilon_greedy=0.0,
            time_step_spec=time_step_spec,
            accepts_per_arm_features=True,
            emit_policy_info=('predicted_rewards_mean', ))

        current_time_step = self._per_arm_time_step_batch(
            batch_size=batch_size,
            global_obs_dim=global_obs_dim,
            arm_obs_dim=arm_obs_dim)
        action_step = policy.action(current_time_step)
        self.assertEqual(action_step.action.dtype, tf.int32)
        self.evaluate(tf.compat.v1.global_variables_initializer())
        action_fn = common.function_in_tf1()(policy.action)
        action_step = action_fn(current_time_step)

        input_observation = current_time_step.observation
        encoded_observation, _ = dummy_net(input_observation)

        if actions_from_reward_layer:
            predicted_rewards_from_reward_layer = reward_layer(
                encoded_observation)
            predicted_rewards_expected = self.evaluate(
                predicted_rewards_from_reward_layer).reshape(
                    (-1, self._num_actions))
        else:
            observation_numpy = self.evaluate(encoded_observation)
            predicted_rewards_expected = (
                self._get_predicted_rewards_from_per_arm_linucb(
                    observation_numpy, batch_size))

        p_info = self.evaluate(action_step.info)
        self.assertEqual(p_info.predicted_rewards_mean.dtype, np.float32)
        self.assertAllClose(p_info.predicted_rewards_mean,
                            predicted_rewards_expected)
Esempio n. 5
0
    def testBuild(self, batch_size, actions_from_reward_layer):
        policy = neural_linucb_policy.NeuralLinUCBPolicy(
            DummyNet(),
            self._encoding_dim,
            get_reward_layer(),
            actions_from_reward_layer=actions_from_reward_layer,
            cov_matrix=self._a,
            data_vector=self._b,
            num_samples=self._num_samples_per_arm,
            epsilon_greedy=0.0,
            time_step_spec=self._time_step_spec)

        self.assertEqual(policy.time_step_spec, self._time_step_spec)
Esempio n. 6
0
    def testPredictedRewards(self, batch_size, actions_from_reward_layer):
        dummy_net = DummyNet(self._obs_spec)
        reward_layer = get_reward_layer()

        policy = neural_linucb_policy.NeuralLinUCBPolicy(
            dummy_net,
            self._encoding_dim,
            reward_layer,
            actions_from_reward_layer=tf.constant(actions_from_reward_layer,
                                                  dtype=tf.bool),
            cov_matrix=self._a,
            data_vector=self._b,
            num_samples=self._num_samples_per_arm,
            epsilon_greedy=0.0,
            time_step_spec=self._time_step_spec,
            emit_policy_info=('predicted_rewards_mean', ))

        current_time_step = self._time_step_batch(batch_size=batch_size)
        action_step = policy.action(current_time_step)
        self.assertEqual(action_step.action.dtype, tf.int32)
        self.evaluate(tf.compat.v1.global_variables_initializer())
        action_fn = common.function_in_tf1()(policy.action)
        action_step = action_fn(current_time_step)

        input_observation = current_time_step.observation
        encoded_observation, _ = dummy_net(input_observation)
        predicted_rewards_from_reward_layer = reward_layer(encoded_observation)

        if actions_from_reward_layer:
            predicted_rewards_expected = self.evaluate(
                predicted_rewards_from_reward_layer)
        else:
            observation_numpy = self.evaluate(encoded_observation)
            predicted_rewards_expected = self._get_predicted_rewards_from_linucb(
                observation_numpy, batch_size)

        p_info = self.evaluate(action_step.info)
        self.assertEqual(p_info.predicted_rewards_mean.dtype, np.float32)
        self.assertAllClose(p_info.predicted_rewards_mean,
                            predicted_rewards_expected)
  def testActionBatch(self, batch_size, actions_from_reward_layer):

    policy = neural_linucb_policy.NeuralLinUCBPolicy(
        DummyNet(self._obs_spec),
        self._encoding_dim,
        get_reward_layer(),
        actions_from_reward_layer=tf.constant(
            actions_from_reward_layer, dtype=tf.bool),
        cov_matrix=self._a,
        data_vector=self._b,
        num_samples=self._num_samples_per_arm,
        epsilon_greedy=0.0,
        time_step_spec=self._time_step_spec)

    action_step = policy.action(self._time_step_batch(batch_size=batch_size))
    self.assertEqual(action_step.action.dtype, tf.int32)
    self.evaluate(tf.compat.v1.global_variables_initializer())
    action_fn = common.function_in_tf1()(policy.action)
    action_step = action_fn(self._time_step_batch(batch_size=batch_size))
    actions_ = self.evaluate(action_step.action)
    self.assertAllGreaterEqual(actions_, self._action_spec.minimum)
    self.assertAllLessEqual(actions_, self._action_spec.maximum)
Esempio n. 8
0
  def __init__(
      self,
      time_step_spec,
      action_spec,
      encoding_network,
      encoding_network_num_train_steps,
      encoding_dim,
      optimizer,
      variable_collection=None,
      alpha=1.0,
      gamma=1.0,
      epsilon_greedy=0.0,
      observation_and_action_constraint_splitter=None,
      # Params for training.
      error_loss_fn=tf.compat.v1.losses.mean_squared_error,
      gradient_clipping=None,
      # Params for debugging.
      debug_summaries=False,
      summarize_grads_and_vars=False,
      train_step_counter=None,
      emit_policy_info=(),
      emit_log_probability=False,
      dtype=tf.float64,
      name=None):
    """Initialize an instance of `NeuralLinUCBAgent`.

    Args:
      time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s.
      action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype
        describing the number of actions for this agent.
      encoding_network: a Keras network that encodes the observations.
      encoding_network_num_train_steps: how many training steps to run for
        training the encoding network before switching to LinUCB. If negative,
        the encoding network is assumed to be already trained.
      encoding_dim: the dimension of encoded observations.
      optimizer: The optimizer to use for training.
      variable_collection: Instance of `NeuralLinUCBVariableCollection`.
        Collection of variables to be updated by the agent. If `None`, a new
        instance of `LinearBanditVariables` will be created. Note that this
        collection excludes the variables owned by the encoding network.
      alpha: (float) positive scalar. This is the exploration parameter that
        multiplies the confidence intervals.
      gamma: a float forgetting factor in [0.0, 1.0]. When set to
        1.0, the algorithm does not forget.
      epsilon_greedy: A float representing the probability of choosing a random
        action instead of the greedy action.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      error_loss_fn: A function for computing the error loss, taking parameters
        labels, predictions, and weights (any function from tf.losses would
        work). The default is `tf.losses.mean_squared_error`.
      gradient_clipping: A float representing the norm length to clip gradients
        (or None for no clipping.)
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      train_step_counter: An optional `tf.Variable` to increment every time the
        train op is run.  Defaults to the `global_step`.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      emit_log_probability: Whether the NeuralLinUCBPolicy emits
        log-probabilities or not. Since the policy is deterministic, the
        probability is just 1.
      dtype: The type of the parameters stored and updated by the agent. Should
        be one of `tf.float32` and `tf.float64`. Defaults to `tf.float64`.
      name: a name for this instance of `NeuralLinUCBAgent`.

    Raises:
      TypeError if variable_collection is not an instance of
        `NeuralLinUCBVariableCollection`.
      ValueError if dtype is not one of `tf.float32` or `tf.float64`.
    """
    tf.Module.__init__(self, name=name)
    common.tf_agents_gauge.get_cell('TFABandit').set(True)
    self._num_actions = bandit_utils.get_num_actions_from_tensor_spec(
        action_spec)
    self._observation_and_action_constraint_splitter = (
        observation_and_action_constraint_splitter)
    if observation_and_action_constraint_splitter is not None:
      context_shape = observation_and_action_constraint_splitter(
          time_step_spec.observation)[0].shape.as_list()
    else:
      context_shape = time_step_spec.observation.shape.as_list()
    self._context_dim = (
        tf.compat.dimension_value(context_shape[0]) if context_shape else 1)
    self._alpha = alpha
    if variable_collection is None:
      variable_collection = NeuralLinUCBVariableCollection(
          self._num_actions, encoding_dim, dtype)
    elif not isinstance(variable_collection, NeuralLinUCBVariableCollection):
      raise TypeError('Parameter `variable_collection` should be '
                      'of type `NeuralLinUCBVariableCollection`.')
    self._variable_collection = variable_collection
    self._gamma = gamma
    if self._gamma < 0.0 or self._gamma > 1.0:
      raise ValueError('Forgetting factor `gamma` must be in [0.0, 1.0].')
    self._dtype = dtype
    if dtype not in (tf.float32, tf.float64):
      raise ValueError(
          'Agent dtype should be either `tf.float32 or `tf.float64`.')
    self._epsilon_greedy = epsilon_greedy

    reward_layer = tf.keras.layers.Dense(
        self._num_actions,
        kernel_initializer=tf.compat.v1.initializers.random_uniform(
            minval=-0.03, maxval=0.03),
        use_bias=False,
        activation=None,
        name='reward_layer')

    self._encoding_network = encoding_network
    self._reward_layer = reward_layer
    self._encoding_network_num_train_steps = encoding_network_num_train_steps
    self._encoding_dim = encoding_dim
    self._optimizer = optimizer
    self._error_loss_fn = error_loss_fn
    self._gradient_clipping = gradient_clipping
    train_step_counter = tf.compat.v1.train.get_or_create_global_step()

    policy = neural_linucb_policy.NeuralLinUCBPolicy(
        encoding_network=self._encoding_network,
        encoding_dim=self._encoding_dim,
        reward_layer=self._reward_layer,
        epsilon_greedy=self._epsilon_greedy,
        actions_from_reward_layer=self.actions_from_reward_layer,
        cov_matrix=self.cov_matrix,
        data_vector=self.data_vector,
        num_samples=self.num_samples,
        time_step_spec=time_step_spec,
        alpha=alpha,
        emit_policy_info=emit_policy_info,
        emit_log_probability=emit_log_probability,
        observation_and_action_constraint_splitter=(
            observation_and_action_constraint_splitter))

    super(NeuralLinUCBAgent, self).__init__(
        time_step_spec=time_step_spec,
        action_spec=policy.action_spec,
        policy=policy,
        collect_policy=policy,
        train_sequence_length=None,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=train_step_counter)
Esempio n. 9
0
    def testActionBatchWithVariablesAndPolicyUpdate(self, batch_size,
                                                    actions_from_reward_layer):

        a_list = []
        a_new_list = []
        b_list = []
        b_new_list = []
        num_samples_list = []
        num_samples_new_list = []
        for k in range(1, self._num_actions + 1):
            a_initial_value = k + 1 + 2 * k * tf.eye(self._encoding_dim,
                                                     dtype=tf.float32)
            a_for_one_arm = tf.compat.v2.Variable(a_initial_value)
            a_list.append(a_for_one_arm)
            b_initial_value = tf.constant(k * np.ones(self._encoding_dim),
                                          dtype=tf.float32)
            b_for_one_arm = tf.compat.v2.Variable(b_initial_value)
            b_list.append(b_for_one_arm)
            num_samples_initial_value = tf.constant([1], dtype=tf.float32)
            num_samples_for_one_arm = tf.compat.v2.Variable(
                num_samples_initial_value)
            num_samples_list.append(num_samples_for_one_arm)

            # Variables for the new policy (they differ by an offset).
            a_new_for_one_arm = tf.compat.v2.Variable(a_initial_value +
                                                      _POLICY_VARIABLES_OFFSET)
            a_new_list.append(a_new_for_one_arm)
            b_new_for_one_arm = tf.compat.v2.Variable(b_initial_value +
                                                      _POLICY_VARIABLES_OFFSET)
            b_new_list.append(b_new_for_one_arm)
            num_samples_for_one_arm_new = tf.compat.v2.Variable(
                num_samples_initial_value + _POLICY_VARIABLES_OFFSET)
            num_samples_new_list.append(num_samples_for_one_arm_new)

        policy = neural_linucb_policy.NeuralLinUCBPolicy(
            encoding_network=DummyNet(),
            encoding_dim=self._encoding_dim,
            reward_layer=get_reward_layer(),
            actions_from_reward_layer=tf.constant(actions_from_reward_layer,
                                                  dtype=tf.bool),
            cov_matrix=a_list,
            data_vector=b_list,
            num_samples=num_samples_list,
            epsilon_greedy=0.0,
            time_step_spec=self._time_step_spec)

        new_policy = neural_linucb_policy.NeuralLinUCBPolicy(
            encoding_network=DummyNet(),
            encoding_dim=self._encoding_dim,
            reward_layer=get_reward_layer(),
            actions_from_reward_layer=tf.constant(actions_from_reward_layer,
                                                  dtype=tf.bool),
            cov_matrix=a_new_list,
            data_vector=b_new_list,
            num_samples=num_samples_new_list,
            epsilon_greedy=0.0,
            time_step_spec=self._time_step_spec)

        action_step = policy.action(
            self._time_step_batch(batch_size=batch_size))
        new_action_step = new_policy.action(
            self._time_step_batch(batch_size=batch_size))
        self.assertEqual(action_step.action.shape,
                         new_action_step.action.shape)
        self.assertEqual(action_step.action.dtype,
                         new_action_step.action.dtype)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.evaluate(new_policy.update(policy))

        action_fn = common.function_in_tf1()(policy.action)
        action_step = action_fn(self._time_step_batch(batch_size=batch_size))
        new_action_fn = common.function_in_tf1()(new_policy.action)
        new_action_step = new_action_fn(
            self._time_step_batch(batch_size=batch_size))

        actions_, new_actions_ = self.evaluate(
            [action_step.action, new_action_step.action])
        self.assertAllEqual(actions_, new_actions_)
Esempio n. 10
0
    def __init__(
            self,
            time_step_spec,
            action_spec,
            encoding_network,
            encoding_network_num_train_steps,
            encoding_dim,
            optimizer,
            alpha=1.0,
            gamma=1.0,
            epsilon_greedy=0.0,
            # Params for training.
            error_loss_fn=tf.compat.v1.losses.mean_squared_error,
            gradient_clipping=None,
            # Params for debugging.
            debug_summaries=False,
            summarize_grads_and_vars=False,
            train_step_counter=None,
            emit_log_probability=False,
            dtype=tf.float64,
            name=None):
        """Initialize an instance of `NeuralLinUCBAgent`.

    Args:
      time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s.
      action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype
        describing the number of actions for this agent.
      encoding_network: a Keras network that encodes the observations.
      encoding_network_num_train_steps: how many training steps to run for
        training the encoding network before switching to LinUCB. If negative,
        the encoding network is assumed to be already trained.
      encoding_dim: the dimension of encoded observations.
      optimizer: The optimizer to use for training.
      alpha: (float) positive scalar. This is the exploration parameter that
        multiplies the confidence intervals.
      gamma: a float forgetting factor in [0.0, 1.0]. When set to
        1.0, the algorithm does not forget.
      epsilon_greedy: A float representing the probability of choosing a random
        action instead of the greedy action.
      error_loss_fn: A function for computing the error loss, taking parameters
        labels, predictions, and weights (any function from tf.losses would
        work). The default is `tf.losses.mean_squared_error`.
      gradient_clipping: A float representing the norm length to clip gradients
        (or None for no clipping.)
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      train_step_counter: An optional `tf.Variable` to increment every time the
        train op is run.  Defaults to the `global_step`.
      emit_log_probability: Whether the NeuralLinUCBPolicy emits
        log-probabilities or not. Since the policy is deterministic, the
        probability is just 1.
      dtype: The type of the parameters stored and updated by the agent. Should
        be one of `tf.float32` and `tf.float64`. Defaults to `tf.float64`.
      name: a name for this instance of `NeuralLinUCBAgent`.

    Raises:
      ValueError if dtype is not one of `tf.float32` or `tf.float64`.
    """
        tf.Module.__init__(self, name=name)
        self._num_actions = bandit_utils.get_num_actions_from_tensor_spec(
            action_spec)
        self._context_dim = int(time_step_spec.observation.shape[0])
        self._alpha = alpha
        self._cov_matrix_list = []
        self._data_vector_list = []
        # We keep track of the number of samples per arm.
        self._num_samples_list = []
        self._gamma = gamma
        if self._gamma < 0.0 or self._gamma > 1.0:
            raise ValueError(
                'Forgetting factor `gamma` must be in [0.0, 1.0].')
        self._dtype = dtype
        if dtype not in (tf.float32, tf.float64):
            raise ValueError(
                'Agent dtype should be either `tf.float32 or `tf.float64`.')
        self._epsilon_greedy = epsilon_greedy

        reward_layer = tf.keras.layers.Dense(
            self._num_actions,
            kernel_initializer=tf.compat.v1.initializers.random_uniform(
                minval=-0.03, maxval=0.03),
            bias_initializer=tf.compat.v1.initializers.constant(-0.2),
            activation=None,
            name='reward_layer')

        self._encoding_network = encoding_network
        self._reward_layer = reward_layer
        self._encoding_network_num_train_steps = encoding_network_num_train_steps
        self._encoding_dim = encoding_dim
        self._optimizer = optimizer
        self._error_loss_fn = error_loss_fn
        self._gradient_clipping = gradient_clipping
        train_step_counter = tf.compat.v1.train.get_or_create_global_step()
        self._actions_from_reward_layer = tf.compat.v2.Variable(True,
                                                                dtype=tf.bool)

        for k in range(self._num_actions):
            self._cov_matrix_list.append(
                tf.compat.v2.Variable(tf.eye(self._encoding_dim, dtype=dtype),
                                      name='a_' + str(k)))
            self._data_vector_list.append(
                tf.compat.v2.Variable(tf.zeros(self._encoding_dim,
                                               dtype=dtype),
                                      name='b_' + str(k)))
            self._num_samples_list.append(
                tf.compat.v2.Variable(tf.zeros([], dtype=dtype),
                                      name='num_samples_' + str(k)))

        policy = neural_linucb_policy.NeuralLinUCBPolicy(
            encoding_network=self._encoding_network,
            encoding_dim=self._encoding_dim,
            reward_layer=self._reward_layer,
            epsilon_greedy=self._epsilon_greedy,
            actions_from_reward_layer=self._actions_from_reward_layer,
            cov_matrix=self._cov_matrix_list,
            data_vector=self._data_vector_list,
            num_samples=self._num_samples_list,
            time_step_spec=time_step_spec,
            alpha=alpha,
            emit_log_probability=emit_log_probability)

        super(NeuralLinUCBAgent,
              self).__init__(time_step_spec=time_step_spec,
                             action_spec=policy.action_spec,
                             policy=policy,
                             collect_policy=policy,
                             train_sequence_length=None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter)
Esempio n. 11
0
    def testSparseObs(self, batch_size, actions_from_reward_layer):
        obs_spec = {
            'global': {
                'sport': tensor_spec.TensorSpec((), tf.string)
            },
            'per_arm': {
                'name': tensor_spec.TensorSpec((3, ), tf.string),
                'fruit': tensor_spec.TensorSpec((3, ), tf.string)
            }
        }
        columns_a = tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                'name', ['bob', 'george', 'wanda']))
        columns_b = tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                'fruit', ['banana', 'kiwi', 'pear']))
        columns_c = tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                'sport', ['bridge', 'chess', 'snooker']))

        dummy_net = arm_network.create_feed_forward_common_tower_network(
            obs_spec,
            global_layers=(3, 4, 5),
            arm_layers=(3, 2),
            common_layers=(4, 3),
            output_dim=self._encoding_dim,
            global_preprocessing_combiner=(
                tf.compat.v2.keras.layers.DenseFeatures([columns_c])),
            arm_preprocessing_combiner=tf.compat.v2.keras.layers.DenseFeatures(
                [columns_a, columns_b]))
        time_step_spec = ts.time_step_spec(obs_spec)
        reward_layer = get_per_arm_reward_layer(
            encoding_dim=self._encoding_dim)
        policy = neural_linucb_policy.NeuralLinUCBPolicy(
            dummy_net,
            self._encoding_dim,
            reward_layer,
            actions_from_reward_layer=tf.constant(actions_from_reward_layer,
                                                  dtype=tf.bool),
            cov_matrix=self._a[0:1],
            data_vector=self._b[0:1],
            num_samples=self._num_samples_per_arm[0:1],
            epsilon_greedy=0.0,
            time_step_spec=time_step_spec,
            accepts_per_arm_features=True,
            emit_policy_info=('predicted_rewards_mean', ))
        observations = {
            'global': {
                'sport': tf.constant(['snooker', 'chess'])
            },
            'per_arm': {
                'name':
                tf.constant([['george', 'george', 'george'],
                             ['bob', 'bob', 'bob']]),
                'fruit':
                tf.constant([['banana', 'banana', 'banana'],
                             ['kiwi', 'kiwi', 'kiwi']])
            }
        }

        time_step = ts.restart(observations, batch_size=2)
        action_fn = common.function_in_tf1()(policy.action)
        action_step = action_fn(time_step, seed=1)
        self.assertEqual(action_step.action.shape.as_list(), [2])
        self.assertEqual(action_step.action.dtype, tf.int32)
        # Initialize all variables
        self.evaluate([
            tf.compat.v1.global_variables_initializer(),
            tf.compat.v1.tables_initializer()
        ])
        action = self.evaluate(action_step.action)
        self.assertAllEqual(action.shape, [2])
        p_info = self.evaluate(action_step.info)
        self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 3])
        self.assertAllEqual(p_info.chosen_arm_features['name'].shape, [2])
        self.assertAllEqual(p_info.chosen_arm_features['fruit'].shape, [2])
        first_action = action[0]
        first_arm_name_feature = observations[
            bandit_spec_utils.PER_ARM_FEATURE_KEY]['name'][0]
        self.assertAllEqual(p_info.chosen_arm_features['name'][0],
                            first_arm_name_feature[first_action])