コード例 #1
0
 def testOuterDimsNestRemovesDimensionsFromSpecs(self, dtype):
     if dtype == tf.string:
         self.skipTest("Not compatible with string type.")
     nested_spec = example_nested_tensor_spec(dtype)
     larger_spec = tensor_spec.add_outer_dims_nest(nested_spec, (3, 4))
     removed_spec = tensor_spec.remove_outer_dims_nest(larger_spec, 2)
     self.assertEqual(nested_spec, removed_spec)
コード例 #2
0
def create_feed_forward_common_tower_network(observation_spec,
                                             global_layers,
                                             arm_layers,
                                             common_layers,
                                             output_dim=1,
                                             arm_preprocessing_combiner=None):
    """Creates a common tower network with feedforward towers.

  The network produced by this function can be used either in
  `GreedyRewardPredictionPolicy`, or `NeuralLinUCBPolicy`.
  In the former case, the network must have `output_dim=1`, it is going to be an
  instance of `QNetwork`, and used in the policy as a reward prediction network.
  In the latter case, the network will be an encoding network with its output
  consumed by a reward layer or a LinUCB method. The specified `output_dim` will
  be the encoding dimension.

  Args:
    observation_spec: A nested tensor spec containing the specs for global as
      well as per-arm observations.
    global_layers: Iterable of ints. Specifies the layers of the global tower.
    arm_layers: Iterable of ints. Specifies the layers of the arm tower.
    common_layers: Iterable of ints. Specifies the layers of the common tower.
    output_dim: The output dimension of the network. If 1, the common tower will
      be a QNetwork. Otherwise, the common tower will be an encoding network
      with the specified output dimension.
    arm_preprocessing_combiner: preprocessing combiner for the arm features.

  Returns:
    A network that takes observations adhering observation_spec and outputs
    reward estimates for every action.
  """
    global_network = encoding_network.EncodingNetwork(
        input_tensor_spec=observation_spec[
            bandit_spec_utils.GLOBAL_FEATURE_KEY],
        fc_layer_params=global_layers)

    arm_feature_spec = tensor_spec.remove_outer_dims_nest(
        observation_spec[bandit_spec_utils.PER_ARM_FEATURE_KEY], 1)
    arm_network = encoding_network.EncodingNetwork(
        input_tensor_spec=arm_feature_spec,
        fc_layer_params=arm_layers,
        preprocessing_combiner=arm_preprocessing_combiner)
    common_input_dim = global_layers[-1] + arm_layers[-1]
    common_input_spec = tensor_spec.TensorSpec(shape=(common_input_dim, ),
                                               dtype=tf.float32)
    if output_dim == 1:
        common_network = q_network.QNetwork(
            input_tensor_spec=common_input_spec,
            action_spec=tensor_spec.BoundedTensorSpec(shape=(),
                                                      minimum=0,
                                                      maximum=0,
                                                      dtype=tf.int32),
            fc_layer_params=common_layers)
    else:
        common_network = encoding_network.EncodingNetwork(
            input_tensor_spec=common_input_spec,
            fc_layer_params=list(common_layers) + [output_dim])
    return GlobalAndArmCommonTowerNetwork(observation_spec, global_network,
                                          arm_network, common_network)
コード例 #3
0
ファイル: policy_saver.py プロジェクト: tensorflow/agents
def _check_compatible(spec, tensor, ignore_outer_dims=True):
    """Checks if `spec` is compatible with `tensor`, maybe ignoring outer dims."""
    if ignore_outer_dims:
        tensor = tensor_spec.remove_outer_dims_nest(
            tensor, tensor.shape.ndims - spec.shape.ndims)
    if not spec.is_compatible_with(tensor):
        raise ValueError('Tensor is incompatible with spec. spec = {0}, '
                         'tensor = {1}'.format(spec, tensor))
コード例 #4
0
 def _maybe_convert_to_spec(p):
     if isinstance(p, distribution_utils.Params):
         return _convert_to_spec_and_remove_singleton_batch_dim(
             p, outer_ndim)
     elif tf.is_tensor(p):
         return tensor_spec.remove_outer_dims_nest(
             tf.type_spec_from_value(p), num_outer_dims=outer_ndim)
     else:
         return p
コード例 #5
0
 def _calc_unbatched_spec(x):
     if isinstance(x, tfp.distributions.Distribution):
         parameters = distribution_utils.get_parameters(x)
         parameter_specs = _convert_to_spec_and_remove_singleton_batch_dim(
             parameters, outer_ndim=outer_ndim)
         return distribution_utils.DistributionSpecV2(
             event_shape=x.event_shape,
             dtype=x.dtype,
             parameters=parameter_specs)
     else:
         return tensor_spec.remove_outer_dims_nest(
             tf.type_spec_from_value(x), num_outer_dims=outer_ndim)
コード例 #6
0
ファイル: policy_utilities.py プロジェクト: zircote/agents
def create_chosen_arm_features_info_spec(
        observation_spec, observation_and_action_constraint_splitter):
    """Creates the chosen arm features info spec from the arm observation spec."""
    if observation_and_action_constraint_splitter is not None:
        observation_spec = observation_and_action_constraint_splitter(
            observation_spec)[0]
        if bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY in observation_spec:
            raise ValueError('Variable number of actions and action masking '
                             'should not be used together.')
        logging.warning(
            'Action masking with per-arm features is discouraged. '
            'Instead, use variable number of actions via the `%s` feature key.',
            bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY)
    arm_spec = observation_spec[bandit_spec_utils.PER_ARM_FEATURE_KEY]
    return tensor_spec.remove_outer_dims_nest(arm_spec, 1)
コード例 #7
0
def _remove_num_actions_dim_from_spec(observation_spec):
  """Removes the extra `num_actions` dimension from the observation spec."""
  obs_spec_no_num_actions = {
      bandit_spec_utils.GLOBAL_FEATURE_KEY:
          observation_spec[bandit_spec_utils.GLOBAL_FEATURE_KEY],
      bandit_spec_utils.PER_ARM_FEATURE_KEY:
          tensor_spec.remove_outer_dims_nest(
              observation_spec[bandit_spec_utils.PER_ARM_FEATURE_KEY], 1)
  }
  if bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY in observation_spec:
    obs_spec_no_num_actions.update({
        bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY:
            observation_spec[bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY]
    })
  return obs_spec_no_num_actions
コード例 #8
0
        def _calc_unbatched_spec(x):
            """Build Network output spec by removing previously added batch dimension.

      Args:
        x: tfp.distributions.Distribution or Tensor.
      Returns:
        Specs without batch dimension representing x.
      """
            if isinstance(x, tfp.distributions.Distribution):
                parameters = distribution_utils.get_parameters(x)
                parameter_specs = _convert_to_spec_and_remove_singleton_batch_dim(
                    parameters, outer_ndim=1)
                return distribution_utils.DistributionSpecV2(
                    event_shape=x.event_shape,
                    dtype=x.dtype,
                    parameters=parameter_specs)
            else:
                return tensor_spec.remove_outer_dims_nest(
                    tf.type_spec_from_value(x), num_outer_dims=1)
コード例 #9
0
 def testOuterDimsNestRemovesDimensionsFromSpecsThrows(self, dtype):
     if dtype == tf.string:
         self.skipTest("Not compatible with string type.")
     nested_spec = example_nested_tensor_spec(dtype)
     with self.assertRaises(ValueError):
         tensor_spec.remove_outer_dims_nest(nested_spec, 10)
コード例 #10
0
  def __init__(self,
               encoding_network,
               encoding_dim,
               reward_layer,
               epsilon_greedy,
               actions_from_reward_layer,
               cov_matrix,
               data_vector,
               num_samples,
               time_step_spec=None,
               alpha=1.0,
               emit_policy_info=(),
               emit_log_probability=False,
               accepts_per_arm_features=False,
               distributed_use_reward_layer=False,
               observation_and_action_constraint_splitter=None,
               name=None):
    """Initializes `NeuralLinUCBPolicy`.

    Args:
      encoding_network: network that encodes the observations.
      encoding_dim: (int) dimension of the encoded observations.
      reward_layer: final layer that predicts the expected reward per arm. In
        case the policy accepts per-arm features, the output of this layer has
        to be a scalar. This is because in the per-arm case, all encoded
        observations have to go through the same computation to get the reward
        estimates. The `num_actions` dimension of the encoded observation is
        treated as a batch dimension in the reward layer.
      epsilon_greedy: (float) representing the probability of choosing a random
        action instead of the greedy action.
      actions_from_reward_layer: (boolean variable) whether to get actions from
        the reward layer or from LinUCB.
      cov_matrix: list of the covariance matrices. There exists one covariance
        matrix per arm, unless the policy accepts per-arm features, in which
        case this list must have a single element.
      data_vector: list of the data vectors. A data vector is a weighted sum
        of the observations, where the weight is the corresponding reward. Each
        arm has its own data vector, unless the policy accepts per-arm features,
        in which case this list must have a single element.
      num_samples: list of number of samples per arm. If the policy accepts per-
        arm features, this is a single-element list counting the number of
        steps.
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      alpha: (float) non-negative weight multiplying the confidence intervals.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      emit_log_probability: (bool) whether to emit log probabilities.
      accepts_per_arm_features: (bool) Whether the policy accepts per-arm
        features.
      distributed_use_reward_layer: (bool) Whether to pick the actions using
        the network or use LinUCB. This applies only in distributed training
        setting and has a similar role to the `actions_from_reward_layer`
        mentioned above.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit policy and 2)
        the mask. The mask should be a 0-1 `Tensor` of shape
        `[batch_size, num_actions]`. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      name: The name of this policy.
    """
    encoding_network.create_variables()
    self._encoding_network = encoding_network
    self._reward_layer = reward_layer
    self._encoding_dim = encoding_dim

    if accepts_per_arm_features and reward_layer.units != 1:
      raise ValueError('The output dimension of the reward layer must be 1, got'
                       ' {}'.format(reward_layer.units))

    if not isinstance(cov_matrix, (list, tuple)):
      raise ValueError('cov_matrix must be a list of matrices (Tensors).')
    self._cov_matrix = cov_matrix

    if not isinstance(data_vector, (list, tuple)):
      raise ValueError('data_vector must be a list of vectors (Tensors).')
    self._data_vector = data_vector

    if not isinstance(num_samples, (list, tuple)):
      raise ValueError('num_samples must be a list of vectors (Tensors).')
    self._num_samples = num_samples

    self._alpha = alpha
    self._actions_from_reward_layer = actions_from_reward_layer
    self._epsilon_greedy = epsilon_greedy
    self._dtype = self._data_vector[0].dtype
    self._distributed_use_reward_layer = distributed_use_reward_layer

    if len(cov_matrix) != len(data_vector):
      raise ValueError('The size of list cov_matrix must match the size of '
                       'list data_vector. Got {} for cov_matrix and {} '
                       'for data_vector'.format(
                           len(self._cov_matrix), len((data_vector))))
    if len(num_samples) != len(cov_matrix):
      raise ValueError('The size of num_samples must match the size of '
                       'list cov_matrix. Got {} for num_samples and {} '
                       'for cov_matrix'.format(
                           len(self._num_samples), len((cov_matrix))))

    self._accepts_per_arm_features = accepts_per_arm_features
    if observation_and_action_constraint_splitter is not None:
      context_spec, _ = observation_and_action_constraint_splitter(
          time_step_spec.observation)
    else:
      context_spec = time_step_spec.observation
    if accepts_per_arm_features:
      self._num_actions = tf.nest.flatten(context_spec[
          bandit_spec_utils.PER_ARM_FEATURE_KEY])[0].shape.as_list()[0]
      self._num_models = 1
    else:
      self._num_actions = len(cov_matrix)
      self._num_models = self._num_actions
    cov_matrix_dim = tf.compat.dimension_value(cov_matrix[0].shape[0])
    if self._encoding_dim != cov_matrix_dim:
      raise ValueError('The dimension of matrix `cov_matrix` must match '
                       'encoding dimension {}.'
                       'Got {} for `cov_matrix`.'.format(
                           self._encoding_dim, cov_matrix_dim))
    data_vector_dim = tf.compat.dimension_value(data_vector[0].shape[0])
    if self._encoding_dim != data_vector_dim:
      raise ValueError('The dimension of vector `data_vector` must match '
                       'encoding  dimension {}. '
                       'Got {} for `data_vector`.'.format(
                           self._encoding_dim, data_vector_dim))
    action_spec = tensor_spec.BoundedTensorSpec(
        shape=(),
        dtype=tf.int32,
        minimum=0,
        maximum=self._num_actions - 1,
        name='action')

    self._emit_policy_info = emit_policy_info
    predicted_rewards_mean = ()
    if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info:
      predicted_rewards_mean = tensor_spec.TensorSpec(
          [self._num_actions],
          dtype=tf.float32)
    predicted_rewards_optimistic = ()
    if (policy_utilities.InfoFields.PREDICTED_REWARDS_OPTIMISTIC in
        emit_policy_info):
      predicted_rewards_optimistic = tensor_spec.TensorSpec(
          [self._num_actions],
          dtype=tf.float32)
    if accepts_per_arm_features:
      arm_spec = context_spec[bandit_spec_utils.PER_ARM_FEATURE_KEY]
      chosen_arm_features_info_spec = tensor_spec.remove_outer_dims_nest(
          arm_spec, 1)
      info_spec = policy_utilities.PerArmPolicyInfo(
          predicted_rewards_mean=predicted_rewards_mean,
          predicted_rewards_optimistic=predicted_rewards_optimistic,
          chosen_arm_features=chosen_arm_features_info_spec)
    else:
      info_spec = policy_utilities.PolicyInfo(
          predicted_rewards_mean=predicted_rewards_mean,
          predicted_rewards_optimistic=predicted_rewards_optimistic)

    super(NeuralLinUCBPolicy, self).__init__(
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        emit_log_probability=emit_log_probability,
        observation_and_action_constraint_splitter=(
            observation_and_action_constraint_splitter),
        info_spec=info_spec,
        name=name)
コード例 #11
0
ファイル: utils.py プロジェクト: tensorflow/agents
def create_chosen_arm_features_info_spec(
    observation_spec: types.NestedTensorSpec) -> types.NestedTensorSpec:
  """Creates the chosen arm features info spec from the arm observation spec."""
  arm_spec = observation_spec[bandit_spec_utils.PER_ARM_FEATURE_KEY]
  return tensor_spec.remove_outer_dims_nest(arm_spec, 1)
コード例 #12
0
    def __init__(self,
                 time_step_spec=None,
                 action_spec=None,
                 reward_network=None,
                 observation_and_action_constraint_splitter=None,
                 accepts_per_arm_features=False,
                 constraints=(),
                 emit_policy_info=(),
                 name=None):
        """Builds a GreedyRewardPredictionPolicy given a reward tf_agents.Network.

    This policy takes a tf_agents.Network predicting rewards and generates the
    action corresponding to the largest predicted reward.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      reward_network: An instance of a `tf_agents.network.Network`,
        callable via `network(observation, step_type) -> (output, final_state)`.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the network and 2) the
        mask.  The mask should be a 0-1 `Tensor` of shape
        `[batch_size, num_actions]`. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      accepts_per_arm_features: (bool) Whether the policy accepts per-arm
        features.
      constraints: iterable of constraints objects that are instances of
        `tf_agents.bandits.agents.NeuralConstraint`.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      name: The name of this policy. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      NotImplementedError: If `action_spec` contains more than one
        `BoundedTensorSpec` or the `BoundedTensorSpec` is not valid.
    """
        flat_action_spec = tf.nest.flatten(action_spec)
        if len(flat_action_spec) > 1:
            raise NotImplementedError(
                'action_spec can only contain a single BoundedTensorSpec.')

        action_spec = flat_action_spec[0]
        if (not tensor_spec.is_bounded(action_spec)
                or not tensor_spec.is_discrete(action_spec)
                or action_spec.shape.rank > 1
                or action_spec.shape.num_elements() != 1):
            raise NotImplementedError(
                'action_spec must be a BoundedTensorSpec of type int32 and shape (). '
                'Found {}.'.format(action_spec))
        self._expected_num_actions = action_spec.maximum - action_spec.minimum + 1
        self._action_offset = action_spec.minimum
        reward_network.create_variables()
        self._reward_network = reward_network
        self._constraints = constraints

        self._emit_policy_info = emit_policy_info
        predicted_rewards_mean = ()
        if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info:
            predicted_rewards_mean = tensor_spec.TensorSpec(
                [self._expected_num_actions])
        bandit_policy_type = ()
        if policy_utilities.InfoFields.BANDIT_POLICY_TYPE in emit_policy_info:
            bandit_policy_type = (
                policy_utilities.create_bandit_policy_type_tensor_spec(
                    shape=[1]))
        if accepts_per_arm_features:
            # The features for the chosen arm is saved to policy_info.
            observation = time_step_spec.observation
            if observation_and_action_constraint_splitter is not None:
                observation = observation_and_action_constraint_splitter(
                    observation)[0]
            arm_spec = observation[bandit_spec_utils.PER_ARM_FEATURE_KEY]
            chosen_arm_features_info = tensor_spec.remove_outer_dims_nest(
                arm_spec, 1)
            info_spec = policy_utilities.PerArmPolicyInfo(
                predicted_rewards_mean=predicted_rewards_mean,
                bandit_policy_type=bandit_policy_type,
                chosen_arm_features=chosen_arm_features_info)
        else:
            info_spec = policy_utilities.PolicyInfo(
                predicted_rewards_mean=predicted_rewards_mean,
                bandit_policy_type=bandit_policy_type)

        self._accepts_per_arm_features = accepts_per_arm_features

        super(GreedyRewardPredictionPolicy,
              self).__init__(time_step_spec,
                             action_spec,
                             policy_state_spec=reward_network.state_spec,
                             clip=False,
                             info_spec=info_spec,
                             observation_and_action_constraint_splitter=(
                                 observation_and_action_constraint_splitter),
                             name=name)