def testOuterDimsNestRemovesDimensionsFromSpecs(self, dtype): if dtype == tf.string: self.skipTest("Not compatible with string type.") nested_spec = example_nested_tensor_spec(dtype) larger_spec = tensor_spec.add_outer_dims_nest(nested_spec, (3, 4)) removed_spec = tensor_spec.remove_outer_dims_nest(larger_spec, 2) self.assertEqual(nested_spec, removed_spec)
def create_feed_forward_common_tower_network(observation_spec, global_layers, arm_layers, common_layers, output_dim=1, arm_preprocessing_combiner=None): """Creates a common tower network with feedforward towers. The network produced by this function can be used either in `GreedyRewardPredictionPolicy`, or `NeuralLinUCBPolicy`. In the former case, the network must have `output_dim=1`, it is going to be an instance of `QNetwork`, and used in the policy as a reward prediction network. In the latter case, the network will be an encoding network with its output consumed by a reward layer or a LinUCB method. The specified `output_dim` will be the encoding dimension. Args: observation_spec: A nested tensor spec containing the specs for global as well as per-arm observations. global_layers: Iterable of ints. Specifies the layers of the global tower. arm_layers: Iterable of ints. Specifies the layers of the arm tower. common_layers: Iterable of ints. Specifies the layers of the common tower. output_dim: The output dimension of the network. If 1, the common tower will be a QNetwork. Otherwise, the common tower will be an encoding network with the specified output dimension. arm_preprocessing_combiner: preprocessing combiner for the arm features. Returns: A network that takes observations adhering observation_spec and outputs reward estimates for every action. """ global_network = encoding_network.EncodingNetwork( input_tensor_spec=observation_spec[ bandit_spec_utils.GLOBAL_FEATURE_KEY], fc_layer_params=global_layers) arm_feature_spec = tensor_spec.remove_outer_dims_nest( observation_spec[bandit_spec_utils.PER_ARM_FEATURE_KEY], 1) arm_network = encoding_network.EncodingNetwork( input_tensor_spec=arm_feature_spec, fc_layer_params=arm_layers, preprocessing_combiner=arm_preprocessing_combiner) common_input_dim = global_layers[-1] + arm_layers[-1] common_input_spec = tensor_spec.TensorSpec(shape=(common_input_dim, ), dtype=tf.float32) if output_dim == 1: common_network = q_network.QNetwork( input_tensor_spec=common_input_spec, action_spec=tensor_spec.BoundedTensorSpec(shape=(), minimum=0, maximum=0, dtype=tf.int32), fc_layer_params=common_layers) else: common_network = encoding_network.EncodingNetwork( input_tensor_spec=common_input_spec, fc_layer_params=list(common_layers) + [output_dim]) return GlobalAndArmCommonTowerNetwork(observation_spec, global_network, arm_network, common_network)
def _check_compatible(spec, tensor, ignore_outer_dims=True): """Checks if `spec` is compatible with `tensor`, maybe ignoring outer dims.""" if ignore_outer_dims: tensor = tensor_spec.remove_outer_dims_nest( tensor, tensor.shape.ndims - spec.shape.ndims) if not spec.is_compatible_with(tensor): raise ValueError('Tensor is incompatible with spec. spec = {0}, ' 'tensor = {1}'.format(spec, tensor))
def _maybe_convert_to_spec(p): if isinstance(p, distribution_utils.Params): return _convert_to_spec_and_remove_singleton_batch_dim( p, outer_ndim) elif tf.is_tensor(p): return tensor_spec.remove_outer_dims_nest( tf.type_spec_from_value(p), num_outer_dims=outer_ndim) else: return p
def _calc_unbatched_spec(x): if isinstance(x, tfp.distributions.Distribution): parameters = distribution_utils.get_parameters(x) parameter_specs = _convert_to_spec_and_remove_singleton_batch_dim( parameters, outer_ndim=outer_ndim) return distribution_utils.DistributionSpecV2( event_shape=x.event_shape, dtype=x.dtype, parameters=parameter_specs) else: return tensor_spec.remove_outer_dims_nest( tf.type_spec_from_value(x), num_outer_dims=outer_ndim)
def create_chosen_arm_features_info_spec( observation_spec, observation_and_action_constraint_splitter): """Creates the chosen arm features info spec from the arm observation spec.""" if observation_and_action_constraint_splitter is not None: observation_spec = observation_and_action_constraint_splitter( observation_spec)[0] if bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY in observation_spec: raise ValueError('Variable number of actions and action masking ' 'should not be used together.') logging.warning( 'Action masking with per-arm features is discouraged. ' 'Instead, use variable number of actions via the `%s` feature key.', bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY) arm_spec = observation_spec[bandit_spec_utils.PER_ARM_FEATURE_KEY] return tensor_spec.remove_outer_dims_nest(arm_spec, 1)
def _remove_num_actions_dim_from_spec(observation_spec): """Removes the extra `num_actions` dimension from the observation spec.""" obs_spec_no_num_actions = { bandit_spec_utils.GLOBAL_FEATURE_KEY: observation_spec[bandit_spec_utils.GLOBAL_FEATURE_KEY], bandit_spec_utils.PER_ARM_FEATURE_KEY: tensor_spec.remove_outer_dims_nest( observation_spec[bandit_spec_utils.PER_ARM_FEATURE_KEY], 1) } if bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY in observation_spec: obs_spec_no_num_actions.update({ bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY: observation_spec[bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY] }) return obs_spec_no_num_actions
def _calc_unbatched_spec(x): """Build Network output spec by removing previously added batch dimension. Args: x: tfp.distributions.Distribution or Tensor. Returns: Specs without batch dimension representing x. """ if isinstance(x, tfp.distributions.Distribution): parameters = distribution_utils.get_parameters(x) parameter_specs = _convert_to_spec_and_remove_singleton_batch_dim( parameters, outer_ndim=1) return distribution_utils.DistributionSpecV2( event_shape=x.event_shape, dtype=x.dtype, parameters=parameter_specs) else: return tensor_spec.remove_outer_dims_nest( tf.type_spec_from_value(x), num_outer_dims=1)
def testOuterDimsNestRemovesDimensionsFromSpecsThrows(self, dtype): if dtype == tf.string: self.skipTest("Not compatible with string type.") nested_spec = example_nested_tensor_spec(dtype) with self.assertRaises(ValueError): tensor_spec.remove_outer_dims_nest(nested_spec, 10)
def __init__(self, encoding_network, encoding_dim, reward_layer, epsilon_greedy, actions_from_reward_layer, cov_matrix, data_vector, num_samples, time_step_spec=None, alpha=1.0, emit_policy_info=(), emit_log_probability=False, accepts_per_arm_features=False, distributed_use_reward_layer=False, observation_and_action_constraint_splitter=None, name=None): """Initializes `NeuralLinUCBPolicy`. Args: encoding_network: network that encodes the observations. encoding_dim: (int) dimension of the encoded observations. reward_layer: final layer that predicts the expected reward per arm. In case the policy accepts per-arm features, the output of this layer has to be a scalar. This is because in the per-arm case, all encoded observations have to go through the same computation to get the reward estimates. The `num_actions` dimension of the encoded observation is treated as a batch dimension in the reward layer. epsilon_greedy: (float) representing the probability of choosing a random action instead of the greedy action. actions_from_reward_layer: (boolean variable) whether to get actions from the reward layer or from LinUCB. cov_matrix: list of the covariance matrices. There exists one covariance matrix per arm, unless the policy accepts per-arm features, in which case this list must have a single element. data_vector: list of the data vectors. A data vector is a weighted sum of the observations, where the weight is the corresponding reward. Each arm has its own data vector, unless the policy accepts per-arm features, in which case this list must have a single element. num_samples: list of number of samples per arm. If the policy accepts per- arm features, this is a single-element list counting the number of steps. time_step_spec: A `TimeStep` spec of the expected time_steps. alpha: (float) non-negative weight multiplying the confidence intervals. emit_policy_info: (tuple of strings) what side information we want to get as part of the policy info. Allowed values can be found in `policy_utilities.PolicyInfo`. emit_log_probability: (bool) whether to emit log probabilities. accepts_per_arm_features: (bool) Whether the policy accepts per-arm features. distributed_use_reward_layer: (bool) Whether to pick the actions using the network or use LinUCB. This applies only in distributed training setting and has a similar role to the `actions_from_reward_layer` mentioned above. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the bandit policy and 2) the mask. The mask should be a 0-1 `Tensor` of shape `[batch_size, num_actions]`. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. name: The name of this policy. """ encoding_network.create_variables() self._encoding_network = encoding_network self._reward_layer = reward_layer self._encoding_dim = encoding_dim if accepts_per_arm_features and reward_layer.units != 1: raise ValueError('The output dimension of the reward layer must be 1, got' ' {}'.format(reward_layer.units)) if not isinstance(cov_matrix, (list, tuple)): raise ValueError('cov_matrix must be a list of matrices (Tensors).') self._cov_matrix = cov_matrix if not isinstance(data_vector, (list, tuple)): raise ValueError('data_vector must be a list of vectors (Tensors).') self._data_vector = data_vector if not isinstance(num_samples, (list, tuple)): raise ValueError('num_samples must be a list of vectors (Tensors).') self._num_samples = num_samples self._alpha = alpha self._actions_from_reward_layer = actions_from_reward_layer self._epsilon_greedy = epsilon_greedy self._dtype = self._data_vector[0].dtype self._distributed_use_reward_layer = distributed_use_reward_layer if len(cov_matrix) != len(data_vector): raise ValueError('The size of list cov_matrix must match the size of ' 'list data_vector. Got {} for cov_matrix and {} ' 'for data_vector'.format( len(self._cov_matrix), len((data_vector)))) if len(num_samples) != len(cov_matrix): raise ValueError('The size of num_samples must match the size of ' 'list cov_matrix. Got {} for num_samples and {} ' 'for cov_matrix'.format( len(self._num_samples), len((cov_matrix)))) self._accepts_per_arm_features = accepts_per_arm_features if observation_and_action_constraint_splitter is not None: context_spec, _ = observation_and_action_constraint_splitter( time_step_spec.observation) else: context_spec = time_step_spec.observation if accepts_per_arm_features: self._num_actions = tf.nest.flatten(context_spec[ bandit_spec_utils.PER_ARM_FEATURE_KEY])[0].shape.as_list()[0] self._num_models = 1 else: self._num_actions = len(cov_matrix) self._num_models = self._num_actions cov_matrix_dim = tf.compat.dimension_value(cov_matrix[0].shape[0]) if self._encoding_dim != cov_matrix_dim: raise ValueError('The dimension of matrix `cov_matrix` must match ' 'encoding dimension {}.' 'Got {} for `cov_matrix`.'.format( self._encoding_dim, cov_matrix_dim)) data_vector_dim = tf.compat.dimension_value(data_vector[0].shape[0]) if self._encoding_dim != data_vector_dim: raise ValueError('The dimension of vector `data_vector` must match ' 'encoding dimension {}. ' 'Got {} for `data_vector`.'.format( self._encoding_dim, data_vector_dim)) action_spec = tensor_spec.BoundedTensorSpec( shape=(), dtype=tf.int32, minimum=0, maximum=self._num_actions - 1, name='action') self._emit_policy_info = emit_policy_info predicted_rewards_mean = () if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info: predicted_rewards_mean = tensor_spec.TensorSpec( [self._num_actions], dtype=tf.float32) predicted_rewards_optimistic = () if (policy_utilities.InfoFields.PREDICTED_REWARDS_OPTIMISTIC in emit_policy_info): predicted_rewards_optimistic = tensor_spec.TensorSpec( [self._num_actions], dtype=tf.float32) if accepts_per_arm_features: arm_spec = context_spec[bandit_spec_utils.PER_ARM_FEATURE_KEY] chosen_arm_features_info_spec = tensor_spec.remove_outer_dims_nest( arm_spec, 1) info_spec = policy_utilities.PerArmPolicyInfo( predicted_rewards_mean=predicted_rewards_mean, predicted_rewards_optimistic=predicted_rewards_optimistic, chosen_arm_features=chosen_arm_features_info_spec) else: info_spec = policy_utilities.PolicyInfo( predicted_rewards_mean=predicted_rewards_mean, predicted_rewards_optimistic=predicted_rewards_optimistic) super(NeuralLinUCBPolicy, self).__init__( time_step_spec=time_step_spec, action_spec=action_spec, emit_log_probability=emit_log_probability, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), info_spec=info_spec, name=name)
def create_chosen_arm_features_info_spec( observation_spec: types.NestedTensorSpec) -> types.NestedTensorSpec: """Creates the chosen arm features info spec from the arm observation spec.""" arm_spec = observation_spec[bandit_spec_utils.PER_ARM_FEATURE_KEY] return tensor_spec.remove_outer_dims_nest(arm_spec, 1)
def __init__(self, time_step_spec=None, action_spec=None, reward_network=None, observation_and_action_constraint_splitter=None, accepts_per_arm_features=False, constraints=(), emit_policy_info=(), name=None): """Builds a GreedyRewardPredictionPolicy given a reward tf_agents.Network. This policy takes a tf_agents.Network predicting rewards and generates the action corresponding to the largest predicted reward. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. reward_network: An instance of a `tf_agents.network.Network`, callable via `network(observation, step_type) -> (output, final_state)`. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the network and 2) the mask. The mask should be a 0-1 `Tensor` of shape `[batch_size, num_actions]`. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. accepts_per_arm_features: (bool) Whether the policy accepts per-arm features. constraints: iterable of constraints objects that are instances of `tf_agents.bandits.agents.NeuralConstraint`. emit_policy_info: (tuple of strings) what side information we want to get as part of the policy info. Allowed values can be found in `policy_utilities.PolicyInfo`. name: The name of this policy. All variables in this module will fall under that name. Defaults to the class name. Raises: NotImplementedError: If `action_spec` contains more than one `BoundedTensorSpec` or the `BoundedTensorSpec` is not valid. """ flat_action_spec = tf.nest.flatten(action_spec) if len(flat_action_spec) > 1: raise NotImplementedError( 'action_spec can only contain a single BoundedTensorSpec.') action_spec = flat_action_spec[0] if (not tensor_spec.is_bounded(action_spec) or not tensor_spec.is_discrete(action_spec) or action_spec.shape.rank > 1 or action_spec.shape.num_elements() != 1): raise NotImplementedError( 'action_spec must be a BoundedTensorSpec of type int32 and shape (). ' 'Found {}.'.format(action_spec)) self._expected_num_actions = action_spec.maximum - action_spec.minimum + 1 self._action_offset = action_spec.minimum reward_network.create_variables() self._reward_network = reward_network self._constraints = constraints self._emit_policy_info = emit_policy_info predicted_rewards_mean = () if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info: predicted_rewards_mean = tensor_spec.TensorSpec( [self._expected_num_actions]) bandit_policy_type = () if policy_utilities.InfoFields.BANDIT_POLICY_TYPE in emit_policy_info: bandit_policy_type = ( policy_utilities.create_bandit_policy_type_tensor_spec( shape=[1])) if accepts_per_arm_features: # The features for the chosen arm is saved to policy_info. observation = time_step_spec.observation if observation_and_action_constraint_splitter is not None: observation = observation_and_action_constraint_splitter( observation)[0] arm_spec = observation[bandit_spec_utils.PER_ARM_FEATURE_KEY] chosen_arm_features_info = tensor_spec.remove_outer_dims_nest( arm_spec, 1) info_spec = policy_utilities.PerArmPolicyInfo( predicted_rewards_mean=predicted_rewards_mean, bandit_policy_type=bandit_policy_type, chosen_arm_features=chosen_arm_features_info) else: info_spec = policy_utilities.PolicyInfo( predicted_rewards_mean=predicted_rewards_mean, bandit_policy_type=bandit_policy_type) self._accepts_per_arm_features = accepts_per_arm_features super(GreedyRewardPredictionPolicy, self).__init__(time_step_spec, action_spec, policy_state_spec=reward_network.state_spec, clip=False, info_spec=info_spec, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), name=name)