def __init__( self, time_step_spec: types.TimeStep, action_spec: types.BoundedTensorSpec, constraint_network: types.Network, error_loss_fn: types.LossFn = tf.compat.v1.losses.mean_squared_error, name: Optional[Text] = 'NeuralConstraint'): """Creates a trainable constraint using a neural network. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of `BoundedTensorSpec` representing the actions. constraint_network: An instance of `tf_agents.network.Network` used to provide estimates of action feasibility. The input structure should be consistent with the `observation_spec`. error_loss_fn: A function for computing the loss used to train the constraint network. The default is `tf.losses.mean_squared_error`. name: Python str name of this agent. All variables in this module will fall under that name. Defaults to the class name. """ super(NeuralConstraint, self).__init__( time_step_spec, action_spec, name) self._num_actions = policy_utilities.get_num_actions_from_tensor_spec( action_spec) with self.name_scope: constraint_network.create_variables() self._constraint_network = constraint_network self._error_loss_fn = error_loss_fn
def build_laplacian_over_ordinal_integer_actions( action_spec: types.BoundedTensorSpec) -> types.Tensor: """Build the unnormalized Laplacian matrix over ordinal integer actions. Assuming integer actions, this functions builds the (unnormalized) Laplacian matrix of the graph implied over the action space. The graph vertices are the integers {0...action_spec.maximum - 1}. Two vertices are adjacent if they correspond to consecutive integer actions. The `action_spec` must specify a scalar int32 or int64 with minimum zero. Args: action_spec: a `BoundedTensorSpec`. Returns: The graph Laplacian matrix (float tensor) of size equal to the number of actions. The diagonal elements are equal to 2 and the off-diagonal elements are equal to -1. Raises: ValueError: if `action_spec` is not a bounded scalar int32 or int64 spec with minimum 0. """ num_actions = policy_utilities.get_num_actions_from_tensor_spec( action_spec) adjacency_matrix = np.zeros([num_actions, num_actions]) for i in range(num_actions - 1): adjacency_matrix[i, i + 1] = 1.0 adjacency_matrix[i + 1, i] = 1.0 laplacian_matrix = np.diag(np.sum(adjacency_matrix, axis=0)) - adjacency_matrix return laplacian_matrix
def __init__(self, time_step_spec: types.TimeStep, action_spec: types.BoundedTensorSpec, input_network: Optional[types.Network] = None, name: Optional[Text] = 'InputNetworkConstraint'): """Creates a constraint using an input network. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of `BoundedTensorSpec` representing the actions. input_network: An instance of `tf_agents.network.Network` used to provide estimates of action feasibility. name: Python str name of this agent. All variables in this module will fall under that name. Defaults to the class name. """ super(InputNetworkConstraint, self).__init__(time_step_spec, action_spec, name) self._num_actions = policy_utilities.get_num_actions_from_tensor_spec( action_spec) self._network = input_network
def __init__( self, original_environment: bandit_tf_environment.BanditTFEnvironment, action_constraint_join_fn: Callable[ [types.TensorSpec, types.TensorSpec], types.TensorSpec], action_probability: float): """Initializes a `BernoulliActionMaskTFEnvironment`. Args: original_environment: Instance of `BanditTFEnvironment`. This environment will be wrapped. action_constraint_join_fn: A function that joins the osbervation from the original environment with the generated masks. action_probability: The probability that any action in the action space is allower by the generated mask. """ self._original_environment = original_environment assert isinstance( original_environment, bandit_tf_environment.BanditTFEnvironment ), 'The wrapped environment needs to be a `BanditTFEnvironment`.' self._action_constraint_join_fn = action_constraint_join_fn self._action_probability = action_probability self._batch_size = self._original_environment.batch_size action_spec = self._original_environment.action_spec() observation_spec_without_mask = ( self._original_environment.time_step_spec().observation) self._num_actions = policy_utilities.get_num_actions_from_tensor_spec( action_spec) mask_spec = tf.TensorSpec([self._num_actions], dtype=tf.int32) joined_observation_spec = self._action_constraint_join_fn( observation_spec_without_mask, mask_spec) time_step_spec = ts.time_step_spec(joined_observation_spec) self._current_mask = tf.compat.v2.Variable( tf.ones([self.batch_size, self._num_actions], dtype=tf.int32)) super(BernoulliActionMaskTFEnvironment, self).__init__(time_step_spec=time_step_spec, action_spec=action_spec, batch_size=self._batch_size)
def __init__(self, time_step_spec: types.TimeStep, action_spec: types.BoundedTensorSpec, learning_rate: float, name: Optional[Text] = None): """Initialize an instance of `Exp3Agent`. Args: time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s. action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype describing the number of actions for this agent. learning_rate: A float valued scalar. A higher value will force the agent to converge on a single action more quickly. A lower value will encourage more exploration. This value corresponds to the `inverse_temperature` argument passed to `CategoricalPolicy`. name: a name for this instance of `Exp3Agent`. """ tf.Module.__init__(self, name=name) common.tf_agents_gauge.get_cell('TFABandit').set(True) self._num_actions = policy_utilities.get_num_actions_from_tensor_spec( action_spec) self._weights = tf.compat.v2.Variable(tf.zeros(self._num_actions), name='weights') self._learning_rate = tf.compat.v2.Variable(learning_rate, name='learning_rate') policy = categorical_policy.CategoricalPolicy( weights=self._weights, time_step_spec=time_step_spec, action_spec=action_spec, inverse_temperature=self._learning_rate) # TODO(b/127462472): consider policy=GreedyPolicy(collect_policy). super(Exp3Agent, self).__init__(time_step_spec=time_step_spec, action_spec=policy.action_spec, policy=policy, collect_policy=policy, train_sequence_length=None) self._as_trajectory = data_converter.AsTrajectory(self.data_context, sequence_length=None)
def testNumActionsFromTensorSpecWrongRank(self): action_spec = tensor_spec.BoundedTensorSpec( dtype=tf.int32, shape=(2, 3), minimum=0, maximum=15) with self.assertRaisesRegex(ValueError, r'Action spec must be a scalar'): policy_utilities.get_num_actions_from_tensor_spec(action_spec)
def testNumActionsFromTensorSpecGoodSpec(self): action_spec = tensor_spec.BoundedTensorSpec( dtype=tf.int32, shape=(), minimum=0, maximum=15) num_actions = policy_utilities.get_num_actions_from_tensor_spec(action_spec) self.assertEqual(num_actions, 16)
def __init__(self, exploration_policy, time_step_spec: types.TimeStep, action_spec: types.BoundedTensorSpec, variable_collection: Optional[ LinearBanditVariableCollection] = None, alpha: float = 1.0, gamma: float = 1.0, use_eigendecomp: bool = False, tikhonov_weight: float = 1.0, add_bias: bool = False, emit_policy_info: Sequence[Text] = (), emit_log_probability: bool = False, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None, accepts_per_arm_features: bool = False, debug_summaries: bool = False, summarize_grads_and_vars: bool = False, enable_summaries: bool = True, dtype: tf.DType = tf.float32, name: Optional[Text] = None): """Initialize an instance of `LinearBanditAgent`. Args: exploration_policy: An Enum of type `ExplorationPolicy`. The kind of policy we use for exploration. Currently supported policies are `LinUCBPolicy` and `LinearThompsonSamplingPolicy`. time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s. action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype describing the number of actions for this agent. variable_collection: Instance of `LinearBanditVariableCollection`. Collection of variables to be updated by the agent. If `None`, a new instance of `LinearBanditVariableCollection` will be created. alpha: (float) positive scalar. This is the exploration parameter that multiplies the confidence intervals. gamma: a float forgetting factor in [0.0, 1.0]. When set to 1.0, the algorithm does not forget. use_eigendecomp: whether to use eigen-decomposition or not. The default solver is Conjugate Gradient. tikhonov_weight: (float) tikhonov regularization term. add_bias: If true, a bias term will be added to the linear reward estimation. emit_policy_info: (tuple of strings) what side information we want to get as part of the policy info. Allowed values can be found in `policy_utilities.PolicyInfo`. emit_log_probability: Whether the policy emits log-probabilities or not. Since the policy is deterministic, the probability is just 1. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the bandit agent and policy, and 2) the boolean mask. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. accepts_per_arm_features: (bool) Whether the agent accepts per-arm features. debug_summaries: A Python bool, default False. When True, debug summaries are gathered. summarize_grads_and_vars: A Python bool, default False. When True, gradients and network variable summaries are written during training. enable_summaries: A Python bool, default True. When False, all summaries (debug or otherwise) should not be written. dtype: The type of the parameters stored and updated by the agent. Should be one of `tf.float32` and `tf.float64`. Defaults to `tf.float32`. name: a name for this instance of `LinearBanditAgent`. Raises: ValueError if dtype is not one of `tf.float32` or `tf.float64`. TypeError if variable_collection is not an instance of `LinearBanditVariableCollection`. """ tf.Module.__init__(self, name=name) common.tf_agents_gauge.get_cell('TFABandit').set(True) self._num_actions = policy_utilities.get_num_actions_from_tensor_spec( action_spec) self._num_models = 1 if accepts_per_arm_features else self._num_actions self._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter) self._time_step_spec = time_step_spec self._accepts_per_arm_features = accepts_per_arm_features self._add_bias = add_bias if observation_and_action_constraint_splitter is not None: context_spec, _ = observation_and_action_constraint_splitter( time_step_spec.observation) else: context_spec = time_step_spec.observation (self._global_context_dim, self._arm_context_dim) = bandit_spec_utils.get_context_dims_from_spec( context_spec, accepts_per_arm_features) if self._add_bias: # The bias is added via a constant 1 feature. self._global_context_dim += 1 self._overall_context_dim = self._global_context_dim + self._arm_context_dim self._alpha = alpha if variable_collection is None: variable_collection = LinearBanditVariableCollection( context_dim=self._overall_context_dim, num_models=self._num_models, use_eigendecomp=use_eigendecomp, dtype=dtype) elif not isinstance(variable_collection, LinearBanditVariableCollection): raise TypeError('Parameter `variable_collection` should be ' 'of type `LinearBanditVariableCollection`.') self._variable_collection = variable_collection self._cov_matrix_list = variable_collection.cov_matrix_list self._data_vector_list = variable_collection.data_vector_list self._eig_matrix_list = variable_collection.eig_matrix_list self._eig_vals_list = variable_collection.eig_vals_list # We keep track of the number of samples per arm. self._num_samples_list = variable_collection.num_samples_list self._gamma = gamma if self._gamma < 0.0 or self._gamma > 1.0: raise ValueError( 'Forgetting factor `gamma` must be in [0.0, 1.0].') self._dtype = dtype if dtype not in (tf.float32, tf.float64): raise ValueError( 'Agent dtype should be either `tf.float32 or `tf.float64`.') self._use_eigendecomp = use_eigendecomp self._tikhonov_weight = tikhonov_weight if exploration_policy == ExplorationPolicy.linear_ucb_policy: exploration_strategy = lin_policy.ExplorationStrategy.optimistic elif exploration_policy == ( ExplorationPolicy.linear_thompson_sampling_policy): exploration_strategy = lin_policy.ExplorationStrategy.sampling else: raise ValueError( 'Linear bandit agent with policy %s not implemented' % exploration_policy) policy = lin_policy.LinearBanditPolicy( action_spec=action_spec, cov_matrix=self._cov_matrix_list, data_vector=self._data_vector_list, num_samples=self._num_samples_list, time_step_spec=time_step_spec, exploration_strategy=exploration_strategy, alpha=alpha, eig_vals=self._eig_vals_list if self._use_eigendecomp else (), eig_matrix=self._eig_matrix_list if self._use_eigendecomp else (), tikhonov_weight=self._tikhonov_weight, add_bias=add_bias, emit_policy_info=emit_policy_info, emit_log_probability=emit_log_probability, accepts_per_arm_features=accepts_per_arm_features, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter)) training_data_spec = None if accepts_per_arm_features: training_data_spec = bandit_spec_utils.drop_arm_observation( policy.trajectory_spec) super(LinearBanditAgent, self).__init__(time_step_spec=time_step_spec, action_spec=action_spec, policy=policy, collect_policy=policy, training_data_spec=training_data_spec, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, enable_summaries=enable_summaries, train_sequence_length=None) self._as_trajectory = data_converter.AsTrajectory(self.data_context, sequence_length=None)
def __init__( self, time_step_spec: types.TimeStep, action_spec: types.BoundedTensorSpec, reward_network: types.Network, optimizer: types.Optimizer, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None, accepts_per_arm_features: bool = False, constraints: Iterable[constr.BaseConstraint] = (), # Params for training. error_loss_fn: types.LossFn = tf.compat.v1.losses.mean_squared_error, gradient_clipping: Optional[float] = None, # Params for debugging. debug_summaries: bool = False, summarize_grads_and_vars: bool = False, enable_summaries: bool = True, emit_policy_info: Tuple[Text, ...] = (), train_step_counter: Optional[tf.Variable] = None, laplacian_matrix: Optional[types.Float] = None, laplacian_smoothing_weight: float = 0.001, name: Optional[Text] = None): """Creates a Greedy Reward Network Prediction Agent. In some use cases, the actions are not independent and they are related to each other (e.g., when the actions are ordinal integers). Assuming that the relations between arms can be modeled by a graph, we may want to enforce that the estimated reward function is smooth over the graph. This implies that the estimated rewards `r_i` and `r_j` for two related actions `i` and `j`, should be close to each other. To quantify this smoothness criterion we use the Laplacian matrix `L` of the graph over the actions. When the laplacian smoothing is enabled, the loss is extended to: ``` Loss_new := Loss + lambda r^T * L * r, ``` where `r` is the estimated reward vector for all actions. The second term is the laplacian smoothing regularization term and `lambda` is the weight that determines how strongly we enforce the regularization. For more details, please see: "Bandits on graphs and structures", Michal Valko https://hal.inria.fr/tel-01359757/document Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of `BoundedTensorSpec` representing the actions. reward_network: A `tf_agents.network.Network` to be used by the agent. The network will be called with call(observation, step_type) and it is expected to provide a reward prediction for all actions. optimizer: The optimizer to use for training. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the bandit agent and policy, and 2) the boolean mask. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. accepts_per_arm_features: (bool) Whether the policy accepts per-arm features. constraints: iterable of constraints objects that are instances of `tf_agents.bandits.agents.NeuralConstraint`. error_loss_fn: A function for computing the error loss, taking parameters labels, predictions, and weights (any function from tf.losses would work). The default is `tf.losses.mean_squared_error`. gradient_clipping: A float representing the norm length to clip gradients (or None for no clipping.) debug_summaries: A Python bool, default False. When True, debug summaries are gathered. summarize_grads_and_vars: A Python bool, default False. When True, gradients and network variable summaries are written during training. enable_summaries: A Python bool, default True. When False, all summaries (debug or otherwise) should not be written. emit_policy_info: (tuple of strings) what side information we want to get as part of the policy info. Allowed values can be found in `policy_utilities.PolicyInfo`. train_step_counter: An optional `tf.Variable` to increment every time the train op is run. Defaults to the `global_step`. laplacian_matrix: A float `Tensor` or a numpy array shaped `[num_actions, num_actions]`. This holds the Laplacian matrix used to regularize the smoothness of the estimated expected reward function. This only applies to problems where the actions have a graph structure. If `None`, the regularization is not applied. laplacian_smoothing_weight: A float that determines the weight of the regularization term. Note that this has no effect if `laplacian_matrix` above is `None`. name: Python str name of this agent. All variables in this module will fall under that name. Defaults to the class name. Raises: ValueError: If the action spec contains more than one action or or it is not a bounded scalar int32 spec with minimum 0. InvalidArgumentError: if the Laplacian provided is not None and not valid. """ tf.Module.__init__(self, name=name) common.tf_agents_gauge.get_cell('TFABandit').set(True) self._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter) self._num_actions = policy_utilities.get_num_actions_from_tensor_spec( action_spec) self._accepts_per_arm_features = accepts_per_arm_features self._constraints = constraints reward_network.create_variables() self._reward_network = reward_network self._optimizer = optimizer self._error_loss_fn = error_loss_fn self._gradient_clipping = gradient_clipping self._heteroscedastic = isinstance( reward_network, heteroscedastic_q_network.HeteroscedasticQNetwork) self._laplacian_matrix = None if laplacian_matrix is not None: self._laplacian_matrix = tf.convert_to_tensor( laplacian_matrix, dtype=tf.float32) # Check the validity of the laplacian matrix. tf.debugging.assert_near( 0.0, tf.norm(tf.reduce_sum(self._laplacian_matrix, 1))) tf.debugging.assert_near( 0.0, tf.norm(tf.reduce_sum(self._laplacian_matrix, 0))) self._laplacian_smoothing_weight = laplacian_smoothing_weight policy = greedy_reward_policy.GreedyRewardPredictionPolicy( time_step_spec, action_spec, reward_network, observation_and_action_constraint_splitter, constraints=constraints, accepts_per_arm_features=accepts_per_arm_features, emit_policy_info=emit_policy_info) training_data_spec = None if accepts_per_arm_features: training_data_spec = bandit_spec_utils.drop_arm_observation( policy.trajectory_spec) super(GreedyRewardPredictionAgent, self).__init__( time_step_spec, action_spec, policy, collect_policy=policy, train_sequence_length=None, training_data_spec=training_data_spec, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, enable_summaries=enable_summaries, train_step_counter=train_step_counter) self._as_trajectory = data_converter.AsTrajectory( self.data_context, sequence_length=None)
def __init__( self, time_step_spec: Optional[ts.TimeStep], action_spec: Optional[types.NestedBoundedTensorSpec], scalarizer: multi_objective_scalarizer.Scalarizer, objective_network_and_loss_fn_sequence: Sequence[Tuple[ Network, Callable[..., tf.Tensor]]], optimizer: tf.keras.optimizers.Optimizer, observation_and_action_constraint_splitter: types.Splitter = None, accepts_per_arm_features: bool = False, # Params for training. gradient_clipping: Optional[float] = None, # Params for debugging. debug_summaries: bool = False, summarize_grads_and_vars: bool = False, enable_summaries: bool = True, emit_policy_info: Tuple[Text, ...] = (), train_step_counter: Optional[tf.Variable] = None, name: Optional[Text] = None): """Creates a Greedy Multi-objective Neural Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of `BoundedTensorSpec` representing the actions. scalarizer: A `tf_agents.bandits.multi_objective.multi_objective_scalarizer.Scalarizer` object that implements scalarization of multiple objectives into a single scalar reward. objective_network_and_loss_fn_sequence: A Sequence of Tuples (`tf_agents.network.Network`, error loss function) to be used by the agent. Each network `net` will be called as `net(observation, training=...)` and is expected to output a `tf.Tensor` of predicted values for a specific objective for all actions, shaped as [batch-size, number-of-actions]. Each network will be trained via minimizing the accompanying error loss function, which takes parameters labels, predictions, and weights (any function from tf.losses would work). optimizer: A 'tf.keras.optimizers.Optimizer' object, the optimizer to use for training. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the bandit agent and policy, and 2) the boolean mask of shape `[batch_size, num_actions]`. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. accepts_per_arm_features: (bool) Whether the agent accepts per-arm features. gradient_clipping: A float representing the norm length to clip gradients (or None for no clipping.) debug_summaries: A Python bool, default False. When True, debug summaries are gathered. summarize_grads_and_vars: A Python bool, default False. When True, gradients and network variable summaries are written during training. enable_summaries: A Python bool, default True. When False, all summaries (debug or otherwise) should not be written. emit_policy_info: (tuple of strings) what side information we want to get as part of the policy info. Allowed values can be found in `policy_utilities.PolicyInfo`. train_step_counter: An optional `tf.Variable` to increment every time the train op is run. Defaults to the `global_step`. name: Python str name of this agent. All variables in this module will fall under that name. Defaults to the class name. Raises: ValueError: - If the action spec contains more than one action or or it is not a bounded scalar int32 spec with minimum 0. - If the length of `objective_network_and_loss_fn_sequence` is less than two. """ tf.Module.__init__(self, name=name) common.tf_agents_gauge.get_cell('TFABandit').set(True) self._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter) self._num_actions = policy_utilities.get_num_actions_from_tensor_spec( action_spec) self._accepts_per_arm_features = accepts_per_arm_features self._num_objectives = len(objective_network_and_loss_fn_sequence) if self._num_objectives < 2: raise ValueError( 'Number of objectives should be at least two, but found to be {}' .format(self._num_objectives)) self._objective_networks, self._error_loss_fns = tuple( zip(*objective_network_and_loss_fn_sequence)) self._optimizer = optimizer self._gradient_clipping = gradient_clipping self._heteroscedastic = [ isinstance(network, heteroscedastic_q_network.HeteroscedasticQNetwork) for network in self._objective_networks ] policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy( time_step_spec, action_spec, scalarizer, self._objective_networks, observation_and_action_constraint_splitter, accepts_per_arm_features=accepts_per_arm_features, emit_policy_info=emit_policy_info) training_data_spec = None if accepts_per_arm_features: training_data_spec = bandit_spec_utils.drop_arm_observation( policy.trajectory_spec) super(GreedyMultiObjectiveNeuralAgent, self).__init__(time_step_spec, action_spec, policy, collect_policy=policy, train_sequence_length=None, training_data_spec=training_data_spec, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, enable_summaries=enable_summaries, train_step_counter=train_step_counter) self._as_trajectory = data_converter.AsTrajectory(self.data_context, sequence_length=None)
def __init__( self, time_step_spec: types.TimeStep, action_spec: types.BoundedTensorSpec, encoding_network: types.Network, encoding_network_num_train_steps: int, encoding_dim: int, optimizer: types.Optimizer, variable_collection: Optional[ NeuralLinUCBVariableCollection] = None, alpha: float = 1.0, gamma: float = 1.0, epsilon_greedy: float = 0.0, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None, accepts_per_arm_features: bool = False, distributed_train_encoding_network: bool = False, # Params for training. error_loss_fn: types.LossFn = tf.compat.v1.losses. mean_squared_error, gradient_clipping: Optional[float] = None, # Params for debugging. debug_summaries: bool = False, summarize_grads_and_vars: bool = False, train_step_counter: Optional[tf.Variable] = None, emit_policy_info: Sequence[Text] = (), emit_log_probability: bool = False, dtype: tf.DType = tf.float64, name: Optional[Text] = 'neural_linucb_agent'): """Initialize an instance of `NeuralLinUCBAgent`. Args: time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s. action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype describing the number of actions for this agent. encoding_network: a Keras network that encodes the observations. encoding_network_num_train_steps: how many training steps to run for training the encoding network before switching to LinUCB. If negative, the encoding network is assumed to be already trained. encoding_dim: the dimension of encoded observations. optimizer: The optimizer to use for training. variable_collection: Instance of `NeuralLinUCBVariableCollection`. Collection of variables to be updated by the agent. If `None`, a new instance of `LinearBanditVariables` will be created. Note that this collection excludes the variables owned by the encoding network. alpha: (float) positive scalar. This is the exploration parameter that multiplies the confidence intervals. gamma: a float forgetting factor in [0.0, 1.0]. When set to 1.0, the algorithm does not forget. epsilon_greedy: A float representing the probability of choosing a random action instead of the greedy action. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the bandit agent and policy, and 2) the boolean mask. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. accepts_per_arm_features: (bool) Whether the policy accepts per-arm features. distributed_train_encoding_network: (bool) whether to train the encoding network or not. This applies only in distributed training setting. When set to true this agent will train the encoding network. Otherwise, it will assume the encoding network is already trained and will train LinUCB on top of it. error_loss_fn: A function for computing the error loss, taking parameters labels, predictions, and weights (any function from tf.losses would work). The default is `tf.losses.mean_squared_error`. gradient_clipping: A float representing the norm length to clip gradients (or None for no clipping.) debug_summaries: A Python bool, default False. When True, debug summaries are gathered. summarize_grads_and_vars: A Python bool, default False. When True, gradients and network variable summaries are written during training. train_step_counter: An optional `tf.Variable` to increment every time the train op is run. Defaults to the `global_step`. emit_policy_info: (tuple of strings) what side information we want to get as part of the policy info. Allowed values can be found in `policy_utilities.PolicyInfo`. emit_log_probability: Whether the NeuralLinUCBPolicy emits log-probabilities or not. Since the policy is deterministic, the probability is just 1. dtype: The type of the parameters stored and updated by the agent. Should be one of `tf.float32` and `tf.float64`. Defaults to `tf.float64`. name: a name for this instance of `NeuralLinUCBAgent`. Raises: TypeError if variable_collection is not an instance of `NeuralLinUCBVariableCollection`. ValueError if dtype is not one of `tf.float32` or `tf.float64`. """ tf.Module.__init__(self, name=name) common.tf_agents_gauge.get_cell('TFABandit').set(True) self._num_actions = policy_utilities.get_num_actions_from_tensor_spec( action_spec) self._num_models = 1 if accepts_per_arm_features else self._num_actions self._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter) self._accepts_per_arm_features = accepts_per_arm_features self._alpha = alpha if variable_collection is None: variable_collection = NeuralLinUCBVariableCollection( self._num_models, encoding_dim, dtype) elif not isinstance(variable_collection, NeuralLinUCBVariableCollection): raise TypeError('Parameter `variable_collection` should be ' 'of type `NeuralLinUCBVariableCollection`.') self._variable_collection = variable_collection self._gamma = gamma if self._gamma < 0.0 or self._gamma > 1.0: raise ValueError( 'Forgetting factor `gamma` must be in [0.0, 1.0].') self._dtype = dtype if dtype not in (tf.float32, tf.float64): raise ValueError( 'Agent dtype should be either `tf.float32 or `tf.float64`.') self._epsilon_greedy = epsilon_greedy reward_layer = tf.keras.layers.Dense( self._num_models, kernel_initializer=tf.random_uniform_initializer(minval=-0.03, maxval=0.03), use_bias=False, activation=None, name='reward_layer') encoding_network.create_variables() self._encoding_network = encoding_network reward_layer.build(input_shape=tf.TensorShape([None, encoding_dim])) self._reward_layer = reward_layer self._encoding_network_num_train_steps = encoding_network_num_train_steps self._encoding_dim = encoding_dim self._optimizer = optimizer self._error_loss_fn = error_loss_fn self._gradient_clipping = gradient_clipping train_step_counter = tf.compat.v1.train.get_or_create_global_step() self._distributed_train_encoding_network = ( distributed_train_encoding_network) policy = neural_linucb_policy.NeuralLinUCBPolicy( encoding_network=self._encoding_network, encoding_dim=self._encoding_dim, reward_layer=self._reward_layer, epsilon_greedy=self._epsilon_greedy, actions_from_reward_layer=self.actions_from_reward_layer, cov_matrix=self.cov_matrix, data_vector=self.data_vector, num_samples=self.num_samples, time_step_spec=time_step_spec, alpha=alpha, emit_policy_info=emit_policy_info, emit_log_probability=emit_log_probability, accepts_per_arm_features=accepts_per_arm_features, distributed_use_reward_layer=distributed_train_encoding_network, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter)) training_data_spec = None if accepts_per_arm_features: training_data_spec = bandit_spec_utils.drop_arm_observation( policy.trajectory_spec) super(NeuralLinUCBAgent, self).__init__(time_step_spec=time_step_spec, action_spec=policy.action_spec, policy=policy, collect_policy=policy, train_sequence_length=None, training_data_spec=training_data_spec, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter) self._as_trajectory = data_converter.AsTrajectory(self.data_context, sequence_length=None)
def __init__(self, time_step_spec: types.TimeStep, action_spec: types.BoundedTensorSpec, variable_collection: Optional[ BernoulliBanditVariableCollection] = None, dtype: tf.DType = tf.float32, batch_size: Optional[int] = 1, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None, emit_policy_info: Sequence[Text] = (), name: Optional[Text] = None): """Creates a Bernoulli Thompson Sampling Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of `BoundedTensorSpec` representing the actions. variable_collection: Instance of `BernoulliBanditVariableCollection`. Collection of variables to be updated by the agent. If `None`, a new instance of `BernoulliBanditVariableCollection` will be created. dtype: The type of the variables. Should be one of `tf.float32` or `tf.float64`. batch_size: optional int with the batch size. It defaults to 1. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the bandit agent and policy, and 2) the boolean mask. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. emit_policy_info: (tuple of strings) what side information we want to get as part of the policy info. Allowed values can be found in `policy_utilities.PolicyInfo`. name: Python str name of this agent. All variables in this module will fall under that name. Defaults to the class name. Raises: ValueError: If the action spec contains more than one action or it is not a bounded scalar int32 spec with minimum 0. TypeError: if variable_collection is not an instance of `BernoulliBanditVariableCollection`. """ tf.Module.__init__(self, name=name) common.tf_agents_gauge.get_cell('TFABandit').set(True) self._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter) self._num_actions = policy_utilities.get_num_actions_from_tensor_spec( action_spec) self._dtype = dtype if variable_collection is None: variable_collection = BernoulliBanditVariableCollection( num_actions=self._num_actions, dtype=dtype) elif not isinstance(variable_collection, BernoulliBanditVariableCollection): raise TypeError('Parameter `variable_collection` should be ' 'of type `BernoulliBanditVariableCollection`.') self._variable_collection = variable_collection self._alpha = variable_collection.alpha self._beta = variable_collection.beta self._batch_size = batch_size policy = bernoulli_policy.BernoulliThompsonSamplingPolicy( time_step_spec, action_spec, self._alpha, self._beta, observation_and_action_constraint_splitter, emit_policy_info=emit_policy_info) super(BernoulliThompsonSamplingAgent, self).__init__(time_step_spec, action_spec, policy, collect_policy=policy, train_sequence_length=None) self._as_trajectory = data_converter.AsTrajectory(self.data_context, sequence_length=None)