def testMaskedAction(self): tf.compat.v1.set_random_seed(1) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 2) observation_spec = (tensor_spec.TensorSpec([2], tf.float32), tensor_spec.TensorSpec([3], tf.int32)) time_step_spec = ts.time_step_spec(observation_spec) def split_fn(obs): return obs[0], obs[1] policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy( time_step_spec, action_spec, reward_network=DummyNet(observation_spec[0]), observation_and_action_constraint_splitter=split_fn) observations = (tf.constant([[1, 2], [3, 4]], dtype=tf.float32), tf.constant([[0, 0, 1], [0, 1, 0]], dtype=tf.int32)) time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step, seed=1) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllEqual(self.evaluate(action_step.action), [2, 1])
def testBoltzmannGumbelPredictedRewards(self): tf.compat.v1.set_random_seed(1) num_samples_list = [] for k in range(3): num_samples_list.append( tf.compat.v2.Variable( tf.zeros([], dtype=tf.int32), name='num_samples_{}'.format(k))) num_samples_list[0].assign_add(2) num_samples_list[1].assign_add(4) num_samples_list[2].assign_add(1) policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy( self._time_step_spec, self._action_spec, reward_network=DummyNet(self._obs_spec), boltzmann_gumbel_exploration_constant=10.0, emit_policy_info=('predicted_rewards_mean',), num_samples_list=num_samples_list) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step, seed=1) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate(tf.compat.v1.global_variables_initializer()) p_info = self.evaluate(action_step.info) self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 3])
def testBuild(self): policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy( self._time_step_spec, self._action_spec, reward_network=DummyNet(self._obs_spec)) self.assertEqual(policy.time_step_spec, self._time_step_spec) self.assertEqual(policy.action_spec, self._action_spec)
def testMultipleActionsRaiseError(self): action_spec = [tensor_spec.BoundedTensorSpec((), tf.int32, 0, 2)] * 2 with self.assertRaisesRegexp( NotImplementedError, 'action_spec can only contain a single BoundedTensorSpec'): boltzmann_reward_policy.BoltzmannRewardPredictionPolicy( self._time_step_spec, action_spec, reward_network=DummyNet(self._obs_spec))
def testWrongActionsRaiseError(self): action_spec = tensor_spec.BoundedTensorSpec((5, 6, 7), tf.float32, 0, 2) with self.assertRaisesRegexp( NotImplementedError, 'action_spec must be a BoundedTensorSpec of type int32.*'): boltzmann_reward_policy.BoltzmannRewardPredictionPolicy( self._time_step_spec, action_spec, reward_network=DummyNet(self._obs_spec))
def testActionHeteroscedastic(self): tf.compat.v1.set_random_seed(1) policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy( self._time_step_spec, self._action_spec, reward_network=HeteroscedasticDummyNet()) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step, seed=1) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllInSet(self.evaluate(action_step.action), [0, 1, 2])
def testWrongOutputLayerRaiseError(self): tf.compat.v1.set_random_seed(1) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 10, 20) policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy( self._time_step_spec, action_spec, reward_network=DummyNet(self._obs_spec)) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=2) with self.assertRaisesRegexp( ValueError, r'The number of actions \(11\) does not match the reward_network output' r' size \(3\)\.'): policy.action(time_step, seed=1)
def testPredictedRewards(self): tf.compat.v1.set_random_seed(1) policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy( self._time_step_spec, self._action_spec, reward_network=DummyNet(self._obs_spec), emit_policy_info=('predicted_rewards_mean', )) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step, seed=1) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate(tf.compat.v1.global_variables_initializer()) p_info = self.evaluate(action_step.info) self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 3])
def testActionScalarSpecWithShift(self): tf.compat.v1.set_random_seed(1) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 10, 12) policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy( self._time_step_spec, action_spec, reward_network=DummyNet(self._obs_spec)) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step, seed=1) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllInSet(self.evaluate(action_step.action), [10, 11, 12])
def testPerArmRewardsVariableNumActions(self): tf.compat.v1.set_random_seed(3000) obs_spec = bandit_spec_utils.create_per_arm_observation_spec( 2, 3, 4, add_num_actions_feature=True) time_step_spec = ts.time_step_spec(obs_spec) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3) reward_network = (global_and_arm_feature_network. create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2))) policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy( time_step_spec, action_spec, reward_network=reward_network, accepts_per_arm_features=True, emit_policy_info=('predicted_rewards_mean', )) action_feature = tf.cast(tf.reshape(tf.random.shuffle(tf.range(24)), shape=[2, 4, 3]), dtype=tf.float32) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: action_feature, bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY: tf.constant([2, 3], dtype=tf.int32) } time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step, seed=1) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate(tf.compat.v1.global_variables_initializer()) action, p_info, first_arm_features = self.evaluate([ action_step.action, action_step.info, observations[bandit_spec_utils.PER_ARM_FEATURE_KEY][0] ]) self.assertAllEqual(action.shape, [2]) self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 4]) self.assertAllEqual(p_info.chosen_arm_features.shape, [2, 3]) first_action = action[0] self.assertAllEqual(p_info.chosen_arm_features[0], first_arm_features[first_action])
def __init__( self, time_step_spec: types.TimeStep, action_spec: types.BoundedTensorSpec, reward_network: types.Network, optimizer: types.Optimizer, temperature: types.FloatOrReturningFloat = 1.0, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None, accepts_per_arm_features: bool = False, constraints: Iterable[constr.NeuralConstraint] = (), # Params for training. error_loss_fn: types.LossFn = tf.compat.v1.losses. mean_squared_error, gradient_clipping: Optional[float] = None, # Params for debugging. debug_summaries: bool = False, summarize_grads_and_vars: bool = False, enable_summaries: bool = True, emit_policy_info: Tuple[Text, ...] = (), train_step_counter: Optional[tf.Variable] = None, name: Optional[Text] = None): """Creates a Neural Boltzmann Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of `BoundedTensorSpec` representing the actions. reward_network: A `tf_agents.network.Network` to be used by the agent. The network will be called with call(observation, step_type) and it is expected to provide a reward prediction for all actions. *Note*: when using `observation_and_action_constraint_splitter`, make sure the `reward_network` is compatible with the network-specific half of the output of the `observation_and_action_constraint_splitter`. In particular, `observation_and_action_constraint_splitter` will be called on the observation before passing to the network. optimizer: The optimizer to use for training. temperature: float or callable that returns a float. The temperature used in the Boltzmann exploration. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the bandit agent and policy, and 2) the boolean mask. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. accepts_per_arm_features: (bool) Whether the policy accepts per-arm features. constraints: iterable of constraints objects that are instances of `tf_agents.bandits.agents.NeuralConstraint`. error_loss_fn: A function for computing the error loss, taking parameters labels, predictions, and weights (any function from tf.losses would work). The default is `tf.losses.mean_squared_error`. gradient_clipping: A float representing the norm length to clip gradients (or None for no clipping.) debug_summaries: A Python bool, default False. When True, debug summaries are gathered. summarize_grads_and_vars: A Python bool, default False. When True, gradients and network variable summaries are written during training. enable_summaries: A Python bool, default True. When False, all summaries (debug or otherwise) should not be written. emit_policy_info: (tuple of strings) what side information we want to get as part of the policy info. Allowed values can be found in `policy_utilities.PolicyInfo`. train_step_counter: An optional `tf.Variable` to increment every time the train op is run. Defaults to the `global_step`. name: Python str name of this agent. All variables in this module will fall under that name. Defaults to the class name. Raises: ValueError: If the action spec contains more than one action or or it is not a bounded scalar int32 spec with minimum 0. """ super(NeuralBoltzmannAgent, self).__init__(time_step_spec=time_step_spec, action_spec=action_spec, reward_network=reward_network, optimizer=optimizer, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), accepts_per_arm_features=accepts_per_arm_features, constraints=constraints, error_loss_fn=error_loss_fn, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, enable_summaries=enable_summaries, emit_policy_info=emit_policy_info, train_step_counter=train_step_counter, name=name) self._policy = boltzmann_policy.BoltzmannRewardPredictionPolicy( time_step_spec, action_spec, reward_network, temperature, observation_and_action_constraint_splitter, constraints=constraints, accepts_per_arm_features=accepts_per_arm_features, emit_policy_info=emit_policy_info) self._collect_policy = self._policy
def testPerArmRewardsSparseObs(self): tf.compat.v1.set_random_seed(3000) obs_spec = { 'global': { 'sport': tensor_spec.TensorSpec((), tf.string) }, 'per_arm': { 'name': tensor_spec.TensorSpec((3, ), tf.string), 'fruit': tensor_spec.TensorSpec((3, ), tf.string) } } columns_a = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'name', ['bob', 'george', 'wanda'])) columns_b = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'fruit', ['banana', 'kiwi', 'pear'])) columns_c = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'sport', ['bridge', 'chess', 'snooker'])) reward_network = (global_and_arm_feature_network. create_feed_forward_common_tower_network( observation_spec=obs_spec, global_layers=(4, 3, 2), arm_layers=(6, 5, 4), common_layers=(7, 6, 5), global_preprocessing_combiner=( tf.compat.v2.keras.layers.DenseFeatures( [columns_c])), arm_preprocessing_combiner=tf.compat.v2.keras. layers.DenseFeatures([columns_a, columns_b]))) time_step_spec = ts.time_step_spec(obs_spec) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 2) policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy( time_step_spec, action_spec, reward_network=reward_network, accepts_per_arm_features=True, emit_policy_info=('predicted_rewards_mean')) observations = { 'global': { 'sport': tf.constant(['snooker', 'chess']) }, 'per_arm': { 'name': tf.constant([['george', 'george', 'george'], ['bob', 'bob', 'bob']]), 'fruit': tf.constant([['banana', 'banana', 'banana'], ['kiwi', 'kiwi', 'kiwi']]) } } time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step, seed=1) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate([ tf.compat.v1.global_variables_initializer(), tf.compat.v1.tables_initializer() ]) action, p_info, first_arm_name_feature = self.evaluate([ action_step.action, action_step.info, observations[bandit_spec_utils.PER_ARM_FEATURE_KEY]['name'][0] ]) self.assertAllEqual(action.shape, [2]) self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 3]) self.assertAllEqual(p_info.chosen_arm_features['name'].shape, [2]) self.assertAllEqual(p_info.chosen_arm_features['fruit'].shape, [2]) first_action = action[0] self.assertAllEqual(p_info.chosen_arm_features['name'][0], first_arm_name_feature[first_action])
def testPerArmRewards(self): tf.compat.v1.set_random_seed(3000) obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4) time_step_spec = ts.time_step_spec(obs_spec) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3) reward_network = (global_and_arm_feature_network. create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2))) policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy( time_step_spec, action_spec, reward_network=reward_network, accepts_per_arm_features=True, emit_policy_info=('predicted_rewards_mean', )) action_feature = tf.cast(tf.reshape(tf.random.shuffle(tf.range(24)), shape=[2, 4, 3]), dtype=tf.float32) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: action_feature } time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step, seed=1) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate(tf.compat.v1.global_variables_initializer()) action, p_info, first_arm_features = self.evaluate([ action_step.action, action_step.info, observations[bandit_spec_utils.PER_ARM_FEATURE_KEY][0] ]) self.assertAllEqual(action.shape, [2]) self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 4]) self.assertAllEqual(p_info.chosen_arm_features.shape, [2, 3]) first_action = action[0] self.assertAllEqual(p_info.chosen_arm_features[0], first_arm_features[first_action]) # Check that zeroing out some of the actions does not affect the predicted # rewards for unchanged actions. This is to make sure that action feature # padding does not influence the behavior. if not tf.executing_eagerly(): # The below comparison will only work in tf2 because of the random per-arm # observations get re-drawn in tf1. return padded_action_feature = tf.concat([ action_feature[:, 0:1, :], tf.zeros(shape=[2, 3, 3], dtype=tf.float32) ], axis=1) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: padded_action_feature } time_step = ts.restart(observations, batch_size=2) padded_action_step = policy.action(time_step, seed=1) padded_p_info = self.evaluate(padded_action_step.info) self.assertAllEqual(p_info.predicted_rewards_mean[:, 0], padded_p_info.predicted_rewards_mean[:, 0])