def setUp(self): super(TemporalActionSmoothingTest, self).setUp() self._obs_spec = tensor_spec.TensorSpec([2], tf.float32) self._time_step_spec = ts.time_step_spec(self._obs_spec) self._action_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, 0, 10)
def testDistributedLinearAgentUpdate(self, batch_size, context_dim, exploration_policy, dtype, use_eigendecomp=False): """Same as above, but uses the distributed train function of the agent.""" # Construct a `Trajectory` for the given action, observation, reward. num_actions = 5 initial_step, final_step = _get_initial_and_final_steps( batch_size, context_dim) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) agent = linear_agent.LinearBanditAgent( exploration_policy=exploration_policy, time_step_spec=time_step_spec, action_spec=action_spec, dtype=dtype) self.evaluate(agent.initialize()) train_fn = common.function_in_tf1()(agent._distributed_train_step) loss_info = train_fn(experience=experience) self.evaluate(loss_info) final_a = self.evaluate(agent.cov_matrix) final_b = self.evaluate(agent.data_vector) # Compute the expected updated estimates. observations_list = tf.dynamic_partition( data=tf.reshape(experience.observation, [batch_size, context_dim]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) rewards_list = tf.dynamic_partition( data=tf.reshape(experience.reward, [batch_size]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) expected_a_updated_list = [] expected_b_updated_list = [] expected_theta_updated_list = [] for _, (observations_for_arm, rewards_for_arm) in enumerate( zip(observations_list, rewards_list)): num_samples_for_arm_current = tf.cast( tf.shape(rewards_for_arm)[0], tf.float32) num_samples_for_arm_total = num_samples_for_arm_current # pylint: disable=cell-var-from-loop def true_fn(): a_new = tf.matmul(observations_for_arm, observations_for_arm, transpose_a=True) b_new = bandit_utils.sum_reward_weighted_observations( rewards_for_arm, observations_for_arm) return a_new, b_new def false_fn(): return tf.zeros([context_dim, context_dim]), tf.zeros([context_dim]) a_new, b_new = tf.cond( tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn) theta_new = tf.squeeze(tf.linalg.solve( a_new + tf.eye(context_dim), tf.expand_dims(b_new, axis=-1)), axis=-1) expected_a_updated_list.append(self.evaluate(a_new)) expected_b_updated_list.append(self.evaluate(b_new)) expected_theta_updated_list.append(self.evaluate(theta_new)) # Check that the actual updated estimates match the expectations. self.assertAllClose(expected_a_updated_list, final_a) self.assertAllClose(expected_b_updated_list, final_b)
def testSerialization(self): desc = tensor_spec.BoundedTensorSpec([1, 5], tf.float32, -1, 1, "test") self.assertEqual(pickle.loads(pickle.dumps(desc)), desc)
def setUp(self): super(QPolicyTest, self).setUp() self._obs_spec = tensor_spec.TensorSpec([2], tf.float32) self._time_step_spec = ts.time_step_spec(self._obs_spec) self._action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)
def testLinearAgentUpdateWithMaskedActions(self, batch_size, context_dim, exploration_policy, dtype, use_eigendecomp=False): """Check that the agent updates for specified actions and rewards.""" # Construct a `Trajectory` for the given action, observation, reward. num_actions = 5 initial_step, final_step = _get_initial_and_final_steps_with_action_mask( batch_size, context_dim, num_actions=num_actions) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. observation_spec = (tensor_spec.TensorSpec([context_dim], tf.float32), tensor_spec.TensorSpec([num_actions], tf.int32)) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) def observation_and_action_constraint_splitter(obs): return obs[0], obs[1] agent = linear_agent.LinearBanditAgent( exploration_policy=exploration_policy, time_step_spec=time_step_spec, action_spec=action_spec, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), dtype=dtype) self.evaluate(agent.initialize()) loss_info = agent.train(experience) self.evaluate(loss_info) final_a = self.evaluate(agent.cov_matrix) final_b = self.evaluate(agent.data_vector) # Compute the expected updated estimates. observations_list = tf.dynamic_partition( data=tf.reshape( observation_and_action_constraint_splitter( experience.observation)[0], [batch_size, -1]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) rewards_list = tf.dynamic_partition( data=tf.reshape(experience.reward, [batch_size]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) expected_a_updated_list = [] expected_b_updated_list = [] for _, (observations_for_arm, rewards_for_arm) in enumerate( zip(observations_list, rewards_list)): num_samples_for_arm_current = tf.cast( tf.shape(rewards_for_arm)[0], tf.float32) num_samples_for_arm_total = num_samples_for_arm_current # pylint: disable=cell-var-from-loop def true_fn(): a_new = tf.matmul(observations_for_arm, observations_for_arm, transpose_a=True) b_new = bandit_utils.sum_reward_weighted_observations( rewards_for_arm, observations_for_arm) return a_new, b_new def false_fn(): return tf.zeros([context_dim, context_dim]), tf.zeros([context_dim]) a_new, b_new = tf.cond( tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn) expected_a_updated_list.append(self.evaluate(a_new)) expected_b_updated_list.append(self.evaluate(b_new)) # Check that the actual updated estimates match the expectations. self.assertAllClose(expected_a_updated_list, final_a) self.assertAllClose(expected_b_updated_list, final_b)
def setUp(self): super(DqnAgentTest, self).setUp() self._observation_spec = tensor_spec.TensorSpec([2], tf.float32) self._time_step_spec = ts.time_step_spec(self._observation_spec) self._action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 1)
def testNeuralLinUCBUpdateDistributed(self, batch_size=1, context_dim=10): """Same as above but with distributed LinUCB updates.""" # Construct a `Trajectory` for the given action, observation, reward. num_actions = 5 initial_step, final_step = _get_initial_and_final_steps( batch_size, context_dim) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec( dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) encoder = DummyNet(observation_spec) encoding_dim = 10 agent = neural_linucb_agent.NeuralLinUCBAgent( time_step_spec=time_step_spec, action_spec=action_spec, encoding_network=encoder, encoding_network_num_train_steps=0, encoding_dim=encoding_dim, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=1e-2)) self.evaluate(agent.initialize()) self.evaluate(tf.compat.v1.global_variables_initializer()) # Call the distributed LinUCB training instead of agent.train(). train_fn = common.function_in_tf1()( agent.compute_loss_using_linucb_distributed) reward = tf.cast(experience.reward, agent._dtype) loss_info = train_fn( experience.observation, action, reward, weights=None) self.evaluate(loss_info) final_a = self.evaluate(agent.cov_matrix) final_b = self.evaluate(agent.data_vector) # Compute the expected updated estimates. observations_list = tf.dynamic_partition( data=tf.reshape(experience.observation, [batch_size, context_dim]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) rewards_list = tf.dynamic_partition( data=tf.reshape(experience.reward, [batch_size]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) expected_a_updated_list = [] expected_b_updated_list = [] for _, (observations_for_arm, rewards_for_arm) in enumerate(zip( observations_list, rewards_list)): encoded_observations_for_arm, _ = encoder(observations_for_arm) num_samples_for_arm_current = tf.cast( tf.shape(rewards_for_arm)[0], tf.float32) num_samples_for_arm_total = num_samples_for_arm_current # pylint: disable=cell-var-from-loop def true_fn(): a_new = tf.matmul( encoded_observations_for_arm, encoded_observations_for_arm, transpose_a=True) b_new = bandit_utils.sum_reward_weighted_observations( rewards_for_arm, encoded_observations_for_arm) return a_new, b_new def false_fn(): return (tf.zeros([encoding_dim, encoding_dim], dtype=tf.float32), tf.zeros([encoding_dim], dtype=tf.float32)) a_new, b_new = tf.cond( tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn) expected_a_updated_list.append(self.evaluate(a_new)) expected_b_updated_list.append(self.evaluate(b_new)) # Check that the actual updated estimates match the expectations. self.assertAllClose(expected_a_updated_list, final_a) self.assertAllClose(expected_b_updated_list, final_b)
def create_bandit_policy_type_tensor_spec(shape): """Create tensor spec for bandit policy type.""" return tensor_spec.BoundedTensorSpec(shape=shape, dtype=tf.int32, minimum=BanditPolicyType.UNKNOWN, maximum=BanditPolicyType.UNIFORM)
def __init__(self, encoding_network, encoding_dim, reward_layer, epsilon_greedy, actions_from_reward_layer, cov_matrix, data_vector, num_samples, time_step_spec=None, alpha=1.0, emit_log_probability=False, name=None): """Initializes `NeuralLinUCBPolicy`. Args: encoding_network: network that encodes the observations. encoding_dim: (int) dimension of the encoded observations. reward_layer: final layer that predicts the expected reward per arm. epsilon_greedy: (float) representing the probability of choosing a random action instead of the greedy action. actions_from_reward_layer: (bool) whether to get actions from the reward layer or from LinUCB. cov_matrix: list of the covariance matrices. There exists one covariance matrix per arm. data_vector: list of the data vectors. A data vector is a weighted sum of the observations, where the weight is the corresponding reward. Each arm has its own data vector. num_samples: list of number of samples per arm. time_step_spec: A `TimeStep` spec of the expected time_steps. alpha: (float) non-negative weight multiplying the confidence intervals. emit_log_probability: (bool) whether to emit log probabilities. name: The name of this policy. """ self._encoding_network = encoding_network self._reward_layer = reward_layer self._encoding_dim = encoding_dim if not isinstance(cov_matrix, (list, tuple)): raise ValueError( 'cov_matrix must be a list of matrices (Tensors).') self._cov_matrix = cov_matrix if not isinstance(data_vector, (list, tuple)): raise ValueError( 'data_vector must be a list of vectors (Tensors).') self._data_vector = data_vector if not isinstance(num_samples, (list, tuple)): raise ValueError( 'num_samples must be a list of vectors (Tensors).') self._num_samples = num_samples self._alpha = alpha self._actions_from_reward_layer = actions_from_reward_layer self._epsilon_greedy = epsilon_greedy self._dtype = self._data_vector[0].dtype if len(cov_matrix) != len(data_vector): raise ValueError( 'The size of list cov_matrix must match the size of ' 'list data_vector. Got {} for cov_matrix and {} ' 'for data_vector'.format(len(self._cov_matrix), len((data_vector)))) if len(num_samples) != len(cov_matrix): raise ValueError('The size of num_samples must match the size of ' 'list cov_matrix. Got {} for num_samples and {} ' 'for cov_matrix'.format(len(self._num_samples), len((cov_matrix)))) self._num_actions = len(cov_matrix) assert self._num_actions self._observation_dim = tf.compat.dimension_value( time_step_spec.observation.shape[0]) cov_matrix_dim = tf.compat.dimension_value(cov_matrix[0].shape[0]) if self._encoding_dim != cov_matrix_dim: raise ValueError('The dimension of matrix `cov_matrix` must match ' 'encoding dimension {}.' 'Got {} for `cov_matrix`.'.format( self._encoding_dim, cov_matrix_dim)) data_vector_dim = tf.compat.dimension_value(data_vector[0].shape[0]) if self._encoding_dim != data_vector_dim: raise ValueError( 'The dimension of vector `data_vector` must match ' 'encoding dimension {}. ' 'Got {} for `data_vector`.'.format(self._encoding_dim, data_vector_dim)) action_spec = tensor_spec.BoundedTensorSpec(shape=(), dtype=tf.int32, minimum=0, maximum=self._num_actions - 1, name='action') super(NeuralLinUCBPolicy, self).__init__(time_step_spec=time_step_spec, action_spec=action_spec, emit_log_probability=emit_log_probability, name=name)
def __init__(self, time_step_spec, action_spec, policy_state_spec=(), info_spec=(), clip=True, emit_log_probability=False, automatic_state_reset=True, observation_and_action_constraint_splitter=None, name=None): """Initialization of Base class. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. Usually provided by the user to the subclass. action_spec: A nest of BoundedTensorSpec representing the actions. Usually provided by the user to the subclass. policy_state_spec: A nest of TensorSpec representing the policy_state. Provided by the subclass, not directly by the user. info_spec: A nest of TensorSpec representing the policy info. Provided by the subclass, not directly by the user. clip: Whether to clip actions to spec before returning them. Default True. Most policy-based algorithms (PCL, PPO, REINFORCE) use unclipped continuous actions for training. emit_log_probability: Emit log-probabilities of actions, if supported. If True, policy_step.info will have CommonFields.LOG_PROBABILITY set. Please consult utility methods provided in policy_step for setting and retrieving these. When working with custom policies, either provide a dictionary info_spec or a namedtuple with the field 'log_probability'. automatic_state_reset: If `True`, then `get_initial_policy_state` is used to clear state in `action()` and `distribution()` for for time steps where `time_step.is_first()`. observation_and_action_constraint_splitter: A function used to process observations with action constraints. These constraints can indicate, for example, a mask of valid/invalid actions for a given state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the network and 2) the constraint. An example `observation_and_action_constraint_splitter` could be as simple as: ``` def observation_and_action_constraint_splitter(observation): return observation['network_input'], observation['constraint'] ``` *Note*: when using `observation_and_action_constraint_splitter`, make sure the provided `q_network` is compatible with the network-specific half of the output of the `observation_and_action_constraint_splitter`. In particular, `observation_and_action_constraint_splitter` will be called on the observation before passing to the network. If `observation_and_action_constraint_splitter` is None, action constraints are not applied. name: A name for this module. Defaults to the class name. """ super(Base, self).__init__(name=name) common.check_tf1_allowed() common.tf_agents_gauge.get_cell('TFAPolicy').set(True) common.assert_members_are_not_overridden(base_cls=Base, instance=self) if not isinstance(time_step_spec, ts.TimeStep): raise ValueError( 'The `time_step_spec` must be an instance of `TimeStep`, but is `{}`.' .format(type(time_step_spec))) self._time_step_spec = time_step_spec self._action_spec = action_spec self._policy_state_spec = policy_state_spec self._emit_log_probability = emit_log_probability if emit_log_probability: log_probability_spec = tensor_spec.BoundedTensorSpec( shape=(), dtype=tf.float32, maximum=0, minimum=-float('inf'), name='log_probability') log_probability_spec = tf.nest.map_structure( lambda _: log_probability_spec, action_spec) info_spec = policy_step.set_log_probability(info_spec, log_probability_spec) self._info_spec = info_spec self._setup_specs() self._clip = clip self._action_fn = common.function_in_tf1()(self._action) self._automatic_state_reset = automatic_state_reset self._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter)
def create_feed_forward_common_tower_network( observation_spec: types.NestedTensorSpec, global_layers: Sequence[int], arm_layers: Sequence[int], common_layers: Sequence[int], output_dim: int = 1, global_preprocessing_combiner: Optional[Callable[..., types.Tensor]] = None, arm_preprocessing_combiner: Optional[Callable[..., types.Tensor]] = None, activation_fn: Callable[[types.Tensor], types.Tensor] = tf.keras.activations.relu ) -> types.Network: """Creates a common tower network with feedforward towers. The network produced by this function can be used either in `GreedyRewardPredictionPolicy`, or `NeuralLinUCBPolicy`. In the former case, the network must have `output_dim=1`, it is going to be an instance of `QNetwork`, and used in the policy as a reward prediction network. In the latter case, the network will be an encoding network with its output consumed by a reward layer or a LinUCB method. The specified `output_dim` will be the encoding dimension. Args: observation_spec: A nested tensor spec containing the specs for global as well as per-arm observations. global_layers: Iterable of ints. Specifies the layers of the global tower. arm_layers: Iterable of ints. Specifies the layers of the arm tower. common_layers: Iterable of ints. Specifies the layers of the common tower. output_dim: The output dimension of the network. If 1, the common tower will be a QNetwork. Otherwise, the common tower will be an encoding network with the specified output dimension. global_preprocessing_combiner: Preprocessing combiner for global features. arm_preprocessing_combiner: Preprocessing combiner for the arm features. activation_fn: A keras activation, specifying the activation function used in all layers. Defaults to relu. Returns: A network that takes observations adhering observation_spec and outputs reward estimates for every action. """ obs_spec_no_num_actions = _remove_num_actions_dim_from_spec( observation_spec) global_network = encoding_network.EncodingNetwork( input_tensor_spec=obs_spec_no_num_actions[ bandit_spec_utils.GLOBAL_FEATURE_KEY], fc_layer_params=global_layers, activation_fn=activation_fn, preprocessing_combiner=global_preprocessing_combiner) arm_network = encoding_network.EncodingNetwork( input_tensor_spec=obs_spec_no_num_actions[ bandit_spec_utils.PER_ARM_FEATURE_KEY], fc_layer_params=arm_layers, activation_fn=activation_fn, preprocessing_combiner=arm_preprocessing_combiner) # When `global_layers` or `arm_layers` are empty, the corresponding encoding # networks simply pass the inputs forward, so in such cases we get the output # dimensions from the respective observation specs. global_network_out_dim = global_layers[ -1] if global_layers else obs_spec_no_num_actions[ bandit_spec_utils.GLOBAL_FEATURE_KEY].shape[-1] arm_network_out_dim = arm_layers[ -1] if arm_layers else obs_spec_no_num_actions[ bandit_spec_utils.PER_ARM_FEATURE_KEY].shape[-1] common_input_spec = tensor_spec.TensorSpec(shape=(global_network_out_dim + arm_network_out_dim, ), dtype=tf.float32) if output_dim == 1: common_network = q_network.QNetwork( input_tensor_spec=common_input_spec, action_spec=tensor_spec.BoundedTensorSpec(shape=(), minimum=0, maximum=0, dtype=tf.int32), fc_layer_params=common_layers, activation_fn=activation_fn) else: common_network = encoding_network.EncodingNetwork( input_tensor_spec=common_input_spec, fc_layer_params=list(common_layers) + [output_dim], activation_fn=activation_fn) return GlobalAndArmCommonTowerNetwork(obs_spec_no_num_actions, global_network, arm_network, common_network)
def testBuildsRnn(self, lstm_size, rnn_construction_fn): observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.float32, 0, 1) time_step_spec = ts.time_step_spec(observation_spec) time_step = tensor_spec.sample_spec_nest(time_step_spec, outer_dims=(1, )) action_spec = [ tensor_spec.BoundedTensorSpec((2, ), tf.float32, 2, 3), tensor_spec.BoundedTensorSpec((3, ), tf.int32, 0, 3) ] net = actor_distribution_rnn_network.ActorDistributionRnnNetwork( observation_spec, action_spec, conv_layer_params=[(4, 2, 2)], input_fc_layer_params=(5, ), output_fc_layer_params=(5, ), lstm_size=lstm_size, rnn_construction_fn=rnn_construction_fn, rnn_construction_kwargs={'lstm_size': 3}) action_distributions, network_state = net( time_step.observation, time_step.step_type, net.get_initial_state(batch_size=1)) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual([1, 2], action_distributions[0].mode().shape.as_list()) self.assertEqual([1, 3], action_distributions[1].mode().shape.as_list()) self.assertLen(net.variables, 14) # Conv Net Kernel self.assertEqual((2, 2, 3, 4), net.variables[0].shape) # Conv Net bias self.assertEqual((4, ), net.variables[1].shape) # Fc Kernel self.assertEqual((64, 5), net.variables[2].shape) # Fc Bias self.assertEqual((5, ), net.variables[3].shape) # RNN Cell Kernel self.assertEqual((5, 3), net.variables[4].shape) # RNN Cell Recurrent Kernel self.assertEqual((3, 3), net.variables[5].shape) # RNN Cell Bias self.assertEqual((3, ), net.variables[6].shape) # Fc Kernel self.assertEqual((3, 5), net.variables[7].shape) # Fc Bias self.assertEqual((5, ), net.variables[8].shape) # Normal Projection Kernel self.assertEqual((5, 2), net.variables[9].shape) # Normal Projection Bias self.assertEqual((2, ), net.variables[10].shape) # Normal Projection STD Bias layer self.assertEqual((2, ), net.variables[11].shape) # Categorical Projection Kernel self.assertEqual((5, 12), net.variables[12].shape) # Categorical Projection Bias self.assertEqual((12, ), net.variables[13].shape) # Assert RNN cell is created. self.assertEqual((3, ), network_state[0].shape)
def __init__(self, encoding_network, encoding_dim, reward_layer, epsilon_greedy, actions_from_reward_layer, cov_matrix, data_vector, num_samples, time_step_spec=None, alpha=1.0, emit_log_probability=False, observation_and_action_constraint_splitter=None, name=None): """Initializes `NeuralLinUCBPolicy`. Args: encoding_network: network that encodes the observations. encoding_dim: (int) dimension of the encoded observations. reward_layer: final layer that predicts the expected reward per arm. epsilon_greedy: (float) representing the probability of choosing a random action instead of the greedy action. actions_from_reward_layer: (bool) whether to get actions from the reward layer or from LinUCB. cov_matrix: list of the covariance matrices. There exists one covariance matrix per arm. data_vector: list of the data vectors. A data vector is a weighted sum of the observations, where the weight is the corresponding reward. Each arm has its own data vector. num_samples: list of number of samples per arm. time_step_spec: A `TimeStep` spec of the expected time_steps. alpha: (float) non-negative weight multiplying the confidence intervals. emit_log_probability: (bool) whether to emit log probabilities. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the bandit policy and 2) the mask. The mask should be a 0-1 `Tensor` of shape `[batch_size, num_actions]`. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. name: The name of this policy. """ self._encoding_network = encoding_network self._reward_layer = reward_layer self._encoding_dim = encoding_dim if not isinstance(cov_matrix, (list, tuple)): raise ValueError( 'cov_matrix must be a list of matrices (Tensors).') self._cov_matrix = cov_matrix if not isinstance(data_vector, (list, tuple)): raise ValueError( 'data_vector must be a list of vectors (Tensors).') self._data_vector = data_vector if not isinstance(num_samples, (list, tuple)): raise ValueError( 'num_samples must be a list of vectors (Tensors).') self._num_samples = num_samples self._alpha = alpha self._actions_from_reward_layer = actions_from_reward_layer self._epsilon_greedy = epsilon_greedy self._dtype = self._data_vector[0].dtype if len(cov_matrix) != len(data_vector): raise ValueError( 'The size of list cov_matrix must match the size of ' 'list data_vector. Got {} for cov_matrix and {} ' 'for data_vector'.format(len(self._cov_matrix), len((data_vector)))) if len(num_samples) != len(cov_matrix): raise ValueError('The size of num_samples must match the size of ' 'list cov_matrix. Got {} for num_samples and {} ' 'for cov_matrix'.format(len(self._num_samples), len((cov_matrix)))) self._num_actions = len(cov_matrix) assert self._num_actions if observation_and_action_constraint_splitter is not None: context_shape = observation_and_action_constraint_splitter( time_step_spec.observation)[0].shape.as_list() else: context_shape = time_step_spec.observation.shape.as_list() self._context_dim = (tf.compat.dimension_value(context_shape[0]) if context_shape else 1) cov_matrix_dim = tf.compat.dimension_value(cov_matrix[0].shape[0]) if self._encoding_dim != cov_matrix_dim: raise ValueError('The dimension of matrix `cov_matrix` must match ' 'encoding dimension {}.' 'Got {} for `cov_matrix`.'.format( self._encoding_dim, cov_matrix_dim)) data_vector_dim = tf.compat.dimension_value(data_vector[0].shape[0]) if self._encoding_dim != data_vector_dim: raise ValueError( 'The dimension of vector `data_vector` must match ' 'encoding dimension {}. ' 'Got {} for `data_vector`.'.format(self._encoding_dim, data_vector_dim)) action_spec = tensor_spec.BoundedTensorSpec(shape=(), dtype=tf.int32, minimum=0, maximum=self._num_actions - 1, name='action') super(NeuralLinUCBPolicy, self).__init__(time_step_spec=time_step_spec, action_spec=action_spec, emit_log_probability=emit_log_probability, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), name=name)
def __init__(self, encoding_network: types.Network, encoding_dim: int, reward_layer: tf.keras.layers.Dense, epsilon_greedy: float, actions_from_reward_layer: types.Bool, cov_matrix: Sequence[types.Float], data_vector: Sequence[types.Float], num_samples: Sequence[types.Int], time_step_spec: types.TimeStep, alpha: float = 1.0, emit_policy_info: Sequence[Text] = (), emit_log_probability: bool = False, accepts_per_arm_features: bool = False, distributed_use_reward_layer: bool = False, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None, name: Optional[Text] = None): """Initializes `NeuralLinUCBPolicy`. Args: encoding_network: network that encodes the observations. encoding_dim: (int) dimension of the encoded observations. reward_layer: final layer that predicts the expected reward per arm. In case the policy accepts per-arm features, the output of this layer has to be a scalar. This is because in the per-arm case, all encoded observations have to go through the same computation to get the reward estimates. The `num_actions` dimension of the encoded observation is treated as a batch dimension in the reward layer. epsilon_greedy: (float) representing the probability of choosing a random action instead of the greedy action. actions_from_reward_layer: (boolean variable) whether to get actions from the reward layer or from LinUCB. cov_matrix: list of the covariance matrices. There exists one covariance matrix per arm, unless the policy accepts per-arm features, in which case this list must have a single element. data_vector: list of the data vectors. A data vector is a weighted sum of the observations, where the weight is the corresponding reward. Each arm has its own data vector, unless the policy accepts per-arm features, in which case this list must have a single element. num_samples: list of number of samples per arm. If the policy accepts per- arm features, this is a single-element list counting the number of steps. time_step_spec: A `TimeStep` spec of the expected time_steps. alpha: (float) non-negative weight multiplying the confidence intervals. emit_policy_info: (tuple of strings) what side information we want to get as part of the policy info. Allowed values can be found in `policy_utilities.PolicyInfo`. emit_log_probability: (bool) whether to emit log probabilities. accepts_per_arm_features: (bool) Whether the policy accepts per-arm features. distributed_use_reward_layer: (bool) Whether to pick the actions using the network or use LinUCB. This applies only in distributed training setting and has a similar role to the `actions_from_reward_layer` mentioned above. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the bandit policy and 2) the mask. The mask should be a 0-1 `Tensor` of shape `[batch_size, num_actions]`. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. name: The name of this policy. """ policy_utilities.check_no_mask_with_arm_features( accepts_per_arm_features, observation_and_action_constraint_splitter) encoding_network.create_variables() self._encoding_network = encoding_network self._reward_layer = reward_layer self._encoding_dim = encoding_dim if accepts_per_arm_features and reward_layer.units != 1: raise ValueError( 'The output dimension of the reward layer must be 1, got' ' {}'.format(reward_layer.units)) if not isinstance(cov_matrix, (list, tuple)): raise ValueError( 'cov_matrix must be a list of matrices (Tensors).') self._cov_matrix = cov_matrix if not isinstance(data_vector, (list, tuple)): raise ValueError( 'data_vector must be a list of vectors (Tensors).') self._data_vector = data_vector if not isinstance(num_samples, (list, tuple)): raise ValueError( 'num_samples must be a list of vectors (Tensors).') self._num_samples = num_samples self._alpha = alpha self._actions_from_reward_layer = actions_from_reward_layer self._epsilon_greedy = epsilon_greedy self._dtype = self._data_vector[0].dtype self._distributed_use_reward_layer = distributed_use_reward_layer if len(cov_matrix) != len(data_vector): raise ValueError( 'The size of list cov_matrix must match the size of ' 'list data_vector. Got {} for cov_matrix and {} ' 'for data_vector'.format(len(self._cov_matrix), len((data_vector)))) if len(num_samples) != len(cov_matrix): raise ValueError('The size of num_samples must match the size of ' 'list cov_matrix. Got {} for num_samples and {} ' 'for cov_matrix'.format(len(self._num_samples), len((cov_matrix)))) self._accepts_per_arm_features = accepts_per_arm_features if observation_and_action_constraint_splitter is not None: context_spec, _ = observation_and_action_constraint_splitter( time_step_spec.observation) else: context_spec = time_step_spec.observation if accepts_per_arm_features: self._num_actions = tf.nest.flatten(context_spec[ bandit_spec_utils.PER_ARM_FEATURE_KEY])[0].shape.as_list()[0] self._num_models = 1 else: self._num_actions = len(cov_matrix) self._num_models = self._num_actions cov_matrix_dim = tf.compat.dimension_value(cov_matrix[0].shape[0]) if self._encoding_dim != cov_matrix_dim: raise ValueError('The dimension of matrix `cov_matrix` must match ' 'encoding dimension {}.' 'Got {} for `cov_matrix`.'.format( self._encoding_dim, cov_matrix_dim)) data_vector_dim = tf.compat.dimension_value(data_vector[0].shape[0]) if self._encoding_dim != data_vector_dim: raise ValueError( 'The dimension of vector `data_vector` must match ' 'encoding dimension {}. ' 'Got {} for `data_vector`.'.format(self._encoding_dim, data_vector_dim)) action_spec = tensor_spec.BoundedTensorSpec(shape=(), dtype=tf.int32, minimum=0, maximum=self._num_actions - 1, name='action') self._emit_policy_info = emit_policy_info predicted_rewards_mean = () if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info: predicted_rewards_mean = tensor_spec.TensorSpec( [self._num_actions], dtype=tf.float32) predicted_rewards_optimistic = () if (policy_utilities.InfoFields.PREDICTED_REWARDS_OPTIMISTIC in emit_policy_info): predicted_rewards_optimistic = tensor_spec.TensorSpec( [self._num_actions], dtype=tf.float32) if accepts_per_arm_features: chosen_arm_features_info_spec = ( policy_utilities.create_chosen_arm_features_info_spec( time_step_spec.observation)) info_spec = policy_utilities.PerArmPolicyInfo( predicted_rewards_mean=predicted_rewards_mean, predicted_rewards_optimistic=predicted_rewards_optimistic, chosen_arm_features=chosen_arm_features_info_spec) else: info_spec = policy_utilities.PolicyInfo( predicted_rewards_mean=predicted_rewards_mean, predicted_rewards_optimistic=predicted_rewards_optimistic) super(NeuralLinUCBPolicy, self).__init__(time_step_spec=time_step_spec, action_spec=action_spec, emit_log_probability=emit_log_probability, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), info_spec=info_spec, name=name)
def setUp(self): super(BehavioralCloningAgentTest, self).setUp() self._obs_spec = [tensor_spec.TensorSpec([2], tf.float32)] self._time_step_spec = ts.time_step_spec(self._obs_spec) self._action_spec = [tensor_spec.BoundedTensorSpec([], tf.int32, 0, 1)] self._observation_spec = self._time_step_spec.observation
def __init__(self): info_spec = {"test": tensor_spec.BoundedTensorSpec([1], tf.int64, 0, 1)} super(TfDictInfoAndLogProbs, self).__init__(info_spec=info_spec)
def testTrainWithSparseTensorAndDenseFeaturesLayer(self, agent_class): obs_spec = { 'dense': tensor_spec.BoundedTensorSpec(dtype=tf.float32, shape=[3], minimum=-10.0, maximum=10.0), 'sparse_terms': tf.SparseTensorSpec(dtype=tf.string, shape=[4]), 'sparse_frequencies': tf.SparseTensorSpec(dtype=tf.float32, shape=[4]), } cat_column = ( tf.compat.v2.feature_column.categorical_column_with_hash_bucket( 'sparse_terms', hash_bucket_size=5)) weighted_cat_column = ( tf.compat.v2.feature_column.weighted_categorical_column( cat_column, weight_feature_key='sparse_frequencies')) feature_columns = [ tf.compat.v2.feature_column.numeric_column('dense', [3]), tf.compat.v2.feature_column.embedding_column( weighted_cat_column, 3), ] dense_features_layer = tf.compat.v2.keras.layers.DenseFeatures( feature_columns) time_step_spec = ts.time_step_spec(obs_spec) q_net = q_network.QNetwork(time_step_spec.observation, self._action_spec, preprocessing_combiner=dense_features_layer) agent = agent_class(time_step_spec, self._action_spec, q_network=q_net, optimizer=tf.compat.v1.train.AdamOptimizer()) observations = tensor_spec.sample_spec_nest(obs_spec, outer_dims=[5, 2]) # sparse_terms and sparse_frequencies must be defined on matching indices. observations['sparse_terms'] = tf.SparseTensor( indices=observations['sparse_frequencies'].indices, values=tf.as_string( tf.math.round(observations['sparse_frequencies'].values)), dense_shape=observations['sparse_frequencies'].dense_shape) if not tf.executing_eagerly(): # Mimic unknown inner dims on the SparseTensor def _unknown_inner_shape(t): if not isinstance(t, tf.SparseTensor): return t return tf.SparseTensor( indices=t.indices, values=t.values, dense_shape=tf.compat.v1.placeholder_with_default( t.dense_shape, shape=t.dense_shape.shape)) observations = tf.nest.map_structure(_unknown_inner_shape, observations) self.assertIsNone( tf.get_static_value(observations['sparse_terms'].dense_shape)) time_step = ts.restart(observations, batch_size=[5, 2]) action_step = tensor_spec.sample_spec_nest(self._action_spec, outer_dims=[5, 2]) p_step = policy_step.PolicyStep(action=action_step, state=(), info=()) traj = trajectory.from_transition(time_step, p_step, time_step) loss_info = agent.train(traj) self.evaluate(tf.compat.v1.global_variables_initializer()) loss_info = self.evaluate(loss_info) self.assertGreater(loss_info.loss, 0)
def __init__(self): observation_spec = tensor_spec.TensorSpec([2, 2], tf.float32) time_step_spec = ts.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1) super(TFPolicyMismatchedDtypes, self).__init__(time_step_spec, action_spec)
def testBoundedTensorSpecSample(self, dtype): spec = tensor_spec.BoundedTensorSpec((2, 3), dtype, 2, 7) sample = tensor_spec.sample_spec_nest(spec) sample_ = self.evaluate(sample) self.assertTrue(np.all(sample_ >= 2)) self.assertTrue(np.all(sample_ <= 7))
def testPerArmRewardsSparseObs(self): obs_spec = { 'global': { 'sport': tensor_spec.TensorSpec((), tf.string) }, 'per_arm': { 'name': tensor_spec.TensorSpec((3, ), tf.string), 'fruit': tensor_spec.TensorSpec((3, ), tf.string) } } columns_a = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'name', ['bob', 'george', 'wanda'])) columns_b = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'fruit', ['banana', 'kiwi', 'pear'])) columns_c = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'sport', ['bridge', 'chess', 'snooker'])) objective_networks = [] for _ in range(3): objective_networks.append( global_and_arm_feature_network. create_feed_forward_common_tower_network( observation_spec=obs_spec, global_layers=(4, 3, 2), arm_layers=(6, 5, 4), common_layers=(7, 6, 5), global_preprocessing_combiner=( tf.compat.v2.keras.layers.DenseFeatures([columns_c])), arm_preprocessing_combiner=tf.compat.v2.keras.layers. DenseFeatures([columns_a, columns_b]))) time_step_spec = ts.time_step_spec(obs_spec) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 2) policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy( time_step_spec, action_spec, self._scalarizer, objective_networks, accepts_per_arm_features=True, emit_policy_info=('predicted_rewards_mean', )) observations = { 'global': { 'sport': tf.constant(['snooker', 'chess']) }, 'per_arm': { 'name': tf.constant([['george', 'george', 'george'], ['bob', 'bob', 'bob']]), 'fruit': tf.constant([['banana', 'banana', 'banana'], ['kiwi', 'kiwi', 'kiwi']]) } } time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate([ tf.compat.v1.global_variables_initializer(), tf.compat.v1.tables_initializer() ]) action, p_info, first_arm_name_feature = self.evaluate([ action_step.action, action_step.info, observations[bandit_spec_utils.PER_ARM_FEATURE_KEY]['name'][0] ]) self.assertAllEqual(action.shape, [2]) self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 3, 3]) self.assertAllEqual(p_info.chosen_arm_features['name'].shape, [2]) self.assertAllEqual(p_info.chosen_arm_features['fruit'].shape, [2]) first_action = action[0] self.assertAllEqual(p_info.chosen_arm_features['name'][0], first_arm_name_feature[first_action])
def create_ppo_agent_and_dataset_fn(action_spec, time_step_spec, train_step, batch_size): """Builds and returns a dummy PPO Agent, dataset and dataset function.""" del action_spec # Unused. del time_step_spec # Unused. del batch_size # Unused. # No arbitrary spec supported. obs_spec = tensor_spec.TensorSpec([2], tf.float32) ts_spec = ts.time_step_spec(obs_spec) act_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1) actor_net = actor_distribution_network.ActorDistributionNetwork( obs_spec, act_spec, fc_layer_params=(100, ), activation_fn=tf.keras.activations.tanh) value_net = value_network.ValueNetwork( obs_spec, fc_layer_params=(100, ), activation_fn=tf.keras.activations.tanh) agent = ppo_clip_agent.PPOClipAgent( ts_spec, act_spec, optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), actor_net=actor_net, value_net=value_net, entropy_regularization=0.0, importance_ratio_clipping=0.2, normalize_observations=False, normalize_rewards=False, use_gae=False, use_td_lambda_return=False, num_epochs=1, debug_summaries=False, summarize_grads_and_vars=False, train_step_counter=train_step, compute_value_and_advantage_in_train=False) def _create_experience(_): observations = tf.constant([ [[1, 2], [3, 4], [5, 6]], [[1, 2], [3, 4], [5, 6]], ], dtype=tf.float32) mid_time_step_val = ts.StepType.MID.tolist() time_steps = ts.TimeStep(step_type=tf.constant( [[mid_time_step_val] * 3] * 2, dtype=tf.int32), reward=tf.constant([[1] * 3] * 2, dtype=tf.float32), discount=tf.constant([[1] * 3] * 2, dtype=tf.float32), observation=observations) actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]], dtype=tf.float32) action_distribution_parameters = { 'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32), 'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32), } value_preds = tf.constant([[9., 15., 21.], [9., 15., 21.]], dtype=tf.float32) policy_info = { 'dist_params': action_distribution_parameters, } policy_info['value_prediction'] = value_preds experience = trajectory.Trajectory(time_steps.step_type, observations, actions, policy_info, time_steps.step_type, time_steps.reward, time_steps.discount) return agent._preprocess(experience) # pylint: disable=protected-access dataset = tf.data.Dataset.from_tensor_slices([[i] for i in range(100) ]).map(_create_experience) dataset = tf.data.Dataset.zip((dataset, tf.data.experimental.Counter())) dataset_fn = lambda: dataset return agent, dataset, dataset_fn, agent.training_data_spec
def testLinearAgentUpdatePerArmFeatures(self, batch_size, context_dim, exploration_policy, dtype, use_eigendecomp=False): """Check that the agent updates for specified actions and rewards.""" # Construct a `Trajectory` for the given action, observation, reward. num_actions = 5 global_context_dim = context_dim arm_context_dim = 3 initial_step, final_step = ( _get_initial_and_final_steps_with_per_arm_features( batch_size, global_context_dim, num_actions, arm_context_dim, num_actions_feature=True)) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = policy_step.PolicyStep( action=tf.convert_to_tensor(action), info=policy_utilities.PerArmPolicyInfo( chosen_arm_features=np.arange( batch_size * arm_context_dim, dtype=np.float32).reshape( [batch_size, arm_context_dim]))) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. observation_spec = bandit_spec_utils.create_per_arm_observation_spec( context_dim, arm_context_dim, num_actions, add_num_actions_feature=True) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) agent = linear_agent.LinearBanditAgent( exploration_policy=exploration_policy, time_step_spec=time_step_spec, action_spec=action_spec, use_eigendecomp=use_eigendecomp, accepts_per_arm_features=True, dtype=dtype) self.evaluate(agent.initialize()) loss_info = agent.train(experience) self.evaluate(loss_info) final_a = self.evaluate(agent.cov_matrix) final_b = self.evaluate(agent.data_vector) # Compute the expected updated estimates. global_observation = experience.observation[ bandit_spec_utils.GLOBAL_FEATURE_KEY] arm_observation = experience.policy_info.chosen_arm_features overall_observation = tf.squeeze(tf.concat( [global_observation, arm_observation], axis=-1), axis=1) rewards = tf.squeeze(experience.reward, axis=1) expected_a_new = tf.matmul(overall_observation, overall_observation, transpose_a=True) expected_b_new = bandit_utils.sum_reward_weighted_observations( rewards, overall_observation) self.assertAllClose(expected_a_new, final_a[0]) self.assertAllClose(expected_b_new, final_b[0])
def testLinearAgentUpdateWithBias(self, batch_size, context_dim, exploration_policy, dtype, use_eigendecomp=False): """Check that the agent updates for specified actions and rewards.""" # Construct a `Trajectory` for the given action, observation, reward. num_actions = 5 initial_step, final_step = _get_initial_and_final_steps( batch_size, context_dim) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) variable_collection = linear_agent.LinearBanditVariableCollection( context_dim + 1, num_actions, use_eigendecomp, dtype) agent = linear_agent.LinearBanditAgent( exploration_policy=exploration_policy, time_step_spec=time_step_spec, action_spec=action_spec, variable_collection=variable_collection, use_eigendecomp=use_eigendecomp, add_bias=True, dtype=dtype) self.evaluate(agent.initialize()) loss_info = agent.train(experience) self.evaluate(loss_info) final_a = self.evaluate(agent.cov_matrix) final_b = self.evaluate(agent.data_vector) final_theta = self.evaluate(agent.theta) # Compute the expected updated estimates. observations_list = tf.dynamic_partition( data=tf.reshape(experience.observation, [batch_size, context_dim]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) rewards_list = tf.dynamic_partition( data=tf.reshape(experience.reward, [batch_size]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) expected_a_updated_list = [] expected_b_updated_list = [] expected_theta_updated_list = [] for _, (observations_for_arm, rewards_for_arm) in enumerate( zip(observations_list, rewards_list)): observations_for_arm = tf.concat([ observations_for_arm, tf.ones_like(observations_for_arm[:, 0:1]) ], axis=1) num_samples_for_arm_current = tf.cast( tf.shape(rewards_for_arm)[0], tf.float32) num_samples_for_arm_total = num_samples_for_arm_current # pylint: disable=cell-var-from-loop def true_fn(): a_new = tf.matmul(observations_for_arm, observations_for_arm, transpose_a=True) b_new = bandit_utils.sum_reward_weighted_observations( rewards_for_arm, observations_for_arm) return a_new, b_new def false_fn(): return tf.zeros([context_dim + 1, context_dim + 1]), tf.zeros([context_dim + 1]) a_new, b_new = tf.cond( tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn) theta_new = tf.squeeze(tf.linalg.solve( a_new + tf.eye(context_dim + 1), tf.expand_dims(b_new, axis=-1)), axis=-1) expected_a_updated_list.append(self.evaluate(a_new)) expected_b_updated_list.append(self.evaluate(b_new)) expected_theta_updated_list.append(self.evaluate(theta_new)) # Check that the actual updated estimates match the expectations. self.assertAllClose(expected_a_updated_list, final_a) self.assertAllClose(expected_b_updated_list, final_b) self.assertAllClose(self.evaluate( tf.stack(expected_theta_updated_list)), final_theta, atol=0.1, rtol=0.05)
def testTrainPerArmAgent(self): num_actions = 5 mask_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(num_actions, ), minimum=0, maximum=1) obs_spec = (bandit_spec_utils.create_per_arm_observation_spec( 2, 3, num_actions), mask_spec) time_step_spec = time_step.time_step_spec(obs_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) encoding_dim = 10 encoder = (global_and_arm_feature_network. create_feed_forward_common_tower_network( obs_spec[0], (4, 3), (3, 4), (4, 2), encoding_dim)) agent = neural_linucb_agent.NeuralLinUCBAgent( time_step_spec=time_step_spec, action_spec=action_spec, encoding_network=encoder, encoding_network_num_train_steps=10, encoding_dim=encoding_dim, observation_and_action_constraint_splitter=lambda x: (x[0], x[1]), accepts_per_arm_features=True, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001)) observations = ({ bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: tf.cast(tf.reshape(tf.range(30), shape=[2, 5, 3]), dtype=tf.float32) }, tf.ones(shape=(2, num_actions), dtype=tf.int32)) actions = np.array([0, 3], dtype=np.int32) rewards = np.array([0.5, 3.0], dtype=np.float32) initial_step = time_step.TimeStep( tf.constant(time_step.StepType.FIRST, dtype=tf.int32, shape=[2], name='step_type'), tf.constant(0.0, dtype=tf.float32, shape=[2], name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[2], name='discount'), observations) final_step = time_step.TimeStep( tf.constant(time_step.StepType.LAST, dtype=tf.int32, shape=[2], name='step_type'), tf.constant(rewards, dtype=tf.float32, name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[2], name='discount'), observations) action_step = policy_step.PolicyStep( action=tf.convert_to_tensor(actions), info=policy_utilities.PerArmPolicyInfo( chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]], dtype=np.float32))) experience = _get_experience(initial_step, action_step, final_step) loss_info, _ = agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables()) loss_value = self.evaluate(loss_info) self.assertGreater(loss_value, 0.0)
def testLinearAgentUpdateWithForgetting(self, batch_size, context_dim, exploration_policy, dtype, use_eigendecomp=False): """Check that the agent updates for specified actions and rewards.""" # We should rewrite this test as it currently does not depend on # the value of `gamma`. To properly test the forgetting factor, we need to # call `train` twice. gamma = 0.9 # Construct a `Trajectory` for the given action, observation, reward. num_actions = 5 initial_step, final_step = _get_initial_and_final_steps( batch_size, context_dim) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) agent = linear_agent.LinearBanditAgent( exploration_policy=exploration_policy, time_step_spec=time_step_spec, action_spec=action_spec, gamma=gamma, dtype=dtype, use_eigendecomp=use_eigendecomp) self.evaluate(tf.compat.v1.global_variables_initializer()) loss_info = agent.train(experience) self.evaluate(loss_info) final_a = self.evaluate(agent.cov_matrix) final_b = self.evaluate(agent.data_vector) final_eig_vals = self.evaluate(agent.eig_vals) # Compute the expected updated estimates. observations_list = tf.dynamic_partition( data=tf.reshape(experience.observation, [batch_size, context_dim]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) rewards_list = tf.dynamic_partition( data=tf.reshape(experience.reward, [batch_size]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) expected_a_updated_list = [] expected_b_updated_list = [] expected_eigvals_updated_list = [] for _, (observations_for_arm, rewards_for_arm) in enumerate( zip(observations_list, rewards_list)): num_samples_for_arm_current = tf.cast( tf.shape(rewards_for_arm)[0], tf.float32) num_samples_for_arm_total = num_samples_for_arm_current # pylint: disable=cell-var-from-loop def true_fn(): a_new = tf.matmul(observations_for_arm, observations_for_arm, transpose_a=True) b_new = bandit_utils.sum_reward_weighted_observations( rewards_for_arm, observations_for_arm) eigmatrix_new = tf.constant([], dtype=dtype) eigvals_new = tf.constant([], dtype=dtype) if use_eigendecomp: eigvals_new, eigmatrix_new = tf.linalg.eigh(a_new) return a_new, b_new, eigvals_new, eigmatrix_new def false_fn(): if use_eigendecomp: return (tf.zeros([context_dim, context_dim]), tf.zeros([context_dim]), tf.ones([context_dim]), tf.eye(context_dim)) else: return (tf.zeros([context_dim, context_dim]), tf.zeros([context_dim]), tf.constant([], dtype=dtype), tf.constant([], dtype=dtype)) a_new, b_new, eig_vals_new, _ = tf.cond( tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn) expected_a_updated_list.append(self.evaluate(a_new)) expected_b_updated_list.append(self.evaluate(b_new)) expected_eigvals_updated_list.append(self.evaluate(eig_vals_new)) # Check that the actual updated estimates match the expectations. self.assertAllClose(expected_a_updated_list, final_a) self.assertAllClose(expected_b_updated_list, final_b) self.assertAllClose(expected_eigvals_updated_list, final_eig_vals, atol=1e-4, rtol=1e-4)
def setUp(self): super(SacAgentTest, self).setUp() self._obs_spec = tensor_spec.TensorSpec([2], tf.float32) self._time_step_spec = ts.time_step_spec(self._obs_spec) self._action_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1)
def testInvalidMaximum(self): with self.assertRaisesRegexp(ValueError, "not compatible"): tensor_spec.BoundedTensorSpec((3, 5), tf.uint8, 0, (1, 1, 1))
def testMixturePolicyDynamicBatchSize(self): context_dim = 35 observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = ts.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(shape=(), dtype=tf.int32, minimum=0, maximum=9, name='action') sub_policies = [ ConstantPolicy(action_spec, time_step_spec, i) for i in range(10) ] weights = [0, 0, 0.2, 0, 0, 0.3, 0, 0, 0.5, 0] dist = tfd.Categorical(probs=weights) policy = mixture_policy.MixturePolicy(dist, sub_policies) batch_size = tf.random.uniform(shape=(), minval=10, maxval=15, dtype=tf.int32) time_step = ts.TimeStep( tf.fill(tf.expand_dims(batch_size, axis=0), ts.StepType.FIRST, name='step_type'), tf.zeros(shape=[batch_size], dtype=tf.float32, name='reward'), tf.ones(shape=[batch_size], dtype=tf.float32, name='discount'), tf.reshape(tf.range(tf.cast(batch_size * context_dim, dtype=tf.float32), dtype=tf.float32), shape=[-1, context_dim], name='observation')) action_step = policy.action(time_step) actions, bsize = self.evaluate([action_step.action, batch_size]) self.assertAllEqual(actions.shape, [bsize]) self.assertAllInSet(actions, [2, 5, 8]) train_step = tf.compat.v1.train.get_or_create_global_step() saver = policy_saver.PolicySaver(policy, train_step=train_step) location = os.path.join(self.get_temp_dir(), 'saved_policy') if not tf.executing_eagerly(): with self.cached_session(): self.evaluate(tf.compat.v1.global_variables_initializer()) saver.save(location) else: saver.save(location) loaded_policy = tf.compat.v2.saved_model.load(location) new_batch_size = 3 new_time_step = ts.TimeStep( tf.fill(tf.expand_dims(new_batch_size, axis=0), ts.StepType.FIRST, name='step_type'), tf.zeros(shape=[new_batch_size], dtype=tf.float32, name='reward'), tf.ones(shape=[new_batch_size], dtype=tf.float32, name='discount'), tf.reshape(tf.range(tf.cast(new_batch_size * context_dim, dtype=tf.float32), dtype=tf.float32), shape=[-1, context_dim], name='observation')) new_action = self.evaluate(loaded_policy.action(new_time_step).action) self.assertAllEqual(new_action.shape, [new_batch_size]) self.assertAllInSet(new_action, [2, 5, 8])
def testUint8IncludeMaxOfDtype(self): spec = tensor_spec.BoundedTensorSpec((2, 3), tf.uint8, 255, 255) sample = tensor_spec.sample_spec_nest(spec) sample_ = self.evaluate(sample) self.assertTrue(np.all(sample_ == 255))
def setUp(self): super(GreedyRewardPredictionPolicyTest, self).setUp() self._obs_spec = tensor_spec.TensorSpec([2], tf.float32) self._time_step_spec = ts.time_step_spec(self._obs_spec) self._action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 2)