def testAcceptsTensorShape(self): desc = tensor_spec.TensorSpec(tf.TensorShape([1]), tf.float32) self.assertEqual(desc.shape, tf.TensorShape([1]))
def testErrorOnWrongActionSpecWhenCreatingAgent(self): time_step_spec = ts.time_step_spec(tensor_spec.TensorSpec([2], tf.float32)) wrong_action_spec = array_spec.BoundedArraySpec([1], np.float32, -1, 1) with self.assertRaisesRegex( TypeError, 'action_spec has to contain BoundedTensorSpec'): tf_agent.TFAgent(time_step_spec, wrong_action_spec, None, None, None)
def testGetOuterShapeTwoDims(self): tensor = tf.zeros([7, 5, 2, 3], dtype=tf.float32) spec = tensor_spec.TensorSpec([2, 3], dtype=tf.float32) batch_dim = nest_utils.get_outer_shape(tensor, spec) self.assertAllEqual(self.evaluate(batch_dim), [7, 5])
def setUp(self): super(SacAgentTest, self).setUp() self._obs_spec = tensor_spec.TensorSpec([2], tf.float32) self._time_step_spec = ts.time_step_spec(self._obs_spec) self._action_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1)
def setUp(self): super(BehavioralCloningAgentTest, self).setUp() self._obs_spec = tensor_spec.TensorSpec([2], tf.float32) self._time_step_spec = ts.time_step_spec(self._obs_spec) self._action_spec = tensor_spec.BoundedTensorSpec([], tf.int32, 0, 1) self._observation_spec = self._time_step_spec.observation
def testMixturePolicyDynamicBatchSize(self): context_dim = 35 observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = ts.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(shape=(), dtype=tf.int32, minimum=0, maximum=9, name='action') sub_policies = [ ConstantPolicy(action_spec, time_step_spec, i) for i in range(10) ] weights = [0, 0, 0.2, 0, 0, 0.3, 0, 0, 0.5, 0] dist = tfd.Categorical(probs=weights) policy = mixture_policy.MixturePolicy(dist, sub_policies) batch_size = tf.random.uniform(shape=(), minval=10, maxval=15, dtype=tf.int32) time_step = ts.TimeStep( tf.fill(tf.expand_dims(batch_size, axis=0), ts.StepType.FIRST, name='step_type'), tf.zeros(shape=[batch_size], dtype=tf.float32, name='reward'), tf.ones(shape=[batch_size], dtype=tf.float32, name='discount'), tf.reshape(tf.range(tf.cast(batch_size * context_dim, dtype=tf.float32), dtype=tf.float32), shape=[-1, context_dim], name='observation')) action_step = policy.action(time_step) actions, bsize = self.evaluate([action_step.action, batch_size]) self.assertAllEqual(actions.shape, [bsize]) self.assertAllInSet(actions, [2, 5, 8]) train_step = tf.compat.v1.train.get_or_create_global_step() saver = policy_saver.PolicySaver(policy, train_step=train_step) location = os.path.join(self.get_temp_dir(), 'saved_policy') if not tf.executing_eagerly(): with self.cached_session(): self.evaluate(tf.compat.v1.global_variables_initializer()) saver.save(location) else: saver.save(location) loaded_policy = tf.compat.v2.saved_model.load(location) new_batch_size = 3 new_time_step = ts.TimeStep( tf.fill(tf.expand_dims(new_batch_size, axis=0), ts.StepType.FIRST, name='step_type'), tf.zeros(shape=[new_batch_size], dtype=tf.float32, name='reward'), tf.ones(shape=[new_batch_size], dtype=tf.float32, name='discount'), tf.reshape(tf.range(tf.cast(new_batch_size * context_dim, dtype=tf.float32), dtype=tf.float32), shape=[-1, context_dim], name='observation')) new_action = self.evaluate(loaded_policy.action(new_time_step).action) self.assertAllEqual(new_action.shape, [new_batch_size]) self.assertAllInSet(new_action, [2, 5, 8])
def testNeuralLinUCBUpdateNumTrainSteps0(self, batch_size=1, context_dim=10): """Check NeuralLinUCBAgent updates when behaving like LinUCB.""" # Construct a `Trajectory` for the given action, observation, reward. num_actions = 5 initial_step, final_step = _get_initial_and_final_steps( batch_size, context_dim) action = np.random.randint(num_actions, size=batch_size, dtype=np.int32) action_step = _get_action_step(action) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) encoder = DummyNet(obs_dim=context_dim) encoding_dim = 10 agent = neural_linucb_agent.NeuralLinUCBAgent( time_step_spec=time_step_spec, action_spec=action_spec, encoding_network=encoder, encoding_network_num_train_steps=0, encoding_dim=encoding_dim, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=1e-2)) loss_info = agent.train(experience) self.evaluate(agent.initialize()) self.evaluate(tf.compat.v1.global_variables_initializer()) self.evaluate(loss_info) final_a = self.evaluate(agent.cov_matrix) final_b = self.evaluate(agent.data_vector) # Compute the expected updated estimates. observations_list = tf.dynamic_partition( data=tf.reshape(tf.cast(experience.observation, tf.float64), [batch_size, context_dim]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) rewards_list = tf.dynamic_partition( data=tf.reshape(tf.cast(experience.reward, tf.float64), [batch_size]), partitions=tf.convert_to_tensor(action), num_partitions=num_actions) expected_a_updated_list = [] expected_b_updated_list = [] for _, (observations_for_arm, rewards_for_arm) in enumerate( zip(observations_list, rewards_list)): encoded_observations_for_arm, _ = encoder(observations_for_arm) encoded_observations_for_arm = tf.cast( encoded_observations_for_arm, dtype=tf.float64) num_samples_for_arm_current = tf.cast( tf.shape(rewards_for_arm)[0], tf.float64) num_samples_for_arm_total = num_samples_for_arm_current # pylint: disable=cell-var-from-loop def true_fn(): a_new = tf.eye(encoding_dim, dtype=tf.float64) + tf.matmul( encoded_observations_for_arm, encoded_observations_for_arm, transpose_a=True) b_new = bandit_utils.sum_reward_weighted_observations( rewards_for_arm, encoded_observations_for_arm) return a_new, b_new def false_fn(): return (tf.eye(encoding_dim, dtype=tf.float64), tf.zeros([encoding_dim], dtype=tf.float64)) a_new, b_new = tf.cond( tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn) expected_a_updated_list.append(self.evaluate(a_new)) expected_b_updated_list.append(self.evaluate(b_new)) # Check that the actual updated estimates match the expectations. self.assertAllClose(expected_a_updated_list, final_a) self.assertAllClose(expected_b_updated_list, final_b)
def testIsDiscrete(self, dtype): spec = tensor_spec.TensorSpec((2, 3), dtype=dtype) self.assertIs(spec.is_discrete(), dtype.is_integer)
def testIsContinuous(self, dtype): spec = tensor_spec.TensorSpec((2, 3), dtype=dtype) self.assertIs(spec.is_continuous(), dtype.is_floating)
def testFromTensorSpec(self): spec_1 = tensor_spec.TensorSpec((1, 2), tf.int32) spec_2 = tensor_spec.TensorSpec.from_spec(spec_1) self.assertEqual(spec_1, spec_2)
def testIsDiscrete(self): discrete_spec = tensor_spec.TensorSpec((1, 2), tf.int32) continuous_spec = tensor_spec.TensorSpec((1, 2), tf.float32) self.assertTrue(discrete_spec.is_discrete()) self.assertFalse(continuous_spec.is_discrete())
def testName(self): desc = tensor_spec.TensorSpec([1], tf.float32, name="beep") self.assertEqual(desc.name, "beep")
def testTypeCompatibility(self): floats = tf.placeholder(tf.float32, shape=[10, 10]) ints = tf.placeholder(tf.int32, shape=[10, 10]) desc = tensor_spec.TensorSpec(shape=(10, 10), dtype=tf.float32) self.assertTrue(desc.is_compatible_with(floats)) self.assertFalse(desc.is_compatible_with(ints))
def testUnknownShape(self): desc = tensor_spec.TensorSpec(shape=None, dtype=tf.float32) self.assertEqual(desc.shape, tf.TensorShape(None))
def __init__(self, dataset: tf.data.Dataset, reward_distribution: types.Distribution, batch_size: types.Int, label_dtype_cast: Optional[tf.DType] = None, shuffle_buffer_size: Optional[types.Int] = None, repeat_dataset: Optional[bool] = True, prefetch_size: Optional[types.Int] = None, seed: Optional[types.Int] = None): """Initialize `ClassificationBanditEnvironment`. Args: dataset: a `tf.data.Dataset` consisting of two `Tensor`s, [inputs, labels] where inputs can be of any shape, while labels are integer class labels. The label tensor can be of any rank as long as it has 1 element. reward_distribution: a `tfd.Distribution` with event_shape `[num_classes, num_actions]`. Entry `[i, j]` is the reward for taking action `j` for an instance of class `i`. batch_size: if `dataset` is batched, this is the size of the batches. label_dtype_cast: if not None, casts dataset labels to this dtype. shuffle_buffer_size: If None, do not shuffle. Otherwise, a shuffle buffer of the specified size is used in the environment's `dataset`. repeat_dataset: Makes the environment iterate on the `dataset` once avoiding `OutOfRangeError: End of sequence` errors when the environment is stepped past the end of the `dataset`. prefetch_size: If None, do not prefetch. Otherwise, a prefetch buffer of the specified size is used in the environment's `dataset`. seed: Used to make results deterministic. Raises: ValueError: if `reward_distribution` does not have an event shape with rank 2. """ # Computing `action_spec`. event_shape = reward_distribution.event_shape if len(event_shape) != 2: raise ValueError( 'reward_distribution must have event shape of rank 2; ' 'got event shape {}'.format(event_shape)) _, num_actions = event_shape action_spec = tensor_spec.BoundedTensorSpec(shape=(), dtype=tf.int32, minimum=0, maximum=num_actions - 1, name='action') output_shapes = tf.compat.v1.data.get_output_shapes(dataset) # Computing `time_step_spec`. if len(output_shapes) != 2: raise ValueError( 'Dataset must have exactly two outputs; got {}'.format( len(output_shapes))) context_shape = output_shapes[0] context_dtype, lbl_dtype = tf.compat.v1.data.get_output_types(dataset) if label_dtype_cast: lbl_dtype = label_dtype_cast observation_spec = tensor_spec.TensorSpec(shape=context_shape, dtype=context_dtype) time_step_spec = time_step.time_step_spec(observation_spec) super(ClassificationBanditEnvironment, self).__init__(action_spec=action_spec, time_step_spec=time_step_spec, batch_size=batch_size) if shuffle_buffer_size: dataset = dataset.shuffle(buffer_size=shuffle_buffer_size, seed=seed, reshuffle_each_iteration=True) if repeat_dataset: dataset = dataset.repeat() dataset = dataset.batch(batch_size, drop_remainder=True) if prefetch_size: dataset = dataset.prefetch(prefetch_size) self._data_iterator = eager_utils.dataset_iterator(dataset) self._current_label = tf.compat.v2.Variable( tf.zeros(batch_size, dtype=lbl_dtype)) self._previous_label = tf.compat.v2.Variable( tf.zeros(batch_size, dtype=lbl_dtype)) self._reward_distribution = reward_distribution self._label_dtype = lbl_dtype reward_means = self._reward_distribution.mean() self._optimal_action_table = tf.argmax( reward_means, axis=1, output_type=self._action_spec.dtype) self._optimal_reward_table = tf.reduce_max(reward_means, axis=1)
def testExclusive(self, dtype): spec = tensor_spec.TensorSpec((2, 3), dtype=dtype) self.assertIs(spec.is_discrete() ^ spec.is_continuous(), True)
def setUp(self): super(ActorPolicyTest, self).setUp() self._obs_spec = tensor_spec.TensorSpec([2], tf.float32) self._time_step_spec = ts.time_step_spec(self._obs_spec) self._action_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, 2, 3)
def testCreatePlaceholderWithNameScope(self): obs_spec = tensor_spec.TensorSpec([2], tf.float32, "obs") time_step_spec = ts.time_step_spec(obs_spec) ph = tensor_spec.to_nest_placeholder( time_step_spec, name_scope="action") self.assertEqual(ph.observation.name, "action/obs:0")
def setUp(self): super(ReinforceAgentTest, self).setUp() tf.compat.v1.enable_resource_variables() self._obs_spec = tensor_spec.TensorSpec([2], tf.float32) self._time_step_spec = ts.time_step_spec(self._obs_spec) self._action_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1)
def testAcceptsNumpyDType(self): desc = tensor_spec.TensorSpec([1], np.float32) self.assertEqual(desc.dtype, tf.float32)
def __init__(self, action_spec, time_step_spec, action): self._constant_action = action super(ConstantPolicy, self).__init__( time_step_spec=time_step_spec, action_spec=action_spec, info_spec={'a': tensor_spec.TensorSpec(shape=(), dtype=tf.int32)})
def __init__(self): observation_spec = tensor_spec.TensorSpec([2, 2], tf.float32) time_step_spec = ts.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1) super(TFPolicyMismatchedDtypes, self).__init__(time_step_spec, action_spec)
def setUp(self): super(TemporalActionSmoothingTest, self).setUp() self._obs_spec = tensor_spec.TensorSpec([2], tf.float32) self._time_step_spec = ts.time_step_spec(self._obs_spec) self._action_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, 0, 10)
def __init__(self, encoding_network, encoding_dim, reward_layer, epsilon_greedy, actions_from_reward_layer, cov_matrix, data_vector, num_samples, time_step_spec=None, alpha=1.0, emit_policy_info=(), emit_log_probability=False, accepts_per_arm_features=False, distributed_use_reward_layer=False, observation_and_action_constraint_splitter=None, name=None): """Initializes `NeuralLinUCBPolicy`. Args: encoding_network: network that encodes the observations. encoding_dim: (int) dimension of the encoded observations. reward_layer: final layer that predicts the expected reward per arm. In case the policy accepts per-arm features, the output of this layer has to be a scalar. This is because in the per-arm case, all encoded observations have to go through the same computation to get the reward estimates. The `num_actions` dimension of the encoded observation is treated as a batch dimension in the reward layer. epsilon_greedy: (float) representing the probability of choosing a random action instead of the greedy action. actions_from_reward_layer: (boolean variable) whether to get actions from the reward layer or from LinUCB. cov_matrix: list of the covariance matrices. There exists one covariance matrix per arm, unless the policy accepts per-arm features, in which case this list must have a single element. data_vector: list of the data vectors. A data vector is a weighted sum of the observations, where the weight is the corresponding reward. Each arm has its own data vector, unless the policy accepts per-arm features, in which case this list must have a single element. num_samples: list of number of samples per arm. If the policy accepts per- arm features, this is a single-element list counting the number of steps. time_step_spec: A `TimeStep` spec of the expected time_steps. alpha: (float) non-negative weight multiplying the confidence intervals. emit_policy_info: (tuple of strings) what side information we want to get as part of the policy info. Allowed values can be found in `policy_utilities.PolicyInfo`. emit_log_probability: (bool) whether to emit log probabilities. accepts_per_arm_features: (bool) Whether the policy accepts per-arm features. distributed_use_reward_layer: (bool) Whether to pick the actions using the network or use LinUCB. This applies only in distributed training setting and has a similar role to the `actions_from_reward_layer` mentioned above. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the bandit policy and 2) the mask. The mask should be a 0-1 `Tensor` of shape `[batch_size, num_actions]`. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. name: The name of this policy. """ encoding_network.create_variables() self._encoding_network = encoding_network self._reward_layer = reward_layer self._encoding_dim = encoding_dim if accepts_per_arm_features and reward_layer.units != 1: raise ValueError('The output dimension of the reward layer must be 1, got' ' {}'.format(reward_layer.units)) if not isinstance(cov_matrix, (list, tuple)): raise ValueError('cov_matrix must be a list of matrices (Tensors).') self._cov_matrix = cov_matrix if not isinstance(data_vector, (list, tuple)): raise ValueError('data_vector must be a list of vectors (Tensors).') self._data_vector = data_vector if not isinstance(num_samples, (list, tuple)): raise ValueError('num_samples must be a list of vectors (Tensors).') self._num_samples = num_samples self._alpha = alpha self._actions_from_reward_layer = actions_from_reward_layer self._epsilon_greedy = epsilon_greedy self._dtype = self._data_vector[0].dtype self._distributed_use_reward_layer = distributed_use_reward_layer if len(cov_matrix) != len(data_vector): raise ValueError('The size of list cov_matrix must match the size of ' 'list data_vector. Got {} for cov_matrix and {} ' 'for data_vector'.format( len(self._cov_matrix), len((data_vector)))) if len(num_samples) != len(cov_matrix): raise ValueError('The size of num_samples must match the size of ' 'list cov_matrix. Got {} for num_samples and {} ' 'for cov_matrix'.format( len(self._num_samples), len((cov_matrix)))) self._accepts_per_arm_features = accepts_per_arm_features if observation_and_action_constraint_splitter is not None: context_spec, _ = observation_and_action_constraint_splitter( time_step_spec.observation) else: context_spec = time_step_spec.observation if accepts_per_arm_features: self._num_actions = tf.nest.flatten(context_spec[ bandit_spec_utils.PER_ARM_FEATURE_KEY])[0].shape.as_list()[0] self._num_models = 1 else: self._num_actions = len(cov_matrix) self._num_models = self._num_actions cov_matrix_dim = tf.compat.dimension_value(cov_matrix[0].shape[0]) if self._encoding_dim != cov_matrix_dim: raise ValueError('The dimension of matrix `cov_matrix` must match ' 'encoding dimension {}.' 'Got {} for `cov_matrix`.'.format( self._encoding_dim, cov_matrix_dim)) data_vector_dim = tf.compat.dimension_value(data_vector[0].shape[0]) if self._encoding_dim != data_vector_dim: raise ValueError('The dimension of vector `data_vector` must match ' 'encoding dimension {}. ' 'Got {} for `data_vector`.'.format( self._encoding_dim, data_vector_dim)) action_spec = tensor_spec.BoundedTensorSpec( shape=(), dtype=tf.int32, minimum=0, maximum=self._num_actions - 1, name='action') self._emit_policy_info = emit_policy_info predicted_rewards_mean = () if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info: predicted_rewards_mean = tensor_spec.TensorSpec( [self._num_actions], dtype=tf.float32) predicted_rewards_optimistic = () if (policy_utilities.InfoFields.PREDICTED_REWARDS_OPTIMISTIC in emit_policy_info): predicted_rewards_optimistic = tensor_spec.TensorSpec( [self._num_actions], dtype=tf.float32) if accepts_per_arm_features: chosen_arm_features_info_spec = ( policy_utilities.create_chosen_arm_features_info_spec( time_step_spec.observation, observation_and_action_constraint_splitter)) info_spec = policy_utilities.PerArmPolicyInfo( predicted_rewards_mean=predicted_rewards_mean, predicted_rewards_optimistic=predicted_rewards_optimistic, chosen_arm_features=chosen_arm_features_info_spec) else: info_spec = policy_utilities.PolicyInfo( predicted_rewards_mean=predicted_rewards_mean, predicted_rewards_optimistic=predicted_rewards_optimistic) super(NeuralLinUCBPolicy, self).__init__( time_step_spec=time_step_spec, action_spec=action_spec, emit_log_probability=emit_log_probability, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), info_spec=info_spec, name=name)
def create_feed_forward_common_tower_network( observation_spec, global_layers, arm_layers, common_layers, output_dim=1, global_preprocessing_combiner=None, arm_preprocessing_combiner=None): """Creates a common tower network with feedforward towers. The network produced by this function can be used either in `GreedyRewardPredictionPolicy`, or `NeuralLinUCBPolicy`. In the former case, the network must have `output_dim=1`, it is going to be an instance of `QNetwork`, and used in the policy as a reward prediction network. In the latter case, the network will be an encoding network with its output consumed by a reward layer or a LinUCB method. The specified `output_dim` will be the encoding dimension. Args: observation_spec: A nested tensor spec containing the specs for global as well as per-arm observations. global_layers: Iterable of ints. Specifies the layers of the global tower. arm_layers: Iterable of ints. Specifies the layers of the arm tower. common_layers: Iterable of ints. Specifies the layers of the common tower. output_dim: The output dimension of the network. If 1, the common tower will be a QNetwork. Otherwise, the common tower will be an encoding network with the specified output dimension. global_preprocessing_combiner: Preprocessing combiner for global features. arm_preprocessing_combiner: Preprocessing combiner for the arm features. Returns: A network that takes observations adhering observation_spec and outputs reward estimates for every action. """ global_network = encoding_network.EncodingNetwork( input_tensor_spec=observation_spec[ bandit_spec_utils.GLOBAL_FEATURE_KEY], fc_layer_params=global_layers, preprocessing_combiner=global_preprocessing_combiner) arm_feature_spec = tensor_spec.remove_outer_dims_nest( observation_spec[bandit_spec_utils.PER_ARM_FEATURE_KEY], 1) arm_network = encoding_network.EncodingNetwork( input_tensor_spec=arm_feature_spec, fc_layer_params=arm_layers, preprocessing_combiner=arm_preprocessing_combiner) common_input_dim = global_layers[-1] + arm_layers[-1] common_input_spec = tensor_spec.TensorSpec(shape=(common_input_dim, ), dtype=tf.float32) if output_dim == 1: common_network = q_network.QNetwork( input_tensor_spec=common_input_spec, action_spec=tensor_spec.BoundedTensorSpec(shape=(), minimum=0, maximum=0, dtype=tf.int32), fc_layer_params=common_layers) else: common_network = encoding_network.EncodingNetwork( input_tensor_spec=common_input_spec, fc_layer_params=list(common_layers) + [output_dim]) return GlobalAndArmCommonTowerNetwork(observation_spec, global_network, arm_network, common_network)
def observation_spec(self): return tensor_spec.TensorSpec(shape=[3], dtype=tf.float32, name='observation_spec')
def testGetOuterShapeOneDim(self): tensor = tf.zeros([5, 2, 3], dtype=tf.float32) spec = tensor_spec.TensorSpec([2, 3], dtype=tf.float32) batch_size = nest_utils.get_outer_shape(tensor, spec) self.assertEqual(self.evaluate(batch_size), [5])
def __init__(self, time_step_spec, action_spec, optimizer=None, actor_net=None, value_net=None, importance_ratio_clipping=0.0, lambda_value=0.95, discount_factor=0.99, entropy_regularization=0.0, policy_l2_reg=0.0, value_function_l2_reg=0.0, value_pred_loss_coef=0.5, num_epochs=25, use_gae=False, use_td_lambda_return=False, normalize_rewards=True, reward_norm_clipping=10.0, normalize_observations=True, log_prob_clipping=0.0, kl_cutoff_factor=2.0, kl_cutoff_coef=1000.0, initial_adaptive_kl_beta=1.0, adaptive_kl_target=0.01, adaptive_kl_tolerance=0.3, gradient_clipping=None, check_numerics=False, debug_summaries=False, summarize_grads_and_vars=False, train_step_counter=None, name=None): """Creates a PPO Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. optimizer: Optimizer to use for the agent. actor_net: A function actor_net(observations, action_spec) that returns tensor of action distribution params for each observation. Takes nested observation and returns nested action. value_net: A function value_net(time_steps) that returns value tensor from neural net predictions for each observation. Takes nested observation and returns batch of value_preds. importance_ratio_clipping: Epsilon in clipped, surrogate PPO objective. For more detail, see explanation at the top of the doc. lambda_value: Lambda parameter for TD-lambda computation. discount_factor: Discount factor for return computation. entropy_regularization: Coefficient for entropy regularization loss term. policy_l2_reg: Coefficient for l2 regularization of policy weights. value_function_l2_reg: Coefficient for l2 regularization of value function weights. value_pred_loss_coef: Multiplier for value prediction loss to balance with policy gradient loss. num_epochs: Number of epochs for computing policy updates. use_gae: If True (default False), uses generalized advantage estimation for computing per-timestep advantage. Else, just subtracts value predictions from empirical return. use_td_lambda_return: If True (default False), uses td_lambda_return for training value function. (td_lambda_return = gae_advantage + value_predictions) normalize_rewards: If true, keeps moving variance of rewards and normalizes incoming rewards. reward_norm_clipping: Value above and below to clip normalized reward. normalize_observations: If true, keeps moving mean and variance of observations and normalizes incoming observations. log_prob_clipping: +/- value for clipping log probs to prevent inf / NaN values. Default: no clipping. kl_cutoff_factor: If policy KL changes more than this much for any single timestep, adds a squared KL penalty to loss function. kl_cutoff_coef: Loss coefficient for kl cutoff term. initial_adaptive_kl_beta: Initial value for beta coefficient of adaptive kl penalty. adaptive_kl_target: Desired kl target for policy updates. If actual kl is far from this target, adaptive_kl_beta will be updated. adaptive_kl_tolerance: A tolerance for adaptive_kl_beta. Mean KL above (1 + tol) * adaptive_kl_target, or below (1 - tol) * adaptive_kl_target, will cause adaptive_kl_beta to be updated. gradient_clipping: Norm length to clip gradients. Default: no clipping. check_numerics: If true, adds tf.debugging.check_numerics to help find NaN / Inf values. For debugging only. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If true, gradient summaries will be written. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. Raises: ValueError: If the actor_net is not a DistributionNetwork. """ if not isinstance(actor_net, network.DistributionNetwork): raise ValueError( 'actor_net must be an instance of a DistributionNetwork.') tf.Module.__init__(self, name=name) self._optimizer = optimizer self._actor_net = actor_net self._value_net = value_net self._importance_ratio_clipping = importance_ratio_clipping self._lambda = lambda_value self._discount_factor = discount_factor self._entropy_regularization = entropy_regularization self._policy_l2_reg = policy_l2_reg self._value_function_l2_reg = value_function_l2_reg self._value_pred_loss_coef = value_pred_loss_coef self._num_epochs = num_epochs self._use_gae = use_gae self._use_td_lambda_return = use_td_lambda_return self._reward_norm_clipping = reward_norm_clipping self._log_prob_clipping = log_prob_clipping self._kl_cutoff_factor = kl_cutoff_factor self._kl_cutoff_coef = kl_cutoff_coef self._adaptive_kl_target = adaptive_kl_target self._adaptive_kl_tolerance = adaptive_kl_tolerance self._gradient_clipping = gradient_clipping or 0.0 self._check_numerics = check_numerics if initial_adaptive_kl_beta > 0.0: # TODO(kbanoop): Rename create_variable. self._adaptive_kl_beta = common.create_variable( 'adaptive_kl_beta', initial_adaptive_kl_beta, dtype=tf.float32) else: self._adaptive_kl_beta = None self._reward_normalizer = None if normalize_rewards: self._reward_normalizer = tensor_normalizer.StreamingTensorNormalizer( tensor_spec.TensorSpec([], tf.float32), scope='normalize_reward') self._observation_normalizer = None if normalize_observations: self._observation_normalizer = ( tensor_normalizer.StreamingTensorNormalizer( time_step_spec.observation, scope='normalize_observations')) policy = greedy_policy.GreedyPolicy( ppo_policy.PPOPolicy( time_step_spec=time_step_spec, action_spec=action_spec, actor_network=actor_net, value_network=value_net, observation_normalizer=self._observation_normalizer, clip=False, collect=False)) collect_policy = ppo_policy.PPOPolicy( time_step_spec=time_step_spec, action_spec=action_spec, actor_network=actor_net, value_network=value_net, observation_normalizer=self._observation_normalizer, clip=False, collect=True) self._action_distribution_spec = (self._actor_net.output_spec) super(PPOAgent, self).__init__(time_step_spec, action_spec, policy, collect_policy, train_sequence_length=None, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter)
def testGetOuterShapeDynamicShapeBatched(self): spec = tensor_spec.TensorSpec([1], dtype=tf.float32) tensor = tf.convert_to_tensor(value=[[0.0]] * 8) batch_size = self.evaluate(nest_utils.get_outer_shape(tensor, spec)) self.assertAllEqual(batch_size, [8])
def setUp(self): super(DqnAgentTest, self).setUp() self._observation_spec = tensor_spec.TensorSpec([2], tf.float32) self._time_step_spec = ts.time_step_spec(self._observation_spec) self._action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 1)