Exemple #1
0
 def testAcceptsTensorShape(self):
   desc = tensor_spec.TensorSpec(tf.TensorShape([1]), tf.float32)
   self.assertEqual(desc.shape, tf.TensorShape([1]))
Exemple #2
0
 def testErrorOnWrongActionSpecWhenCreatingAgent(self):
   time_step_spec = ts.time_step_spec(tensor_spec.TensorSpec([2], tf.float32))
   wrong_action_spec = array_spec.BoundedArraySpec([1], np.float32, -1, 1)
   with self.assertRaisesRegex(
       TypeError, 'action_spec has to contain BoundedTensorSpec'):
     tf_agent.TFAgent(time_step_spec, wrong_action_spec, None, None, None)
Exemple #3
0
 def testGetOuterShapeTwoDims(self):
     tensor = tf.zeros([7, 5, 2, 3], dtype=tf.float32)
     spec = tensor_spec.TensorSpec([2, 3], dtype=tf.float32)
     batch_dim = nest_utils.get_outer_shape(tensor, spec)
     self.assertAllEqual(self.evaluate(batch_dim), [7, 5])
Exemple #4
0
 def setUp(self):
   super(SacAgentTest, self).setUp()
   self._obs_spec = tensor_spec.TensorSpec([2], tf.float32)
   self._time_step_spec = ts.time_step_spec(self._obs_spec)
   self._action_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1)
 def setUp(self):
     super(BehavioralCloningAgentTest, self).setUp()
     self._obs_spec = tensor_spec.TensorSpec([2], tf.float32)
     self._time_step_spec = ts.time_step_spec(self._obs_spec)
     self._action_spec = tensor_spec.BoundedTensorSpec([], tf.int32, 0, 1)
     self._observation_spec = self._time_step_spec.observation
    def testMixturePolicyDynamicBatchSize(self):
        context_dim = 35
        observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
        time_step_spec = ts.time_step_spec(observation_spec)
        action_spec = tensor_spec.BoundedTensorSpec(shape=(),
                                                    dtype=tf.int32,
                                                    minimum=0,
                                                    maximum=9,
                                                    name='action')
        sub_policies = [
            ConstantPolicy(action_spec, time_step_spec, i) for i in range(10)
        ]
        weights = [0, 0, 0.2, 0, 0, 0.3, 0, 0, 0.5, 0]
        dist = tfd.Categorical(probs=weights)

        policy = mixture_policy.MixturePolicy(dist, sub_policies)
        batch_size = tf.random.uniform(shape=(),
                                       minval=10,
                                       maxval=15,
                                       dtype=tf.int32)
        time_step = ts.TimeStep(
            tf.fill(tf.expand_dims(batch_size, axis=0),
                    ts.StepType.FIRST,
                    name='step_type'),
            tf.zeros(shape=[batch_size], dtype=tf.float32, name='reward'),
            tf.ones(shape=[batch_size], dtype=tf.float32, name='discount'),
            tf.reshape(tf.range(tf.cast(batch_size * context_dim,
                                        dtype=tf.float32),
                                dtype=tf.float32),
                       shape=[-1, context_dim],
                       name='observation'))
        action_step = policy.action(time_step)
        actions, bsize = self.evaluate([action_step.action, batch_size])
        self.assertAllEqual(actions.shape, [bsize])
        self.assertAllInSet(actions, [2, 5, 8])

        train_step = tf.compat.v1.train.get_or_create_global_step()
        saver = policy_saver.PolicySaver(policy, train_step=train_step)
        location = os.path.join(self.get_temp_dir(), 'saved_policy')
        if not tf.executing_eagerly():
            with self.cached_session():
                self.evaluate(tf.compat.v1.global_variables_initializer())
                saver.save(location)
        else:
            saver.save(location)
        loaded_policy = tf.compat.v2.saved_model.load(location)
        new_batch_size = 3
        new_time_step = ts.TimeStep(
            tf.fill(tf.expand_dims(new_batch_size, axis=0),
                    ts.StepType.FIRST,
                    name='step_type'),
            tf.zeros(shape=[new_batch_size], dtype=tf.float32, name='reward'),
            tf.ones(shape=[new_batch_size], dtype=tf.float32, name='discount'),
            tf.reshape(tf.range(tf.cast(new_batch_size * context_dim,
                                        dtype=tf.float32),
                                dtype=tf.float32),
                       shape=[-1, context_dim],
                       name='observation'))
        new_action = self.evaluate(loaded_policy.action(new_time_step).action)
        self.assertAllEqual(new_action.shape, [new_batch_size])
        self.assertAllInSet(new_action, [2, 5, 8])
    def testNeuralLinUCBUpdateNumTrainSteps0(self,
                                             batch_size=1,
                                             context_dim=10):
        """Check NeuralLinUCBAgent updates when behaving like LinUCB."""

        # Construct a `Trajectory` for the given action, observation, reward.
        num_actions = 5
        initial_step, final_step = _get_initial_and_final_steps(
            batch_size, context_dim)
        action = np.random.randint(num_actions,
                                   size=batch_size,
                                   dtype=np.int32)
        action_step = _get_action_step(action)
        experience = _get_experience(initial_step, action_step, final_step)

        # Construct an agent and perform the update.
        observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
        time_step_spec = time_step.time_step_spec(observation_spec)
        action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                    shape=(),
                                                    minimum=0,
                                                    maximum=num_actions - 1)
        encoder = DummyNet(obs_dim=context_dim)
        encoding_dim = 10
        agent = neural_linucb_agent.NeuralLinUCBAgent(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            encoding_network=encoder,
            encoding_network_num_train_steps=0,
            encoding_dim=encoding_dim,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=1e-2))

        loss_info = agent.train(experience)
        self.evaluate(agent.initialize())
        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.evaluate(loss_info)
        final_a = self.evaluate(agent.cov_matrix)
        final_b = self.evaluate(agent.data_vector)

        # Compute the expected updated estimates.
        observations_list = tf.dynamic_partition(
            data=tf.reshape(tf.cast(experience.observation, tf.float64),
                            [batch_size, context_dim]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        rewards_list = tf.dynamic_partition(
            data=tf.reshape(tf.cast(experience.reward, tf.float64),
                            [batch_size]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        expected_a_updated_list = []
        expected_b_updated_list = []
        for _, (observations_for_arm, rewards_for_arm) in enumerate(
                zip(observations_list, rewards_list)):

            encoded_observations_for_arm, _ = encoder(observations_for_arm)
            encoded_observations_for_arm = tf.cast(
                encoded_observations_for_arm, dtype=tf.float64)

            num_samples_for_arm_current = tf.cast(
                tf.shape(rewards_for_arm)[0], tf.float64)
            num_samples_for_arm_total = num_samples_for_arm_current

            # pylint: disable=cell-var-from-loop
            def true_fn():
                a_new = tf.eye(encoding_dim, dtype=tf.float64) + tf.matmul(
                    encoded_observations_for_arm,
                    encoded_observations_for_arm,
                    transpose_a=True)
                b_new = bandit_utils.sum_reward_weighted_observations(
                    rewards_for_arm, encoded_observations_for_arm)
                return a_new, b_new

            def false_fn():
                return (tf.eye(encoding_dim, dtype=tf.float64),
                        tf.zeros([encoding_dim], dtype=tf.float64))

            a_new, b_new = tf.cond(
                tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn)

            expected_a_updated_list.append(self.evaluate(a_new))
            expected_b_updated_list.append(self.evaluate(b_new))

        # Check that the actual updated estimates match the expectations.
        self.assertAllClose(expected_a_updated_list, final_a)
        self.assertAllClose(expected_b_updated_list, final_b)
Exemple #8
0
 def testIsDiscrete(self, dtype):
   spec = tensor_spec.TensorSpec((2, 3), dtype=dtype)
   self.assertIs(spec.is_discrete(), dtype.is_integer)
Exemple #9
0
 def testIsContinuous(self, dtype):
   spec = tensor_spec.TensorSpec((2, 3), dtype=dtype)
   self.assertIs(spec.is_continuous(), dtype.is_floating)
Exemple #10
0
 def testFromTensorSpec(self):
   spec_1 = tensor_spec.TensorSpec((1, 2), tf.int32)
   spec_2 = tensor_spec.TensorSpec.from_spec(spec_1)
   self.assertEqual(spec_1, spec_2)
Exemple #11
0
 def testIsDiscrete(self):
   discrete_spec = tensor_spec.TensorSpec((1, 2), tf.int32)
   continuous_spec = tensor_spec.TensorSpec((1, 2), tf.float32)
   self.assertTrue(discrete_spec.is_discrete())
   self.assertFalse(continuous_spec.is_discrete())
Exemple #12
0
 def testName(self):
   desc = tensor_spec.TensorSpec([1], tf.float32, name="beep")
   self.assertEqual(desc.name, "beep")
Exemple #13
0
 def testTypeCompatibility(self):
   floats = tf.placeholder(tf.float32, shape=[10, 10])
   ints = tf.placeholder(tf.int32, shape=[10, 10])
   desc = tensor_spec.TensorSpec(shape=(10, 10), dtype=tf.float32)
   self.assertTrue(desc.is_compatible_with(floats))
   self.assertFalse(desc.is_compatible_with(ints))
Exemple #14
0
 def testUnknownShape(self):
   desc = tensor_spec.TensorSpec(shape=None, dtype=tf.float32)
   self.assertEqual(desc.shape, tf.TensorShape(None))
Exemple #15
0
    def __init__(self,
                 dataset: tf.data.Dataset,
                 reward_distribution: types.Distribution,
                 batch_size: types.Int,
                 label_dtype_cast: Optional[tf.DType] = None,
                 shuffle_buffer_size: Optional[types.Int] = None,
                 repeat_dataset: Optional[bool] = True,
                 prefetch_size: Optional[types.Int] = None,
                 seed: Optional[types.Int] = None):
        """Initialize `ClassificationBanditEnvironment`.

    Args:
      dataset: a `tf.data.Dataset` consisting of two `Tensor`s, [inputs, labels]
        where inputs can be of any shape, while labels are integer class labels.
        The label tensor can be of any rank as long as it has 1 element.
      reward_distribution: a `tfd.Distribution` with event_shape
        `[num_classes, num_actions]`. Entry `[i, j]` is the reward for taking
        action `j` for an instance of class `i`.
      batch_size: if `dataset` is batched, this is the size of the batches.
      label_dtype_cast: if not None, casts dataset labels to this dtype.
      shuffle_buffer_size: If None, do not shuffle.  Otherwise, a shuffle buffer
        of the specified size is used in the environment's `dataset`.
      repeat_dataset: Makes the environment iterate on the `dataset` once
        avoiding `OutOfRangeError:  End of sequence` errors when the environment
        is stepped past the end of the `dataset`.
      prefetch_size: If None, do not prefetch.  Otherwise, a prefetch buffer
        of the specified size is used in the environment's `dataset`.
      seed: Used to make results deterministic.
    Raises:
      ValueError: if `reward_distribution` does not have an event shape with
        rank 2.
    """

        # Computing `action_spec`.
        event_shape = reward_distribution.event_shape
        if len(event_shape) != 2:
            raise ValueError(
                'reward_distribution must have event shape of rank 2; '
                'got event shape {}'.format(event_shape))
        _, num_actions = event_shape
        action_spec = tensor_spec.BoundedTensorSpec(shape=(),
                                                    dtype=tf.int32,
                                                    minimum=0,
                                                    maximum=num_actions - 1,
                                                    name='action')
        output_shapes = tf.compat.v1.data.get_output_shapes(dataset)

        # Computing `time_step_spec`.
        if len(output_shapes) != 2:
            raise ValueError(
                'Dataset must have exactly two outputs; got {}'.format(
                    len(output_shapes)))
        context_shape = output_shapes[0]
        context_dtype, lbl_dtype = tf.compat.v1.data.get_output_types(dataset)
        if label_dtype_cast:
            lbl_dtype = label_dtype_cast
        observation_spec = tensor_spec.TensorSpec(shape=context_shape,
                                                  dtype=context_dtype)
        time_step_spec = time_step.time_step_spec(observation_spec)

        super(ClassificationBanditEnvironment,
              self).__init__(action_spec=action_spec,
                             time_step_spec=time_step_spec,
                             batch_size=batch_size)

        if shuffle_buffer_size:
            dataset = dataset.shuffle(buffer_size=shuffle_buffer_size,
                                      seed=seed,
                                      reshuffle_each_iteration=True)
        if repeat_dataset:
            dataset = dataset.repeat()
        dataset = dataset.batch(batch_size, drop_remainder=True)
        if prefetch_size:
            dataset = dataset.prefetch(prefetch_size)
        self._data_iterator = eager_utils.dataset_iterator(dataset)
        self._current_label = tf.compat.v2.Variable(
            tf.zeros(batch_size, dtype=lbl_dtype))
        self._previous_label = tf.compat.v2.Variable(
            tf.zeros(batch_size, dtype=lbl_dtype))
        self._reward_distribution = reward_distribution
        self._label_dtype = lbl_dtype

        reward_means = self._reward_distribution.mean()
        self._optimal_action_table = tf.argmax(
            reward_means, axis=1, output_type=self._action_spec.dtype)
        self._optimal_reward_table = tf.reduce_max(reward_means, axis=1)
Exemple #16
0
 def testExclusive(self, dtype):
   spec = tensor_spec.TensorSpec((2, 3), dtype=dtype)
   self.assertIs(spec.is_discrete() ^ spec.is_continuous(), True)
 def setUp(self):
     super(ActorPolicyTest, self).setUp()
     self._obs_spec = tensor_spec.TensorSpec([2], tf.float32)
     self._time_step_spec = ts.time_step_spec(self._obs_spec)
     self._action_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, 2,
                                                       3)
Exemple #18
0
 def testCreatePlaceholderWithNameScope(self):
   obs_spec = tensor_spec.TensorSpec([2], tf.float32, "obs")
   time_step_spec = ts.time_step_spec(obs_spec)
   ph = tensor_spec.to_nest_placeholder(
       time_step_spec, name_scope="action")
   self.assertEqual(ph.observation.name, "action/obs:0")
Exemple #19
0
 def setUp(self):
   super(ReinforceAgentTest, self).setUp()
   tf.compat.v1.enable_resource_variables()
   self._obs_spec = tensor_spec.TensorSpec([2], tf.float32)
   self._time_step_spec = ts.time_step_spec(self._obs_spec)
   self._action_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1)
Exemple #20
0
 def testAcceptsNumpyDType(self):
   desc = tensor_spec.TensorSpec([1], np.float32)
   self.assertEqual(desc.dtype, tf.float32)
Exemple #21
0
 def __init__(self, action_spec, time_step_spec, action):
     self._constant_action = action
     super(ConstantPolicy, self).__init__(
         time_step_spec=time_step_spec,
         action_spec=action_spec,
         info_spec={'a': tensor_spec.TensorSpec(shape=(), dtype=tf.int32)})
Exemple #22
0
 def __init__(self):
     observation_spec = tensor_spec.TensorSpec([2, 2], tf.float32)
     time_step_spec = ts.time_step_spec(observation_spec)
     action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)
     super(TFPolicyMismatchedDtypes, self).__init__(time_step_spec,
                                                    action_spec)
 def setUp(self):
     super(TemporalActionSmoothingTest, self).setUp()
     self._obs_spec = tensor_spec.TensorSpec([2], tf.float32)
     self._time_step_spec = ts.time_step_spec(self._obs_spec)
     self._action_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, 0,
                                                       10)
Exemple #24
0
  def __init__(self,
               encoding_network,
               encoding_dim,
               reward_layer,
               epsilon_greedy,
               actions_from_reward_layer,
               cov_matrix,
               data_vector,
               num_samples,
               time_step_spec=None,
               alpha=1.0,
               emit_policy_info=(),
               emit_log_probability=False,
               accepts_per_arm_features=False,
               distributed_use_reward_layer=False,
               observation_and_action_constraint_splitter=None,
               name=None):
    """Initializes `NeuralLinUCBPolicy`.

    Args:
      encoding_network: network that encodes the observations.
      encoding_dim: (int) dimension of the encoded observations.
      reward_layer: final layer that predicts the expected reward per arm. In
        case the policy accepts per-arm features, the output of this layer has
        to be a scalar. This is because in the per-arm case, all encoded
        observations have to go through the same computation to get the reward
        estimates. The `num_actions` dimension of the encoded observation is
        treated as a batch dimension in the reward layer.
      epsilon_greedy: (float) representing the probability of choosing a random
        action instead of the greedy action.
      actions_from_reward_layer: (boolean variable) whether to get actions from
        the reward layer or from LinUCB.
      cov_matrix: list of the covariance matrices. There exists one covariance
        matrix per arm, unless the policy accepts per-arm features, in which
        case this list must have a single element.
      data_vector: list of the data vectors. A data vector is a weighted sum
        of the observations, where the weight is the corresponding reward. Each
        arm has its own data vector, unless the policy accepts per-arm features,
        in which case this list must have a single element.
      num_samples: list of number of samples per arm. If the policy accepts per-
        arm features, this is a single-element list counting the number of
        steps.
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      alpha: (float) non-negative weight multiplying the confidence intervals.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      emit_log_probability: (bool) whether to emit log probabilities.
      accepts_per_arm_features: (bool) Whether the policy accepts per-arm
        features.
      distributed_use_reward_layer: (bool) Whether to pick the actions using
        the network or use LinUCB. This applies only in distributed training
        setting and has a similar role to the `actions_from_reward_layer`
        mentioned above.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit policy and 2)
        the mask. The mask should be a 0-1 `Tensor` of shape
        `[batch_size, num_actions]`. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      name: The name of this policy.
    """
    encoding_network.create_variables()
    self._encoding_network = encoding_network
    self._reward_layer = reward_layer
    self._encoding_dim = encoding_dim

    if accepts_per_arm_features and reward_layer.units != 1:
      raise ValueError('The output dimension of the reward layer must be 1, got'
                       ' {}'.format(reward_layer.units))

    if not isinstance(cov_matrix, (list, tuple)):
      raise ValueError('cov_matrix must be a list of matrices (Tensors).')
    self._cov_matrix = cov_matrix

    if not isinstance(data_vector, (list, tuple)):
      raise ValueError('data_vector must be a list of vectors (Tensors).')
    self._data_vector = data_vector

    if not isinstance(num_samples, (list, tuple)):
      raise ValueError('num_samples must be a list of vectors (Tensors).')
    self._num_samples = num_samples

    self._alpha = alpha
    self._actions_from_reward_layer = actions_from_reward_layer
    self._epsilon_greedy = epsilon_greedy
    self._dtype = self._data_vector[0].dtype
    self._distributed_use_reward_layer = distributed_use_reward_layer

    if len(cov_matrix) != len(data_vector):
      raise ValueError('The size of list cov_matrix must match the size of '
                       'list data_vector. Got {} for cov_matrix and {} '
                       'for data_vector'.format(
                           len(self._cov_matrix), len((data_vector))))
    if len(num_samples) != len(cov_matrix):
      raise ValueError('The size of num_samples must match the size of '
                       'list cov_matrix. Got {} for num_samples and {} '
                       'for cov_matrix'.format(
                           len(self._num_samples), len((cov_matrix))))

    self._accepts_per_arm_features = accepts_per_arm_features
    if observation_and_action_constraint_splitter is not None:
      context_spec, _ = observation_and_action_constraint_splitter(
          time_step_spec.observation)
    else:
      context_spec = time_step_spec.observation
    if accepts_per_arm_features:
      self._num_actions = tf.nest.flatten(context_spec[
          bandit_spec_utils.PER_ARM_FEATURE_KEY])[0].shape.as_list()[0]
      self._num_models = 1
    else:
      self._num_actions = len(cov_matrix)
      self._num_models = self._num_actions
    cov_matrix_dim = tf.compat.dimension_value(cov_matrix[0].shape[0])
    if self._encoding_dim != cov_matrix_dim:
      raise ValueError('The dimension of matrix `cov_matrix` must match '
                       'encoding dimension {}.'
                       'Got {} for `cov_matrix`.'.format(
                           self._encoding_dim, cov_matrix_dim))
    data_vector_dim = tf.compat.dimension_value(data_vector[0].shape[0])
    if self._encoding_dim != data_vector_dim:
      raise ValueError('The dimension of vector `data_vector` must match '
                       'encoding  dimension {}. '
                       'Got {} for `data_vector`.'.format(
                           self._encoding_dim, data_vector_dim))
    action_spec = tensor_spec.BoundedTensorSpec(
        shape=(),
        dtype=tf.int32,
        minimum=0,
        maximum=self._num_actions - 1,
        name='action')

    self._emit_policy_info = emit_policy_info
    predicted_rewards_mean = ()
    if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info:
      predicted_rewards_mean = tensor_spec.TensorSpec(
          [self._num_actions],
          dtype=tf.float32)
    predicted_rewards_optimistic = ()
    if (policy_utilities.InfoFields.PREDICTED_REWARDS_OPTIMISTIC in
        emit_policy_info):
      predicted_rewards_optimistic = tensor_spec.TensorSpec(
          [self._num_actions],
          dtype=tf.float32)
    if accepts_per_arm_features:
      chosen_arm_features_info_spec = (
          policy_utilities.create_chosen_arm_features_info_spec(
              time_step_spec.observation,
              observation_and_action_constraint_splitter))
      info_spec = policy_utilities.PerArmPolicyInfo(
          predicted_rewards_mean=predicted_rewards_mean,
          predicted_rewards_optimistic=predicted_rewards_optimistic,
          chosen_arm_features=chosen_arm_features_info_spec)
    else:
      info_spec = policy_utilities.PolicyInfo(
          predicted_rewards_mean=predicted_rewards_mean,
          predicted_rewards_optimistic=predicted_rewards_optimistic)

    super(NeuralLinUCBPolicy, self).__init__(
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        emit_log_probability=emit_log_probability,
        observation_and_action_constraint_splitter=(
            observation_and_action_constraint_splitter),
        info_spec=info_spec,
        name=name)
def create_feed_forward_common_tower_network(
        observation_spec,
        global_layers,
        arm_layers,
        common_layers,
        output_dim=1,
        global_preprocessing_combiner=None,
        arm_preprocessing_combiner=None):
    """Creates a common tower network with feedforward towers.

  The network produced by this function can be used either in
  `GreedyRewardPredictionPolicy`, or `NeuralLinUCBPolicy`.
  In the former case, the network must have `output_dim=1`, it is going to be an
  instance of `QNetwork`, and used in the policy as a reward prediction network.
  In the latter case, the network will be an encoding network with its output
  consumed by a reward layer or a LinUCB method. The specified `output_dim` will
  be the encoding dimension.

  Args:
    observation_spec: A nested tensor spec containing the specs for global as
      well as per-arm observations.
    global_layers: Iterable of ints. Specifies the layers of the global tower.
    arm_layers: Iterable of ints. Specifies the layers of the arm tower.
    common_layers: Iterable of ints. Specifies the layers of the common tower.
    output_dim: The output dimension of the network. If 1, the common tower will
      be a QNetwork. Otherwise, the common tower will be an encoding network
      with the specified output dimension.
    global_preprocessing_combiner: Preprocessing combiner for global features.
    arm_preprocessing_combiner: Preprocessing combiner for the arm features.

  Returns:
    A network that takes observations adhering observation_spec and outputs
    reward estimates for every action.
  """
    global_network = encoding_network.EncodingNetwork(
        input_tensor_spec=observation_spec[
            bandit_spec_utils.GLOBAL_FEATURE_KEY],
        fc_layer_params=global_layers,
        preprocessing_combiner=global_preprocessing_combiner)

    arm_feature_spec = tensor_spec.remove_outer_dims_nest(
        observation_spec[bandit_spec_utils.PER_ARM_FEATURE_KEY], 1)
    arm_network = encoding_network.EncodingNetwork(
        input_tensor_spec=arm_feature_spec,
        fc_layer_params=arm_layers,
        preprocessing_combiner=arm_preprocessing_combiner)
    common_input_dim = global_layers[-1] + arm_layers[-1]
    common_input_spec = tensor_spec.TensorSpec(shape=(common_input_dim, ),
                                               dtype=tf.float32)
    if output_dim == 1:
        common_network = q_network.QNetwork(
            input_tensor_spec=common_input_spec,
            action_spec=tensor_spec.BoundedTensorSpec(shape=(),
                                                      minimum=0,
                                                      maximum=0,
                                                      dtype=tf.int32),
            fc_layer_params=common_layers)
    else:
        common_network = encoding_network.EncodingNetwork(
            input_tensor_spec=common_input_spec,
            fc_layer_params=list(common_layers) + [output_dim])
    return GlobalAndArmCommonTowerNetwork(observation_spec, global_network,
                                          arm_network, common_network)
 def observation_spec(self):
     return tensor_spec.TensorSpec(shape=[3],
                                   dtype=tf.float32,
                                   name='observation_spec')
Exemple #27
0
 def testGetOuterShapeOneDim(self):
     tensor = tf.zeros([5, 2, 3], dtype=tf.float32)
     spec = tensor_spec.TensorSpec([2, 3], dtype=tf.float32)
     batch_size = nest_utils.get_outer_shape(tensor, spec)
     self.assertEqual(self.evaluate(batch_size), [5])
Exemple #28
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 optimizer=None,
                 actor_net=None,
                 value_net=None,
                 importance_ratio_clipping=0.0,
                 lambda_value=0.95,
                 discount_factor=0.99,
                 entropy_regularization=0.0,
                 policy_l2_reg=0.0,
                 value_function_l2_reg=0.0,
                 value_pred_loss_coef=0.5,
                 num_epochs=25,
                 use_gae=False,
                 use_td_lambda_return=False,
                 normalize_rewards=True,
                 reward_norm_clipping=10.0,
                 normalize_observations=True,
                 log_prob_clipping=0.0,
                 kl_cutoff_factor=2.0,
                 kl_cutoff_coef=1000.0,
                 initial_adaptive_kl_beta=1.0,
                 adaptive_kl_target=0.01,
                 adaptive_kl_tolerance=0.3,
                 gradient_clipping=None,
                 check_numerics=False,
                 debug_summaries=False,
                 summarize_grads_and_vars=False,
                 train_step_counter=None,
                 name=None):
        """Creates a PPO Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      optimizer: Optimizer to use for the agent.
      actor_net: A function actor_net(observations, action_spec) that returns
        tensor of action distribution params for each observation. Takes nested
        observation and returns nested action.
      value_net: A function value_net(time_steps) that returns value tensor from
        neural net predictions for each observation. Takes nested observation
        and returns batch of value_preds.
      importance_ratio_clipping: Epsilon in clipped, surrogate PPO objective.
        For more detail, see explanation at the top of the doc.
      lambda_value: Lambda parameter for TD-lambda computation.
      discount_factor: Discount factor for return computation.
      entropy_regularization: Coefficient for entropy regularization loss term.
      policy_l2_reg: Coefficient for l2 regularization of policy weights.
      value_function_l2_reg: Coefficient for l2 regularization of value function
        weights.
      value_pred_loss_coef: Multiplier for value prediction loss to balance with
        policy gradient loss.
      num_epochs: Number of epochs for computing policy updates.
      use_gae: If True (default False), uses generalized advantage estimation
        for computing per-timestep advantage. Else, just subtracts value
        predictions from empirical return.
      use_td_lambda_return: If True (default False), uses td_lambda_return for
        training value function. (td_lambda_return = gae_advantage +
        value_predictions)
      normalize_rewards: If true, keeps moving variance of rewards and
        normalizes incoming rewards.
      reward_norm_clipping: Value above and below to clip normalized reward.
      normalize_observations: If true, keeps moving mean and variance of
        observations and normalizes incoming observations.
      log_prob_clipping: +/- value for clipping log probs to prevent inf / NaN
        values.  Default: no clipping.
      kl_cutoff_factor: If policy KL changes more than this much for any single
        timestep, adds a squared KL penalty to loss function.
      kl_cutoff_coef: Loss coefficient for kl cutoff term.
      initial_adaptive_kl_beta: Initial value for beta coefficient of adaptive
        kl penalty.
      adaptive_kl_target: Desired kl target for policy updates. If actual kl is
        far from this target, adaptive_kl_beta will be updated.
      adaptive_kl_tolerance: A tolerance for adaptive_kl_beta. Mean KL above (1
        + tol) * adaptive_kl_target, or below (1 - tol) * adaptive_kl_target,
        will cause adaptive_kl_beta to be updated.
      gradient_clipping: Norm length to clip gradients.  Default: no clipping.
      check_numerics: If true, adds tf.debugging.check_numerics to help find NaN
        / Inf values. For debugging only.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If true, gradient summaries will be written.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall under
        that name. Defaults to the class name.

    Raises:
      ValueError: If the actor_net is not a DistributionNetwork.
    """
        if not isinstance(actor_net, network.DistributionNetwork):
            raise ValueError(
                'actor_net must be an instance of a DistributionNetwork.')

        tf.Module.__init__(self, name=name)

        self._optimizer = optimizer
        self._actor_net = actor_net
        self._value_net = value_net
        self._importance_ratio_clipping = importance_ratio_clipping
        self._lambda = lambda_value
        self._discount_factor = discount_factor
        self._entropy_regularization = entropy_regularization
        self._policy_l2_reg = policy_l2_reg
        self._value_function_l2_reg = value_function_l2_reg
        self._value_pred_loss_coef = value_pred_loss_coef
        self._num_epochs = num_epochs
        self._use_gae = use_gae
        self._use_td_lambda_return = use_td_lambda_return
        self._reward_norm_clipping = reward_norm_clipping
        self._log_prob_clipping = log_prob_clipping
        self._kl_cutoff_factor = kl_cutoff_factor
        self._kl_cutoff_coef = kl_cutoff_coef
        self._adaptive_kl_target = adaptive_kl_target
        self._adaptive_kl_tolerance = adaptive_kl_tolerance
        self._gradient_clipping = gradient_clipping or 0.0
        self._check_numerics = check_numerics

        if initial_adaptive_kl_beta > 0.0:
            # TODO(kbanoop): Rename create_variable.
            self._adaptive_kl_beta = common.create_variable(
                'adaptive_kl_beta', initial_adaptive_kl_beta, dtype=tf.float32)
        else:
            self._adaptive_kl_beta = None

        self._reward_normalizer = None
        if normalize_rewards:
            self._reward_normalizer = tensor_normalizer.StreamingTensorNormalizer(
                tensor_spec.TensorSpec([], tf.float32),
                scope='normalize_reward')

        self._observation_normalizer = None
        if normalize_observations:
            self._observation_normalizer = (
                tensor_normalizer.StreamingTensorNormalizer(
                    time_step_spec.observation,
                    scope='normalize_observations'))

        policy = greedy_policy.GreedyPolicy(
            ppo_policy.PPOPolicy(
                time_step_spec=time_step_spec,
                action_spec=action_spec,
                actor_network=actor_net,
                value_network=value_net,
                observation_normalizer=self._observation_normalizer,
                clip=False,
                collect=False))

        collect_policy = ppo_policy.PPOPolicy(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            actor_network=actor_net,
            value_network=value_net,
            observation_normalizer=self._observation_normalizer,
            clip=False,
            collect=True)

        self._action_distribution_spec = (self._actor_net.output_spec)

        super(PPOAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy,
                             train_sequence_length=None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter)
Exemple #29
0
 def testGetOuterShapeDynamicShapeBatched(self):
     spec = tensor_spec.TensorSpec([1], dtype=tf.float32)
     tensor = tf.convert_to_tensor(value=[[0.0]] * 8)
     batch_size = self.evaluate(nest_utils.get_outer_shape(tensor, spec))
     self.assertAllEqual(batch_size, [8])
Exemple #30
0
 def setUp(self):
   super(DqnAgentTest, self).setUp()
   self._observation_spec = tensor_spec.TensorSpec([2], tf.float32)
   self._time_step_spec = ts.time_step_spec(self._observation_spec)
   self._action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 1)