def testSimple(self):
     converter = data_converter.AsTrajectory(self._data_context)
     traj = tensor_spec.sample_spec_nest(self._data_context.trajectory_spec,
                                         outer_dims=[2, 3])
     converted = converter(traj)
     (traj, converted) = self.evaluate((traj, converted))
     tf.nest.map_structure(self.assertAllEqual, converted, traj)
 def testFromBatchTimeTransition(self):
     converter = data_converter.AsTrajectory(self._data_context)
     traj = tensor_spec.sample_spec_nest(self._data_context.trajectory_spec,
                                         outer_dims=[2, 3])
     transition = trajectory.to_transition(traj, traj)
     converted = converter(transition)
     (traj, converted) = self.evaluate((traj, converted))
     tf.nest.map_structure(self.assertAllEqual, converted, traj)
 def testNoTimeDimensionRaises(self):
     converter = data_converter.AsTrajectory(self._data_context)
     traj = tensor_spec.sample_spec_nest(self._data_context.trajectory_spec,
                                         outer_dims=[3])
     with self.assertRaisesRegex(
             ValueError,
             r'must have two outer dimensions: batch size and time'):
         converter(traj)
 def testNoTimeDimensionRaises(self):
     converter = data_converter.AsTrajectory(self._data_context)
     traj = tensor_spec.sample_spec_nest(self._data_context.trajectory_spec,
                                         outer_dims=[3])
     with self.assertRaisesRegex(
             ValueError,
             r'tensors must have shape \`\[B, T\] \+ spec.shape\`'):
         converter(traj)
 def testInvalidTimeDimensionRaises(self):
     converter = data_converter.AsTrajectory(self._data_context,
                                             sequence_length=4)
     traj = tensor_spec.sample_spec_nest(self._data_context.trajectory_spec,
                                         outer_dims=[2, 3])
     with self.assertRaisesRegex(
             ValueError,
             r'has a time axis dim value \'3\' vs the expected \'4\''):
         converter(traj)
 def testPrunes(self):
     converter = data_converter.AsTrajectory(self._data_context)
     my_spec = self._data_context.trajectory_spec.replace(
         action={
             'action1': tf.TensorSpec((), tf.float32),
             'action2': tf.TensorSpec([4], tf.int32)
         })
     traj = tensor_spec.sample_spec_nest(my_spec, outer_dims=[2, 3])
     converted = converter(traj)
     expected = tf.nest.map_structure(lambda x: x, traj)
     del expected.action['action2']
     (expected, converted) = self.evaluate((expected, converted))
     tf.nest.map_structure(self.assertAllEqual, converted, expected)
Ejemplo n.º 7
0
 def __init__(self,
              time_step_spec=None,
              action_spec=None,
              training_data_spec=None,
              train_sequence_length=None):
     if time_step_spec is None:
         obs_spec = {'obs': tf.TensorSpec([], tf.float32)}
         time_step_spec = ts.time_step_spec(obs_spec)
     action_spec = action_spec or ()
     policy = random_tf_policy.RandomTFPolicy(time_step_spec, action_spec)
     super(MyAgent,
           self).__init__(time_step_spec=time_step_spec,
                          action_spec=action_spec,
                          policy=policy,
                          collect_policy=policy,
                          train_sequence_length=train_sequence_length,
                          training_data_spec=training_data_spec)
     self._as_trajectory = data_converter.AsTrajectory(
         self.data_context, sequence_length=train_sequence_length)
Ejemplo n.º 8
0
    def __init__(self,
                 mixture_distribution: types.Distribution,
                 agents: Sequence[tf_agent.TFAgent],
                 name: Optional[Text] = None):
        """Initializes an instance of `MixtureAgent`.

    Args:
      mixture_distribution: An instance of `tfd.Categorical` distribution. This
        distribution is used to draw sub-policies by the mixture policy. The
        parameters of the distribution is trained by the mixture agent.
      agents: List of instances of TF-Agents bandit agents. These agents will be
        trained and used to select actions. The length of this list should match
        that of `mixture_weights`.
      name: The name of this instance of `MixtureAgent`.
    """
        tf.Module.__init__(self, name=name)
        time_step_spec = agents[0].time_step_spec
        action_spec = agents[0].action_spec
        self._original_info_spec = agents[0].policy.info_spec
        error_message = None
        for agent in agents[1:]:
            if action_spec != agent.action_spec:
                error_message = 'Inconsistent action specs.'
            if time_step_spec != agent.time_step_spec:
                error_message = 'Inconsistent time step specs.'
            if self._original_info_spec != agent.policy.info_spec:
                error_message = 'Inconsistent info specs.'
        if error_message is not None:
            raise ValueError(error_message)
        self._agents = agents
        self._num_agents = len(agents)
        self._mixture_distribution = mixture_distribution
        policies = [agent.collect_policy for agent in agents]
        policy = mixture_policy.MixturePolicy(mixture_distribution, policies)
        super(MixtureAgent, self).__init__(time_step_spec,
                                           action_spec,
                                           policy,
                                           policy,
                                           train_sequence_length=None)
        self._as_trajectory = data_converter.AsTrajectory(self.data_context,
                                                          sequence_length=None)
Ejemplo n.º 9
0
    def __init__(self,
                 time_step_spec: types.TimeStep,
                 action_spec: types.BoundedTensorSpec,
                 learning_rate: float,
                 name: Optional[Text] = None):
        """Initialize an instance of `Exp3Agent`.

    Args:
      time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s.
      action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype
        describing the number of actions for this agent.
      learning_rate: A float valued scalar. A higher value will force the agent
        to converge on a single action more quickly. A lower value will
        encourage more exploration. This value corresponds to the
        `inverse_temperature` argument passed to `CategoricalPolicy`.
      name: a name for this instance of `Exp3Agent`.
    """
        tf.Module.__init__(self, name=name)
        common.tf_agents_gauge.get_cell('TFABandit').set(True)
        self._num_actions = policy_utilities.get_num_actions_from_tensor_spec(
            action_spec)
        self._weights = tf.compat.v2.Variable(tf.zeros(self._num_actions),
                                              name='weights')
        self._learning_rate = tf.compat.v2.Variable(learning_rate,
                                                    name='learning_rate')
        policy = categorical_policy.CategoricalPolicy(
            weights=self._weights,
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            inverse_temperature=self._learning_rate)
        # TODO(b/127462472): consider policy=GreedyPolicy(collect_policy).
        super(Exp3Agent, self).__init__(time_step_spec=time_step_spec,
                                        action_spec=policy.action_spec,
                                        policy=policy,
                                        collect_policy=policy,
                                        train_sequence_length=None,
                                        validate_args=False)
        self._as_trajectory = data_converter.AsTrajectory(self.data_context,
                                                          sequence_length=None)
    def __init__(self,
                 time_step_spec: types.TimeStep,
                 action_spec: types.BoundedTensorSpec,
                 variable_collection: Optional[
                     BernoulliBanditVariableCollection] = None,
                 dtype: tf.DType = tf.float32,
                 batch_size: Optional[int] = 1,
                 observation_and_action_constraint_splitter: Optional[
                     types.Splitter] = None,
                 emit_policy_info: Sequence[Text] = (),
                 name: Optional[Text] = None):
        """Creates a Bernoulli Thompson Sampling Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      variable_collection: Instance of `BernoulliBanditVariableCollection`.
        Collection of variables to be updated by the agent. If `None`, a new
        instance of `BernoulliBanditVariableCollection` will be created.
      dtype: The type of the variables. Should be one of `tf.float32` or
        `tf.float64`.
      batch_size: optional int with the batch size. It defaults to 1.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      name: Python str name of this agent. All variables in this module will
        fall under that name. Defaults to the class name.

    Raises:
      ValueError: If the action spec contains more than one action or it is
        not a bounded scalar int32 spec with minimum 0.
      TypeError: if variable_collection is not an instance of
        `BernoulliBanditVariableCollection`.
    """
        tf.Module.__init__(self, name=name)
        common.tf_agents_gauge.get_cell('TFABandit').set(True)
        self._observation_and_action_constraint_splitter = (
            observation_and_action_constraint_splitter)
        self._num_actions = policy_utilities.get_num_actions_from_tensor_spec(
            action_spec)

        self._dtype = dtype
        if variable_collection is None:
            variable_collection = BernoulliBanditVariableCollection(
                num_actions=self._num_actions, dtype=dtype)
        elif not isinstance(variable_collection,
                            BernoulliBanditVariableCollection):
            raise TypeError('Parameter `variable_collection` should be '
                            'of type `BernoulliBanditVariableCollection`.')
        self._variable_collection = variable_collection
        self._alpha = variable_collection.alpha
        self._beta = variable_collection.beta
        self._batch_size = batch_size
        policy = bernoulli_policy.BernoulliThompsonSamplingPolicy(
            time_step_spec,
            action_spec,
            self._alpha,
            self._beta,
            observation_and_action_constraint_splitter,
            emit_policy_info=emit_policy_info)

        super(BernoulliThompsonSamplingAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy=policy,
                             train_sequence_length=None)
        self._as_trajectory = data_converter.AsTrajectory(self.data_context,
                                                          sequence_length=None)
Ejemplo n.º 11
0
    def __init__(self,
                 exploration_policy,
                 time_step_spec: types.TimeStep,
                 action_spec: types.BoundedTensorSpec,
                 variable_collection: Optional[
                     LinearBanditVariableCollection] = None,
                 alpha: float = 1.0,
                 gamma: float = 1.0,
                 use_eigendecomp: bool = False,
                 tikhonov_weight: float = 1.0,
                 add_bias: bool = False,
                 emit_policy_info: Sequence[Text] = (),
                 emit_log_probability: bool = False,
                 observation_and_action_constraint_splitter: Optional[
                     types.Splitter] = None,
                 accepts_per_arm_features: bool = False,
                 debug_summaries: bool = False,
                 summarize_grads_and_vars: bool = False,
                 enable_summaries: bool = True,
                 dtype: tf.DType = tf.float32,
                 name: Optional[Text] = None):
        """Initialize an instance of `LinearBanditAgent`.

    Args:
      exploration_policy: An Enum of type `ExplorationPolicy`. The kind of
        policy we use for exploration. Currently supported policies are
        `LinUCBPolicy` and `LinearThompsonSamplingPolicy`.
      time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s.
      action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype
        describing the number of actions for this agent.
      variable_collection: Instance of `LinearBanditVariableCollection`.
        Collection of variables to be updated by the agent. If `None`, a new
        instance of `LinearBanditVariableCollection` will be created.
      alpha: (float) positive scalar. This is the exploration parameter that
        multiplies the confidence intervals.
      gamma: a float forgetting factor in [0.0, 1.0]. When set to 1.0, the
        algorithm does not forget.
      use_eigendecomp: whether to use eigen-decomposition or not. The default
        solver is Conjugate Gradient.
      tikhonov_weight: (float) tikhonov regularization term.
      add_bias: If true, a bias term will be added to the linear reward
        estimation.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      emit_log_probability: Whether the policy emits log-probabilities or not.
        Since the policy is deterministic, the probability is just 1.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      accepts_per_arm_features: (bool) Whether the agent accepts per-arm
        features.
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      enable_summaries: A Python bool, default True. When False, all summaries
        (debug or otherwise) should not be written.
      dtype: The type of the parameters stored and updated by the agent. Should
        be one of `tf.float32` and `tf.float64`. Defaults to `tf.float32`.
      name: a name for this instance of `LinearBanditAgent`.

    Raises:
      ValueError if dtype is not one of `tf.float32` or `tf.float64`.
      TypeError if variable_collection is not an instance of
        `LinearBanditVariableCollection`.
    """
        tf.Module.__init__(self, name=name)
        common.tf_agents_gauge.get_cell('TFABandit').set(True)
        self._num_actions = policy_utilities.get_num_actions_from_tensor_spec(
            action_spec)
        self._num_models = 1 if accepts_per_arm_features else self._num_actions
        self._observation_and_action_constraint_splitter = (
            observation_and_action_constraint_splitter)
        self._time_step_spec = time_step_spec
        self._accepts_per_arm_features = accepts_per_arm_features
        self._add_bias = add_bias
        if observation_and_action_constraint_splitter is not None:
            context_spec, _ = observation_and_action_constraint_splitter(
                time_step_spec.observation)
        else:
            context_spec = time_step_spec.observation

        (self._global_context_dim,
         self._arm_context_dim) = bandit_spec_utils.get_context_dims_from_spec(
             context_spec, accepts_per_arm_features)
        if self._add_bias:
            # The bias is added via a constant 1 feature.
            self._global_context_dim += 1
        self._overall_context_dim = self._global_context_dim + self._arm_context_dim

        self._alpha = alpha
        if variable_collection is None:
            variable_collection = LinearBanditVariableCollection(
                context_dim=self._overall_context_dim,
                num_models=self._num_models,
                use_eigendecomp=use_eigendecomp,
                dtype=dtype)
        elif not isinstance(variable_collection,
                            LinearBanditVariableCollection):
            raise TypeError('Parameter `variable_collection` should be '
                            'of type `LinearBanditVariableCollection`.')
        self._variable_collection = variable_collection
        self._cov_matrix_list = variable_collection.cov_matrix_list
        self._data_vector_list = variable_collection.data_vector_list
        self._eig_matrix_list = variable_collection.eig_matrix_list
        self._eig_vals_list = variable_collection.eig_vals_list
        # We keep track of the number of samples per arm.
        self._num_samples_list = variable_collection.num_samples_list
        self._gamma = gamma
        if self._gamma < 0.0 or self._gamma > 1.0:
            raise ValueError(
                'Forgetting factor `gamma` must be in [0.0, 1.0].')
        self._dtype = dtype
        if dtype not in (tf.float32, tf.float64):
            raise ValueError(
                'Agent dtype should be either `tf.float32 or `tf.float64`.')
        self._use_eigendecomp = use_eigendecomp
        self._tikhonov_weight = tikhonov_weight

        if exploration_policy == ExplorationPolicy.linear_ucb_policy:
            exploration_strategy = lin_policy.ExplorationStrategy.optimistic
        elif exploration_policy == (
                ExplorationPolicy.linear_thompson_sampling_policy):
            exploration_strategy = lin_policy.ExplorationStrategy.sampling
        else:
            raise ValueError(
                'Linear bandit agent with policy %s not implemented' %
                exploration_policy)
        policy = lin_policy.LinearBanditPolicy(
            action_spec=action_spec,
            cov_matrix=self._cov_matrix_list,
            data_vector=self._data_vector_list,
            num_samples=self._num_samples_list,
            time_step_spec=time_step_spec,
            exploration_strategy=exploration_strategy,
            alpha=alpha,
            eig_vals=self._eig_vals_list if self._use_eigendecomp else (),
            eig_matrix=self._eig_matrix_list if self._use_eigendecomp else (),
            tikhonov_weight=self._tikhonov_weight,
            add_bias=add_bias,
            emit_policy_info=emit_policy_info,
            emit_log_probability=emit_log_probability,
            accepts_per_arm_features=accepts_per_arm_features,
            observation_and_action_constraint_splitter=(
                observation_and_action_constraint_splitter))

        training_data_spec = None
        if accepts_per_arm_features:
            training_data_spec = bandit_spec_utils.drop_arm_observation(
                policy.trajectory_spec)
        super(LinearBanditAgent,
              self).__init__(time_step_spec=time_step_spec,
                             action_spec=action_spec,
                             policy=policy,
                             collect_policy=policy,
                             training_data_spec=training_data_spec,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             enable_summaries=enable_summaries,
                             train_sequence_length=None)
        self._as_trajectory = data_converter.AsTrajectory(self.data_context,
                                                          sequence_length=None)
    def __init__(
            self,
            time_step_spec: Optional[ts.TimeStep],
            action_spec: Optional[types.NestedBoundedTensorSpec],
            scalarizer: multi_objective_scalarizer.Scalarizer,
            objective_network_and_loss_fn_sequence: Sequence[Tuple[
                Network, Callable[..., tf.Tensor]]],
            optimizer: tf.keras.optimizers.Optimizer,
            observation_and_action_constraint_splitter: types.Splitter = None,
            accepts_per_arm_features: bool = False,
            # Params for training.
            gradient_clipping: Optional[float] = None,
            # Params for debugging.
            debug_summaries: bool = False,
            summarize_grads_and_vars: bool = False,
            enable_summaries: bool = True,
            emit_policy_info: Tuple[Text] = (),
            train_step_counter: Optional[tf.Variable] = None,
            name: Optional[Text] = None):
        """Creates a Greedy Multi-objective Neural Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      scalarizer: A
       `tf_agents.bandits.multi_objective.multi_objective_scalarizer.Scalarizer`
        object that implements scalarization of multiple objectives into a
        single scalar reward.
      objective_network_and_loss_fn_sequence: A Sequence of Tuples
        (`tf_agents.network.Network`, error loss function) to be used by the
        agent. Each network `net` will be called as
        `net(observation, training=...)` and is expected to output a
        `tf.Tensor` of predicted values for a specific objective for all
        actions, shaped as [batch-size, number-of-actions]. Each network will be
        trained via minimizing the accompanying error loss function, which takes
        parameters labels, predictions, and weights (any function from tf.losses
        would work).
      optimizer: A 'tf.keras.optimizers.Optimizer' object, the optimizer to use
        for training.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask of shape `[batch_size, num_actions]`.
        This function should also work with a `TensorSpec` as input, and should
        output `TensorSpec` objects for the observation and mask.
      accepts_per_arm_features: (bool) Whether the agent accepts per-arm
        features.
      gradient_clipping: A float representing the norm length to clip gradients
        (or None for no clipping.)
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      enable_summaries: A Python bool, default True. When False, all summaries
        (debug or otherwise) should not be written.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      train_step_counter: An optional `tf.Variable` to increment every time the
        train op is run.  Defaults to the `global_step`.
      name: Python str name of this agent. All variables in this module will
        fall under that name. Defaults to the class name.

    Raises:
      ValueError:
        - If the action spec contains more than one action or or it is not a
          bounded scalar int32 spec with minimum 0.
        - If the length of `objective_network_and_loss_fn_sequence` is less than
          two.
    """
        tf.Module.__init__(self, name=name)
        common.tf_agents_gauge.get_cell('TFABandit').set(True)
        self._observation_and_action_constraint_splitter = (
            observation_and_action_constraint_splitter)
        self._num_actions = policy_utilities.get_num_actions_from_tensor_spec(
            action_spec)
        self._accepts_per_arm_features = accepts_per_arm_features

        self._num_objectives = len(objective_network_and_loss_fn_sequence)
        if self._num_objectives < 2:
            raise ValueError(
                'Number of objectives should be at least two, but found to be {}'
                .format(self._num_objectives))
        self._objective_networks, self._error_loss_fns = tuple(
            zip(*objective_network_and_loss_fn_sequence))
        self._optimizer = optimizer
        self._gradient_clipping = gradient_clipping
        self._heteroscedastic = [
            isinstance(network,
                       heteroscedastic_q_network.HeteroscedasticQNetwork)
            for network in self._objective_networks
        ]

        policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
            time_step_spec,
            action_spec,
            scalarizer,
            self._objective_networks,
            observation_and_action_constraint_splitter,
            accepts_per_arm_features=accepts_per_arm_features,
            emit_policy_info=emit_policy_info)
        training_data_spec = None
        if accepts_per_arm_features:
            training_data_spec = bandit_spec_utils.drop_arm_observation(
                policy.trajectory_spec)

        super(GreedyMultiObjectiveNeuralAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy=policy,
                             train_sequence_length=None,
                             training_data_spec=training_data_spec,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             enable_summaries=enable_summaries,
                             train_step_counter=train_step_counter,
                             validate_args=False)
        self._as_trajectory = data_converter.AsTrajectory(self.data_context,
                                                          sequence_length=None)
Ejemplo n.º 13
0
    def __init__(
            self,
            time_step_spec: ts.TimeStep,
            action_spec: types.NestedTensorSpec,
            cloning_network: network.Network,
            optimizer: types.Optimizer,
            num_outer_dims: Literal[1, 2] = 1,  # pylint: disable=bad-whitespace
            epsilon_greedy: types.Float = 0.1,
            loss_fn: Optional[Callable[[types.NestedTensor, bool],
                                       types.Tensor]] = None,
            gradient_clipping: Optional[types.Float] = None,
            # Params for debugging.
            debug_summaries: bool = False,
            summarize_grads_and_vars: bool = False,
            train_step_counter: Optional[tf.Variable] = None,
            name: Optional[Text] = None):
        """Creates an instance of a Behavioral Cloning agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      cloning_network: A `tf_agents.networks.Network` to be used by the agent.
        The network will be called as

          ```
          network(observation, step_type=step_type, network_state=initial_state)
          ```
        and must return a 2-tuple with elements `(output, next_network_state)`
      optimizer: The optimizer to use for training.
      num_outer_dims: The number of outer dimensions for the agent. Must be
        either 1 or 2. If 2, training will require both a batch_size and time
        dimension on every Tensor; if 1, training will require only a batch_size
        outer dimension.
      epsilon_greedy: probability of choosing a random action in the default
        epsilon-greedy collect policy (used only if actions are discrete)
      loss_fn: A function for computing the error between the output of the
        cloning network and the action that was taken. If None, the loss
        depends on the action dtype. The `loss_fn` is called with parameters:
        `(experience, training)`, and must return a loss value for each element
        of the batch.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall
        under that name. Defaults to the class name.
    """
        tf.Module.__init__(self, name=name)
        self._cloning_network = cloning_network
        self._optimizer = optimizer
        self._gradient_clipping = gradient_clipping

        action_spec = tensor_spec.from_spec(action_spec)
        flat_action_spec = tf.nest.flatten(action_spec)
        continuous_specs = [
            tensor_spec.is_continuous(s) for s in flat_action_spec
        ]

        if not flat_action_spec:
            raise ValueError(
                'The `action_spec` must contain at least one action.')

        single_discrete_scalar_action = (
            len(flat_action_spec) == 1 and flat_action_spec[0].shape.rank == 0
            and not tensor_spec.is_continuous(flat_action_spec[0]))
        single_continuous_action = (len(flat_action_spec) == 1
                                    and tensor_spec.is_continuous(
                                        flat_action_spec[0]))

        if (not loss_fn and not single_discrete_scalar_action
                and not single_continuous_action):
            raise ValueError(
                'A `loss_fn` must be provided unless there is a single, scalar '
                'discrete action or a single (scalar or non-scalar) continuous '
                'action.')

        self._network_output_spec = cloning_network.create_variables(
            time_step_spec.observation)

        # If there is a mix of continuous and discrete actions we want to use an
        # actor policy so we can use the `setup_as_continuous` method as long as the
        # user provided a custom loss_fn which we verified above.
        if any(continuous_specs):
            policy, collect_policy = self._setup_as_continuous(
                time_step_spec, action_spec, loss_fn)
        else:
            policy, collect_policy = self._setup_as_discrete(
                time_step_spec, action_spec, loss_fn, epsilon_greedy)

        super(BehavioralCloningAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy,
                             train_sequence_length=None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter)

        self._as_trajectory = data_converter.AsTrajectory(
            self.data_context,
            sequence_length=None,
            num_outer_dims=num_outer_dims)
Ejemplo n.º 14
0
  def __init__(
      self,
      time_step_spec: types.TimeStep,
      action_spec: types.BoundedTensorSpec,
      encoding_network: types.Network,
      encoding_network_num_train_steps: int,
      encoding_dim: int,
      optimizer: types.Optimizer,
      variable_collection: Optional[NeuralLinUCBVariableCollection] = None,
      alpha: float = 1.0,
      gamma: float = 1.0,
      epsilon_greedy: float = 0.0,
      observation_and_action_constraint_splitter: Optional[
          types.Splitter] = None,
      accepts_per_arm_features: bool = False,
      distributed_train_encoding_network: bool = False,
      # Params for training.
      error_loss_fn: types.LossFn = tf.compat.v1.losses.mean_squared_error,
      gradient_clipping: Optional[float] = None,
      # Params for debugging.
      debug_summaries: bool = False,
      summarize_grads_and_vars: bool = False,
      train_step_counter: Optional[tf.Variable] = None,
      emit_policy_info: Sequence[Text] = (),
      emit_log_probability: bool = False,
      dtype: tf.DType = tf.float64,
      name: Optional[Text] = None):
    """Initialize an instance of `NeuralLinUCBAgent`.

    Args:
      time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s.
      action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype
        describing the number of actions for this agent.
      encoding_network: a Keras network that encodes the observations.
      encoding_network_num_train_steps: how many training steps to run for
        training the encoding network before switching to LinUCB. If negative,
        the encoding network is assumed to be already trained.
      encoding_dim: the dimension of encoded observations.
      optimizer: The optimizer to use for training.
      variable_collection: Instance of `NeuralLinUCBVariableCollection`.
        Collection of variables to be updated by the agent. If `None`, a new
        instance of `LinearBanditVariables` will be created. Note that this
        collection excludes the variables owned by the encoding network.
      alpha: (float) positive scalar. This is the exploration parameter that
        multiplies the confidence intervals.
      gamma: a float forgetting factor in [0.0, 1.0]. When set to
        1.0, the algorithm does not forget.
      epsilon_greedy: A float representing the probability of choosing a random
        action instead of the greedy action.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      accepts_per_arm_features: (bool) Whether the policy accepts per-arm
        features.
      distributed_train_encoding_network: (bool) whether to train the encoding
        network or not. This applies only in distributed training setting. When
        set to true this agent will train the encoding network. Otherwise, it
        will assume the encoding network is already trained and will train
        LinUCB on top of it.
      error_loss_fn: A function for computing the error loss, taking parameters
        labels, predictions, and weights (any function from tf.losses would
        work). The default is `tf.losses.mean_squared_error`.
      gradient_clipping: A float representing the norm length to clip gradients
        (or None for no clipping.)
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      train_step_counter: An optional `tf.Variable` to increment every time the
        train op is run.  Defaults to the `global_step`.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      emit_log_probability: Whether the NeuralLinUCBPolicy emits
        log-probabilities or not. Since the policy is deterministic, the
        probability is just 1.
      dtype: The type of the parameters stored and updated by the agent. Should
        be one of `tf.float32` and `tf.float64`. Defaults to `tf.float64`.
      name: a name for this instance of `NeuralLinUCBAgent`.

    Raises:
      TypeError if variable_collection is not an instance of
        `NeuralLinUCBVariableCollection`.
      ValueError if dtype is not one of `tf.float32` or `tf.float64`.
    """
    tf.Module.__init__(self, name=name)
    common.tf_agents_gauge.get_cell('TFABandit').set(True)
    self._num_actions = policy_utilities.get_num_actions_from_tensor_spec(
        action_spec)
    self._num_models = 1 if accepts_per_arm_features else self._num_actions
    self._observation_and_action_constraint_splitter = (
        observation_and_action_constraint_splitter)
    self._accepts_per_arm_features = accepts_per_arm_features
    self._alpha = alpha
    if variable_collection is None:
      variable_collection = NeuralLinUCBVariableCollection(
          self._num_models, encoding_dim, dtype)
    elif not isinstance(variable_collection, NeuralLinUCBVariableCollection):
      raise TypeError('Parameter `variable_collection` should be '
                      'of type `NeuralLinUCBVariableCollection`.')
    self._variable_collection = variable_collection
    self._gamma = gamma
    if self._gamma < 0.0 or self._gamma > 1.0:
      raise ValueError('Forgetting factor `gamma` must be in [0.0, 1.0].')
    self._dtype = dtype
    if dtype not in (tf.float32, tf.float64):
      raise ValueError(
          'Agent dtype should be either `tf.float32 or `tf.float64`.')
    self._epsilon_greedy = epsilon_greedy

    reward_layer = tf.keras.layers.Dense(
        self._num_models,
        kernel_initializer=tf.random_uniform_initializer(
            minval=-0.03, maxval=0.03),
        use_bias=False,
        activation=None,
        name='reward_layer')

    encoding_network.create_variables()
    self._encoding_network = encoding_network
    reward_layer.build(input_shape=tf.TensorShape([None, encoding_dim]))
    self._reward_layer = reward_layer
    self._encoding_network_num_train_steps = encoding_network_num_train_steps
    self._encoding_dim = encoding_dim
    self._optimizer = optimizer
    self._error_loss_fn = error_loss_fn
    self._gradient_clipping = gradient_clipping
    train_step_counter = tf.compat.v1.train.get_or_create_global_step()
    self._distributed_train_encoding_network = (
        distributed_train_encoding_network)

    policy = neural_linucb_policy.NeuralLinUCBPolicy(
        encoding_network=self._encoding_network,
        encoding_dim=self._encoding_dim,
        reward_layer=self._reward_layer,
        epsilon_greedy=self._epsilon_greedy,
        actions_from_reward_layer=self.actions_from_reward_layer,
        cov_matrix=self.cov_matrix,
        data_vector=self.data_vector,
        num_samples=self.num_samples,
        time_step_spec=time_step_spec,
        alpha=alpha,
        emit_policy_info=emit_policy_info,
        emit_log_probability=emit_log_probability,
        accepts_per_arm_features=accepts_per_arm_features,
        distributed_use_reward_layer=distributed_train_encoding_network,
        observation_and_action_constraint_splitter=(
            observation_and_action_constraint_splitter))

    training_data_spec = None
    if accepts_per_arm_features:
      training_data_spec = bandit_spec_utils.drop_arm_observation(
          policy.trajectory_spec)
    super(NeuralLinUCBAgent, self).__init__(
        time_step_spec=time_step_spec,
        action_spec=policy.action_spec,
        policy=policy,
        collect_policy=policy,
        train_sequence_length=None,
        training_data_spec=training_data_spec,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=train_step_counter,
        validate_args=False)

    self._as_trajectory = data_converter.AsTrajectory(
        self.data_context, sequence_length=None)
Ejemplo n.º 15
0
  def __init__(
      self,
      time_step_spec: types.TimeStep,
      action_spec: types.BoundedTensorSpec,
      reward_network: types.Network,
      optimizer: types.Optimizer,
      observation_and_action_constraint_splitter: Optional[
          types.Splitter] = None,
      accepts_per_arm_features: bool = False,
      constraints: Iterable[constr.BaseConstraint] = (),
      # Params for training.
      error_loss_fn: types.LossFn = tf.compat.v1.losses.mean_squared_error,
      gradient_clipping: Optional[float] = None,
      # Params for debugging.
      debug_summaries: bool = False,
      summarize_grads_and_vars: bool = False,
      enable_summaries: bool = True,
      emit_policy_info: Tuple[Text, ...] = (),
      train_step_counter: Optional[tf.Variable] = None,
      laplacian_matrix: Optional[types.Float] = None,
      laplacian_smoothing_weight: float = 0.001,
      name: Optional[Text] = None):
    """Creates a Greedy Reward Network Prediction Agent.

     In some use cases, the actions are not independent and they are related to
     each other (e.g., when the actions are ordinal integers). Assuming that
     the relations between arms can be modeled by a graph, we may want to
     enforce that the estimated reward function is smooth over the graph. This
     implies that the estimated rewards `r_i` and `r_j` for two related actions
     `i` and `j`, should be close to each other. To quantify this smoothness
     criterion we use the Laplacian matrix `L` of the graph over the actions.
     When the laplacian smoothing is enabled, the loss is extended to:
     ```
       Loss_new := Loss + lambda r^T * L * r,
     ```
     where `r` is the estimated reward vector for all actions. The second
     term is the laplacian smoothing regularization term and `lambda` is the
     weight that determines how strongly we enforce the regularization.
     For more details, please see:
     "Bandits on graphs and structures", Michal Valko
     https://hal.inria.fr/tel-01359757/document

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      reward_network: A `tf_agents.network.Network` to be used by the agent. The
        network will be called with call(observation, step_type) and it is
        expected to provide a reward prediction for all actions.
      optimizer: The optimizer to use for training.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      accepts_per_arm_features: (bool) Whether the policy accepts per-arm
        features.
      constraints: iterable of constraints objects that are instances of
        `tf_agents.bandits.agents.NeuralConstraint`.
      error_loss_fn: A function for computing the error loss, taking parameters
        labels, predictions, and weights (any function from tf.losses would
        work). The default is `tf.losses.mean_squared_error`.
      gradient_clipping: A float representing the norm length to clip gradients
        (or None for no clipping.)
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      enable_summaries: A Python bool, default True. When False, all summaries
        (debug or otherwise) should not be written.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      train_step_counter: An optional `tf.Variable` to increment every time the
        train op is run.  Defaults to the `global_step`.
      laplacian_matrix: A float `Tensor` or a numpy array shaped
        `[num_actions, num_actions]`. This holds the Laplacian matrix used to
        regularize the smoothness of the estimated expected reward function.
        This only applies to problems where the actions have a graph structure.
        If `None`, the regularization is not applied.
      laplacian_smoothing_weight: A float that determines the weight of the
        regularization term. Note that this has no effect if `laplacian_matrix`
        above is `None`.
      name: Python str name of this agent. All variables in this module will
        fall under that name. Defaults to the class name.

    Raises:
      ValueError: If the action spec contains more than one action or or it is
      not a bounded scalar int32 spec with minimum 0.
      InvalidArgumentError: if the Laplacian provided is not None and not valid.
    """
    tf.Module.__init__(self, name=name)
    common.tf_agents_gauge.get_cell('TFABandit').set(True)
    self._observation_and_action_constraint_splitter = (
        observation_and_action_constraint_splitter)
    self._num_actions = policy_utilities.get_num_actions_from_tensor_spec(
        action_spec)
    self._accepts_per_arm_features = accepts_per_arm_features
    self._constraints = constraints

    reward_network.create_variables()
    self._reward_network = reward_network
    self._optimizer = optimizer
    self._error_loss_fn = error_loss_fn
    self._gradient_clipping = gradient_clipping
    self._heteroscedastic = isinstance(
        reward_network, heteroscedastic_q_network.HeteroscedasticQNetwork)
    self._laplacian_matrix = None
    if laplacian_matrix is not None:
      self._laplacian_matrix = tf.convert_to_tensor(
          laplacian_matrix, dtype=tf.float32)
      # Check the validity of the laplacian matrix.
      tf.debugging.assert_near(
          0.0, tf.norm(tf.reduce_sum(self._laplacian_matrix, 1)))
      tf.debugging.assert_near(
          0.0, tf.norm(tf.reduce_sum(self._laplacian_matrix, 0)))
    self._laplacian_smoothing_weight = laplacian_smoothing_weight

    policy = greedy_reward_policy.GreedyRewardPredictionPolicy(
        time_step_spec,
        action_spec,
        reward_network,
        observation_and_action_constraint_splitter,
        constraints=constraints,
        accepts_per_arm_features=accepts_per_arm_features,
        emit_policy_info=emit_policy_info)
    training_data_spec = None
    if accepts_per_arm_features:
      training_data_spec = bandit_spec_utils.drop_arm_observation(
          policy.trajectory_spec)

    super(GreedyRewardPredictionAgent, self).__init__(
        time_step_spec,
        action_spec,
        policy,
        collect_policy=policy,
        train_sequence_length=None,
        training_data_spec=training_data_spec,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        enable_summaries=enable_summaries,
        train_step_counter=train_step_counter)
    self._as_trajectory = data_converter.AsTrajectory(
        self.data_context, sequence_length=None)
Ejemplo n.º 16
0
    def __init__(self,
                 time_step_spec: ts.TimeStep,
                 action_spec: types.TensorSpec,
                 actor_network: network.Network,
                 optimizer: types.Optimizer,
                 value_network: Optional[network.Network] = None,
                 value_estimation_loss_coef: types.Float = 0.2,
                 advantage_fn: Optional[AdvantageFnType] = None,
                 use_advantage_loss: bool = True,
                 gamma: types.Float = 1.0,
                 normalize_returns: bool = True,
                 gradient_clipping: Optional[types.Float] = None,
                 debug_summaries: bool = False,
                 summarize_grads_and_vars: bool = False,
                 entropy_regularization: Optional[types.Float] = None,
                 train_step_counter: Optional[tf.Variable] = None,
                 name: Optional[Text] = None):
        """Creates a REINFORCE Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      actor_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call(observation, step_type).
      optimizer: Optimizer for the actor network.
      value_network: (Optional) A `tf_agents.network.Network` to be used by the
        agent. The network will be called with call(observation, step_type) and
        returns a floating point value tensor.
      value_estimation_loss_coef: (Optional) Multiplier for value prediction
        loss to balance with policy gradient loss.
      advantage_fn: A function `A(returns, value_preds)` that takes returns and
        value function predictions as input and returns advantages. The default
        is `A(returns, value_preds) = returns - value_preds` if a value network
        is specified and `use_advantage_loss=True`, otherwise `A(returns,
        value_preds) = returns`.
      use_advantage_loss: Whether to use value function predictions for
        computing returns. `use_advantage_loss=False` is equivalent to setting
        `advantage_fn=lambda returns, value_preds: returns`.
      gamma: A discount factor for future rewards.
      normalize_returns: Whether to normalize returns across episodes when
        computing the loss.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      entropy_regularization: Coefficient for entropy regularization loss term.
      train_step_counter: An optional counter to increment every time the train
        op is run. Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall under
        that name. Defaults to the class name.
    """
        tf.Module.__init__(self, name=name)

        actor_network.create_variables()
        self._actor_network = actor_network
        if value_network:
            value_network.create_variables()
        self._value_network = value_network

        collect_policy = actor_policy.ActorPolicy(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            actor_network=self._actor_network,
            clip=True)

        policy = greedy_policy.GreedyPolicy(collect_policy)

        self._optimizer = optimizer
        self._gamma = gamma
        self._normalize_returns = normalize_returns
        self._gradient_clipping = gradient_clipping
        self._entropy_regularization = entropy_regularization
        self._value_estimation_loss_coef = value_estimation_loss_coef
        self._baseline = self._value_network is not None
        self._advantage_fn = advantage_fn
        if self._advantage_fn is None:
            if use_advantage_loss and self._baseline:
                self._advantage_fn = lambda returns, value_preds: returns - value_preds
            else:
                self._advantage_fn = lambda returns, _: returns

        super(ReinforceAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy,
                             train_sequence_length=None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter)
        self._as_trajectory = data_converter.AsTrajectory(self.data_context)
    def __init__(
            self,
            time_step_spec,
            action_spec,
            optimizer=None,
            actor_net=None,
            value_net=None,
            importance_ratio_clipping=0.0,
            lambda_value=0.95,
            discount_factor=0.99,
            entropy_regularization=0.0,
            policy_l2_reg=0.0,
            value_function_l2_reg=0.0,
            shared_vars_l2_reg=0.0,
            value_pred_loss_coef=0.5,
            num_epochs=25,
            use_gae=False,
            use_td_lambda_return=False,
            normalize_rewards=True,
            reward_norm_clipping=10.0,
            normalize_observations=True,
            log_prob_clipping=0.0,
            kl_cutoff_factor=0.0,
            kl_cutoff_coef=0.0,
            initial_adaptive_kl_beta=0.0,
            adaptive_kl_target=0.0,
            adaptive_kl_tolerance=0.0,
            gradient_clipping=None,
            value_clipping=None,
            check_numerics=False,
            # TODO(b/150244758): Change the default to False once we move
            # clients onto Reverb.
            compute_value_and_advantage_in_train=True,
            update_normalizers_in_train=True,
            debug_summaries=False,
            summarize_grads_and_vars=False,
            train_step_counter=None,
            name='AttentionPPOAgent'):
        """Creates a PPO Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      optimizer: Optimizer to use for the agent, default to using
        `tf.compat.v1.train.AdamOptimizer`.
      actor_net: A `network.DistributionNetwork` which maps observations to
        action distributions. Commonly, it is set to
        `actor_distribution_network.ActorDistributionNetwork`.
      value_net: A `Network` which returns the value prediction for input
        states, with `call(observation, step_type, network_state)`. Commonly, it
        is set to `value_network.ValueNetwork`.
      importance_ratio_clipping: Epsilon in clipped, surrogate PPO objective.
        For more detail, see explanation at the top of the doc.
      lambda_value: Lambda parameter for TD-lambda computation.
      discount_factor: Discount factor for return computation. Default to `0.99`
        which is the value used for all environments from (Schulman, 2017).
      entropy_regularization: Coefficient for entropy regularization loss term.
        Default to `0.0` because no entropy bonus was used in (Schulman, 2017).
      policy_l2_reg: Coefficient for L2 regularization of unshared actor_net
        weights. Default to `0.0` because no L2 regularization was applied on
        the policy network weights in (Schulman, 2017).
      value_function_l2_reg: Coefficient for l2 regularization of unshared value
        function weights. Default to `0.0` because no L2 regularization was
        applied on the policy network weights in (Schulman, 2017).
      shared_vars_l2_reg: Coefficient for l2 regularization of weights shared
        between actor_net and value_net. Default to `0.0` because no L2
        regularization was applied on the policy network or value network
        weights in (Schulman, 2017).
      value_pred_loss_coef: Multiplier for value prediction loss to balance with
        policy gradient loss. Default to `0.5`, which was used for all
        environments in the OpenAI baseline implementation. This parameters is
        irrelevant unless you are sharing part of actor_net and value_net. In
        that case, you would want to tune this coeeficient, whose value depends
        on the network architecture of your choice.
      num_epochs: Number of epochs for computing policy updates. (Schulman,2017)
        sets this to 10 for Mujoco, 15 for Roboschool and 3 for Atari.
      use_gae: If True (default False), uses generalized advantage estimation
        for computing per-timestep advantage. Else, just subtracts value
        predictions from empirical return.
      use_td_lambda_return: If True (default False), uses td_lambda_return for
        training value function; here: `td_lambda_return = gae_advantage +
          value_predictions`. `use_gae` must be set to `True` as well to enable
          TD -lambda returns. If `use_td_lambda_return` is set to True while
          `use_gae` is False, the empirical return will be used and a warning
          will be logged.
      normalize_rewards: If true, keeps moving variance of rewards and
        normalizes incoming rewards. While not mentioned directly in (Schulman,
        2017), reward normalization was implemented in OpenAI baselines and
        (Ilyas et al., 2018) pointed out that it largely improves performance.
        You may refer to Figure 1 of https://arxiv.org/pdf/1811.02553.pdf for a
          comparison with and without reward scaling.
      reward_norm_clipping: Value above and below to clip normalized reward.
        Additional optimization proposed in (Ilyas et al., 2018) set to `5` or
        `10`.
      normalize_observations: If `True`, keeps moving mean and variance of
        observations and normalizes incoming observations. Additional
        optimization proposed in (Ilyas et al., 2018). If true, and the
        observation spec is not tf.float32 (such as Atari), please manually
        convert the observation spec received from the environment to tf.float32
        before creating the networks. Otherwise, the normalized input to the
        network (float32) will have a different dtype as what the network
        expects, resulting in a mismatch error.
        Example usage: ```python observation_tensor_spec, action_spec,
          time_step_tensor_spec = ( spec_utils.get_tensor_specs(env))
          normalized_observation_tensor_spec = tf.nest.map_structure(
            lambda s: tf.TensorSpec( dtype=tf.float32, shape=s.shape,
              name=s.name ), observation_tensor_spec )  actor_net =
              actor_distribution_network.ActorDistributionNetwork(
              normalized_observation_tensor_spec, ...) value_net =
              value_network.ValueNetwork( normalized_observation_tensor_spec,
              ...) # Note that the agent still uses the original
              time_step_tensor_spec # from the environment. agent =
              ppo_clip_agent.PPOClipAgent( time_step_tensor_spec, action_spec,
              actor_net, value_net, ...) ```
      log_prob_clipping: +/- value for clipping log probs to prevent inf / NaN
        values.  Default: no clipping.
      kl_cutoff_factor: Only meaningful when `kl_cutoff_coef > 0.0`. A multipler
        used for calculating the KL cutoff ( = `kl_cutoff_factor *
        adaptive_kl_target`). If policy KL averaged across the batch changes
        more than the cutoff, a squared cutoff loss would be added to the loss
        function.
      kl_cutoff_coef: kl_cutoff_coef and kl_cutoff_factor are additional params
        if one wants to use a KL cutoff loss term in addition to the adaptive KL
        loss term. Default to 0.0 to disable the KL cutoff loss term as this was
        not used in the paper.  kl_cutoff_coef is the coefficient to mulitply by
        the KL cutoff loss term, before adding to the total loss function.
      initial_adaptive_kl_beta: Initial value for beta coefficient of adaptive
        KL penalty. This initial value is not important in practice because the
        algorithm quickly adjusts to it. A common default is 1.0.
      adaptive_kl_target: Desired KL target for policy updates. If actual KL is
        far from this target, adaptive_kl_beta will be updated. You should tune
        this for your environment. 0.01 was found to perform well for Mujoco.
      adaptive_kl_tolerance: A tolerance for adaptive_kl_beta. Mean KL above `(1
        + tol) * adaptive_kl_target`, or below `(1 - tol) * adaptive_kl_target`,
        will cause `adaptive_kl_beta` to be updated. `0.5` was chosen
        heuristically in the paper, but the algorithm is not very sensitive to
        it.
      gradient_clipping: Norm length to clip gradients.  Default: no clipping.
      value_clipping: Difference between new and old value predictions are
        clipped to this threshold. Value clipping could be helpful when training
        very deep networks. Default: no clipping.
      check_numerics: If true, adds `tf.debugging.check_numerics` to help find
        NaN / Inf values. For debugging only.
      compute_value_and_advantage_in_train: A bool to indicate where value
        prediction and advantage calculation happen.  If True, both happen in
        agent.train(). If False, value prediction is computed during data
        collection. This argument must be set to `False` if mini batch learning
        is enabled.
      update_normalizers_in_train: A bool to indicate whether normalizers are
        updated as parts of the `train` method. Set to `False` if mini batch
        learning is enabled, or if `train` is called on multiple iterations of
        the same trajectories. In that case, you would need to use `PPOLearner`
        (which updates all the normalizers outside of the agent). This ensures
        that normalizers are updated in the same way as (Schulman, 2017).
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If true, gradient summaries will be written.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall under
        that name. Defaults to the class name.

    Raises:
      TypeError: if `actor_net` or `value_net` is not of type
        `tf_agents.networks.Network`.
    """
        if not isinstance(actor_net, network.Network):
            raise TypeError(
                'actor_net must be an instance of a network.Network.')
        if not isinstance(value_net, network.Network):
            raise TypeError(
                'value_net must be an instance of a network.Network.')

        # PPOPolicy validates these, so we skip validation here.
        actor_net.create_variables(time_step_spec.observation)
        value_net.create_variables(time_step_spec.observation)

        tf.Module.__init__(self, name=name)

        self._optimizer = optimizer
        self._actor_net = actor_net
        self._value_net = value_net
        self._importance_ratio_clipping = importance_ratio_clipping
        self._lambda = lambda_value
        self._discount_factor = discount_factor
        self._entropy_regularization = entropy_regularization
        self._policy_l2_reg = policy_l2_reg
        self._value_function_l2_reg = value_function_l2_reg
        self._shared_vars_l2_reg = shared_vars_l2_reg
        self._value_pred_loss_coef = value_pred_loss_coef
        self._num_epochs = num_epochs
        self._use_gae = use_gae
        self._use_td_lambda_return = use_td_lambda_return
        self._reward_norm_clipping = reward_norm_clipping
        self._log_prob_clipping = log_prob_clipping
        self._kl_cutoff_factor = kl_cutoff_factor
        self._kl_cutoff_coef = kl_cutoff_coef
        self._adaptive_kl_target = adaptive_kl_target
        self._adaptive_kl_tolerance = adaptive_kl_tolerance
        self._gradient_clipping = gradient_clipping or 0.0
        self._value_clipping = value_clipping or 0.0
        self._check_numerics = check_numerics
        self._compute_value_and_advantage_in_train = (
            compute_value_and_advantage_in_train)
        self.update_normalizers_in_train = update_normalizers_in_train
        if not isinstance(self._optimizer, tf.keras.optimizers.Optimizer):
            logging.warning(
                'Only tf.keras.optimizers.Optimizers are well supported, got a '
                'non-TF2 optimizer: %s', self._optimizer)

        self._initial_adaptive_kl_beta = initial_adaptive_kl_beta
        if initial_adaptive_kl_beta > 0.0:
            self._adaptive_kl_beta = common.create_variable(
                'adaptive_kl_beta', initial_adaptive_kl_beta, dtype=tf.float32)
        else:
            self._adaptive_kl_beta = None

        self._reward_normalizer = None
        if normalize_rewards:
            self._reward_normalizer = tensor_normalizer.StreamingTensorNormalizer(
                tensor_spec.TensorSpec([], tf.float32),
                scope='normalize_reward')

        self._observation_normalizer = None
        if normalize_observations:
            self._observation_normalizer = (
                tensor_normalizer.StreamingTensorNormalizer(
                    time_step_spec.observation,
                    scope='normalize_observations'))

        self._advantage_normalizer = tensor_normalizer.StreamingTensorNormalizer(
            tensor_spec.TensorSpec([], tf.float32),
            scope='normalize_advantages')

        policy = greedy_policy.GreedyPolicy(
            attention_ppo_policy.AttentionPPOPolicy(
                time_step_spec=time_step_spec,
                action_spec=action_spec,
                actor_network=actor_net,
                value_network=value_net,
                observation_normalizer=self._observation_normalizer,
                clip=False,
                collect=False))

        collect_policy = attention_ppo_policy.AttentionPPOPolicy(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            actor_network=actor_net,
            value_network=value_net,
            observation_normalizer=self._observation_normalizer,
            clip=False,
            collect=True,
            compute_value_and_advantage_in_train=(
                self._compute_value_and_advantage_in_train),
        )

        if isinstance(self._actor_net, network.DistributionNetwork):
            # Legacy behavior
            self._action_distribution_spec = self._actor_net.output_spec
        else:
            self._action_distribution_spec = self._actor_net.create_variables(
                time_step_spec.observation)

        # Set training_data_spec to collect_data_spec with augmented policy info,
        # iff return and normalized advantage are saved in preprocess_sequence.
        if self._compute_value_and_advantage_in_train:
            training_data_spec = None
        else:
            training_policy_info = collect_policy.trajectory_spec.policy_info.copy(
            )
            training_policy_info.update({
                'value_prediction':
                collect_policy.trajectory_spec.policy_info['value_prediction'],
                'return':
                tensor_spec.TensorSpec(shape=[], dtype=tf.float32),
                'advantage':
                tensor_spec.TensorSpec(shape=[], dtype=tf.float32),
            })
            training_data_spec = collect_policy.trajectory_spec.replace(
                policy_info=training_policy_info)

        super(ppo_agent.PPOAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy,
                             train_sequence_length=None,
                             training_data_spec=training_data_spec,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter)

        # This must be built after super() which sets up self.data_context.
        self._collected_as_transition = data_converter.AsTransition(
            self.collect_data_context, squeeze_time_dim=False)

        self._as_trajectory = data_converter.AsTrajectory(self.data_context,
                                                          sequence_length=None)