Beispiel #1
0
def project_to_output_distribution(inputs,
                                   output_spec,
                                   project_to_discrete,
                                   project_to_continuous,
                                   outer_rank=1,
                                   scope='project_to_output'):
  """Project a batch of inputs to a distribution object.

  Args:
    inputs: An input Tensor of shape [batch_size, None].
    output_spec: A single output spec.
    project_to_discrete: The method to use for projecting a discrete output.
    project_to_continuous: The method to use for projecting a continuous output.
    outer_rank: The number of outer dimensions of inputs to consider batch
      dimensions and to treat as batch dimensions of output distribution.
    scope: The variable scope.

  Returns:
    A distribution object corresponding to the arguments and output spec
      provided.

  Raises:
    ValueError: If the distribution type of output_spec is unclear.
  """
  with tf.variable_scope(scope):
    if tensor_spec.is_discrete(output_spec):
      return project_to_discrete(inputs, output_spec, outer_rank=outer_rank)
    elif tensor_spec.is_continuous(output_spec):
      return project_to_continuous(inputs, output_spec, outer_rank=outer_rank)
    else:
      raise ValueError('Output spec corresponds to unknown distribution.')
Beispiel #2
0
 def testExclusive(self, dtype):
     if dtype == tf.string:
         self.skipTest("Not compatible with string type.")
     spec = tensor_spec.TensorSpec((2, 3), dtype=dtype)
     self.assertIs(
         tensor_spec.is_discrete(spec) ^ tensor_spec.is_continuous(spec),
         True)
Beispiel #3
0
def calc_default_target_entropy(spec):
    """Calc default target entropy
    Args:
        spec (TensorSpec): action spec
    Returns:
    """
    dims = np.product(spec.shape.as_list())
    if tensor_spec.is_continuous(spec):
        e = -1
    else:
        min_prob = 0.01
        p = min_prob
        q = 1 - p
        e = -p * np.log(p) - q * np.log(q)
    return e * dims
Beispiel #4
0
def calc_default_target_entropy(spec):
    """Calc default target entropy
    Args:
        spec (TensorSpec): action spec
    Returns:
    """
    zeros = np.zeros(spec.shape)
    min_max = np.broadcast(spec.minimum, spec.maximum, zeros)
    cont = tensor_spec.is_continuous(spec)
    min_prob = 0.01
    log_mp = np.log(min_prob)
    # continuous: suppose the prob concentrates on a delta of 0.01*(M-m)
    # discrete: ignore the entry of 0.99 and uniformly distribute probs on rest
    e = np.sum([(np.log(M - m) + log_mp
                 if cont else min_prob * (np.log(M - m) - log_mp))
                for m, M, _ in min_max])
    return e
Beispiel #5
0
def calc_default_max_entropy(spec, fraction=0.8):
    """Calc default max entropy
    Args:
        spec (TensorSpec): action spec
        fraction (float): this fraction of the theoretical entropy upper bound
            will be used as the max entropy
    Returns:
        A default max entropy for adjusting the entropy weight
    """
    assert fraction <= 1.0 and fraction > 0
    zeros = np.zeros(spec.shape)
    min_max = np.broadcast(spec.minimum, spec.maximum, zeros)
    cont = tensor_spec.is_continuous(spec)
    # use uniform distributions to compute upper bounds
    e = np.sum([(np.log(M - m) * (fraction if M - m > 1 else 1.0 / fraction)
                 if cont else np.log(M - m + 1) * fraction)
                for m, M, _ in min_max])
    return e
    def __init__(
            self,
            time_step_spec: ts.TimeStep,
            action_spec: types.NestedTensorSpec,
            cloning_network: network.Network,
            optimizer: types.Optimizer,
            num_outer_dims: Literal[1, 2] = 1,  # pylint: disable=bad-whitespace
            epsilon_greedy: types.Float = 0.1,
            loss_fn: Optional[Callable[[types.NestedTensor, bool],
                                       types.Tensor]] = None,
            gradient_clipping: Optional[types.Float] = None,
            # Params for debugging.
            debug_summaries: bool = False,
            summarize_grads_and_vars: bool = False,
            train_step_counter: Optional[tf.Variable] = None,
            name: Optional[Text] = None):
        """Creates an instance of a Behavioral Cloning agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      cloning_network: A `tf_agents.networks.Network` to be used by the agent.
        The network will be called as

          ```
          network(observation, step_type=step_type, network_state=initial_state)
          ```
        and must return a 2-tuple with elements `(output, next_network_state)`
      optimizer: The optimizer to use for training.
      num_outer_dims: The number of outer dimensions for the agent. Must be
        either 1 or 2. If 2, training will require both a batch_size and time
        dimension on every Tensor; if 1, training will require only a batch_size
        outer dimension.
      epsilon_greedy: probability of choosing a random action in the default
        epsilon-greedy collect policy (used only if actions are discrete)
      loss_fn: A function for computing the error between the output of the
        cloning network and the action that was taken. If None, the loss
        depends on the action dtype. The `loss_fn` is called with parameters:
        `(experience, training)`, and must return a loss value for each element
        of the batch.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall
        under that name. Defaults to the class name.
    """
        tf.Module.__init__(self, name=name)
        self._cloning_network = cloning_network
        self._optimizer = optimizer
        self._gradient_clipping = gradient_clipping

        action_spec = tensor_spec.from_spec(action_spec)
        flat_action_spec = tf.nest.flatten(action_spec)
        continuous_specs = [
            tensor_spec.is_continuous(s) for s in flat_action_spec
        ]

        if not flat_action_spec:
            raise ValueError(
                'The `action_spec` must contain at least one action.')

        single_discrete_scalar_action = (
            len(flat_action_spec) == 1 and flat_action_spec[0].shape.rank == 0
            and not tensor_spec.is_continuous(flat_action_spec[0]))
        single_continuous_action = (len(flat_action_spec) == 1
                                    and tensor_spec.is_continuous(
                                        flat_action_spec[0]))

        if (not loss_fn and not single_discrete_scalar_action
                and not single_continuous_action):
            raise ValueError(
                'A `loss_fn` must be provided unless there is a single, scalar '
                'discrete action or a single (scalar or non-scalar) continuous '
                'action.')

        self._network_output_spec = cloning_network.create_variables(
            time_step_spec.observation)

        # If there is a mix of continuous and discrete actions we want to use an
        # actor policy so we can use the `setup_as_continuous` method as long as the
        # user provided a custom loss_fn which we verified above.
        if any(continuous_specs):
            policy, collect_policy = self._setup_as_continuous(
                time_step_spec, action_spec, loss_fn)
        else:
            policy, collect_policy = self._setup_as_discrete(
                time_step_spec, action_spec, loss_fn, epsilon_greedy)

        super(BehavioralCloningAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy,
                             train_sequence_length=None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter)

        self._as_trajectory = data_converter.AsTrajectory(
            self.data_context,
            sequence_length=None,
            num_outer_dims=num_outer_dims)
Beispiel #7
0
 def testExclusive(self, dtype):
     spec = tensor_spec.TensorSpec((2, 3), dtype=dtype)
     self.assertIs(
         tensor_spec.is_discrete(spec) ^ tensor_spec.is_continuous(spec),
         True)
Beispiel #8
0
 def testIsContinuous(self, dtype):
     spec = tensor_spec.TensorSpec((2, 3), dtype=dtype)
     self.assertIs(tensor_spec.is_continuous(spec), dtype.is_floating)
Beispiel #9
0
    def __init__(self,
                 action_spec,
                 actor_network: Network,
                 critic_network: Network,
                 critic_loss=None,
                 target_entropy=None,
                 initial_log_alpha=0.0,
                 target_update_tau=0.05,
                 target_update_period=1,
                 dqda_clipping=None,
                 actor_optimizer=None,
                 critic_optimizer=None,
                 alpha_optimizer=None,
                 gradient_clipping=None,
                 train_step_counter=None,
                 debug_summaries=False,
                 name="SacAlgorithm"):
        """Create a SacAlgorithm

        Args:
            action_spec (nested BoundedTensorSpec): representing the actions.
            actor_network (Network): The network will be called with
                call(observation, step_type).
            critic_network (Network): The network will be called with
                call(observation, action, step_type).
            critic_loss (None|OneStepTDLoss): an object for calculating critic loss.
                If None, a default OneStepTDLoss will be used.
            initial_log_alpha (float): initial value for variable log_alpha
            target_entropy (float|None): The target average policy entropy, for updating alpha.
            target_update_tau (float): Factor for soft update of the target
                networks.
            target_update_period (int): Period for soft update of the target
                networks.
            dqda_clipping (float): when computing the actor loss, clips the
                gradient dqda element-wise between [-dqda_clipping, dqda_clipping].
                Does not perform clipping if dqda_clipping == 0.
            actor_optimizer (tf.optimizers.Optimizer): The optimizer for actor.
            critic_optimizer (tf.optimizers.Optimizer): The optimizer for critic.
            alpha_optimizer (tf.optimizers.Optimizer): The optimizer for alpha.
            gradient_clipping (float): Norm length to clip gradients.
            train_step_counter (tf.Variable): An optional counter to increment
                every time the a new iteration is started. If None, it will use
                tf.summary.experimental.get_step(). If this is still None, a
                counter will be created.
            debug_summaries (bool): True if debug summaries should be created.
            name (str): The name of this algorithm.
        """
        critic_network1 = critic_network
        critic_network2 = critic_network.copy(name='CriticNetwork2')
        log_alpha = tfa_common.create_variable(name='log_alpha',
                                               initial_value=initial_log_alpha,
                                               dtype=tf.float32,
                                               trainable=True)
        super().__init__(
            action_spec,
            train_state_spec=SacState(
                share=SacShareState(actor=actor_network.state_spec),
                actor=SacActorState(critic1=critic_network.state_spec,
                                    critic2=critic_network.state_spec),
                critic=SacCriticState(
                    critic1=critic_network.state_spec,
                    critic2=critic_network.state_spec,
                    target_critic1=critic_network.state_spec,
                    target_critic2=critic_network.state_spec)),
            action_distribution_spec=actor_network.output_spec,
            predict_state_spec=actor_network.state_spec,
            optimizer=[actor_optimizer, critic_optimizer, alpha_optimizer],
            get_trainable_variables_func=[
                lambda: actor_network.trainable_variables, lambda:
                (critic_network1.trainable_variables + critic_network2.
                 trainable_variables), lambda: [log_alpha]
            ],
            gradient_clipping=gradient_clipping,
            train_step_counter=train_step_counter,
            debug_summaries=debug_summaries,
            name=name)

        self._log_alpha = log_alpha
        self._actor_network = actor_network
        self._critic_network1 = critic_network1
        self._critic_network2 = critic_network2
        self._target_critic_network1 = self._critic_network1.copy(
            name='TargetCriticNetwork1')
        self._target_critic_network2 = self._critic_network2.copy(
            name='TargetCriticNetwork2')
        self._actor_optimizer = actor_optimizer
        self._critic_optimizer = critic_optimizer
        self._alpha_optimizer = alpha_optimizer

        if critic_loss is None:
            critic_loss = OneStepTDLoss(debug_summaries=debug_summaries)
        self._critic_loss = critic_loss

        flat_action_spec = tf.nest.flatten(self._action_spec)
        self._is_continuous = tensor_spec.is_continuous(flat_action_spec[0])
        if target_entropy is None:
            target_entropy = np.sum(
                list(
                    map(dist_utils.calc_default_target_entropy,
                        flat_action_spec)))
        self._target_entropy = target_entropy

        self._dqda_clipping = dqda_clipping

        self._update_target = common.get_target_updater(
            models=[self._critic_network1, self._critic_network2],
            target_models=[
                self._target_critic_network1, self._target_critic_network2
            ],
            tau=target_update_tau,
            period=target_update_period)

        tfa_common.soft_variables_update(
            self._critic_network1.variables,
            self._target_critic_network1.variables,
            tau=1.0)

        tfa_common.soft_variables_update(
            self._critic_network2.variables,
            self._target_critic_network2.variables,
            tau=1.0)
Beispiel #10
0
def create_sac_algorithm(env,
                         actor_fc_layers=(100, 100),
                         critic_fc_layers=(100, 100),
                         use_rnns=False,
                         alpha_learning_rate=5e-3,
                         actor_learning_rate=5e-3,
                         critic_learning_rate=5e-3,
                         debug_summaries=False):
    """Create a simple SacAlgorithm.

    Args:
        env (TFEnvironment): A TFEnvironment
        actor_fc_layers (list[int]): list of fc layers parameters for actor network
        critic_fc_layers (list[int]): list of fc layers parameters for critic network
        use_rnns (bool): True if rnn should be used
        alpha_learning_rate (float): learning rate for alpha
        actor_learning_rate (float) : learning rate for actor network
        critic_learning_rate (float) : learning rate for critic network
        debug_summaries (bool): True if debug summaries should be created
    """

    observation_spec = env.observation_spec()
    action_spec = env.action_spec()

    is_continuous = tensor_spec.is_continuous(tf.nest.flatten(action_spec)[0])
    if use_rnns:
        actor_net = ActorDistributionRnnNetwork(
            observation_spec,
            action_spec,
            input_fc_layer_params=actor_fc_layers,
            output_fc_layer_params=())
        if is_continuous:
            critic_net = CriticRnnNetwork(
                (observation_spec, action_spec),
                observation_fc_layer_params=(),
                action_fc_layer_params=(),
                output_fc_layer_params=(),
                joint_fc_layer_params=critic_fc_layers)
        else:
            critic_net = QRnnNetwork(observation_spec,
                                     action_spec,
                                     output_fc_layer_params=(),
                                     input_fc_layer_params=critic_fc_layers)
    else:
        actor_net = ActorDistributionNetwork(observation_spec,
                                             action_spec,
                                             fc_layer_params=actor_fc_layers)
        if is_continuous:
            critic_net = CriticNetwork((observation_spec, action_spec),
                                       joint_fc_layer_params=critic_fc_layers)
        else:
            critic_net = QNetwork(observation_spec,
                                  action_spec,
                                  fc_layer_params=critic_fc_layers)

    actor_optimizer = tf.optimizers.Adam(learning_rate=actor_learning_rate)
    critic_optimizer = tf.optimizers.Adam(learning_rate=critic_learning_rate)
    alpha_optimizer = tf.optimizers.Adam(learning_rate=alpha_learning_rate)
    return SacAlgorithm(action_spec=action_spec,
                        actor_network=actor_net,
                        critic_network=critic_net,
                        actor_optimizer=actor_optimizer,
                        critic_optimizer=critic_optimizer,
                        alpha_optimizer=alpha_optimizer,
                        debug_summaries=debug_summaries)
 def testIsContinuous(self, dtype):
     spec = array_spec.ArraySpec((2, 3), dtype=dtype)
     self.assertIs(tensor_spec.is_continuous(spec),
                   issubclass(np.dtype(dtype).type, np.floating))
 def _validate_action_spec(action_spec):
     if not tensor_spec.is_continuous(action_spec):
         raise ValueError(
             'OU Noise is applicable only to continuous actions.')
Beispiel #13
0
def normal(inputs,
           output_spec,
           outer_rank=1,
           projection_layer=default_fully_connected,
           mean_transform=tanh_squash_to_spec,
           std_initializer=tf.zeros_initializer(),
           std_transform=tf.exp,
           distribution_cls=tfp.distributions.Normal):
    """Project a batch of inputs to a batch of means and standard deviations.

  Given an output spec for a single tensor continuous action, produces a
  neural net layer converting inputs to a normal distribution matching
  the spec.  The mean is derived from a fully connected linear layer as
  mean_transform(layer_output, output_spec).  The std is fixed to a single
  trainable tensor (thus independent of the inputs).  Specifically, std is
  parameterized as std_transform(variable).

  Args:
    inputs: An input Tensor of shape [batch_size, ?].
    output_spec: An output spec (either BoundedArraySpec or BoundedTensorSpec).
    outer_rank: The number of outer dimensions of inputs to consider batch
      dimensions and to treat as batch dimensions of output distribution.
    projection_layer: Function taking in inputs, num_elements, scope and
      returning a projection of inputs to a Tensor of width num_elements.
    mean_transform: A function taking in layer output and the output_spec,
      returning the means.  Defaults to tanh_squash_to_spec.
    std_initializer: Initializer for std_dev variables.
    std_transform: The function applied to the trainable std variable. For
      example, tf.exp (default), tf.nn.softplus.
    distribution_cls: The distribution class to use for output distribution.
      Default is tfp.distributions.Normal.

  Returns:
    A tf.distribution.Normal object in which the standard deviation is not
      dependent on input.

  Raises:
    ValueError: If output_spec is invalid.
  """
    if not tensor_spec.is_bounded(output_spec):
        raise ValueError('Input output_spec is of invalid type '
                         '%s.' % type(output_spec))
    if not tensor_spec.is_continuous(output_spec):
        raise ValueError('Output is not continuous.')

    batch_squash = utils.BatchSquash(outer_rank)
    inputs = batch_squash.flatten(inputs)
    means = projection_layer(inputs,
                             output_spec.shape.num_elements(),
                             scope='means')
    stds = tf.contrib.layers.bias_add(
        tf.zeros_like(means),  # Independent of inputs.
        initializer=std_initializer,
        scope='stds',
        activation_fn=None)

    means = tf.reshape(means, [-1] + output_spec.shape.as_list())
    means = mean_transform(means, output_spec)
    means = tf.cast(means, output_spec.dtype)

    stds = tf.reshape(stds, [-1] + output_spec.shape.as_list())
    stds = std_transform(stds)
    stds = tf.cast(stds, output_spec.dtype)

    means, stds = batch_squash.unflatten(means), batch_squash.unflatten(stds)
    return distribution_cls(means, stds)
Beispiel #14
0
    def __init__(self,
                 x_spec,
                 y_spec,
                 model=None,
                 fc_layers=(256, ),
                 sampler='buffer',
                 buffer_size=65536,
                 optimizer: tf.optimizers.Optimizer = None,
                 estimator_type='DV',
                 averager=ScalarAdaptiveAverager(),
                 name="MIEstimator"):
        """Create a MIEstimator.

        Args:
            x_spec (nested TensorSpec): spec of x
            y_spec (nested TensorSpec): spec of y
            model (Network): can be called as model([x, y]) and return a Tensor
                with shape=[batch_size, 1]. If None, a default MLP with
                fc_layers will be created.
            fc_layers (tuple[int]): size of hidden layers. Only used if model is
                None.
            sampler (str): type of sampler used to get samples from marginal
                distribution, should be one of ['buffer', 'double_buffer',
                'shuffle', 'shift']
            buffer_size (int): capacity of buffer for storing y for sampler
                'buffer' and 'double_buffer'
            optimzer (tf.optimizers.Optimzer): optimizer
            estimator_type (str): one of 'DV', 'KLD' or 'JSD'
            averager (EMAverager): averager used to maintain a moving average
                of exp(T). Only used for 'DV' estimator
            name (str): name of this estimator
        """
        assert estimator_type in ['ML', 'DV', 'KLD', 'JSD'
                                  ], "Wrong estimator_type %s" % estimator_type
        super().__init__(train_state_spec=(), optimizer=optimizer, name=name)
        self._x_spec = x_spec
        self._y_spec = y_spec
        if model is None:
            if estimator_type == 'ML':
                model = TFAEncodingNetwork(
                    name="MIEstimator",
                    input_tensor_spec=x_spec,
                    fc_layer_params=fc_layers,
                    preprocessing_combiner=NestConcatenate(axis=-1))
            else:
                model = EncodingNetwork(
                    name="MIEstimator",
                    input_tensor_spec=[x_spec, y_spec],
                    fc_layer_params=fc_layers,
                    last_layer_size=1)
        self._model = model
        self._type = estimator_type
        if sampler == 'buffer':
            self._y_buffer = DataBuffer(y_spec, capacity=buffer_size)
            self._sampler = self._buffer_sampler
        elif sampler == 'double_buffer':
            self._x_buffer = DataBuffer(x_spec, capacity=buffer_size)
            self._y_buffer = DataBuffer(y_spec, capacity=buffer_size)
            self._sampler = self._double_buffer_sampler
        elif sampler == 'shuffle':
            self._sampler = self._shuffle_sampler
        elif sampler == 'shift':
            self._sampler = self._shift_sampler
        else:
            raise TypeError("Wrong type for sampler %s" % sampler)

        if estimator_type == 'DV':
            self._mean_averager = averager
        if estimator_type == 'ML':
            assert isinstance(
                y_spec,
                tf.TensorSpec), ("Currently, 'ML' does "
                                 "not support nested y_spec: %s" % y_spec)
            assert tensor_spec.is_continuous(y_spec), (
                "Currently, 'ML' does "
                "not support discreted y_spec: %s" % y_spec)
            self._delta_loc_layer = tf.keras.layers.Dense(
                y_spec.shape[-1],
                kernel_initializer=tf.initializers.Zeros(),
                bias_initializer=tf.initializers.Zeros(),
                name='delta_loc_layer')
            self._delta_scale_layer = tf.keras.layers.Dense(
                y_spec.shape[-1],
                kernel_initializer=tf.initializers.Zeros(),
                bias_initializer=tf.keras.initializers.Constant(
                    value=math.log(math.e - 1)),
                name='delta_scale_layer')