Beispiel #1
0
def _generate_time_step(batched,
                        observation,
                        step_type,
                        discount,
                        prev_action=None,
                        action_spec=None,
                        reward=None,
                        reward_spec=ts.TensorSpec(()),
                        env_id=None,
                        env_info={}):

    flat_observation = nest.flatten(observation)

    if all(map(_is_numpy_array, flat_observation)):
        md = np
        if reward is not None:
            reward = np.float32(reward)
        discount = np.float32(discount)
    else:
        assert all(
            map(torch.is_tensor,
                flat_observation)), ("Elements in observation must be Tensor")
        md = torch
        if reward is not None:
            reward = to_tensor(reward, dtype=torch.float32)
        discount = to_tensor(discount, dtype=torch.float32)

    if batched:
        batch_size = flat_observation[0].shape[0]
        outer_dims = (batch_size, )
        if env_id is None:
            env_id = md.arange(batch_size, dtype=md.int32)
        if reward is not None:
            assert reward.shape[:1] == outer_dims
        if prev_action is not None:
            flat_action = nest.flatten(prev_action)
            assert flat_action[0].shape[:1] == outer_dims
    else:
        outer_dims = ()
        if env_id is None:
            env_id = md.zeros((), dtype=md.int32)

    step_type = md.full(outer_dims, step_type, dtype=md.int32)
    if reward is None:
        reward = md.zeros(outer_dims + reward_spec.shape, dtype=md.float32)
    discount = md.ones(outer_dims, dtype=md.float32) * discount
    if prev_action is None:
        prev_action = nest.map_structure(
            lambda spec: md.zeros(outer_dims + spec.shape,
                                  dtype=getattr(
                                      md, ts.torch_dtype_to_str(spec.dtype))),
            action_spec)

    return TimeStep(step_type,
                    reward,
                    discount,
                    observation,
                    prev_action,
                    env_id,
                    env_info=env_info)
Beispiel #2
0
 def decode_step(self, latent_vector, observations):
     """Calculate decoding loss."""
     decoders = flatten(self._decoders)
     observations = flatten(observations)
     decoder_losses = [
         decoder.train_step((latent_vector, obs)).info
         for decoder, obs in zip(decoders, observations)
     ]
     loss = math_ops.add_n(
         [decoder_loss.loss for decoder_loss in decoder_losses])
     decoder_losses = alf.nest.pack_sequence_as(self._decoders,
                                                decoder_losses)
     return LossInfo(loss=loss, extra=decoder_losses)
Beispiel #3
0
def time_step_spec(observation_spec, action_spec, reward_spec):
    """Returns a ``TimeStep`` spec given the ``observation_spec`` and the
    ``action_spec``.
    """
    def is_valid_tensor_spec(spec):
        return isinstance(spec, ts.TensorSpec)

    assert all(map(is_valid_tensor_spec, nest.flatten(observation_spec)))
    assert all(map(is_valid_tensor_spec, nest.flatten(action_spec)))
    return TimeStep(step_type=ts.TensorSpec([], torch.int32),
                    reward=reward_spec,
                    discount=ts.BoundedTensorSpec([],
                                                  torch.float32,
                                                  minimum=0.0,
                                                  maximum=1.0),
                    observation=observation_spec,
                    prev_action=action_spec,
                    env_id=ts.TensorSpec([], torch.int32))
Beispiel #4
0
    def _worker(self, conn, env_constructor, env_id=None, flatten=False):
        """The process waits for actions and sends back environment results.

        Args:
            conn (multiprocessing.connection): Connection for communication to the main process.
            env_constructor (Callable): callable environment creator.
            flatten (bool): whether to assume flattened actions and time_steps
              during communication to avoid overhead.

        Raises:
            KeyError: When receiving a message of unknown type.
        """
        try:
            alf.set_default_device("cpu")
            env = env_constructor(env_id)
            action_spec = env.action_spec()
            conn.send(self._READY)  # Ready.
            while True:
                try:
                    # Only block for short times to have keyboard exceptions be raised.
                    if not conn.poll(0.1):
                        continue
                    message, payload = conn.recv()
                except (EOFError, KeyboardInterrupt):
                    break
                if message == self._ACCESS:
                    name = payload
                    result = getattr(env, name)
                    conn.send((self._RESULT, result))
                    continue
                if message == self._CALL:
                    name, args, kwargs = payload
                    if flatten and name == 'step':
                        args = [nest.pack_sequence_as(action_spec, args[0])]
                    result = getattr(env, name)(*args, **kwargs)
                    if flatten and name in ['step', 'reset']:
                        result = nest.flatten(result)
                        assert all([
                            not isinstance(x, torch.Tensor) for x in result
                        ]), ("Tensor result is not allowed: %s" % name)
                    conn.send((self._RESULT, result))
                    continue
                if message == self._CLOSE:
                    assert payload is None
                    env.close()
                    break
                raise KeyError(
                    'Received message of unknown type {}'.format(message))
        except Exception:  # pylint: disable=broad-except
            etype, evalue, tb = sys.exc_info()
            stacktrace = ''.join(traceback.format_exception(etype, evalue, tb))
            message = 'Error in environment process: {}'.format(stacktrace)
            logging.error(message)
            conn.send((self._EXCEPTION, stacktrace))
        finally:
            conn.close()
Beispiel #5
0
def rsample_action_distribution(nested_distributions):
    """Sample actions from distributions with reparameterization-based sampling
        (rsample) to enable backpropagation.
    Args:
        nested_distributions (nested Distribution): action distributions.
    Returns:
        rsampled actions
    """
    assert all(nest.flatten(nest.map_structure(lambda d: d.has_rsample,
                nested_distributions))), \
            ("all the distributions need to support rsample in order to enable "
            "backpropagation")
    return nest.map_structure(lambda d: d.rsample(), nested_distributions)
Beispiel #6
0
 def _unstack_actions(self, batched_actions):
     """Returns a list of actions from potentially nested batch of actions."""
     batched_actions = nest.map_structure(lambda x: x.cpu(),
                                          batched_actions)
     flattened_actions = nest.flatten(batched_actions)
     if self._flatten:
         unstacked_actions = zip(*flattened_actions)
     else:
         unstacked_actions = [
             nest.pack_sequence_as(batched_actions, actions)
             for actions in zip(*flattened_actions)
         ]
     return unstacked_actions
Beispiel #7
0
def entropy_with_fallback(distributions):
    r"""Computes total entropy of nested distribution.
    If ``entropy()`` of a distribution is not implemented, this function will
    fallback to use sampling to calculate the entropy. It returns two values:
    ``(entropy, entropy_for_gradient)``.

    There are two situations:

    - ``entropy()`` is implemented  and it's same as ``entropy_for_gradient``.
    - ``entropy()`` is not implemented. We use sampling to calculate entropy. The
      unbiased estimator for entropy is :math:`-\log(p(x))`. However, the gradient
      of :math:`-\log(p(x))` is not an unbiased estimator of the gradient of
      entropy. So we also calculate a value whose gradient is an unbiased
      estimator of the gradient of entropy. See ``estimated_entropy()`` for detail.

    Examples:

    .. code-block:: python

        ent, ent_for_grad = entropy_with_fall_back(dist, action_spec)
        alf.summary.scalar("entropy", ent)
        ent_for_grad.backward()

    Args:
        distributions (nested Distribution): A possibly batched tuple of
            distributions.

    Returns:
        tuple:
        - entropy
        - entropy_for_gradient: You should use ``entropy`` in situations where its
          value is needed, and ``entropy_for_gradient`` where you need to calculate the
          gradient of entropy.
    """

    def _compute_entropy(dist: td.Distribution):
        if isinstance(dist, td.TransformedDistribution):
            # TransformedDistribution is used by NormalProjectionNetwork with
            # scale_distribution=True, in which case we estimate with sampling.
            entropy, entropy_for_gradient = estimated_entropy(dist)
        else:
            entropy = dist.entropy()
            entropy_for_gradient = entropy
        return entropy, entropy_for_gradient

    entropies = list(map(_compute_entropy, nest.flatten(distributions)))
    entropies, entropies_for_gradient = zip(*entropies)

    return sum(entropies), sum(entropies_for_gradient)
Beispiel #8
0
def compute_entropy(distributions):
    """Computes total entropy of nested distribution.
    Args:
        distributions (nested Distribution): A possibly batched tuple of
            distributions.
    Returns:
        entropy
    """
    def _compute_entropy(dist: td.Distribution):
        entropy = dist.entropy()
        return entropy

    entropies = nest.map_structure(_compute_entropy, distributions)
    total_entropies = sum(nest.flatten(entropies))
    return total_entropies
Beispiel #9
0
def compute_log_probability(distributions, actions):
    """Computes log probability of actions given distribution.

    Args:
        distributions: A possibly batched tuple of distributions.
        actions: A possibly batched action tuple.

    Returns:
        Tensor: the log probability summed over actions in the batch.
    """
    def _compute_log_prob(single_distribution, single_action):
        single_log_prob = single_distribution.log_prob(single_action)
        return single_log_prob

    nest.assert_same_structure(distributions, actions)
    log_probs = nest.map_structure(_compute_log_prob, distributions, actions)
    total_log_probs = sum(nest.flatten(log_probs))
    return total_log_probs
    def _create_projection_net(self, discrete_projection_net_ctor,
                               continuous_projection_net_ctor):
        """If there are :math:`N` action specs, then create :math:`N` projection
        networks which can be a mixture of categoricals and normals.
        """
        def _create(spec):
            if spec.is_discrete:
                net = discrete_projection_net_ctor(
                    input_size=self._encoding_net.output_spec.shape[0],
                    action_spec=spec)
            else:
                net = continuous_projection_net_ctor(
                    input_size=self._encoding_net.output_spec.shape[0],
                    action_spec=spec)
            return net

        self._projection_net = nest.map_structure(_create, self._action_spec)
        if nest.is_nested(self._projection_net):
            # need this for torch to pickup the parameters of all the modules
            self._projection_net_module_list = nn.ModuleList(
                nest.flatten(self._projection_net))
Beispiel #11
0
def _check_action_specs_for_critic_networks(action_spec,
                                            action_input_processors,
                                            action_preprocessing_combiner):

    if len(nest.flatten(action_spec)) > 1:
        assert action_preprocessing_combiner is not None, (
            "An action combiner is needed when there are multiple action specs:"
            " {}".format(action_spec))

    def _check_individual(spec, proc):
        if spec.is_discrete:
            assert proc is not None, (
                'CriticNetwork only supports continuous actions. One of given '
                + 'action specs {} is discrete. Use QNetwork instead. '.format(
                    spec) +
                'Alternatively, specify `action_input_processors` to transform '
                + 'discrete actions to continuous action embeddings first.')

    if action_input_processors is None:
        action_input_processors = nest.map_structure(lambda _: None,
                                                     action_spec)

    nest.map_structure(_check_individual, action_spec, action_input_processors)
Beispiel #12
0
    def __init__(self,
                 input_tensor_spec: TensorSpec,
                 action_spec: BoundedTensorSpec,
                 input_preprocessors=None,
                 preprocessing_combiner=None,
                 conv_layer_params=None,
                 fc_layer_params=None,
                 activation=torch.relu_,
                 kernel_initializer=None,
                 name="QNetwork"):
        """Creates an instance of ``QNetwork`` for estimating action-value of
        discrete actions. The action-value is defined as the expected return
        starting from the given input observation and taking the given action.
        It takes observation as input and outputs an action-value tensor with
        the shape of ``[batch_size, num_of_actions]``.

        Args:
            input_tensor_spec (TensorSpec): the tensor spec of the input
            action_spec (TensorSpec): the tensor spec of the action
            input_preprocessors (nested InputPreprocessor): a nest of
                ``InputPreprocessor``, each of which will be applied to the
                corresponding input. If not None, then it must
                have the same structure with ``input_tensor_spec`` (after reshaping).
                If any element is None, then it will be treated as ``math_ops.identity``.
                This arg is helpful if you want to have separate preprocessings
                for different inputs by configuring a gin file without changing
                the code. For example, embedding a discrete input before concatenating
                it to another continuous vector.
            preprocessing_combiner (NestCombiner): preprocessing called on
                complex inputs. Note that this combiner must also accept
                ``input_tensor_spec`` as the input to compute the processed
                tensor spec. For example, see ``alf.nest.utils.NestConcat``. This
                arg is helpful if you want to combine inputs by configuring a
                gin file without changing the code.
            conv_layer_params (tuple[tuple]): a tuple of tuples where each
                tuple takes a format ``(filters, kernel_size, strides, padding)``,
                where ``padding`` is optional.
            fc_layer_params (tuple[int]): a tuple of integers representing hidden
                FC layer sizes.
            activation (nn.functional): activation used for hidden layers. The
                last layer will not be activated.
            kernel_initializer (Callable): initializer for all the layers but
                the last layer. If none is provided a default ``variance_scaling_initializer``
                will be used.
        """
        super(QNetwork, self).__init__(input_tensor_spec, name=name)

        assert len(nest.flatten(action_spec)) == 1, (
            "Currently only support a single discrete action! Use "
            "CriticNetwork instead for multiple actions.")

        num_actions = action_spec.maximum - action_spec.minimum + 1
        self._output_spec = TensorSpec((num_actions, ))

        self._encoding_net = EncodingNetwork(
            input_tensor_spec=input_tensor_spec,
            input_preprocessors=input_preprocessors,
            preprocessing_combiner=preprocessing_combiner,
            conv_layer_params=conv_layer_params,
            fc_layer_params=fc_layer_params,
            activation=activation,
            kernel_initializer=kernel_initializer)

        last_kernel_initializer = functools.partial(torch.nn.init.uniform_, \
                                    a=-0.003, b=0.003)

        self._final_layer = layers.FC(
            self._encoding_net.output_spec.shape[0],
            num_actions,
            activation=math_ops.identity,
            kernel_initializer=last_kernel_initializer,
            bias_init_value=-0.2)
Beispiel #13
0
    def __init__(self,
                 input_tensor_spec,
                 action_qt: ActionQuantizer = None,
                 num_critic_replicas=2,
                 obs_encoding_layer_params=None,
                 pre_encoding_layer_params=None,
                 mid_encoding_layer_params=None,
                 post_encoding_layer_params=None,
                 free_form_fc_layer_params=None,
                 activation=torch.relu_,
                 kernel_initializer=None,
                 debug_summaries=False,
                 name="MdqCriticNetwork"):
        """Creates an instance of `MdqCriticNetwork` for estimating action-value
        of continuous actions and action sampling.

        Currently there are two branches of networks:
            - free-form branch: a plain MLP for Q-learning
            - adv-form branch: an advantage form of network for action
                generation. It is trained by a target from the free-form net.

        The adv-form branch has the following structures for flexibility:
            obs -> [obs_encoding_net] -> encoded_obs
            encoded_obs, action ->
                                [pre_encoding_nets] ->
                                [mid_shared_encoding_nets] ->
                                [post_encoding_nets] -> outputs
            where the pre_encoding_nets and post_encoding_nets do not share
            parameters across action dimensions while mid_shared_encoding_nets
            shares parameters across action dimensions.
            If the encoding_layer_params for a sub-net is None, that sub-net is
            effectively neglected.

        Furthermore, to enable parallel computation across action dimension in
        the case of value computation, we have both parallel and individual
        versions for the nets without parameter sharing. For exmaple, for
        post_encoding_nets, we also have post_encoding_parallel_net, which is
        essentially the equivalent form of post_encoding_nets but supports
        parallel forwarding. The parameters of the two versions are synced.
        The partial actions (a[0:i]) are zero-padded for both parallel and
        individual networks to enable parallel computation.


        For conciseness purpose, the following notations will be used when
        convenient:
            - B: batch size
            - d: dimensionality of feature
            - n: number of network replica
            - action_dim: the dimensionality of actions
            - action_bin: number of discrete bins for each action dim

        Args:
            input_tensor_spec: A tuple of TensorSpecs (observation_spec, action_spec)
                representing the inputs.
            action_qt (ActionQuantizer): action quantization module
            num_critic_replicas (int): number of critic networks
            obs_encoding_layer_params (tuple[int]): a tuple of integers
                representing hidden FC layer sizes for encoding observations.
            pre_encoding_layer_params (tuple[int]): a tuple of integers
                representing hidden FC layer sizes for encoding concatenated
                [encoded_observation, actions]. Parameters are not shared across
                action dimensions
            mid_encoding_layer_params (tuple[int]): a tuple of integers
                representing hidden FC layer for further encoding the outputs
                from pre_encoding_net. The parameters are shared across action
                dimentions.
            post_encoding_layer_params (tuple[int]): a tuple of integers
                representing hidden FC layer for further encoding the outputs
                from mid_encoding_net. The parameters are not shared across
                action dimentions.
            free_form_fc_layer_params (tuple[int]): a tuple of integers
                representing hidden FC layer for Q-learning. We refer it as
                the free form to differentiate it from the mdq-form of network
                which is structured.
            activation (nn.functional): activation used for hidden layers. The
                last layer will not be activated.
            kernel_initializer (Callable): initializer for all the layers but
                the last layer. If none is provided a variance_scaling_initializer
                with uniform distribution will be used.
            name (str):
        """

        super().__init__(input_tensor_spec, name=name)

        observation_spec, action_spec = input_tensor_spec

        flat_action_spec = nest.flatten(action_spec)
        if len(flat_action_spec) > 1:
            raise ValueError(
                'Only a single action is supported by this network')

        self._single_action_spec = flat_action_spec[0]

        if action_qt is None:
            action_qt = ActionQuantizer(action_spec, "uniform", 15)
        self._action_qt = action_qt
        self._action_bins = self._action_qt._action_bins

        # the logpi of the uniform prior used for KL computation
        self._log_pi_uniform_prior = -np.log(self._action_bins)

        self._action_dim = action_spec.shape[0]  # control vector dim
        self._num_critic_replicas = num_critic_replicas

        self._obs_encoding_net = ParallelEncodingNetwork(
            observation_spec,
            self._num_critic_replicas,
            fc_layer_params=obs_encoding_layer_params,
            activation=activation,
            kernel_initializer=kernel_initializer)

        last_activation = math_ops.identity
        last_kernel_initializer = functools.partial(torch.nn.init.uniform_, \
                                a=-0.003, b=0.003)

        in_size = self._action_dim

        self._pre_encoding_nets = []
        for i in range(self._action_dim):
            # output_spec.shape: [n, d]
            self._pre_encoding_nets.append(
                ParallelEncodingNetwork(
                    TensorSpec((self._obs_encoding_net.output_spec.shape[-1] +
                                in_size, )),
                    self._num_critic_replicas,
                    fc_layer_params=pre_encoding_layer_params,
                    activation=activation,
                    kernel_initializer=kernel_initializer))

        # parallel along both critic and action dims without sharing parameters
        # for each action dimension.
        # input: [B, action_dim*n, d]: need to stack over dim1
        # output: [B, action_dim*n, d']: need to unstack over dim1 for
        # splitting over networks
        self._pre_encoding_parallel_net = ParallelEncodingNetwork(
            TensorSpec(
                (self._obs_encoding_net.output_spec.shape[-1] + in_size, )),
            self._num_critic_replicas * self._action_dim,
            fc_layer_params=pre_encoding_layer_params,
            activation=activation,
            kernel_initializer=kernel_initializer)

        # parallel along both critic and action dims with sharing parameters
        # for each action dimension.
        # input: [action_dim*B, n, d]: need to stack over dim0
        # output: [action_dim*B, n, d']: need to unstack over dim0 for
        # splitting over networks
        self._mid_shared_encoding_nets = ParallelEncodingNetwork(
            TensorSpec(
                (self._pre_encoding_parallel_net.output_spec.shape[-1], )),
            self._num_critic_replicas,
            fc_layer_params=mid_encoding_layer_params,
            activation=activation,
            kernel_initializer=kernel_initializer)
        out_size = self._mid_shared_encoding_nets.output_spec.shape[-1]

        post_enc_out_size = self._action_qt.action_bins

        self._post_encoding_nets = []
        for i in range(self._action_dim):
            self._post_encoding_nets.append(
                ParallelEncodingNetwork(
                    TensorSpec((out_size, )),
                    self._num_critic_replicas,
                    fc_layer_params=post_encoding_layer_params,
                    activation=activation,
                    kernel_initializer=kernel_initializer,
                    last_layer_size=post_enc_out_size,
                    last_activation=last_activation,
                    last_kernel_initializer=last_kernel_initializer))

        # parallel along both critic and action dims without sharing parameters
        # for each action dimension.
        # input: [B, action_dim*n, d]: need to stack over dim1
        # output: [B, action_dim*n, d']: need to unstack over dim1 for
        # splitting over networks
        self._post_encoding_parallel_net = ParallelEncodingNetwork(
            TensorSpec((out_size, )),
            self._num_critic_replicas * self._action_dim,
            fc_layer_params=post_encoding_layer_params,
            activation=activation,
            kernel_initializer=kernel_initializer,
            last_layer_size=post_enc_out_size,
            last_activation=last_activation,
            last_kernel_initializer=last_kernel_initializer)

        assert free_form_fc_layer_params is not None

        self._free_form_q_net = ParallelEncodingNetwork(
            TensorSpec((observation_spec.shape[-1] + self._action_dim, )),
            self._num_critic_replicas,
            fc_layer_params=free_form_fc_layer_params,
            activation=activation,
            kernel_initializer=kernel_initializer,
            last_layer_size=1,
            last_activation=math_ops.identity,
            last_kernel_initializer=last_kernel_initializer)

        MdqCriticNetwork._parallel_to_individual_network_sync(
            self._pre_encoding_parallel_net,
            self._pre_encoding_nets,
            step=self._num_critic_replicas)

        MdqCriticNetwork._parallel_to_individual_network_sync(
            self._post_encoding_parallel_net,
            self._post_encoding_nets,
            step=self._num_critic_replicas)

        self._output_spec = TensorSpec(())

        self._debug_summaries = debug_summaries
Beispiel #14
0
    def __init__(self,
                 input_tensor_spec: TensorSpec,
                 action_spec: BoundedTensorSpec,
                 input_preprocessors=None,
                 preprocessing_combiner=None,
                 conv_layer_params=None,
                 fc_layer_params=None,
                 activation=torch.relu_,
                 squashing_func=torch.tanh,
                 kernel_initializer=None,
                 name="ActorNetwork"):
        """Creates an instance of ``ActorNetwork``, which maps the inputs to
        actions (single or nested) through a sequence of deterministic layers.

        Args:
            input_tensor_spec (TensorSpec): the tensor spec of the input.
            action_spec (BoundedTensorSpec): the tensor spec of the action.
            input_preprocessors (nested InputPreprocessor): a nest of
                ``InputPreprocessor``, each of which will be applied to the
                corresponding input. If not None, then it must
                have the same structure with ``input_tensor_spec`` (after reshaping).
                If any element is None, then it will be treated as ``math_ops.identity``.
                This arg is helpful if you want to have separate preprocessings
                for different inputs by configuring a gin file without changing
                the code. For example, embedding a discrete input before concatenating
                it to another continuous vector.
            preprocessing_combiner (NestCombiner): preprocessing called on
                complex inputs. Note that this combiner must also accept
                ``input_tensor_spec`` as the input to compute the processed
                tensor spec. For example, see ``alf.nest.utils.NestConcat``. This
                arg is helpful if you want to combine inputs by configuring a
                gin file without changing the code.
            conv_layer_params (tuple[tuple]): a tuple of tuples where each
                tuple takes a format ``(filters, kernel_size, strides, padding)``,
                where ``padding`` is optional.
            fc_layer_params (tuple[int]): a tuple of integers representing hidden
                FC layer sizes.
            activation (nn.functional): activation used for hidden layers. The
                last layer will not be activated.
            squashing_func (Callable): the activation function used to squashing
                the output to the range :math:`(-1, 1)`. Default to ``tanh``.
            kernel_initializer (Callable): initializer for all the layers but
                the last layer. If none is provided a ``variance_scaling_initializer``
                with uniform distribution will be used.
            name (str): name of the network
        """
        super(ActorNetwork, self).__init__(input_tensor_spec,
                                           input_preprocessors,
                                           preprocessing_combiner,
                                           name=name)

        if kernel_initializer is None:
            kernel_initializer = functools.partial(variance_scaling_init,
                                                   gain=math.sqrt(1.0 / 3),
                                                   mode='fan_in',
                                                   distribution='uniform')

        self._action_spec = action_spec
        flat_action_spec = nest.flatten(action_spec)
        self._flat_action_spec = flat_action_spec

        is_continuous = [
            single_action_spec.is_continuous
            for single_action_spec in flat_action_spec
        ]

        assert all(is_continuous), "only continuous action is supported"

        self._encoding_net = EncodingNetwork(
            input_tensor_spec=self._processed_input_tensor_spec,
            conv_layer_params=conv_layer_params,
            fc_layer_params=fc_layer_params,
            activation=activation,
            kernel_initializer=kernel_initializer,
            name=self.name + ".encoding_net")

        last_kernel_initializer = functools.partial(torch.nn.init.uniform_, \
                                    a=-0.003, b=0.003)
        self._action_layers = nn.ModuleList()
        self._squashing_func = squashing_func
        for single_action_spec in flat_action_spec:
            self._action_layers.append(
                layers.FC(self._encoding_net.output_spec.shape[0],
                          single_action_spec.shape[0],
                          kernel_initializer=last_kernel_initializer))
Beispiel #15
0
def termination(observation, prev_action, reward, env_id=None, env_info={}):
    """Returns a ``TimeStep`` with ``step_type`` set to ``StepType.LAST``.

    Called by ``env.step()`` if 'Done'. ``discount`` should not be sent in and
    will be set as 0.

    Args:
        observation (nested tensors): current observations of the env.
        prev_action (nested tensors): previous actions to the the env.
        reward (float): A scalar, or 1D NumPy array, or tensor.
        env_id (torch.int32): (optional) A scalar or 1D tensor of the environment
            ID(s).
        env_info (dict): extra info returned by the environment.

    Returns:
        TimeStep:

    Raises:
        ValueError: If observations are tensors but reward's statically known rank
            is not 0 or 1.
    """
    flat_observation = nest.flatten(observation)
    if all(map(_is_numpy_array, flat_observation)):
        reward = np.float32(reward)
        if env_id is None:
            env_id = np.int32(0)
        step_type = StepType.LAST
        discount = np.float32(0.0)
        return TimeStep(step_type,
                        reward,
                        discount,
                        observation,
                        prev_action,
                        env_id,
                        env_info=env_info)
    else:
        assert all(
            map(torch.is_tensor,
                flat_observation)), ("Elements in observation must be Tensor")

        reward = to_tensor(reward, dtype=torch.float32)
        assert reward.dim() <= 1, "Expected reward to be a scalar or vector."
        if reward.dim() == 0:
            shape = []
            if env_id is None:
                env_id = torch.tensor(0, dtype=torch.int32)
        else:
            flat_action = nest.flatten(prev_action)
            assert flat_observation[0].shape[:1] == reward.shape
            assert flat_action[0].shape[:1] == reward.shape
            shape = reward.shape
            env_id = torch.arange(shape[0], dtype=torch.int32)
        step_type = torch.full(shape, StepType.LAST, dtype=torch.int32)
        discount = torch.full(shape, 0.0, dtype=torch.float32)
        return TimeStep(step_type,
                        reward,
                        discount,
                        observation,
                        prev_action,
                        env_id,
                        env_info=env_info)
Beispiel #16
0
def transition(observation,
               prev_action,
               reward,
               discount=1.0,
               env_id=None,
               env_info={}):
    """Returns a ``TimeStep`` with ``step_type`` set equal to ``StepType.MID``.

    Called by ``env.step()`` if not 'Done'.

    The batch size is inferred from the shape of ``reward``.

    If ``discount`` is a scalar, and ``observation`` contains tensors,
    then ``discount`` will be broadcasted to match ``reward.shape``.

    Args:
        observation (nested tensors): current observations of the env.
        prev_action (nested tensors): previous actions to the the env.
        reward (float): A scalar, or 1D NumPy array, or tensor.
        discount (float): (optional) A scalar, or 1D NumPy array, or tensor.
        env_id (torch.int32): (optional) A scalar or 1D tensor of the environment
            ID(s).
        env_info (dict): extra info returned by the environment.

    Returns:
        TimeStep:

    Raises:
        ValueError: If observations are tensors but reward's rank
        is not 0 or 1.
    """
    flat_observation = nest.flatten(observation)
    if all(map(_is_numpy_array, flat_observation)):
        reward = np.float32(reward)
        if env_id is None:
            env_id = np.int32(0)
        step_type = StepType.MID
        discount = np.float32(discount)
        return TimeStep(step_type,
                        reward,
                        discount,
                        observation,
                        prev_action,
                        env_id,
                        env_info=env_info)
    else:
        assert all(
            map(torch.is_tensor,
                flat_observation)), ("Elements in observation must be Tensor")

        # TODO: If reward.shape.rank == 2, and static
        # batch sizes are available for both flat_observation and reward,
        # check that these match.
        reward = to_tensor(reward, dtype=torch.float32)
        assert reward.dim() <= 1, "Expected reward to be a scalar or vector."
        if reward.dim() == 0:
            shape = []
            if env_id is None:
                env_id = torch.tensor(0, dtype=torch.int32)
        else:
            flat_action = nest.flatten(prev_action)
            assert flat_observation[0].shape[:1] == reward.shape
            assert flat_action[0].shape[:1] == reward.shape
            shape = reward.shape
            env_id = torch.arange(shape[0], dtype=torch.int32)
        step_type = torch.full(shape, StepType.MID, dtype=torch.int32)
        discount = to_tensor(discount, dtype=torch.float32)

        if discount.dim() == 0:
            discount = torch.full(shape, discount, dtype=torch.float32)
        else:
            assert reward.shape == discount.shape
        return TimeStep(step_type,
                        reward,
                        discount,
                        observation,
                        prev_action,
                        env_id,
                        env_info=env_info)
Beispiel #17
0
def restart(observation, action_spec, env_id=None, env_info={}, batched=False):
    """Returns a ``TimeStep`` with ``step_type`` set equal to ``StepType.FIRST``.

    Called by ``env.reset()``.

    Args:
        observation (nested tensors): observations of the env.
        action_spec (nested TensorSpec): tensor spec of actions.
        env_id (batched or scalar torch.int32): (optional) ID of the env.
        env_info (dict): extra info returned by the environment.
        batched (bool): (optional) whether batched envs or not.

    Returns:
        TimeStep:
    """

    first_observation = nest.flatten(observation)

    if all(map(_is_numpy_array, first_observation)):
        step_type = StepType.FIRST
        if batched:
            batch_size = first_observation[0].shape[0]
            reward = np.zeros((batch_size, ), dtype=np.float32)
            discount = np.ones((batch_size, ), dtype=np.float32)
            prev_action = nest.map_structure(
                lambda spec: spec.numpy_zeros(outer_dims=(batch_size, )),
                action_spec)
            if env_id is None:
                env_id = np.arrange(batch_size, dtype=np.int32)
        else:
            reward = np.float32(0.0)
            discount = np.float32(1.0)
            prev_action = nest.map_structure(lambda spec: spec.numpy_zeros(),
                                             action_spec)
            if env_id is None:
                env_id = np.int32(0)
        return TimeStep(step_type,
                        reward,
                        discount,
                        observation,
                        prev_action,
                        env_id,
                        env_info=env_info)
    else:
        assert all(
            map(torch.is_tensor,
                first_observation)), ("Elements in observation must be Tensor")

        # TODO: Check leading dimension of first_observation
        # against batch_size if all are known statically.
        if batched:
            batch_size = first_observation[0].shape[0]
            step_type = torch.full((batch_size, ),
                                   StepType.FIRST,
                                   dtype=torch.int32)
            reward = torch.full((batch_size, ), 0.0, dtype=torch.float32)
            discount = torch.full((batch_size, ), 1.0, dtype=torch.float32)
            prev_action = nest.map_structure(
                lambda spec: spec.zeros(outer_dims=(batch_size, )),
                action_spec)
            env_id = torch.arange(batch_size, dtype=torch.int32)
        else:
            step_type = torch.full((), StepType.FIRST, dtype=torch.int32)
            reward = torch.tensor(0.0, dtype=torch.float32)
            discount = torch.tensor(1.0, dtype=torch.float32)
            prev_action = nest.map_structure(lambda spec: spec.zeros(),
                                             action_spec)
            if env_id is None:
                env_id = torch.tensor(0, dtype=torch.int32)
        return TimeStep(step_type,
                        reward,
                        discount,
                        observation,
                        prev_action,
                        env_id,
                        env_info=env_info)
Beispiel #18
0
    def __init__(self,
                 input_tensor_spec,
                 output_tensor_spec,
                 joint_fc_layer_params=None,
                 activation=torch.relu_,
                 kernel_initializer=None,
                 prob=False,
                 continuous_projection_net_ctor=NormalProjectionNetwork,
                 name="DynamicsNetwork"):
        """Creates an instance of `DynamicsNetwork` for predicting the next
        observation given current observation and action.

        Args:
            input_tensor_spec: A tuple of TensorSpecs (observation_spec, action_spec)
                representing the inputs.
            joint_fc_layer_params (tuple[int]): a tuple of integers representing
                hidden FC layer sizes FC layers after merging observations and
                actions.
            activation (nn.functional): activation used for hidden layers. The
                last layer will not be activated.
            kernel_initializer (Callable): initializer for all the layers but
                the last layer. If none is provided a variance_scaling_initializer
                with uniform distribution will be used.
            prob (bool): If True, use the probabistic mode of network; otherwise,
                use the determinstic mode of network.
            continuous_projection_net_ctor (ProjectionNetwork): constructor that
                generates a continuous projection network that outputs
                a distribution.
            name (str):
        """
        super().__init__(input_tensor_spec, name=name)

        observation_spec, action_spec = input_tensor_spec
        out_size = output_tensor_spec.shape[0]

        flat_action_spec = nest.flatten(action_spec)
        if len(flat_action_spec) > 1:
            raise ValueError(
                'Only a single action is supported by this network')

        if kernel_initializer is None:
            kernel_initializer = functools.partial(
                variance_scaling_init,
                gain=1.0 / 2.0,
                mode='fan_in',
                distribution='truncated_normal',
                nonlinearity=math_ops.identity)

        self._single_action_spec = flat_action_spec[0]

        self._prob = prob
        if self._prob:
            self._joint_encoder = EncodingNetwork(
                TensorSpec(
                    (observation_spec.shape[0] + action_spec.shape[0], )),
                fc_layer_params=joint_fc_layer_params,
                activation=activation,
                kernel_initializer=kernel_initializer)

            # the output spec is named as ``action_spec`` in projection_net
            self._projection_net = continuous_projection_net_ctor(
                # note that in the case of multi-replica, should use [-1]
                input_size=self._joint_encoder.output_spec.shape[-1],
                action_spec=output_tensor_spec,
                squash_mean=False,
                scale_distribution=False,
                state_dependent_std=True)
        else:
            self._joint_encoder = EncodingNetwork(
                TensorSpec(
                    (observation_spec.shape[0] + action_spec.shape[0], )),
                fc_layer_params=joint_fc_layer_params,
                activation=activation,
                kernel_initializer=kernel_initializer,
                last_activation=math_ops.identity,
                last_layer_size=out_size)
            self._projection_net = None

        self._output_spec = TensorSpec((out_size, ))