def _generate_time_step(batched, observation, step_type, discount, prev_action=None, action_spec=None, reward=None, reward_spec=ts.TensorSpec(()), env_id=None, env_info={}): flat_observation = nest.flatten(observation) if all(map(_is_numpy_array, flat_observation)): md = np if reward is not None: reward = np.float32(reward) discount = np.float32(discount) else: assert all( map(torch.is_tensor, flat_observation)), ("Elements in observation must be Tensor") md = torch if reward is not None: reward = to_tensor(reward, dtype=torch.float32) discount = to_tensor(discount, dtype=torch.float32) if batched: batch_size = flat_observation[0].shape[0] outer_dims = (batch_size, ) if env_id is None: env_id = md.arange(batch_size, dtype=md.int32) if reward is not None: assert reward.shape[:1] == outer_dims if prev_action is not None: flat_action = nest.flatten(prev_action) assert flat_action[0].shape[:1] == outer_dims else: outer_dims = () if env_id is None: env_id = md.zeros((), dtype=md.int32) step_type = md.full(outer_dims, step_type, dtype=md.int32) if reward is None: reward = md.zeros(outer_dims + reward_spec.shape, dtype=md.float32) discount = md.ones(outer_dims, dtype=md.float32) * discount if prev_action is None: prev_action = nest.map_structure( lambda spec: md.zeros(outer_dims + spec.shape, dtype=getattr( md, ts.torch_dtype_to_str(spec.dtype))), action_spec) return TimeStep(step_type, reward, discount, observation, prev_action, env_id, env_info=env_info)
def decode_step(self, latent_vector, observations): """Calculate decoding loss.""" decoders = flatten(self._decoders) observations = flatten(observations) decoder_losses = [ decoder.train_step((latent_vector, obs)).info for decoder, obs in zip(decoders, observations) ] loss = math_ops.add_n( [decoder_loss.loss for decoder_loss in decoder_losses]) decoder_losses = alf.nest.pack_sequence_as(self._decoders, decoder_losses) return LossInfo(loss=loss, extra=decoder_losses)
def time_step_spec(observation_spec, action_spec, reward_spec): """Returns a ``TimeStep`` spec given the ``observation_spec`` and the ``action_spec``. """ def is_valid_tensor_spec(spec): return isinstance(spec, ts.TensorSpec) assert all(map(is_valid_tensor_spec, nest.flatten(observation_spec))) assert all(map(is_valid_tensor_spec, nest.flatten(action_spec))) return TimeStep(step_type=ts.TensorSpec([], torch.int32), reward=reward_spec, discount=ts.BoundedTensorSpec([], torch.float32, minimum=0.0, maximum=1.0), observation=observation_spec, prev_action=action_spec, env_id=ts.TensorSpec([], torch.int32))
def _worker(self, conn, env_constructor, env_id=None, flatten=False): """The process waits for actions and sends back environment results. Args: conn (multiprocessing.connection): Connection for communication to the main process. env_constructor (Callable): callable environment creator. flatten (bool): whether to assume flattened actions and time_steps during communication to avoid overhead. Raises: KeyError: When receiving a message of unknown type. """ try: alf.set_default_device("cpu") env = env_constructor(env_id) action_spec = env.action_spec() conn.send(self._READY) # Ready. while True: try: # Only block for short times to have keyboard exceptions be raised. if not conn.poll(0.1): continue message, payload = conn.recv() except (EOFError, KeyboardInterrupt): break if message == self._ACCESS: name = payload result = getattr(env, name) conn.send((self._RESULT, result)) continue if message == self._CALL: name, args, kwargs = payload if flatten and name == 'step': args = [nest.pack_sequence_as(action_spec, args[0])] result = getattr(env, name)(*args, **kwargs) if flatten and name in ['step', 'reset']: result = nest.flatten(result) assert all([ not isinstance(x, torch.Tensor) for x in result ]), ("Tensor result is not allowed: %s" % name) conn.send((self._RESULT, result)) continue if message == self._CLOSE: assert payload is None env.close() break raise KeyError( 'Received message of unknown type {}'.format(message)) except Exception: # pylint: disable=broad-except etype, evalue, tb = sys.exc_info() stacktrace = ''.join(traceback.format_exception(etype, evalue, tb)) message = 'Error in environment process: {}'.format(stacktrace) logging.error(message) conn.send((self._EXCEPTION, stacktrace)) finally: conn.close()
def rsample_action_distribution(nested_distributions): """Sample actions from distributions with reparameterization-based sampling (rsample) to enable backpropagation. Args: nested_distributions (nested Distribution): action distributions. Returns: rsampled actions """ assert all(nest.flatten(nest.map_structure(lambda d: d.has_rsample, nested_distributions))), \ ("all the distributions need to support rsample in order to enable " "backpropagation") return nest.map_structure(lambda d: d.rsample(), nested_distributions)
def _unstack_actions(self, batched_actions): """Returns a list of actions from potentially nested batch of actions.""" batched_actions = nest.map_structure(lambda x: x.cpu(), batched_actions) flattened_actions = nest.flatten(batched_actions) if self._flatten: unstacked_actions = zip(*flattened_actions) else: unstacked_actions = [ nest.pack_sequence_as(batched_actions, actions) for actions in zip(*flattened_actions) ] return unstacked_actions
def entropy_with_fallback(distributions): r"""Computes total entropy of nested distribution. If ``entropy()`` of a distribution is not implemented, this function will fallback to use sampling to calculate the entropy. It returns two values: ``(entropy, entropy_for_gradient)``. There are two situations: - ``entropy()`` is implemented and it's same as ``entropy_for_gradient``. - ``entropy()`` is not implemented. We use sampling to calculate entropy. The unbiased estimator for entropy is :math:`-\log(p(x))`. However, the gradient of :math:`-\log(p(x))` is not an unbiased estimator of the gradient of entropy. So we also calculate a value whose gradient is an unbiased estimator of the gradient of entropy. See ``estimated_entropy()`` for detail. Examples: .. code-block:: python ent, ent_for_grad = entropy_with_fall_back(dist, action_spec) alf.summary.scalar("entropy", ent) ent_for_grad.backward() Args: distributions (nested Distribution): A possibly batched tuple of distributions. Returns: tuple: - entropy - entropy_for_gradient: You should use ``entropy`` in situations where its value is needed, and ``entropy_for_gradient`` where you need to calculate the gradient of entropy. """ def _compute_entropy(dist: td.Distribution): if isinstance(dist, td.TransformedDistribution): # TransformedDistribution is used by NormalProjectionNetwork with # scale_distribution=True, in which case we estimate with sampling. entropy, entropy_for_gradient = estimated_entropy(dist) else: entropy = dist.entropy() entropy_for_gradient = entropy return entropy, entropy_for_gradient entropies = list(map(_compute_entropy, nest.flatten(distributions))) entropies, entropies_for_gradient = zip(*entropies) return sum(entropies), sum(entropies_for_gradient)
def compute_entropy(distributions): """Computes total entropy of nested distribution. Args: distributions (nested Distribution): A possibly batched tuple of distributions. Returns: entropy """ def _compute_entropy(dist: td.Distribution): entropy = dist.entropy() return entropy entropies = nest.map_structure(_compute_entropy, distributions) total_entropies = sum(nest.flatten(entropies)) return total_entropies
def compute_log_probability(distributions, actions): """Computes log probability of actions given distribution. Args: distributions: A possibly batched tuple of distributions. actions: A possibly batched action tuple. Returns: Tensor: the log probability summed over actions in the batch. """ def _compute_log_prob(single_distribution, single_action): single_log_prob = single_distribution.log_prob(single_action) return single_log_prob nest.assert_same_structure(distributions, actions) log_probs = nest.map_structure(_compute_log_prob, distributions, actions) total_log_probs = sum(nest.flatten(log_probs)) return total_log_probs
def _create_projection_net(self, discrete_projection_net_ctor, continuous_projection_net_ctor): """If there are :math:`N` action specs, then create :math:`N` projection networks which can be a mixture of categoricals and normals. """ def _create(spec): if spec.is_discrete: net = discrete_projection_net_ctor( input_size=self._encoding_net.output_spec.shape[0], action_spec=spec) else: net = continuous_projection_net_ctor( input_size=self._encoding_net.output_spec.shape[0], action_spec=spec) return net self._projection_net = nest.map_structure(_create, self._action_spec) if nest.is_nested(self._projection_net): # need this for torch to pickup the parameters of all the modules self._projection_net_module_list = nn.ModuleList( nest.flatten(self._projection_net))
def _check_action_specs_for_critic_networks(action_spec, action_input_processors, action_preprocessing_combiner): if len(nest.flatten(action_spec)) > 1: assert action_preprocessing_combiner is not None, ( "An action combiner is needed when there are multiple action specs:" " {}".format(action_spec)) def _check_individual(spec, proc): if spec.is_discrete: assert proc is not None, ( 'CriticNetwork only supports continuous actions. One of given ' + 'action specs {} is discrete. Use QNetwork instead. '.format( spec) + 'Alternatively, specify `action_input_processors` to transform ' + 'discrete actions to continuous action embeddings first.') if action_input_processors is None: action_input_processors = nest.map_structure(lambda _: None, action_spec) nest.map_structure(_check_individual, action_spec, action_input_processors)
def __init__(self, input_tensor_spec: TensorSpec, action_spec: BoundedTensorSpec, input_preprocessors=None, preprocessing_combiner=None, conv_layer_params=None, fc_layer_params=None, activation=torch.relu_, kernel_initializer=None, name="QNetwork"): """Creates an instance of ``QNetwork`` for estimating action-value of discrete actions. The action-value is defined as the expected return starting from the given input observation and taking the given action. It takes observation as input and outputs an action-value tensor with the shape of ``[batch_size, num_of_actions]``. Args: input_tensor_spec (TensorSpec): the tensor spec of the input action_spec (TensorSpec): the tensor spec of the action input_preprocessors (nested InputPreprocessor): a nest of ``InputPreprocessor``, each of which will be applied to the corresponding input. If not None, then it must have the same structure with ``input_tensor_spec`` (after reshaping). If any element is None, then it will be treated as ``math_ops.identity``. This arg is helpful if you want to have separate preprocessings for different inputs by configuring a gin file without changing the code. For example, embedding a discrete input before concatenating it to another continuous vector. preprocessing_combiner (NestCombiner): preprocessing called on complex inputs. Note that this combiner must also accept ``input_tensor_spec`` as the input to compute the processed tensor spec. For example, see ``alf.nest.utils.NestConcat``. This arg is helpful if you want to combine inputs by configuring a gin file without changing the code. conv_layer_params (tuple[tuple]): a tuple of tuples where each tuple takes a format ``(filters, kernel_size, strides, padding)``, where ``padding`` is optional. fc_layer_params (tuple[int]): a tuple of integers representing hidden FC layer sizes. activation (nn.functional): activation used for hidden layers. The last layer will not be activated. kernel_initializer (Callable): initializer for all the layers but the last layer. If none is provided a default ``variance_scaling_initializer`` will be used. """ super(QNetwork, self).__init__(input_tensor_spec, name=name) assert len(nest.flatten(action_spec)) == 1, ( "Currently only support a single discrete action! Use " "CriticNetwork instead for multiple actions.") num_actions = action_spec.maximum - action_spec.minimum + 1 self._output_spec = TensorSpec((num_actions, )) self._encoding_net = EncodingNetwork( input_tensor_spec=input_tensor_spec, input_preprocessors=input_preprocessors, preprocessing_combiner=preprocessing_combiner, conv_layer_params=conv_layer_params, fc_layer_params=fc_layer_params, activation=activation, kernel_initializer=kernel_initializer) last_kernel_initializer = functools.partial(torch.nn.init.uniform_, \ a=-0.003, b=0.003) self._final_layer = layers.FC( self._encoding_net.output_spec.shape[0], num_actions, activation=math_ops.identity, kernel_initializer=last_kernel_initializer, bias_init_value=-0.2)
def __init__(self, input_tensor_spec, action_qt: ActionQuantizer = None, num_critic_replicas=2, obs_encoding_layer_params=None, pre_encoding_layer_params=None, mid_encoding_layer_params=None, post_encoding_layer_params=None, free_form_fc_layer_params=None, activation=torch.relu_, kernel_initializer=None, debug_summaries=False, name="MdqCriticNetwork"): """Creates an instance of `MdqCriticNetwork` for estimating action-value of continuous actions and action sampling. Currently there are two branches of networks: - free-form branch: a plain MLP for Q-learning - adv-form branch: an advantage form of network for action generation. It is trained by a target from the free-form net. The adv-form branch has the following structures for flexibility: obs -> [obs_encoding_net] -> encoded_obs encoded_obs, action -> [pre_encoding_nets] -> [mid_shared_encoding_nets] -> [post_encoding_nets] -> outputs where the pre_encoding_nets and post_encoding_nets do not share parameters across action dimensions while mid_shared_encoding_nets shares parameters across action dimensions. If the encoding_layer_params for a sub-net is None, that sub-net is effectively neglected. Furthermore, to enable parallel computation across action dimension in the case of value computation, we have both parallel and individual versions for the nets without parameter sharing. For exmaple, for post_encoding_nets, we also have post_encoding_parallel_net, which is essentially the equivalent form of post_encoding_nets but supports parallel forwarding. The parameters of the two versions are synced. The partial actions (a[0:i]) are zero-padded for both parallel and individual networks to enable parallel computation. For conciseness purpose, the following notations will be used when convenient: - B: batch size - d: dimensionality of feature - n: number of network replica - action_dim: the dimensionality of actions - action_bin: number of discrete bins for each action dim Args: input_tensor_spec: A tuple of TensorSpecs (observation_spec, action_spec) representing the inputs. action_qt (ActionQuantizer): action quantization module num_critic_replicas (int): number of critic networks obs_encoding_layer_params (tuple[int]): a tuple of integers representing hidden FC layer sizes for encoding observations. pre_encoding_layer_params (tuple[int]): a tuple of integers representing hidden FC layer sizes for encoding concatenated [encoded_observation, actions]. Parameters are not shared across action dimensions mid_encoding_layer_params (tuple[int]): a tuple of integers representing hidden FC layer for further encoding the outputs from pre_encoding_net. The parameters are shared across action dimentions. post_encoding_layer_params (tuple[int]): a tuple of integers representing hidden FC layer for further encoding the outputs from mid_encoding_net. The parameters are not shared across action dimentions. free_form_fc_layer_params (tuple[int]): a tuple of integers representing hidden FC layer for Q-learning. We refer it as the free form to differentiate it from the mdq-form of network which is structured. activation (nn.functional): activation used for hidden layers. The last layer will not be activated. kernel_initializer (Callable): initializer for all the layers but the last layer. If none is provided a variance_scaling_initializer with uniform distribution will be used. name (str): """ super().__init__(input_tensor_spec, name=name) observation_spec, action_spec = input_tensor_spec flat_action_spec = nest.flatten(action_spec) if len(flat_action_spec) > 1: raise ValueError( 'Only a single action is supported by this network') self._single_action_spec = flat_action_spec[0] if action_qt is None: action_qt = ActionQuantizer(action_spec, "uniform", 15) self._action_qt = action_qt self._action_bins = self._action_qt._action_bins # the logpi of the uniform prior used for KL computation self._log_pi_uniform_prior = -np.log(self._action_bins) self._action_dim = action_spec.shape[0] # control vector dim self._num_critic_replicas = num_critic_replicas self._obs_encoding_net = ParallelEncodingNetwork( observation_spec, self._num_critic_replicas, fc_layer_params=obs_encoding_layer_params, activation=activation, kernel_initializer=kernel_initializer) last_activation = math_ops.identity last_kernel_initializer = functools.partial(torch.nn.init.uniform_, \ a=-0.003, b=0.003) in_size = self._action_dim self._pre_encoding_nets = [] for i in range(self._action_dim): # output_spec.shape: [n, d] self._pre_encoding_nets.append( ParallelEncodingNetwork( TensorSpec((self._obs_encoding_net.output_spec.shape[-1] + in_size, )), self._num_critic_replicas, fc_layer_params=pre_encoding_layer_params, activation=activation, kernel_initializer=kernel_initializer)) # parallel along both critic and action dims without sharing parameters # for each action dimension. # input: [B, action_dim*n, d]: need to stack over dim1 # output: [B, action_dim*n, d']: need to unstack over dim1 for # splitting over networks self._pre_encoding_parallel_net = ParallelEncodingNetwork( TensorSpec( (self._obs_encoding_net.output_spec.shape[-1] + in_size, )), self._num_critic_replicas * self._action_dim, fc_layer_params=pre_encoding_layer_params, activation=activation, kernel_initializer=kernel_initializer) # parallel along both critic and action dims with sharing parameters # for each action dimension. # input: [action_dim*B, n, d]: need to stack over dim0 # output: [action_dim*B, n, d']: need to unstack over dim0 for # splitting over networks self._mid_shared_encoding_nets = ParallelEncodingNetwork( TensorSpec( (self._pre_encoding_parallel_net.output_spec.shape[-1], )), self._num_critic_replicas, fc_layer_params=mid_encoding_layer_params, activation=activation, kernel_initializer=kernel_initializer) out_size = self._mid_shared_encoding_nets.output_spec.shape[-1] post_enc_out_size = self._action_qt.action_bins self._post_encoding_nets = [] for i in range(self._action_dim): self._post_encoding_nets.append( ParallelEncodingNetwork( TensorSpec((out_size, )), self._num_critic_replicas, fc_layer_params=post_encoding_layer_params, activation=activation, kernel_initializer=kernel_initializer, last_layer_size=post_enc_out_size, last_activation=last_activation, last_kernel_initializer=last_kernel_initializer)) # parallel along both critic and action dims without sharing parameters # for each action dimension. # input: [B, action_dim*n, d]: need to stack over dim1 # output: [B, action_dim*n, d']: need to unstack over dim1 for # splitting over networks self._post_encoding_parallel_net = ParallelEncodingNetwork( TensorSpec((out_size, )), self._num_critic_replicas * self._action_dim, fc_layer_params=post_encoding_layer_params, activation=activation, kernel_initializer=kernel_initializer, last_layer_size=post_enc_out_size, last_activation=last_activation, last_kernel_initializer=last_kernel_initializer) assert free_form_fc_layer_params is not None self._free_form_q_net = ParallelEncodingNetwork( TensorSpec((observation_spec.shape[-1] + self._action_dim, )), self._num_critic_replicas, fc_layer_params=free_form_fc_layer_params, activation=activation, kernel_initializer=kernel_initializer, last_layer_size=1, last_activation=math_ops.identity, last_kernel_initializer=last_kernel_initializer) MdqCriticNetwork._parallel_to_individual_network_sync( self._pre_encoding_parallel_net, self._pre_encoding_nets, step=self._num_critic_replicas) MdqCriticNetwork._parallel_to_individual_network_sync( self._post_encoding_parallel_net, self._post_encoding_nets, step=self._num_critic_replicas) self._output_spec = TensorSpec(()) self._debug_summaries = debug_summaries
def __init__(self, input_tensor_spec: TensorSpec, action_spec: BoundedTensorSpec, input_preprocessors=None, preprocessing_combiner=None, conv_layer_params=None, fc_layer_params=None, activation=torch.relu_, squashing_func=torch.tanh, kernel_initializer=None, name="ActorNetwork"): """Creates an instance of ``ActorNetwork``, which maps the inputs to actions (single or nested) through a sequence of deterministic layers. Args: input_tensor_spec (TensorSpec): the tensor spec of the input. action_spec (BoundedTensorSpec): the tensor spec of the action. input_preprocessors (nested InputPreprocessor): a nest of ``InputPreprocessor``, each of which will be applied to the corresponding input. If not None, then it must have the same structure with ``input_tensor_spec`` (after reshaping). If any element is None, then it will be treated as ``math_ops.identity``. This arg is helpful if you want to have separate preprocessings for different inputs by configuring a gin file without changing the code. For example, embedding a discrete input before concatenating it to another continuous vector. preprocessing_combiner (NestCombiner): preprocessing called on complex inputs. Note that this combiner must also accept ``input_tensor_spec`` as the input to compute the processed tensor spec. For example, see ``alf.nest.utils.NestConcat``. This arg is helpful if you want to combine inputs by configuring a gin file without changing the code. conv_layer_params (tuple[tuple]): a tuple of tuples where each tuple takes a format ``(filters, kernel_size, strides, padding)``, where ``padding`` is optional. fc_layer_params (tuple[int]): a tuple of integers representing hidden FC layer sizes. activation (nn.functional): activation used for hidden layers. The last layer will not be activated. squashing_func (Callable): the activation function used to squashing the output to the range :math:`(-1, 1)`. Default to ``tanh``. kernel_initializer (Callable): initializer for all the layers but the last layer. If none is provided a ``variance_scaling_initializer`` with uniform distribution will be used. name (str): name of the network """ super(ActorNetwork, self).__init__(input_tensor_spec, input_preprocessors, preprocessing_combiner, name=name) if kernel_initializer is None: kernel_initializer = functools.partial(variance_scaling_init, gain=math.sqrt(1.0 / 3), mode='fan_in', distribution='uniform') self._action_spec = action_spec flat_action_spec = nest.flatten(action_spec) self._flat_action_spec = flat_action_spec is_continuous = [ single_action_spec.is_continuous for single_action_spec in flat_action_spec ] assert all(is_continuous), "only continuous action is supported" self._encoding_net = EncodingNetwork( input_tensor_spec=self._processed_input_tensor_spec, conv_layer_params=conv_layer_params, fc_layer_params=fc_layer_params, activation=activation, kernel_initializer=kernel_initializer, name=self.name + ".encoding_net") last_kernel_initializer = functools.partial(torch.nn.init.uniform_, \ a=-0.003, b=0.003) self._action_layers = nn.ModuleList() self._squashing_func = squashing_func for single_action_spec in flat_action_spec: self._action_layers.append( layers.FC(self._encoding_net.output_spec.shape[0], single_action_spec.shape[0], kernel_initializer=last_kernel_initializer))
def termination(observation, prev_action, reward, env_id=None, env_info={}): """Returns a ``TimeStep`` with ``step_type`` set to ``StepType.LAST``. Called by ``env.step()`` if 'Done'. ``discount`` should not be sent in and will be set as 0. Args: observation (nested tensors): current observations of the env. prev_action (nested tensors): previous actions to the the env. reward (float): A scalar, or 1D NumPy array, or tensor. env_id (torch.int32): (optional) A scalar or 1D tensor of the environment ID(s). env_info (dict): extra info returned by the environment. Returns: TimeStep: Raises: ValueError: If observations are tensors but reward's statically known rank is not 0 or 1. """ flat_observation = nest.flatten(observation) if all(map(_is_numpy_array, flat_observation)): reward = np.float32(reward) if env_id is None: env_id = np.int32(0) step_type = StepType.LAST discount = np.float32(0.0) return TimeStep(step_type, reward, discount, observation, prev_action, env_id, env_info=env_info) else: assert all( map(torch.is_tensor, flat_observation)), ("Elements in observation must be Tensor") reward = to_tensor(reward, dtype=torch.float32) assert reward.dim() <= 1, "Expected reward to be a scalar or vector." if reward.dim() == 0: shape = [] if env_id is None: env_id = torch.tensor(0, dtype=torch.int32) else: flat_action = nest.flatten(prev_action) assert flat_observation[0].shape[:1] == reward.shape assert flat_action[0].shape[:1] == reward.shape shape = reward.shape env_id = torch.arange(shape[0], dtype=torch.int32) step_type = torch.full(shape, StepType.LAST, dtype=torch.int32) discount = torch.full(shape, 0.0, dtype=torch.float32) return TimeStep(step_type, reward, discount, observation, prev_action, env_id, env_info=env_info)
def transition(observation, prev_action, reward, discount=1.0, env_id=None, env_info={}): """Returns a ``TimeStep`` with ``step_type`` set equal to ``StepType.MID``. Called by ``env.step()`` if not 'Done'. The batch size is inferred from the shape of ``reward``. If ``discount`` is a scalar, and ``observation`` contains tensors, then ``discount`` will be broadcasted to match ``reward.shape``. Args: observation (nested tensors): current observations of the env. prev_action (nested tensors): previous actions to the the env. reward (float): A scalar, or 1D NumPy array, or tensor. discount (float): (optional) A scalar, or 1D NumPy array, or tensor. env_id (torch.int32): (optional) A scalar or 1D tensor of the environment ID(s). env_info (dict): extra info returned by the environment. Returns: TimeStep: Raises: ValueError: If observations are tensors but reward's rank is not 0 or 1. """ flat_observation = nest.flatten(observation) if all(map(_is_numpy_array, flat_observation)): reward = np.float32(reward) if env_id is None: env_id = np.int32(0) step_type = StepType.MID discount = np.float32(discount) return TimeStep(step_type, reward, discount, observation, prev_action, env_id, env_info=env_info) else: assert all( map(torch.is_tensor, flat_observation)), ("Elements in observation must be Tensor") # TODO: If reward.shape.rank == 2, and static # batch sizes are available for both flat_observation and reward, # check that these match. reward = to_tensor(reward, dtype=torch.float32) assert reward.dim() <= 1, "Expected reward to be a scalar or vector." if reward.dim() == 0: shape = [] if env_id is None: env_id = torch.tensor(0, dtype=torch.int32) else: flat_action = nest.flatten(prev_action) assert flat_observation[0].shape[:1] == reward.shape assert flat_action[0].shape[:1] == reward.shape shape = reward.shape env_id = torch.arange(shape[0], dtype=torch.int32) step_type = torch.full(shape, StepType.MID, dtype=torch.int32) discount = to_tensor(discount, dtype=torch.float32) if discount.dim() == 0: discount = torch.full(shape, discount, dtype=torch.float32) else: assert reward.shape == discount.shape return TimeStep(step_type, reward, discount, observation, prev_action, env_id, env_info=env_info)
def restart(observation, action_spec, env_id=None, env_info={}, batched=False): """Returns a ``TimeStep`` with ``step_type`` set equal to ``StepType.FIRST``. Called by ``env.reset()``. Args: observation (nested tensors): observations of the env. action_spec (nested TensorSpec): tensor spec of actions. env_id (batched or scalar torch.int32): (optional) ID of the env. env_info (dict): extra info returned by the environment. batched (bool): (optional) whether batched envs or not. Returns: TimeStep: """ first_observation = nest.flatten(observation) if all(map(_is_numpy_array, first_observation)): step_type = StepType.FIRST if batched: batch_size = first_observation[0].shape[0] reward = np.zeros((batch_size, ), dtype=np.float32) discount = np.ones((batch_size, ), dtype=np.float32) prev_action = nest.map_structure( lambda spec: spec.numpy_zeros(outer_dims=(batch_size, )), action_spec) if env_id is None: env_id = np.arrange(batch_size, dtype=np.int32) else: reward = np.float32(0.0) discount = np.float32(1.0) prev_action = nest.map_structure(lambda spec: spec.numpy_zeros(), action_spec) if env_id is None: env_id = np.int32(0) return TimeStep(step_type, reward, discount, observation, prev_action, env_id, env_info=env_info) else: assert all( map(torch.is_tensor, first_observation)), ("Elements in observation must be Tensor") # TODO: Check leading dimension of first_observation # against batch_size if all are known statically. if batched: batch_size = first_observation[0].shape[0] step_type = torch.full((batch_size, ), StepType.FIRST, dtype=torch.int32) reward = torch.full((batch_size, ), 0.0, dtype=torch.float32) discount = torch.full((batch_size, ), 1.0, dtype=torch.float32) prev_action = nest.map_structure( lambda spec: spec.zeros(outer_dims=(batch_size, )), action_spec) env_id = torch.arange(batch_size, dtype=torch.int32) else: step_type = torch.full((), StepType.FIRST, dtype=torch.int32) reward = torch.tensor(0.0, dtype=torch.float32) discount = torch.tensor(1.0, dtype=torch.float32) prev_action = nest.map_structure(lambda spec: spec.zeros(), action_spec) if env_id is None: env_id = torch.tensor(0, dtype=torch.int32) return TimeStep(step_type, reward, discount, observation, prev_action, env_id, env_info=env_info)
def __init__(self, input_tensor_spec, output_tensor_spec, joint_fc_layer_params=None, activation=torch.relu_, kernel_initializer=None, prob=False, continuous_projection_net_ctor=NormalProjectionNetwork, name="DynamicsNetwork"): """Creates an instance of `DynamicsNetwork` for predicting the next observation given current observation and action. Args: input_tensor_spec: A tuple of TensorSpecs (observation_spec, action_spec) representing the inputs. joint_fc_layer_params (tuple[int]): a tuple of integers representing hidden FC layer sizes FC layers after merging observations and actions. activation (nn.functional): activation used for hidden layers. The last layer will not be activated. kernel_initializer (Callable): initializer for all the layers but the last layer. If none is provided a variance_scaling_initializer with uniform distribution will be used. prob (bool): If True, use the probabistic mode of network; otherwise, use the determinstic mode of network. continuous_projection_net_ctor (ProjectionNetwork): constructor that generates a continuous projection network that outputs a distribution. name (str): """ super().__init__(input_tensor_spec, name=name) observation_spec, action_spec = input_tensor_spec out_size = output_tensor_spec.shape[0] flat_action_spec = nest.flatten(action_spec) if len(flat_action_spec) > 1: raise ValueError( 'Only a single action is supported by this network') if kernel_initializer is None: kernel_initializer = functools.partial( variance_scaling_init, gain=1.0 / 2.0, mode='fan_in', distribution='truncated_normal', nonlinearity=math_ops.identity) self._single_action_spec = flat_action_spec[0] self._prob = prob if self._prob: self._joint_encoder = EncodingNetwork( TensorSpec( (observation_spec.shape[0] + action_spec.shape[0], )), fc_layer_params=joint_fc_layer_params, activation=activation, kernel_initializer=kernel_initializer) # the output spec is named as ``action_spec`` in projection_net self._projection_net = continuous_projection_net_ctor( # note that in the case of multi-replica, should use [-1] input_size=self._joint_encoder.output_spec.shape[-1], action_spec=output_tensor_spec, squash_mean=False, scale_distribution=False, state_dependent_std=True) else: self._joint_encoder = EncodingNetwork( TensorSpec( (observation_spec.shape[0] + action_spec.shape[0], )), fc_layer_params=joint_fc_layer_params, activation=activation, kernel_initializer=kernel_initializer, last_activation=math_ops.identity, last_layer_size=out_size) self._projection_net = None self._output_spec = TensorSpec((out_size, ))