def _set_default_specs(self): self.observation_spec = ts.TensorSpec((3, 3), torch.float32) self.action_spec = ts.BoundedTensorSpec([7], dtype=torch.float32, minimum=-1.0, maximum=1.0) self.time_step_spec = ds.time_step_spec(self.observation_spec, self.action_spec, ts.TensorSpec(()))
def restart(observation, action_spec, reward_spec=ts.TensorSpec(()), env_id=None, env_info={}, batched=False): """Returns a ``TimeStep`` with ``step_type`` set equal to ``StepType.FIRST``. Called by ``env.reset()``. Args: observation (nested tensors): observations of the env. action_spec (nested TensorSpec): tensor spec of actions. reward_spec (TensorSpec): a rank-1 or rank-0 (default) tensor spec env_id (batched or scalar torch.int32): (optional) ID of the env. env_info (dict): extra info returned by the environment. batched (bool): (optional) whether batched envs or not. Returns: TimeStep: """ return _generate_time_step(batched=batched, observation=observation, step_type=StepType.FIRST, discount=1., action_spec=action_spec, reward_spec=reward_spec, env_id=env_id, env_info=env_info)
def _generate_time_step(batched, observation, step_type, discount, prev_action=None, action_spec=None, reward=None, reward_spec=ts.TensorSpec(()), env_id=None, env_info={}): flat_observation = nest.flatten(observation) if all(map(_is_numpy_array, flat_observation)): md = np if reward is not None: reward = np.float32(reward) discount = np.float32(discount) else: assert all( map(torch.is_tensor, flat_observation)), ("Elements in observation must be Tensor") md = torch if reward is not None: reward = to_tensor(reward, dtype=torch.float32) discount = to_tensor(discount, dtype=torch.float32) if batched: batch_size = flat_observation[0].shape[0] outer_dims = (batch_size, ) if env_id is None: env_id = md.arange(batch_size, dtype=md.int32) if reward is not None: assert reward.shape[:1] == outer_dims if prev_action is not None: flat_action = nest.flatten(prev_action) assert flat_action[0].shape[:1] == outer_dims else: outer_dims = () if env_id is None: env_id = md.zeros((), dtype=md.int32) step_type = md.full(outer_dims, step_type, dtype=md.int32) if reward is None: reward = md.zeros(outer_dims + reward_spec.shape, dtype=md.float32) discount = md.ones(outer_dims, dtype=md.float32) * discount if prev_action is None: prev_action = nest.map_structure( lambda spec: md.zeros(outer_dims + spec.shape, dtype=getattr( md, ts.torch_dtype_to_str(spec.dtype))), action_spec) return TimeStep(step_type, reward, discount, observation, prev_action, env_id, env_info=env_info)
def time_step_spec(observation_spec, action_spec, reward_spec): """Returns a ``TimeStep`` spec given the ``observation_spec`` and the ``action_spec``. """ def is_valid_tensor_spec(spec): return isinstance(spec, ts.TensorSpec) assert all(map(is_valid_tensor_spec, nest.flatten(observation_spec))) assert all(map(is_valid_tensor_spec, nest.flatten(action_spec))) return TimeStep(step_type=ts.TensorSpec([], torch.int32), reward=reward_spec, discount=ts.BoundedTensorSpec([], torch.float32, minimum=0.0, maximum=1.0), observation=observation_spec, prev_action=action_spec, env_id=ts.TensorSpec([], torch.int32))
def test_close_no_hang_after_init(self): constructor = functools.partial( RandomAlfEnvironment, ts.TensorSpec((3, 3), torch.float32), ts.BoundedTensorSpec([1], torch.float32, minimum=-1.0, maximum=1.0), episode_end_probability=0, min_duration=2, max_duration=2) env = ProcessEnvironment(constructor) env.start() env.close()
def __init__(self, crash_at_step, env_id=None): super(MockEnvironmentCrashInStep, self).__init__( observation_spec=ts.TensorSpec((3, 3), torch.float32), action_spec=ts.BoundedTensorSpec([1], torch.float32, minimum=-1.0, maximum=1.0), env_id=env_id, episode_end_probability=0, min_duration=crash_at_step + 1, max_duration=crash_at_step + 1) self._crash_at_step = crash_at_step self._steps = 0
def transition(observation, prev_action, reward, reward_spec=ts.TensorSpec(()), discount=1.0, env_id=None, env_info={}): """Returns a ``TimeStep`` with ``step_type`` set equal to ``StepType.MID``. Called by ``env.step()`` if not 'Done'. The batch size is inferred from the shape of ``reward``. If ``discount`` is a scalar, and ``observation`` contains tensors, then ``discount`` will be broadcasted to match ``reward.shape``. Args: observation (nested tensors): current observations of the env. prev_action (nested tensors): previous actions to the the env. reward (float): A scalar, or 1D NumPy array, or tensor. reward_spec (TensorSpec): a rank-1 or rank-0 (default) tensor spec. Used to tell if the transition is batched or not. discount (float): (optional) A scalar, or 1D NumPy array, or tensor. env_id (torch.int32): (optional) A scalar or 1D tensor of the environment ID(s). env_info (dict): extra info returned by the environment. Returns: TimeStep: Raises: ValueError: If observations are tensors but reward's rank is not 0 or 1. """ return _generate_time_step( batched=torch.as_tensor(reward).ndim > len(reward_spec.shape), observation=observation, step_type=StepType.MID, discount=discount, prev_action=prev_action, reward=reward, reward_spec=reward_spec, env_id=env_id, env_info=env_info)
def termination(observation, prev_action, reward, reward_spec=ts.TensorSpec(()), env_id=None, env_info={}): """Returns a ``TimeStep`` with ``step_type`` set to ``StepType.LAST``. Called by ``env.step()`` if 'Done'. ``discount`` should not be sent in and will be set as 0. Args: observation (nested tensors): current observations of the env. prev_action (nested tensors): previous actions to the the env. reward (float): A scalar, or 1D NumPy array, or tensor. reward_spec (TensorSpec): a rank-1 or rank-0 (default) tensor spec. Used to tell if the termination is batched or not. env_id (torch.int32): (optional) A scalar or 1D tensor of the environment ID(s). env_info (dict): extra info returned by the environment. Returns: TimeStep: Raises: ValueError: If observations are tensors but reward's statically known rank is not 0 or 1. """ return _generate_time_step( batched=torch.as_tensor(reward).ndim > len(reward_spec.shape), observation=observation, step_type=StepType.LAST, discount=0., prev_action=prev_action, reward=reward, reward_spec=reward_spec, env_id=env_id, env_info=env_info)
def __init__(self, observation_spec, action_spec, env_id=None, episode_end_probability=0.1, discount=1.0, reward_fn=None, batch_size=None, seed=42, render_size=(2, 2, 3), min_duration=0, max_duration=None): """Initializes the environment. Args: observation_spec (nested TensorSpec): tensor spec for observations action_spec (nested TensorSpec): tensor spec for actions. env_id (int): (optional) ID of the environment. episode_end_probability (float): Probability an episode will end when the environment is stepped. discount (float): Discount to set in time_steps. reward_fn (Callable): Callable that takes in step_type, action, an observation(s), and returns a tensor of rewards. batch_size (int): (Optional) Number of observations generated per call. If this value is not `None`, then all actions are expected to have an additional major axis of size `batch_size`, and all outputs will have an additional major axis of size `batch_size`. seed (int): Seed to use for rng used in observation generation. render_size (tuple of ints): Size of the random render image to return when calling render. min_duration (int): Number of steps at the beginning of the episode during which the episode can not terminate. max_duration (int): Optional number of steps after which the episode terminates regarless of the termination probability. Raises: ValueError: If batch_size argument is not None and does not match the shapes of discount or reward. """ self._batch_size = batch_size self._observation_spec = observation_spec self._action_spec = action_spec self._time_step_spec = ds.time_step_spec( self._observation_spec, action_spec, ts.TensorSpec(())) self._episode_end_probability = episode_end_probability discount = np.asarray(discount, dtype=np.float32) if env_id is None: self._env_id = np.int32(0) else: self._env_id = np.int32(env_id) if self._batch_size: if not discount.shape: discount = np.tile(discount, self._batch_size) if self._batch_size != len(discount): raise ValueError( 'Size of discounts must equal the batch size.') self._discount = discount if reward_fn is None: # Return a reward whose size matches the batch size if self._batch_size is None: self._reward_fn = lambda *_: np.float32(0) else: self._reward_fn = ( lambda *_: np.zeros(self._batch_size, dtype=np.float32)) else: self._reward_fn = reward_fn self._done = True self._num_steps = 0 self._min_duration = min_duration self._max_duration = max_duration self._rng = np.random.RandomState(seed) self._render_size = render_size super(RandomAlfEnvironment, self).__init__()
def reward_spec(self): return ts.TensorSpec(())