def reset(self, no_observation: bool = False) -> tuple:
        """
        Set the batch of environments to their initial states
        :param no_observation: When set to True, the reset function does not return an observation (None)
                       This option exists, since the planner does not require observations, but rendering slows
                       it down a lot
        :return: a 3-tuple consisting of:
                    - a torch.Tensor containing a batch of initial observations
                      Shape: (batch_size,) + observation_shape
                    - a torch.Tensor containing a batch of boolean flags indicating whether the environments terminated
                    - a tuple of dicts possibly containing additional information for each environment
        """
        # Set internal time step counter to 0
        self._t = 0
        # Reset all environments
        results = [env.reset() for env in self._envs]
        # Process the control suite results
        results = [self._process_result(result) for result in results]
        # Filter the non-existent rewards
        results = [(o, t, info) for o, r, t, info in results]

        # Don't return an observation if no_observation flag is set
        if no_observation:
            # Unzip all tuples into 3 tuples containing the observations, flags and info dicts, respectively
            results = [*zip(*results)]
            # No observation is returned
            results[0] = None
            # Merge all flags to one tensor
            results[1] = batch_tensors(*results[1])
            # Return all results as a tuple
            return tuple(results)
        # If required, set observation to image observations
        elif not self._state_obs:
            # Get raw pixel observations
            pixels_tuple = self._pixels()
            # Preprocess all observations
            results = [(preprocess_observation(image, self._bit_depth,
                                               self._observation_size), t,
                        info)
                       for image, (_, t, info) in zip(pixels_tuple, results)]

            # Add raw pixels to all info dicts
            for pixels, (_, _, info) in zip(pixels_tuple, results):
                info['pixels'] = pixels

        # Unzip all tuples into 3 tuples containing the observations, flags and info dicts, respectively
        results = [*zip(*results)]
        # Merge all observations to one tensor
        results[0] = batch_tensors(*results[0])
        # Merge all flags to one tensor
        results[1] = batch_tensors(*results[1])

        # Return all results as a tuple
        return tuple(results)
    def _batch_samples(samples: tuple) -> tuple:
        """
        Transform a tuple of sample tuples to a single batched sample tuple

        The batch size is the length of the tuple of samples

        :param samples: a tuple of individual samples
            Each individual sample is a tuple consisting of:
            - o: an observation tensor (at time t)          (dtype: float, shape: observation_shape)
            - a: an action tensor (at time t)               (dtype: float, shape: action_shape)
            - r: a reward tensor (at time t + 1)            (dtype: float, shape: (1,) )
            - o': an observation tensor (at time t + 1)     (dtype: float, shape: observation_shape)
            - a': an action tensor (at time t + 1)          (dtype: float, shape: action_shape)

        :return: a single sample batch tuple consisting of:
            - o: an observation tensor (at time t)          (dtype: float, shape: (batch_size,) + observation_shape)
            - a: an action tensor (at time t)               (dtype: float, shape: (batch_size,) + action_shape)
            - r: a reward tensor (at time t + 1)            (dtype: float, shape: batch_size)
            - o': an observation tensor (at time t + 1)     (dtype: float, shape: (batch_size,) + observation_shape)
            - a': an action tensor (at time t + 1)          (dtype: float, shape: (batch_size,) + action_shape)
        """
        # Separate the samples into five tuples of all (o, a, r, o', a')
        o, a, r, o_, a_ = tuple(*zip(*samples))
        # For all five tuples, concatenate the entries over one batch dimension
        o, a, r, o_, a_ = tuple(batch_tensors(*ts) for ts in (o, a, r, o_, a_))
        # Return as a single batched sample tuple
        return o, a, r, o_, a_
 def sample_random_action(self):
     actions = []
     for env in self._envs:
         spec = env.action_spec()
         action = np.random.uniform(spec.minimum, spec.maximum, spec.shape)
         actions += [torch.from_numpy(action).to(torch.float32)]
     actions = batch_tensors(*actions)
     return actions
Esempio n. 4
0
 def sample_random_action(self) -> torch.Tensor:
     """
     :return: a uniformly sampled random action from the action space
     """
     actions = [env.action_space.sample() for env in self._envs]
     actions = [torch.from_numpy(a) for a in actions]
     actions = batch_tensors(*actions)
     return actions
Esempio n. 5
0
    def get_rewards_as_tensor(self) -> torch.Tensor:
        """
        Get all rewards in one single tensor
            shape: (episode_length,)

        :return: a torch.FloatTensor containing all obtained rewards
        """
        # Concatenate or 'batch' the rewards along a newly created episode_length dimension
        return batch_tensors(*self.rewards)
Esempio n. 6
0
    def get_actions_as_tensor(self) -> torch.Tensor:
        """
        Get all actions in one single tensor
            shape: (episode_length,) + action_shape

        :return: a torch.FloatTensor containing all performed actions
        """
        # Concatenate or 'batch' the actions along a newly created episode_length dimension
        return batch_tensors(*self.actions)
Esempio n. 7
0
    def get_observations_as_tensor(self) -> torch.Tensor:
        """
        Get all observations in one single tensor
            shape: (episode_length + 1,) + observation_shape

        The +1 is due to the initial observation

        :return: a torch.FloatTensor containing all obtained observations
        """
        # Concatenate or 'batch' the observations along a newly created episode_length dimension
        return batch_tensors(*self.observations)
Esempio n. 8
0
 def as_dataset(self) -> TensorDataset:
     """
     Convert the dataset to a torch.utils.data.TensorDataset
     Iterating through this dataset will give the individual (o, a, r, o', a') samples
     :return: the entire dataset as a torch.utils.data.TensorDataset
     """
     # Separate the dataset into five tuples of all (o, a, r, o', a')
     o, a, r, o_, a_ = (*zip(*self._data), )
     # For all five tuples, concatenate the entries over one batch dimension
     # All tuples are now single tensors containing all data (with the data set size as batch size)
     o, a, r, o_, a_ = tuple(batch_tensors(*ts) for ts in (o, a, r, o_, a_))
     # Use these tensors to create a TensorDataset
     return TensorDataset(o, a, r, o_, a_)
    def as_episode_dataset(self) -> TensorDataset:
        """
        Get the dataset as a PyTorch TensorDataset containing episodes of data

        Each entry in the dataset is a 5-tuple consisting of:
            - an observation tensor containing all observations obtained during the episode (at some time t)
                shape: (episode_length,) + observation_shape
            - an action tensor containing all actions performed during the episode (at some time t)
                shape: (episode_length,) + action_shape
            - a reward tensor containing all rewards obtained during the episode (at some time t + 1)
                shape: (episode_length,)
            - an observation tensor containing all subsequent observations obtained during the episode
              (at some time t + 1)
                shape: (episode_length,) + observation shape
            - an action tensor containing all subsequent actions performed during the episode (at some time t + 1)
                shape: (episode_length,) + action_shape

        The two observation tensors and two action tensors have shared storage

        All tensors are ordered in the way that data was collected

        :return a TensorDataset containing all episode data
        """
        # Group episode data
        episode_data = []
        for episode in self._data:

            # Get all episode data in single tensors
            observations = episode.get_observations_as_tensor()
            actions = episode.get_actions_as_tensor()
            rewards = episode.get_rewards_as_tensor()

            num_samples = rewards.size(0) - 1

            # Slice the corresponding tensors
            o = observations.narrow(dim=0, start=0, length=num_samples)
            a = actions.narrow(dim=0, start=0, length=num_samples)
            r = rewards.narrow(dim=0, start=0, length=num_samples)
            o_ = observations.narrow(dim=0, start=1, length=num_samples)
            a_ = actions.narrow(dim=0, start=1, length=num_samples)

            # Preprocess the images
            o = preprocess_observation_tensor(o, self._bit_depth)
            o_ = preprocess_observation_tensor(o_, self._bit_depth)

            episode_data.append((o, a, r, o_, a_))

        # Concatenate all episode data over a new dataset dimension
        tensors = [batch_tensors(*ts) for ts in (*zip(*episode_data), )]
        # Use these tensors to create a TensorDataset
        return TensorDataset(*tensors)
Esempio n. 10
0
    def reset(self, no_observation: bool = False) -> tuple:
        """
        Set the batch of environments to their initial states
        :param no_observation: When set to True, the reset function does not return an observation (None)
                       This option exists, since the planner does not require observations, but rendering slows
                       it down a lot
        :return: a 3-tuple consisting of:
                    - a torch.Tensor containing a batch of initial observations
                      Shape: (batch_size,) + observation_shape
                    - a torch.Tensor containing a batch of boolean flags indicating whether the environments terminated
                    - a tuple of dicts possibly containing additional information for each environment
        """
        # Set internal time step counter to 0
        self._t = 0
        # Get all initial observations from resetting the environment batch
        observations = [env.reset() for env in self._envs]
        # Create a flag tensor
        flags = torch.zeros(self.batch_size, dtype=torch.bool)
        # Create an info dict for each environment
        infos = tuple([dict() for _ in range(len(self._envs))])
        # Don't return an observation if no_observation flag is set
        if no_observation:
            return None, flags, infos
        elif not self._state_obs:
            # Get raw pixel observations of the environments
            pixels_tuple = self._pixels()
            # Process the image observations
            observations = [
                preprocess_observation(o, self._bit_depth,
                                       self._observation_size)
                for o in pixels_tuple
            ]
            # observations = [self._process_image(o) for o in pixels_tuple]
            # Add raw pixels to the info dict
            for info, pixels in zip(infos, pixels_tuple):
                info['pixels'] = pixels

        # Cast all observations to tensors
        # observations = [torch.from_numpy(o).to(dtype=torch.float) for o in observations]
        # Concatenate all tensors in a newly created batch dimension
        # Results in a single observation tensor of shape: (batch_size,) + observation_shape
        observations = batch_tensors(
            *observations)  # TODO -- cast states to tensors!
        # Return the results
        return observations, flags, infos
    def __getitem__(self, index):
        episode = self._data[index]

        # Get all episode data in single tensors
        observations = episode.get_observations_as_tensor()
        actions = episode.get_actions_as_tensor()
        rewards = episode.get_rewards_as_tensor()

        num_samples = rewards.size(0) - 1

        # Slice the corresponding tensors
        o = observations.narrow(dim=0, start=0, length=num_samples)
        a = actions.narrow(dim=0, start=0, length=num_samples)
        r = rewards.narrow(dim=0, start=0, length=num_samples)
        o_ = observations.narrow(dim=0, start=1, length=num_samples)
        a_ = actions.narrow(dim=0, start=1, length=num_samples)

        # Preprocess the images
        o = preprocess_observation_tensor(o, self._bit_depth)
        o_ = preprocess_observation_tensor(o_, self._bit_depth)

        return tuple(batch_tensors(*ts) for ts in (o, a, r, o_, a_))
Esempio n. 12
0
    def step(self,
             action: torch.Tensor,
             no_observation: bool = False) -> tuple:
        """
        Perform an action in the environment. Returns a reward and observation
        :param action: a Tensor representation of the action that should be performed in the environment
                        Shape: (batch_size,) + action_shape
        :param no_observation: When set to True, the step function does not return an observation (None)
                               This option exists, since the planner does not require observations, but rendering slows
                               it down a lot
        :return: a 4-tuple consisting of:
                    - a torch.Tensor observation
                      Shape: (batch_size,) + observation_shape
                    - a torch.Tensor reward
                      Shape: (batch_size,)
                    - a torch.Tensor boolean flag indicating whether the environment has terminated
                      Shape: (batch_size,)
                    - a tuple of dicts possibly containing additional information
        """
        # Increment the internal time step counter
        self._t += 1
        # Convert the tensor to suitable input
        action = action.detach().numpy()
        # Execute the actions in the environments
        results = [env.step(a) for a, env in zip(action, self._envs)]
        # Process the control suite results
        results = [self._process_result(result) for result in results]

        # Don't return an observation if no_observation flag is set
        if no_observation:
            # Unzip all tuples into 3 tuples containing the observations, flags and info dicts, respectively
            results = [*zip(*results)]
            # No observation is returned
            results[0] = None
            # Merge all rewards to one tensor
            results[1] = batch_tensors(*results[1])
            # Merge all flags to one tensor
            results[2] = batch_tensors(*results[2])
            # Return all results as a tuple
            return tuple(results)
        # If required, set observation to image observations
        elif not self._state_obs:
            # Get raw pixels from all environments
            pixels_tuple = self._pixels()
            # Convert them to suitable observations
            observations = [
                preprocess_observation(o, self._bit_depth,
                                       self._observation_size)
                for o in pixels_tuple
            ]
            # Merge the observations in the results
            results = [(o, ) + result[1:]
                       for o, result in zip(observations, results)]

            # Add all raw pixel observations to the info dicts
            for result, pixels in zip(results, pixels_tuple):
                result[3]['pixels'] = pixels

        # Unzip all tuples into 4 tuples containing the observations, rewards, flags and info dicts, respectively
        results = [*zip(*results)]
        # Merge all observations to one tensor
        results[0] = batch_tensors(*results[0])
        # Merge all rewards to one tensor
        results[1] = batch_tensors(*results[1])
        # Merge all flags to one tensor
        results[2] = batch_tensors(*results[2])

        # Check max episode length condition. Update flags if required
        if self._t >= self._max_t:
            results[
                2] |= True  # Set all flags to true if max episode length is reached

        # Return all results as a tuple
        return tuple(results)
Esempio n. 13
0
    def _compute_loss(self, model: RSSM, episode: tuple) -> tuple:
        """
        Compute the loss of the RSSM over a single batch of episodes
        :param model: the RSSM model
        :param episode: a five-tuple consisting of:
                - a tensor containing all observations obtained during the episode
                    shape: (batch_size, episode_length,) + observation_shape
                - a tensor containing all actions performed during the episode
                    shape: (batch_size, episode_length, action_size)
                - a tensor containing all rewards obtained during the episode
                    shape: (batch_size, episode_length)
                - a tensor containing all subsequent observations obtained during the episode
                    (has shared storage with the first observation tensor)
                    shape: (batch_size, episode_length,) + observation_shape
                - a tensor containing all subsequent actions performed during the episode
                    (has shared storage with the first action tensor)
                    shape: (batch_size, episode_length, action_size)
        :return: a two-tuple containing:
                    - a loss tensor
                        shape: (1,)
                    - a dict containing info about the loss computation
        """
        # Keep info dict
        info = {}

        # Get the batch size used
        batch_size = episode[0].size(0)

        # Reset RSSM initial state
        model.reset(batch_size=batch_size)

        # Average losses over all time steps
        reward_losses = []
        observation_losses = []
        belief_losses = []
        value_losses = []

        # Switch the episode and batch dimensions
        episode = tuple([Trainer._switch_dims(xs) for xs in episode])

        # Apply state-action augmentations
        for aug in self._state_action_augmentations:
            episode = from_keyword(aug)(*episode)

        # Apply data augmentation to the observation tensor
        o_augmented = episode[3]  # Shape: (T, batch_size,) + observation_shape
        for aug in self._data_augmentations:
            o_augmented = from_keyword(aug)(o_augmented)  # Shape (T, batch_size,) + augmented_observation_shape

        # Simulate the trajectory in the model
        for t, (o, a, r, o_, a_, o_aug) in enumerate(zip(*episode, o_augmented)):
            # Optionally, get a value function estimate
            predicted_v = model.state_action_value(a) if isinstance(model, ERSSM) else None

            # Get prediction (distributions) from the environment model
            predicted_o_, predicted_r, predicted_s, _, _ = model.simulate_step(a)

            # Only observation sample is required, omit dist params
            predicted_o_, _, _ = predicted_o_
            # Only reward sample is required, omit dist params
            predicted_r, _, _ = predicted_r
            # Get belief distribution parameters
            predicted_s, state_prior_mean, state_prior_std = predicted_s

            # Compute reward loss
            reward_loss = F.mse_loss(predicted_r, r)
            reward_losses.append(reward_loss)

            # Compute observation loss
            observation_loss = F.mse_loss(predicted_o_, o_, reduction='none').sum(dim=(1, 2, 3))
            observation_losses.append(observation_loss)

            # Get the prior and posterior belief distributions
            prior = Normal(state_prior_mean, state_prior_std)
            # Get an estimate of the posterior belief using the encoder
            _, state_posterior_mean, state_posterior_std = model.posterior_state_belief(o_aug)
            posterior = Normal(state_posterior_mean, state_posterior_std)

            # Allowed deviation in KL divergence
            free_nats = torch.ones(1, dtype=torch.float32, device=o.device) * self._kl_free_nats

            # Compute KL loss
            belief_loss = kl_divergence(posterior, prior).sum(dim=1)
            # Bound by free nats
            belief_loss = torch.max(belief_loss, free_nats)
            # Add to all losses
            belief_losses.append(belief_loss)

            # Optionally, train the value function
            if isinstance(model, ERSSM):
                # Compute the target by bootstrapping
                with torch.no_grad():
                    value_target = r + model.state_action_value(a_)
                # Compute value loss
                value_loss = F.mse_loss(predicted_v, value_target)
                value_losses.append(value_loss)

        # Compute total loss components
        reward_loss = torch.mean(batch_tensors(*reward_losses)) * self._c_r_loss
        observation_loss = torch.mean(batch_tensors(*observation_losses)) * self._c_o_loss
        belief_loss = torch.mean(batch_tensors(*belief_losses)) * self._c_kl_loss
        if len(value_losses) > 0:
            value_loss = torch.mean(batch_tensors(*value_losses)) * self._c_v_loss
        else:
            value_loss = torch.zeros(1, device=reward_loss.device)
        # Compute total loss
        loss = reward_loss + observation_loss + belief_loss + value_loss

        # Store relevant information in info dict
        info['reward_loss'] = reward_loss.item()
        info['observation_loss'] = observation_loss.item()
        info['belief_loss'] = belief_loss.item()
        info['value_loss'] = value_loss.item()
        info['loss'] = loss.item()

        return loss, info