def reset(self, no_observation: bool = False) -> tuple: """ Set the batch of environments to their initial states :param no_observation: When set to True, the reset function does not return an observation (None) This option exists, since the planner does not require observations, but rendering slows it down a lot :return: a 3-tuple consisting of: - a torch.Tensor containing a batch of initial observations Shape: (batch_size,) + observation_shape - a torch.Tensor containing a batch of boolean flags indicating whether the environments terminated - a tuple of dicts possibly containing additional information for each environment """ # Set internal time step counter to 0 self._t = 0 # Reset all environments results = [env.reset() for env in self._envs] # Process the control suite results results = [self._process_result(result) for result in results] # Filter the non-existent rewards results = [(o, t, info) for o, r, t, info in results] # Don't return an observation if no_observation flag is set if no_observation: # Unzip all tuples into 3 tuples containing the observations, flags and info dicts, respectively results = [*zip(*results)] # No observation is returned results[0] = None # Merge all flags to one tensor results[1] = batch_tensors(*results[1]) # Return all results as a tuple return tuple(results) # If required, set observation to image observations elif not self._state_obs: # Get raw pixel observations pixels_tuple = self._pixels() # Preprocess all observations results = [(preprocess_observation(image, self._bit_depth, self._observation_size), t, info) for image, (_, t, info) in zip(pixels_tuple, results)] # Add raw pixels to all info dicts for pixels, (_, _, info) in zip(pixels_tuple, results): info['pixels'] = pixels # Unzip all tuples into 3 tuples containing the observations, flags and info dicts, respectively results = [*zip(*results)] # Merge all observations to one tensor results[0] = batch_tensors(*results[0]) # Merge all flags to one tensor results[1] = batch_tensors(*results[1]) # Return all results as a tuple return tuple(results)
def _batch_samples(samples: tuple) -> tuple: """ Transform a tuple of sample tuples to a single batched sample tuple The batch size is the length of the tuple of samples :param samples: a tuple of individual samples Each individual sample is a tuple consisting of: - o: an observation tensor (at time t) (dtype: float, shape: observation_shape) - a: an action tensor (at time t) (dtype: float, shape: action_shape) - r: a reward tensor (at time t + 1) (dtype: float, shape: (1,) ) - o': an observation tensor (at time t + 1) (dtype: float, shape: observation_shape) - a': an action tensor (at time t + 1) (dtype: float, shape: action_shape) :return: a single sample batch tuple consisting of: - o: an observation tensor (at time t) (dtype: float, shape: (batch_size,) + observation_shape) - a: an action tensor (at time t) (dtype: float, shape: (batch_size,) + action_shape) - r: a reward tensor (at time t + 1) (dtype: float, shape: batch_size) - o': an observation tensor (at time t + 1) (dtype: float, shape: (batch_size,) + observation_shape) - a': an action tensor (at time t + 1) (dtype: float, shape: (batch_size,) + action_shape) """ # Separate the samples into five tuples of all (o, a, r, o', a') o, a, r, o_, a_ = tuple(*zip(*samples)) # For all five tuples, concatenate the entries over one batch dimension o, a, r, o_, a_ = tuple(batch_tensors(*ts) for ts in (o, a, r, o_, a_)) # Return as a single batched sample tuple return o, a, r, o_, a_
def sample_random_action(self): actions = [] for env in self._envs: spec = env.action_spec() action = np.random.uniform(spec.minimum, spec.maximum, spec.shape) actions += [torch.from_numpy(action).to(torch.float32)] actions = batch_tensors(*actions) return actions
def sample_random_action(self) -> torch.Tensor: """ :return: a uniformly sampled random action from the action space """ actions = [env.action_space.sample() for env in self._envs] actions = [torch.from_numpy(a) for a in actions] actions = batch_tensors(*actions) return actions
def get_rewards_as_tensor(self) -> torch.Tensor: """ Get all rewards in one single tensor shape: (episode_length,) :return: a torch.FloatTensor containing all obtained rewards """ # Concatenate or 'batch' the rewards along a newly created episode_length dimension return batch_tensors(*self.rewards)
def get_actions_as_tensor(self) -> torch.Tensor: """ Get all actions in one single tensor shape: (episode_length,) + action_shape :return: a torch.FloatTensor containing all performed actions """ # Concatenate or 'batch' the actions along a newly created episode_length dimension return batch_tensors(*self.actions)
def get_observations_as_tensor(self) -> torch.Tensor: """ Get all observations in one single tensor shape: (episode_length + 1,) + observation_shape The +1 is due to the initial observation :return: a torch.FloatTensor containing all obtained observations """ # Concatenate or 'batch' the observations along a newly created episode_length dimension return batch_tensors(*self.observations)
def as_dataset(self) -> TensorDataset: """ Convert the dataset to a torch.utils.data.TensorDataset Iterating through this dataset will give the individual (o, a, r, o', a') samples :return: the entire dataset as a torch.utils.data.TensorDataset """ # Separate the dataset into five tuples of all (o, a, r, o', a') o, a, r, o_, a_ = (*zip(*self._data), ) # For all five tuples, concatenate the entries over one batch dimension # All tuples are now single tensors containing all data (with the data set size as batch size) o, a, r, o_, a_ = tuple(batch_tensors(*ts) for ts in (o, a, r, o_, a_)) # Use these tensors to create a TensorDataset return TensorDataset(o, a, r, o_, a_)
def as_episode_dataset(self) -> TensorDataset: """ Get the dataset as a PyTorch TensorDataset containing episodes of data Each entry in the dataset is a 5-tuple consisting of: - an observation tensor containing all observations obtained during the episode (at some time t) shape: (episode_length,) + observation_shape - an action tensor containing all actions performed during the episode (at some time t) shape: (episode_length,) + action_shape - a reward tensor containing all rewards obtained during the episode (at some time t + 1) shape: (episode_length,) - an observation tensor containing all subsequent observations obtained during the episode (at some time t + 1) shape: (episode_length,) + observation shape - an action tensor containing all subsequent actions performed during the episode (at some time t + 1) shape: (episode_length,) + action_shape The two observation tensors and two action tensors have shared storage All tensors are ordered in the way that data was collected :return a TensorDataset containing all episode data """ # Group episode data episode_data = [] for episode in self._data: # Get all episode data in single tensors observations = episode.get_observations_as_tensor() actions = episode.get_actions_as_tensor() rewards = episode.get_rewards_as_tensor() num_samples = rewards.size(0) - 1 # Slice the corresponding tensors o = observations.narrow(dim=0, start=0, length=num_samples) a = actions.narrow(dim=0, start=0, length=num_samples) r = rewards.narrow(dim=0, start=0, length=num_samples) o_ = observations.narrow(dim=0, start=1, length=num_samples) a_ = actions.narrow(dim=0, start=1, length=num_samples) # Preprocess the images o = preprocess_observation_tensor(o, self._bit_depth) o_ = preprocess_observation_tensor(o_, self._bit_depth) episode_data.append((o, a, r, o_, a_)) # Concatenate all episode data over a new dataset dimension tensors = [batch_tensors(*ts) for ts in (*zip(*episode_data), )] # Use these tensors to create a TensorDataset return TensorDataset(*tensors)
def reset(self, no_observation: bool = False) -> tuple: """ Set the batch of environments to their initial states :param no_observation: When set to True, the reset function does not return an observation (None) This option exists, since the planner does not require observations, but rendering slows it down a lot :return: a 3-tuple consisting of: - a torch.Tensor containing a batch of initial observations Shape: (batch_size,) + observation_shape - a torch.Tensor containing a batch of boolean flags indicating whether the environments terminated - a tuple of dicts possibly containing additional information for each environment """ # Set internal time step counter to 0 self._t = 0 # Get all initial observations from resetting the environment batch observations = [env.reset() for env in self._envs] # Create a flag tensor flags = torch.zeros(self.batch_size, dtype=torch.bool) # Create an info dict for each environment infos = tuple([dict() for _ in range(len(self._envs))]) # Don't return an observation if no_observation flag is set if no_observation: return None, flags, infos elif not self._state_obs: # Get raw pixel observations of the environments pixels_tuple = self._pixels() # Process the image observations observations = [ preprocess_observation(o, self._bit_depth, self._observation_size) for o in pixels_tuple ] # observations = [self._process_image(o) for o in pixels_tuple] # Add raw pixels to the info dict for info, pixels in zip(infos, pixels_tuple): info['pixels'] = pixels # Cast all observations to tensors # observations = [torch.from_numpy(o).to(dtype=torch.float) for o in observations] # Concatenate all tensors in a newly created batch dimension # Results in a single observation tensor of shape: (batch_size,) + observation_shape observations = batch_tensors( *observations) # TODO -- cast states to tensors! # Return the results return observations, flags, infos
def __getitem__(self, index): episode = self._data[index] # Get all episode data in single tensors observations = episode.get_observations_as_tensor() actions = episode.get_actions_as_tensor() rewards = episode.get_rewards_as_tensor() num_samples = rewards.size(0) - 1 # Slice the corresponding tensors o = observations.narrow(dim=0, start=0, length=num_samples) a = actions.narrow(dim=0, start=0, length=num_samples) r = rewards.narrow(dim=0, start=0, length=num_samples) o_ = observations.narrow(dim=0, start=1, length=num_samples) a_ = actions.narrow(dim=0, start=1, length=num_samples) # Preprocess the images o = preprocess_observation_tensor(o, self._bit_depth) o_ = preprocess_observation_tensor(o_, self._bit_depth) return tuple(batch_tensors(*ts) for ts in (o, a, r, o_, a_))
def step(self, action: torch.Tensor, no_observation: bool = False) -> tuple: """ Perform an action in the environment. Returns a reward and observation :param action: a Tensor representation of the action that should be performed in the environment Shape: (batch_size,) + action_shape :param no_observation: When set to True, the step function does not return an observation (None) This option exists, since the planner does not require observations, but rendering slows it down a lot :return: a 4-tuple consisting of: - a torch.Tensor observation Shape: (batch_size,) + observation_shape - a torch.Tensor reward Shape: (batch_size,) - a torch.Tensor boolean flag indicating whether the environment has terminated Shape: (batch_size,) - a tuple of dicts possibly containing additional information """ # Increment the internal time step counter self._t += 1 # Convert the tensor to suitable input action = action.detach().numpy() # Execute the actions in the environments results = [env.step(a) for a, env in zip(action, self._envs)] # Process the control suite results results = [self._process_result(result) for result in results] # Don't return an observation if no_observation flag is set if no_observation: # Unzip all tuples into 3 tuples containing the observations, flags and info dicts, respectively results = [*zip(*results)] # No observation is returned results[0] = None # Merge all rewards to one tensor results[1] = batch_tensors(*results[1]) # Merge all flags to one tensor results[2] = batch_tensors(*results[2]) # Return all results as a tuple return tuple(results) # If required, set observation to image observations elif not self._state_obs: # Get raw pixels from all environments pixels_tuple = self._pixels() # Convert them to suitable observations observations = [ preprocess_observation(o, self._bit_depth, self._observation_size) for o in pixels_tuple ] # Merge the observations in the results results = [(o, ) + result[1:] for o, result in zip(observations, results)] # Add all raw pixel observations to the info dicts for result, pixels in zip(results, pixels_tuple): result[3]['pixels'] = pixels # Unzip all tuples into 4 tuples containing the observations, rewards, flags and info dicts, respectively results = [*zip(*results)] # Merge all observations to one tensor results[0] = batch_tensors(*results[0]) # Merge all rewards to one tensor results[1] = batch_tensors(*results[1]) # Merge all flags to one tensor results[2] = batch_tensors(*results[2]) # Check max episode length condition. Update flags if required if self._t >= self._max_t: results[ 2] |= True # Set all flags to true if max episode length is reached # Return all results as a tuple return tuple(results)
def _compute_loss(self, model: RSSM, episode: tuple) -> tuple: """ Compute the loss of the RSSM over a single batch of episodes :param model: the RSSM model :param episode: a five-tuple consisting of: - a tensor containing all observations obtained during the episode shape: (batch_size, episode_length,) + observation_shape - a tensor containing all actions performed during the episode shape: (batch_size, episode_length, action_size) - a tensor containing all rewards obtained during the episode shape: (batch_size, episode_length) - a tensor containing all subsequent observations obtained during the episode (has shared storage with the first observation tensor) shape: (batch_size, episode_length,) + observation_shape - a tensor containing all subsequent actions performed during the episode (has shared storage with the first action tensor) shape: (batch_size, episode_length, action_size) :return: a two-tuple containing: - a loss tensor shape: (1,) - a dict containing info about the loss computation """ # Keep info dict info = {} # Get the batch size used batch_size = episode[0].size(0) # Reset RSSM initial state model.reset(batch_size=batch_size) # Average losses over all time steps reward_losses = [] observation_losses = [] belief_losses = [] value_losses = [] # Switch the episode and batch dimensions episode = tuple([Trainer._switch_dims(xs) for xs in episode]) # Apply state-action augmentations for aug in self._state_action_augmentations: episode = from_keyword(aug)(*episode) # Apply data augmentation to the observation tensor o_augmented = episode[3] # Shape: (T, batch_size,) + observation_shape for aug in self._data_augmentations: o_augmented = from_keyword(aug)(o_augmented) # Shape (T, batch_size,) + augmented_observation_shape # Simulate the trajectory in the model for t, (o, a, r, o_, a_, o_aug) in enumerate(zip(*episode, o_augmented)): # Optionally, get a value function estimate predicted_v = model.state_action_value(a) if isinstance(model, ERSSM) else None # Get prediction (distributions) from the environment model predicted_o_, predicted_r, predicted_s, _, _ = model.simulate_step(a) # Only observation sample is required, omit dist params predicted_o_, _, _ = predicted_o_ # Only reward sample is required, omit dist params predicted_r, _, _ = predicted_r # Get belief distribution parameters predicted_s, state_prior_mean, state_prior_std = predicted_s # Compute reward loss reward_loss = F.mse_loss(predicted_r, r) reward_losses.append(reward_loss) # Compute observation loss observation_loss = F.mse_loss(predicted_o_, o_, reduction='none').sum(dim=(1, 2, 3)) observation_losses.append(observation_loss) # Get the prior and posterior belief distributions prior = Normal(state_prior_mean, state_prior_std) # Get an estimate of the posterior belief using the encoder _, state_posterior_mean, state_posterior_std = model.posterior_state_belief(o_aug) posterior = Normal(state_posterior_mean, state_posterior_std) # Allowed deviation in KL divergence free_nats = torch.ones(1, dtype=torch.float32, device=o.device) * self._kl_free_nats # Compute KL loss belief_loss = kl_divergence(posterior, prior).sum(dim=1) # Bound by free nats belief_loss = torch.max(belief_loss, free_nats) # Add to all losses belief_losses.append(belief_loss) # Optionally, train the value function if isinstance(model, ERSSM): # Compute the target by bootstrapping with torch.no_grad(): value_target = r + model.state_action_value(a_) # Compute value loss value_loss = F.mse_loss(predicted_v, value_target) value_losses.append(value_loss) # Compute total loss components reward_loss = torch.mean(batch_tensors(*reward_losses)) * self._c_r_loss observation_loss = torch.mean(batch_tensors(*observation_losses)) * self._c_o_loss belief_loss = torch.mean(batch_tensors(*belief_losses)) * self._c_kl_loss if len(value_losses) > 0: value_loss = torch.mean(batch_tensors(*value_losses)) * self._c_v_loss else: value_loss = torch.zeros(1, device=reward_loss.device) # Compute total loss loss = reward_loss + observation_loss + belief_loss + value_loss # Store relevant information in info dict info['reward_loss'] = reward_loss.item() info['observation_loss'] = observation_loss.item() info['belief_loss'] = belief_loss.item() info['value_loss'] = value_loss.item() info['loss'] = loss.item() return loss, info