def _preprocess_trajectories(self, trajectories): (_, reward_mask, observations, actions, rewards, infos) = (ppo.pad_trajectories(trajectories, boundary=self._max_timestep)) assert self.train_env.observation_space.shape == observations.shape[2:] if not self._serialized_sequence_policy: # Add one timestep at the end, so it's compatible with # self._rewards_to_actions. pad_width = ((0, 0), (0, 1)) + ((0, 0), ) * (actions.ndim - 2) actions = np.pad(actions, pad_width) actions = np.reshape(actions, (actions.shape[0], -1)) else: (observations, actions) = self._serialize_trajectories(observations, actions, reward_mask) return (observations, actions, rewards, reward_mask, infos)
def _preprocess_trajectories(self, trajectories): (_, reward_mask, observations, actions, rewards, infos) = (ppo.pad_trajectories(trajectories, boundary=self._max_timestep)) (low, high) = self.train_env.reward_range outside = np.logical_or(rewards < low, rewards > high) rewards = jax.ops.index_update(rewards, jax.ops.index[outside], 0) assert self.train_env.observation_space.shape == observations.shape[2:] if self._policy_and_value_vocab_size is None: # Add one timestep at the end, so it's compatible with # self._rewards_to_actions. pad_width = ((0, 0), (0, 1)) + ((0, 0), ) * (actions.ndim - 2) actions = np.pad(actions, pad_width) actions = np.reshape(actions, (actions.shape[0], -1)) else: (observations, actions) = self._serialize_trajectories(observations, actions, reward_mask) return (observations, actions, rewards, reward_mask, infos)
def test_pad_trajectories(self): observation_shape = (2, 3, 4) trajectories = [] n_trajectories = 7 n_actions = 10 # Time-steps are between [min_allowable_time_step, max_allowable_time_step] max_allowable_time_step = 19 min_allowable_time_step = 5 # The actual max we see in the data. max_time_step = -1 # Bucket length. bucket_length = 15 # Make `n_trajectories` random trajectories. for i in range(n_trajectories): time_steps = np.random.randint(min_allowable_time_step, max_allowable_time_step + 1) if time_steps > max_time_step: max_time_step = time_steps observations = np.random.randint( 0, 255, size=(time_steps + 1,) + observation_shape).astype(np.uint8) rewards = np.random.uniform(size=(time_steps,)).astype(np.float32) actions = np.random.randint( 0, n_actions, size=(time_steps,)).astype(np.int32) infos = { 'a': np.random.uniform(size=(time_steps,)).astype(np.float32), 'b': np.random.uniform(size=(time_steps,)).astype(np.float32) } trajectories.append((observations, rewards, actions, infos)) # Now pad these trajectories. padded_trajectories = ppo.pad_trajectories( trajectories, boundary=bucket_length) # Expected padding. i = 1 while i * bucket_length < max_time_step: i += 1 expected_padding = i * bucket_length # Get the padded objects. (pad_lengths, reward_mask, padded_observations, padded_actions, padded_rewards, padded_infos) = padded_trajectories # Expectations on the padded shapes. self.assertEqual(padded_observations.shape, ( n_trajectories, expected_padding + 1, ) + observation_shape) self.assertEqual(padded_actions.shape, (n_trajectories, expected_padding)) self.assertEqual(padded_rewards.shape, (n_trajectories, expected_padding)) self.assertEqual(reward_mask.shape, (n_trajectories, expected_padding)) self.assertEqual(padded_infos['a'].shape, (n_trajectories, expected_padding)) self.assertEqual(padded_infos['b'].shape, (n_trajectories, expected_padding)) # Assert that the padding lengths and reward mask are consistent. self.assertAllEqual( np.full((n_trajectories,), expected_padding), np.array(np.sum(reward_mask, axis=1)) + pad_lengths)