def test_step(self): bt = trajectory.BatchTrajectory(batch_size=self.BATCH_SIZE) indices = np.arange(self.BATCH_SIZE) observations, _, _, _ = self.get_random_observations_rewards_actions_dones( ) # Have to call reset first. bt.reset(indices, observations) # Create some fake data for calling step. new_observations, raw_rewards, actions, dones = ( self.get_random_observations_rewards_actions_dones()) processed_rewards = raw_rewards.astype(np.int64) # Force mark the first one as done anyways, so that there is something to # test. dones[0] = True num_done = sum(dones) self.assertLessEqual(1, num_done) # i.e. num_done is atleast 1. num_not_done = len(dones) - num_done # Finally call step. bt.step(new_observations, raw_rewards, processed_rewards, dones, actions) # Expect to see `num_done` number of completed trajectories. self.assertEqual(num_done, bt.num_completed_trajectories) # Expect to see that the rest are marked as active. num_active = sum(t.is_active for t in bt.trajectories) self.assertEqual(num_not_done, num_active)
def initialize_environments(self, batch_size=1, parallelism=1, **kwargs): """Initializes the environments. Args: batch_size: (int) Number of `self.base_env_name` envs to initialize. parallelism: (int) If this is greater than one then we run the envs in parallel using multi-threading. **kwargs: (dict) Kwargs to pass to gym.make. """ assert batch_size >= 1 self._envs = [ gym.make(self.base_env_name, **kwargs) for _ in range(batch_size) ] self._parallelism = parallelism self._pool = multiprocessing.pool.ThreadPool(self._parallelism) if self._env_wrapper_fn is not None: self._envs = list(map(self._env_wrapper_fn, self._envs)) self._verify_same_spaces() # If self.reward_range is None, i.e. this means that we should take the # reward range of the env. if self.reward_range is None: self._reward_range = self._envs[0].reward_range # This data structure stores the history of each env. # # NOTE: Even if the env is a NN and can step in all batches concurrently, it # is still valuable to store the trajectories separately. self._trajectories = trajectory.BatchTrajectory(batch_size=batch_size)
def initialize_environments(self, batch_size=1, max_episode_steps=-1, max_and_skip_env=False): """Initializes the environments and trajectories. Subclasses can override this if they don't want a default implementation which initializes `batch_size` environments, but must take care to initialize self._trajectories (this is checked in __init__ anyways). Args: batch_size: (int) Number of `self.base_env_name` envs to initialize. max_episode_steps: (int) Passed on to `gym_utils.make_gym_env`. max_and_skip_env: (boolean) Passed on to `gym_utils.make_gym_env`. """ assert batch_size >= 1 self._batch_size = batch_size # pylint: disable=g-complex-comprehension self._envs = [ gym_utils.make_gym_env(self.base_env_name, rl_env_max_episode_steps=max_episode_steps, maxskip_env=max_and_skip_env) for _ in range(batch_size) ] # If self.observation_space and self.action_space aren't None, then it means # that this is a re-initialization of this class, in that case make sure # that this matches our previous behaviour. if self._observation_space: assert str(self._observation_space) == str( self._envs[0].observation_space) else: # This means that we are initializing this class for the first time. # # We set this equal to the first env's observation space, later on we'll # verify that all envs have the same observation space. self._observation_space = self._envs[0].observation_space # Similarly for action_space if self._action_space: assert str(self._action_space) == str(self._envs[0].action_space) else: self._action_space = self._envs[0].action_space self._verify_same_spaces() # If self.reward_range is None, i.e. this means that we should take the # reward range of the env. if self.reward_range is None: self._reward_range = self._envs[0].reward_range # This data structure stores the history of each env. # # NOTE: Even if the env is a NN and can step in all batches concurrently, it # is still valuable to store the trajectories separately. self._trajectories = trajectory.BatchTrajectory(batch_size=batch_size)
def initialize_environments(self, batch_size=1, parallelism=1, **env_kwargs): """Initializes the environments and trajectories. Subclasses can override this if they don't want a default implementation which initializes `batch_size` environments, but must take care to initialize self._trajectories (this is checked in __init__ anyways). Args: batch_size: (int) Number of `self.base_env_name` envs to initialize. parallelism: (int) If this is greater than one then we run the envs in parallel using multi-threading. **env_kwargs: (dict) Kwargs to pass to gym.make. """ assert batch_size >= 1 self._batch_size = batch_size self._envs = [ gym.make(self.base_env_name, **env_kwargs) for _ in range(batch_size) ] self._parallelism = parallelism self._pool = multiprocessing.pool.ThreadPool(self._parallelism) if self._env_wrapper_fn is not None: self._envs = list(map(self._env_wrapper_fn, self._envs)) # If self.observation_space and self.action_space aren't None, then it means # that this is a re-initialization of this class, in that case make sure # that this matches our previous behaviour. if self._observation_space: assert str(self._observation_space) == str( self._envs[0].observation_space) else: # This means that we are initializing this class for the first time. # # We set this equal to the first env's observation space, later on we'll # verify that all envs have the same observation space. self._observation_space = self._envs[0].observation_space # Similarly for action_space if self._action_space: assert str(self._action_space) == str(self._envs[0].action_space) else: self._action_space = self._envs[0].action_space self._verify_same_spaces() # If self.reward_range is None, i.e. this means that we should take the # reward range of the env. if self.reward_range is None: self._reward_range = self._envs[0].reward_range # This data structure stores the history of each env. # # NOTE: Even if the env is a NN and can step in all batches concurrently, it # is still valuable to store the trajectories separately. self._trajectories = trajectory.BatchTrajectory(batch_size=batch_size)
def test_desired_placement_of_rewards_and_actions(self): batch_size = 1 bt = trajectory.BatchTrajectory(batch_size=batch_size) indices = np.arange(batch_size) observations, _, _, _ = self.get_random_observations_rewards_actions_dones( batch_size=batch_size) # Have to call reset first. bt.reset(indices, observations) # Create some fake data for calling step. new_observations, raw_rewards, actions, _ = ( self.get_random_observations_rewards_actions_dones( batch_size=batch_size)) processed_rewards = raw_rewards.astype(np.int64) dones = np.full(batch_size, False) # Call step. bt.step(new_observations, raw_rewards, processed_rewards, dones, actions) # Assert that nothing is done, since dones is False self.assertEqual(0, bt.num_completed_trajectories) # The only trajectory is active. self.assertEqual(batch_size, len(bt.trajectories)) t = bt.trajectories[0] self.assertTrue(t.is_active) self.assertEqual(2, t.num_time_steps) ts = t.time_steps # Now assert on placements # i.e. the old observation/done is first and the new one comes later. self.assertAllEqual(observations[0], ts[0].observation) self.assertAllEqual(new_observations[0], ts[1].observation) self.assertEqual(False, ts[0].done) self.assertEqual(False, ts[1].done) # Similarly actions went to the first time-step. self.assertEqual(actions[0], ts[0].action) self.assertIsNone(ts[1].action) # However make sure reward went into the second time-step and not the first. self.assertNear(raw_rewards[0], ts[1].raw_reward, 1e-6) self.assertIsNone(ts[0].raw_reward) # Similarly with processed_rewards. self.assertEqual(processed_rewards[0], ts[1].processed_reward) self.assertIsNone(ts[0].processed_reward)
def initialize_environments(self, batch_size=1, parallelism=1, per_env_kwargs=None, **kwargs): """Initializes the environments. Args: batch_size: (int) Number of `self.base_env_name` envs to initialize. parallelism: (int) If this is greater than one then we run the envs in parallel using multi-threading. per_env_kwargs: (list or None) An optional list of dictionaries to pass to gym.make. If not None, length should match `batch_size`. **kwargs: (dict) Kwargs to pass to gym.make. """ assert batch_size >= 1 if per_env_kwargs is not None: assert batch_size == len(per_env_kwargs) else: per_env_kwargs = [{} for _ in range(batch_size)] # By now `per_env_kwargs` is a list of dictionaries of size batch_size. # The individual dictionaries maybe empty. def union_dicts(dict1, dict2): """Union `dict1` and `dict2`.""" copy_dict1 = copy.copy(dict1) copy_dict1.update(dict2) return copy_dict1 self._envs = [ gym.make(self.base_env_name, **union_dicts(kwargs, env_kwarg)) for env_kwarg in per_env_kwargs ] self._parallelism = parallelism self._pool = multiprocessing.pool.ThreadPool(self._parallelism) if self._env_wrapper_fn is not None: self._envs = list(map(self._env_wrapper_fn, self._envs)) self._verify_same_spaces() # If self.reward_range is None, i.e. this means that we should take the # reward range of the env. if self.reward_range is None: self._reward_range = self._envs[0].reward_range # This data structure stores the history of each env. # # NOTE: Even if the env is a NN and can step in all batches concurrently, it # is still valuable to store the trajectories separately. self._trajectories = trajectory.BatchTrajectory(batch_size=batch_size)
def test_reset_all(self): bt = trajectory.BatchTrajectory(batch_size=self.BATCH_SIZE) indices = np.arange(self.BATCH_SIZE) observations, _, _, _ = self.get_random_observations_rewards_actions_dones() # Call reset. bt.reset(indices, observations) # Assert that all trajectories are active and not done (reset never marks # anything as done). self.assertTrue(all(t.is_active for t in bt.trajectories)) self.assertEqual(0, bt.num_completed_trajectories)
def test_reset_all(self): bt = trajectory.BatchTrajectory(batch_size=self.BATCH_SIZE) indices = np.arange(self.BATCH_SIZE) observations = np.random.rand(*((self.BATCH_SIZE, ) + self.OBSERVATION_SHAPE)) # Call reset. bt.reset(indices, observations) # Assert that all trajectories are active and not done (reset never marks # anything as done). self.assertTrue(all(t.is_active() for t in bt.trajectories)) self.assertEqual(0, len(bt.completed_trajectories))
def initialize(self, batch_size=1, **kwargs): self.initialize_environments(batch_size=batch_size, **kwargs) self._batch_size = batch_size # This data structure stores the history of each env. # # NOTE: Even if the env is a NN and can step in all batches concurrently, it # is still valuable to store the trajectories separately. self._trajectories = trajectory.BatchTrajectory(batch_size=batch_size) # Assert that *all* the above are now set, we should do this since # subclasses can override `initialize_environments`. self.assert_common_preconditions() assert self.observation_space is not None assert self.action_space is not None assert self.reward_range is not None
def test_reset_some(self): bt = trajectory.BatchTrajectory(batch_size=self.BATCH_SIZE) indices = np.arange(self.BATCH_SIZE // 2) observations, _, _, _ = self.get_random_observations_rewards_actions_dones( batch_size=self.BATCH_SIZE // 2) # Just reset the first half. bt.reset(indices, observations) # So first half are active, rest aren't. self.assertTrue( all(t.is_active for t in bt.trajectories[:self.BATCH_SIZE // 2])) self.assertTrue( all(not t.is_active for t in bt.trajectories[self.BATCH_SIZE // 2:])) # Nothing is done anyways. self.assertEqual(0, bt.num_completed_trajectories)
def test_reset_some(self): bt = trajectory.BatchTrajectory(batch_size=self.BATCH_SIZE) indices = np.arange(self.BATCH_SIZE // 2) observations = np.random.rand(*((self.BATCH_SIZE // 2, ) + self.OBSERVATION_SHAPE)) # Just reset the first half. bt.reset(indices, observations) # So first half are active, rest aren't. self.assertTrue( all(t.is_active() for t in bt.trajectories[:self.BATCH_SIZE // 2])) self.assertTrue( all(not t.is_active() for t in bt.trajectories[self.BATCH_SIZE // 2:])) # Nothing is done anyways. self.assertEqual(0, len(bt.completed_trajectories))
def test_truncate(self): batch_size = 1 bt = trajectory.BatchTrajectory(batch_size=batch_size) indices = np.arange(batch_size) observations, _, _, _ = ( self.get_random_observations_rewards_actions_dones( batch_size=batch_size)) # Have to call reset first. bt.reset(indices, observations) # Take a few steps. ts = 5 for _ in range(ts): (observations, rewards, actions, dones) = self.get_random_observations_rewards_actions_dones( batch_size=batch_size) dones[...] = False bt.step(observations, rewards, rewards, dones, actions) self.assertEqual(0, bt.num_completed_trajectories) num_to_keep = 2 bt.truncate_trajectories(indices, num_to_keep=num_to_keep) self.assertEqual(batch_size, bt.num_completed_trajectories) # Assert they are all active. # Since the last `num_to_keep` observations were duplicated. self.assertTrue(all(t.is_active for t in bt.trajectories)) orig_obs = bt.completed_trajectories[0].observations_np # + 1 because of the initial reset self.assertEqual(ts + 1, orig_obs.shape[0]) trunc_obs = bt.trajectories[0].observations_np self.assertEqual(num_to_keep, trunc_obs.shape[0]) self.assertEqual(num_to_keep, bt.trajectories[0].num_time_steps) # Test that the observations are the same. self.assertAllEqual(orig_obs[-num_to_keep:, ...], trunc_obs)
def test_step(self): bt = trajectory.BatchTrajectory(batch_size=self.BATCH_SIZE) indices = np.arange(self.BATCH_SIZE) observations = np.random.rand(*((self.BATCH_SIZE, ) + self.OBSERVATION_SHAPE)) # Have to call reset first. bt.reset(indices, observations) # Create some fake data for calling step. new_observations = np.random.rand(*((self.BATCH_SIZE, ) + self.OBSERVATION_SHAPE)) raw_rewards = processed_rewards = actions = np.random.randn( self.BATCH_SIZE) processed_rewards = np.int64(processed_rewards) dones = raw_rewards > 0.5 # Force mark the first one as done anyways, so that there is something to # test. dones[0] = True num_done = sum(dones) self.assertLessEqual(1, num_done) # i.e. num_done is atleast 1. num_not_done = len(dones) - num_done # Finally call step. bt.step(new_observations, raw_rewards, processed_rewards, dones, actions) # Expect to see `num_done` number of completed trajectories. self.assertEqual(num_done, len(bt.completed_trajectories)) # Expect to see that the rest are marked as active. num_active = sum(t.is_active() for t in bt.trajectories) self.assertEqual(num_not_done, num_active)
def test_observations_np(self): bt = trajectory.BatchTrajectory(batch_size=self.BATCH_SIZE) indices = np.arange(self.BATCH_SIZE) observations, _, _, _ = self.get_random_observations_rewards_actions_dones( ) # Have to call reset first. bt.reset(indices, observations) # Number of time-steps now looks like the following: # (1, 1, 1, 1, 1, 1, 1, 1, 1, 1) lengths = np.full((self.BATCH_SIZE, ), 1) ts = 5 for _ in range(ts): (observations, rewards, actions, dones) = self.get_random_observations_rewards_actions_dones() dones[...] = False bt.step(observations, rewards, rewards, dones, actions) # Number of time-steps now looks like the following: # (6, 6, 6, 6, 6, 6, 6, 6, 6, 6) lengths = lengths + ts # Now let's mark the first two as done. observations, _, _, _ = self.get_random_observations_rewards_actions_dones( batch_size=2) bt.reset(np.array([0, 1]), observations) # Number of time-steps now looks like the following: # (1, 1, 6, 6, 6, 6, 6, 6, 6, 6) lengths[0] = lengths[1] = 1 for _ in range(ts): (observations, rewards, actions, dones) = self.get_random_observations_rewards_actions_dones() dones[...] = False bt.step(observations, rewards, rewards, dones, actions) # Number of time-steps now looks like the following: # (6, 6, 11, 11, 11, 11, 11, 11, 11, 11) lengths = lengths + ts boundary = 20 len_history_for_policy = 40 padded_obs_np, padded_lengths = bt.observations_np( boundary=boundary, len_history_for_policy=len_history_for_policy) # The lengths are what we expect them to be. self.assertAllEqual(lengths, padded_lengths) # The padded_observations are the shape we expect them to be. self.assertEqual( (self.BATCH_SIZE, boundary + 1) + self.OBSERVATION_SHAPE, padded_obs_np.shape) # Let's now request the last n = [1, 2 * boundary) steps for the history. for len_history_for_policy in range(1, 2 * boundary): # The expected lengths will now be: truncated_lengths = [ min(l, len_history_for_policy) for l in lengths ] padded_obs_np, padded_lengths = bt.observations_np( boundary=boundary, len_history_for_policy=len_history_for_policy) self.assertAllEqual(truncated_lengths, padded_lengths) # This shouldn't change, since even if we request lengths > boundary + 1 # there are no trajectories that long. self.assertEqual( (self.BATCH_SIZE, boundary + 1) + self.OBSERVATION_SHAPE, padded_obs_np.shape) # Let's do 10 more steps (to go on the other side of the boundary. ts = 10 for _ in range(ts): (observations, rewards, actions, dones) = self.get_random_observations_rewards_actions_dones() dones[...] = False bt.step(observations, rewards, rewards, dones, actions) # Number of time-steps now looks like the following: # (16, 16, 21, 21, 21, 21, 21, 21, 21, 21) lengths = lengths + ts len_history_for_policy = 40 padded_obs_np, padded_lengths = bt.observations_np( boundary=boundary, len_history_for_policy=len_history_for_policy) # The lengths are what we expect them to be. self.assertAllEqual(lengths, padded_lengths) # The padded_observations are the shape we expect them to be. self.assertEqual( (self.BATCH_SIZE, (2 * boundary) + 1) + self.OBSERVATION_SHAPE, padded_obs_np.shape) # Test that the padding is the only part that is all 0s. # NOTE: There is almost 0 probability that the random observation is all 0s. zero_obs = np.full(self.OBSERVATION_SHAPE, 0.) for b in range(self.BATCH_SIZE): # The first lengths[b] will be actual data, rest is 0s. for ts in range(lengths[b]): self.assertFalse(np.all(zero_obs == padded_obs_np[b][ts])) for ts in range(lengths[b], len(padded_obs_np[b])): self.assertAllEqual(zero_obs, padded_obs_np[b][ts])
def test_num_time_steps(self): bt = trajectory.BatchTrajectory(batch_size=self.BATCH_SIZE) self.assertEqual(0, bt.num_completed_time_steps) self.assertEqual(0, bt.num_time_steps)
def test_creation(self): bt = trajectory.BatchTrajectory(batch_size=self.BATCH_SIZE) self.assertEqual(self.BATCH_SIZE, len(bt.trajectories)) self.assertEqual(0, bt.num_completed_trajectories)