Example #1
0
    def test_change_last_time_step(self):
        t = trajectory.Trajectory()
        t.add_time_step(observation=1, done=False)
        t.add_time_step(observation=1, done=True)
        self.assertTrue(t.is_active)

        num_ts_old = t.num_time_steps
        self.assertEqual(2, num_ts_old)

        # Assert on what the last time-step is currently.
        ts = t.last_time_step
        self.assertEqual(1, ts.observation)
        self.assertTrue(ts.done)
        self.assertEqual(None, ts.action)

        # Change the last time-step.
        t.change_last_time_step(done=False, action=5)

        # Assert that it changed.
        ts = t.last_time_step
        self.assertEqual(
            1, ts.observation)  # unchanged, since we didn't change it.
        self.assertFalse(ts.done)  # was True earlier
        self.assertEqual(5, ts.action)  # was None earlier

        # Assert on the number of steps remaining the same as before.
        self.assertEqual(num_ts_old, t.num_time_steps)
 def _make_trajectory(self, observations, actions):
     assert len(observations) == len(actions) + 1
     t = trajectory.Trajectory()
     for (obs, act) in zip(observations, actions):
         t.add_time_step(observation=obs, action=act, done=False)
     t.add_time_step(observation=observations[-1], done=True)
     return t
Example #3
0
    def test_truncate_and_last_n_observations_np(self):
        t = trajectory.Trajectory()
        ts = 5
        shape = (3, 4)
        for _ in range(ts):
            t.add_time_step(observation=np.random.uniform(size=shape),
                            done=False)

        original_obs = np.copy(t.observations_np)
        self.assertEqual((ts, ) + shape, original_obs.shape)

        # Now let's just get the observations from the last 2 steps.
        num_to_keep = 2
        truncated_original_obs = original_obs[-num_to_keep:, ...]

        # Let's get the last `num_to_keep` observations
        last_n_observations_np = np.copy(
            t.last_n_observations_np(n=num_to_keep))

        # Now truncate the trajectory and get the same.
        _ = t.truncate(num_to_keep=num_to_keep)
        truncated_np = np.copy(t.observations_np)

        # These should be the expected length.
        self.assertEqual((2, ) + shape, last_n_observations_np.shape)
        self.assertEqual((2, ) + shape, truncated_np.shape)

        # Test the last `num_to_keep` are the same.
        self.assertAllEqual(truncated_np, truncated_original_obs)
        self.assertAllEqual(last_n_observations_np, truncated_original_obs)
  def test_as_numpy(self):
    t = trajectory.Trajectory()
    shape = (3, 4)

    # We'll have `ts` observations and `ts-1` actions and rewards.
    ts = 5
    num_actions = 6
    observations = np.random.uniform(size=(ts,) + shape)
    actions = np.random.choice(range(num_actions), size=(ts-1,))
    rewards = np.random.choice([-1, 0, 1], size=(ts-1,))

    # First time-step has no reward.
    t.add_time_step(observation=observations[0],
                    done=False,
                    action=actions[0])
    for i in range(1, ts - 1):
      t.add_time_step(observation=observations[i],
                      done=False,
                      raw_reward=rewards[i-1],
                      processed_reward=rewards[i-1],
                      action=actions[i])
    # Last time-step has no action.
    t.add_time_step(observation=observations[-1],
                    done=False,
                    raw_reward=rewards[-1],
                    processed_reward=rewards[-1])

    traj_np = t.as_numpy

    self.assertAllEqual(observations, traj_np[0])
    self.assertAllEqual(actions, traj_np[1])
    self.assertAllEqual(rewards, traj_np[2])
Example #5
0
  def test_observation_np(self):
    t = trajectory.Trajectory()
    ts = 5
    shape = (3, 4)
    for _ in range(ts):
      t.add_time_step(observation=np.random.uniform(size=shape), done=False)

    self.assertEqual((ts,) + shape, t.observations_np.shape)
Example #6
0
 def _make_trajectory(self, observations=None, actions=None):
   t = trajectory.Trajectory()
   if observations is None:
     observations = itertools.repeat(None)
   if actions is None:
     actions = itertools.repeat(None)
   for (observation, action) in zip(observations, actions):
     t.add_time_step(observation=observation, action=action)
   return t
Example #7
0
  def test_reward(self):
    t = trajectory.Trajectory()
    # first time-step doesn't have rewards, since they are on entering a state.
    t.add_time_step(
        observation=1, raw_reward=None, processed_reward=None, done=False)
    t.add_time_step(
        observation=2, raw_reward=2, processed_reward=200, done=False)
    t.add_time_step(
        observation=3, raw_reward=3, processed_reward=300, done=True)

    raw_reward, processed_reward = t.reward

    self.assertEqual(5, raw_reward)
    self.assertEqual(500, processed_reward)
Example #8
0
    def test_add_time_step(self):
        t = trajectory.Trajectory()
        t.add_time_step(observation=1, done=True)

        # Test that the trajectory is now active.
        self.assertTrue(t.is_active)

        added_t = t.last_time_step
        self.assertEqual(1, added_t.observation)
        self.assertTrue(added_t.done)
        self.assertIsNone(None, added_t.raw_reward)
        self.assertIsNone(None, added_t.processed_reward)
        self.assertIsNone(None, added_t.action)

        self.assertEqual(1, t.num_time_steps)
Example #9
0
  def test_as_numpy(self):
    t = trajectory.Trajectory()
    shape = (3, 4)

    # We'll have `ts` observations and `ts-1` actions and rewards.
    ts = 5
    num_actions = 6
    observations = np.random.uniform(size=(ts,) + shape)
    actions = np.random.choice(range(num_actions), size=(ts - 1,))
    rewards = np.random.choice([-1, 0, 1], size=(ts - 1,))
    squares = np.arange(ts - 1)**2
    cubes = np.arange(ts - 1)**3

    def get_info(i):
      return {"sq": squares[i], "cu": cubes[i]}

    # First time-step has no reward.
    t.add_time_step(
        observation=observations[0],
        done=False,
        action=actions[0],
        info=get_info(0))
    for i in range(1, ts - 1):
      t.add_time_step(
          observation=observations[i],
          done=False,
          raw_reward=rewards[i - 1],
          processed_reward=rewards[i - 1],
          action=actions[i],
          info=get_info(i))
    # Last time-step has no action.
    t.add_time_step(
        observation=observations[-1],
        done=False,
        raw_reward=rewards[-1],
        processed_reward=rewards[-1])

    traj_np = t.as_numpy

    self.assertAllEqual(observations, traj_np[0])
    self.assertAllEqual(actions, traj_np[1])
    self.assertAllEqual(rewards, traj_np[2])

    self.assertAllEqual(squares, traj_np[4]["sq"])
    self.assertAllEqual(cubes, traj_np[4]["cu"])
Example #10
0
 def get_random_trajectory(self,
                           max_time_step=None,
                           obs_shape=(2, 2)) -> trajectory.Trajectory:
     t = trajectory.Trajectory()
     max_time_step = max_time_step or np.random.randint(2, 10)
     for _ in range(max_time_step):
         r = float(np.random.uniform(size=()))
         t.add_time_step(observation=np.random.uniform(size=obs_shape),
                         done=False,
                         raw_reward=r,
                         processed_reward=r,
                         action=int(np.random.choice(10, ())),
                         info={
                             replay_buffer.ReplayBuffer.LOGPS_KEY_TRAJ:
                             float(np.random.uniform(low=-10, high=0))
                         })
     t.change_last_time_step(done=True)
     return t
Example #11
0
def play_env_problem(env, policy_fn):
    """Plays an EnvProblem using a given policy function."""
    trajectories = [trajectory.Trajectory() for _ in range(env.batch_size)]
    observations = env.reset()
    for (traj, observation) in zip(trajectories, observations):
        traj.add_time_step(observation=observation)

    done_so_far = np.array([False] * env.batch_size)
    while not np.all(done_so_far):
        padded_observations, _ = env.trajectories.observations_np(
            len_history_for_policy=None)
        actions = policy_fn(padded_observations)
        (observations, rewards, dones, _) = env.step(actions)
        for (traj, observation, action, reward,
             done) in zip(trajectories, observations, actions, rewards, dones):
            if not traj.done:
                traj.change_last_time_step(action=action)
                traj.add_time_step(observation=observation,
                                   raw_reward=reward,
                                   done=done)
            env.reset(indices=env_problem_utils.done_indices(dones))
        done_so_far = np.logical_or(done_so_far, dones)
    return trajectories
Example #12
0
 def _make_singleton_trajectory(self, observation):
   t = trajectory.Trajectory()
   t.add_time_step(observation=observation)
   return t
Example #13
0
 def test_empty_trajectory(self):
     t = trajectory.Trajectory()
     self.assertFalse(t.is_active)
     self.assertEqual(0, t.num_time_steps)
     self.assertFalse(t.done)
Example #14
0
    def test_load_from_directory(self):
        output_dir = self.get_temp_dir()

        epochs = [0, 1, 2]
        env_ids = [0, 1, 2]
        temperatures = [0.5, 1.0]
        random_strings = ["a", "b"]

        # Write some trajectories.
        # There are 3x3x2x2 (36) trajectories, and of them 3x2x2 (12) are done.
        for epoch in epochs:
            for env_id in env_ids:
                for temperature in temperatures:
                    for random_string in random_strings:
                        traj = trajectory.Trajectory(time_steps=[
                            time_step.TimeStep(observation=epoch,
                                               done=(epoch == 0),
                                               raw_reward=1.0,
                                               processed_reward=1.0,
                                               action=env_id,
                                               info={})
                        ])

                        trajectory_file_name = trajectory.TRAJECTORY_FILE_FORMAT.format(
                            epoch=epoch,
                            env_id=env_id,
                            temperature=temperature,
                            r=random_string)

                        with gfile.GFile(
                                os.path.join(output_dir, trajectory_file_name),
                                "w") as f:
                            trajectory.get_pickle_module().dump(traj, f)

        # Load everything and check.
        bt = trajectory.BatchTrajectory.load_from_directory(output_dir)

        self.assertIsInstance(bt, trajectory.BatchTrajectory)
        self.assertEqual(36, bt.num_completed_trajectories)
        self.assertEqual(36, bt.batch_size)

        bt = trajectory.BatchTrajectory.load_from_directory(output_dir,
                                                            epoch=0)
        self.assertEqual(12, bt.num_completed_trajectories)
        self.assertEqual(12, bt.batch_size)

        # Get 100 trajectories, but there aren't any.
        bt = trajectory.BatchTrajectory.load_from_directory(output_dir,
                                                            epoch=0,
                                                            n_trajectories=100,
                                                            max_tries=0)
        self.assertIsNone(bt)

        bt = trajectory.BatchTrajectory.load_from_directory(output_dir,
                                                            epoch=0,
                                                            temperature=0.5)
        self.assertEqual(6, bt.num_completed_trajectories)
        self.assertEqual(6, bt.batch_size)

        bt = trajectory.BatchTrajectory.load_from_directory(output_dir,
                                                            epoch=1)
        self.assertEqual(12, bt.num_completed_trajectories)
        self.assertEqual(12, bt.batch_size)

        # Constraints cannot be satisfied.
        bt = trajectory.BatchTrajectory.load_from_directory(output_dir,
                                                            epoch=1,
                                                            n_trajectories=100,
                                                            up_sample=False,
                                                            max_tries=0)
        self.assertIsNone(bt)

        # Constraints can be satisfied.
        bt = trajectory.BatchTrajectory.load_from_directory(output_dir,
                                                            epoch=1,
                                                            n_trajectories=100,
                                                            up_sample=True,
                                                            max_tries=0)
        self.assertEqual(100, bt.num_completed_trajectories)
        self.assertEqual(100, bt.batch_size)

        bt = trajectory.BatchTrajectory.load_from_directory(output_dir,
                                                            epoch=1,
                                                            n_trajectories=10)
        self.assertEqual(10, bt.num_completed_trajectories)
        self.assertEqual(10, bt.batch_size)

        gfile.rmtree(output_dir)