Ejemplo n.º 1
0
    def compute_steps(
            self, gamma, lam, horizon, min_steps_per_task,
            observation_filter, reward_filter):
        """Compute multiple rollouts and concatenate the results.

        Args:
            gamma: MDP discount factor
            lam: GAE(lambda) parameter
            horizon: Number of steps after which a rollout gets cut
            min_steps_per_task: Lower bound on the number of states to be
                collected.
            observation_filter: Function that is applied to each of the
                observations.
            reward_filter: Function that is applied to each of the rewards.

        Returns:
            states: List of states.
            total_rewards: Total rewards of the trajectories.
            trajectory_lengths: Lengths of the trajectories.
        """

        # Update our local filters
        self.observation_filter = observation_filter.copy()
        self.reward_filter = reward_filter.copy()

        num_steps_so_far = 0
        trajectories = []
        total_rewards = []
        trajectory_lengths = []
        while True:
            trajectory = self.compute_trajectory(gamma, lam, horizon)
            total_rewards.append(
                trajectory["raw_rewards"].sum(axis=0).mean())
            trajectory_lengths.append(
                np.logical_not(trajectory["dones"]).sum(axis=0).mean())
            trajectory = flatten(trajectory)
            not_done = np.logical_not(trajectory["dones"])
            # Filtering out states that are done. We do this because
            # trajectories are batched and cut only if all the trajectories
            # in the batch terminated, so we can potentially get rid of
            # some of the states here.
            trajectory = {key: val[not_done]
                          for key, val in trajectory.items()}
            num_steps_so_far += trajectory["raw_rewards"].shape[0]
            trajectories.append(trajectory)
            if num_steps_so_far >= min_steps_per_task:
                break
        return (
            concatenate(trajectories),
            total_rewards,
            trajectory_lengths,
            self.observation_filter,
            self.reward_filter)
Ejemplo n.º 2
0
 def testFlatten(self):
     d = {
         "s": np.array([[[1, -1], [2, -2]], [[3, -3], [4, -4]]]),
         "a": np.array([[[5], [-5]], [[6], [-6]]])
     }
     flat = flatten(d.copy(), start=0, stop=2)
     assert_allclose(d["s"][0][0][:], flat["s"][0][:])
     assert_allclose(d["s"][0][1][:], flat["s"][1][:])
     assert_allclose(d["s"][1][0][:], flat["s"][2][:])
     assert_allclose(d["s"][1][1][:], flat["s"][3][:])
     assert_allclose(d["a"][0][0], flat["a"][0])
     assert_allclose(d["a"][0][1], flat["a"][1])
     assert_allclose(d["a"][1][0], flat["a"][2])
     assert_allclose(d["a"][1][1], flat["a"][3])