Esempio n. 1
0
    def testConcatenate(self):
        d1 = {"s": np.array([0, 1]), "a": np.array([2, 3])}
        d2 = {"s": np.array([4, 5]), "a": np.array([6, 7])}
        d = concatenate([d1, d2])
        assert_allclose(d["s"], np.array([0, 1, 4, 5]))
        assert_allclose(d["a"], np.array([2, 3, 6, 7]))

        D = concatenate([d])
        assert_allclose(D["s"], np.array([0, 1, 4, 5]))
        assert_allclose(D["a"], np.array([2, 3, 6, 7]))
Esempio n. 2
0
    def compute_steps(self, config, obs_filter, rew_filter):
        """Compute multiple rollouts and concatenate the results.

        Args:
            config: Configuration parameters
            obs_filter: Function that is applied to each of the
                observations.
            reward_filter: Function that is applied to each of the rewards.

        Returns:
            states: List of states.
            total_rewards: Total rewards of the trajectories.
            trajectory_lengths: Lengths of the trajectories.
        """
        num_steps_so_far = 0
        trajectories = []
        self.update_filters(obs_filter, rew_filter)

        while num_steps_so_far < config["min_steps_per_task"]:
            rollout = self.sampler.get_data()
            trajectory = process_rollout(
                rollout, self.reward_filter, config["gamma"],
                config["lambda"], use_gae=config["use_gae"])
            num_steps_so_far += trajectory["rewards"].shape[0]
            trajectories.append(trajectory)
        metrics = self.sampler.get_metrics()
        total_rewards, trajectory_lengths = zip(*[
            (c.episode_reward, c.episode_length) for c in metrics])
        updated_obs_filter = self.sampler.get_obs_filter(flush=True)
        return (
            concatenate(trajectories),
            total_rewards,
            trajectory_lengths,
            updated_obs_filter,
            self.reward_filter)
def collect_samples(agents,
                    config,
                    observation_filter,
                    reward_filter):
    num_timesteps_so_far = 0
    trajectories = []
    total_rewards = []
    trajectory_lengths = []
    # This variable maps the object IDs of trajectories that are currently
    # computed to the agent that they are computed on; we start some initial
    # tasks here.
    agent_dict = {agent.compute_steps.remote(
                      config, observation_filter, reward_filter):
                  agent for agent in agents}
    while num_timesteps_so_far < config["timesteps_per_batch"]:
        # TODO(pcm): Make wait support arbitrary iterators and remove the
        # conversion to list here.
        [next_trajectory], _ = ray.wait(list(agent_dict))
        agent = agent_dict.pop(next_trajectory)
        # Start task with next trajectory and record it in the dictionary.
        agent_dict[agent.compute_steps.remote(
                      config, observation_filter, reward_filter)] = agent
        trajectory, rewards, lengths, obs_f, rew_f = ray.get(next_trajectory)
        total_rewards.extend(rewards)
        trajectory_lengths.extend(lengths)
        num_timesteps_so_far += sum(lengths)
        trajectories.append(trajectory)
        observation_filter.update(obs_f)
        reward_filter.update(rew_f)
    return (concatenate(trajectories), np.mean(total_rewards),
            np.mean(trajectory_lengths))
Esempio n. 4
0
def collect_samples(agents,
                    config,
                    observation_filter=NoFilter(),
                    reward_filter=NoFilter()):
    num_timesteps_so_far = 0
    trajectories = []
    total_rewards = []
    trajectory_lengths = []
    # This variable maps the object IDs of trajectories that are currently
    # computed to the agent that they are computed on; we start some initial
    # tasks here.
    agent_dict = {agent.compute_steps.remote(
                      config["gamma"], config["lambda"],
                      config["horizon"], config["min_steps_per_task"]):
                  agent for agent in agents}
    while num_timesteps_so_far < config["timesteps_per_batch"]:
        # TODO(pcm): Make wait support arbitrary iterators and remove the
        # conversion to list here.
        [next_trajectory], waiting_trajectories = ray.wait(
            list(agent_dict.keys()))
        agent = agent_dict.pop(next_trajectory)
        # Start task with next trajectory and record it in the dictionary.
        agent_dict[agent.compute_steps.remote(
                       config["gamma"], config["lambda"],
                       config["horizon"], config["min_steps_per_task"])] = (
            agent)
        trajectory, rewards, lengths = ray.get(next_trajectory)
        total_rewards.extend(rewards)
        trajectory_lengths.extend(lengths)
        num_timesteps_so_far += len(trajectory["dones"])
        trajectories.append(trajectory)
    return (concatenate(trajectories), np.mean(total_rewards),
            np.mean(trajectory_lengths))
Esempio n. 5
0
    def compute_steps(
            self, gamma, lam, horizon, min_steps_per_task,
            observation_filter, reward_filter):
        """Compute multiple rollouts and concatenate the results.

        Args:
            gamma: MDP discount factor
            lam: GAE(lambda) parameter
            horizon: Number of steps after which a rollout gets cut
            min_steps_per_task: Lower bound on the number of states to be
                collected.
            observation_filter: Function that is applied to each of the
                observations.
            reward_filter: Function that is applied to each of the rewards.

        Returns:
            states: List of states.
            total_rewards: Total rewards of the trajectories.
            trajectory_lengths: Lengths of the trajectories.
        """

        # Update our local filters
        self.observation_filter = observation_filter.copy()
        self.reward_filter = reward_filter.copy()

        num_steps_so_far = 0
        trajectories = []
        total_rewards = []
        trajectory_lengths = []
        while True:
            trajectory = self.compute_trajectory(gamma, lam, horizon)
            total_rewards.append(
                trajectory["raw_rewards"].sum(axis=0).mean())
            trajectory_lengths.append(
                np.logical_not(trajectory["dones"]).sum(axis=0).mean())
            trajectory = flatten(trajectory)
            not_done = np.logical_not(trajectory["dones"])
            # Filtering out states that are done. We do this because
            # trajectories are batched and cut only if all the trajectories
            # in the batch terminated, so we can potentially get rid of
            # some of the states here.
            trajectory = {key: val[not_done]
                          for key, val in trajectory.items()}
            num_steps_so_far += trajectory["raw_rewards"].shape[0]
            trajectories.append(trajectory)
            if num_steps_so_far >= min_steps_per_task:
                break
        return (
            concatenate(trajectories),
            total_rewards,
            trajectory_lengths,
            self.observation_filter,
            self.reward_filter)