Exemple #1
0
def collect_samples(agents, config, local_evaluator):
    num_timesteps_so_far = 0
    trajectories = []
    # This variable maps the object IDs of trajectories that are currently
    # computed to the agent that they are computed on; we start some initial
    # tasks here.

    agent_dict = {}

    for agent in agents:
        fut_sample = agent.sample.remote()
        agent_dict[fut_sample] = agent

    while num_timesteps_so_far < config["timesteps_per_batch"]:
        # TODO(pcm): Make wait support arbitrary iterators and remove the
        # conversion to list here.
        [fut_sample], _ = ray.wait(list(agent_dict))
        agent = agent_dict.pop(fut_sample)
        # Start task with next trajectory and record it in the dictionary.
        fut_sample2 = agent.sample.remote()
        agent_dict[fut_sample2] = agent

        next_sample = ray.get(fut_sample)
        num_timesteps_so_far += next_sample.count
        trajectories.append(next_sample)
    return SampleBatch.concat_samples(trajectories)
Exemple #2
0
    def compute_steps(self, config, obs_filter, rew_filter):
        """Compute multiple rollouts and concatenate the results.

        Args:
            config: Configuration parameters
            obs_filter: Function that is applied to each of the
                observations.
            reward_filter: Function that is applied to each of the rewards.

        Returns:
            states: List of states.
            total_rewards: Total rewards of the trajectories.
            trajectory_lengths: Lengths of the trajectories.
        """
        num_steps_so_far = 0
        trajectories = []
        self.update_filters(obs_filter, rew_filter)

        while num_steps_so_far < config["min_steps_per_task"]:
            rollout = self.sampler.get_data()
            trajectory = process_rollout(rollout,
                                         self.reward_filter,
                                         config["gamma"],
                                         config["lambda"],
                                         use_gae=config["use_gae"])
            num_steps_so_far += trajectory["rewards"].shape[0]
            trajectories.append(trajectory)
        metrics = self.sampler.get_metrics()
        total_rewards, trajectory_lengths = zip(*[(c.episode_reward,
                                                   c.episode_length)
                                                  for c in metrics])
        updated_obs_filter = self.sampler.get_obs_filter(flush=True)
        return (SampleBatch.concat_samples(trajectories), total_rewards,
                trajectory_lengths, updated_obs_filter, self.reward_filter)
Exemple #3
0
def collect_samples(agents, config, observation_filter, reward_filter):
    num_timesteps_so_far = 0
    trajectories = []
    total_rewards = []
    trajectory_lengths = []
    # This variable maps the object IDs of trajectories that are currently
    # computed to the agent that they are computed on; we start some initial
    # tasks here.
    agent_dict = {
        agent.compute_steps.remote(config, observation_filter, reward_filter):
        agent
        for agent in agents
    }
    while num_timesteps_so_far < config["timesteps_per_batch"]:
        # TODO(pcm): Make wait support arbitrary iterators and remove the
        # conversion to list here.
        [next_trajectory], _ = ray.wait(list(agent_dict))
        agent = agent_dict.pop(next_trajectory)
        # Start task with next trajectory and record it in the dictionary.
        agent_dict[agent.compute_steps.remote(config, observation_filter,
                                              reward_filter)] = agent
        trajectory, rewards, lengths, obs_f, rew_f = ray.get(next_trajectory)
        total_rewards.extend(rewards)
        trajectory_lengths.extend(lengths)
        num_timesteps_so_far += sum(lengths)
        trajectories.append(trajectory)
        observation_filter.update(obs_f)
        reward_filter.update(rew_f)
    return (SampleBatch.concat_samples(trajectories), np.mean(total_rewards),
            np.mean(trajectory_lengths))
Exemple #4
0
 def testConcat(self):
     b1 = SampleBatch({"a": np.array([1, 2, 3]), "b": np.array([4, 5, 6])})
     b2 = SampleBatch({"a": np.array([1]), "b": np.array([4])})
     b3 = SampleBatch({"a": np.array([1]), "b": np.array([5])})
     b12 = b1.concat(b2)
     self.assertEqual(b12["a"].tolist(), [1, 2, 3, 1])
     self.assertEqual(b12["b"].tolist(), [4, 5, 6, 4])
     b = SampleBatch.concat_samples([b1, b2, b3])
     self.assertEqual(b["a"].tolist(), [1, 2, 3, 1, 1])
     self.assertEqual(b["b"].tolist(), [4, 5, 6, 4, 5])
Exemple #5
0
 def testConcat(self):
     b1 = SampleBatch({"a": np.array([1, 2, 3]), "b": np.array([4, 5, 6])})
     b2 = SampleBatch({"a": np.array([1]), "b": np.array([4])})
     b3 = SampleBatch({"a": np.array([1]), "b": np.array([5])})
     b12 = b1.concat(b2)
     self.assertEqual(b12.data["a"].tolist(), [1, 2, 3, 1])
     self.assertEqual(b12.data["b"].tolist(), [4, 5, 6, 4])
     b = SampleBatch.concat_samples([b1, b2, b3])
     self.assertEqual(b.data["a"].tolist(), [1, 2, 3, 1, 1])
     self.assertEqual(b.data["b"].tolist(), [4, 5, 6, 4, 5])
Exemple #6
0
    def sample(self):
        """Returns experience samples from this Evaluator. Observation
        filter and reward filters are flushed here.

        Returns:
            SampleBatch: A columnar batch of experiences.
        """
        num_steps_so_far = 0
        all_samples = []

        while num_steps_so_far < self.config["min_steps_per_task"]:
            rollout = self.sampler.get_data()
            samples = process_rollout(
                rollout, self.rew_filter, self.config["gamma"],
                self.config["lambda"], use_gae=self.config["use_gae"])
            num_steps_so_far += samples.count
            all_samples.append(samples)
        return SampleBatch.concat_samples(all_samples)
Exemple #7
0
    def sample(self):
        """Returns experience samples from this Evaluator. Observation
        filter and reward filters are flushed here.

        Returns:
            SampleBatch: A columnar batch of experiences.
        """
        num_steps_so_far = 0
        all_samples = []

        while num_steps_so_far < self.config["min_steps_per_task"]:
            rollout = self.sampler.get_data()
            samples = process_rollout(
                rollout, self.rew_filter, self.config["gamma"],
                self.config["lambda"], use_gae=self.config["use_gae"])
            num_steps_so_far += samples.count
            all_samples.append(samples)
        return SampleBatch.concat_samples(all_samples)
Exemple #8
0
    def sample(self):
        """Returns experience samples from this Evaluator. Observation
        filter and reward filters are flushed here.

        Returns:
            SampleBatch: A columnar batch of experiences.
        """
        num_steps_so_far = 0
        all_samples = []

        while num_steps_so_far < self.config["min_steps_per_task"]:
            rollout = self.sampler.get_data()
            last_r = 0.0  # note: not needed since we don't truncate rollouts
            samples = compute_advantages(
                rollout, last_r, self.config["gamma"],
                self.config["lambda"], use_gae=self.config["use_gae"])
            num_steps_so_far += samples.count
            all_samples.append(samples)
        return SampleBatch.concat_samples(all_samples)
Exemple #9
0
    def sample(self):
        """Evaluate the current policies and return a batch of experiences.

        Return:
            SampleBatch from evaluating the current policies.
        """

        batches = [self.sampler.get_data()]
        steps_so_far = batches[0].count
        while steps_so_far < self.batch_steps:
            batch = self.sampler.get_data()
            steps_so_far += batch.count
            batches.append(batch)
        batch = SampleBatch.concat_samples(batches)

        if self.compress_observations:
            batch["obs"] = [pack(o) for o in batch["obs"]]
            batch["new_obs"] = [pack(o) for o in batch["new_obs"]]

        return batch
Exemple #10
0
    def sample(self, no_replay=False):
        # First seed the replay buffer with a few new samples
        if self.workers:
            weights = ray.put(self.get_weights())
            for w in self.workers:
                w.set_weights.remote(weights)
            samples = ray.get([w.sample.remote() for w in self.workers])
        else:
            samples = [DQNEvaluator.sample(self)]

        for s in samples:
            for row in s.rows():
                self.replay_buffer.add(
                    row["obs"], row["actions"], row["rewards"], row["new_obs"],
                    row["dones"])

        if no_replay:
            return SampleBatch.concat_samples(samples)

        # Then return a batch sampled from the buffer
        if self.config["prioritized_replay"]:
            (obses_t, actions, rewards, obses_tp1,
                dones, weights, batch_indexes) = self.replay_buffer.sample(
                    self.config["train_batch_size"],
                    beta=self.beta_schedule.value(self.global_timestep))
            self._update_priorities_if_needed()
            batch = SampleBatch({
                "obs": obses_t, "actions": actions, "rewards": rewards,
                "new_obs": obses_tp1, "dones": dones, "weights": weights,
                "batch_indexes": batch_indexes})
            self.samples_to_prioritize = batch
        else:
            obses_t, actions, rewards, obses_tp1, dones = \
                self.replay_buffer.sample(self.config["train_batch_size"])
            batch = SampleBatch({
                "obs": obses_t, "actions": actions, "rewards": rewards,
                "new_obs": obses_tp1, "dones": dones,
                "weights": np.ones_like(rewards)})
        return batch