Esempio n. 1
0
 def test_ray_sampling(self):
     """
     Tests Ray's memory performance.
     """
     assert get_distributed_backend() == "ray"
     memory = PrioritizedReplayBuffer(
         size=self.capacity,
         alpha=1.0,
         clip_rewards=True
     )
     records = [self.record_space.sample(size=1) for _ in range_(self.inserts)]
     for record in records:
         memory.add(
             obs_t=ray_compress(record['states']),
             action=record['actions'],
             reward=record['reward'],
             obs_tp1=ray_compress(record['states']),
             done=record['terminals'],
             weight=None
         )
     start = time.monotonic()
     for _ in range_(self.samples):
         batch_tuple = memory.sample(self.sample_batch_size, beta=1.0)
     end = time.monotonic() - start
     tp = self.samples / end
     print('#### Testing Ray Prioritized Replay memory ####')
     print('Testing sampling performance:')
     print('Sampled {} batches, throughput: {} samples/s, total time: {} s'.format(
         self.samples, tp, end
     ))
Esempio n. 2
0
    def test_add(self):
        memory = PrioritizedReplayBuffer(
            size=2,
            alpha=self.alpha,
        )

        # Assert indices 0 before insert.
        self.assertEqual(len(memory), 0)
        self.assertEqual(memory._next_idx, 0)

        # Insert single record.
        data = self._generate_data()
        memory.add(*data, weight=0.5)
        self.assertTrue(len(memory) == 1)
        self.assertTrue(memory._next_idx == 1)

        # Insert single record.
        data = self._generate_data()
        memory.add(*data, weight=0.1)
        self.assertTrue(len(memory) == 2)
        self.assertTrue(memory._next_idx == 0)

        # Insert over capacity.
        data = self._generate_data()
        memory.add(*data, weight=1.0)
        self.assertTrue(len(memory) == 2)
        self.assertTrue(memory._next_idx == 1)
Esempio n. 3
0
    def __init__(self, num_shards, learning_starts, buffer_size,
                 train_batch_size, prioritized_replay_alpha,
                 prioritized_replay_beta, prioritized_replay_eps):
        self.replay_starts = learning_starts // num_shards
        self.buffer_size = buffer_size // num_shards
        self.train_batch_size = train_batch_size
        self.prioritized_replay_beta = prioritized_replay_beta
        self.prioritized_replay_eps = prioritized_replay_eps

        self.replay_buffer = PrioritizedReplayBuffer(
            self.buffer_size, alpha=prioritized_replay_alpha)

        # Metrics
        self.add_batch_timer = TimerStat()
        self.replay_timer = TimerStat()
        self.update_priorities_timer = TimerStat()
Esempio n. 4
0
    def _init(self,
              learning_starts=1000,
              buffer_size=10000,
              prioritized_replay=True,
              prioritized_replay_alpha=0.6,
              prioritized_replay_beta=0.4,
              prioritized_replay_eps=1e-6,
              train_batch_size=32,
              sample_batch_size=4):

        self.replay_starts = learning_starts
        self.prioritized_replay_beta = prioritized_replay_beta
        self.prioritized_replay_eps = prioritized_replay_eps
        self.train_batch_size = train_batch_size

        # Stats
        self.update_weights_timer = TimerStat()
        self.sample_timer = TimerStat()
        self.replay_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.throughput = RunningStat()

        # Set up replay buffer
        if prioritized_replay:
            self.replay_buffer = PrioritizedReplayBuffer(
                buffer_size, alpha=prioritized_replay_alpha)
        else:
            self.replay_buffer = ReplayBuffer(buffer_size)

        assert buffer_size >= self.replay_starts
 def new_buffer():
     return PrioritizedReplayBuffer(
         self.buffer_size,
         alpha=self.prioritized_replay_alpha,
         dynamic_experience_replay=self.dynamic_experience_replay,
         demonstration_zone_percentage=self.
         demonstration_zone_percentage)
Esempio n. 6
0
    def test_ray_prioritized_replay_insert(self):
        """
        Tests Ray's memory performance.
        """
        assert get_distributed_backend() == "ray"
        memory = PrioritizedReplayBuffer(
            size=self.capacity,
            alpha=1.0,
            clip_rewards=True
        )
        # Test individual inserts.
        records = [self.record_space.sample(size=1) for _ in range_(self.inserts)]

        start = time.monotonic()
        for record in records:
            memory.add(
                obs_t=record['states'],
                action=record['actions'],
                reward=record['reward'],
                obs_tp1=record['states'],
                done=record['terminals'],
                weight=None
            )
        end = time.monotonic() - start
        tp = len(records) / end
        print('#### Testing Ray Prioritized Replay memory ####')
        print('Testing insert performance:')
        print('Inserted {} separate records, throughput: {} records/s, total time: {} s'.format(
            len(records), tp, end
        ))

        memory = PrioritizedReplayBuffer(
            size=self.capacity,
            alpha=1.0,
            clip_rewards=True
        )

        # Test chunked inserts -> done via external for loop in Ray.
        chunks = int(self.inserts / self.chunksize)
        records = [self.record_space.sample(size=self.chunksize) for _ in range_(chunks)]
        start = time.monotonic()
        for chunk in records:
            for i in range_(self.chunksize):
                memory.add(
                    obs_t=chunk['states'][i],
                    action=chunk['actions'][i],
                    reward=chunk['reward'][i],
                    obs_tp1=chunk['states'][i],
                    done=chunk['terminals'][i],
                    weight=None
                )
        end = time.monotonic() - start
        tp = len(records) * self.chunksize / end
        print('Testing chunked insert performance:')
        print('Inserted {} chunks, throughput: {} records/s, total time: {} s'.format(
            len(records), tp, end
        ))
 def new_buffer():
     return PrioritizedReplayBuffer(
         self.buffer_size,
         alpha=self.prioritized_replay_alpha,
         permanent_data_length=len(human_data),
         human_data=human_data,
         dynamic_experience_replay=self.dynamic_experience_replay,
         demonstration_zone_percentage=self.
         demonstration_zone_percentage)
Esempio n. 8
0
    def test_ray_updating(self):
        """
        Tests Ray's memory performance.
        """
        assert get_distributed_backend() == "ray"
        memory = PrioritizedReplayBuffer(
            size=self.capacity,
            alpha=1.0,
            clip_rewards=True
        )
        records = [self.record_space.sample(size=1) for _ in range_(self.inserts)]
        for record in records:
            memory.add(
                obs_t=record['states'],
                action=record['actions'],
                reward=record['reward'],
                obs_tp1=record['states'],
                done=record['terminals'],
                weight=None
            )
        loss_values = [np.random.random(size=self.sample_batch_size) for _ in range_(self.samples)]
        indices = [np.random.randint(low=0, high=self.inserts, size=self.sample_batch_size) for _
                   in range_(self.samples)]

        start = time.monotonic()
        for index, loss in zip(indices, loss_values):
            memory.update_priorities(index, loss)
        end = time.monotonic() - start
        tp = len(indices) / end
        print('#### Testing Ray Prioritized Replay memory ####')
        print('Testing updating performance:')
        print('Updates {} loss batches, throughput: {} updates/s, total time: {} s'.format(
            len(indices), tp, end
        ))
Esempio n. 9
0
    def test_alpha_parameter(self):
        # Test sampling from a PR with a very small alpha (should behave just
        # like a regular ReplayBuffer).
        memory = PrioritizedReplayBuffer(size=self.capacity, alpha=0.01)

        # Insert n samples.
        num_records = 5
        for i in range(num_records):
            data = self._generate_data()
            memory.add(*data, weight=np.random.rand())
            self.assertTrue(len(memory) == i + 1)
            self.assertTrue(memory._next_idx == i + 1)

        # Fetch records, their indices and weights.
        _, _, _, _, _, weights, indices = \
            memory.sample(1000, beta=self.beta)
        counts = Counter()
        for i in indices:
            counts[i] += 1
        print(counts)
        # Expect an approximately uniform distribution of indices.
        for i in counts.values():
            self.assertTrue(100 < i < 300)
Esempio n. 10
0
    def test_ray_combined_ops(self):
        """
        Tests a combined workflow of insert, sample, update on the prioritized replay memory.
        """
        assert get_distributed_backend() == "ray"
        memory = PrioritizedReplayBuffer(
            size=self.capacity,
            alpha=1.0,
            clip_rewards=True
        )
        chunksize = 32

        # Test chunked inserts -> done via external for loop in Ray.
        chunks = int(self.inserts / chunksize)
        records = [self.record_space.sample(size=chunksize) for _ in range_(chunks)]
        loss_values = [np.random.random(size=self.sample_batch_size) for _ in range_(chunks)]
        start = time.monotonic()

        for chunk, loss_values in zip(records, loss_values):
            # Insert.
            for i in range_(chunksize):
                memory.add(
                    obs_t=ray_compress(chunk['states'][i]),
                    action=chunk['actions'][i],
                    reward=chunk['reward'][i],
                    obs_tp1=ray_compress(chunk['states'][i]),
                    done=chunk['terminals'][i],
                    weight=None
                )
            # Sample.
            batch_tuple = memory.sample(self.sample_batch_size, beta=1.0)
            indices = batch_tuple[-1]
            # Update
            memory.update_priorities(indices, loss_values)

        end = time.monotonic() - start
        tp = len(records) / end
        print('Ray: testing combined insert/sample/update performance:')
        print('Ran {} combined ops, throughput: {} combined ops/s, total time: {} s'.format(
            len(records), tp, end
        ))
Esempio n. 11
0
 def new_buffer():
     return PrioritizedReplayBuffer(self.buffer_size,
                                    alpha=prioritized_replay_alpha)
Esempio n. 12
0
class ReplayActor(object):
    """A replay buffer shard.

    Ray actors are single-threaded, so for scalability multiple replay actors
    may be created to increase parallelism."""
    def __init__(self, num_shards, learning_starts, buffer_size,
                 train_batch_size, prioritized_replay_alpha,
                 prioritized_replay_beta, prioritized_replay_eps):
        self.replay_starts = learning_starts // num_shards
        self.buffer_size = buffer_size // num_shards
        self.train_batch_size = train_batch_size
        self.prioritized_replay_beta = prioritized_replay_beta
        self.prioritized_replay_eps = prioritized_replay_eps

        self.replay_buffer = PrioritizedReplayBuffer(
            self.buffer_size, alpha=prioritized_replay_alpha)

        # Metrics
        self.add_batch_timer = TimerStat()
        self.replay_timer = TimerStat()
        self.update_priorities_timer = TimerStat()

    def get_host(self):
        return os.uname()[1]

    def add_batch(self, batch):
        PolicyOptimizer._check_not_multiagent(batch)
        with self.add_batch_timer:
            for row in batch.rows():
                self.replay_buffer.add(row["obs"], row["actions"],
                                       row["rewards"], row["new_obs"],
                                       row["dones"], row["weights"])

    def replay(self):
        with self.replay_timer:
            if len(self.replay_buffer) < self.replay_starts:
                return None

            (obses_t, actions, rewards, obses_tp1, dones, weights,
             batch_indexes) = self.replay_buffer.sample(
                 self.train_batch_size, beta=self.prioritized_replay_beta)

            batch = SampleBatch({
                "obs": obses_t,
                "actions": actions,
                "rewards": rewards,
                "new_obs": obses_tp1,
                "dones": dones,
                "weights": weights,
                "batch_indexes": batch_indexes
            })
            return batch

    def update_priorities(self, batch_indexes, td_errors):
        with self.update_priorities_timer:
            new_priorities = (np.abs(td_errors) + self.prioritized_replay_eps)
            self.replay_buffer.update_priorities(batch_indexes, new_priorities)

    def stats(self):
        stat = {
            "add_batch_time_ms":
            round(1000 * self.add_batch_timer.mean, 3),
            "replay_time_ms":
            round(1000 * self.replay_timer.mean, 3),
            "update_priorities_time_ms":
            round(1000 * self.update_priorities_timer.mean, 3),
        }
        stat.update(self.replay_buffer.stats())
        return stat
Esempio n. 13
0
    def test_update_priorities(self):
        memory = PrioritizedReplayBuffer(size=self.capacity, alpha=self.alpha)

        # Insert n samples.
        num_records = 5
        for i in range(num_records):
            data = self._generate_data()
            memory.add(*data, weight=1.0)
            self.assertTrue(len(memory) == i + 1)
            self.assertTrue(memory._next_idx == i + 1)

        # Fetch records, their indices and weights.
        _, _, _, _, _, weights, indices = \
            memory.sample(3, beta=self.beta)
        check(weights, np.ones(shape=(3, )))
        self.assertEqual(3, len(indices))
        self.assertTrue(len(memory) == num_records)
        self.assertTrue(memory._next_idx == num_records)

        # Update weight of indices 0, 2, 3, 4 to very small.
        memory.update_priorities(np.array([0, 2, 3, 4]),
                                 np.array([0.01, 0.01, 0.01, 0.01]))
        # Expect to sample almost only index 1
        # (which still has a weight of 1.0).
        for _ in range(10):
            _, _, _, _, _, weights, indices = memory.sample(1000,
                                                            beta=self.beta)
            self.assertTrue(970 < np.sum(indices) < 1100)

        # Update weight of indices 0 and 1 to >> 0.01.
        # Expect to sample 0 and 1 equally (and some 2s, 3s, and 4s).
        for _ in range(10):
            rand = np.random.random() + 0.2
            memory.update_priorities(np.array([0, 1]), np.array([rand, rand]))
            _, _, _, _, _, _, indices = memory.sample(1000, beta=self.beta)
            # Expect biased to higher values due to some 2s, 3s, and 4s.
            # print(np.sum(indices))
            self.assertTrue(400 < np.sum(indices) < 800)

        # Update weights to be 1:2.
        # Expect to sample double as often index 1 over index 0
        # plus very few times indices 2, 3, or 4.
        for _ in range(10):
            rand = np.random.random() + 0.2
            memory.update_priorities(np.array([0, 1]),
                                     np.array([rand, rand * 2]))
            _, _, _, _, _, _, indices = memory.sample(1000, beta=self.beta)
            # print(np.sum(indices))
            self.assertTrue(600 < np.sum(indices) < 850)

        # Update weights to be 1:4.
        # Expect to sample quadruple as often index 1 over index 0
        # plus very few times indices 2, 3, or 4.
        for _ in range(10):
            rand = np.random.random() + 0.2
            memory.update_priorities(np.array([0, 1]),
                                     np.array([rand, rand * 4]))
            _, _, _, _, _, _, indices = memory.sample(1000, beta=self.beta)
            # print(np.sum(indices))
            self.assertTrue(750 < np.sum(indices) < 950)

        # Update weights to be 1:9.
        # Expect to sample 9 times as often index 1 over index 0.
        # plus very few times indices 2, 3, or 4.
        for _ in range(10):
            rand = np.random.random() + 0.2
            memory.update_priorities(np.array([0, 1]),
                                     np.array([rand, rand * 9]))
            _, _, _, _, _, _, indices = memory.sample(1000, beta=self.beta)
            # print(np.sum(indices))
            self.assertTrue(850 < np.sum(indices) < 1100)

        # Insert n more samples.
        num_records = 5
        for i in range(num_records):
            data = self._generate_data()
            memory.add(*data, weight=1.0)
            self.assertTrue(len(memory) == i + 6)
            self.assertTrue(memory._next_idx == (i + 6) % self.capacity)

        # Update all weights to be 1.0 to 10.0 and sample a >100 batch.
        memory.update_priorities(
            np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
            np.array([0.001, 0.1, 2., 8., 16., 32., 64., 128., 256., 512.]))
        counts = Counter()
        for _ in range(10):
            _, _, _, _, _, _, indices = memory.sample(np.random.randint(
                100, 600),
                                                      beta=self.beta)
            for i in indices:
                counts[i] += 1
        print(counts)
        # Expect an approximately correct distribution of indices.
        self.assertTrue(
            counts[9] >= counts[8] >= counts[7] >= counts[6] >= counts[5] >=
            counts[4] >= counts[3] >= counts[2] >= counts[1] >= counts[0])
Esempio n. 14
0
class ReplayActor(object):
    def __init__(self, num_shards, learning_starts, buffer_size,
                 train_batch_size, prioritized_replay_alpha,
                 prioritized_replay_beta, prioritized_replay_eps):
        self.replay_starts = learning_starts // num_shards
        self.buffer_size = buffer_size // num_shards
        self.train_batch_size = train_batch_size
        self.prioritized_replay_beta = prioritized_replay_beta
        self.prioritized_replay_eps = prioritized_replay_eps

        self.replay_buffer = PrioritizedReplayBuffer(
            buffer_size, alpha=prioritized_replay_alpha)

        # Metrics
        self.add_batch_timer = TimerStat()
        self.replay_timer = TimerStat()
        self.update_priorities_timer = TimerStat()

    def get_host(self):
        return os.uname()[1]

    def add_batch(self, batch):
        with self.add_batch_timer:
            for row in batch.rows():
                self.replay_buffer.add(row["obs"], row["actions"],
                                       row["rewards"], row["new_obs"],
                                       row["dones"], row["weights"])

    def replay(self):
        with self.replay_timer:
            if len(self.replay_buffer) < self.replay_starts:
                return None

            (obses_t, actions, rewards, obses_tp1, dones, weights,
             batch_indexes) = self.replay_buffer.sample(
                 self.train_batch_size, beta=self.prioritized_replay_beta)

            batch = SampleBatch({
                "obs": obses_t,
                "actions": actions,
                "rewards": rewards,
                "new_obs": obses_tp1,
                "dones": dones,
                "weights": weights,
                "batch_indexes": batch_indexes
            })
            return batch

    def update_priorities(self, batch, td_errors):
        with self.update_priorities_timer:
            new_priorities = (np.abs(td_errors) + self.prioritized_replay_eps)
            self.replay_buffer.update_priorities(batch["batch_indexes"],
                                                 new_priorities)

    def stats(self):
        stat = {
            "add_batch_time_ms":
            round(1000 * self.add_batch_timer.mean, 3),
            "replay_time_ms":
            round(1000 * self.replay_timer.mean, 3),
            "update_priorities_time_ms":
            round(1000 * self.update_priorities_timer.mean, 3),
        }
        stat.update(self.replay_buffer.stats())
        return stat
Esempio n. 15
0
 def new_buffer():
     return PrioritizedReplayBuffer(
         buffer_size,
         alpha=prioritized_replay_alpha,
         clip_rewards=clip_rewards)