def test_ray_updating(self): """ Tests Ray's memory performance. """ assert get_distributed_backend() == "ray" memory = PrioritizedReplayBuffer( size=self.capacity, alpha=1.0, clip_rewards=True ) records = [self.record_space.sample(size=1) for _ in range_(self.inserts)] for record in records: memory.add( obs_t=record['states'], action=record['actions'], reward=record['reward'], obs_tp1=record['states'], done=record['terminals'], weight=None ) loss_values = [np.random.random(size=self.sample_batch_size) for _ in range_(self.samples)] indices = [np.random.randint(low=0, high=self.inserts, size=self.sample_batch_size) for _ in range_(self.samples)] start = time.monotonic() for index, loss in zip(indices, loss_values): memory.update_priorities(index, loss) end = time.monotonic() - start tp = len(indices) / end print('#### Testing Ray Prioritized Replay memory ####') print('Testing updating performance:') print('Updates {} loss batches, throughput: {} updates/s, total time: {} s'.format( len(indices), tp, end ))
def test_ray_sampling(self): """ Tests Ray's memory performance. """ assert get_distributed_backend() == "ray" memory = PrioritizedReplayBuffer( size=self.capacity, alpha=1.0, clip_rewards=True ) records = [self.record_space.sample(size=1) for _ in range_(self.inserts)] for record in records: memory.add( obs_t=ray_compress(record['states']), action=record['actions'], reward=record['reward'], obs_tp1=ray_compress(record['states']), done=record['terminals'], weight=None ) start = time.monotonic() for _ in range_(self.samples): batch_tuple = memory.sample(self.sample_batch_size, beta=1.0) end = time.monotonic() - start tp = self.samples / end print('#### Testing Ray Prioritized Replay memory ####') print('Testing sampling performance:') print('Sampled {} batches, throughput: {} samples/s, total time: {} s'.format( self.samples, tp, end ))
def test_ray_prioritized_replay_insert(self): """ Tests Ray's memory performance. """ assert get_distributed_backend() == "ray" memory = PrioritizedReplayBuffer( size=self.capacity, alpha=1.0, clip_rewards=True ) # Test individual inserts. records = [self.record_space.sample(size=1) for _ in range_(self.inserts)] start = time.monotonic() for record in records: memory.add( obs_t=record['states'], action=record['actions'], reward=record['reward'], obs_tp1=record['states'], done=record['terminals'], weight=None ) end = time.monotonic() - start tp = len(records) / end print('#### Testing Ray Prioritized Replay memory ####') print('Testing insert performance:') print('Inserted {} separate records, throughput: {} records/s, total time: {} s'.format( len(records), tp, end )) memory = PrioritizedReplayBuffer( size=self.capacity, alpha=1.0, clip_rewards=True ) # Test chunked inserts -> done via external for loop in Ray. chunks = int(self.inserts / self.chunksize) records = [self.record_space.sample(size=self.chunksize) for _ in range_(chunks)] start = time.monotonic() for chunk in records: for i in range_(self.chunksize): memory.add( obs_t=chunk['states'][i], action=chunk['actions'][i], reward=chunk['reward'][i], obs_tp1=chunk['states'][i], done=chunk['terminals'][i], weight=None ) end = time.monotonic() - start tp = len(records) * self.chunksize / end print('Testing chunked insert performance:') print('Inserted {} chunks, throughput: {} records/s, total time: {} s'.format( len(records), tp, end ))
def test_ray_combined_ops(self): """ Tests a combined workflow of insert, sample, update on the prioritized replay memory. """ assert get_distributed_backend() == "ray" memory = PrioritizedReplayBuffer( size=self.capacity, alpha=1.0, clip_rewards=True ) chunksize = 32 # Test chunked inserts -> done via external for loop in Ray. chunks = int(self.inserts / chunksize) records = [self.record_space.sample(size=chunksize) for _ in range_(chunks)] loss_values = [np.random.random(size=self.sample_batch_size) for _ in range_(chunks)] start = time.monotonic() for chunk, loss_values in zip(records, loss_values): # Insert. for i in range_(chunksize): memory.add( obs_t=ray_compress(chunk['states'][i]), action=chunk['actions'][i], reward=chunk['reward'][i], obs_tp1=ray_compress(chunk['states'][i]), done=chunk['terminals'][i], weight=None ) # Sample. batch_tuple = memory.sample(self.sample_batch_size, beta=1.0) indices = batch_tuple[-1] # Update memory.update_priorities(indices, loss_values) end = time.monotonic() - start tp = len(records) / end print('Ray: testing combined insert/sample/update performance:') print('Ran {} combined ops, throughput: {} combined ops/s, total time: {} s'.format( len(records), tp, end ))
def test_add(self): memory = PrioritizedReplayBuffer( size=2, alpha=self.alpha, ) # Assert indices 0 before insert. self.assertEqual(len(memory), 0) self.assertEqual(memory._next_idx, 0) # Insert single record. data = self._generate_data() memory.add(*data, weight=0.5) self.assertTrue(len(memory) == 1) self.assertTrue(memory._next_idx == 1) # Insert single record. data = self._generate_data() memory.add(*data, weight=0.1) self.assertTrue(len(memory) == 2) self.assertTrue(memory._next_idx == 0) # Insert over capacity. data = self._generate_data() memory.add(*data, weight=1.0) self.assertTrue(len(memory) == 2) self.assertTrue(memory._next_idx == 1)
def test_alpha_parameter(self): # Test sampling from a PR with a very small alpha (should behave just # like a regular ReplayBuffer). memory = PrioritizedReplayBuffer(size=self.capacity, alpha=0.01) # Insert n samples. num_records = 5 for i in range(num_records): data = self._generate_data() memory.add(*data, weight=np.random.rand()) self.assertTrue(len(memory) == i + 1) self.assertTrue(memory._next_idx == i + 1) # Fetch records, their indices and weights. _, _, _, _, _, weights, indices = \ memory.sample(1000, beta=self.beta) counts = Counter() for i in indices: counts[i] += 1 print(counts) # Expect an approximately uniform distribution of indices. for i in counts.values(): self.assertTrue(100 < i < 300)
class ReplayActor(object): """A replay buffer shard. Ray actors are single-threaded, so for scalability multiple replay actors may be created to increase parallelism.""" def __init__(self, num_shards, learning_starts, buffer_size, train_batch_size, prioritized_replay_alpha, prioritized_replay_beta, prioritized_replay_eps): self.replay_starts = learning_starts // num_shards self.buffer_size = buffer_size // num_shards self.train_batch_size = train_batch_size self.prioritized_replay_beta = prioritized_replay_beta self.prioritized_replay_eps = prioritized_replay_eps self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=prioritized_replay_alpha) # Metrics self.add_batch_timer = TimerStat() self.replay_timer = TimerStat() self.update_priorities_timer = TimerStat() def get_host(self): return os.uname()[1] def add_batch(self, batch): PolicyOptimizer._check_not_multiagent(batch) with self.add_batch_timer: for row in batch.rows(): self.replay_buffer.add(row["obs"], row["actions"], row["rewards"], row["new_obs"], row["dones"], row["weights"]) def replay(self): with self.replay_timer: if len(self.replay_buffer) < self.replay_starts: return None (obses_t, actions, rewards, obses_tp1, dones, weights, batch_indexes) = self.replay_buffer.sample( self.train_batch_size, beta=self.prioritized_replay_beta) batch = SampleBatch({ "obs": obses_t, "actions": actions, "rewards": rewards, "new_obs": obses_tp1, "dones": dones, "weights": weights, "batch_indexes": batch_indexes }) return batch def update_priorities(self, batch_indexes, td_errors): with self.update_priorities_timer: new_priorities = (np.abs(td_errors) + self.prioritized_replay_eps) self.replay_buffer.update_priorities(batch_indexes, new_priorities) def stats(self): stat = { "add_batch_time_ms": round(1000 * self.add_batch_timer.mean, 3), "replay_time_ms": round(1000 * self.replay_timer.mean, 3), "update_priorities_time_ms": round(1000 * self.update_priorities_timer.mean, 3), } stat.update(self.replay_buffer.stats()) return stat
def test_update_priorities(self): memory = PrioritizedReplayBuffer(size=self.capacity, alpha=self.alpha) # Insert n samples. num_records = 5 for i in range(num_records): data = self._generate_data() memory.add(*data, weight=1.0) self.assertTrue(len(memory) == i + 1) self.assertTrue(memory._next_idx == i + 1) # Fetch records, their indices and weights. _, _, _, _, _, weights, indices = \ memory.sample(3, beta=self.beta) check(weights, np.ones(shape=(3, ))) self.assertEqual(3, len(indices)) self.assertTrue(len(memory) == num_records) self.assertTrue(memory._next_idx == num_records) # Update weight of indices 0, 2, 3, 4 to very small. memory.update_priorities(np.array([0, 2, 3, 4]), np.array([0.01, 0.01, 0.01, 0.01])) # Expect to sample almost only index 1 # (which still has a weight of 1.0). for _ in range(10): _, _, _, _, _, weights, indices = memory.sample(1000, beta=self.beta) self.assertTrue(970 < np.sum(indices) < 1100) # Update weight of indices 0 and 1 to >> 0.01. # Expect to sample 0 and 1 equally (and some 2s, 3s, and 4s). for _ in range(10): rand = np.random.random() + 0.2 memory.update_priorities(np.array([0, 1]), np.array([rand, rand])) _, _, _, _, _, _, indices = memory.sample(1000, beta=self.beta) # Expect biased to higher values due to some 2s, 3s, and 4s. # print(np.sum(indices)) self.assertTrue(400 < np.sum(indices) < 800) # Update weights to be 1:2. # Expect to sample double as often index 1 over index 0 # plus very few times indices 2, 3, or 4. for _ in range(10): rand = np.random.random() + 0.2 memory.update_priorities(np.array([0, 1]), np.array([rand, rand * 2])) _, _, _, _, _, _, indices = memory.sample(1000, beta=self.beta) # print(np.sum(indices)) self.assertTrue(600 < np.sum(indices) < 850) # Update weights to be 1:4. # Expect to sample quadruple as often index 1 over index 0 # plus very few times indices 2, 3, or 4. for _ in range(10): rand = np.random.random() + 0.2 memory.update_priorities(np.array([0, 1]), np.array([rand, rand * 4])) _, _, _, _, _, _, indices = memory.sample(1000, beta=self.beta) # print(np.sum(indices)) self.assertTrue(750 < np.sum(indices) < 950) # Update weights to be 1:9. # Expect to sample 9 times as often index 1 over index 0. # plus very few times indices 2, 3, or 4. for _ in range(10): rand = np.random.random() + 0.2 memory.update_priorities(np.array([0, 1]), np.array([rand, rand * 9])) _, _, _, _, _, _, indices = memory.sample(1000, beta=self.beta) # print(np.sum(indices)) self.assertTrue(850 < np.sum(indices) < 1100) # Insert n more samples. num_records = 5 for i in range(num_records): data = self._generate_data() memory.add(*data, weight=1.0) self.assertTrue(len(memory) == i + 6) self.assertTrue(memory._next_idx == (i + 6) % self.capacity) # Update all weights to be 1.0 to 10.0 and sample a >100 batch. memory.update_priorities( np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), np.array([0.001, 0.1, 2., 8., 16., 32., 64., 128., 256., 512.])) counts = Counter() for _ in range(10): _, _, _, _, _, _, indices = memory.sample(np.random.randint( 100, 600), beta=self.beta) for i in indices: counts[i] += 1 print(counts) # Expect an approximately correct distribution of indices. self.assertTrue( counts[9] >= counts[8] >= counts[7] >= counts[6] >= counts[5] >= counts[4] >= counts[3] >= counts[2] >= counts[1] >= counts[0])
class ReplayActor(object): def __init__(self, num_shards, learning_starts, buffer_size, train_batch_size, prioritized_replay_alpha, prioritized_replay_beta, prioritized_replay_eps): self.replay_starts = learning_starts // num_shards self.buffer_size = buffer_size // num_shards self.train_batch_size = train_batch_size self.prioritized_replay_beta = prioritized_replay_beta self.prioritized_replay_eps = prioritized_replay_eps self.replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) # Metrics self.add_batch_timer = TimerStat() self.replay_timer = TimerStat() self.update_priorities_timer = TimerStat() def get_host(self): return os.uname()[1] def add_batch(self, batch): with self.add_batch_timer: for row in batch.rows(): self.replay_buffer.add(row["obs"], row["actions"], row["rewards"], row["new_obs"], row["dones"], row["weights"]) def replay(self): with self.replay_timer: if len(self.replay_buffer) < self.replay_starts: return None (obses_t, actions, rewards, obses_tp1, dones, weights, batch_indexes) = self.replay_buffer.sample( self.train_batch_size, beta=self.prioritized_replay_beta) batch = SampleBatch({ "obs": obses_t, "actions": actions, "rewards": rewards, "new_obs": obses_tp1, "dones": dones, "weights": weights, "batch_indexes": batch_indexes }) return batch def update_priorities(self, batch, td_errors): with self.update_priorities_timer: new_priorities = (np.abs(td_errors) + self.prioritized_replay_eps) self.replay_buffer.update_priorities(batch["batch_indexes"], new_priorities) def stats(self): stat = { "add_batch_time_ms": round(1000 * self.add_batch_timer.mean, 3), "replay_time_ms": round(1000 * self.replay_timer.mean, 3), "update_priorities_time_ms": round(1000 * self.update_priorities_timer.mean, 3), } stat.update(self.replay_buffer.stats()) return stat