def test_ray_updating(self): """ Tests Ray's memory performance. """ assert get_distributed_backend() == "ray" memory = PrioritizedReplayBuffer( size=self.capacity, alpha=1.0, clip_rewards=True ) records = [self.record_space.sample(size=1) for _ in range_(self.inserts)] for record in records: memory.add( obs_t=record['states'], action=record['actions'], reward=record['reward'], obs_tp1=record['states'], done=record['terminals'], weight=None ) loss_values = [np.random.random(size=self.sample_batch_size) for _ in range_(self.samples)] indices = [np.random.randint(low=0, high=self.inserts, size=self.sample_batch_size) for _ in range_(self.samples)] start = time.monotonic() for index, loss in zip(indices, loss_values): memory.update_priorities(index, loss) end = time.monotonic() - start tp = len(indices) / end print('#### Testing Ray Prioritized Replay memory ####') print('Testing updating performance:') print('Updates {} loss batches, throughput: {} updates/s, total time: {} s'.format( len(indices), tp, end ))
def test_ray_combined_ops(self): """ Tests a combined workflow of insert, sample, update on the prioritized replay memory. """ assert get_distributed_backend() == "ray" memory = PrioritizedReplayBuffer( size=self.capacity, alpha=1.0, clip_rewards=True ) chunksize = 32 # Test chunked inserts -> done via external for loop in Ray. chunks = int(self.inserts / chunksize) records = [self.record_space.sample(size=chunksize) for _ in range_(chunks)] loss_values = [np.random.random(size=self.sample_batch_size) for _ in range_(chunks)] start = time.monotonic() for chunk, loss_values in zip(records, loss_values): # Insert. for i in range_(chunksize): memory.add( obs_t=ray_compress(chunk['states'][i]), action=chunk['actions'][i], reward=chunk['reward'][i], obs_tp1=ray_compress(chunk['states'][i]), done=chunk['terminals'][i], weight=None ) # Sample. batch_tuple = memory.sample(self.sample_batch_size, beta=1.0) indices = batch_tuple[-1] # Update memory.update_priorities(indices, loss_values) end = time.monotonic() - start tp = len(records) / end print('Ray: testing combined insert/sample/update performance:') print('Ran {} combined ops, throughput: {} combined ops/s, total time: {} s'.format( len(records), tp, end ))
class ReplayActor(object): """A replay buffer shard. Ray actors are single-threaded, so for scalability multiple replay actors may be created to increase parallelism.""" def __init__(self, num_shards, learning_starts, buffer_size, train_batch_size, prioritized_replay_alpha, prioritized_replay_beta, prioritized_replay_eps): self.replay_starts = learning_starts // num_shards self.buffer_size = buffer_size // num_shards self.train_batch_size = train_batch_size self.prioritized_replay_beta = prioritized_replay_beta self.prioritized_replay_eps = prioritized_replay_eps self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=prioritized_replay_alpha) # Metrics self.add_batch_timer = TimerStat() self.replay_timer = TimerStat() self.update_priorities_timer = TimerStat() def get_host(self): return os.uname()[1] def add_batch(self, batch): PolicyOptimizer._check_not_multiagent(batch) with self.add_batch_timer: for row in batch.rows(): self.replay_buffer.add(row["obs"], row["actions"], row["rewards"], row["new_obs"], row["dones"], row["weights"]) def replay(self): with self.replay_timer: if len(self.replay_buffer) < self.replay_starts: return None (obses_t, actions, rewards, obses_tp1, dones, weights, batch_indexes) = self.replay_buffer.sample( self.train_batch_size, beta=self.prioritized_replay_beta) batch = SampleBatch({ "obs": obses_t, "actions": actions, "rewards": rewards, "new_obs": obses_tp1, "dones": dones, "weights": weights, "batch_indexes": batch_indexes }) return batch def update_priorities(self, batch_indexes, td_errors): with self.update_priorities_timer: new_priorities = (np.abs(td_errors) + self.prioritized_replay_eps) self.replay_buffer.update_priorities(batch_indexes, new_priorities) def stats(self): stat = { "add_batch_time_ms": round(1000 * self.add_batch_timer.mean, 3), "replay_time_ms": round(1000 * self.replay_timer.mean, 3), "update_priorities_time_ms": round(1000 * self.update_priorities_timer.mean, 3), } stat.update(self.replay_buffer.stats()) return stat
def test_update_priorities(self): memory = PrioritizedReplayBuffer(size=self.capacity, alpha=self.alpha) # Insert n samples. num_records = 5 for i in range(num_records): data = self._generate_data() memory.add(*data, weight=1.0) self.assertTrue(len(memory) == i + 1) self.assertTrue(memory._next_idx == i + 1) # Fetch records, their indices and weights. _, _, _, _, _, weights, indices = \ memory.sample(3, beta=self.beta) check(weights, np.ones(shape=(3, ))) self.assertEqual(3, len(indices)) self.assertTrue(len(memory) == num_records) self.assertTrue(memory._next_idx == num_records) # Update weight of indices 0, 2, 3, 4 to very small. memory.update_priorities(np.array([0, 2, 3, 4]), np.array([0.01, 0.01, 0.01, 0.01])) # Expect to sample almost only index 1 # (which still has a weight of 1.0). for _ in range(10): _, _, _, _, _, weights, indices = memory.sample(1000, beta=self.beta) self.assertTrue(970 < np.sum(indices) < 1100) # Update weight of indices 0 and 1 to >> 0.01. # Expect to sample 0 and 1 equally (and some 2s, 3s, and 4s). for _ in range(10): rand = np.random.random() + 0.2 memory.update_priorities(np.array([0, 1]), np.array([rand, rand])) _, _, _, _, _, _, indices = memory.sample(1000, beta=self.beta) # Expect biased to higher values due to some 2s, 3s, and 4s. # print(np.sum(indices)) self.assertTrue(400 < np.sum(indices) < 800) # Update weights to be 1:2. # Expect to sample double as often index 1 over index 0 # plus very few times indices 2, 3, or 4. for _ in range(10): rand = np.random.random() + 0.2 memory.update_priorities(np.array([0, 1]), np.array([rand, rand * 2])) _, _, _, _, _, _, indices = memory.sample(1000, beta=self.beta) # print(np.sum(indices)) self.assertTrue(600 < np.sum(indices) < 850) # Update weights to be 1:4. # Expect to sample quadruple as often index 1 over index 0 # plus very few times indices 2, 3, or 4. for _ in range(10): rand = np.random.random() + 0.2 memory.update_priorities(np.array([0, 1]), np.array([rand, rand * 4])) _, _, _, _, _, _, indices = memory.sample(1000, beta=self.beta) # print(np.sum(indices)) self.assertTrue(750 < np.sum(indices) < 950) # Update weights to be 1:9. # Expect to sample 9 times as often index 1 over index 0. # plus very few times indices 2, 3, or 4. for _ in range(10): rand = np.random.random() + 0.2 memory.update_priorities(np.array([0, 1]), np.array([rand, rand * 9])) _, _, _, _, _, _, indices = memory.sample(1000, beta=self.beta) # print(np.sum(indices)) self.assertTrue(850 < np.sum(indices) < 1100) # Insert n more samples. num_records = 5 for i in range(num_records): data = self._generate_data() memory.add(*data, weight=1.0) self.assertTrue(len(memory) == i + 6) self.assertTrue(memory._next_idx == (i + 6) % self.capacity) # Update all weights to be 1.0 to 10.0 and sample a >100 batch. memory.update_priorities( np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), np.array([0.001, 0.1, 2., 8., 16., 32., 64., 128., 256., 512.])) counts = Counter() for _ in range(10): _, _, _, _, _, _, indices = memory.sample(np.random.randint( 100, 600), beta=self.beta) for i in indices: counts[i] += 1 print(counts) # Expect an approximately correct distribution of indices. self.assertTrue( counts[9] >= counts[8] >= counts[7] >= counts[6] >= counts[5] >= counts[4] >= counts[3] >= counts[2] >= counts[1] >= counts[0])
class ReplayActor(object): def __init__(self, num_shards, learning_starts, buffer_size, train_batch_size, prioritized_replay_alpha, prioritized_replay_beta, prioritized_replay_eps): self.replay_starts = learning_starts // num_shards self.buffer_size = buffer_size // num_shards self.train_batch_size = train_batch_size self.prioritized_replay_beta = prioritized_replay_beta self.prioritized_replay_eps = prioritized_replay_eps self.replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) # Metrics self.add_batch_timer = TimerStat() self.replay_timer = TimerStat() self.update_priorities_timer = TimerStat() def get_host(self): return os.uname()[1] def add_batch(self, batch): with self.add_batch_timer: for row in batch.rows(): self.replay_buffer.add(row["obs"], row["actions"], row["rewards"], row["new_obs"], row["dones"], row["weights"]) def replay(self): with self.replay_timer: if len(self.replay_buffer) < self.replay_starts: return None (obses_t, actions, rewards, obses_tp1, dones, weights, batch_indexes) = self.replay_buffer.sample( self.train_batch_size, beta=self.prioritized_replay_beta) batch = SampleBatch({ "obs": obses_t, "actions": actions, "rewards": rewards, "new_obs": obses_tp1, "dones": dones, "weights": weights, "batch_indexes": batch_indexes }) return batch def update_priorities(self, batch, td_errors): with self.update_priorities_timer: new_priorities = (np.abs(td_errors) + self.prioritized_replay_eps) self.replay_buffer.update_priorities(batch["batch_indexes"], new_priorities) def stats(self): stat = { "add_batch_time_ms": round(1000 * self.add_batch_timer.mean, 3), "replay_time_ms": round(1000 * self.replay_timer.mean, 3), "update_priorities_time_ms": round(1000 * self.update_priorities_timer.mean, 3), } stat.update(self.replay_buffer.stats()) return stat