def test_ray_sampling(self): """ Tests Ray's memory performance. """ assert get_distributed_backend() == "ray" memory = PrioritizedReplayBuffer( size=self.capacity, alpha=1.0, clip_rewards=True ) records = [self.record_space.sample(size=1) for _ in range_(self.inserts)] for record in records: memory.add( obs_t=ray_compress(record['states']), action=record['actions'], reward=record['reward'], obs_tp1=ray_compress(record['states']), done=record['terminals'], weight=None ) start = time.monotonic() for _ in range_(self.samples): batch_tuple = memory.sample(self.sample_batch_size, beta=1.0) end = time.monotonic() - start tp = self.samples / end print('#### Testing Ray Prioritized Replay memory ####') print('Testing sampling performance:') print('Sampled {} batches, throughput: {} samples/s, total time: {} s'.format( self.samples, tp, end ))
def _batch_process_sample(self, states, actions, rewards, next_states, terminals): """ Batch Post-processes sample, e.g. by computing priority weights, and compressing. Args: states (list): List of states. actions (list, dict): List of actions or dict of lists for container actions. rewards (list): List of rewards. next_states: (list): List of next_states. terminals (list): List of terminals. Returns: dict: Sample batch dict. """ weights = np.ones_like(rewards) # Compute loss-per-item. if self.worker_executes_postprocessing: # Next states were just collected, we batch process them here. _, loss_per_item = self.agent.post_process( dict(states=states, actions=actions, rewards=rewards, terminals=terminals, next_states=next_states, importance_weights=weights)) weights = np.abs(loss_per_item) + SMALL_NUMBER env_dtype = self.vector_env.state_space.dtype compressed_states = [ ray_compress( np.asarray(state, dtype=util.convert_dtype(dtype=env_dtype, to='np'))) for state in states ] compressed_next_states = compressed_states[self.n_step_adjustment:] + \ [ray_compress(np.asarray(next_s,dtype=util.convert_dtype(dtype=env_dtype, to='np'))) for next_s in next_states[-self.n_step_adjustment:]] if self.container_actions: for name in self.action_space.keys(): actions[name] = np.array(actions[name]) else: actions = np.array(actions) return dict(states=compressed_states, actions=actions, rewards=np.array(rewards), terminals=np.array(terminals), next_states=compressed_next_states, importance_weights=np.array(weights)), len(rewards)
def test_rlgraph_combined_ops(self): """ Tests a combined workflow of insert, sample, update on the prioritized replay memory. """ memory = ApexMemory( capacity=self.capacity, alpha=1.0 ) chunksize = 32 chunks = int(self.inserts / chunksize) records = [self.record_space.sample(size=chunksize) for _ in range_(chunks)] loss_values = [np.random.random(size=self.sample_batch_size) for _ in range_(chunks)] start = time.monotonic() for chunk, loss_values in zip(records, loss_values): # Each record now is a chunk. for i in range_(chunksize): memory.insert_records(( ray_compress(chunk['states'][i]), chunk['actions'][i], chunk['reward'][i], chunk['terminals'][i], None )) batch, indices, weights = memory.get_records(self.sample_batch_size) memory.update_records(indices, loss_values) end = time.monotonic() - start tp = len(records) / end print('RLGraph: Testing combined op performance:') print('Ran {} combined ops, throughput: {} combined ops/s, total time: {} s'.format( len(records), tp, end ))
def test_rlgraph_sampling(self): """ Tests RLgraph's sampling performance. """ memory = ApexMemory( capacity=self.capacity, alpha=1.0 ) records = [self.record_space.sample(size=1) for _ in range_(self.inserts)] for record in records: memory.insert_records(( ray_compress(record['states']), record['actions'], record['reward'], record['terminals'], None )) start = time.monotonic() for _ in range_(self.samples): batch_tuple = memory.get_records(self.sample_batch_size) end = time.monotonic() - start tp = self.samples / end print('#### Testing RLGraph Prioritized Replay memory ####') print('Testing sampling performance:') print('Sampled {} batches, throughput: {} batches/s, total time: {} s'.format( self.samples, tp, end ))
def _process_policy_trajectories(self, states, actions, rewards, terminals, sequence_indices): """ Post-processes policy trajectories. """ if self.worker_executes_postprocessing: rewards = self.agent.post_process( dict(states=states, rewards=rewards, terminals=terminals, sequence_indices=sequence_indices)) if self.compress: env_dtype = self.vector_env.state_space.dtype states = [ ray_compress( np.asarray(state, dtype=util.convert_dtype(dtype=env_dtype, to='np'))) for state in states ] return dict(states=states, actions=actions, rewards=rewards, terminals=terminals), len(rewards)
def test_ray_combined_ops(self): """ Tests a combined workflow of insert, sample, update on the prioritized replay memory. """ assert get_distributed_backend() == "ray" memory = PrioritizedReplayBuffer( size=self.capacity, alpha=1.0, clip_rewards=True ) chunksize = 32 # Test chunked inserts -> done via external for loop in Ray. chunks = int(self.inserts / chunksize) records = [self.record_space.sample(size=chunksize) for _ in range_(chunks)] loss_values = [np.random.random(size=self.sample_batch_size) for _ in range_(chunks)] start = time.monotonic() for chunk, loss_values in zip(records, loss_values): # Insert. for i in range_(chunksize): memory.add( obs_t=ray_compress(chunk['states'][i]), action=chunk['actions'][i], reward=chunk['reward'][i], obs_tp1=ray_compress(chunk['states'][i]), done=chunk['terminals'][i], weight=None ) # Sample. batch_tuple = memory.sample(self.sample_batch_size, beta=1.0) indices = batch_tuple[-1] # Update memory.update_priorities(indices, loss_values) end = time.monotonic() - start tp = len(records) / end print('Ray: testing combined insert/sample/update performance:') print('Ran {} combined ops, throughput: {} combined ops/s, total time: {} s'.format( len(records), tp, end ))
def _batch_process_sample(self, states, actions, rewards, next_states, terminals): """ Batch Post-processes sample, e.g. by computing priority weights, and compressing. Args: states (list): List of states. actions (list): List of actions. rewards (list): List of rewards. next_states: (list): List of next_states. terminals (list): List of terminals. Returns: dict: Sample batch dict. """ weights = np.ones_like(rewards) # Compute loss-per-item. if self.worker_computes_weights: # Next states were just collected, we batch process them here. # TODO make generic agent method? _, loss_per_item = self.agent.get_td_loss( dict(states=states, actions=actions, rewards=rewards, terminals=terminals, next_states=next_states, importance_weights=weights)) weights = np.abs(loss_per_item) + SMALL_NUMBER compressed_states = [ray_compress(state) for state in states] compressed_next_states = compressed_states[self.n_step_adjustment:] + [ ray_compress(next_s) for next_s in next_states[-self.n_step_adjustment:] ] return dict(states=np.array(compressed_states), actions=np.array(actions), rewards=np.array(rewards), terminals=np.array(terminals), next_states=np.array(compressed_next_states), importance_weights=np.array(weights)), len(rewards)
def _process_policy_trajectories(self, states, actions, rewards, terminals, sequence_indices): """ Post-processes policy trajectories. """ if self.generalized_advantage_estimation: rewards = self.agent.post_process( dict( states=states, rewards=rewards, terminals=terminals, sequence_indices=sequence_indices ) ) if self.compress: states = [ray_compress(state) for state in states] return dict( states=states, actions=actions, rewards=rewards, terminals=terminals ), len(rewards)
def test_update_records(self): """ Tests update records logic. """ memory = MemPrioritizedReplay(capacity=self.capacity, next_states=True) memory.create_variables(self.input_spaces) # Insert a few Elements. observation = memory.record_space_flat.sample(size=2) memory.insert_records(observation) # Fetch elements and their indices. num_records = 2 batch = memory.get_records(num_records) indices = batch[1] self.assertEqual(num_records, len(indices)) # Does not return anything. memory.update_records(indices, np.asarray([0.1, 0.2])) # Test apex memory. memory = ApexMemory(capacity=self.capacity, alpha=self.alpha, beta=self.beta) observation = self.apex_space.sample(size=5) for i in range_(5): memory.insert_records( (ray_compress(observation["states"][i]), observation["actions"][i], observation["reward"][i], observation["terminals"][i], observation["weights"][i])) # Fetch elements and their indices. num_records = 5 batch = memory.get_records(num_records) indices = batch[1] self.assertEqual(num_records, len(indices)) # Does not return anything memory.update_records(indices, np.random.uniform(size=10))