Ejemplo n.º 1
0
 def test_ray_sampling(self):
     """
     Tests Ray's memory performance.
     """
     assert get_distributed_backend() == "ray"
     memory = PrioritizedReplayBuffer(
         size=self.capacity,
         alpha=1.0,
         clip_rewards=True
     )
     records = [self.record_space.sample(size=1) for _ in range_(self.inserts)]
     for record in records:
         memory.add(
             obs_t=ray_compress(record['states']),
             action=record['actions'],
             reward=record['reward'],
             obs_tp1=ray_compress(record['states']),
             done=record['terminals'],
             weight=None
         )
     start = time.monotonic()
     for _ in range_(self.samples):
         batch_tuple = memory.sample(self.sample_batch_size, beta=1.0)
     end = time.monotonic() - start
     tp = self.samples / end
     print('#### Testing Ray Prioritized Replay memory ####')
     print('Testing sampling performance:')
     print('Sampled {} batches, throughput: {} samples/s, total time: {} s'.format(
         self.samples, tp, end
     ))
Ejemplo n.º 2
0
    def _batch_process_sample(self, states, actions, rewards, next_states,
                              terminals):
        """
        Batch Post-processes sample, e.g. by computing priority weights, and compressing.

        Args:
            states (list): List of states.
            actions (list, dict): List of actions or dict of lists  for container actions.
            rewards (list): List of rewards.
            next_states: (list): List of next_states.
            terminals (list): List of terminals.

        Returns:
            dict: Sample batch dict.
        """
        weights = np.ones_like(rewards)

        # Compute loss-per-item.
        if self.worker_executes_postprocessing:
            # Next states were just collected, we batch process them here.
            _, loss_per_item = self.agent.post_process(
                dict(states=states,
                     actions=actions,
                     rewards=rewards,
                     terminals=terminals,
                     next_states=next_states,
                     importance_weights=weights))
            weights = np.abs(loss_per_item) + SMALL_NUMBER
        env_dtype = self.vector_env.state_space.dtype
        compressed_states = [
            ray_compress(
                np.asarray(state,
                           dtype=util.convert_dtype(dtype=env_dtype, to='np')))
            for state in states
        ]

        compressed_next_states = compressed_states[self.n_step_adjustment:] + \
                                 [ray_compress(np.asarray(next_s,dtype=util.convert_dtype(dtype=env_dtype, to='np')))
                                  for next_s in next_states[-self.n_step_adjustment:]]
        if self.container_actions:
            for name in self.action_space.keys():
                actions[name] = np.array(actions[name])
        else:
            actions = np.array(actions)
        return dict(states=compressed_states,
                    actions=actions,
                    rewards=np.array(rewards),
                    terminals=np.array(terminals),
                    next_states=compressed_next_states,
                    importance_weights=np.array(weights)), len(rewards)
Ejemplo n.º 3
0
    def test_rlgraph_combined_ops(self):
        """
        Tests a combined workflow of insert, sample, update on the prioritized replay memory.
        """
        memory = ApexMemory(
            capacity=self.capacity,
            alpha=1.0
        )

        chunksize = 32
        chunks = int(self.inserts / chunksize)
        records = [self.record_space.sample(size=chunksize) for _ in range_(chunks)]
        loss_values = [np.random.random(size=self.sample_batch_size) for _ in range_(chunks)]

        start = time.monotonic()
        for chunk, loss_values in zip(records, loss_values):
            # Each record now is a chunk.
            for i in range_(chunksize):
                memory.insert_records((
                    ray_compress(chunk['states'][i]),
                    chunk['actions'][i],
                    chunk['reward'][i],
                    chunk['terminals'][i],
                    None
                ))
            batch, indices, weights = memory.get_records(self.sample_batch_size)
            memory.update_records(indices, loss_values)

        end = time.monotonic() - start
        tp = len(records) / end
        print('RLGraph: Testing combined op performance:')
        print('Ran {} combined ops, throughput: {} combined ops/s, total time: {} s'.format(
            len(records), tp, end
        ))
Ejemplo n.º 4
0
    def test_rlgraph_sampling(self):
        """
        Tests RLgraph's sampling performance.
        """
        memory = ApexMemory(
            capacity=self.capacity,
            alpha=1.0
        )

        records = [self.record_space.sample(size=1) for _ in range_(self.inserts)]
        for record in records:
            memory.insert_records((
                 ray_compress(record['states']),
                 record['actions'],
                 record['reward'],
                 record['terminals'],
                 None
            ))
        start = time.monotonic()
        for _ in range_(self.samples):
            batch_tuple = memory.get_records(self.sample_batch_size)
        end = time.monotonic() - start
        tp = self.samples / end
        print('#### Testing RLGraph Prioritized Replay memory ####')
        print('Testing sampling performance:')
        print('Sampled {} batches, throughput: {} batches/s, total time: {} s'.format(
            self.samples, tp, end
        ))
Ejemplo n.º 5
0
    def _process_policy_trajectories(self, states, actions, rewards, terminals,
                                     sequence_indices):
        """
        Post-processes policy trajectories.
        """
        if self.worker_executes_postprocessing:
            rewards = self.agent.post_process(
                dict(states=states,
                     rewards=rewards,
                     terminals=terminals,
                     sequence_indices=sequence_indices))

        if self.compress:
            env_dtype = self.vector_env.state_space.dtype
            states = [
                ray_compress(
                    np.asarray(state,
                               dtype=util.convert_dtype(dtype=env_dtype,
                                                        to='np')))
                for state in states
            ]
        return dict(states=states,
                    actions=actions,
                    rewards=rewards,
                    terminals=terminals), len(rewards)
Ejemplo n.º 6
0
    def test_ray_combined_ops(self):
        """
        Tests a combined workflow of insert, sample, update on the prioritized replay memory.
        """
        assert get_distributed_backend() == "ray"
        memory = PrioritizedReplayBuffer(
            size=self.capacity,
            alpha=1.0,
            clip_rewards=True
        )
        chunksize = 32

        # Test chunked inserts -> done via external for loop in Ray.
        chunks = int(self.inserts / chunksize)
        records = [self.record_space.sample(size=chunksize) for _ in range_(chunks)]
        loss_values = [np.random.random(size=self.sample_batch_size) for _ in range_(chunks)]
        start = time.monotonic()

        for chunk, loss_values in zip(records, loss_values):
            # Insert.
            for i in range_(chunksize):
                memory.add(
                    obs_t=ray_compress(chunk['states'][i]),
                    action=chunk['actions'][i],
                    reward=chunk['reward'][i],
                    obs_tp1=ray_compress(chunk['states'][i]),
                    done=chunk['terminals'][i],
                    weight=None
                )
            # Sample.
            batch_tuple = memory.sample(self.sample_batch_size, beta=1.0)
            indices = batch_tuple[-1]
            # Update
            memory.update_priorities(indices, loss_values)

        end = time.monotonic() - start
        tp = len(records) / end
        print('Ray: testing combined insert/sample/update performance:')
        print('Ran {} combined ops, throughput: {} combined ops/s, total time: {} s'.format(
            len(records), tp, end
        ))
Ejemplo n.º 7
0
    def _batch_process_sample(self, states, actions, rewards, next_states,
                              terminals):
        """
        Batch Post-processes sample, e.g. by computing priority weights, and compressing.

        Args:
            states (list): List of states.
            actions (list): List of actions.
            rewards (list): List of rewards.
            next_states: (list): List of next_states.
            terminals (list): List of terminals.

        Returns:
            dict: Sample batch dict.
        """
        weights = np.ones_like(rewards)

        # Compute loss-per-item.
        if self.worker_computes_weights:
            # Next states were just collected, we batch process them here.
            # TODO make generic agent method?
            _, loss_per_item = self.agent.get_td_loss(
                dict(states=states,
                     actions=actions,
                     rewards=rewards,
                     terminals=terminals,
                     next_states=next_states,
                     importance_weights=weights))
            weights = np.abs(loss_per_item) + SMALL_NUMBER

        compressed_states = [ray_compress(state) for state in states]
        compressed_next_states = compressed_states[self.n_step_adjustment:] + [
            ray_compress(next_s)
            for next_s in next_states[-self.n_step_adjustment:]
        ]
        return dict(states=np.array(compressed_states),
                    actions=np.array(actions),
                    rewards=np.array(rewards),
                    terminals=np.array(terminals),
                    next_states=np.array(compressed_next_states),
                    importance_weights=np.array(weights)), len(rewards)
Ejemplo n.º 8
0
    def _process_policy_trajectories(self, states, actions, rewards, terminals, sequence_indices):
        """
        Post-processes policy trajectories.
        """
        if self.generalized_advantage_estimation:
            rewards = self.agent.post_process(
                dict(
                    states=states,
                    rewards=rewards,
                    terminals=terminals,
                    sequence_indices=sequence_indices
                )
            )

        if self.compress:
            states = [ray_compress(state) for state in states]
        return dict(
            states=states,
            actions=actions,
            rewards=rewards,
            terminals=terminals
        ), len(rewards)
Ejemplo n.º 9
0
    def test_update_records(self):
        """
        Tests update records logic.
        """
        memory = MemPrioritizedReplay(capacity=self.capacity, next_states=True)
        memory.create_variables(self.input_spaces)

        # Insert a few Elements.
        observation = memory.record_space_flat.sample(size=2)
        memory.insert_records(observation)

        # Fetch elements and their indices.
        num_records = 2
        batch = memory.get_records(num_records)
        indices = batch[1]
        self.assertEqual(num_records, len(indices))

        # Does not return anything.
        memory.update_records(indices, np.asarray([0.1, 0.2]))

        # Test apex memory.
        memory = ApexMemory(capacity=self.capacity,
                            alpha=self.alpha,
                            beta=self.beta)
        observation = self.apex_space.sample(size=5)
        for i in range_(5):
            memory.insert_records(
                (ray_compress(observation["states"][i]),
                 observation["actions"][i], observation["reward"][i],
                 observation["terminals"][i], observation["weights"][i]))

        # Fetch elements and their indices.
        num_records = 5
        batch = memory.get_records(num_records)
        indices = batch[1]
        self.assertEqual(num_records, len(indices))

        # Does not return anything
        memory.update_records(indices, np.random.uniform(size=10))