Ejemplo n.º 1
0
    def test_lockstep_mode(self):
        """Test the lockstep mode by only adding SampleBatches.

        Such SampleBatches are converted to MultiAgent Batches as if there
        was only one policy."""
        self.batch_id = 0
        batch_size = 5
        buffer_size = 30

        buffer = MultiAgentReplayBuffer(
            capacity=buffer_size,
            replay_mode="lockstep",
            learning_starts=0,
            num_shards=1,
        )

        # Test add/sample
        self._add_sample_batch_to_buffer(buffer, batch_size=batch_size, num_batches=1)

        # Sampling from it now should yield the first batch
        assert get_batch_id(buffer.sample(1)) == 0

        self._add_sample_batch_to_buffer(buffer, batch_size=batch_size, num_batches=2)

        # Sampling from it now should yield our first batch 1/3 of the time
        num_sampled_dict = {_id: 0 for _id in range(self.batch_id)}
        num_samples = 200
        for i in range(num_samples):
            _id = get_batch_id(buffer.sample(1))
            num_sampled_dict[_id] += 1
        assert np.allclose(
            np.array(list(num_sampled_dict.values())) / num_samples,
            len(num_sampled_dict) * [1 / 3],
            atol=0.1,
        )
Ejemplo n.º 2
0
    def test_independent_with_underlying_prioritized_replay_buffer(self):
        """Test this the buffer with different underlying buffers.

        Test if we can initialize a more complex underlying buffer with
        additional arguments and independent sampling.
        This does not test updating priorities and using weights as
        implemented in MultiAgentPrioritizedReplayBuffer.
        """
        # Test with PrioritizedReplayBuffer, args for c'tor, add and sample
        prioritized_replay_buffer_config = {
            "type": PrioritizedReplayBuffer,
            "alpha": 0.6,
            "beta": 0.4,
        }

        num_policies = 2
        buffer_size = 15
        num_batches = 1

        buffer = MultiAgentReplayBuffer(
            capacity=buffer_size,
            replay_mode="independent",
            learning_starts=0,
            num_shards=1,
            underlying_buffer_config=prioritized_replay_buffer_config,
        )

        self._add_multi_agent_batch_to_buffer(
            buffer, num_policies=num_policies, num_batches=num_batches
        )

        # Only test if we can sample from multiple policies
        sample = buffer.sample(2)
        assert len(sample) == 4
        assert len(sample.policy_batches) == 2
    def test_independent_mode_sequences_storage_unit(self):
        """Test the independent mode with sequences as a storage unit.

        Such SampleBatches are converted to MultiAgentBatches as if there
        was only one policy."""
        buffer_size = 15
        self.batch_id = 0

        buffer = MultiAgentReplayBuffer(
            capacity=buffer_size,
            replay_mode="independent",
            storage_unit="sequences",
            replay_sequence_length=2,
            learning_starts=0,
            num_shards=1,
        )

        # Test add/sample
        self._add_multi_agent_batch_to_buffer(buffer,
                                              num_policies=2,
                                              num_batches=1,
                                              seq_lens=True)

        # Sampling from it now should yield the first batch
        assert get_batch_id(buffer.sample(1), 0) == 0

        self._add_multi_agent_batch_to_buffer(buffer,
                                              num_policies=2,
                                              num_batches=2,
                                              seq_lens=True)

        # Sampling from it now should yield each batch that went into a
        # multiagent batch 1/6th of the time
        num_sampled_dict = {_id: 0 for _id in range(self.batch_id)}
        num_samples = 200
        for i in range(num_samples):
            sample = buffer.sample(1)
            # Count one of both policy batches
            _id = get_batch_id(sample, np.random.choice([0, 1]))
            num_sampled_dict[_id] += 1
            # See if a random batch has the desired sequence length of two
            assert len(sample.policy_batches[np.random.choice([0, 1])]) == 2
        assert np.allclose(
            np.array(list(num_sampled_dict.values())) / num_samples,
            len(num_sampled_dict) * [1 / 6],
            atol=0.1,
        )
Ejemplo n.º 4
0
    def test_independent_mode_multiple_policies(self):
        """Test the lockstep mode by adding batches from multiple policies."""

        num_batches = 3
        buffer_size = 15
        num_policies = 2
        # Test lockstep mode with different policy ids using MultiAgentBatches

        self.batch_id = 0

        buffer = MultiAgentReplayBuffer(
            capacity=buffer_size,
            replay_mode="independent",
            learning_starts=0,
            num_shards=1,
        )

        self._add_multi_agent_batch_to_buffer(
            buffer, num_policies=num_policies, num_batches=num_batches
        )

        # Sample 4 SampleBatches from only one policy
        for _id in range(num_policies):
            for __id in buffer.sample(4, policy_id=_id).policy_batches[_id][
                "policy_id"
            ]:
                assert __id == _id

        # Sample without specifying the policy should yield the same number
        # of batches from each policy
        num_sampled_dict = {_id: 0 for _id in range(num_policies)}
        num_samples = 200
        for i in range(num_samples):
            num_items = np.random.randint(0, 5)
            for _id, batch in buffer.sample(num_items=num_items).policy_batches.items():
                num_sampled_dict[_id] += 1
                assert len(batch) == num_items
        assert np.allclose(
            np.array(list(num_sampled_dict.values())),
            len(num_sampled_dict) * [200],
            atol=0.1,
        )
Ejemplo n.º 5
0
    def test_lockstep_with_underlying_replay_buffer(self):
        """Test this the buffer with different underlying buffers.

        Test if we can initialize a simple underlying buffer without
        additional arguments and lockstep sampling.
        """
        # Test with ReplayBuffer, no args for c'tor, add and sample
        replay_buffer_config = {"type": ReplayBuffer}

        num_policies = 2
        buffer_size = 200
        num_batches = 20

        buffer = MultiAgentReplayBuffer(
            capacity=buffer_size,
            replay_mode="lockstep",
            learning_starts=0,
            num_shards=1,
            underlying_buffer_config=replay_buffer_config,
        )

        self._add_multi_agent_batch_to_buffer(
            buffer, num_policies=num_policies - 1, num_batches=num_batches
        )

        # Only test if we can sample and if samples belong to a single policy
        sample = buffer.sample(2)
        assert len(sample) == 2
        assert len(sample.policy_batches) == 1

        self._add_multi_agent_batch_to_buffer(
            buffer, num_policies=num_policies, num_batches=num_batches
        )

        # Only test if we can sample from multiple policies, out of 100
        # samples, some should be of each policy
        sample = buffer.sample(100)
        assert len(sample) == 100
        assert len(sample.policy_batches) == 2
Ejemplo n.º 6
0
    def test_store_to_replay_local(self):
        buf = MultiAgentReplayBuffer(
            num_shards=1,
            learning_starts=200,
            capacity=1000,
            prioritized_replay_alpha=0.6,
            prioritized_replay_beta=0.4,
            prioritized_replay_eps=0.0001,
        )
        assert len(buf.sample(100)) == 0

        workers = make_workers(0)
        a = ParallelRollouts(workers, mode="bulk_sync")
        b = a.for_each(StoreToReplayBuffer(local_buffer=buf))

        next(b)
        assert len(buf.sample(100)) == 0  # learning hasn't started yet
        next(b)
        assert buf.sample(100).count == 100

        replay_op = Replay(local_buffer=buf, num_items_to_replay=100)
        assert next(replay_op).count == 100
Ejemplo n.º 7
0
    def test_policy_id_of_multi_agent_batches_independent(self):
        """Test if indepent sampling yields a MultiAgentBatch with the
        correct policy id."""
        self.batch_id = 0

        # Test lockstep mode with different policy ids using MultiAgentBatches
        buffer = MultiAgentReplayBuffer(
            capacity=10, replay_mode="independent", learning_starts=0, num_shards=1
        )

        self._add_multi_agent_batch_to_buffer(buffer, num_policies=1, num_batches=1)

        mabatch = buffer.sample(1)
        assert list(mabatch.policy_batches.keys())[0] == 0
Ejemplo n.º 8
0
class MyTrainer(Trainer):
    @classmethod
    @override(Trainer)
    def get_default_config(cls) -> TrainerConfigDict:
        # Run this Trainer with new `training_iteration` API and set some PPO-specific
        # parameters.
        return with_common_config({
            "num_sgd_iter": 10,
            "sgd_minibatch_size": 128,
        })

    @override(Trainer)
    def setup(self, config):
        # Call super's `setup` to create rollout workers.
        super().setup(config)
        # Create local replay buffer.
        self.local_replay_buffer = MultiAgentReplayBuffer(num_shards=1,
                                                          learning_starts=1000,
                                                          capacity=50000)

    @override(Trainer)
    def training_iteration(self) -> ResultDict:
        # Generate common experiences, collect batch for PPO, store every (DQN) batch
        # into replay buffer.
        ppo_batches = []
        num_env_steps = 0
        # PPO batch size fixed at 200.
        while num_env_steps < 200:
            ma_batches = synchronous_parallel_sample(worker_set=self.workers,
                                                     concat=False)
            # Loop through (parallely collected) ma-batches.
            for ma_batch in ma_batches:
                # Update sampled counters.
                self._counters[NUM_ENV_STEPS_SAMPLED] += ma_batch.count
                self._counters[
                    NUM_AGENT_STEPS_SAMPLED] += ma_batch.agent_steps()
                ppo_batch = ma_batch.policy_batches.pop("ppo_policy")
                # Add collected batches (only for DQN policy) to replay buffer.
                self.local_replay_buffer.add(ma_batch)

                ppo_batches.append(ppo_batch)
                num_env_steps += ppo_batch.count

        # DQN sub-flow.
        dqn_train_results = {}
        dqn_train_batch = self.local_replay_buffer.sample(num_items=64)
        if dqn_train_batch is not None:
            dqn_train_results = train_one_step(self, dqn_train_batch,
                                               ["dqn_policy"])
            self._counters[
                "agent_steps_trained_DQN"] += dqn_train_batch.agent_steps()
            print(
                "DQN policy learning on samples from",
                "agent steps trained",
                dqn_train_batch.agent_steps(),
            )
        # Update DQN's target net every 500 train steps.
        if (self._counters["agent_steps_trained_DQN"] -
                self._counters[LAST_TARGET_UPDATE_TS] >= 500):
            self.workers.local_worker().get_policy(
                "dqn_policy").update_target()
            self._counters[NUM_TARGET_UPDATES] += 1
            self._counters[LAST_TARGET_UPDATE_TS] = self._counters[
                "agent_steps_trained_DQN"]

        # PPO sub-flow.
        ppo_train_batch = SampleBatch.concat_samples(ppo_batches)
        self._counters[
            "agent_steps_trained_PPO"] += ppo_train_batch.agent_steps()
        # Standardize advantages.
        ppo_train_batch[Postprocessing.ADVANTAGES] = standardized(
            ppo_train_batch[Postprocessing.ADVANTAGES])
        print(
            "PPO policy learning on samples from",
            "agent steps trained",
            ppo_train_batch.agent_steps(),
        )
        ppo_train_batch = MultiAgentBatch({"ppo_policy": ppo_train_batch},
                                          ppo_train_batch.count)
        ppo_train_results = train_one_step(self, ppo_train_batch,
                                           ["ppo_policy"])

        # Combine results for PPO and DQN into one results dict.
        results = dict(ppo_train_results, **dqn_train_results)
        return results