Beispiel #1
0
    def test_lockstep_mode(self):
        """Test the lockstep mode by only adding SampleBatches.

        Such SampleBatches are converted to MultiAgent Batches as if there
        was only one policy."""
        self.batch_id = 0
        batch_size = 5
        buffer_size = 30

        buffer = MultiAgentReplayBuffer(
            capacity=buffer_size,
            replay_mode="lockstep",
            learning_starts=0,
            num_shards=1,
        )

        # Test add/sample
        self._add_sample_batch_to_buffer(buffer, batch_size=batch_size, num_batches=1)

        # Sampling from it now should yield the first batch
        assert get_batch_id(buffer.sample(1)) == 0

        self._add_sample_batch_to_buffer(buffer, batch_size=batch_size, num_batches=2)

        # Sampling from it now should yield our first batch 1/3 of the time
        num_sampled_dict = {_id: 0 for _id in range(self.batch_id)}
        num_samples = 200
        for i in range(num_samples):
            _id = get_batch_id(buffer.sample(1))
            num_sampled_dict[_id] += 1
        assert np.allclose(
            np.array(list(num_sampled_dict.values())) / num_samples,
            len(num_sampled_dict) * [1 / 3],
            atol=0.1,
        )
Beispiel #2
0
 def setup(self, config):
     # Call super's `setup` to create rollout workers.
     super().setup(config)
     # Create local replay buffer.
     self.local_replay_buffer = MultiAgentReplayBuffer(num_shards=1,
                                                       learning_starts=1000,
                                                       capacity=50000)
Beispiel #3
0
    def test_independent_with_underlying_prioritized_replay_buffer(self):
        """Test this the buffer with different underlying buffers.

        Test if we can initialize a more complex underlying buffer with
        additional arguments and independent sampling.
        This does not test updating priorities and using weights as
        implemented in MultiAgentPrioritizedReplayBuffer.
        """
        # Test with PrioritizedReplayBuffer, args for c'tor, add and sample
        prioritized_replay_buffer_config = {
            "type": PrioritizedReplayBuffer,
            "alpha": 0.6,
            "beta": 0.4,
        }

        num_policies = 2
        buffer_size = 15
        num_batches = 1

        buffer = MultiAgentReplayBuffer(
            capacity=buffer_size,
            replay_mode="independent",
            learning_starts=0,
            num_shards=1,
            underlying_buffer_config=prioritized_replay_buffer_config,
        )

        self._add_multi_agent_batch_to_buffer(
            buffer, num_policies=num_policies, num_batches=num_batches
        )

        # Only test if we can sample from multiple policies
        sample = buffer.sample(2)
        assert len(sample) == 4
        assert len(sample.policy_batches) == 2
Beispiel #4
0
    def set_state(self, state: Dict[str, Any]) -> None:
        """Restores all local state to the provided `state`.

        Args:
            state: The new state to set this buffer. Can be obtained by
                calling `self.get_state()`.
        """
        self.last_added_batches = state["last_added_batches"]
        MultiAgentReplayBuffer.set_state(state)
Beispiel #5
0
    def test_policy_id_of_multi_agent_batches_independent(self):
        """Test if indepent sampling yields a MultiAgentBatch with the
        correct policy id."""
        self.batch_id = 0

        # Test lockstep mode with different policy ids using MultiAgentBatches
        buffer = MultiAgentReplayBuffer(
            capacity=10, replay_mode="independent", learning_starts=0, num_shards=1
        )

        self._add_multi_agent_batch_to_buffer(buffer, num_policies=1, num_batches=1)

        mabatch = buffer.sample(1)
        assert list(mabatch.policy_batches.keys())[0] == 0
    def test_independent_mode_sequences_storage_unit(self):
        """Test the independent mode with sequences as a storage unit.

        Such SampleBatches are converted to MultiAgentBatches as if there
        was only one policy."""
        buffer_size = 15
        self.batch_id = 0

        buffer = MultiAgentReplayBuffer(
            capacity=buffer_size,
            replay_mode="independent",
            storage_unit="sequences",
            replay_sequence_length=2,
            learning_starts=0,
            num_shards=1,
        )

        # Test add/sample
        self._add_multi_agent_batch_to_buffer(buffer,
                                              num_policies=2,
                                              num_batches=1,
                                              seq_lens=True)

        # Sampling from it now should yield the first batch
        assert get_batch_id(buffer.sample(1), 0) == 0

        self._add_multi_agent_batch_to_buffer(buffer,
                                              num_policies=2,
                                              num_batches=2,
                                              seq_lens=True)

        # Sampling from it now should yield each batch that went into a
        # multiagent batch 1/6th of the time
        num_sampled_dict = {_id: 0 for _id in range(self.batch_id)}
        num_samples = 200
        for i in range(num_samples):
            sample = buffer.sample(1)
            # Count one of both policy batches
            _id = get_batch_id(sample, np.random.choice([0, 1]))
            num_sampled_dict[_id] += 1
            # See if a random batch has the desired sequence length of two
            assert len(sample.policy_batches[np.random.choice([0, 1])]) == 2
        assert np.allclose(
            np.array(list(num_sampled_dict.values())) / num_samples,
            len(num_sampled_dict) * [1 / 6],
            atol=0.1,
        )
Beispiel #7
0
    def test_set_get_state(self):
        num_policies = 2
        buffer_size = 15
        num_batches = 1

        buffer = MultiAgentReplayBuffer(
            capacity=buffer_size,
            replay_mode="independent",
            learning_starts=0,
            num_shards=1,
        )

        self._add_multi_agent_batch_to_buffer(
            buffer, num_policies=num_policies, num_batches=num_batches
        )

        state = buffer.get_state()

        another_buffer = MultiAgentReplayBuffer(
            capacity=buffer_size,
            replay_mode="independent",
            learning_starts=0,
            num_shards=1,
        )

        another_buffer.set_state(state)

        # State is equal to set of states of underlying buffers
        for _id, _buffer in buffer.replay_buffers.items():
            assert _buffer.get_state() == another_buffer.replay_buffers[_id].get_state()

        assert buffer._num_added == another_buffer._num_added
Beispiel #8
0
    def test_independent_mode_multiple_policies(self):
        """Test the lockstep mode by adding batches from multiple policies."""

        num_batches = 3
        buffer_size = 15
        num_policies = 2
        # Test lockstep mode with different policy ids using MultiAgentBatches

        self.batch_id = 0

        buffer = MultiAgentReplayBuffer(
            capacity=buffer_size,
            replay_mode="independent",
            learning_starts=0,
            num_shards=1,
        )

        self._add_multi_agent_batch_to_buffer(
            buffer, num_policies=num_policies, num_batches=num_batches
        )

        # Sample 4 SampleBatches from only one policy
        for _id in range(num_policies):
            for __id in buffer.sample(4, policy_id=_id).policy_batches[_id][
                "policy_id"
            ]:
                assert __id == _id

        # Sample without specifying the policy should yield the same number
        # of batches from each policy
        num_sampled_dict = {_id: 0 for _id in range(num_policies)}
        num_samples = 200
        for i in range(num_samples):
            num_items = np.random.randint(0, 5)
            for _id, batch in buffer.sample(num_items=num_items).policy_batches.items():
                num_sampled_dict[_id] += 1
                assert len(batch) == num_items
        assert np.allclose(
            np.array(list(num_sampled_dict.values())),
            len(num_sampled_dict) * [200],
            atol=0.1,
        )
Beispiel #9
0
    def get_state(self) -> Dict[str, Any]:
        """Returns all local state.

        Returns:
            The serializable local state.
        """
        data = {
            "last_added_batches": self.last_added_batches,
        }
        parent = MultiAgentReplayBuffer.get_state(self)
        parent.update(data)
        return parent
Beispiel #10
0
    def test_lockstep_with_underlying_replay_buffer(self):
        """Test this the buffer with different underlying buffers.

        Test if we can initialize a simple underlying buffer without
        additional arguments and lockstep sampling.
        """
        # Test with ReplayBuffer, no args for c'tor, add and sample
        replay_buffer_config = {"type": ReplayBuffer}

        num_policies = 2
        buffer_size = 200
        num_batches = 20

        buffer = MultiAgentReplayBuffer(
            capacity=buffer_size,
            replay_mode="lockstep",
            learning_starts=0,
            num_shards=1,
            underlying_buffer_config=replay_buffer_config,
        )

        self._add_multi_agent_batch_to_buffer(
            buffer, num_policies=num_policies - 1, num_batches=num_batches
        )

        # Only test if we can sample and if samples belong to a single policy
        sample = buffer.sample(2)
        assert len(sample) == 2
        assert len(sample.policy_batches) == 1

        self._add_multi_agent_batch_to_buffer(
            buffer, num_policies=num_policies, num_batches=num_batches
        )

        # Only test if we can sample from multiple policies, out of 100
        # samples, some should be of each policy
        sample = buffer.sample(100)
        assert len(sample) == 100
        assert len(sample.policy_batches) == 2
Beispiel #11
0
    def test_store_to_replay_local(self):
        buf = MultiAgentReplayBuffer(
            num_shards=1,
            learning_starts=200,
            capacity=1000,
            prioritized_replay_alpha=0.6,
            prioritized_replay_beta=0.4,
            prioritized_replay_eps=0.0001,
        )
        assert len(buf.sample(100)) == 0

        workers = make_workers(0)
        a = ParallelRollouts(workers, mode="bulk_sync")
        b = a.for_each(StoreToReplayBuffer(local_buffer=buf))

        next(b)
        assert len(buf.sample(100)) == 0  # learning hasn't started yet
        next(b)
        assert buf.sample(100).count == 100

        replay_op = Replay(local_buffer=buf, num_items_to_replay=100)
        assert next(replay_op).count == 100
Beispiel #12
0
    def __init__(
        self,
        capacity: int = 10000,
        storage_unit: str = "timesteps",
        num_shards: int = 1,
        learning_starts: int = 1000,
        replay_batch_size: int = 1,
        prioritized_replay_alpha: float = 0.6,
        prioritized_replay_beta: float = 0.4,
        prioritized_replay_eps: float = 1e-6,
        replay_mode: str = "independent",
        replay_sequence_length: int = 1,
        replay_burn_in: int = 0,
        replay_zero_init_states: bool = True,
        replay_ratio: float = 0.66,
    ):
        """Initializes MixInMultiAgentReplayBuffer instance.

        Args:
            capacity: Number of batches to store in total.
            storage_unit (str): Either 'sequences' or 'timesteps'. Specifies
                how experiences are stored.
            num_shards: The number of buffer shards that exist in total
                (including this one).
            learning_starts: Number of timesteps after which a call to
                `replay()` will yield samples (before that, `replay()` will
                return None).
            capacity: The capacity of the buffer. Note that when
                `replay_sequence_length` > 1, this is the number of sequences
                (not single timesteps) stored.
            replay_batch_size: The batch size to be sampled (in timesteps).
                Note that if `replay_sequence_length` > 1,
                `self.replay_batch_size` will be set to the number of
                sequences sampled (B).
            prioritized_replay_alpha: Alpha parameter for a prioritized
                replay buffer. Use 0.0 for no prioritization.
            prioritized_replay_beta: Beta parameter for a prioritized
                replay buffer.
            prioritized_replay_eps: Epsilon parameter for a prioritized
                replay buffer.
            replay_mode: One of "independent" or "lockstep". Determined,
                whether in the multiagent case, sampling is done across all
                agents/policies equally.
            replay_sequence_length: The sequence length (T) of a single
                sample. If > 1, we will sample B x T from this buffer.
            replay_burn_in: The burn-in length in case
                `replay_sequence_length` > 0. This is the number of timesteps
                each sequence overlaps with the previous one to generate a
                better internal state (=state after the burn-in), instead of
                starting from 0.0 each RNN rollout.
            replay_zero_init_states: Whether the initial states in the
                buffer (if replay_sequence_length > 0) are alwayas 0.0 or
                should be updated with the previous train_batch state outputs.
            replay_ratio: Ratio of replayed samples in the returned
                batches. E.g. a ratio of 0.0 means only return new samples
                (no replay), a ratio of 0.5 means always return newest sample
                plus one old one (1:1), a ratio of 0.66 means always return
                the newest sample plus 2 old (replayed) ones (1:2), etc...
        """
        if not 0 < replay_ratio < 1:
            raise ValueError("Replay ratio must be within [0, 1]")

        MultiAgentReplayBuffer.__init__(
            self,
            capacity,
            storage_unit,
            num_shards,
            learning_starts,
            replay_batch_size,
            prioritized_replay_alpha,
            prioritized_replay_beta,
            prioritized_replay_eps,
            replay_mode,
            replay_sequence_length,
            replay_burn_in,
            replay_zero_init_states,
        )

        self.replay_ratio = replay_ratio
        self.replay_proportion = None
        if self.replay_ratio != 1.0:
            self.replay_proportion = self.replay_ratio / (1.0 -
                                                          self.replay_ratio)

        # Last added batch(es).
        self.last_added_batches = collections.defaultdict(list)
    def __init__(
        self,
        capacity: int = 10000,
        storage_unit: str = "timesteps",
        num_shards: int = 1,
        replay_batch_size: int = 1,
        learning_starts: int = 1000,
        replay_mode: str = "independent",
        replay_sequence_length: int = 1,
        replay_burn_in: int = 0,
        replay_zero_init_states: bool = True,
        prioritized_replay_alpha: float = 0.6,
        prioritized_replay_beta: float = 0.4,
        prioritized_replay_eps: float = 1e-6,
        underlying_buffer_config: dict = None,
        **kwargs
    ):
        """Initializes a MultiAgentReplayBuffer instance.

        Args:
            num_shards: The number of buffer shards that exist in total
                (including this one).
            storage_unit: Either 'timesteps', 'sequences' or
                'episodes'. Specifies how experiences are stored. If they
                are stored in episodes, replay_sequence_length is ignored.
                If they are stored in episodes, replay_sequence_length is
                ignored.
            learning_starts: Number of timesteps after which a call to
                `replay()` will yield samples (before that, `replay()` will
                return None).
            capacity: The capacity of the buffer. Note that when
                `replay_sequence_length` > 1, this is the number of sequences
                (not single timesteps) stored.
            replay_batch_size: The batch size to be sampled (in timesteps).
                Note that if `replay_sequence_length` > 1,
                `self.replay_batch_size` will be set to the number of
                sequences sampled (B).
            prioritized_replay_alpha: Alpha parameter for a prioritized
                replay buffer. Use 0.0 for no prioritization.
            prioritized_replay_beta: Beta parameter for a prioritized
                replay buffer.
            prioritized_replay_eps: Epsilon parameter for a prioritized
                replay buffer.
            replay_sequence_length: The sequence length (T) of a single
                sample. If > 1, we will sample B x T from this buffer.
            replay_burn_in: The burn-in length in case
                `replay_sequence_length` > 0. This is the number of timesteps
                each sequence overlaps with the previous one to generate a
                better internal state (=state after the burn-in), instead of
                starting from 0.0 each RNN rollout.
            replay_zero_init_states: Whether the initial states in the
                buffer (if replay_sequence_length > 0) are alwayas 0.0 or
                should be updated with the previous train_batch state outputs.
            underlying_buffer_config: A config that contains all necessary
                constructor arguments and arguments for methods to call on
                the underlying buffers. This replaces the standard behaviour
                of the underlying PrioritizedReplayBuffer. The config
                follows the conventions of the general
                replay_buffer_config. kwargs for subsequent calls of methods
                may also be included. Example:
                "replay_buffer_config": {"type": PrioritizedReplayBuffer,
                "capacity": 10, "storage_unit": "timesteps",
                prioritized_replay_alpha: 0.5, prioritized_replay_beta: 0.5,
                prioritized_replay_eps: 0.5}
            **kwargs: Forward compatibility kwargs.
        """
        if "replay_mode" in kwargs and (
            kwargs["replay_mode"] == "lockstep"
            or kwargs["replay_mode"] == ReplayMode.LOCKSTEP
        ):
            if log_once("lockstep_mode_not_supported"):
                logger.error(
                    "Replay mode `lockstep` is not supported for "
                    "MultiAgentPrioritizedReplayBuffer. "
                    "This buffer will run in `independent` mode."
                )
            kwargs["replay_mode"] = "independent"

        if underlying_buffer_config is not None:
            if log_once("underlying_buffer_config_not_supported"):
                logger.info(
                    "PrioritizedMultiAgentReplayBuffer instantiated "
                    "with underlying_buffer_config. This will "
                    "overwrite the standard behaviour of the "
                    "underlying PrioritizedReplayBuffer."
                )
            prioritized_replay_buffer_config = underlying_buffer_config
        else:
            prioritized_replay_buffer_config = {
                "type": PrioritizedReplayBuffer,
                "alpha": prioritized_replay_alpha,
                "beta": prioritized_replay_beta,
            }

        shard_capacity = capacity // num_shards
        MultiAgentReplayBuffer.__init__(
            self,
            shard_capacity,
            storage_unit,
            **kwargs,
            underlying_buffer_config=prioritized_replay_buffer_config,
            replay_batch_size=replay_batch_size,
            learning_starts=learning_starts,
            replay_mode=replay_mode,
            replay_sequence_length=replay_sequence_length,
            replay_burn_in=replay_burn_in,
            replay_zero_init_states=replay_zero_init_states,
        )

        self.prioritized_replay_eps = prioritized_replay_eps
        self.update_priorities_timer = TimerStat()
Beispiel #14
0
class MyTrainer(Trainer):
    @classmethod
    @override(Trainer)
    def get_default_config(cls) -> TrainerConfigDict:
        # Run this Trainer with new `training_iteration` API and set some PPO-specific
        # parameters.
        return with_common_config({
            "num_sgd_iter": 10,
            "sgd_minibatch_size": 128,
        })

    @override(Trainer)
    def setup(self, config):
        # Call super's `setup` to create rollout workers.
        super().setup(config)
        # Create local replay buffer.
        self.local_replay_buffer = MultiAgentReplayBuffer(num_shards=1,
                                                          learning_starts=1000,
                                                          capacity=50000)

    @override(Trainer)
    def training_iteration(self) -> ResultDict:
        # Generate common experiences, collect batch for PPO, store every (DQN) batch
        # into replay buffer.
        ppo_batches = []
        num_env_steps = 0
        # PPO batch size fixed at 200.
        while num_env_steps < 200:
            ma_batches = synchronous_parallel_sample(worker_set=self.workers,
                                                     concat=False)
            # Loop through (parallely collected) ma-batches.
            for ma_batch in ma_batches:
                # Update sampled counters.
                self._counters[NUM_ENV_STEPS_SAMPLED] += ma_batch.count
                self._counters[
                    NUM_AGENT_STEPS_SAMPLED] += ma_batch.agent_steps()
                ppo_batch = ma_batch.policy_batches.pop("ppo_policy")
                # Add collected batches (only for DQN policy) to replay buffer.
                self.local_replay_buffer.add(ma_batch)

                ppo_batches.append(ppo_batch)
                num_env_steps += ppo_batch.count

        # DQN sub-flow.
        dqn_train_results = {}
        dqn_train_batch = self.local_replay_buffer.sample(num_items=64)
        if dqn_train_batch is not None:
            dqn_train_results = train_one_step(self, dqn_train_batch,
                                               ["dqn_policy"])
            self._counters[
                "agent_steps_trained_DQN"] += dqn_train_batch.agent_steps()
            print(
                "DQN policy learning on samples from",
                "agent steps trained",
                dqn_train_batch.agent_steps(),
            )
        # Update DQN's target net every 500 train steps.
        if (self._counters["agent_steps_trained_DQN"] -
                self._counters[LAST_TARGET_UPDATE_TS] >= 500):
            self.workers.local_worker().get_policy(
                "dqn_policy").update_target()
            self._counters[NUM_TARGET_UPDATES] += 1
            self._counters[LAST_TARGET_UPDATE_TS] = self._counters[
                "agent_steps_trained_DQN"]

        # PPO sub-flow.
        ppo_train_batch = SampleBatch.concat_samples(ppo_batches)
        self._counters[
            "agent_steps_trained_PPO"] += ppo_train_batch.agent_steps()
        # Standardize advantages.
        ppo_train_batch[Postprocessing.ADVANTAGES] = standardized(
            ppo_train_batch[Postprocessing.ADVANTAGES])
        print(
            "PPO policy learning on samples from",
            "agent steps trained",
            ppo_train_batch.agent_steps(),
        )
        ppo_train_batch = MultiAgentBatch({"ppo_policy": ppo_train_batch},
                                          ppo_train_batch.count)
        ppo_train_results = train_one_step(self, ppo_train_batch,
                                           ["ppo_policy"])

        # Combine results for PPO and DQN into one results dict.
        results = dict(ppo_train_results, **dqn_train_results)
        return results