def __init__(
     self, multi_gpu_learner_thread: MultiGPULearnerThread, share_stats: bool
 ):
     threading.Thread.__init__(self)
     self.multi_gpu_learner_thread = multi_gpu_learner_thread
     self.daemon = True
     if share_stats:
         self.queue_timer = multi_gpu_learner_thread.queue_timer
         self.load_timer = multi_gpu_learner_thread.load_timer
     else:
         self.queue_timer = _Timer()
         self.load_timer = _Timer()
Beispiel #2
0
 def __init__(self, local_worker):
     threading.Thread.__init__(self)
     self.learner_queue_size = WindowStat("size", 50)
     self.local_worker = local_worker
     self.inqueue = queue.Queue(maxsize=LEARNER_QUEUE_MAX_SIZE)
     self.outqueue = queue.Queue()
     self.queue_timer = _Timer()
     self.grad_timer = _Timer()
     self.overall_timer = _Timer()
     self.daemon = True
     self.weights_updated = False
     self.stopped = False
     self.learner_info = {}
Beispiel #3
0
    def __init__(
        self,
        capacity: int,
        replay_ratio: float,
        replay_mode: ReplayMode = ReplayMode.INDEPENDENT,
    ):
        """Initializes MixInReplay instance.

        Args:
            capacity: Number of batches to store in total.
            replay_ratio: Ratio of replayed samples in the returned
                batches. E.g. a ratio of 0.0 means only return new samples
                (no replay), a ratio of 0.5 means always return newest sample
                plus one old one (1:1), a ratio of 0.66 means always return
                the newest sample plus 2 old (replayed) ones (1:2), etc...
        """
        self.capacity = capacity
        self.replay_ratio = replay_ratio
        self.replay_proportion = None
        if self.replay_ratio != 1.0:
            self.replay_proportion = self.replay_ratio / (1.0 - self.replay_ratio)

        if replay_mode in ["lockstep", ReplayMode.LOCKSTEP]:
            self.replay_mode = ReplayMode.LOCKSTEP
        elif replay_mode in ["independent", ReplayMode.INDEPENDENT]:
            self.replay_mode = ReplayMode.INDEPENDENT
        else:
            raise ValueError("Unsupported replay mode: {}".format(replay_mode))

        def new_buffer():
            return SimpleReplayBuffer(num_slots=capacity)

        self.replay_buffers = collections.defaultdict(new_buffer)

        # Metrics.
        self.add_batch_timer = _Timer()
        self.replay_timer = _Timer()
        self.update_priorities_timer = _Timer()

        # Added timesteps over lifetime.
        self.num_added = 0

        # Last added batch(es).
        self.last_added_batches = collections.defaultdict(list)
Beispiel #4
0
    def __init__(
        self,
        local_worker: RolloutWorker,
        minibatch_buffer_size: int,
        num_sgd_iter: int,
        learner_queue_size: int,
        learner_queue_timeout: int,
    ):
        """Initialize the learner thread.

        Args:
            local_worker: process local rollout worker holding
                policies this thread will call learn_on_batch() on
            minibatch_buffer_size: max number of train batches to store
                in the minibatching buffer
            num_sgd_iter: number of passes to learn on per train batch
            learner_queue_size: max size of queue of inbound
                train batches to this thread
            learner_queue_timeout: raise an exception if the queue has
                been empty for this long in seconds
        """
        threading.Thread.__init__(self)
        self.learner_queue_size = WindowStat("size", 50)
        self.local_worker = local_worker
        self.inqueue = queue.Queue(maxsize=learner_queue_size)
        self.outqueue = queue.Queue()
        self.minibatch_buffer = MinibatchBuffer(
            inqueue=self.inqueue,
            size=minibatch_buffer_size,
            timeout=learner_queue_timeout,
            num_passes=num_sgd_iter,
            init_num_passes=num_sgd_iter,
        )
        self.queue_timer = _Timer()
        self.grad_timer = _Timer()
        self.load_timer = _Timer()
        self.load_wait_timer = _Timer()
        self.daemon = True
        self.weights_updated = False
        self.learner_info = {}
        self.stopped = False
        self.num_steps = 0
Beispiel #5
0
    def _update_policy(policy: Policy, replay_actor: ActorHandle, pid: PolicyID):
        if not hasattr(policy, "_target_and_kl_stats"):
            policy._target_and_kl_stats = {
                LAST_TARGET_UPDATE_TS: 0,
                NUM_TARGET_UPDATES: 0,
                NUM_AGENT_STEPS_TRAINED: 0,
                TARGET_NET_UPDATE_TIMER: _Timer(),
            }

        train_results = policy.learn_on_batch_from_replay_buffer(
            replay_actor=replay_actor, policy_id=pid
        )

        if not train_results:
            return train_results

        # Update target net and KL.
        with policy._target_and_kl_stats[TARGET_NET_UPDATE_TIMER]:
            policy._target_and_kl_stats[NUM_AGENT_STEPS_TRAINED] += train_results[
                NUM_AGENT_STEPS_TRAINED
            ]
            target_update_freq = (
                policy.config["num_sgd_iter"]
                * policy.config["replay_buffer_capacity"]
                * policy.config["train_batch_size"]
            )
            cur_ts = policy._target_and_kl_stats[NUM_AGENT_STEPS_TRAINED]
            last_update = policy._target_and_kl_stats[LAST_TARGET_UPDATE_TS]

            # Update target networks on all policy learners.
            if cur_ts - last_update > target_update_freq:
                policy._target_and_kl_stats[NUM_TARGET_UPDATES] += 1
                policy._target_and_kl_stats[LAST_TARGET_UPDATE_TS] = cur_ts
                policy.update_target()
                # Also update Policy's current KL coeff.
                if policy.config["use_kl_loss"]:
                    kl = train_results[LEARNER_STATS_KEY].get("kl")
                    assert kl is not None, train_results
                    # Make the actual `Policy.update_kl()` call.
                    policy.update_kl(kl)

        return train_results
Beispiel #6
0
    def __init__(self,
                 capacity: int = 10000,
                 storage_unit: str = "timesteps",
                 num_shards: int = 1,
                 learning_starts: int = 1000,
                 replay_mode: str = "independent",
                 replay_sequence_override: bool = True,
                 replay_sequence_length: int = 1,
                 replay_burn_in: int = 0,
                 replay_zero_init_states: bool = True,
                 underlying_buffer_config: dict = None,
                 **kwargs):
        """Initializes a MultiAgentReplayBuffer instance.

        Args:
            capacity: The capacity of the buffer, measured in `storage_unit`.
            storage_unit: Either 'timesteps', 'sequences' or
                'episodes'. Specifies how experiences are stored. If they
                are stored in episodes, replay_sequence_length is ignored.
            num_shards: The number of buffer shards that exist in total
                (including this one).
            learning_starts: Number of timesteps after which a call to
                `sample()` will yield samples (before that, `sample()` will
                return None).
            replay_mode: One of "independent" or "lockstep". Determines,
                whether batches are sampled independently or to an equal
                amount.
            replay_sequence_override: If True, ignore sequences found in incoming
                batches, slicing them into sequences as specified by
                `replay_sequence_length` and `replay_sequence_burn_in`. This only has
                an effect if storage_unit is `sequences`.
            replay_sequence_length: The sequence length (T) of a single
                sample. If > 1, we will sample B x T from this buffer. This
                only has an effect if storage_unit is 'timesteps'.
            replay_burn_in: This is the number of timesteps
                each sequence overlaps with the previous one to generate a
                better internal state (=state after the burn-in), instead of
                starting from 0.0 each RNN rollout. This only has an effect
                if storage_unit is `sequences`.
            replay_zero_init_states: Whether the initial states in the
                buffer (if replay_sequence_length > 0) are alwayas 0.0 or
                should be updated with the previous train_batch state outputs.
            underlying_buffer_config: A config that contains all necessary
                constructor arguments and arguments for methods to call on
                the underlying buffers.
            ``**kwargs``: Forward compatibility kwargs.
        """
        shard_capacity = capacity // num_shards
        ReplayBuffer.__init__(self, capacity, storage_unit)

        # If the user provides an underlying buffer config, we use to
        # instantiate and interact with underlying buffers
        self.underlying_buffer_config = underlying_buffer_config
        if self.underlying_buffer_config is not None:
            self.underlying_buffer_call_args = self.underlying_buffer_config
        else:
            self.underlying_buffer_call_args = {}
        self.replay_sequence_override = replay_sequence_override
        self.replay_starts = learning_starts // num_shards
        self.replay_mode = replay_mode
        self.replay_sequence_length = replay_sequence_length
        self.replay_burn_in = replay_burn_in
        self.replay_zero_init_states = replay_zero_init_states
        self.replay_sequence_override = replay_sequence_override

        if (replay_sequence_length > 1
                and self.storage_unit is not StorageUnit.SEQUENCES):
            logger.warning(
                "MultiAgentReplayBuffer configured with "
                "`replay_sequence_length={}`, but `storage_unit={}`. "
                "replay_sequence_length will be ignored and set to 1.".format(
                    replay_sequence_length, storage_unit))
            self.replay_sequence_length = 1

        if replay_sequence_length == 1 and self.storage_unit is StorageUnit.SEQUENCES:
            logger.warning(
                "MultiAgentReplayBuffer configured with "
                "`replay_sequence_length={}`, but `storage_unit={}`. "
                "This will result in sequences equal to timesteps.".format(
                    replay_sequence_length, storage_unit))

        if replay_mode in ["lockstep", ReplayMode.LOCKSTEP]:
            self.replay_mode = ReplayMode.LOCKSTEP
            if self.storage_unit in [
                    StorageUnit.EPISODES, StorageUnit.SEQUENCES
            ]:
                raise ValueError("MultiAgentReplayBuffer does not support "
                                 "lockstep mode with storage unit `episodes`"
                                 "or `sequences`.")
        elif replay_mode in ["independent", ReplayMode.INDEPENDENT]:
            self.replay_mode = ReplayMode.INDEPENDENT
        else:
            raise ValueError("Unsupported replay mode: {}".format(replay_mode))

        if self.underlying_buffer_config:
            ctor_args = {
                **{
                    "capacity": shard_capacity,
                    "storage_unit": StorageUnit.FRAGMENTS
                },
                **self.underlying_buffer_config,
            }

            def new_buffer():
                return from_config(self.underlying_buffer_config["type"],
                                   ctor_args)

        else:
            # Default case
            def new_buffer():
                self.underlying_buffer_call_args = {}
                return ReplayBuffer(
                    self.capacity,
                    storage_unit=StorageUnit.FRAGMENTS,
                )

        self.replay_buffers = collections.defaultdict(new_buffer)

        # Metrics.
        self.add_batch_timer = _Timer()
        self.replay_timer = _Timer()
        self._num_added = 0
Beispiel #7
0
    def __init__(self,
                 capacity: int = 10000,
                 storage_unit: str = "timesteps",
                 num_shards: int = 1,
                 learning_starts: int = 1000,
                 replay_mode: str = "independent",
                 replay_sequence_length: int = 1,
                 replay_burn_in: int = 0,
                 replay_zero_init_states: bool = True,
                 prioritized_replay_alpha: float = 0.6,
                 prioritized_replay_beta: float = 0.4,
                 prioritized_replay_eps: float = 1e-6,
                 underlying_buffer_config: dict = None,
                 **kwargs):
        """Initializes a MultiAgentReplayBuffer instance.

        Args:
            num_shards: The number of buffer shards that exist in total
                (including this one).
            storage_unit: Either 'timesteps', 'sequences' or
                'episodes'. Specifies how experiences are stored. If they
                are stored in episodes, replay_sequence_length is ignored.
                If they are stored in episodes, replay_sequence_length is
                ignored.
            learning_starts: Number of timesteps after which a call to
                `replay()` will yield samples (before that, `replay()` will
                return None).
            capacity: The capacity of the buffer, measured in `storage_unit`.
            prioritized_replay_alpha: Alpha parameter for a prioritized
                replay buffer. Use 0.0 for no prioritization.
            prioritized_replay_beta: Beta parameter for a prioritized
                replay buffer.
            prioritized_replay_eps: Epsilon parameter for a prioritized
                replay buffer.
            replay_sequence_length: The sequence length (T) of a single
                sample. If > 1, we will sample B x T from this buffer.
            replay_burn_in: The burn-in length in case
                `replay_sequence_length` > 0. This is the number of timesteps
                each sequence overlaps with the previous one to generate a
                better internal state (=state after the burn-in), instead of
                starting from 0.0 each RNN rollout.
            replay_zero_init_states: Whether the initial states in the
                buffer (if replay_sequence_length > 0) are alwayas 0.0 or
                should be updated with the previous train_batch state outputs.
            underlying_buffer_config: A config that contains all necessary
                constructor arguments and arguments for methods to call on
                the underlying buffers. This replaces the standard behaviour
                of the underlying PrioritizedReplayBuffer. The config
                follows the conventions of the general
                replay_buffer_config. kwargs for subsequent calls of methods
                may also be included. Example:
                "replay_buffer_config": {"type": PrioritizedReplayBuffer,
                "capacity": 10, "storage_unit": "timesteps",
                prioritized_replay_alpha: 0.5, prioritized_replay_beta: 0.5,
                prioritized_replay_eps: 0.5}
            ``**kwargs``: Forward compatibility kwargs.
        """
        if "replay_mode" in kwargs and (kwargs["replay_mode"] == "lockstep"
                                        or kwargs["replay_mode"]
                                        == ReplayMode.LOCKSTEP):
            if log_once("lockstep_mode_not_supported"):
                logger.error("Replay mode `lockstep` is not supported for "
                             "MultiAgentPrioritizedReplayBuffer. "
                             "This buffer will run in `independent` mode.")
            kwargs["replay_mode"] = "independent"

        if underlying_buffer_config is not None:
            if log_once("underlying_buffer_config_not_supported"):
                logger.info("PrioritizedMultiAgentReplayBuffer instantiated "
                            "with underlying_buffer_config. This will "
                            "overwrite the standard behaviour of the "
                            "underlying PrioritizedReplayBuffer.")
            prioritized_replay_buffer_config = underlying_buffer_config
        else:
            prioritized_replay_buffer_config = {
                "type": PrioritizedReplayBuffer,
                "alpha": prioritized_replay_alpha,
                "beta": prioritized_replay_beta,
            }

        shard_capacity = capacity // num_shards
        MultiAgentReplayBuffer.__init__(
            self,
            shard_capacity,
            storage_unit,
            **kwargs,
            underlying_buffer_config=prioritized_replay_buffer_config,
            learning_starts=learning_starts,
            replay_mode=replay_mode,
            replay_sequence_length=replay_sequence_length,
            replay_burn_in=replay_burn_in,
            replay_zero_init_states=replay_zero_init_states,
        )

        self.prioritized_replay_eps = prioritized_replay_eps
        self.update_priorities_timer = _Timer()