def __init__( self, multi_gpu_learner_thread: MultiGPULearnerThread, share_stats: bool ): threading.Thread.__init__(self) self.multi_gpu_learner_thread = multi_gpu_learner_thread self.daemon = True if share_stats: self.queue_timer = multi_gpu_learner_thread.queue_timer self.load_timer = multi_gpu_learner_thread.load_timer else: self.queue_timer = _Timer() self.load_timer = _Timer()
def __init__(self, local_worker): threading.Thread.__init__(self) self.learner_queue_size = WindowStat("size", 50) self.local_worker = local_worker self.inqueue = queue.Queue(maxsize=LEARNER_QUEUE_MAX_SIZE) self.outqueue = queue.Queue() self.queue_timer = _Timer() self.grad_timer = _Timer() self.overall_timer = _Timer() self.daemon = True self.weights_updated = False self.stopped = False self.learner_info = {}
def __init__( self, capacity: int, replay_ratio: float, replay_mode: ReplayMode = ReplayMode.INDEPENDENT, ): """Initializes MixInReplay instance. Args: capacity: Number of batches to store in total. replay_ratio: Ratio of replayed samples in the returned batches. E.g. a ratio of 0.0 means only return new samples (no replay), a ratio of 0.5 means always return newest sample plus one old one (1:1), a ratio of 0.66 means always return the newest sample plus 2 old (replayed) ones (1:2), etc... """ self.capacity = capacity self.replay_ratio = replay_ratio self.replay_proportion = None if self.replay_ratio != 1.0: self.replay_proportion = self.replay_ratio / (1.0 - self.replay_ratio) if replay_mode in ["lockstep", ReplayMode.LOCKSTEP]: self.replay_mode = ReplayMode.LOCKSTEP elif replay_mode in ["independent", ReplayMode.INDEPENDENT]: self.replay_mode = ReplayMode.INDEPENDENT else: raise ValueError("Unsupported replay mode: {}".format(replay_mode)) def new_buffer(): return SimpleReplayBuffer(num_slots=capacity) self.replay_buffers = collections.defaultdict(new_buffer) # Metrics. self.add_batch_timer = _Timer() self.replay_timer = _Timer() self.update_priorities_timer = _Timer() # Added timesteps over lifetime. self.num_added = 0 # Last added batch(es). self.last_added_batches = collections.defaultdict(list)
def __init__( self, local_worker: RolloutWorker, minibatch_buffer_size: int, num_sgd_iter: int, learner_queue_size: int, learner_queue_timeout: int, ): """Initialize the learner thread. Args: local_worker: process local rollout worker holding policies this thread will call learn_on_batch() on minibatch_buffer_size: max number of train batches to store in the minibatching buffer num_sgd_iter: number of passes to learn on per train batch learner_queue_size: max size of queue of inbound train batches to this thread learner_queue_timeout: raise an exception if the queue has been empty for this long in seconds """ threading.Thread.__init__(self) self.learner_queue_size = WindowStat("size", 50) self.local_worker = local_worker self.inqueue = queue.Queue(maxsize=learner_queue_size) self.outqueue = queue.Queue() self.minibatch_buffer = MinibatchBuffer( inqueue=self.inqueue, size=minibatch_buffer_size, timeout=learner_queue_timeout, num_passes=num_sgd_iter, init_num_passes=num_sgd_iter, ) self.queue_timer = _Timer() self.grad_timer = _Timer() self.load_timer = _Timer() self.load_wait_timer = _Timer() self.daemon = True self.weights_updated = False self.learner_info = {} self.stopped = False self.num_steps = 0
def _update_policy(policy: Policy, replay_actor: ActorHandle, pid: PolicyID): if not hasattr(policy, "_target_and_kl_stats"): policy._target_and_kl_stats = { LAST_TARGET_UPDATE_TS: 0, NUM_TARGET_UPDATES: 0, NUM_AGENT_STEPS_TRAINED: 0, TARGET_NET_UPDATE_TIMER: _Timer(), } train_results = policy.learn_on_batch_from_replay_buffer( replay_actor=replay_actor, policy_id=pid ) if not train_results: return train_results # Update target net and KL. with policy._target_and_kl_stats[TARGET_NET_UPDATE_TIMER]: policy._target_and_kl_stats[NUM_AGENT_STEPS_TRAINED] += train_results[ NUM_AGENT_STEPS_TRAINED ] target_update_freq = ( policy.config["num_sgd_iter"] * policy.config["replay_buffer_capacity"] * policy.config["train_batch_size"] ) cur_ts = policy._target_and_kl_stats[NUM_AGENT_STEPS_TRAINED] last_update = policy._target_and_kl_stats[LAST_TARGET_UPDATE_TS] # Update target networks on all policy learners. if cur_ts - last_update > target_update_freq: policy._target_and_kl_stats[NUM_TARGET_UPDATES] += 1 policy._target_and_kl_stats[LAST_TARGET_UPDATE_TS] = cur_ts policy.update_target() # Also update Policy's current KL coeff. if policy.config["use_kl_loss"]: kl = train_results[LEARNER_STATS_KEY].get("kl") assert kl is not None, train_results # Make the actual `Policy.update_kl()` call. policy.update_kl(kl) return train_results
def __init__(self, capacity: int = 10000, storage_unit: str = "timesteps", num_shards: int = 1, learning_starts: int = 1000, replay_mode: str = "independent", replay_sequence_override: bool = True, replay_sequence_length: int = 1, replay_burn_in: int = 0, replay_zero_init_states: bool = True, underlying_buffer_config: dict = None, **kwargs): """Initializes a MultiAgentReplayBuffer instance. Args: capacity: The capacity of the buffer, measured in `storage_unit`. storage_unit: Either 'timesteps', 'sequences' or 'episodes'. Specifies how experiences are stored. If they are stored in episodes, replay_sequence_length is ignored. num_shards: The number of buffer shards that exist in total (including this one). learning_starts: Number of timesteps after which a call to `sample()` will yield samples (before that, `sample()` will return None). replay_mode: One of "independent" or "lockstep". Determines, whether batches are sampled independently or to an equal amount. replay_sequence_override: If True, ignore sequences found in incoming batches, slicing them into sequences as specified by `replay_sequence_length` and `replay_sequence_burn_in`. This only has an effect if storage_unit is `sequences`. replay_sequence_length: The sequence length (T) of a single sample. If > 1, we will sample B x T from this buffer. This only has an effect if storage_unit is 'timesteps'. replay_burn_in: This is the number of timesteps each sequence overlaps with the previous one to generate a better internal state (=state after the burn-in), instead of starting from 0.0 each RNN rollout. This only has an effect if storage_unit is `sequences`. replay_zero_init_states: Whether the initial states in the buffer (if replay_sequence_length > 0) are alwayas 0.0 or should be updated with the previous train_batch state outputs. underlying_buffer_config: A config that contains all necessary constructor arguments and arguments for methods to call on the underlying buffers. ``**kwargs``: Forward compatibility kwargs. """ shard_capacity = capacity // num_shards ReplayBuffer.__init__(self, capacity, storage_unit) # If the user provides an underlying buffer config, we use to # instantiate and interact with underlying buffers self.underlying_buffer_config = underlying_buffer_config if self.underlying_buffer_config is not None: self.underlying_buffer_call_args = self.underlying_buffer_config else: self.underlying_buffer_call_args = {} self.replay_sequence_override = replay_sequence_override self.replay_starts = learning_starts // num_shards self.replay_mode = replay_mode self.replay_sequence_length = replay_sequence_length self.replay_burn_in = replay_burn_in self.replay_zero_init_states = replay_zero_init_states self.replay_sequence_override = replay_sequence_override if (replay_sequence_length > 1 and self.storage_unit is not StorageUnit.SEQUENCES): logger.warning( "MultiAgentReplayBuffer configured with " "`replay_sequence_length={}`, but `storage_unit={}`. " "replay_sequence_length will be ignored and set to 1.".format( replay_sequence_length, storage_unit)) self.replay_sequence_length = 1 if replay_sequence_length == 1 and self.storage_unit is StorageUnit.SEQUENCES: logger.warning( "MultiAgentReplayBuffer configured with " "`replay_sequence_length={}`, but `storage_unit={}`. " "This will result in sequences equal to timesteps.".format( replay_sequence_length, storage_unit)) if replay_mode in ["lockstep", ReplayMode.LOCKSTEP]: self.replay_mode = ReplayMode.LOCKSTEP if self.storage_unit in [ StorageUnit.EPISODES, StorageUnit.SEQUENCES ]: raise ValueError("MultiAgentReplayBuffer does not support " "lockstep mode with storage unit `episodes`" "or `sequences`.") elif replay_mode in ["independent", ReplayMode.INDEPENDENT]: self.replay_mode = ReplayMode.INDEPENDENT else: raise ValueError("Unsupported replay mode: {}".format(replay_mode)) if self.underlying_buffer_config: ctor_args = { **{ "capacity": shard_capacity, "storage_unit": StorageUnit.FRAGMENTS }, **self.underlying_buffer_config, } def new_buffer(): return from_config(self.underlying_buffer_config["type"], ctor_args) else: # Default case def new_buffer(): self.underlying_buffer_call_args = {} return ReplayBuffer( self.capacity, storage_unit=StorageUnit.FRAGMENTS, ) self.replay_buffers = collections.defaultdict(new_buffer) # Metrics. self.add_batch_timer = _Timer() self.replay_timer = _Timer() self._num_added = 0
def __init__(self, capacity: int = 10000, storage_unit: str = "timesteps", num_shards: int = 1, learning_starts: int = 1000, replay_mode: str = "independent", replay_sequence_length: int = 1, replay_burn_in: int = 0, replay_zero_init_states: bool = True, prioritized_replay_alpha: float = 0.6, prioritized_replay_beta: float = 0.4, prioritized_replay_eps: float = 1e-6, underlying_buffer_config: dict = None, **kwargs): """Initializes a MultiAgentReplayBuffer instance. Args: num_shards: The number of buffer shards that exist in total (including this one). storage_unit: Either 'timesteps', 'sequences' or 'episodes'. Specifies how experiences are stored. If they are stored in episodes, replay_sequence_length is ignored. If they are stored in episodes, replay_sequence_length is ignored. learning_starts: Number of timesteps after which a call to `replay()` will yield samples (before that, `replay()` will return None). capacity: The capacity of the buffer, measured in `storage_unit`. prioritized_replay_alpha: Alpha parameter for a prioritized replay buffer. Use 0.0 for no prioritization. prioritized_replay_beta: Beta parameter for a prioritized replay buffer. prioritized_replay_eps: Epsilon parameter for a prioritized replay buffer. replay_sequence_length: The sequence length (T) of a single sample. If > 1, we will sample B x T from this buffer. replay_burn_in: The burn-in length in case `replay_sequence_length` > 0. This is the number of timesteps each sequence overlaps with the previous one to generate a better internal state (=state after the burn-in), instead of starting from 0.0 each RNN rollout. replay_zero_init_states: Whether the initial states in the buffer (if replay_sequence_length > 0) are alwayas 0.0 or should be updated with the previous train_batch state outputs. underlying_buffer_config: A config that contains all necessary constructor arguments and arguments for methods to call on the underlying buffers. This replaces the standard behaviour of the underlying PrioritizedReplayBuffer. The config follows the conventions of the general replay_buffer_config. kwargs for subsequent calls of methods may also be included. Example: "replay_buffer_config": {"type": PrioritizedReplayBuffer, "capacity": 10, "storage_unit": "timesteps", prioritized_replay_alpha: 0.5, prioritized_replay_beta: 0.5, prioritized_replay_eps: 0.5} ``**kwargs``: Forward compatibility kwargs. """ if "replay_mode" in kwargs and (kwargs["replay_mode"] == "lockstep" or kwargs["replay_mode"] == ReplayMode.LOCKSTEP): if log_once("lockstep_mode_not_supported"): logger.error("Replay mode `lockstep` is not supported for " "MultiAgentPrioritizedReplayBuffer. " "This buffer will run in `independent` mode.") kwargs["replay_mode"] = "independent" if underlying_buffer_config is not None: if log_once("underlying_buffer_config_not_supported"): logger.info("PrioritizedMultiAgentReplayBuffer instantiated " "with underlying_buffer_config. This will " "overwrite the standard behaviour of the " "underlying PrioritizedReplayBuffer.") prioritized_replay_buffer_config = underlying_buffer_config else: prioritized_replay_buffer_config = { "type": PrioritizedReplayBuffer, "alpha": prioritized_replay_alpha, "beta": prioritized_replay_beta, } shard_capacity = capacity // num_shards MultiAgentReplayBuffer.__init__( self, shard_capacity, storage_unit, **kwargs, underlying_buffer_config=prioritized_replay_buffer_config, learning_starts=learning_starts, replay_mode=replay_mode, replay_sequence_length=replay_sequence_length, replay_burn_in=replay_burn_in, replay_zero_init_states=replay_zero_init_states, ) self.prioritized_replay_eps = prioritized_replay_eps self.update_priorities_timer = _Timer()