def init_workers(self): """ Initialize all types of workers and start their worker processes. """ actor_queues = [MpQueue(2 * 1000 * 1000) for _ in range(self.cfg.num_workers)] policy_worker_queues = dict() for policy_id in range(self.cfg.num_policies): policy_worker_queues[policy_id] = [] for i in range(self.cfg.policy_workers_per_policy): policy_worker_queues[policy_id].append(TorchJoinableQueue()) log.info('Initializing learners...') policy_locks = [multiprocessing.Lock() for _ in range(self.cfg.num_policies)] resume_experience_collection_cv = [multiprocessing.Condition() for _ in range(self.cfg.num_policies)] learner_idx = 0 for policy_id in range(self.cfg.num_policies): learner_worker = LearnerWorker( learner_idx, policy_id, self.cfg, self.obs_space, self.action_space, self.report_queue, policy_worker_queues[policy_id], self.traj_buffers, policy_locks[policy_id], resume_experience_collection_cv[policy_id], ) learner_worker.start_process() learner_worker.init() self.learner_workers[policy_id] = learner_worker learner_idx += 1 log.info('Initializing policy workers...') for policy_id in range(self.cfg.num_policies): self.policy_workers[policy_id] = [] policy_queue = MpQueue() self.policy_queues[policy_id] = policy_queue for i in range(self.cfg.policy_workers_per_policy): policy_worker = PolicyWorker( i, policy_id, self.cfg, self.obs_space, self.action_space, self.traj_buffers, policy_queue, actor_queues, self.report_queue, policy_worker_queues[policy_id][i], policy_locks[policy_id], resume_experience_collection_cv[policy_id], ) self.policy_workers[policy_id].append(policy_worker) policy_worker.start_process() log.info('Initializing actors...') # We support actor worker initialization in groups, which can be useful for some envs that # e.g. crash when too many environments are being initialized in parallel. # Currently the limit is not used since it is not required for any envs supported out of the box, # so we parallelize initialization as hard as we can. # If this is required for your environment, perhaps a better solution would be to use global locks, # like FileLock (see doom_gym.py) self.actor_workers = [] max_parallel_init = int(1e9) # might be useful to limit this for some envs worker_indices = list(range(self.cfg.num_workers)) for i in range(0, self.cfg.num_workers, max_parallel_init): workers = self.init_subset(worker_indices[i:i + max_parallel_init], actor_queues) self.actor_workers.extend(workers)
def __init__(self, cfg): super().__init__(cfg) self.processes = [] self.terminate = RawValue(ctypes.c_bool, False) self.start_event = multiprocessing.Event() self.start_event.clear() self.report_queue = MpQueue() self.report_every_sec = 1.0 self.last_report = 0 self.avg_stats_intervals = (1, 10, 60, 300, 600) self.fps_stats = deque([], maxlen=max(self.avg_stats_intervals))
def __init__(self, cfg): super().__init__(cfg) # we should not use CUDA in the main thread, only on the workers set_global_cuda_envvars(cfg) tmp_env = make_env_func(self.cfg, env_config=None) self.obs_space = tmp_env.observation_space self.action_space = tmp_env.action_space self.num_agents = tmp_env.num_agents self.reward_shaping_scheme = None if self.cfg.with_pbt: if hasattr(tmp_env.unwrapped, '_reward_shaping_wrapper'): # noinspection PyProtectedMember self.reward_shaping_scheme = tmp_env.unwrapped._reward_shaping_wrapper.reward_shaping_scheme else: try: from envs.doom.multiplayer.doom_multiagent_wrapper import MultiAgentEnv if isinstance(tmp_env.unwrapped, MultiAgentEnv): self.reward_shaping_scheme = tmp_env.unwrapped.default_reward_shaping except ImportError: pass tmp_env.close() # shared memory allocation self.traj_buffers = SharedBuffers(self.cfg, self.num_agents, self.obs_space, self.action_space) self.actor_workers = None self.report_queue = MpQueue(20 * 1000 * 1000) self.policy_workers = dict() self.policy_queues = dict() self.learner_workers = dict() self.workers_by_handle = None self.policy_inputs = [[] for _ in range(self.cfg.num_policies)] self.policy_outputs = dict() for worker_idx in range(self.cfg.num_workers): for split_idx in range(self.cfg.worker_num_splits): self.policy_outputs[(worker_idx, split_idx)] = dict() self.policy_avg_stats = dict() self.policy_lag = [dict() for _ in range(self.cfg.num_policies)] self.last_timing = dict() self.env_steps = dict() self.samples_collected = [0 for _ in range(self.cfg.num_policies)] self.total_env_steps_since_resume = 0 # currently this applies only to the current run, not experiment as a whole # to change this behavior we'd need to save the state of the main loop to a filesystem self.total_train_seconds = 0 self.last_report = time.time() self.last_experiment_summaries = 0 self.report_interval = 5.0 # sec self.experiment_summaries_interval = self.cfg.experiment_summaries_interval # sec self.avg_stats_intervals = (2, 12, 60 ) # 10 seconds, 1 minute, 5 minutes self.fps_stats = deque([], maxlen=max(self.avg_stats_intervals)) self.throughput_stats = [ deque([], maxlen=5) for _ in range(self.cfg.num_policies) ] self.avg_stats = dict() self.stats = dict() # regular (non-averaged) stats self.writers = dict() writer_keys = list(range(self.cfg.num_policies)) for key in writer_keys: summary_dir = join(summaries_dir(experiment_dir(cfg=self.cfg)), str(key)) summary_dir = ensure_dir_exists(summary_dir) self.writers[key] = SummaryWriter(summary_dir, flush_secs=20) self.pbt = PopulationBasedTraining(self.cfg, self.reward_shaping_scheme, self.writers)
def __init__( self, worker_idx, policy_id, cfg, obs_space, action_space, report_queue, policy_worker_queues, shared_buffers, policy_lock, resume_experience_collection_cv, ): log.info('Initializing the learner %d for policy %d', worker_idx, policy_id) self.worker_idx = worker_idx self.policy_id = policy_id self.cfg = cfg # PBT-related stuff self.should_save_model = True # set to true if we need to save the model to disk on the next training iteration self.load_policy_id = None # non-None when we need to replace our parameters with another policy's parameters self.pbt_mutex = threading.Lock() self.new_cfg = None # non-None when we need to update the learning hyperparameters self.terminate = False self.num_batches_processed = 0 self.obs_space = obs_space self.action_space = action_space self.rollout_tensors = shared_buffers.tensor_trajectories self.traj_tensors_available = shared_buffers.is_traj_tensor_available self.policy_versions = shared_buffers.policy_versions self.stop_experience_collection = shared_buffers.stop_experience_collection self.stop_experience_collection_num_msgs = self.resume_experience_collection_num_msgs = 0 self.device = None self.dqn = None self.optimizer = None self.policy_lock = policy_lock self.resume_experience_collection_cv = resume_experience_collection_cv self.task_queue = MpQueue() self.report_queue = report_queue self.initialized_event = MultiprocessingEvent() self.initialized_event.clear() self.model_saved_event = MultiprocessingEvent() self.model_saved_event.clear() # queues corresponding to policy workers using the same policy # we send weight updates via these queues self.policy_worker_queues = policy_worker_queues self.experience_buffer_queue = Queue() self.tensor_batch_pool = ObjectPool() self.tensor_batcher = TensorBatcher(self.tensor_batch_pool) self.with_training = True # set to False for debugging no-training regime self.train_in_background = self.cfg.train_in_background_thread # set to False for debugging self.training_thread = Thread(target=self._train_loop) if self.train_in_background else None self.train_thread_initialized = threading.Event() self.is_training = False self.train_step = self.env_steps = 0 # decay rate at which summaries are collected # save summaries every 20 seconds in the beginning, but decay to every 4 minutes in the limit, because we # do not need frequent summaries for longer experiments self.summary_rate_decay_seconds = LinearDecay([(0, 20), (100000, 120), (1000000, 240)]) self.last_summary_time = 0 self.last_saved_time = self.last_milestone_time = 0 self.discarded_experience_over_time = deque([], maxlen=30) self.discarded_experience_timer = time.time() self.num_discarded_rollouts = 0 self.process = Process(target=self._run, daemon=True) if is_continuous_action_space(self.action_space) and self.cfg.exploration_loss == 'symmetric_kl': raise NotImplementedError('KL-divergence exploration loss is not supported with ' 'continuous action spaces. Use entropy exploration loss') if self.cfg.exploration_loss_coeff == 0.0: self.exploration_loss_func = lambda action_distr: 0.0 elif self.cfg.exploration_loss == 'entropy': self.exploration_loss_func = self.entropy_exploration_loss elif self.cfg.exploration_loss == 'symmetric_kl': self.exploration_loss_func = self.symmetric_kl_exploration_loss else: raise NotImplementedError(f'{self.cfg.exploration_loss} not supported!')