def _batch_worker(self, worker_id: int, queue: mp.JoinableQueue, lock, rx: Connection) -> None: Tqdm.set_lock(lock) try: self.reader._set_worker_info( WorkerInfo(self.num_workers, worker_id)) instances = self.reader.read(self.data_path) for batch in self._instances_to_batches( instances, move_to_device=self._worker_cuda_safe): if self._safe_queue_put(worker_id, (batch, None), queue, rx): continue else: # Couldn't put item on queue because parent process has exited. return except Exception as e: if not self._safe_queue_put( worker_id, (None, (repr(e), traceback.format_exc())), queue, rx): return # Indicate to the consumer (main thread) that this worker is finished. queue.put((None, None)) # Wait until this process can safely exit. queue.join()
def run_experiment(experiment_id, experiment_directory, run_id, experiment_config, agents_config, seed): np.random.seed(seed) torch.manual_seed(seed) results_path = f'{experiment_directory}/run-{run_id}' if not os.path.exists(results_path): os.mkdir(results_path) base_path = results_path createNewEnvironment = EnvironmentCreationFunction( experiment_config['environment']) checkpoint_at_iterations = [ int(i) for i in experiment_config['checkpoint_at_iterations'] ] benchmarking_episodes = int(experiment_config['benchmarking_episodes']) training_schemes = util.experiment_parsing.initialize_training_schemes( experiment_config['self_play_training_schemes']) algorithms = util.experiment_parsing.initialize_algorithms( createNewEnvironment(), agents_config) fixed_agents = util.experiment_parsing.initialize_fixed_agents( experiment_config['fixed_agents']) training_jobs = enumerate_training_jobs(training_schemes, algorithms) (initial_fixed_agents_to_benchmark, fixed_agents_for_confusion) = preprocess_fixed_agents( fixed_agents, checkpoint_at_iterations) agent_queue, benchmark_queue, matrix_queue = JoinableQueue( ), JoinableQueue(), JoinableQueue() for fixed_agent in initial_fixed_agents_to_benchmark: agent_queue.put(fixed_agent) (training_processes, mm_process, benchmark_process, cfm_process) = create_all_initial_processes( training_jobs, createNewEnvironment, checkpoint_at_iterations, agent_queue, benchmark_queue, matrix_queue, benchmarking_episodes, fixed_agents_for_confusion, results_path, seed) run_processes(training_processes, mm_process, benchmark_process, cfm_process)
def _instance_worker(self, worker_id: int, queue: mp.JoinableQueue, lock, rx: Connection) -> None: Tqdm.set_lock(lock) try: self.reader._set_worker_info( WorkerInfo(self.num_workers, worker_id)) instances = self.reader.read(self.data_path) checked_for_token_indexers: bool = False for instance in instances: # Check the first instance to make sure it doesn't contain any TextFields with # token_indexers because we don't want to be duplicating those by sending # them across processes. if not checked_for_token_indexers: for field_name, field in instance.fields.items(): if isinstance(field, TextField ) and field._token_indexers is not None: raise ValueError( f"Found a TextField ({field_name}) with token_indexers already " "applied, but you're using num_workers > 0 in your data loader. " "Make sure your dataset reader's text_to_instance() method doesn't " "add any token_indexers to the TextFields it creates. Instead, the token_indexers " "should be added to the instances in the apply_token_indexers() method of your " "dataset reader (which you'll have to implement if you haven't done " "so already).") checked_for_token_indexers = True if self._safe_queue_put(worker_id, (instance, None), queue, rx): continue else: # Couldn't put item on queue because parent process has exited. return except Exception as e: if not self._safe_queue_put( worker_id, (None, (repr(e), traceback.format_exc())), queue, rx): return # Indicate to the consumer that this worker is finished. queue.put((None, None)) # Wait until this process can safely exit. queue.join()
def _safe_queue_put(self, worker_id: int, item: Any, queue: mp.JoinableQueue, rx: Connection) -> bool: while True: # First we have to check to make sure the parent process is still alive # and consuming from the queue because there are circumstances where the # parent process can or exit stop consuming without automatically cleaning up # its children (the workers). # For example, when the parent process is killed with `kill -9`. # So the first thing we do is check to see if the parent has notified # us (the worker) to stop through the rx (receiver) connection. # Of course this only works if the parent was able to send out a notification, # which may not always be the case. So we have a backup check below. if rx.poll(): logger.warning( "worker %d received stop message from parent, exiting now", worker_id) queue.cancel_join_thread() return False # The is the backup check. # The file descriptor associated with the rx (receiver) connection will # be readable if and only if the parent process has exited. # NOTE (epwalsh): this doesn't work on Mac OS X with `start_method == "fork"` # for some reason, i.e. the file descriptor doesn't show as readable # after the parent process has died. fds, _, _ = select.select([rx.fileno()], [], [], 0) if fds: logger.warning( "worker %d parent process has died, exiting now", worker_id) queue.cancel_join_thread() return False # If we're down here the parent process is still alive to the best of our # knowledge, so we can continue putting things on the queue. try: queue.put(item, True, 0.1) return True except Full: continue
def _batch_worker(self, worker_id: int, queue: mp.JoinableQueue) -> None: try: self.reader._set_worker_info( WorkerInfo(self.num_workers, worker_id)) instances = self.reader.read(self.data_path) for batch in self._instances_to_batches( instances, move_to_device=self._worker_cuda_safe): queue.put((batch, None)) except Exception as e: queue.put((None, (repr(e), traceback.format_exc()))) # Indicate to the consumer (main thread) that this worker is finished. queue.put((None, None)) # Wait until this process can safely exit. queue.join()
def _batch_worker(self, instance_queue: mp.JoinableQueue, batch_queue: mp.JoinableQueue) -> None: try: for batch_chunk in lazy_groups_of( self._instances_to_batches( self._gather_instances(instance_queue)), self._batch_chunk_size, ): batch_queue.put((batch_chunk, None)) except Exception as e: batch_queue.put((None, (e, traceback.format_exc()))) # Indicate to the consumer (main thread) that this worker is finished. batch_queue.put((None, None)) # Wait for the consumer (in the main process) to finish receiving all batch groups # to avoid prematurely closing the queue. batch_queue.join()
def _instance_worker(self, worker_id: int, queue: mp.JoinableQueue) -> None: try: self.reader._set_worker_info( WorkerInfo(self.num_workers, worker_id)) instances = self.reader.read(self.data_path) checked_for_token_indexers: bool = False for instances_chunk in lazy_groups_of(instances, self._instance_chunk_size): # Check the first instance to make sure it doesn't contain any TextFields with # token_indexers because we don't want to be duplicating those by sending # them across processes. if not checked_for_token_indexers: for field_name, field in instances_chunk[0].fields.items(): if isinstance(field, TextField ) and field._token_indexers is not None: raise ValueError( f"Found a TextField ({field_name}) with token_indexers already " "applied, but you're using num_workers > 0 in your data loader. " "Make sure your dataset reader's text_to_instance() method doesn't " "add any token_indexers to the TextFields it creates. The token_indexers " "should be added to the instances in apply_token_indexers() method of your " "dataset reader (which you'll have to implement if you haven't done " "so already).") checked_for_token_indexers = True queue.put((instances_chunk, None)) except Exception as e: queue.put((None, (e, traceback.format_exc()))) # Indicate to the consumer that this worker is finished. queue.put((None, None)) # Wait for consumer to finish to avoid prematurely closing the queue. queue.join()