def _batch_worker(self, worker_id: int, queue: mp.JoinableQueue, lock,
                      rx: Connection) -> None:
        Tqdm.set_lock(lock)
        try:
            self.reader._set_worker_info(
                WorkerInfo(self.num_workers, worker_id))
            instances = self.reader.read(self.data_path)
            for batch in self._instances_to_batches(
                    instances, move_to_device=self._worker_cuda_safe):
                if self._safe_queue_put(worker_id, (batch, None), queue, rx):
                    continue
                else:
                    # Couldn't put item on queue because parent process has exited.
                    return
        except Exception as e:
            if not self._safe_queue_put(
                    worker_id,
                (None, (repr(e), traceback.format_exc())), queue, rx):
                return

        # Indicate to the consumer (main thread) that this worker is finished.
        queue.put((None, None))

        # Wait until this process can safely exit.
        queue.join()
Beispiel #2
0
def run_experiment(experiment_id, experiment_directory, run_id,
                   experiment_config, agents_config, seed):
    np.random.seed(seed)
    torch.manual_seed(seed)

    results_path = f'{experiment_directory}/run-{run_id}'
    if not os.path.exists(results_path):
        os.mkdir(results_path)
    base_path = results_path

    createNewEnvironment = EnvironmentCreationFunction(
        experiment_config['environment'])

    checkpoint_at_iterations = [
        int(i) for i in experiment_config['checkpoint_at_iterations']
    ]
    benchmarking_episodes = int(experiment_config['benchmarking_episodes'])

    training_schemes = util.experiment_parsing.initialize_training_schemes(
        experiment_config['self_play_training_schemes'])
    algorithms = util.experiment_parsing.initialize_algorithms(
        createNewEnvironment(), agents_config)
    fixed_agents = util.experiment_parsing.initialize_fixed_agents(
        experiment_config['fixed_agents'])

    training_jobs = enumerate_training_jobs(training_schemes, algorithms)

    (initial_fixed_agents_to_benchmark,
     fixed_agents_for_confusion) = preprocess_fixed_agents(
         fixed_agents, checkpoint_at_iterations)
    agent_queue, benchmark_queue, matrix_queue = JoinableQueue(
    ), JoinableQueue(), JoinableQueue()

    for fixed_agent in initial_fixed_agents_to_benchmark:
        agent_queue.put(fixed_agent)

    (training_processes, mm_process,
     benchmark_process, cfm_process) = create_all_initial_processes(
         training_jobs, createNewEnvironment, checkpoint_at_iterations,
         agent_queue, benchmark_queue, matrix_queue, benchmarking_episodes,
         fixed_agents_for_confusion, results_path, seed)

    run_processes(training_processes, mm_process, benchmark_process,
                  cfm_process)
    def _instance_worker(self, worker_id: int, queue: mp.JoinableQueue, lock,
                         rx: Connection) -> None:
        Tqdm.set_lock(lock)
        try:
            self.reader._set_worker_info(
                WorkerInfo(self.num_workers, worker_id))
            instances = self.reader.read(self.data_path)
            checked_for_token_indexers: bool = False
            for instance in instances:
                # Check the first instance to make sure it doesn't contain any TextFields with
                # token_indexers because we don't want to be duplicating those by sending
                # them across processes.
                if not checked_for_token_indexers:
                    for field_name, field in instance.fields.items():
                        if isinstance(field, TextField
                                      ) and field._token_indexers is not None:
                            raise ValueError(
                                f"Found a TextField ({field_name}) with token_indexers already "
                                "applied, but you're using num_workers > 0 in your data loader. "
                                "Make sure your dataset reader's text_to_instance() method doesn't "
                                "add any token_indexers to the TextFields it creates. Instead, the token_indexers "
                                "should be added to the instances in the apply_token_indexers() method of your "
                                "dataset reader (which you'll have to implement if you haven't done "
                                "so already).")
                    checked_for_token_indexers = True
                if self._safe_queue_put(worker_id, (instance, None), queue,
                                        rx):
                    continue
                else:
                    # Couldn't put item on queue because parent process has exited.
                    return
        except Exception as e:
            if not self._safe_queue_put(
                    worker_id,
                (None, (repr(e), traceback.format_exc())), queue, rx):
                return

        # Indicate to the consumer that this worker is finished.
        queue.put((None, None))

        # Wait until this process can safely exit.
        queue.join()
 def _safe_queue_put(self, worker_id: int, item: Any,
                     queue: mp.JoinableQueue, rx: Connection) -> bool:
     while True:
         # First we have to check to make sure the parent process is still alive
         # and consuming from the queue because there are circumstances where the
         # parent process can or exit stop consuming without automatically cleaning up
         # its children (the workers).
         # For example, when the parent process is killed with `kill -9`.
         # So the first thing we do is check to see if the parent has notified
         # us (the worker) to stop through the rx (receiver) connection.
         # Of course this only works if the parent was able to send out a notification,
         # which may not always be the case. So we have a backup check below.
         if rx.poll():
             logger.warning(
                 "worker %d received stop message from parent, exiting now",
                 worker_id)
             queue.cancel_join_thread()
             return False
         # The is the backup check.
         # The file descriptor associated with the rx (receiver) connection will
         # be readable if and only if the parent process has exited.
         # NOTE (epwalsh): this doesn't work on Mac OS X with `start_method == "fork"`
         # for some reason, i.e. the file descriptor doesn't show as readable
         # after the parent process has died.
         fds, _, _ = select.select([rx.fileno()], [], [], 0)
         if fds:
             logger.warning(
                 "worker %d parent process has died, exiting now",
                 worker_id)
             queue.cancel_join_thread()
             return False
         # If we're down here the parent process is still alive to the best of our
         # knowledge, so we can continue putting things on the queue.
         try:
             queue.put(item, True, 0.1)
             return True
         except Full:
             continue
Beispiel #5
0
    def _batch_worker(self, worker_id: int, queue: mp.JoinableQueue) -> None:
        try:
            self.reader._set_worker_info(
                WorkerInfo(self.num_workers, worker_id))
            instances = self.reader.read(self.data_path)
            for batch in self._instances_to_batches(
                    instances, move_to_device=self._worker_cuda_safe):
                queue.put((batch, None))
        except Exception as e:
            queue.put((None, (repr(e), traceback.format_exc())))

        # Indicate to the consumer (main thread) that this worker is finished.
        queue.put((None, None))

        # Wait until this process can safely exit.
        queue.join()
Beispiel #6
0
    def _batch_worker(self, instance_queue: mp.JoinableQueue,
                      batch_queue: mp.JoinableQueue) -> None:
        try:
            for batch_chunk in lazy_groups_of(
                    self._instances_to_batches(
                        self._gather_instances(instance_queue)),
                    self._batch_chunk_size,
            ):
                batch_queue.put((batch_chunk, None))
        except Exception as e:
            batch_queue.put((None, (e, traceback.format_exc())))

        # Indicate to the consumer (main thread) that this worker is finished.
        batch_queue.put((None, None))

        # Wait for the consumer (in the main process) to finish receiving all batch groups
        # to avoid prematurely closing the queue.
        batch_queue.join()
Beispiel #7
0
    def _instance_worker(self, worker_id: int,
                         queue: mp.JoinableQueue) -> None:
        try:
            self.reader._set_worker_info(
                WorkerInfo(self.num_workers, worker_id))

            instances = self.reader.read(self.data_path)
            checked_for_token_indexers: bool = False

            for instances_chunk in lazy_groups_of(instances,
                                                  self._instance_chunk_size):
                # Check the first instance to make sure it doesn't contain any TextFields with
                # token_indexers because we don't want to be duplicating those by sending
                # them across processes.
                if not checked_for_token_indexers:
                    for field_name, field in instances_chunk[0].fields.items():
                        if isinstance(field, TextField
                                      ) and field._token_indexers is not None:
                            raise ValueError(
                                f"Found a TextField ({field_name}) with token_indexers already "
                                "applied, but you're using num_workers > 0 in your data loader. "
                                "Make sure your dataset reader's text_to_instance() method doesn't "
                                "add any token_indexers to the TextFields it creates. The token_indexers "
                                "should be added to the instances in apply_token_indexers() method of your "
                                "dataset reader (which you'll have to implement if you haven't done "
                                "so already).")
                    checked_for_token_indexers = True
                queue.put((instances_chunk, None))
        except Exception as e:
            queue.put((None, (e, traceback.format_exc())))

        # Indicate to the consumer that this worker is finished.
        queue.put((None, None))

        # Wait for consumer to finish to avoid prematurely closing the queue.
        queue.join()