Esempio n. 1
0
def test_shutdown(ray_start_regular_shared):
    q = Queue()
    actor = q.actor
    q.shutdown()
    assert q.actor is None
    with pytest.raises(RayActorError):
        ray.get(actor.empty.remote())
Esempio n. 2
0
    def execution_loop(self, trainer, tune_enabled: bool = True):
        """Main execution loop for training, testing, & prediction.

        Sets up the torch.distributed process group for each
        worker. Then trigger remote training/testing/eval via
        ``train_remote`` on each worker. If using with Ray Tune, create a
        communication queue to retrieve intermediate results, and process
        those results. Finally retrieve the training results from the rank 0
        worker and return."""

        # Sets environment variables for all workers.
        self._setup_env_vars()

        self.global_to_local = self.get_local_ranks()

        model = self._model
        model_ref = ray.put(model)
        # Don't pickle the model when training remotely.
        self._model = None

        queue = None
        if tune_enabled and TUNE_INSTALLED and is_session_enabled():
            # Create communication queue and send to all the workers.
            queue = Queue(actor_options={"num_cpus": 0})

        futures = [
            self.workers[i].execute.remote(self.execute_remote, model_ref, i,
                                           queue)
            for i in range(self.num_workers)
        ]

        results = process_results(futures, queue)
        # Get the results, checkpoint path, and model weights from worker 0.
        results, best_path, state_stream = results[0]
        state_dict = load_state_stream(state_stream, to_gpu=self.use_gpu)
        # Set the state for PTL using the output from remote training.
        self._results = results
        self._model = model
        self._model.load_state_dict(state_dict)
        if self.lightning_module.trainer.checkpoint_callback:
            self.lightning_module.trainer.checkpoint_callback \
                .best_model_path = best_path

        if queue:
            # Shutdown the queue.
            queue.shutdown()

        return results
Esempio n. 3
0
def test_custom_resources(ray_start_regular_shared):
    current_resources = ray.available_resources()
    assert current_resources["CPU"] == 1.0

    # By default an actor should not reserve any resources.
    q = Queue()
    current_resources = ray.available_resources()
    assert current_resources["CPU"] == 1.0
    q.shutdown()

    # Specify resource requirement. The queue should now reserve 1 CPU.
    q = Queue(actor_options={"num_cpus": 1})

    def no_cpu_in_resources():
        return "CPU" not in ray.available_resources()

    wait_for_condition(no_cpu_in_resources)
    q.shutdown()
    def start_training(self, trainer):
        """Main training loop.

        Trigger remote training via ``train_remote`` on each
        worker. If using with Ray Tune, create a communication queue to
        retrieve intermediate results, and process those results. Finally
        retrieve the training results from the rank 0 worker and return."""
        model = self._model
        model_ref = ray.put(model)
        # Don't pickle the model when training remotely.
        self._model = None

        queue = None
        if TUNE_INSTALLED and is_session_enabled():
            # Create communication queue and send to all the workers.
            queue = Queue(actor_options={"num_cpus": 0})

        result_futures = self.executor.run_remote(self.train_remote,
                                                  args=[model_ref, queue])

        results = process_results(result_futures, queue)

        results, state_dict, best_path = results[0]
        self._results = results
        self._model = model
        self._model.load_state_dict(state_dict)
        self._model.trainer.accelerator.training_type_plugin = self
        if self.lightning_module.trainer.checkpoint_callback:
            self.lightning_module.trainer.checkpoint_callback \
                .best_model_path = best_path

        if queue:
            # Shutdown the queue.
            queue.shutdown()

        return results
Esempio n. 5
0
    def run(self,
            worker_fn: Callable,
            callbacks: Optional[List[Callable]] = None) -> List[Any]:
        """Executes the provided function on all workers.

        Args:
            worker_fn: Target elastic function that can be executed.
            callbacks: List of callables. Each callback must either
                be a callable function or a class that implements __call__.
                Every callback will be invoked on every value logged
                by the rank 0 worker.

        Returns:
            List of return values from every completed worker.
        """
        return_values = []
        from ray.util.queue import Queue
        import inspect
        args = inspect.getfullargspec(Queue).args
        if "actor_options" not in args:
            # Ray 1.1 and less
            _queue = Queue()
        else:
            _queue = Queue(actor_options={
                "num_cpus": 0,
                "resources": {
                    ray.state.current_node_id(): 0.001
                }
            })
        self.driver.start(
            self.settings.num_proc,
            self._create_spawn_worker_fn(return_values, worker_fn, _queue))

        def _process_calls(queue, callbacks, event):
            if not callbacks:
                return
            while queue.actor:
                if not queue.empty():
                    result = queue.get_nowait()
                    for c in callbacks:
                        c(result)
                    # avoid slamming the CI
                elif event.is_set():
                    break
                time.sleep(0.1)

        try:
            event = threading.Event()
            _callback_thread = threading.Thread(target=_process_calls,
                                                args=(_queue, callbacks,
                                                      event),
                                                daemon=True)
            _callback_thread.start()
            res = self.driver.get_results()
            event.set()
            if _callback_thread:
                _callback_thread.join(timeout=60)
        finally:
            if hasattr(_queue, "shutdown"):
                _queue.shutdown()
            else:
                done_ref = _queue.actor.__ray_terminate__.remote()
                done, not_done = ray.wait([done_ref], timeout=5)
                if not_done:
                    ray.kill(_queue.actor)
        self.driver.stop()

        if res.error_message is not None:
            raise RuntimeError(res.error_message)

        for name, value in sorted(res.worker_results.items(),
                                  key=lambda item: item[1][1]):
            exit_code, timestamp = value
            if exit_code != 0:
                raise RuntimeError(
                    'Horovod detected that one or more processes '
                    'exited with non-zero '
                    'status, thus causing the job to be terminated. '
                    'The first process '
                    'to do so was:\nProcess name: {name}\nExit code: {code}\n'.
                    format(name=name, code=exit_code))

        return_values = [
            value for k, value in sorted(return_values, key=lambda kv: kv[0])
        ]
        return return_values