Beispiel #1
0
    def test_shutdown_on_initial_discovery_failure(self):
        """Tests that the driver will shutdown immediately if initial host discovery fails."""
        discovery = mock.Mock()
        discovery.find_available_hosts_and_slots.side_effect = RuntimeError()

        discover_hosts = ElasticDriver._discover_hosts

        def wrapped_discover_hosts(obj):
            try:
                discover_hosts(obj)
            except RuntimeError:
                # Suppress the error message from the background discovery thread to clean up unit tests
                pass

        try:
            ElasticDriver._discover_hosts = wrapped_discover_hosts
            driver = ElasticDriver(mock.Mock(), discovery, min_np=2, max_np=4)
            with pytest.raises(RuntimeError):
                driver.wait_for_available_slots(min_np=2)
            assert driver.finished()
        finally:
            ElasticDriver._discover_hosts = discover_hosts
Beispiel #2
0
class ElasticAdapter(Adapter):
    """Adapter for executing Ray calls for elastic Horovod jobs.

    Args:
        settings (horovod.Settings): Configuration for job setup. You can
            use a standard Horovod Settings object or create one directly
            from RayExecutor.create_settings.
        min_workers (int): Minimum number of processes running for
            training to continue. If number of available processes dips
            below this threshold, then training will wait for
            more instances to become available.
        max_workers (int): Maximum number of training processes,
            beyond which no additional processes will be created.
            If not specified, then will be unbounded.
        reset_limit (int): Maximum number of times that the training
            job can scale up or down the number of workers after
            which the job is terminated.
        cooldown_range (Tuple[int, int]): Range(in seconds) a failing
            host will remain in blacklist.
            Example: cooldown_range=(10, 100)
            This sets the minimum cooldown period to 10 seconds,
            and the maximum cooldown period to 100 seconds.
        elastic_timeout (int): Timeout for elastic initialisation after
            re-scaling the cluster. The default value is 600 seconds.
            Alternatively, the environment variable
            HOROVOD_ELASTIC_TIMEOUT can also be used.'
        cpus_per_worker (int): Number of CPU resources to allocate to
            each worker.
        use_gpu (bool): Whether to use GPU for allocation. TODO: this
            can be removed.
        gpus_per_worker (int): Number of GPU resources to allocate to
            each worker.
        override_discovery (bool): Whether for the ElasticRayExecutor to
            automatically provide a discovery mechanism for ElasticSettings.

    """
    def __init__(self,
                 settings,
                 min_workers: int,
                 max_workers: Optional[int] = None,
                 use_gpu: bool = False,
                 cpus_per_worker: int = 1,
                 gpus_per_worker: Optional[int] = None,
                 override_discovery: bool = True,
                 reset_limit: int = None,
                 cooldown_range: Optional[Tuple[int, int]] = None,
                 elastic_timeout: int = 600):
        self.settings = settings
        if override_discovery:
            settings.discovery = RayHostDiscovery(
                use_gpu=use_gpu,
                cpus_per_worker=cpus_per_worker,
                gpus_per_worker=gpus_per_worker)
        self.cpus_per_worker = cpus_per_worker
        self.gpus_per_worker = gpus_per_worker
        self.use_gpu = use_gpu
        # moved from settings
        self.min_workers = min_workers
        self.max_workers = max_workers
        self.num_workers = min_workers
        self.reset_limit = reset_limit
        self.cooldown_range = cooldown_range
        self.elastic_timeout = elastic_timeout
        self.driver = None
        self.rendezvous = None

    def start(self,
              executable_cls: type = None,
              executable_args: Optional[List] = None,
              executable_kwargs: Optional[Dict] = None,
              extra_env_vars: Optional[Dict] = None):
        """Starts the Horovod driver and services.

        Args:
            executable_cls (type): The class that will be created within
                an actor (BaseHorovodWorker). This will allow Horovod
                to establish its connections and set env vars.
            executable_args (List): Arguments to be passed into the
                worker class upon initialization.
            executable_kwargs (Dict): Keyword arguments to be passed into the
                worker class upon initialization.
            extra_env_vars (Dict): Environment variables to be set
                on the actors (worker processes) before initialization.

        """

        self.rendezvous = RendezvousServer(self.settings.verbose)
        self.driver = ElasticDriver(rendezvous=self.rendezvous,
                                    discovery=self.settings.discovery,
                                    min_np=self.min_workers,
                                    max_np=self.max_workers,
                                    timeout=self.elastic_timeout,
                                    reset_limit=self.reset_limit,
                                    cooldown_range=self.cooldown_range,
                                    verbose=self.settings.verbose)
        handler = create_rendezvous_handler(self.driver)
        logger.debug("[ray] starting rendezvous")
        global_rendezv_port = self.rendezvous.start(handler)

        logger.debug(f"[ray] waiting for {self.num_workers} to start.")
        self.driver.wait_for_available_slots(self.num_workers)

        # Host-to-host common interface detection
        # requires at least 2 hosts in an elastic job.
        min_hosts = _get_min_start_hosts(self.settings)
        current_hosts = self.driver.wait_for_available_slots(
            self.num_workers, min_hosts=min_hosts)
        logger.debug("[ray] getting common interfaces")
        nics = detect_nics(
            self.settings,
            all_host_names=current_hosts.host_assignment_order,
        )
        logger.debug("[ray] getting driver IP")
        server_ip = socket.gethostbyname(socket.gethostname())
        self.run_env_vars = create_run_env_vars(server_ip,
                                                nics,
                                                global_rendezv_port,
                                                elastic=True)

        self.executable_cls = executable_cls
        self.executable_args = executable_args
        self.executable_kwargs = executable_kwargs
        self.env_vars = extra_env_vars or {}

    def _create_resources(self, hostname: str):
        resources = dict(num_cpus=self.cpus_per_worker,
                         num_gpus=int(self.use_gpu) * self.gpus_per_worker,
                         resources={f"node:{hostname}": 0.01})
        return resources

    def _create_remote_worker(self, slot_info, worker_env_vars):
        hostname = slot_info.hostname
        loaded_worker_cls = self.remote_worker_cls.options(
            **self._create_resources(hostname))

        worker = loaded_worker_cls.remote()
        worker.update_env_vars.remote(worker_env_vars)
        worker.update_env_vars.remote(create_slot_env_vars(slot_info))
        if self.use_gpu:
            visible_devices = ",".join(
                [str(i) for i in range(slot_info.local_size)])
            worker.update_env_vars.remote(
                {"CUDA_VISIBLE_DEVICES": visible_devices})
        return worker

    def _create_spawn_worker_fn(self, return_results: List,
                                worker_fn: Callable,
                                queue: "ray.util.Queue") -> Callable:
        self.remote_worker_cls = ray.remote(BaseHorovodWorker)
        # event = register_shutdown_event()
        worker_env_vars = {}
        worker_env_vars.update(self.run_env_vars.copy())
        worker_env_vars.update(self.env_vars.copy())
        worker_env_vars.update({"PYTHONUNBUFFERED": "1"})

        def worker_loop(slot_info, events):
            def ping_worker(worker):
                # There is an odd edge case where a node can be removed
                # before the remote worker is started, leading to a failure
                # in trying to create the horovod mesh.
                try:
                    ping = worker.execute.remote(lambda _: 1)
                    ray.get(ping, timeout=10)
                except Exception as e:
                    logger.error(f"{slot_info.hostname}: Ping failed - {e}")
                    return False
                return True

            worker = self._create_remote_worker(slot_info, worker_env_vars)
            if not ping_worker(worker):
                return 1, time.time()

            ray.get(worker.set_queue.remote(queue))
            future = worker.execute.remote(worker_fn)

            result = None
            while result is None:
                try:
                    #  TODO: make this event driven at some point.
                    retval = ray.get(future, timeout=0.1)
                    return_results.append((slot_info.rank, retval))
                    # Success
                    result = 0, time.time()
                except GetTimeoutError:
                    # Timeout
                    if any(e.is_set() for e in events):
                        ray.kill(worker)
                        result = 1, time.time()
                except Exception as e:
                    logger.error(f"{slot_info.hostname}[{slot_info.rank}]:{e}")
                    ray.kill(worker)
                    result = 1, time.time()
            logger.debug(f"Worker ({slot_info}) routine is done!")
            return result

        return worker_loop

    def run(self,
            fn: Callable[[Any], Any],
            args: Optional[List] = None,
            kwargs: Optional[Dict] = None,
            callbacks: Optional[List[Callable]] = None) -> List[Any]:
        """Executes the provided function on all workers.

        Args:
            fn: Target function that can be executed with arbitrary
                args and keyword arguments.
            args: List of arguments to be passed into the target function.
            kwargs: Dictionary of keyword arguments to be
                passed into the target function.
            callbacks: List of callables. Each callback must either
                be a callable function or a class that implements __call__.
                Every callback will be invoked on every value logged
                by the rank 0 worker.

        Returns:
            Deserialized return values from the target function.
        """
        args = args or []
        kwargs = kwargs or {}
        f = lambda _: fn(*args, **kwargs)
        return self._run_remote(f, callbacks=callbacks)

    def _run_remote(self,
                    worker_fn: Callable,
                    callbacks: Optional[List[Callable]] = None) -> List[Any]:
        """Executes the provided function on all workers.

        Args:
            worker_fn: Target elastic function that can be executed.
            callbacks: List of callables. Each callback must either
                be a callable function or a class that implements __call__.
                Every callback will be invoked on every value logged
                by the rank 0 worker.

        Returns:
            List of return values from every completed worker.
        """
        return_values = []
        from ray.util.queue import Queue
        import inspect
        args = inspect.getfullargspec(Queue).args
        if "actor_options" not in args:
            # Ray 1.1 and less
            _queue = Queue()
        else:
            _queue = Queue(actor_options={
                "num_cpus": 0,
                "resources": {
                    ray.state.current_node_id(): 0.001
                }
            })
        self.driver.start(
            self.num_workers,
            self._create_spawn_worker_fn(return_values, worker_fn, _queue))

        def _process_calls(queue, callbacks, event):
            if not callbacks:
                return
            while queue.actor:
                if not queue.empty():
                    result = queue.get_nowait()
                    for c in callbacks:
                        c(result)
                    # avoid slamming the CI
                elif event.is_set():
                    break
                time.sleep(0.1)

        try:
            event = threading.Event()
            _callback_thread = threading.Thread(target=_process_calls,
                                                args=(_queue, callbacks,
                                                      event),
                                                daemon=True)
            _callback_thread.start()
            res = self.driver.get_results()
            event.set()
            if _callback_thread:
                _callback_thread.join(timeout=60)
        finally:
            if hasattr(_queue, "shutdown"):
                _queue.shutdown()
            else:
                done_ref = _queue.actor.__ray_terminate__.remote()
                done, not_done = ray.wait([done_ref], timeout=5)
                if not_done:
                    ray.kill(_queue.actor)
        self.driver.stop()

        if res.error_message is not None:
            raise RuntimeError(res.error_message)

        for name, value in sorted(res.worker_results.items(),
                                  key=lambda item: item[1][1]):
            exit_code, timestamp = value
            if exit_code != 0:
                raise RuntimeError(
                    'Horovod detected that one or more processes '
                    'exited with non-zero '
                    'status, thus causing the job to be terminated. '
                    'The first process '
                    'to do so was:\nProcess name: {name}\nExit code: {code}\n'.
                    format(name=name, code=exit_code))

        return_values = [
            value for k, value in sorted(return_values, key=lambda kv: kv[0])
        ]
        return return_values

    def run_remote(self, fn: Callable[[Any], Any]) -> List[Any]:
        raise NotImplementedError(
            "ObjectRefs cannot be returned from Elastic runs as the workers are ephemeral"
        )

    def execute(self,
                fn: Callable[["executable_cls"], Any],
                callbacks: Optional[List[Callable]] = None) -> List[Any]:
        """Executes the provided function on all workers.

        Args:
            fn: Target function to be invoked on every object.
            callbacks: List of callables. Each callback must either
                be a callable function or a class that implements __call__.
                Every callback will be invoked on every value logged
                by the rank 0 worker.
        Returns:
            Deserialized return values from the target function.
        """
        return ray.get(self._run_remote(fn, callbacks=callbacks))

    def execute_single(self, fn: Callable[["executable_cls"],
                                          Any]) -> List[Any]:
        """Executes the provided function on the rank 0 worker (chief).

        Args:
            fn: Target function to be invoked on the chief object.

        Returns:
            Deserialized return values from the target function.
        """
        raise NotImplementedError(
            "Elastic mode does not support execute_single. Please use the execute method instead"
        )

    def shutdown(self):
        """Destroys the driver."""
        if not self.driver:
            return
        assert self.driver.finished()
        self.driver = None