class BackendExecutor: def __init__(self, backend_config: BackendConfig, num_workers: int = 1, num_cpus_per_worker: int = 1, num_gpus_per_worker: int = 0): self._backend_config = backend_config self._num_workers = num_workers self._num_cpus_per_worker = num_cpus_per_worker self._num_gpus_per_worker = num_gpus_per_worker def start(self): self.worker_group = WorkerGroup(self._num_workers, self._num_cpus_per_worker, self._num_gpus_per_worker) def execute(self, train_func: Callable): pass def shutdown(self): self.worker_group.shutdown() def run(self, train_func: Callable): """ Runs the training function. 1. Starts the executor. 2. Executes the function. 3. Shuts down the executor. 4. Returns results of the function. """ pass
def test_worker_creation_num_cpus(ray_start_2_cpus): assert ray.available_resources()["CPU"] == 2 wg = WorkerGroup(num_cpus_per_worker=2) time.sleep(1) assert len(wg.workers) == 1 # Make sure both CPUs are being used by the actor. assert "CPU" not in ray.available_resources() wg.shutdown()
def test_worker_restart(ray_start_2_cpus): wg = WorkerGroup(num_workers=2) with pytest.raises(RuntimeError): wg.start() # Avoid race condition. time.sleep(1) wg.shutdown(0) wg.start() wg.execute(lambda: 1)
def handle_failure(self, worker_group: WorkerGroup, failed_worker_indexes: List[int], backend_config: BackendConfig): """Logic for handling failures. By default, restart all workers. """ worker_group.shutdown() worker_group.start() self.on_start(worker_group, backend_config)
def test_worker_shutdown(ray_start_2_cpus): assert ray.available_resources()["CPU"] == 2 wg = WorkerGroup(num_workers=2) time.sleep(1) assert "CPU" not in ray.available_resources() assert len(ray.state.actors()) == 2 wg.shutdown() time.sleep(1) assert ray.available_resources()["CPU"] == 2 with pytest.raises(RuntimeError): wg.execute(lambda: 1)