Exemple #1
0
    def _handle_supervisor_startup(self, job_id: str,
                                   result: Optional[Exception]):
        """Handle the result of starting a job supervisor actor.

        If started successfully, result should be None. Otherwise it should be
        an Exception.

        On failure, the job will be marked failed with a relevant error
        message.
        """
        if result is None:
            return
        elif isinstance(result, RuntimeEnvSetupError):
            logger.info(f"Failed to set up runtime_env for job {job_id}.")
            self._status_client.put_status(
                job_id,
                JobStatusInfo(status=JobStatus.FAILED,
                              message=(f"runtime_env setup failed: {result}")))
        elif isinstance(result, Exception):
            logger.error(
                f"Failed to start supervisor for job {job_id}: {result}.")
            self._status_client.put_status(
                job_id,
                JobStatusInfo(
                    status=JobStatus.FAILED,
                    message=f"Error occurred while starting the job: {result}")
            )
        else:
            assert False, "This should not be reached."
Exemple #2
0
    async def _monitor_job(
        self, job_id: str, job_supervisor: Optional[ActorHandle] = None
    ):
        """Monitors the specified job until it enters a terminal state.

        This is necessary because we need to handle the case where the
        JobSupervisor dies unexpectedly.
        """
        is_alive = True
        if job_supervisor is None:
            job_supervisor = self._get_actor_for_job(job_id)

            if job_supervisor is None:
                logger.error(f"Failed to get job supervisor for job {job_id}.")
                self._status_client.put_status(
                    job_id,
                    JobStatusInfo(
                        status=JobStatus.FAILED,
                        message=(
                            "Unexpected error occurred: Failed to get job supervisor."
                        ),
                    ),
                )
                is_alive = False

        while is_alive:
            try:
                await job_supervisor.ping.remote()
                await asyncio.sleep(self.JOB_MONITOR_LOOP_PERIOD_S)
            except Exception as e:
                is_alive = False
                if self._status_client.get_status(job_id).status.is_terminal():
                    # If the job is already in a terminal state, then the actor
                    # exiting is expected.
                    pass
                elif isinstance(e, RuntimeEnvSetupError):
                    logger.info(f"Failed to set up runtime_env for job {job_id}.")
                    self._status_client.put_status(
                        job_id,
                        JobStatusInfo(
                            status=JobStatus.FAILED,
                            message=(f"runtime_env setup failed: {e}"),
                        ),
                    )
                else:
                    logger.warning(
                        f"Job supervisor for job {job_id} failed unexpectedly: {e}."
                    )
                    self._status_client.put_status(
                        job_id,
                        JobStatusInfo(
                            status=JobStatus.FAILED,
                            message=f"Unexpected error occurred: {e}",
                        ),
                    )

        # Kill the actor defensively to avoid leaking actors in unexpected error cases.
        if job_supervisor is not None:
            ray.kill(job_supervisor, no_restart=True)
Exemple #3
0
    def get_job_status(
        self,
        job_id: str,
    ) -> JobStatusInfo:
        r = self._do_request("GET", f"/api/jobs/{job_id}")

        if r.status_code == 200:
            response = JobStatusResponse(**r.json())
            return JobStatusInfo(status=response.status, message=response.message)
        else:
            self._raise_error(r)
Exemple #4
0
    async def run(
        self,
        # Signal actor used in testing to capture PENDING -> RUNNING cases
        _start_signal_actor: Optional[ActorHandle] = None):
        """
        Stop and start both happen asynchrously, coordinated by asyncio event
        and coroutine, respectively.

        1) Sets job status as running
        2) Pass runtime env and metadata to subprocess as serialized env
            variables.
        3) Handle concurrent events of driver execution and
        """
        cur_status = self._get_status()
        assert cur_status.status == JobStatus.PENDING, (
            "Run should only be called once.")

        if _start_signal_actor:
            # Block in PENDING state until start signal received.
            await _start_signal_actor.wait.remote()

        self._status_client.put_status(self._job_id,
                                       JobStatusInfo(JobStatus.RUNNING))

        try:
            # Set JobConfig for the child process (runtime_env, metadata).
            os.environ[RAY_JOB_CONFIG_JSON_ENV_VAR] = json.dumps({
                "runtime_env":
                self._runtime_env,
                "metadata":
                self._metadata,
            })
            ray_redis_address = ray._private.services.find_redis_address_or_die(  # noqa: E501
            )
            os.environ[ray_constants.
                       RAY_ADDRESS_ENVIRONMENT_VARIABLE] = ray_redis_address

            log_path = self._log_client.get_log_file_path(self._job_id)
            child_process = self._exec_entrypoint(log_path)

            polling_task = create_task(self._polling(child_process))
            finished, _ = await asyncio.wait(
                [polling_task, self._stop_event.wait()],
                return_when=FIRST_COMPLETED)

            if self._stop_event.is_set():
                polling_task.cancel()
                # TODO (jiaodong): Improve this with SIGTERM then SIGKILL
                child_process.kill()
                self._status_client.put_status(self._job_id, JobStatus.STOPPED)
            else:
                # Child process finished execution and no stop event is set
                # at the same time
                assert len(
                    finished) == 1, "Should have only one coroutine done"
                [child_process_task] = finished
                return_code = child_process_task.result()
                if return_code == 0:
                    self._status_client.put_status(self._job_id,
                                                   JobStatus.SUCCEEDED)
                else:
                    log_tail = self._log_client.tail_logs(self._job_id)
                    if log_tail is not None and log_tail != "":
                        message = ("Job failed due to an application error, "
                                   "last available logs:\n" + log_tail)
                    else:
                        message = None
                    self._status_client.put_status(
                        self._job_id,
                        JobStatusInfo(status=JobStatus.FAILED,
                                      message=message))
        except Exception:
            logger.error(
                "Got unexpected exception while trying to execute driver "
                f"command. {traceback.format_exc()}")
        finally:
            # clean up actor after tasks are finished
            ray.actor.exit_actor()
Exemple #5
0
    def submit_job(
        self,
        *,
        entrypoint: str,
        job_id: Optional[str] = None,
        runtime_env: Optional[Dict[str, Any]] = None,
        metadata: Optional[Dict[str, str]] = None,
        _start_signal_actor: Optional[ActorHandle] = None,
    ) -> str:
        """
        Job execution happens asynchronously.

        1) Generate a new unique id for this job submission, each call of this
            method assumes they're independent submission with its own new
            ID, job supervisor actor, and child process.
        2) Create new detached actor with same runtime_env as job spec

        Actual setting up runtime_env, subprocess group, driver command
        execution, subprocess cleaning up and running status update to GCS
        is all handled by job supervisor actor.

        Args:
            entrypoint: Driver command to execute in subprocess shell.
                Represents the entrypoint to start user application.
            runtime_env: Runtime environment used to execute driver command,
                which could contain its own ray.init() to configure runtime
                env at ray cluster, task and actor level.
            metadata: Support passing arbitrary data to driver command in
                case needed.
            _start_signal_actor: Used in testing only to capture state
                transitions between PENDING -> RUNNING. Regular user shouldn't
                need this.

        Returns:
            job_id: Generated uuid for further job management. Only valid
                within the same ray cluster.
        """
        if job_id is None:
            job_id = generate_job_id()
        elif self._status_client.get_status(job_id) is not None:
            raise RuntimeError(f"Job {job_id} already exists.")

        logger.info(f"Starting job with job_id: {job_id}")
        self._status_client.put_status(job_id, JobStatus.PENDING)

        # Wait for the actor to start up asynchronously so this call always
        # returns immediately and we can catch errors with the actor starting
        # up.
        try:
            supervisor = self._supervisor_actor_cls.options(
                lifetime="detached",
                name=self.JOB_ACTOR_NAME.format(job_id=job_id),
                num_cpus=0,
                # Currently we assume JobManager is created by dashboard server
                # running on headnode, same for job supervisor actors scheduled
                resources={
                    self._get_current_node_resource_key(): 0.001,
                },
                runtime_env=runtime_env,
            ).remote(job_id, entrypoint, metadata or {})
            supervisor.run.remote(_start_signal_actor=_start_signal_actor)

            # Monitor the job in the background so we can detect errors without
            # requiring a client to poll.
            create_task(self._monitor_job(job_id, job_supervisor=supervisor))
        except Exception as e:
            self._status_client.put_status(
                job_id,
                JobStatusInfo(
                    status=JobStatus.FAILED,
                    message=f"Failed to start job supervisor: {e}.",
                ),
            )

        return job_id
Exemple #6
0
    async def run(
        self,
        # Signal actor used in testing to capture PENDING -> RUNNING cases
        _start_signal_actor: Optional[ActorHandle] = None,
    ):
        """
        Stop and start both happen asynchrously, coordinated by asyncio event
        and coroutine, respectively.

        1) Sets job status as running
        2) Pass runtime env and metadata to subprocess as serialized env
            variables.
        3) Handle concurrent events of driver execution and
        """
        curr_status = self._status_client.get_status(self._job_id)
        assert (
            curr_status.status == JobStatus.PENDING
        ), "Run should only be called once."

        if _start_signal_actor:
            # Block in PENDING state until start signal received.
            await _start_signal_actor.wait.remote()

        self._status_client.put_status(self._job_id, JobStatusInfo(JobStatus.RUNNING))

        try:
            # Set JobConfig for the child process (runtime_env, metadata).
            os.environ[RAY_JOB_CONFIG_JSON_ENV_VAR] = json.dumps(
                {
                    "runtime_env": self._runtime_env,
                    "metadata": self._metadata,
                }
            )
            # Always set RAY_ADDRESS as find_bootstrap_address address for
            # job submission. In case of local development, prevent user from
            # re-using http://{address}:{dashboard_port} to interact with
            # jobs SDK.
            # TODO:(mwtian) Check why "auto" does not work in entrypoint script
            os.environ[
                ray_constants.RAY_ADDRESS_ENVIRONMENT_VARIABLE
            ] = ray._private.services.find_bootstrap_address().pop()

            # Set PYTHONUNBUFFERED=1 to stream logs during the job instead of
            # only streaming them upon completion of the job.
            os.environ["PYTHONUNBUFFERED"] = "1"
            logger.info(
                "Submitting job with RAY_ADDRESS = "
                f"{os.environ[ray_constants.RAY_ADDRESS_ENVIRONMENT_VARIABLE]}"
            )
            log_path = self._log_client.get_log_file_path(self._job_id)
            child_process = self._exec_entrypoint(log_path)

            polling_task = create_task(self._polling(child_process))
            finished, _ = await asyncio.wait(
                [polling_task, self._stop_event.wait()], return_when=FIRST_COMPLETED
            )

            if self._stop_event.is_set():
                polling_task.cancel()
                # TODO (jiaodong): Improve this with SIGTERM then SIGKILL
                child_process.kill()
                self._status_client.put_status(self._job_id, JobStatus.STOPPED)
            else:
                # Child process finished execution and no stop event is set
                # at the same time
                assert len(finished) == 1, "Should have only one coroutine done"
                [child_process_task] = finished
                return_code = child_process_task.result()
                if return_code == 0:
                    self._status_client.put_status(self._job_id, JobStatus.SUCCEEDED)
                else:
                    log_tail = self._log_client.get_last_n_log_lines(self._job_id)
                    if log_tail is not None and log_tail != "":
                        message = (
                            "Job failed due to an application error, "
                            "last available logs:\n" + log_tail
                        )
                    else:
                        message = None
                    self._status_client.put_status(
                        self._job_id,
                        JobStatusInfo(status=JobStatus.FAILED, message=message),
                    )
        except Exception:
            logger.error(
                "Got unexpected exception while trying to execute driver "
                f"command. {traceback.format_exc()}"
            )
        finally:
            # clean up actor after tasks are finished
            ray.actor.exit_actor()