def ray_job_submit( script_name: str, head_service: str, k8s_namespace: str = "default", ray_dashboard_port: int = 8265, ) -> str: """Submits a Python script via the Ray Job Submission API, using the Python SDK. Waits for successful completion of the job and returns the job logs as a string. Uses `kubectl port-forward` to access the Ray head's dashboard port. Scripts live in `tests/kuberay/scripts`. This directory is used as the working dir for the job. Args: script_name: The name of the script to submit. head_service: The name of the Ray head K8s service. k8s_namespace: K8s namespace the Ray cluster belongs to. ray_dashboard_port: The port on which the Ray head is running the Ray dashboard. """ with _kubectl_port_forward(service=head_service, namespace=k8s_namespace, target_port=ray_dashboard_port) as local_port: # It takes a bit of time to establish the connection. # Try a few times to instantiate the JobSubmissionClient, as the client's # instantiation does not retry on connection errors. for trie in range(1, 7): time.sleep(5) try: client = JobSubmissionClient(f"http://127.0.0.1:{local_port}") except ConnectionError as e: if trie < 6: logger.info( "Job client connection failed. Retrying in 5 seconds.") else: raise e from None job_id = client.submit_job( entrypoint=f"python {script_name}", runtime_env={ "working_dir": SCRIPTS_DIR, # Throw in some extra data for fun, to validate runtime envs. "pip": ["pytest==6.0.0"], "env_vars": { "key_foo": "value_bar" }, }, ) # Wait for the job to complete successfully. # This logic is copied from the Job Submission docs. start = time.time() timeout = 60 while time.time() - start <= timeout: status = client.get_job_status(job_id) print(f"status: {status}") if status in { JobStatus.SUCCEEDED, JobStatus.STOPPED, JobStatus.FAILED }: break time.sleep(5) assert status == JobStatus.SUCCEEDED return client.get_job_logs(job_id)
def _check_job_stopped(client: JobSubmissionClient, job_id: str) -> bool: status = client.get_job_status(job_id) return status == JobStatus.STOPPED
def _check_job_succeeded(client: JobSubmissionClient, job_id: str) -> bool: status = client.get_job_status(job_id) if status == JobStatus.FAILED: logs = client.get_job_logs(job_id) raise RuntimeError(f"Job failed\nlogs:\n{logs}") return status == JobStatus.SUCCEEDED
def _check_job_failed(client: JobSubmissionClient, job_id: str) -> bool: status = client.get_job_status(job_id) return status == JobStatus.FAILED