def test_failed_job_status(ray_start_with_dashboard, disable_aiohttp_cache, enable_test_module): address = ray_start_with_dashboard["webui_url"] assert wait_until_server_available(address) address = format_web_url(address) entrypoint_cmd = ("python -c\"" "import ray;" "ray.init();" "import time;" "time.sleep(5);" "import sys;" "sys.exit(1);" "\"") client = JobSubmissionClient(address) job_id = client.submit_job(entrypoint=entrypoint_cmd) def wait_for_job_to_fail(): data = _get_snapshot(address) for job_entry in data["data"]["snapshot"]["jobs"].values(): if job_entry["status"] is not None: assert job_entry["config"]["metadata"][ "jobSubmissionId"] == job_id assert job_entry["status"] in {"PENDING", "RUNNING", "FAILED"} assert job_entry["statusMessage"] is not None return job_entry["status"] == "FAILED" return False wait_for_condition(wait_for_job_to_fail, timeout=30)
def _get_sdk_client(address: Optional[str], create_cluster_if_needed: bool = False ) -> JobSubmissionClient: if address is None: if "RAY_ADDRESS" not in os.environ: raise ValueError( "Address must be specified using either the --address flag " "or RAY_ADDRESS environment variable.") address = os.environ["RAY_ADDRESS"] return JobSubmissionClient(address, create_cluster_if_needed)
def test_temporary_uri_reference(monkeypatch, expiration_s): """Test that temporary GCS URI references are deleted after expiration_s.""" monkeypatch.setenv("RAY_RUNTIME_ENV_TEMPORARY_REFERENCE_EXPIRATION_S", str(expiration_s)) # We can't use a fixture with a shared Ray runtime because we need to set the # expiration_s env var before Ray starts. with _ray_start(include_dashboard=True, num_cpus=1) as ctx: headers = { "Connection": "keep-alive", "Authorization": "TOK:<MY_TOKEN>" } address = ctx.address_info["webui_url"] assert wait_until_server_available(address) client = JobSubmissionClient(format_web_url(address), headers=headers) with tempfile.TemporaryDirectory() as tmp_dir: path = Path(tmp_dir) hello_file = path / "hi.txt" with hello_file.open(mode="w") as f: f.write("hi\n") start = time.time() client.submit_job(entrypoint="echo hi", runtime_env={"working_dir": tmp_dir}) # Give time for deletion to occur if expiration_s is 0. time.sleep(2) # Need to connect to Ray to check internal_kv. # ray.init(address="auto") print("Starting Internal KV checks at time ", time.time() - start) if expiration_s > 0: assert not check_internal_kv_gced() wait_for_condition(check_internal_kv_gced, timeout=2 * expiration_s) assert expiration_s < time.time() - start < 2 * expiration_s print("Internal KV was GC'ed at time ", time.time() - start) else: wait_for_condition(check_internal_kv_gced) print("Internal KV was GC'ed at time ", time.time() - start)
def _log_job_status(client: JobSubmissionClient, job_id: str): status = client.get_job_status(job_id) if status.status == JobStatus.SUCCEEDED: _log_big_success_msg(f"Job '{job_id}' succeeded") elif status.status == JobStatus.STOPPED: cli_logger.warning(f"Job '{job_id}' was stopped") elif status.status == JobStatus.FAILED: _log_big_error_msg(f"Job '{job_id}' failed") if status.message is not None: cli_logger.print(f"Status message: {status.message}") else: # Catch-all. cli_logger.print(f"Status for job '{job_id}': {status.status}") if status.message is not None: cli_logger.print(f"Status message: {status.message}")
async def _tail_logs(client: JobSubmissionClient, job_id: str): async for lines in client.tail_job_logs(job_id): print(lines, end="") _log_job_status(client, job_id)
def _check_job_succeeded(client: JobSubmissionClient, job_id: str) -> bool: status = client.get_job_status(job_id) if status == JobStatus.FAILED: stdout, stderr = client.get_job_logs(job_id) raise RuntimeError(f"Job failed\nstdout:\n{stdout}\nstderr:\n{stderr}") return status == JobStatus.SUCCEEDED
def job_sdk_client(ray_start_with_dashboard, disable_aiohttp_cache, enable_test_module): address = ray_start_with_dashboard["webui_url"] assert wait_until_server_available(address) yield JobSubmissionClient(format_web_url(address))
def _check_job_stopped(client: JobSubmissionClient, job_id: str) -> bool: status = client.get_job_status(job_id) return status.status == JobStatus.STOPPED
def _check_job_failed(client: JobSubmissionClient, job_id: str) -> bool: status = client.get_job_status(job_id) return status.status == JobStatus.FAILED
def job_sdk_client(headers): with _ray_start(include_dashboard=True, num_cpus=1) as address_info: address = address_info["webui_url"] assert wait_until_server_available(address) yield JobSubmissionClient(format_web_url(address), headers=headers)