def testStartClusterFails(self): result = Result() self._succeed_until("cluster_env") # Fails because API response faulty with self.assertRaises(ClusterCreationError): self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) self.cluster_manager_return["cluster_id"] = "valid" # Fail for random cluster startup reason self.cluster_manager_return["start_cluster"] = _fail_on_call( ClusterStartupError ) with self.assertRaises(ClusterStartupError): self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_STARTUP_ERROR.value) # Ensure cluster was terminated self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1) # Fail for cluster startup timeout self.cluster_manager_return["start_cluster"] = _fail_on_call( ClusterStartupTimeout ) with self.assertRaises(ClusterStartupTimeout): self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_STARTUP_TIMEOUT.value) # Ensure cluster was terminated self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
def testInvalidClusterCompute(self): result = Result() with patch( "ray_release.glue.load_test_cluster_compute", _fail_on_call(ReleaseTestConfigError), ), self.assertRaises(ReleaseTestConfigError): self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because file not found os.unlink(os.path.join(self.tempdir, "cluster_compute.yaml")) with self.assertRaisesRegex(ReleaseTestConfigError, "Path not found"): self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid jinja template self.writeClusterCompute("{{ INVALID") with self.assertRaisesRegex(ReleaseTestConfigError, "yaml template"): self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid json self.writeClusterCompute("{'test': true, 'fail}") with self.assertRaisesRegex(ReleaseTestConfigError, "quoted scalar"): self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value)
def testBuildConfigFailsClusterEnv(self): result = Result() self._succeed_until("cluster_compute") # Fails because API response faulty with self.assertRaisesRegex(ClusterEnvCreateError, "Unexpected"): self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) # Fails for random cluster env create reason self.cluster_manager_return["create_cluster_env"] = _fail_on_call( ClusterEnvCreateError, "Known" ) with self.assertRaisesRegex(ClusterEnvCreateError, "Known"): self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) # Now, succeed creation but fail on cluster env build self.cluster_manager_return["cluster_env_id"] = "valid" self.cluster_manager_return["create_cluster_env"] = None self.cluster_manager_return["build_cluster_env"] = _fail_on_call( ClusterEnvBuildError ) with self.assertRaises(ClusterEnvBuildError): self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_ENV_BUILD_ERROR.value) # Now, fail on cluster env timeout self.cluster_manager_return["build_cluster_env"] = _fail_on_call( ClusterEnvBuildTimeout ) with self.assertRaises(ClusterEnvBuildTimeout): self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_ENV_BUILD_TIMEOUT.value)
def testDriverSetupFails(self): result = Result() self._succeed_until("local_env") with self.assertRaises(LocalEnvSetupError): self._run(result) self.assertEqual(result.return_code, ExitCode.LOCAL_ENV_SETUP_ERROR.value)
def main( test_name: str, test_collection_file: Optional[str] = None, smoke_test: bool = False, report: bool = False, ray_wheels: Optional[str] = None, cluster_id: Optional[str] = None, cluster_env_id: Optional[str] = None, no_terminate: bool = False, ): test_collection_file = test_collection_file or os.path.join( os.path.dirname(__file__), "..", "..", "release_tests.yaml") test_collection = read_and_validate_release_test_collection( test_collection_file) test = find_test(test_collection, test_name) if not test: raise ReleaseTestCLIError( f"Test `{test_name}` not found in collection file: " f"{test_collection_file}") if smoke_test: test = as_smoke_test(test) ray_wheels_url = find_and_wait_for_ray_wheels_url( ray_wheels, timeout=DEFAULT_WHEEL_WAIT_TIMEOUT) anyscale_project = os.environ.get("ANYSCALE_PROJECT", None) if not anyscale_project: raise ReleaseTestCLIError( "You have to set the ANYSCALE_PROJECT environment variable!") maybe_fetch_api_token() result = Result() reporters = [LogReporter()] if report: reporters.append(LegacyRDSReporter()) try: result = run_release_test( test, anyscale_project=anyscale_project, result=result, ray_wheels_url=ray_wheels_url, reporters=reporters, smoke_test=smoke_test, cluster_id=cluster_id, cluster_env_id=cluster_env_id, no_terminate=no_terminate, ) except ReleaseTestError as e: logger.exception(e) logger.info(f"Release test pipeline for test {test['name']} completed. " f"Returning with exit code = {result.return_code}") sys.exit(result.return_code)
def testInvalidPrepareLocalEnv(self): result = Result() self.command_runner_return["prepare_local_env"] = _fail_on_call( LocalEnvSetupError ) with self.assertRaises(LocalEnvSetupError): self._run(result) self.assertEqual(result.return_code, ExitCode.LOCAL_ENV_SETUP_ERROR.value)
def testHandleAlert(self): # Unknown test suite with self.assertRaises(ReleaseTestConfigError): handle.handle_result( Test(name="unit_alert_test", alert="invalid"), Result(status="finished") ) # Alert raised with self.assertRaises(ResultsAlert): handle.handle_result( Test(name="unit_alert_test", alert="default"), Result(status="unsuccessful"), ) # Everything fine handle.handle_result( Test(name="unit_alert_test", alert="default"), Result(status="finished") )
def testSmokeUnstableTest(self): result = Result() self._succeed_until("complete") self.test["stable"] = False self._run(result, smoke_test=True) # Ensure stable and smoke_test are set correctly. assert not result.stable assert result.smoke_test
def testFetchResultFails(self): result = Result() self._succeed_until("test_command") self.command_runner_return["fetch_results"] = _fail_on_call(ResultsError) with self.assertLogs(logger, "ERROR") as cm: self._run(result) self.assertTrue(any("Could not fetch results" in o for o in cm.output)) self.assertEqual(result.return_code, ExitCode.SUCCESS.value) self.assertEqual(result.status, "finished") # Ensure cluster was terminated self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
def testPrepareRemoteEnvFails(self): result = Result() self._succeed_until("cluster_start") self.command_runner_return["prepare_remote_env"] = _fail_on_call( RemoteEnvSetupError ) with self.assertRaises(RemoteEnvSetupError): self._run(result) self.assertEqual(result.return_code, ExitCode.REMOTE_ENV_SETUP_ERROR.value) # Ensure cluster was terminated self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
def testWaitForNodesFails(self): result = Result() self._succeed_until("remote_env") # Wait for nodes command fails self.command_runner_return["wait_for_nodes"] = _fail_on_call( ClusterNodesWaitTimeout ) with self.assertRaises(ClusterNodesWaitTimeout): self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_WAIT_TIMEOUT.value) # Ensure cluster was terminated self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
def testAlertFails(self): result = Result() self._succeed_until("get_last_logs") self.mock_alert_return = "Alert raised" with self.assertRaises(ResultsAlert): self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_ALERT.value) self.assertEqual(result.status, "error") # Ensure cluster was terminated self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
def testInvalidClusterIdOverride(self): result = Result() self._succeed_until("driver_setup") self.sdk.returns["get_cluster_environment"] = None with self.assertRaises(ClusterEnvCreateError): self._run(result, cluster_env_id="existing") self.sdk.returns["get_cluster_environment"] = APIDict(result=APIDict( config_json={"overridden": True})) with self.assertRaises(Exception) as cm: # Fail somewhere else self._run(result, cluster_env_id="existing") self.assertNotIsInstance(cm.exception, ClusterEnvCreateError)
def testReportFails(self): result = Result() self._succeed_until("complete") class FailReporter(Reporter): def report_result(self, test: Test, result: Result): raise RuntimeError with self.assertLogs(logger, "ERROR") as cm: self._run(result, reporters=[FailReporter()]) self.assertTrue(any("Error reporting results" in o for o in cm.output)) self.assertEqual(result.return_code, ExitCode.SUCCESS.value) self.assertEqual(result.status, "finished") # Ensure cluster was terminated self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
def testTestCommandTimeoutLongRunning(self): result = Result() self._succeed_until("fetch_results") # Test command times out self.command_runner_return["run_command"] = _fail_on_call(CommandTimeout) with self.assertRaises(TestCommandTimeout): self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_TIMEOUT.value) # But now set test to long running self.test["run"]["long_running"] = True self._run(result) # Will not fail this time self.assertGreaterEqual(result.results["last_update_diff"], 60.0) # Ensure cluster was terminated self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
def testTestCommandFails(self): result = Result() self._succeed_until("prepare_command") # Test command fails self.command_runner_return["run_command"] = _fail_on_call(CommandError) with self.assertRaises(TestCommandError): self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_ERROR.value) # Test command times out self.command_runner_return["run_command"] = _fail_on_call(CommandTimeout) with self.assertRaises(TestCommandTimeout): self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_TIMEOUT.value) # Ensure cluster was terminated self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
def testBuildConfigFailsClusterCompute(self): result = Result() self._succeed_until("driver_setup") # These commands should succeed self.command_runner_return["prepare_local_env"] = None # Fails because API response faulty with self.assertRaisesRegex(ClusterComputeCreateError, "Unexpected"): self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) # Fails for random cluster compute reason self.cluster_manager_return["create_cluster_compute"] = _fail_on_call( ClusterComputeCreateError, "Known" ) with self.assertRaisesRegex(ClusterComputeCreateError, "Known"): self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value)
def testPrepareCommandFails(self): result = Result() self._succeed_until("wait_for_nodes") # Prepare command fails self.command_runner_return["run_prepare_command"] = _fail_on_call(CommandError) with self.assertRaises(PrepareCommandError): self._run(result) self.assertEqual(result.return_code, ExitCode.PREPARE_ERROR.value) # Prepare command times out self.command_runner_return["run_prepare_command"] = _fail_on_call( CommandTimeout ) with self.assertRaises(PrepareCommandTimeout): self._run(result) # Special case: Prepare commands are usually waiting for nodes # (this may change in the future!) self.assertEqual(result.return_code, ExitCode.CLUSTER_WAIT_TIMEOUT.value) # Ensure cluster was terminated self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
def run_release_test( test: Test, anyscale_project: str, result: Result, ray_wheels_url: str, reporters: Optional[List[Reporter]] = None, smoke_test: bool = False, cluster_id: Optional[str] = None, cluster_env_id: Optional[str] = None, no_terminate: bool = False, ) -> Result: validate_test(test) result.wheels_url = ray_wheels_url result.stable = test.get("stable", True) buildkite_url = os.getenv("BUILDKITE_BUILD_URL", "") if buildkite_url: buildkite_url += "#" + os.getenv("BUILDKITE_JOB_ID", "") result.buildkite_url = buildkite_url working_dir = test["working_dir"] old_wd = os.getcwd() new_wd = os.path.join(RELEASE_PACKAGE_DIR, working_dir) os.chdir(new_wd) start_time = time.monotonic() run_type = test["run"].get("type", "sdk_command") command_runner_cls = type_str_to_command_runner.get(run_type) if not command_runner_cls: raise ReleaseTestConfigError( f"Unknown command runner type: {run_type}. Must be one of " f"{list(type_str_to_command_runner.keys())}") cluster_manager_cls = command_runner_to_cluster_manager[command_runner_cls] file_manager_str = test["run"].get("file_manager", None) if file_manager_str: if file_manager_str not in file_manager_str_to_file_manager: raise ReleaseTestConfigError( f"Unknown file manager: {file_manager_str}. Must be one of " f"{list(file_manager_str_to_file_manager.keys())}") file_manager_cls = file_manager_str_to_file_manager[file_manager_str] else: file_manager_cls = command_runner_to_file_manager[command_runner_cls] # Instantiate managers and command runner try: cluster_manager = cluster_manager_cls(test["name"], anyscale_project) file_manager = file_manager_cls(cluster_manager=cluster_manager) command_runner = command_runner_cls(cluster_manager, file_manager, working_dir) except Exception as e: raise ReleaseTestSetupError( f"Error setting up release test: {e}") from e pipeline_exception = None try: # Load configs cluster_env = load_test_cluster_env(test, ray_wheels_url=ray_wheels_url) cluster_compute = load_test_cluster_compute(test) if cluster_env_id: try: cluster_manager.cluster_env_id = cluster_env_id cluster_manager.build_cluster_env() cluster_manager.fetch_build_info() logger.info("Using overridden cluster environment with ID " f"{cluster_env_id} and build ID " f"{cluster_manager.cluster_env_build_id}") except Exception as e: raise ClusterEnvCreateError( f"Could not get existing overridden cluster environment " f"{cluster_env_id}: {e}") from e else: cluster_manager.set_cluster_env(cluster_env) cluster_manager.set_cluster_compute(cluster_compute) driver_setup_script = test.get("driver_setup", None) if driver_setup_script: try: run_bash_script(driver_setup_script) except Exception as e: raise LocalEnvSetupError( f"Driver setup script failed: {e}") from e # Install local dependencies command_runner.prepare_local_env(ray_wheels_url) # Start session if cluster_id: # Re-use existing cluster ID for development cluster_manager.cluster_id = cluster_id cluster_manager.cluster_name = get_cluster_name(cluster_id) else: build_timeout = test["run"].get("build_timeout", DEFAULT_BUILD_TIMEOUT) if cluster_env_id: cluster_manager.cluster_env_id = cluster_env_id cluster_manager.build_configs(timeout=build_timeout) cluster_timeout = test["run"].get("session_timeout", DEFAULT_CLUSTER_TIMEOUT) autosuspend_mins = test["run"].get("autosuspend_mins", None) if autosuspend_mins: cluster_manager.autosuspend_minutes = autosuspend_mins cluster_manager.start_cluster(timeout=cluster_timeout) result.cluster_url = cluster_manager.get_cluster_url() # Upload files command_runner.prepare_remote_env() command_timeout = test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT) wait_for_nodes = test["run"].get("wait_for_nodes", None) if wait_for_nodes: num_nodes = test["run"]["wait_for_nodes"]["num_nodes"] wait_timeout = test["run"]["wait_for_nodes"]["timeout"] command_runner.wait_for_nodes(num_nodes, wait_timeout) prepare_cmd = test["run"].get("prepare", None) if prepare_cmd: prepare_timeout = test["run"].get("prepare_timeout", command_timeout) try: command_runner.run_prepare_command(prepare_cmd, timeout=prepare_timeout) except CommandError as e: raise PrepareCommandError(e) except CommandTimeout as e: raise PrepareCommandTimeout(e) command = test["run"]["script"] command_env = {} if smoke_test: command = f"{command} --smoke-test" command_env["IS_SMOKE_TEST"] = "1" try: command_runner.run_command(command, env=command_env, timeout=command_timeout) except CommandError as e: raise TestCommandError(e) except CommandTimeout as e: raise TestCommandTimeout(e) try: command_results = command_runner.fetch_results() except Exception as e: logger.error(f"Could not fetch results for test command: {e}") command_results = {} # Postprocess result: if "last_update" in command_results: command_results["last_update_diff"] = time.time( ) - command_results.get("last_update", 0.0) if smoke_test: command_results["smoke_test"] = True result.results = command_results result.status = "finished" except Exception as e: pipeline_exception = e try: last_logs = command_runner.get_last_logs() except Exception as e: logger.error(f"Error fetching logs: {e}") last_logs = "No logs could be retrieved." result.last_logs = last_logs if not no_terminate: try: cluster_manager.terminate_cluster(wait=False) except Exception as e: logger.error(f"Could not terminate cluster: {e}") time_taken = time.monotonic() - start_time result.runtime = time_taken os.chdir(old_wd) if not pipeline_exception: # Only handle results if we didn't run into issues earlier try: handle_result(test, result) except Exception as e: pipeline_exception = e if pipeline_exception: exit_code, error_type, runtime = handle_exception(pipeline_exception) result.return_code = exit_code.value result.status = error_type if runtime is not None: result.runtime = runtime reporters = reporters or [] for reporter in reporters: try: reporter.report_result(test, result) except Exception as e: logger.error(f"Error reporting results via {type(reporter)}: {e}") if pipeline_exception: raise pipeline_exception return result
def testDefaultAlert(self): self.assertTrue(default.handle_result(self.test, Result(status="timeout"))) self.assertFalse(default.handle_result(self.test, Result(status="finished")))
def run_release_test( test: Test, anyscale_project: str, result: Result, ray_wheels_url: str, reporters: Optional[List[Reporter]] = None, smoke_test: bool = False, cluster_id: Optional[str] = None, cluster_env_id: Optional[str] = None, no_terminate: bool = False, ) -> Result: buildkite_group(":spiral_note_pad: Loading test configuration") validate_test(test) result.wheels_url = ray_wheels_url result.stable = test.get("stable", True) result.smoke_test = smoke_test buildkite_url = os.getenv("BUILDKITE_BUILD_URL", "") if buildkite_url: buildkite_url += "#" + os.getenv("BUILDKITE_JOB_ID", "") result.buildkite_url = buildkite_url working_dir = test["working_dir"] old_wd = os.getcwd() new_wd = os.path.join(RELEASE_PACKAGE_DIR, working_dir) os.chdir(new_wd) start_time = time.monotonic() run_type = test["run"].get("type", "sdk_command") command_runner_cls = type_str_to_command_runner.get(run_type) if not command_runner_cls: raise ReleaseTestConfigError( f"Unknown command runner type: {run_type}. Must be one of " f"{list(type_str_to_command_runner.keys())}") cluster_manager_cls = command_runner_to_cluster_manager[command_runner_cls] file_manager_str = test["run"].get("file_manager", None) if file_manager_str: if file_manager_str not in file_manager_str_to_file_manager: raise ReleaseTestConfigError( f"Unknown file manager: {file_manager_str}. Must be one of " f"{list(file_manager_str_to_file_manager.keys())}") file_manager_cls = file_manager_str_to_file_manager[file_manager_str] else: file_manager_cls = command_runner_to_file_manager[command_runner_cls] # Instantiate managers and command runner try: cluster_manager = cluster_manager_cls(test["name"], anyscale_project, smoke_test=smoke_test) file_manager = file_manager_cls(cluster_manager=cluster_manager) command_runner = command_runner_cls(cluster_manager, file_manager, working_dir) except Exception as e: raise ReleaseTestSetupError( f"Error setting up release test: {e}") from e pipeline_exception = None try: # Load configs cluster_env = load_test_cluster_env(test, ray_wheels_url=ray_wheels_url) cluster_compute = load_test_cluster_compute(test) if cluster_env_id: try: cluster_manager.cluster_env_id = cluster_env_id cluster_manager.build_cluster_env() cluster_manager.fetch_build_info() logger.info("Using overridden cluster environment with ID " f"{cluster_env_id} and build ID " f"{cluster_manager.cluster_env_build_id}") except Exception as e: raise ClusterEnvCreateError( f"Could not get existing overridden cluster environment " f"{cluster_env_id}: {e}") from e else: cluster_manager.set_cluster_env(cluster_env) cluster_manager.set_cluster_compute(cluster_compute) buildkite_group(":nut_and_bolt: Setting up local environment") driver_setup_script = test.get("driver_setup", None) if driver_setup_script: try: run_bash_script(driver_setup_script) except Exception as e: raise LocalEnvSetupError( f"Driver setup script failed: {e}") from e # Install local dependencies command_runner.prepare_local_env(ray_wheels_url) command_timeout = test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT) # Re-install anyscale package as local dependencies might have changed # from local env setup reinstall_anyscale_dependencies() # Print installed pip packages buildkite_group(":bulb: Local environment information") pip_packages = get_pip_packages() pip_package_string = "\n".join(pip_packages) logger.info(f"Installed python packages:\n{pip_package_string}") # Start cluster if cluster_id: buildkite_group(":rocket: Using existing cluster") # Re-use existing cluster ID for development cluster_manager.cluster_id = cluster_id cluster_manager.cluster_name = get_cluster_name(cluster_id) else: buildkite_group(":gear: Building cluster environment") build_timeout = test["run"].get("build_timeout", DEFAULT_BUILD_TIMEOUT) if cluster_env_id: cluster_manager.cluster_env_id = cluster_env_id cluster_manager.build_configs(timeout=build_timeout) cluster_timeout = test["run"].get("session_timeout", DEFAULT_CLUSTER_TIMEOUT) autosuspend_mins = test["cluster"].get("autosuspend_mins", None) if autosuspend_mins: cluster_manager.autosuspend_minutes = autosuspend_mins else: cluster_manager.autosuspend_minutes = min( DEFAULT_AUTOSUSPEND_MINS, int(command_timeout / 60) + 10) buildkite_group(":rocket: Starting up cluster") cluster_manager.start_cluster(timeout=cluster_timeout) result.cluster_url = cluster_manager.get_cluster_url() # Upload files buildkite_group(":wrench: Preparing remote environment") command_runner.prepare_remote_env() wait_for_nodes = test["run"].get("wait_for_nodes", None) if wait_for_nodes: buildkite_group(":stopwatch: Waiting for nodes to come up") num_nodes = test["run"]["wait_for_nodes"]["num_nodes"] wait_timeout = test["run"]["wait_for_nodes"].get( "timeout", DEFAULT_WAIT_FOR_NODES_TIMEOUT) command_runner.wait_for_nodes(num_nodes, wait_timeout) prepare_cmd = test["run"].get("prepare", None) if prepare_cmd: prepare_timeout = test["run"].get("prepare_timeout", command_timeout) try: command_runner.run_prepare_command(prepare_cmd, timeout=prepare_timeout) except CommandError as e: raise PrepareCommandError(e) except CommandTimeout as e: raise PrepareCommandTimeout(e) buildkite_group(":runner: Running test script") command = test["run"]["script"] command_env = {} if smoke_test: command = f"{command} --smoke-test" command_env["IS_SMOKE_TEST"] = "1" is_long_running = test["run"].get("long_running", False) try: command_runner.run_command(command, env=command_env, timeout=command_timeout) except CommandError as e: raise TestCommandError(e) except CommandTimeout as e: if not is_long_running: # Only raise error if command is not long running raise TestCommandTimeout(e) buildkite_group(":floppy_disk: Fetching results") try: command_results = command_runner.fetch_results() except Exception as e: logger.error("Could not fetch results for test command") logger.exception(e) command_results = {} # Postprocess result: if "last_update" in command_results: command_results["last_update_diff"] = time.time( ) - command_results.get("last_update", 0.0) if smoke_test: command_results["smoke_test"] = True result.results = command_results result.status = "finished" except Exception as e: logger.exception(e) buildkite_open_last() pipeline_exception = e try: last_logs = command_runner.get_last_logs() except Exception as e: logger.error(f"Error fetching logs: {e}") last_logs = "No logs could be retrieved." result.last_logs = last_logs if not no_terminate: buildkite_group(":earth_africa: Terminating cluster") try: cluster_manager.terminate_cluster(wait=False) except Exception as e: logger.error(f"Could not terminate cluster: {e}") time_taken = time.monotonic() - start_time result.runtime = time_taken os.chdir(old_wd) if not pipeline_exception: buildkite_group(":mag: Interpreting results") # Only handle results if we didn't run into issues earlier try: handle_result(test, result) except Exception as e: pipeline_exception = e if pipeline_exception: buildkite_group(":rotating_light: Handling errors") exit_code, error_type, runtime = handle_exception(pipeline_exception) result.return_code = exit_code.value result.status = error_type if runtime is not None: result.runtime = runtime buildkite_group(":memo: Reporting results", open=True) reporters = reporters or [] for reporter in reporters: try: reporter.report_result(test, result) except Exception as e: logger.error(f"Error reporting results via {type(reporter)}: {e}") if pipeline_exception: raise pipeline_exception return result
def main( test_name: str, test_collection_file: Optional[str] = None, smoke_test: bool = False, report: bool = False, ray_wheels: Optional[str] = None, cluster_id: Optional[str] = None, cluster_env_id: Optional[str] = None, env: Optional[str] = None, no_terminate: bool = False, ): test_collection_file = test_collection_file or os.path.join( os.path.dirname(__file__), "..", "..", "release_tests.yaml") test_collection = read_and_validate_release_test_collection( test_collection_file) test = find_test(test_collection, test_name) if not test: raise ReleaseTestCLIError( f"Test `{test_name}` not found in collection file: " f"{test_collection_file}") if smoke_test: test = as_smoke_test(test) env_to_use = env or test.get("env", DEFAULT_ENVIRONMENT) env_dict = load_environment(env_to_use) populate_os_env(env_dict) if "python" in test: python_version = parse_python_version(test["python"]) else: python_version = DEFAULT_PYTHON_VERSION ray_wheels_url = find_and_wait_for_ray_wheels_url( ray_wheels, python_version=python_version, timeout=DEFAULT_WHEEL_WAIT_TIMEOUT) anyscale_project = os.environ.get("ANYSCALE_PROJECT", None) if not anyscale_project: raise ReleaseTestCLIError( "You have to set the ANYSCALE_PROJECT environment variable!") maybe_fetch_api_token() result = Result() reporters = [LogReporter()] if "BUILDKITE" in os.environ: reporters.append(ArtifactsReporter()) if report: reporters.append(LegacyRDSReporter()) reporters.append(DBReporter()) try: result = run_release_test( test, anyscale_project=anyscale_project, result=result, ray_wheels_url=ray_wheels_url, reporters=reporters, smoke_test=smoke_test, cluster_id=cluster_id, cluster_env_id=cluster_env_id, no_terminate=no_terminate, ) return_code = result.return_code except ReleaseTestError as e: logger.exception(e) return_code = e.exit_code.value logger.info(f"Release test pipeline for test {test['name']} completed. " f"Returning with exit code = {return_code}") sys.exit(result.return_code)