def report_result(self, test: Test, result: Result): logger.info("Persisting result to the databricks delta lake...") result_json = { "_table": "release_test_result", "report_timestamp_ms": int(time.time() * 1000), "status": result.status or "", "results": result.results or {}, "name": test.get("name", ""), "group": test.get("group", ""), "team": test.get("team", ""), "frequency": test.get("frequency", ""), "cluster_url": result.cluster_url or "", "wheel_url": result.wheels_url or "", "buildkite_url": result.buildkite_url or "", "runtime": result.runtime or -1.0, "stable": result.stable, "return_code": result.return_code, } logger.debug(f"Result json: {json.dumps(result_json)}") try: self.firehose.put_record( DeliveryStreamName="ray-ci-results", Record={"Data": json.dumps(result_json)}, ) except Exception: logger.exception( "Failed to persist result to the databricks delta lake") else: logger.info( "Result has been persisted to the databricks delta lake")
def main( test_name: str, test_collection_file: Optional[str] = None, smoke_test: bool = False, report: bool = False, ray_wheels: Optional[str] = None, cluster_id: Optional[str] = None, cluster_env_id: Optional[str] = None, no_terminate: bool = False, ): test_collection_file = test_collection_file or os.path.join( os.path.dirname(__file__), "..", "..", "release_tests.yaml") test_collection = read_and_validate_release_test_collection( test_collection_file) test = find_test(test_collection, test_name) if not test: raise ReleaseTestCLIError( f"Test `{test_name}` not found in collection file: " f"{test_collection_file}") if smoke_test: test = as_smoke_test(test) ray_wheels_url = find_and_wait_for_ray_wheels_url( ray_wheels, timeout=DEFAULT_WHEEL_WAIT_TIMEOUT) anyscale_project = os.environ.get("ANYSCALE_PROJECT", None) if not anyscale_project: raise ReleaseTestCLIError( "You have to set the ANYSCALE_PROJECT environment variable!") maybe_fetch_api_token() result = Result() reporters = [LogReporter()] if report: reporters.append(LegacyRDSReporter()) try: result = run_release_test( test, anyscale_project=anyscale_project, result=result, ray_wheels_url=ray_wheels_url, reporters=reporters, smoke_test=smoke_test, cluster_id=cluster_id, cluster_env_id=cluster_env_id, no_terminate=no_terminate, ) except ReleaseTestError as e: logger.exception(e) logger.info(f"Release test pipeline for test {test['name']} completed. " f"Returning with exit code = {result.return_code}") sys.exit(result.return_code)
def prepare_remote_env(self): # Copy wait script to working dir wait_script = os.path.join(os.path.dirname(__file__), "_wait_cluster.py") # Copy wait script to working dir if os.path.exists("wait_cluster.py"): os.unlink("wait_cluster.py") os.link(wait_script, "wait_cluster.py") try: self.file_manager.upload() except Exception as e: logger.exception(e) raise RemoteEnvSetupError( f"Error setting up remote environment: {e}") from e
def run_release_test( test: Test, anyscale_project: str, result: Result, ray_wheels_url: str, reporters: Optional[List[Reporter]] = None, smoke_test: bool = False, cluster_id: Optional[str] = None, cluster_env_id: Optional[str] = None, no_terminate: bool = False, ) -> Result: buildkite_group(":spiral_note_pad: Loading test configuration") validate_test(test) result.wheels_url = ray_wheels_url result.stable = test.get("stable", True) result.smoke_test = smoke_test buildkite_url = os.getenv("BUILDKITE_BUILD_URL", "") if buildkite_url: buildkite_url += "#" + os.getenv("BUILDKITE_JOB_ID", "") result.buildkite_url = buildkite_url working_dir = test["working_dir"] old_wd = os.getcwd() new_wd = os.path.join(RELEASE_PACKAGE_DIR, working_dir) os.chdir(new_wd) start_time = time.monotonic() run_type = test["run"].get("type", "sdk_command") command_runner_cls = type_str_to_command_runner.get(run_type) if not command_runner_cls: raise ReleaseTestConfigError( f"Unknown command runner type: {run_type}. Must be one of " f"{list(type_str_to_command_runner.keys())}") cluster_manager_cls = command_runner_to_cluster_manager[command_runner_cls] file_manager_str = test["run"].get("file_manager", None) if file_manager_str: if file_manager_str not in file_manager_str_to_file_manager: raise ReleaseTestConfigError( f"Unknown file manager: {file_manager_str}. Must be one of " f"{list(file_manager_str_to_file_manager.keys())}") file_manager_cls = file_manager_str_to_file_manager[file_manager_str] else: file_manager_cls = command_runner_to_file_manager[command_runner_cls] # Instantiate managers and command runner try: cluster_manager = cluster_manager_cls(test["name"], anyscale_project, smoke_test=smoke_test) file_manager = file_manager_cls(cluster_manager=cluster_manager) command_runner = command_runner_cls(cluster_manager, file_manager, working_dir) except Exception as e: raise ReleaseTestSetupError( f"Error setting up release test: {e}") from e pipeline_exception = None try: # Load configs cluster_env = load_test_cluster_env(test, ray_wheels_url=ray_wheels_url) cluster_compute = load_test_cluster_compute(test) if cluster_env_id: try: cluster_manager.cluster_env_id = cluster_env_id cluster_manager.build_cluster_env() cluster_manager.fetch_build_info() logger.info("Using overridden cluster environment with ID " f"{cluster_env_id} and build ID " f"{cluster_manager.cluster_env_build_id}") except Exception as e: raise ClusterEnvCreateError( f"Could not get existing overridden cluster environment " f"{cluster_env_id}: {e}") from e else: cluster_manager.set_cluster_env(cluster_env) cluster_manager.set_cluster_compute(cluster_compute) buildkite_group(":nut_and_bolt: Setting up local environment") driver_setup_script = test.get("driver_setup", None) if driver_setup_script: try: run_bash_script(driver_setup_script) except Exception as e: raise LocalEnvSetupError( f"Driver setup script failed: {e}") from e # Install local dependencies command_runner.prepare_local_env(ray_wheels_url) command_timeout = test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT) # Re-install anyscale package as local dependencies might have changed # from local env setup reinstall_anyscale_dependencies() # Print installed pip packages buildkite_group(":bulb: Local environment information") pip_packages = get_pip_packages() pip_package_string = "\n".join(pip_packages) logger.info(f"Installed python packages:\n{pip_package_string}") # Start cluster if cluster_id: buildkite_group(":rocket: Using existing cluster") # Re-use existing cluster ID for development cluster_manager.cluster_id = cluster_id cluster_manager.cluster_name = get_cluster_name(cluster_id) else: buildkite_group(":gear: Building cluster environment") build_timeout = test["run"].get("build_timeout", DEFAULT_BUILD_TIMEOUT) if cluster_env_id: cluster_manager.cluster_env_id = cluster_env_id cluster_manager.build_configs(timeout=build_timeout) cluster_timeout = test["run"].get("session_timeout", DEFAULT_CLUSTER_TIMEOUT) autosuspend_mins = test["cluster"].get("autosuspend_mins", None) if autosuspend_mins: cluster_manager.autosuspend_minutes = autosuspend_mins else: cluster_manager.autosuspend_minutes = min( DEFAULT_AUTOSUSPEND_MINS, int(command_timeout / 60) + 10) buildkite_group(":rocket: Starting up cluster") cluster_manager.start_cluster(timeout=cluster_timeout) result.cluster_url = cluster_manager.get_cluster_url() # Upload files buildkite_group(":wrench: Preparing remote environment") command_runner.prepare_remote_env() wait_for_nodes = test["run"].get("wait_for_nodes", None) if wait_for_nodes: buildkite_group(":stopwatch: Waiting for nodes to come up") num_nodes = test["run"]["wait_for_nodes"]["num_nodes"] wait_timeout = test["run"]["wait_for_nodes"].get( "timeout", DEFAULT_WAIT_FOR_NODES_TIMEOUT) command_runner.wait_for_nodes(num_nodes, wait_timeout) prepare_cmd = test["run"].get("prepare", None) if prepare_cmd: prepare_timeout = test["run"].get("prepare_timeout", command_timeout) try: command_runner.run_prepare_command(prepare_cmd, timeout=prepare_timeout) except CommandError as e: raise PrepareCommandError(e) except CommandTimeout as e: raise PrepareCommandTimeout(e) buildkite_group(":runner: Running test script") command = test["run"]["script"] command_env = {} if smoke_test: command = f"{command} --smoke-test" command_env["IS_SMOKE_TEST"] = "1" is_long_running = test["run"].get("long_running", False) try: command_runner.run_command(command, env=command_env, timeout=command_timeout) except CommandError as e: raise TestCommandError(e) except CommandTimeout as e: if not is_long_running: # Only raise error if command is not long running raise TestCommandTimeout(e) buildkite_group(":floppy_disk: Fetching results") try: command_results = command_runner.fetch_results() except Exception as e: logger.error("Could not fetch results for test command") logger.exception(e) command_results = {} # Postprocess result: if "last_update" in command_results: command_results["last_update_diff"] = time.time( ) - command_results.get("last_update", 0.0) if smoke_test: command_results["smoke_test"] = True result.results = command_results result.status = "finished" except Exception as e: logger.exception(e) buildkite_open_last() pipeline_exception = e try: last_logs = command_runner.get_last_logs() except Exception as e: logger.error(f"Error fetching logs: {e}") last_logs = "No logs could be retrieved." result.last_logs = last_logs if not no_terminate: buildkite_group(":earth_africa: Terminating cluster") try: cluster_manager.terminate_cluster(wait=False) except Exception as e: logger.error(f"Could not terminate cluster: {e}") time_taken = time.monotonic() - start_time result.runtime = time_taken os.chdir(old_wd) if not pipeline_exception: buildkite_group(":mag: Interpreting results") # Only handle results if we didn't run into issues earlier try: handle_result(test, result) except Exception as e: pipeline_exception = e if pipeline_exception: buildkite_group(":rotating_light: Handling errors") exit_code, error_type, runtime = handle_exception(pipeline_exception) result.return_code = exit_code.value result.status = error_type if runtime is not None: result.runtime = runtime buildkite_group(":memo: Reporting results", open=True) reporters = reporters or [] for reporter in reporters: try: reporter.report_result(test, result) except Exception as e: logger.error(f"Error reporting results via {type(reporter)}: {e}") if pipeline_exception: raise pipeline_exception return result
def main( test_name: str, test_collection_file: Optional[str] = None, smoke_test: bool = False, report: bool = False, ray_wheels: Optional[str] = None, cluster_id: Optional[str] = None, cluster_env_id: Optional[str] = None, env: Optional[str] = None, no_terminate: bool = False, ): test_collection_file = test_collection_file or os.path.join( os.path.dirname(__file__), "..", "..", "release_tests.yaml") test_collection = read_and_validate_release_test_collection( test_collection_file) test = find_test(test_collection, test_name) if not test: raise ReleaseTestCLIError( f"Test `{test_name}` not found in collection file: " f"{test_collection_file}") if smoke_test: test = as_smoke_test(test) env_to_use = env or test.get("env", DEFAULT_ENVIRONMENT) env_dict = load_environment(env_to_use) populate_os_env(env_dict) if "python" in test: python_version = parse_python_version(test["python"]) else: python_version = DEFAULT_PYTHON_VERSION ray_wheels_url = find_and_wait_for_ray_wheels_url( ray_wheels, python_version=python_version, timeout=DEFAULT_WHEEL_WAIT_TIMEOUT) anyscale_project = os.environ.get("ANYSCALE_PROJECT", None) if not anyscale_project: raise ReleaseTestCLIError( "You have to set the ANYSCALE_PROJECT environment variable!") maybe_fetch_api_token() result = Result() reporters = [LogReporter()] if "BUILDKITE" in os.environ: reporters.append(ArtifactsReporter()) if report: reporters.append(LegacyRDSReporter()) reporters.append(DBReporter()) try: result = run_release_test( test, anyscale_project=anyscale_project, result=result, ray_wheels_url=ray_wheels_url, reporters=reporters, smoke_test=smoke_test, cluster_id=cluster_id, cluster_env_id=cluster_env_id, no_terminate=no_terminate, ) return_code = result.return_code except ReleaseTestError as e: logger.exception(e) return_code = e.exit_code.value logger.info(f"Release test pipeline for test {test['name']} completed. " f"Returning with exit code = {return_code}") sys.exit(result.return_code)