def wait_ready(self, deadline): with cli_logger.group( "Waiting for SSH to become available", _numbered=("[]", 1, NUM_SETUP_STEPS) ): with LogTimer(self.log_prefix + "Got remote shell"): cli_logger.print("Running `{}` as a test.", cf.bold("uptime")) first_conn_refused_time = None while True: if time.time() > deadline: raise Exception("wait_ready timeout exceeded.") if self.provider.is_terminated(self.node_id): raise Exception( "wait_ready aborting because node " "detected as terminated." ) try: # Run outside of the container self.cmd_runner.run("uptime", timeout=5, run_env="host") cli_logger.success("Success.") return True except ProcessRunnerError as e: first_conn_refused_time = cmd_output_util.handle_ssh_fails( e, first_conn_refused_time, retry_interval=READY_CHECK_INTERVAL, ) time.sleep(READY_CHECK_INTERVAL) except Exception as e: # TODO(maximsmol): we should not be ignoring # exceptions if they get filtered properly # (new style log + non-interactive shells) # # however threading this configuration state # is a pain and I'm leaving it for later retry_str = "(" + str(e) + ")" if hasattr(e, "cmd"): if isinstance(e.cmd, str): cmd_ = e.cmd elif isinstance(e.cmd, list): cmd_ = " ".join(e.cmd) else: logger.debug( f"e.cmd type ({type(e.cmd)}) not list or str." ) cmd_ = str(e.cmd) retry_str = "(Exit Status {}): {}".format( e.returncode, cmd_ ) cli_logger.print( "SSH still not available {}, retrying in {} seconds.", cf.dimmed(retry_str), cf.bold(str(READY_CHECK_INTERVAL)), ) time.sleep(READY_CHECK_INTERVAL)
def run( config_or_import_path: str, runtime_env: str, runtime_env_json: str, working_dir: str, app_dir: str, address: str, host: str, port: int, blocking: bool, ): sys.path.insert(0, app_dir) final_runtime_env = parse_runtime_env_args( runtime_env=runtime_env, runtime_env_json=runtime_env_json, working_dir=working_dir, ) if pathlib.Path(config_or_import_path).is_file(): config_path = config_or_import_path cli_logger.print(f'Deploying from config file: "{config_path}".') with open(config_path, "r") as config_file: config = ServeApplicationSchema.parse_obj( yaml.safe_load(config_file)) is_config = True else: import_path = config_or_import_path cli_logger.print(f'Deploying from import path: "{import_path}".') node = import_attr(import_path) is_config = False # Setting the runtime_env here will set defaults for the deployments. ray.init(address=address, namespace=SERVE_NAMESPACE, runtime_env=final_runtime_env) client = serve.start(detached=True) try: if is_config: client.deploy_app(config) else: serve.run(node, host=host, port=port) cli_logger.success("Deployed successfully.") if blocking: while True: # Block, letting Ray print logs to the terminal. time.sleep(10) except KeyboardInterrupt: cli_logger.info("Got KeyboardInterrupt, shutting down...") serve.shutdown() sys.exit()
def wait_ready(self, deadline): with cli_logger.group("Waiting for SSH to become available", _numbered=("[]", 1, 6)): with LogTimer(self.log_prefix + "Got remote shell"): cli_logger.old_info(logger, "{}Waiting for remote shell...", self.log_prefix) cli_logger.print("Running `{}` as a test.", cf.bold("uptime")) first_conn_refused_time = None while time.time() < deadline and \ not self.provider.is_terminated(self.node_id): try: cli_logger.old_debug(logger, "{}Waiting for remote shell...", self.log_prefix) # Run outside of the container self.cmd_runner.run("uptime", timeout=5, run_env="host") cli_logger.old_debug(logger, "Uptime succeeded.") cli_logger.success("Success.") return True except ProcessRunnerError as e: first_conn_refused_time = \ cmd_output_util.handle_ssh_fails( e, first_conn_refused_time, retry_interval=READY_CHECK_INTERVAL) time.sleep(READY_CHECK_INTERVAL) except Exception as e: # TODO(maximsmol): we should not be ignoring # exceptions if they get filtered properly # (new style log + non-interactive shells) # # however threading this configuration state # is a pain and I'm leaving it for later retry_str = str(e) if hasattr(e, "cmd"): retry_str = "(Exit Status {}): {}".format( e.returncode, " ".join(e.cmd)) cli_logger.print( "SSH still not available {}, " "retrying in {} seconds.", cf.dimmed(retry_str), cf.bold(str(READY_CHECK_INTERVAL))) cli_logger.old_debug(logger, "{}Node not up, retrying: {}", self.log_prefix, retry_str) time.sleep(READY_CHECK_INTERVAL) assert False, "Unable to connect to node"
def delete(address: str, yes: bool): if not yes: click.confirm( f"\nThis will shutdown the Serve application at address " f'"{address}" and delete all deployments there. Do you ' "want to continue?", abort=True, ) ServeSubmissionClient(address).delete_application() cli_logger.newline() cli_logger.success("\nSent delete request successfully!\n") cli_logger.newline()
def deploy(config_file_name: str, address: str): with open(config_file_name, "r") as config_file: config = yaml.safe_load(config_file) # Schematize config to validate format. ServeApplicationSchema.parse_obj(config) ServeSubmissionClient(address).deploy_application(config) cli_logger.newline() cli_logger.success( "\nSent deploy request successfully!\n " "* Use `serve status` to check deployments' statuses.\n " "* Use `serve config` to see the running app's config.\n") cli_logger.newline()
def run( config_or_import_path: str, runtime_env: str, runtime_env_json: str, working_dir: str, app_dir: str, address: str, host: str, port: int, blocking: bool, ): sys.path.insert(0, app_dir) final_runtime_env = parse_runtime_env_args( runtime_env=runtime_env, runtime_env_json=runtime_env_json, working_dir=working_dir, ) app_or_node = None if pathlib.Path(config_or_import_path).is_file(): config_path = config_or_import_path cli_logger.print(f"Loading app from config file: '{config_path}'.") with open(config_path, "r") as config_file: app_or_node = Application.from_yaml(config_file) else: import_path = config_or_import_path cli_logger.print(f"Loading app from import path: '{import_path}'.") app_or_node = import_attr(import_path) # Setting the runtime_env here will set defaults for the deployments. ray.init(address=address, namespace="serve", runtime_env=final_runtime_env) try: serve.run(app_or_node, host=host, port=port) cli_logger.success("Deployed successfully!\n") if blocking: while True: statuses = serve_application_status_to_schema( get_deployment_statuses() ).json(indent=4) cli_logger.info(f"{statuses}") time.sleep(10) except KeyboardInterrupt: cli_logger.info("Got KeyboardInterrupt, shutting down...") serve.shutdown() sys.exit()
def deploy(config_file_name: str, address: str): full_address_path = f"{address}/api/serve/deployments/" with open(config_file_name, "r") as config_file: config = yaml.safe_load(config_file) # Generate a schema using the config to ensure its format is valid ServeApplicationSchema.parse_obj(config) response = requests.put(full_address_path, json=config) if response.status_code == 200: cli_logger.newline() cli_logger.success( "\nSent deploy request successfully!\n " "* Use `serve status` to check your deployments' statuses.\n " "* Use `serve info` to see your running Serve " "application's configuration.\n") cli_logger.newline() else: log_failed_request(response, address)
def _log_big_success_msg(success_msg): cli_logger.newline() cli_logger.success("-" * len(success_msg)) cli_logger.success(success_msg) cli_logger.success("-" * len(success_msg)) cli_logger.newline()
def teardown_cluster(config_file: str, yes: bool, workers_only: bool, override_cluster_name: Optional[str], keep_min_workers: bool): """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = prepare_config(config) validate_config(config) cli_logger.confirm(yes, "Destroying cluster.", _abort=True) cli_logger.old_confirm("This will destroy your cluster", yes) if not workers_only: try: exec_cluster(config_file, cmd="ray stop", run_env="auto", screen=False, tmux=False, stop=False, start=False, override_cluster_name=override_cluster_name, port_forward=None, with_output=False) except Exception as e: # todo: add better exception info cli_logger.verbose_error("{}", str(e)) cli_logger.warning( "Exception occured when stopping the cluster Ray runtime " "(use -v to dump teardown exceptions).") cli_logger.warning( "Ignoring the exception and " "attempting to shut down the cluster nodes anyway.") cli_logger.old_exception( logger, "Ignoring error attempting a clean shutdown.") provider = _get_node_provider(config["provider"], config["cluster_name"]) try: def remaining_nodes(): workers = provider.non_terminated_nodes( {TAG_RAY_NODE_KIND: NODE_KIND_WORKER}) if keep_min_workers: min_workers = config.get("min_workers", 0) cli_logger.print( "{} random worker nodes will not be shut down. " + cf.dimmed("(due to {})"), cf.bold(min_workers), cf.bold("--keep-min-workers")) cli_logger.old_info(logger, "teardown_cluster: Keeping {} nodes...", min_workers) workers = random.sample(workers, len(workers) - min_workers) # todo: it's weird to kill the head node but not all workers if workers_only: cli_logger.print( "The head node will not be shut down. " + cf.dimmed("(due to {})"), cf.bold("--workers-only")) return workers head = provider.non_terminated_nodes( {TAG_RAY_NODE_KIND: NODE_KIND_HEAD}) return head + workers def run_docker_stop(node, container_name): try: updater = NodeUpdaterThread( node_id=node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="", file_mounts_contents_hash="", is_head_node=False, docker_config=config.get("docker")) _exec(updater, f"docker stop {container_name}", False, False, run_env="host") except Exception: cli_logger.warning(f"Docker stop failed on {node}") cli_logger.old_warning(logger, f"Docker stop failed on {node}") # Loop here to check that both the head and worker nodes are actually # really gone A = remaining_nodes() container_name = config.get("docker", {}).get("container_name") if container_name: for node in A: run_docker_stop(node, container_name) with LogTimer("teardown_cluster: done."): while A: cli_logger.old_info( logger, "teardown_cluster: " "Shutting down {} nodes...", len(A)) provider.terminate_nodes(A) cli_logger.print("Requested {} nodes to shut down.", cf.bold(len(A)), _tags=dict(interval="1s")) time.sleep( POLL_INTERVAL) # todo: interval should be a variable A = remaining_nodes() cli_logger.print("{} nodes remaining after {} second(s).", cf.bold(len(A)), POLL_INTERVAL) cli_logger.success("No nodes remaining.") finally: provider.cleanup()
cli_logger.old_style = False cli_logger.verbosity = 999 cli_logger.detect_colors() cli_logger.print( cf.bold("Bold ") + cf.italic("Italic ") + cf.underlined("Underlined")) cli_logger.labeled_value("Label", "value") cli_logger.print("List: {}", cli_logger.render_list([1, 2, 3])) cli_logger.newline() cli_logger.very_verbose("Very verbose") cli_logger.verbose("Verbose") cli_logger.verbose_warning("Verbose warning") cli_logger.verbose_error("Verbose error") cli_logger.print("Info") cli_logger.success("Success") cli_logger.warning("Warning") cli_logger.error("Error") cli_logger.newline() try: cli_logger.abort("Abort") except Exception: pass try: cli_logger.doassert(False, "Assert") except Exception: pass cli_logger.newline() cli_logger.confirm(True, "example") cli_logger.newline() with cli_logger.indented():
def stop(force, verbose, log_style, log_color): """Stop Ray processes manually on the local machine.""" cli_logger.configure(log_style, log_color, verbose) # Note that raylet needs to exit before object store, otherwise # it cannot exit gracefully. is_linux = sys.platform.startswith("linux") processes_to_kill = [ # The first element is the substring to filter. # The second element, if True, is to filter ps results by command name # (only the first 15 charactors of the executable name on Linux); # if False, is to filter ps results by command with all its arguments. # See STANDARD FORMAT SPECIFIERS section of # http://man7.org/linux/man-pages/man1/ps.1.html # about comm and args. This can help avoid killing non-ray processes. # Format: # Keyword to filter, filter by command (True)/filter by args (False) ["raylet", True], ["plasma_store", True], ["gcs_server", True], ["monitor.py", False], ["redis-server", False], ["default_worker.py", False], # Python worker. ["ray::", True], # Python worker. TODO(mehrdadn): Fix for Windows ["io.ray.runtime.runner.worker.DefaultWorker", False], # Java worker. ["log_monitor.py", False], ["reporter.py", False], ["dashboard.py", False], ["new_dashboard/agent.py", False], ["ray_process_reaper.py", False], ] process_infos = [] for proc in psutil.process_iter(["name", "cmdline"]): try: process_infos.append((proc, proc.name(), proc.cmdline())) except psutil.Error: pass total_found = 0 total_stopped = 0 for keyword, filter_by_cmd in processes_to_kill: if filter_by_cmd and is_linux and len(keyword) > 15: # getting here is an internal bug, so we do not use cli_logger msg = ("The filter string should not be more than {} " "characters. Actual length: {}. Filter: {}").format( 15, len(keyword), keyword) raise ValueError(msg) found = [] for candidate in process_infos: proc, proc_cmd, proc_args = candidate corpus = (proc_cmd if filter_by_cmd else subprocess.list2cmdline(proc_args)) if keyword in corpus: found.append(candidate) for proc, proc_cmd, proc_args in found: total_found += 1 proc_string = str(subprocess.list2cmdline(proc_args)) try: if force: proc.kill() else: # TODO(mehrdadn): On Windows, this is forceful termination. # We don't want CTRL_BREAK_EVENT, because that would # terminate the entire process group. What to do? proc.terminate() if force: cli_logger.verbose("Killed `{}` {} ", cf.bold(proc_string), cf.dimmed("(via SIGKILL)")) else: cli_logger.verbose("Send termination request to `{}` {}", cf.bold(proc_string), cf.dimmed("(via SIGTERM)")) total_stopped += 1 except psutil.NoSuchProcess: cli_logger.verbose( "Attempted to stop `{}`, but process was already dead.", cf.bold(proc_string)) pass except (psutil.Error, OSError) as ex: cli_logger.error("Could not terminate `{}` due to {}", cf.bold(proc_string), str(ex)) if total_found == 0: cli_logger.print("Did not find any active Ray processes.") else: if total_stopped == total_found: cli_logger.success("Stopped all {} Ray processes.", total_stopped) else: cli_logger.warning( "Stopped only {} out of {} Ray processes. " "Set `{}` to see more details.", total_stopped, total_found, cf.bold("-v")) cli_logger.warning("Try running the command again, or use `{}`.", cf.bold("--force"))
def start(node_ip_address, address, port, redis_password, redis_shard_ports, object_manager_port, node_manager_port, gcs_server_port, min_worker_port, max_worker_port, worker_port_list, memory, object_store_memory, redis_max_memory, num_cpus, num_gpus, resources, head, include_dashboard, dashboard_host, dashboard_port, block, plasma_directory, autoscaling_config, no_redirect_worker_output, no_redirect_output, plasma_store_socket_name, raylet_socket_name, temp_dir, java_worker_options, load_code_from_local, code_search_path, system_config, lru_evict, enable_object_reconstruction, metrics_export_port, log_style, log_color, verbose): """Start Ray processes manually on the local machine.""" cli_logger.configure(log_style, log_color, verbose) if gcs_server_port and not head: raise ValueError( "gcs_server_port can be only assigned when you specify --head.") # Convert hostnames to numerical IP address. if node_ip_address is not None: node_ip_address = services.address_to_ip(node_ip_address) redis_address = None if address is not None: (redis_address, redis_address_ip, redis_address_port) = services.validate_redis_address(address) try: resources = json.loads(resources) except Exception: cli_logger.error("`{}` is not a valid JSON string.", cf.bold("--resources")) cli_logger.abort( "Valid values look like this: `{}`", cf.bold("--resources='\"CustomResource3\": 1, " "\"CustomResource2\": 2}'")) raise Exception("Unable to parse the --resources argument using " "json.loads. Try using a format like\n\n" " --resources='{\"CustomResource1\": 3, " "\"CustomReseource2\": 2}'") redirect_worker_output = None if not no_redirect_worker_output else True redirect_output = None if not no_redirect_output else True ray_params = ray.parameter.RayParams( node_ip_address=node_ip_address, min_worker_port=min_worker_port, max_worker_port=max_worker_port, worker_port_list=worker_port_list, object_manager_port=object_manager_port, node_manager_port=node_manager_port, gcs_server_port=gcs_server_port, memory=memory, object_store_memory=object_store_memory, redis_password=redis_password, redirect_worker_output=redirect_worker_output, redirect_output=redirect_output, num_cpus=num_cpus, num_gpus=num_gpus, resources=resources, plasma_directory=plasma_directory, huge_pages=False, plasma_store_socket_name=plasma_store_socket_name, raylet_socket_name=raylet_socket_name, temp_dir=temp_dir, include_dashboard=include_dashboard, dashboard_host=dashboard_host, dashboard_port=dashboard_port, java_worker_options=java_worker_options, load_code_from_local=load_code_from_local, code_search_path=code_search_path, _system_config=system_config, lru_evict=lru_evict, enable_object_reconstruction=enable_object_reconstruction, metrics_export_port=metrics_export_port) if head: # Use default if port is none, allocate an available port if port is 0 if port is None: port = ray_constants.DEFAULT_PORT if port == 0: with socket() as s: s.bind(("", 0)) port = s.getsockname()[1] num_redis_shards = None # Start Ray on the head node. if redis_shard_ports is not None: redis_shard_ports = redis_shard_ports.split(",") # Infer the number of Redis shards from the ports if the number is # not provided. num_redis_shards = len(redis_shard_ports) if redis_address is not None: cli_logger.abort( "`{}` starts a new Redis server, `{}` should not be set.", cf.bold("--head"), cf.bold("--address")) raise Exception("If --head is passed in, a Redis server will be " "started, so a Redis address should not be " "provided.") node_ip_address = services.get_node_ip_address() # Get the node IP address if one is not provided. ray_params.update_if_absent(node_ip_address=node_ip_address) cli_logger.labeled_value("Local node IP", ray_params.node_ip_address) ray_params.update_if_absent( redis_port=port, redis_shard_ports=redis_shard_ports, redis_max_memory=redis_max_memory, num_redis_shards=num_redis_shards, redis_max_clients=None, autoscaling_config=autoscaling_config, ) # Fail early when starting a new cluster when one is already running if address is None: default_address = f"{node_ip_address}:{port}" redis_addresses = services.find_redis_address(default_address) if len(redis_addresses) > 0: raise ConnectionError( f"Ray is already running at {default_address}. " f"Please specify a different port using the `--port`" f" command to `ray start`.") node = ray.node.Node( ray_params, head=True, shutdown_at_exit=block, spawn_reaper=block) redis_address = node.redis_address # this is a noop if new-style is not set, so the old logger calls # are still in place cli_logger.newline() startup_msg = "Ray runtime started." cli_logger.success("-" * len(startup_msg)) cli_logger.success(startup_msg) cli_logger.success("-" * len(startup_msg)) cli_logger.newline() with cli_logger.group("Next steps"): cli_logger.print( "To connect to this Ray runtime from another node, run") cli_logger.print( cf.bold(" ray start --address='{}'{}"), redis_address, f" --redis-password='******'" if redis_password else "") cli_logger.newline() cli_logger.print("Alternatively, use the following Python code:") with cli_logger.indented(): with cf.with_style("monokai") as c: cli_logger.print("{} ray", c.magenta("import")) cli_logger.print( "ray{}init(address{}{}{})", c.magenta("."), c.magenta("="), c.yellow("'auto'"), ", _redis_password{}{}".format( c.magenta("="), c.yellow("'" + redis_password + "'")) if redis_password else "") cli_logger.newline() cli_logger.print( cf.underlined("If connection fails, check your " "firewall settings and " "network configuration.")) cli_logger.newline() cli_logger.print("To terminate the Ray runtime, run") cli_logger.print(cf.bold(" ray stop")) else: # Start Ray on a non-head node. if not (port is None): cli_logger.abort("`{}` should not be specified without `{}`.", cf.bold("--port"), cf.bold("--head")) raise Exception("If --head is not passed in, --port is not " "allowed.") if redis_shard_ports is not None: cli_logger.abort("`{}` should not be specified without `{}`.", cf.bold("--redis-shard-ports"), cf.bold("--head")) raise Exception("If --head is not passed in, --redis-shard-ports " "is not allowed.") if redis_address is None: cli_logger.abort("`{}` is required unless starting with `{}`.", cf.bold("--address"), cf.bold("--head")) raise Exception("If --head is not passed in, --address must " "be provided.") if include_dashboard: cli_logger.abort("`{}` should not be specified without `{}`.", cf.bold("--include-dashboard"), cf.bold("--head")) raise ValueError( "If --head is not passed in, the --include-dashboard" "flag is not relevant.") # Wait for the Redis server to be started. And throw an exception if we # can't connect to it. services.wait_for_redis_to_start( redis_address_ip, redis_address_port, password=redis_password) # Create a Redis client. redis_client = services.create_redis_client( redis_address, password=redis_password) # Check that the version information on this node matches the version # information that the cluster was started with. services.check_version_info(redis_client) # Get the node IP address if one is not provided. ray_params.update_if_absent( node_ip_address=services.get_node_ip_address(redis_address)) cli_logger.labeled_value("Local node IP", ray_params.node_ip_address) # Check that there aren't already Redis clients with the same IP # address connected with this Redis instance. This raises an exception # if the Redis server already has clients on this node. check_no_existing_redis_clients(ray_params.node_ip_address, redis_client) ray_params.update(redis_address=redis_address) node = ray.node.Node( ray_params, head=False, shutdown_at_exit=block, spawn_reaper=block) cli_logger.newline() startup_msg = "Ray runtime started." cli_logger.success("-" * len(startup_msg)) cli_logger.success(startup_msg) cli_logger.success("-" * len(startup_msg)) cli_logger.newline() cli_logger.print("To terminate the Ray runtime, run") cli_logger.print(cf.bold(" ray stop")) if block: cli_logger.newline() with cli_logger.group(cf.bold("--block")): cli_logger.print( "This command will now block until terminated by a signal.") cli_logger.print( "Runing subprocesses are monitored and a message will be " "printed if any of them terminate unexpectedly.") while True: time.sleep(1) deceased = node.dead_processes() if len(deceased) > 0: cli_logger.newline() cli_logger.error("Some Ray subprcesses exited unexpectedly:") with cli_logger.indented(): for process_type, process in deceased: cli_logger.error( "{}", cf.bold(str(process_type)), _tags={"exit code": str(process.returncode)}) # shutdown_at_exit will handle cleanup. cli_logger.newline() cli_logger.error("Remaining processes will be killed.") sys.exit(1)