def _check_if_container_restart_is_needed( self, image: str, cleaned_bind_mounts: Dict[str, str]) -> bool: re_init_required = False running_image = (self.run( check_docker_image(self.container_name, self.docker_cmd), with_output=True, run_env="host", ).decode("utf-8").strip()) if running_image != image: cli_logger.error( "A container with name {} is running image {} instead " + "of {} (which was provided in the YAML)", self.container_name, running_image, image, ) mounts = (self.run( check_bind_mounts_cmd(self.container_name, self.docker_cmd), with_output=True, run_env="host", ).decode("utf-8").strip()) try: active_mounts = json.loads(mounts) active_remote_mounts = { mnt["Destination"].strip("/") for mnt in active_mounts } # Ignore ray bootstrap files. requested_remote_mounts = { self._docker_expand_user(remote).strip("/") for remote in cleaned_bind_mounts.keys() } unfulfilled_mounts = requested_remote_mounts - active_remote_mounts if unfulfilled_mounts: re_init_required = True cli_logger.warning( "This Docker Container is already running. " "Restarting the Docker container on " "this node to pick up the following file_mounts {}", unfulfilled_mounts, ) except json.JSONDecodeError: cli_logger.verbose( "Unable to check if file_mounts specified in the YAML " "differ from those on the running container.") return re_init_required
def handle_ssh_fails(e, first_conn_refused_time, retry_interval): """Handle SSH system failures coming from a subprocess. Args: e: The `ProcessRunnerException` to handle. first_conn_refused_time: The time (as reported by this function) or None, indicating the last time a CONN_REFUSED error was caught. After exceeding a patience value, the program will be aborted since SSH will likely never recover. retry_interval: The interval after which the command will be retried, used here just to inform the user. """ if e.msg_type != "ssh_command_failed": return if e.special_case == "ssh_conn_refused": if (first_conn_refused_time is not None and time.time() - first_conn_refused_time > CONN_REFUSED_PATIENCE): cli_logger.error( "SSH connection was being refused " "for {} seconds. Head node assumed " "unreachable.", cf.bold(str(CONN_REFUSED_PATIENCE)), ) cli_logger.abort("Check the node's firewall settings " "and the cloud network configuration.") cli_logger.warning("SSH connection was refused.") cli_logger.warning("This might mean that the SSH daemon is " "still setting up, or that " "the host is inaccessable (e.g. due to " "a firewall).") return time.time() if e.special_case in ["ssh_timeout", "ssh_conn_refused"]: cli_logger.print( "SSH still not available, retrying in {} seconds.", cf.bold(str(retry_interval)), ) else: raise e return first_conn_refused_time
def run_docker_stop(node, container_name): try: updater = NodeUpdaterThread(node_id=node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="", file_mounts_contents_hash="", is_head_node=False, docker_config=config.get("docker")) _exec(updater, cmd=f"docker stop {container_name}", run_env="host") except Exception: cli_logger.warning(f"Docker stop failed on {node}")
def warn_about_bad_start_command(start_commands: List[str], no_monitor_on_head: bool = False) -> None: ray_start_cmd = list(filter(lambda x: "ray start" in x, start_commands)) if len(ray_start_cmd) == 0: cli_logger.warning( "Ray runtime will not be started because `{}` is not in `{}`.", cf.bold("ray start"), cf.bold("head_start_ray_commands")) autoscaling_config_in_ray_start_cmd = any( "autoscaling-config" in x for x in ray_start_cmd) if not (autoscaling_config_in_ray_start_cmd or no_monitor_on_head): cli_logger.warning( "The head node will not launch any workers because " "`{}` does not have `{}` set.\n" "Potential fix: add `{}` to the `{}` command under `{}`.", cf.bold("ray start"), cf.bold("--autoscaling-config"), cf.bold("--autoscaling-config=~/ray_bootstrap_config.yaml"), cf.bold("ray start"), cf.bold("head_start_ray_commands"))
def prepare_manual(config: Dict[str, Any]) -> Dict[str, Any]: """Validates and sets defaults for configs of manually managed on-prem clusters. - Checks for presence of required `worker_ips` and `head_ips` fields. - Defaults min and max workers to the number of `worker_ips`. - Caps min and max workers at the number of `worker_ips`. - Writes min and max worker info into the single worker node type. """ config = copy.deepcopy(config) if ("worker_ips" not in config["provider"]) or ("head_ip" not in config["provider"]): cli_logger.abort("Please supply a `head_ip` and list of `worker_ips`. " "Alternatively, supply a `coordinator_address`.") num_ips = len(config["provider"]["worker_ips"]) node_type = config["available_node_types"][LOCAL_CLUSTER_NODE_TYPE] # Default to keeping all provided ips in the cluster. config.setdefault("max_workers", num_ips) # The autoscaler no longer uses global `min_workers`. # We will move `min_workers` to the node_type config. min_workers = config.pop("min_workers", num_ips) max_workers = config["max_workers"] if min_workers > num_ips: cli_logger.warning( f"The value of `min_workers` supplied ({min_workers}) is greater" f" than the number of available worker ips ({num_ips})." f" Setting `min_workers={num_ips}`.") node_type["min_workers"] = num_ips else: node_type["min_workers"] = min_workers if max_workers > num_ips: cli_logger.warning( f"The value of `max_workers` supplied ({max_workers}) is greater" f" than the number of available worker ips ({num_ips})." f" Setting `max_workers={num_ips}`.") node_type["max_workers"] = num_ips config["max_workers"] = num_ips else: node_type["max_workers"] = max_workers return config
def job_logs(address: Optional[str], job_id: str, follow: bool): """Gets the logs of a job. Example: >>> ray job logs <my_job_id> """ client = _get_sdk_client(address) sdk_version = client.get_version() # sdk version 0 did not have log streaming if follow: if int(sdk_version) > 0: asyncio.get_event_loop().run_until_complete( _tail_logs(client, job_id)) else: cli_logger.warning( "Tailing logs is not enabled for job sdk client version " f"{sdk_version}. Please upgrade your ray to latest version " "for this feature.") else: print(client.get_job_logs(job_id), end="")
def rsync_up(cluster_config_file, source, target, cluster_name, all_nodes, log_style, log_color, verbose): """Upload specific files to a Ray cluster.""" cli_logger.configure(log_style, log_color, verbose) if all_nodes: cli_logger.warning( "WARNING: the `all_nodes` option is deprecated and will be " "removed in the future. " "Rsync to worker nodes is not reliable since workers may be " "added during autoscaling. Please use the `file_mounts` " "feature instead for consistent file sync in autoscaling clusters") rsync( cluster_config_file, source, target, cluster_name, down=False, all_nodes=all_nodes)
def add_callback_handler( self, event: str, callback: Union[Callable[[Dict], None], List[Callable[[Dict], None]]], ): """Stores callback handler for event. Args: event: Event that callback should be called on. See CreateClusterEvent for details on the events available to be registered against. callback (Callable[[Dict], None]): Callable object that is invoked when specified event occurs. """ if event not in CreateClusterEvent.__members__.values(): cli_logger.warning(f"{event} is not currently tracked, and this" " callback will not be invoked.") self.callback_map.setdefault( event, []).extend([callback] if type(callback) is not list else callback)
def fill_node_type_max_workers(config): """Sets default per-node max workers to global max_workers. This equivalent to setting the default per-node max workers to infinity, with the only upper constraint coming from the global max_workers. """ assert "max_workers" in config, "Global max workers should be set." node_types = config["available_node_types"] for node_type_name in node_types: node_type_data = node_types[node_type_name] # Log a warning if head node type's max_workers is absent. if (node_type_name == config["head_node_type"] and "max_workers" not in node_type_data): cli_logger.warning( HEAD_TYPE_MAX_WORKERS_WARN_TEMPLATE.format( node_type=node_type_name, max_workers=config["max_workers"], version=ray.__version__)) # The key part of this function: node_type_data.setdefault("max_workers", config["max_workers"])
def logs(address: Optional[str], job_id: str, follow: bool): """Gets the logs of a job. Example: ray job logs <my_job_id> """ client = _get_sdk_client(address) sdk_version = client.get_version() # sdk version 0 did not have log streaming if follow: if int(sdk_version) > 0: asyncio.get_event_loop().run_until_complete( _tail_logs(client, job_id)) else: cli_logger.warning( "Tailing logs is not enabled for job sdk client version " f"{sdk_version}. Please upgrade your ray to latest version " "for this feature.") else: # Set no_format to True because the logs may have unescaped "{" and "}" # and the CLILogger calls str.format(). cli_logger.print(client.get_job_logs(job_id), end="", no_format=True)
def _set_ssh_ip_if_required(self): if self.ssh_ip is not None: return # We assume that this never changes. # I think that's reasonable. deadline = time.time() + AUTOSCALER_NODE_START_WAIT_S with LogTimer(self.log_prefix + "Got IP"): ip = self._wait_for_ip(deadline) cli_logger.doassert(ip is not None, "Could not get node IP.") # todo: msg assert ip is not None, "Unable to find IP of node" self.ssh_ip = ip # This should run before any SSH commands and therefore ensure that # the ControlPath directory exists, allowing SSH to maintain # persistent sessions later on. try: os.makedirs(self.ssh_control_path, mode=0o700, exist_ok=True) except OSError as e: cli_logger.warning("{}", str(e)) # todo: msg
def _get_running_head_node(config: Dict[str, Any], printable_config_file: str, override_cluster_name: Optional[str], create_if_needed: bool = False, _provider: Optional[NodeProvider] = None) -> str: """Get a valid, running head node""" provider = _provider or _get_node_provider(config["provider"], config["cluster_name"]) head_node_tags = { TAG_RAY_NODE_KIND: NODE_KIND_HEAD, } nodes = provider.non_terminated_nodes(head_node_tags) head_node = None for node in nodes: node_state = provider.node_tags(node).get(TAG_RAY_NODE_STATUS) if node_state == STATUS_UP_TO_DATE: head_node = node else: cli_logger.warning(f"Head node ({node}) is in state {node_state}.") if head_node is not None: return head_node elif create_if_needed: get_or_create_head_node( config, printable_config_file=printable_config_file, restart_only=False, no_restart=False, yes=True, override_cluster_name=override_cluster_name) return _get_running_head_node( config, printable_config_file, override_cluster_name, create_if_needed=False) else: raise RuntimeError("Head node of cluster ({}) not found!".format( config["cluster_name"]))
def _info_from_params( cluster: Optional[str] = None, host: Optional[str] = None, ssh_user: Optional[str] = None, ssh_key: Optional[str] = None, docker: Optional[str] = None, ): """Parse command line arguments. Note: This returns a list of hosts, not a comma separated string! """ if not host and not cluster: bootstrap_config = os.path.expanduser("~/ray_bootstrap_config.yaml") if os.path.exists(bootstrap_config): cluster = bootstrap_config cli_logger.warning(f"Detected cluster config file at {cluster}. " f"If this is incorrect, specify with " f"`ray cluster-dump <config>`") elif cluster: cluster = os.path.expanduser(cluster) cluster_name = None if cluster: h, u, k, d, cluster_name = get_info_from_ray_cluster_config(cluster) ssh_user = ssh_user or u ssh_key = ssh_key or k docker = docker or d hosts = host.split(",") if host else h if not hosts: raise LocalCommandFailed( f"Invalid cluster file or cluster has no running nodes: " f"{cluster}") elif host: hosts = host.split(",") else: raise LocalCommandFailed( "You need to either specify a `<cluster_config>` or `--host`.") if not ssh_user: ssh_user = DEFAULT_SSH_USER cli_logger.warning( f"Using default SSH user `{ssh_user}`. " f"If this is incorrect, specify with `--ssh-user <user>`") if not ssh_key: for cand_key in DEFAULT_SSH_KEYS: cand_key_file = os.path.expanduser(cand_key) if os.path.exists(cand_key_file): ssh_key = cand_key_file cli_logger.warning( f"Auto detected SSH key file: {ssh_key}. " f"If this is incorrect, specify with `--ssh-key <key>`") break return cluster, hosts, ssh_user, ssh_key, docker, cluster_name
def warn_about_bad_start_command(start_commands): ray_start_cmd = list(filter(lambda x: "ray start" in x, start_commands)) if len(ray_start_cmd) == 0: cli_logger.warning( "Ray runtime will not be started because `{}` is not in `{}`.", cf.bold("ray start"), cf.bold("head_start_ray_commands")) cli_logger.old_warning( logger, "Ray start is not included in the head_start_ray_commands section." ) if not any("autoscaling-config" in x for x in ray_start_cmd): cli_logger.warning( "The head node will not launch any workers because " "`{}` does not have `{}` set.\n" "Potential fix: add `{}` to the `{}` command under `{}`.", cf.bold("ray start"), cf.bold("--autoscaling-config"), cf.bold("--autoscaling-config=~/ray_bootstrap_config.yaml"), cf.bold("ray start"), cf.bold("head_start_ray_commands")) cli_logger.old_warning( logger, "Ray start on the head node does not have the flag" "--autoscaling-config set. The head node will not launch" "workers. Add --autoscaling-config=~/ray_bootstrap_config.yaml" "to ray start in the head_start_ray_commands section.")
def up(cluster_config_file, min_workers, max_workers, no_restart, restart_only, yes, cluster_name, no_config_cache, redirect_command_output, use_login_shells, log_style, log_color, verbose): """Create or update a Ray cluster.""" cli_logger.configure(log_style, log_color, verbose) if restart_only or no_restart: cli_logger.doassert(restart_only != no_restart, "`{}` is incompatible with `{}`.", cf.bold("--restart-only"), cf.bold("--no-restart")) assert restart_only != no_restart, "Cannot set both 'restart_only' " \ "and 'no_restart' at the same time!" if urllib.parse.urlparse(cluster_config_file).scheme in ("http", "https"): try: response = urllib.request.urlopen(cluster_config_file, timeout=5) content = response.read() file_name = cluster_config_file.split("/")[-1] with open(file_name, "wb") as f: f.write(content) cluster_config_file = file_name except urllib.error.HTTPError as e: cli_logger.warning("{}", str(e)) cli_logger.warning( "Could not download remote cluster configuration file.") cli_logger.old_info(logger, "Error downloading file: ", e) create_or_update_cluster( config_file=cluster_config_file, override_min_workers=min_workers, override_max_workers=max_workers, no_restart=no_restart, restart_only=restart_only, yes=yes, override_cluster_name=cluster_name, no_config_cache=no_config_cache, redirect_command_output=redirect_command_output, use_login_shells=use_login_shells)
def check_legacy_fields(config: Dict[str, Any]) -> None: """For use in providers that have completed the migration to available_node_types. Warns user that head_node and worker_nodes fields are being ignored. Throws an error if available_node_types and head_node_type aren't specified. """ # log warning if non-empty head_node field if "head_node" in config and config["head_node"]: cli_logger.warning( "The `head_node` field is deprecated and will be ignored. " "Use `head_node_type` and `available_node_types` instead.") # log warning if non-empty worker_nodes field if "worker_nodes" in config and config["worker_nodes"]: cli_logger.warning( "The `worker_nodes` field is deprecated and will be ignored. " "Use `available_node_types` instead.") if "available_node_types" not in config: cli_logger.error("`available_node_types` not specified in config") raise ValueError("`available_node_types` not specified in config") if "head_node_type" not in config: cli_logger.error("`head_node_type` not specified in config") raise ValueError("`head_node_type` not specified in config")
def _bootstrap_config(config: Dict[str, Any], no_config_cache: bool = False) -> Dict[str, Any]: config = prepare_config(config) hasher = hashlib.sha1() hasher.update(json.dumps([config], sort_keys=True).encode("utf-8")) cache_key = os.path.join(tempfile.gettempdir(), "ray-config-{}".format(hasher.hexdigest())) if os.path.exists(cache_key) and not no_config_cache: cli_logger.old_info(logger, "Using cached config at {}", cache_key) config_cache = json.loads(open(cache_key).read()) if config_cache.get("_version", -1) == CONFIG_CACHE_VERSION: # todo: is it fine to re-resolve? afaik it should be. # we can have migrations otherwise or something # but this seems overcomplicated given that resolving is # relatively cheap try_reload_log_state(config_cache["config"]["provider"], config_cache.get("provider_log_info")) if log_once("_printed_cached_config_warning"): cli_logger.verbose_warning( "Loaded cached provider configuration " "from " + cf.bold("{}"), cache_key) if cli_logger.verbosity == 0: cli_logger.warning("Loaded cached provider configuration") cli_logger.warning( "If you experience issues with " "the cloud provider, try re-running " "the command with {}.", cf.bold("--no-config-cache")) return config_cache["config"] else: cli_logger.warning( "Found cached cluster config " "but the version " + cf.bold("{}") + " " "(expected " + cf.bold("{}") + ") does not match.\n" "This is normal if cluster launcher was updated.\n" "Config will be re-resolved.", config_cache.get("_version", "none"), CONFIG_CACHE_VERSION) validate_config(config) importer = _NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: raise NotImplementedError("Unsupported provider {}".format( config["provider"])) provider_cls = importer(config["provider"])
def submit(cluster_config_file, screen, tmux, stop, start, cluster_name, no_config_cache, port_forward, script, args, script_args, log_style, log_color, verbose): """Uploads and runs a script on the specified cluster. The script is automatically synced to the following location: os.path.join("~", os.path.basename(script)) Example: >>> ray submit [CLUSTER.YAML] experiment.py -- --smoke-test """ cli_logger.configure(log_style, log_color, verbose) cli_logger.doassert(not (screen and tmux), "`{}` and `{}` are incompatible.", cf.bold("--screen"), cf.bold("--tmux")) cli_logger.doassert( not (script_args and args), "`{0}` and `{1}` are incompatible. Use only `{1}`.\n" "Example: `{2}`", cf.bold("--args"), cf.bold("-- <args ...>"), cf.bold("ray submit script.py -- --arg=123 --flag")) assert not (screen and tmux), "Can specify only one of `screen` or `tmux`." assert not (script_args and args), "Use -- --arg1 --arg2 for script args." if args: cli_logger.warning( "`{}` is deprecated and will be removed in the future.", cf.bold("--args")) cli_logger.warning("Use `{}` instead. Example: `{}`.", cf.bold("-- <args ...>"), cf.bold("ray submit script.py -- --arg=123 --flag")) cli_logger.newline() if start: create_or_update_cluster( config_file=cluster_config_file, override_min_workers=None, override_max_workers=None, no_restart=False, restart_only=False, yes=True, override_cluster_name=cluster_name, no_config_cache=no_config_cache, redirect_command_output=False, use_login_shells=True) target = os.path.basename(script) target = os.path.join("~", target) rsync( cluster_config_file, script, target, cluster_name, no_config_cache=no_config_cache, down=False) command_parts = ["python", target] if script_args: command_parts += list(script_args) elif args is not None: command_parts += [args] port_forward = [(port, port) for port in list(port_forward)] cmd = " ".join(command_parts) exec_cluster( cluster_config_file, cmd=cmd, run_env="docker", screen=screen, tmux=tmux, stop=stop, start=False, override_cluster_name=cluster_name, no_config_cache=no_config_cache, port_forward=port_forward)
def teardown_cluster(config_file: str, yes: bool, workers_only: bool, override_cluster_name: Optional[str], keep_min_workers: bool): """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = prepare_config(config) validate_config(config) cli_logger.confirm(yes, "Destroying cluster.", _abort=True) cli_logger.old_confirm("This will destroy your cluster", yes) if not workers_only: try: exec_cluster(config_file, cmd="ray stop", run_env="auto", screen=False, tmux=False, stop=False, start=False, override_cluster_name=override_cluster_name, port_forward=None, with_output=False) except Exception as e: # todo: add better exception info cli_logger.verbose_error("{}", str(e)) cli_logger.warning( "Exception occured when stopping the cluster Ray runtime " "(use -v to dump teardown exceptions).") cli_logger.warning( "Ignoring the exception and " "attempting to shut down the cluster nodes anyway.") cli_logger.old_exception( logger, "Ignoring error attempting a clean shutdown.") provider = _get_node_provider(config["provider"], config["cluster_name"]) try: def remaining_nodes(): workers = provider.non_terminated_nodes( {TAG_RAY_NODE_KIND: NODE_KIND_WORKER}) if keep_min_workers: min_workers = config.get("min_workers", 0) cli_logger.print( "{} random worker nodes will not be shut down. " + cf.dimmed("(due to {})"), cf.bold(min_workers), cf.bold("--keep-min-workers")) cli_logger.old_info(logger, "teardown_cluster: Keeping {} nodes...", min_workers) workers = random.sample(workers, len(workers) - min_workers) # todo: it's weird to kill the head node but not all workers if workers_only: cli_logger.print( "The head node will not be shut down. " + cf.dimmed("(due to {})"), cf.bold("--workers-only")) return workers head = provider.non_terminated_nodes( {TAG_RAY_NODE_KIND: NODE_KIND_HEAD}) return head + workers def run_docker_stop(node, container_name): try: updater = NodeUpdaterThread( node_id=node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="", file_mounts_contents_hash="", is_head_node=False, docker_config=config.get("docker")) _exec(updater, f"docker stop {container_name}", False, False, run_env="host") except Exception: cli_logger.warning(f"Docker stop failed on {node}") cli_logger.old_warning(logger, f"Docker stop failed on {node}") # Loop here to check that both the head and worker nodes are actually # really gone A = remaining_nodes() container_name = config.get("docker", {}).get("container_name") if container_name: for node in A: run_docker_stop(node, container_name) with LogTimer("teardown_cluster: done."): while A: cli_logger.old_info( logger, "teardown_cluster: " "Shutting down {} nodes...", len(A)) provider.terminate_nodes(A) cli_logger.print("Requested {} nodes to shut down.", cf.bold(len(A)), _tags=dict(interval="1s")) time.sleep( POLL_INTERVAL) # todo: interval should be a variable A = remaining_nodes() cli_logger.print("{} nodes remaining after {} second(s).", cf.bold(len(A)), POLL_INTERVAL) cli_logger.success("No nodes remaining.") finally: provider.cleanup()
def submit( address: Optional[str], job_id: Optional[str], runtime_env: Optional[str], runtime_env_json: Optional[str], working_dir: Optional[str], entrypoint: Tuple[str], no_wait: bool, ): """Submits a job to be run on the cluster. Example: ray job submit -- python my_script.py --arg=val """ if ray_constants.RAY_JOB_SUBMIT_HOOK in os.environ: # Submit all args as **kwargs per the JOB_SUBMIT_HOOK contract. _load_class(os.environ[ray_constants.RAY_JOB_SUBMIT_HOOK])( address=address, job_id=job_id, runtime_env=runtime_env, runtime_env_json=runtime_env_json, working_dir=working_dir, entrypoint=entrypoint, no_wait=no_wait, ) client = _get_sdk_client(address, create_cluster_if_needed=True) final_runtime_env = parse_runtime_env_args( runtime_env=runtime_env, runtime_env_json=runtime_env_json, working_dir=working_dir, ) job_id = client.submit_job( entrypoint=list2cmdline(entrypoint), job_id=job_id, runtime_env=final_runtime_env, ) _log_big_success_msg(f"Job '{job_id}' submitted successfully") with cli_logger.group("Next steps"): cli_logger.print("Query the logs of the job:") with cli_logger.indented(): cli_logger.print(cf.bold(f"ray job logs {job_id}")) cli_logger.print("Query the status of the job:") with cli_logger.indented(): cli_logger.print(cf.bold(f"ray job status {job_id}")) cli_logger.print("Request the job to be stopped:") with cli_logger.indented(): cli_logger.print(cf.bold(f"ray job stop {job_id}")) cli_logger.newline() sdk_version = client.get_version() # sdk version 0 does not have log streaming if not no_wait: if int(sdk_version) > 0: cli_logger.print("Tailing logs until the job exits " "(disable with --no-wait):") asyncio.get_event_loop().run_until_complete( _tail_logs(client, job_id)) else: cli_logger.warning( "Tailing logs is not enabled for job sdk client version " f"{sdk_version}. Please upgrade your ray to latest version " "for this feature.")
cli_logger.old_style = False cli_logger.verbosity = 999 cli_logger.detect_colors() cli_logger.print( cf.bold("Bold ") + cf.italic("Italic ") + cf.underlined("Underlined")) cli_logger.labeled_value("Label", "value") cli_logger.print("List: {}", cli_logger.render_list([1, 2, 3])) cli_logger.newline() cli_logger.very_verbose("Very verbose") cli_logger.verbose("Verbose") cli_logger.verbose_warning("Verbose warning") cli_logger.verbose_error("Verbose error") cli_logger.print("Info") cli_logger.success("Success") cli_logger.warning("Warning") cli_logger.error("Error") cli_logger.newline() try: cli_logger.abort("Abort") except Exception: pass try: cli_logger.doassert(False, "Assert") except Exception: pass cli_logger.newline() cli_logger.confirm(True, "example") cli_logger.newline() with cli_logger.indented(): cli_logger.print("Indented")
def stop(force, verbose, log_style, log_color): """Stop Ray processes manually on the local machine.""" cli_logger.configure(log_style, log_color, verbose) # Note that raylet needs to exit before object store, otherwise # it cannot exit gracefully. is_linux = sys.platform.startswith("linux") processes_to_kill = [ # The first element is the substring to filter. # The second element, if True, is to filter ps results by command name # (only the first 15 charactors of the executable name on Linux); # if False, is to filter ps results by command with all its arguments. # See STANDARD FORMAT SPECIFIERS section of # http://man7.org/linux/man-pages/man1/ps.1.html # about comm and args. This can help avoid killing non-ray processes. # Format: # Keyword to filter, filter by command (True)/filter by args (False) ["raylet", True], ["plasma_store", True], ["gcs_server", True], ["monitor.py", False], ["redis-server", False], ["default_worker.py", False], # Python worker. ["ray::", True], # Python worker. TODO(mehrdadn): Fix for Windows ["io.ray.runtime.runner.worker.DefaultWorker", False], # Java worker. ["log_monitor.py", False], ["reporter.py", False], ["dashboard.py", False], ["new_dashboard/agent.py", False], ["ray_process_reaper.py", False], ] process_infos = [] for proc in psutil.process_iter(["name", "cmdline"]): try: process_infos.append((proc, proc.name(), proc.cmdline())) except psutil.Error: pass total_found = 0 total_stopped = 0 for keyword, filter_by_cmd in processes_to_kill: if filter_by_cmd and is_linux and len(keyword) > 15: # getting here is an internal bug, so we do not use cli_logger msg = ("The filter string should not be more than {} " "characters. Actual length: {}. Filter: {}").format( 15, len(keyword), keyword) raise ValueError(msg) found = [] for candidate in process_infos: proc, proc_cmd, proc_args = candidate corpus = (proc_cmd if filter_by_cmd else subprocess.list2cmdline(proc_args)) if keyword in corpus: found.append(candidate) for proc, proc_cmd, proc_args in found: total_found += 1 proc_string = str(subprocess.list2cmdline(proc_args)) try: if force: proc.kill() else: # TODO(mehrdadn): On Windows, this is forceful termination. # We don't want CTRL_BREAK_EVENT, because that would # terminate the entire process group. What to do? proc.terminate() if force: cli_logger.verbose("Killed `{}` {} ", cf.bold(proc_string), cf.dimmed("(via SIGKILL)")) else: cli_logger.verbose("Send termination request to `{}` {}", cf.bold(proc_string), cf.dimmed("(via SIGTERM)")) total_stopped += 1 except psutil.NoSuchProcess: cli_logger.verbose( "Attempted to stop `{}`, but process was already dead.", cf.bold(proc_string)) pass except (psutil.Error, OSError) as ex: cli_logger.error("Could not terminate `{}` due to {}", cf.bold(proc_string), str(ex)) if total_found == 0: cli_logger.print("Did not find any active Ray processes.") else: if total_stopped == total_found: cli_logger.success("Stopped all {} Ray processes.", total_stopped) else: cli_logger.warning( "Stopped only {} out of {} Ray processes. " "Set `{}` to see more details.", total_stopped, total_found, cf.bold("-v")) cli_logger.warning("Try running the command again, or use `{}`.", cf.bold("--force"))
def _bootstrap_config(config: Dict[str, Any], no_config_cache: bool = False) -> Dict[str, Any]: config = prepare_config(config) hasher = hashlib.sha1() hasher.update(json.dumps([config], sort_keys=True).encode("utf-8")) cache_key = os.path.join(tempfile.gettempdir(), "ray-config-{}".format(hasher.hexdigest())) if os.path.exists(cache_key) and not no_config_cache: config_cache = json.loads(open(cache_key).read()) if config_cache.get("_version", -1) == CONFIG_CACHE_VERSION: # todo: is it fine to re-resolve? afaik it should be. # we can have migrations otherwise or something # but this seems overcomplicated given that resolving is # relatively cheap try_reload_log_state(config_cache["config"]["provider"], config_cache.get("provider_log_info")) if log_once("_printed_cached_config_warning"): cli_logger.verbose_warning( "Loaded cached provider configuration " "from " + cf.bold("{}"), cache_key) if cli_logger.verbosity == 0: cli_logger.warning("Loaded cached provider configuration") cli_logger.warning( "If you experience issues with " "the cloud provider, try re-running " "the command with {}.", cf.bold("--no-config-cache")) return config_cache["config"] else: cli_logger.warning( "Found cached cluster config " "but the version " + cf.bold("{}") + " " "(expected " + cf.bold("{}") + ") does not match.\n" "This is normal if cluster launcher was updated.\n" "Config will be re-resolved.", config_cache.get("_version", "none"), CONFIG_CACHE_VERSION) importer = _NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: raise NotImplementedError("Unsupported provider {}".format( config["provider"])) provider_cls = importer(config["provider"]) cli_logger.print("Checking {} environment settings", _PROVIDER_PRETTY_NAMES.get(config["provider"]["type"])) try: config = provider_cls.fillout_available_node_types_resources(config) except Exception as exc: if cli_logger.verbosity > 2: logger.exception("Failed to autodetect node resources.") else: cli_logger.warning( f"Failed to autodetect node resources: {str(exc)}. " "You can see full stack trace with higher verbosity.") # NOTE: if `resources` field is missing, validate_config for providers # other than AWS and Kubernetes will fail (the schema error will ask the # user to manually fill the resources) as we currently support autofilling # resources for AWS and Kubernetes only. validate_config(config) resolved_config = provider_cls.bootstrap_config(config) if not no_config_cache: with open(cache_key, "w") as f: config_cache = { "_version": CONFIG_CACHE_VERSION, "provider_log_info": try_get_log_state(config["provider"]), "config": resolved_config } f.write(json.dumps(config_cache)) return resolved_config
def _create_node(self, node_config, tags, count): created_nodes_dict = {} tags = to_aws_format(tags) conf = node_config.copy() tag_pairs = [{ "Key": TAG_RAY_CLUSTER_NAME, "Value": self.cluster_name, }] for k, v in tags.items(): tag_pairs.append({ "Key": k, "Value": v, }) tag_specs = [{ "ResourceType": "instance", "Tags": tag_pairs, }] user_tag_specs = conf.get("TagSpecifications", []) # Allow users to add tags and override values of existing # tags with their own. This only applies to the resource type # "instance". All other resource types are appended to the list of # tag specs. for user_tag_spec in user_tag_specs: if user_tag_spec["ResourceType"] == "instance": for user_tag in user_tag_spec["Tags"]: exists = False for tag in tag_specs[0]["Tags"]: if user_tag["Key"] == tag["Key"]: exists = True tag["Value"] = user_tag["Value"] break if not exists: tag_specs[0]["Tags"] += [user_tag] else: tag_specs += [user_tag_spec] # SubnetIds is not a real config key: we must resolve to a # single SubnetId before invoking the AWS API. subnet_ids = conf.pop("SubnetIds") for attempt in range(1, BOTO_CREATE_MAX_RETRIES + 1): try: subnet_id = subnet_ids[self.subnet_idx % len(subnet_ids)] self.subnet_idx += 1 conf.update({ "MinCount": 1, "MaxCount": count, "SubnetId": subnet_id, "TagSpecifications": tag_specs }) created = self.ec2_fail_fast.create_instances(**conf) created_nodes_dict = {n.id: n for n in created} # todo: timed? # todo: handle plurality? with cli_logger.group( "Launched {} nodes", count, _tags=dict(subnet_id=subnet_id)): for instance in created: # NOTE(maximsmol): This is needed for mocking # boto3 for tests. This is likely a bug in moto # but AWS docs don't seem to say. # You can patch moto/ec2/responses/instances.py # to fix this (add <stateReason> to EC2_RUN_INSTANCES) # The correct value is technically # {"code": "0", "Message": "pending"} state_reason = instance.state_reason or { "Message": "pending" } cli_logger.print( "Launched instance {}", instance.instance_id, _tags=dict( state=instance.state["Name"], info=state_reason["Message"])) break except botocore.exceptions.ClientError as exc: if attempt == BOTO_CREATE_MAX_RETRIES: cli_logger.abort( "Failed to launch instances. Max attempts exceeded.", exc=exc, ) else: cli_logger.warning( "create_instances: Attempt failed with {}, retrying.", exc) return created_nodes_dict
def _create_node(self, node_config, tags, count): created_nodes_dict = {} tags = to_aws_format(tags) conf = node_config.copy() tag_pairs = [{ "Key": TAG_RAY_CLUSTER_NAME, "Value": self.cluster_name, }] for k, v in tags.items(): tag_pairs.append({ "Key": k, "Value": v, }) tag_specs = [{ "ResourceType": "instance", "Tags": tag_pairs, }] user_tag_specs = conf.get("TagSpecifications", []) AWSNodeProvider._merge_tag_specs(tag_specs, user_tag_specs) # SubnetIds is not a real config key: we must resolve to a # single SubnetId before invoking the AWS API. subnet_ids = conf.pop("SubnetIds") # update config with min/max node counts and tag specs conf.update({ "MinCount": 1, "MaxCount": count, "TagSpecifications": tag_specs }) cli_logger_tags = {} for attempt in range(1, BOTO_CREATE_MAX_RETRIES + 1): try: if "NetworkInterfaces" in conf: net_ifs = conf["NetworkInterfaces"] # remove security group IDs previously copied from network # interfaces (create_instances call fails otherwise) conf.pop("SecurityGroupIds", None) cli_logger_tags["network_interfaces"] = str(net_ifs) else: subnet_id = subnet_ids[self.subnet_idx % len(subnet_ids)] self.subnet_idx += 1 conf["SubnetId"] = subnet_id cli_logger_tags["subnet_id"] = subnet_id created = self.ec2_fail_fast.create_instances(**conf) created_nodes_dict = {n.id: n for n in created} # todo: timed? # todo: handle plurality? with cli_logger.group("Launched {} nodes", count, _tags=cli_logger_tags): for instance in created: # NOTE(maximsmol): This is needed for mocking # boto3 for tests. This is likely a bug in moto # but AWS docs don't seem to say. # You can patch moto/ec2/responses/instances.py # to fix this (add <stateReason> to EC2_RUN_INSTANCES) # The correct value is technically # {"code": "0", "Message": "pending"} state_reason = instance.state_reason or { "Message": "pending" } cli_logger.print("Launched instance {}", instance.instance_id, _tags=dict( state=instance.state["Name"], info=state_reason["Message"])) break except botocore.exceptions.ClientError as exc: if attempt == BOTO_CREATE_MAX_RETRIES: cli_logger.abort( "Failed to launch instances. Max attempts exceeded.", exc=exc, ) else: cli_logger.warning( "create_instances: Attempt failed with {}, retrying.", exc) return created_nodes_dict
def run_init(self, *, as_head: bool, file_mounts: Dict[str, str], sync_run_yet: bool): BOOTSTRAP_MOUNTS = [ "~/ray_bootstrap_config.yaml", "~/ray_bootstrap_key.pem" ] specific_image = self.docker_config.get( f"{'head' if as_head else 'worker'}_image", self.docker_config.get("image")) self._check_docker_installed() if self.docker_config.get("pull_before_run", True): assert specific_image, "Image must be included in config if " + \ "pull_before_run is specified" self.run("{} pull {}".format(self.docker_cmd, specific_image), run_env="host") else: self.run(f"{self.docker_cmd} image inspect {specific_image} " "1> /dev/null 2>&1 || " f"{self.docker_cmd} pull {specific_image}") # Bootstrap files cannot be bind mounted because docker opens the # underlying inode. When the file is switched, docker becomes outdated. cleaned_bind_mounts = file_mounts.copy() for mnt in BOOTSTRAP_MOUNTS: cleaned_bind_mounts.pop(mnt, None) docker_run_executed = False container_running = self._check_container_status() requires_re_init = False if container_running: requires_re_init = self._check_if_container_restart_is_needed( specific_image, cleaned_bind_mounts) if requires_re_init: self.run(f"{self.docker_cmd} stop {self.container_name}", run_env="host") if (not container_running) or requires_re_init: if not sync_run_yet: # Do not start the actual image as we need to run file_sync # first to ensure that all folders are created with the # correct ownership. Docker will create the folders with # `root` as the owner. return True # Get home directory image_env = self.ssh_command_runner.run( f"{self.docker_cmd} " + "inspect -f '{{json .Config.Env}}' " + specific_image, with_output=True).decode().strip() home_directory = "/root" for env_var in json.loads(image_env): if env_var.startswith("HOME="): home_directory = env_var.split("HOME=")[1] break user_docker_run_options = self.docker_config.get( "run_options", []) + self.docker_config.get( f"{'head' if as_head else 'worker'}_run_options", []) start_command = docker_start_cmds( self.ssh_command_runner.ssh_user, specific_image, cleaned_bind_mounts, self.container_name, self._configure_runtime( self._auto_configure_shm(user_docker_run_options)), self.ssh_command_runner.cluster_name, home_directory, self.docker_cmd) self.run(start_command, run_env="host") docker_run_executed = True # Explicitly copy in ray bootstrap files. for mount in BOOTSTRAP_MOUNTS: if mount in file_mounts: if not sync_run_yet: # NOTE(ilr) This rsync is needed because when starting from # a stopped instance, /tmp may be deleted and `run_init` # is called before the first `file_sync` happens self.run_rsync_up(file_mounts[mount], mount) self.ssh_command_runner.run( "{cmd} cp {src} {container}:{dst}".format( cmd=self.docker_cmd, src=os.path.join( self._get_docker_host_mount_location( self.ssh_command_runner.cluster_name), mount), container=self.container_name, dst=self._docker_expand_user(mount))) try: # Check if the current user has read permission. # If they do not, try to change ownership! self.run(f"cat {mount} >/dev/null 2>&1 || " f"sudo chown $(id -u):$(id -g) {mount}") except Exception: lsl_string = self.run( f"ls -l {mount}", with_output=True).decode("utf-8").strip() # The string is of format <Permission> <Links> # <Owner> <Group> <Size> <Date> <Name> permissions = lsl_string.split(" ")[0] owner = lsl_string.split(" ")[2] group = lsl_string.split(" ")[3] current_user = self.run( "whoami", with_output=True).decode("utf-8").strip() cli_logger.warning( f"File ({mount}) is owned by user:{owner} and group:" f"{group} with permissions ({permissions}). The " f"current user ({current_user}) does not have " "permission to read these files, and Ray may not be " "able to autoscale. This can be resolved by " "installing `sudo` in your container, or adding a " f"command like 'chown {current_user} {mount}' to " "your `setup_commands`.") self.initialized = True return docker_run_executed
def job_submit( address: Optional[str], job_id: Optional[str], runtime_env: Optional[str], runtime_env_json: Optional[str], working_dir: Optional[str], entrypoint: Tuple[str], no_wait: bool, ): """Submits a job to be run on the cluster. Example: >>> ray job submit -- python my_script.py --arg=val """ client = _get_sdk_client(address, create_cluster_if_needed=True) final_runtime_env = {} if runtime_env is not None: if runtime_env_json is not None: raise ValueError("Only one of --runtime_env and " "--runtime-env-json can be provided.") with open(runtime_env, "r") as f: final_runtime_env = yaml.safe_load(f) elif runtime_env_json is not None: final_runtime_env = json.loads(runtime_env_json) if working_dir is not None: if "working_dir" in final_runtime_env: cli_logger.warning( "Overriding runtime_env working_dir with --working-dir option") final_runtime_env["working_dir"] = working_dir job_id = client.submit_job( entrypoint=list2cmdline(entrypoint), job_id=job_id, runtime_env=final_runtime_env, ) _log_big_success_msg(f"Job '{job_id}' submitted successfully") with cli_logger.group("Next steps"): cli_logger.print("Query the logs of the job:") with cli_logger.indented(): cli_logger.print(cf.bold(f"ray job logs {job_id}")) cli_logger.print("Query the status of the job:") with cli_logger.indented(): cli_logger.print(cf.bold(f"ray job status {job_id}")) cli_logger.print("Request the job to be stopped:") with cli_logger.indented(): cli_logger.print(cf.bold(f"ray job stop {job_id}")) cli_logger.newline() sdk_version = client.get_version() # sdk version 0 does not have log streaming if not no_wait: if int(sdk_version) > 0: cli_logger.print("Tailing logs until the job exits " "(disable with --no-wait):") asyncio.get_event_loop().run_until_complete( _tail_logs(client, job_id)) else: cli_logger.warning( "Tailing logs is not enabled for job sdk client version " f"{sdk_version}. Please upgrade your ray to latest version " "for this feature.")
def get_cluster_dump_archive(cluster_config_file: Optional[str] = None, host: Optional[str] = None, ssh_user: Optional[str] = None, ssh_key: Optional[str] = None, docker: Optional[str] = None, local: Optional[bool] = None, output: Optional[str] = None, logs: bool = True, debug_state: bool = True, pip: bool = True, processes: bool = True, processes_verbose: bool = False) -> Optional[str]: # Inform the user what kind of logs are collected (before actually # collecting, so they can abort) content_str = "" if logs: content_str += \ " - The logfiles of your Ray session\n" \ " This usually includes Python outputs (stdout/stderr)\n" if debug_state: content_str += \ " - Debug state information on your Ray cluster \n" \ " e.g. number of workers, drivers, objects, etc.\n" if pip: content_str += " - Your installed Python packages (`pip freeze`)\n" if processes: content_str += \ " - Information on your running Ray processes\n" \ " This includes command line arguments\n" cli_logger.warning( "You are about to create a cluster dump. This will collect data from " "cluster nodes.\n\n" "The dump will contain this information:\n\n" f"{content_str}\n" f"If you are concerned about leaking private information, extract " f"the archive and inspect its contents before sharing it with " f"anyone.") # Parse arguments (e.g. fetch info from cluster config) cluster_config_file, hosts, ssh_user, ssh_key, docker, cluster_name = \ _info_from_params(cluster_config_file, host, ssh_user, ssh_key, docker) nodes = [ Node( host=h, ssh_user=ssh_user, ssh_key=ssh_key, docker_container=docker) for h in hosts ] if not nodes: cli_logger.error( "No nodes found. Specify with `--host` or by passing a ray " "cluster config to `--cluster`.") return None if cluster_config_file: nodes[0].is_head = True if local is None: # If called with a cluster config, this was probably started # from a laptop local = not bool(cluster_config_file) parameters = GetParameters( logs=logs, debug_state=debug_state, pip=pip, processes=processes, processes_verbose=processes_verbose) with Archive() as archive: if local: create_archive_for_local_and_remote_nodes( archive, remote_nodes=nodes, parameters=parameters) else: create_archive_for_remote_nodes( archive, remote_nodes=nodes, parameters=parameters) if not output: if cluster_name: filename = f"{cluster_name}_" \ f"{datetime.datetime.now():%Y-%m-%d_%H-%M-%S}.tar.gz" else: filename = f"collected_logs_" \ f"{datetime.datetime.now():%Y-%m-%d_%H-%M-%S}.tar.gz" output = os.path.join(os.getcwd(), filename) else: output = os.path.expanduser(output) os.rename(archive.file, output) return output