def do_update(self): self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_WAITING_FOR_SSH}) cli_logger.labeled_value("New status", STATUS_WAITING_FOR_SSH) deadline = time.time() + AUTOSCALER_NODE_START_WAIT_S self.wait_ready(deadline) global_event_system.execute_callback( CreateClusterEvent.ssh_control_acquired) node_tags = self.provider.node_tags(self.node_id) logger.debug("Node tags: {}".format(str(node_tags))) if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash: # When resuming from a stopped instance the runtime_hash may be the # same, but the container will not be started. init_required = self.cmd_runner.run_init( as_head=self.is_head_node, file_mounts=self.file_mounts, sync_run_yet=False) if init_required: node_tags[TAG_RAY_RUNTIME_CONFIG] += "-invalidate" # This ensures that `setup_commands` are not removed self.restart_only = False if self.restart_only: self.setup_commands = [] # runtime_hash will only change whenever the user restarts # or updates their cluster with `get_or_create_head_node` if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash and ( not self.file_mounts_contents_hash or node_tags.get(TAG_RAY_FILE_MOUNTS_CONTENTS) == self.file_mounts_contents_hash): # todo: we lie in the confirmation message since # full setup might be cancelled here cli_logger.print( "Configuration already up to date, " "skipping file mounts, initalization and setup commands.", _numbered=("[]", "2-6", NUM_SETUP_STEPS)) else: cli_logger.print("Updating cluster configuration.", _tags=dict(hash=self.runtime_hash)) self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SYNCING_FILES}) cli_logger.labeled_value("New status", STATUS_SYNCING_FILES) self.sync_file_mounts(self.rsync_up, step_numbers=(1, NUM_SETUP_STEPS)) # Only run setup commands if runtime_hash has changed because # we don't want to run setup_commands every time the head node # file_mounts folders have changed. if node_tags.get(TAG_RAY_RUNTIME_CONFIG) != self.runtime_hash: # Run init commands self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SETTING_UP}) cli_logger.labeled_value("New status", STATUS_SETTING_UP) if self.initialization_commands: with cli_logger.group("Running initialization commands", _numbered=("[]", 4, NUM_SETUP_STEPS)): global_event_system.execute_callback( CreateClusterEvent.run_initialization_cmd) with LogTimer(self.log_prefix + "Initialization commands", show_status=True): for cmd in self.initialization_commands: global_event_system.execute_callback( CreateClusterEvent.run_initialization_cmd, {"command": cmd}) try: # Overriding the existing SSHOptions class # with a new SSHOptions class that uses # this ssh_private_key as its only __init__ # argument. # Run outside docker. self.cmd_runner.run( cmd, ssh_options_override_ssh_key=self. auth_config.get("ssh_private_key"), run_env="host") except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error( "See above for stderr.") raise click.ClickException( "Initialization command failed." ) from None else: cli_logger.print("No initialization commands to run.", _numbered=("[]", 4, NUM_SETUP_STEPS)) with cli_logger.group( "Initalizing command runner", # todo: fix command numbering _numbered=("[]", 5, NUM_SETUP_STEPS)): self.cmd_runner.run_init(as_head=self.is_head_node, file_mounts=self.file_mounts, sync_run_yet=True) if self.setup_commands: with cli_logger.group( "Running setup commands", # todo: fix command numbering _numbered=("[]", 6, NUM_SETUP_STEPS)): global_event_system.execute_callback( CreateClusterEvent.run_setup_cmd) with LogTimer(self.log_prefix + "Setup commands", show_status=True): total = len(self.setup_commands) for i, cmd in enumerate(self.setup_commands): global_event_system.execute_callback( CreateClusterEvent.run_setup_cmd, {"command": cmd}) if cli_logger.verbosity == 0 and len(cmd) > 30: cmd_to_print = cf.bold(cmd[:30]) + "..." else: cmd_to_print = cf.bold(cmd) cli_logger.print("{}", cmd_to_print, _numbered=("()", i, total)) try: # Runs in the container if docker is in use self.cmd_runner.run(cmd, run_env="auto") except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error( "See above for stderr.") raise click.ClickException( "Setup command failed.") else: cli_logger.print("No setup commands to run.", _numbered=("[]", 6, NUM_SETUP_STEPS)) with cli_logger.group("Starting the Ray runtime", _numbered=("[]", 7, NUM_SETUP_STEPS)): global_event_system.execute_callback( CreateClusterEvent.start_ray_runtime) with LogTimer(self.log_prefix + "Ray start commands", show_status=True): for cmd in self.ray_start_commands: if self.node_resources: env_vars = { RESOURCES_ENVIRONMENT_VARIABLE: self.node_resources } else: env_vars = {} try: old_redirected = cmd_output_util.is_output_redirected() cmd_output_util.set_output_redirected(False) # Runs in the container if docker is in use self.cmd_runner.run(cmd, environment_variables=env_vars, run_env="auto") cmd_output_util.set_output_redirected(old_redirected) except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error("See above for stderr.") raise click.ClickException("Start command failed.") global_event_system.execute_callback( CreateClusterEvent.start_ray_runtime_completed)
def stop(force, verbose, log_style, log_color): """Stop Ray processes manually on the local machine.""" cli_logger.configure(log_style, log_color, verbose) # Note that raylet needs to exit before object store, otherwise # it cannot exit gracefully. is_linux = sys.platform.startswith("linux") processes_to_kill = [ # The first element is the substring to filter. # The second element, if True, is to filter ps results by command name # (only the first 15 charactors of the executable name on Linux); # if False, is to filter ps results by command with all its arguments. # See STANDARD FORMAT SPECIFIERS section of # http://man7.org/linux/man-pages/man1/ps.1.html # about comm and args. This can help avoid killing non-ray processes. # Format: # Keyword to filter, filter by command (True)/filter by args (False) ["raylet", True], ["plasma_store", True], ["gcs_server", True], ["monitor.py", False], ["redis-server", False], ["default_worker.py", False], # Python worker. ["ray::", True], # Python worker. TODO(mehrdadn): Fix for Windows ["io.ray.runtime.runner.worker.DefaultWorker", False], # Java worker. ["log_monitor.py", False], ["reporter.py", False], ["dashboard.py", False], ["ray_process_reaper.py", False], ] process_infos = [] for proc in psutil.process_iter(["name", "cmdline"]): try: process_infos.append((proc, proc.name(), proc.cmdline())) except psutil.Error: pass total_found = 0 total_stopped = 0 for keyword, filter_by_cmd in processes_to_kill: if filter_by_cmd and is_linux and len(keyword) > 15: # getting here is an internal bug, so we do not use cli_logger msg = ("The filter string should not be more than {} " "characters. Actual length: {}. Filter: {}").format( 15, len(keyword), keyword) raise ValueError(msg) found = [] for candidate in process_infos: proc, proc_cmd, proc_args = candidate corpus = (proc_cmd if filter_by_cmd else subprocess.list2cmdline(proc_args)) if keyword in corpus: found.append(candidate) for proc, proc_cmd, proc_args in found: total_found += 1 proc_string = str(subprocess.list2cmdline(proc_args)) if verbose: operation = "Terminating" if force else "Killing" cli_logger.old_info(logger, "%s process %s: %s", operation, proc.pid, proc_string) try: if force: proc.kill() else: # TODO(mehrdadn): On Windows, this is forceful termination. # We don't want CTRL_BREAK_EVENT, because that would # terminate the entire process group. What to do? proc.terminate() if force: cli_logger.verbose("Killed `{}` {} ", cf.bold(proc_string), cf.dimmed("(via SIGKILL)")) else: cli_logger.verbose("Send termination request to `{}` {}", cf.bold(proc_string), cf.dimmed("(via SIGTERM)")) total_stopped += 1 except psutil.NoSuchProcess: cli_logger.verbose( "Attempted to stop `{}`, but process was already dead.", cf.bold(proc_string)) pass except (psutil.Error, OSError) as ex: cli_logger.error("Could not terminate `{}` due to {}", cf.bold(proc_string), str(ex)) cli_logger.old_error(logger, "Error: %s", ex) if total_found == 0: cli_logger.print("Did not find any active Ray processes.") else: if total_stopped == total_found: cli_logger.success("Stopped all {} Ray processes.", total_stopped) else: cli_logger.warning( "Stopped only {} out of {} Ray processes. " "Set `{}` to see more details.", total_stopped, total_found, cf.bold("-v")) cli_logger.warning("Try running the command again, or use `{}`.", cf.bold("--force"))
def _read_subprocess_stream(f, output_file, is_stdout=False): """Read and process a subprocess output stream. The goal is to find error messages and respond to them in a clever way. Currently just used for SSH messages (CONN_REFUSED, TIMEOUT, etc.), so the user does not get confused by these. Ran in a thread each for both `stdout` and `stderr` to allow for cross-platform asynchronous IO. Note: `select`-based IO is another option, but Windows has no support for `select`ing pipes, and Linux support varies somewhat. Spefically, Older *nix systems might also have quirks in how they handle `select` on pipes. Args: f: File object for the stream. output_file: File object to which filtered output is written. is_stdout (bool): When `is_stdout` is `False`, the stream is assumed to be `stderr`. Different error message detectors are used, and the output is displayed to the user unless it matches a special case (e.g. SSH timeout), in which case this is left up to the caller. """ detected_special_case = None while True: # ! Readline here is crucial. # ! Normal `read()` will block until EOF instead of until # something is available. line = f.readline() if line is None or line == "": # EOF break if line[-1] == "\n": line = line[:-1] if not is_stdout: if _ssh_output_regexes["connection_closed"]\ .fullmatch(line) is not None: # Do not log "connection closed" messages which SSH # puts in stderr for no reason. # # They are never errors since the connection will # close no matter whether the command succeeds or not. continue if _ssh_output_regexes["timeout"].fullmatch(line) is not None: # Timeout is not really an error but rather a special # condition. It should be handled by the caller, since # network conditions/nodes in the early stages of boot # are expected to sometimes cause connection timeouts. if detected_special_case is not None: raise ValueError("Bug: ssh_timeout conflicts with another " "special codition: " + detected_special_case) detected_special_case = "ssh_timeout" continue if _ssh_output_regexes["conn_refused"]\ .fullmatch(line) is not None: # Connection refused is not really an error but # rather a special condition. It should be handled by # the caller, since network conditions/nodes in the # early stages of boot are expected to sometimes cause # CONN_REFUSED. if detected_special_case is not None: raise ValueError( "Bug: ssh_conn_refused conflicts with another " "special codition: " + detected_special_case) detected_special_case = "ssh_conn_refused" continue if _ssh_output_regexes["known_host_update"]\ .fullmatch(line) is not None: # Since we ignore SSH host control anyway # (-o UserKnownHostsFile=/dev/null), # we should silence the host control warnings. continue cli_logger.error(line) if output_file is not None and output_file != subprocess.DEVNULL: output_file.write(line + "\n") return detected_special_case
def run_init(self, *, as_head, file_mounts): BOOTSTRAP_MOUNTS = [ "~/ray_bootstrap_config.yaml", "~/ray_bootstrap_key.pem" ] image = self.docker_config.get("image") image = self.docker_config.get( f"{'head' if as_head else 'worker'}_image", image) self._check_docker_installed() if self.docker_config.get("pull_before_run", True): assert image, "Image must be included in config if " + \ "pull_before_run is specified" self.run("docker pull {}".format(image), run_env="host") # Bootstrap files cannot be bind mounted because docker opens the # underlying inode. When the file is switched, docker becomes outdated. cleaned_bind_mounts = file_mounts.copy() for mnt in BOOTSTRAP_MOUNTS: cleaned_bind_mounts.pop(mnt, None) start_command = docker_start_cmds( self.ssh_command_runner.ssh_user, image, cleaned_bind_mounts, self.container_name, self.docker_config.get("run_options", []) + self.docker_config.get( f"{'head' if as_head else 'worker'}_run_options", [])) if not self._check_container_status(): self.run(start_command, run_env="host") else: running_image = self.run( check_docker_image(self.container_name), with_output=True, run_env="host").decode("utf-8").strip() if running_image != image: logger.error(f"A container with name {self.container_name} " + f"is running image {running_image} instead " + f"of {image} (which was provided in the YAML") mounts = self.run( check_bind_mounts_cmd(self.container_name), with_output=True, run_env="host").decode("utf-8").strip() try: active_mounts = json.loads(mounts) active_remote_mounts = [ mnt["Destination"] for mnt in active_mounts ] # Ignore ray bootstrap files. for remote, local in cleaned_bind_mounts.items(): remote = self._docker_expand_user(remote) if remote not in active_remote_mounts: cli_logger.error( "Please ray stop & restart cluster to " f"allow mount {remote}:{local} to take hold") except json.JSONDecodeError: cli_logger.verbose( "Unable to check if file_mounts specified in the YAML " "differ from those on the running container.") # Explicitly copy in ray bootstrap files. for mount in BOOTSTRAP_MOUNTS: if mount in file_mounts: self.ssh_command_runner.run( "docker cp {src} {container}:{dst}".format( src=os.path.join(DOCKER_MOUNT_PREFIX, mount), container=self.container_name, dst=self._docker_expand_user(mount))) self.initialized = True
def start(node_ip_address, address, port, redis_password, redis_shard_ports, object_manager_port, node_manager_port, gcs_server_port, min_worker_port, max_worker_port, memory, object_store_memory, redis_max_memory, num_cpus, num_gpus, resources, head, include_dashboard, dashboard_host, dashboard_port, block, plasma_directory, autoscaling_config, no_redirect_worker_output, no_redirect_output, plasma_store_socket_name, raylet_socket_name, temp_dir, java_worker_options, load_code_from_local, code_search_path, system_config, lru_evict, enable_object_reconstruction, metrics_export_port, log_style, log_color, verbose): """Start Ray processes manually on the local machine.""" cli_logger.configure(log_style, log_color, verbose) if gcs_server_port and not head: raise ValueError( "gcs_server_port can be only assigned when you specify --head.") # Convert hostnames to numerical IP address. if node_ip_address is not None: node_ip_address = services.address_to_ip(node_ip_address) redis_address = None if address is not None: (redis_address, redis_address_ip, redis_address_port) = services.validate_redis_address(address) try: resources = json.loads(resources) except Exception: cli_logger.error("`{}` is not a valid JSON string.", cf.bold("--resources")) cli_logger.abort( "Valid values look like this: `{}`", cf.bold("--resources='\"CustomResource3\": 1, " "\"CustomResource2\": 2}'")) raise Exception("Unable to parse the --resources argument using " "json.loads. Try using a format like\n\n" " --resources='{\"CustomResource1\": 3, " "\"CustomReseource2\": 2}'") redirect_worker_output = None if not no_redirect_worker_output else True redirect_output = None if not no_redirect_output else True ray_params = ray.parameter.RayParams( node_ip_address=node_ip_address, min_worker_port=min_worker_port, max_worker_port=max_worker_port, object_manager_port=object_manager_port, node_manager_port=node_manager_port, gcs_server_port=gcs_server_port, memory=memory, object_store_memory=object_store_memory, redis_password=redis_password, redirect_worker_output=redirect_worker_output, redirect_output=redirect_output, num_cpus=num_cpus, num_gpus=num_gpus, resources=resources, plasma_directory=plasma_directory, huge_pages=False, plasma_store_socket_name=plasma_store_socket_name, raylet_socket_name=raylet_socket_name, temp_dir=temp_dir, include_dashboard=include_dashboard, dashboard_host=dashboard_host, dashboard_port=dashboard_port, java_worker_options=java_worker_options, load_code_from_local=load_code_from_local, code_search_path=code_search_path, _system_config=system_config, lru_evict=lru_evict, enable_object_reconstruction=enable_object_reconstruction, metrics_export_port=metrics_export_port) if head: num_redis_shards = None # Start Ray on the head node. if redis_shard_ports is not None: redis_shard_ports = redis_shard_ports.split(",") # Infer the number of Redis shards from the ports if the number is # not provided. num_redis_shards = len(redis_shard_ports) if redis_address is not None: cli_logger.abort( "`{}` starts a new Redis server, `{}` should not be set.", cf.bold("--head"), cf.bold("--address")) raise Exception("If --head is passed in, a Redis server will be " "started, so a Redis address should not be " "provided.") # Get the node IP address if one is not provided. ray_params.update_if_absent( node_ip_address=services.get_node_ip_address()) cli_logger.labeled_value("Local node IP", ray_params.node_ip_address) cli_logger.old_info(logger, "Using IP address {} for this node.", ray_params.node_ip_address) ray_params.update_if_absent( redis_port=port, redis_shard_ports=redis_shard_ports, redis_max_memory=redis_max_memory, num_redis_shards=num_redis_shards, redis_max_clients=None, autoscaling_config=autoscaling_config, ) node = ray.node.Node( ray_params, head=True, shutdown_at_exit=block, spawn_reaper=block) redis_address = node.redis_address # this is a noop if new-style is not set, so the old logger calls # are still in place cli_logger.newline() startup_msg = "Ray runtime started." cli_logger.success("-" * len(startup_msg)) cli_logger.success(startup_msg) cli_logger.success("-" * len(startup_msg)) cli_logger.newline() with cli_logger.group("Next steps"): cli_logger.print( "To connect to this Ray runtime from another node, run") cli_logger.print( cf.bold(" ray start --address='{}'{}"), redis_address, f" --redis-password='******'" if redis_password else "") cli_logger.newline() cli_logger.print("Alternatively, use the following Python code:") with cli_logger.indented(): with cf.with_style("monokai") as c: cli_logger.print("{} ray", c.magenta("import")) cli_logger.print( "ray{}init(address{}{}{})", c.magenta("."), c.magenta("="), c.yellow("'auto'"), ", _redis_password{}{}".format( c.magenta("="), c.yellow("'" + redis_password + "'")) if redis_password else "") cli_logger.newline() cli_logger.print( cf.underlined("If connection fails, check your " "firewall settings and " "network configuration.")) cli_logger.newline() cli_logger.print("To terminate the Ray runtime, run") cli_logger.print(cf.bold(" ray stop")) cli_logger.old_info( logger, "\nStarted Ray on this node. You can add additional nodes to " "the cluster by calling\n\n" " ray start --address='{}'{}\n\n" "from the node you wish to add. You can connect a driver to the " "cluster from Python by running\n\n" " import ray\n" " ray.init(address='auto'{})\n\n" "If you have trouble connecting from a different machine, check " "that your firewall is configured properly. If you wish to " "terminate the processes that have been started, run\n\n" " ray stop".format( redis_address, " --redis-password='******'" if redis_password else "", ", _redis_password='******'" if redis_password else "")) else: # Start Ray on a non-head node. if not (port is None): cli_logger.abort("`{}` should not be specified without `{}`.", cf.bold("--port"), cf.bold("--head")) raise Exception("If --head is not passed in, --port is not " "allowed.") if redis_shard_ports is not None: cli_logger.abort("`{}` should not be specified without `{}`.", cf.bold("--redis-shard-ports"), cf.bold("--head")) raise Exception("If --head is not passed in, --redis-shard-ports " "is not allowed.") if redis_address is None: cli_logger.abort("`{}` is required unless starting with `{}`.", cf.bold("--address"), cf.bold("--head")) raise Exception("If --head is not passed in, --address must " "be provided.") if include_dashboard: cli_logger.abort("`{}` should not be specified without `{}`.", cf.bold("--include-dashboard"), cf.bold("--head")) raise ValueError( "If --head is not passed in, the --include-dashboard" "flag is not relevant.") # Wait for the Redis server to be started. And throw an exception if we # can't connect to it. services.wait_for_redis_to_start( redis_address_ip, redis_address_port, password=redis_password) # Create a Redis client. redis_client = services.create_redis_client( redis_address, password=redis_password) # Check that the version information on this node matches the version # information that the cluster was started with. services.check_version_info(redis_client) # Get the node IP address if one is not provided. ray_params.update_if_absent( node_ip_address=services.get_node_ip_address(redis_address)) cli_logger.labeled_value("Local node IP", ray_params.node_ip_address) cli_logger.old_info(logger, "Using IP address {} for this node.", ray_params.node_ip_address) # Check that there aren't already Redis clients with the same IP # address connected with this Redis instance. This raises an exception # if the Redis server already has clients on this node. check_no_existing_redis_clients(ray_params.node_ip_address, redis_client) ray_params.update(redis_address=redis_address) node = ray.node.Node( ray_params, head=False, shutdown_at_exit=block, spawn_reaper=block) cli_logger.newline() startup_msg = "Ray runtime started." cli_logger.success("-" * len(startup_msg)) cli_logger.success(startup_msg) cli_logger.success("-" * len(startup_msg)) cli_logger.newline() cli_logger.print("To terminate the Ray runtime, run") cli_logger.print(cf.bold(" ray stop")) cli_logger.old_info( logger, "\nStarted Ray on this node. If you wish to terminate the " "processes that have been started, run\n\n" " ray stop") if block: cli_logger.newline() with cli_logger.group(cf.bold("--block")): cli_logger.print( "This command will now block until terminated by a signal.") cli_logger.print( "Runing subprocesses are monitored and a message will be " "printed if any of them terminate unexpectedly.") while True: time.sleep(1) deceased = node.dead_processes() if len(deceased) > 0: cli_logger.newline() cli_logger.error("Some Ray subprcesses exited unexpectedly:") cli_logger.old_error(logger, "Ray processes died unexpectedly:") with cli_logger.indented(): for process_type, process in deceased: cli_logger.error( "{}", cf.bold(str(process_type)), _tags={"exit code": str(process.returncode)}) cli_logger.old_error( logger, "\t{} died with exit code {}".format( process_type, process.returncode)) # shutdown_at_exit will handle cleanup. cli_logger.newline() cli_logger.error("Remaining processes will be killed.") cli_logger.old_error( logger, "Killing remaining processes and exiting...") sys.exit(1)
cli_logger.configure(log_style="auto", verbosity=999) cli_logger.print( cf.bold("Bold ") + cf.italic("Italic ") + cf.underlined("Underlined")) cli_logger.labeled_value("Label", "value") cli_logger.print("List: {}", cli_logger.render_list([1, 2, 3])) cli_logger.newline() cli_logger.very_verbose("Very verbose") cli_logger.verbose("Verbose") cli_logger.verbose_warning("Verbose warning") cli_logger.verbose_error("Verbose error") cli_logger.print("Info") cli_logger.success("Success") cli_logger.warning("Warning") cli_logger.error("Error") cli_logger.newline() try: cli_logger.abort("Abort") except Exception: pass try: cli_logger.doassert(False, "Assert") except Exception: pass cli_logger.newline() cli_logger.confirm(True, "example") cli_logger.newline() with cli_logger.indented(): cli_logger.print("Indented") with cli_logger.group("Group"):
def get_cluster_dump_archive(cluster_config_file: Optional[str] = None, host: Optional[str] = None, ssh_user: Optional[str] = None, ssh_key: Optional[str] = None, docker: Optional[str] = None, local: Optional[bool] = None, output: Optional[str] = None, logs: bool = True, debug_state: bool = True, pip: bool = True, processes: bool = True, processes_verbose: bool = False) -> Optional[str]: # Inform the user what kind of logs are collected (before actually # collecting, so they can abort) content_str = "" if logs: content_str += \ " - The logfiles of your Ray session\n" \ " This usually includes Python outputs (stdout/stderr)\n" if debug_state: content_str += \ " - Debug state information on your Ray cluster \n" \ " e.g. number of workers, drivers, objects, etc.\n" if pip: content_str += " - Your installed Python packages (`pip freeze`)\n" if processes: content_str += \ " - Information on your running Ray processes\n" \ " This includes command line arguments\n" cli_logger.warning( "You are about to create a cluster dump. This will collect data from " "cluster nodes.\n\n" "The dump will contain this information:\n\n" f"{content_str}\n" f"If you are concerned about leaking private information, extract " f"the archive and inspect its contents before sharing it with " f"anyone.") # Parse arguments (e.g. fetch info from cluster config) cluster_config_file, hosts, ssh_user, ssh_key, docker, cluster_name = \ _info_from_params(cluster_config_file, host, ssh_user, ssh_key, docker) nodes = [ Node( host=h, ssh_user=ssh_user, ssh_key=ssh_key, docker_container=docker) for h in hosts ] if not nodes: cli_logger.error( f"No nodes found. Specify with `--host` or by passing a ray " f"cluster config to `--cluster`.") return None if cluster_config_file: nodes[0].is_head = True if local is None: # If called with a cluster config, this was probably started # from a laptop local = not bool(cluster_config_file) parameters = GetParameters( logs=logs, debug_state=debug_state, pip=pip, processes=processes, processes_verbose=processes_verbose) with Archive() as archive: if local: create_archive_for_local_and_remote_nodes( archive, remote_nodes=nodes, parameters=parameters) else: create_archive_for_remote_nodes( archive, remote_nodes=nodes, parameters=parameters) if not output: if cluster_name: filename = f"{cluster_name}_" \ f"{datetime.datetime.now():%Y-%m-%d_%H-%M-%S}.tar.gz" else: filename = f"collected_logs_" \ f"{datetime.datetime.now():%Y-%m-%d_%H-%M-%S}.tar.gz" output = os.path.join(os.getcwd(), filename) else: output = os.path.expanduser(output) os.rename(archive.file, output) return output
def _log_big_error_msg(success_msg): cli_logger.newline() cli_logger.error("-" * len(success_msg)) cli_logger.error(success_msg) cli_logger.error("-" * len(success_msg)) cli_logger.newline()
def run(self): cli_logger.old_info(logger, "{}Updating to {}", self.log_prefix, self.runtime_hash) if cmd_output_util.does_allow_interactive( ) and cmd_output_util.is_output_redirected(): # this is most probably a bug since the user has no control # over these settings msg = ("Output was redirected for an interactive command. " "Either do not pass `--redirect-command-output` " "or also pass in `--use-normal-shells`.") cli_logger.abort(msg) raise click.ClickException(msg) try: with LogTimer(self.log_prefix + "Applied config {}".format(self.runtime_hash)): self.do_update() except Exception as e: error_str = str(e) if hasattr(e, "cmd"): error_str = "(Exit Status {}) {}".format( e.returncode, " ".join(e.cmd)) self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED}) cli_logger.error("New status: {}", cf.bold(STATUS_UPDATE_FAILED)) cli_logger.old_error(logger, "{}Error executing: {}\n", self.log_prefix, error_str) cli_logger.error("!!!") if hasattr(e, "cmd"): cli_logger.error( "Setup command `{}` failed with exit code {}. stderr:", cf.bold(e.cmd), e.returncode) else: cli_logger.verbose_error("{}", str(vars(e))) # todo: handle this better somehow? cli_logger.error("{}", str(e)) # todo: print stderr here cli_logger.error("!!!") cli_logger.newline() if isinstance(e, click.ClickException): # todo: why do we ignore this here return raise tags_to_set = { TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE, TAG_RAY_RUNTIME_CONFIG: self.runtime_hash, } if self.file_mounts_contents_hash is not None: tags_to_set[ TAG_RAY_FILE_MOUNTS_CONTENTS] = self.file_mounts_contents_hash self.provider.set_node_tags(self.node_id, tags_to_set) cli_logger.labeled_value("New status", STATUS_UP_TO_DATE) self.exitcode = 0
def run_init( self, *, as_head: bool, file_mounts: Dict[str, str], sync_run_yet: bool ): BOOTSTRAP_MOUNTS = ["~/ray_bootstrap_config.yaml", "~/ray_bootstrap_key.pem"] specific_image = self.docker_config.get( f"{'head' if as_head else 'worker'}_image", self.docker_config.get("image") ) self._check_docker_installed() if self.docker_config.get("pull_before_run", True): assert specific_image, ( "Image must be included in config if " + "pull_before_run is specified" ) self.run( "{} pull {}".format(self.docker_cmd, specific_image), run_env="host" ) else: self.run( f"{self.docker_cmd} image inspect {specific_image} " "1> /dev/null 2>&1 || " f"{self.docker_cmd} pull {specific_image}" ) # Bootstrap files cannot be bind mounted because docker opens the # underlying inode. When the file is switched, docker becomes outdated. cleaned_bind_mounts = file_mounts.copy() for mnt in BOOTSTRAP_MOUNTS: cleaned_bind_mounts.pop(mnt, None) docker_run_executed = False container_running = self._check_container_status() requires_re_init = False if container_running: requires_re_init = self._check_if_container_restart_is_needed( specific_image, cleaned_bind_mounts ) if requires_re_init: self.run( f"{self.docker_cmd} stop {self.container_name}", run_env="host" ) if (not container_running) or requires_re_init: if not sync_run_yet: # Do not start the actual image as we need to run file_sync # first to ensure that all folders are created with the # correct ownership. Docker will create the folders with # `root` as the owner. return True # Get home directory image_env = ( self.ssh_command_runner.run( f"{self.docker_cmd} " + "inspect -f '{{json .Config.Env}}' " + specific_image, with_output=True, ) .decode() .strip() ) home_directory = "/root" try: for env_var in json.loads(image_env): if env_var.startswith("HOME="): home_directory = env_var.split("HOME=")[1] break except json.JSONDecodeError as e: cli_logger.error( "Unable to deserialize `image_env` to Python object. " f"The `image_env` is:\n{image_env}" ) raise e user_docker_run_options = self.docker_config.get( "run_options", [] ) + self.docker_config.get( f"{'head' if as_head else 'worker'}_run_options", [] ) start_command = docker_start_cmds( self.ssh_command_runner.ssh_user, specific_image, cleaned_bind_mounts, self.container_name, self._configure_runtime( self._auto_configure_shm(user_docker_run_options) ), self.ssh_command_runner.cluster_name, home_directory, self.docker_cmd, ) self.run(start_command, run_env="host") docker_run_executed = True # Explicitly copy in ray bootstrap files. for mount in BOOTSTRAP_MOUNTS: if mount in file_mounts: if not sync_run_yet: # NOTE(ilr) This rsync is needed because when starting from # a stopped instance, /tmp may be deleted and `run_init` # is called before the first `file_sync` happens self.run_rsync_up(file_mounts[mount], mount) self.ssh_command_runner.run( "rsync -e '{cmd} exec -i' -avz {src} {container}:{dst}".format( cmd=self.docker_cmd, src=os.path.join( self._get_docker_host_mount_location( self.ssh_command_runner.cluster_name ), mount, ), container=self.container_name, dst=self._docker_expand_user(mount), ) ) try: # Check if the current user has read permission. # If they do not, try to change ownership! self.run( f"cat {mount} >/dev/null 2>&1 || " f"sudo chown $(id -u):$(id -g) {mount}" ) except Exception: lsl_string = ( self.run(f"ls -l {mount}", with_output=True) .decode("utf-8") .strip() ) # The string is of format <Permission> <Links> # <Owner> <Group> <Size> <Date> <Name> permissions = lsl_string.split(" ")[0] owner = lsl_string.split(" ")[2] group = lsl_string.split(" ")[3] current_user = ( self.run("whoami", with_output=True).decode("utf-8").strip() ) cli_logger.warning( f"File ({mount}) is owned by user:{owner} and group:" f"{group} with permissions ({permissions}). The " f"current user ({current_user}) does not have " "permission to read these files, and Ray may not be " "able to autoscale. This can be resolved by " "installing `sudo` in your container, or adding a " f"command like 'chown {current_user} {mount}' to " "your `setup_commands`." ) self.initialized = True return docker_run_executed