def _run_helper(self, final_cmd, with_output=False, exit_on_fail=False, silent=False): """Run a command that was already setup with SSH and `bash` settings. Args: cmd (List[str]): Full command to run. Should include SSH options and other processing that we do. with_output (bool): If `with_output` is `True`, command stdout and stderr will be captured and returned. exit_on_fail (bool): If `exit_on_fail` is `True`, the process will exit if the command fails (exits with a code other than 0). Raises: ProcessRunnerError if using new log style and disabled login shells. click.ClickException if using login shells. """ try: # For now, if the output is needed we just skip the new logic. # In the future we could update the new logic to support # capturing output, but it is probably not needed. if not cli_logger.old_style and not with_output: return run_cmd_redirected( final_cmd, process_runner=self.process_runner, silent=silent, use_login_shells=is_using_login_shells()) if with_output: return self.process_runner.check_output(final_cmd) else: return self.process_runner.check_call(final_cmd) except subprocess.CalledProcessError as e: quoted_cmd = " ".join(final_cmd[:-1] + [quote(final_cmd[-1])]) if not cli_logger.old_style and not is_using_login_shells(): raise ProcessRunnerError("Command failed", "ssh_command_failed", code=e.returncode, command=quoted_cmd) if exit_on_fail: raise click.ClickException( "Command failed:\n\n {}\n".format(quoted_cmd)) from None else: fail_msg = "SSH command failed." if is_output_redirected(): fail_msg += " See above for the output from the failure." raise click.ClickException(fail_msg) from None
def run(self): cli_logger.old_info(logger, "{}Updating to {}", self.log_prefix, self.runtime_hash) if cmd_output_util.does_allow_interactive( ) and cmd_output_util.is_output_redirected(): # this is most probably a bug since the user has no control # over these settings msg = ("Output was redirected for an interactive command. " "Either do not pass `--redirect-command-output` " "or also pass in `--use-normal-shells`.") cli_logger.abort(msg) raise click.ClickException(msg) try: with LogTimer(self.log_prefix + "Applied config {}".format(self.runtime_hash)): self.do_update() except Exception as e: error_str = str(e) if hasattr(e, "cmd"): error_str = "(Exit Status {}) {}".format( e.returncode, " ".join(e.cmd)) self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED}) cli_logger.error("New status: {}", cf.bold(STATUS_UPDATE_FAILED)) cli_logger.old_error(logger, "{}Error executing: {}\n", self.log_prefix, error_str) cli_logger.error("!!!") if hasattr(e, "cmd"): cli_logger.error( "Setup command `{}` failed with exit code {}. stderr:", cf.bold(e.cmd), e.returncode) else: cli_logger.verbose_error("{}", str(vars(e))) # todo: handle this better somehow? cli_logger.error("{}", str(e)) # todo: print stderr here cli_logger.error("!!!") cli_logger.newline() if isinstance(e, click.ClickException): # todo: why do we ignore this here return raise tags_to_set = { TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE, TAG_RAY_RUNTIME_CONFIG: self.runtime_hash, } if self.file_mounts_contents_hash is not None: tags_to_set[ TAG_RAY_FILE_MOUNTS_CONTENTS] = self.file_mounts_contents_hash self.provider.set_node_tags(self.node_id, tags_to_set) cli_logger.labeled_value("New status", STATUS_UP_TO_DATE) self.exitcode = 0
def do_update(self): self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_WAITING_FOR_SSH}) cli_logger.labeled_value("New status", STATUS_WAITING_FOR_SSH) deadline = time.time() + NODE_START_WAIT_S self.wait_ready(deadline) node_tags = self.provider.node_tags(self.node_id) logger.debug("Node tags: {}".format(str(node_tags))) # runtime_hash will only change whenever the user restarts # or updates their cluster with `get_or_create_head_node` if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash and ( self.file_mounts_contents_hash is None or node_tags.get(TAG_RAY_FILE_MOUNTS_CONTENTS) == self.file_mounts_contents_hash): # todo: we lie in the confirmation message since # full setup might be cancelled here cli_logger.print( "Configuration already up to date, " "skipping file mounts, initalization and setup commands.", _numbered=("[]", "2-5", 6)) cli_logger.old_info(logger, "{}{} already up-to-date, skip to ray start", self.log_prefix, self.node_id) # When resuming from a stopped instance the runtime_hash may be the # same, but the container will not be started. self.cmd_runner.run_init(as_head=self.is_head_node, file_mounts=self.file_mounts) else: cli_logger.print("Updating cluster configuration.", _tags=dict(hash=self.runtime_hash)) self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SYNCING_FILES}) cli_logger.labeled_value("New status", STATUS_SYNCING_FILES) self.sync_file_mounts(self.rsync_up, step_numbers=(2, 6)) # Only run setup commands if runtime_hash has changed because # we don't want to run setup_commands every time the head node # file_mounts folders have changed. if node_tags.get(TAG_RAY_RUNTIME_CONFIG) != self.runtime_hash: # Run init commands self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SETTING_UP}) cli_logger.labeled_value("New status", STATUS_SETTING_UP) if self.initialization_commands: with cli_logger.group("Running initialization commands", _numbered=("[]", 3, 5)): with LogTimer(self.log_prefix + "Initialization commands", show_status=True): for cmd in self.initialization_commands: try: # Overriding the existing SSHOptions class # with a new SSHOptions class that uses # this ssh_private_key as its only __init__ # argument. # Run outside docker. self.cmd_runner.run( cmd, ssh_options_override_ssh_key=self. auth_config.get("ssh_private_key"), run_env="host") except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error( "See above for stderr.") raise click.ClickException( "Initialization command failed." ) from None else: cli_logger.print("No initialization commands to run.", _numbered=("[]", 3, 6)) self.cmd_runner.run_init(as_head=self.is_head_node, file_mounts=self.file_mounts) if self.setup_commands: with cli_logger.group( "Running setup commands", # todo: fix command numbering _numbered=("[]", 4, 6)): with LogTimer(self.log_prefix + "Setup commands", show_status=True): total = len(self.setup_commands) for i, cmd in enumerate(self.setup_commands): if cli_logger.verbosity == 0 and len(cmd) > 30: cmd_to_print = cf.bold(cmd[:30]) + "..." else: cmd_to_print = cf.bold(cmd) cli_logger.print("{}", cmd_to_print, _numbered=("()", i, total)) try: # Runs in the container if docker is in use self.cmd_runner.run(cmd, run_env="auto") except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error( "See above for stderr.") raise click.ClickException( "Setup command failed.") else: cli_logger.print("No setup commands to run.", _numbered=("[]", 4, 6)) with cli_logger.group("Starting the Ray runtime", _numbered=("[]", 6, 6)): with LogTimer(self.log_prefix + "Ray start commands", show_status=True): for cmd in self.ray_start_commands: if self.node_resources: env_vars = { ray_constants.RESOURCES_ENVIRONMENT_VARIABLE: self.node_resources } else: env_vars = {} try: old_redirected = cmd_output_util.is_output_redirected() cmd_output_util.set_output_redirected(False) # Runs in the container if docker is in use self.cmd_runner.run(cmd, environment_variables=env_vars, run_env="auto") cmd_output_util.set_output_redirected(old_redirected) except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error("See above for stderr.") raise click.ClickException("Start command failed.")
return self.process_runner.check_call(final_cmd) except subprocess.CalledProcessError as e: quoted_cmd = " ".join(final_cmd[:-1] + [quote(final_cmd[-1])]) if not cli_logger.old_style and not is_using_login_shells(): raise ProcessRunnerError( "Command failed", "ssh_command_failed", code=e.returncode, command=quoted_cmd) if exit_on_fail: raise click.ClickException( "Command failed:\n\n {}\n".format(quoted_cmd)) from None else: fail_msg = "SSH command failed." if is_output_redirected(): fail_msg += " See above for the output from the failure." raise click.ClickException(fail_msg) from None def run( self, cmd, timeout=120, exit_on_fail=False, port_forward=None, with_output=False, environment_variables: Dict[str, object] = None, run_env="auto", # Unused argument. ssh_options_override_ssh_key="", shutdown_after_run=False, ):