Ejemplo n.º 1
0
 def handle_yaml_error(e):
     cli_logger.error(
         "Cluster config invalid.\n"
         "Failed to load YAML file " + cf.bold("{}"), config_file)
     cli_logger.newline()
     with cli_logger.verbatim_error_ctx("PyYAML error:"):
         cli_logger.error(e)
     cli_logger.abort()
Ejemplo n.º 2
0
Archivo: updater.py Proyecto: aeli0/ray
    def run(self):
        cli_logger.old_info(logger, "{}Updating to {}", self.log_prefix,
                            self.runtime_hash)

        try:
            with LogTimer(self.log_prefix +
                          "Applied config {}".format(self.runtime_hash)):
                self.do_update()
        except Exception as e:
            error_str = str(e)
            if hasattr(e, "cmd"):
                error_str = "(Exit Status {}) {}".format(
                    e.returncode, " ".join(e.cmd))

            self.provider.set_node_tags(
                self.node_id, {TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED})
            cli_logger.error("New status: {}", cf.bold(STATUS_UPDATE_FAILED))

            cli_logger.old_error(logger, "{}Error executing: {}\n",
                                 self.log_prefix, error_str)

            cli_logger.error("!!!")
            if hasattr(e, "cmd"):
                cli_logger.error(
                    "Setup command `{}` failed with exit code {}. stderr:",
                    cf.bold(e.cmd), e.returncode)
            else:
                cli_logger.verbose_error("{}", str(vars(e)))
                # todo: handle this better somehow?
                cli_logger.error("{}", str(e))
            # todo: print stderr here
            cli_logger.error("!!!")
            cli_logger.newline()

            if isinstance(e, click.ClickException):
                # todo: why do we ignore this here
                return
            raise

        tags_to_set = {
            TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE,
            TAG_RAY_RUNTIME_CONFIG: self.runtime_hash,
        }
        if self.file_mounts_contents_hash is not None:
            tags_to_set[
                TAG_RAY_FILE_MOUNTS_CONTENTS] = self.file_mounts_contents_hash

        self.provider.set_node_tags(self.node_id, tags_to_set)
        cli_logger.labeled_value("New status", STATUS_UP_TO_DATE)

        self.exitcode = 0
Ejemplo n.º 3
0
    def run_init(self, *, as_head, file_mounts):
        image = self.docker_config.get("image")
        image = self.docker_config.get(
            f"{'head' if as_head else 'worker'}_image", image)

        self._check_docker_installed()
        if self.docker_config.get("pull_before_run", True):
            assert image, "Image must be included in config if " + \
                "pull_before_run is specified"

            self.run("docker pull {}".format(image), run_env="host")

        start_command = docker_start_cmds(
            self.ssh_command_runner.ssh_user, image, file_mounts,
            self.container_name,
            self.docker_config.get("run_options", []) + self.docker_config.get(
                f"{'head' if as_head else 'worker'}_run_options", []))

        if not self._check_container_status():
            self.run(start_command, run_env="host")
        else:
            running_image = self.run(
                check_docker_image(self.container_name),
                with_output=True,
                run_env="host").decode("utf-8").strip()
            if running_image != image:
                logger.error(f"A container with name {self.container_name} " +
                             f"is running image {running_image} instead " +
                             f"of {image} (which was provided in the YAML")
            mounts = self.run(
                check_bind_mounts_cmd(self.container_name),
                with_output=True,
                run_env="host").decode("utf-8").strip()
            try:
                active_mounts = json.loads(mounts)
                active_remote_mounts = [
                    mnt["Destination"] for mnt in active_mounts
                ]
                for remote, local in file_mounts.items():
                    remote = self._docker_expand_user(remote)
                    if remote not in active_remote_mounts:
                        cli_logger.error(
                            "Please ray stop & restart cluster to "
                            f"allow mount {remote}:{local} to take hold")
            except json.JSONDecodeError:
                cli_logger.verbose(
                    "Unable to check if file_mounts specified in the YAML "
                    "differ from those on the running container.")
        self.initialized = True
Ejemplo n.º 4
0
def handle_ssh_fails(e, first_conn_refused_time, retry_interval):
    """Handle SSH system failures coming from a subprocess.

    Args:
        e: The `ProcessRunnerException` to handle.
        first_conn_refused_time:
            The time (as reported by this function) or None,
            indicating the last time a CONN_REFUSED error was caught.

            After exceeding a patience value, the program will be aborted
            since SSH will likely never recover.
        retry_interval: The interval after which the command will be retried,
                        used here just to inform the user.
    """
    if e.msg_type != "ssh_command_failed":
        return

    if e.special_case == "ssh_conn_refused":
        if first_conn_refused_time is not None and \
            time.time() - first_conn_refused_time > \
                CONN_REFUSED_PATIENCE:
            cli_logger.error(
                "SSH connection was being refused "
                "for {} seconds. Head node assumed "
                "unreachable.", cf.bold(str(CONN_REFUSED_PATIENCE)))
            cli_logger.abort("Check the node's firewall settings "
                             "and the cloud network configuration.")

        cli_logger.warning("SSH connection was refused.")
        cli_logger.warning("This might mean that the SSH daemon is "
                           "still setting up, or that "
                           "the host is inaccessable (e.g. due to "
                           "a firewall).")

        return time.time()

    if e.special_case in ["ssh_timeout", "ssh_conn_refused"]:
        cli_logger.print("SSH still not available, "
                         "retrying in {} seconds.",
                         cf.bold(str(retry_interval)))
    else:
        raise e

    return first_conn_refused_time
Ejemplo n.º 5
0
    def run_init(self, *, as_head, file_mounts):
        BOOTSTRAP_MOUNTS = [
            "~/ray_bootstrap_config.yaml", "~/ray_bootstrap_key.pem"
        ]

        image = self.docker_config.get("image")
        image = self.docker_config.get(
            f"{'head' if as_head else 'worker'}_image", image)

        self._check_docker_installed()
        if self.docker_config.get("pull_before_run", True):
            assert image, "Image must be included in config if " + \
                "pull_before_run is specified"

            self.run("docker pull {}".format(image), run_env="host")

        # Bootstrap files cannot be bind mounted because docker opens the
        # underlying inode. When the file is switched, docker becomes outdated.
        cleaned_bind_mounts = file_mounts.copy()
        for mnt in BOOTSTRAP_MOUNTS:
            cleaned_bind_mounts.pop(mnt, None)

        start_command = docker_start_cmds(
            self.ssh_command_runner.ssh_user, image, cleaned_bind_mounts,
            self.container_name,
            self.docker_config.get("run_options", []) + self.docker_config.get(
                f"{'head' if as_head else 'worker'}_run_options", []))

        if not self._check_container_status():
            self.run(start_command, run_env="host")
        else:
            running_image = self.run(check_docker_image(self.container_name),
                                     with_output=True,
                                     run_env="host").decode("utf-8").strip()
            if running_image != image:
                logger.error(f"A container with name {self.container_name} " +
                             f"is running image {running_image} instead " +
                             f"of {image} (which was provided in the YAML")
            mounts = self.run(check_bind_mounts_cmd(self.container_name),
                              with_output=True,
                              run_env="host").decode("utf-8").strip()
            try:
                active_mounts = json.loads(mounts)
                active_remote_mounts = [
                    mnt["Destination"] for mnt in active_mounts
                ]
                # Ignore ray bootstrap files.
                for remote, local in cleaned_bind_mounts.items():
                    remote = self._docker_expand_user(remote)
                    if remote not in active_remote_mounts:
                        cli_logger.error(
                            "Please ray stop & restart cluster to "
                            f"allow mount {remote}:{local} to take hold")
            except json.JSONDecodeError:
                cli_logger.verbose(
                    "Unable to check if file_mounts specified in the YAML "
                    "differ from those on the running container.")

        # Explicitly copy in ray bootstrap files.
        for mount in BOOTSTRAP_MOUNTS:
            if mount in file_mounts:
                self.ssh_command_runner.run(
                    "docker cp {src} {container}:{dst}".format(
                        src=os.path.join(DOCKER_MOUNT_PREFIX, mount),
                        container=self.container_name,
                        dst=self._docker_expand_user(mount)))
        self.initialized = True
Ejemplo n.º 6
0
    def do_update(self):
        self.provider.set_node_tags(
            self.node_id, {TAG_RAY_NODE_STATUS: STATUS_WAITING_FOR_SSH})
        cli_logger.labeled_value("New status", STATUS_WAITING_FOR_SSH)

        deadline = time.time() + NODE_START_WAIT_S
        self.wait_ready(deadline)

        node_tags = self.provider.node_tags(self.node_id)
        logger.debug("Node tags: {}".format(str(node_tags)))

        # runtime_hash will only change whenever the user restarts
        # or updates their cluster with `get_or_create_head_node`
        if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash and (
                self.file_mounts_contents_hash is None
                or node_tags.get(TAG_RAY_FILE_MOUNTS_CONTENTS)
                == self.file_mounts_contents_hash):
            # todo: we lie in the confirmation message since
            # full setup might be cancelled here
            cli_logger.print(
                "Configuration already up to date, "
                "skipping file mounts, initalization and setup commands.",
                _numbered=("[]", "2-5", 6))
            cli_logger.old_info(logger,
                                "{}{} already up-to-date, skip to ray start",
                                self.log_prefix, self.node_id)

            # When resuming from a stopped instance the runtime_hash may be the
            # same, but the container will not be started.
            self.cmd_runner.run_init(as_head=self.is_head_node,
                                     file_mounts=self.file_mounts)

        else:
            cli_logger.print("Updating cluster configuration.",
                             _tags=dict(hash=self.runtime_hash))

            self.provider.set_node_tags(
                self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SYNCING_FILES})
            cli_logger.labeled_value("New status", STATUS_SYNCING_FILES)
            self.sync_file_mounts(self.rsync_up, step_numbers=(2, 6))

            # Only run setup commands if runtime_hash has changed because
            # we don't want to run setup_commands every time the head node
            # file_mounts folders have changed.
            if node_tags.get(TAG_RAY_RUNTIME_CONFIG) != self.runtime_hash:
                # Run init commands
                self.provider.set_node_tags(
                    self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SETTING_UP})
                cli_logger.labeled_value("New status", STATUS_SETTING_UP)

                if self.initialization_commands:
                    with cli_logger.group("Running initialization commands",
                                          _numbered=("[]", 3, 5)):
                        with LogTimer(self.log_prefix +
                                      "Initialization commands",
                                      show_status=True):
                            for cmd in self.initialization_commands:
                                try:
                                    # Overriding the existing SSHOptions class
                                    # with a new SSHOptions class that uses
                                    # this ssh_private_key as its only __init__
                                    # argument.
                                    # Run outside docker.
                                    self.cmd_runner.run(
                                        cmd,
                                        ssh_options_override_ssh_key=self.
                                        auth_config.get("ssh_private_key"),
                                        run_env="host")
                                except ProcessRunnerError as e:
                                    if e.msg_type == "ssh_command_failed":
                                        cli_logger.error("Failed.")
                                        cli_logger.error(
                                            "See above for stderr.")

                                    raise click.ClickException(
                                        "Initialization command failed."
                                    ) from None
                else:
                    cli_logger.print("No initialization commands to run.",
                                     _numbered=("[]", 3, 6))
                self.cmd_runner.run_init(as_head=self.is_head_node,
                                         file_mounts=self.file_mounts)
                if self.setup_commands:
                    with cli_logger.group(
                            "Running setup commands",
                            # todo: fix command numbering
                            _numbered=("[]", 4, 6)):
                        with LogTimer(self.log_prefix + "Setup commands",
                                      show_status=True):

                            total = len(self.setup_commands)
                            for i, cmd in enumerate(self.setup_commands):
                                if cli_logger.verbosity == 0 and len(cmd) > 30:
                                    cmd_to_print = cf.bold(cmd[:30]) + "..."
                                else:
                                    cmd_to_print = cf.bold(cmd)

                                cli_logger.print("{}",
                                                 cmd_to_print,
                                                 _numbered=("()", i, total))

                                try:
                                    # Runs in the container if docker is in use
                                    self.cmd_runner.run(cmd, run_env="auto")
                                except ProcessRunnerError as e:
                                    if e.msg_type == "ssh_command_failed":
                                        cli_logger.error("Failed.")
                                        cli_logger.error(
                                            "See above for stderr.")

                                    raise click.ClickException(
                                        "Setup command failed.")
                else:
                    cli_logger.print("No setup commands to run.",
                                     _numbered=("[]", 4, 6))

        with cli_logger.group("Starting the Ray runtime",
                              _numbered=("[]", 6, 6)):
            with LogTimer(self.log_prefix + "Ray start commands",
                          show_status=True):
                for cmd in self.ray_start_commands:
                    if self.node_resources:
                        env_vars = {
                            ray_constants.RESOURCES_ENVIRONMENT_VARIABLE:
                            self.node_resources
                        }
                    else:
                        env_vars = {}
                    try:
                        old_redirected = cmd_output_util.is_output_redirected()
                        cmd_output_util.set_output_redirected(False)
                        # Runs in the container if docker is in use
                        self.cmd_runner.run(cmd,
                                            environment_variables=env_vars,
                                            run_env="auto")
                        cmd_output_util.set_output_redirected(old_redirected)
                    except ProcessRunnerError as e:
                        if e.msg_type == "ssh_command_failed":
                            cli_logger.error("Failed.")
                            cli_logger.error("See above for stderr.")

                        raise click.ClickException("Start command failed.")
Ejemplo n.º 7
0
    def run(self):
        cli_logger.old_info(logger, "{}Updating to {}", self.log_prefix,
                            self.runtime_hash)

        if cmd_output_util.does_allow_interactive(
        ) and cmd_output_util.is_output_redirected():
            # this is most probably a bug since the user has no control
            # over these settings
            msg = ("Output was redirected for an interactive command. "
                   "Either do not pass `--redirect-command-output` "
                   "or also pass in `--use-normal-shells`.")
            cli_logger.abort(msg)
            raise click.ClickException(msg)

        try:
            with LogTimer(self.log_prefix +
                          "Applied config {}".format(self.runtime_hash)):
                self.do_update()
        except Exception as e:
            error_str = str(e)
            if hasattr(e, "cmd"):
                error_str = "(Exit Status {}) {}".format(
                    e.returncode, " ".join(e.cmd))

            self.provider.set_node_tags(
                self.node_id, {TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED})
            cli_logger.error("New status: {}", cf.bold(STATUS_UPDATE_FAILED))

            cli_logger.old_error(logger, "{}Error executing: {}\n",
                                 self.log_prefix, error_str)

            cli_logger.error("!!!")
            if hasattr(e, "cmd"):
                cli_logger.error(
                    "Setup command `{}` failed with exit code {}. stderr:",
                    cf.bold(e.cmd), e.returncode)
            else:
                cli_logger.verbose_error("{}", str(vars(e)))
                # todo: handle this better somehow?
                cli_logger.error("{}", str(e))
            # todo: print stderr here
            cli_logger.error("!!!")
            cli_logger.newline()

            if isinstance(e, click.ClickException):
                # todo: why do we ignore this here
                return
            raise

        tags_to_set = {
            TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE,
            TAG_RAY_RUNTIME_CONFIG: self.runtime_hash,
        }
        if self.file_mounts_contents_hash is not None:
            tags_to_set[
                TAG_RAY_FILE_MOUNTS_CONTENTS] = self.file_mounts_contents_hash

        self.provider.set_node_tags(self.node_id, tags_to_set)
        cli_logger.labeled_value("New status", STATUS_UP_TO_DATE)

        self.exitcode = 0
Ejemplo n.º 8
0
def _read_subprocess_stream(f, output_file, is_stdout=False):
    """Read and process a subprocess output stream.

    The goal is to find error messages and respond to them in a clever way.
    Currently just used for SSH messages (CONN_REFUSED, TIMEOUT, etc.), so
    the user does not get confused by these.

    Ran in a thread each for both `stdout` and `stderr` to
    allow for cross-platform asynchronous IO.

    Note: `select`-based IO is another option, but Windows has
    no support for `select`ing pipes, and Linux support varies somewhat.
    Spefically, Older *nix systems might also have quirks in how they
    handle `select` on pipes.

    Args:
        f: File object for the stream.
        output_file: File object to which filtered output is written.
        is_stdout (bool):
            When `is_stdout` is `False`, the stream is assumed to
            be `stderr`. Different error message detectors are used,
            and the output is displayed to the user unless it matches
            a special case (e.g. SSH timeout), in which case this is
            left up to the caller.
    """

    detected_special_case = None
    while True:
        # ! Readline here is crucial.
        # ! Normal `read()` will block until EOF instead of until
        #   something is available.
        line = f.readline()

        if line is None or line == "":
            # EOF
            break

        if line[-1] == "\n":
            line = line[:-1]

        if not is_stdout:
            if _ssh_output_regexes["connection_closed"]\
                    .fullmatch(line) is not None:
                # Do not log "connection closed" messages which SSH
                # puts in stderr for no reason.
                #
                # They are never errors since the connection will
                # close no matter whether the command succeeds or not.
                continue

            if _ssh_output_regexes["timeout"].fullmatch(line) is not None:
                # Timeout is not really an error but rather a special
                # condition. It should be handled by the caller, since
                # network conditions/nodes in the early stages of boot
                # are expected to sometimes cause connection timeouts.
                if detected_special_case is not None:
                    raise ValueError("Bug: ssh_timeout conflicts with another "
                                     "special codition: " +
                                     detected_special_case)

                detected_special_case = "ssh_timeout"
                continue

            if _ssh_output_regexes["conn_refused"]\
                    .fullmatch(line) is not None:
                # Connection refused is not really an error but
                # rather a special condition. It should be handled by
                # the caller, since network conditions/nodes in the
                # early stages of boot are expected to sometimes cause
                # CONN_REFUSED.
                if detected_special_case is not None:
                    raise ValueError(
                        "Bug: ssh_conn_refused conflicts with another "
                        "special codition: " + detected_special_case)

                detected_special_case = "ssh_conn_refused"
                continue

            if _ssh_output_regexes["known_host_update"]\
                    .fullmatch(line) is not None:
                # Since we ignore SSH host control anyway
                # (-o UserKnownHostsFile=/dev/null),
                # we should silence the host control warnings.
                continue

            cli_logger.error(line)

        if output_file is not None:
            output_file.write(line + "\n")

    return detected_special_case
Ejemplo n.º 9
0
def handle_boto_error(exc, msg, *args, **kwargs):
    if cli_logger.old_style:
        # old-style logging doesn't do anything here
        # so we exit early
        return

    error_code = None
    error_info = None
    # todo: not sure if these exceptions always have response
    if hasattr(exc, "response"):
        error_info = exc.response.get("Error", None)
    if error_info is not None:
        error_code = error_info.get("Code", None)

    generic_message_args = [
        "{}\n"
        "Error code: {}",
        msg.format(*args, **kwargs),
        cf.bold(error_code)
    ]

    # apparently
    # ExpiredTokenException
    # ExpiredToken
    # RequestExpired
    # are all the same pretty much
    credentials_expiration_codes = [
        "ExpiredTokenException", "ExpiredToken", "RequestExpired"
    ]

    if error_code in credentials_expiration_codes:
        # "An error occurred (ExpiredToken) when calling the
        # GetInstanceProfile operation: The security token
        # included in the request is expired"

        # "An error occurred (RequestExpired) when calling the
        # DescribeKeyPairs operation: Request has expired."

        token_command = ("aws sts get-session-token "
                         "--serial-number arn:aws:iam::" +
                         cf.underlined("ROOT_ACCOUNT_ID") + ":mfa/" +
                         cf.underlined("AWS_USERNAME") + " --token-code " +
                         cf.underlined("TWO_FACTOR_AUTH_CODE"))

        secret_key_var = ("export AWS_SECRET_ACCESS_KEY = " +
                          cf.underlined("REPLACE_ME") +
                          " # found at Credentials.SecretAccessKey")
        session_token_var = ("export AWS_SESSION_TOKEN = " +
                             cf.underlined("REPLACE_ME") +
                             " # found at Credentials.SessionToken")
        access_key_id_var = ("export AWS_ACCESS_KEY_ID = " +
                             cf.underlined("REPLACE_ME") +
                             " # found at Credentials.AccessKeyId")

        # fixme: replace with a Github URL that points
        # to our repo
        aws_session_script_url = ("https://gist.github.com/maximsmol/"
                                  "a0284e1d97b25d417bd9ae02e5f450cf")

        cli_logger.verbose_error(*generic_message_args)
        cli_logger.verbose(vars(exc))

        cli_logger.abort(
            "Your AWS session has expired.\n\n"
            "You can request a new one using\n{}\n"
            "then expose it to Ray by setting\n{}\n{}\n{}\n\n"
            "You can find a script that automates this at:\n{}",
            cf.bold(token_command), cf.bold(secret_key_var),
            cf.bold(session_token_var), cf.bold(access_key_id_var),
            cf.underlined(aws_session_script_url))

    # todo: any other errors that we should catch separately?

    cli_logger.error(*generic_message_args)
    cli_logger.newline()
    with cli_logger.verbatim_error_ctx("Boto3 error:"):
        cli_logger.verbose(vars(exc))
        cli_logger.error(exc)
    cli_logger.abort()
Ejemplo n.º 10
0
cli_logger.verbosity = 999
cli_logger.detect_colors()

cli_logger.print(
    cf.bold("Bold ") + cf.italic("Italic ") + cf.underlined("Underlined"))
cli_logger.labeled_value("Label", "value")
cli_logger.print("List: {}", cli_logger.render_list([1, 2, 3]))
cli_logger.newline()
cli_logger.very_verbose("Very verbose")
cli_logger.verbose("Verbose")
cli_logger.verbose_warning("Verbose warning")
cli_logger.verbose_error("Verbose error")
cli_logger.print("Info")
cli_logger.success("Success")
cli_logger.warning("Warning")
cli_logger.error("Error")
cli_logger.newline()
try:
    cli_logger.abort("Abort")
except Exception:
    pass
try:
    cli_logger.doassert(False, "Assert")
except Exception:
    pass
cli_logger.newline()
cli_logger.confirm(True, "example")
cli_logger.newline()
with cli_logger.indented():
    cli_logger.print("Indented")
with cli_logger.group("Group"):
Ejemplo n.º 11
0
    def do_update(self):
        self.provider.set_node_tags(
            self.node_id, {TAG_RAY_NODE_STATUS: STATUS_WAITING_FOR_SSH})
        cli_logger.labeled_value("New status", STATUS_WAITING_FOR_SSH)

        deadline = time.time() + NODE_START_WAIT_S
        self.wait_ready(deadline)

        node_tags = self.provider.node_tags(self.node_id)
        logger.debug("Node tags: {}".format(str(node_tags)))

        # runtime_hash will only change whenever the user restarts
        # or updates their cluster with `get_or_create_head_node`
        if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash and (
                self.file_mounts_contents_hash is None
                or node_tags.get(TAG_RAY_FILE_MOUNTS_CONTENTS)
                == self.file_mounts_contents_hash):
            # todo: we lie in the confirmation message since
            # full setup might be cancelled here
            cli_logger.print(
                "Configuration already up to date, "
                "skipping file mounts, initalization and setup commands.")
            cli_logger.old_info(logger,
                                "{}{} already up-to-date, skip to ray start",
                                self.log_prefix, self.node_id)

        else:
            cli_logger.print("Updating cluster configuration.",
                             _tags=dict(hash=self.runtime_hash))

            self.provider.set_node_tags(
                self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SYNCING_FILES})
            cli_logger.labeled_value("New status", STATUS_SYNCING_FILES)
            self.sync_file_mounts(self.rsync_up)

            # Only run setup commands if runtime_hash has changed because
            # we don't want to run setup_commands every time the head node
            # file_mounts folders have changed.
            if node_tags.get(TAG_RAY_RUNTIME_CONFIG) != self.runtime_hash:
                # Run init commands
                self.provider.set_node_tags(
                    self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SETTING_UP})
                cli_logger.labeled_value("New status", STATUS_SETTING_UP)

                if self.initialization_commands:
                    with cli_logger.group("Running initialization commands",
                                          _numbered=("[]", 3, 5)):
                        with LogTimer(self.log_prefix +
                                      "Initialization commands",
                                      show_status=True):
                            for cmd in self.initialization_commands:
                                try:
                                    self.cmd_runner.run(
                                        cmd,
                                        ssh_options_override=SSHOptions(
                                            self.auth_config.get(
                                                "ssh_private_key")))
                                except ProcessRunnerError as e:
                                    if e.msg_type == "ssh_command_failed":
                                        cli_logger.error("Failed.")
                                        cli_logger.error(
                                            "See above for stderr.")

                                    raise click.ClickException(
                                        "Initialization command failed.")
                else:
                    cli_logger.print("No initialization commands to run.",
                                     _numbered=("[]", 3, 6))

                if self.setup_commands:
                    with cli_logger.group(
                            "Running setup commands",
                            # todo: fix command numbering
                            _numbered=("[]", 4, 6)):
                        with LogTimer(self.log_prefix + "Setup commands",
                                      show_status=True):

                            total = len(self.setup_commands)
                            for i, cmd in enumerate(self.setup_commands):
                                if cli_logger.verbosity == 0 and len(cmd) > 30:
                                    cmd_to_print = cf.bold(cmd[:30]) + "..."
                                else:
                                    cmd_to_print = cf.bold(cmd)

                                cli_logger.print("{}",
                                                 cmd_to_print,
                                                 _numbered=("()", i, total))

                                try:
                                    self.cmd_runner.run(cmd)
                                except ProcessRunnerError as e:
                                    if e.msg_type == "ssh_command_failed":
                                        cli_logger.error("Failed.")
                                        cli_logger.error(
                                            "See above for stderr.")

                                    raise click.ClickException(
                                        "Setup command failed.")
                else:
                    cli_logger.print("No setup commands to run.",
                                     _numbered=("[]", 4, 6))

        with cli_logger.group("Starting the Ray runtime",
                              _numbered=("[]", 6, 6)):
            with LogTimer(self.log_prefix + "Ray start commands",
                          show_status=True):
                for cmd in self.ray_start_commands:
                    try:
                        self.cmd_runner.run(cmd)
                    except ProcessRunnerError as e:
                        if e.msg_type == "ssh_command_failed":
                            cli_logger.error("Failed.")
                            cli_logger.error("See above for stderr.")

                        raise click.ClickException("Start command failed.")