Exemple #1
0
        def do_sync(remote_path, local_path, allow_non_existing_paths=False):
            if allow_non_existing_paths and not os.path.exists(local_path):
                cli_logger.print("sync: {} does not exist. Skipping.",
                                 local_path)
                # Ignore missing source files. In the future we should support
                # the --delete-missing-args command to delete files that have
                # been removed
                return

            assert os.path.exists(local_path), local_path

            if os.path.isdir(local_path):
                if not local_path.endswith("/"):
                    local_path += "/"
                if not remote_path.endswith("/"):
                    remote_path += "/"

            with LogTimer(self.log_prefix +
                          "Synced {} to {}".format(local_path, remote_path)):
                is_docker = (self.docker_config
                             and self.docker_config["container_name"] != "")
                if not is_docker:
                    # The DockerCommandRunner handles this internally.
                    self.cmd_runner.run(
                        "mkdir -p {}".format(os.path.dirname(remote_path)),
                        run_env="host")
                sync_cmd(
                    local_path, remote_path, docker_mount_if_possible=True)

                if remote_path not in nolog_paths:
                    # todo: timed here?
                    cli_logger.print("{} from {}", cf.bold(remote_path),
                                     cf.bold(local_path))
Exemple #2
0
    def wait_ready(self, deadline):
        with cli_logger.group(
            "Waiting for SSH to become available", _numbered=("[]", 1, NUM_SETUP_STEPS)
        ):
            with LogTimer(self.log_prefix + "Got remote shell"):

                cli_logger.print("Running `{}` as a test.", cf.bold("uptime"))
                first_conn_refused_time = None
                while True:
                    if time.time() > deadline:
                        raise Exception("wait_ready timeout exceeded.")
                    if self.provider.is_terminated(self.node_id):
                        raise Exception(
                            "wait_ready aborting because node "
                            "detected as terminated."
                        )

                    try:
                        # Run outside of the container
                        self.cmd_runner.run("uptime", timeout=5, run_env="host")
                        cli_logger.success("Success.")
                        return True
                    except ProcessRunnerError as e:
                        first_conn_refused_time = cmd_output_util.handle_ssh_fails(
                            e,
                            first_conn_refused_time,
                            retry_interval=READY_CHECK_INTERVAL,
                        )
                        time.sleep(READY_CHECK_INTERVAL)
                    except Exception as e:
                        # TODO(maximsmol): we should not be ignoring
                        # exceptions if they get filtered properly
                        # (new style log + non-interactive shells)
                        #
                        # however threading this configuration state
                        # is a pain and I'm leaving it for later

                        retry_str = "(" + str(e) + ")"
                        if hasattr(e, "cmd"):
                            if isinstance(e.cmd, str):
                                cmd_ = e.cmd
                            elif isinstance(e.cmd, list):
                                cmd_ = " ".join(e.cmd)
                            else:
                                logger.debug(
                                    f"e.cmd type ({type(e.cmd)}) not list or str."
                                )
                                cmd_ = str(e.cmd)
                            retry_str = "(Exit Status {}): {}".format(
                                e.returncode, cmd_
                            )

                        cli_logger.print(
                            "SSH still not available {}, retrying in {} seconds.",
                            cf.dimmed(retry_str),
                            cf.bold(str(READY_CHECK_INTERVAL)),
                        )

                        time.sleep(READY_CHECK_INTERVAL)
Exemple #3
0
    def run(self):
        update_start_time = time.time()
        if (
            cmd_output_util.does_allow_interactive()
            and cmd_output_util.is_output_redirected()
        ):
            # this is most probably a bug since the user has no control
            # over these settings
            msg = (
                "Output was redirected for an interactive command. "
                "Either do not pass `--redirect-command-output` "
                "or also pass in `--use-normal-shells`."
            )
            cli_logger.abort(msg)

        try:
            with LogTimer(
                self.log_prefix + "Applied config {}".format(self.runtime_hash)
            ):
                self.do_update()
        except Exception as e:
            self.provider.set_node_tags(
                self.node_id, {TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED}
            )
            cli_logger.error("New status: {}", cf.bold(STATUS_UPDATE_FAILED))

            cli_logger.error("!!!")
            if hasattr(e, "cmd"):
                cli_logger.error(
                    "Setup command `{}` failed with exit code {}. stderr:",
                    cf.bold(e.cmd),
                    e.returncode,
                )
            else:
                cli_logger.verbose_error("{}", str(vars(e)))
                # todo: handle this better somehow?
                cli_logger.error("{}", str(e))
            # todo: print stderr here
            cli_logger.error("!!!")
            cli_logger.newline()

            if isinstance(e, click.ClickException):
                # todo: why do we ignore this here
                return
            raise

        tags_to_set = {
            TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE,
            TAG_RAY_RUNTIME_CONFIG: self.runtime_hash,
        }
        if self.file_mounts_contents_hash is not None:
            tags_to_set[TAG_RAY_FILE_MOUNTS_CONTENTS] = self.file_mounts_contents_hash

        self.provider.set_node_tags(self.node_id, tags_to_set)
        cli_logger.labeled_value("New status", STATUS_UP_TO_DATE)

        self.update_time = time.time() - update_start_time
        self.exitcode = 0
Exemple #4
0
    def _create_tags(self, batch_updates):

        for (k, v), node_ids in batch_updates.items():
            m = "Set tag {}={} on {}".format(k, v, node_ids)
            with LogTimer("AliyunNodeProvider: {}".format(m)):
                if k == TAG_RAY_NODE_NAME:
                    k = "Name"

                self.acs.tag_resource(node_ids, [{"Key": k, "Value": v}])
Exemple #5
0
    def wait_ready(self, deadline):
        with cli_logger.group("Waiting for SSH to become available",
                              _numbered=("[]", 1, 6)):
            with LogTimer(self.log_prefix + "Got remote shell"):
                cli_logger.old_info(logger, "{}Waiting for remote shell...",
                                    self.log_prefix)

                cli_logger.print("Running `{}` as a test.", cf.bold("uptime"))
                first_conn_refused_time = None
                while time.time() < deadline and \
                        not self.provider.is_terminated(self.node_id):
                    try:
                        cli_logger.old_debug(logger,
                                             "{}Waiting for remote shell...",
                                             self.log_prefix)

                        # Run outside of the container
                        self.cmd_runner.run("uptime",
                                            timeout=5,
                                            run_env="host")
                        cli_logger.old_debug(logger, "Uptime succeeded.")
                        cli_logger.success("Success.")
                        return True
                    except ProcessRunnerError as e:
                        first_conn_refused_time = \
                            cmd_output_util.handle_ssh_fails(
                                e, first_conn_refused_time,
                                retry_interval=READY_CHECK_INTERVAL)
                        time.sleep(READY_CHECK_INTERVAL)
                    except Exception as e:
                        # TODO(maximsmol): we should not be ignoring
                        # exceptions if they get filtered properly
                        # (new style log + non-interactive shells)
                        #
                        # however threading this configuration state
                        # is a pain and I'm leaving it for later

                        retry_str = str(e)
                        if hasattr(e, "cmd"):
                            retry_str = "(Exit Status {}): {}".format(
                                e.returncode, " ".join(e.cmd))

                        cli_logger.print(
                            "SSH still not available {}, "
                            "retrying in {} seconds.", cf.dimmed(retry_str),
                            cf.bold(str(READY_CHECK_INTERVAL)))
                        cli_logger.old_debug(logger,
                                             "{}Node not up, retrying: {}",
                                             self.log_prefix, retry_str)

                        time.sleep(READY_CHECK_INTERVAL)

        assert False, "Unable to connect to node"
Exemple #6
0
 def _create_tags(self, batch_updates):
     for (k, v), node_ids in batch_updates.items():
         m = "Set tag {}={} on {}".format(k, v, node_ids)
         with LogTimer("AWSNodeProvider: {}".format(m)):
             if k == TAG_RAY_NODE_NAME:
                 k = "Name"
             self.ec2.meta.client.create_tags(
                 Resources=node_ids,
                 Tags=[{
                     "Key": k,
                     "Value": v
                 }],
             )
Exemple #7
0
    def _set_ssh_ip_if_required(self):
        if self.ssh_ip is not None:
            return

        # We assume that this never changes.
        #   I think that's reasonable.
        deadline = time.time() + AUTOSCALER_NODE_START_WAIT_S
        with LogTimer(self.log_prefix + "Got IP"):
            ip = self._wait_for_ip(deadline)

            cli_logger.doassert(ip is not None, "Could not get node IP.")  # todo: msg
            assert ip is not None, "Unable to find IP of node"

        self.ssh_ip = ip

        # This should run before any SSH commands and therefore ensure that
        #   the ControlPath directory exists, allowing SSH to maintain
        #   persistent sessions later on.
        try:
            os.makedirs(self.ssh_control_path, mode=0o700, exist_ok=True)
        except OSError as e:
            cli_logger.warning("{}", str(e))  # todo: msg
Exemple #8
0
    def _node_tag_update_loop(self):
        """Update the AWS tags for a cluster periodically.

        The purpose of this loop is to avoid excessive EC2 calls when a large
        number of nodes are being launched simultaneously.
        """
        while True:
            self.tag_cache_update_event.wait()
            self.tag_cache_update_event.clear()

            batch_updates = defaultdict(list)

            with self.tag_cache_lock:
                for node_id, tags in self.tag_cache_pending.items():
                    for x in tags.items():
                        batch_updates[x].append(node_id)
                    self.tag_cache[node_id].update(tags)

                self.tag_cache_pending = {}

            for (k, v), node_ids in batch_updates.items():
                m = "Set tag {}={} on {}".format(k, v, node_ids)
                with LogTimer("AWSNodeProvider: {}".format(m)):
                    if k == TAG_RAY_NODE_NAME:
                        k = "Name"
                    self.ec2.meta.client.create_tags(
                        Resources=node_ids,
                        Tags=[{
                            "Key": k,
                            "Value": v
                        }],
                    )

            self.tag_cache_kill_event.wait(timeout=5)
            if self.tag_cache_kill_event.is_set():
                return
Exemple #9
0
def teardown_cluster(config_file: str, yes: bool, workers_only: bool,
                     override_cluster_name: Optional[str],
                     keep_min_workers: bool):
    """Destroys all nodes of a Ray cluster described by a config json."""
    config = yaml.safe_load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name
    config = prepare_config(config)
    validate_config(config)

    cli_logger.confirm(yes, "Destroying cluster.", _abort=True)
    cli_logger.old_confirm("This will destroy your cluster", yes)

    if not workers_only:
        try:
            exec_cluster(config_file,
                         cmd="ray stop",
                         run_env="auto",
                         screen=False,
                         tmux=False,
                         stop=False,
                         start=False,
                         override_cluster_name=override_cluster_name,
                         port_forward=None,
                         with_output=False)
        except Exception as e:
            # todo: add better exception info
            cli_logger.verbose_error("{}", str(e))
            cli_logger.warning(
                "Exception occured when stopping the cluster Ray runtime "
                "(use -v to dump teardown exceptions).")
            cli_logger.warning(
                "Ignoring the exception and "
                "attempting to shut down the cluster nodes anyway.")

            cli_logger.old_exception(
                logger, "Ignoring error attempting a clean shutdown.")

    provider = _get_node_provider(config["provider"], config["cluster_name"])
    try:

        def remaining_nodes():
            workers = provider.non_terminated_nodes(
                {TAG_RAY_NODE_KIND: NODE_KIND_WORKER})

            if keep_min_workers:
                min_workers = config.get("min_workers", 0)

                cli_logger.print(
                    "{} random worker nodes will not be shut down. " +
                    cf.dimmed("(due to {})"), cf.bold(min_workers),
                    cf.bold("--keep-min-workers"))
                cli_logger.old_info(logger,
                                    "teardown_cluster: Keeping {} nodes...",
                                    min_workers)

                workers = random.sample(workers, len(workers) - min_workers)

            # todo: it's weird to kill the head node but not all workers
            if workers_only:
                cli_logger.print(
                    "The head node will not be shut down. " +
                    cf.dimmed("(due to {})"), cf.bold("--workers-only"))

                return workers

            head = provider.non_terminated_nodes(
                {TAG_RAY_NODE_KIND: NODE_KIND_HEAD})

            return head + workers

        def run_docker_stop(node, container_name):
            try:
                updater = NodeUpdaterThread(
                    node_id=node,
                    provider_config=config["provider"],
                    provider=provider,
                    auth_config=config["auth"],
                    cluster_name=config["cluster_name"],
                    file_mounts=config["file_mounts"],
                    initialization_commands=[],
                    setup_commands=[],
                    ray_start_commands=[],
                    runtime_hash="",
                    file_mounts_contents_hash="",
                    is_head_node=False,
                    docker_config=config.get("docker"))
                _exec(updater,
                      f"docker stop {container_name}",
                      False,
                      False,
                      run_env="host")
            except Exception:
                cli_logger.warning(f"Docker stop failed on {node}")
                cli_logger.old_warning(logger, f"Docker stop failed on {node}")

        # Loop here to check that both the head and worker nodes are actually
        #   really gone
        A = remaining_nodes()

        container_name = config.get("docker", {}).get("container_name")
        if container_name:
            for node in A:
                run_docker_stop(node, container_name)

        with LogTimer("teardown_cluster: done."):
            while A:
                cli_logger.old_info(
                    logger, "teardown_cluster: "
                    "Shutting down {} nodes...", len(A))

                provider.terminate_nodes(A)

                cli_logger.print("Requested {} nodes to shut down.",
                                 cf.bold(len(A)),
                                 _tags=dict(interval="1s"))

                time.sleep(
                    POLL_INTERVAL)  # todo: interval should be a variable
                A = remaining_nodes()
                cli_logger.print("{} nodes remaining after {} second(s).",
                                 cf.bold(len(A)), POLL_INTERVAL)
            cli_logger.success("No nodes remaining.")
    finally:
        provider.cleanup()
Exemple #10
0
    def do_update(self):
        self.provider.set_node_tags(
            self.node_id, {TAG_RAY_NODE_STATUS: STATUS_WAITING_FOR_SSH})
        cli_logger.labeled_value("New status", STATUS_WAITING_FOR_SSH)

        deadline = time.time() + NODE_START_WAIT_S
        self.wait_ready(deadline)

        node_tags = self.provider.node_tags(self.node_id)
        logger.debug("Node tags: {}".format(str(node_tags)))

        if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash:
            # When resuming from a stopped instance the runtime_hash may be the
            # same, but the container will not be started.
            self.cmd_runner.run_init(
                as_head=self.is_head_node, file_mounts=self.file_mounts)

        # runtime_hash will only change whenever the user restarts
        # or updates their cluster with `get_or_create_head_node`
        if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash and (
                not self.file_mounts_contents_hash
                or node_tags.get(TAG_RAY_FILE_MOUNTS_CONTENTS) ==
                self.file_mounts_contents_hash):
            # todo: we lie in the confirmation message since
            # full setup might be cancelled here
            cli_logger.print(
                "Configuration already up to date, "
                "skipping file mounts, initalization and setup commands.",
                _numbered=("[]", "2-5", 6))
            cli_logger.old_info(logger,
                                "{}{} already up-to-date, skip to ray start",
                                self.log_prefix, self.node_id)

        else:
            cli_logger.print(
                "Updating cluster configuration.",
                _tags=dict(hash=self.runtime_hash))

            self.provider.set_node_tags(
                self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SYNCING_FILES})
            cli_logger.labeled_value("New status", STATUS_SYNCING_FILES)
            self.sync_file_mounts(self.rsync_up, step_numbers=(2, 6))

            # Only run setup commands if runtime_hash has changed because
            # we don't want to run setup_commands every time the head node
            # file_mounts folders have changed.
            if node_tags.get(TAG_RAY_RUNTIME_CONFIG) != self.runtime_hash:
                # Run init commands
                self.provider.set_node_tags(
                    self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SETTING_UP})
                cli_logger.labeled_value("New status", STATUS_SETTING_UP)

                if self.initialization_commands:
                    with cli_logger.group(
                            "Running initialization commands",
                            _numbered=("[]", 3, 5)):
                        with LogTimer(
                                self.log_prefix + "Initialization commands",
                                show_status=True):
                            for cmd in self.initialization_commands:
                                try:
                                    # Overriding the existing SSHOptions class
                                    # with a new SSHOptions class that uses
                                    # this ssh_private_key as its only __init__
                                    # argument.
                                    # Run outside docker.
                                    self.cmd_runner.run(
                                        cmd,
                                        ssh_options_override_ssh_key=self.
                                        auth_config.get("ssh_private_key"),
                                        run_env="host")
                                except ProcessRunnerError as e:
                                    if e.msg_type == "ssh_command_failed":
                                        cli_logger.error("Failed.")
                                        cli_logger.error(
                                            "See above for stderr.")

                                    raise click.ClickException(
                                        "Initialization command failed."
                                    ) from None
                else:
                    cli_logger.print(
                        "No initialization commands to run.",
                        _numbered=("[]", 3, 6))
                self.cmd_runner.run_init(
                    as_head=self.is_head_node, file_mounts=self.file_mounts)
                if self.setup_commands:
                    with cli_logger.group(
                            "Running setup commands",
                            # todo: fix command numbering
                            _numbered=("[]", 4, 6)):
                        with LogTimer(
                                self.log_prefix + "Setup commands",
                                show_status=True):

                            total = len(self.setup_commands)
                            for i, cmd in enumerate(self.setup_commands):
                                if cli_logger.verbosity == 0 and len(cmd) > 30:
                                    cmd_to_print = cf.bold(cmd[:30]) + "..."
                                else:
                                    cmd_to_print = cf.bold(cmd)

                                cli_logger.print(
                                    "{}",
                                    cmd_to_print,
                                    _numbered=("()", i, total))

                                try:
                                    # Runs in the container if docker is in use
                                    self.cmd_runner.run(cmd, run_env="auto")
                                except ProcessRunnerError as e:
                                    if e.msg_type == "ssh_command_failed":
                                        cli_logger.error("Failed.")
                                        cli_logger.error(
                                            "See above for stderr.")

                                    raise click.ClickException(
                                        "Setup command failed.")
                else:
                    cli_logger.print(
                        "No setup commands to run.", _numbered=("[]", 4, 6))

        with cli_logger.group(
                "Starting the Ray runtime", _numbered=("[]", 6, 6)):
            with LogTimer(
                    self.log_prefix + "Ray start commands", show_status=True):
                for cmd in self.ray_start_commands:
                    if self.node_resources:
                        env_vars = {
                            RESOURCES_ENVIRONMENT_VARIABLE: self.node_resources
                        }
                    else:
                        env_vars = {}
                    try:
                        old_redirected = cmd_output_util.is_output_redirected()
                        cmd_output_util.set_output_redirected(False)
                        # Runs in the container if docker is in use
                        self.cmd_runner.run(
                            cmd,
                            environment_variables=env_vars,
                            run_env="auto")
                        cmd_output_util.set_output_redirected(old_redirected)
                    except ProcessRunnerError as e:
                        if e.msg_type == "ssh_command_failed":
                            cli_logger.error("Failed.")
                            cli_logger.error("See above for stderr.")

                        raise click.ClickException("Start command failed.")
Exemple #11
0
    def do_update(self):
        self.provider.set_node_tags(
            self.node_id, {TAG_RAY_NODE_STATUS: STATUS_WAITING_FOR_SSH})
        cli_logger.labeled_value("New status", STATUS_WAITING_FOR_SSH)

        deadline = time.time() + AUTOSCALER_NODE_START_WAIT_S
        self.wait_ready(deadline)
        global_event_system.execute_callback(
            CreateClusterEvent.ssh_control_acquired)

        node_tags = self.provider.node_tags(self.node_id)
        logger.debug("Node tags: {}".format(str(node_tags)))

        if self.provider_type == "aws" and self.provider.provider_config:
            from ray.autoscaler._private.aws.cloudwatch.cloudwatch_helper \
                import CloudwatchHelper
            CloudwatchHelper(self.provider.provider_config,
                             [self.node_id], self.provider.cluster_name). \
                update_from_config(self.is_head_node)

        if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash:
            # When resuming from a stopped instance the runtime_hash may be the
            # same, but the container will not be started.
            init_required = self.cmd_runner.run_init(
                as_head=self.is_head_node,
                file_mounts=self.file_mounts,
                sync_run_yet=False)
            if init_required:
                node_tags[TAG_RAY_RUNTIME_CONFIG] += "-invalidate"
                # This ensures that `setup_commands` are not removed
                self.restart_only = False

        if self.restart_only:
            self.setup_commands = []

        # runtime_hash will only change whenever the user restarts
        # or updates their cluster with `get_or_create_head_node`
        if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash and (
                not self.file_mounts_contents_hash
                or node_tags.get(TAG_RAY_FILE_MOUNTS_CONTENTS) ==
                self.file_mounts_contents_hash):
            # todo: we lie in the confirmation message since
            # full setup might be cancelled here
            cli_logger.print(
                "Configuration already up to date, "
                "skipping file mounts, initalization and setup commands.",
                _numbered=("[]", "2-6", NUM_SETUP_STEPS))

        else:
            cli_logger.print(
                "Updating cluster configuration.",
                _tags=dict(hash=self.runtime_hash))

            self.provider.set_node_tags(
                self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SYNCING_FILES})
            cli_logger.labeled_value("New status", STATUS_SYNCING_FILES)
            self.sync_file_mounts(
                self.rsync_up, step_numbers=(1, NUM_SETUP_STEPS))

            # Only run setup commands if runtime_hash has changed because
            # we don't want to run setup_commands every time the head node
            # file_mounts folders have changed.
            if node_tags.get(TAG_RAY_RUNTIME_CONFIG) != self.runtime_hash:
                # Run init commands
                self.provider.set_node_tags(
                    self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SETTING_UP})
                cli_logger.labeled_value("New status", STATUS_SETTING_UP)

                if self.initialization_commands:
                    with cli_logger.group(
                            "Running initialization commands",
                            _numbered=("[]", 4, NUM_SETUP_STEPS)):
                        global_event_system.execute_callback(
                            CreateClusterEvent.run_initialization_cmd)
                        with LogTimer(
                                self.log_prefix + "Initialization commands",
                                show_status=True):
                            for cmd in self.initialization_commands:
                                global_event_system.execute_callback(
                                    CreateClusterEvent.run_initialization_cmd,
                                    {"command": cmd})
                                try:
                                    # Overriding the existing SSHOptions class
                                    # with a new SSHOptions class that uses
                                    # this ssh_private_key as its only __init__
                                    # argument.
                                    # Run outside docker.
                                    self.cmd_runner.run(
                                        cmd,
                                        ssh_options_override_ssh_key=self.
                                        auth_config.get("ssh_private_key"),
                                        run_env="host")
                                except ProcessRunnerError as e:
                                    if e.msg_type == "ssh_command_failed":
                                        cli_logger.error("Failed.")
                                        cli_logger.error(
                                            "See above for stderr.")

                                    raise click.ClickException(
                                        "Initialization command failed."
                                    ) from None
                else:
                    cli_logger.print(
                        "No initialization commands to run.",
                        _numbered=("[]", 4, NUM_SETUP_STEPS))
                with cli_logger.group(
                        "Initalizing command runner",
                        # todo: fix command numbering
                        _numbered=("[]", 5, NUM_SETUP_STEPS)):
                    self.cmd_runner.run_init(
                        as_head=self.is_head_node,
                        file_mounts=self.file_mounts,
                        sync_run_yet=True)
                if self.setup_commands:
                    with cli_logger.group(
                            "Running setup commands",
                            # todo: fix command numbering
                            _numbered=("[]", 6, NUM_SETUP_STEPS)):
                        global_event_system.execute_callback(
                            CreateClusterEvent.run_setup_cmd)
                        with LogTimer(
                                self.log_prefix + "Setup commands",
                                show_status=True):

                            total = len(self.setup_commands)
                            for i, cmd in enumerate(self.setup_commands):
                                global_event_system.execute_callback(
                                    CreateClusterEvent.run_setup_cmd,
                                    {"command": cmd})
                                if cli_logger.verbosity == 0 and len(cmd) > 30:
                                    cmd_to_print = cf.bold(cmd[:30]) + "..."
                                else:
                                    cmd_to_print = cf.bold(cmd)

                                cli_logger.print(
                                    "{}",
                                    cmd_to_print,
                                    _numbered=("()", i, total))

                                try:
                                    # Runs in the container if docker is in use
                                    self.cmd_runner.run(cmd, run_env="auto")
                                except ProcessRunnerError as e:
                                    if e.msg_type == "ssh_command_failed":
                                        cli_logger.error("Failed.")
                                        cli_logger.error(
                                            "See above for stderr.")

                                    raise click.ClickException(
                                        "Setup command failed.")
                else:
                    cli_logger.print(
                        "No setup commands to run.",
                        _numbered=("[]", 6, NUM_SETUP_STEPS))

        with cli_logger.group(
                "Starting the Ray runtime", _numbered=("[]", 7,
                                                       NUM_SETUP_STEPS)):
            global_event_system.execute_callback(
                CreateClusterEvent.start_ray_runtime)
            with LogTimer(
                    self.log_prefix + "Ray start commands", show_status=True):
                for cmd in self.ray_start_commands:

                    # Add a resource override env variable if needed:
                    if self.provider_type == "local":
                        # Local NodeProvider doesn't need resource override.
                        env_vars = {}
                    elif self.node_resources:
                        env_vars = {
                            RESOURCES_ENVIRONMENT_VARIABLE: self.node_resources
                        }
                    else:
                        env_vars = {}

                    try:
                        old_redirected = cmd_output_util.is_output_redirected()
                        cmd_output_util.set_output_redirected(False)
                        # Runs in the container if docker is in use
                        self.cmd_runner.run(
                            cmd,
                            environment_variables=env_vars,
                            run_env="auto")
                        cmd_output_util.set_output_redirected(old_redirected)
                    except ProcessRunnerError as e:
                        if e.msg_type == "ssh_command_failed":
                            cli_logger.error("Failed.")
                            cli_logger.error("See above for stderr.")

                        raise click.ClickException("Start command failed.")
            global_event_system.execute_callback(
                CreateClusterEvent.start_ray_runtime_completed)
Exemple #12
0
            # Only run setup commands if runtime_hash has changed because
            # we don't want to run setup_commands every time the head node
            # file_mounts folders have changed.
            if node_tags.get(TAG_RAY_RUNTIME_CONFIG) != self.runtime_hash:
                # Run init commands
                self.provider.set_node_tags(
                    self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SETTING_UP})
                cli_logger.labeled_value("New status", STATUS_SETTING_UP)

                if self.initialization_commands:
                    with cli_logger.group(
                            "Running initialization commands",
                            _numbered=("[]", 3, 5)):
                        with LogTimer(
                                self.log_prefix + "Initialization commands",
                                show_status=True):
                            for cmd in self.initialization_commands:
                                try:
                                    # Overriding the existing SSHOptions class
                                    # with a new SSHOptions class that uses
                                    # this ssh_private_key as its only __init__
                                    # argument.
                                    # Run outside docker.
                                    self.cmd_runner.run(
                                        cmd,
                                        ssh_options_override_ssh_key=self.
                                        auth_config.get("ssh_private_key"),
                                        run_env="host")
                                except ProcessRunnerError as e:
                                    if e.msg_type == "ssh_command_failed":