Esempio n. 1
0
 def _check_if_container_restart_is_needed(
         self, image: str, cleaned_bind_mounts: Dict[str, str]) -> bool:
     re_init_required = False
     running_image = (self.run(
         check_docker_image(self.container_name, self.docker_cmd),
         with_output=True,
         run_env="host",
     ).decode("utf-8").strip())
     if running_image != image:
         cli_logger.error(
             "A container with name {} is running image {} instead " +
             "of {} (which was provided in the YAML)",
             self.container_name,
             running_image,
             image,
         )
     mounts = (self.run(
         check_bind_mounts_cmd(self.container_name, self.docker_cmd),
         with_output=True,
         run_env="host",
     ).decode("utf-8").strip())
     try:
         active_mounts = json.loads(mounts)
         active_remote_mounts = {
             mnt["Destination"].strip("/")
             for mnt in active_mounts
         }
         # Ignore ray bootstrap files.
         requested_remote_mounts = {
             self._docker_expand_user(remote).strip("/")
             for remote in cleaned_bind_mounts.keys()
         }
         unfulfilled_mounts = requested_remote_mounts - active_remote_mounts
         if unfulfilled_mounts:
             re_init_required = True
             cli_logger.warning(
                 "This Docker Container is already running. "
                 "Restarting the Docker container on "
                 "this node to pick up the following file_mounts {}",
                 unfulfilled_mounts,
             )
     except json.JSONDecodeError:
         cli_logger.verbose(
             "Unable to check if file_mounts specified in the YAML "
             "differ from those on the running container.")
     return re_init_required
Esempio n. 2
0
def handle_ssh_fails(e, first_conn_refused_time, retry_interval):
    """Handle SSH system failures coming from a subprocess.

    Args:
        e: The `ProcessRunnerException` to handle.
        first_conn_refused_time:
            The time (as reported by this function) or None,
            indicating the last time a CONN_REFUSED error was caught.

            After exceeding a patience value, the program will be aborted
            since SSH will likely never recover.
        retry_interval: The interval after which the command will be retried,
                        used here just to inform the user.
    """
    if e.msg_type != "ssh_command_failed":
        return

    if e.special_case == "ssh_conn_refused":
        if (first_conn_refused_time is not None and
                time.time() - first_conn_refused_time > CONN_REFUSED_PATIENCE):
            cli_logger.error(
                "SSH connection was being refused "
                "for {} seconds. Head node assumed "
                "unreachable.",
                cf.bold(str(CONN_REFUSED_PATIENCE)),
            )
            cli_logger.abort("Check the node's firewall settings "
                             "and the cloud network configuration.")

        cli_logger.warning("SSH connection was refused.")
        cli_logger.warning("This might mean that the SSH daemon is "
                           "still setting up, or that "
                           "the host is inaccessable (e.g. due to "
                           "a firewall).")

        return time.time()

    if e.special_case in ["ssh_timeout", "ssh_conn_refused"]:
        cli_logger.print(
            "SSH still not available, retrying in {} seconds.",
            cf.bold(str(retry_interval)),
        )
    else:
        raise e

    return first_conn_refused_time
Esempio n. 3
0
 def run_docker_stop(node, container_name):
     try:
         updater = NodeUpdaterThread(node_id=node,
                                     provider_config=config["provider"],
                                     provider=provider,
                                     auth_config=config["auth"],
                                     cluster_name=config["cluster_name"],
                                     file_mounts=config["file_mounts"],
                                     initialization_commands=[],
                                     setup_commands=[],
                                     ray_start_commands=[],
                                     runtime_hash="",
                                     file_mounts_contents_hash="",
                                     is_head_node=False,
                                     docker_config=config.get("docker"))
         _exec(updater, cmd=f"docker stop {container_name}", run_env="host")
     except Exception:
         cli_logger.warning(f"Docker stop failed on {node}")
Esempio n. 4
0
def warn_about_bad_start_command(start_commands: List[str],
                                 no_monitor_on_head: bool = False) -> None:
    ray_start_cmd = list(filter(lambda x: "ray start" in x, start_commands))
    if len(ray_start_cmd) == 0:
        cli_logger.warning(
            "Ray runtime will not be started because `{}` is not in `{}`.",
            cf.bold("ray start"), cf.bold("head_start_ray_commands"))

    autoscaling_config_in_ray_start_cmd = any(
        "autoscaling-config" in x for x in ray_start_cmd)
    if not (autoscaling_config_in_ray_start_cmd or no_monitor_on_head):
        cli_logger.warning(
            "The head node will not launch any workers because "
            "`{}` does not have `{}` set.\n"
            "Potential fix: add `{}` to the `{}` command under `{}`.",
            cf.bold("ray start"), cf.bold("--autoscaling-config"),
            cf.bold("--autoscaling-config=~/ray_bootstrap_config.yaml"),
            cf.bold("ray start"), cf.bold("head_start_ray_commands"))
Esempio n. 5
0
def prepare_manual(config: Dict[str, Any]) -> Dict[str, Any]:
    """Validates and sets defaults for configs of manually managed on-prem
    clusters.

    - Checks for presence of required `worker_ips` and `head_ips` fields.
    - Defaults min and max workers to the number of `worker_ips`.
    - Caps min and max workers at the number of `worker_ips`.
    - Writes min and max worker info into the single worker node type.
    """
    config = copy.deepcopy(config)
    if ("worker_ips" not in config["provider"]) or ("head_ip"
                                                    not in config["provider"]):
        cli_logger.abort("Please supply a `head_ip` and list of `worker_ips`. "
                         "Alternatively, supply a `coordinator_address`.")
    num_ips = len(config["provider"]["worker_ips"])
    node_type = config["available_node_types"][LOCAL_CLUSTER_NODE_TYPE]
    # Default to keeping all provided ips in the cluster.
    config.setdefault("max_workers", num_ips)

    # The autoscaler no longer uses global `min_workers`.
    # We will move `min_workers` to the node_type config.
    min_workers = config.pop("min_workers", num_ips)
    max_workers = config["max_workers"]

    if min_workers > num_ips:
        cli_logger.warning(
            f"The value of `min_workers` supplied ({min_workers}) is greater"
            f" than the number of available worker ips ({num_ips})."
            f" Setting `min_workers={num_ips}`.")
        node_type["min_workers"] = num_ips
    else:
        node_type["min_workers"] = min_workers

    if max_workers > num_ips:
        cli_logger.warning(
            f"The value of `max_workers` supplied ({max_workers}) is greater"
            f" than the number of available worker ips ({num_ips})."
            f" Setting `max_workers={num_ips}`.")
        node_type["max_workers"] = num_ips
        config["max_workers"] = num_ips
    else:
        node_type["max_workers"] = max_workers

    return config
Esempio n. 6
0
File: cli.py Progetto: scv119/ray
def job_logs(address: Optional[str], job_id: str, follow: bool):
    """Gets the logs of a job.

    Example:
        >>> ray job logs <my_job_id>
    """
    client = _get_sdk_client(address)
    sdk_version = client.get_version()
    # sdk version 0 did not have log streaming
    if follow:
        if int(sdk_version) > 0:
            asyncio.get_event_loop().run_until_complete(
                _tail_logs(client, job_id))
        else:
            cli_logger.warning(
                "Tailing logs is not enabled for job sdk client version "
                f"{sdk_version}. Please upgrade your ray to latest version "
                "for this feature.")
    else:
        print(client.get_job_logs(job_id), end="")
Esempio n. 7
0
def rsync_up(cluster_config_file, source, target, cluster_name, all_nodes,
             log_style, log_color, verbose):
    """Upload specific files to a Ray cluster."""
    cli_logger.configure(log_style, log_color, verbose)

    if all_nodes:
        cli_logger.warning(
            "WARNING: the `all_nodes` option is deprecated and will be "
            "removed in the future. "
            "Rsync to worker nodes is not reliable since workers may be "
            "added during autoscaling. Please use the `file_mounts` "
            "feature instead for consistent file sync in autoscaling clusters")

    rsync(
        cluster_config_file,
        source,
        target,
        cluster_name,
        down=False,
        all_nodes=all_nodes)
Esempio n. 8
0
    def add_callback_handler(
        self,
        event: str,
        callback: Union[Callable[[Dict], None], List[Callable[[Dict], None]]],
    ):
        """Stores callback handler for event.

        Args:
            event: Event that callback should be called on. See
                CreateClusterEvent for details on the events available to be
                registered against.
            callback (Callable[[Dict], None]): Callable object that is invoked
                when specified event occurs.
        """
        if event not in CreateClusterEvent.__members__.values():
            cli_logger.warning(f"{event} is not currently tracked, and this"
                               " callback will not be invoked.")

        self.callback_map.setdefault(
            event,
            []).extend([callback] if type(callback) is not list else callback)
Esempio n. 9
0
def fill_node_type_max_workers(config):
    """Sets default per-node max workers to global max_workers.
    This equivalent to setting the default per-node max workers to infinity,
    with the only upper constraint coming from the global max_workers.
    """
    assert "max_workers" in config, "Global max workers should be set."
    node_types = config["available_node_types"]
    for node_type_name in node_types:
        node_type_data = node_types[node_type_name]

        # Log a warning if head node type's max_workers is absent.
        if (node_type_name == config["head_node_type"]
                and "max_workers" not in node_type_data):
            cli_logger.warning(
                HEAD_TYPE_MAX_WORKERS_WARN_TEMPLATE.format(
                    node_type=node_type_name,
                    max_workers=config["max_workers"],
                    version=ray.__version__))

        # The key part of this function:
        node_type_data.setdefault("max_workers", config["max_workers"])
Esempio n. 10
0
def logs(address: Optional[str], job_id: str, follow: bool):
    """Gets the logs of a job.

    Example:
        ray job logs <my_job_id>
    """
    client = _get_sdk_client(address)
    sdk_version = client.get_version()
    # sdk version 0 did not have log streaming
    if follow:
        if int(sdk_version) > 0:
            asyncio.get_event_loop().run_until_complete(
                _tail_logs(client, job_id))
        else:
            cli_logger.warning(
                "Tailing logs is not enabled for job sdk client version "
                f"{sdk_version}. Please upgrade your ray to latest version "
                "for this feature.")
    else:
        # Set no_format to True because the logs may have unescaped "{" and "}"
        # and the CLILogger calls str.format().
        cli_logger.print(client.get_job_logs(job_id), end="", no_format=True)
Esempio n. 11
0
    def _set_ssh_ip_if_required(self):
        if self.ssh_ip is not None:
            return

        # We assume that this never changes.
        #   I think that's reasonable.
        deadline = time.time() + AUTOSCALER_NODE_START_WAIT_S
        with LogTimer(self.log_prefix + "Got IP"):
            ip = self._wait_for_ip(deadline)

            cli_logger.doassert(ip is not None, "Could not get node IP.")  # todo: msg
            assert ip is not None, "Unable to find IP of node"

        self.ssh_ip = ip

        # This should run before any SSH commands and therefore ensure that
        #   the ControlPath directory exists, allowing SSH to maintain
        #   persistent sessions later on.
        try:
            os.makedirs(self.ssh_control_path, mode=0o700, exist_ok=True)
        except OSError as e:
            cli_logger.warning("{}", str(e))  # todo: msg
Esempio n. 12
0
def _get_running_head_node(config: Dict[str, Any],
                           printable_config_file: str,
                           override_cluster_name: Optional[str],
                           create_if_needed: bool = False,
                           _provider: Optional[NodeProvider] = None) -> str:
    """Get a valid, running head node"""
    provider = _provider or _get_node_provider(config["provider"],
                                               config["cluster_name"])
    head_node_tags = {
        TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
    }
    nodes = provider.non_terminated_nodes(head_node_tags)
    head_node = None
    for node in nodes:
        node_state = provider.node_tags(node).get(TAG_RAY_NODE_STATUS)
        if node_state == STATUS_UP_TO_DATE:
            head_node = node
        else:
            cli_logger.warning(f"Head node ({node}) is in state {node_state}.")

    if head_node is not None:
        return head_node
    elif create_if_needed:
        get_or_create_head_node(
            config,
            printable_config_file=printable_config_file,
            restart_only=False,
            no_restart=False,
            yes=True,
            override_cluster_name=override_cluster_name)
        return _get_running_head_node(
            config,
            printable_config_file,
            override_cluster_name,
            create_if_needed=False)
    else:
        raise RuntimeError("Head node of cluster ({}) not found!".format(
            config["cluster_name"]))
Esempio n. 13
0
def _info_from_params(
    cluster: Optional[str] = None,
    host: Optional[str] = None,
    ssh_user: Optional[str] = None,
    ssh_key: Optional[str] = None,
    docker: Optional[str] = None,
):
    """Parse command line arguments.

    Note: This returns a list of hosts, not a comma separated string!
    """
    if not host and not cluster:
        bootstrap_config = os.path.expanduser("~/ray_bootstrap_config.yaml")
        if os.path.exists(bootstrap_config):
            cluster = bootstrap_config
            cli_logger.warning(f"Detected cluster config file at {cluster}. "
                               f"If this is incorrect, specify with "
                               f"`ray cluster-dump <config>`")
    elif cluster:
        cluster = os.path.expanduser(cluster)

    cluster_name = None

    if cluster:
        h, u, k, d, cluster_name = get_info_from_ray_cluster_config(cluster)

        ssh_user = ssh_user or u
        ssh_key = ssh_key or k
        docker = docker or d
        hosts = host.split(",") if host else h

        if not hosts:
            raise LocalCommandFailed(
                f"Invalid cluster file or cluster has no running nodes: "
                f"{cluster}")
    elif host:
        hosts = host.split(",")
    else:
        raise LocalCommandFailed(
            "You need to either specify a `<cluster_config>` or `--host`.")

    if not ssh_user:
        ssh_user = DEFAULT_SSH_USER
        cli_logger.warning(
            f"Using default SSH user `{ssh_user}`. "
            f"If this is incorrect, specify with `--ssh-user <user>`")

    if not ssh_key:
        for cand_key in DEFAULT_SSH_KEYS:
            cand_key_file = os.path.expanduser(cand_key)
            if os.path.exists(cand_key_file):
                ssh_key = cand_key_file
                cli_logger.warning(
                    f"Auto detected SSH key file: {ssh_key}. "
                    f"If this is incorrect, specify with `--ssh-key <key>`")
                break

    return cluster, hosts, ssh_user, ssh_key, docker, cluster_name
Esempio n. 14
0
def warn_about_bad_start_command(start_commands):
    ray_start_cmd = list(filter(lambda x: "ray start" in x, start_commands))
    if len(ray_start_cmd) == 0:
        cli_logger.warning(
            "Ray runtime will not be started because `{}` is not in `{}`.",
            cf.bold("ray start"), cf.bold("head_start_ray_commands"))
        cli_logger.old_warning(
            logger,
            "Ray start is not included in the head_start_ray_commands section."
        )
    if not any("autoscaling-config" in x for x in ray_start_cmd):
        cli_logger.warning(
            "The head node will not launch any workers because "
            "`{}` does not have `{}` set.\n"
            "Potential fix: add `{}` to the `{}` command under `{}`.",
            cf.bold("ray start"), cf.bold("--autoscaling-config"),
            cf.bold("--autoscaling-config=~/ray_bootstrap_config.yaml"),
            cf.bold("ray start"), cf.bold("head_start_ray_commands"))
        cli_logger.old_warning(
            logger, "Ray start on the head node does not have the flag"
            "--autoscaling-config set. The head node will not launch"
            "workers. Add --autoscaling-config=~/ray_bootstrap_config.yaml"
            "to ray start in the head_start_ray_commands section.")
Esempio n. 15
0
def up(cluster_config_file, min_workers, max_workers, no_restart, restart_only,
       yes, cluster_name, no_config_cache, redirect_command_output,
       use_login_shells, log_style, log_color, verbose):
    """Create or update a Ray cluster."""
    cli_logger.configure(log_style, log_color, verbose)

    if restart_only or no_restart:
        cli_logger.doassert(restart_only != no_restart,
                            "`{}` is incompatible with `{}`.",
                            cf.bold("--restart-only"), cf.bold("--no-restart"))
        assert restart_only != no_restart, "Cannot set both 'restart_only' " \
            "and 'no_restart' at the same time!"

    if urllib.parse.urlparse(cluster_config_file).scheme in ("http", "https"):
        try:
            response = urllib.request.urlopen(cluster_config_file, timeout=5)
            content = response.read()
            file_name = cluster_config_file.split("/")[-1]
            with open(file_name, "wb") as f:
                f.write(content)
            cluster_config_file = file_name
        except urllib.error.HTTPError as e:
            cli_logger.warning("{}", str(e))
            cli_logger.warning(
                "Could not download remote cluster configuration file.")
            cli_logger.old_info(logger, "Error downloading file: ", e)
    create_or_update_cluster(
        config_file=cluster_config_file,
        override_min_workers=min_workers,
        override_max_workers=max_workers,
        no_restart=no_restart,
        restart_only=restart_only,
        yes=yes,
        override_cluster_name=cluster_name,
        no_config_cache=no_config_cache,
        redirect_command_output=redirect_command_output,
        use_login_shells=use_login_shells)
Esempio n. 16
0
def check_legacy_fields(config: Dict[str, Any]) -> None:
    """For use in providers that have completed the migration to
    available_node_types.

    Warns user that head_node and worker_nodes fields are being ignored.
    Throws an error if available_node_types and head_node_type aren't
    specified.
    """
    # log warning if non-empty head_node field
    if "head_node" in config and config["head_node"]:
        cli_logger.warning(
            "The `head_node` field is deprecated and will be ignored. "
            "Use `head_node_type` and `available_node_types` instead.")
    # log warning if non-empty worker_nodes field
    if "worker_nodes" in config and config["worker_nodes"]:
        cli_logger.warning(
            "The `worker_nodes` field is deprecated and will be ignored. "
            "Use `available_node_types` instead.")
    if "available_node_types" not in config:
        cli_logger.error("`available_node_types` not specified in config")
        raise ValueError("`available_node_types` not specified in config")
    if "head_node_type" not in config:
        cli_logger.error("`head_node_type` not specified in config")
        raise ValueError("`head_node_type` not specified in config")
Esempio n. 17
0
def _bootstrap_config(config: Dict[str, Any],
                      no_config_cache: bool = False) -> Dict[str, Any]:
    config = prepare_config(config)

    hasher = hashlib.sha1()
    hasher.update(json.dumps([config], sort_keys=True).encode("utf-8"))
    cache_key = os.path.join(tempfile.gettempdir(),
                             "ray-config-{}".format(hasher.hexdigest()))

    if os.path.exists(cache_key) and not no_config_cache:
        cli_logger.old_info(logger, "Using cached config at {}", cache_key)

        config_cache = json.loads(open(cache_key).read())
        if config_cache.get("_version", -1) == CONFIG_CACHE_VERSION:
            # todo: is it fine to re-resolve? afaik it should be.
            # we can have migrations otherwise or something
            # but this seems overcomplicated given that resolving is
            # relatively cheap
            try_reload_log_state(config_cache["config"]["provider"],
                                 config_cache.get("provider_log_info"))

            if log_once("_printed_cached_config_warning"):
                cli_logger.verbose_warning(
                    "Loaded cached provider configuration "
                    "from " + cf.bold("{}"), cache_key)
                if cli_logger.verbosity == 0:
                    cli_logger.warning("Loaded cached provider configuration")
                cli_logger.warning(
                    "If you experience issues with "
                    "the cloud provider, try re-running "
                    "the command with {}.", cf.bold("--no-config-cache"))

            return config_cache["config"]
        else:
            cli_logger.warning(
                "Found cached cluster config "
                "but the version " + cf.bold("{}") + " "
                "(expected " + cf.bold("{}") + ") does not match.\n"
                "This is normal if cluster launcher was updated.\n"
                "Config will be re-resolved.",
                config_cache.get("_version", "none"), CONFIG_CACHE_VERSION)
    validate_config(config)

    importer = _NODE_PROVIDERS.get(config["provider"]["type"])
    if not importer:
        raise NotImplementedError("Unsupported provider {}".format(
            config["provider"]))

    provider_cls = importer(config["provider"])
Esempio n. 18
0
def submit(cluster_config_file, screen, tmux, stop, start, cluster_name,
           no_config_cache, port_forward, script, args, script_args, log_style,
           log_color, verbose):
    """Uploads and runs a script on the specified cluster.

    The script is automatically synced to the following location:

        os.path.join("~", os.path.basename(script))

    Example:
        >>> ray submit [CLUSTER.YAML] experiment.py -- --smoke-test
    """
    cli_logger.configure(log_style, log_color, verbose)

    cli_logger.doassert(not (screen and tmux),
                        "`{}` and `{}` are incompatible.", cf.bold("--screen"),
                        cf.bold("--tmux"))
    cli_logger.doassert(
        not (script_args and args),
        "`{0}` and `{1}` are incompatible. Use only `{1}`.\n"
        "Example: `{2}`", cf.bold("--args"), cf.bold("-- <args ...>"),
        cf.bold("ray submit script.py -- --arg=123 --flag"))

    assert not (screen and tmux), "Can specify only one of `screen` or `tmux`."
    assert not (script_args and args), "Use -- --arg1 --arg2 for script args."

    if args:
        cli_logger.warning(
            "`{}` is deprecated and will be removed in the future.",
            cf.bold("--args"))
        cli_logger.warning("Use `{}` instead. Example: `{}`.",
                           cf.bold("-- <args ...>"),
                           cf.bold("ray submit script.py -- --arg=123 --flag"))
        cli_logger.newline()

    if start:
        create_or_update_cluster(
            config_file=cluster_config_file,
            override_min_workers=None,
            override_max_workers=None,
            no_restart=False,
            restart_only=False,
            yes=True,
            override_cluster_name=cluster_name,
            no_config_cache=no_config_cache,
            redirect_command_output=False,
            use_login_shells=True)
    target = os.path.basename(script)
    target = os.path.join("~", target)
    rsync(
        cluster_config_file,
        script,
        target,
        cluster_name,
        no_config_cache=no_config_cache,
        down=False)

    command_parts = ["python", target]
    if script_args:
        command_parts += list(script_args)
    elif args is not None:
        command_parts += [args]

    port_forward = [(port, port) for port in list(port_forward)]
    cmd = " ".join(command_parts)
    exec_cluster(
        cluster_config_file,
        cmd=cmd,
        run_env="docker",
        screen=screen,
        tmux=tmux,
        stop=stop,
        start=False,
        override_cluster_name=cluster_name,
        no_config_cache=no_config_cache,
        port_forward=port_forward)
Esempio n. 19
0
def teardown_cluster(config_file: str, yes: bool, workers_only: bool,
                     override_cluster_name: Optional[str],
                     keep_min_workers: bool):
    """Destroys all nodes of a Ray cluster described by a config json."""
    config = yaml.safe_load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name
    config = prepare_config(config)
    validate_config(config)

    cli_logger.confirm(yes, "Destroying cluster.", _abort=True)
    cli_logger.old_confirm("This will destroy your cluster", yes)

    if not workers_only:
        try:
            exec_cluster(config_file,
                         cmd="ray stop",
                         run_env="auto",
                         screen=False,
                         tmux=False,
                         stop=False,
                         start=False,
                         override_cluster_name=override_cluster_name,
                         port_forward=None,
                         with_output=False)
        except Exception as e:
            # todo: add better exception info
            cli_logger.verbose_error("{}", str(e))
            cli_logger.warning(
                "Exception occured when stopping the cluster Ray runtime "
                "(use -v to dump teardown exceptions).")
            cli_logger.warning(
                "Ignoring the exception and "
                "attempting to shut down the cluster nodes anyway.")

            cli_logger.old_exception(
                logger, "Ignoring error attempting a clean shutdown.")

    provider = _get_node_provider(config["provider"], config["cluster_name"])
    try:

        def remaining_nodes():
            workers = provider.non_terminated_nodes(
                {TAG_RAY_NODE_KIND: NODE_KIND_WORKER})

            if keep_min_workers:
                min_workers = config.get("min_workers", 0)

                cli_logger.print(
                    "{} random worker nodes will not be shut down. " +
                    cf.dimmed("(due to {})"), cf.bold(min_workers),
                    cf.bold("--keep-min-workers"))
                cli_logger.old_info(logger,
                                    "teardown_cluster: Keeping {} nodes...",
                                    min_workers)

                workers = random.sample(workers, len(workers) - min_workers)

            # todo: it's weird to kill the head node but not all workers
            if workers_only:
                cli_logger.print(
                    "The head node will not be shut down. " +
                    cf.dimmed("(due to {})"), cf.bold("--workers-only"))

                return workers

            head = provider.non_terminated_nodes(
                {TAG_RAY_NODE_KIND: NODE_KIND_HEAD})

            return head + workers

        def run_docker_stop(node, container_name):
            try:
                updater = NodeUpdaterThread(
                    node_id=node,
                    provider_config=config["provider"],
                    provider=provider,
                    auth_config=config["auth"],
                    cluster_name=config["cluster_name"],
                    file_mounts=config["file_mounts"],
                    initialization_commands=[],
                    setup_commands=[],
                    ray_start_commands=[],
                    runtime_hash="",
                    file_mounts_contents_hash="",
                    is_head_node=False,
                    docker_config=config.get("docker"))
                _exec(updater,
                      f"docker stop {container_name}",
                      False,
                      False,
                      run_env="host")
            except Exception:
                cli_logger.warning(f"Docker stop failed on {node}")
                cli_logger.old_warning(logger, f"Docker stop failed on {node}")

        # Loop here to check that both the head and worker nodes are actually
        #   really gone
        A = remaining_nodes()

        container_name = config.get("docker", {}).get("container_name")
        if container_name:
            for node in A:
                run_docker_stop(node, container_name)

        with LogTimer("teardown_cluster: done."):
            while A:
                cli_logger.old_info(
                    logger, "teardown_cluster: "
                    "Shutting down {} nodes...", len(A))

                provider.terminate_nodes(A)

                cli_logger.print("Requested {} nodes to shut down.",
                                 cf.bold(len(A)),
                                 _tags=dict(interval="1s"))

                time.sleep(
                    POLL_INTERVAL)  # todo: interval should be a variable
                A = remaining_nodes()
                cli_logger.print("{} nodes remaining after {} second(s).",
                                 cf.bold(len(A)), POLL_INTERVAL)
            cli_logger.success("No nodes remaining.")
    finally:
        provider.cleanup()
Esempio n. 20
0
def submit(
    address: Optional[str],
    job_id: Optional[str],
    runtime_env: Optional[str],
    runtime_env_json: Optional[str],
    working_dir: Optional[str],
    entrypoint: Tuple[str],
    no_wait: bool,
):
    """Submits a job to be run on the cluster.

    Example:
        ray job submit -- python my_script.py --arg=val
    """

    if ray_constants.RAY_JOB_SUBMIT_HOOK in os.environ:
        # Submit all args as **kwargs per the JOB_SUBMIT_HOOK contract.
        _load_class(os.environ[ray_constants.RAY_JOB_SUBMIT_HOOK])(
            address=address,
            job_id=job_id,
            runtime_env=runtime_env,
            runtime_env_json=runtime_env_json,
            working_dir=working_dir,
            entrypoint=entrypoint,
            no_wait=no_wait,
        )

    client = _get_sdk_client(address, create_cluster_if_needed=True)

    final_runtime_env = parse_runtime_env_args(
        runtime_env=runtime_env,
        runtime_env_json=runtime_env_json,
        working_dir=working_dir,
    )

    job_id = client.submit_job(
        entrypoint=list2cmdline(entrypoint),
        job_id=job_id,
        runtime_env=final_runtime_env,
    )

    _log_big_success_msg(f"Job '{job_id}' submitted successfully")

    with cli_logger.group("Next steps"):
        cli_logger.print("Query the logs of the job:")
        with cli_logger.indented():
            cli_logger.print(cf.bold(f"ray job logs {job_id}"))

        cli_logger.print("Query the status of the job:")
        with cli_logger.indented():
            cli_logger.print(cf.bold(f"ray job status {job_id}"))

        cli_logger.print("Request the job to be stopped:")
        with cli_logger.indented():
            cli_logger.print(cf.bold(f"ray job stop {job_id}"))

    cli_logger.newline()
    sdk_version = client.get_version()
    # sdk version 0 does not have log streaming
    if not no_wait:
        if int(sdk_version) > 0:
            cli_logger.print("Tailing logs until the job exits "
                             "(disable with --no-wait):")
            asyncio.get_event_loop().run_until_complete(
                _tail_logs(client, job_id))
        else:
            cli_logger.warning(
                "Tailing logs is not enabled for job sdk client version "
                f"{sdk_version}. Please upgrade your ray to latest version "
                "for this feature.")
Esempio n. 21
0
cli_logger.old_style = False
cli_logger.verbosity = 999
cli_logger.detect_colors()

cli_logger.print(
    cf.bold("Bold ") + cf.italic("Italic ") + cf.underlined("Underlined"))
cli_logger.labeled_value("Label", "value")
cli_logger.print("List: {}", cli_logger.render_list([1, 2, 3]))
cli_logger.newline()
cli_logger.very_verbose("Very verbose")
cli_logger.verbose("Verbose")
cli_logger.verbose_warning("Verbose warning")
cli_logger.verbose_error("Verbose error")
cli_logger.print("Info")
cli_logger.success("Success")
cli_logger.warning("Warning")
cli_logger.error("Error")
cli_logger.newline()
try:
    cli_logger.abort("Abort")
except Exception:
    pass
try:
    cli_logger.doassert(False, "Assert")
except Exception:
    pass
cli_logger.newline()
cli_logger.confirm(True, "example")
cli_logger.newline()
with cli_logger.indented():
    cli_logger.print("Indented")
Esempio n. 22
0
def stop(force, verbose, log_style, log_color):
    """Stop Ray processes manually on the local machine."""

    cli_logger.configure(log_style, log_color, verbose)

    # Note that raylet needs to exit before object store, otherwise
    # it cannot exit gracefully.
    is_linux = sys.platform.startswith("linux")
    processes_to_kill = [
        # The first element is the substring to filter.
        # The second element, if True, is to filter ps results by command name
        # (only the first 15 charactors of the executable name on Linux);
        # if False, is to filter ps results by command with all its arguments.
        # See STANDARD FORMAT SPECIFIERS section of
        # http://man7.org/linux/man-pages/man1/ps.1.html
        # about comm and args. This can help avoid killing non-ray processes.
        # Format:
        # Keyword to filter, filter by command (True)/filter by args (False)
        ["raylet", True],
        ["plasma_store", True],
        ["gcs_server", True],
        ["monitor.py", False],
        ["redis-server", False],
        ["default_worker.py", False],  # Python worker.
        ["ray::", True],  # Python worker. TODO(mehrdadn): Fix for Windows
        ["io.ray.runtime.runner.worker.DefaultWorker", False],  # Java worker.
        ["log_monitor.py", False],
        ["reporter.py", False],
        ["dashboard.py", False],
        ["new_dashboard/agent.py", False],
        ["ray_process_reaper.py", False],
    ]

    process_infos = []
    for proc in psutil.process_iter(["name", "cmdline"]):
        try:
            process_infos.append((proc, proc.name(), proc.cmdline()))
        except psutil.Error:
            pass

    total_found = 0
    total_stopped = 0
    for keyword, filter_by_cmd in processes_to_kill:
        if filter_by_cmd and is_linux and len(keyword) > 15:
            # getting here is an internal bug, so we do not use cli_logger
            msg = ("The filter string should not be more than {} "
                   "characters. Actual length: {}. Filter: {}").format(
                       15, len(keyword), keyword)
            raise ValueError(msg)

        found = []
        for candidate in process_infos:
            proc, proc_cmd, proc_args = candidate
            corpus = (proc_cmd
                      if filter_by_cmd else subprocess.list2cmdline(proc_args))
            if keyword in corpus:
                found.append(candidate)

        for proc, proc_cmd, proc_args in found:
            total_found += 1

            proc_string = str(subprocess.list2cmdline(proc_args))
            try:
                if force:
                    proc.kill()
                else:
                    # TODO(mehrdadn): On Windows, this is forceful termination.
                    # We don't want CTRL_BREAK_EVENT, because that would
                    # terminate the entire process group. What to do?
                    proc.terminate()

                if force:
                    cli_logger.verbose("Killed `{}` {} ", cf.bold(proc_string),
                                       cf.dimmed("(via SIGKILL)"))
                else:
                    cli_logger.verbose("Send termination request to `{}` {}",
                                       cf.bold(proc_string),
                                       cf.dimmed("(via SIGTERM)"))

                total_stopped += 1
            except psutil.NoSuchProcess:
                cli_logger.verbose(
                    "Attempted to stop `{}`, but process was already dead.",
                    cf.bold(proc_string))
                pass
            except (psutil.Error, OSError) as ex:
                cli_logger.error("Could not terminate `{}` due to {}",
                                 cf.bold(proc_string), str(ex))

    if total_found == 0:
        cli_logger.print("Did not find any active Ray processes.")
    else:
        if total_stopped == total_found:
            cli_logger.success("Stopped all {} Ray processes.", total_stopped)
        else:
            cli_logger.warning(
                "Stopped only {} out of {} Ray processes. "
                "Set `{}` to see more details.", total_stopped, total_found,
                cf.bold("-v"))
            cli_logger.warning("Try running the command again, or use `{}`.",
                               cf.bold("--force"))
Esempio n. 23
0
def _bootstrap_config(config: Dict[str, Any],
                      no_config_cache: bool = False) -> Dict[str, Any]:
    config = prepare_config(config)

    hasher = hashlib.sha1()
    hasher.update(json.dumps([config], sort_keys=True).encode("utf-8"))
    cache_key = os.path.join(tempfile.gettempdir(),
                             "ray-config-{}".format(hasher.hexdigest()))

    if os.path.exists(cache_key) and not no_config_cache:
        config_cache = json.loads(open(cache_key).read())
        if config_cache.get("_version", -1) == CONFIG_CACHE_VERSION:
            # todo: is it fine to re-resolve? afaik it should be.
            # we can have migrations otherwise or something
            # but this seems overcomplicated given that resolving is
            # relatively cheap
            try_reload_log_state(config_cache["config"]["provider"],
                                 config_cache.get("provider_log_info"))

            if log_once("_printed_cached_config_warning"):
                cli_logger.verbose_warning(
                    "Loaded cached provider configuration "
                    "from " + cf.bold("{}"), cache_key)
                if cli_logger.verbosity == 0:
                    cli_logger.warning("Loaded cached provider configuration")
                cli_logger.warning(
                    "If you experience issues with "
                    "the cloud provider, try re-running "
                    "the command with {}.", cf.bold("--no-config-cache"))

            return config_cache["config"]
        else:
            cli_logger.warning(
                "Found cached cluster config "
                "but the version " + cf.bold("{}") + " "
                "(expected " + cf.bold("{}") + ") does not match.\n"
                "This is normal if cluster launcher was updated.\n"
                "Config will be re-resolved.",
                config_cache.get("_version", "none"), CONFIG_CACHE_VERSION)

    importer = _NODE_PROVIDERS.get(config["provider"]["type"])
    if not importer:
        raise NotImplementedError("Unsupported provider {}".format(
            config["provider"]))

    provider_cls = importer(config["provider"])

    cli_logger.print("Checking {} environment settings",
                     _PROVIDER_PRETTY_NAMES.get(config["provider"]["type"]))
    try:
        config = provider_cls.fillout_available_node_types_resources(config)
    except Exception as exc:
        if cli_logger.verbosity > 2:
            logger.exception("Failed to autodetect node resources.")
        else:
            cli_logger.warning(
                f"Failed to autodetect node resources: {str(exc)}. "
                "You can see full stack trace with higher verbosity.")

    # NOTE: if `resources` field is missing, validate_config for providers
    # other than AWS and Kubernetes will fail (the schema error will ask the
    # user to manually fill the resources) as we currently support autofilling
    # resources for AWS and Kubernetes only.
    validate_config(config)
    resolved_config = provider_cls.bootstrap_config(config)

    if not no_config_cache:
        with open(cache_key, "w") as f:
            config_cache = {
                "_version": CONFIG_CACHE_VERSION,
                "provider_log_info": try_get_log_state(config["provider"]),
                "config": resolved_config
            }
            f.write(json.dumps(config_cache))
    return resolved_config
Esempio n. 24
0
    def _create_node(self, node_config, tags, count):
        created_nodes_dict = {}

        tags = to_aws_format(tags)
        conf = node_config.copy()

        tag_pairs = [{
            "Key": TAG_RAY_CLUSTER_NAME,
            "Value": self.cluster_name,
        }]
        for k, v in tags.items():
            tag_pairs.append({
                "Key": k,
                "Value": v,
            })
        tag_specs = [{
            "ResourceType": "instance",
            "Tags": tag_pairs,
        }]
        user_tag_specs = conf.get("TagSpecifications", [])
        # Allow users to add tags and override values of existing
        # tags with their own. This only applies to the resource type
        # "instance". All other resource types are appended to the list of
        # tag specs.
        for user_tag_spec in user_tag_specs:
            if user_tag_spec["ResourceType"] == "instance":
                for user_tag in user_tag_spec["Tags"]:
                    exists = False
                    for tag in tag_specs[0]["Tags"]:
                        if user_tag["Key"] == tag["Key"]:
                            exists = True
                            tag["Value"] = user_tag["Value"]
                            break
                    if not exists:
                        tag_specs[0]["Tags"] += [user_tag]
            else:
                tag_specs += [user_tag_spec]

        # SubnetIds is not a real config key: we must resolve to a
        # single SubnetId before invoking the AWS API.
        subnet_ids = conf.pop("SubnetIds")

        for attempt in range(1, BOTO_CREATE_MAX_RETRIES + 1):
            try:
                subnet_id = subnet_ids[self.subnet_idx % len(subnet_ids)]

                self.subnet_idx += 1
                conf.update({
                    "MinCount": 1,
                    "MaxCount": count,
                    "SubnetId": subnet_id,
                    "TagSpecifications": tag_specs
                })
                created = self.ec2_fail_fast.create_instances(**conf)
                created_nodes_dict = {n.id: n for n in created}

                # todo: timed?
                # todo: handle plurality?
                with cli_logger.group(
                        "Launched {} nodes",
                        count,
                        _tags=dict(subnet_id=subnet_id)):
                    for instance in created:
                        # NOTE(maximsmol): This is needed for mocking
                        # boto3 for tests. This is likely a bug in moto
                        # but AWS docs don't seem to say.
                        # You can patch moto/ec2/responses/instances.py
                        # to fix this (add <stateReason> to EC2_RUN_INSTANCES)

                        # The correct value is technically
                        # {"code": "0", "Message": "pending"}
                        state_reason = instance.state_reason or {
                            "Message": "pending"
                        }

                        cli_logger.print(
                            "Launched instance {}",
                            instance.instance_id,
                            _tags=dict(
                                state=instance.state["Name"],
                                info=state_reason["Message"]))
                break
            except botocore.exceptions.ClientError as exc:
                if attempt == BOTO_CREATE_MAX_RETRIES:
                    cli_logger.abort(
                        "Failed to launch instances. Max attempts exceeded.",
                        exc=exc,
                    )
                else:
                    cli_logger.warning(
                        "create_instances: Attempt failed with {}, retrying.",
                        exc)
        return created_nodes_dict
Esempio n. 25
0
    def _create_node(self, node_config, tags, count):
        created_nodes_dict = {}

        tags = to_aws_format(tags)
        conf = node_config.copy()

        tag_pairs = [{
            "Key": TAG_RAY_CLUSTER_NAME,
            "Value": self.cluster_name,
        }]
        for k, v in tags.items():
            tag_pairs.append({
                "Key": k,
                "Value": v,
            })
        tag_specs = [{
            "ResourceType": "instance",
            "Tags": tag_pairs,
        }]
        user_tag_specs = conf.get("TagSpecifications", [])
        AWSNodeProvider._merge_tag_specs(tag_specs, user_tag_specs)

        # SubnetIds is not a real config key: we must resolve to a
        # single SubnetId before invoking the AWS API.
        subnet_ids = conf.pop("SubnetIds")

        # update config with min/max node counts and tag specs
        conf.update({
            "MinCount": 1,
            "MaxCount": count,
            "TagSpecifications": tag_specs
        })

        cli_logger_tags = {}
        for attempt in range(1, BOTO_CREATE_MAX_RETRIES + 1):
            try:
                if "NetworkInterfaces" in conf:
                    net_ifs = conf["NetworkInterfaces"]
                    # remove security group IDs previously copied from network
                    # interfaces (create_instances call fails otherwise)
                    conf.pop("SecurityGroupIds", None)
                    cli_logger_tags["network_interfaces"] = str(net_ifs)
                else:
                    subnet_id = subnet_ids[self.subnet_idx % len(subnet_ids)]
                    self.subnet_idx += 1
                    conf["SubnetId"] = subnet_id
                    cli_logger_tags["subnet_id"] = subnet_id

                created = self.ec2_fail_fast.create_instances(**conf)
                created_nodes_dict = {n.id: n for n in created}

                # todo: timed?
                # todo: handle plurality?
                with cli_logger.group("Launched {} nodes",
                                      count,
                                      _tags=cli_logger_tags):
                    for instance in created:
                        # NOTE(maximsmol): This is needed for mocking
                        # boto3 for tests. This is likely a bug in moto
                        # but AWS docs don't seem to say.
                        # You can patch moto/ec2/responses/instances.py
                        # to fix this (add <stateReason> to EC2_RUN_INSTANCES)

                        # The correct value is technically
                        # {"code": "0", "Message": "pending"}
                        state_reason = instance.state_reason or {
                            "Message": "pending"
                        }

                        cli_logger.print("Launched instance {}",
                                         instance.instance_id,
                                         _tags=dict(
                                             state=instance.state["Name"],
                                             info=state_reason["Message"]))
                break
            except botocore.exceptions.ClientError as exc:
                if attempt == BOTO_CREATE_MAX_RETRIES:
                    cli_logger.abort(
                        "Failed to launch instances. Max attempts exceeded.",
                        exc=exc,
                    )
                else:
                    cli_logger.warning(
                        "create_instances: Attempt failed with {}, retrying.",
                        exc)
        return created_nodes_dict
Esempio n. 26
0
    def run_init(self, *, as_head: bool, file_mounts: Dict[str, str],
                 sync_run_yet: bool):
        BOOTSTRAP_MOUNTS = [
            "~/ray_bootstrap_config.yaml", "~/ray_bootstrap_key.pem"
        ]

        specific_image = self.docker_config.get(
            f"{'head' if as_head else 'worker'}_image",
            self.docker_config.get("image"))

        self._check_docker_installed()
        if self.docker_config.get("pull_before_run", True):
            assert specific_image, "Image must be included in config if " + \
                "pull_before_run is specified"
            self.run("{} pull {}".format(self.docker_cmd, specific_image),
                     run_env="host")
        else:

            self.run(f"{self.docker_cmd} image inspect {specific_image} "
                     "1> /dev/null  2>&1 || "
                     f"{self.docker_cmd} pull {specific_image}")

        # Bootstrap files cannot be bind mounted because docker opens the
        # underlying inode. When the file is switched, docker becomes outdated.
        cleaned_bind_mounts = file_mounts.copy()
        for mnt in BOOTSTRAP_MOUNTS:
            cleaned_bind_mounts.pop(mnt, None)

        docker_run_executed = False

        container_running = self._check_container_status()
        requires_re_init = False
        if container_running:
            requires_re_init = self._check_if_container_restart_is_needed(
                specific_image, cleaned_bind_mounts)
            if requires_re_init:
                self.run(f"{self.docker_cmd} stop {self.container_name}",
                         run_env="host")

        if (not container_running) or requires_re_init:
            if not sync_run_yet:
                # Do not start the actual image as we need to run file_sync
                # first to ensure that all folders are created with the
                # correct ownership. Docker will create the folders with
                # `root` as the owner.
                return True
            # Get home directory
            image_env = self.ssh_command_runner.run(
                f"{self.docker_cmd} " + "inspect -f '{{json .Config.Env}}' " +
                specific_image,
                with_output=True).decode().strip()
            home_directory = "/root"
            for env_var in json.loads(image_env):
                if env_var.startswith("HOME="):
                    home_directory = env_var.split("HOME=")[1]
                    break

            user_docker_run_options = self.docker_config.get(
                "run_options", []) + self.docker_config.get(
                    f"{'head' if as_head else 'worker'}_run_options", [])
            start_command = docker_start_cmds(
                self.ssh_command_runner.ssh_user, specific_image,
                cleaned_bind_mounts, self.container_name,
                self._configure_runtime(
                    self._auto_configure_shm(user_docker_run_options)),
                self.ssh_command_runner.cluster_name, home_directory,
                self.docker_cmd)
            self.run(start_command, run_env="host")
            docker_run_executed = True

        # Explicitly copy in ray bootstrap files.
        for mount in BOOTSTRAP_MOUNTS:
            if mount in file_mounts:
                if not sync_run_yet:
                    # NOTE(ilr) This rsync is needed because when starting from
                    #  a stopped instance,  /tmp may be deleted and `run_init`
                    # is called before the first `file_sync` happens
                    self.run_rsync_up(file_mounts[mount], mount)
                self.ssh_command_runner.run(
                    "{cmd} cp {src} {container}:{dst}".format(
                        cmd=self.docker_cmd,
                        src=os.path.join(
                            self._get_docker_host_mount_location(
                                self.ssh_command_runner.cluster_name), mount),
                        container=self.container_name,
                        dst=self._docker_expand_user(mount)))
                try:
                    # Check if the current user has read permission.
                    # If they do not, try to change ownership!
                    self.run(f"cat {mount} >/dev/null 2>&1 || "
                             f"sudo chown $(id -u):$(id -g) {mount}")
                except Exception:
                    lsl_string = self.run(
                        f"ls -l {mount}",
                        with_output=True).decode("utf-8").strip()
                    # The string is of format <Permission> <Links>
                    # <Owner> <Group> <Size> <Date> <Name>
                    permissions = lsl_string.split(" ")[0]
                    owner = lsl_string.split(" ")[2]
                    group = lsl_string.split(" ")[3]
                    current_user = self.run(
                        "whoami", with_output=True).decode("utf-8").strip()
                    cli_logger.warning(
                        f"File ({mount}) is owned by user:{owner} and group:"
                        f"{group} with permissions ({permissions}). The "
                        f"current user ({current_user}) does not have "
                        "permission to read these files, and Ray may not be "
                        "able to autoscale. This can be resolved by "
                        "installing `sudo` in your container, or adding a "
                        f"command like 'chown {current_user} {mount}' to "
                        "your `setup_commands`.")
        self.initialized = True
        return docker_run_executed
Esempio n. 27
0
File: cli.py Progetto: scv119/ray
def job_submit(
    address: Optional[str],
    job_id: Optional[str],
    runtime_env: Optional[str],
    runtime_env_json: Optional[str],
    working_dir: Optional[str],
    entrypoint: Tuple[str],
    no_wait: bool,
):
    """Submits a job to be run on the cluster.

    Example:
        >>> ray job submit -- python my_script.py --arg=val
    """
    client = _get_sdk_client(address, create_cluster_if_needed=True)

    final_runtime_env = {}
    if runtime_env is not None:
        if runtime_env_json is not None:
            raise ValueError("Only one of --runtime_env and "
                             "--runtime-env-json can be provided.")
        with open(runtime_env, "r") as f:
            final_runtime_env = yaml.safe_load(f)

    elif runtime_env_json is not None:
        final_runtime_env = json.loads(runtime_env_json)

    if working_dir is not None:
        if "working_dir" in final_runtime_env:
            cli_logger.warning(
                "Overriding runtime_env working_dir with --working-dir option")

        final_runtime_env["working_dir"] = working_dir

    job_id = client.submit_job(
        entrypoint=list2cmdline(entrypoint),
        job_id=job_id,
        runtime_env=final_runtime_env,
    )

    _log_big_success_msg(f"Job '{job_id}' submitted successfully")

    with cli_logger.group("Next steps"):
        cli_logger.print("Query the logs of the job:")
        with cli_logger.indented():
            cli_logger.print(cf.bold(f"ray job logs {job_id}"))

        cli_logger.print("Query the status of the job:")
        with cli_logger.indented():
            cli_logger.print(cf.bold(f"ray job status {job_id}"))

        cli_logger.print("Request the job to be stopped:")
        with cli_logger.indented():
            cli_logger.print(cf.bold(f"ray job stop {job_id}"))

    cli_logger.newline()
    sdk_version = client.get_version()
    # sdk version 0 does not have log streaming
    if not no_wait:
        if int(sdk_version) > 0:
            cli_logger.print("Tailing logs until the job exits "
                             "(disable with --no-wait):")
            asyncio.get_event_loop().run_until_complete(
                _tail_logs(client, job_id))
        else:
            cli_logger.warning(
                "Tailing logs is not enabled for job sdk client version "
                f"{sdk_version}. Please upgrade your ray to latest version "
                "for this feature.")
Esempio n. 28
0
def get_cluster_dump_archive(cluster_config_file: Optional[str] = None,
                             host: Optional[str] = None,
                             ssh_user: Optional[str] = None,
                             ssh_key: Optional[str] = None,
                             docker: Optional[str] = None,
                             local: Optional[bool] = None,
                             output: Optional[str] = None,
                             logs: bool = True,
                             debug_state: bool = True,
                             pip: bool = True,
                             processes: bool = True,
                             processes_verbose: bool = False) -> Optional[str]:

    # Inform the user what kind of logs are collected (before actually
    # collecting, so they can abort)
    content_str = ""
    if logs:
        content_str += \
            "  - The logfiles of your Ray session\n" \
            "    This usually includes Python outputs (stdout/stderr)\n"

    if debug_state:
        content_str += \
            "  - Debug state information on your Ray cluster \n" \
            "    e.g. number of workers, drivers, objects, etc.\n"

    if pip:
        content_str += "  - Your installed Python packages (`pip freeze`)\n"

    if processes:
        content_str += \
            "  - Information on your running Ray processes\n" \
            "    This includes command line arguments\n"

    cli_logger.warning(
        "You are about to create a cluster dump. This will collect data from "
        "cluster nodes.\n\n"
        "The dump will contain this information:\n\n"
        f"{content_str}\n"
        f"If you are concerned about leaking private information, extract "
        f"the archive and inspect its contents before sharing it with "
        f"anyone.")

    # Parse arguments (e.g. fetch info from cluster config)
    cluster_config_file, hosts, ssh_user, ssh_key, docker, cluster_name = \
        _info_from_params(cluster_config_file, host, ssh_user, ssh_key, docker)

    nodes = [
        Node(
            host=h,
            ssh_user=ssh_user,
            ssh_key=ssh_key,
            docker_container=docker) for h in hosts
    ]

    if not nodes:
        cli_logger.error(
            "No nodes found. Specify with `--host` or by passing a ray "
            "cluster config to `--cluster`.")
        return None

    if cluster_config_file:
        nodes[0].is_head = True

    if local is None:
        # If called with a cluster config, this was probably started
        # from a laptop
        local = not bool(cluster_config_file)

    parameters = GetParameters(
        logs=logs,
        debug_state=debug_state,
        pip=pip,
        processes=processes,
        processes_verbose=processes_verbose)

    with Archive() as archive:
        if local:
            create_archive_for_local_and_remote_nodes(
                archive, remote_nodes=nodes, parameters=parameters)
        else:
            create_archive_for_remote_nodes(
                archive, remote_nodes=nodes, parameters=parameters)

    if not output:
        if cluster_name:
            filename = f"{cluster_name}_" \
                       f"{datetime.datetime.now():%Y-%m-%d_%H-%M-%S}.tar.gz"
        else:
            filename = f"collected_logs_" \
                       f"{datetime.datetime.now():%Y-%m-%d_%H-%M-%S}.tar.gz"
        output = os.path.join(os.getcwd(), filename)
    else:
        output = os.path.expanduser(output)

    os.rename(archive.file, output)
    return output