def main():
    kubernetes.config.load_incluster_config()
    cluster_config_path = prepare_ray_cluster_config()

    config = create_or_update_cluster(
        cluster_config_path,
        override_min_workers=None,
        override_max_workers=None,
        no_restart=False,
        restart_only=False,
        yes=True,
        no_config_cache=True)
    with open(cluster_config_path, "w") as file:
        yaml.dump(config, file)

    ray_head_pod_ip = get_ray_head_pod_ip(config)
    # TODO: Add support for user-specified redis port and password
    redis_address = services.address(ray_head_pod_ip,
                                     ray_constants.DEFAULT_PORT)
    stderr_file, stdout_file = get_logs()

    services.start_monitor(
        redis_address,
        stdout_file=stdout_file,
        stderr_file=stderr_file,
        autoscaling_config=cluster_config_path,
        redis_password=ray_constants.REDIS_DEFAULT_PASSWORD)
Beispiel #2
0
def create_or_update_cluster(cluster_config: Union[dict, str],
                             *,
                             no_restart: bool = False,
                             restart_only: bool = False,
                             no_config_cache: bool = False) -> Dict[str, Any]:
    """Create or updates an autoscaling Ray cluster from a config json.

    Args:
        cluster_config (Union[str, dict]): Either the config dict of the
            cluster, or a path pointing to a file containing the config.
        no_restart (bool): Whether to skip restarting Ray services during the
            update. This avoids interrupting running jobs and can be used to
            dynamically adjust autoscaler configuration.
        restart_only (bool): Whether to skip running setup commands and only
            restart Ray. This cannot be used with 'no-restart'.
        no_config_cache (bool): Whether to disable the config cache and fully
            resolve all environment settings from the Cloud provider again.
    """
    with _as_config_file(cluster_config) as config_file:
        return commands.create_or_update_cluster(
            config_file=config_file,
            override_min_workers=None,
            override_max_workers=None,
            no_restart=no_restart,
            restart_only=restart_only,
            yes=True,
            override_cluster_name=None,
            no_config_cache=no_config_cache,
            redirect_command_output=None,
            use_login_shells=True)
Beispiel #3
0
 def start_head(self) -> None:
     self.write_config()
     self.config = commands.create_or_update_cluster(
         self.config_path,
         override_min_workers=None,
         override_max_workers=None,
         no_restart=False,
         restart_only=False,
         yes=True,
         no_config_cache=True)
     self.write_config()
Beispiel #4
0
def up(cluster_config_file, min_workers, max_workers, no_restart, restart_only,
       yes, cluster_name, no_config_cache, redirect_command_output,
       use_login_shells, log_style, log_color, verbose):
    """Create or update a Ray cluster."""
    cli_logger.configure(log_style, log_color, verbose)

    if restart_only or no_restart:
        cli_logger.doassert(restart_only != no_restart,
                            "`{}` is incompatible with `{}`.",
                            cf.bold("--restart-only"), cf.bold("--no-restart"))
        assert restart_only != no_restart, "Cannot set both 'restart_only' " \
            "and 'no_restart' at the same time!"

    if urllib.parse.urlparse(cluster_config_file).scheme in ("http", "https"):
        try:
            response = urllib.request.urlopen(cluster_config_file, timeout=5)
            content = response.read()
            file_name = cluster_config_file.split("/")[-1]
            with open(file_name, "wb") as f:
                f.write(content)
            cluster_config_file = file_name
        except urllib.error.HTTPError as e:
            cli_logger.warning("{}", str(e))
            cli_logger.warning(
                "Could not download remote cluster configuration file.")
            cli_logger.old_info(logger, "Error downloading file: ", e)
    create_or_update_cluster(
        config_file=cluster_config_file,
        override_min_workers=min_workers,
        override_max_workers=max_workers,
        no_restart=no_restart,
        restart_only=restart_only,
        yes=yes,
        override_cluster_name=cluster_name,
        no_config_cache=no_config_cache,
        redirect_command_output=redirect_command_output,
        use_login_shells=use_login_shells)
Beispiel #5
0
 def start_head(self, restart_ray: bool = False) -> None:
     self.write_config()
     # Don't restart Ray on head unless recovering from failure.
     no_restart = not restart_ray
     # Create or update cluster head and record config side effects.
     self.config = commands.create_or_update_cluster(
         self.config_path,
         override_min_workers=None,
         override_max_workers=None,
         no_restart=no_restart,
         restart_only=False,
         yes=True,
         no_config_cache=True,
         no_monitor_on_head=True)
     # Write the resulting config for use by the autoscaling monitor:
     self.write_config()
Beispiel #6
0
def submit(cluster_config_file, screen, tmux, stop, start, cluster_name,
           no_config_cache, port_forward, script, args, script_args, log_style,
           log_color, verbose):
    """Uploads and runs a script on the specified cluster.

    The script is automatically synced to the following location:

        os.path.join("~", os.path.basename(script))

    Example:
        >>> ray submit [CLUSTER.YAML] experiment.py -- --smoke-test
    """
    cli_logger.configure(log_style, log_color, verbose)

    cli_logger.doassert(not (screen and tmux),
                        "`{}` and `{}` are incompatible.", cf.bold("--screen"),
                        cf.bold("--tmux"))
    cli_logger.doassert(
        not (script_args and args),
        "`{0}` and `{1}` are incompatible. Use only `{1}`.\n"
        "Example: `{2}`", cf.bold("--args"), cf.bold("-- <args ...>"),
        cf.bold("ray submit script.py -- --arg=123 --flag"))

    assert not (screen and tmux), "Can specify only one of `screen` or `tmux`."
    assert not (script_args and args), "Use -- --arg1 --arg2 for script args."

    if args:
        cli_logger.warning(
            "`{}` is deprecated and will be removed in the future.",
            cf.bold("--args"))
        cli_logger.warning("Use `{}` instead. Example: `{}`.",
                           cf.bold("-- <args ...>"),
                           cf.bold("ray submit script.py -- --arg=123 --flag"))
        cli_logger.newline()

    if start:
        create_or_update_cluster(
            config_file=cluster_config_file,
            override_min_workers=None,
            override_max_workers=None,
            no_restart=False,
            restart_only=False,
            yes=True,
            override_cluster_name=cluster_name,
            no_config_cache=no_config_cache,
            redirect_command_output=False,
            use_login_shells=True)
    target = os.path.basename(script)
    target = os.path.join("~", target)
    rsync(
        cluster_config_file,
        script,
        target,
        cluster_name,
        no_config_cache=no_config_cache,
        down=False)

    command_parts = ["python", target]
    if script_args:
        command_parts += list(script_args)
    elif args is not None:
        command_parts += [args]

    port_forward = [(port, port) for port in list(port_forward)]
    cmd = " ".join(command_parts)
    exec_cluster(
        cluster_config_file,
        cmd=cmd,
        run_env="docker",
        screen=screen,
        tmux=tmux,
        stop=stop,
        start=False,
        override_cluster_name=cluster_name,
        no_config_cache=no_config_cache,
        port_forward=port_forward)
"""
Ray operator for Kubernetes.

Reads ray cluster config from a k8s ConfigMap, starts a ray head node pod using
create_or_update_cluster(), then runs an autoscaling loop in the operator pod
executing this script. Writes autoscaling logs to the directory
/root/ray-operator-logs.

In this setup, the ray head node does not run an autoscaler. It is important
NOT to supply an --autoscaling-config argument to head node's ray start command
in the cluster config when using this operator.

To run, first create a ConfigMap named ray-operator-configmap from a ray
cluster config. Then apply the manifest at python/ray/autoscaler/kubernetes/operator_configs/operator_config.yaml

For example:
kubectl create namespace raytest
kubectl -n raytest create configmap ray-operator-configmap --from-file=python/ray/autoscaler/kubernetes/operator_configs/test_cluster_config.yaml
kubectl -n raytest apply -f python/ray/autoscaler/kubernetes/operator_configs/operator_config.yaml
""" # noqa
import os
from typing import Any, Dict, IO, Tuple

import kubernetes
import yaml

from ray._private import services
from ray.autoscaler._private.commands import create_or_update_cluster
from ray.autoscaler._private.kubernetes import core_api
from ray.utils import open_log