def main(): kubernetes.config.load_incluster_config() cluster_config_path = prepare_ray_cluster_config() config = create_or_update_cluster( cluster_config_path, override_min_workers=None, override_max_workers=None, no_restart=False, restart_only=False, yes=True, no_config_cache=True) with open(cluster_config_path, "w") as file: yaml.dump(config, file) ray_head_pod_ip = get_ray_head_pod_ip(config) # TODO: Add support for user-specified redis port and password redis_address = services.address(ray_head_pod_ip, ray_constants.DEFAULT_PORT) stderr_file, stdout_file = get_logs() services.start_monitor( redis_address, stdout_file=stdout_file, stderr_file=stderr_file, autoscaling_config=cluster_config_path, redis_password=ray_constants.REDIS_DEFAULT_PASSWORD)
def create_or_update_cluster(cluster_config: Union[dict, str], *, no_restart: bool = False, restart_only: bool = False, no_config_cache: bool = False) -> Dict[str, Any]: """Create or updates an autoscaling Ray cluster from a config json. Args: cluster_config (Union[str, dict]): Either the config dict of the cluster, or a path pointing to a file containing the config. no_restart (bool): Whether to skip restarting Ray services during the update. This avoids interrupting running jobs and can be used to dynamically adjust autoscaler configuration. restart_only (bool): Whether to skip running setup commands and only restart Ray. This cannot be used with 'no-restart'. no_config_cache (bool): Whether to disable the config cache and fully resolve all environment settings from the Cloud provider again. """ with _as_config_file(cluster_config) as config_file: return commands.create_or_update_cluster( config_file=config_file, override_min_workers=None, override_max_workers=None, no_restart=no_restart, restart_only=restart_only, yes=True, override_cluster_name=None, no_config_cache=no_config_cache, redirect_command_output=None, use_login_shells=True)
def start_head(self) -> None: self.write_config() self.config = commands.create_or_update_cluster( self.config_path, override_min_workers=None, override_max_workers=None, no_restart=False, restart_only=False, yes=True, no_config_cache=True) self.write_config()
def up(cluster_config_file, min_workers, max_workers, no_restart, restart_only, yes, cluster_name, no_config_cache, redirect_command_output, use_login_shells, log_style, log_color, verbose): """Create or update a Ray cluster.""" cli_logger.configure(log_style, log_color, verbose) if restart_only or no_restart: cli_logger.doassert(restart_only != no_restart, "`{}` is incompatible with `{}`.", cf.bold("--restart-only"), cf.bold("--no-restart")) assert restart_only != no_restart, "Cannot set both 'restart_only' " \ "and 'no_restart' at the same time!" if urllib.parse.urlparse(cluster_config_file).scheme in ("http", "https"): try: response = urllib.request.urlopen(cluster_config_file, timeout=5) content = response.read() file_name = cluster_config_file.split("/")[-1] with open(file_name, "wb") as f: f.write(content) cluster_config_file = file_name except urllib.error.HTTPError as e: cli_logger.warning("{}", str(e)) cli_logger.warning( "Could not download remote cluster configuration file.") cli_logger.old_info(logger, "Error downloading file: ", e) create_or_update_cluster( config_file=cluster_config_file, override_min_workers=min_workers, override_max_workers=max_workers, no_restart=no_restart, restart_only=restart_only, yes=yes, override_cluster_name=cluster_name, no_config_cache=no_config_cache, redirect_command_output=redirect_command_output, use_login_shells=use_login_shells)
def start_head(self, restart_ray: bool = False) -> None: self.write_config() # Don't restart Ray on head unless recovering from failure. no_restart = not restart_ray # Create or update cluster head and record config side effects. self.config = commands.create_or_update_cluster( self.config_path, override_min_workers=None, override_max_workers=None, no_restart=no_restart, restart_only=False, yes=True, no_config_cache=True, no_monitor_on_head=True) # Write the resulting config for use by the autoscaling monitor: self.write_config()
def submit(cluster_config_file, screen, tmux, stop, start, cluster_name, no_config_cache, port_forward, script, args, script_args, log_style, log_color, verbose): """Uploads and runs a script on the specified cluster. The script is automatically synced to the following location: os.path.join("~", os.path.basename(script)) Example: >>> ray submit [CLUSTER.YAML] experiment.py -- --smoke-test """ cli_logger.configure(log_style, log_color, verbose) cli_logger.doassert(not (screen and tmux), "`{}` and `{}` are incompatible.", cf.bold("--screen"), cf.bold("--tmux")) cli_logger.doassert( not (script_args and args), "`{0}` and `{1}` are incompatible. Use only `{1}`.\n" "Example: `{2}`", cf.bold("--args"), cf.bold("-- <args ...>"), cf.bold("ray submit script.py -- --arg=123 --flag")) assert not (screen and tmux), "Can specify only one of `screen` or `tmux`." assert not (script_args and args), "Use -- --arg1 --arg2 for script args." if args: cli_logger.warning( "`{}` is deprecated and will be removed in the future.", cf.bold("--args")) cli_logger.warning("Use `{}` instead. Example: `{}`.", cf.bold("-- <args ...>"), cf.bold("ray submit script.py -- --arg=123 --flag")) cli_logger.newline() if start: create_or_update_cluster( config_file=cluster_config_file, override_min_workers=None, override_max_workers=None, no_restart=False, restart_only=False, yes=True, override_cluster_name=cluster_name, no_config_cache=no_config_cache, redirect_command_output=False, use_login_shells=True) target = os.path.basename(script) target = os.path.join("~", target) rsync( cluster_config_file, script, target, cluster_name, no_config_cache=no_config_cache, down=False) command_parts = ["python", target] if script_args: command_parts += list(script_args) elif args is not None: command_parts += [args] port_forward = [(port, port) for port in list(port_forward)] cmd = " ".join(command_parts) exec_cluster( cluster_config_file, cmd=cmd, run_env="docker", screen=screen, tmux=tmux, stop=stop, start=False, override_cluster_name=cluster_name, no_config_cache=no_config_cache, port_forward=port_forward)
""" Ray operator for Kubernetes. Reads ray cluster config from a k8s ConfigMap, starts a ray head node pod using create_or_update_cluster(), then runs an autoscaling loop in the operator pod executing this script. Writes autoscaling logs to the directory /root/ray-operator-logs. In this setup, the ray head node does not run an autoscaler. It is important NOT to supply an --autoscaling-config argument to head node's ray start command in the cluster config when using this operator. To run, first create a ConfigMap named ray-operator-configmap from a ray cluster config. Then apply the manifest at python/ray/autoscaler/kubernetes/operator_configs/operator_config.yaml For example: kubectl create namespace raytest kubectl -n raytest create configmap ray-operator-configmap --from-file=python/ray/autoscaler/kubernetes/operator_configs/test_cluster_config.yaml kubectl -n raytest apply -f python/ray/autoscaler/kubernetes/operator_configs/operator_config.yaml """ # noqa import os from typing import Any, Dict, IO, Tuple import kubernetes import yaml from ray._private import services from ray.autoscaler._private.commands import create_or_update_cluster from ray.autoscaler._private.kubernetes import core_api from ray.utils import open_log