def _get_head_node(config: Dict[str, Any], printable_config_file: str, override_cluster_name: Optional[str], create_if_needed: bool = False) -> str: provider = _get_node_provider(config["provider"], config["cluster_name"]) head_node_tags = { TAG_RAY_NODE_KIND: NODE_KIND_HEAD, } nodes = provider.non_terminated_nodes(head_node_tags) if len(nodes) > 0: head_node = nodes[0] return head_node elif create_if_needed: get_or_create_head_node(config, printable_config_file=printable_config_file, restart_only=False, no_restart=False, yes=True, override_cluster_name=override_cluster_name) return _get_head_node(config, printable_config_file, override_cluster_name, create_if_needed=False) else: raise RuntimeError("Head node of cluster ({}) not found!".format( config["cluster_name"]))
def test_network_interfaces( ec2_client_stub, iam_client_stub, ec2_client_stub_fail_fast, ec2_client_stub_max_retries, ): # use default stubs to skip ahead to subnet configuration stubs.configure_iam_role_default(iam_client_stub) stubs.configure_key_pair_default(ec2_client_stub) # given the security groups associated with our network interfaces... sgids = ["sg-00000000", "sg-11111111", "sg-22222222", "sg-33333333"] security_groups = [] suffix = 0 for sgid in sgids: sg = copy.deepcopy(DEFAULT_SG) sg["GroupName"] += f"-{suffix}" sg["GroupId"] = sgid security_groups.append(sg) suffix += 1 # expect to describe all security groups to ensure they share the same VPC stubs.describe_sgs_by_id(ec2_client_stub, sgids, security_groups) # use a default stub to skip subnet configuration stubs.configure_subnet_default(ec2_client_stub) # given our mocks and an example config file as input... # expect the config to be loaded, validated, and bootstrapped successfully config = helpers.bootstrap_aws_example_config_file( "example-network-interfaces.yaml") # instantiate a new node provider new_provider = _get_node_provider( config["provider"], DEFAULT_CLUSTER_NAME, False, ) for name, node_type in config["available_node_types"].items(): node_cfg = node_type["node_config"] tags = helpers.node_provider_tags(config, name) # given our bootstrapped node config as input to create a new node... # expect to first describe all stopped instances that could be reused stubs.describe_instances_with_any_filter_consumer( ec2_client_stub_max_retries) # given no stopped EC2 instances to reuse... # expect to create new nodes with the given network interface config stubs.run_instances_with_network_interfaces_consumer( ec2_client_stub_fail_fast, node_cfg["NetworkInterfaces"], ) new_provider.create_node(node_cfg, tags, 1) iam_client_stub.assert_no_pending_responses() ec2_client_stub.assert_no_pending_responses() ec2_client_stub_fail_fast.assert_no_pending_responses() ec2_client_stub_max_retries.assert_no_pending_responses()
def test_launch_templates(ec2_client_stub, ec2_client_stub_fail_fast, ec2_client_stub_max_retries): # given the launch template associated with our default head node type... # expect to first describe the default launch template by ID stubs.describe_launch_template_versions_by_id_default( ec2_client_stub, ["$Latest"]) # given the launch template associated with our default worker node type... # expect to next describe the same default launch template by name stubs.describe_launch_template_versions_by_name_default( ec2_client_stub, ["2"]) # use default stubs to skip ahead to subnet configuration stubs.configure_key_pair_default(ec2_client_stub) # given the security groups associated with our launch template... sgids = [DEFAULT_SG["GroupId"]] security_groups = [DEFAULT_SG] # expect to describe all security groups to ensure they share the same VPC stubs.describe_sgs_by_id(ec2_client_stub, sgids, security_groups) # use a default stub to skip subnet configuration stubs.configure_subnet_default(ec2_client_stub) # given our mocks and an example config file as input... # expect the config to be loaded, validated, and bootstrapped successfully config = helpers.bootstrap_aws_example_config_file( "example-launch-templates.yaml") # instantiate a new node provider new_provider = _get_node_provider( config["provider"], DEFAULT_CLUSTER_NAME, False, ) max_count = 1 for name, node_type in config["available_node_types"].items(): # given our bootstrapped node config as input to create a new node... # expect to first describe all stopped instances that could be reused stubs.describe_instances_with_any_filter_consumer( ec2_client_stub_max_retries) # given no stopped EC2 instances to reuse... # expect to create new nodes with the given launch template config node_cfg = node_type["node_config"] stubs.run_instances_with_launch_template_consumer( ec2_client_stub_fail_fast, config, node_cfg, name, DEFAULT_LT["LaunchTemplateData"], max_count, ) tags = helpers.node_provider_tags(config, name) new_provider.create_node(node_cfg, tags, max_count) ec2_client_stub.assert_no_pending_responses() ec2_client_stub_fail_fast.assert_no_pending_responses() ec2_client_stub_max_retries.assert_no_pending_responses()
def _get_worker_nodes(config: Dict[str, Any], override_cluster_name: Optional[str]) -> List[str]: """Returns worker node ids for given configuration.""" # todo: technically could be reused in get_worker_node_ips if override_cluster_name is not None: config["cluster_name"] = override_cluster_name provider = _get_node_provider(config["provider"], config["cluster_name"]) return provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_WORKER})
def count_non_terminated_nodes() -> int: """Get the count of non terminated nodes for the Ray cluster raycluster-complete in namespace default. """ provider_config = _generate_provider_config( ray_cluster_namespace="default") kuberay_node_provider = _get_node_provider( provider_config=provider_config, cluster_name="raycluster-complete") nodes = kuberay_node_provider.non_terminated_nodes({}) return len(nodes)
def _get_worker_nodes(config, override_cluster_name): """Returns worker node ids for given configuration.""" # todo: technically could be reused in get_worker_node_ips if override_cluster_name is not None: config["cluster_name"] = override_cluster_name provider = _get_node_provider(config["provider"], config["cluster_name"]) try: return provider.non_terminated_nodes( {TAG_RAY_NODE_KIND: NODE_KIND_WORKER}) finally: provider.cleanup()
def reset(self, errors_fatal=False): sync_continuously = False if hasattr(self, "config"): sync_continuously = self.config.get( "file_mounts_sync_continuously", False) try: with open(self.config_path) as f: new_config = yaml.safe_load(f.read()) if new_config != getattr(self, "config", None): try: validate_config(new_config) except Exception as e: logger.debug( "Cluster config validation failed. The version of " "the ray CLI you launched this cluster with may " "be higher than the version of ray being run on " "the cluster. Some new features may not be " "available until you upgrade ray on your cluster.", exc_info=e) (new_runtime_hash, new_file_mounts_contents_hash) = hash_runtime_conf( new_config["file_mounts"], new_config["cluster_synced_files"], [ new_config["worker_setup_commands"], new_config["worker_start_ray_commands"], ], generate_file_mounts_contents_hash=sync_continuously, ) self.config = new_config self.runtime_hash = new_runtime_hash self.file_mounts_contents_hash = new_file_mounts_contents_hash if not self.provider: self.provider = _get_node_provider(self.config["provider"], self.config["cluster_name"]) # Check whether we can enable the resource demand scheduler. if "available_node_types" in self.config: self.available_node_types = self.config["available_node_types"] self.resource_demand_scheduler = ResourceDemandScheduler( self.provider, self.available_node_types, self.config["max_workers"]) else: self.available_node_types = None self.resource_demand_scheduler = None except Exception as e: if errors_fatal: raise e else: logger.exception("StandardAutoscaler: " "Error parsing config.")
def kill_node(config_file, yes, hard, override_cluster_name): """Kills a random Raylet worker.""" config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config) cli_logger.confirm(yes, "A random node will be killed.") cli_logger.old_confirm("This will kill a node in your cluster", yes) provider = _get_node_provider(config["provider"], config["cluster_name"]) try: nodes = provider.non_terminated_nodes({ TAG_RAY_NODE_KIND: NODE_KIND_WORKER }) node = random.choice(nodes) cli_logger.print("Shutdown " + cf.bold("{}"), node) cli_logger.old_info(logger, "kill_node: Shutdown worker {}", node) if hard: provider.terminate_node(node) else: updater = NodeUpdaterThread( node_id=node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="", file_mounts_contents_hash="", is_head_node=False, docker_config=config.get("docker")) _exec(updater, "ray stop", False, False) time.sleep(POLL_INTERVAL) if config.get("provider", {}).get("use_internal_ips", False) is True: node_ip = provider.internal_ip(node) else: node_ip = provider.external_ip(node) finally: provider.cleanup() return node_ip
def get_head_node_ip(config_file: str, override_cluster_name: Optional[str] = None) -> str: """Returns head node IP for given configuration file if exists.""" config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name provider = _get_node_provider(config["provider"], config["cluster_name"]) head_node = _get_head_node(config, config_file, override_cluster_name) if config.get("provider", {}).get("use_internal_ips", False): head_node_ip = provider.internal_ip(head_node) else: head_node_ip = provider.external_ip(head_node) return head_node_ip
def get_worker_node_ips( config_file: str, override_cluster_name: Optional[str] = None) -> List[str]: """Returns worker node IPs for given configuration file.""" config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name provider = _get_node_provider(config["provider"], config["cluster_name"]) nodes = provider.non_terminated_nodes( {TAG_RAY_NODE_KIND: NODE_KIND_WORKER}) if config.get("provider", {}).get("use_internal_ips", False) is True: return [provider.internal_ip(node) for node in nodes] else: return [provider.external_ip(node) for node in nodes]
def get_info_from_ray_cluster_config( cluster_config: str ) -> Tuple[List[str], str, str, Optional[str], Optional[str]]: """Get information from Ray cluster config. Return list of host IPs, ssh user, ssh key file, and optional docker container. Args: cluster_config (str): Path to ray cluster config. Returns: Tuple of list of host IPs, ssh user name, ssh key file path, optional docker container name, optional cluster name. """ from ray.autoscaler._private.commands import _bootstrap_config cli_logger.print(f"Retrieving cluster information from ray cluster file: " f"{cluster_config}") cluster_config = os.path.expanduser(cluster_config) config = yaml.safe_load(open(cluster_config).read()) config = _bootstrap_config(config, no_config_cache=True) provider = _get_node_provider(config["provider"], config["cluster_name"]) head_nodes = provider.non_terminated_nodes({ TAG_RAY_NODE_KIND: NODE_KIND_HEAD }) worker_nodes = provider.non_terminated_nodes({ TAG_RAY_NODE_KIND: NODE_KIND_WORKER }) hosts = [provider.external_ip(node) for node in head_nodes + worker_nodes] ssh_user = config["auth"]["ssh_user"] ssh_key = config["auth"]["ssh_private_key"] docker = None docker_config = config.get("docker", None) if docker_config: docker = docker_config.get("container_name", None) cluster_name = config.get("cluster_name", None) return hosts, ssh_user, ssh_key, docker, cluster_name
def reset(self, errors_fatal=False): sync_continuously = False if hasattr(self, "config"): sync_continuously = self.config.get( "file_mounts_sync_continuously", False) try: with open(self.config_path) as f: new_config = yaml.safe_load(f.read()) validate_config(new_config) (new_runtime_hash, new_file_mounts_contents_hash) = hash_runtime_conf( new_config["file_mounts"], new_config["cluster_synced_files"], [ new_config["worker_setup_commands"], new_config["worker_start_ray_commands"], ], generate_file_mounts_contents_hash=sync_continuously, ) self.config = new_config self.runtime_hash = new_runtime_hash self.file_mounts_contents_hash = new_file_mounts_contents_hash if not self.provider: self.provider = _get_node_provider(self.config["provider"], self.config["cluster_name"]) # Check whether we can enable the resource demand scheduler. if "available_node_types" in self.config: self.available_node_types = self.config["available_node_types"] self.resource_demand_scheduler = ResourceDemandScheduler( self.provider, self.available_node_types, self.config["max_workers"]) else: self.available_node_types = None self.resource_demand_scheduler = None except Exception as e: if errors_fatal: raise e else: logger.exception("StandardAutoscaler: " "Error parsing config.")
def _get_running_head_node(config: Dict[str, Any], printable_config_file: str, override_cluster_name: Optional[str], create_if_needed: bool = False, _provider: Optional[NodeProvider] = None) -> str: """Get a valid, running head node""" provider = _provider or _get_node_provider(config["provider"], config["cluster_name"]) head_node_tags = { TAG_RAY_NODE_KIND: NODE_KIND_HEAD, } nodes = provider.non_terminated_nodes(head_node_tags) head_node = None for node in nodes: node_state = provider.node_tags(node).get(TAG_RAY_NODE_STATUS) if node_state == STATUS_UP_TO_DATE: head_node = node else: cli_logger.warning(f"Head node ({node}) is in state {node_state}.") if head_node is not None: return head_node elif create_if_needed: get_or_create_head_node( config, printable_config_file=printable_config_file, restart_only=False, no_restart=False, yes=True, override_cluster_name=override_cluster_name) return _get_running_head_node( config, printable_config_file, override_cluster_name, create_if_needed=False) else: raise RuntimeError("Head node of cluster ({}) not found!".format( config["cluster_name"]))
def reset(self, errors_fatal=False): sync_continuously = False if hasattr(self, "config"): sync_continuously = self.config.get( "file_mounts_sync_continuously", False) try: with open(self.config_path) as f: new_config = yaml.safe_load(f.read()) if new_config != getattr(self, "config", None): try: validate_config(new_config) except Exception as e: logger.debug( "Cluster config validation failed. The version of " "the ray CLI you launched this cluster with may " "be higher than the version of ray being run on " "the cluster. Some new features may not be " "available until you upgrade ray on your cluster.", exc_info=e) (new_runtime_hash, new_file_mounts_contents_hash) = hash_runtime_conf( new_config["file_mounts"], new_config["cluster_synced_files"], [ new_config["worker_setup_commands"], new_config["worker_start_ray_commands"], ], generate_file_mounts_contents_hash=sync_continuously, ) self.config = new_config self.runtime_hash = new_runtime_hash self.file_mounts_contents_hash = new_file_mounts_contents_hash if not self.provider: self.provider = _get_node_provider(self.config["provider"], self.config["cluster_name"]) self.available_node_types = self.config["available_node_types"] upscaling_speed = self.config.get("upscaling_speed") aggressive = self.config.get("autoscaling_mode") == "aggressive" target_utilization_fraction = self.config.get( "target_utilization_fraction") if upscaling_speed: upscaling_speed = float(upscaling_speed) # TODO(ameer): consider adding (if users ask) an option of # initial_upscaling_num_workers. elif aggressive: upscaling_speed = 99999 logger.warning( "Legacy aggressive autoscaling mode " "detected. Replacing it by setting upscaling_speed to " "99999.") elif target_utilization_fraction: upscaling_speed = ( 1 / max(target_utilization_fraction, 0.001) - 1) logger.warning( "Legacy target_utilization_fraction config " "detected. Replacing it by setting upscaling_speed to " + "1 / target_utilization_fraction - 1.") else: upscaling_speed = 1.0 if self.resource_demand_scheduler: # The node types are autofilled internally for legacy yamls, # overwriting the class will remove the inferred node resources # for legacy yamls. self.resource_demand_scheduler.reset_config( self.provider, self.available_node_types, self.config["max_workers"], self.config["head_node_type"], upscaling_speed) else: self.resource_demand_scheduler = ResourceDemandScheduler( self.provider, self.available_node_types, self.config["max_workers"], self.config["head_node_type"], upscaling_speed) except Exception as e: if errors_fatal: raise e else: logger.exception("StandardAutoscaler: " "Error parsing config.")
def teardown_cluster(config_file: str, yes: bool, workers_only: bool, override_cluster_name: Optional[str], keep_min_workers: bool) -> None: """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config) cli_logger.confirm(yes, "Destroying cluster.", _abort=True) if not workers_only: try: exec_cluster( config_file, cmd="ray stop", run_env="auto", screen=False, tmux=False, stop=False, start=False, override_cluster_name=override_cluster_name, port_forward=None, with_output=False) except Exception as e: # todo: add better exception info cli_logger.verbose_error("{}", str(e)) cli_logger.warning( "Exception occurred when stopping the cluster Ray runtime " "(use -v to dump teardown exceptions).") cli_logger.warning( "Ignoring the exception and " "attempting to shut down the cluster nodes anyway.") provider = _get_node_provider(config["provider"], config["cluster_name"]) def remaining_nodes(): workers = provider.non_terminated_nodes({ TAG_RAY_NODE_KIND: NODE_KIND_WORKER }) if keep_min_workers: min_workers = config.get("min_workers", 0) cli_logger.print( "{} random worker nodes will not be shut down. " + cf.dimmed("(due to {})"), cf.bold(min_workers), cf.bold("--keep-min-workers")) workers = random.sample(workers, len(workers) - min_workers) # todo: it's weird to kill the head node but not all workers if workers_only: cli_logger.print( "The head node will not be shut down. " + cf.dimmed("(due to {})"), cf.bold("--workers-only")) return workers head = provider.non_terminated_nodes({ TAG_RAY_NODE_KIND: NODE_KIND_HEAD }) return head + workers def run_docker_stop(node, container_name): try: updater = NodeUpdaterThread( node_id=node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="", file_mounts_contents_hash="", is_head_node=False, docker_config=config.get("docker")) _exec( updater, f"docker stop {container_name}", with_output=False, run_env="host") except Exception: cli_logger.warning(f"Docker stop failed on {node}") # Loop here to check that both the head and worker nodes are actually # really gone A = remaining_nodes() container_name = config.get("docker", {}).get("container_name") if container_name: # This is to ensure that the parallel SSH calls below do not mess with # the users terminal. output_redir = cmd_output_util.is_output_redirected() cmd_output_util.set_output_redirected(True) allow_interactive = cmd_output_util.does_allow_interactive() cmd_output_util.set_allow_interactive(False) with ThreadPoolExecutor( max_workers=MAX_PARALLEL_SHUTDOWN_WORKERS) as executor: for node in A: executor.submit( run_docker_stop, node=node, container_name=container_name) cmd_output_util.set_output_redirected(output_redir) cmd_output_util.set_allow_interactive(allow_interactive) with LogTimer("teardown_cluster: done."): while A: provider.terminate_nodes(A) cli_logger.print( "Requested {} nodes to shut down.", cf.bold(len(A)), _tags=dict(interval="1s")) time.sleep(POLL_INTERVAL) # todo: interval should be a variable A = remaining_nodes() cli_logger.print("{} nodes remaining after {} second(s).", cf.bold(len(A)), POLL_INTERVAL) cli_logger.success("No nodes remaining.")
cli_logger.abort("Cannot provide both ip_address and 'all_nodes'.") config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config, no_config_cache=no_config_cache) is_file_mount = False if source and target: for remote_mount in config.get("file_mounts", {}).keys(): if (source if down else target).startswith(remote_mount): is_file_mount = True break <<<<<<< HEAD:python/ray/autoscaler/_private/commands.py provider = _get_node_provider(config["provider"], config["cluster_name"]) def rsync_to_node(node_id, is_head_node): updater = NodeUpdaterThread( node_id=node_id, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="", use_internal_ip=use_internal_ip, process_runner=_runner,
def rsync(config_file: str, source: Optional[str], target: Optional[str], override_cluster_name: Optional[str], down: bool, ip_address: Optional[str] = None, use_internal_ip: bool = False, no_config_cache: bool = False, all_nodes: bool = False, _runner: ModuleType = subprocess) -> None: """Rsyncs files. Arguments: config_file: path to the cluster yaml source: source dir target: target dir override_cluster_name: set the name of the cluster down: whether we're syncing remote -> local ip_address (str): Address of node. Raise Exception if both ip_address and 'all_nodes' are provided. use_internal_ip (bool): Whether the provided ip_address is public or private. all_nodes: whether to sync worker nodes in addition to the head node """ if bool(source) != bool(target): cli_logger.abort( "Expected either both a source and a target, or neither.") assert bool(source) == bool(target), ( "Must either provide both or neither source and target.") if ip_address and all_nodes: cli_logger.abort("Cannot provide both ip_address and 'all_nodes'.") config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config, no_config_cache=no_config_cache) is_file_mount = False if source and target: for remote_mount in config.get("file_mounts", {}).keys(): if (source if down else target).startswith(remote_mount): is_file_mount = True break provider = _get_node_provider(config["provider"], config["cluster_name"]) def rsync_to_node(node_id, is_head_node): updater = NodeUpdaterThread( node_id=node_id, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="", use_internal_ip=use_internal_ip, process_runner=_runner, file_mounts_contents_hash="", is_head_node=is_head_node, rsync_options={ "rsync_exclude": config.get("rsync_exclude"), "rsync_filter": config.get("rsync_filter") }, docker_config=config.get("docker")) if down: rsync = updater.rsync_down else: rsync = updater.rsync_up if source and target: # print rsync progress for single file rsync cmd_output_util.set_output_redirected(False) set_rsync_silent(False) rsync(source, target, is_file_mount) else: updater.sync_file_mounts(rsync) nodes = [] head_node = _get_head_node( config, config_file, override_cluster_name, create_if_needed=False) if ip_address: nodes = [ provider.get_node_id(ip_address, use_internal_ip=use_internal_ip) ] else: nodes = [head_node] if all_nodes: nodes.extend(_get_worker_nodes(config, override_cluster_name)) for node_id in nodes: rsync_to_node(node_id, is_head_node=(node_id == head_node))
def exec_cluster(config_file: str, *, cmd: str = None, run_env: str = "auto", screen: bool = False, tmux: bool = False, stop: bool = False, start: bool = False, override_cluster_name: Optional[str] = None, no_config_cache: bool = False, port_forward: Optional[Port_forward] = None, with_output: bool = False) -> str: """Runs a command on the specified cluster. Arguments: config_file: path to the cluster yaml cmd: command to run run_env: whether to run the command on the host or in a container. Select between "auto", "host" and "docker" screen: whether to run in a screen tmux: whether to run in a tmux session stop: whether to stop the cluster after command run start: whether to start the cluster if it isn't up override_cluster_name: set the name of the cluster port_forward ( (int, int) or list[(int, int)] ): port(s) to forward """ assert not (screen and tmux), "Can specify only one of `screen` or `tmux`." assert run_env in RUN_ENV_TYPES, "--run_env must be in {}".format( RUN_ENV_TYPES) # TODO(rliaw): We default this to True to maintain backwards-compat. # In the future we would want to support disabling login-shells # and interactivity. cmd_output_util.set_allow_interactive(True) config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config, no_config_cache=no_config_cache) head_node = _get_head_node( config, config_file, override_cluster_name, create_if_needed=start) provider = _get_node_provider(config["provider"], config["cluster_name"]) updater = NodeUpdaterThread( node_id=head_node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="", file_mounts_contents_hash="", is_head_node=True, rsync_options={ "rsync_exclude": config.get("rsync_exclude"), "rsync_filter": config.get("rsync_filter") }, docker_config=config.get("docker")) shutdown_after_run = False if cmd and stop: cmd += "; ".join([ "ray stop", "ray teardown ~/ray_bootstrap_config.yaml --yes --workers-only" ]) shutdown_after_run = True result = _exec( updater, cmd, screen, tmux, port_forward=port_forward, with_output=with_output, run_env=run_env, shutdown_after_run=shutdown_after_run) if tmux or screen: attach_command_parts = ["ray attach", config_file] if override_cluster_name is not None: attach_command_parts.append( "--cluster-name={}".format(override_cluster_name)) if tmux: attach_command_parts.append("--tmux") elif screen: attach_command_parts.append("--screen") attach_command = " ".join(attach_command_parts) cli_logger.print("Run `{}` to check command status.", cf.bold(attach_command)) return result
def get_or_create_head_node(config: Dict[str, Any], config_file: str, no_restart: bool, restart_only: bool, yes: bool, override_cluster_name: Optional[str], _provider: Optional[NodeProvider] = None, _runner: ModuleType = subprocess) -> None: """Create the cluster head node, which in turn creates the workers.""" provider = (_provider or _get_node_provider(config["provider"], config["cluster_name"])) config = copy.deepcopy(config) config_file = os.path.abspath(config_file) head_node_tags = { TAG_RAY_NODE_KIND: NODE_KIND_HEAD, } nodes = provider.non_terminated_nodes(head_node_tags) if len(nodes) > 0: head_node = nodes[0] else: head_node = None if not head_node: cli_logger.confirm( yes, "No head node found. " "Launching a new cluster.", _abort=True) if head_node: if restart_only: cli_logger.confirm( yes, "Updating cluster configuration and " "restarting the cluster Ray runtime. " "Setup commands will not be run due to `{}`.\n", cf.bold("--restart-only"), _abort=True) elif no_restart: cli_logger.print( "Cluster Ray runtime will not be restarted due " "to `{}`.", cf.bold("--no-restart")) cli_logger.confirm( yes, "Updating cluster configuration and " "running setup commands.", _abort=True) else: cli_logger.print( "Updating cluster configuration and running full setup.") cli_logger.confirm( yes, cf.bold("Cluster Ray runtime will be restarted."), _abort=True) cli_logger.newline() # TODO(ekl) this logic is duplicated in node_launcher.py (keep in sync) head_node_config = copy.deepcopy(config["head_node"]) if "head_node_type" in config: head_node_tags[TAG_RAY_USER_NODE_TYPE] = config["head_node_type"] head_node_config.update(config["available_node_types"][config[ "head_node_type"]]["node_config"]) launch_hash = hash_launch_conf(head_node_config, config["auth"]) if head_node is None or provider.node_tags(head_node).get( TAG_RAY_LAUNCH_CONFIG) != launch_hash: with cli_logger.group("Acquiring an up-to-date head node"): if head_node is not None: cli_logger.print( "Currently running head node is out-of-date with " "cluster configuration") cli_logger.print( "hash is {}, expected {}", cf.bold( provider.node_tags(head_node) .get(TAG_RAY_LAUNCH_CONFIG)), cf.bold(launch_hash)) cli_logger.confirm(yes, "Relaunching it.", _abort=True) provider.terminate_node(head_node) cli_logger.print("Terminated head node {}", head_node) head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format( config["cluster_name"]) provider.create_node(head_node_config, head_node_tags, 1) cli_logger.print("Launched a new head node") start = time.time() head_node = None with cli_logger.group("Fetching the new head node"): while True: if time.time() - start > 50: cli_logger.abort( "Head node fetch timed out.") # todo: msg raise RuntimeError("Failed to create head node.") nodes = provider.non_terminated_nodes(head_node_tags) if len(nodes) == 1: head_node = nodes[0] break time.sleep(POLL_INTERVAL) cli_logger.newline() with cli_logger.group( "Setting up head node", _numbered=("<>", 1, 1), # cf.bold(provider.node_tags(head_node)[TAG_RAY_NODE_NAME]), _tags=dict()): # add id, ARN to tags? # TODO(ekl) right now we always update the head node even if the # hash matches. # We could prompt the user for what they want to do here. # No need to pass in cluster_sync_files because we use this # hash to set up the head node (runtime_hash, file_mounts_contents_hash) = hash_runtime_conf( config["file_mounts"], None, config) # Rewrite the auth config so that the head # node can update the workers remote_config = copy.deepcopy(config) # drop proxy options if they exist, otherwise # head node won't be able to connect to workers remote_config["auth"].pop("ssh_proxy_command", None) if "ssh_private_key" in config["auth"]: remote_key_path = "~/ray_bootstrap_key.pem" remote_config["auth"]["ssh_private_key"] = remote_key_path # Adjust for new file locations new_mounts = {} for remote_path in config["file_mounts"]: new_mounts[remote_path] = remote_path remote_config["file_mounts"] = new_mounts remote_config["no_restart"] = no_restart remote_config = provider.prepare_for_head_node(remote_config) # Now inject the rewritten config and SSH key into the head node remote_config_file = tempfile.NamedTemporaryFile( "w", prefix="ray-bootstrap-") remote_config_file.write(json.dumps(remote_config)) remote_config_file.flush() config["file_mounts"].update({ "~/ray_bootstrap_config.yaml": remote_config_file.name }) if "ssh_private_key" in config["auth"]: config["file_mounts"].update({ remote_key_path: config["auth"]["ssh_private_key"], }) cli_logger.print("Prepared bootstrap config") if restart_only: setup_commands = [] ray_start_commands = config["head_start_ray_commands"] elif no_restart: setup_commands = config["head_setup_commands"] ray_start_commands = [] else: setup_commands = config["head_setup_commands"] ray_start_commands = config["head_start_ray_commands"] if not no_restart: warn_about_bad_start_command(ray_start_commands) updater = NodeUpdaterThread( node_id=head_node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=config["initialization_commands"], setup_commands=setup_commands, ray_start_commands=ray_start_commands, process_runner=_runner, runtime_hash=runtime_hash, file_mounts_contents_hash=file_mounts_contents_hash, is_head_node=True, rsync_options={ "rsync_exclude": config.get("rsync_exclude"), "rsync_filter": config.get("rsync_filter") }, docker_config=config.get("docker")) updater.start() updater.join() # Refresh the node cache so we see the external ip if available provider.non_terminated_nodes(head_node_tags) if updater.exitcode != 0: # todo: this does not follow the mockup and is not good enough cli_logger.abort("Failed to setup head node.") sys.exit(1) monitor_str = "tail -n 100 -f /tmp/ray/session_latest/logs/monitor*" if override_cluster_name: modifiers = " --cluster-name={}".format(quote(override_cluster_name)) else: modifiers = "" cli_logger.newline() with cli_logger.group("Useful commands"): cli_logger.print("Monitor autoscaling with") cli_logger.print( cf.bold(" ray exec {}{} {}"), config_file, modifiers, quote(monitor_str)) cli_logger.print("Connect to a terminal on the cluster head:") cli_logger.print(cf.bold(" ray attach {}{}"), config_file, modifiers) remote_shell_str = updater.cmd_runner.remote_shell_command_str() cli_logger.print("Get a remote shell to the cluster manually:") cli_logger.print(" {}", remote_shell_str.strip())
def teardown_cluster(config_file: str, yes: bool, workers_only: bool, override_cluster_name: Optional[str], keep_min_workers: bool) -> None: """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = prepare_config(config) validate_config(config) cli_logger.confirm(yes, "Destroying cluster.", _abort=True) if not workers_only: try: exec_cluster( config_file, cmd="ray stop", run_env="auto", screen=False, tmux=False, stop=False, start=False, override_cluster_name=override_cluster_name, port_forward=None, with_output=False) except Exception as e: # todo: add better exception info cli_logger.verbose_error("{}", str(e)) cli_logger.warning( "Exception occurred when stopping the cluster Ray runtime " "(use -v to dump teardown exceptions).") cli_logger.warning( "Ignoring the exception and " "attempting to shut down the cluster nodes anyway.") provider = _get_node_provider(config["provider"], config["cluster_name"]) def remaining_nodes(): workers = provider.non_terminated_nodes({ TAG_RAY_NODE_KIND: NODE_KIND_WORKER }) if keep_min_workers: min_workers = config.get("min_workers", 0) cli_logger.print( "{} random worker nodes will not be shut down. " + cf.dimmed("(due to {})"), cf.bold(min_workers), cf.bold("--keep-min-workers")) workers = random.sample(workers, len(workers) - min_workers) # todo: it's weird to kill the head node but not all workers if workers_only: cli_logger.print( "The head node will not be shut down. " + cf.dimmed("(due to {})"), cf.bold("--workers-only")) return workers head = provider.non_terminated_nodes({ TAG_RAY_NODE_KIND: NODE_KIND_HEAD }) return head + workers def run_docker_stop(node, container_name): try: exec_cluster( config_file, cmd=f"docker stop {container_name}", run_env="host", screen=False, tmux=False, stop=False, start=False, override_cluster_name=override_cluster_name, port_forward=None, with_output=False) except Exception: cli_logger.warning(f"Docker stop failed on {node}") # Loop here to check that both the head and worker nodes are actually # really gone A = remaining_nodes() container_name = config.get("docker", {}).get("container_name") if container_name: for node in A: run_docker_stop(node, container_name) with LogTimer("teardown_cluster: done."): while A: provider.terminate_nodes(A) cli_logger.print( "Requested {} nodes to shut down.", cf.bold(len(A)), _tags=dict(interval="1s")) time.sleep(POLL_INTERVAL) # todo: interval should be a variable A = remaining_nodes() cli_logger.print("{} nodes remaining after {} second(s).", cf.bold(len(A)), POLL_INTERVAL) cli_logger.success("No nodes remaining.")
def testClusterStateInit(self): """Check ClusterState __init__ func generates correct state file. Test the general use case and if num_workers increase/decrease. """ # Use a random head_ip so that the state file is regenerated each time # this test is run. (Otherwise the test will fail spuriously when run a # second time.) self._monkeypatch.setenv("RAY_TMPDIR", self._tmpdir) # ensure that a new cluster can start up if RAY_TMPDIR doesn't exist yet assert not os.path.exists(get_ray_temp_dir()) head_ip = ".".join(str(random.randint(0, 255)) for _ in range(4)) cluster_config = { "cluster_name": "random_name", "min_workers": 0, "max_workers": 0, "provider": { "type": "local", "head_ip": head_ip, "worker_ips": ["0.0.0.0:1"], "external_head_ip": "0.0.0.0.3", }, } provider_config = cluster_config["provider"] node_provider = _get_node_provider( provider_config, cluster_config["cluster_name"], use_cache=False ) assert os.path.exists(get_ray_temp_dir()) assert node_provider.external_ip(head_ip) == "0.0.0.0.3" assert isinstance(node_provider, LocalNodeProvider) expected_workers = {} expected_workers[provider_config["head_ip"]] = { "tags": {TAG_RAY_NODE_KIND: NODE_KIND_HEAD}, "state": "terminated", "external_ip": "0.0.0.0.3", } expected_workers[provider_config["worker_ips"][0]] = { "tags": {TAG_RAY_NODE_KIND: NODE_KIND_WORKER}, "state": "terminated", } state_save_path = local_config.get_state_path(cluster_config["cluster_name"]) assert os.path.exists(state_save_path) workers = json.loads(open(state_save_path).read()) assert workers == expected_workers # Test removing workers updates the cluster state. del expected_workers[provider_config["worker_ips"][0]] removed_ip = provider_config["worker_ips"].pop() node_provider = _get_node_provider( provider_config, cluster_config["cluster_name"], use_cache=False ) workers = json.loads(open(state_save_path).read()) assert workers == expected_workers # Test adding back workers updates the cluster state. expected_workers[removed_ip] = { "tags": {TAG_RAY_NODE_KIND: NODE_KIND_WORKER}, "state": "terminated", } provider_config["worker_ips"].append(removed_ip) node_provider = _get_node_provider( provider_config, cluster_config["cluster_name"], use_cache=False ) workers = json.loads(open(state_save_path).read()) assert workers == expected_workers # Test record_local_head_state_if_needed head_ip = cluster_config["provider"]["head_ip"] cluster_name = cluster_config["cluster_name"] node_provider = _get_node_provider( provider_config, cluster_config["cluster_name"], use_cache=False ) assert head_ip not in node_provider.non_terminated_nodes({}) record_local_head_state_if_needed(node_provider) assert head_ip in node_provider.non_terminated_nodes({}) expected_head_tags = { TAG_RAY_NODE_KIND: NODE_KIND_HEAD, TAG_RAY_USER_NODE_TYPE: local_config.LOCAL_CLUSTER_NODE_TYPE, TAG_RAY_NODE_NAME: "ray-{}-head".format(cluster_name), TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE, } assert node_provider.node_tags(head_ip) == expected_head_tags # Repeat and verify nothing has changed. record_local_head_state_if_needed(node_provider) assert head_ip in node_provider.non_terminated_nodes({}) assert node_provider.node_tags(head_ip) == expected_head_tags
def testCoordinatorSenderNodeProvider(self): """Integration test of CoordinatorSenderNodeProvider.""" cluster_config = { "cluster_name": "random_name", "min_workers": 0, "max_workers": 0, "provider": { "type": "local", "coordinator_address": self.coordinator_address, }, "head_node": {}, "worker_nodes": {}, } provider_config = cluster_config["provider"] node_provider_1 = _get_node_provider( provider_config, cluster_config["cluster_name"], use_cache=False ) assert isinstance(node_provider_1, CoordinatorSenderNodeProvider) assert not node_provider_1.non_terminated_nodes({}) assert not node_provider_1.is_running(self.list_of_node_ips[0]) assert node_provider_1.is_terminated(self.list_of_node_ips[0]) assert not node_provider_1.node_tags(self.list_of_node_ips[0]) head_node_tags = { TAG_RAY_NODE_KIND: NODE_KIND_HEAD, } assert not node_provider_1.non_terminated_nodes(head_node_tags) head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format( cluster_config["cluster_name"] ) node_provider_1.create_node(cluster_config["head_node"], head_node_tags, 1) assert node_provider_1.non_terminated_nodes({}) == [self.list_of_node_ips[0]] head_node_tags[TAG_RAY_CLUSTER_NAME] = cluster_config["cluster_name"] assert node_provider_1.node_tags(self.list_of_node_ips[0]) == head_node_tags assert node_provider_1.is_running(self.list_of_node_ips[0]) assert not node_provider_1.is_terminated(self.list_of_node_ips[0]) # Add another cluster. cluster_config["cluster_name"] = "random_name_2" provider_config = cluster_config["provider"] node_provider_2 = _get_node_provider( provider_config, cluster_config["cluster_name"], use_cache=False ) assert not node_provider_2.non_terminated_nodes({}) assert not node_provider_2.is_running(self.list_of_node_ips[1]) assert node_provider_2.is_terminated(self.list_of_node_ips[1]) assert not node_provider_2.node_tags(self.list_of_node_ips[1]) assert not node_provider_2.non_terminated_nodes(head_node_tags) head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format( cluster_config["cluster_name"] ) node_provider_2.create_node(cluster_config["head_node"], head_node_tags, 1) assert node_provider_2.non_terminated_nodes({}) == [self.list_of_node_ips[1]] head_node_tags[TAG_RAY_CLUSTER_NAME] = cluster_config["cluster_name"] assert node_provider_2.node_tags(self.list_of_node_ips[1]) == head_node_tags assert node_provider_2.is_running(self.list_of_node_ips[1]) assert not node_provider_2.is_terminated(self.list_of_node_ips[1]) # Add another cluster (should fail because we only have two nodes). cluster_config["cluster_name"] = "random_name_3" provider_config = cluster_config["provider"] node_provider_3 = _get_node_provider( provider_config, cluster_config["cluster_name"], use_cache=False ) assert not node_provider_3.non_terminated_nodes(head_node_tags) head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format( cluster_config["cluster_name"] ) node_provider_3.create_node(cluster_config["head_node"], head_node_tags, 1) assert not node_provider_3.non_terminated_nodes({}) # Terminate all nodes. node_provider_1.terminate_node(self.list_of_node_ips[0]) assert not node_provider_1.non_terminated_nodes({}) node_provider_2.terminate_node(self.list_of_node_ips[1]) assert not node_provider_2.non_terminated_nodes({}) # Check if now we can create more clusters/nodes. node_provider_3.create_node(cluster_config["head_node"], head_node_tags, 1) worker_node_tags = { TAG_RAY_NODE_NAME: "ray-{}-worker".format(cluster_config["cluster_name"]), TAG_RAY_NODE_KIND: NODE_KIND_WORKER, } node_provider_3.create_node(cluster_config["worker_nodes"], worker_node_tags, 1) assert node_provider_3.non_terminated_nodes({}) == self.list_of_node_ips worker_filter = {TAG_RAY_NODE_KIND: NODE_KIND_WORKER} assert node_provider_3.non_terminated_nodes(worker_filter) == [ self.list_of_node_ips[1] ] head_filter = {TAG_RAY_NODE_KIND: NODE_KIND_HEAD} assert node_provider_3.non_terminated_nodes(head_filter) == [ self.list_of_node_ips[0] ]
def testClusterStateInit(self): """Check ClusterState __init__ func generates correct state file. Test the general use case and if num_workers increase/decrease. """ cluster_config = { "cluster_name": "random_name", "min_workers": 0, "max_workers": 0, "initial_workers": 0, "provider": { "type": "local", "head_ip": "0.0.0.0:2", "worker_ips": ["0.0.0.0:1"] }, } provider_config = cluster_config["provider"] node_provider = _get_node_provider(provider_config, cluster_config["cluster_name"], use_cache=False) assert isinstance(node_provider, LocalNodeProvider) expected_workers = {} expected_workers[provider_config["head_ip"]] = { "tags": { TAG_RAY_NODE_KIND: NODE_KIND_HEAD }, "state": "terminated", } expected_workers[provider_config["worker_ips"][0]] = { "tags": { TAG_RAY_NODE_KIND: NODE_KIND_WORKER }, "state": "terminated", } state_save_path = "/tmp/cluster-{}.state".format( cluster_config["cluster_name"]) assert os.path.exists(state_save_path) workers = json.loads(open(state_save_path).read()) assert workers == expected_workers # Test removing workers updates the cluster state. del expected_workers[provider_config["worker_ips"][0]] removed_ip = provider_config["worker_ips"].pop() node_provider = _get_node_provider(provider_config, cluster_config["cluster_name"], use_cache=False) workers = json.loads(open(state_save_path).read()) assert workers == expected_workers # Test adding back workers updates the cluster state. expected_workers[removed_ip] = { "tags": { TAG_RAY_NODE_KIND: NODE_KIND_WORKER }, "state": "terminated", } provider_config["worker_ips"].append(removed_ip) node_provider = _get_node_provider(provider_config, cluster_config["cluster_name"], use_cache=False) workers = json.loads(open(state_save_path).read()) assert workers == expected_workers
def get_or_create_head_node(config: Dict[str, Any], printable_config_file: str, no_restart: bool, restart_only: bool, yes: bool, override_cluster_name: Optional[str], no_monitor_on_head: bool = False, _provider: Optional[NodeProvider] = None, _runner: ModuleType = subprocess) -> None: """Create the cluster head node, which in turn creates the workers.""" global_event_system.execute_callback( CreateClusterEvent.cluster_booting_started) provider = (_provider or _get_node_provider(config["provider"], config["cluster_name"])) config = copy.deepcopy(config) head_node_tags = { TAG_RAY_NODE_KIND: NODE_KIND_HEAD, } nodes = provider.non_terminated_nodes(head_node_tags) if len(nodes) > 0: head_node = nodes[0] else: head_node = None if not head_node: cli_logger.confirm( yes, "No head node found. " "Launching a new cluster.", _abort=True) if head_node: if restart_only: cli_logger.confirm( yes, "Updating cluster configuration and " "restarting the cluster Ray runtime. " "Setup commands will not be run due to `{}`.\n", cf.bold("--restart-only"), _abort=True) elif no_restart: cli_logger.print( "Cluster Ray runtime will not be restarted due " "to `{}`.", cf.bold("--no-restart")) cli_logger.confirm( yes, "Updating cluster configuration and " "running setup commands.", _abort=True) else: cli_logger.print( "Updating cluster configuration and running full setup.") cli_logger.confirm( yes, cf.bold("Cluster Ray runtime will be restarted."), _abort=True) cli_logger.newline() # TODO(ekl) this logic is duplicated in node_launcher.py (keep in sync) head_node_config = copy.deepcopy(config["head_node"]) head_node_resources = None if "head_node_type" in config: head_node_type = config["head_node_type"] head_node_tags[TAG_RAY_USER_NODE_TYPE] = head_node_type head_config = config["available_node_types"][head_node_type] head_node_config.update(head_config["node_config"]) # Not necessary to keep in sync with node_launcher.py # Keep in sync with autoscaler.py _node_resources head_node_resources = head_config.get("resources") launch_hash = hash_launch_conf(head_node_config, config["auth"]) if head_node is None or provider.node_tags(head_node).get( TAG_RAY_LAUNCH_CONFIG) != launch_hash: with cli_logger.group("Acquiring an up-to-date head node"): global_event_system.execute_callback( CreateClusterEvent.acquiring_new_head_node) if head_node is not None: cli_logger.print( "Currently running head node is out-of-date with " "cluster configuration") cli_logger.print( "hash is {}, expected {}", cf.bold( provider.node_tags(head_node) .get(TAG_RAY_LAUNCH_CONFIG)), cf.bold(launch_hash)) cli_logger.confirm(yes, "Relaunching it.", _abort=True) provider.terminate_node(head_node) cli_logger.print("Terminated head node {}", head_node) head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format( config["cluster_name"]) head_node_tags[TAG_RAY_NODE_STATUS] = STATUS_UNINITIALIZED provider.create_node(head_node_config, head_node_tags, 1) cli_logger.print("Launched a new head node") start = time.time() head_node = None with cli_logger.group("Fetching the new head node"): while True: if time.time() - start > 50: cli_logger.abort( "Head node fetch timed out.") # todo: msg raise RuntimeError("Failed to create head node.") nodes = provider.non_terminated_nodes(head_node_tags) if len(nodes) == 1: head_node = nodes[0] break time.sleep(POLL_INTERVAL) cli_logger.newline() global_event_system.execute_callback(CreateClusterEvent.head_node_acquired) with cli_logger.group( "Setting up head node", _numbered=("<>", 1, 1), # cf.bold(provider.node_tags(head_node)[TAG_RAY_NODE_NAME]), _tags=dict()): # add id, ARN to tags? # TODO(ekl) right now we always update the head node even if the # hash matches. # We could prompt the user for what they want to do here. # No need to pass in cluster_sync_files because we use this # hash to set up the head node (runtime_hash, file_mounts_contents_hash) = hash_runtime_conf( config["file_mounts"], None, config) if not no_monitor_on_head: # Return remote_config_file to avoid prematurely closing it. config, remote_config_file = _set_up_config_for_head_node( config, provider, no_restart) cli_logger.print("Prepared bootstrap config") if restart_only: # Docker may re-launch nodes, requiring setup # commands to be rerun. if config.get("docker", {}).get("container_name"): setup_commands = config["head_setup_commands"] else: setup_commands = [] ray_start_commands = config["head_start_ray_commands"] elif no_restart: setup_commands = config["head_setup_commands"] ray_start_commands = [] else: setup_commands = config["head_setup_commands"] ray_start_commands = config["head_start_ray_commands"] if not no_restart: warn_about_bad_start_command(ray_start_commands, no_monitor_on_head) updater = NodeUpdaterThread( node_id=head_node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=config["initialization_commands"], setup_commands=setup_commands, ray_start_commands=ray_start_commands, process_runner=_runner, runtime_hash=runtime_hash, file_mounts_contents_hash=file_mounts_contents_hash, is_head_node=True, node_resources=head_node_resources, rsync_options={ "rsync_exclude": config.get("rsync_exclude"), "rsync_filter": config.get("rsync_filter") }, docker_config=config.get("docker"), restart_only=restart_only) updater.start() updater.join() # Refresh the node cache so we see the external ip if available provider.non_terminated_nodes(head_node_tags) if updater.exitcode != 0: # todo: this does not follow the mockup and is not good enough cli_logger.abort("Failed to setup head node.") sys.exit(1) global_event_system.execute_callback( CreateClusterEvent.cluster_booting_completed, { "head_node_id": head_node, }) monitor_str = "tail -n 100 -f /tmp/ray/session_latest/logs/monitor*" if override_cluster_name: modifiers = " --cluster-name={}".format(quote(override_cluster_name)) else: modifiers = "" cli_logger.newline() with cli_logger.group("Useful commands"): printable_config_file = os.path.abspath(printable_config_file) cli_logger.print("Monitor autoscaling with") cli_logger.print( cf.bold(" ray exec {}{} {}"), printable_config_file, modifiers, quote(monitor_str)) cli_logger.print("Connect to a terminal on the cluster head:") cli_logger.print( cf.bold(" ray attach {}{}"), printable_config_file, modifiers) remote_shell_str = updater.cmd_runner.remote_shell_command_str() cli_logger.print("Get a remote shell to the cluster manually:") cli_logger.print(" {}", remote_shell_str.strip())