def reset(self, errors_fatal=False): sync_continuously = False if hasattr(self, "config"): sync_continuously = self.config.get( "file_mounts_sync_continuously", False) try: with open(self.config_path) as f: new_config = yaml.safe_load(f.read()) if new_config != getattr(self, "config", None): try: validate_config(new_config) except Exception as e: logger.debug( "Cluster config validation failed. The version of " "the ray CLI you launched this cluster with may " "be higher than the version of ray being run on " "the cluster. Some new features may not be " "available until you upgrade ray on your cluster.", exc_info=e) (new_runtime_hash, new_file_mounts_contents_hash) = hash_runtime_conf( new_config["file_mounts"], new_config["cluster_synced_files"], [ new_config["worker_setup_commands"], new_config["worker_start_ray_commands"], ], generate_file_mounts_contents_hash=sync_continuously, ) self.config = new_config self.runtime_hash = new_runtime_hash self.file_mounts_contents_hash = new_file_mounts_contents_hash if not self.provider: self.provider = _get_node_provider(self.config["provider"], self.config["cluster_name"]) # Check whether we can enable the resource demand scheduler. if "available_node_types" in self.config: self.available_node_types = self.config["available_node_types"] self.resource_demand_scheduler = ResourceDemandScheduler( self.provider, self.available_node_types, self.config["max_workers"]) else: self.available_node_types = None self.resource_demand_scheduler = None except Exception as e: if errors_fatal: raise e else: logger.exception("StandardAutoscaler: " "Error parsing config.")
def reset(self, errors_fatal=False): sync_continuously = False if hasattr(self, "config"): sync_continuously = self.config.get( "file_mounts_sync_continuously", False) try: with open(self.config_path) as f: new_config = yaml.safe_load(f.read()) validate_config(new_config) (new_runtime_hash, new_file_mounts_contents_hash) = hash_runtime_conf( new_config["file_mounts"], new_config["cluster_synced_files"], [ new_config["worker_setup_commands"], new_config["worker_start_ray_commands"], ], generate_file_mounts_contents_hash=sync_continuously, ) self.config = new_config self.runtime_hash = new_runtime_hash self.file_mounts_contents_hash = new_file_mounts_contents_hash if not self.provider: self.provider = _get_node_provider(self.config["provider"], self.config["cluster_name"]) # Check whether we can enable the resource demand scheduler. if "available_node_types" in self.config: self.available_node_types = self.config["available_node_types"] self.resource_demand_scheduler = ResourceDemandScheduler( self.provider, self.available_node_types, self.config["max_workers"]) else: self.available_node_types = None self.resource_demand_scheduler = None except Exception as e: if errors_fatal: raise e else: logger.exception("StandardAutoscaler: " "Error parsing config.")
def reset(self, errors_fatal=False): sync_continuously = False if hasattr(self, "config"): sync_continuously = self.config.get( "file_mounts_sync_continuously", False) try: with open(self.config_path) as f: new_config = yaml.safe_load(f.read()) if new_config != getattr(self, "config", None): try: validate_config(new_config) except Exception as e: logger.debug( "Cluster config validation failed. The version of " "the ray CLI you launched this cluster with may " "be higher than the version of ray being run on " "the cluster. Some new features may not be " "available until you upgrade ray on your cluster.", exc_info=e) (new_runtime_hash, new_file_mounts_contents_hash) = hash_runtime_conf( new_config["file_mounts"], new_config["cluster_synced_files"], [ new_config["worker_setup_commands"], new_config["worker_start_ray_commands"], ], generate_file_mounts_contents_hash=sync_continuously, ) self.config = new_config self.runtime_hash = new_runtime_hash self.file_mounts_contents_hash = new_file_mounts_contents_hash if not self.provider: self.provider = _get_node_provider(self.config["provider"], self.config["cluster_name"]) self.available_node_types = self.config["available_node_types"] upscaling_speed = self.config.get("upscaling_speed") aggressive = self.config.get("autoscaling_mode") == "aggressive" target_utilization_fraction = self.config.get( "target_utilization_fraction") if upscaling_speed: upscaling_speed = float(upscaling_speed) # TODO(ameer): consider adding (if users ask) an option of # initial_upscaling_num_workers. elif aggressive: upscaling_speed = 99999 logger.warning( "Legacy aggressive autoscaling mode " "detected. Replacing it by setting upscaling_speed to " "99999.") elif target_utilization_fraction: upscaling_speed = ( 1 / max(target_utilization_fraction, 0.001) - 1) logger.warning( "Legacy target_utilization_fraction config " "detected. Replacing it by setting upscaling_speed to " + "1 / target_utilization_fraction - 1.") else: upscaling_speed = 1.0 if self.resource_demand_scheduler: # The node types are autofilled internally for legacy yamls, # overwriting the class will remove the inferred node resources # for legacy yamls. self.resource_demand_scheduler.reset_config( self.provider, self.available_node_types, self.config["max_workers"], self.config["head_node_type"], upscaling_speed) else: self.resource_demand_scheduler = ResourceDemandScheduler( self.provider, self.available_node_types, self.config["max_workers"], self.config["head_node_type"], upscaling_speed) except Exception as e: if errors_fatal: raise e else: logger.exception("StandardAutoscaler: " "Error parsing config.")
def get_or_create_head_node(config, config_file, no_restart, restart_only, yes, override_cluster_name, _provider=None, _runner=subprocess): """Create the cluster head node, which in turn creates the workers.""" provider = (_provider or _get_node_provider(config["provider"], config["cluster_name"])) config = copy.deepcopy(config) config_file = os.path.abspath(config_file) try: head_node_tags = { TAG_RAY_NODE_KIND: NODE_KIND_HEAD, } nodes = provider.non_terminated_nodes(head_node_tags) if len(nodes) > 0: head_node = nodes[0] else: head_node = None if not head_node: cli_logger.confirm(yes, "No head node found. " "Launching a new cluster.", _abort=True) cli_logger.old_confirm("This will create a new cluster", yes) elif not no_restart: cli_logger.old_confirm("This will restart cluster services", yes) if head_node: if restart_only: cli_logger.confirm( yes, "Updating cluster configuration and " "restarting the cluster Ray runtime. " "Setup commands will not be run due to `{}`.\n", cf.bold("--restart-only"), _abort=True) elif no_restart: cli_logger.print( "Cluster Ray runtime will not be restarted due " "to `{}`.", cf.bold("--no-restart")) cli_logger.confirm(yes, "Updating cluster configuration and " "running setup commands.", _abort=True) else: cli_logger.print( "Updating cluster configuration and running full setup.") cli_logger.confirm( yes, cf.bold("Cluster Ray runtime will be restarted."), _abort=True) cli_logger.newline() # TODO(ekl) this logic is duplicated in node_launcher.py (keep in sync) head_node_config = copy.deepcopy(config["head_node"]) if "head_node_type" in config: head_node_tags[TAG_RAY_USER_NODE_TYPE] = config["head_node_type"] head_node_config.update(config["available_node_types"][ config["head_node_type"]]["node_config"]) launch_hash = hash_launch_conf(head_node_config, config["auth"]) if head_node is None or provider.node_tags(head_node).get( TAG_RAY_LAUNCH_CONFIG) != launch_hash: with cli_logger.group("Acquiring an up-to-date head node"): if head_node is not None: cli_logger.print( "Currently running head node is out-of-date with " "cluster configuration") cli_logger.print( "hash is {}, expected {}", cf.bold( provider.node_tags(head_node).get( TAG_RAY_LAUNCH_CONFIG)), cf.bold(launch_hash)) cli_logger.confirm(yes, "Relaunching it.", _abort=True) cli_logger.old_confirm( "Head node config out-of-date. It will be terminated", yes) cli_logger.old_info( logger, "get_or_create_head_node: " "Shutting down outdated head node {}", head_node) provider.terminate_node(head_node) cli_logger.print("Terminated head node {}", head_node) cli_logger.old_info( logger, "get_or_create_head_node: Launching new head node...") head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format( config["cluster_name"]) provider.create_node(head_node_config, head_node_tags, 1) cli_logger.print("Launched a new head node") start = time.time() head_node = None with cli_logger.timed("Fetching the new head node"): while True: if time.time() - start > 50: cli_logger.abort( "Head node fetch timed out.") # todo: msg raise RuntimeError("Failed to create head node.") nodes = provider.non_terminated_nodes(head_node_tags) if len(nodes) == 1: head_node = nodes[0] break time.sleep(POLL_INTERVAL) cli_logger.newline() with cli_logger.group( "Setting up head node", _numbered=("<>", 1, 1), # cf.bold(provider.node_tags(head_node)[TAG_RAY_NODE_NAME]), _tags=dict()): # add id, ARN to tags? # TODO(ekl) right now we always update the head node even if the # hash matches. # We could prompt the user for what they want to do here. # No need to pass in cluster_sync_files because we use this # hash to set up the head node (runtime_hash, file_mounts_contents_hash) = hash_runtime_conf( config["file_mounts"], None, config) cli_logger.old_info( logger, "get_or_create_head_node: Updating files on head node...") # Rewrite the auth config so that the head # node can update the workers remote_config = copy.deepcopy(config) # drop proxy options if they exist, otherwise # head node won't be able to connect to workers remote_config["auth"].pop("ssh_proxy_command", None) if "ssh_private_key" in config["auth"]: remote_key_path = "~/ray_bootstrap_key.pem" remote_config["auth"]["ssh_private_key"] = remote_key_path # Adjust for new file locations new_mounts = {} for remote_path in config["file_mounts"]: new_mounts[remote_path] = remote_path remote_config["file_mounts"] = new_mounts remote_config["no_restart"] = no_restart remote_config = provider.prepare_for_head_node(remote_config) # Now inject the rewritten config and SSH key into the head node remote_config_file = tempfile.NamedTemporaryFile( "w", prefix="ray-bootstrap-") remote_config_file.write(json.dumps(remote_config)) remote_config_file.flush() config["file_mounts"].update( {"~/ray_bootstrap_config.yaml": remote_config_file.name}) if "ssh_private_key" in config["auth"]: config["file_mounts"].update({ remote_key_path: config["auth"]["ssh_private_key"], }) cli_logger.print("Prepared bootstrap config") if restart_only: setup_commands = [] ray_start_commands = config["head_start_ray_commands"] elif no_restart: setup_commands = config["head_setup_commands"] ray_start_commands = [] else: setup_commands = config["head_setup_commands"] ray_start_commands = config["head_start_ray_commands"] if not no_restart: warn_about_bad_start_command(ray_start_commands) updater = NodeUpdaterThread( node_id=head_node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=config["initialization_commands"], setup_commands=setup_commands, ray_start_commands=ray_start_commands, process_runner=_runner, runtime_hash=runtime_hash, file_mounts_contents_hash=file_mounts_contents_hash, is_head_node=True, docker_config=config.get("docker")) updater.start() updater.join() # Refresh the node cache so we see the external ip if available provider.non_terminated_nodes(head_node_tags) if config.get("provider", {}).get("use_internal_ips", False) is True: head_node_ip = provider.internal_ip(head_node) else: head_node_ip = provider.external_ip(head_node) if updater.exitcode != 0: # todo: this does not follow the mockup and is not good enough cli_logger.abort("Failed to setup head node.") cli_logger.old_error( logger, "get_or_create_head_node: " "Updating {} failed", head_node_ip) sys.exit(1) cli_logger.old_info( logger, "get_or_create_head_node: " "Head node up-to-date, IP address is: {}", head_node_ip) monitor_str = "tail -n 100 -f /tmp/ray/session_latest/logs/monitor*" if override_cluster_name: modifiers = " --cluster-name={}".format( quote(override_cluster_name)) else: modifiers = "" if cli_logger.old_style: print("To monitor autoscaling activity, you can run:\n\n" " ray exec {} {}{}\n".format(config_file, quote(monitor_str), modifiers)) print("To open a console on the cluster:\n\n" " ray attach {}{}\n".format(config_file, modifiers)) print("To get a remote shell to the cluster manually, run:\n\n" " {}\n".format( updater.cmd_runner.remote_shell_command_str())) cli_logger.newline() with cli_logger.group("Useful commands"): cli_logger.print("Monitor autoscaling with") cli_logger.print(cf.bold(" ray exec {}{} {}"), config_file, modifiers, quote(monitor_str)) cli_logger.print("Connect to a terminal on the cluster head:") cli_logger.print(cf.bold(" ray attach {}{}"), config_file, modifiers) remote_shell_str = updater.cmd_runner.remote_shell_command_str() cli_logger.print("Get a remote shell to the cluster manually:") cli_logger.print(" {}", remote_shell_str.strip()) finally: provider.cleanup()
def get_or_create_head_node(config: Dict[str, Any], printable_config_file: str, no_restart: bool, restart_only: bool, yes: bool, override_cluster_name: Optional[str], no_monitor_on_head: bool = False, _provider: Optional[NodeProvider] = None, _runner: ModuleType = subprocess) -> None: """Create the cluster head node, which in turn creates the workers.""" global_event_system.execute_callback( CreateClusterEvent.cluster_booting_started) provider = (_provider or _get_node_provider(config["provider"], config["cluster_name"])) config = copy.deepcopy(config) head_node_tags = { TAG_RAY_NODE_KIND: NODE_KIND_HEAD, } nodes = provider.non_terminated_nodes(head_node_tags) if len(nodes) > 0: head_node = nodes[0] else: head_node = None if not head_node: cli_logger.confirm( yes, "No head node found. " "Launching a new cluster.", _abort=True) if head_node: if restart_only: cli_logger.confirm( yes, "Updating cluster configuration and " "restarting the cluster Ray runtime. " "Setup commands will not be run due to `{}`.\n", cf.bold("--restart-only"), _abort=True) elif no_restart: cli_logger.print( "Cluster Ray runtime will not be restarted due " "to `{}`.", cf.bold("--no-restart")) cli_logger.confirm( yes, "Updating cluster configuration and " "running setup commands.", _abort=True) else: cli_logger.print( "Updating cluster configuration and running full setup.") cli_logger.confirm( yes, cf.bold("Cluster Ray runtime will be restarted."), _abort=True) cli_logger.newline() # TODO(ekl) this logic is duplicated in node_launcher.py (keep in sync) head_node_config = copy.deepcopy(config["head_node"]) head_node_resources = None if "head_node_type" in config: head_node_type = config["head_node_type"] head_node_tags[TAG_RAY_USER_NODE_TYPE] = head_node_type head_config = config["available_node_types"][head_node_type] head_node_config.update(head_config["node_config"]) # Not necessary to keep in sync with node_launcher.py # Keep in sync with autoscaler.py _node_resources head_node_resources = head_config.get("resources") launch_hash = hash_launch_conf(head_node_config, config["auth"]) if head_node is None or provider.node_tags(head_node).get( TAG_RAY_LAUNCH_CONFIG) != launch_hash: with cli_logger.group("Acquiring an up-to-date head node"): global_event_system.execute_callback( CreateClusterEvent.acquiring_new_head_node) if head_node is not None: cli_logger.print( "Currently running head node is out-of-date with " "cluster configuration") cli_logger.print( "hash is {}, expected {}", cf.bold( provider.node_tags(head_node) .get(TAG_RAY_LAUNCH_CONFIG)), cf.bold(launch_hash)) cli_logger.confirm(yes, "Relaunching it.", _abort=True) provider.terminate_node(head_node) cli_logger.print("Terminated head node {}", head_node) head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format( config["cluster_name"]) head_node_tags[TAG_RAY_NODE_STATUS] = STATUS_UNINITIALIZED provider.create_node(head_node_config, head_node_tags, 1) cli_logger.print("Launched a new head node") start = time.time() head_node = None with cli_logger.group("Fetching the new head node"): while True: if time.time() - start > 50: cli_logger.abort( "Head node fetch timed out.") # todo: msg raise RuntimeError("Failed to create head node.") nodes = provider.non_terminated_nodes(head_node_tags) if len(nodes) == 1: head_node = nodes[0] break time.sleep(POLL_INTERVAL) cli_logger.newline() global_event_system.execute_callback(CreateClusterEvent.head_node_acquired) with cli_logger.group( "Setting up head node", _numbered=("<>", 1, 1), # cf.bold(provider.node_tags(head_node)[TAG_RAY_NODE_NAME]), _tags=dict()): # add id, ARN to tags? # TODO(ekl) right now we always update the head node even if the # hash matches. # We could prompt the user for what they want to do here. # No need to pass in cluster_sync_files because we use this # hash to set up the head node (runtime_hash, file_mounts_contents_hash) = hash_runtime_conf( config["file_mounts"], None, config) if not no_monitor_on_head: # Return remote_config_file to avoid prematurely closing it. config, remote_config_file = _set_up_config_for_head_node( config, provider, no_restart) cli_logger.print("Prepared bootstrap config") if restart_only: # Docker may re-launch nodes, requiring setup # commands to be rerun. if config.get("docker", {}).get("container_name"): setup_commands = config["head_setup_commands"] else: setup_commands = [] ray_start_commands = config["head_start_ray_commands"] elif no_restart: setup_commands = config["head_setup_commands"] ray_start_commands = [] else: setup_commands = config["head_setup_commands"] ray_start_commands = config["head_start_ray_commands"] if not no_restart: warn_about_bad_start_command(ray_start_commands, no_monitor_on_head) updater = NodeUpdaterThread( node_id=head_node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=config["initialization_commands"], setup_commands=setup_commands, ray_start_commands=ray_start_commands, process_runner=_runner, runtime_hash=runtime_hash, file_mounts_contents_hash=file_mounts_contents_hash, is_head_node=True, node_resources=head_node_resources, rsync_options={ "rsync_exclude": config.get("rsync_exclude"), "rsync_filter": config.get("rsync_filter") }, docker_config=config.get("docker"), restart_only=restart_only) updater.start() updater.join() # Refresh the node cache so we see the external ip if available provider.non_terminated_nodes(head_node_tags) if updater.exitcode != 0: # todo: this does not follow the mockup and is not good enough cli_logger.abort("Failed to setup head node.") sys.exit(1) global_event_system.execute_callback( CreateClusterEvent.cluster_booting_completed, { "head_node_id": head_node, }) monitor_str = "tail -n 100 -f /tmp/ray/session_latest/logs/monitor*" if override_cluster_name: modifiers = " --cluster-name={}".format(quote(override_cluster_name)) else: modifiers = "" cli_logger.newline() with cli_logger.group("Useful commands"): printable_config_file = os.path.abspath(printable_config_file) cli_logger.print("Monitor autoscaling with") cli_logger.print( cf.bold(" ray exec {}{} {}"), printable_config_file, modifiers, quote(monitor_str)) cli_logger.print("Connect to a terminal on the cluster head:") cli_logger.print( cf.bold(" ray attach {}{}"), printable_config_file, modifiers) remote_shell_str = updater.cmd_runner.remote_shell_command_str() cli_logger.print("Get a remote shell to the cluster manually:") cli_logger.print(" {}", remote_shell_str.strip())