def terminate_nodes(self, node_ids): if not node_ids: return if self.cache_stopped_nodes: spot_ids = [] on_demand_ids = [] for node_id in node_ids: if self._get_cached_node(node_id).spot_instance_request_id: spot_ids += [node_id] else: on_demand_ids += [node_id] if on_demand_ids: # todo: show node names? cli_logger.print( "Stopping instances {} " + cf.dimmed( "(to terminate instead, " "set `cache_stopped_nodes: False` " "under `provider` in the cluster configuration)"), cli_logger.render_list(on_demand_ids)) self.ec2.meta.client.stop_instances(InstanceIds=on_demand_ids) if spot_ids: cli_logger.print( "Terminating instances {} " + cf.dimmed("(cannot stop spot instances, only terminate)"), cli_logger.render_list(spot_ids)) self.ec2.meta.client.terminate_instances(InstanceIds=spot_ids) else: self.ec2.meta.client.terminate_instances(InstanceIds=node_ids)
def print_info(resource_string: str, key: str, src_key: str, allowed_tags: Optional[List[str]] = None, list_value: bool = False) -> None: if allowed_tags is None: allowed_tags = ["default"] node_tags = {} # set of configurations corresponding to `key` unique_settings = set() for node_type_key, node_type in config[ "available_node_types"].items(): node_tags[node_type_key] = {} tag = _log_info[src_key][node_type_key] if tag in allowed_tags: node_tags[node_type_key][tag] = True setting = node_type["node_config"].get(key) if list_value: unique_settings.add(tuple(setting)) else: unique_settings.add(setting) head_value_str = head_node_config[key] if list_value: head_value_str = cli_logger.render_list(head_value_str) if len(unique_settings) == 1: # all node types are configured the same, condense # log output cli_logger.labeled_value( resource_string + " (all available node types)", "{}", head_value_str, _tags=node_tags[config["head_node_type"]]) else: # do head node type first cli_logger.labeled_value(resource_string + f" ({head_node_type})", "{}", head_value_str, _tags=node_tags[head_node_type]) # go through remaining types for node_type_key, node_type in config[ "available_node_types"].items(): if node_type_key == head_node_type: continue workers_value_str = node_type["node_config"][key] if list_value: workers_value_str = cli_logger.render_list( workers_value_str) cli_logger.labeled_value(resource_string + f" ({node_type_key})", "{}", workers_value_str, _tags=node_tags[node_type_key])
def terminate_nodes(self, node_ids): if not node_ids: return terminate_instances_func = self.ec2.meta.client.terminate_instances stop_instances_func = self.ec2.meta.client.stop_instances # In some cases, this function stops some nodes, but terminates others. # Each of these requires a different EC2 API call. So, we use the # "nodes_to_terminate" dict below to keep track of exactly which API # call will be used to stop/terminate which set of nodes. The key is # the function to use, and the value is the list of nodes to terminate # with that function. nodes_to_terminate = { terminate_instances_func: [], stop_instances_func: [] } if self.cache_stopped_nodes: spot_ids = [] on_demand_ids = [] for node_id in node_ids: if self._get_cached_node(node_id).spot_instance_request_id: spot_ids += [node_id] else: on_demand_ids += [node_id] if on_demand_ids: # todo: show node names? cli_logger.print( "Stopping instances {} " + cf.dimmed( "(to terminate instead, " "set `cache_stopped_nodes: False` " "under `provider` in the cluster configuration)"), cli_logger.render_list(on_demand_ids), ) if spot_ids: cli_logger.print( "Terminating instances {} " + cf.dimmed("(cannot stop spot instances, only terminate)"), cli_logger.render_list(spot_ids), ) nodes_to_terminate[stop_instances_func] = on_demand_ids nodes_to_terminate[terminate_instances_func] = spot_ids else: nodes_to_terminate[terminate_instances_func] = node_ids max_terminate_nodes = (self.max_terminate_nodes if self.max_terminate_nodes is not None else len(node_ids)) for terminate_func, nodes in nodes_to_terminate.items(): for start in range(0, len(nodes), max_terminate_nodes): terminate_func(InstanceIds=nodes[start:start + max_terminate_nodes])
def terminate_nodes(self, node_ids): if not node_ids: return if self.cache_stopped_nodes: spot_ids = [] on_demand_ids = [] for node_id in node_ids: if self._get_cached_node(node_id).spot_instance_request_id: spot_ids += [node_id] else: on_demand_ids += [node_id] if on_demand_ids: # todo: show node names? cli_logger.print( "Stopping instances {} " + cf.dimmed( "(to terminate instead, " "set `cache_stopped_nodes: False` " "under `provider` in the cluster configuration)"), cli_logger.render_list(on_demand_ids)) cli_logger.old_info( logger, "AWSNodeProvider: stopping nodes {}. To terminate nodes " "on stop, set 'cache_stopped_nodes: False' in the " "provider config.", on_demand_ids) self.ec2.meta.client.stop_instances(InstanceIds=on_demand_ids) if spot_ids: cli_logger.print( "Terminating instances {} " + cf.dimmed("(cannot stop spot instances, only terminate)"), cli_logger.render_list(spot_ids)) cli_logger.old_info( logger, "AWSNodeProvider: terminating nodes {} (spot nodes cannot " "be stopped, only terminated)", spot_ids) self.ec2.meta.client.terminate_instances(InstanceIds=spot_ids) else: self.ec2.meta.client.terminate_instances(InstanceIds=node_ids) for node_id in node_ids: self.tag_cache.pop(node_id, None) self.tag_cache_pending.pop(node_id, None)
def print_info(resource_string, key, head_src_key, workers_src_key, allowed_tags=None, list_value=False): if allowed_tags is None: allowed_tags = ["default"] head_tags = {} workers_tags = {} if _log_info[head_src_key] in allowed_tags: head_tags[_log_info[head_src_key]] = True if _log_info[workers_src_key] in allowed_tags: workers_tags[_log_info[workers_src_key]] = True head_value_str = config["head_node"][key] if list_value: head_value_str = cli_logger.render_list(head_value_str) if same_everywhere(key): cli_logger.labeled_value( # todo: handle plural vs singular? resource_string + " (head & workers)", "{}", head_value_str, _tags=head_tags) else: workers_value_str = config["worker_nodes"][key] if list_value: workers_value_str = cli_logger.render_list( workers_value_str) cli_logger.labeled_value( resource_string + " (head)", "{}", head_value_str, _tags=head_tags) cli_logger.labeled_value( resource_string + " (workers)", "{}", workers_value_str, _tags=workers_tags)
def create_or_update_cluster(config_file: str, override_min_workers: Optional[int], override_max_workers: Optional[int], no_restart: bool, restart_only: bool, yes: bool, override_cluster_name: Optional[str], no_config_cache: bool = False, redirect_command_output: bool = False, use_login_shells: bool = True) -> None: """Create or updates an autoscaling Ray cluster from a config json.""" set_using_login_shells(use_login_shells) if not use_login_shells: cmd_output_util.set_allow_interactive(False) if redirect_command_output is None: # Do not redirect by default. cmd_output_util.set_output_redirected(False) else: cmd_output_util.set_output_redirected(redirect_command_output) def handle_yaml_error(e): cli_logger.error("Cluster config invalid") cli_logger.newline() cli_logger.error("Failed to load YAML file " + cf.bold("{}"), config_file) cli_logger.newline() with cli_logger.verbatim_error_ctx("PyYAML error:"): cli_logger.error(e) cli_logger.abort() try: config = yaml.safe_load(open(config_file).read()) except FileNotFoundError: cli_logger.abort( "Provided cluster configuration file ({}) does not exist", cf.bold(config_file)) raise except yaml.parser.ParserError as e: handle_yaml_error(e) raise except yaml.scanner.ScannerError as e: handle_yaml_error(e) raise # todo: validate file_mounts, ssh keys, etc. importer = _NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: cli_logger.abort( "Unknown provider type " + cf.bold("{}") + "\n" "Available providers are: {}", config["provider"]["type"], cli_logger.render_list([ k for k in _NODE_PROVIDERS.keys() if _NODE_PROVIDERS[k] is not None ])) raise NotImplementedError("Unsupported provider {}".format( config["provider"])) printed_overrides = False def handle_cli_override(key, override): if override is not None: if key in config: nonlocal printed_overrides printed_overrides = True cli_logger.warning( "`{}` override provided on the command line.\n" " Using " + cf.bold("{}") + cf.dimmed(" [configuration file has " + cf.bold("{}") + "]"), key, override, config[key]) config[key] = override handle_cli_override("min_workers", override_min_workers) handle_cli_override("max_workers", override_max_workers) handle_cli_override("cluster_name", override_cluster_name) if printed_overrides: cli_logger.newline() cli_logger.labeled_value("Cluster", config["cluster_name"]) # disable the cli_logger here if needed # because it only supports aws if config["provider"]["type"] != "aws": cli_logger.old_style = True cli_logger.newline() config = _bootstrap_config(config, no_config_cache=no_config_cache) try_logging_config(config) get_or_create_head_node(config, config_file, no_restart, restart_only, yes, override_cluster_name)
def create_node(self, node_config, tags, count): tags = copy.deepcopy(tags) # Try to reuse previously stopped nodes with compatible configs if self.cache_stopped_nodes: # TODO(ekl) this is breaking the abstraction boundary a little by # peeking into the tag set. filters = [ { "Name": "instance-state-name", "Values": ["stopped", "stopping"], }, { "Name": "tag:{}".format(TAG_RAY_CLUSTER_NAME), "Values": [self.cluster_name], }, { "Name": "tag:{}".format(TAG_RAY_NODE_KIND), "Values": [tags[TAG_RAY_NODE_KIND]], }, { "Name": "tag:{}".format(TAG_RAY_LAUNCH_CONFIG), "Values": [tags[TAG_RAY_LAUNCH_CONFIG]], }, ] # This tag may not always be present. if TAG_RAY_USER_NODE_TYPE in tags: filters.append({ "Name": "tag:{}".format(TAG_RAY_USER_NODE_TYPE), "Values": [tags[TAG_RAY_USER_NODE_TYPE]], }) reuse_nodes = list( self.ec2.instances.filter(Filters=filters))[:count] reuse_node_ids = [n.id for n in reuse_nodes] if reuse_nodes: cli_logger.print( # todo: handle plural vs singular? "Reusing nodes {}. " "To disable reuse, set `cache_stopped_nodes: False` " "under `provider` in the cluster configuration.", cli_logger.render_list(reuse_node_ids)) cli_logger.old_info( logger, "AWSNodeProvider: reusing instances {}. " "To disable reuse, set " "'cache_stopped_nodes: False' in the provider " "config.", reuse_node_ids) # todo: timed? with cli_logger.group("Stopping instances to reuse"): for node in reuse_nodes: self.tag_cache[node.id] = from_aws_format( {x["Key"]: x["Value"] for x in node.tags}) if node.state["Name"] == "stopping": cli_logger.print("Waiting for instance {} to stop", node.id) cli_logger.old_info( logger, "AWSNodeProvider: waiting for instance " "{} to fully stop...", node.id) node.wait_until_stopped() self.ec2.meta.client.start_instances( InstanceIds=reuse_node_ids) for node_id in reuse_node_ids: self.set_node_tags(node_id, tags) count -= len(reuse_node_ids) if count: self._create_node(node_config, tags, count)
# This is an executable script that runs an example of every single CliLogger # function for demonstration purposes. Primarily useful for tuning color and # other formatting. from ray.autoscaler._private.cli_logger import cli_logger import colorful as cf cli_logger.old_style = False cli_logger.verbosity = 999 cli_logger.detect_colors() cli_logger.print( cf.bold("Bold ") + cf.italic("Italic ") + cf.underlined("Underlined")) cli_logger.labeled_value("Label", "value") cli_logger.print("List: {}", cli_logger.render_list([1, 2, 3])) cli_logger.newline() cli_logger.very_verbose("Very verbose") cli_logger.verbose("Verbose") cli_logger.verbose_warning("Verbose warning") cli_logger.verbose_error("Verbose error") cli_logger.print("Info") cli_logger.success("Success") cli_logger.warning("Warning") cli_logger.error("Error") cli_logger.newline() try: cli_logger.abort("Abort") except Exception: pass try:
def create_node(self, node_config, tags, count) -> Dict[str, Any]: """Creates instances. Returns dict mapping instance id to ec2.Instance object for the created instances. """ # sort tags by key to support deterministic unit test stubbing tags = OrderedDict(sorted(copy.deepcopy(tags).items())) reused_nodes_dict = {} # Try to reuse previously stopped nodes with compatible configs if self.cache_stopped_nodes: # TODO(ekl) this is breaking the abstraction boundary a little by # peeking into the tag set. filters = [ { "Name": "instance-state-name", "Values": ["stopped", "stopping"], }, { "Name": "tag:{}".format(TAG_RAY_CLUSTER_NAME), "Values": [self.cluster_name], }, { "Name": "tag:{}".format(TAG_RAY_NODE_KIND), "Values": [tags[TAG_RAY_NODE_KIND]], }, { "Name": "tag:{}".format(TAG_RAY_LAUNCH_CONFIG), "Values": [tags[TAG_RAY_LAUNCH_CONFIG]], }, ] # This tag may not always be present. if TAG_RAY_USER_NODE_TYPE in tags: filters.append({ "Name": "tag:{}".format(TAG_RAY_USER_NODE_TYPE), "Values": [tags[TAG_RAY_USER_NODE_TYPE]], }) reuse_nodes = list( self.ec2.instances.filter(Filters=filters))[:count] reuse_node_ids = [n.id for n in reuse_nodes] reused_nodes_dict = {n.id: n for n in reuse_nodes} if reuse_nodes: cli_logger.print( # todo: handle plural vs singular? "Reusing nodes {}. " "To disable reuse, set `cache_stopped_nodes: False` " "under `provider` in the cluster configuration.", cli_logger.render_list(reuse_node_ids)) # todo: timed? with cli_logger.group("Stopping instances to reuse"): for node in reuse_nodes: self.tag_cache[node.id] = from_aws_format( {x["Key"]: x["Value"] for x in node.tags}) if node.state["Name"] == "stopping": cli_logger.print("Waiting for instance {} to stop", node.id) node.wait_until_stopped() self.ec2.meta.client.start_instances( InstanceIds=reuse_node_ids) for node_id in reuse_node_ids: self.set_node_tags(node_id, tags) count -= len(reuse_node_ids) created_nodes_dict = {} if count: created_nodes_dict = self._create_node(node_config, tags, count) all_created_nodes = reused_nodes_dict all_created_nodes.update(created_nodes_dict) return all_created_nodes
def create_or_update_cluster( config_file: str, override_min_workers: Optional[int], override_max_workers: Optional[int], no_restart: bool, restart_only: bool, yes: bool, override_cluster_name: Optional[str] = None, no_config_cache: bool = False, redirect_command_output: Optional[bool] = False, use_login_shells: bool = True, no_monitor_on_head: bool = False) -> Dict[str, Any]: """Creates or updates an autoscaling Ray cluster from a config json.""" # no_monitor_on_head is an internal flag used by the Ray K8s operator. # If True, prevents autoscaling config sync to the Ray head during cluster # creation. See https://github.com/ray-project/ray/pull/13720. set_using_login_shells(use_login_shells) if not use_login_shells: cmd_output_util.set_allow_interactive(False) if redirect_command_output is None: # Do not redirect by default. cmd_output_util.set_output_redirected(False) else: cmd_output_util.set_output_redirected(redirect_command_output) def handle_yaml_error(e): cli_logger.error("Cluster config invalid") cli_logger.newline() cli_logger.error("Failed to load YAML file " + cf.bold("{}"), config_file) cli_logger.newline() with cli_logger.verbatim_error_ctx("PyYAML error:"): cli_logger.error(e) cli_logger.abort() try: config = yaml.safe_load(open(config_file).read()) except FileNotFoundError: cli_logger.abort( "Provided cluster configuration file ({}) does not exist", cf.bold(config_file)) raise except yaml.parser.ParserError as e: handle_yaml_error(e) raise except yaml.scanner.ScannerError as e: handle_yaml_error(e) raise global_event_system.execute_callback(CreateClusterEvent.up_started, {"cluster_config": config}) # todo: validate file_mounts, ssh keys, etc. importer = _NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: cli_logger.abort( "Unknown provider type " + cf.bold("{}") + "\n" "Available providers are: {}", config["provider"]["type"], cli_logger.render_list([ k for k in _NODE_PROVIDERS.keys() if _NODE_PROVIDERS[k] is not None ])) raise NotImplementedError("Unsupported provider {}".format( config["provider"])) printed_overrides = False def handle_cli_override(key, override): if override is not None: if key in config: nonlocal printed_overrides printed_overrides = True cli_logger.warning( "`{}` override provided on the command line.\n" " Using " + cf.bold("{}") + cf.dimmed( " [configuration file has " + cf.bold("{}") + "]"), key, override, config[key]) config[key] = override handle_cli_override("min_workers", override_min_workers) handle_cli_override("max_workers", override_max_workers) handle_cli_override("cluster_name", override_cluster_name) if printed_overrides: cli_logger.newline() cli_logger.labeled_value("Cluster", config["cluster_name"]) cli_logger.newline() config = _bootstrap_config(config, no_config_cache=no_config_cache) try_logging_config(config) get_or_create_head_node(config, config_file, no_restart, restart_only, yes, override_cluster_name, no_monitor_on_head) return config