def _derive_autoscaling_config_from_ray_cr(ray_cr: Dict[str, Any]) -> Dict[str, Any]: provider_config = _generate_provider_config(ray_cr["metadata"]["namespace"]) available_node_types = _generate_available_node_types_from_ray_cr_spec( ray_cr["spec"] ) # The autoscaler expects a global max workers field. We set it to the sum of # node type max workers. global_max_workers = sum( node_type["max_workers"] for node_type in available_node_types.values() ) # Legacy autoscaling fields carry no information but are required for compatibility. legacy_autoscaling_fields = _generate_legacy_autoscaling_config_fields() autoscaling_config = { "provider": provider_config, "cluster_name": ray_cr["metadata"]["name"], "head_node_type": _HEAD_GROUP_NAME, "available_node_types": available_node_types, "max_workers": global_max_workers, # Should consider exposing `idleTimeoutMinutes` in the RayCluster CRD, # under an `autoscaling` field. "idle_timeout_minutes": 5, # Should consider exposing `upscalingSpeed` in the RayCluster CRD, # under an `autoscaling` field. "upscaling_speed": 1, **legacy_autoscaling_fields, } # Make sure the config is readable by the autoscaler. validate_config(autoscaling_config) return autoscaling_config
def testValidateDefaultConfig(self): for config_path in CONFIG_PATHS: try: if "aws/example-multi-node-type.yaml" in config_path: # aws tested in testValidateDefaultConfigAWSMultiNodeTypes. continue if "local" in config_path: # local tested in testValidateLocal continue if "fake_multi_node" in config_path: # not supported with ray up continue if "kuberay" in config_path: # not supported with ray up continue with open(config_path) as f: config = yaml.safe_load(f) config = prepare_config(config) if config["provider"]["type"] == "kubernetes": KubernetesNodeProvider.\ fillout_available_node_types_resources(config) validate_config(config) except Exception: logging.exception("") self.fail( f"Config {config_path} did not pass validation test!")
def _test_invalid_config(self, config_path): with open(os.path.join(RAY_PATH, config_path)) as f: config = yaml.safe_load(f) try: validate_config(config) self.fail("Expected validation to fail for {}".format(config_path)) except jsonschema.ValidationError: pass
def testValidateDefaultConfig(self): for config_path in CONFIG_PATHS: with open(config_path) as f: config = yaml.safe_load(f) config = prepare_config(config) try: validate_config(config) except Exception: self.fail("Config did not pass validation test!")
def testValidateDefaultConfig(self): for config_path in CONFIG_PATHS: if "aws/example-multi-node-type.yaml" in config_path: # aws is tested in testValidateDefaultConfigAWSMultiNodeTypes. continue with open(config_path) as f: config = yaml.safe_load(f) config = prepare_config(config) try: validate_config(config) except Exception: self.fail("Config did not pass validation test!")
def reset(self, errors_fatal=False): sync_continuously = False if hasattr(self, "config"): sync_continuously = self.config.get( "file_mounts_sync_continuously", False) try: with open(self.config_path) as f: new_config = yaml.safe_load(f.read()) if new_config != getattr(self, "config", None): try: validate_config(new_config) except Exception as e: logger.debug( "Cluster config validation failed. The version of " "the ray CLI you launched this cluster with may " "be higher than the version of ray being run on " "the cluster. Some new features may not be " "available until you upgrade ray on your cluster.", exc_info=e) (new_runtime_hash, new_file_mounts_contents_hash) = hash_runtime_conf( new_config["file_mounts"], new_config["cluster_synced_files"], [ new_config["worker_setup_commands"], new_config["worker_start_ray_commands"], ], generate_file_mounts_contents_hash=sync_continuously, ) self.config = new_config self.runtime_hash = new_runtime_hash self.file_mounts_contents_hash = new_file_mounts_contents_hash if not self.provider: self.provider = _get_node_provider(self.config["provider"], self.config["cluster_name"]) # Check whether we can enable the resource demand scheduler. if "available_node_types" in self.config: self.available_node_types = self.config["available_node_types"] self.resource_demand_scheduler = ResourceDemandScheduler( self.provider, self.available_node_types, self.config["max_workers"]) else: self.available_node_types = None self.resource_demand_scheduler = None except Exception as e: if errors_fatal: raise e else: logger.exception("StandardAutoscaler: " "Error parsing config.")
def testValidateNetworkConfig(self): web_yaml = "https://raw.githubusercontent.com/ray-project/ray/" \ "master/python/ray/autoscaler/aws/example-full.yaml" response = urllib.request.urlopen(web_yaml, timeout=5) content = response.read() with tempfile.TemporaryFile() as f: f.write(content) f.seek(0) config = yaml.safe_load(f) config = prepare_config(config) try: validate_config(config) except Exception: self.fail("Config did not pass validation test!")
def _bootstrap_config(config: Dict[str, Any], no_config_cache: bool = False) -> Dict[str, Any]: config = prepare_config(config) hasher = hashlib.sha1() hasher.update(json.dumps([config], sort_keys=True).encode("utf-8")) cache_key = os.path.join(tempfile.gettempdir(), "ray-config-{}".format(hasher.hexdigest())) if os.path.exists(cache_key) and not no_config_cache: cli_logger.old_info(logger, "Using cached config at {}", cache_key) config_cache = json.loads(open(cache_key).read()) if config_cache.get("_version", -1) == CONFIG_CACHE_VERSION: # todo: is it fine to re-resolve? afaik it should be. # we can have migrations otherwise or something # but this seems overcomplicated given that resolving is # relatively cheap try_reload_log_state(config_cache["config"]["provider"], config_cache.get("provider_log_info")) if log_once("_printed_cached_config_warning"): cli_logger.verbose_warning( "Loaded cached provider configuration " "from " + cf.bold("{}"), cache_key) if cli_logger.verbosity == 0: cli_logger.warning("Loaded cached provider configuration") cli_logger.warning( "If you experience issues with " "the cloud provider, try re-running " "the command with {}.", cf.bold("--no-config-cache")) return config_cache["config"] else: cli_logger.warning( "Found cached cluster config " "but the version " + cf.bold("{}") + " " "(expected " + cf.bold("{}") + ") does not match.\n" "This is normal if cluster launcher was updated.\n" "Config will be re-resolved.", config_cache.get("_version", "none"), CONFIG_CACHE_VERSION) validate_config(config) importer = _NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: raise NotImplementedError("Unsupported provider {}".format( config["provider"])) provider_cls = importer(config["provider"])
def testFaultyResourceValidation(self): """Checks that schema validation catches invalid node type resource field. Demonstrates a fix in https://github.com/ray-project/ray/pull/16691.""" path = os.path.join(RAY_PATH, "autoscaler", "aws", "example-full.yaml") config = yaml.safe_load(open(path).read()) node_type = config["available_node_types"]["ray.head.default"] # Invalid `resources` field, say user entered `resources: `. node_type["resources"] = None with pytest.raises(jsonschema.exceptions.ValidationError): validate_config(config) # Invalid value in resource dict. node_type["resources"] = {"CPU": "a string is not valid here"} with pytest.raises(jsonschema.exceptions.ValidationError): validate_config(config)
def testValidateDefaultConfig(self): for config_path in CONFIG_PATHS: if "aws/example-multi-node-type.yaml" in config_path: # aws is tested in testValidateDefaultConfigAWSMultiNodeTypes. continue with open(config_path) as f: config = yaml.safe_load(f) config = prepare_config(config) if config["provider"]["type"] == "kubernetes": KubernetesNodeProvider.fillout_available_node_types_resources( config) try: validate_config(config) except Exception: self.fail( f"Config {config_path} did not pass validation test!")
def testValidateLocal(self): """ Tests local node provider config validation for the most common use case of bootstrapping a cluster at a static set of ips. """ local_config_path = os.path.join( RAY_PATH, "autoscaler/local/example-minimal-manual.yaml") base_config = yaml.safe_load(open(local_config_path).read()) base_config["provider"]["head_ip"] = "xxx.yyy" base_config["provider"]["worker_ips"] = [ "aaa.bbb", "ccc.ddd", "eee.fff" ] base_config["auth"]["ssh_user"] = "******" base_config["auth"]["ssh_private_key"] = "~/.ssh/id_rsa" test_prepare_config = copy.deepcopy(base_config) prepared_config = prepare_config(test_prepare_config) try: validate_config(prepared_config) except Exception: self.fail("Failed to validate local/example-minimal-manual.yaml") expected_prepared = yaml.safe_load(EXPECTED_LOCAL_CONFIG_STR) assert prepared_config == expected_prepared no_worker_config = copy.deepcopy(base_config) del no_worker_config["provider"]["worker_ips"] with pytest.raises(ClickException): prepare_config(no_worker_config) no_head_config = copy.deepcopy(base_config) del no_head_config["provider"]["head_ip"] with pytest.raises(ClickException): prepare_config(no_head_config) for field in "head_node", "worker_nodes", "available_node_types": faulty_config = copy.deepcopy(base_config) faulty_config[field] = "This field shouldn't be in here." with pytest.raises(ClickException): prepare_config(faulty_config) too_many_workers_config = copy.deepcopy(base_config) # More workers requested than the three available ips. too_many_workers_config["max_workers"] = 10 too_many_workers_config["min_workers"] = 10 prepared_config = prepare_config(too_many_workers_config) # Check that worker config numbers were clipped to 3. assert prepared_config == expected_prepared
def _derive_autoscaling_config_from_ray_cr( ray_cr: Dict[str, Any]) -> Dict[str, Any]: provider_config = _generate_provider_config( ray_cr["metadata"]["namespace"]) available_node_types = _generate_available_node_types_from_ray_cr_spec( ray_cr["spec"]) # The autoscaler expects a global max workers field. We set it to the sum of # node type max workers. global_max_workers = sum(node_type["max_workers"] for node_type in available_node_types.values()) # Legacy autoscaling fields carry no information but are required for compatibility. legacy_autoscaling_fields = _generate_legacy_autoscaling_config_fields() # Process autoscaler options. autoscaler_options = ray_cr["spec"].get(AUTOSCALER_OPTIONS_KEY, {}) if IDLE_SECONDS_KEY in autoscaler_options: idle_timeout_minutes = autoscaler_options[IDLE_SECONDS_KEY] / 60.0 else: idle_timeout_minutes = 5.0 if autoscaler_options.get(UPSCALING_KEY) == UPSCALING_VALUE_AGGRESSIVE: upscaling_speed = 1000 # i.e. big else: upscaling_speed = 1 autoscaling_config = { "provider": provider_config, "cluster_name": ray_cr["metadata"]["name"], "head_node_type": _HEAD_GROUP_NAME, "available_node_types": available_node_types, "max_workers": global_max_workers, # Should consider exposing `idleTimeoutMinutes` in the RayCluster CRD, # under an `autoscaling` field. "idle_timeout_minutes": idle_timeout_minutes, # Should consider exposing `upscalingSpeed` in the RayCluster CRD, # under an `autoscaling` field. "upscaling_speed": upscaling_speed, **legacy_autoscaling_fields, } # Make sure the config is readable by the autoscaler. validate_config(autoscaling_config) return autoscaling_config
def reset(self, errors_fatal=False): sync_continuously = False if hasattr(self, "config"): sync_continuously = self.config.get( "file_mounts_sync_continuously", False) try: with open(self.config_path) as f: new_config = yaml.safe_load(f.read()) validate_config(new_config) (new_runtime_hash, new_file_mounts_contents_hash) = hash_runtime_conf( new_config["file_mounts"], new_config["cluster_synced_files"], [ new_config["worker_setup_commands"], new_config["worker_start_ray_commands"], ], generate_file_mounts_contents_hash=sync_continuously, ) self.config = new_config self.runtime_hash = new_runtime_hash self.file_mounts_contents_hash = new_file_mounts_contents_hash if not self.provider: self.provider = _get_node_provider(self.config["provider"], self.config["cluster_name"]) # Check whether we can enable the resource demand scheduler. if "available_node_types" in self.config: self.available_node_types = self.config["available_node_types"] self.resource_demand_scheduler = ResourceDemandScheduler( self.provider, self.available_node_types, self.config["max_workers"]) else: self.available_node_types = None self.resource_demand_scheduler = None except Exception as e: if errors_fatal: raise e else: logger.exception("StandardAutoscaler: " "Error parsing config.")
def testValidateCustomSecurityGroupConfig(self): aws_config_path = os.path.join(RAY_PATH, "autoscaler/aws/example-minimal.yaml") with open(aws_config_path) as f: config = yaml.safe_load(f) # Test validate security group with custom permissions ip_permissions = [{ "FromPort": port, "ToPort": port, "IpProtocol": "TCP", "IpRanges": [{ "CidrIp": "0.0.0.0/0" }], } for port in [80, 443, 8265]] config["provider"].update({ "security_group": { "IpPermissions": ip_permissions } }) config = prepare_config(copy.deepcopy(config)) try: validate_config(config) assert config["provider"]["security_group"][ "IpPermissions"] == ip_permissions except Exception: self.fail( "Failed to validate config with security group in bound rules!" ) # Test validate security group with custom name group_name = "test_security_group_name" config["provider"]["security_group"].update({"GroupName": group_name}) try: validate_config(config) assert config["provider"]["security_group"][ "GroupName"] == group_name except Exception: self.fail("Failed to validate config with security group name!")
def testValidateDefaultConfigMinMaxWorkers(self): aws_config_path = os.path.join( RAY_PATH, "autoscaler/aws/example-multi-node-type.yaml") with open(aws_config_path) as f: config = yaml.safe_load(f) config = prepare_config(config) for node_type in config["available_node_types"]: config["available_node_types"][node_type]["resources"] = config[ "available_node_types"][node_type].get("resources", {}) try: validate_config(config) except Exception: self.fail("Config did not pass validation test!") config["max_workers"] = 0 # the sum of min_workers is 1. with pytest.raises(ValueError): validate_config(config) # make sure edge case of exactly 1 passes too. config["max_workers"] = 1 try: validate_config(config) except Exception: self.fail("Config did not pass validation test!")
def reset(self, errors_fatal=False): sync_continuously = False if hasattr(self, "config"): sync_continuously = self.config.get( "file_mounts_sync_continuously", False) try: with open(self.config_path) as f: new_config = yaml.safe_load(f.read()) if new_config != getattr(self, "config", None): try: validate_config(new_config) except Exception as e: logger.debug( "Cluster config validation failed. The version of " "the ray CLI you launched this cluster with may " "be higher than the version of ray being run on " "the cluster. Some new features may not be " "available until you upgrade ray on your cluster.", exc_info=e) (new_runtime_hash, new_file_mounts_contents_hash) = hash_runtime_conf( new_config["file_mounts"], new_config["cluster_synced_files"], [ new_config["worker_setup_commands"], new_config["worker_start_ray_commands"], ], generate_file_mounts_contents_hash=sync_continuously, ) self.config = new_config self.runtime_hash = new_runtime_hash self.file_mounts_contents_hash = new_file_mounts_contents_hash if not self.provider: self.provider = _get_node_provider(self.config["provider"], self.config["cluster_name"]) self.available_node_types = self.config["available_node_types"] upscaling_speed = self.config.get("upscaling_speed") aggressive = self.config.get("autoscaling_mode") == "aggressive" target_utilization_fraction = self.config.get( "target_utilization_fraction") if upscaling_speed: upscaling_speed = float(upscaling_speed) # TODO(ameer): consider adding (if users ask) an option of # initial_upscaling_num_workers. elif aggressive: upscaling_speed = 99999 logger.warning( "Legacy aggressive autoscaling mode " "detected. Replacing it by setting upscaling_speed to " "99999.") elif target_utilization_fraction: upscaling_speed = ( 1 / max(target_utilization_fraction, 0.001) - 1) logger.warning( "Legacy target_utilization_fraction config " "detected. Replacing it by setting upscaling_speed to " + "1 / target_utilization_fraction - 1.") else: upscaling_speed = 1.0 if self.resource_demand_scheduler: # The node types are autofilled internally for legacy yamls, # overwriting the class will remove the inferred node resources # for legacy yamls. self.resource_demand_scheduler.reset_config( self.provider, self.available_node_types, self.config["max_workers"], self.config["head_node_type"], upscaling_speed) else: self.resource_demand_scheduler = ResourceDemandScheduler( self.provider, self.available_node_types, self.config["max_workers"], self.config["head_node_type"], upscaling_speed) except Exception as e: if errors_fatal: raise e else: logger.exception("StandardAutoscaler: " "Error parsing config.")
def testValidateLocal(self): """ Tests local node provider config validation for the most common use case of bootstrapping a cluster at a static set of ips. """ local_config_path = os.path.join( RAY_PATH, "autoscaler/local/example-minimal-manual.yaml") base_config = yaml.safe_load(open(local_config_path).read()) base_config["provider"]["head_ip"] = "xxx.yyy" base_config["provider"]["worker_ips"] = [ "aaa.bbb", "ccc.ddd", "eee.fff" ] base_config["auth"]["ssh_user"] = "******" base_config["auth"]["ssh_private_key"] = "~/.ssh/id_rsa" test_prepare_config = copy.deepcopy(base_config) prepared_config = prepare_config(test_prepare_config) try: validate_config(prepared_config) except Exception: self.fail("Failed to validate local/example-minimal-manual.yaml") expected_prepared = yaml.safe_load(EXPECTED_LOCAL_CONFIG_STR) assert prepared_config == expected_prepared no_worker_config = copy.deepcopy(base_config) del no_worker_config["provider"]["worker_ips"] with pytest.raises(ClickException): prepare_config(no_worker_config) no_head_config = copy.deepcopy(base_config) del no_head_config["provider"]["head_ip"] with pytest.raises(ClickException): prepare_config(no_head_config) for field in "head_node", "worker_nodes", "available_node_types": faulty_config = copy.deepcopy(base_config) faulty_config[field] = "This field shouldn't be in here." with pytest.raises(ClickException): prepare_config(faulty_config) too_many_workers_config = copy.deepcopy(base_config) # More workers requested than the three available ips. too_many_workers_config["max_workers"] = 10 too_many_workers_config["min_workers"] = 10 prepared_config = prepare_config(too_many_workers_config) # Check that worker config numbers were clipped to 3. assert prepared_config == expected_prepared not_enough_workers_config = copy.deepcopy(base_config) # Max workers is less than than the three available ips. # The user is probably has probably made an error. Make sure we log a warning. not_enough_workers_config["max_workers"] = 0 not_enough_workers_config["min_workers"] = 0 with mock.patch( "ray.autoscaler._private.local.config.cli_logger.warning" ) as warning: prepared_config = prepare_config(not_enough_workers_config) warning.assert_called_with( "The value of `max_workers` supplied (0) is less" " than the number of available worker ips (3)." " At most 0 Ray worker nodes will connect to the cluster.") expected_prepared = yaml.safe_load(EXPECTED_LOCAL_CONFIG_STR) # We logged a warning. # However, prepare_config does not repair the strange config setting: expected_prepared["max_workers"] = 0 expected_prepared["available_node_types"]["local.cluster.node"][ "max_workers"] = 0 expected_prepared["available_node_types"]["local.cluster.node"][ "min_workers"] = 0 assert prepared_config == expected_prepared
def _bootstrap_config(config: Dict[str, Any], no_config_cache: bool = False) -> Dict[str, Any]: config = prepare_config(config) hasher = hashlib.sha1() hasher.update(json.dumps([config], sort_keys=True).encode("utf-8")) cache_key = os.path.join(tempfile.gettempdir(), "ray-config-{}".format(hasher.hexdigest())) if os.path.exists(cache_key) and not no_config_cache: config_cache = json.loads(open(cache_key).read()) if config_cache.get("_version", -1) == CONFIG_CACHE_VERSION: # todo: is it fine to re-resolve? afaik it should be. # we can have migrations otherwise or something # but this seems overcomplicated given that resolving is # relatively cheap try_reload_log_state(config_cache["config"]["provider"], config_cache.get("provider_log_info")) if log_once("_printed_cached_config_warning"): cli_logger.verbose_warning( "Loaded cached provider configuration " "from " + cf.bold("{}"), cache_key) if cli_logger.verbosity == 0: cli_logger.warning("Loaded cached provider configuration") cli_logger.warning( "If you experience issues with " "the cloud provider, try re-running " "the command with {}.", cf.bold("--no-config-cache")) return config_cache["config"] else: cli_logger.warning( "Found cached cluster config " "but the version " + cf.bold("{}") + " " "(expected " + cf.bold("{}") + ") does not match.\n" "This is normal if cluster launcher was updated.\n" "Config will be re-resolved.", config_cache.get("_version", "none"), CONFIG_CACHE_VERSION) importer = _NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: raise NotImplementedError("Unsupported provider {}".format( config["provider"])) provider_cls = importer(config["provider"]) cli_logger.print("Checking {} environment settings", _PROVIDER_PRETTY_NAMES.get(config["provider"]["type"])) try: config = provider_cls.fillout_available_node_types_resources(config) except Exception as exc: if cli_logger.verbosity > 2: logger.exception("Failed to autodetect node resources.") else: cli_logger.warning( f"Failed to autodetect node resources: {str(exc)}. " "You can see full stack trace with higher verbosity.") # NOTE: if `resources` field is missing, validate_config for providers # other than AWS and Kubernetes will fail (the schema error will ask the # user to manually fill the resources) as we currently support autofilling # resources for AWS and Kubernetes only. validate_config(config) resolved_config = provider_cls.bootstrap_config(config) if not no_config_cache: with open(cache_key, "w") as f: config_cache = { "_version": CONFIG_CACHE_VERSION, "provider_log_info": try_get_log_state(config["provider"]), "config": resolved_config } f.write(json.dumps(config_cache)) return resolved_config
def testValidateDefaultConfigAWSMultiNodeTypes(self): aws_config_path = os.path.join( RAY_PATH, "autoscaler/aws/example-multi-node-type.yaml") with open(aws_config_path) as f: config = yaml.safe_load(f) new_config = copy.deepcopy(config) # modify it here new_config["available_node_types"] = { "cpu_4_ondemand": new_config["available_node_types"][ "cpu_4_ondemand"], "cpu_16_spot": new_config["available_node_types"]["cpu_16_spot"], "gpu_8_ondemand": new_config["available_node_types"][ "gpu_8_ondemand"] } orig_new_config = copy.deepcopy(new_config) expected_available_node_types = orig_new_config["available_node_types"] expected_available_node_types["cpu_4_ondemand"]["resources"] = { "CPU": 4 } expected_available_node_types["cpu_16_spot"]["resources"] = { "CPU": 16, "Custom1": 1, "is_spot": 1 } expected_available_node_types["gpu_8_ondemand"]["resources"] = { "CPU": 32, "GPU": 4, "accelerator_type:V100": 1 } boto3_dict = { "InstanceTypes": [{ "InstanceType": "m4.xlarge", "VCpuInfo": { "DefaultVCpus": 4 } }, { "InstanceType": "m4.4xlarge", "VCpuInfo": { "DefaultVCpus": 16 } }, { "InstanceType": "p3.8xlarge", "VCpuInfo": { "DefaultVCpus": 32 }, "GpuInfo": { "Gpus": [{ "Name": "V100", "Count": 4 }] } }] } boto3_mock = Mock() describe_instance_types_mock = Mock() describe_instance_types_mock.describe_instance_types = MagicMock( return_value=boto3_dict) boto3_mock.client = MagicMock( return_value=describe_instance_types_mock) with patch.multiple( "ray.autoscaler._private.aws.node_provider", boto3=boto3_mock, ): new_config = prepare_config(new_config) try: validate_config(new_config) expected_available_node_types == new_config["available_node_types"] except Exception: self.fail("Config did not pass multi node types auto fill test!")
def teardown_cluster(config_file: str, yes: bool, workers_only: bool, override_cluster_name: Optional[str], keep_min_workers: bool): """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = prepare_config(config) validate_config(config) cli_logger.confirm(yes, "Destroying cluster.", _abort=True) cli_logger.old_confirm("This will destroy your cluster", yes) if not workers_only: try: exec_cluster(config_file, cmd="ray stop", run_env="auto", screen=False, tmux=False, stop=False, start=False, override_cluster_name=override_cluster_name, port_forward=None, with_output=False) except Exception as e: # todo: add better exception info cli_logger.verbose_error("{}", str(e)) cli_logger.warning( "Exception occured when stopping the cluster Ray runtime " "(use -v to dump teardown exceptions).") cli_logger.warning( "Ignoring the exception and " "attempting to shut down the cluster nodes anyway.") cli_logger.old_exception( logger, "Ignoring error attempting a clean shutdown.") provider = _get_node_provider(config["provider"], config["cluster_name"]) try: def remaining_nodes(): workers = provider.non_terminated_nodes( {TAG_RAY_NODE_KIND: NODE_KIND_WORKER}) if keep_min_workers: min_workers = config.get("min_workers", 0) cli_logger.print( "{} random worker nodes will not be shut down. " + cf.dimmed("(due to {})"), cf.bold(min_workers), cf.bold("--keep-min-workers")) cli_logger.old_info(logger, "teardown_cluster: Keeping {} nodes...", min_workers) workers = random.sample(workers, len(workers) - min_workers) # todo: it's weird to kill the head node but not all workers if workers_only: cli_logger.print( "The head node will not be shut down. " + cf.dimmed("(due to {})"), cf.bold("--workers-only")) return workers head = provider.non_terminated_nodes( {TAG_RAY_NODE_KIND: NODE_KIND_HEAD}) return head + workers def run_docker_stop(node, container_name): try: updater = NodeUpdaterThread( node_id=node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="", file_mounts_contents_hash="", is_head_node=False, docker_config=config.get("docker")) _exec(updater, f"docker stop {container_name}", False, False, run_env="host") except Exception: cli_logger.warning(f"Docker stop failed on {node}") cli_logger.old_warning(logger, f"Docker stop failed on {node}") # Loop here to check that both the head and worker nodes are actually # really gone A = remaining_nodes() container_name = config.get("docker", {}).get("container_name") if container_name: for node in A: run_docker_stop(node, container_name) with LogTimer("teardown_cluster: done."): while A: cli_logger.old_info( logger, "teardown_cluster: " "Shutting down {} nodes...", len(A)) provider.terminate_nodes(A) cli_logger.print("Requested {} nodes to shut down.", cf.bold(len(A)), _tags=dict(interval="1s")) time.sleep( POLL_INTERVAL) # todo: interval should be a variable A = remaining_nodes() cli_logger.print("{} nodes remaining after {} second(s).", cf.bold(len(A)), POLL_INTERVAL) cli_logger.success("No nodes remaining.") finally: provider.cleanup()
def testValidateDefaultConfigAWSMultiNodeTypes(self): aws_config_path = os.path.join( RAY_PATH, "autoscaler/aws/example-multi-node-type.yaml") with open(aws_config_path) as f: config = yaml.safe_load(f) new_config = copy.deepcopy(config) # modify it here new_config["available_node_types"] = { "cpu_4_ondemand": new_config["available_node_types"]["cpu_4_ondemand"], "cpu_16_spot": new_config["available_node_types"]["cpu_16_spot"], "gpu_8_ondemand": new_config["available_node_types"]["gpu_8_ondemand"] } orig_new_config = copy.deepcopy(new_config) expected_available_node_types = orig_new_config["available_node_types"] expected_available_node_types["cpu_4_ondemand"]["resources"] = { "CPU": 4 } expected_available_node_types["cpu_16_spot"]["resources"] = { "CPU": 16, "memory": 41231686041, "Custom1": 1, "is_spot": 1 } expected_available_node_types["gpu_8_ondemand"]["resources"] = { "CPU": 32, "memory": 157195803033, "GPU": 4, "accelerator_type:V100": 1 } boto3_dict = { "InstanceTypes": [{ "InstanceType": "m4.xlarge", "VCpuInfo": { "DefaultVCpus": 4 }, "MemoryInfo": { "SizeInMiB": 16384 } }, { "InstanceType": "m4.4xlarge", "VCpuInfo": { "DefaultVCpus": 16 }, "MemoryInfo": { "SizeInMiB": 65536 } }, { "InstanceType": "p3.8xlarge", "VCpuInfo": { "DefaultVCpus": 32 }, "MemoryInfo": { "SizeInMiB": 249856 }, "GpuInfo": { "Gpus": [{ "Name": "V100", "Count": 4 }] } }] } describe_instance_types_mock = Mock() describe_instance_types_mock.describe_instance_types = MagicMock( return_value=boto3_dict) client_cache_mock = MagicMock( return_value=describe_instance_types_mock) with patch.multiple( "ray.autoscaler._private.aws.node_provider", client_cache=client_cache_mock, ): new_config = prepare_config(new_config) importer = _NODE_PROVIDERS.get(new_config["provider"]["type"]) provider_cls = importer(new_config["provider"]) try: new_config = \ provider_cls.fillout_available_node_types_resources( new_config) validate_config(new_config) expected_available_node_types == new_config[ "available_node_types"] except Exception: self.fail( "Config did not pass multi node types auto fill test!")