def testImportingCorrectClass(self): """Check correct import when coordinator_address is in config yaml.""" provider_config = {"coordinator_address": "fake_address:1234"} coordinator_node_provider = _NODE_PROVIDERS.get("local")(provider_config) assert coordinator_node_provider is CoordinatorSenderNodeProvider local_node_provider = _NODE_PROVIDERS.get("local")({}) assert local_node_provider is LocalNodeProvider
def _fillout_available_node_types_resources( cluster_config: Dict[str, Any]) -> Dict[str, Any]: """Fills out missing "resources" field for available_node_types.""" if "available_node_types" in cluster_config: importer = _NODE_PROVIDERS.get(cluster_config["provider"]["type"]) if importer is not None: provider_cls = importer(cluster_config["provider"]) return provider_cls.fillout_available_node_types_resources( cluster_config) return cluster_config
def _bootstrap_config(config: Dict[str, Any], no_config_cache: bool = False) -> Dict[str, Any]: config = prepare_config(config) hasher = hashlib.sha1() hasher.update(json.dumps([config], sort_keys=True).encode("utf-8")) cache_key = os.path.join(tempfile.gettempdir(), "ray-config-{}".format(hasher.hexdigest())) if os.path.exists(cache_key) and not no_config_cache: cli_logger.old_info(logger, "Using cached config at {}", cache_key) config_cache = json.loads(open(cache_key).read()) if config_cache.get("_version", -1) == CONFIG_CACHE_VERSION: # todo: is it fine to re-resolve? afaik it should be. # we can have migrations otherwise or something # but this seems overcomplicated given that resolving is # relatively cheap try_reload_log_state(config_cache["config"]["provider"], config_cache.get("provider_log_info")) if log_once("_printed_cached_config_warning"): cli_logger.verbose_warning( "Loaded cached provider configuration " "from " + cf.bold("{}"), cache_key) if cli_logger.verbosity == 0: cli_logger.warning("Loaded cached provider configuration") cli_logger.warning( "If you experience issues with " "the cloud provider, try re-running " "the command with {}.", cf.bold("--no-config-cache")) return config_cache["config"] else: cli_logger.warning( "Found cached cluster config " "but the version " + cf.bold("{}") + " " "(expected " + cf.bold("{}") + ") does not match.\n" "This is normal if cluster launcher was updated.\n" "Config will be re-resolved.", config_cache.get("_version", "none"), CONFIG_CACHE_VERSION) validate_config(config) importer = _NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: raise NotImplementedError("Unsupported provider {}".format( config["provider"])) provider_cls = importer(config["provider"])
def testValidateDefaultConfigAWSMultiNodeTypes(self): aws_config_path = os.path.join( RAY_PATH, "autoscaler/aws/example-multi-node-type.yaml") with open(aws_config_path) as f: config = yaml.safe_load(f) new_config = copy.deepcopy(config) # modify it here new_config["available_node_types"] = { "cpu_4_ondemand": new_config["available_node_types"]["cpu_4_ondemand"], "cpu_16_spot": new_config["available_node_types"]["cpu_16_spot"], "gpu_8_ondemand": new_config["available_node_types"]["gpu_8_ondemand"] } orig_new_config = copy.deepcopy(new_config) expected_available_node_types = orig_new_config["available_node_types"] expected_available_node_types["cpu_4_ondemand"]["resources"] = { "CPU": 4 } expected_available_node_types["cpu_16_spot"]["resources"] = { "CPU": 16, "memory": 41231686041, "Custom1": 1, "is_spot": 1 } expected_available_node_types["gpu_8_ondemand"]["resources"] = { "CPU": 32, "memory": 157195803033, "GPU": 4, "accelerator_type:V100": 1 } boto3_dict = { "InstanceTypes": [{ "InstanceType": "m4.xlarge", "VCpuInfo": { "DefaultVCpus": 4 }, "MemoryInfo": { "SizeInMiB": 16384 } }, { "InstanceType": "m4.4xlarge", "VCpuInfo": { "DefaultVCpus": 16 }, "MemoryInfo": { "SizeInMiB": 65536 } }, { "InstanceType": "p3.8xlarge", "VCpuInfo": { "DefaultVCpus": 32 }, "MemoryInfo": { "SizeInMiB": 249856 }, "GpuInfo": { "Gpus": [{ "Name": "V100", "Count": 4 }] } }] } describe_instance_types_mock = Mock() describe_instance_types_mock.describe_instance_types = MagicMock( return_value=boto3_dict) client_cache_mock = MagicMock( return_value=describe_instance_types_mock) with patch.multiple( "ray.autoscaler._private.aws.node_provider", client_cache=client_cache_mock, ): new_config = prepare_config(new_config) importer = _NODE_PROVIDERS.get(new_config["provider"]["type"]) provider_cls = importer(new_config["provider"]) try: new_config = \ provider_cls.fillout_available_node_types_resources( new_config) validate_config(new_config) expected_available_node_types == new_config[ "available_node_types"] except Exception: self.fail( "Config did not pass multi node types auto fill test!")
def create_or_update_cluster(config_file: str, override_min_workers: Optional[int], override_max_workers: Optional[int], no_restart: bool, restart_only: bool, yes: bool, override_cluster_name: Optional[str] = None, no_config_cache: bool = False, redirect_command_output: Optional[bool] = False, use_login_shells: bool = True) -> None: """Create or updates an autoscaling Ray cluster from a config json.""" set_using_login_shells(use_login_shells) if not use_login_shells: cmd_output_util.set_allow_interactive(False) if redirect_command_output is None: # Do not redirect by default. cmd_output_util.set_output_redirected(False) else: cmd_output_util.set_output_redirected(redirect_command_output) def handle_yaml_error(e): cli_logger.error("Cluster config invalid") cli_logger.newline() cli_logger.error("Failed to load YAML file " + cf.bold("{}"), config_file) cli_logger.newline() with cli_logger.verbatim_error_ctx("PyYAML error:"): cli_logger.error(e) cli_logger.abort() try: config = yaml.safe_load(open(config_file).read()) except FileNotFoundError: cli_logger.abort( "Provided cluster configuration file ({}) does not exist", cf.bold(config_file)) raise except yaml.parser.ParserError as e: handle_yaml_error(e) raise except yaml.scanner.ScannerError as e: handle_yaml_error(e) raise # todo: validate file_mounts, ssh keys, etc. importer = _NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: cli_logger.abort( "Unknown provider type " + cf.bold("{}") + "\n" "Available providers are: {}", config["provider"]["type"], cli_logger.render_list([ k for k in _NODE_PROVIDERS.keys() if _NODE_PROVIDERS[k] is not None ])) raise NotImplementedError("Unsupported provider {}".format( config["provider"])) printed_overrides = False def handle_cli_override(key, override): if override is not None: if key in config: nonlocal printed_overrides printed_overrides = True cli_logger.warning( "`{}` override provided on the command line.\n" " Using " + cf.bold("{}") + cf.dimmed( " [configuration file has " + cf.bold("{}") + "]"), key, override, config[key]) config[key] = override handle_cli_override("min_workers", override_min_workers) handle_cli_override("max_workers", override_max_workers) handle_cli_override("cluster_name", override_cluster_name) if printed_overrides: cli_logger.newline() cli_logger.labeled_value("Cluster", config["cluster_name"]) cli_logger.newline() config = _bootstrap_config(config, no_config_cache=no_config_cache) try_logging_config(config) get_or_create_head_node(config, config_file, no_restart, restart_only, yes, override_cluster_name)
def _bootstrap_config(config: Dict[str, Any], no_config_cache: bool = False) -> Dict[str, Any]: config = prepare_config(config) hasher = hashlib.sha1() hasher.update(json.dumps([config], sort_keys=True).encode("utf-8")) cache_key = os.path.join(tempfile.gettempdir(), "ray-config-{}".format(hasher.hexdigest())) if os.path.exists(cache_key) and not no_config_cache: config_cache = json.loads(open(cache_key).read()) if config_cache.get("_version", -1) == CONFIG_CACHE_VERSION: # todo: is it fine to re-resolve? afaik it should be. # we can have migrations otherwise or something # but this seems overcomplicated given that resolving is # relatively cheap try_reload_log_state(config_cache["config"]["provider"], config_cache.get("provider_log_info")) if log_once("_printed_cached_config_warning"): cli_logger.verbose_warning( "Loaded cached provider configuration " "from " + cf.bold("{}"), cache_key) if cli_logger.verbosity == 0: cli_logger.warning("Loaded cached provider configuration") cli_logger.warning( "If you experience issues with " "the cloud provider, try re-running " "the command with {}.", cf.bold("--no-config-cache")) return config_cache["config"] else: cli_logger.warning( "Found cached cluster config " "but the version " + cf.bold("{}") + " " "(expected " + cf.bold("{}") + ") does not match.\n" "This is normal if cluster launcher was updated.\n" "Config will be re-resolved.", config_cache.get("_version", "none"), CONFIG_CACHE_VERSION) importer = _NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: raise NotImplementedError("Unsupported provider {}".format( config["provider"])) provider_cls = importer(config["provider"]) cli_logger.print("Checking {} environment settings", _PROVIDER_PRETTY_NAMES.get(config["provider"]["type"])) try: config = provider_cls.fillout_available_node_types_resources(config) except Exception as exc: if cli_logger.verbosity > 2: logger.exception("Failed to autodetect node resources.") else: cli_logger.warning( f"Failed to autodetect node resources: {str(exc)}. " "You can see full stack trace with higher verbosity.") # NOTE: if `resources` field is missing, validate_config for providers # other than AWS and Kubernetes will fail (the schema error will ask the # user to manually fill the resources) as we currently support autofilling # resources for AWS and Kubernetes only. validate_config(config) resolved_config = provider_cls.bootstrap_config(config) if not no_config_cache: with open(cache_key, "w") as f: config_cache = { "_version": CONFIG_CACHE_VERSION, "provider_log_info": try_get_log_state(config["provider"]), "config": resolved_config } f.write(json.dumps(config_cache)) return resolved_config
def create_or_update_cluster( config_file: str, override_min_workers: Optional[int], override_max_workers: Optional[int], no_restart: bool, restart_only: bool, yes: bool, override_cluster_name: Optional[str] = None, no_config_cache: bool = False, redirect_command_output: Optional[bool] = False, use_login_shells: bool = True, no_monitor_on_head: bool = False) -> Dict[str, Any]: """Creates or updates an autoscaling Ray cluster from a config json.""" # no_monitor_on_head is an internal flag used by the Ray K8s operator. # If True, prevents autoscaling config sync to the Ray head during cluster # creation. See https://github.com/ray-project/ray/pull/13720. set_using_login_shells(use_login_shells) if not use_login_shells: cmd_output_util.set_allow_interactive(False) if redirect_command_output is None: # Do not redirect by default. cmd_output_util.set_output_redirected(False) else: cmd_output_util.set_output_redirected(redirect_command_output) def handle_yaml_error(e): cli_logger.error("Cluster config invalid") cli_logger.newline() cli_logger.error("Failed to load YAML file " + cf.bold("{}"), config_file) cli_logger.newline() with cli_logger.verbatim_error_ctx("PyYAML error:"): cli_logger.error(e) cli_logger.abort() try: config = yaml.safe_load(open(config_file).read()) except FileNotFoundError: cli_logger.abort( "Provided cluster configuration file ({}) does not exist", cf.bold(config_file)) except yaml.parser.ParserError as e: handle_yaml_error(e) raise except yaml.scanner.ScannerError as e: handle_yaml_error(e) raise global_event_system.execute_callback(CreateClusterEvent.up_started, {"cluster_config": config}) # todo: validate file_mounts, ssh keys, etc. importer = _NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: cli_logger.abort( "Unknown provider type " + cf.bold("{}") + "\n" "Available providers are: {}", config["provider"]["type"], cli_logger.render_list([ k for k in _NODE_PROVIDERS.keys() if _NODE_PROVIDERS[k] is not None ])) printed_overrides = False def handle_cli_override(key, override): if override is not None: if key in config: nonlocal printed_overrides printed_overrides = True cli_logger.warning( "`{}` override provided on the command line.\n" " Using " + cf.bold("{}") + cf.dimmed( " [configuration file has " + cf.bold("{}") + "]"), key, override, config[key]) config[key] = override handle_cli_override("min_workers", override_min_workers) handle_cli_override("max_workers", override_max_workers) handle_cli_override("cluster_name", override_cluster_name) if printed_overrides: cli_logger.newline() cli_logger.labeled_value("Cluster", config["cluster_name"]) cli_logger.newline() config = _bootstrap_config(config, no_config_cache=no_config_cache) try_logging_config(config) get_or_create_head_node(config, config_file, no_restart, restart_only, yes, override_cluster_name, no_monitor_on_head) return config