Ejemplo n.º 1
0
    def testFillEdgeLegacyConfigs(self):
        # Test edge cases: legacy configs which specify workers but not head
        # or vice-versa.
        no_head = load_test_config("test_no_head.yaml")
        aws_defaults = _get_default_config(no_head["provider"])
        head_prepared = prepare_config(no_head)
        assert head_prepared["available_node_types"][
            "ray-legacy-head-node-type"]["node_config"] ==\
            aws_defaults["available_node_types"][
                "ray.head.default"]["node_config"]
        assert head_prepared["head_node"] == {}
        # Custom worker config preserved
        node_types = head_prepared["available_node_types"]
        worker_type = node_types["ray-legacy-worker-node-type"]
        assert worker_type["node_config"] == head_prepared["worker_nodes"] == {
            "foo": "bar"
        }

        no_workers = load_test_config("test_no_workers.yaml")
        workers_prepared = prepare_config(no_workers)
        assert workers_prepared["available_node_types"][
            "ray-legacy-worker-node-type"]["node_config"] ==\
            aws_defaults["available_node_types"][
                "ray.worker.default"]["node_config"]
        assert workers_prepared["worker_nodes"] == {}
        # Custom head config preserved
        node_types = workers_prepared["available_node_types"]
        head_type = node_types["ray-legacy-head-node-type"]
        assert head_type["node_config"] == workers_prepared["head_node"] == {
            "baz": "qux"
        }
Ejemplo n.º 2
0
    def testMaxWorkerDefault(self):
        # Load config, call prepare config, check that default max_workers
        # is filled correctly for node types that don't specify it.
        # Check that max_workers is untouched for node types
        # that do specify it.
        config = load_test_config("test_multi_node.yaml")
        node_types = config["available_node_types"]

        # Max workers initially absent for this node type.
        assert "max_workers" not in node_types["worker_node_max_unspecified"]
        # Max workers specified for this node type.
        assert "max_workers" in node_types["worker_node_max_specified"]

        prepared_config = prepare_config(config)
        prepared_node_types = prepared_config["available_node_types"]
        # Max workers unchanged.
        assert (node_types["worker_node_max_specified"]["max_workers"] ==
                prepared_node_types["worker_node_max_specified"]["max_workers"]
                == 3)
        # Max workers auto-filled with specified cluster-wide value of 5.
        assert (
            config["max_workers"] ==
            prepared_node_types["worker_node_max_unspecified"]["max_workers"]
            == 5)

        # Repeat with a config that doesn't specify global max workers.
        # Default value of 2 should be pulled in for global max workers.
        config = load_test_config("test_multi_node.yaml")
        # Delete global max_workers so it can be autofilled with default of 2.
        del config["max_workers"]
        node_types = config["available_node_types"]

        # Max workers initially absent for this node type.
        assert "max_workers" not in node_types["worker_node_max_unspecified"]
        # Max workers specified for this node type.
        assert "max_workers" in node_types["worker_node_max_specified"]

        prepared_config = prepare_config(config)
        prepared_node_types = prepared_config["available_node_types"]
        # Max workers unchanged.
        assert (node_types["worker_node_max_specified"]["max_workers"] ==
                prepared_node_types["worker_node_max_specified"]["max_workers"]
                == 3)
        # Max workers auto-filled with default cluster-wide value of 2.
        assert (
            prepared_config["max_workers"] ==
            prepared_node_types["worker_node_max_unspecified"]["max_workers"]
            == 2)
Ejemplo n.º 3
0
    def testLegacyYaml(self):
        # Test correct default-merging behavior for legacy yamls.
        providers = ["aws", "azure"]
        for provider in providers:
            path = os.path.join(RAY_PATH, "autoscaler", provider,
                                "example-full-legacy.yaml")
            legacy_config = yaml.safe_load(open(path).read())
            # custom head and workers
            legacy_config["head_node"] = {"blahblah": 0}
            legacy_config["worker_nodes"] = {"halbhalhb": 0}
            legacy_config_copy = copy.deepcopy(legacy_config)
            prepared_legacy = prepare_config(legacy_config_copy)
            assert prepared_legacy["available_node_types"][
                NODE_TYPE_LEGACY_HEAD]["max_workers"] == 0
            assert prepared_legacy["available_node_types"][
                NODE_TYPE_LEGACY_HEAD]["min_workers"] == 0
            assert prepared_legacy["available_node_types"][
                NODE_TYPE_LEGACY_HEAD]["node_config"] == legacy_config[
                    "head_node"]

            assert prepared_legacy["available_node_types"][
                NODE_TYPE_LEGACY_WORKER]["max_workers"] == 2
            assert prepared_legacy["available_node_types"][
                NODE_TYPE_LEGACY_WORKER]["min_workers"] == 0
            assert prepared_legacy["available_node_types"][
                NODE_TYPE_LEGACY_WORKER]["node_config"] == legacy_config[
                    "worker_nodes"]
Ejemplo n.º 4
0
 def testValidateDefaultConfig(self):
     for config_path in CONFIG_PATHS:
         try:
             if "aws/example-multi-node-type.yaml" in config_path:
                 # aws tested in testValidateDefaultConfigAWSMultiNodeTypes.
                 continue
             if "local" in config_path:
                 # local tested in testValidateLocal
                 continue
             if "fake_multi_node" in config_path:
                 # not supported with ray up
                 continue
             if "kuberay" in config_path:
                 # not supported with ray up
                 continue
             with open(config_path) as f:
                 config = yaml.safe_load(f)
             config = prepare_config(config)
             if config["provider"]["type"] == "kubernetes":
                 KubernetesNodeProvider.\
                     fillout_available_node_types_resources(config)
             validate_config(config)
         except Exception:
             logging.exception("")
             self.fail(
                 f"Config {config_path} did not pass validation test!")
Ejemplo n.º 5
0
 def testValidateDefaultConfig(self):
     for config_path in CONFIG_PATHS:
         with open(config_path) as f:
             config = yaml.safe_load(f)
         config = prepare_config(config)
         try:
             validate_config(config)
         except Exception:
             self.fail("Config did not pass validation test!")
Ejemplo n.º 6
0
 def testValidateDefaultConfig(self):
     for config_path in CONFIG_PATHS:
         if "aws/example-multi-node-type.yaml" in config_path:
             # aws is tested in testValidateDefaultConfigAWSMultiNodeTypes.
             continue
         with open(config_path) as f:
             config = yaml.safe_load(f)
         config = prepare_config(config)
         try:
             validate_config(config)
         except Exception:
             self.fail("Config did not pass validation test!")
Ejemplo n.º 7
0
 def testExampleFull(self):
     """
     Test that example-full yamls are unmodified by prepared_config,
     except possibly by having setup_commands merged.
     """
     providers = ["aws", "gcp", "azure"]
     for provider in providers:
         path = os.path.join(RAY_PATH, "autoscaler", provider,
                             "example-full.yaml")
         config = yaml.safe_load(open(path).read())
         config_copy = copy.deepcopy(config)
         merge_setup_commands(config_copy)
         assert config_copy == prepare_config(config)
Ejemplo n.º 8
0
 def testValidateNetworkConfig(self):
     web_yaml = "https://raw.githubusercontent.com/ray-project/ray/" \
         "master/python/ray/autoscaler/aws/example-full.yaml"
     response = urllib.request.urlopen(web_yaml, timeout=5)
     content = response.read()
     with tempfile.TemporaryFile() as f:
         f.write(content)
         f.seek(0)
         config = yaml.safe_load(f)
     config = prepare_config(config)
     try:
         validate_config(config)
     except Exception:
         self.fail("Config did not pass validation test!")
Ejemplo n.º 9
0
    def testValidateLocal(self):
        """
        Tests local node provider config validation for the most common use
        case of bootstrapping a cluster at a static set of ips.
        """
        local_config_path = os.path.join(
            RAY_PATH, "autoscaler/local/example-minimal-manual.yaml")
        base_config = yaml.safe_load(open(local_config_path).read())
        base_config["provider"]["head_ip"] = "xxx.yyy"
        base_config["provider"]["worker_ips"] = [
            "aaa.bbb", "ccc.ddd", "eee.fff"
        ]
        base_config["auth"]["ssh_user"] = "******"
        base_config["auth"]["ssh_private_key"] = "~/.ssh/id_rsa"

        test_prepare_config = copy.deepcopy(base_config)
        prepared_config = prepare_config(test_prepare_config)
        try:
            validate_config(prepared_config)
        except Exception:
            self.fail("Failed to validate local/example-minimal-manual.yaml")
        expected_prepared = yaml.safe_load(EXPECTED_LOCAL_CONFIG_STR)
        assert prepared_config == expected_prepared

        no_worker_config = copy.deepcopy(base_config)
        del no_worker_config["provider"]["worker_ips"]
        with pytest.raises(ClickException):
            prepare_config(no_worker_config)
        no_head_config = copy.deepcopy(base_config)
        del no_head_config["provider"]["head_ip"]
        with pytest.raises(ClickException):
            prepare_config(no_head_config)
        for field in "head_node", "worker_nodes", "available_node_types":
            faulty_config = copy.deepcopy(base_config)
            faulty_config[field] = "This field shouldn't be in here."
            with pytest.raises(ClickException):
                prepare_config(faulty_config)

        too_many_workers_config = copy.deepcopy(base_config)

        # More workers requested than the three available ips.
        too_many_workers_config["max_workers"] = 10
        too_many_workers_config["min_workers"] = 10
        prepared_config = prepare_config(too_many_workers_config)

        # Check that worker config numbers were clipped to 3.
        assert prepared_config == expected_prepared
Ejemplo n.º 10
0
def _bootstrap_config(config: Dict[str, Any],
                      no_config_cache: bool = False) -> Dict[str, Any]:
    config = prepare_config(config)

    hasher = hashlib.sha1()
    hasher.update(json.dumps([config], sort_keys=True).encode("utf-8"))
    cache_key = os.path.join(tempfile.gettempdir(),
                             "ray-config-{}".format(hasher.hexdigest()))

    if os.path.exists(cache_key) and not no_config_cache:
        cli_logger.old_info(logger, "Using cached config at {}", cache_key)

        config_cache = json.loads(open(cache_key).read())
        if config_cache.get("_version", -1) == CONFIG_CACHE_VERSION:
            # todo: is it fine to re-resolve? afaik it should be.
            # we can have migrations otherwise or something
            # but this seems overcomplicated given that resolving is
            # relatively cheap
            try_reload_log_state(config_cache["config"]["provider"],
                                 config_cache.get("provider_log_info"))

            if log_once("_printed_cached_config_warning"):
                cli_logger.verbose_warning(
                    "Loaded cached provider configuration "
                    "from " + cf.bold("{}"), cache_key)
                if cli_logger.verbosity == 0:
                    cli_logger.warning("Loaded cached provider configuration")
                cli_logger.warning(
                    "If you experience issues with "
                    "the cloud provider, try re-running "
                    "the command with {}.", cf.bold("--no-config-cache"))

            return config_cache["config"]
        else:
            cli_logger.warning(
                "Found cached cluster config "
                "but the version " + cf.bold("{}") + " "
                "(expected " + cf.bold("{}") + ") does not match.\n"
                "This is normal if cluster launcher was updated.\n"
                "Config will be re-resolved.",
                config_cache.get("_version", "none"), CONFIG_CACHE_VERSION)
    validate_config(config)

    importer = _NODE_PROVIDERS.get(config["provider"]["type"])
    if not importer:
        raise NotImplementedError("Unsupported provider {}".format(
            config["provider"]))

    provider_cls = importer(config["provider"])
Ejemplo n.º 11
0
 def testValidateDefaultConfig(self):
     for config_path in CONFIG_PATHS:
         if "aws/example-multi-node-type.yaml" in config_path:
             # aws is tested in testValidateDefaultConfigAWSMultiNodeTypes.
             continue
         with open(config_path) as f:
             config = yaml.safe_load(f)
         config = prepare_config(config)
         if config["provider"]["type"] == "kubernetes":
             KubernetesNodeProvider.fillout_available_node_types_resources(
                 config)
         try:
             validate_config(config)
         except Exception:
             self.fail(
                 f"Config {config_path} did not pass validation test!")
Ejemplo n.º 12
0
    def testValidateCustomSecurityGroupConfig(self):
        aws_config_path = os.path.join(RAY_PATH,
                                       "autoscaler/aws/example-minimal.yaml")
        with open(aws_config_path) as f:
            config = yaml.safe_load(f)

        # Test validate security group with custom permissions
        ip_permissions = [{
            "FromPort": port,
            "ToPort": port,
            "IpProtocol": "TCP",
            "IpRanges": [{
                "CidrIp": "0.0.0.0/0"
            }],
        } for port in [80, 443, 8265]]
        config["provider"].update({
            "security_group": {
                "IpPermissions": ip_permissions
            }
        })
        config = prepare_config(copy.deepcopy(config))
        try:
            validate_config(config)
            assert config["provider"]["security_group"][
                "IpPermissions"] == ip_permissions
        except Exception:
            self.fail(
                "Failed to validate config with security group in bound rules!"
            )

        # Test validate security group with custom name
        group_name = "test_security_group_name"
        config["provider"]["security_group"].update({"GroupName": group_name})

        try:
            validate_config(config)
            assert config["provider"]["security_group"][
                "GroupName"] == group_name
        except Exception:
            self.fail("Failed to validate config with security group name!")
Ejemplo n.º 13
0
    def testValidateDefaultConfigMinMaxWorkers(self):
        aws_config_path = os.path.join(
            RAY_PATH, "autoscaler/aws/example-multi-node-type.yaml")
        with open(aws_config_path) as f:
            config = yaml.safe_load(f)
        config = prepare_config(config)
        for node_type in config["available_node_types"]:
            config["available_node_types"][node_type]["resources"] = config[
                "available_node_types"][node_type].get("resources", {})
        try:
            validate_config(config)
        except Exception:
            self.fail("Config did not pass validation test!")

        config["max_workers"] = 0  # the sum of min_workers is 1.
        with pytest.raises(ValueError):
            validate_config(config)

        # make sure edge case of exactly 1 passes too.
        config["max_workers"] = 1
        try:
            validate_config(config)
        except Exception:
            self.fail("Config did not pass validation test!")
Ejemplo n.º 14
0
    def testValidateLocal(self):
        """
        Tests local node provider config validation for the most common use
        case of bootstrapping a cluster at a static set of ips.
        """
        local_config_path = os.path.join(
            RAY_PATH, "autoscaler/local/example-minimal-manual.yaml")
        base_config = yaml.safe_load(open(local_config_path).read())
        base_config["provider"]["head_ip"] = "xxx.yyy"
        base_config["provider"]["worker_ips"] = [
            "aaa.bbb", "ccc.ddd", "eee.fff"
        ]
        base_config["auth"]["ssh_user"] = "******"
        base_config["auth"]["ssh_private_key"] = "~/.ssh/id_rsa"

        test_prepare_config = copy.deepcopy(base_config)
        prepared_config = prepare_config(test_prepare_config)
        try:
            validate_config(prepared_config)
        except Exception:
            self.fail("Failed to validate local/example-minimal-manual.yaml")
        expected_prepared = yaml.safe_load(EXPECTED_LOCAL_CONFIG_STR)
        assert prepared_config == expected_prepared

        no_worker_config = copy.deepcopy(base_config)
        del no_worker_config["provider"]["worker_ips"]
        with pytest.raises(ClickException):
            prepare_config(no_worker_config)
        no_head_config = copy.deepcopy(base_config)
        del no_head_config["provider"]["head_ip"]
        with pytest.raises(ClickException):
            prepare_config(no_head_config)
        for field in "head_node", "worker_nodes", "available_node_types":
            faulty_config = copy.deepcopy(base_config)
            faulty_config[field] = "This field shouldn't be in here."
            with pytest.raises(ClickException):
                prepare_config(faulty_config)

        too_many_workers_config = copy.deepcopy(base_config)

        # More workers requested than the three available ips.
        too_many_workers_config["max_workers"] = 10
        too_many_workers_config["min_workers"] = 10
        prepared_config = prepare_config(too_many_workers_config)

        # Check that worker config numbers were clipped to 3.
        assert prepared_config == expected_prepared

        not_enough_workers_config = copy.deepcopy(base_config)

        # Max workers is less than than the three available ips.
        # The user is probably has probably made an error. Make sure we log a warning.
        not_enough_workers_config["max_workers"] = 0
        not_enough_workers_config["min_workers"] = 0
        with mock.patch(
                "ray.autoscaler._private.local.config.cli_logger.warning"
        ) as warning:
            prepared_config = prepare_config(not_enough_workers_config)
            warning.assert_called_with(
                "The value of `max_workers` supplied (0) is less"
                " than the number of available worker ips (3)."
                " At most 0 Ray worker nodes will connect to the cluster.")
        expected_prepared = yaml.safe_load(EXPECTED_LOCAL_CONFIG_STR)
        # We logged a warning.
        # However, prepare_config does not repair the strange config setting:
        expected_prepared["max_workers"] = 0
        expected_prepared["available_node_types"]["local.cluster.node"][
            "max_workers"] = 0
        expected_prepared["available_node_types"]["local.cluster.node"][
            "min_workers"] = 0
        assert prepared_config == expected_prepared
Ejemplo n.º 15
0
    def testValidateDefaultConfigAWSMultiNodeTypes(self):
        aws_config_path = os.path.join(
            RAY_PATH, "autoscaler/aws/example-multi-node-type.yaml")
        with open(aws_config_path) as f:
            config = yaml.safe_load(f)
        new_config = copy.deepcopy(config)
        # modify it here
        new_config["available_node_types"] = {
            "cpu_4_ondemand":
            new_config["available_node_types"]["cpu_4_ondemand"],
            "cpu_16_spot": new_config["available_node_types"]["cpu_16_spot"],
            "gpu_8_ondemand":
            new_config["available_node_types"]["gpu_8_ondemand"]
        }
        orig_new_config = copy.deepcopy(new_config)
        expected_available_node_types = orig_new_config["available_node_types"]
        expected_available_node_types["cpu_4_ondemand"]["resources"] = {
            "CPU": 4
        }
        expected_available_node_types["cpu_16_spot"]["resources"] = {
            "CPU": 16,
            "memory": 41231686041,
            "Custom1": 1,
            "is_spot": 1
        }
        expected_available_node_types["gpu_8_ondemand"]["resources"] = {
            "CPU": 32,
            "memory": 157195803033,
            "GPU": 4,
            "accelerator_type:V100": 1
        }

        boto3_dict = {
            "InstanceTypes": [{
                "InstanceType": "m4.xlarge",
                "VCpuInfo": {
                    "DefaultVCpus": 4
                },
                "MemoryInfo": {
                    "SizeInMiB": 16384
                }
            }, {
                "InstanceType": "m4.4xlarge",
                "VCpuInfo": {
                    "DefaultVCpus": 16
                },
                "MemoryInfo": {
                    "SizeInMiB": 65536
                }
            }, {
                "InstanceType": "p3.8xlarge",
                "VCpuInfo": {
                    "DefaultVCpus": 32
                },
                "MemoryInfo": {
                    "SizeInMiB": 249856
                },
                "GpuInfo": {
                    "Gpus": [{
                        "Name": "V100",
                        "Count": 4
                    }]
                }
            }]
        }
        describe_instance_types_mock = Mock()
        describe_instance_types_mock.describe_instance_types = MagicMock(
            return_value=boto3_dict)
        client_cache_mock = MagicMock(
            return_value=describe_instance_types_mock)
        with patch.multiple(
                "ray.autoscaler._private.aws.node_provider",
                client_cache=client_cache_mock,
        ):
            new_config = prepare_config(new_config)
            importer = _NODE_PROVIDERS.get(new_config["provider"]["type"])
            provider_cls = importer(new_config["provider"])

            try:
                new_config = \
                    provider_cls.fillout_available_node_types_resources(
                        new_config)
                validate_config(new_config)
                expected_available_node_types == new_config[
                    "available_node_types"]
            except Exception:
                self.fail(
                    "Config did not pass multi node types auto fill test!")
Ejemplo n.º 16
0
def teardown_cluster(config_file: str, yes: bool, workers_only: bool,
                     override_cluster_name: Optional[str],
                     keep_min_workers: bool):
    """Destroys all nodes of a Ray cluster described by a config json."""
    config = yaml.safe_load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name
    config = prepare_config(config)
    validate_config(config)

    cli_logger.confirm(yes, "Destroying cluster.", _abort=True)
    cli_logger.old_confirm("This will destroy your cluster", yes)

    if not workers_only:
        try:
            exec_cluster(config_file,
                         cmd="ray stop",
                         run_env="auto",
                         screen=False,
                         tmux=False,
                         stop=False,
                         start=False,
                         override_cluster_name=override_cluster_name,
                         port_forward=None,
                         with_output=False)
        except Exception as e:
            # todo: add better exception info
            cli_logger.verbose_error("{}", str(e))
            cli_logger.warning(
                "Exception occured when stopping the cluster Ray runtime "
                "(use -v to dump teardown exceptions).")
            cli_logger.warning(
                "Ignoring the exception and "
                "attempting to shut down the cluster nodes anyway.")

            cli_logger.old_exception(
                logger, "Ignoring error attempting a clean shutdown.")

    provider = _get_node_provider(config["provider"], config["cluster_name"])
    try:

        def remaining_nodes():
            workers = provider.non_terminated_nodes(
                {TAG_RAY_NODE_KIND: NODE_KIND_WORKER})

            if keep_min_workers:
                min_workers = config.get("min_workers", 0)

                cli_logger.print(
                    "{} random worker nodes will not be shut down. " +
                    cf.dimmed("(due to {})"), cf.bold(min_workers),
                    cf.bold("--keep-min-workers"))
                cli_logger.old_info(logger,
                                    "teardown_cluster: Keeping {} nodes...",
                                    min_workers)

                workers = random.sample(workers, len(workers) - min_workers)

            # todo: it's weird to kill the head node but not all workers
            if workers_only:
                cli_logger.print(
                    "The head node will not be shut down. " +
                    cf.dimmed("(due to {})"), cf.bold("--workers-only"))

                return workers

            head = provider.non_terminated_nodes(
                {TAG_RAY_NODE_KIND: NODE_KIND_HEAD})

            return head + workers

        def run_docker_stop(node, container_name):
            try:
                updater = NodeUpdaterThread(
                    node_id=node,
                    provider_config=config["provider"],
                    provider=provider,
                    auth_config=config["auth"],
                    cluster_name=config["cluster_name"],
                    file_mounts=config["file_mounts"],
                    initialization_commands=[],
                    setup_commands=[],
                    ray_start_commands=[],
                    runtime_hash="",
                    file_mounts_contents_hash="",
                    is_head_node=False,
                    docker_config=config.get("docker"))
                _exec(updater,
                      f"docker stop {container_name}",
                      False,
                      False,
                      run_env="host")
            except Exception:
                cli_logger.warning(f"Docker stop failed on {node}")
                cli_logger.old_warning(logger, f"Docker stop failed on {node}")

        # Loop here to check that both the head and worker nodes are actually
        #   really gone
        A = remaining_nodes()

        container_name = config.get("docker", {}).get("container_name")
        if container_name:
            for node in A:
                run_docker_stop(node, container_name)

        with LogTimer("teardown_cluster: done."):
            while A:
                cli_logger.old_info(
                    logger, "teardown_cluster: "
                    "Shutting down {} nodes...", len(A))

                provider.terminate_nodes(A)

                cli_logger.print("Requested {} nodes to shut down.",
                                 cf.bold(len(A)),
                                 _tags=dict(interval="1s"))

                time.sleep(
                    POLL_INTERVAL)  # todo: interval should be a variable
                A = remaining_nodes()
                cli_logger.print("{} nodes remaining after {} second(s).",
                                 cf.bold(len(A)), POLL_INTERVAL)
            cli_logger.success("No nodes remaining.")
    finally:
        provider.cleanup()
Ejemplo n.º 17
0
def _bootstrap_config(config: Dict[str, Any],
                      no_config_cache: bool = False) -> Dict[str, Any]:
    config = prepare_config(config)

    hasher = hashlib.sha1()
    hasher.update(json.dumps([config], sort_keys=True).encode("utf-8"))
    cache_key = os.path.join(tempfile.gettempdir(),
                             "ray-config-{}".format(hasher.hexdigest()))

    if os.path.exists(cache_key) and not no_config_cache:
        config_cache = json.loads(open(cache_key).read())
        if config_cache.get("_version", -1) == CONFIG_CACHE_VERSION:
            # todo: is it fine to re-resolve? afaik it should be.
            # we can have migrations otherwise or something
            # but this seems overcomplicated given that resolving is
            # relatively cheap
            try_reload_log_state(config_cache["config"]["provider"],
                                 config_cache.get("provider_log_info"))

            if log_once("_printed_cached_config_warning"):
                cli_logger.verbose_warning(
                    "Loaded cached provider configuration "
                    "from " + cf.bold("{}"), cache_key)
                if cli_logger.verbosity == 0:
                    cli_logger.warning("Loaded cached provider configuration")
                cli_logger.warning(
                    "If you experience issues with "
                    "the cloud provider, try re-running "
                    "the command with {}.", cf.bold("--no-config-cache"))

            return config_cache["config"]
        else:
            cli_logger.warning(
                "Found cached cluster config "
                "but the version " + cf.bold("{}") + " "
                "(expected " + cf.bold("{}") + ") does not match.\n"
                "This is normal if cluster launcher was updated.\n"
                "Config will be re-resolved.",
                config_cache.get("_version", "none"), CONFIG_CACHE_VERSION)

    importer = _NODE_PROVIDERS.get(config["provider"]["type"])
    if not importer:
        raise NotImplementedError("Unsupported provider {}".format(
            config["provider"]))

    provider_cls = importer(config["provider"])

    cli_logger.print("Checking {} environment settings",
                     _PROVIDER_PRETTY_NAMES.get(config["provider"]["type"]))
    try:
        config = provider_cls.fillout_available_node_types_resources(config)
    except Exception as exc:
        if cli_logger.verbosity > 2:
            logger.exception("Failed to autodetect node resources.")
        else:
            cli_logger.warning(
                f"Failed to autodetect node resources: {str(exc)}. "
                "You can see full stack trace with higher verbosity.")

    # NOTE: if `resources` field is missing, validate_config for providers
    # other than AWS and Kubernetes will fail (the schema error will ask the
    # user to manually fill the resources) as we currently support autofilling
    # resources for AWS and Kubernetes only.
    validate_config(config)
    resolved_config = provider_cls.bootstrap_config(config)

    if not no_config_cache:
        with open(cache_key, "w") as f:
            config_cache = {
                "_version": CONFIG_CACHE_VERSION,
                "provider_log_info": try_get_log_state(config["provider"]),
                "config": resolved_config
            }
            f.write(json.dumps(config_cache))
    return resolved_config
Ejemplo n.º 18
0
    def testValidateDefaultConfigAWSMultiNodeTypes(self):
        aws_config_path = os.path.join(
            RAY_PATH, "autoscaler/aws/example-multi-node-type.yaml")
        with open(aws_config_path) as f:
            config = yaml.safe_load(f)
        new_config = copy.deepcopy(config)
        # modify it here
        new_config["available_node_types"] = {
            "cpu_4_ondemand": new_config["available_node_types"][
                "cpu_4_ondemand"],
            "cpu_16_spot": new_config["available_node_types"]["cpu_16_spot"],
            "gpu_8_ondemand": new_config["available_node_types"][
                "gpu_8_ondemand"]
        }
        orig_new_config = copy.deepcopy(new_config)
        expected_available_node_types = orig_new_config["available_node_types"]
        expected_available_node_types["cpu_4_ondemand"]["resources"] = {
            "CPU": 4
        }
        expected_available_node_types["cpu_16_spot"]["resources"] = {
            "CPU": 16,
            "Custom1": 1,
            "is_spot": 1
        }
        expected_available_node_types["gpu_8_ondemand"]["resources"] = {
            "CPU": 32,
            "GPU": 4,
            "accelerator_type:V100": 1
        }

        boto3_dict = {
            "InstanceTypes": [{
                "InstanceType": "m4.xlarge",
                "VCpuInfo": {
                    "DefaultVCpus": 4
                }
            }, {
                "InstanceType": "m4.4xlarge",
                "VCpuInfo": {
                    "DefaultVCpus": 16
                }
            }, {
                "InstanceType": "p3.8xlarge",
                "VCpuInfo": {
                    "DefaultVCpus": 32
                },
                "GpuInfo": {
                    "Gpus": [{
                        "Name": "V100",
                        "Count": 4
                    }]
                }
            }]
        }
        boto3_mock = Mock()
        describe_instance_types_mock = Mock()
        describe_instance_types_mock.describe_instance_types = MagicMock(
            return_value=boto3_dict)
        boto3_mock.client = MagicMock(
            return_value=describe_instance_types_mock)
        with patch.multiple(
                "ray.autoscaler._private.aws.node_provider",
                boto3=boto3_mock,
        ):
            new_config = prepare_config(new_config)

        try:
            validate_config(new_config)
            expected_available_node_types == new_config["available_node_types"]
        except Exception:
            self.fail("Config did not pass multi node types auto fill test!")