Beispiel #1
0
    def terminate_node(self, node_id):
        node = self._get_cached_node(node_id)
        if self.cache_stopped_nodes:
            if node.spot_instance_request_id:
                cli_logger.print(
                    "Terminating instance {} " +
                    cf.dimmed("(cannot stop spot instances, only terminate)"),
                    node_id)  # todo: show node name?
                node.terminate()
            else:
                cli_logger.print("Stopping instance {} " + cf.dimmed(
                    "(to terminate instead, "
                    "set `cache_stopped_nodes: False` "
                    "under `provider` in the cluster configuration)"),
                                 node_id)  # todo: show node name?
                node.stop()
        else:
            node.terminate()

        # TODO (Alex): We are leaking the tag cache here. Naively, we would
        # want to just remove the cache entry here, but terminating can be
        # asyncrhonous or error, which would result in a use after free error.
        # If this leak becomes bad, we can garbage collect the tag cache when
        # the node cache is updated.
        pass
Beispiel #2
0
        def do_sync(remote_path, local_path, allow_non_existing_paths=False):
            if allow_non_existing_paths and not os.path.exists(local_path):
                cli_logger.print("sync: {} does not exist. Skipping.",
                                 local_path)
                # Ignore missing source files. In the future we should support
                # the --delete-missing-args command to delete files that have
                # been removed
                return

            assert os.path.exists(local_path), local_path

            if os.path.isdir(local_path):
                if not local_path.endswith("/"):
                    local_path += "/"
                if not remote_path.endswith("/"):
                    remote_path += "/"

            with LogTimer(self.log_prefix +
                          "Synced {} to {}".format(local_path, remote_path)):
                is_docker = (self.docker_config
                             and self.docker_config["container_name"] != "")
                if not is_docker:
                    # The DockerCommandRunner handles this internally.
                    self.cmd_runner.run("mkdir -p {}".format(
                        os.path.dirname(remote_path)),
                                        run_env="host")
                sync_cmd(local_path,
                         remote_path,
                         docker_mount_if_possible=True)

                if remote_path not in nolog_paths:
                    # todo: timed here?
                    cli_logger.print("{} from {}", cf.bold(remote_path),
                                     cf.bold(local_path))
Beispiel #3
0
    def terminate_nodes(self, node_ids):
        if not node_ids:
            return
        if self.cache_stopped_nodes:
            spot_ids = []
            on_demand_ids = []

            for node_id in node_ids:
                if self._get_cached_node(node_id).spot_instance_request_id:
                    spot_ids += [node_id]
                else:
                    on_demand_ids += [node_id]

            if on_demand_ids:
                # todo: show node names?
                cli_logger.print(
                    "Stopping instances {} " + cf.dimmed(
                        "(to terminate instead, "
                        "set `cache_stopped_nodes: False` "
                        "under `provider` in the cluster configuration)"),
                    cli_logger.render_list(on_demand_ids))

                self.ec2.meta.client.stop_instances(InstanceIds=on_demand_ids)
            if spot_ids:
                cli_logger.print(
                    "Terminating instances {} " +
                    cf.dimmed("(cannot stop spot instances, only terminate)"),
                    cli_logger.render_list(spot_ids))

                self.ec2.meta.client.terminate_instances(InstanceIds=spot_ids)
        else:
            self.ec2.meta.client.terminate_instances(InstanceIds=node_ids)
Beispiel #4
0
def teardown_cluster(config_file: str, yes: bool,
                     override_cluster_name: Optional[str]) -> None:
    config = yaml.safe_load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name

    config = _bootstrap_config(config)

    cli_logger.confirm(yes, "Destroying cluster.", _abort=True)

    provider = _get_node_provider(config["provider"], config["cluster_name"])

    A = provider.non_terminated_nodes({TAG_NODE_KIND: NODE_KIND_WORKER})
    with LogTimer("teardown_cluster: done."):
        while A:
            provider.terminate_nodes(A)

            cli_logger.print("Requested {} nodes to shut down.",
                             cf.bold(len(A)),
                             _tags=dict(interval="1s"))

            time.sleep(POLL_INTERVAL)  # todo: interval should be a variable
            A = provider.non_terminated_nodes(
                {TAG_NODE_KIND: NODE_KIND_WORKER})
            cli_logger.print("{} nodes remaining after {} second(s).",
                             cf.bold(len(A)), POLL_INTERVAL)
        cli_logger.success("No nodes remaining.")
Beispiel #5
0
    def wait_ready(self, deadline):
        with cli_logger.group("Waiting for SSH to become available",
                              _numbered=("[]", 1, NUM_SETUP_STEPS)):
            with LogTimer(self.log_prefix + "Got remote shell"):

                cli_logger.print("Running `{}` as a test.", cf.bold("uptime"))
                first_conn_refused_time = None
                while time.time() < deadline and \
                        not self.provider.is_terminated(self.node_id):
                    try:
                        # Run outside of the container
                        self.cmd_runner.run("uptime",
                                            timeout=5,
                                            run_env="host")
                        cli_logger.success("Success.")
                        return True
                    except ProcessRunnerError as e:
                        first_conn_refused_time = \
                            cmd_output_util.handle_ssh_fails(
                                e, first_conn_refused_time,
                                retry_interval=READY_CHECK_INTERVAL)
                        time.sleep(READY_CHECK_INTERVAL)
                    except Exception as e:
                        # TODO(maximsmol): we should not be ignoring
                        # exceptions if they get filtered properly
                        # (new style log + non-interactive shells)
                        #
                        # however threading this configuration state
                        # is a pain and I'm leaving it for later

                        retry_str = "(" + str(e) + ")"
                        if hasattr(e, "cmd"):
                            if isinstance(e.cmd, str):
                                cmd_ = e.cmd
                            elif isinstance(e.cmd, list):
                                cmd_ = " ".join(e.cmd)
                            else:
                                logger.debug(f"e.cmd type ({type(e.cmd)}) not "
                                             "list or str.")
                                cmd_ = str(e.cmd)
                            retry_str = "(Exit Status {}): {}".format(
                                e.returncode, cmd_)

                        cli_logger.print(
                            "SSH still not available {}, "
                            "retrying in {} seconds.", cf.dimmed(retry_str),
                            cf.bold(str(READY_CHECK_INTERVAL)))

                        time.sleep(READY_CHECK_INTERVAL)

        assert False, "Unable to connect to node"
Beispiel #6
0
def handle_ssh_fails(e, first_conn_refused_time, retry_interval):
    """Handle SSH system failures coming from a subprocess.

    Args:
        e: The `ProcessRunnerException` to handle.
        first_conn_refused_time:
            The time (as reported by this function) or None,
            indicating the last time a CONN_REFUSED error was caught.

            After exceeding a patience value, the program will be aborted
            since SSH will likely never recover.
        retry_interval: The interval after which the command will be retried,
                        used here just to inform the user.
    """
    if e.msg_type != "ssh_command_failed":
        return

    if e.special_case == "ssh_conn_refused":
        if first_conn_refused_time is not None and \
            time.time() - first_conn_refused_time > \
                CONN_REFUSED_PATIENCE:
            cli_logger.error(
                "SSH connection was being refused "
                "for {} seconds. Head node assumed "
                "unreachable.", cf.bold(str(CONN_REFUSED_PATIENCE)))
            cli_logger.abort("Check the node's firewall settings "
                             "and the cloud network configuration.")

        cli_logger.warning("SSH connection was refused.")
        cli_logger.warning("This might mean that the SSH daemon is "
                           "still setting up, or that "
                           "the host is inaccessable (e.g. due to "
                           "a firewall).")

        return time.time()

    if e.special_case in ["ssh_timeout", "ssh_conn_refused"]:
        cli_logger.print("SSH still not available, "
                         "retrying in {} seconds.",
                         cf.bold(str(retry_interval)))
    else:
        raise e

    return first_conn_refused_time
Beispiel #7
0
    def _wait_for_ip(self, deadline):
        # if we have IP do not print waiting info
        ip = self._get_node_ip()
        if ip is not None:
            cli_logger.labeled_value("Fetched IP", ip)
            return ip

        interval = 10
        with cli_logger.group("Waiting for IP"):
            while time.time() < deadline and \
                    not self.provider.is_terminated(self.node_id):
                ip = self._get_node_ip()
                if ip is not None:
                    cli_logger.labeled_value("Received", ip)
                    return ip
                cli_logger.print("Not yet available, retrying in {} seconds",
                                 cf.bold(str(interval)))
                time.sleep(interval)

        return None
Beispiel #8
0
    def do_update(self):
        self.provider.set_node_tags(self.node_id,
                                    {TAG_NODE_STATUS: STATUS_WAITING_FOR_SSH})
        cli_logger.labeled_value("New status", STATUS_WAITING_FOR_SSH)

        deadline = time.time() + AUTOSCALER_NODE_START_WAIT_S
        self.wait_ready(deadline)
        global_event_system.execute_callback(
            CreateClusterEvent.ssh_control_acquired)

        node_tags = self.provider.node_tags(self.node_id)
        logger.debug("Node tags: {}".format(str(node_tags)))

        if node_tags.get(TAG_RUNTIME_CONFIG) == self.runtime_hash:
            # When resuming from a stopped instance the runtime_hash may be the
            # same, but the container will not be started.
            init_required = self.cmd_runner.run_init(
                as_head=self.is_head_node,
                file_mounts=self.file_mounts,
                sync_run_yet=False)
            if init_required:
                node_tags[TAG_RUNTIME_CONFIG] += "-invalidate"
                # This ensures that `setup_commands` are not removed
                self.restart_only = False

        if self.restart_only:
            self.setup_commands = []

        # runtime_hash will only change whenever the user restarts
        # or updates their cluster with `get_or_create_head_node`
        if node_tags.get(TAG_RUNTIME_CONFIG) == self.runtime_hash and (
                not self.file_mounts_contents_hash
                or node_tags.get(TAG_FILE_MOUNTS_CONTENTS)
                == self.file_mounts_contents_hash):
            # todo: we lie in the confirmation message since
            # full setup might be cancelled here
            cli_logger.print(
                "Configuration already up to date, "
                "skipping file mounts, initalization and setup commands.",
                _numbered=("[]", "2-6", NUM_SETUP_STEPS))

        else:
            cli_logger.print("Updating cluster configuration.",
                             _tags=dict(hash=self.runtime_hash))

            self.provider.set_node_tags(
                self.node_id, {TAG_NODE_STATUS: STATUS_SYNCING_FILES})
            cli_logger.labeled_value("New status", STATUS_SYNCING_FILES)
            self.sync_file_mounts(self.rsync_up,
                                  step_numbers=(1, NUM_SETUP_STEPS))

            # Only run setup commands if runtime_hash has changed because
            # we don't want to run setup_commands every time the head node
            # file_mounts folders have changed.
            if node_tags.get(TAG_RUNTIME_CONFIG) != self.runtime_hash:
                # Run init commands
                self.provider.set_node_tags(
                    self.node_id, {TAG_NODE_STATUS: STATUS_SETTING_UP})
                cli_logger.labeled_value("New status", STATUS_SETTING_UP)

                if self.initialization_commands:
                    with cli_logger.group("Running initialization commands",
                                          _numbered=("[]", 4,
                                                     NUM_SETUP_STEPS)):
                        global_event_system.execute_callback(
                            CreateClusterEvent.run_initialization_cmd)
                        with LogTimer(self.log_prefix +
                                      "Initialization commands",
                                      show_status=True):
                            for cmd in self.initialization_commands:
                                global_event_system.execute_callback(
                                    CreateClusterEvent.run_initialization_cmd,
                                    {"command": cmd})
                                try:
                                    # Overriding the existing SSHOptions class
                                    # with a new SSHOptions class that uses
                                    # this ssh_private_key as its only __init__
                                    # argument.
                                    # Run outside docker.
                                    self.cmd_runner.run(
                                        cmd,
                                        ssh_options_override_ssh_key=self.
                                        auth_config.get("ssh_private_key"),
                                        run_env="host")
                                except ProcessRunnerError as e:
                                    if e.msg_type == "ssh_command_failed":
                                        cli_logger.error("Failed.")
                                        cli_logger.error(
                                            "See above for stderr.")

                                    raise click.ClickException(
                                        "Initialization command failed."
                                    ) from None
                else:
                    cli_logger.print("No initialization commands to run.",
                                     _numbered=("[]", 4, NUM_SETUP_STEPS))
                with cli_logger.group(
                        "Initalizing command runner",
                        # todo: fix command numbering
                        _numbered=("[]", 5, NUM_SETUP_STEPS)):
                    self.cmd_runner.run_init(as_head=self.is_head_node,
                                             file_mounts=self.file_mounts,
                                             sync_run_yet=True)
                if self.setup_commands:
                    with cli_logger.group(
                            "Running setup commands",
                            # todo: fix command numbering
                            _numbered=("[]", 6, NUM_SETUP_STEPS)):
                        global_event_system.execute_callback(
                            CreateClusterEvent.run_setup_cmd)
                        with LogTimer(self.log_prefix + "Setup commands",
                                      show_status=True):

                            total = len(self.setup_commands)
                            for i, cmd in enumerate(self.setup_commands):
                                global_event_system.execute_callback(
                                    CreateClusterEvent.run_setup_cmd,
                                    {"command": cmd})
                                if cli_logger.verbosity == 0 and len(cmd) > 30:
                                    cmd_to_print = cf.bold(cmd[:30]) + "..."
                                else:
                                    cmd_to_print = cf.bold(cmd)

                                cli_logger.print("{}",
                                                 cmd_to_print,
                                                 _numbered=("()", i, total))
                                try:
                                    # Runs in the container if docker is in use
                                    self.cmd_runner.run(cmd, run_env="auto")
                                except ProcessRunnerError as e:
                                    if e.msg_type == "ssh_command_failed":
                                        cli_logger.error("Failed.")
                                        cli_logger.error(
                                            "See above for stderr.")

                                    raise click.ClickException(
                                        "Setup command failed.")
                else:
                    cli_logger.print("No setup commands to run.",
                                     _numbered=("[]", 6, NUM_SETUP_STEPS))
Beispiel #9
0
    def sync_file_mounts(self, sync_cmd, step_numbers=(0, 2)):
        # step_numbers is (# of previous steps, total steps)
        previous_steps, total_steps = step_numbers

        nolog_paths = []
        if cli_logger.verbosity == 0:
            nolog_paths = [
                "~/ray_bootstrap_key.pem", "~/ray_bootstrap_config.yaml"
            ]

        def do_sync(remote_path, local_path, allow_non_existing_paths=False):
            if allow_non_existing_paths and not os.path.exists(local_path):
                cli_logger.print("sync: {} does not exist. Skipping.",
                                 local_path)
                # Ignore missing source files. In the future we should support
                # the --delete-missing-args command to delete files that have
                # been removed
                return

            assert os.path.exists(local_path), local_path

            if os.path.isdir(local_path):
                if not local_path.endswith("/"):
                    local_path += "/"
                if not remote_path.endswith("/"):
                    remote_path += "/"

            with LogTimer(self.log_prefix +
                          "Synced {} to {}".format(local_path, remote_path)):
                is_docker = (self.docker_config
                             and self.docker_config["container_name"] != "")
                if not is_docker:
                    # The DockerCommandRunner handles this internally.
                    self.cmd_runner.run("mkdir -p {}".format(
                        os.path.dirname(remote_path)),
                                        run_env="host")
                sync_cmd(local_path,
                         remote_path,
                         docker_mount_if_possible=True)

                if remote_path not in nolog_paths:
                    # todo: timed here?
                    cli_logger.print("{} from {}", cf.bold(remote_path),
                                     cf.bold(local_path))

        # Rsync file mounts
        with cli_logger.group("Processing file mounts",
                              _numbered=("[]", previous_steps + 1,
                                         total_steps)):
            for remote_path, local_path in self.file_mounts.items():
                do_sync(remote_path, local_path)
            previous_steps += 1

        if self.cluster_synced_files:
            with cli_logger.group("Processing worker file mounts",
                                  _numbered=("[]", previous_steps + 1,
                                             total_steps)):
                cli_logger.print("synced files: {}",
                                 str(self.cluster_synced_files))
                for path in self.cluster_synced_files:
                    do_sync(path, path, allow_non_existing_paths=True)
                previous_steps += 1
        else:
            cli_logger.print("No worker file mounts to sync",
                             _numbered=("[]", previous_steps + 1, total_steps))
Beispiel #10
0
    def _create_node(self, node_config, tags, count):
        created_nodes_dict = {}

        tags = to_aws_format(tags)
        conf = node_config.copy()

        tag_pairs = [{
            "Key": TAG_CLUSTER_NAME,
            "Value": self.cluster_name,
        }]
        for k, v in tags.items():
            tag_pairs.append({
                "Key": k,
                "Value": v,
            })
        tag_specs = [{
            "ResourceType": "instance",
            "Tags": tag_pairs,
        }]
        user_tag_specs = conf.get("TagSpecifications", [])
        # Allow users to add tags and override values of existing
        # tags with their own. This only applies to the resource type
        # "instance". All other resource types are appended to the list of
        # tag specs.
        for user_tag_spec in user_tag_specs:
            if user_tag_spec["ResourceType"] == "instance":
                for user_tag in user_tag_spec["Tags"]:
                    exists = False
                    for tag in tag_specs[0]["Tags"]:
                        if user_tag["Key"] == tag["Key"]:
                            exists = True
                            tag["Value"] = user_tag["Value"]
                            break
                    if not exists:
                        tag_specs[0]["Tags"] += [user_tag]
            else:
                tag_specs += [user_tag_spec]

        # SubnetIds is not a real config key: we must resolve to a
        # single SubnetId before invoking the AWS API.
        subnet_ids = conf.pop("SubnetIds")

        for attempt in range(1, BOTO_CREATE_MAX_RETRIES + 1):
            try:
                subnet_id = subnet_ids[self.subnet_idx % len(subnet_ids)]

                self.subnet_idx += 1
                conf.update({
                    "MinCount": 1,
                    "MaxCount": count,
                    "SubnetId": subnet_id,
                    "TagSpecifications": tag_specs
                })
                created = self.ec2_fail_fast.create_instances(**conf)
                created_nodes_dict = {n.id: n for n in created}

                # todo: timed?
                # todo: handle plurality?
                with cli_logger.group("Launched {} nodes",
                                      count,
                                      _tags=dict(subnet_id=subnet_id)):
                    for instance in created:
                        # NOTE(maximsmol): This is needed for mocking
                        # boto3 for tests. This is likely a bug in moto
                        # but AWS docs don't seem to say.
                        # You can patch moto/ec2/responses/instances.py
                        # to fix this (add <stateReason> to EC2_RUN_INSTANCES)

                        # The correct value is technically
                        # {"code": "0", "Message": "pending"}
                        state_reason = instance.state_reason or {
                            "Message": "pending"
                        }

                        cli_logger.print("Launched instance {}",
                                         instance.instance_id,
                                         _tags=dict(
                                             state=instance.state["Name"],
                                             info=state_reason["Message"]))
                break
            except botocore.exceptions.ClientError as exc:
                if attempt == BOTO_CREATE_MAX_RETRIES:
                    # todo: err msg
                    cli_logger.abort(
                        "Failed to launch instances. Max attempts exceeded.")
                    raise exc
                else:
                    cli_logger.print(
                        "create_instances: Attempt failed with {}, retrying.",
                        exc)
        return created_nodes_dict
Beispiel #11
0
    def create_node(self, node_config, tags, count) -> Dict[str, Any]:
        """Creates instances.

        Returns dict mapping instance id to ec2.Instance object for the created
        instances.
        """
        tags = copy.deepcopy(tags)

        reused_nodes_dict = {}
        # Try to reuse previously stopped nodes with compatible configs
        if self.cache_stopped_nodes:
            # TODO(ekl) this is breaking the abstraction boundary a little by
            # peeking into the tag set.
            filters = [
                {
                    "Name": "instance-state-name",
                    "Values": ["stopped", "stopping"],
                },
                {
                    "Name": "tag:{}".format(TAG_CLUSTER_NAME),
                    "Values": [self.cluster_name],
                },
                {
                    "Name": "tag:{}".format(TAG_NODE_KIND),
                    "Values": [tags[TAG_NODE_KIND]],
                },
                {
                    "Name": "tag:{}".format(TAG_LAUNCH_CONFIG),
                    "Values": [tags[TAG_LAUNCH_CONFIG]],
                },
            ]
            # This tag may not always be present.
            if TAG_USER_NODE_TYPE in tags:
                filters.append({
                    "Name": "tag:{}".format(TAG_USER_NODE_TYPE),
                    "Values": [tags[TAG_USER_NODE_TYPE]],
                })

            reuse_nodes = list(
                self.ec2.instances.filter(Filters=filters))[:count]
            reuse_node_ids = [n.id for n in reuse_nodes]
            reused_nodes_dict = {n.id: n for n in reuse_nodes}
            if reuse_nodes:
                cli_logger.print(
                    # todo: handle plural vs singular?
                    "Reusing nodes {}. "
                    "To disable reuse, set `cache_stopped_nodes: False` "
                    "under `provider` in the cluster configuration.",
                    cli_logger.render_list(reuse_node_ids))

                # todo: timed?
                with cli_logger.group("Stopping instances to reuse"):
                    for node in reuse_nodes:
                        self.tag_cache[node.id] = from_aws_format(
                            {x["Key"]: x["Value"]
                             for x in node.tags})
                        if node.state["Name"] == "stopping":
                            cli_logger.print("Waiting for instance {} to stop",
                                             node.id)
                            node.wait_until_stopped()

                self.ec2.meta.client.start_instances(
                    InstanceIds=reuse_node_ids)
                for node_id in reuse_node_ids:
                    self.set_node_tags(node_id, tags)
                count -= len(reuse_node_ids)

        created_nodes_dict = {}
        if count:
            created_nodes_dict = self._create_node(node_config, tags, count)

        all_created_nodes = reused_nodes_dict
        all_created_nodes.update(created_nodes_dict)
        return all_created_nodes
Beispiel #12
0
def create_nodes(config: Dict[str, Any],
                 yes: bool,
                 _provider: Optional[NodeProvider] = None,
                 _runner: ModuleType = subprocess) -> None:
    provider = (_provider or _get_node_provider(config["provider"],
                                                config["cluster_name"]))

    worker_filter = {TAG_NODE_KIND: NODE_KIND_WORKER}
    launch_config = copy.deepcopy(config["worker_nodes"])
    launch_hash = hash_launch_conf(launch_config, config["auth"])
    count = int(config["num_workers"])
    cli_logger.print("Launching {} nodes.".format(count))
    node_config = copy.deepcopy(config["worker_nodes"])
    node_tags = {
        TAG_NODE_NAME: "cls-{}-worker".format(config["cluster_name"]),
        TAG_NODE_KIND: NODE_KIND_WORKER,
        TAG_NODE_STATUS: STATUS_UNINITIALIZED,
        TAG_LAUNCH_CONFIG: launch_hash,
    }
    provider.create_node(node_config, node_tags, count)
    start = time.time()
    workers = []
    prev = start
    with cli_logger.group("Fetching the new worker node"):
        while True:
            nodes = provider.non_terminated_nodes(worker_filter)
            cur = time.time()
            if cur - prev > 50:
                prev = cur
            if len(nodes) >= count:
                workers = nodes
                break
            time.sleep(POLL_INTERVAL)
    cli_logger.newline()
    updaters = []
    (runtime_hash,
     file_mounts_contents_hash) = hash_runtime_conf(config["file_mounts"],
                                                    None, config)
    for worker in workers:
        updater = NodeUpdaterThread(
            node_id=worker,
            provider_config=config["provider"],
            provider=provider,
            auth_config=config['auth'],
            cluster_name=config['cluster_name'],
            file_mounts=config['file_mounts'],
            initialization_commands=config["initialization_commands"],
            setup_commands=config['worker_setup_commands'],
            process_runner=_runner,
            runtime_hash=runtime_hash,
            is_head_node=False,
            file_mounts_contents_hash=file_mounts_contents_hash,
            rsync_options={
                "rsync_exclude": config.get("rsync_exclude"),
                "rsync_filter": config.get("rsync_filter")
            },
        )
        updater.start()
        updaters.append(updater)
    for up in updaters:
        up.join()
        provider.non_terminated_nodes(worker_filter)
        if up.exitcode != 0:
            cli_logger.abort("Fail to setup worker node. ")
Beispiel #13
0
def _bootstrap_config(config: Dict[str, Any],
                      no_config_cache: bool = False) -> Dict[str, Any]:
    config = prepare_config(config)

    hasher = hashlib.sha1()
    hasher.update(json.dumps([config], sort_keys=True).encode("utf-8"))
    cache_key = os.path.join(tempfile.gettempdir(),
                             "ray-config-{}".format(hasher.hexdigest()))

    if os.path.exists(cache_key) and not no_config_cache:
        config_cache = json.loads(open(cache_key).read())
        if config_cache.get("_version", -1) == CONFIG_CACHE_VERSION:
            # todo: is it fine to re-resolve? afaik it should be.
            # we can have migrations otherwise or something
            # but this seems overcomplicated given that resolving is
            # relatively cheap
            try_reload_log_state(config_cache["config"]["provider"],
                                 config_cache.get("provider_log_info"))

            if log_once("_printed_cached_config_warning"):
                cli_logger.verbose_warning(
                    "Loaded cached provider configuration "
                    "from " + cf.bold("{}"), cache_key)
                if cli_logger.verbosity == 0:
                    cli_logger.warning("Loaded cached provider configuration")
                cli_logger.warning(
                    "If you experience issues with "
                    "the cloud provider, try re-running "
                    "the command with {}.", cf.bold("--no-config-cache"))

            return config_cache["config"]
        else:
            cli_logger.warning(
                "Found cached cluster config "
                "but the version " + cf.bold("{}") + " "
                "(expected " + cf.bold("{}") + ") does not match.\n"
                "This is normal if cluster launcher was updated.\n"
                "Config will be re-resolved.",
                config_cache.get("_version", "none"), CONFIG_CACHE_VERSION)

    importer = _NODE_PROVIDERS.get(config["provider"]["type"])
    if not importer:
        raise NotImplementedError("Unsupported provider {}".format(
            config["provider"]))

    provider_cls = importer(config["provider"])

    cli_logger.print("Checking {} environment settings",
                     _PROVIDER_PRETTY_NAMES.get(config["provider"]["type"]))
    try:
        config = provider_cls.fillout_available_node_types_resources(config)
    except Exception as exc:
        if cli_logger.verbosity > 2:
            logger.exception("Failed to autodetect node resources.")
        else:
            cli_logger.warning(
                f"Failed to autodetect node resources: {str(exc)}. "
                "You can see full stack trace with higher verbosity.")

    # NOTE: if `resources` field is missing, validate_config for providers
    # other than AWS and Kubernetes will fail (the schema error will ask the
    # user to manually fill the resources) as we currently support autofilling
    # resources for AWS and Kubernetes only.
    validate_config(config)
    resolved_config = provider_cls.bootstrap_config(config)

    if not no_config_cache:
        with open(cache_key, "w") as f:
            config_cache = {
                "_version": CONFIG_CACHE_VERSION,
                "provider_log_info": try_get_log_state(config["provider"]),
                "config": resolved_config
            }
            f.write(json.dumps(config_cache))
    return resolved_config