Exemple #1
0
def teardown_cluster(config_file: str, yes: bool,
                     override_cluster_name: Optional[str]) -> None:
    config = yaml.safe_load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name

    config = _bootstrap_config(config)

    cli_logger.confirm(yes, "Destroying cluster.", _abort=True)

    provider = _get_node_provider(config["provider"], config["cluster_name"])

    A = provider.non_terminated_nodes({TAG_NODE_KIND: NODE_KIND_WORKER})
    with LogTimer("teardown_cluster: done."):
        while A:
            provider.terminate_nodes(A)

            cli_logger.print("Requested {} nodes to shut down.",
                             cf.bold(len(A)),
                             _tags=dict(interval="1s"))

            time.sleep(POLL_INTERVAL)  # todo: interval should be a variable
            A = provider.non_terminated_nodes(
                {TAG_NODE_KIND: NODE_KIND_WORKER})
            cli_logger.print("{} nodes remaining after {} second(s).",
                             cf.bold(len(A)), POLL_INTERVAL)
        cli_logger.success("No nodes remaining.")
Exemple #2
0
def _get_worker_nodes(config: Dict[str, Any],
                      override_cluster_name: Optional[str]) -> List[str]:
    """Returns worker node ids for given configuration."""
    # todo: technically could be reused in get_worker_node_ips
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name

    provider = _get_node_provider(config["provider"], config["cluster_name"])
    return provider.non_terminated_nodes({TAG_NODE_KIND: NODE_KIND_WORKER})
Exemple #3
0
def get_worker_node_ips(
        config_file: str,
        override_cluster_name: Optional[str] = None) -> List[str]:
    """Returns worker node IPs for given configuration file."""

    config = yaml.safe_load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name

    provider = _get_node_provider(config["provider"], config["cluster_name"])
    nodes = provider.non_terminated_nodes({TAG_NODE_KIND: NODE_KIND_WORKER})

    if config.get("provider", {}).get("use_internal_ips", False) is True:
        return [provider.internal_ip(node) for node in nodes]
    else:
        return [provider.external_ip(node) for node in nodes]
Exemple #4
0
def create_nodes(config: Dict[str, Any],
                 yes: bool,
                 _provider: Optional[NodeProvider] = None,
                 _runner: ModuleType = subprocess) -> None:
    provider = (_provider or _get_node_provider(config["provider"],
                                                config["cluster_name"]))

    worker_filter = {TAG_NODE_KIND: NODE_KIND_WORKER}
    launch_config = copy.deepcopy(config["worker_nodes"])
    launch_hash = hash_launch_conf(launch_config, config["auth"])
    count = int(config["num_workers"])
    cli_logger.print("Launching {} nodes.".format(count))
    node_config = copy.deepcopy(config["worker_nodes"])
    node_tags = {
        TAG_NODE_NAME: "cls-{}-worker".format(config["cluster_name"]),
        TAG_NODE_KIND: NODE_KIND_WORKER,
        TAG_NODE_STATUS: STATUS_UNINITIALIZED,
        TAG_LAUNCH_CONFIG: launch_hash,
    }
    provider.create_node(node_config, node_tags, count)
    start = time.time()
    workers = []
    prev = start
    with cli_logger.group("Fetching the new worker node"):
        while True:
            nodes = provider.non_terminated_nodes(worker_filter)
            cur = time.time()
            if cur - prev > 50:
                prev = cur
            if len(nodes) >= count:
                workers = nodes
                break
            time.sleep(POLL_INTERVAL)
    cli_logger.newline()
    updaters = []
    (runtime_hash,
     file_mounts_contents_hash) = hash_runtime_conf(config["file_mounts"],
                                                    None, config)
    for worker in workers:
        updater = NodeUpdaterThread(
            node_id=worker,
            provider_config=config["provider"],
            provider=provider,
            auth_config=config['auth'],
            cluster_name=config['cluster_name'],
            file_mounts=config['file_mounts'],
            initialization_commands=config["initialization_commands"],
            setup_commands=config['worker_setup_commands'],
            process_runner=_runner,
            runtime_hash=runtime_hash,
            is_head_node=False,
            file_mounts_contents_hash=file_mounts_contents_hash,
            rsync_options={
                "rsync_exclude": config.get("rsync_exclude"),
                "rsync_filter": config.get("rsync_filter")
            },
        )
        updater.start()
        updaters.append(updater)
    for up in updaters:
        up.join()
        provider.non_terminated_nodes(worker_filter)
        if up.exitcode != 0:
            cli_logger.abort("Fail to setup worker node. ")
Exemple #5
0
def rsync(config_file: str,
          source: Optional[str],
          target: Optional[str],
          override_cluster_name: Optional[str],
          down: bool,
          ip_address: Optional[str] = None,
          use_internal_ip: bool = False,
          no_config_cache: bool = False,
          all_nodes: bool = False,
          _runner: ModuleType = subprocess) -> None:
    """Rsyncs files.

    Arguments:
        config_file: path to the cluster yaml
        source: source dir
        target: target dir
        override_cluster_name: set the name of the cluster
        down: whether we're syncing remote -> local
        ip_address (str): Address of node. Raise Exception
            if both ip_address and 'all_nodes' are provided.
        use_internal_ip (bool): Whether the provided ip_address is
            public or private.
        all_nodes: whether to sync worker nodes in addition to the head node
    """
    if bool(source) != bool(target):
        cli_logger.abort(
            "Expected either both a source and a target, or neither.")

    assert bool(source) == bool(target), (
        "Must either provide both or neither source and target.")

    if ip_address and all_nodes:
        cli_logger.abort("Cannot provide both ip_address and 'all_nodes'.")

    config = yaml.safe_load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name
    config = _bootstrap_config(config, no_config_cache=no_config_cache)

    is_file_mount = False
    if source and target:
        for remote_mount in config.get("file_mounts", {}).keys():
            if (source if down else target).startswith(remote_mount):
                is_file_mount = True
                break

    provider = _get_node_provider(config["provider"], config["cluster_name"])

    def rsync_to_node(node_id, is_head_node):
        updater = NodeUpdaterThread(node_id=node_id,
                                    provider_config=config["provider"],
                                    provider=provider,
                                    auth_config=config["auth"],
                                    cluster_name=config["cluster_name"],
                                    file_mounts=config["file_mounts"],
                                    initialization_commands=[],
                                    setup_commands=[],
                                    runtime_hash="",
                                    use_internal_ip=use_internal_ip,
                                    process_runner=_runner,
                                    file_mounts_contents_hash="",
                                    is_head_node=is_head_node,
                                    rsync_options={
                                        "rsync_exclude":
                                        config.get("rsync_exclude"),
                                        "rsync_filter":
                                        config.get("rsync_filter")
                                    },
                                    docker_config=config.get("docker"))
        if down:
            rsync = updater.rsync_down
        else:
            rsync = updater.rsync_up

        if source and target:
            # print rsync progress for single file rsync
            if cli_logger.verbosity > 0:
                cmd_output_util.set_output_redirected(False)
                set_rsync_silent(False)
            rsync(source, target, is_file_mount)
        else:
            updater.sync_file_mounts(rsync)

    nodes = _get_worker_nodes(config, override_cluster_name)

    for node_id in nodes:
        rsync_to_node(node_id, is_head_node=False)