def teardown_cluster(config_file: str, yes: bool, override_cluster_name: Optional[str]) -> None: config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config) cli_logger.confirm(yes, "Destroying cluster.", _abort=True) provider = _get_node_provider(config["provider"], config["cluster_name"]) A = provider.non_terminated_nodes({TAG_NODE_KIND: NODE_KIND_WORKER}) with LogTimer("teardown_cluster: done."): while A: provider.terminate_nodes(A) cli_logger.print("Requested {} nodes to shut down.", cf.bold(len(A)), _tags=dict(interval="1s")) time.sleep(POLL_INTERVAL) # todo: interval should be a variable A = provider.non_terminated_nodes( {TAG_NODE_KIND: NODE_KIND_WORKER}) cli_logger.print("{} nodes remaining after {} second(s).", cf.bold(len(A)), POLL_INTERVAL) cli_logger.success("No nodes remaining.")
def _get_worker_nodes(config: Dict[str, Any], override_cluster_name: Optional[str]) -> List[str]: """Returns worker node ids for given configuration.""" # todo: technically could be reused in get_worker_node_ips if override_cluster_name is not None: config["cluster_name"] = override_cluster_name provider = _get_node_provider(config["provider"], config["cluster_name"]) return provider.non_terminated_nodes({TAG_NODE_KIND: NODE_KIND_WORKER})
def get_worker_node_ips( config_file: str, override_cluster_name: Optional[str] = None) -> List[str]: """Returns worker node IPs for given configuration file.""" config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name provider = _get_node_provider(config["provider"], config["cluster_name"]) nodes = provider.non_terminated_nodes({TAG_NODE_KIND: NODE_KIND_WORKER}) if config.get("provider", {}).get("use_internal_ips", False) is True: return [provider.internal_ip(node) for node in nodes] else: return [provider.external_ip(node) for node in nodes]
def create_nodes(config: Dict[str, Any], yes: bool, _provider: Optional[NodeProvider] = None, _runner: ModuleType = subprocess) -> None: provider = (_provider or _get_node_provider(config["provider"], config["cluster_name"])) worker_filter = {TAG_NODE_KIND: NODE_KIND_WORKER} launch_config = copy.deepcopy(config["worker_nodes"]) launch_hash = hash_launch_conf(launch_config, config["auth"]) count = int(config["num_workers"]) cli_logger.print("Launching {} nodes.".format(count)) node_config = copy.deepcopy(config["worker_nodes"]) node_tags = { TAG_NODE_NAME: "cls-{}-worker".format(config["cluster_name"]), TAG_NODE_KIND: NODE_KIND_WORKER, TAG_NODE_STATUS: STATUS_UNINITIALIZED, TAG_LAUNCH_CONFIG: launch_hash, } provider.create_node(node_config, node_tags, count) start = time.time() workers = [] prev = start with cli_logger.group("Fetching the new worker node"): while True: nodes = provider.non_terminated_nodes(worker_filter) cur = time.time() if cur - prev > 50: prev = cur if len(nodes) >= count: workers = nodes break time.sleep(POLL_INTERVAL) cli_logger.newline() updaters = [] (runtime_hash, file_mounts_contents_hash) = hash_runtime_conf(config["file_mounts"], None, config) for worker in workers: updater = NodeUpdaterThread( node_id=worker, provider_config=config["provider"], provider=provider, auth_config=config['auth'], cluster_name=config['cluster_name'], file_mounts=config['file_mounts'], initialization_commands=config["initialization_commands"], setup_commands=config['worker_setup_commands'], process_runner=_runner, runtime_hash=runtime_hash, is_head_node=False, file_mounts_contents_hash=file_mounts_contents_hash, rsync_options={ "rsync_exclude": config.get("rsync_exclude"), "rsync_filter": config.get("rsync_filter") }, ) updater.start() updaters.append(updater) for up in updaters: up.join() provider.non_terminated_nodes(worker_filter) if up.exitcode != 0: cli_logger.abort("Fail to setup worker node. ")
def rsync(config_file: str, source: Optional[str], target: Optional[str], override_cluster_name: Optional[str], down: bool, ip_address: Optional[str] = None, use_internal_ip: bool = False, no_config_cache: bool = False, all_nodes: bool = False, _runner: ModuleType = subprocess) -> None: """Rsyncs files. Arguments: config_file: path to the cluster yaml source: source dir target: target dir override_cluster_name: set the name of the cluster down: whether we're syncing remote -> local ip_address (str): Address of node. Raise Exception if both ip_address and 'all_nodes' are provided. use_internal_ip (bool): Whether the provided ip_address is public or private. all_nodes: whether to sync worker nodes in addition to the head node """ if bool(source) != bool(target): cli_logger.abort( "Expected either both a source and a target, or neither.") assert bool(source) == bool(target), ( "Must either provide both or neither source and target.") if ip_address and all_nodes: cli_logger.abort("Cannot provide both ip_address and 'all_nodes'.") config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config, no_config_cache=no_config_cache) is_file_mount = False if source and target: for remote_mount in config.get("file_mounts", {}).keys(): if (source if down else target).startswith(remote_mount): is_file_mount = True break provider = _get_node_provider(config["provider"], config["cluster_name"]) def rsync_to_node(node_id, is_head_node): updater = NodeUpdaterThread(node_id=node_id, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], runtime_hash="", use_internal_ip=use_internal_ip, process_runner=_runner, file_mounts_contents_hash="", is_head_node=is_head_node, rsync_options={ "rsync_exclude": config.get("rsync_exclude"), "rsync_filter": config.get("rsync_filter") }, docker_config=config.get("docker")) if down: rsync = updater.rsync_down else: rsync = updater.rsync_up if source and target: # print rsync progress for single file rsync if cli_logger.verbosity > 0: cmd_output_util.set_output_redirected(False) set_rsync_silent(False) rsync(source, target, is_file_mount) else: updater.sync_file_mounts(rsync) nodes = _get_worker_nodes(config, override_cluster_name) for node_id in nodes: rsync_to_node(node_id, is_head_node=False)