def run(self): update_start_time = time.time() if ( cmd_output_util.does_allow_interactive() and cmd_output_util.is_output_redirected() ): # this is most probably a bug since the user has no control # over these settings msg = ( "Output was redirected for an interactive command. " "Either do not pass `--redirect-command-output` " "or also pass in `--use-normal-shells`." ) cli_logger.abort(msg) try: with LogTimer( self.log_prefix + "Applied config {}".format(self.runtime_hash) ): self.do_update() except Exception as e: self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED} ) cli_logger.error("New status: {}", cf.bold(STATUS_UPDATE_FAILED)) cli_logger.error("!!!") if hasattr(e, "cmd"): cli_logger.error( "Setup command `{}` failed with exit code {}. stderr:", cf.bold(e.cmd), e.returncode, ) else: cli_logger.verbose_error("{}", str(vars(e))) # todo: handle this better somehow? cli_logger.error("{}", str(e)) # todo: print stderr here cli_logger.error("!!!") cli_logger.newline() if isinstance(e, click.ClickException): # todo: why do we ignore this here return raise tags_to_set = { TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE, TAG_RAY_RUNTIME_CONFIG: self.runtime_hash, } if self.file_mounts_contents_hash is not None: tags_to_set[TAG_RAY_FILE_MOUNTS_CONTENTS] = self.file_mounts_contents_hash self.provider.set_node_tags(self.node_id, tags_to_set) cli_logger.labeled_value("New status", STATUS_UP_TO_DATE) self.update_time = time.time() - update_start_time self.exitcode = 0
def teardown_cluster(config_file: str, yes: bool, workers_only: bool, override_cluster_name: Optional[str], keep_min_workers: bool) -> None: """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config) cli_logger.confirm(yes, "Destroying cluster.", _abort=True) if not workers_only: try: exec_cluster( config_file, cmd="ray stop", run_env="auto", screen=False, tmux=False, stop=False, start=False, override_cluster_name=override_cluster_name, port_forward=None, with_output=False) except Exception as e: # todo: add better exception info cli_logger.verbose_error("{}", str(e)) cli_logger.warning( "Exception occurred when stopping the cluster Ray runtime " "(use -v to dump teardown exceptions).") cli_logger.warning( "Ignoring the exception and " "attempting to shut down the cluster nodes anyway.") provider = _get_node_provider(config["provider"], config["cluster_name"]) def remaining_nodes(): workers = provider.non_terminated_nodes({ TAG_RAY_NODE_KIND: NODE_KIND_WORKER }) if keep_min_workers: min_workers = config.get("min_workers", 0) cli_logger.print( "{} random worker nodes will not be shut down. " + cf.dimmed("(due to {})"), cf.bold(min_workers), cf.bold("--keep-min-workers")) workers = random.sample(workers, len(workers) - min_workers) # todo: it's weird to kill the head node but not all workers if workers_only: cli_logger.print( "The head node will not be shut down. " + cf.dimmed("(due to {})"), cf.bold("--workers-only")) return workers head = provider.non_terminated_nodes({ TAG_RAY_NODE_KIND: NODE_KIND_HEAD }) return head + workers def run_docker_stop(node, container_name): try: updater = NodeUpdaterThread( node_id=node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="", file_mounts_contents_hash="", is_head_node=False, docker_config=config.get("docker")) _exec( updater, f"docker stop {container_name}", with_output=False, run_env="host") except Exception: cli_logger.warning(f"Docker stop failed on {node}") # Loop here to check that both the head and worker nodes are actually # really gone A = remaining_nodes() container_name = config.get("docker", {}).get("container_name") if container_name: # This is to ensure that the parallel SSH calls below do not mess with # the users terminal. output_redir = cmd_output_util.is_output_redirected() cmd_output_util.set_output_redirected(True) allow_interactive = cmd_output_util.does_allow_interactive() cmd_output_util.set_allow_interactive(False) with ThreadPoolExecutor( max_workers=MAX_PARALLEL_SHUTDOWN_WORKERS) as executor: for node in A: executor.submit( run_docker_stop, node=node, container_name=container_name) cmd_output_util.set_output_redirected(output_redir) cmd_output_util.set_allow_interactive(allow_interactive) with LogTimer("teardown_cluster: done."): while A: provider.terminate_nodes(A) cli_logger.print( "Requested {} nodes to shut down.", cf.bold(len(A)), _tags=dict(interval="1s")) time.sleep(POLL_INTERVAL) # todo: interval should be a variable A = remaining_nodes() cli_logger.print("{} nodes remaining after {} second(s).", cf.bold(len(A)), POLL_INTERVAL) cli_logger.success("No nodes remaining.")