def run(self): update_start_time = time.time() if ( cmd_output_util.does_allow_interactive() and cmd_output_util.is_output_redirected() ): # this is most probably a bug since the user has no control # over these settings msg = ( "Output was redirected for an interactive command. " "Either do not pass `--redirect-command-output` " "or also pass in `--use-normal-shells`." ) cli_logger.abort(msg) try: with LogTimer( self.log_prefix + "Applied config {}".format(self.runtime_hash) ): self.do_update() except Exception as e: self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED} ) cli_logger.error("New status: {}", cf.bold(STATUS_UPDATE_FAILED)) cli_logger.error("!!!") if hasattr(e, "cmd"): cli_logger.error( "Setup command `{}` failed with exit code {}. stderr:", cf.bold(e.cmd), e.returncode, ) else: cli_logger.verbose_error("{}", str(vars(e))) # todo: handle this better somehow? cli_logger.error("{}", str(e)) # todo: print stderr here cli_logger.error("!!!") cli_logger.newline() if isinstance(e, click.ClickException): # todo: why do we ignore this here return raise tags_to_set = { TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE, TAG_RAY_RUNTIME_CONFIG: self.runtime_hash, } if self.file_mounts_contents_hash is not None: tags_to_set[TAG_RAY_FILE_MOUNTS_CONTENTS] = self.file_mounts_contents_hash self.provider.set_node_tags(self.node_id, tags_to_set) cli_logger.labeled_value("New status", STATUS_UP_TO_DATE) self.update_time = time.time() - update_start_time self.exitcode = 0
def teardown_cluster(config_file: str, yes: bool, workers_only: bool, override_cluster_name: Optional[str], keep_min_workers: bool): """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = prepare_config(config) validate_config(config) cli_logger.confirm(yes, "Destroying cluster.", _abort=True) cli_logger.old_confirm("This will destroy your cluster", yes) if not workers_only: try: exec_cluster(config_file, cmd="ray stop", run_env="auto", screen=False, tmux=False, stop=False, start=False, override_cluster_name=override_cluster_name, port_forward=None, with_output=False) except Exception as e: # todo: add better exception info cli_logger.verbose_error("{}", str(e)) cli_logger.warning( "Exception occured when stopping the cluster Ray runtime " "(use -v to dump teardown exceptions).") cli_logger.warning( "Ignoring the exception and " "attempting to shut down the cluster nodes anyway.") cli_logger.old_exception( logger, "Ignoring error attempting a clean shutdown.") provider = _get_node_provider(config["provider"], config["cluster_name"]) try: def remaining_nodes(): workers = provider.non_terminated_nodes( {TAG_RAY_NODE_KIND: NODE_KIND_WORKER}) if keep_min_workers: min_workers = config.get("min_workers", 0) cli_logger.print( "{} random worker nodes will not be shut down. " + cf.dimmed("(due to {})"), cf.bold(min_workers), cf.bold("--keep-min-workers")) cli_logger.old_info(logger, "teardown_cluster: Keeping {} nodes...", min_workers) workers = random.sample(workers, len(workers) - min_workers) # todo: it's weird to kill the head node but not all workers if workers_only: cli_logger.print( "The head node will not be shut down. " + cf.dimmed("(due to {})"), cf.bold("--workers-only")) return workers head = provider.non_terminated_nodes( {TAG_RAY_NODE_KIND: NODE_KIND_HEAD}) return head + workers def run_docker_stop(node, container_name): try: updater = NodeUpdaterThread( node_id=node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="", file_mounts_contents_hash="", is_head_node=False, docker_config=config.get("docker")) _exec(updater, f"docker stop {container_name}", False, False, run_env="host") except Exception: cli_logger.warning(f"Docker stop failed on {node}") cli_logger.old_warning(logger, f"Docker stop failed on {node}") # Loop here to check that both the head and worker nodes are actually # really gone A = remaining_nodes() container_name = config.get("docker", {}).get("container_name") if container_name: for node in A: run_docker_stop(node, container_name) with LogTimer("teardown_cluster: done."): while A: cli_logger.old_info( logger, "teardown_cluster: " "Shutting down {} nodes...", len(A)) provider.terminate_nodes(A) cli_logger.print("Requested {} nodes to shut down.", cf.bold(len(A)), _tags=dict(interval="1s")) time.sleep( POLL_INTERVAL) # todo: interval should be a variable A = remaining_nodes() cli_logger.print("{} nodes remaining after {} second(s).", cf.bold(len(A)), POLL_INTERVAL) cli_logger.success("No nodes remaining.") finally: provider.cleanup()
from ray.autoscaler._private.cli_logger import cli_logger import colorful as cf cli_logger.old_style = False cli_logger.verbosity = 999 cli_logger.detect_colors() cli_logger.print( cf.bold("Bold ") + cf.italic("Italic ") + cf.underlined("Underlined")) cli_logger.labeled_value("Label", "value") cli_logger.print("List: {}", cli_logger.render_list([1, 2, 3])) cli_logger.newline() cli_logger.very_verbose("Very verbose") cli_logger.verbose("Verbose") cli_logger.verbose_warning("Verbose warning") cli_logger.verbose_error("Verbose error") cli_logger.print("Info") cli_logger.success("Success") cli_logger.warning("Warning") cli_logger.error("Error") cli_logger.newline() try: cli_logger.abort("Abort") except Exception: pass try: cli_logger.doassert(False, "Assert") except Exception: pass cli_logger.newline() cli_logger.confirm(True, "example")
def handle_boto_error(exc, msg, *args, **kwargs): if cli_logger.old_style: # old-style logging doesn't do anything here # so we exit early return error_code = None error_info = None # todo: not sure if these exceptions always have response if hasattr(exc, "response"): error_info = exc.response.get("Error", None) if error_info is not None: error_code = error_info.get("Code", None) generic_message_args = [ "{}\n" "Error code: {}", msg.format(*args, **kwargs), cf.bold(error_code) ] # apparently # ExpiredTokenException # ExpiredToken # RequestExpired # are all the same pretty much credentials_expiration_codes = [ "ExpiredTokenException", "ExpiredToken", "RequestExpired" ] if error_code in credentials_expiration_codes: # "An error occurred (ExpiredToken) when calling the # GetInstanceProfile operation: The security token # included in the request is expired" # "An error occurred (RequestExpired) when calling the # DescribeKeyPairs operation: Request has expired." token_command = ( "aws sts get-session-token " "--serial-number arn:aws:iam::" + cf.underlined("ROOT_ACCOUNT_ID") + ":mfa/" + cf.underlined("AWS_USERNAME") + " --token-code " + cf.underlined("TWO_FACTOR_AUTH_CODE")) secret_key_var = ( "export AWS_SECRET_ACCESS_KEY = " + cf.underlined("REPLACE_ME") + " # found at Credentials.SecretAccessKey") session_token_var = ( "export AWS_SESSION_TOKEN = " + cf.underlined("REPLACE_ME") + " # found at Credentials.SessionToken") access_key_id_var = ( "export AWS_ACCESS_KEY_ID = " + cf.underlined("REPLACE_ME") + " # found at Credentials.AccessKeyId") # fixme: replace with a Github URL that points # to our repo aws_session_script_url = ("https://gist.github.com/maximsmol/" "a0284e1d97b25d417bd9ae02e5f450cf") cli_logger.verbose_error(*generic_message_args) cli_logger.verbose(vars(exc)) cli_logger.panic("Your AWS session has expired.") cli_logger.newline() cli_logger.panic("You can request a new one using") cli_logger.panic(cf.bold(token_command)) cli_logger.panic("then expose it to Ray by setting") cli_logger.panic(cf.bold(secret_key_var)) cli_logger.panic(cf.bold(session_token_var)) cli_logger.panic(cf.bold(access_key_id_var)) cli_logger.newline() cli_logger.panic("You can find a script that automates this at:") cli_logger.panic(cf.underlined(aws_session_script_url)) # Do not re-raise the exception here because it looks awful # and we already print all the info in verbose cli_logger.abort() # todo: any other errors that we should catch separately? cli_logger.panic(*generic_message_args) cli_logger.newline() with cli_logger.verbatim_error_ctx("Boto3 error:"): cli_logger.verbose("{}", str(vars(exc))) cli_logger.panic("{}", str(exc)) cli_logger.abort()