Esempio n. 1
0
 def handle_yaml_error(e):
     cli_logger.error("Cluster config invalid")
     cli_logger.newline()
     cli_logger.error("Failed to load YAML file " + cf.bold("{}"),
                      config_file)
     cli_logger.newline()
     with cli_logger.verbatim_error_ctx("PyYAML error:"):
         cli_logger.error(e)
     cli_logger.abort()
Esempio n. 2
0
    def run(self):
        if cmd_output_util.does_allow_interactive(
        ) and cmd_output_util.is_output_redirected():
            # this is most probably a bug since the user has no control
            # over these settings
            msg = ("Output was redirected for an interactive command. "
                   "Either do not pass `--redirect-command-output` "
                   "or also pass in `--use-normal-shells`.")
            cli_logger.abort(msg)
            raise click.ClickException(msg)

        try:
            with LogTimer(self.log_prefix +
                          "Applied config {}".format(self.runtime_hash)):
                self.do_update()
        except Exception as e:
            self.provider.set_node_tags(
                self.node_id, {TAG_NODE_STATUS: STATUS_UPDATE_FAILED})
            cli_logger.error("New status: {}", cf.bold(STATUS_UPDATE_FAILED))

            cli_logger.error("!!!")
            if hasattr(e, "cmd"):
                cli_logger.error(
                    "Setup command `{}` failed with exit code {}. stderr:",
                    cf.bold(e.cmd), e.returncode)
            else:
                cli_logger.verbose_error("{}", str(vars(e)))
                # todo: handle this better somehow?
                cli_logger.error("{}", str(e))
            # todo: print stderr here
            cli_logger.error("!!!")
            cli_logger.newline()

            if isinstance(e, click.ClickException):
                # todo: why do we ignore this here
                return
            raise

        tags_to_set = {
            TAG_NODE_STATUS: STATUS_UP_TO_DATE,
            TAG_RUNTIME_CONFIG: self.runtime_hash,
        }
        if self.file_mounts_contents_hash is not None:
            tags_to_set[
                TAG_FILE_MOUNTS_CONTENTS] = self.file_mounts_contents_hash

        self.provider.set_node_tags(self.node_id, tags_to_set)
        cli_logger.labeled_value("New status", STATUS_UP_TO_DATE)

        self.exitcode = 0
Esempio n. 3
0
def handle_boto_error(exc, msg, *args, **kwargs):
    error_code = None
    error_info = None
    # todo: not sure if these exceptions always have response
    if hasattr(exc, "response"):
        error_info = exc.response.get("Error", None)
    if error_info is not None:
        error_code = error_info.get("Code", None)

    generic_message_args = [
        "{}\n"
        "Error code: {}",
        msg.format(*args, **kwargs),
        cf.bold(error_code)
    ]

    # apparently
    # ExpiredTokenException
    # ExpiredToken
    # RequestExpired
    # are all the same pretty much
    credentials_expiration_codes = [
        "ExpiredTokenException", "ExpiredToken", "RequestExpired"
    ]

    if error_code in credentials_expiration_codes:
        # "An error occurred (ExpiredToken) when calling the
        # GetInstanceProfile operation: The security token
        # included in the request is expired"

        # "An error occurred (RequestExpired) when calling the
        # DescribeKeyPairs operation: Request has expired."

        token_command = (
            "aws sts get-session-token "
            "--serial-number arn:aws:iam::" + cf.underlined("ROOT_ACCOUNT_ID")
            + ":mfa/" + cf.underlined("AWS_USERNAME") + " --token-code " +
            cf.underlined("TWO_FACTOR_AUTH_CODE"))

        secret_key_var = (
            "export AWS_SECRET_ACCESS_KEY = " + cf.underlined("REPLACE_ME") +
            " # found at Credentials.SecretAccessKey")
        session_token_var = (
            "export AWS_SESSION_TOKEN = " + cf.underlined("REPLACE_ME") +
            " # found at Credentials.SessionToken")
        access_key_id_var = (
            "export AWS_ACCESS_KEY_ID = " + cf.underlined("REPLACE_ME") +
            " # found at Credentials.AccessKeyId")

        # fixme: replace with a Github URL that points
        # to our repo
        aws_session_script_url = ("https://gist.github.com/maximsmol/"
                                  "a0284e1d97b25d417bd9ae02e5f450cf")

        cli_logger.verbose_error(*generic_message_args)
        cli_logger.verbose(vars(exc))

        cli_logger.panic("Your AWS session has expired.")
        cli_logger.newline()
        cli_logger.panic("You can request a new one using")
        cli_logger.panic(cf.bold(token_command))
        cli_logger.panic("then expose it to Ray by setting")
        cli_logger.panic(cf.bold(secret_key_var))
        cli_logger.panic(cf.bold(session_token_var))
        cli_logger.panic(cf.bold(access_key_id_var))
        cli_logger.newline()
        cli_logger.panic("You can find a script that automates this at:")
        cli_logger.panic(cf.underlined(aws_session_script_url))
        # Do not re-raise the exception here because it looks awful
        # and we already print all the info in verbose
        cli_logger.abort()

    # todo: any other errors that we should catch separately?

    cli_logger.panic(*generic_message_args)
    cli_logger.newline()
    with cli_logger.verbatim_error_ctx("Boto3 error:"):
        cli_logger.verbose("{}", str(vars(exc)))
        cli_logger.panic("{}", str(exc))
    cli_logger.abort()
Esempio n. 4
0
def create_nodes(config: Dict[str, Any],
                 yes: bool,
                 _provider: Optional[NodeProvider] = None,
                 _runner: ModuleType = subprocess) -> None:
    provider = (_provider or _get_node_provider(config["provider"],
                                                config["cluster_name"]))

    worker_filter = {TAG_NODE_KIND: NODE_KIND_WORKER}
    launch_config = copy.deepcopy(config["worker_nodes"])
    launch_hash = hash_launch_conf(launch_config, config["auth"])
    count = int(config["num_workers"])
    cli_logger.print("Launching {} nodes.".format(count))
    node_config = copy.deepcopy(config["worker_nodes"])
    node_tags = {
        TAG_NODE_NAME: "cls-{}-worker".format(config["cluster_name"]),
        TAG_NODE_KIND: NODE_KIND_WORKER,
        TAG_NODE_STATUS: STATUS_UNINITIALIZED,
        TAG_LAUNCH_CONFIG: launch_hash,
    }
    provider.create_node(node_config, node_tags, count)
    start = time.time()
    workers = []
    prev = start
    with cli_logger.group("Fetching the new worker node"):
        while True:
            nodes = provider.non_terminated_nodes(worker_filter)
            cur = time.time()
            if cur - prev > 50:
                prev = cur
            if len(nodes) >= count:
                workers = nodes
                break
            time.sleep(POLL_INTERVAL)
    cli_logger.newline()
    updaters = []
    (runtime_hash,
     file_mounts_contents_hash) = hash_runtime_conf(config["file_mounts"],
                                                    None, config)
    for worker in workers:
        updater = NodeUpdaterThread(
            node_id=worker,
            provider_config=config["provider"],
            provider=provider,
            auth_config=config['auth'],
            cluster_name=config['cluster_name'],
            file_mounts=config['file_mounts'],
            initialization_commands=config["initialization_commands"],
            setup_commands=config['worker_setup_commands'],
            process_runner=_runner,
            runtime_hash=runtime_hash,
            is_head_node=False,
            file_mounts_contents_hash=file_mounts_contents_hash,
            rsync_options={
                "rsync_exclude": config.get("rsync_exclude"),
                "rsync_filter": config.get("rsync_filter")
            },
        )
        updater.start()
        updaters.append(updater)
    for up in updaters:
        up.join()
        provider.non_terminated_nodes(worker_filter)
        if up.exitcode != 0:
            cli_logger.abort("Fail to setup worker node. ")
Esempio n. 5
0
def create_or_update_cluster(
    config_file: str,
    yes: bool,
    override_num_workers: Optional[int],
    override_cluster_name: Optional[str] = None,
    no_config_cache: bool = False,
    redirect_command_output: Optional[bool] = False,
    use_login_shells: bool = True,
):
    set_using_login_shells(use_login_shells)
    if not use_login_shells:
        cmd_output_util.set_allow_interactive(False)
    if redirect_command_output is None:
        # Do not redirect by default.
        cmd_output_util.set_output_redirected(False)
    else:
        cmd_output_util.set_output_redirected(redirect_command_output)

    def handle_yaml_error(e):
        cli_logger.error("Cluster config invalid")
        cli_logger.newline()
        cli_logger.error("Failed to load YAML file " + cf.bold("{}"),
                         config_file)
        cli_logger.newline()
        with cli_logger.verbatim_error_ctx("PyYAML error:"):
            cli_logger.error(e)
        cli_logger.abort()

    try:
        config = yaml.safe_load(open(config_file).read())
    except FileNotFoundError:
        cli_logger.abort(
            "Provided cluster configuration file ({}) does not exist",
            cf.bold(config_file))
        raise
    except yaml.parser.ParserError as e:
        handle_yaml_error(e)
        raise
    except yaml.scanner.ScannerError as e:
        handle_yaml_error(e)
        raise
    global_event_system.execute_callback(CreateClusterEvent.up_started,
                                         {"cluster_config": config})
    importer = _NODE_PROVIDERS.get(config["provider"]["type"])
    if not importer:
        cli_logger.abort(
            "Unknown provider type " + cf.bold("{}") + "\n"
            "Available providers are: {}", config["provider"]["type"],
            cli_logger.render_list([
                k for k in _NODE_PROVIDERS.keys()
                if _NODE_PROVIDERS[k] is not None
            ]))
        raise NotImplementedError("Unsupported provider {}".format(
            config["provider"]))

    printed_overrides = False

    def handle_cli_override(key, override):
        if override is not None:
            if key in config:
                nonlocal printed_overrides
                printed_overrides = True
                cli_logger.warning(
                    "`{}` override provided on the command line.\n"
                    "  Using " + cf.bold("{}") +
                    cf.dimmed(" [configuration file has " + cf.bold("{}") +
                              "]"), key, override, config[key])
            config[key] = override

    handle_cli_override("num_workers", override_num_workers)
    handle_cli_override("cluster_name", override_cluster_name)
    if printed_overrides:
        cli_logger.newline()

    cli_logger.labeled_value("Cluster", config["cluster_name"])

    cli_logger.newline()
    config = _bootstrap_config(config, no_config_cache=no_config_cache)

    try_logging_config(config)
    create_nodes(config, yes)
    return config