def rsync_down(self, source, target, docker_mount_if_possible=False): options = {} options["docker_mount_if_possible"] = docker_mount_if_possible options["rsync_exclude"] = self.rsync_options.get("rsync_exclude") options["rsync_filter"] = self.rsync_options.get("rsync_filter") self.cmd_runner.run_rsync_down(source, target, options=options) cli_logger.verbose("`rsync`ed {} (remote) to {} (local)", cf.bold(source), cf.bold(target))
def _configure_iam_role(config): if "IamInstanceProfile" in config["head_node"]: _set_config_info(head_instance_profile_src="config") return config _set_config_info(head_instance_profile_src="default") profile = _get_instance_profile(DEFAULT_RAY_INSTANCE_PROFILE, config) if profile is None: cli_logger.verbose( "Creating new IAM instance profile {} for use as the default.", cf.bold(DEFAULT_RAY_INSTANCE_PROFILE)) client = _client("iam", config) client.create_instance_profile( InstanceProfileName=DEFAULT_RAY_INSTANCE_PROFILE) profile = _get_instance_profile(DEFAULT_RAY_INSTANCE_PROFILE, config) time.sleep(15) # wait for propagation cli_logger.doassert(profile is not None, "Failed to create instance profile.") # todo: err msg assert profile is not None, "Failed to create instance profile" if not profile.roles: role = _get_role(DEFAULT_RAY_IAM_ROLE, config) if role is None: cli_logger.verbose( "Creating new IAM role {} for " "use as the default instance role.", cf.bold(DEFAULT_RAY_IAM_ROLE)) iam = _resource("iam", config) iam.create_role( RoleName=DEFAULT_RAY_IAM_ROLE, AssumeRolePolicyDocument=json.dumps({ "Statement": [ { "Effect": "Allow", "Principal": { "Service": "ec2.amazonaws.com" }, "Action": "sts:AssumeRole", }, ], })) role = _get_role(DEFAULT_RAY_IAM_ROLE, config) cli_logger.doassert(role is not None, "Failed to create role.") # todo: err msg assert role is not None, "Failed to create role" role.attach_policy( PolicyArn="arn:aws:iam::aws:policy/AmazonEC2FullAccess") role.attach_policy( PolicyArn="arn:aws:iam::aws:policy/AmazonS3FullAccess") profile.add_role(RoleName=role.name) time.sleep(15) # wait for propagation config["head_node"]["IamInstanceProfile"] = {"Arn": profile.arn} return config
def rsync_down(self, source, target, file_mount=False): cli_logger.old_info(logger, "{}Syncing {} from {}...", self.log_prefix, source, target) options = {} options["file_mount"] = file_mount self.cmd_runner.run_rsync_down(source, target, options=options) cli_logger.verbose("`rsync`ed {} (remote) to {} (local)", cf.bold(source), cf.bold(target))
def _check_if_container_restart_is_needed( self, image: str, cleaned_bind_mounts: Dict[str, str] ) -> bool: re_init_required = False running_image = ( self.run( check_docker_image(self.container_name, self.docker_cmd), with_output=True, run_env="host", ) .decode("utf-8") .strip() ) if running_image != image: cli_logger.error( "A container with name {} is running image {} instead " + "of {} (which was provided in the YAML)", self.container_name, running_image, image, ) mounts = ( self.run( check_bind_mounts_cmd(self.container_name, self.docker_cmd), with_output=True, run_env="host", ) .decode("utf-8") .strip() ) try: active_mounts = json.loads(mounts) active_remote_mounts = { mnt["Destination"].strip("/") for mnt in active_mounts } # Ignore ray bootstrap files. requested_remote_mounts = { self._docker_expand_user(remote).strip("/") for remote in cleaned_bind_mounts.keys() } unfulfilled_mounts = requested_remote_mounts - active_remote_mounts if unfulfilled_mounts: re_init_required = True cli_logger.warning( "This Docker Container is already running. " "Restarting the Docker container on " "this node to pick up the following file_mounts {}", unfulfilled_mounts, ) except json.JSONDecodeError: cli_logger.verbose( "Unable to check if file_mounts specified in the YAML " "differ from those on the running container." ) return re_init_required
def rsync_up(self, source, target, docker_mount_if_possible=False): cli_logger.old_info(logger, "{}Syncing {} to {}...", self.log_prefix, source, target) options = {} options["docker_mount_if_possible"] = docker_mount_if_possible options["rsync_exclude"] = self.rsync_options.get("rsync_exclude") options["rsync_filter"] = self.rsync_options.get("rsync_filter") self.cmd_runner.run_rsync_up(source, target, options=options) cli_logger.verbose("`rsync`ed {} (local) to {} (remote)", cf.bold(source), cf.bold(target))
def run_rsync_up(self, source, target, options=None): self._set_ssh_ip_if_required() command = [ "rsync", "--rsh", subprocess.list2cmdline(["ssh"] + self.ssh_options.to_ssh_options_list( timeout=120)), "-avz", source, "{}@{}:{}".format(self.ssh_user, self.ssh_ip, target) ] cli_logger.verbose("Running `{}`", cf.bold(" ".join(command))) self._run_helper(command, silent=is_rsync_silent())
def resource_cache(name, region, max_retries=BOTO_MAX_RETRIES, **kwargs): cli_logger.verbose("Creating AWS resource `{}` in `{}`", cf.bold(name), cf.bold(region)) kwargs.setdefault( "config", Config(retries={"max_attempts": max_retries}), ) return boto3.resource( name, region, **kwargs, )
def run_cmd_redirected(cmd, process_runner=subprocess, silent=False, use_login_shells=False): """Run a command and optionally redirect output to a file. Args: cmd (List[str]): Command to run. process_runner: Process runner used for executing commands. silent: If true, the command output will be silenced completely (redirected to /dev/null), unless verbose logging is enabled. Use this for running utility commands like rsync. """ if silent and cli_logger.verbosity < 1: return _run_and_process_output( cmd, process_runner=process_runner, stdout_file=process_runner.DEVNULL, stderr_file=process_runner.DEVNULL, use_login_shells=use_login_shells, ) if not is_output_redirected(): return _run_and_process_output( cmd, process_runner=process_runner, stdout_file=sys.stdout, stderr_file=sys.stderr, use_login_shells=use_login_shells, ) else: tmpfile_path = os.path.join( tempfile.gettempdir(), "ray-up-{}-{}.txt".format(cmd[0], time.time())) with open( tmpfile_path, mode="w", # line buffering buffering=1, ) as tmp: cli_logger.verbose("Command stdout is redirected to {}", cf.bold(tmp.name)) return _run_and_process_output( cmd, process_runner=process_runner, stdout_file=tmp, stderr_file=tmp, use_login_shells=use_login_shells, )
def _create_security_group(config, vpc_id, group_name): client = _client("ec2", config) client.create_security_group( Description="Auto-created security group for Ray workers", GroupName=group_name, VpcId=vpc_id) security_group = _get_security_group(config, vpc_id, group_name) cli_logger.doassert(security_group, "Failed to create security group") # err msg cli_logger.verbose("Created new security group {}", cf.bold(security_group.group_name), _tags=dict(id=security_group.id)) cli_logger.doassert(security_group, "Failed to create security group") # err msg assert security_group, "Failed to create security group" return security_group
def client_cache(name, region, max_retries=BOTO_MAX_RETRIES, **kwargs): try: # try to re-use a client from the resource cache first return resource_cache(name, region, max_retries, **kwargs).meta.client except ResourceNotExistsError: # fall back for clients without an associated resource cli_logger.verbose("Creating AWS client `{}` in `{}`", cf.bold(name), cf.bold(region)) kwargs.setdefault( "config", Config(retries={"max_attempts": max_retries}), ) return boto3.client( name, region, **kwargs, )
# other formatting. from ray.autoscaler._private.cli_logger import cli_logger import colorful as cf cli_logger.old_style = False cli_logger.verbosity = 999 cli_logger.detect_colors() cli_logger.print( cf.bold("Bold ") + cf.italic("Italic ") + cf.underlined("Underlined")) cli_logger.labeled_value("Label", "value") cli_logger.print("List: {}", cli_logger.render_list([1, 2, 3])) cli_logger.newline() cli_logger.very_verbose("Very verbose") cli_logger.verbose("Verbose") cli_logger.verbose_warning("Verbose warning") cli_logger.verbose_error("Verbose error") cli_logger.print("Info") cli_logger.success("Success") cli_logger.warning("Warning") cli_logger.error("Error") cli_logger.newline() try: cli_logger.abort("Abort") except Exception: pass try: cli_logger.doassert(False, "Assert") except Exception: pass
def run( self, cmd, timeout=120, exit_on_fail=False, port_forward=None, with_output=False, environment_variables: Dict[str, object] = None, run_env="auto", # Unused argument. ssh_options_override_ssh_key="", shutdown_after_run=False, silent=False): if shutdown_after_run: cmd += "; sudo shutdown -h now" if ssh_options_override_ssh_key: ssh_options = SSHOptions(ssh_options_override_ssh_key) else: ssh_options = self.ssh_options assert isinstance( ssh_options, SSHOptions ), "ssh_options must be of type SSHOptions, got {}".format( type(ssh_options)) self._set_ssh_ip_if_required() if is_using_login_shells(): ssh = ["ssh", "-tt"] else: ssh = ["ssh"] if port_forward: with cli_logger.group("Forwarding ports"): if not isinstance(port_forward, list): port_forward = [port_forward] for local, remote in port_forward: cli_logger.verbose( "Forwarding port {} to port {} on localhost.", cf.bold(local), cf.bold(remote)) # todo: msg ssh += ["-L", "{}:localhost:{}".format(remote, local)] final_cmd = ssh + ssh_options.to_ssh_options_list( timeout=timeout) + ["{}@{}".format(self.ssh_user, self.ssh_ip)] if cmd: if environment_variables: cmd = _with_environment_variables(cmd, environment_variables) if is_using_login_shells(): final_cmd += _with_interactive(cmd) else: final_cmd += [cmd] else: # We do this because `-o ControlMaster` causes the `-N` flag to # still create an interactive shell in some ssh versions. final_cmd.append("while true; do sleep 86400; done") cli_logger.verbose("Running `{}`", cf.bold(cmd)) with cli_logger.indented(): cli_logger.very_verbose("Full command is `{}`", cf.bold(" ".join(final_cmd))) if cli_logger.verbosity > 0: with cli_logger.indented(): return self._run_helper(final_cmd, with_output, exit_on_fail, silent=silent) else: return self._run_helper(final_cmd, with_output, exit_on_fail, silent=silent)
def _configure_key_pair(config): if "ssh_private_key" in config["auth"]: _set_config_info(keypair_src="config") # If the key is not configured via the cloudinit # UserData, it should be configured via KeyName or # else we will risk starting a node that we cannot # SSH into: if "UserData" not in config["head_node"]: cli_logger.doassert( # todo: verify schema beforehand? "KeyName" in config["head_node"], "`KeyName` missing for head node.") # todo: err msg assert "KeyName" in config["head_node"] if "UserData" not in config["worker_nodes"]: cli_logger.doassert( "KeyName" in config["worker_nodes"], "`KeyName` missing for worker nodes.") # todo: err msg assert "KeyName" in config["worker_nodes"] return config _set_config_info(keypair_src="default") ec2 = _resource("ec2", config) # Writing the new ssh key to the filesystem fails if the ~/.ssh # directory doesn't already exist. os.makedirs(os.path.expanduser("~/.ssh"), exist_ok=True) # Try a few times to get or create a good key pair. MAX_NUM_KEYS = 30 for i in range(MAX_NUM_KEYS): key_name = config["provider"].get("key_pair", {}).get("key_name") key_name, key_path = key_pair(i, config["provider"]["region"], key_name) key = _get_key(key_name, config) # Found a good key. if key and os.path.exists(key_path): break # We can safely create a new key. if not key and not os.path.exists(key_path): cli_logger.verbose( "Creating new key pair {} for use as the default.", cf.bold(key_name)) key = ec2.create_key_pair(KeyName=key_name) # We need to make sure to _create_ the file with the right # permissions. In order to do that we need to change the default # os.open behavior to include the mode we want. with open(key_path, "w", opener=partial(os.open, mode=0o600)) as f: f.write(key.key_material) break if not key: cli_logger.abort( "No matching local key file for any of the key pairs in this " "account with ids from 0..{}. " "Consider deleting some unused keys pairs from your account.", key_name) cli_logger.doassert( os.path.exists(key_path), "Private key file " + cf.bold("{}") + " not found for " + cf.bold("{}"), key_path, key_name) # todo: err msg assert os.path.exists(key_path), \ "Private key file {} not found for {}".format(key_path, key_name) config["auth"]["ssh_private_key"] = key_path config["head_node"]["KeyName"] = key_name config["worker_nodes"]["KeyName"] = key_name return config
def stop(force, verbose, log_style, log_color): """Stop Ray processes manually on the local machine.""" cli_logger.configure(log_style, log_color, verbose) # Note that raylet needs to exit before object store, otherwise # it cannot exit gracefully. is_linux = sys.platform.startswith("linux") processes_to_kill = [ # The first element is the substring to filter. # The second element, if True, is to filter ps results by command name # (only the first 15 charactors of the executable name on Linux); # if False, is to filter ps results by command with all its arguments. # See STANDARD FORMAT SPECIFIERS section of # http://man7.org/linux/man-pages/man1/ps.1.html # about comm and args. This can help avoid killing non-ray processes. # Format: # Keyword to filter, filter by command (True)/filter by args (False) ["raylet", True], ["plasma_store", True], ["gcs_server", True], ["monitor.py", False], ["redis-server", False], ["default_worker.py", False], # Python worker. ["ray::", True], # Python worker. TODO(mehrdadn): Fix for Windows ["io.ray.runtime.runner.worker.DefaultWorker", False], # Java worker. ["log_monitor.py", False], ["reporter.py", False], ["dashboard.py", False], ["new_dashboard/agent.py", False], ["ray_process_reaper.py", False], ] process_infos = [] for proc in psutil.process_iter(["name", "cmdline"]): try: process_infos.append((proc, proc.name(), proc.cmdline())) except psutil.Error: pass total_found = 0 total_stopped = 0 for keyword, filter_by_cmd in processes_to_kill: if filter_by_cmd and is_linux and len(keyword) > 15: # getting here is an internal bug, so we do not use cli_logger msg = ("The filter string should not be more than {} " "characters. Actual length: {}. Filter: {}").format( 15, len(keyword), keyword) raise ValueError(msg) found = [] for candidate in process_infos: proc, proc_cmd, proc_args = candidate corpus = (proc_cmd if filter_by_cmd else subprocess.list2cmdline(proc_args)) if keyword in corpus: found.append(candidate) for proc, proc_cmd, proc_args in found: total_found += 1 proc_string = str(subprocess.list2cmdline(proc_args)) try: if force: proc.kill() else: # TODO(mehrdadn): On Windows, this is forceful termination. # We don't want CTRL_BREAK_EVENT, because that would # terminate the entire process group. What to do? proc.terminate() if force: cli_logger.verbose("Killed `{}` {} ", cf.bold(proc_string), cf.dimmed("(via SIGKILL)")) else: cli_logger.verbose("Send termination request to `{}` {}", cf.bold(proc_string), cf.dimmed("(via SIGTERM)")) total_stopped += 1 except psutil.NoSuchProcess: cli_logger.verbose( "Attempted to stop `{}`, but process was already dead.", cf.bold(proc_string)) pass except (psutil.Error, OSError) as ex: cli_logger.error("Could not terminate `{}` due to {}", cf.bold(proc_string), str(ex)) if total_found == 0: cli_logger.print("Did not find any active Ray processes.") else: if total_stopped == total_found: cli_logger.success("Stopped all {} Ray processes.", total_stopped) else: cli_logger.warning( "Stopped only {} out of {} Ray processes. " "Set `{}` to see more details.", total_stopped, total_found, cf.bold("-v")) cli_logger.warning("Try running the command again, or use `{}`.", cf.bold("--force"))
def _configure_iam_role(config): head_node_type = config["head_node_type"] head_node_config = config["available_node_types"][head_node_type]["node_config"] if "IamInstanceProfile" in head_node_config: _set_config_info(head_instance_profile_src="config") return config _set_config_info(head_instance_profile_src="default") instance_profile_name = cwh.resolve_instance_profile_name( config["provider"], DEFAULT_RAY_INSTANCE_PROFILE, ) profile = _get_instance_profile(instance_profile_name, config) if profile is None: cli_logger.verbose( "Creating new IAM instance profile {} for use as the default.", cf.bold(instance_profile_name), ) client = _client("iam", config) client.create_instance_profile(InstanceProfileName=instance_profile_name) profile = _get_instance_profile(instance_profile_name, config) time.sleep(15) # wait for propagation cli_logger.doassert( profile is not None, "Failed to create instance profile." ) # todo: err msg assert profile is not None, "Failed to create instance profile" if not profile.roles: role_name = cwh.resolve_iam_role_name(config["provider"], DEFAULT_RAY_IAM_ROLE) role = _get_role(role_name, config) if role is None: cli_logger.verbose( "Creating new IAM role {} for use as the default instance role.", cf.bold(role_name), ) iam = _resource("iam", config) policy_doc = { "Statement": [ { "Effect": "Allow", "Principal": {"Service": "ec2.amazonaws.com"}, "Action": "sts:AssumeRole", }, ] } attach_policy_arns = cwh.resolve_policy_arns( config["provider"], iam, [ "arn:aws:iam::aws:policy/AmazonEC2FullAccess", "arn:aws:iam::aws:policy/AmazonS3FullAccess", ], ) iam.create_role( RoleName=role_name, AssumeRolePolicyDocument=json.dumps(policy_doc) ) role = _get_role(role_name, config) cli_logger.doassert( role is not None, "Failed to create role." ) # todo: err msg assert role is not None, "Failed to create role" for policy_arn in attach_policy_arns: role.attach_policy(PolicyArn=policy_arn) profile.add_role(RoleName=role.name) time.sleep(15) # wait for propagation # Add IAM role to "head_node" field so that it is applied only to # the head node -- not to workers with the same node type as the head. config["head_node"]["IamInstanceProfile"] = {"Arn": profile.arn} return config
def run_init(self, *, as_head, file_mounts): BOOTSTRAP_MOUNTS = [ "~/ray_bootstrap_config.yaml", "~/ray_bootstrap_key.pem" ] image = self.docker_config.get("image") image = self.docker_config.get( f"{'head' if as_head else 'worker'}_image", image) self._check_docker_installed() if self.docker_config.get("pull_before_run", True): assert image, "Image must be included in config if " + \ "pull_before_run is specified" self.run("docker pull {}".format(image), run_env="host") # Bootstrap files cannot be bind mounted because docker opens the # underlying inode. When the file is switched, docker becomes outdated. cleaned_bind_mounts = file_mounts.copy() for mnt in BOOTSTRAP_MOUNTS: cleaned_bind_mounts.pop(mnt, None) if not self._check_container_status(): # Get home directory image_env = self.ssh_command_runner.run( "docker inspect -f '{{json .Config.Env}}' " + image, with_output=True).decode().strip() home_directory = "/root" for env_var in json.loads(image_env): if env_var.startswith("HOME="): home_directory = env_var.split("HOME=")[1] break start_command = docker_start_cmds( self.ssh_command_runner.ssh_user, image, cleaned_bind_mounts, self.container_name, self.docker_config.get("run_options", []) + self.docker_config.get( f"{'head' if as_head else 'worker'}_run_options", []) + self._configure_runtime() + self._auto_configure_shm(), self.ssh_command_runner.cluster_name, home_directory) self.run(start_command, run_env="host") else: running_image = self.run(check_docker_image(self.container_name), with_output=True, run_env="host").decode("utf-8").strip() if running_image != image: logger.error(f"A container with name {self.container_name} " + f"is running image {running_image} instead " + f"of {image} (which was provided in the YAML") mounts = self.run(check_bind_mounts_cmd(self.container_name), with_output=True, run_env="host").decode("utf-8").strip() try: active_mounts = json.loads(mounts) active_remote_mounts = [ mnt["Destination"] for mnt in active_mounts ] # Ignore ray bootstrap files. for remote, local in cleaned_bind_mounts.items(): remote = self._docker_expand_user(remote) if remote not in active_remote_mounts: cli_logger.error( "Please ray stop & restart cluster to " f"allow mount {remote}:{local} to take hold") except json.JSONDecodeError: cli_logger.verbose( "Unable to check if file_mounts specified in the YAML " "differ from those on the running container.") # Explicitly copy in ray bootstrap files. for mount in BOOTSTRAP_MOUNTS: if mount in file_mounts: self.ssh_command_runner.run( "docker cp {src} {container}:{dst}".format( src=os.path.join( self._get_docker_host_mount_location( self.ssh_command_runner.cluster_name), mount), container=self.container_name, dst=self._docker_expand_user(mount))) self.initialized = True
def handle_boto_error(exc, msg, *args, **kwargs): if cli_logger.old_style: # old-style logging doesn't do anything here # so we exit early return error_code = None error_info = None # todo: not sure if these exceptions always have response if hasattr(exc, "response"): error_info = exc.response.get("Error", None) if error_info is not None: error_code = error_info.get("Code", None) generic_message_args = [ "{}\n" "Error code: {}", msg.format(*args, **kwargs), cf.bold(error_code) ] # apparently # ExpiredTokenException # ExpiredToken # RequestExpired # are all the same pretty much credentials_expiration_codes = [ "ExpiredTokenException", "ExpiredToken", "RequestExpired" ] if error_code in credentials_expiration_codes: # "An error occurred (ExpiredToken) when calling the # GetInstanceProfile operation: The security token # included in the request is expired" # "An error occurred (RequestExpired) when calling the # DescribeKeyPairs operation: Request has expired." token_command = ( "aws sts get-session-token " "--serial-number arn:aws:iam::" + cf.underlined("ROOT_ACCOUNT_ID") + ":mfa/" + cf.underlined("AWS_USERNAME") + " --token-code " + cf.underlined("TWO_FACTOR_AUTH_CODE")) secret_key_var = ( "export AWS_SECRET_ACCESS_KEY = " + cf.underlined("REPLACE_ME") + " # found at Credentials.SecretAccessKey") session_token_var = ( "export AWS_SESSION_TOKEN = " + cf.underlined("REPLACE_ME") + " # found at Credentials.SessionToken") access_key_id_var = ( "export AWS_ACCESS_KEY_ID = " + cf.underlined("REPLACE_ME") + " # found at Credentials.AccessKeyId") # fixme: replace with a Github URL that points # to our repo aws_session_script_url = ("https://gist.github.com/maximsmol/" "a0284e1d97b25d417bd9ae02e5f450cf") cli_logger.verbose_error(*generic_message_args) cli_logger.verbose(vars(exc)) cli_logger.panic("Your AWS session has expired.") cli_logger.newline() cli_logger.panic("You can request a new one using") cli_logger.panic(cf.bold(token_command)) cli_logger.panic("then expose it to Ray by setting") cli_logger.panic(cf.bold(secret_key_var)) cli_logger.panic(cf.bold(session_token_var)) cli_logger.panic(cf.bold(access_key_id_var)) cli_logger.newline() cli_logger.panic("You can find a script that automates this at:") cli_logger.panic(cf.underlined(aws_session_script_url)) # Do not re-raise the exception here because it looks awful # and we already print all the info in verbose cli_logger.abort() # todo: any other errors that we should catch separately? cli_logger.panic(*generic_message_args) cli_logger.newline() with cli_logger.verbatim_error_ctx("Boto3 error:"): cli_logger.verbose("{}", str(vars(exc))) cli_logger.panic("{}", str(exc)) cli_logger.abort()