def get_or_create_head_node(config, config_file, no_restart, restart_only, yes, override_cluster_name, _provider=None, _runner=subprocess): """Create the cluster head node, which in turn creates the workers.""" provider = (_provider or _get_node_provider(config["provider"], config["cluster_name"])) config = copy.deepcopy(config) config_file = os.path.abspath(config_file) try: head_node_tags = { TAG_RAY_NODE_KIND: NODE_KIND_HEAD, } nodes = provider.non_terminated_nodes(head_node_tags) if len(nodes) > 0: head_node = nodes[0] else: head_node = None if not head_node: cli_logger.confirm(yes, "No head node found. " "Launching a new cluster.", _abort=True) cli_logger.old_confirm("This will create a new cluster", yes) elif not no_restart: cli_logger.old_confirm("This will restart cluster services", yes) if head_node: if restart_only: cli_logger.confirm( yes, "Updating cluster configuration and " "restarting the cluster Ray runtime. " "Setup commands will not be run due to `{}`.\n", cf.bold("--restart-only"), _abort=True) elif no_restart: cli_logger.print( "Cluster Ray runtime will not be restarted due " "to `{}`.", cf.bold("--no-restart")) cli_logger.confirm(yes, "Updating cluster configuration and " "running setup commands.", _abort=True) else: cli_logger.print( "Updating cluster configuration and running full setup.") cli_logger.confirm( yes, cf.bold("Cluster Ray runtime will be restarted."), _abort=True) cli_logger.newline() # TODO(ekl) this logic is duplicated in node_launcher.py (keep in sync) head_node_config = copy.deepcopy(config["head_node"]) if "head_node_type" in config: head_node_tags[TAG_RAY_USER_NODE_TYPE] = config["head_node_type"] head_node_config.update(config["available_node_types"][ config["head_node_type"]]["node_config"]) launch_hash = hash_launch_conf(head_node_config, config["auth"]) if head_node is None or provider.node_tags(head_node).get( TAG_RAY_LAUNCH_CONFIG) != launch_hash: with cli_logger.group("Acquiring an up-to-date head node"): if head_node is not None: cli_logger.print( "Currently running head node is out-of-date with " "cluster configuration") cli_logger.print( "hash is {}, expected {}", cf.bold( provider.node_tags(head_node).get( TAG_RAY_LAUNCH_CONFIG)), cf.bold(launch_hash)) cli_logger.confirm(yes, "Relaunching it.", _abort=True) cli_logger.old_confirm( "Head node config out-of-date. It will be terminated", yes) cli_logger.old_info( logger, "get_or_create_head_node: " "Shutting down outdated head node {}", head_node) provider.terminate_node(head_node) cli_logger.print("Terminated head node {}", head_node) cli_logger.old_info( logger, "get_or_create_head_node: Launching new head node...") head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format( config["cluster_name"]) provider.create_node(head_node_config, head_node_tags, 1) cli_logger.print("Launched a new head node") start = time.time() head_node = None with cli_logger.timed("Fetching the new head node"): while True: if time.time() - start > 50: cli_logger.abort( "Head node fetch timed out.") # todo: msg raise RuntimeError("Failed to create head node.") nodes = provider.non_terminated_nodes(head_node_tags) if len(nodes) == 1: head_node = nodes[0] break time.sleep(POLL_INTERVAL) cli_logger.newline() with cli_logger.group( "Setting up head node", _numbered=("<>", 1, 1), # cf.bold(provider.node_tags(head_node)[TAG_RAY_NODE_NAME]), _tags=dict()): # add id, ARN to tags? # TODO(ekl) right now we always update the head node even if the # hash matches. # We could prompt the user for what they want to do here. # No need to pass in cluster_sync_files because we use this # hash to set up the head node (runtime_hash, file_mounts_contents_hash) = hash_runtime_conf( config["file_mounts"], None, config) cli_logger.old_info( logger, "get_or_create_head_node: Updating files on head node...") # Rewrite the auth config so that the head # node can update the workers remote_config = copy.deepcopy(config) # drop proxy options if they exist, otherwise # head node won't be able to connect to workers remote_config["auth"].pop("ssh_proxy_command", None) if "ssh_private_key" in config["auth"]: remote_key_path = "~/ray_bootstrap_key.pem" remote_config["auth"]["ssh_private_key"] = remote_key_path # Adjust for new file locations new_mounts = {} for remote_path in config["file_mounts"]: new_mounts[remote_path] = remote_path remote_config["file_mounts"] = new_mounts remote_config["no_restart"] = no_restart remote_config = provider.prepare_for_head_node(remote_config) # Now inject the rewritten config and SSH key into the head node remote_config_file = tempfile.NamedTemporaryFile( "w", prefix="ray-bootstrap-") remote_config_file.write(json.dumps(remote_config)) remote_config_file.flush() config["file_mounts"].update( {"~/ray_bootstrap_config.yaml": remote_config_file.name}) if "ssh_private_key" in config["auth"]: config["file_mounts"].update({ remote_key_path: config["auth"]["ssh_private_key"], }) cli_logger.print("Prepared bootstrap config") if restart_only: setup_commands = [] ray_start_commands = config["head_start_ray_commands"] elif no_restart: setup_commands = config["head_setup_commands"] ray_start_commands = [] else: setup_commands = config["head_setup_commands"] ray_start_commands = config["head_start_ray_commands"] if not no_restart: warn_about_bad_start_command(ray_start_commands) updater = NodeUpdaterThread( node_id=head_node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=config["initialization_commands"], setup_commands=setup_commands, ray_start_commands=ray_start_commands, process_runner=_runner, runtime_hash=runtime_hash, file_mounts_contents_hash=file_mounts_contents_hash, is_head_node=True, docker_config=config.get("docker")) updater.start() updater.join() # Refresh the node cache so we see the external ip if available provider.non_terminated_nodes(head_node_tags) if config.get("provider", {}).get("use_internal_ips", False) is True: head_node_ip = provider.internal_ip(head_node) else: head_node_ip = provider.external_ip(head_node) if updater.exitcode != 0: # todo: this does not follow the mockup and is not good enough cli_logger.abort("Failed to setup head node.") cli_logger.old_error( logger, "get_or_create_head_node: " "Updating {} failed", head_node_ip) sys.exit(1) cli_logger.old_info( logger, "get_or_create_head_node: " "Head node up-to-date, IP address is: {}", head_node_ip) monitor_str = "tail -n 100 -f /tmp/ray/session_latest/logs/monitor*" if override_cluster_name: modifiers = " --cluster-name={}".format( quote(override_cluster_name)) else: modifiers = "" if cli_logger.old_style: print("To monitor autoscaling activity, you can run:\n\n" " ray exec {} {}{}\n".format(config_file, quote(monitor_str), modifiers)) print("To open a console on the cluster:\n\n" " ray attach {}{}\n".format(config_file, modifiers)) print("To get a remote shell to the cluster manually, run:\n\n" " {}\n".format( updater.cmd_runner.remote_shell_command_str())) cli_logger.newline() with cli_logger.group("Useful commands"): cli_logger.print("Monitor autoscaling with") cli_logger.print(cf.bold(" ray exec {}{} {}"), config_file, modifiers, quote(monitor_str)) cli_logger.print("Connect to a terminal on the cluster head:") cli_logger.print(cf.bold(" ray attach {}{}"), config_file, modifiers) remote_shell_str = updater.cmd_runner.remote_shell_command_str() cli_logger.print("Get a remote shell to the cluster manually:") cli_logger.print(" {}", remote_shell_str.strip()) finally: provider.cleanup()
def _create_node(self, node_config, tags, count): tags = to_aws_format(tags) conf = node_config.copy() # Delete unsupported keys from the node config try: del conf["Resources"] except KeyError: pass tag_pairs = [{ "Key": TAG_RAY_CLUSTER_NAME, "Value": self.cluster_name, }] for k, v in tags.items(): tag_pairs.append({ "Key": k, "Value": v, }) tag_specs = [{ "ResourceType": "instance", "Tags": tag_pairs, }] user_tag_specs = conf.get("TagSpecifications", []) # Allow users to add tags and override values of existing # tags with their own. This only applies to the resource type # "instance". All other resource types are appended to the list of # tag specs. for user_tag_spec in user_tag_specs: if user_tag_spec["ResourceType"] == "instance": for user_tag in user_tag_spec["Tags"]: exists = False for tag in tag_specs[0]["Tags"]: if user_tag["Key"] == tag["Key"]: exists = True tag["Value"] = user_tag["Value"] break if not exists: tag_specs[0]["Tags"] += [user_tag] else: tag_specs += [user_tag_spec] # SubnetIds is not a real config key: we must resolve to a # single SubnetId before invoking the AWS API. subnet_ids = conf.pop("SubnetIds") for attempt in range(1, BOTO_CREATE_MAX_RETRIES + 1): try: subnet_id = subnet_ids[self.subnet_idx % len(subnet_ids)] cli_logger.old_info( logger, "NodeProvider: calling create_instances " "with {} (count={}).", subnet_id, count) self.subnet_idx += 1 conf.update({ "MinCount": 1, "MaxCount": count, "SubnetId": subnet_id, "TagSpecifications": tag_specs }) created = self.ec2_fail_fast.create_instances(**conf) # todo: timed? # todo: handle plurality? with cli_logger.group("Launched {} nodes", count, _tags=dict(subnet_id=subnet_id)): for instance in created: # NOTE(maximsmol): This is needed for mocking # boto3 for tests. This is likely a bug in moto # but AWS docs don't seem to say. # You can patch moto/ec2/responses/instances.py # to fix this (add <stateReason> to EC2_RUN_INSTANCES) # The correct value is technically # {"code": "0", "Message": "pending"} state_reason = instance.state_reason or { "Message": "pending" } cli_logger.print("Launched instance {}", instance.instance_id, _tags=dict( state=instance.state["Name"], info=state_reason["Message"])) cli_logger.old_info( logger, "NodeProvider: Created instance " "[id={}, name={}, info={}]", instance.instance_id, instance.state["Name"], state_reason["Message"]) break except botocore.exceptions.ClientError as exc: if attempt == BOTO_CREATE_MAX_RETRIES: # todo: err msg cli_logger.abort( "Failed to launch instances. Max attempts exceeded.") cli_logger.old_error( logger, "create_instances: Max attempts ({}) exceeded.", BOTO_CREATE_MAX_RETRIES) raise exc else: cli_logger.print( "create_instances: Attempt failed with {}, retrying.", exc) cli_logger.old_error(logger, exc)
def stop(force, verbose, log_style, log_color): """Stop Ray processes manually on the local machine.""" cli_logger.configure(log_style, log_color, verbose) # Note that raylet needs to exit before object store, otherwise # it cannot exit gracefully. is_linux = sys.platform.startswith("linux") processes_to_kill = [ # The first element is the substring to filter. # The second element, if True, is to filter ps results by command name # (only the first 15 charactors of the executable name on Linux); # if False, is to filter ps results by command with all its arguments. # See STANDARD FORMAT SPECIFIERS section of # http://man7.org/linux/man-pages/man1/ps.1.html # about comm and args. This can help avoid killing non-ray processes. # Format: # Keyword to filter, filter by command (True)/filter by args (False) ["raylet", True], ["plasma_store", True], ["gcs_server", True], ["monitor.py", False], ["redis-server", False], ["default_worker.py", False], # Python worker. ["ray::", True], # Python worker. TODO(mehrdadn): Fix for Windows ["io.ray.runtime.runner.worker.DefaultWorker", False], # Java worker. ["log_monitor.py", False], ["reporter.py", False], ["dashboard.py", False], ["ray_process_reaper.py", False], ] process_infos = [] for proc in psutil.process_iter(["name", "cmdline"]): try: process_infos.append((proc, proc.name(), proc.cmdline())) except psutil.Error: pass total_found = 0 total_stopped = 0 for keyword, filter_by_cmd in processes_to_kill: if filter_by_cmd and is_linux and len(keyword) > 15: # getting here is an internal bug, so we do not use cli_logger msg = ("The filter string should not be more than {} " "characters. Actual length: {}. Filter: {}").format( 15, len(keyword), keyword) raise ValueError(msg) found = [] for candidate in process_infos: proc, proc_cmd, proc_args = candidate corpus = (proc_cmd if filter_by_cmd else subprocess.list2cmdline(proc_args)) if keyword in corpus: found.append(candidate) for proc, proc_cmd, proc_args in found: total_found += 1 proc_string = str(subprocess.list2cmdline(proc_args)) if verbose: operation = "Terminating" if force else "Killing" cli_logger.old_info(logger, "%s process %s: %s", operation, proc.pid, proc_string) try: if force: proc.kill() else: # TODO(mehrdadn): On Windows, this is forceful termination. # We don't want CTRL_BREAK_EVENT, because that would # terminate the entire process group. What to do? proc.terminate() if force: cli_logger.verbose("Killed `{}` {} ", cf.bold(proc_string), cf.dimmed("(via SIGKILL)")) else: cli_logger.verbose("Send termination request to `{}` {}", cf.bold(proc_string), cf.dimmed("(via SIGTERM)")) total_stopped += 1 except psutil.NoSuchProcess: cli_logger.verbose( "Attempted to stop `{}`, but process was already dead.", cf.bold(proc_string)) pass except (psutil.Error, OSError) as ex: cli_logger.error("Could not terminate `{}` due to {}", cf.bold(proc_string), str(ex)) cli_logger.old_error(logger, "Error: %s", ex) if total_found == 0: cli_logger.print("Did not find any active Ray processes.") else: if total_stopped == total_found: cli_logger.success("Stopped all {} Ray processes.", total_stopped) else: cli_logger.warning( "Stopped only {} out of {} Ray processes. " "Set `{}` to see more details.", total_stopped, total_found, cf.bold("-v")) cli_logger.warning("Try running the command again, or use `{}`.", cf.bold("--force"))
def run(self): cli_logger.old_info(logger, "{}Updating to {}", self.log_prefix, self.runtime_hash) if cmd_output_util.does_allow_interactive( ) and cmd_output_util.is_output_redirected(): # this is most probably a bug since the user has no control # over these settings msg = ("Output was redirected for an interactive command. " "Either do not pass `--redirect-command-output` " "or also pass in `--use-normal-shells`.") cli_logger.abort(msg) raise click.ClickException(msg) try: with LogTimer(self.log_prefix + "Applied config {}".format(self.runtime_hash)): self.do_update() except Exception as e: error_str = str(e) if hasattr(e, "cmd"): error_str = "(Exit Status {}) {}".format( e.returncode, " ".join(e.cmd)) self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED}) cli_logger.error("New status: {}", cf.bold(STATUS_UPDATE_FAILED)) cli_logger.old_error(logger, "{}Error executing: {}\n", self.log_prefix, error_str) cli_logger.error("!!!") if hasattr(e, "cmd"): cli_logger.error( "Setup command `{}` failed with exit code {}. stderr:", cf.bold(e.cmd), e.returncode) else: cli_logger.verbose_error("{}", str(vars(e))) # todo: handle this better somehow? cli_logger.error("{}", str(e)) # todo: print stderr here cli_logger.error("!!!") cli_logger.newline() if isinstance(e, click.ClickException): # todo: why do we ignore this here return raise tags_to_set = { TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE, TAG_RAY_RUNTIME_CONFIG: self.runtime_hash, } if self.file_mounts_contents_hash is not None: tags_to_set[ TAG_RAY_FILE_MOUNTS_CONTENTS] = self.file_mounts_contents_hash self.provider.set_node_tags(self.node_id, tags_to_set) cli_logger.labeled_value("New status", STATUS_UP_TO_DATE) self.exitcode = 0
def start(node_ip_address, address, port, redis_password, redis_shard_ports, object_manager_port, node_manager_port, gcs_server_port, min_worker_port, max_worker_port, memory, object_store_memory, redis_max_memory, num_cpus, num_gpus, resources, head, include_dashboard, dashboard_host, dashboard_port, block, plasma_directory, autoscaling_config, no_redirect_worker_output, no_redirect_output, plasma_store_socket_name, raylet_socket_name, temp_dir, java_worker_options, load_code_from_local, code_search_path, system_config, lru_evict, enable_object_reconstruction, metrics_export_port, log_style, log_color, verbose): """Start Ray processes manually on the local machine.""" cli_logger.configure(log_style, log_color, verbose) if gcs_server_port and not head: raise ValueError( "gcs_server_port can be only assigned when you specify --head.") # Convert hostnames to numerical IP address. if node_ip_address is not None: node_ip_address = services.address_to_ip(node_ip_address) redis_address = None if address is not None: (redis_address, redis_address_ip, redis_address_port) = services.validate_redis_address(address) try: resources = json.loads(resources) except Exception: cli_logger.error("`{}` is not a valid JSON string.", cf.bold("--resources")) cli_logger.abort( "Valid values look like this: `{}`", cf.bold("--resources='\"CustomResource3\": 1, " "\"CustomResource2\": 2}'")) raise Exception("Unable to parse the --resources argument using " "json.loads. Try using a format like\n\n" " --resources='{\"CustomResource1\": 3, " "\"CustomReseource2\": 2}'") redirect_worker_output = None if not no_redirect_worker_output else True redirect_output = None if not no_redirect_output else True ray_params = ray.parameter.RayParams( node_ip_address=node_ip_address, min_worker_port=min_worker_port, max_worker_port=max_worker_port, object_manager_port=object_manager_port, node_manager_port=node_manager_port, gcs_server_port=gcs_server_port, memory=memory, object_store_memory=object_store_memory, redis_password=redis_password, redirect_worker_output=redirect_worker_output, redirect_output=redirect_output, num_cpus=num_cpus, num_gpus=num_gpus, resources=resources, plasma_directory=plasma_directory, huge_pages=False, plasma_store_socket_name=plasma_store_socket_name, raylet_socket_name=raylet_socket_name, temp_dir=temp_dir, include_dashboard=include_dashboard, dashboard_host=dashboard_host, dashboard_port=dashboard_port, java_worker_options=java_worker_options, load_code_from_local=load_code_from_local, code_search_path=code_search_path, _system_config=system_config, lru_evict=lru_evict, enable_object_reconstruction=enable_object_reconstruction, metrics_export_port=metrics_export_port) if head: num_redis_shards = None # Start Ray on the head node. if redis_shard_ports is not None: redis_shard_ports = redis_shard_ports.split(",") # Infer the number of Redis shards from the ports if the number is # not provided. num_redis_shards = len(redis_shard_ports) if redis_address is not None: cli_logger.abort( "`{}` starts a new Redis server, `{}` should not be set.", cf.bold("--head"), cf.bold("--address")) raise Exception("If --head is passed in, a Redis server will be " "started, so a Redis address should not be " "provided.") # Get the node IP address if one is not provided. ray_params.update_if_absent( node_ip_address=services.get_node_ip_address()) cli_logger.labeled_value("Local node IP", ray_params.node_ip_address) cli_logger.old_info(logger, "Using IP address {} for this node.", ray_params.node_ip_address) ray_params.update_if_absent( redis_port=port, redis_shard_ports=redis_shard_ports, redis_max_memory=redis_max_memory, num_redis_shards=num_redis_shards, redis_max_clients=None, autoscaling_config=autoscaling_config, ) node = ray.node.Node( ray_params, head=True, shutdown_at_exit=block, spawn_reaper=block) redis_address = node.redis_address # this is a noop if new-style is not set, so the old logger calls # are still in place cli_logger.newline() startup_msg = "Ray runtime started." cli_logger.success("-" * len(startup_msg)) cli_logger.success(startup_msg) cli_logger.success("-" * len(startup_msg)) cli_logger.newline() with cli_logger.group("Next steps"): cli_logger.print( "To connect to this Ray runtime from another node, run") cli_logger.print( cf.bold(" ray start --address='{}'{}"), redis_address, f" --redis-password='******'" if redis_password else "") cli_logger.newline() cli_logger.print("Alternatively, use the following Python code:") with cli_logger.indented(): with cf.with_style("monokai") as c: cli_logger.print("{} ray", c.magenta("import")) cli_logger.print( "ray{}init(address{}{}{})", c.magenta("."), c.magenta("="), c.yellow("'auto'"), ", _redis_password{}{}".format( c.magenta("="), c.yellow("'" + redis_password + "'")) if redis_password else "") cli_logger.newline() cli_logger.print( cf.underlined("If connection fails, check your " "firewall settings and " "network configuration.")) cli_logger.newline() cli_logger.print("To terminate the Ray runtime, run") cli_logger.print(cf.bold(" ray stop")) cli_logger.old_info( logger, "\nStarted Ray on this node. You can add additional nodes to " "the cluster by calling\n\n" " ray start --address='{}'{}\n\n" "from the node you wish to add. You can connect a driver to the " "cluster from Python by running\n\n" " import ray\n" " ray.init(address='auto'{})\n\n" "If you have trouble connecting from a different machine, check " "that your firewall is configured properly. If you wish to " "terminate the processes that have been started, run\n\n" " ray stop".format( redis_address, " --redis-password='******'" if redis_password else "", ", _redis_password='******'" if redis_password else "")) else: # Start Ray on a non-head node. if not (port is None): cli_logger.abort("`{}` should not be specified without `{}`.", cf.bold("--port"), cf.bold("--head")) raise Exception("If --head is not passed in, --port is not " "allowed.") if redis_shard_ports is not None: cli_logger.abort("`{}` should not be specified without `{}`.", cf.bold("--redis-shard-ports"), cf.bold("--head")) raise Exception("If --head is not passed in, --redis-shard-ports " "is not allowed.") if redis_address is None: cli_logger.abort("`{}` is required unless starting with `{}`.", cf.bold("--address"), cf.bold("--head")) raise Exception("If --head is not passed in, --address must " "be provided.") if include_dashboard: cli_logger.abort("`{}` should not be specified without `{}`.", cf.bold("--include-dashboard"), cf.bold("--head")) raise ValueError( "If --head is not passed in, the --include-dashboard" "flag is not relevant.") # Wait for the Redis server to be started. And throw an exception if we # can't connect to it. services.wait_for_redis_to_start( redis_address_ip, redis_address_port, password=redis_password) # Create a Redis client. redis_client = services.create_redis_client( redis_address, password=redis_password) # Check that the version information on this node matches the version # information that the cluster was started with. services.check_version_info(redis_client) # Get the node IP address if one is not provided. ray_params.update_if_absent( node_ip_address=services.get_node_ip_address(redis_address)) cli_logger.labeled_value("Local node IP", ray_params.node_ip_address) cli_logger.old_info(logger, "Using IP address {} for this node.", ray_params.node_ip_address) # Check that there aren't already Redis clients with the same IP # address connected with this Redis instance. This raises an exception # if the Redis server already has clients on this node. check_no_existing_redis_clients(ray_params.node_ip_address, redis_client) ray_params.update(redis_address=redis_address) node = ray.node.Node( ray_params, head=False, shutdown_at_exit=block, spawn_reaper=block) cli_logger.newline() startup_msg = "Ray runtime started." cli_logger.success("-" * len(startup_msg)) cli_logger.success(startup_msg) cli_logger.success("-" * len(startup_msg)) cli_logger.newline() cli_logger.print("To terminate the Ray runtime, run") cli_logger.print(cf.bold(" ray stop")) cli_logger.old_info( logger, "\nStarted Ray on this node. If you wish to terminate the " "processes that have been started, run\n\n" " ray stop") if block: cli_logger.newline() with cli_logger.group(cf.bold("--block")): cli_logger.print( "This command will now block until terminated by a signal.") cli_logger.print( "Runing subprocesses are monitored and a message will be " "printed if any of them terminate unexpectedly.") while True: time.sleep(1) deceased = node.dead_processes() if len(deceased) > 0: cli_logger.newline() cli_logger.error("Some Ray subprcesses exited unexpectedly:") cli_logger.old_error(logger, "Ray processes died unexpectedly:") with cli_logger.indented(): for process_type, process in deceased: cli_logger.error( "{}", cf.bold(str(process_type)), _tags={"exit code": str(process.returncode)}) cli_logger.old_error( logger, "\t{} died with exit code {}".format( process_type, process.returncode)) # shutdown_at_exit will handle cleanup. cli_logger.newline() cli_logger.error("Remaining processes will be killed.") cli_logger.old_error( logger, "Killing remaining processes and exiting...") sys.exit(1)