def _get_head_node(config, config_file, override_cluster_name, create_if_needed=False): provider = get_node_provider(config["provider"], config["cluster_name"]) try: head_node_tags = { TAG_RAY_NODE_TYPE: "head", } nodes = provider.non_terminated_nodes(head_node_tags) finally: provider.cleanup() if len(nodes) > 0: head_node = nodes[0] return head_node elif create_if_needed: get_or_create_head_node( config, config_file, restart_only=False, no_restart=False, yes=True, override_cluster_name=override_cluster_name) return _get_head_node( config, config_file, override_cluster_name, create_if_needed=False) else: raise RuntimeError("Head node of cluster ({}) not found!".format( config["cluster_name"]))
def __init__( self, node_id, provider_config, auth_config, cluster_name, file_mounts, setup_cmds, runtime_hash, redirect_output=True, process_runner=subprocess): self.daemon = True self.process_runner = process_runner self.provider = get_node_provider(provider_config, cluster_name) self.ssh_private_key = auth_config["ssh_private_key"] self.ssh_user = auth_config["ssh_user"] self.ssh_ip = self.provider.external_ip(node_id) self.node_id = node_id self.file_mounts = file_mounts self.setup_cmds = setup_cmds self.runtime_hash = runtime_hash if redirect_output: self.logfile = tempfile.NamedTemporaryFile( mode="w", prefix="node-updater-", delete=False) self.output_name = self.logfile.name self.stdout = self.logfile self.stderr = self.logfile else: self.logfile = None self.output_name = "(console)" self.stdout = sys.stdout self.stderr = sys.stderr
def __init__( self, config_path, load_metrics, max_concurrent_launches=AUTOSCALER_MAX_CONCURRENT_LAUNCHES, max_failures=AUTOSCALER_MAX_NUM_FAILURES, process_runner=subprocess, verbose_updates=False, node_updater_cls=NodeUpdaterProcess, update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S): self.config_path = config_path self.reload_config(errors_fatal=True) self.load_metrics = load_metrics self.provider = get_node_provider( self.config["provider"], self.config["cluster_name"]) self.max_failures = max_failures self.max_concurrent_launches = max_concurrent_launches self.verbose_updates = verbose_updates self.process_runner = process_runner self.node_updater_cls = node_updater_cls # Map from node_id to NodeUpdater processes self.updaters = {} self.num_failed_updates = defaultdict(int) self.num_successful_updates = defaultdict(int) self.num_failures = 0 self.last_update_time = 0.0 self.update_interval_s = update_interval_s for local_path in self.config["file_mounts"].values(): assert os.path.exists(local_path) print("StandardAutoscaler: {}".format(self.config))
def __init__(self, config_path, load_metrics, max_launch_batch=AUTOSCALER_MAX_LAUNCH_BATCH, max_concurrent_launches=AUTOSCALER_MAX_CONCURRENT_LAUNCHES, max_failures=AUTOSCALER_MAX_NUM_FAILURES, process_runner=subprocess, update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S): self.config_path = config_path self.reload_config(errors_fatal=True) self.load_metrics = load_metrics self.provider = get_node_provider(self.config["provider"], self.config["cluster_name"]) self.max_failures = max_failures self.max_launch_batch = max_launch_batch self.max_concurrent_launches = max_concurrent_launches self.process_runner = process_runner # Map from node_id to NodeUpdater processes self.updaters = {} self.num_failed_updates = defaultdict(int) self.num_successful_updates = defaultdict(int) self.num_failures = 0 self.last_update_time = 0.0 self.update_interval_s = update_interval_s self.bringup = True # Node launchers self.launch_queue = queue.Queue() self.num_launches_pending = ConcurrentCounter() max_batches = math.ceil( max_concurrent_launches / float(max_launch_batch)) for i in range(int(max_batches)): node_launcher = NodeLauncher( provider=self.provider, queue=self.launch_queue, pending=self.num_launches_pending) node_launcher.daemon = True node_launcher.start() # Expand local file_mounts to allow ~ in the paths. This can't be done # earlier when the config is written since we might be on different # platform and the expansion would result in wrong path. self.config["file_mounts"] = { remote: os.path.expanduser(local) for remote, local in self.config["file_mounts"].items() } for local_path in self.config["file_mounts"].values(): assert os.path.exists(local_path) logger.info("StandardAutoscaler: {}".format(self.config))
def get_head_node_ip(config_file): """Returns head node IP for given configuration file if exists.""" config = yaml.load(open(config_file).read()) provider = get_node_provider(config["provider"], config["cluster_name"]) head_node_tags = { TAG_RAY_NODE_TYPE: "Head", } nodes = provider.nodes(head_node_tags) if len(nodes) > 0: head_node = nodes[0] return provider.external_ip(head_node) else: print("Head node of cluster ({}) not found!".format( config["cluster_name"])) sys.exit(1)
def get_worker_node_ips(config_file, override_cluster_name): """Returns worker node IPs for given configuration file.""" config = yaml.load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name provider = get_node_provider(config["provider"], config["cluster_name"]) try: nodes = provider.non_terminated_nodes({TAG_RAY_NODE_TYPE: "worker"}) if config.get("provider", {}).get("use_internal_ips", False) is True: return [provider.internal_ip(node) for node in nodes] else: return [provider.external_ip(node) for node in nodes] finally: provider.cleanup()
def get_head_node_ip(config_file, override_cluster_name): """Returns head node IP for given configuration file if exists.""" config = yaml.load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name provider = get_node_provider(config["provider"], config["cluster_name"]) try: head_node = _get_head_node(config, config_file, override_cluster_name) if config.get("provider", {}).get("use_internal_ips", False) is True: head_node_ip = provider.internal_ip(head_node) else: head_node_ip = provider.external_ip(head_node) finally: provider.cleanup() return head_node_ip
def kill_node(config_file, yes, hard, override_cluster_name): """Kills a random Raylet worker.""" config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config) confirm("This will kill a node in your cluster", yes) provider = get_node_provider(config["provider"], config["cluster_name"]) try: nodes = provider.non_terminated_nodes({ TAG_RAY_NODE_TYPE: NODE_TYPE_WORKER }) node = random.choice(nodes) logger.info("kill_node: Shutdown worker {}".format(node)) if hard: provider.terminate_node(node) else: updater = NodeUpdaterThread( node_id=node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="") _exec(updater, "ray stop", False, False) time.sleep(5) if config.get("provider", {}).get("use_internal_ips", False) is True: node_ip = provider.internal_ip(node) else: node_ip = provider.external_ip(node) finally: provider.cleanup() return node_ip
def teardown_cluster(config_file, yes, workers_only, override_cluster_name): """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name validate_config(config) config = fillout_defaults(config) confirm("This will destroy your cluster", yes) provider = get_node_provider(config["provider"], config["cluster_name"]) try: def remaining_nodes(): if workers_only: A = [] else: A = [ node_id for node_id in provider.nodes({ TAG_RAY_NODE_TYPE: "head" }) ] A += [ node_id for node_id in provider.nodes({ TAG_RAY_NODE_TYPE: "worker" }) ] return A # Loop here to check that both the head and worker nodes are actually # really gone A = remaining_nodes() with LogTimer("teardown_cluster: Termination done."): while A: logger.info("teardown_cluster: " "Terminating {} nodes...".format(len(A))) provider.terminate_nodes(A) time.sleep(1) A = remaining_nodes() finally: provider.cleanup()
def teardown_cluster(config_file, yes, workers_only, override_cluster_name): """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name validate_config(config) config = fillout_defaults(config) confirm("This will destroy your cluster", yes) provider = get_node_provider(config["provider"], config["cluster_name"]) try: def remaining_nodes(): if workers_only: A = [] else: A = [ node_id for node_id in provider.non_terminated_nodes({ TAG_RAY_NODE_TYPE: "head" }) ] A += [ node_id for node_id in provider.non_terminated_nodes({ TAG_RAY_NODE_TYPE: "worker" }) ] return A # Loop here to check that both the head and worker nodes are actually # really gone A = remaining_nodes() with LogTimer("teardown_cluster: Termination done."): while A: logger.info("teardown_cluster: " "Terminating {} nodes...".format(len(A))) provider.terminate_nodes(A) time.sleep(1) A = remaining_nodes() finally: provider.cleanup()
def _launch_node(self, config, count): if self.provider is None: self.provider = get_node_provider(config["provider"], config["cluster_name"]) tag_filters = {TAG_RAY_NODE_TYPE: "worker"} before = self.provider.nodes(tag_filters=tag_filters) launch_hash = hash_launch_conf(config["worker_nodes"], config["auth"]) self.provider.create_node( config["worker_nodes"], { TAG_RAY_NODE_NAME: "ray-{}-worker".format( config["cluster_name"]), TAG_RAY_NODE_TYPE: "worker", TAG_RAY_NODE_STATUS: "uninitialized", TAG_RAY_LAUNCH_CONFIG: launch_hash, }, count) after = self.provider.nodes(tag_filters=tag_filters) if set(after).issubset(before): logger.error("No new nodes reported after node creation")
def get_head_node_ip(config_file: str, override_cluster_name: Optional[str]) -> str: """Returns head node IP for given configuration file if exists.""" config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name provider = get_node_provider(config["provider"], config["cluster_name"]) try: head_node = _get_head_node(config, config_file, override_cluster_name) if config.get("provider", {}).get("use_internal_ips", False) is True: head_node_ip = provider.internal_ip(head_node) else: head_node_ip = provider.external_ip(head_node) finally: provider.cleanup() return head_node_ip
def get_worker_node_ips(config_file, override_cluster_name): """Returns worker node IPs for given configuration file.""" config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name provider = get_node_provider(config["provider"], config["cluster_name"]) try: nodes = provider.non_terminated_nodes({ TAG_RAY_NODE_TYPE: NODE_TYPE_WORKER }) if config.get("provider", {}).get("use_internal_ips", False) is True: return [provider.internal_ip(node) for node in nodes] else: return [provider.external_ip(node) for node in nodes] finally: provider.cleanup()
def teardown_cluster(config_file): """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.load(open(config_file).read()) validate_config(config) provider = get_node_provider(config["provider"], config["cluster_name"]) head_node_tags = { TAG_RAY_NODE_TYPE: "Head", } for node in provider.nodes(head_node_tags): print("Terminating head node {}".format(node)) provider.terminate_node(node) nodes = provider.nodes({}) while nodes: for node in nodes: print("Terminating worker {}".format(node)) provider.terminate_node(node) time.sleep(5) nodes = provider.nodes({})
def _get_head_node(config, config_file, create_if_needed=False): provider = get_node_provider(config["provider"], config["cluster_name"]) head_node_tags = { TAG_RAY_NODE_TYPE: "head", } nodes = provider.nodes(head_node_tags) if len(nodes) > 0: head_node = nodes[0] return head_node elif create_if_needed: get_or_create_head_node(config, config_file, restart_only=False, no_restart=False, yes=True) return _get_head_node(config, config_file, create_if_needed=False) else: print("Head node of cluster ({}) not found!".format( config["cluster_name"])) sys.exit(1)
def __init__(self, config_path, load_metrics, max_concurrent_launches=AUTOSCALER_MAX_CONCURRENT_LAUNCHES, max_failures=AUTOSCALER_MAX_NUM_FAILURES, process_runner=subprocess, verbose_updates=True, node_updater_cls=NodeUpdaterProcess, update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S): self.config_path = config_path self.reload_config(errors_fatal=True) self.load_metrics = load_metrics self.provider = get_node_provider(self.config["provider"], self.config["cluster_name"]) self.max_failures = max_failures self.max_concurrent_launches = max_concurrent_launches self.verbose_updates = verbose_updates self.process_runner = process_runner self.node_updater_cls = node_updater_cls # Map from node_id to NodeUpdater processes self.updaters = {} self.num_failed_updates = defaultdict(int) self.num_successful_updates = defaultdict(int) self.num_failures = 0 self.last_update_time = 0.0 self.update_interval_s = update_interval_s # Expand local file_mounts to allow ~ in the paths. This can't be done # earlier when the config is written since we might be on different # platform and the expansion would result in wrong path. self.config["file_mounts"] = { remote: os.path.expanduser(local) for remote, local in self.config["file_mounts"].items() } for local_path in self.config["file_mounts"].values(): assert os.path.exists(local_path) print("StandardAutoscaler: {}".format(self.config))
def __init__(self, node_id, provider_config, auth_config, cluster_name, file_mounts, setup_cmds, runtime_hash, redirect_output=True, process_runner=subprocess, use_internal_ip=False): self.daemon = True self.process_runner = process_runner self.node_id = node_id self.use_internal_ip = (use_internal_ip or provider_config.get( "use_internal_ips", False)) self.provider = get_node_provider(provider_config, cluster_name) self.ssh_private_key = auth_config["ssh_private_key"] self.ssh_user = auth_config["ssh_user"] self.ssh_ip = self.get_node_ip() self.file_mounts = { remote: os.path.expanduser(local) for remote, local in file_mounts.items() } self.setup_cmds = setup_cmds self.runtime_hash = runtime_hash self.logger = logger.getChild(str(node_id)) if redirect_output: self.logfile = tempfile.NamedTemporaryFile( mode="w", prefix="node-updater-", delete=False) handler = logging.StreamHandler(stream=self.logfile) handler.setLevel(logging.INFO) self.logger.addHandler(handler) self.output_name = self.logfile.name self.stdout = self.logfile self.stderr = self.logfile else: self.logfile = None self.output_name = "(console)" self.stdout = sys.stdout self.stderr = sys.stderr
def reset(self, errors_fatal=False): sync_continuously = False if hasattr(self, "config"): sync_continuously = self.config.get( "file_mounts_sync_continuously", False) try: with open(self.config_path) as f: new_config = yaml.safe_load(f.read()) validate_config(new_config) (new_runtime_hash, new_file_mounts_contents_hash) = hash_runtime_conf( new_config["file_mounts"], new_config["cluster_synced_files"], [ new_config["worker_setup_commands"], new_config["worker_start_ray_commands"], ], generate_file_mounts_contents_hash=sync_continuously, ) self.config = new_config self.runtime_hash = new_runtime_hash self.file_mounts_contents_hash = new_file_mounts_contents_hash if not self.provider: self.provider = get_node_provider(self.config["provider"], self.config["cluster_name"]) # Check whether we can enable the resource demand scheduler. if "available_node_types" in self.config: self.available_node_types = self.config["available_node_types"] self.resource_demand_scheduler = ResourceDemandScheduler( self.provider, self.available_node_types, self.config["max_workers"]) else: self.available_node_types = None self.resource_demand_scheduler = None except Exception as e: if errors_fatal: raise e else: logger.exception("StandardAutoscaler: " "Error parsing config.")
def rsync(config_file, source, target, override_cluster_name, down): """Rsyncs files. Arguments: config_file: path to the cluster yaml source: source dir target: target dir override_cluster_name: set the name of the cluster down: whether we're syncing remote -> local """ config = yaml.load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config) head_node = _get_head_node(config, config_file, override_cluster_name, create_if_needed=False) provider = get_node_provider(config["provider"], config["cluster_name"]) try: updater = NodeUpdaterThread( node_id=head_node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], runtime_hash="", ) if down: rsync = updater.rsync_down else: rsync = updater.rsync_up rsync(source, target, check_error=False) finally: provider.cleanup()
def kill_node(config_file, yes, override_cluster_name): """Kills a random Raylet worker.""" config = yaml.load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config) confirm("This will kill a node in your cluster", yes) provider = get_node_provider(config["provider"], config["cluster_name"]) try: nodes = provider.non_terminated_nodes({TAG_RAY_NODE_TYPE: "worker"}) node = random.choice(nodes) logger.info("kill_node: Terminating worker {}".format(node)) updater = NodeUpdaterThread( node_id=node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], runtime_hash="") _exec(updater, "ray stop", False, False) time.sleep(5) if config.get("provider", {}).get("use_internal_ips", False) is True: node_ip = provider.internal_ip(node) else: node_ip = provider.external_ip(node) finally: provider.cleanup() return node_ip
def rsync(config_file, source, target, override_cluster_name, down): """Rsyncs files. Arguments: config_file: path to the cluster yaml source: source dir target: target dir override_cluster_name: set the name of the cluster down: whether we're syncing remote -> local """ config = yaml.load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config) head_node = _get_head_node( config, config_file, override_cluster_name, create_if_needed=False) provider = get_node_provider(config["provider"], config["cluster_name"]) try: updater = NodeUpdaterThread( node_id=head_node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], runtime_hash="", ) if down: rsync = updater.rsync_down else: rsync = updater.rsync_up rsync(source, target, check_error=False) finally: provider.cleanup()
def teardown_cluster(config_file, yes): """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.load(open(config_file).read()) validate_config(config) config = fillout_defaults(config) confirm("This will destroy your cluster", yes) provider = get_node_provider(config["provider"], config["cluster_name"]) head_node_tags = { TAG_RAY_NODE_TYPE: "Head", } for node in provider.nodes(head_node_tags): print("Terminating head node {}".format(node)) provider.terminate_node(node) nodes = provider.nodes({}) while nodes: for node in nodes: print("Terminating worker {}".format(node)) provider.terminate_node(node) time.sleep(5) nodes = provider.nodes({})
def get_or_create_head_node(config, config_file, no_restart, restart_only, yes, override_cluster_name, _provider=None, _runner=subprocess): """Create the cluster head node, which in turn creates the workers.""" provider = (_provider or get_node_provider(config["provider"], config["cluster_name"])) config = copy.deepcopy(config) raw_config_file = config_file # used for printing to the user config_file = os.path.abspath(config_file) try: head_node_tags = { TAG_RAY_NODE_KIND: NODE_KIND_HEAD, } nodes = provider.non_terminated_nodes(head_node_tags) if len(nodes) > 0: head_node = nodes[0] else: head_node = None if not head_node: cli_logger.confirm(yes, "No head node found. " "Launching a new cluster.", _abort=True) cli_logger.old_confirm("This will create a new cluster", yes) elif not no_restart: cli_logger.old_confirm("This will restart cluster services", yes) if head_node: if restart_only: cli_logger.confirm( yes, "Updating cluster configuration and " "restarting the cluster Ray runtime. " "Setup commands will not be run due to `{}`.\n", cf.bold("--restart-only"), _abort=True) elif no_restart: cli_logger.print( "Cluster Ray runtime will not be restarted due " "to `{}`.", cf.bold("--no-restart")) cli_logger.confirm(yes, "Updating cluster configuration and " "running setup commands.", _abort=True) else: cli_logger.print( "Updating cluster configuration and running full setup.") cli_logger.confirm( yes, cf.bold("Cluster Ray runtime will be restarted."), _abort=True) cli_logger.newline() # TODO(ekl) this logic is duplicated in node_launcher.py (keep in sync) head_node_config = copy.deepcopy(config["head_node"]) if "head_node_type" in config: head_node_tags[TAG_RAY_USER_NODE_TYPE] = config["head_node_type"] head_node_config.update(config["available_node_types"][ config["head_node_type"]]["node_config"]) launch_hash = hash_launch_conf(head_node_config, config["auth"]) if head_node is None or provider.node_tags(head_node).get( TAG_RAY_LAUNCH_CONFIG) != launch_hash: with cli_logger.group("Acquiring an up-to-date head node"): if head_node is not None: cli_logger.print( "Currently running head node is out-of-date with " "cluster configuration") cli_logger.print( "hash is {}, expected {}", cf.bold( provider.node_tags(head_node).get( TAG_RAY_LAUNCH_CONFIG)), cf.bold(launch_hash)) cli_logger.confirm(yes, "Relaunching it.", _abort=True) cli_logger.old_confirm( "Head node config out-of-date. It will be terminated", yes) cli_logger.old_info( logger, "get_or_create_head_node: " "Shutting down outdated head node {}", head_node) provider.terminate_node(head_node) cli_logger.print("Terminated head node {}", head_node) cli_logger.old_info( logger, "get_or_create_head_node: Launching new head node...") head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format( config["cluster_name"]) provider.create_node(head_node_config, head_node_tags, 1) cli_logger.print("Launched a new head node") start = time.time() head_node = None with cli_logger.timed("Fetching the new head node"): while True: if time.time() - start > 50: cli_logger.abort( "Head node fetch timed out.") # todo: msg raise RuntimeError("Failed to create head node.") nodes = provider.non_terminated_nodes(head_node_tags) if len(nodes) == 1: head_node = nodes[0] break time.sleep(1) cli_logger.newline() with cli_logger.group( "Setting up head node", _numbered=("<>", 1, 1), # cf.bold(provider.node_tags(head_node)[TAG_RAY_NODE_NAME]), _tags=dict()): # add id, ARN to tags? # TODO(ekl) right now we always update the head node even if the # hash matches. # We could prompt the user for what they want to do here. # No need to pass in cluster_sync_files because we use this # hash to set up the head node (runtime_hash, file_mounts_contents_hash) = hash_runtime_conf( config["file_mounts"], None, config) cli_logger.old_info( logger, "get_or_create_head_node: Updating files on head node...") # Rewrite the auth config so that the head # node can update the workers remote_config = copy.deepcopy(config) # drop proxy options if they exist, otherwise # head node won't be able to connect to workers remote_config["auth"].pop("ssh_proxy_command", None) if "ssh_private_key" in config["auth"]: remote_key_path = "~/ray_bootstrap_key.pem" remote_config["auth"]["ssh_private_key"] = remote_key_path # Adjust for new file locations new_mounts = {} for remote_path in config["file_mounts"]: new_mounts[remote_path] = remote_path remote_config["file_mounts"] = new_mounts remote_config["no_restart"] = no_restart # Now inject the rewritten config and SSH key into the head node remote_config_file = tempfile.NamedTemporaryFile( "w", prefix="ray-bootstrap-") remote_config_file.write(json.dumps(remote_config)) remote_config_file.flush() config["file_mounts"].update( {"~/ray_bootstrap_config.yaml": remote_config_file.name}) if "ssh_private_key" in config["auth"]: config["file_mounts"].update({ remote_key_path: config["auth"]["ssh_private_key"], }) cli_logger.print("Prepared bootstrap config") if restart_only: setup_commands = [] ray_start_commands = config["head_start_ray_commands"] elif no_restart: setup_commands = config["head_setup_commands"] ray_start_commands = [] else: setup_commands = config["head_setup_commands"] ray_start_commands = config["head_start_ray_commands"] if not no_restart: warn_about_bad_start_command(ray_start_commands) updater = NodeUpdaterThread( node_id=head_node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=config["initialization_commands"], setup_commands=setup_commands, ray_start_commands=ray_start_commands, process_runner=_runner, runtime_hash=runtime_hash, file_mounts_contents_hash=file_mounts_contents_hash, is_head_node=True, docker_config=config.get("docker")) updater.start() updater.join() # Refresh the node cache so we see the external ip if available provider.non_terminated_nodes(head_node_tags) if config.get("provider", {}).get("use_internal_ips", False) is True: head_node_ip = provider.internal_ip(head_node) else: head_node_ip = provider.external_ip(head_node) if updater.exitcode != 0: # todo: this does not follow the mockup and is not good enough cli_logger.abort("Failed to setup head node.") cli_logger.old_error( logger, "get_or_create_head_node: " "Updating {} failed", head_node_ip) sys.exit(1) cli_logger.old_info( logger, "get_or_create_head_node: " "Head node up-to-date, IP address is: {}", head_node_ip) monitor_str = "tail -n 100 -f /tmp/ray/session_*/logs/monitor*" if override_cluster_name: modifiers = " --cluster-name={}".format( quote(override_cluster_name)) else: modifiers = "" if cli_logger.old_style: print("To monitor autoscaling activity, you can run:\n\n" " ray exec {} {}{}\n".format(config_file, quote(monitor_str), modifiers)) print("To open a console on the cluster:\n\n" " ray attach {}{}\n".format(config_file, modifiers)) print("To get a remote shell to the cluster manually, run:\n\n" " {}\n".format( updater.cmd_runner.remote_shell_command_str())) cli_logger.newline() with cli_logger.group("Useful commands"): cli_logger.print("Monitor autoscaling with") cli_logger.print(cf.bold(" ray exec {}{} {}"), raw_config_file, modifiers, quote(monitor_str)) cli_logger.print("Connect to a terminal on the cluster head") cli_logger.print(cf.bold(" ray attach {}{}"), raw_config_file, modifiers) finally: provider.cleanup()
def teardown_cluster(config_file: str, yes: bool, workers_only: bool, override_cluster_name: Optional[str], keep_min_workers: bool): """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = prepare_config(config) validate_config(config) cli_logger.confirm(yes, "Destroying cluster.", _abort=True) cli_logger.old_confirm("This will destroy your cluster", yes) if not workers_only: try: exec_cluster(config_file, cmd="ray stop", run_env="auto", screen=False, tmux=False, stop=False, start=False, override_cluster_name=override_cluster_name, port_forward=None, with_output=False) except Exception as e: # todo: add better exception info cli_logger.verbose_error("{}", str(e)) cli_logger.warning( "Exception occured when stopping the cluster Ray runtime " "(use -v to dump teardown exceptions).") cli_logger.warning( "Ignoring the exception and " "attempting to shut down the cluster nodes anyway.") cli_logger.old_exception( logger, "Ignoring error attempting a clean shutdown.") provider = get_node_provider(config["provider"], config["cluster_name"]) try: def remaining_nodes(): workers = provider.non_terminated_nodes( {TAG_RAY_NODE_KIND: NODE_KIND_WORKER}) if keep_min_workers: min_workers = config.get("min_workers", 0) cli_logger.print( "{} random worker nodes will not be shut down. " + cf.dimmed("(due to {})"), cf.bold(min_workers), cf.bold("--keep-min-workers")) cli_logger.old_info(logger, "teardown_cluster: Keeping {} nodes...", min_workers) workers = random.sample(workers, len(workers) - min_workers) # todo: it's weird to kill the head node but not all workers if workers_only: cli_logger.print( "The head node will not be shut down. " + cf.dimmed("(due to {})"), cf.bold("--workers-only")) return workers head = provider.non_terminated_nodes( {TAG_RAY_NODE_KIND: NODE_KIND_HEAD}) return head + workers def run_docker_stop(node, container_name): try: updater = NodeUpdaterThread( node_id=node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="", file_mounts_contents_hash="", is_head_node=False, docker_config=config.get("docker")) _exec(updater, f"docker stop {container_name}", False, False, run_env="host") except Exception: cli_logger.warning(f"Docker stop failed on {node}") cli_logger.old_warning(logger, f"Docker stop failed on {node}") # Loop here to check that both the head and worker nodes are actually # really gone A = remaining_nodes() container_name = config.get("docker", {}).get("container_name") if container_name: for node in A: run_docker_stop(node, container_name) with LogTimer("teardown_cluster: done."): while A: cli_logger.old_info( logger, "teardown_cluster: " "Shutting down {} nodes...", len(A)) provider.terminate_nodes(A) cli_logger.print("Requested {} nodes to shut down.", cf.bold(len(A)), _tags=dict(interval="1s")) time.sleep(1) # todo: interval should be a variable A = remaining_nodes() cli_logger.print("{} nodes remaining after 1 second.", cf.bold(len(A))) cli_logger.success("No nodes remaining.") finally: provider.cleanup()
def testClusterStateInit(self): """Check ClusterState __init__ func generates correct state file. Test the general use case and if num_workers increase/decrease. """ cluster_config = { "cluster_name": "random_name", "min_workers": 0, "max_workers": 0, "initial_workers": 0, "provider": { "type": "local", "head_ip": "0.0.0.0:2", "worker_ips": ["0.0.0.0:1"] }, } provider_config = cluster_config["provider"] node_provider = get_node_provider(provider_config, cluster_config["cluster_name"]) assert isinstance(node_provider, LocalNodeProvider) expected_workers = {} expected_workers[provider_config["head_ip"]] = { "tags": { TAG_RAY_NODE_TYPE: NODE_TYPE_HEAD }, "state": "terminated", } expected_workers[provider_config["worker_ips"][0]] = { "tags": { TAG_RAY_NODE_TYPE: NODE_TYPE_WORKER }, "state": "terminated", } state_save_path = "/tmp/cluster-{}.state".format( cluster_config["cluster_name"]) assert os.path.exists(state_save_path) workers = json.loads(open(state_save_path).read()) assert workers == expected_workers # Test removing workers updates the cluster state. del expected_workers[provider_config["worker_ips"][0]] removed_ip = provider_config["worker_ips"].pop() node_provider = get_node_provider(provider_config, cluster_config["cluster_name"]) workers = json.loads(open(state_save_path).read()) assert workers == expected_workers # Test adding back workers updates the cluster state. expected_workers[removed_ip] = { "tags": { TAG_RAY_NODE_TYPE: NODE_TYPE_WORKER }, "state": "terminated", } provider_config["worker_ips"].append(removed_ip) node_provider = get_node_provider(provider_config, cluster_config["cluster_name"]) workers = json.loads(open(state_save_path).read()) assert workers == expected_workers
def get_or_create_head_node(config, no_restart, yes): """Create the cluster head node, which in turn creates the workers.""" provider = get_node_provider(config["provider"], config["cluster_name"]) head_node_tags = { TAG_RAY_NODE_TYPE: "Head", } nodes = provider.nodes(head_node_tags) if len(nodes) > 0: head_node = nodes[0] else: head_node = None if not head_node: confirm("This will create a new cluster", yes) elif not no_restart: confirm("This will restart cluster services", yes) launch_hash = hash_launch_conf(config["head_node"], config["auth"]) if head_node is None or provider.node_tags(head_node).get( TAG_RAY_LAUNCH_CONFIG) != launch_hash: if head_node is not None: confirm("Head node config out-of-date. It will be terminated", yes) print("Terminating outdated head node {}".format(head_node)) provider.terminate_node(head_node) print("Launching new head node...") head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash head_node_tags[TAG_NAME] = "ray-{}-head".format(config["cluster_name"]) provider.create_node(config["head_node"], head_node_tags, 1) nodes = provider.nodes(head_node_tags) assert len(nodes) == 1, "Failed to create head node." head_node = nodes[0] # TODO(ekl) right now we always update the head node even if the hash # matches. We could prompt the user for what they want to do in this case. runtime_hash = hash_runtime_conf(config["file_mounts"], config) print("Updating files on head node...") # Rewrite the auth config so that the head node can update the workers remote_key_path = "~/ray_bootstrap_key.pem" remote_config = copy.deepcopy(config) remote_config["auth"]["ssh_private_key"] = remote_key_path # Adjust for new file locations new_mounts = {} for remote_path in config["file_mounts"]: new_mounts[remote_path] = remote_path remote_config["file_mounts"] = new_mounts remote_config["no_restart"] = no_restart # Now inject the rewritten config and SSH key into the head node remote_config_file = tempfile.NamedTemporaryFile( "w", prefix="ray-bootstrap-") remote_config_file.write(json.dumps(remote_config)) remote_config_file.flush() config["file_mounts"].update({ remote_key_path: config["auth"]["ssh_private_key"], "~/ray_bootstrap_config.yaml": remote_config_file.name }) if no_restart: init_commands = ( config["setup_commands"] + config["head_setup_commands"]) else: init_commands = ( config["setup_commands"] + config["head_setup_commands"] + config["head_start_ray_commands"]) updater = NodeUpdaterProcess( head_node, config["provider"], config["auth"], config["cluster_name"], config["file_mounts"], init_commands, runtime_hash, redirect_output=False) updater.start() updater.join() # Refresh the node cache so we see the external ip if available provider.nodes(head_node_tags) if updater.exitcode != 0: print("Error: updating {} failed".format( provider.external_ip(head_node))) sys.exit(1) print( "Head node up-to-date, IP address is: {}".format( provider.external_ip(head_node))) monitor_str = "tail -f /tmp/raylogs/monitor-*" for s in init_commands: if ("ray start" in s and "docker exec" in s and "--autoscaling-config" in s): monitor_str = "docker exec {} /bin/sh -c {}".format( config["docker"]["container_name"], quote(monitor_str)) print( "To monitor auto-scaling activity, you can run:\n\n" " ssh -i {} {}@{} {}\n".format( config["auth"]["ssh_private_key"], config["auth"]["ssh_user"], provider.external_ip(head_node), quote(monitor_str))) print( "To login to the cluster, run:\n\n" " ssh -i {} {}@{}\n".format( config["auth"]["ssh_private_key"], config["auth"]["ssh_user"], provider.external_ip(head_node)))
def exec_cluster(config_file, cmd, docker, screen, tmux, stop, start, override_cluster_name, port_forward): """Runs a command on the specified cluster. Arguments: config_file: path to the cluster yaml cmd: command to run docker: whether to run command in docker container of config screen: whether to run in a screen tmux: whether to run in a tmux session stop: whether to stop the cluster after command run start: whether to start the cluster if it isn't up override_cluster_name: set the name of the cluster port_forward: port to forward """ assert not (screen and tmux), "Can specify only one of `screen` or `tmux`." config = yaml.load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config) head_node = _get_head_node( config, config_file, override_cluster_name, create_if_needed=start) provider = get_node_provider(config["provider"], config["cluster_name"]) try: updater = NodeUpdaterThread( node_id=head_node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], runtime_hash="", ) def wrap_docker(command): container_name = config["docker"]["container_name"] if not container_name: raise ValueError("Docker container not specified in config.") return with_docker_exec( [command], container_name=container_name)[0] cmd = wrap_docker(cmd) if docker else cmd if stop: shutdown_cmd = ( "ray stop; ray teardown ~/ray_bootstrap_config.yaml " "--yes --workers-only") if docker: shutdown_cmd = wrap_docker(shutdown_cmd) cmd += ("; {}; sudo shutdown -h now".format(shutdown_cmd)) _exec( updater, cmd, screen, tmux, expect_error=stop, port_forward=port_forward) if tmux or screen: attach_command_parts = ["ray attach", config_file] if override_cluster_name is not None: attach_command_parts.append( "--cluster-name={}".format(override_cluster_name)) if tmux: attach_command_parts.append("--tmux") elif screen: attach_command_parts.append("--screen") attach_command = " ".join(attach_command_parts) attach_info = "Use `{}` to check on command status.".format( attach_command) logger.info(attach_info) finally: provider.cleanup()
def get_or_create_head_node(config, config_file, no_restart, restart_only, yes, override_cluster_name): """Create the cluster head node, which in turn creates the workers.""" provider = get_node_provider(config["provider"], config["cluster_name"]) try: head_node_tags = { TAG_RAY_NODE_TYPE: "head", } nodes = provider.non_terminated_nodes(head_node_tags) if len(nodes) > 0: head_node = nodes[0] else: head_node = None if not head_node: confirm("This will create a new cluster", yes) elif not no_restart: confirm("This will restart cluster services", yes) launch_hash = hash_launch_conf(config["head_node"], config["auth"]) if head_node is None or provider.node_tags(head_node).get( TAG_RAY_LAUNCH_CONFIG) != launch_hash: if head_node is not None: confirm("Head node config out-of-date. It will be terminated", yes) logger.info( "get_or_create_head_node: " "Terminating outdated head node {}".format(head_node)) provider.terminate_node(head_node) logger.info("get_or_create_head_node: Launching new head node...") head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format( config["cluster_name"]) provider.create_node(config["head_node"], head_node_tags, 1) nodes = provider.non_terminated_nodes(head_node_tags) assert len(nodes) == 1, "Failed to create head node." head_node = nodes[0] # TODO(ekl) right now we always update the head node even if the hash # matches. We could prompt the user for what they want to do here. runtime_hash = hash_runtime_conf(config["file_mounts"], config) logger.info("get_or_create_head_node: Updating files on head node...") # Rewrite the auth config so that the head node can update the workers remote_key_path = "~/ray_bootstrap_key.pem" remote_config = copy.deepcopy(config) remote_config["auth"]["ssh_private_key"] = remote_key_path # Adjust for new file locations new_mounts = {} for remote_path in config["file_mounts"]: new_mounts[remote_path] = remote_path remote_config["file_mounts"] = new_mounts remote_config["no_restart"] = no_restart # Now inject the rewritten config and SSH key into the head node remote_config_file = tempfile.NamedTemporaryFile( "w", prefix="ray-bootstrap-") remote_config_file.write(json.dumps(remote_config)) remote_config_file.flush() config["file_mounts"].update({ remote_key_path: config["auth"]["ssh_private_key"], "~/ray_bootstrap_config.yaml": remote_config_file.name }) if restart_only: init_commands = config["head_start_ray_commands"] elif no_restart: init_commands = ( config["setup_commands"] + config["head_setup_commands"]) else: init_commands = ( config["setup_commands"] + config["head_setup_commands"] + config["head_start_ray_commands"]) updater = NodeUpdaterThread( node_id=head_node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=config["initialization_commands"], setup_commands=init_commands, runtime_hash=runtime_hash, ) updater.start() updater.join() # Refresh the node cache so we see the external ip if available provider.non_terminated_nodes(head_node_tags) if config.get("provider", {}).get("use_internal_ips", False) is True: head_node_ip = provider.internal_ip(head_node) else: head_node_ip = provider.external_ip(head_node) if updater.exitcode != 0: logger.error("get_or_create_head_node: " "Updating {} failed".format(head_node_ip)) sys.exit(1) logger.info( "get_or_create_head_node: " "Head node up-to-date, IP address is: {}".format(head_node_ip)) monitor_str = "tail -n 100 -f /tmp/ray/session_*/logs/monitor*" use_docker = bool(config["docker"]["container_name"]) if override_cluster_name: modifiers = " --cluster-name={}".format( quote(override_cluster_name)) else: modifiers = "" print("To monitor auto-scaling activity, you can run:\n\n" " ray exec {} {}{}{}\n".format( config_file, "--docker " if use_docker else " ", quote(monitor_str), modifiers)) print("To open a console on the cluster:\n\n" " ray attach {}{}\n".format(config_file, modifiers)) print("To ssh manually to the cluster, run:\n\n" " ssh -i {} {}@{}\n".format(config["auth"]["ssh_private_key"], config["auth"]["ssh_user"], head_node_ip)) finally: provider.cleanup()
def __init__(self, config_path, load_metrics, max_launch_batch=AUTOSCALER_MAX_LAUNCH_BATCH, max_concurrent_launches=AUTOSCALER_MAX_CONCURRENT_LAUNCHES, max_failures=AUTOSCALER_MAX_NUM_FAILURES, process_runner=subprocess, update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S): self.config_path = config_path self.reload_config(errors_fatal=True) self.load_metrics = load_metrics self.provider = get_node_provider(self.config["provider"], self.config["cluster_name"]) # Check whether we can enable the resource demand scheduler. if "available_node_types" in self.config: self.available_node_types = self.config["available_node_types"] self.resource_demand_scheduler = ResourceDemandScheduler( self.provider, self.available_node_types, self.config["max_workers"]) else: self.available_node_types = None self.resource_demand_scheduler = None self.max_failures = max_failures self.max_launch_batch = max_launch_batch self.max_concurrent_launches = max_concurrent_launches self.process_runner = process_runner # Map from node_id to NodeUpdater processes self.updaters = {} self.num_failed_updates = defaultdict(int) self.num_successful_updates = defaultdict(int) self.num_failures = 0 self.last_update_time = 0.0 self.update_interval_s = update_interval_s self.bringup = True # Node launchers self.launch_queue = queue.Queue() self.pending_launches = ConcurrentCounter() max_batches = math.ceil(max_concurrent_launches / float(max_launch_batch)) for i in range(int(max_batches)): node_launcher = NodeLauncher( provider=self.provider, queue=self.launch_queue, index=i, pending=self.pending_launches, node_types=self.available_node_types, ) node_launcher.daemon = True node_launcher.start() # Expand local file_mounts to allow ~ in the paths. This can't be done # earlier when the config is written since we might be on different # platform and the expansion would result in wrong path. self.config["file_mounts"] = { remote: os.path.expanduser(local) for remote, local in self.config["file_mounts"].items() } for local_path in self.config["file_mounts"].values(): assert os.path.exists(local_path) # Aggregate resources the user is requesting of the cluster. self.resource_requests = defaultdict(int) # List of resource bundles the user is requesting of the cluster. self.resource_demand_vector = None logger.info("StandardAutoscaler: {}".format(self.config))
def teardown_cluster(config_file: str, yes: bool, workers_only: bool, override_cluster_name: Optional[str], keep_min_workers: bool, log_old_style: bool, log_color: str, verbose: int): """Destroys all nodes of a Ray cluster described by a config json.""" cli_logger.old_style = log_old_style cli_logger.color_mode = log_color cli_logger.verbosity = verbose cli_logger.dump_command_output = verbose == 3 # todo: add a separate flag? config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = prepare_config(config) validate_config(config) cli_logger.confirm(yes, "Destroying cluster.", _abort=True) cli_logger.old_confirm("This will destroy your cluster", yes) if not workers_only: try: exec_cluster(config_file, cmd="ray stop", run_env="auto", screen=False, tmux=False, stop=False, start=False, override_cluster_name=override_cluster_name, port_forward=None, with_output=False) except Exception as e: # todo: add better exception info cli_logger.verbose_error("{}", str(e)) cli_logger.warning( "Exception occured when stopping the cluster Ray runtime " "(use -v to dump teardown exceptions).") cli_logger.warning( "Ignoring the exception and " "attempting to shut down the cluster nodes anyway.") cli_logger.old_exception( logger, "Ignoring error attempting a clean shutdown.") provider = get_node_provider(config["provider"], config["cluster_name"]) try: def remaining_nodes(): workers = provider.non_terminated_nodes( {TAG_RAY_NODE_TYPE: NODE_TYPE_WORKER}) if keep_min_workers: min_workers = config.get("min_workers", 0) cli_logger.print( "{} random worker nodes will not be shut down. " + cf.gray("(due to {})"), cf.bold(min_workers), cf.bold("--keep-min-workers")) cli_logger.old_info(logger, "teardown_cluster: Keeping {} nodes...", min_workers) workers = random.sample(workers, len(workers) - min_workers) # todo: it's weird to kill the head node but not all workers if workers_only: cli_logger.print( "The head node will not be shut down. " + cf.gray("(due to {})"), cf.bold("--workers-only")) return workers head = provider.non_terminated_nodes( {TAG_RAY_NODE_TYPE: NODE_TYPE_HEAD}) return head + workers # Loop here to check that both the head and worker nodes are actually # really gone A = remaining_nodes() with LogTimer("teardown_cluster: done."): while A: cli_logger.old_info( logger, "teardown_cluster: " "Shutting down {} nodes...", len(A)) provider.terminate_nodes(A) cli_logger.print("Requested {} nodes to shut down.", cf.bold(len(A)), _tags=dict(interval="1s")) time.sleep(1) # todo: interval should be a variable A = remaining_nodes() cli_logger.print("{} nodes remaining after 1 second.", cf.bold(len(A))) finally: provider.cleanup()
def testCoordinatorSenderNodeProvider(self): """Integration test of CoordinatorSenderNodeProvider.""" cluster_config = { "cluster_name": "random_name", "min_workers": 0, "max_workers": 0, "initial_workers": 0, "provider": { "type": "local", "coordinator_address": self.coordinator_address, }, "head_node": {}, "worker_nodes": {}, } provider_config = cluster_config["provider"] node_provider_1 = get_node_provider(provider_config, cluster_config["cluster_name"]) assert isinstance(node_provider_1, CoordinatorSenderNodeProvider) assert not node_provider_1.non_terminated_nodes({}) assert not node_provider_1.is_running(self.list_of_node_ips[0]) assert node_provider_1.is_terminated(self.list_of_node_ips[0]) assert not node_provider_1.node_tags(self.list_of_node_ips[0]) head_node_tags = { TAG_RAY_NODE_TYPE: NODE_TYPE_HEAD, } assert not node_provider_1.non_terminated_nodes(head_node_tags) head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format( cluster_config["cluster_name"]) node_provider_1.create_node(cluster_config["head_node"], head_node_tags, 1) assert node_provider_1.non_terminated_nodes( {}) == [self.list_of_node_ips[0]] head_node_tags[TAG_RAY_CLUSTER_NAME] = cluster_config["cluster_name"] assert node_provider_1.node_tags( self.list_of_node_ips[0]) == head_node_tags assert node_provider_1.is_running(self.list_of_node_ips[0]) assert not node_provider_1.is_terminated(self.list_of_node_ips[0]) # Add another cluster. cluster_config["cluster_name"] = "random_name_2" provider_config = cluster_config["provider"] node_provider_2 = get_node_provider(provider_config, cluster_config["cluster_name"]) assert not node_provider_2.non_terminated_nodes({}) assert not node_provider_2.is_running(self.list_of_node_ips[1]) assert node_provider_2.is_terminated(self.list_of_node_ips[1]) assert not node_provider_2.node_tags(self.list_of_node_ips[1]) assert not node_provider_2.non_terminated_nodes(head_node_tags) head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format( cluster_config["cluster_name"]) node_provider_2.create_node(cluster_config["head_node"], head_node_tags, 1) assert node_provider_2.non_terminated_nodes( {}) == [self.list_of_node_ips[1]] head_node_tags[TAG_RAY_CLUSTER_NAME] = cluster_config["cluster_name"] assert node_provider_2.node_tags( self.list_of_node_ips[1]) == head_node_tags assert node_provider_2.is_running(self.list_of_node_ips[1]) assert not node_provider_2.is_terminated(self.list_of_node_ips[1]) # Add another cluster (should fail because we only have two nodes). cluster_config["cluster_name"] = "random_name_3" provider_config = cluster_config["provider"] node_provider_3 = get_node_provider(provider_config, cluster_config["cluster_name"]) assert not node_provider_3.non_terminated_nodes(head_node_tags) head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format( cluster_config["cluster_name"]) node_provider_3.create_node(cluster_config["head_node"], head_node_tags, 1) assert not node_provider_3.non_terminated_nodes({}) # Terminate all nodes. node_provider_1.terminate_node(self.list_of_node_ips[0]) assert not node_provider_1.non_terminated_nodes({}) node_provider_2.terminate_node(self.list_of_node_ips[1]) assert not node_provider_2.non_terminated_nodes({}) # Check if now we can create more clusters/nodes. node_provider_3.create_node(cluster_config["head_node"], head_node_tags, 1) worker_node_tags = { TAG_RAY_NODE_NAME: "ray-{}-worker".format(cluster_config["cluster_name"]), TAG_RAY_NODE_TYPE: NODE_TYPE_WORKER } node_provider_3.create_node(cluster_config["worker_nodes"], worker_node_tags, 1) assert node_provider_3.non_terminated_nodes( {}) == self.list_of_node_ips worker_filter = {TAG_RAY_NODE_TYPE: NODE_TYPE_WORKER} assert node_provider_3.non_terminated_nodes(worker_filter) == [ self.list_of_node_ips[1] ] head_filter = {TAG_RAY_NODE_TYPE: NODE_TYPE_HEAD} assert node_provider_3.non_terminated_nodes(head_filter) == [ self.list_of_node_ips[0] ]
def get_or_create_head_node(config): """Create the cluster head node, which in turn creates the workers.""" provider = get_node_provider(config["provider"], config["cluster_name"]) head_node_tags = { TAG_RAY_NODE_TYPE: "Head", } nodes = provider.nodes(head_node_tags) if len(nodes) > 0: head_node = nodes[0] else: head_node = None launch_hash = hash_launch_conf(config["head_node"], config["auth"]) if head_node is None or provider.node_tags(head_node).get( TAG_RAY_LAUNCH_CONFIG) != launch_hash: if head_node is not None: print("Terminating outdated head node {}".format(head_node)) provider.terminate_node(head_node) print("Launching new head node...") head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash head_node_tags[TAG_NAME] = "ray-{}-head".format(config["cluster_name"]) provider.create_node(config["head_node"], head_node_tags, 1) nodes = provider.nodes(head_node_tags) assert len(nodes) == 1, "Failed to create head node." head_node = nodes[0] runtime_hash = hash_runtime_conf(config["file_mounts"], config) if provider.node_tags(head_node).get( TAG_RAY_RUNTIME_CONFIG) != runtime_hash: print("Updating files on head node...") # Rewrite the auth config so that the head node can update the workers remote_key_path = "~/ray_bootstrap_key.pem" remote_config = copy.deepcopy(config) remote_config["auth"]["ssh_private_key"] = remote_key_path # Adjust for new file locations new_mounts = {} for remote_path in config["file_mounts"]: new_mounts[remote_path] = remote_path remote_config["file_mounts"] = new_mounts # Now inject the rewritten config and SSH key into the head node remote_config_file = tempfile.NamedTemporaryFile( "w", prefix="ray-bootstrap-") remote_config_file.write(json.dumps(remote_config)) remote_config_file.flush() config["file_mounts"].update({ remote_key_path: config["auth"]["ssh_private_key"], "~/ray_bootstrap_config.yaml": remote_config_file.name }) updater = NodeUpdaterProcess( head_node, config["provider"], config["auth"], config["cluster_name"], config["file_mounts"], config["head_init_commands"], runtime_hash, redirect_output=False) updater.start() updater.join() if updater.exitcode != 0: print("Error: updating {} failed".format( provider.external_ip(head_node))) sys.exit(1) print( "Head node up-to-date, IP address is: {}".format( provider.external_ip(head_node))) print( "To monitor auto-scaling activity, you can run:\n\n" " ssh -i {} {}@{} 'tail -f /tmp/raylogs/monitor-*'\n".format( config["auth"]["ssh_private_key"], config["auth"]["ssh_user"], provider.external_ip(head_node)))
def exec_cluster(config_file, cmd=None, docker=False, screen=False, tmux=False, stop=False, start=False, override_cluster_name=None, port_forward=None, with_output=False): """Runs a command on the specified cluster. Arguments: config_file: path to the cluster yaml cmd: command to run docker: whether to run command in docker container of config screen: whether to run in a screen tmux: whether to run in a tmux session stop: whether to stop the cluster after command run start: whether to start the cluster if it isn't up override_cluster_name: set the name of the cluster port_forward (int or list[int]): port(s) to forward """ assert not (screen and tmux), "Can specify only one of `screen` or `tmux`." config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config) head_node = _get_head_node( config, config_file, override_cluster_name, create_if_needed=start) provider = get_node_provider(config["provider"], config["cluster_name"]) try: updater = NodeUpdaterThread( node_id=head_node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="", ) def wrap_docker(command): container_name = config["docker"]["container_name"] if not container_name: raise ValueError("Docker container not specified in config.") return with_docker_exec( [command], container_name=container_name)[0] if cmd: cmd = wrap_docker(cmd) if docker else cmd if stop: shutdown_cmd = ( "ray stop; ray teardown ~/ray_bootstrap_config.yaml " "--yes --workers-only") if docker: shutdown_cmd = wrap_docker(shutdown_cmd) cmd += ("; {}; sudo shutdown -h now".format(shutdown_cmd)) result = _exec( updater, cmd, screen, tmux, port_forward=port_forward, with_output=with_output) if tmux or screen: attach_command_parts = ["ray attach", config_file] if override_cluster_name is not None: attach_command_parts.append( "--cluster-name={}".format(override_cluster_name)) if tmux: attach_command_parts.append("--tmux") elif screen: attach_command_parts.append("--screen") attach_command = " ".join(attach_command_parts) attach_info = "Use `{}` to check on command status.".format( attach_command) logger.info(attach_info) return result finally: provider.cleanup()
def exec_cluster(config_file: str, *, cmd: Any = None, run_env: str = "auto", screen: bool = False, tmux: bool = False, stop: bool = False, start: bool = False, override_cluster_name: Optional[str] = None, no_config_cache: bool = False, port_forward: Any = None, with_output: bool = False): """Runs a command on the specified cluster. Arguments: config_file: path to the cluster yaml cmd: command to run run_env: whether to run the command on the host or in a container. Select between "auto", "host" and "docker" screen: whether to run in a screen tmux: whether to run in a tmux session stop: whether to stop the cluster after command run start: whether to start the cluster if it isn't up override_cluster_name: set the name of the cluster port_forward (int or list[int]): port(s) to forward """ assert not (screen and tmux), "Can specify only one of `screen` or `tmux`." assert run_env in RUN_ENV_TYPES, "--run_env must be in {}".format( RUN_ENV_TYPES) # TODO(rliaw): We default this to True to maintain backwards-compat. # In the future we would want to support disabling login-shells # and interactivity. cmd_output_util.set_allow_interactive(True) config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config, no_config_cache=no_config_cache) head_node = _get_head_node(config, config_file, override_cluster_name, create_if_needed=start) provider = get_node_provider(config["provider"], config["cluster_name"]) try: updater = NodeUpdaterThread(node_id=head_node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="", file_mounts_contents_hash="", is_head_node=True, docker_config=config.get("docker")) is_docker = isinstance(updater.cmd_runner, DockerCommandRunner) if cmd and stop: cmd += "; ".join([ "ray stop", "ray teardown ~/ray_bootstrap_config.yaml --yes --workers-only" ]) if is_docker and run_env == "docker": updater.cmd_runner.shutdown_after_next_cmd() else: cmd += "; sudo shutdown -h now" result = _exec(updater, cmd, screen, tmux, port_forward=port_forward, with_output=with_output, run_env=run_env) if tmux or screen: attach_command_parts = ["ray attach", config_file] if override_cluster_name is not None: attach_command_parts.append( "--cluster-name={}".format(override_cluster_name)) if tmux: attach_command_parts.append("--tmux") elif screen: attach_command_parts.append("--screen") attach_command = " ".join(attach_command_parts) cli_logger.print("Run `{}` to check command status.", cf.bold(attach_command)) attach_info = "Use `{}` to check on command status.".format( attach_command) cli_logger.old_info(logger, attach_info) return result finally: provider.cleanup()
def rsync(config_file: str, source: Optional[str], target: Optional[str], override_cluster_name: Optional[str], down: bool, no_config_cache: bool = False, all_nodes: bool = False): """Rsyncs files. Arguments: config_file: path to the cluster yaml source: source dir target: target dir override_cluster_name: set the name of the cluster down: whether we're syncing remote -> local all_nodes: whether to sync worker nodes in addition to the head node """ if bool(source) != bool(target): cli_logger.abort( "Expected either both a source and a target, or neither.") assert bool(source) == bool(target), ( "Must either provide both or neither source and target.") config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config, no_config_cache=no_config_cache) is_file_mount = False for remote_mount in config.get("file_mounts", {}).keys(): if remote_mount in (source if down else target): is_file_mount = True break provider = get_node_provider(config["provider"], config["cluster_name"]) try: nodes = [] if all_nodes: # technically we re-open the provider for no reason # in get_worker_nodes but it's cleaner this way # and _get_head_node does this too nodes = _get_worker_nodes(config, override_cluster_name) head_node = _get_head_node(config, config_file, override_cluster_name, create_if_needed=False) nodes += [head_node] for node_id in nodes: updater = NodeUpdaterThread(node_id=node_id, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="", file_mounts_contents_hash="", is_head_node=(node_id == head_node), docker_config=config.get("docker")) if down: rsync = updater.rsync_down else: rsync = updater.rsync_up if source and target: # print rsync progress for single file rsync cmd_output_util.set_output_redirected(False) set_rsync_silent(False) rsync(source, target, is_file_mount) else: updater.sync_file_mounts(rsync) finally: provider.cleanup()
def teardown_cluster(config_file, yes, workers_only, override_cluster_name, keep_min_workers): """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = prepare_config(config) validate_config(config) confirm("This will destroy your cluster", yes) if not workers_only: try: exec_cluster(config_file, cmd="ray stop", run_env="auto", screen=False, tmux=False, stop=False, start=False, override_cluster_name=override_cluster_name, port_forward=None, with_output=False) except Exception: logger.exception("Ignoring error attempting a clean shutdown.") provider = get_node_provider(config["provider"], config["cluster_name"]) try: def remaining_nodes(): workers = provider.non_terminated_nodes( {TAG_RAY_NODE_TYPE: NODE_TYPE_WORKER}) if keep_min_workers: min_workers = config.get("min_workers", 0) logger.info("teardown_cluster: " "Keeping {} nodes...".format(min_workers)) workers = random.sample(workers, len(workers) - min_workers) if workers_only: return workers head = provider.non_terminated_nodes( {TAG_RAY_NODE_TYPE: NODE_TYPE_HEAD}) return head + workers # Loop here to check that both the head and worker nodes are actually # really gone A = remaining_nodes() with LogTimer("teardown_cluster: done."): while A: logger.info("teardown_cluster: " "Shutting down {} nodes...".format(len(A))) provider.terminate_nodes(A) time.sleep(1) A = remaining_nodes() finally: provider.cleanup()
def get_or_create_head_node(config, config_file, no_restart, restart_only, yes, override_cluster_name): """Create the cluster head node, which in turn creates the workers.""" provider = get_node_provider(config["provider"], config["cluster_name"]) config_file = os.path.abspath(config_file) try: head_node_tags = { TAG_RAY_NODE_TYPE: NODE_TYPE_HEAD, } nodes = provider.non_terminated_nodes(head_node_tags) if len(nodes) > 0: head_node = nodes[0] else: head_node = None if not head_node: confirm("This will create a new cluster", yes) elif not no_restart: confirm("This will restart cluster services", yes) launch_hash = hash_launch_conf(config["head_node"], config["auth"]) if head_node is None or provider.node_tags(head_node).get( TAG_RAY_LAUNCH_CONFIG) != launch_hash: if head_node is not None: confirm("Head node config out-of-date. It will be terminated", yes) logger.info( "get_or_create_head_node: " "Shutting down outdated head node {}".format(head_node)) provider.terminate_node(head_node) logger.info("get_or_create_head_node: Launching new head node...") head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format( config["cluster_name"]) provider.create_node(config["head_node"], head_node_tags, 1) start = time.time() head_node = None while True: if time.time() - start > 5: raise RuntimeError("Failed to create head node.") nodes = provider.non_terminated_nodes(head_node_tags) if len(nodes) == 1: head_node = nodes[0] break time.sleep(1) # TODO(ekl) right now we always update the head node even if the hash # matches. We could prompt the user for what they want to do here. runtime_hash = hash_runtime_conf(config["file_mounts"], config) logger.info("get_or_create_head_node: Updating files on head node...") # Rewrite the auth config so that the head node can update the workers remote_config = copy.deepcopy(config) if config["provider"]["type"] != "kubernetes": remote_key_path = "~/ray_bootstrap_key.pem" remote_config["auth"]["ssh_private_key"] = remote_key_path # Adjust for new file locations new_mounts = {} for remote_path in config["file_mounts"]: new_mounts[remote_path] = remote_path remote_config["file_mounts"] = new_mounts remote_config["no_restart"] = no_restart # Now inject the rewritten config and SSH key into the head node remote_config_file = tempfile.NamedTemporaryFile( "w", prefix="ray-bootstrap-") remote_config_file.write(json.dumps(remote_config)) remote_config_file.flush() config["file_mounts"].update({ "~/ray_bootstrap_config.yaml": remote_config_file.name }) if config["provider"]["type"] != "kubernetes": config["file_mounts"].update({ remote_key_path: config["auth"]["ssh_private_key"], }) if restart_only: init_commands = [] ray_start_commands = config["head_start_ray_commands"] elif no_restart: init_commands = config["head_setup_commands"] ray_start_commands = [] else: init_commands = config["head_setup_commands"] ray_start_commands = config["head_start_ray_commands"] if not no_restart: warn_about_bad_start_command(ray_start_commands) updater = NodeUpdaterThread( node_id=head_node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=config["initialization_commands"], setup_commands=init_commands, ray_start_commands=ray_start_commands, runtime_hash=runtime_hash, ) updater.start() updater.join() # Refresh the node cache so we see the external ip if available provider.non_terminated_nodes(head_node_tags) if config.get("provider", {}).get("use_internal_ips", False) is True: head_node_ip = provider.internal_ip(head_node) else: head_node_ip = provider.external_ip(head_node) if updater.exitcode != 0: logger.error("get_or_create_head_node: " "Updating {} failed".format(head_node_ip)) sys.exit(1) logger.info( "get_or_create_head_node: " "Head node up-to-date, IP address is: {}".format(head_node_ip)) monitor_str = "tail -n 100 -f /tmp/ray/session_*/logs/monitor*" use_docker = "docker" in config and bool( config["docker"]["container_name"]) if override_cluster_name: modifiers = " --cluster-name={}".format( quote(override_cluster_name)) else: modifiers = "" print("To monitor auto-scaling activity, you can run:\n\n" " ray exec {} {}{}{}\n".format( config_file, "--docker " if use_docker else "", quote(monitor_str), modifiers)) print("To open a console on the cluster:\n\n" " ray attach {}{}\n".format(config_file, modifiers)) print("To get a remote shell to the cluster manually, run:\n\n" " {}\n".format(updater.cmd_runner.remote_shell_command_str())) finally: provider.cleanup()
def exec_cluster(config_file, *, cmd=None, run_env="auto", screen=False, tmux=False, stop=False, start=False, override_cluster_name=None, port_forward=None, with_output=False): """Runs a command on the specified cluster. Arguments: config_file: path to the cluster yaml cmd: command to run run_env: whether to run the command on the host or in a container. Select between "auto", "host" and "docker" screen: whether to run in a screen tmux: whether to run in a tmux session stop: whether to stop the cluster after command run start: whether to start the cluster if it isn't up override_cluster_name: set the name of the cluster port_forward (int or list[int]): port(s) to forward """ assert not (screen and tmux), "Can specify only one of `screen` or `tmux`." assert run_env in RUN_ENV_TYPES, "--run_env must be in {}".format( RUN_ENV_TYPES) config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config) head_node = _get_head_node(config, config_file, override_cluster_name, create_if_needed=start) provider = get_node_provider(config["provider"], config["cluster_name"]) try: updater = NodeUpdaterThread(node_id=head_node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="", docker_config=config.get("docker")) is_docker = isinstance(updater.cmd_runner, DockerCommandRunner) if cmd and stop: cmd += "; ".join([ "ray stop", "ray teardown ~/ray_bootstrap_config.yaml --yes --workers-only" ]) if is_docker and run_env == "docker": updater.cmd_runner.shutdown_after_next_cmd() else: cmd += "; sudo shutdown -h now" result = _exec(updater, cmd, screen, tmux, port_forward=port_forward, with_output=with_output, run_env=run_env) if tmux or screen: attach_command_parts = ["ray attach", config_file] if override_cluster_name is not None: attach_command_parts.append( "--cluster-name={}".format(override_cluster_name)) if tmux: attach_command_parts.append("--tmux") elif screen: attach_command_parts.append("--screen") attach_command = " ".join(attach_command_parts) attach_info = "Use `{}` to check on command status.".format( attach_command) logger.info(attach_info) return result finally: provider.cleanup()
def rsync(config_file, source, target, override_cluster_name, down, all_nodes=False): """Rsyncs files. Arguments: config_file: path to the cluster yaml source: source dir target: target dir override_cluster_name: set the name of the cluster down: whether we're syncing remote -> local all_nodes: whether to sync worker nodes in addition to the head node """ assert bool(source) == bool(target), ( "Must either provide both or neither source and target.") config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config) provider = get_node_provider(config["provider"], config["cluster_name"]) try: nodes = [] if all_nodes: # technically we re-open the provider for no reason # in get_worker_nodes but it's cleaner this way # and _get_head_node does this too nodes = _get_worker_nodes(config, override_cluster_name) nodes += [ _get_head_node( config, config_file, override_cluster_name, create_if_needed=False) ] for node_id in nodes: updater = NodeUpdaterThread( node_id=node_id, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="", ) if down: rsync = updater.rsync_down else: rsync = updater.rsync_up if source and target: rsync(source, target) else: updater.sync_file_mounts(rsync) finally: provider.cleanup()
def get_or_create_head_node(config, no_restart): """Create the cluster head node, which in turn creates the workers.""" provider = get_node_provider(config["provider"], config["cluster_name"]) head_node_tags = { TAG_RAY_NODE_TYPE: "Head", } nodes = provider.nodes(head_node_tags) if len(nodes) > 0: head_node = nodes[0] else: head_node = None if not head_node: confirm("This will create a new cluster") elif not no_restart: confirm("This will restart your cluster") launch_hash = hash_launch_conf(config["head_node"], config["auth"]) if head_node is None or provider.node_tags(head_node).get( TAG_RAY_LAUNCH_CONFIG) != launch_hash: if head_node is not None: print("Terminating outdated head node {}".format(head_node)) provider.terminate_node(head_node) print("Launching new head node...") head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash head_node_tags[TAG_NAME] = "ray-{}-head".format(config["cluster_name"]) provider.create_node(config["head_node"], head_node_tags, 1) nodes = provider.nodes(head_node_tags) assert len(nodes) == 1, "Failed to create head node." head_node = nodes[0] # TODO(ekl) right now we always update the head node even if the hash # matches. We could prompt the user for what they want to do in this case. runtime_hash = hash_runtime_conf(config["file_mounts"], config) print("Updating files on head node...") # Rewrite the auth config so that the head node can update the workers remote_key_path = "~/ray_bootstrap_key.pem" remote_config = copy.deepcopy(config) remote_config["auth"]["ssh_private_key"] = remote_key_path # Adjust for new file locations new_mounts = {} for remote_path in config["file_mounts"]: new_mounts[remote_path] = remote_path remote_config["file_mounts"] = new_mounts remote_config["no_restart"] = no_restart # Now inject the rewritten config and SSH key into the head node remote_config_file = tempfile.NamedTemporaryFile("w", prefix="ray-bootstrap-") remote_config_file.write(json.dumps(remote_config)) remote_config_file.flush() config["file_mounts"].update({ remote_key_path: config["auth"]["ssh_private_key"], "~/ray_bootstrap_config.yaml": remote_config_file.name }) if no_restart: init_commands = (config["setup_commands"] + config["head_setup_commands"]) else: init_commands = (config["setup_commands"] + config["head_setup_commands"] + config["head_start_ray_commands"]) updater = NodeUpdaterProcess(head_node, config["provider"], config["auth"], config["cluster_name"], config["file_mounts"], init_commands, runtime_hash, redirect_output=False) updater.start() updater.join() # Refresh the node cache so we see the external ip if available provider.nodes(head_node_tags) if updater.exitcode != 0: print("Error: updating {} failed".format( provider.external_ip(head_node))) sys.exit(1) print("Head node up-to-date, IP address is: {}".format( provider.external_ip(head_node))) print("To monitor auto-scaling activity, you can run:\n\n" " ssh -i {} {}@{} 'tail -f /tmp/raylogs/monitor-*'\n".format( config["auth"]["ssh_private_key"], config["auth"]["ssh_user"], provider.external_ip(head_node))) print("To login to the cluster, run:\n\n" " ssh -i {} {}@{}\n".format(config["auth"]["ssh_private_key"], config["auth"]["ssh_user"], provider.external_ip(head_node)))
def get_or_create_head_node(config, config_file, no_restart, restart_only, yes, override_cluster_name): """Create the cluster head node, which in turn creates the workers.""" provider = get_node_provider(config["provider"], config["cluster_name"]) head_node_tags = { TAG_RAY_NODE_TYPE: "head", } nodes = provider.nodes(head_node_tags) if len(nodes) > 0: head_node = nodes[0] else: head_node = None if not head_node: confirm("This will create a new cluster", yes) elif not no_restart: confirm("This will restart cluster services", yes) launch_hash = hash_launch_conf(config["head_node"], config["auth"]) if head_node is None or provider.node_tags(head_node).get( TAG_RAY_LAUNCH_CONFIG) != launch_hash: if head_node is not None: confirm("Head node config out-of-date. It will be terminated", yes) logger.info("Terminating outdated head node {}".format(head_node)) provider.terminate_node(head_node) logger.info("Launching new head node...") head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format( config["cluster_name"]) provider.create_node(config["head_node"], head_node_tags, 1) nodes = provider.nodes(head_node_tags) assert len(nodes) == 1, "Failed to create head node." head_node = nodes[0] # TODO(ekl) right now we always update the head node even if the hash # matches. We could prompt the user for what they want to do in this case. runtime_hash = hash_runtime_conf(config["file_mounts"], config) logger.info("Updating files on head node...") # Rewrite the auth config so that the head node can update the workers remote_key_path = "~/ray_bootstrap_key.pem" remote_config = copy.deepcopy(config) remote_config["auth"]["ssh_private_key"] = remote_key_path # Adjust for new file locations new_mounts = {} for remote_path in config["file_mounts"]: new_mounts[remote_path] = remote_path remote_config["file_mounts"] = new_mounts remote_config["no_restart"] = no_restart # Now inject the rewritten config and SSH key into the head node remote_config_file = tempfile.NamedTemporaryFile("w", prefix="ray-bootstrap-") remote_config_file.write(json.dumps(remote_config)) remote_config_file.flush() config["file_mounts"].update({ remote_key_path: config["auth"]["ssh_private_key"], "~/ray_bootstrap_config.yaml": remote_config_file.name }) if restart_only: init_commands = config["head_start_ray_commands"] elif no_restart: init_commands = (config["setup_commands"] + config["head_setup_commands"]) else: init_commands = (config["setup_commands"] + config["head_setup_commands"] + config["head_start_ray_commands"]) updater = NodeUpdaterProcess(head_node, config["provider"], config["auth"], config["cluster_name"], config["file_mounts"], init_commands, runtime_hash, redirect_output=False) updater.start() updater.join() # Refresh the node cache so we see the external ip if available provider.nodes(head_node_tags) if updater.exitcode != 0: logger.error("Updating {} failed".format( provider.external_ip(head_node))) sys.exit(1) logger.info("Head node up-to-date, IP address is: {}".format( provider.external_ip(head_node))) monitor_str = "tail -n 100 -f /tmp/ray/session_*/logs/monitor*" for s in init_commands: if ("ray start" in s and "docker exec" in s and "--autoscaling-config" in s): monitor_str = "docker exec {} /bin/sh -c {}".format( config["docker"]["container_name"], quote(monitor_str)) if override_cluster_name: modifiers = " --cluster-name={}".format(quote(override_cluster_name)) else: modifiers = "" print("To monitor auto-scaling activity, you can run:\n\n" " ray exec {} {}{}\n".format(config_file, quote(monitor_str), modifiers)) print("To open a console on the cluster:\n\n" " ray attach {}{}\n".format(config_file, modifiers)) print("To ssh manually to the cluster, run:\n\n" " ssh -i {} {}@{}\n".format(config["auth"]["ssh_private_key"], config["auth"]["ssh_user"], provider.external_ip(head_node)))
def exec_cluster(config_file, cmd, screen, tmux, stop, start, override_cluster_name, port_forward): """Runs a command on the specified cluster. Arguments: config_file: path to the cluster yaml cmd: command to run screen: whether to run in a screen tmux: whether to run in a tmux session stop: whether to stop the cluster after command run start: whether to start the cluster if it isn't up override_cluster_name: set the name of the cluster port_forward: port to forward """ assert not (screen and tmux), "Can specify only one of `screen` or `tmux`." config = yaml.load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config) head_node = _get_head_node(config, config_file, override_cluster_name, create_if_needed=start) provider = get_node_provider(config["provider"], config["cluster_name"]) try: updater = NodeUpdaterThread( head_node, config["provider"], provider, config["auth"], config["cluster_name"], config["file_mounts"], [], "", ) if stop: cmd += ( "; ray stop; ray teardown ~/ray_bootstrap_config.yaml --yes " "--workers-only; sudo shutdown -h now") _exec(updater, cmd, screen, tmux, expect_error=stop, port_forward=port_forward) if tmux or screen: attach_command_parts = ["ray attach", config_file] if override_cluster_name is not None: attach_command_parts.append( "--cluster-name={}".format(override_cluster_name)) if tmux: attach_command_parts.append("--tmux") elif screen: attach_command_parts.append("--screen") attach_command = " ".join(attach_command_parts) attach_info = "Use `{}` to check on command status.".format( attach_command) logger.info(attach_info) finally: provider.cleanup()