def do_update(self): self.provider.set_node_tags(self.node_id, {TAG_RAY_NODE_STATUS: "waiting-for-ssh"}) deadline = time.time() + NODE_START_WAIT_S self.set_ssh_ip_if_required() # Wait for SSH access with LogTimer("NodeUpdater: " "{}: Got SSH".format(self.node_id)): ssh_ok = self.wait_for_ssh(deadline) assert ssh_ok, "Unable to SSH to node" self.provider.set_node_tags(self.node_id, {TAG_RAY_NODE_STATUS: "syncing-files"}) self.sync_file_mounts(self.rsync_up) # Run init commands self.provider.set_node_tags(self.node_id, {TAG_RAY_NODE_STATUS: "setting-up"}) m = "{}: Initialization commands completed".format(self.node_id) with LogTimer("NodeUpdater: {}".format(m)): for cmd in self.initialization_commands: self.ssh_cmd(cmd) m = "{}: Setup commands completed".format(self.node_id) with LogTimer("NodeUpdater: {}".format(m)): for cmd in self.setup_commands: self.ssh_cmd(cmd)
def do_update(self): self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_WAITING_FOR_SSH}) deadline = time.time() + NODE_START_WAIT_S self.wait_ready(deadline) node_tags = self.provider.node_tags(self.node_id) logger.debug("Node tags: {}".format(str(node_tags))) if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash: logger.info(self.log_prefix + "{} already up-to-date, skip to ray start".format( self.node_id)) else: self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SYNCING_FILES}) self.sync_file_mounts(self.rsync_up) # Run init commands self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SETTING_UP}) with LogTimer(self.log_prefix + "Initialization commands completed"): for cmd in self.initialization_commands: self.cmd_runner.run(cmd) with LogTimer(self.log_prefix + "Setup commands completed"): for cmd in self.setup_commands: self.cmd_runner.run(cmd) with LogTimer(self.log_prefix + "Ray start commands completed"): for cmd in self.ray_start_commands: self.cmd_runner.run(cmd)
def do_update(self): self.provider.set_node_tags(self.node_id, {TAG_RAY_NODE_STATUS: "waiting-for-ssh"}) deadline = time.time() + NODE_START_WAIT_S self.set_ssh_ip_if_required() # Wait for SSH access with LogTimer("NodeUpdater: " "{}: Got SSH".format(self.node_id)): ssh_ok = self.wait_for_ssh(deadline) assert ssh_ok, "Unable to SSH to node" # Rsync file mounts self.provider.set_node_tags(self.node_id, {TAG_RAY_NODE_STATUS: "syncing-files"}) for remote_path, local_path in self.file_mounts.items(): logger.info("NodeUpdater: " "{}: Syncing {} to {}...".format( self.node_id, local_path, remote_path)) assert os.path.exists(local_path), local_path if os.path.isdir(local_path): if not local_path.endswith("/"): local_path += "/" if not remote_path.endswith("/"): remote_path += "/" m = "{}: Synced {} to {}".format(self.node_id, local_path, remote_path) with LogTimer("NodeUpdater {}".format(m)): with open("/dev/null", "w") as redirect: self.ssh_cmd( "mkdir -p {}".format(os.path.dirname(remote_path)), redirect=redirect, ) self.rsync_up(local_path, remote_path, redirect=redirect) # Run init commands self.provider.set_node_tags(self.node_id, {TAG_RAY_NODE_STATUS: "setting-up"}) m = "{}: Initialization commands completed".format(self.node_id) with LogTimer("NodeUpdater: {}".format(m)): with open("/dev/null", "w") as redirect: for cmd in self.initialization_commands: self.ssh_cmd(cmd, redirect=redirect) m = "{}: Setup commands completed".format(self.node_id) with LogTimer("NodeUpdater: {}".format(m)): with open("/dev/null", "w") as redirect: for cmd in self.setup_commands: self.ssh_cmd(cmd, redirect=redirect)
def set_ssh_ip_if_required(self): if self.ssh_ip is not None: return # We assume that this never changes. # I think that's reasonable. deadline = time.time() + NODE_START_WAIT_S with LogTimer("NodeUpdater: {}: Got IP".format(self.node_id)): ip = self.wait_for_ip(deadline) assert ip is not None, "Unable to find IP of node" self.ssh_ip = ip # This should run before any SSH commands and therefore ensure that # the ControlPath directory exists, allowing SSH to maintain # persistent sessions later on. with open("/dev/null", "w") as redirect: self.get_caller(False)( ["mkdir", "-p", self.ssh_control_path], stdout=redirect, stderr=redirect) self.get_caller(False)( ["chmod", "0700", self.ssh_control_path], stdout=redirect, stderr=redirect)
def sync_file_mounts(self, sync_cmd): nolog_paths = [] if cli_logger.verbosity == 0: nolog_paths = [ "~/ray_bootstrap_key.pem", "~/ray_bootstrap_config.yaml" ] # Rsync file mounts with cli_logger.group("Processing file mounts", _numbered=("[]", 2, 5)): for remote_path, local_path in self.file_mounts.items(): assert os.path.exists(local_path), local_path if os.path.isdir(local_path): if not local_path.endswith("/"): local_path += "/" if not remote_path.endswith("/"): remote_path += "/" with LogTimer( self.log_prefix + "Synced {} to {}".format(local_path, remote_path)): self.cmd_runner.run("mkdir -p {}".format( os.path.dirname(remote_path))) sync_cmd(local_path, remote_path) if remote_path not in nolog_paths: # todo: timed here? cli_logger.print("{} from {}", cf.bold(remote_path), cf.bold(local_path))
def do_sync(remote_path, local_path, allow_non_existing_paths=False): if allow_non_existing_paths and not os.path.exists(local_path): # Ignore missing source files. In the future we should support # the --delete-missing-args command to delete files that have # been removed return assert os.path.exists(local_path), local_path if os.path.isdir(local_path): if not local_path.endswith("/"): local_path += "/" if not remote_path.endswith("/"): remote_path += "/" with LogTimer(self.log_prefix + "Synced {} to {}".format(local_path, remote_path)): self.cmd_runner.run("mkdir -p {}".format( os.path.dirname(remote_path)), run_env="host") sync_cmd(local_path, remote_path) if remote_path not in nolog_paths: # todo: timed here? cli_logger.print("{} from {}", cf.bold(remote_path), cf.bold(local_path))
def wait_ready(self, deadline): with LogTimer(self.log_prefix + "Got remote shell"): logger.info(self.log_prefix + "Waiting for remote shell...") while time.time() < deadline and \ not self.provider.is_terminated(self.node_id): try: logger.debug(self.log_prefix + "Waiting for remote shell...") # Setting redirect=False allows the user to see errors like # unix_listener: path "/tmp/rkn_ray_ssh_sockets/..." too # long for Unix domain socket. self.cmd_runner.run("uptime", timeout=5, redirect=False) return True except Exception as e: retry_str = str(e) if hasattr(e, "cmd"): retry_str = "(Exit Status {}): {}".format( e.returncode, " ".join(e.cmd)) logger.debug(self.log_prefix + "Node not up, retrying: {}".format(retry_str)) time.sleep(READY_CHECK_INTERVAL) assert False, "Unable to connect to node"
def set_ssh_ip_if_required(self): if self.ssh_ip is not None: return # We assume that this never changes. # I think that's reasonable. deadline = time.time() + NODE_START_WAIT_S with LogTimer(self.log_prefix + "Got IP"): ip = self.wait_for_ip(deadline) assert ip is not None, "Unable to find IP of node" self.ssh_ip = ip # This should run before any SSH commands and therefore ensure that # the ControlPath directory exists, allowing SSH to maintain # persistent sessions later on. try: self.process_runner.check_call( ["mkdir", "-p", self.ssh_control_path]) except subprocess.CalledProcessError as e: logger.warning(e) try: self.process_runner.check_call( ["chmod", "0700", self.ssh_control_path]) except subprocess.CalledProcessError as e: logger.warning(self.log_prefix + str(e))
def run(self): logger.info(self.log_prefix + "Updating to {}".format(self.runtime_hash)) try: with LogTimer(self.log_prefix + "Applied config {}".format(self.runtime_hash)): self.do_update() except Exception as e: error_str = str(e) if hasattr(e, "cmd"): error_str = "(Exit Status {}) {}".format( e.returncode, " ".join(e.cmd)) logger.error(self.log_prefix + "Error updating {}".format(error_str)) self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED}) raise e self.provider.set_node_tags( self.node_id, { TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE, TAG_RAY_RUNTIME_CONFIG: self.runtime_hash }) self.exitcode = 0
def run(self): logger.info("NodeUpdater: " "{}: Updating to {}".format(self.node_id, self.runtime_hash)) try: m = "{}: Applied config {}".format(self.node_id, self.runtime_hash) with LogTimer("NodeUpdater: {}".format(m)): self.do_update() except Exception as e: error_str = str(e) if hasattr(e, "cmd"): error_str = "(Exit Status {}) {}".format( e.returncode, " ".join(e.cmd)) logger.error("NodeUpdater: " "{}: Error updating {}".format( self.node_id, error_str)) self.provider.set_node_tags(self.node_id, {TAG_RAY_NODE_STATUS: "update-failed"}) raise e self.provider.set_node_tags( self.node_id, { TAG_RAY_NODE_STATUS: "up-to-date", TAG_RAY_RUNTIME_CONFIG: self.runtime_hash }) self.exitcode = 0
def do_sync(remote_path, local_path, allow_non_existing_paths=False): if allow_non_existing_paths and not os.path.exists(local_path): cli_logger.print("sync: {} does not exist. Skipping.", local_path) # Ignore missing source files. In the future we should support # the --delete-missing-args command to delete files that have # been removed return assert os.path.exists(local_path), local_path if os.path.isdir(local_path): if not local_path.endswith("/"): local_path += "/" if not remote_path.endswith("/"): remote_path += "/" with LogTimer(self.log_prefix + "Synced {} to {}".format(local_path, remote_path)): is_docker = (self.docker_config and self.docker_config["container_name"] != "") if not is_docker: # The DockerCommandRunner handles this internally. self.cmd_runner.run("mkdir -p {}".format( os.path.dirname(remote_path)), run_env="host") sync_cmd(local_path, remote_path, file_mount=True) if remote_path not in nolog_paths: # todo: timed here? cli_logger.print("{} from {}", cf.bold(remote_path), cf.bold(local_path))
def teardown_cluster(config_file, yes, workers_only, override_cluster_name, keep_min_workers): """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = prepare_config(config) validate_config(config) confirm("This will destroy your cluster", yes) if not workers_only: try: exec_cluster(config_file, "ray stop", False, False, False, False, False, override_cluster_name, None, False) except Exception: logger.exception("Ignoring error attempting a clean shutdown.") provider = get_node_provider(config["provider"], config["cluster_name"]) try: def remaining_nodes(): workers = provider.non_terminated_nodes( {TAG_RAY_NODE_TYPE: NODE_TYPE_WORKER}) if keep_min_workers: min_workers = config.get("min_workers", 0) logger.info("teardown_cluster: " "Keeping {} nodes...".format(min_workers)) workers = random.sample(workers, len(workers) - min_workers) if workers_only: return workers head = provider.non_terminated_nodes( {TAG_RAY_NODE_TYPE: NODE_TYPE_HEAD}) return head + workers # Loop here to check that both the head and worker nodes are actually # really gone A = remaining_nodes() with LogTimer("teardown_cluster: done."): while A: logger.info("teardown_cluster: " "Shutting down {} nodes...".format(len(A))) provider.terminate_nodes(A) time.sleep(1) A = remaining_nodes() finally: provider.cleanup()
def do_update(self): self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_WAITING_FOR_SSH}) deadline = time.time() + NODE_START_WAIT_S self.set_ssh_ip_if_required() # Wait for SSH access with LogTimer("NodeUpdater: " "{}: Got SSH".format(self.node_id)): ssh_ok = self.wait_for_ssh(deadline) assert ssh_ok, "Unable to SSH to node" node_tags = self.provider.node_tags(self.node_id) if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash: logger.info( "NodeUpdater: {} already up-to-date, skip to ray start".format( self.node_id)) else: self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SYNCING_FILES}) self.sync_file_mounts(self.rsync_up) # Run init commands self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SETTING_UP}) m = "{}: Initialization commands completed".format(self.node_id) with LogTimer("NodeUpdater: {}".format(m)): for cmd in self.initialization_commands: self.ssh_cmd(cmd) m = "{}: Setup commands completed".format(self.node_id) with LogTimer("NodeUpdater: {}".format(m)): for cmd in self.setup_commands: self.ssh_cmd(cmd) m = "{}: Ray start commands completed".format(self.node_id) with LogTimer("NodeUpdater: {}".format(m)): for cmd in self.ray_start_commands: self.ssh_cmd(cmd)
def wait_ready(self, deadline): with cli_logger.group("Waiting for SSH to become available", _numbered=("[]", 1, 6)): with LogTimer(self.log_prefix + "Got remote shell"): cli_logger.old_info(logger, "{}Waiting for remote shell...", self.log_prefix) cli_logger.print("Running `{}` as a test.", cf.bold("uptime")) first_conn_refused_time = None while time.time() < deadline and \ not self.provider.is_terminated(self.node_id): try: cli_logger.old_debug(logger, "{}Waiting for remote shell...", self.log_prefix) # Run outside of the container self.cmd_runner.run("uptime", run_env="host") cli_logger.old_debug(logger, "Uptime succeeded.") cli_logger.success("Success.") return True except ProcessRunnerError as e: first_conn_refused_time = \ cmd_output_util.handle_ssh_fails( e, first_conn_refused_time, retry_interval=READY_CHECK_INTERVAL) time.sleep(READY_CHECK_INTERVAL) except Exception as e: # TODO(maximsmol): we should not be ignoring # exceptions if they get filtered properly # (new style log + non-interactive shells) # # however threading this configuration state # is a pain and I'm leaving it for later retry_str = str(e) if hasattr(e, "cmd"): retry_str = "(Exit Status {}): {}".format( e.returncode, " ".join(e.cmd)) cli_logger.print( "SSH still not available {}, " "retrying in {} seconds.", cf.dimmed(retry_str), cf.bold(str(READY_CHECK_INTERVAL))) cli_logger.old_debug(logger, "{}Node not up, retrying: {}", self.log_prefix, retry_str) time.sleep(READY_CHECK_INTERVAL) assert False, "Unable to connect to node"
def run(self): cli_logger.old_info(logger, "{}Updating to {}", self.log_prefix, self.runtime_hash) try: with LogTimer(self.log_prefix + "Applied config {}".format(self.runtime_hash)): self.do_update() except Exception as e: error_str = str(e) if hasattr(e, "cmd"): error_str = "(Exit Status {}) {}".format( e.returncode, " ".join(e.cmd)) self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED}) cli_logger.error("New status: {}", cf.bold(STATUS_UPDATE_FAILED)) cli_logger.old_error(logger, "{}Error executing: {}\n", self.log_prefix, error_str) cli_logger.error("!!!") if hasattr(e, "cmd"): cli_logger.error( "Setup command `{}` failed with exit code {}. stderr:", cf.bold(e.cmd), e.returncode) else: cli_logger.verbose_error("{}", str(vars(e))) # todo: handle this better somehow? cli_logger.error("{}", str(e)) # todo: print stderr here cli_logger.error("!!!") cli_logger.newline() if isinstance(e, click.ClickException): # todo: why do we ignore this here return raise tags_to_set = { TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE, TAG_RAY_RUNTIME_CONFIG: self.runtime_hash, } if self.file_mounts_contents_hash is not None: tags_to_set[ TAG_RAY_FILE_MOUNTS_CONTENTS] = self.file_mounts_contents_hash self.provider.set_node_tags(self.node_id, tags_to_set) cli_logger.labeled_value("New status", STATUS_UP_TO_DATE) self.exitcode = 0
def sync_file_mounts(self, sync_cmd): # Rsync file mounts for remote_path, local_path in self.file_mounts.items(): assert os.path.exists(local_path), local_path if os.path.isdir(local_path): if not local_path.endswith("/"): local_path += "/" if not remote_path.endswith("/"): remote_path += "/" with LogTimer(self.log_prefix + "Synced {} to {}".format(local_path, remote_path)): self.cmd_runner.run("mkdir -p {}".format( os.path.dirname(remote_path))) sync_cmd(local_path, remote_path)
def sync_file_mounts(self, sync_cmd): # Rsync file mounts for remote_path, local_path in self.file_mounts.items(): assert os.path.exists(local_path), local_path if os.path.isdir(local_path): if not local_path.endswith("/"): local_path += "/" if not remote_path.endswith("/"): remote_path += "/" m = "{}: Synced {} to {}".format(self.node_id, local_path, remote_path) with LogTimer("NodeUpdater {}".format(m)): self.ssh_cmd( "mkdir -p {}".format(os.path.dirname(remote_path)), redirect=None, ) sync_cmd(local_path, remote_path, redirect=None)
def teardown_cluster(config_file, yes, workers_only, override_cluster_name): """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name validate_config(config) config = fillout_defaults(config) confirm("This will destroy your cluster", yes) provider = get_node_provider(config["provider"], config["cluster_name"]) try: def remaining_nodes(): if workers_only: A = [] else: A = [ node_id for node_id in provider.non_terminated_nodes({ TAG_RAY_NODE_TYPE: "head" }) ] A += [ node_id for node_id in provider.non_terminated_nodes({ TAG_RAY_NODE_TYPE: "worker" }) ] return A # Loop here to check that both the head and worker nodes are actually # really gone A = remaining_nodes() with LogTimer("teardown_cluster: Termination done."): while A: logger.info("teardown_cluster: " "Terminating {} nodes...".format(len(A))) provider.terminate_nodes(A) time.sleep(1) A = remaining_nodes() finally: provider.cleanup()
def set_ssh_ip_if_required(self): if self.ssh_ip is not None: return # We assume that this never changes. # I think that's reasonable. deadline = time.time() + NODE_START_WAIT_S with LogTimer(self.log_prefix + "Got IP"): ip = self.wait_for_ip(deadline) assert ip is not None, "Unable to find IP of node" self.ssh_ip = ip # This should run before any SSH commands and therefore ensure that # the ControlPath directory exists, allowing SSH to maintain # persistent sessions later on. try: os.makedirs(self.ssh_control_path, mode=0o700, exist_ok=True) except OSError as e: logger.warning(e)
def _node_tag_update_loop(self): """ Update the AWS tags for a cluster periodically. The purpose of this loop is to avoid excessive EC2 calls when a large number of nodes are being launched simultaneously. """ while True: self.tag_cache_update_event.wait() self.tag_cache_update_event.clear() batch_updates = defaultdict(list) with self.tag_cache_lock: for node_id, tags in self.tag_cache_pending.items(): for x in tags.items(): batch_updates[x].append(node_id) self.tag_cache[node_id].update(tags) self.tag_cache_pending = {} for (k, v), node_ids in batch_updates.items(): m = "Set tag {}={} on {}".format(k, v, node_ids) with LogTimer("AWSNodeProvider: {}".format(m)): if k == TAG_RAY_NODE_NAME: k = "Name" self.ec2.meta.client.create_tags( Resources=node_ids, Tags=[{ "Key": k, "Value": v }], ) self.tag_cache_kill_event.wait(timeout=5) if self.tag_cache_kill_event.is_set(): return
def wait_ready(self, deadline): with cli_logger.group( "Waiting for SSH to become available", _numbered=("[]", 1, 6)): with LogTimer(self.log_prefix + "Got remote shell"): cli_logger.old_info(logger, "{}Waiting for remote shell...", self.log_prefix) cli_logger.print("Running `{}` as a test.", cf.bold("uptime")) while time.time() < deadline and \ not self.provider.is_terminated(self.node_id): try: cli_logger.old_debug(logger, "{}Waiting for remote shell...", self.log_prefix) self.cmd_runner.run("uptime") cli_logger.old_debug(logger, "Uptime succeeded.") cli_logger.success("Success.") return True except Exception as e: retry_str = str(e) if hasattr(e, "cmd"): retry_str = "(Exit Status {}): {}".format( e.returncode, " ".join(e.cmd)) cli_logger.print( "SSH still not available {}, " "retrying in {} seconds.", cf.gray(retry_str), cf.bold(str(READY_CHECK_INTERVAL))) cli_logger.old_debug(logger, "{}Node not up, retrying: {}", self.log_prefix, retry_str) time.sleep(READY_CHECK_INTERVAL) assert False, "Unable to connect to node"
def wait_ready(self, deadline): with LogTimer(self.log_prefix + "Got remote shell"): logger.info(self.log_prefix + "Waiting for remote shell...") while time.time() < deadline and \ not self.provider.is_terminated(self.node_id): try: logger.debug(self.log_prefix + "Waiting for remote shell...") self.cmd_runner.run("uptime", timeout=5) logger.debug("Uptime succeeded.") return True except Exception as e: retry_str = str(e) if hasattr(e, "cmd"): retry_str = "(Exit Status {}): {}".format( e.returncode, " ".join(e.cmd)) logger.debug(self.log_prefix + "Node not up, retrying: {}".format(retry_str)) time.sleep(READY_CHECK_INTERVAL) assert False, "Unable to connect to node"
def do_update(self): self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_WAITING_FOR_SSH}) cli_logger.labeled_value("New status", STATUS_WAITING_FOR_SSH) deadline = time.time() + NODE_START_WAIT_S self.wait_ready(deadline) node_tags = self.provider.node_tags(self.node_id) logger.debug("Node tags: {}".format(str(node_tags))) # runtime_hash will only change whenever the user restarts # or updates their cluster with `get_or_create_head_node` if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash and ( self.file_mounts_contents_hash is None or node_tags.get(TAG_RAY_FILE_MOUNTS_CONTENTS) == self.file_mounts_contents_hash): # todo: we lie in the confirmation message since # full setup might be cancelled here cli_logger.print( "Configuration already up to date, " "skipping file mounts, initalization and setup commands.") cli_logger.old_info(logger, "{}{} already up-to-date, skip to ray start", self.log_prefix, self.node_id) else: cli_logger.print( "Updating cluster configuration.", _tags=dict(hash=self.runtime_hash)) self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SYNCING_FILES}) cli_logger.labeled_value("New status", STATUS_SYNCING_FILES) self.sync_file_mounts(self.rsync_up) # Only run setup commands if runtime_hash has changed because # we don't want to run setup_commands every time the head node # file_mounts folders have changed. if node_tags.get(TAG_RAY_RUNTIME_CONFIG) != self.runtime_hash: # Run init commands self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SETTING_UP}) cli_logger.labeled_value("New status", STATUS_SETTING_UP) if self.initialization_commands: with cli_logger.group( "Running initialization commands", _numbered=("[]", 4, 6)): # todo: fix command numbering with LogTimer( self.log_prefix + "Initialization commands", show_status=True): for cmd in self.initialization_commands: self.cmd_runner.run( cmd, ssh_options_override=SSHOptions( self.auth_config.get( "ssh_private_key"))) else: cli_logger.print( "No initialization commands to run.", _numbered=("[]", 4, 6)) if self.setup_commands: with cli_logger.group( "Running setup commands", _numbered=("[]", 5, 6)): # todo: fix command numbering with LogTimer( self.log_prefix + "Setup commands", show_status=True): total = len(self.setup_commands) for i, cmd in enumerate(self.setup_commands): if cli_logger.verbosity == 0: cmd_to_print = cf.bold(cmd[:30]) + "..." else: cmd_to_print = cf.bold(cmd) cli_logger.print( "{}", cmd_to_print, _numbered=("()", i, total)) self.cmd_runner.run(cmd) else: cli_logger.print( "No setup commands to run.", _numbered=("[]", 5, 6)) with cli_logger.group( "Starting the Ray runtime", _numbered=("[]", 6, 6)): with LogTimer( self.log_prefix + "Ray start commands", show_status=True): for cmd in self.ray_start_commands: self.cmd_runner.run(cmd)
def teardown_cluster(config_file: str, yes: bool, workers_only: bool, override_cluster_name: Optional[str], keep_min_workers: bool): """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = prepare_config(config) validate_config(config) cli_logger.confirm(yes, "Destroying cluster.", _abort=True) cli_logger.old_confirm("This will destroy your cluster", yes) if not workers_only: try: exec_cluster(config_file, cmd="ray stop", run_env="auto", screen=False, tmux=False, stop=False, start=False, override_cluster_name=override_cluster_name, port_forward=None, with_output=False) except Exception as e: # todo: add better exception info cli_logger.verbose_error("{}", str(e)) cli_logger.warning( "Exception occured when stopping the cluster Ray runtime " "(use -v to dump teardown exceptions).") cli_logger.warning( "Ignoring the exception and " "attempting to shut down the cluster nodes anyway.") cli_logger.old_exception( logger, "Ignoring error attempting a clean shutdown.") provider = get_node_provider(config["provider"], config["cluster_name"]) try: def remaining_nodes(): workers = provider.non_terminated_nodes( {TAG_RAY_NODE_KIND: NODE_KIND_WORKER}) if keep_min_workers: min_workers = config.get("min_workers", 0) cli_logger.print( "{} random worker nodes will not be shut down. " + cf.dimmed("(due to {})"), cf.bold(min_workers), cf.bold("--keep-min-workers")) cli_logger.old_info(logger, "teardown_cluster: Keeping {} nodes...", min_workers) workers = random.sample(workers, len(workers) - min_workers) # todo: it's weird to kill the head node but not all workers if workers_only: cli_logger.print( "The head node will not be shut down. " + cf.dimmed("(due to {})"), cf.bold("--workers-only")) return workers head = provider.non_terminated_nodes( {TAG_RAY_NODE_KIND: NODE_KIND_HEAD}) return head + workers def run_docker_stop(node, container_name): try: updater = NodeUpdaterThread( node_id=node, provider_config=config["provider"], provider=provider, auth_config=config["auth"], cluster_name=config["cluster_name"], file_mounts=config["file_mounts"], initialization_commands=[], setup_commands=[], ray_start_commands=[], runtime_hash="", file_mounts_contents_hash="", is_head_node=False, docker_config=config.get("docker")) _exec(updater, f"docker stop {container_name}", False, False, run_env="host") except Exception: cli_logger.warning(f"Docker stop failed on {node}") cli_logger.old_warning(logger, f"Docker stop failed on {node}") # Loop here to check that both the head and worker nodes are actually # really gone A = remaining_nodes() container_name = config.get("docker", {}).get("container_name") if container_name: for node in A: run_docker_stop(node, container_name) with LogTimer("teardown_cluster: done."): while A: cli_logger.old_info( logger, "teardown_cluster: " "Shutting down {} nodes...", len(A)) provider.terminate_nodes(A) cli_logger.print("Requested {} nodes to shut down.", cf.bold(len(A)), _tags=dict(interval="1s")) time.sleep(1) # todo: interval should be a variable A = remaining_nodes() cli_logger.print("{} nodes remaining after 1 second.", cf.bold(len(A))) cli_logger.success("No nodes remaining.") finally: provider.cleanup()
def do_update(self): self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_WAITING_FOR_SSH}) cli_logger.labeled_value("New status", STATUS_WAITING_FOR_SSH) deadline = time.time() + NODE_START_WAIT_S self.wait_ready(deadline) node_tags = self.provider.node_tags(self.node_id) logger.debug("Node tags: {}".format(str(node_tags))) # runtime_hash will only change whenever the user restarts # or updates their cluster with `get_or_create_head_node` if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash and ( self.file_mounts_contents_hash is None or node_tags.get(TAG_RAY_FILE_MOUNTS_CONTENTS) == self.file_mounts_contents_hash): # todo: we lie in the confirmation message since # full setup might be cancelled here cli_logger.print( "Configuration already up to date, " "skipping file mounts, initalization and setup commands.", _numbered=("[]", "2-5", 6)) cli_logger.old_info(logger, "{}{} already up-to-date, skip to ray start", self.log_prefix, self.node_id) # When resuming from a stopped instance the runtime_hash may be the # same, but the container will not be started. self.cmd_runner.run_init(as_head=self.is_head_node, file_mounts=self.file_mounts) else: cli_logger.print("Updating cluster configuration.", _tags=dict(hash=self.runtime_hash)) self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SYNCING_FILES}) cli_logger.labeled_value("New status", STATUS_SYNCING_FILES) self.sync_file_mounts(self.rsync_up, step_numbers=(2, 6)) # Only run setup commands if runtime_hash has changed because # we don't want to run setup_commands every time the head node # file_mounts folders have changed. if node_tags.get(TAG_RAY_RUNTIME_CONFIG) != self.runtime_hash: # Run init commands self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SETTING_UP}) cli_logger.labeled_value("New status", STATUS_SETTING_UP) if self.initialization_commands: with cli_logger.group("Running initialization commands", _numbered=("[]", 3, 5)): with LogTimer(self.log_prefix + "Initialization commands", show_status=True): for cmd in self.initialization_commands: try: # Overriding the existing SSHOptions class # with a new SSHOptions class that uses # this ssh_private_key as its only __init__ # argument. # Run outside docker. self.cmd_runner.run( cmd, ssh_options_override_ssh_key=self. auth_config.get("ssh_private_key"), run_env="host") except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error( "See above for stderr.") raise click.ClickException( "Initialization command failed." ) from None else: cli_logger.print("No initialization commands to run.", _numbered=("[]", 3, 6)) self.cmd_runner.run_init(as_head=self.is_head_node, file_mounts=self.file_mounts) if self.setup_commands: with cli_logger.group( "Running setup commands", # todo: fix command numbering _numbered=("[]", 4, 6)): with LogTimer(self.log_prefix + "Setup commands", show_status=True): total = len(self.setup_commands) for i, cmd in enumerate(self.setup_commands): if cli_logger.verbosity == 0 and len(cmd) > 30: cmd_to_print = cf.bold(cmd[:30]) + "..." else: cmd_to_print = cf.bold(cmd) cli_logger.print("{}", cmd_to_print, _numbered=("()", i, total)) try: # Runs in the container if docker is in use self.cmd_runner.run(cmd, run_env="auto") except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error( "See above for stderr.") raise click.ClickException( "Setup command failed.") else: cli_logger.print("No setup commands to run.", _numbered=("[]", 4, 6)) with cli_logger.group("Starting the Ray runtime", _numbered=("[]", 6, 6)): with LogTimer(self.log_prefix + "Ray start commands", show_status=True): for cmd in self.ray_start_commands: if self.node_resources: env_vars = { ray_constants.RESOURCES_ENVIRONMENT_VARIABLE: self.node_resources } else: env_vars = {} try: old_redirected = cmd_output_util.is_output_redirected() cmd_output_util.set_output_redirected(False) # Runs in the container if docker is in use self.cmd_runner.run(cmd, environment_variables=env_vars, run_env="auto") cmd_output_util.set_output_redirected(old_redirected) except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error("See above for stderr.") raise click.ClickException("Start command failed.")
def run(self): cli_logger.old_info(logger, "{}Updating to {}", self.log_prefix, self.runtime_hash) if cmd_output_util.does_allow_interactive( ) and cmd_output_util.is_output_redirected(): # this is most probably a bug since the user has no control # over these settings msg = ("Output was redirected for an interactive command. " "Either do not pass `--redirect-command-output` " "or also pass in `--use-normal-shells`.") cli_logger.abort(msg) raise click.ClickException(msg) try: with LogTimer(self.log_prefix + "Applied config {}".format(self.runtime_hash)): self.do_update() except Exception as e: error_str = str(e) if hasattr(e, "cmd"): error_str = "(Exit Status {}) {}".format( e.returncode, " ".join(e.cmd)) self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED}) cli_logger.error("New status: {}", cf.bold(STATUS_UPDATE_FAILED)) cli_logger.old_error(logger, "{}Error executing: {}\n", self.log_prefix, error_str) cli_logger.error("!!!") if hasattr(e, "cmd"): cli_logger.error( "Setup command `{}` failed with exit code {}. stderr:", cf.bold(e.cmd), e.returncode) else: cli_logger.verbose_error("{}", str(vars(e))) # todo: handle this better somehow? cli_logger.error("{}", str(e)) # todo: print stderr here cli_logger.error("!!!") cli_logger.newline() if isinstance(e, click.ClickException): # todo: why do we ignore this here return raise tags_to_set = { TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE, TAG_RAY_RUNTIME_CONFIG: self.runtime_hash, } if self.file_mounts_contents_hash is not None: tags_to_set[ TAG_RAY_FILE_MOUNTS_CONTENTS] = self.file_mounts_contents_hash self.provider.set_node_tags(self.node_id, tags_to_set) cli_logger.labeled_value("New status", STATUS_UP_TO_DATE) self.exitcode = 0
def teardown_cluster(config_file: str, yes: bool, workers_only: bool, override_cluster_name: Optional[str], keep_min_workers: bool, log_old_style: bool, log_color: str, verbose: int): """Destroys all nodes of a Ray cluster described by a config json.""" cli_logger.old_style = log_old_style cli_logger.color_mode = log_color cli_logger.verbosity = verbose cli_logger.dump_command_output = verbose == 3 # todo: add a separate flag? config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = prepare_config(config) validate_config(config) cli_logger.confirm(yes, "Destroying cluster.", _abort=True) cli_logger.old_confirm("This will destroy your cluster", yes) if not workers_only: try: exec_cluster(config_file, cmd="ray stop", run_env="auto", screen=False, tmux=False, stop=False, start=False, override_cluster_name=override_cluster_name, port_forward=None, with_output=False) except Exception as e: # todo: add better exception info cli_logger.verbose_error("{}", str(e)) cli_logger.warning( "Exception occured when stopping the cluster Ray runtime " "(use -v to dump teardown exceptions).") cli_logger.warning( "Ignoring the exception and " "attempting to shut down the cluster nodes anyway.") cli_logger.old_exception( logger, "Ignoring error attempting a clean shutdown.") provider = get_node_provider(config["provider"], config["cluster_name"]) try: def remaining_nodes(): workers = provider.non_terminated_nodes( {TAG_RAY_NODE_TYPE: NODE_TYPE_WORKER}) if keep_min_workers: min_workers = config.get("min_workers", 0) cli_logger.print( "{} random worker nodes will not be shut down. " + cf.gray("(due to {})"), cf.bold(min_workers), cf.bold("--keep-min-workers")) cli_logger.old_info(logger, "teardown_cluster: Keeping {} nodes...", min_workers) workers = random.sample(workers, len(workers) - min_workers) # todo: it's weird to kill the head node but not all workers if workers_only: cli_logger.print( "The head node will not be shut down. " + cf.gray("(due to {})"), cf.bold("--workers-only")) return workers head = provider.non_terminated_nodes( {TAG_RAY_NODE_TYPE: NODE_TYPE_HEAD}) return head + workers # Loop here to check that both the head and worker nodes are actually # really gone A = remaining_nodes() with LogTimer("teardown_cluster: done."): while A: cli_logger.old_info( logger, "teardown_cluster: " "Shutting down {} nodes...", len(A)) provider.terminate_nodes(A) cli_logger.print("Requested {} nodes to shut down.", cf.bold(len(A)), _tags=dict(interval="1s")) time.sleep(1) # todo: interval should be a variable A = remaining_nodes() cli_logger.print("{} nodes remaining after 1 second.", cf.bold(len(A))) finally: provider.cleanup()