def terminate_nodes(self, node_ids): if not node_ids: return if self.cache_stopped_nodes: spot_ids = [] on_demand_ids = [] for node_id in node_ids: if self._get_cached_node(node_id).spot_instance_request_id: spot_ids += [node_id] else: on_demand_ids += [node_id] if on_demand_ids: # todo: show node names? cli_logger.print( "Stopping instances {} " + cf.dimmed( "(to terminate instead, " "set `cache_stopped_nodes: False` " "under `provider` in the cluster configuration)"), cli_logger.render_list(on_demand_ids)) self.ec2.meta.client.stop_instances(InstanceIds=on_demand_ids) if spot_ids: cli_logger.print( "Terminating instances {} " + cf.dimmed("(cannot stop spot instances, only terminate)"), cli_logger.render_list(spot_ids)) self.ec2.meta.client.terminate_instances(InstanceIds=spot_ids) else: self.ec2.meta.client.terminate_instances(InstanceIds=node_ids)
def remaining_nodes(): workers = provider.non_terminated_nodes({ TAG_RAY_NODE_KIND: NODE_KIND_WORKER }) if keep_min_workers: min_workers = config.get("min_workers", 0) cli_logger.print( "{} random worker nodes will not be shut down. " + cf.dimmed("(due to {})"), cf.bold(min_workers), cf.bold("--keep-min-workers")) workers = random.sample(workers, len(workers) - min_workers) # todo: it's weird to kill the head node but not all workers if workers_only: cli_logger.print( "The head node will not be shut down. " + cf.dimmed("(due to {})"), cf.bold("--workers-only")) return workers head = provider.non_terminated_nodes({ TAG_RAY_NODE_KIND: NODE_KIND_HEAD }) return head + workers
def terminate_node(self, node_id): node = self._get_cached_node(node_id) if self.cache_stopped_nodes: if node.spot_instance_request_id: cli_logger.print( "Terminating instance {} " + cf.dimmed("(cannot stop spot instances, only terminate)"), node_id) # todo: show node name? cli_logger.old_info( logger, "AWSNodeProvider: terminating node {} (spot nodes cannot " "be stopped, only terminated)", node_id) node.terminate() else: cli_logger.print("Stopping instance {} " + cf.dimmed( "(to terminate instead, " "set `cache_stopped_nodes: False` " "under `provider` in the cluster configuration)"), node_id) # todo: show node name? cli_logger.old_info( logger, "AWSNodeProvider: stopping node {}. To terminate nodes " "on stop, set 'cache_stopped_nodes: False' in the " "provider config.".format(node_id)) node.stop() else: node.terminate() self.tag_cache.pop(node_id, None) self.tag_cache_pending.pop(node_id, None)
def terminate_node(self, node_id): node = self._get_cached_node(node_id) if self.cache_stopped_nodes: if node.spot_instance_request_id: cli_logger.print( "Terminating instance {} " + cf.dimmed("(cannot stop spot instances, only terminate)"), node_id) # todo: show node name? node.terminate() else: cli_logger.print("Stopping instance {} " + cf.dimmed( "(to terminate instead, " "set `cache_stopped_nodes: False` " "under `provider` in the cluster configuration)"), node_id) # todo: show node name? node.stop() else: node.terminate() # TODO (Alex): We are leaking the tag cache here. Naively, we would # want to just remove the cache entry here, but terminating can be # asyncrhonous or error, which would result in a use after free error. # If this leak becomes bad, we can garbage collect the tag cache when # the node cache is updated. pass
def terminate_nodes(self, node_ids): if not node_ids: return terminate_instances_func = self.ec2.meta.client.terminate_instances stop_instances_func = self.ec2.meta.client.stop_instances # In some cases, this function stops some nodes, but terminates others. # Each of these requires a different EC2 API call. So, we use the # "nodes_to_terminate" dict below to keep track of exactly which API # call will be used to stop/terminate which set of nodes. The key is # the function to use, and the value is the list of nodes to terminate # with that function. nodes_to_terminate = { terminate_instances_func: [], stop_instances_func: [] } if self.cache_stopped_nodes: spot_ids = [] on_demand_ids = [] for node_id in node_ids: if self._get_cached_node(node_id).spot_instance_request_id: spot_ids += [node_id] else: on_demand_ids += [node_id] if on_demand_ids: # todo: show node names? cli_logger.print( "Stopping instances {} " + cf.dimmed( "(to terminate instead, " "set `cache_stopped_nodes: False` " "under `provider` in the cluster configuration)"), cli_logger.render_list(on_demand_ids), ) if spot_ids: cli_logger.print( "Terminating instances {} " + cf.dimmed("(cannot stop spot instances, only terminate)"), cli_logger.render_list(spot_ids), ) nodes_to_terminate[stop_instances_func] = on_demand_ids nodes_to_terminate[terminate_instances_func] = spot_ids else: nodes_to_terminate[terminate_instances_func] = node_ids max_terminate_nodes = (self.max_terminate_nodes if self.max_terminate_nodes is not None else len(node_ids)) for terminate_func, nodes in nodes_to_terminate.items(): for start in range(0, len(nodes), max_terminate_nodes): terminate_func(InstanceIds=nodes[start:start + max_terminate_nodes])
def wait_ready(self, deadline): with cli_logger.group( "Waiting for SSH to become available", _numbered=("[]", 1, NUM_SETUP_STEPS) ): with LogTimer(self.log_prefix + "Got remote shell"): cli_logger.print("Running `{}` as a test.", cf.bold("uptime")) first_conn_refused_time = None while True: if time.time() > deadline: raise Exception("wait_ready timeout exceeded.") if self.provider.is_terminated(self.node_id): raise Exception( "wait_ready aborting because node " "detected as terminated." ) try: # Run outside of the container self.cmd_runner.run("uptime", timeout=5, run_env="host") cli_logger.success("Success.") return True except ProcessRunnerError as e: first_conn_refused_time = cmd_output_util.handle_ssh_fails( e, first_conn_refused_time, retry_interval=READY_CHECK_INTERVAL, ) time.sleep(READY_CHECK_INTERVAL) except Exception as e: # TODO(maximsmol): we should not be ignoring # exceptions if they get filtered properly # (new style log + non-interactive shells) # # however threading this configuration state # is a pain and I'm leaving it for later retry_str = "(" + str(e) + ")" if hasattr(e, "cmd"): if isinstance(e.cmd, str): cmd_ = e.cmd elif isinstance(e.cmd, list): cmd_ = " ".join(e.cmd) else: logger.debug( f"e.cmd type ({type(e.cmd)}) not list or str." ) cmd_ = str(e.cmd) retry_str = "(Exit Status {}): {}".format( e.returncode, cmd_ ) cli_logger.print( "SSH still not available {}, retrying in {} seconds.", cf.dimmed(retry_str), cf.bold(str(READY_CHECK_INTERVAL)), ) time.sleep(READY_CHECK_INTERVAL)
def terminate_nodes(self, node_ids): if not node_ids: return if self.cache_stopped_nodes: spot_ids = [] on_demand_ids = [] for node_id in node_ids: if self._get_cached_node(node_id).spot_instance_request_id: spot_ids += [node_id] else: on_demand_ids += [node_id] if on_demand_ids: # todo: show node names? cli_logger.print( "Stopping instances {} " + cf.dimmed( "(to terminate instead, " "set `cache_stopped_nodes: False` " "under `provider` in the cluster configuration)"), cli_logger.render_list(on_demand_ids)) cli_logger.old_info( logger, "AWSNodeProvider: stopping nodes {}. To terminate nodes " "on stop, set 'cache_stopped_nodes: False' in the " "provider config.", on_demand_ids) self.ec2.meta.client.stop_instances(InstanceIds=on_demand_ids) if spot_ids: cli_logger.print( "Terminating instances {} " + cf.dimmed("(cannot stop spot instances, only terminate)"), cli_logger.render_list(spot_ids)) cli_logger.old_info( logger, "AWSNodeProvider: terminating nodes {} (spot nodes cannot " "be stopped, only terminated)", spot_ids) self.ec2.meta.client.terminate_instances(InstanceIds=spot_ids) else: self.ec2.meta.client.terminate_instances(InstanceIds=node_ids) for node_id in node_ids: self.tag_cache.pop(node_id, None) self.tag_cache_pending.pop(node_id, None)
def handle_cli_override(key, override): if override is not None: if key in config: nonlocal printed_overrides printed_overrides = True cli_logger.warning( "`{}` override provided on the command line.\n" " Using " + cf.bold("{}") + cf.dimmed( " [configuration file has " + cf.bold("{}") + "]"), key, override, config[key]) config[key] = override
def terminate_node(self, node_id): node = self._get_cached_node(node_id) if self.cache_stopped_nodes: if node.spot_instance_request_id: cli_logger.print( "Terminating instance {} " + cf.dimmed("(cannot stop spot instances, only terminate)"), node_id) # todo: show node name? node.terminate() else: cli_logger.print("Stopping instance {} " + cf.dimmed( "(to terminate instead, " "set `cache_stopped_nodes: False` " "under `provider` in the cluster configuration)"), node_id) # todo: show node name? node.stop() else: node.terminate() self.tag_cache.pop(node_id, None) self.tag_cache_pending.pop(node_id, None)
def wait_ready(self, deadline): with cli_logger.group("Waiting for SSH to become available", _numbered=("[]", 1, 6)): with LogTimer(self.log_prefix + "Got remote shell"): cli_logger.old_info(logger, "{}Waiting for remote shell...", self.log_prefix) cli_logger.print("Running `{}` as a test.", cf.bold("uptime")) first_conn_refused_time = None while time.time() < deadline and \ not self.provider.is_terminated(self.node_id): try: cli_logger.old_debug(logger, "{}Waiting for remote shell...", self.log_prefix) # Run outside of the container self.cmd_runner.run("uptime", timeout=5, run_env="host") cli_logger.old_debug(logger, "Uptime succeeded.") cli_logger.success("Success.") return True except ProcessRunnerError as e: first_conn_refused_time = \ cmd_output_util.handle_ssh_fails( e, first_conn_refused_time, retry_interval=READY_CHECK_INTERVAL) time.sleep(READY_CHECK_INTERVAL) except Exception as e: # TODO(maximsmol): we should not be ignoring # exceptions if they get filtered properly # (new style log + non-interactive shells) # # however threading this configuration state # is a pain and I'm leaving it for later retry_str = str(e) if hasattr(e, "cmd"): retry_str = "(Exit Status {}): {}".format( e.returncode, " ".join(e.cmd)) cli_logger.print( "SSH still not available {}, " "retrying in {} seconds.", cf.dimmed(retry_str), cf.bold(str(READY_CHECK_INTERVAL))) cli_logger.old_debug(logger, "{}Node not up, retrying: {}", self.log_prefix, retry_str) time.sleep(READY_CHECK_INTERVAL) assert False, "Unable to connect to node"
def stop(force, verbose, log_style, log_color): """Stop Ray processes manually on the local machine.""" cli_logger.configure(log_style, log_color, verbose) # Note that raylet needs to exit before object store, otherwise # it cannot exit gracefully. is_linux = sys.platform.startswith("linux") processes_to_kill = [ # The first element is the substring to filter. # The second element, if True, is to filter ps results by command name # (only the first 15 charactors of the executable name on Linux); # if False, is to filter ps results by command with all its arguments. # See STANDARD FORMAT SPECIFIERS section of # http://man7.org/linux/man-pages/man1/ps.1.html # about comm and args. This can help avoid killing non-ray processes. # Format: # Keyword to filter, filter by command (True)/filter by args (False) ["raylet", True], ["plasma_store", True], ["gcs_server", True], ["monitor.py", False], ["redis-server", False], ["default_worker.py", False], # Python worker. ["ray::", True], # Python worker. TODO(mehrdadn): Fix for Windows ["io.ray.runtime.runner.worker.DefaultWorker", False], # Java worker. ["log_monitor.py", False], ["reporter.py", False], ["dashboard.py", False], ["new_dashboard/agent.py", False], ["ray_process_reaper.py", False], ] process_infos = [] for proc in psutil.process_iter(["name", "cmdline"]): try: process_infos.append((proc, proc.name(), proc.cmdline())) except psutil.Error: pass total_found = 0 total_stopped = 0 for keyword, filter_by_cmd in processes_to_kill: if filter_by_cmd and is_linux and len(keyword) > 15: # getting here is an internal bug, so we do not use cli_logger msg = ("The filter string should not be more than {} " "characters. Actual length: {}. Filter: {}").format( 15, len(keyword), keyword) raise ValueError(msg) found = [] for candidate in process_infos: proc, proc_cmd, proc_args = candidate corpus = (proc_cmd if filter_by_cmd else subprocess.list2cmdline(proc_args)) if keyword in corpus: found.append(candidate) for proc, proc_cmd, proc_args in found: total_found += 1 proc_string = str(subprocess.list2cmdline(proc_args)) try: if force: proc.kill() else: # TODO(mehrdadn): On Windows, this is forceful termination. # We don't want CTRL_BREAK_EVENT, because that would # terminate the entire process group. What to do? proc.terminate() if force: cli_logger.verbose("Killed `{}` {} ", cf.bold(proc_string), cf.dimmed("(via SIGKILL)")) else: cli_logger.verbose("Send termination request to `{}` {}", cf.bold(proc_string), cf.dimmed("(via SIGTERM)")) total_stopped += 1 except psutil.NoSuchProcess: cli_logger.verbose( "Attempted to stop `{}`, but process was already dead.", cf.bold(proc_string)) pass except (psutil.Error, OSError) as ex: cli_logger.error("Could not terminate `{}` due to {}", cf.bold(proc_string), str(ex)) if total_found == 0: cli_logger.print("Did not find any active Ray processes.") else: if total_stopped == total_found: cli_logger.success("Stopped all {} Ray processes.", total_stopped) else: cli_logger.warning( "Stopped only {} out of {} Ray processes. " "Set `{}` to see more details.", total_stopped, total_found, cf.bold("-v")) cli_logger.warning("Try running the command again, or use `{}`.", cf.bold("--force"))