def terminate_node(self, node_id): node = self._get_cached_node(node_id) if self.cache_stopped_nodes: if node.spot_instance_request_id: cli_logger.print( "Terminating instance {} " + cf.dimmed("(cannot stop spot instances, only terminate)"), node_id) # todo: show node name? node.terminate() else: cli_logger.print("Stopping instance {} " + cf.dimmed( "(to terminate instead, " "set `cache_stopped_nodes: False` " "under `provider` in the cluster configuration)"), node_id) # todo: show node name? node.stop() else: node.terminate() # TODO (Alex): We are leaking the tag cache here. Naively, we would # want to just remove the cache entry here, but terminating can be # asyncrhonous or error, which would result in a use after free error. # If this leak becomes bad, we can garbage collect the tag cache when # the node cache is updated. pass
def do_sync(remote_path, local_path, allow_non_existing_paths=False): if allow_non_existing_paths and not os.path.exists(local_path): cli_logger.print("sync: {} does not exist. Skipping.", local_path) # Ignore missing source files. In the future we should support # the --delete-missing-args command to delete files that have # been removed return assert os.path.exists(local_path), local_path if os.path.isdir(local_path): if not local_path.endswith("/"): local_path += "/" if not remote_path.endswith("/"): remote_path += "/" with LogTimer(self.log_prefix + "Synced {} to {}".format(local_path, remote_path)): is_docker = (self.docker_config and self.docker_config["container_name"] != "") if not is_docker: # The DockerCommandRunner handles this internally. self.cmd_runner.run("mkdir -p {}".format( os.path.dirname(remote_path)), run_env="host") sync_cmd(local_path, remote_path, docker_mount_if_possible=True) if remote_path not in nolog_paths: # todo: timed here? cli_logger.print("{} from {}", cf.bold(remote_path), cf.bold(local_path))
def terminate_nodes(self, node_ids): if not node_ids: return if self.cache_stopped_nodes: spot_ids = [] on_demand_ids = [] for node_id in node_ids: if self._get_cached_node(node_id).spot_instance_request_id: spot_ids += [node_id] else: on_demand_ids += [node_id] if on_demand_ids: # todo: show node names? cli_logger.print( "Stopping instances {} " + cf.dimmed( "(to terminate instead, " "set `cache_stopped_nodes: False` " "under `provider` in the cluster configuration)"), cli_logger.render_list(on_demand_ids)) self.ec2.meta.client.stop_instances(InstanceIds=on_demand_ids) if spot_ids: cli_logger.print( "Terminating instances {} " + cf.dimmed("(cannot stop spot instances, only terminate)"), cli_logger.render_list(spot_ids)) self.ec2.meta.client.terminate_instances(InstanceIds=spot_ids) else: self.ec2.meta.client.terminate_instances(InstanceIds=node_ids)
def teardown_cluster(config_file: str, yes: bool, override_cluster_name: Optional[str]) -> None: config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config) cli_logger.confirm(yes, "Destroying cluster.", _abort=True) provider = _get_node_provider(config["provider"], config["cluster_name"]) A = provider.non_terminated_nodes({TAG_NODE_KIND: NODE_KIND_WORKER}) with LogTimer("teardown_cluster: done."): while A: provider.terminate_nodes(A) cli_logger.print("Requested {} nodes to shut down.", cf.bold(len(A)), _tags=dict(interval="1s")) time.sleep(POLL_INTERVAL) # todo: interval should be a variable A = provider.non_terminated_nodes( {TAG_NODE_KIND: NODE_KIND_WORKER}) cli_logger.print("{} nodes remaining after {} second(s).", cf.bold(len(A)), POLL_INTERVAL) cli_logger.success("No nodes remaining.")
def wait_ready(self, deadline): with cli_logger.group("Waiting for SSH to become available", _numbered=("[]", 1, NUM_SETUP_STEPS)): with LogTimer(self.log_prefix + "Got remote shell"): cli_logger.print("Running `{}` as a test.", cf.bold("uptime")) first_conn_refused_time = None while time.time() < deadline and \ not self.provider.is_terminated(self.node_id): try: # Run outside of the container self.cmd_runner.run("uptime", timeout=5, run_env="host") cli_logger.success("Success.") return True except ProcessRunnerError as e: first_conn_refused_time = \ cmd_output_util.handle_ssh_fails( e, first_conn_refused_time, retry_interval=READY_CHECK_INTERVAL) time.sleep(READY_CHECK_INTERVAL) except Exception as e: # TODO(maximsmol): we should not be ignoring # exceptions if they get filtered properly # (new style log + non-interactive shells) # # however threading this configuration state # is a pain and I'm leaving it for later retry_str = "(" + str(e) + ")" if hasattr(e, "cmd"): if isinstance(e.cmd, str): cmd_ = e.cmd elif isinstance(e.cmd, list): cmd_ = " ".join(e.cmd) else: logger.debug(f"e.cmd type ({type(e.cmd)}) not " "list or str.") cmd_ = str(e.cmd) retry_str = "(Exit Status {}): {}".format( e.returncode, cmd_) cli_logger.print( "SSH still not available {}, " "retrying in {} seconds.", cf.dimmed(retry_str), cf.bold(str(READY_CHECK_INTERVAL))) time.sleep(READY_CHECK_INTERVAL) assert False, "Unable to connect to node"
def handle_ssh_fails(e, first_conn_refused_time, retry_interval): """Handle SSH system failures coming from a subprocess. Args: e: The `ProcessRunnerException` to handle. first_conn_refused_time: The time (as reported by this function) or None, indicating the last time a CONN_REFUSED error was caught. After exceeding a patience value, the program will be aborted since SSH will likely never recover. retry_interval: The interval after which the command will be retried, used here just to inform the user. """ if e.msg_type != "ssh_command_failed": return if e.special_case == "ssh_conn_refused": if first_conn_refused_time is not None and \ time.time() - first_conn_refused_time > \ CONN_REFUSED_PATIENCE: cli_logger.error( "SSH connection was being refused " "for {} seconds. Head node assumed " "unreachable.", cf.bold(str(CONN_REFUSED_PATIENCE))) cli_logger.abort("Check the node's firewall settings " "and the cloud network configuration.") cli_logger.warning("SSH connection was refused.") cli_logger.warning("This might mean that the SSH daemon is " "still setting up, or that " "the host is inaccessable (e.g. due to " "a firewall).") return time.time() if e.special_case in ["ssh_timeout", "ssh_conn_refused"]: cli_logger.print("SSH still not available, " "retrying in {} seconds.", cf.bold(str(retry_interval))) else: raise e return first_conn_refused_time
def _wait_for_ip(self, deadline): # if we have IP do not print waiting info ip = self._get_node_ip() if ip is not None: cli_logger.labeled_value("Fetched IP", ip) return ip interval = 10 with cli_logger.group("Waiting for IP"): while time.time() < deadline and \ not self.provider.is_terminated(self.node_id): ip = self._get_node_ip() if ip is not None: cli_logger.labeled_value("Received", ip) return ip cli_logger.print("Not yet available, retrying in {} seconds", cf.bold(str(interval))) time.sleep(interval) return None
def do_update(self): self.provider.set_node_tags(self.node_id, {TAG_NODE_STATUS: STATUS_WAITING_FOR_SSH}) cli_logger.labeled_value("New status", STATUS_WAITING_FOR_SSH) deadline = time.time() + AUTOSCALER_NODE_START_WAIT_S self.wait_ready(deadline) global_event_system.execute_callback( CreateClusterEvent.ssh_control_acquired) node_tags = self.provider.node_tags(self.node_id) logger.debug("Node tags: {}".format(str(node_tags))) if node_tags.get(TAG_RUNTIME_CONFIG) == self.runtime_hash: # When resuming from a stopped instance the runtime_hash may be the # same, but the container will not be started. init_required = self.cmd_runner.run_init( as_head=self.is_head_node, file_mounts=self.file_mounts, sync_run_yet=False) if init_required: node_tags[TAG_RUNTIME_CONFIG] += "-invalidate" # This ensures that `setup_commands` are not removed self.restart_only = False if self.restart_only: self.setup_commands = [] # runtime_hash will only change whenever the user restarts # or updates their cluster with `get_or_create_head_node` if node_tags.get(TAG_RUNTIME_CONFIG) == self.runtime_hash and ( not self.file_mounts_contents_hash or node_tags.get(TAG_FILE_MOUNTS_CONTENTS) == self.file_mounts_contents_hash): # todo: we lie in the confirmation message since # full setup might be cancelled here cli_logger.print( "Configuration already up to date, " "skipping file mounts, initalization and setup commands.", _numbered=("[]", "2-6", NUM_SETUP_STEPS)) else: cli_logger.print("Updating cluster configuration.", _tags=dict(hash=self.runtime_hash)) self.provider.set_node_tags( self.node_id, {TAG_NODE_STATUS: STATUS_SYNCING_FILES}) cli_logger.labeled_value("New status", STATUS_SYNCING_FILES) self.sync_file_mounts(self.rsync_up, step_numbers=(1, NUM_SETUP_STEPS)) # Only run setup commands if runtime_hash has changed because # we don't want to run setup_commands every time the head node # file_mounts folders have changed. if node_tags.get(TAG_RUNTIME_CONFIG) != self.runtime_hash: # Run init commands self.provider.set_node_tags( self.node_id, {TAG_NODE_STATUS: STATUS_SETTING_UP}) cli_logger.labeled_value("New status", STATUS_SETTING_UP) if self.initialization_commands: with cli_logger.group("Running initialization commands", _numbered=("[]", 4, NUM_SETUP_STEPS)): global_event_system.execute_callback( CreateClusterEvent.run_initialization_cmd) with LogTimer(self.log_prefix + "Initialization commands", show_status=True): for cmd in self.initialization_commands: global_event_system.execute_callback( CreateClusterEvent.run_initialization_cmd, {"command": cmd}) try: # Overriding the existing SSHOptions class # with a new SSHOptions class that uses # this ssh_private_key as its only __init__ # argument. # Run outside docker. self.cmd_runner.run( cmd, ssh_options_override_ssh_key=self. auth_config.get("ssh_private_key"), run_env="host") except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error( "See above for stderr.") raise click.ClickException( "Initialization command failed." ) from None else: cli_logger.print("No initialization commands to run.", _numbered=("[]", 4, NUM_SETUP_STEPS)) with cli_logger.group( "Initalizing command runner", # todo: fix command numbering _numbered=("[]", 5, NUM_SETUP_STEPS)): self.cmd_runner.run_init(as_head=self.is_head_node, file_mounts=self.file_mounts, sync_run_yet=True) if self.setup_commands: with cli_logger.group( "Running setup commands", # todo: fix command numbering _numbered=("[]", 6, NUM_SETUP_STEPS)): global_event_system.execute_callback( CreateClusterEvent.run_setup_cmd) with LogTimer(self.log_prefix + "Setup commands", show_status=True): total = len(self.setup_commands) for i, cmd in enumerate(self.setup_commands): global_event_system.execute_callback( CreateClusterEvent.run_setup_cmd, {"command": cmd}) if cli_logger.verbosity == 0 and len(cmd) > 30: cmd_to_print = cf.bold(cmd[:30]) + "..." else: cmd_to_print = cf.bold(cmd) cli_logger.print("{}", cmd_to_print, _numbered=("()", i, total)) try: # Runs in the container if docker is in use self.cmd_runner.run(cmd, run_env="auto") except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error( "See above for stderr.") raise click.ClickException( "Setup command failed.") else: cli_logger.print("No setup commands to run.", _numbered=("[]", 6, NUM_SETUP_STEPS))
def sync_file_mounts(self, sync_cmd, step_numbers=(0, 2)): # step_numbers is (# of previous steps, total steps) previous_steps, total_steps = step_numbers nolog_paths = [] if cli_logger.verbosity == 0: nolog_paths = [ "~/ray_bootstrap_key.pem", "~/ray_bootstrap_config.yaml" ] def do_sync(remote_path, local_path, allow_non_existing_paths=False): if allow_non_existing_paths and not os.path.exists(local_path): cli_logger.print("sync: {} does not exist. Skipping.", local_path) # Ignore missing source files. In the future we should support # the --delete-missing-args command to delete files that have # been removed return assert os.path.exists(local_path), local_path if os.path.isdir(local_path): if not local_path.endswith("/"): local_path += "/" if not remote_path.endswith("/"): remote_path += "/" with LogTimer(self.log_prefix + "Synced {} to {}".format(local_path, remote_path)): is_docker = (self.docker_config and self.docker_config["container_name"] != "") if not is_docker: # The DockerCommandRunner handles this internally. self.cmd_runner.run("mkdir -p {}".format( os.path.dirname(remote_path)), run_env="host") sync_cmd(local_path, remote_path, docker_mount_if_possible=True) if remote_path not in nolog_paths: # todo: timed here? cli_logger.print("{} from {}", cf.bold(remote_path), cf.bold(local_path)) # Rsync file mounts with cli_logger.group("Processing file mounts", _numbered=("[]", previous_steps + 1, total_steps)): for remote_path, local_path in self.file_mounts.items(): do_sync(remote_path, local_path) previous_steps += 1 if self.cluster_synced_files: with cli_logger.group("Processing worker file mounts", _numbered=("[]", previous_steps + 1, total_steps)): cli_logger.print("synced files: {}", str(self.cluster_synced_files)) for path in self.cluster_synced_files: do_sync(path, path, allow_non_existing_paths=True) previous_steps += 1 else: cli_logger.print("No worker file mounts to sync", _numbered=("[]", previous_steps + 1, total_steps))
def _create_node(self, node_config, tags, count): created_nodes_dict = {} tags = to_aws_format(tags) conf = node_config.copy() tag_pairs = [{ "Key": TAG_CLUSTER_NAME, "Value": self.cluster_name, }] for k, v in tags.items(): tag_pairs.append({ "Key": k, "Value": v, }) tag_specs = [{ "ResourceType": "instance", "Tags": tag_pairs, }] user_tag_specs = conf.get("TagSpecifications", []) # Allow users to add tags and override values of existing # tags with their own. This only applies to the resource type # "instance". All other resource types are appended to the list of # tag specs. for user_tag_spec in user_tag_specs: if user_tag_spec["ResourceType"] == "instance": for user_tag in user_tag_spec["Tags"]: exists = False for tag in tag_specs[0]["Tags"]: if user_tag["Key"] == tag["Key"]: exists = True tag["Value"] = user_tag["Value"] break if not exists: tag_specs[0]["Tags"] += [user_tag] else: tag_specs += [user_tag_spec] # SubnetIds is not a real config key: we must resolve to a # single SubnetId before invoking the AWS API. subnet_ids = conf.pop("SubnetIds") for attempt in range(1, BOTO_CREATE_MAX_RETRIES + 1): try: subnet_id = subnet_ids[self.subnet_idx % len(subnet_ids)] self.subnet_idx += 1 conf.update({ "MinCount": 1, "MaxCount": count, "SubnetId": subnet_id, "TagSpecifications": tag_specs }) created = self.ec2_fail_fast.create_instances(**conf) created_nodes_dict = {n.id: n for n in created} # todo: timed? # todo: handle plurality? with cli_logger.group("Launched {} nodes", count, _tags=dict(subnet_id=subnet_id)): for instance in created: # NOTE(maximsmol): This is needed for mocking # boto3 for tests. This is likely a bug in moto # but AWS docs don't seem to say. # You can patch moto/ec2/responses/instances.py # to fix this (add <stateReason> to EC2_RUN_INSTANCES) # The correct value is technically # {"code": "0", "Message": "pending"} state_reason = instance.state_reason or { "Message": "pending" } cli_logger.print("Launched instance {}", instance.instance_id, _tags=dict( state=instance.state["Name"], info=state_reason["Message"])) break except botocore.exceptions.ClientError as exc: if attempt == BOTO_CREATE_MAX_RETRIES: # todo: err msg cli_logger.abort( "Failed to launch instances. Max attempts exceeded.") raise exc else: cli_logger.print( "create_instances: Attempt failed with {}, retrying.", exc) return created_nodes_dict
def create_node(self, node_config, tags, count) -> Dict[str, Any]: """Creates instances. Returns dict mapping instance id to ec2.Instance object for the created instances. """ tags = copy.deepcopy(tags) reused_nodes_dict = {} # Try to reuse previously stopped nodes with compatible configs if self.cache_stopped_nodes: # TODO(ekl) this is breaking the abstraction boundary a little by # peeking into the tag set. filters = [ { "Name": "instance-state-name", "Values": ["stopped", "stopping"], }, { "Name": "tag:{}".format(TAG_CLUSTER_NAME), "Values": [self.cluster_name], }, { "Name": "tag:{}".format(TAG_NODE_KIND), "Values": [tags[TAG_NODE_KIND]], }, { "Name": "tag:{}".format(TAG_LAUNCH_CONFIG), "Values": [tags[TAG_LAUNCH_CONFIG]], }, ] # This tag may not always be present. if TAG_USER_NODE_TYPE in tags: filters.append({ "Name": "tag:{}".format(TAG_USER_NODE_TYPE), "Values": [tags[TAG_USER_NODE_TYPE]], }) reuse_nodes = list( self.ec2.instances.filter(Filters=filters))[:count] reuse_node_ids = [n.id for n in reuse_nodes] reused_nodes_dict = {n.id: n for n in reuse_nodes} if reuse_nodes: cli_logger.print( # todo: handle plural vs singular? "Reusing nodes {}. " "To disable reuse, set `cache_stopped_nodes: False` " "under `provider` in the cluster configuration.", cli_logger.render_list(reuse_node_ids)) # todo: timed? with cli_logger.group("Stopping instances to reuse"): for node in reuse_nodes: self.tag_cache[node.id] = from_aws_format( {x["Key"]: x["Value"] for x in node.tags}) if node.state["Name"] == "stopping": cli_logger.print("Waiting for instance {} to stop", node.id) node.wait_until_stopped() self.ec2.meta.client.start_instances( InstanceIds=reuse_node_ids) for node_id in reuse_node_ids: self.set_node_tags(node_id, tags) count -= len(reuse_node_ids) created_nodes_dict = {} if count: created_nodes_dict = self._create_node(node_config, tags, count) all_created_nodes = reused_nodes_dict all_created_nodes.update(created_nodes_dict) return all_created_nodes
def create_nodes(config: Dict[str, Any], yes: bool, _provider: Optional[NodeProvider] = None, _runner: ModuleType = subprocess) -> None: provider = (_provider or _get_node_provider(config["provider"], config["cluster_name"])) worker_filter = {TAG_NODE_KIND: NODE_KIND_WORKER} launch_config = copy.deepcopy(config["worker_nodes"]) launch_hash = hash_launch_conf(launch_config, config["auth"]) count = int(config["num_workers"]) cli_logger.print("Launching {} nodes.".format(count)) node_config = copy.deepcopy(config["worker_nodes"]) node_tags = { TAG_NODE_NAME: "cls-{}-worker".format(config["cluster_name"]), TAG_NODE_KIND: NODE_KIND_WORKER, TAG_NODE_STATUS: STATUS_UNINITIALIZED, TAG_LAUNCH_CONFIG: launch_hash, } provider.create_node(node_config, node_tags, count) start = time.time() workers = [] prev = start with cli_logger.group("Fetching the new worker node"): while True: nodes = provider.non_terminated_nodes(worker_filter) cur = time.time() if cur - prev > 50: prev = cur if len(nodes) >= count: workers = nodes break time.sleep(POLL_INTERVAL) cli_logger.newline() updaters = [] (runtime_hash, file_mounts_contents_hash) = hash_runtime_conf(config["file_mounts"], None, config) for worker in workers: updater = NodeUpdaterThread( node_id=worker, provider_config=config["provider"], provider=provider, auth_config=config['auth'], cluster_name=config['cluster_name'], file_mounts=config['file_mounts'], initialization_commands=config["initialization_commands"], setup_commands=config['worker_setup_commands'], process_runner=_runner, runtime_hash=runtime_hash, is_head_node=False, file_mounts_contents_hash=file_mounts_contents_hash, rsync_options={ "rsync_exclude": config.get("rsync_exclude"), "rsync_filter": config.get("rsync_filter") }, ) updater.start() updaters.append(updater) for up in updaters: up.join() provider.non_terminated_nodes(worker_filter) if up.exitcode != 0: cli_logger.abort("Fail to setup worker node. ")
def _bootstrap_config(config: Dict[str, Any], no_config_cache: bool = False) -> Dict[str, Any]: config = prepare_config(config) hasher = hashlib.sha1() hasher.update(json.dumps([config], sort_keys=True).encode("utf-8")) cache_key = os.path.join(tempfile.gettempdir(), "ray-config-{}".format(hasher.hexdigest())) if os.path.exists(cache_key) and not no_config_cache: config_cache = json.loads(open(cache_key).read()) if config_cache.get("_version", -1) == CONFIG_CACHE_VERSION: # todo: is it fine to re-resolve? afaik it should be. # we can have migrations otherwise or something # but this seems overcomplicated given that resolving is # relatively cheap try_reload_log_state(config_cache["config"]["provider"], config_cache.get("provider_log_info")) if log_once("_printed_cached_config_warning"): cli_logger.verbose_warning( "Loaded cached provider configuration " "from " + cf.bold("{}"), cache_key) if cli_logger.verbosity == 0: cli_logger.warning("Loaded cached provider configuration") cli_logger.warning( "If you experience issues with " "the cloud provider, try re-running " "the command with {}.", cf.bold("--no-config-cache")) return config_cache["config"] else: cli_logger.warning( "Found cached cluster config " "but the version " + cf.bold("{}") + " " "(expected " + cf.bold("{}") + ") does not match.\n" "This is normal if cluster launcher was updated.\n" "Config will be re-resolved.", config_cache.get("_version", "none"), CONFIG_CACHE_VERSION) importer = _NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: raise NotImplementedError("Unsupported provider {}".format( config["provider"])) provider_cls = importer(config["provider"]) cli_logger.print("Checking {} environment settings", _PROVIDER_PRETTY_NAMES.get(config["provider"]["type"])) try: config = provider_cls.fillout_available_node_types_resources(config) except Exception as exc: if cli_logger.verbosity > 2: logger.exception("Failed to autodetect node resources.") else: cli_logger.warning( f"Failed to autodetect node resources: {str(exc)}. " "You can see full stack trace with higher verbosity.") # NOTE: if `resources` field is missing, validate_config for providers # other than AWS and Kubernetes will fail (the schema error will ask the # user to manually fill the resources) as we currently support autofilling # resources for AWS and Kubernetes only. validate_config(config) resolved_config = provider_cls.bootstrap_config(config) if not no_config_cache: with open(cache_key, "w") as f: config_cache = { "_version": CONFIG_CACHE_VERSION, "provider_log_info": try_get_log_state(config["provider"]), "config": resolved_config } f.write(json.dumps(config_cache)) return resolved_config