Exemple #1
0
def _get_head_node(config,
                   config_file,
                   override_cluster_name,
                   create_if_needed=False):
    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:
        head_node_tags = {
            TAG_RAY_NODE_TYPE: "head",
        }
        nodes = provider.non_terminated_nodes(head_node_tags)
    finally:
        provider.cleanup()

    if len(nodes) > 0:
        head_node = nodes[0]
        return head_node
    elif create_if_needed:
        get_or_create_head_node(
            config,
            config_file,
            restart_only=False,
            no_restart=False,
            yes=True,
            override_cluster_name=override_cluster_name)
        return _get_head_node(
            config, config_file, override_cluster_name, create_if_needed=False)
    else:
        raise RuntimeError("Head node of cluster ({}) not found!".format(
            config["cluster_name"]))
Exemple #2
0
 def __init__(
         self, node_id, provider_config, auth_config, cluster_name,
         file_mounts, setup_cmds, runtime_hash, redirect_output=True,
         process_runner=subprocess):
     self.daemon = True
     self.process_runner = process_runner
     self.provider = get_node_provider(provider_config, cluster_name)
     self.ssh_private_key = auth_config["ssh_private_key"]
     self.ssh_user = auth_config["ssh_user"]
     self.ssh_ip = self.provider.external_ip(node_id)
     self.node_id = node_id
     self.file_mounts = file_mounts
     self.setup_cmds = setup_cmds
     self.runtime_hash = runtime_hash
     if redirect_output:
         self.logfile = tempfile.NamedTemporaryFile(
             mode="w", prefix="node-updater-", delete=False)
         self.output_name = self.logfile.name
         self.stdout = self.logfile
         self.stderr = self.logfile
     else:
         self.logfile = None
         self.output_name = "(console)"
         self.stdout = sys.stdout
         self.stderr = sys.stderr
Exemple #3
0
    def __init__(
            self, config_path, load_metrics,
            max_concurrent_launches=AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
            max_failures=AUTOSCALER_MAX_NUM_FAILURES,
            process_runner=subprocess, verbose_updates=False,
            node_updater_cls=NodeUpdaterProcess,
            update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S):
        self.config_path = config_path
        self.reload_config(errors_fatal=True)
        self.load_metrics = load_metrics
        self.provider = get_node_provider(
            self.config["provider"], self.config["cluster_name"])

        self.max_failures = max_failures
        self.max_concurrent_launches = max_concurrent_launches
        self.verbose_updates = verbose_updates
        self.process_runner = process_runner
        self.node_updater_cls = node_updater_cls

        # Map from node_id to NodeUpdater processes
        self.updaters = {}
        self.num_failed_updates = defaultdict(int)
        self.num_successful_updates = defaultdict(int)
        self.num_failures = 0
        self.last_update_time = 0.0
        self.update_interval_s = update_interval_s

        for local_path in self.config["file_mounts"].values():
            assert os.path.exists(local_path)

        print("StandardAutoscaler: {}".format(self.config))
Exemple #4
0
    def __init__(self,
                 config_path,
                 load_metrics,
                 max_launch_batch=AUTOSCALER_MAX_LAUNCH_BATCH,
                 max_concurrent_launches=AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
                 max_failures=AUTOSCALER_MAX_NUM_FAILURES,
                 process_runner=subprocess,
                 update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S):
        self.config_path = config_path
        self.reload_config(errors_fatal=True)
        self.load_metrics = load_metrics
        self.provider = get_node_provider(self.config["provider"],
                                          self.config["cluster_name"])

        self.max_failures = max_failures
        self.max_launch_batch = max_launch_batch
        self.max_concurrent_launches = max_concurrent_launches
        self.process_runner = process_runner

        # Map from node_id to NodeUpdater processes
        self.updaters = {}
        self.num_failed_updates = defaultdict(int)
        self.num_successful_updates = defaultdict(int)
        self.num_failures = 0
        self.last_update_time = 0.0
        self.update_interval_s = update_interval_s
        self.bringup = True

        # Node launchers
        self.launch_queue = queue.Queue()
        self.num_launches_pending = ConcurrentCounter()
        max_batches = math.ceil(
            max_concurrent_launches / float(max_launch_batch))
        for i in range(int(max_batches)):
            node_launcher = NodeLauncher(
                provider=self.provider,
                queue=self.launch_queue,
                pending=self.num_launches_pending)
            node_launcher.daemon = True
            node_launcher.start()

        # Expand local file_mounts to allow ~ in the paths. This can't be done
        # earlier when the config is written since we might be on different
        # platform and the expansion would result in wrong path.
        self.config["file_mounts"] = {
            remote: os.path.expanduser(local)
            for remote, local in self.config["file_mounts"].items()
        }

        for local_path in self.config["file_mounts"].values():
            assert os.path.exists(local_path)

        logger.info("StandardAutoscaler: {}".format(self.config))
Exemple #5
0
def get_head_node_ip(config_file):
    """Returns head node IP for given configuration file if exists."""

    config = yaml.load(open(config_file).read())
    provider = get_node_provider(config["provider"], config["cluster_name"])
    head_node_tags = {
        TAG_RAY_NODE_TYPE: "Head",
    }
    nodes = provider.nodes(head_node_tags)
    if len(nodes) > 0:
        head_node = nodes[0]
        return provider.external_ip(head_node)
    else:
        print("Head node of cluster ({}) not found!".format(
            config["cluster_name"]))
        sys.exit(1)
Exemple #6
0
def get_worker_node_ips(config_file, override_cluster_name):
    """Returns worker node IPs for given configuration file."""

    config = yaml.load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name

    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:
        nodes = provider.non_terminated_nodes({TAG_RAY_NODE_TYPE: "worker"})

        if config.get("provider", {}).get("use_internal_ips", False) is True:
            return [provider.internal_ip(node) for node in nodes]
        else:
            return [provider.external_ip(node) for node in nodes]
    finally:
        provider.cleanup()
Exemple #7
0
def get_head_node_ip(config_file, override_cluster_name):
    """Returns head node IP for given configuration file if exists."""

    config = yaml.load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name

    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:
        head_node = _get_head_node(config, config_file, override_cluster_name)
        if config.get("provider", {}).get("use_internal_ips", False) is True:
            head_node_ip = provider.internal_ip(head_node)
        else:
            head_node_ip = provider.external_ip(head_node)
    finally:
        provider.cleanup()

    return head_node_ip
Exemple #8
0
def kill_node(config_file, yes, hard, override_cluster_name):
    """Kills a random Raylet worker."""

    config = yaml.safe_load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name
    config = _bootstrap_config(config)

    confirm("This will kill a node in your cluster", yes)

    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:
        nodes = provider.non_terminated_nodes({
            TAG_RAY_NODE_TYPE: NODE_TYPE_WORKER
        })
        node = random.choice(nodes)
        logger.info("kill_node: Shutdown worker {}".format(node))
        if hard:
            provider.terminate_node(node)
        else:
            updater = NodeUpdaterThread(
                node_id=node,
                provider_config=config["provider"],
                provider=provider,
                auth_config=config["auth"],
                cluster_name=config["cluster_name"],
                file_mounts=config["file_mounts"],
                initialization_commands=[],
                setup_commands=[],
                ray_start_commands=[],
                runtime_hash="")

            _exec(updater, "ray stop", False, False)

        time.sleep(5)

        if config.get("provider", {}).get("use_internal_ips", False) is True:
            node_ip = provider.internal_ip(node)
        else:
            node_ip = provider.external_ip(node)
    finally:
        provider.cleanup()

    return node_ip
Exemple #9
0
def teardown_cluster(config_file, yes, workers_only, override_cluster_name):
    """Destroys all nodes of a Ray cluster described by a config json."""

    config = yaml.load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name
    validate_config(config)
    config = fillout_defaults(config)

    confirm("This will destroy your cluster", yes)

    provider = get_node_provider(config["provider"], config["cluster_name"])

    try:

        def remaining_nodes():
            if workers_only:
                A = []
            else:
                A = [
                    node_id for node_id in provider.nodes({
                        TAG_RAY_NODE_TYPE: "head"
                    })
                ]

            A += [
                node_id for node_id in provider.nodes({
                    TAG_RAY_NODE_TYPE: "worker"
                })
            ]
            return A

        # Loop here to check that both the head and worker nodes are actually
        #   really gone
        A = remaining_nodes()
        with LogTimer("teardown_cluster: Termination done."):
            while A:
                logger.info("teardown_cluster: "
                            "Terminating {} nodes...".format(len(A)))
                provider.terminate_nodes(A)
                time.sleep(1)
                A = remaining_nodes()
    finally:
        provider.cleanup()
Exemple #10
0
def teardown_cluster(config_file, yes, workers_only, override_cluster_name):
    """Destroys all nodes of a Ray cluster described by a config json."""

    config = yaml.load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name
    validate_config(config)
    config = fillout_defaults(config)

    confirm("This will destroy your cluster", yes)

    provider = get_node_provider(config["provider"], config["cluster_name"])

    try:

        def remaining_nodes():
            if workers_only:
                A = []
            else:
                A = [
                    node_id for node_id in provider.non_terminated_nodes({
                        TAG_RAY_NODE_TYPE: "head"
                    })
                ]

            A += [
                node_id for node_id in provider.non_terminated_nodes({
                    TAG_RAY_NODE_TYPE: "worker"
                })
            ]
            return A

        # Loop here to check that both the head and worker nodes are actually
        #   really gone
        A = remaining_nodes()
        with LogTimer("teardown_cluster: Termination done."):
            while A:
                logger.info("teardown_cluster: "
                            "Terminating {} nodes...".format(len(A)))
                provider.terminate_nodes(A)
                time.sleep(1)
                A = remaining_nodes()
    finally:
        provider.cleanup()
Exemple #11
0
    def _launch_node(self, config, count):
        if self.provider is None:
            self.provider = get_node_provider(config["provider"],
                                              config["cluster_name"])

        tag_filters = {TAG_RAY_NODE_TYPE: "worker"}
        before = self.provider.nodes(tag_filters=tag_filters)
        launch_hash = hash_launch_conf(config["worker_nodes"], config["auth"])
        self.provider.create_node(
            config["worker_nodes"], {
                TAG_RAY_NODE_NAME: "ray-{}-worker".format(
                    config["cluster_name"]),
                TAG_RAY_NODE_TYPE: "worker",
                TAG_RAY_NODE_STATUS: "uninitialized",
                TAG_RAY_LAUNCH_CONFIG: launch_hash,
            }, count)
        after = self.provider.nodes(tag_filters=tag_filters)
        if set(after).issubset(before):
            logger.error("No new nodes reported after node creation")
Exemple #12
0
def get_head_node_ip(config_file: str,
                     override_cluster_name: Optional[str]) -> str:
    """Returns head node IP for given configuration file if exists."""

    config = yaml.safe_load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name

    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:
        head_node = _get_head_node(config, config_file, override_cluster_name)
        if config.get("provider", {}).get("use_internal_ips", False) is True:
            head_node_ip = provider.internal_ip(head_node)
        else:
            head_node_ip = provider.external_ip(head_node)
    finally:
        provider.cleanup()

    return head_node_ip
Exemple #13
0
def get_worker_node_ips(config_file, override_cluster_name):
    """Returns worker node IPs for given configuration file."""

    config = yaml.safe_load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name

    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:
        nodes = provider.non_terminated_nodes({
            TAG_RAY_NODE_TYPE: NODE_TYPE_WORKER
        })

        if config.get("provider", {}).get("use_internal_ips", False) is True:
            return [provider.internal_ip(node) for node in nodes]
        else:
            return [provider.external_ip(node) for node in nodes]
    finally:
        provider.cleanup()
def teardown_cluster(config_file):
    """Destroys all nodes of a Ray cluster described by a config json."""

    config = yaml.load(open(config_file).read())

    validate_config(config)
    provider = get_node_provider(config["provider"], config["cluster_name"])
    head_node_tags = {
        TAG_RAY_NODE_TYPE: "Head",
    }
    for node in provider.nodes(head_node_tags):
        print("Terminating head node {}".format(node))
        provider.terminate_node(node)
    nodes = provider.nodes({})
    while nodes:
        for node in nodes:
            print("Terminating worker {}".format(node))
            provider.terminate_node(node)
        time.sleep(5)
        nodes = provider.nodes({})
Exemple #15
0
def _get_head_node(config, config_file, create_if_needed=False):
    provider = get_node_provider(config["provider"], config["cluster_name"])
    head_node_tags = {
        TAG_RAY_NODE_TYPE: "head",
    }
    nodes = provider.nodes(head_node_tags)
    if len(nodes) > 0:
        head_node = nodes[0]
        return head_node
    elif create_if_needed:
        get_or_create_head_node(config,
                                config_file,
                                restart_only=False,
                                no_restart=False,
                                yes=True)
        return _get_head_node(config, config_file, create_if_needed=False)
    else:
        print("Head node of cluster ({}) not found!".format(
            config["cluster_name"]))
        sys.exit(1)
Exemple #16
0
    def __init__(self,
                 config_path,
                 load_metrics,
                 max_concurrent_launches=AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
                 max_failures=AUTOSCALER_MAX_NUM_FAILURES,
                 process_runner=subprocess,
                 verbose_updates=True,
                 node_updater_cls=NodeUpdaterProcess,
                 update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S):
        self.config_path = config_path
        self.reload_config(errors_fatal=True)
        self.load_metrics = load_metrics
        self.provider = get_node_provider(self.config["provider"],
                                          self.config["cluster_name"])

        self.max_failures = max_failures
        self.max_concurrent_launches = max_concurrent_launches
        self.verbose_updates = verbose_updates
        self.process_runner = process_runner
        self.node_updater_cls = node_updater_cls

        # Map from node_id to NodeUpdater processes
        self.updaters = {}
        self.num_failed_updates = defaultdict(int)
        self.num_successful_updates = defaultdict(int)
        self.num_failures = 0
        self.last_update_time = 0.0
        self.update_interval_s = update_interval_s

        # Expand local file_mounts to allow ~ in the paths. This can't be done
        # earlier when the config is written since we might be on different
        # platform and the expansion would result in wrong path.
        self.config["file_mounts"] = {
            remote: os.path.expanduser(local)
            for remote, local in self.config["file_mounts"].items()
        }

        for local_path in self.config["file_mounts"].values():
            assert os.path.exists(local_path)

        print("StandardAutoscaler: {}".format(self.config))
Exemple #17
0
 def __init__(self,
              node_id,
              provider_config,
              auth_config,
              cluster_name,
              file_mounts,
              setup_cmds,
              runtime_hash,
              redirect_output=True,
              process_runner=subprocess,
              use_internal_ip=False):
     self.daemon = True
     self.process_runner = process_runner
     self.node_id = node_id
     self.use_internal_ip = (use_internal_ip or provider_config.get(
         "use_internal_ips", False))
     self.provider = get_node_provider(provider_config, cluster_name)
     self.ssh_private_key = auth_config["ssh_private_key"]
     self.ssh_user = auth_config["ssh_user"]
     self.ssh_ip = self.get_node_ip()
     self.file_mounts = {
         remote: os.path.expanduser(local)
         for remote, local in file_mounts.items()
     }
     self.setup_cmds = setup_cmds
     self.runtime_hash = runtime_hash
     self.logger = logger.getChild(str(node_id))
     if redirect_output:
         self.logfile = tempfile.NamedTemporaryFile(
             mode="w", prefix="node-updater-", delete=False)
         handler = logging.StreamHandler(stream=self.logfile)
         handler.setLevel(logging.INFO)
         self.logger.addHandler(handler)
         self.output_name = self.logfile.name
         self.stdout = self.logfile
         self.stderr = self.logfile
     else:
         self.logfile = None
         self.output_name = "(console)"
         self.stdout = sys.stdout
         self.stderr = sys.stderr
Exemple #18
0
    def reset(self, errors_fatal=False):
        sync_continuously = False
        if hasattr(self, "config"):
            sync_continuously = self.config.get(
                "file_mounts_sync_continuously", False)
        try:
            with open(self.config_path) as f:
                new_config = yaml.safe_load(f.read())
            validate_config(new_config)
            (new_runtime_hash,
             new_file_mounts_contents_hash) = hash_runtime_conf(
                 new_config["file_mounts"],
                 new_config["cluster_synced_files"],
                 [
                     new_config["worker_setup_commands"],
                     new_config["worker_start_ray_commands"],
                 ],
                 generate_file_mounts_contents_hash=sync_continuously,
             )
            self.config = new_config
            self.runtime_hash = new_runtime_hash
            self.file_mounts_contents_hash = new_file_mounts_contents_hash
            if not self.provider:
                self.provider = get_node_provider(self.config["provider"],
                                                  self.config["cluster_name"])
            # Check whether we can enable the resource demand scheduler.
            if "available_node_types" in self.config:
                self.available_node_types = self.config["available_node_types"]
                self.resource_demand_scheduler = ResourceDemandScheduler(
                    self.provider, self.available_node_types,
                    self.config["max_workers"])
            else:
                self.available_node_types = None
                self.resource_demand_scheduler = None

        except Exception as e:
            if errors_fatal:
                raise e
            else:
                logger.exception("StandardAutoscaler: "
                                 "Error parsing config.")
Exemple #19
0
def rsync(config_file, source, target, override_cluster_name, down):
    """Rsyncs files.

    Arguments:
        config_file: path to the cluster yaml
        source: source dir
        target: target dir
        override_cluster_name: set the name of the cluster
        down: whether we're syncing remote -> local
    """

    config = yaml.load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name
    config = _bootstrap_config(config)
    head_node = _get_head_node(config,
                               config_file,
                               override_cluster_name,
                               create_if_needed=False)

    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:
        updater = NodeUpdaterThread(
            node_id=head_node,
            provider_config=config["provider"],
            provider=provider,
            auth_config=config["auth"],
            cluster_name=config["cluster_name"],
            file_mounts=config["file_mounts"],
            initialization_commands=[],
            setup_commands=[],
            runtime_hash="",
        )
        if down:
            rsync = updater.rsync_down
        else:
            rsync = updater.rsync_up
        rsync(source, target, check_error=False)
    finally:
        provider.cleanup()
Exemple #20
0
def kill_node(config_file, yes, override_cluster_name):
    """Kills a random Raylet worker."""

    config = yaml.load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name
    config = _bootstrap_config(config)

    confirm("This will kill a node in your cluster", yes)

    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:
        nodes = provider.non_terminated_nodes({TAG_RAY_NODE_TYPE: "worker"})
        node = random.choice(nodes)
        logger.info("kill_node: Terminating worker {}".format(node))

        updater = NodeUpdaterThread(
            node_id=node,
            provider_config=config["provider"],
            provider=provider,
            auth_config=config["auth"],
            cluster_name=config["cluster_name"],
            file_mounts=config["file_mounts"],
            initialization_commands=[],
            setup_commands=[],
            runtime_hash="")

        _exec(updater, "ray stop", False, False)

        time.sleep(5)

        if config.get("provider", {}).get("use_internal_ips", False) is True:
            node_ip = provider.internal_ip(node)
        else:
            node_ip = provider.external_ip(node)
    finally:
        provider.cleanup()

    return node_ip
Exemple #21
0
def rsync(config_file, source, target, override_cluster_name, down):
    """Rsyncs files.

    Arguments:
        config_file: path to the cluster yaml
        source: source dir
        target: target dir
        override_cluster_name: set the name of the cluster
        down: whether we're syncing remote -> local
    """

    config = yaml.load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name
    config = _bootstrap_config(config)
    head_node = _get_head_node(
        config, config_file, override_cluster_name, create_if_needed=False)

    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:
        updater = NodeUpdaterThread(
            node_id=head_node,
            provider_config=config["provider"],
            provider=provider,
            auth_config=config["auth"],
            cluster_name=config["cluster_name"],
            file_mounts=config["file_mounts"],
            initialization_commands=[],
            setup_commands=[],
            runtime_hash="",
        )
        if down:
            rsync = updater.rsync_down
        else:
            rsync = updater.rsync_up
        rsync(source, target, check_error=False)
    finally:
        provider.cleanup()
Exemple #22
0
def teardown_cluster(config_file, yes):
    """Destroys all nodes of a Ray cluster described by a config json."""

    config = yaml.load(open(config_file).read())
    validate_config(config)
    config = fillout_defaults(config)

    confirm("This will destroy your cluster", yes)

    provider = get_node_provider(config["provider"], config["cluster_name"])
    head_node_tags = {
        TAG_RAY_NODE_TYPE: "Head",
    }
    for node in provider.nodes(head_node_tags):
        print("Terminating head node {}".format(node))
        provider.terminate_node(node)
    nodes = provider.nodes({})
    while nodes:
        for node in nodes:
            print("Terminating worker {}".format(node))
            provider.terminate_node(node)
        time.sleep(5)
        nodes = provider.nodes({})
Exemple #23
0
def get_or_create_head_node(config,
                            config_file,
                            no_restart,
                            restart_only,
                            yes,
                            override_cluster_name,
                            _provider=None,
                            _runner=subprocess):
    """Create the cluster head node, which in turn creates the workers."""
    provider = (_provider or get_node_provider(config["provider"],
                                               config["cluster_name"]))

    config = copy.deepcopy(config)
    raw_config_file = config_file  # used for printing to the user
    config_file = os.path.abspath(config_file)
    try:
        head_node_tags = {
            TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
        }
        nodes = provider.non_terminated_nodes(head_node_tags)
        if len(nodes) > 0:
            head_node = nodes[0]
        else:
            head_node = None

        if not head_node:
            cli_logger.confirm(yes, "No head node found. "
                               "Launching a new cluster.",
                               _abort=True)
            cli_logger.old_confirm("This will create a new cluster", yes)
        elif not no_restart:
            cli_logger.old_confirm("This will restart cluster services", yes)

        if head_node:
            if restart_only:
                cli_logger.confirm(
                    yes, "Updating cluster configuration and "
                    "restarting the cluster Ray runtime. "
                    "Setup commands will not be run due to `{}`.\n",
                    cf.bold("--restart-only"),
                    _abort=True)
            elif no_restart:
                cli_logger.print(
                    "Cluster Ray runtime will not be restarted due "
                    "to `{}`.", cf.bold("--no-restart"))
                cli_logger.confirm(yes, "Updating cluster configuration and "
                                   "running setup commands.",
                                   _abort=True)
            else:
                cli_logger.print(
                    "Updating cluster configuration and running full setup.")
                cli_logger.confirm(
                    yes,
                    cf.bold("Cluster Ray runtime will be restarted."),
                    _abort=True)
        cli_logger.newline()

        # TODO(ekl) this logic is duplicated in node_launcher.py (keep in sync)
        head_node_config = copy.deepcopy(config["head_node"])
        if "head_node_type" in config:
            head_node_tags[TAG_RAY_USER_NODE_TYPE] = config["head_node_type"]
            head_node_config.update(config["available_node_types"][
                config["head_node_type"]]["node_config"])

        launch_hash = hash_launch_conf(head_node_config, config["auth"])
        if head_node is None or provider.node_tags(head_node).get(
                TAG_RAY_LAUNCH_CONFIG) != launch_hash:
            with cli_logger.group("Acquiring an up-to-date head node"):
                if head_node is not None:
                    cli_logger.print(
                        "Currently running head node is out-of-date with "
                        "cluster configuration")
                    cli_logger.print(
                        "hash is {}, expected {}",
                        cf.bold(
                            provider.node_tags(head_node).get(
                                TAG_RAY_LAUNCH_CONFIG)), cf.bold(launch_hash))
                    cli_logger.confirm(yes, "Relaunching it.", _abort=True)
                    cli_logger.old_confirm(
                        "Head node config out-of-date. It will be terminated",
                        yes)

                    cli_logger.old_info(
                        logger, "get_or_create_head_node: "
                        "Shutting down outdated head node {}", head_node)

                    provider.terminate_node(head_node)
                    cli_logger.print("Terminated head node {}", head_node)

                cli_logger.old_info(
                    logger,
                    "get_or_create_head_node: Launching new head node...")

                head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash
                head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format(
                    config["cluster_name"])
                provider.create_node(head_node_config, head_node_tags, 1)
                cli_logger.print("Launched a new head node")

                start = time.time()
                head_node = None
                with cli_logger.timed("Fetching the new head node"):
                    while True:
                        if time.time() - start > 50:
                            cli_logger.abort(
                                "Head node fetch timed out.")  # todo: msg
                            raise RuntimeError("Failed to create head node.")
                        nodes = provider.non_terminated_nodes(head_node_tags)
                        if len(nodes) == 1:
                            head_node = nodes[0]
                            break
                        time.sleep(1)
                cli_logger.newline()

        with cli_logger.group(
                "Setting up head node",
                _numbered=("<>", 1, 1),
                # cf.bold(provider.node_tags(head_node)[TAG_RAY_NODE_NAME]),
                _tags=dict()):  # add id, ARN to tags?

            # TODO(ekl) right now we always update the head node even if the
            # hash matches.
            # We could prompt the user for what they want to do here.
            # No need to pass in cluster_sync_files because we use this
            # hash to set up the head node
            (runtime_hash, file_mounts_contents_hash) = hash_runtime_conf(
                config["file_mounts"], None, config)

            cli_logger.old_info(
                logger,
                "get_or_create_head_node: Updating files on head node...")

            # Rewrite the auth config so that the head
            # node can update the workers
            remote_config = copy.deepcopy(config)

            # drop proxy options if they exist, otherwise
            # head node won't be able to connect to workers
            remote_config["auth"].pop("ssh_proxy_command", None)

            if "ssh_private_key" in config["auth"]:
                remote_key_path = "~/ray_bootstrap_key.pem"
                remote_config["auth"]["ssh_private_key"] = remote_key_path

            # Adjust for new file locations
            new_mounts = {}
            for remote_path in config["file_mounts"]:
                new_mounts[remote_path] = remote_path
            remote_config["file_mounts"] = new_mounts
            remote_config["no_restart"] = no_restart

            # Now inject the rewritten config and SSH key into the head node
            remote_config_file = tempfile.NamedTemporaryFile(
                "w", prefix="ray-bootstrap-")
            remote_config_file.write(json.dumps(remote_config))
            remote_config_file.flush()
            config["file_mounts"].update(
                {"~/ray_bootstrap_config.yaml": remote_config_file.name})

            if "ssh_private_key" in config["auth"]:
                config["file_mounts"].update({
                    remote_key_path:
                    config["auth"]["ssh_private_key"],
                })
            cli_logger.print("Prepared bootstrap config")

            if restart_only:
                setup_commands = []
                ray_start_commands = config["head_start_ray_commands"]
            elif no_restart:
                setup_commands = config["head_setup_commands"]
                ray_start_commands = []
            else:
                setup_commands = config["head_setup_commands"]
                ray_start_commands = config["head_start_ray_commands"]

            if not no_restart:
                warn_about_bad_start_command(ray_start_commands)

            updater = NodeUpdaterThread(
                node_id=head_node,
                provider_config=config["provider"],
                provider=provider,
                auth_config=config["auth"],
                cluster_name=config["cluster_name"],
                file_mounts=config["file_mounts"],
                initialization_commands=config["initialization_commands"],
                setup_commands=setup_commands,
                ray_start_commands=ray_start_commands,
                process_runner=_runner,
                runtime_hash=runtime_hash,
                file_mounts_contents_hash=file_mounts_contents_hash,
                is_head_node=True,
                docker_config=config.get("docker"))
            updater.start()
            updater.join()

            # Refresh the node cache so we see the external ip if available
            provider.non_terminated_nodes(head_node_tags)

            if config.get("provider", {}).get("use_internal_ips",
                                              False) is True:
                head_node_ip = provider.internal_ip(head_node)
            else:
                head_node_ip = provider.external_ip(head_node)

            if updater.exitcode != 0:
                # todo: this does not follow the mockup and is not good enough
                cli_logger.abort("Failed to setup head node.")

                cli_logger.old_error(
                    logger, "get_or_create_head_node: "
                    "Updating {} failed", head_node_ip)
                sys.exit(1)

            cli_logger.old_info(
                logger, "get_or_create_head_node: "
                "Head node up-to-date, IP address is: {}", head_node_ip)

        monitor_str = "tail -n 100 -f /tmp/ray/session_*/logs/monitor*"
        if override_cluster_name:
            modifiers = " --cluster-name={}".format(
                quote(override_cluster_name))
        else:
            modifiers = ""

        if cli_logger.old_style:
            print("To monitor autoscaling activity, you can run:\n\n"
                  "  ray exec {} {}{}\n".format(config_file,
                                                quote(monitor_str), modifiers))
            print("To open a console on the cluster:\n\n"
                  "  ray attach {}{}\n".format(config_file, modifiers))

            print("To get a remote shell to the cluster manually, run:\n\n"
                  "  {}\n".format(
                      updater.cmd_runner.remote_shell_command_str()))

        cli_logger.newline()
        with cli_logger.group("Useful commands"):
            cli_logger.print("Monitor autoscaling with")
            cli_logger.print(cf.bold("  ray exec {}{} {}"), raw_config_file,
                             modifiers, quote(monitor_str))

            cli_logger.print("Connect to a terminal on the cluster head")
            cli_logger.print(cf.bold("  ray attach {}{}"), raw_config_file,
                             modifiers)
    finally:
        provider.cleanup()
Exemple #24
0
def teardown_cluster(config_file: str, yes: bool, workers_only: bool,
                     override_cluster_name: Optional[str],
                     keep_min_workers: bool):
    """Destroys all nodes of a Ray cluster described by a config json."""
    config = yaml.safe_load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name
    config = prepare_config(config)
    validate_config(config)

    cli_logger.confirm(yes, "Destroying cluster.", _abort=True)
    cli_logger.old_confirm("This will destroy your cluster", yes)

    if not workers_only:
        try:
            exec_cluster(config_file,
                         cmd="ray stop",
                         run_env="auto",
                         screen=False,
                         tmux=False,
                         stop=False,
                         start=False,
                         override_cluster_name=override_cluster_name,
                         port_forward=None,
                         with_output=False)
        except Exception as e:
            # todo: add better exception info
            cli_logger.verbose_error("{}", str(e))
            cli_logger.warning(
                "Exception occured when stopping the cluster Ray runtime "
                "(use -v to dump teardown exceptions).")
            cli_logger.warning(
                "Ignoring the exception and "
                "attempting to shut down the cluster nodes anyway.")

            cli_logger.old_exception(
                logger, "Ignoring error attempting a clean shutdown.")

    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:

        def remaining_nodes():
            workers = provider.non_terminated_nodes(
                {TAG_RAY_NODE_KIND: NODE_KIND_WORKER})

            if keep_min_workers:
                min_workers = config.get("min_workers", 0)

                cli_logger.print(
                    "{} random worker nodes will not be shut down. " +
                    cf.dimmed("(due to {})"), cf.bold(min_workers),
                    cf.bold("--keep-min-workers"))
                cli_logger.old_info(logger,
                                    "teardown_cluster: Keeping {} nodes...",
                                    min_workers)

                workers = random.sample(workers, len(workers) - min_workers)

            # todo: it's weird to kill the head node but not all workers
            if workers_only:
                cli_logger.print(
                    "The head node will not be shut down. " +
                    cf.dimmed("(due to {})"), cf.bold("--workers-only"))

                return workers

            head = provider.non_terminated_nodes(
                {TAG_RAY_NODE_KIND: NODE_KIND_HEAD})

            return head + workers

        def run_docker_stop(node, container_name):
            try:
                updater = NodeUpdaterThread(
                    node_id=node,
                    provider_config=config["provider"],
                    provider=provider,
                    auth_config=config["auth"],
                    cluster_name=config["cluster_name"],
                    file_mounts=config["file_mounts"],
                    initialization_commands=[],
                    setup_commands=[],
                    ray_start_commands=[],
                    runtime_hash="",
                    file_mounts_contents_hash="",
                    is_head_node=False,
                    docker_config=config.get("docker"))
                _exec(updater,
                      f"docker stop {container_name}",
                      False,
                      False,
                      run_env="host")
            except Exception:
                cli_logger.warning(f"Docker stop failed on {node}")
                cli_logger.old_warning(logger, f"Docker stop failed on {node}")

        # Loop here to check that both the head and worker nodes are actually
        #   really gone
        A = remaining_nodes()

        container_name = config.get("docker", {}).get("container_name")
        if container_name:
            for node in A:
                run_docker_stop(node, container_name)

        with LogTimer("teardown_cluster: done."):
            while A:
                cli_logger.old_info(
                    logger, "teardown_cluster: "
                    "Shutting down {} nodes...", len(A))

                provider.terminate_nodes(A)

                cli_logger.print("Requested {} nodes to shut down.",
                                 cf.bold(len(A)),
                                 _tags=dict(interval="1s"))

                time.sleep(1)  # todo: interval should be a variable
                A = remaining_nodes()
                cli_logger.print("{} nodes remaining after 1 second.",
                                 cf.bold(len(A)))
            cli_logger.success("No nodes remaining.")
    finally:
        provider.cleanup()
    def testClusterStateInit(self):
        """Check ClusterState __init__ func generates correct state file.

        Test the general use case and if num_workers increase/decrease.
        """

        cluster_config = {
            "cluster_name": "random_name",
            "min_workers": 0,
            "max_workers": 0,
            "initial_workers": 0,
            "provider": {
                "type": "local",
                "head_ip": "0.0.0.0:2",
                "worker_ips": ["0.0.0.0:1"]
            },
        }
        provider_config = cluster_config["provider"]
        node_provider = get_node_provider(provider_config,
                                          cluster_config["cluster_name"])
        assert isinstance(node_provider, LocalNodeProvider)
        expected_workers = {}
        expected_workers[provider_config["head_ip"]] = {
            "tags": {
                TAG_RAY_NODE_TYPE: NODE_TYPE_HEAD
            },
            "state": "terminated",
        }
        expected_workers[provider_config["worker_ips"][0]] = {
            "tags": {
                TAG_RAY_NODE_TYPE: NODE_TYPE_WORKER
            },
            "state": "terminated",
        }

        state_save_path = "/tmp/cluster-{}.state".format(
            cluster_config["cluster_name"])
        assert os.path.exists(state_save_path)
        workers = json.loads(open(state_save_path).read())
        assert workers == expected_workers

        # Test removing workers updates the cluster state.
        del expected_workers[provider_config["worker_ips"][0]]
        removed_ip = provider_config["worker_ips"].pop()
        node_provider = get_node_provider(provider_config,
                                          cluster_config["cluster_name"])
        workers = json.loads(open(state_save_path).read())
        assert workers == expected_workers

        # Test adding back workers updates the cluster state.
        expected_workers[removed_ip] = {
            "tags": {
                TAG_RAY_NODE_TYPE: NODE_TYPE_WORKER
            },
            "state": "terminated",
        }
        provider_config["worker_ips"].append(removed_ip)
        node_provider = get_node_provider(provider_config,
                                          cluster_config["cluster_name"])
        workers = json.loads(open(state_save_path).read())
        assert workers == expected_workers
Exemple #26
0
def get_or_create_head_node(config, no_restart, yes):
    """Create the cluster head node, which in turn creates the workers."""

    provider = get_node_provider(config["provider"], config["cluster_name"])
    head_node_tags = {
        TAG_RAY_NODE_TYPE: "Head",
    }
    nodes = provider.nodes(head_node_tags)
    if len(nodes) > 0:
        head_node = nodes[0]
    else:
        head_node = None

    if not head_node:
        confirm("This will create a new cluster", yes)
    elif not no_restart:
        confirm("This will restart cluster services", yes)

    launch_hash = hash_launch_conf(config["head_node"], config["auth"])
    if head_node is None or provider.node_tags(head_node).get(
            TAG_RAY_LAUNCH_CONFIG) != launch_hash:
        if head_node is not None:
            confirm("Head node config out-of-date. It will be terminated", yes)
            print("Terminating outdated head node {}".format(head_node))
            provider.terminate_node(head_node)
        print("Launching new head node...")
        head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash
        head_node_tags[TAG_NAME] = "ray-{}-head".format(config["cluster_name"])
        provider.create_node(config["head_node"], head_node_tags, 1)

    nodes = provider.nodes(head_node_tags)
    assert len(nodes) == 1, "Failed to create head node."
    head_node = nodes[0]

    # TODO(ekl) right now we always update the head node even if the hash
    # matches. We could prompt the user for what they want to do in this case.
    runtime_hash = hash_runtime_conf(config["file_mounts"], config)
    print("Updating files on head node...")

    # Rewrite the auth config so that the head node can update the workers
    remote_key_path = "~/ray_bootstrap_key.pem"
    remote_config = copy.deepcopy(config)
    remote_config["auth"]["ssh_private_key"] = remote_key_path

    # Adjust for new file locations
    new_mounts = {}
    for remote_path in config["file_mounts"]:
        new_mounts[remote_path] = remote_path
    remote_config["file_mounts"] = new_mounts
    remote_config["no_restart"] = no_restart

    # Now inject the rewritten config and SSH key into the head node
    remote_config_file = tempfile.NamedTemporaryFile(
        "w", prefix="ray-bootstrap-")
    remote_config_file.write(json.dumps(remote_config))
    remote_config_file.flush()
    config["file_mounts"].update({
        remote_key_path: config["auth"]["ssh_private_key"],
        "~/ray_bootstrap_config.yaml": remote_config_file.name
    })

    if no_restart:
        init_commands = (
            config["setup_commands"] + config["head_setup_commands"])
    else:
        init_commands = (
            config["setup_commands"] + config["head_setup_commands"] +
            config["head_start_ray_commands"])

    updater = NodeUpdaterProcess(
        head_node,
        config["provider"],
        config["auth"],
        config["cluster_name"],
        config["file_mounts"],
        init_commands,
        runtime_hash,
        redirect_output=False)
    updater.start()
    updater.join()

    # Refresh the node cache so we see the external ip if available
    provider.nodes(head_node_tags)

    if updater.exitcode != 0:
        print("Error: updating {} failed".format(
            provider.external_ip(head_node)))
        sys.exit(1)
    print(
        "Head node up-to-date, IP address is: {}".format(
            provider.external_ip(head_node)))

    monitor_str = "tail -f /tmp/raylogs/monitor-*"
    for s in init_commands:
        if ("ray start" in s and "docker exec" in s and
                "--autoscaling-config" in s):
            monitor_str = "docker exec {} /bin/sh -c {}".format(
                        config["docker"]["container_name"],
                        quote(monitor_str))
    print(
        "To monitor auto-scaling activity, you can run:\n\n"
        "  ssh -i {} {}@{} {}\n".format(
            config["auth"]["ssh_private_key"],
            config["auth"]["ssh_user"],
            provider.external_ip(head_node),
            quote(monitor_str)))
    print(
        "To login to the cluster, run:\n\n"
        "  ssh -i {} {}@{}\n".format(
            config["auth"]["ssh_private_key"],
            config["auth"]["ssh_user"],
            provider.external_ip(head_node)))
Exemple #27
0
def exec_cluster(config_file, cmd, docker, screen, tmux, stop, start,
                 override_cluster_name, port_forward):
    """Runs a command on the specified cluster.

    Arguments:
        config_file: path to the cluster yaml
        cmd: command to run
        docker: whether to run command in docker container of config
        screen: whether to run in a screen
        tmux: whether to run in a tmux session
        stop: whether to stop the cluster after command run
        start: whether to start the cluster if it isn't up
        override_cluster_name: set the name of the cluster
        port_forward: port to forward
    """
    assert not (screen and tmux), "Can specify only one of `screen` or `tmux`."

    config = yaml.load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name
    config = _bootstrap_config(config)

    head_node = _get_head_node(
        config, config_file, override_cluster_name, create_if_needed=start)

    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:
        updater = NodeUpdaterThread(
            node_id=head_node,
            provider_config=config["provider"],
            provider=provider,
            auth_config=config["auth"],
            cluster_name=config["cluster_name"],
            file_mounts=config["file_mounts"],
            initialization_commands=[],
            setup_commands=[],
            runtime_hash="",
        )

        def wrap_docker(command):
            container_name = config["docker"]["container_name"]
            if not container_name:
                raise ValueError("Docker container not specified in config.")
            return with_docker_exec(
                [command], container_name=container_name)[0]

        cmd = wrap_docker(cmd) if docker else cmd

        if stop:
            shutdown_cmd = (
                "ray stop; ray teardown ~/ray_bootstrap_config.yaml "
                "--yes --workers-only")
            if docker:
                shutdown_cmd = wrap_docker(shutdown_cmd)
            cmd += ("; {}; sudo shutdown -h now".format(shutdown_cmd))

        _exec(
            updater,
            cmd,
            screen,
            tmux,
            expect_error=stop,
            port_forward=port_forward)

        if tmux or screen:
            attach_command_parts = ["ray attach", config_file]
            if override_cluster_name is not None:
                attach_command_parts.append(
                    "--cluster-name={}".format(override_cluster_name))
            if tmux:
                attach_command_parts.append("--tmux")
            elif screen:
                attach_command_parts.append("--screen")

            attach_command = " ".join(attach_command_parts)
            attach_info = "Use `{}` to check on command status.".format(
                attach_command)
            logger.info(attach_info)
    finally:
        provider.cleanup()
Exemple #28
0
def get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
                            override_cluster_name):
    """Create the cluster head node, which in turn creates the workers."""
    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:
        head_node_tags = {
            TAG_RAY_NODE_TYPE: "head",
        }
        nodes = provider.non_terminated_nodes(head_node_tags)
        if len(nodes) > 0:
            head_node = nodes[0]
        else:
            head_node = None

        if not head_node:
            confirm("This will create a new cluster", yes)
        elif not no_restart:
            confirm("This will restart cluster services", yes)

        launch_hash = hash_launch_conf(config["head_node"], config["auth"])
        if head_node is None or provider.node_tags(head_node).get(
                TAG_RAY_LAUNCH_CONFIG) != launch_hash:
            if head_node is not None:
                confirm("Head node config out-of-date. It will be terminated",
                        yes)
                logger.info(
                    "get_or_create_head_node: "
                    "Terminating outdated head node {}".format(head_node))
                provider.terminate_node(head_node)
            logger.info("get_or_create_head_node: Launching new head node...")
            head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash
            head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format(
                config["cluster_name"])
            provider.create_node(config["head_node"], head_node_tags, 1)

        nodes = provider.non_terminated_nodes(head_node_tags)
        assert len(nodes) == 1, "Failed to create head node."
        head_node = nodes[0]

        # TODO(ekl) right now we always update the head node even if the hash
        # matches. We could prompt the user for what they want to do here.
        runtime_hash = hash_runtime_conf(config["file_mounts"], config)
        logger.info("get_or_create_head_node: Updating files on head node...")

        # Rewrite the auth config so that the head node can update the workers
        remote_key_path = "~/ray_bootstrap_key.pem"
        remote_config = copy.deepcopy(config)
        remote_config["auth"]["ssh_private_key"] = remote_key_path

        # Adjust for new file locations
        new_mounts = {}
        for remote_path in config["file_mounts"]:
            new_mounts[remote_path] = remote_path
        remote_config["file_mounts"] = new_mounts
        remote_config["no_restart"] = no_restart

        # Now inject the rewritten config and SSH key into the head node
        remote_config_file = tempfile.NamedTemporaryFile(
            "w", prefix="ray-bootstrap-")
        remote_config_file.write(json.dumps(remote_config))
        remote_config_file.flush()
        config["file_mounts"].update({
            remote_key_path: config["auth"]["ssh_private_key"],
            "~/ray_bootstrap_config.yaml": remote_config_file.name
        })

        if restart_only:
            init_commands = config["head_start_ray_commands"]
        elif no_restart:
            init_commands = (
                config["setup_commands"] + config["head_setup_commands"])
        else:
            init_commands = (
                config["setup_commands"] + config["head_setup_commands"] +
                config["head_start_ray_commands"])

        updater = NodeUpdaterThread(
            node_id=head_node,
            provider_config=config["provider"],
            provider=provider,
            auth_config=config["auth"],
            cluster_name=config["cluster_name"],
            file_mounts=config["file_mounts"],
            initialization_commands=config["initialization_commands"],
            setup_commands=init_commands,
            runtime_hash=runtime_hash,
        )
        updater.start()
        updater.join()

        # Refresh the node cache so we see the external ip if available
        provider.non_terminated_nodes(head_node_tags)

        if config.get("provider", {}).get("use_internal_ips", False) is True:
            head_node_ip = provider.internal_ip(head_node)
        else:
            head_node_ip = provider.external_ip(head_node)

        if updater.exitcode != 0:
            logger.error("get_or_create_head_node: "
                         "Updating {} failed".format(head_node_ip))
            sys.exit(1)
        logger.info(
            "get_or_create_head_node: "
            "Head node up-to-date, IP address is: {}".format(head_node_ip))

        monitor_str = "tail -n 100 -f /tmp/ray/session_*/logs/monitor*"
        use_docker = bool(config["docker"]["container_name"])
        if override_cluster_name:
            modifiers = " --cluster-name={}".format(
                quote(override_cluster_name))
        else:
            modifiers = ""
        print("To monitor auto-scaling activity, you can run:\n\n"
              "  ray exec {} {}{}{}\n".format(
                  config_file, "--docker " if use_docker else " ",
                  quote(monitor_str), modifiers))
        print("To open a console on the cluster:\n\n"
              "  ray attach {}{}\n".format(config_file, modifiers))

        print("To ssh manually to the cluster, run:\n\n"
              "  ssh -i {} {}@{}\n".format(config["auth"]["ssh_private_key"],
                                           config["auth"]["ssh_user"],
                                           head_node_ip))
    finally:
        provider.cleanup()
Exemple #29
0
    def __init__(self,
                 config_path,
                 load_metrics,
                 max_launch_batch=AUTOSCALER_MAX_LAUNCH_BATCH,
                 max_concurrent_launches=AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
                 max_failures=AUTOSCALER_MAX_NUM_FAILURES,
                 process_runner=subprocess,
                 update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S):
        self.config_path = config_path
        self.reload_config(errors_fatal=True)
        self.load_metrics = load_metrics
        self.provider = get_node_provider(self.config["provider"],
                                          self.config["cluster_name"])

        # Check whether we can enable the resource demand scheduler.
        if "available_node_types" in self.config:
            self.available_node_types = self.config["available_node_types"]
            self.resource_demand_scheduler = ResourceDemandScheduler(
                self.provider, self.available_node_types,
                self.config["max_workers"])
        else:
            self.available_node_types = None
            self.resource_demand_scheduler = None

        self.max_failures = max_failures
        self.max_launch_batch = max_launch_batch
        self.max_concurrent_launches = max_concurrent_launches
        self.process_runner = process_runner

        # Map from node_id to NodeUpdater processes
        self.updaters = {}
        self.num_failed_updates = defaultdict(int)
        self.num_successful_updates = defaultdict(int)
        self.num_failures = 0
        self.last_update_time = 0.0
        self.update_interval_s = update_interval_s
        self.bringup = True

        # Node launchers
        self.launch_queue = queue.Queue()
        self.pending_launches = ConcurrentCounter()
        max_batches = math.ceil(max_concurrent_launches /
                                float(max_launch_batch))
        for i in range(int(max_batches)):
            node_launcher = NodeLauncher(
                provider=self.provider,
                queue=self.launch_queue,
                index=i,
                pending=self.pending_launches,
                node_types=self.available_node_types,
            )
            node_launcher.daemon = True
            node_launcher.start()

        # Expand local file_mounts to allow ~ in the paths. This can't be done
        # earlier when the config is written since we might be on different
        # platform and the expansion would result in wrong path.
        self.config["file_mounts"] = {
            remote: os.path.expanduser(local)
            for remote, local in self.config["file_mounts"].items()
        }

        for local_path in self.config["file_mounts"].values():
            assert os.path.exists(local_path)

        # Aggregate resources the user is requesting of the cluster.
        self.resource_requests = defaultdict(int)
        # List of resource bundles the user is requesting of the cluster.
        self.resource_demand_vector = None

        logger.info("StandardAutoscaler: {}".format(self.config))
Exemple #30
0
def teardown_cluster(config_file: str, yes: bool, workers_only: bool,
                     override_cluster_name: Optional[str],
                     keep_min_workers: bool, log_old_style: bool,
                     log_color: str, verbose: int):
    """Destroys all nodes of a Ray cluster described by a config json."""
    cli_logger.old_style = log_old_style
    cli_logger.color_mode = log_color
    cli_logger.verbosity = verbose
    cli_logger.dump_command_output = verbose == 3  # todo: add a separate flag?

    config = yaml.safe_load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name
    config = prepare_config(config)
    validate_config(config)

    cli_logger.confirm(yes, "Destroying cluster.", _abort=True)
    cli_logger.old_confirm("This will destroy your cluster", yes)

    if not workers_only:
        try:
            exec_cluster(config_file,
                         cmd="ray stop",
                         run_env="auto",
                         screen=False,
                         tmux=False,
                         stop=False,
                         start=False,
                         override_cluster_name=override_cluster_name,
                         port_forward=None,
                         with_output=False)
        except Exception as e:
            # todo: add better exception info
            cli_logger.verbose_error("{}", str(e))
            cli_logger.warning(
                "Exception occured when stopping the cluster Ray runtime "
                "(use -v to dump teardown exceptions).")
            cli_logger.warning(
                "Ignoring the exception and "
                "attempting to shut down the cluster nodes anyway.")

            cli_logger.old_exception(
                logger, "Ignoring error attempting a clean shutdown.")

    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:

        def remaining_nodes():

            workers = provider.non_terminated_nodes(
                {TAG_RAY_NODE_TYPE: NODE_TYPE_WORKER})

            if keep_min_workers:
                min_workers = config.get("min_workers", 0)

                cli_logger.print(
                    "{} random worker nodes will not be shut down. " +
                    cf.gray("(due to {})"), cf.bold(min_workers),
                    cf.bold("--keep-min-workers"))
                cli_logger.old_info(logger,
                                    "teardown_cluster: Keeping {} nodes...",
                                    min_workers)

                workers = random.sample(workers, len(workers) - min_workers)

            # todo: it's weird to kill the head node but not all workers
            if workers_only:
                cli_logger.print(
                    "The head node will not be shut down. " +
                    cf.gray("(due to {})"), cf.bold("--workers-only"))

                return workers

            head = provider.non_terminated_nodes(
                {TAG_RAY_NODE_TYPE: NODE_TYPE_HEAD})

            return head + workers

        # Loop here to check that both the head and worker nodes are actually
        #   really gone
        A = remaining_nodes()
        with LogTimer("teardown_cluster: done."):
            while A:
                cli_logger.old_info(
                    logger, "teardown_cluster: "
                    "Shutting down {} nodes...", len(A))

                provider.terminate_nodes(A)

                cli_logger.print("Requested {} nodes to shut down.",
                                 cf.bold(len(A)),
                                 _tags=dict(interval="1s"))

                time.sleep(1)  # todo: interval should be a variable
                A = remaining_nodes()
                cli_logger.print("{} nodes remaining after 1 second.",
                                 cf.bold(len(A)))
    finally:
        provider.cleanup()
    def testCoordinatorSenderNodeProvider(self):
        """Integration test of CoordinatorSenderNodeProvider."""
        cluster_config = {
            "cluster_name": "random_name",
            "min_workers": 0,
            "max_workers": 0,
            "initial_workers": 0,
            "provider": {
                "type": "local",
                "coordinator_address": self.coordinator_address,
            },
            "head_node": {},
            "worker_nodes": {},
        }
        provider_config = cluster_config["provider"]
        node_provider_1 = get_node_provider(provider_config,
                                            cluster_config["cluster_name"])
        assert isinstance(node_provider_1, CoordinatorSenderNodeProvider)

        assert not node_provider_1.non_terminated_nodes({})
        assert not node_provider_1.is_running(self.list_of_node_ips[0])
        assert node_provider_1.is_terminated(self.list_of_node_ips[0])
        assert not node_provider_1.node_tags(self.list_of_node_ips[0])
        head_node_tags = {
            TAG_RAY_NODE_TYPE: NODE_TYPE_HEAD,
        }
        assert not node_provider_1.non_terminated_nodes(head_node_tags)
        head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format(
            cluster_config["cluster_name"])
        node_provider_1.create_node(cluster_config["head_node"],
                                    head_node_tags, 1)
        assert node_provider_1.non_terminated_nodes(
            {}) == [self.list_of_node_ips[0]]
        head_node_tags[TAG_RAY_CLUSTER_NAME] = cluster_config["cluster_name"]
        assert node_provider_1.node_tags(
            self.list_of_node_ips[0]) == head_node_tags
        assert node_provider_1.is_running(self.list_of_node_ips[0])
        assert not node_provider_1.is_terminated(self.list_of_node_ips[0])

        # Add another cluster.
        cluster_config["cluster_name"] = "random_name_2"
        provider_config = cluster_config["provider"]
        node_provider_2 = get_node_provider(provider_config,
                                            cluster_config["cluster_name"])
        assert not node_provider_2.non_terminated_nodes({})
        assert not node_provider_2.is_running(self.list_of_node_ips[1])
        assert node_provider_2.is_terminated(self.list_of_node_ips[1])
        assert not node_provider_2.node_tags(self.list_of_node_ips[1])
        assert not node_provider_2.non_terminated_nodes(head_node_tags)
        head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format(
            cluster_config["cluster_name"])
        node_provider_2.create_node(cluster_config["head_node"],
                                    head_node_tags, 1)
        assert node_provider_2.non_terminated_nodes(
            {}) == [self.list_of_node_ips[1]]
        head_node_tags[TAG_RAY_CLUSTER_NAME] = cluster_config["cluster_name"]
        assert node_provider_2.node_tags(
            self.list_of_node_ips[1]) == head_node_tags
        assert node_provider_2.is_running(self.list_of_node_ips[1])
        assert not node_provider_2.is_terminated(self.list_of_node_ips[1])

        # Add another cluster (should fail because we only have two nodes).
        cluster_config["cluster_name"] = "random_name_3"
        provider_config = cluster_config["provider"]
        node_provider_3 = get_node_provider(provider_config,
                                            cluster_config["cluster_name"])
        assert not node_provider_3.non_terminated_nodes(head_node_tags)
        head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format(
            cluster_config["cluster_name"])
        node_provider_3.create_node(cluster_config["head_node"],
                                    head_node_tags, 1)
        assert not node_provider_3.non_terminated_nodes({})

        # Terminate all nodes.
        node_provider_1.terminate_node(self.list_of_node_ips[0])
        assert not node_provider_1.non_terminated_nodes({})
        node_provider_2.terminate_node(self.list_of_node_ips[1])
        assert not node_provider_2.non_terminated_nodes({})

        # Check if now we can create more clusters/nodes.
        node_provider_3.create_node(cluster_config["head_node"],
                                    head_node_tags, 1)
        worker_node_tags = {
            TAG_RAY_NODE_NAME:
            "ray-{}-worker".format(cluster_config["cluster_name"]),
            TAG_RAY_NODE_TYPE:
            NODE_TYPE_WORKER
        }
        node_provider_3.create_node(cluster_config["worker_nodes"],
                                    worker_node_tags, 1)
        assert node_provider_3.non_terminated_nodes(
            {}) == self.list_of_node_ips
        worker_filter = {TAG_RAY_NODE_TYPE: NODE_TYPE_WORKER}
        assert node_provider_3.non_terminated_nodes(worker_filter) == [
            self.list_of_node_ips[1]
        ]
        head_filter = {TAG_RAY_NODE_TYPE: NODE_TYPE_HEAD}
        assert node_provider_3.non_terminated_nodes(head_filter) == [
            self.list_of_node_ips[0]
        ]
def get_or_create_head_node(config):
    """Create the cluster head node, which in turn creates the workers."""

    provider = get_node_provider(config["provider"], config["cluster_name"])
    head_node_tags = {
        TAG_RAY_NODE_TYPE: "Head",
    }
    nodes = provider.nodes(head_node_tags)
    if len(nodes) > 0:
        head_node = nodes[0]
    else:
        head_node = None

    launch_hash = hash_launch_conf(config["head_node"], config["auth"])
    if head_node is None or provider.node_tags(head_node).get(
            TAG_RAY_LAUNCH_CONFIG) != launch_hash:
        if head_node is not None:
            print("Terminating outdated head node {}".format(head_node))
            provider.terminate_node(head_node)
        print("Launching new head node...")
        head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash
        head_node_tags[TAG_NAME] = "ray-{}-head".format(config["cluster_name"])
        provider.create_node(config["head_node"], head_node_tags, 1)

    nodes = provider.nodes(head_node_tags)
    assert len(nodes) == 1, "Failed to create head node."
    head_node = nodes[0]

    runtime_hash = hash_runtime_conf(config["file_mounts"], config)

    if provider.node_tags(head_node).get(
            TAG_RAY_RUNTIME_CONFIG) != runtime_hash:
        print("Updating files on head node...")

        # Rewrite the auth config so that the head node can update the workers
        remote_key_path = "~/ray_bootstrap_key.pem"
        remote_config = copy.deepcopy(config)
        remote_config["auth"]["ssh_private_key"] = remote_key_path

        # Adjust for new file locations
        new_mounts = {}
        for remote_path in config["file_mounts"]:
            new_mounts[remote_path] = remote_path
        remote_config["file_mounts"] = new_mounts

        # Now inject the rewritten config and SSH key into the head node
        remote_config_file = tempfile.NamedTemporaryFile(
            "w", prefix="ray-bootstrap-")
        remote_config_file.write(json.dumps(remote_config))
        remote_config_file.flush()
        config["file_mounts"].update({
            remote_key_path: config["auth"]["ssh_private_key"],
            "~/ray_bootstrap_config.yaml": remote_config_file.name
        })

        updater = NodeUpdaterProcess(
            head_node,
            config["provider"],
            config["auth"],
            config["cluster_name"],
            config["file_mounts"],
            config["head_init_commands"],
            runtime_hash,
            redirect_output=False)
        updater.start()
        updater.join()
        if updater.exitcode != 0:
            print("Error: updating {} failed".format(
                provider.external_ip(head_node)))
            sys.exit(1)
    print(
        "Head node up-to-date, IP address is: {}".format(
            provider.external_ip(head_node)))
    print(
        "To monitor auto-scaling activity, you can run:\n\n"
        "  ssh -i {} {}@{} 'tail -f /tmp/raylogs/monitor-*'\n".format(
            config["auth"]["ssh_private_key"],
            config["auth"]["ssh_user"],
            provider.external_ip(head_node)))
Exemple #33
0
def exec_cluster(config_file,
                 cmd=None,
                 docker=False,
                 screen=False,
                 tmux=False,
                 stop=False,
                 start=False,
                 override_cluster_name=None,
                 port_forward=None,
                 with_output=False):
    """Runs a command on the specified cluster.

    Arguments:
        config_file: path to the cluster yaml
        cmd: command to run
        docker: whether to run command in docker container of config
        screen: whether to run in a screen
        tmux: whether to run in a tmux session
        stop: whether to stop the cluster after command run
        start: whether to start the cluster if it isn't up
        override_cluster_name: set the name of the cluster
        port_forward (int or list[int]): port(s) to forward
    """
    assert not (screen and tmux), "Can specify only one of `screen` or `tmux`."

    config = yaml.safe_load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name
    config = _bootstrap_config(config)

    head_node = _get_head_node(
        config, config_file, override_cluster_name, create_if_needed=start)

    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:
        updater = NodeUpdaterThread(
            node_id=head_node,
            provider_config=config["provider"],
            provider=provider,
            auth_config=config["auth"],
            cluster_name=config["cluster_name"],
            file_mounts=config["file_mounts"],
            initialization_commands=[],
            setup_commands=[],
            ray_start_commands=[],
            runtime_hash="",
        )

        def wrap_docker(command):
            container_name = config["docker"]["container_name"]
            if not container_name:
                raise ValueError("Docker container not specified in config.")
            return with_docker_exec(
                [command], container_name=container_name)[0]

        if cmd:
            cmd = wrap_docker(cmd) if docker else cmd

            if stop:
                shutdown_cmd = (
                    "ray stop; ray teardown ~/ray_bootstrap_config.yaml "
                    "--yes --workers-only")
                if docker:
                    shutdown_cmd = wrap_docker(shutdown_cmd)
                cmd += ("; {}; sudo shutdown -h now".format(shutdown_cmd))

        result = _exec(
            updater,
            cmd,
            screen,
            tmux,
            port_forward=port_forward,
            with_output=with_output)

        if tmux or screen:
            attach_command_parts = ["ray attach", config_file]
            if override_cluster_name is not None:
                attach_command_parts.append(
                    "--cluster-name={}".format(override_cluster_name))
            if tmux:
                attach_command_parts.append("--tmux")
            elif screen:
                attach_command_parts.append("--screen")

            attach_command = " ".join(attach_command_parts)
            attach_info = "Use `{}` to check on command status.".format(
                attach_command)
            logger.info(attach_info)
        return result
    finally:
        provider.cleanup()
Exemple #34
0
def exec_cluster(config_file: str,
                 *,
                 cmd: Any = None,
                 run_env: str = "auto",
                 screen: bool = False,
                 tmux: bool = False,
                 stop: bool = False,
                 start: bool = False,
                 override_cluster_name: Optional[str] = None,
                 no_config_cache: bool = False,
                 port_forward: Any = None,
                 with_output: bool = False):
    """Runs a command on the specified cluster.

    Arguments:
        config_file: path to the cluster yaml
        cmd: command to run
        run_env: whether to run the command on the host or in a container.
            Select between "auto", "host" and "docker"
        screen: whether to run in a screen
        tmux: whether to run in a tmux session
        stop: whether to stop the cluster after command run
        start: whether to start the cluster if it isn't up
        override_cluster_name: set the name of the cluster
        port_forward (int or list[int]): port(s) to forward
    """
    assert not (screen and tmux), "Can specify only one of `screen` or `tmux`."
    assert run_env in RUN_ENV_TYPES, "--run_env must be in {}".format(
        RUN_ENV_TYPES)
    # TODO(rliaw): We default this to True to maintain backwards-compat.
    # In the future we would want to support disabling login-shells
    # and interactivity.
    cmd_output_util.set_allow_interactive(True)

    config = yaml.safe_load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name
    config = _bootstrap_config(config, no_config_cache=no_config_cache)

    head_node = _get_head_node(config,
                               config_file,
                               override_cluster_name,
                               create_if_needed=start)

    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:
        updater = NodeUpdaterThread(node_id=head_node,
                                    provider_config=config["provider"],
                                    provider=provider,
                                    auth_config=config["auth"],
                                    cluster_name=config["cluster_name"],
                                    file_mounts=config["file_mounts"],
                                    initialization_commands=[],
                                    setup_commands=[],
                                    ray_start_commands=[],
                                    runtime_hash="",
                                    file_mounts_contents_hash="",
                                    is_head_node=True,
                                    docker_config=config.get("docker"))

        is_docker = isinstance(updater.cmd_runner, DockerCommandRunner)

        if cmd and stop:
            cmd += "; ".join([
                "ray stop",
                "ray teardown ~/ray_bootstrap_config.yaml --yes --workers-only"
            ])
            if is_docker and run_env == "docker":
                updater.cmd_runner.shutdown_after_next_cmd()
            else:
                cmd += "; sudo shutdown -h now"

        result = _exec(updater,
                       cmd,
                       screen,
                       tmux,
                       port_forward=port_forward,
                       with_output=with_output,
                       run_env=run_env)
        if tmux or screen:
            attach_command_parts = ["ray attach", config_file]
            if override_cluster_name is not None:
                attach_command_parts.append(
                    "--cluster-name={}".format(override_cluster_name))
            if tmux:
                attach_command_parts.append("--tmux")
            elif screen:
                attach_command_parts.append("--screen")

            attach_command = " ".join(attach_command_parts)
            cli_logger.print("Run `{}` to check command status.",
                             cf.bold(attach_command))

            attach_info = "Use `{}` to check on command status.".format(
                attach_command)
            cli_logger.old_info(logger, attach_info)
        return result
    finally:
        provider.cleanup()
Exemple #35
0
def rsync(config_file: str,
          source: Optional[str],
          target: Optional[str],
          override_cluster_name: Optional[str],
          down: bool,
          no_config_cache: bool = False,
          all_nodes: bool = False):
    """Rsyncs files.

    Arguments:
        config_file: path to the cluster yaml
        source: source dir
        target: target dir
        override_cluster_name: set the name of the cluster
        down: whether we're syncing remote -> local
        all_nodes: whether to sync worker nodes in addition to the head node
    """
    if bool(source) != bool(target):
        cli_logger.abort(
            "Expected either both a source and a target, or neither.")

    assert bool(source) == bool(target), (
        "Must either provide both or neither source and target.")

    config = yaml.safe_load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name
    config = _bootstrap_config(config, no_config_cache=no_config_cache)

    is_file_mount = False
    for remote_mount in config.get("file_mounts", {}).keys():
        if remote_mount in (source if down else target):
            is_file_mount = True
            break

    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:
        nodes = []
        if all_nodes:
            # technically we re-open the provider for no reason
            # in get_worker_nodes but it's cleaner this way
            # and _get_head_node does this too
            nodes = _get_worker_nodes(config, override_cluster_name)

        head_node = _get_head_node(config,
                                   config_file,
                                   override_cluster_name,
                                   create_if_needed=False)

        nodes += [head_node]

        for node_id in nodes:
            updater = NodeUpdaterThread(node_id=node_id,
                                        provider_config=config["provider"],
                                        provider=provider,
                                        auth_config=config["auth"],
                                        cluster_name=config["cluster_name"],
                                        file_mounts=config["file_mounts"],
                                        initialization_commands=[],
                                        setup_commands=[],
                                        ray_start_commands=[],
                                        runtime_hash="",
                                        file_mounts_contents_hash="",
                                        is_head_node=(node_id == head_node),
                                        docker_config=config.get("docker"))
            if down:
                rsync = updater.rsync_down
            else:
                rsync = updater.rsync_up

            if source and target:
                # print rsync progress for single file rsync
                cmd_output_util.set_output_redirected(False)
                set_rsync_silent(False)

                rsync(source, target, is_file_mount)
            else:
                updater.sync_file_mounts(rsync)

    finally:
        provider.cleanup()
Exemple #36
0
def teardown_cluster(config_file, yes, workers_only, override_cluster_name,
                     keep_min_workers):
    """Destroys all nodes of a Ray cluster described by a config json."""

    config = yaml.safe_load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name
    config = prepare_config(config)
    validate_config(config)

    confirm("This will destroy your cluster", yes)

    if not workers_only:
        try:
            exec_cluster(config_file,
                         cmd="ray stop",
                         run_env="auto",
                         screen=False,
                         tmux=False,
                         stop=False,
                         start=False,
                         override_cluster_name=override_cluster_name,
                         port_forward=None,
                         with_output=False)
        except Exception:
            logger.exception("Ignoring error attempting a clean shutdown.")

    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:

        def remaining_nodes():

            workers = provider.non_terminated_nodes(
                {TAG_RAY_NODE_TYPE: NODE_TYPE_WORKER})

            if keep_min_workers:
                min_workers = config.get("min_workers", 0)
                logger.info("teardown_cluster: "
                            "Keeping {} nodes...".format(min_workers))
                workers = random.sample(workers, len(workers) - min_workers)

            if workers_only:
                return workers

            head = provider.non_terminated_nodes(
                {TAG_RAY_NODE_TYPE: NODE_TYPE_HEAD})

            return head + workers

        # Loop here to check that both the head and worker nodes are actually
        #   really gone
        A = remaining_nodes()
        with LogTimer("teardown_cluster: done."):
            while A:
                logger.info("teardown_cluster: "
                            "Shutting down {} nodes...".format(len(A)))
                provider.terminate_nodes(A)
                time.sleep(1)
                A = remaining_nodes()
    finally:
        provider.cleanup()
Exemple #37
0
def get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
                            override_cluster_name):
    """Create the cluster head node, which in turn creates the workers."""
    provider = get_node_provider(config["provider"], config["cluster_name"])
    config_file = os.path.abspath(config_file)
    try:
        head_node_tags = {
            TAG_RAY_NODE_TYPE: NODE_TYPE_HEAD,
        }
        nodes = provider.non_terminated_nodes(head_node_tags)
        if len(nodes) > 0:
            head_node = nodes[0]
        else:
            head_node = None

        if not head_node:
            confirm("This will create a new cluster", yes)
        elif not no_restart:
            confirm("This will restart cluster services", yes)

        launch_hash = hash_launch_conf(config["head_node"], config["auth"])
        if head_node is None or provider.node_tags(head_node).get(
                TAG_RAY_LAUNCH_CONFIG) != launch_hash:
            if head_node is not None:
                confirm("Head node config out-of-date. It will be terminated",
                        yes)
                logger.info(
                    "get_or_create_head_node: "
                    "Shutting down outdated head node {}".format(head_node))
                provider.terminate_node(head_node)
            logger.info("get_or_create_head_node: Launching new head node...")
            head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash
            head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format(
                config["cluster_name"])
            provider.create_node(config["head_node"], head_node_tags, 1)

        start = time.time()
        head_node = None
        while True:
            if time.time() - start > 5:
                raise RuntimeError("Failed to create head node.")
            nodes = provider.non_terminated_nodes(head_node_tags)
            if len(nodes) == 1:
                head_node = nodes[0]
                break
            time.sleep(1)

        # TODO(ekl) right now we always update the head node even if the hash
        # matches. We could prompt the user for what they want to do here.
        runtime_hash = hash_runtime_conf(config["file_mounts"], config)
        logger.info("get_or_create_head_node: Updating files on head node...")

        # Rewrite the auth config so that the head node can update the workers
        remote_config = copy.deepcopy(config)
        if config["provider"]["type"] != "kubernetes":
            remote_key_path = "~/ray_bootstrap_key.pem"
            remote_config["auth"]["ssh_private_key"] = remote_key_path

        # Adjust for new file locations
        new_mounts = {}
        for remote_path in config["file_mounts"]:
            new_mounts[remote_path] = remote_path
        remote_config["file_mounts"] = new_mounts
        remote_config["no_restart"] = no_restart

        # Now inject the rewritten config and SSH key into the head node
        remote_config_file = tempfile.NamedTemporaryFile(
            "w", prefix="ray-bootstrap-")
        remote_config_file.write(json.dumps(remote_config))
        remote_config_file.flush()
        config["file_mounts"].update({
            "~/ray_bootstrap_config.yaml": remote_config_file.name
        })
        if config["provider"]["type"] != "kubernetes":
            config["file_mounts"].update({
                remote_key_path: config["auth"]["ssh_private_key"],
            })

        if restart_only:
            init_commands = []
            ray_start_commands = config["head_start_ray_commands"]
        elif no_restart:
            init_commands = config["head_setup_commands"]
            ray_start_commands = []
        else:
            init_commands = config["head_setup_commands"]
            ray_start_commands = config["head_start_ray_commands"]

        if not no_restart:
            warn_about_bad_start_command(ray_start_commands)

        updater = NodeUpdaterThread(
            node_id=head_node,
            provider_config=config["provider"],
            provider=provider,
            auth_config=config["auth"],
            cluster_name=config["cluster_name"],
            file_mounts=config["file_mounts"],
            initialization_commands=config["initialization_commands"],
            setup_commands=init_commands,
            ray_start_commands=ray_start_commands,
            runtime_hash=runtime_hash,
        )
        updater.start()
        updater.join()

        # Refresh the node cache so we see the external ip if available
        provider.non_terminated_nodes(head_node_tags)

        if config.get("provider", {}).get("use_internal_ips", False) is True:
            head_node_ip = provider.internal_ip(head_node)
        else:
            head_node_ip = provider.external_ip(head_node)

        if updater.exitcode != 0:
            logger.error("get_or_create_head_node: "
                         "Updating {} failed".format(head_node_ip))
            sys.exit(1)
        logger.info(
            "get_or_create_head_node: "
            "Head node up-to-date, IP address is: {}".format(head_node_ip))

        monitor_str = "tail -n 100 -f /tmp/ray/session_*/logs/monitor*"
        use_docker = "docker" in config and bool(
            config["docker"]["container_name"])
        if override_cluster_name:
            modifiers = " --cluster-name={}".format(
                quote(override_cluster_name))
        else:
            modifiers = ""
        print("To monitor auto-scaling activity, you can run:\n\n"
              "  ray exec {} {}{}{}\n".format(
                  config_file, "--docker " if use_docker else "",
                  quote(monitor_str), modifiers))
        print("To open a console on the cluster:\n\n"
              "  ray attach {}{}\n".format(config_file, modifiers))

        print("To get a remote shell to the cluster manually, run:\n\n"
              "  {}\n".format(updater.cmd_runner.remote_shell_command_str()))
    finally:
        provider.cleanup()
Exemple #38
0
def exec_cluster(config_file,
                 *,
                 cmd=None,
                 run_env="auto",
                 screen=False,
                 tmux=False,
                 stop=False,
                 start=False,
                 override_cluster_name=None,
                 port_forward=None,
                 with_output=False):
    """Runs a command on the specified cluster.

    Arguments:
        config_file: path to the cluster yaml
        cmd: command to run
        run_env: whether to run the command on the host or in a container.
            Select between "auto", "host" and "docker"
        screen: whether to run in a screen
        tmux: whether to run in a tmux session
        stop: whether to stop the cluster after command run
        start: whether to start the cluster if it isn't up
        override_cluster_name: set the name of the cluster
        port_forward (int or list[int]): port(s) to forward
    """
    assert not (screen and tmux), "Can specify only one of `screen` or `tmux`."
    assert run_env in RUN_ENV_TYPES, "--run_env must be in {}".format(
        RUN_ENV_TYPES)
    config = yaml.safe_load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name
    config = _bootstrap_config(config)

    head_node = _get_head_node(config,
                               config_file,
                               override_cluster_name,
                               create_if_needed=start)

    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:
        updater = NodeUpdaterThread(node_id=head_node,
                                    provider_config=config["provider"],
                                    provider=provider,
                                    auth_config=config["auth"],
                                    cluster_name=config["cluster_name"],
                                    file_mounts=config["file_mounts"],
                                    initialization_commands=[],
                                    setup_commands=[],
                                    ray_start_commands=[],
                                    runtime_hash="",
                                    docker_config=config.get("docker"))

        is_docker = isinstance(updater.cmd_runner, DockerCommandRunner)

        if cmd and stop:
            cmd += "; ".join([
                "ray stop",
                "ray teardown ~/ray_bootstrap_config.yaml --yes --workers-only"
            ])
            if is_docker and run_env == "docker":
                updater.cmd_runner.shutdown_after_next_cmd()
            else:
                cmd += "; sudo shutdown -h now"

        result = _exec(updater,
                       cmd,
                       screen,
                       tmux,
                       port_forward=port_forward,
                       with_output=with_output,
                       run_env=run_env)
        if tmux or screen:
            attach_command_parts = ["ray attach", config_file]
            if override_cluster_name is not None:
                attach_command_parts.append(
                    "--cluster-name={}".format(override_cluster_name))
            if tmux:
                attach_command_parts.append("--tmux")
            elif screen:
                attach_command_parts.append("--screen")

            attach_command = " ".join(attach_command_parts)
            attach_info = "Use `{}` to check on command status.".format(
                attach_command)
            logger.info(attach_info)
        return result
    finally:
        provider.cleanup()
Exemple #39
0
def rsync(config_file,
          source,
          target,
          override_cluster_name,
          down,
          all_nodes=False):
    """Rsyncs files.

    Arguments:
        config_file: path to the cluster yaml
        source: source dir
        target: target dir
        override_cluster_name: set the name of the cluster
        down: whether we're syncing remote -> local
        all_nodes: whether to sync worker nodes in addition to the head node
    """
    assert bool(source) == bool(target), (
        "Must either provide both or neither source and target.")

    config = yaml.safe_load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name
    config = _bootstrap_config(config)

    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:
        nodes = []
        if all_nodes:
            # technically we re-open the provider for no reason
            # in get_worker_nodes but it's cleaner this way
            # and _get_head_node does this too
            nodes = _get_worker_nodes(config, override_cluster_name)

        nodes += [
            _get_head_node(
                config,
                config_file,
                override_cluster_name,
                create_if_needed=False)
        ]

        for node_id in nodes:
            updater = NodeUpdaterThread(
                node_id=node_id,
                provider_config=config["provider"],
                provider=provider,
                auth_config=config["auth"],
                cluster_name=config["cluster_name"],
                file_mounts=config["file_mounts"],
                initialization_commands=[],
                setup_commands=[],
                ray_start_commands=[],
                runtime_hash="",
            )
            if down:
                rsync = updater.rsync_down
            else:
                rsync = updater.rsync_up

            if source and target:
                rsync(source, target)
            else:
                updater.sync_file_mounts(rsync)

    finally:
        provider.cleanup()
Exemple #40
0
def get_or_create_head_node(config, no_restart):
    """Create the cluster head node, which in turn creates the workers."""

    provider = get_node_provider(config["provider"], config["cluster_name"])
    head_node_tags = {
        TAG_RAY_NODE_TYPE: "Head",
    }
    nodes = provider.nodes(head_node_tags)
    if len(nodes) > 0:
        head_node = nodes[0]
    else:
        head_node = None

    if not head_node:
        confirm("This will create a new cluster")
    elif not no_restart:
        confirm("This will restart your cluster")

    launch_hash = hash_launch_conf(config["head_node"], config["auth"])
    if head_node is None or provider.node_tags(head_node).get(
            TAG_RAY_LAUNCH_CONFIG) != launch_hash:
        if head_node is not None:
            print("Terminating outdated head node {}".format(head_node))
            provider.terminate_node(head_node)
        print("Launching new head node...")
        head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash
        head_node_tags[TAG_NAME] = "ray-{}-head".format(config["cluster_name"])
        provider.create_node(config["head_node"], head_node_tags, 1)

    nodes = provider.nodes(head_node_tags)
    assert len(nodes) == 1, "Failed to create head node."
    head_node = nodes[0]

    # TODO(ekl) right now we always update the head node even if the hash
    # matches. We could prompt the user for what they want to do in this case.
    runtime_hash = hash_runtime_conf(config["file_mounts"], config)
    print("Updating files on head node...")

    # Rewrite the auth config so that the head node can update the workers
    remote_key_path = "~/ray_bootstrap_key.pem"
    remote_config = copy.deepcopy(config)
    remote_config["auth"]["ssh_private_key"] = remote_key_path

    # Adjust for new file locations
    new_mounts = {}
    for remote_path in config["file_mounts"]:
        new_mounts[remote_path] = remote_path
    remote_config["file_mounts"] = new_mounts
    remote_config["no_restart"] = no_restart

    # Now inject the rewritten config and SSH key into the head node
    remote_config_file = tempfile.NamedTemporaryFile("w",
                                                     prefix="ray-bootstrap-")
    remote_config_file.write(json.dumps(remote_config))
    remote_config_file.flush()
    config["file_mounts"].update({
        remote_key_path:
        config["auth"]["ssh_private_key"],
        "~/ray_bootstrap_config.yaml":
        remote_config_file.name
    })

    if no_restart:
        init_commands = (config["setup_commands"] +
                         config["head_setup_commands"])
    else:
        init_commands = (config["setup_commands"] +
                         config["head_setup_commands"] +
                         config["head_start_ray_commands"])

    updater = NodeUpdaterProcess(head_node,
                                 config["provider"],
                                 config["auth"],
                                 config["cluster_name"],
                                 config["file_mounts"],
                                 init_commands,
                                 runtime_hash,
                                 redirect_output=False)
    updater.start()
    updater.join()

    # Refresh the node cache so we see the external ip if available
    provider.nodes(head_node_tags)

    if updater.exitcode != 0:
        print("Error: updating {} failed".format(
            provider.external_ip(head_node)))
        sys.exit(1)
    print("Head node up-to-date, IP address is: {}".format(
        provider.external_ip(head_node)))
    print("To monitor auto-scaling activity, you can run:\n\n"
          "  ssh -i {} {}@{} 'tail -f /tmp/raylogs/monitor-*'\n".format(
              config["auth"]["ssh_private_key"], config["auth"]["ssh_user"],
              provider.external_ip(head_node)))
    print("To login to the cluster, run:\n\n"
          "  ssh -i {} {}@{}\n".format(config["auth"]["ssh_private_key"],
                                       config["auth"]["ssh_user"],
                                       provider.external_ip(head_node)))
Exemple #41
0
def get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
                            override_cluster_name):
    """Create the cluster head node, which in turn creates the workers."""

    provider = get_node_provider(config["provider"], config["cluster_name"])
    head_node_tags = {
        TAG_RAY_NODE_TYPE: "head",
    }
    nodes = provider.nodes(head_node_tags)
    if len(nodes) > 0:
        head_node = nodes[0]
    else:
        head_node = None

    if not head_node:
        confirm("This will create a new cluster", yes)
    elif not no_restart:
        confirm("This will restart cluster services", yes)

    launch_hash = hash_launch_conf(config["head_node"], config["auth"])
    if head_node is None or provider.node_tags(head_node).get(
            TAG_RAY_LAUNCH_CONFIG) != launch_hash:
        if head_node is not None:
            confirm("Head node config out-of-date. It will be terminated", yes)
            logger.info("Terminating outdated head node {}".format(head_node))
            provider.terminate_node(head_node)
        logger.info("Launching new head node...")
        head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash
        head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format(
            config["cluster_name"])
        provider.create_node(config["head_node"], head_node_tags, 1)

    nodes = provider.nodes(head_node_tags)
    assert len(nodes) == 1, "Failed to create head node."
    head_node = nodes[0]

    # TODO(ekl) right now we always update the head node even if the hash
    # matches. We could prompt the user for what they want to do in this case.
    runtime_hash = hash_runtime_conf(config["file_mounts"], config)
    logger.info("Updating files on head node...")

    # Rewrite the auth config so that the head node can update the workers
    remote_key_path = "~/ray_bootstrap_key.pem"
    remote_config = copy.deepcopy(config)
    remote_config["auth"]["ssh_private_key"] = remote_key_path

    # Adjust for new file locations
    new_mounts = {}
    for remote_path in config["file_mounts"]:
        new_mounts[remote_path] = remote_path
    remote_config["file_mounts"] = new_mounts
    remote_config["no_restart"] = no_restart

    # Now inject the rewritten config and SSH key into the head node
    remote_config_file = tempfile.NamedTemporaryFile("w",
                                                     prefix="ray-bootstrap-")
    remote_config_file.write(json.dumps(remote_config))
    remote_config_file.flush()
    config["file_mounts"].update({
        remote_key_path:
        config["auth"]["ssh_private_key"],
        "~/ray_bootstrap_config.yaml":
        remote_config_file.name
    })

    if restart_only:
        init_commands = config["head_start_ray_commands"]
    elif no_restart:
        init_commands = (config["setup_commands"] +
                         config["head_setup_commands"])
    else:
        init_commands = (config["setup_commands"] +
                         config["head_setup_commands"] +
                         config["head_start_ray_commands"])

    updater = NodeUpdaterProcess(head_node,
                                 config["provider"],
                                 config["auth"],
                                 config["cluster_name"],
                                 config["file_mounts"],
                                 init_commands,
                                 runtime_hash,
                                 redirect_output=False)
    updater.start()
    updater.join()

    # Refresh the node cache so we see the external ip if available
    provider.nodes(head_node_tags)

    if updater.exitcode != 0:
        logger.error("Updating {} failed".format(
            provider.external_ip(head_node)))
        sys.exit(1)
    logger.info("Head node up-to-date, IP address is: {}".format(
        provider.external_ip(head_node)))

    monitor_str = "tail -n 100 -f /tmp/ray/session_*/logs/monitor*"
    for s in init_commands:
        if ("ray start" in s and "docker exec" in s
                and "--autoscaling-config" in s):
            monitor_str = "docker exec {} /bin/sh -c {}".format(
                config["docker"]["container_name"], quote(monitor_str))
    if override_cluster_name:
        modifiers = " --cluster-name={}".format(quote(override_cluster_name))
    else:
        modifiers = ""
    print("To monitor auto-scaling activity, you can run:\n\n"
          "  ray exec {} {}{}\n".format(config_file, quote(monitor_str),
                                        modifiers))
    print("To open a console on the cluster:\n\n"
          "  ray attach {}{}\n".format(config_file, modifiers))
    print("To ssh manually to the cluster, run:\n\n"
          "  ssh -i {} {}@{}\n".format(config["auth"]["ssh_private_key"],
                                       config["auth"]["ssh_user"],
                                       provider.external_ip(head_node)))
Exemple #42
0
def exec_cluster(config_file, cmd, screen, tmux, stop, start,
                 override_cluster_name, port_forward):
    """Runs a command on the specified cluster.

    Arguments:
        config_file: path to the cluster yaml
        cmd: command to run
        screen: whether to run in a screen
        tmux: whether to run in a tmux session
        stop: whether to stop the cluster after command run
        start: whether to start the cluster if it isn't up
        override_cluster_name: set the name of the cluster
        port_forward: port to forward
    """
    assert not (screen and tmux), "Can specify only one of `screen` or `tmux`."

    config = yaml.load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name
    config = _bootstrap_config(config)
    head_node = _get_head_node(config,
                               config_file,
                               override_cluster_name,
                               create_if_needed=start)

    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:
        updater = NodeUpdaterThread(
            head_node,
            config["provider"],
            provider,
            config["auth"],
            config["cluster_name"],
            config["file_mounts"],
            [],
            "",
        )
        if stop:
            cmd += (
                "; ray stop; ray teardown ~/ray_bootstrap_config.yaml --yes "
                "--workers-only; sudo shutdown -h now")
        _exec(updater,
              cmd,
              screen,
              tmux,
              expect_error=stop,
              port_forward=port_forward)

        if tmux or screen:
            attach_command_parts = ["ray attach", config_file]
            if override_cluster_name is not None:
                attach_command_parts.append(
                    "--cluster-name={}".format(override_cluster_name))
            if tmux:
                attach_command_parts.append("--tmux")
            elif screen:
                attach_command_parts.append("--screen")

            attach_command = " ".join(attach_command_parts)
            attach_info = "Use `{}` to check on command status.".format(
                attach_command)
            logger.info(attach_info)
    finally:
        provider.cleanup()