Ejemplo n.º 1
0
def test_ray_debugger_public_multi_node(shutdown_only, ray_debugger_external):
    c = Cluster(
        initialize_head=True,
        connect=True,
        head_node_args={
            "num_cpus": 0,
            "num_gpus": 1,
            "ray_debugger_external": ray_debugger_external,
        },
    )
    c.add_node(num_cpus=1, ray_debugger_external=ray_debugger_external)

    @ray.remote
    def f():
        ray.util.pdb.set_trace()
        return 1

    # num_gpus=1 forces the task onto the head node.
    head_node_result = f.options(num_cpus=0, num_gpus=1).remote()

    # num_cpus=1 forces the task onto the worker node.
    worker_node_result = f.options(num_cpus=1).remote()

    wait_for_condition(lambda: len(
        ray.experimental.internal_kv._internal_kv_list(
            "RAY_PDB_", namespace=ray_constants.KV_NAMESPACE_PDB)) == 2)

    active_sessions = ray.experimental.internal_kv._internal_kv_list(
        "RAY_PDB_", namespace=ray_constants.KV_NAMESPACE_PDB)
    assert len(active_sessions) == 2
    session1 = json.loads(
        ray.experimental.internal_kv._internal_kv_get(
            active_sessions[0], namespace=ray_constants.KV_NAMESPACE_PDB))
    session2 = json.loads(
        ray.experimental.internal_kv._internal_kv_get(
            active_sessions[1], namespace=ray_constants.KV_NAMESPACE_PDB))

    host1, port1 = session1["pdb_address"].split(":")
    if ray_debugger_external:
        assert host1 == services.get_node_ip_address(), host1
    else:
        assert host1 == "localhost", host1

    host2, port2 = session2["pdb_address"].split(":")
    if ray_debugger_external:
        assert host2 == services.get_node_ip_address(), host2
    else:
        assert host2 == "localhost", host2

    # Check that we can successfully connect to both breakpoints.
    tn1 = Telnet(host1, int(port1))
    tn1.write(b"c\n")

    tn2 = Telnet(host2, int(port2))
    tn2.write(b"c\n")

    # The messages above should cause these to return now.
    ray.get([head_node_result, worker_node_result])
Ejemplo n.º 2
0
def with_head_node_ip(cmds, head_ip=None):
    if head_ip is None:
        head_ip = services.get_node_ip_address()
    out = []
    for cmd in cmds:
        out.append("export RAY_HEAD_IP={}; {}".format(head_ip, cmd))
    return out
Ejemplo n.º 3
0
def run_kuberay_autoscaler(cluster_name: str, cluster_namespace: str):
    """Wait until the Ray head container is ready. Then start the autoscaler."""
    _setup_logging()
    head_ip = get_node_ip_address()
    ray_address = f"{head_ip}:6379"
    while True:
        try:
            subprocess.check_call(
                ["ray", "health-check", "--address", ray_address])
            logger.info("The Ray head is ready. Starting the autoscaler.")
            break
        except subprocess.CalledProcessError:
            logger.warning("The Ray head is not yet ready.")
            logger.warning(f"Will check again in {BACKOFF_S} seconds.")
            time.sleep(BACKOFF_S)

    # autoscaling_config_producer reads the RayCluster CR from K8s and uses the CR
    # to output an autoscaling config.
    autoscaling_config_producer = AutoscalingConfigProducer(
        cluster_name, cluster_namespace)

    Monitor(
        address=ray_address,
        # The `autoscaling_config` arg can be a dict or a `Callable: () -> dict`.
        # In this case, it's a callable.
        autoscaling_config=autoscaling_config_producer,
        monitor_ip=head_ip,
    ).run()
Ejemplo n.º 4
0
 def __init__(self, logs_dir, gcs_address):
     """Initialize the log monitor object."""
     self.ip = services.get_node_ip_address()
     self.logs_dir = logs_dir
     self.publisher = gcs_pubsub.GcsPublisher(address=gcs_address)
     self.log_filenames = set()
     self.open_file_infos = []
     self.closed_file_infos = []
     self.can_open_more_files = True
Ejemplo n.º 5
0
 def __init__(self, local_ip=None):
     self.last_used_time_by_ip = {}
     self.last_heartbeat_time_by_ip = {}
     self.static_resources_by_ip = {}
     self.dynamic_resources_by_ip = {}
     self.resource_load_by_ip = {}
     self.local_ip = services.get_node_ip_address(
     ) if local_ip is None else local_ip
     self.waiting_bundles = []
     self.infeasible_bundles = []
Ejemplo n.º 6
0
 def _start_ray_node(self, command, tag):
     modified_env = self._prepare_env()
     print("Starting {} by running: {}".format(tag, command))
     process_info = session_execute(command=command,
                                    env=modified_env,
                                    tag=tag)
     JVMGuard.register_pids(process_info.pids)
     import ray._private.services as rservices
     process_info.node_ip = rservices.get_node_ip_address()
     return process_info
Ejemplo n.º 7
0
 def __init__(self, logs_dir, redis_address, redis_password=None):
     """Initialize the log monitor object."""
     self.ip = services.get_node_ip_address()
     self.logs_dir = logs_dir
     self.redis_client = ray._private.services.create_redis_client(
         redis_address, password=redis_password)
     self.log_filenames = set()
     self.open_file_infos = []
     self.closed_file_infos = []
     self.can_open_more_files = True
Ejemplo n.º 8
0
 def _start_ray_node(self, command, tag):
     modified_env = self._prepare_env()
     print("Starting {} by running: {}".format(tag, command))
     process_info = session_execute(command=command, env=modified_env, tag=tag)
     spark_executor_pid = RayServiceFuncGenerator._get_spark_executor_pid()
     RayServiceFuncGenerator.start_ray_daemon(self.python_loc,
                                              pid_to_watch=spark_executor_pid,
                                              pgid_to_kill=process_info.pgid)
     import ray._private.services as rservices
     process_info.node_ip = rservices.get_node_ip_address()
     return process_info
Ejemplo n.º 9
0
 def __init__(self, local_ip=None):
     self.last_used_time_by_ip = {}
     self.last_heartbeat_time_by_ip = {}
     self.static_resources_by_ip = {}
     self.dynamic_resources_by_ip = {}
     self.resource_load_by_ip = {}
     self.local_ip = services.get_node_ip_address(
     ) if local_ip is None else local_ip
     self.waiting_bundles = []
     self.infeasible_bundles = []
     self.pending_placement_groups = []
     self.resource_requests = []
     self.cluster_full_of_actors_detected = False
Ejemplo n.º 10
0
 async def _run_app(self):
     port = 8080
     app = web.Application()
     app.add_routes([
         web.get('/discover/{namespace}/{name}/{group}',
                 self._handle_discover),
         web.put('/hints/{namespace}/{name}', self._handle_report),
     ])
     self._runner = web.AppRunner(app)
     await self._runner.setup()
     site = web.TCPSite(self._runner, services.get_node_ip_address(), port)
     await site.start()
     self._ready.set()
     return None
Ejemplo n.º 11
0
 def __init__(self, logs_dir, redis_address, redis_password=None):
     """Initialize the log monitor object."""
     self.ip = services.get_node_ip_address()
     self.logs_dir = logs_dir
     self.redis_client = ray._private.services.create_redis_client(
         redis_address, password=redis_password)
     self.publisher = None
     if gcs_pubsub.gcs_pubsub_enabled():
         gcs_addr = gcs_utils.get_gcs_address_from_redis(self.redis_client)
         self.publisher = gcs_pubsub.GcsPublisher(address=gcs_addr)
     self.log_filenames = set()
     self.open_file_infos = []
     self.closed_file_infos = []
     self.can_open_more_files = True
Ejemplo n.º 12
0
 def _shutdown_per_node(iter):
     print("Stopping pgids: {}".format(pgids))
     if node_ips:
         current_node_ip = rservices.get_node_ip_address()
         effect_pgids = [
             pair[0] for pair in zip(pgids, node_ips)
             if pair[1] == current_node_ip
         ]
     else:
         effect_pgids = pgids
     for pgid in effect_pgids:
         print("Stopping by pgid {}".format(pgid))
         try:
             os.killpg(pgid, signal.SIGTERM)
         except Exception:
             print("WARNING: cannot kill pgid: {}".format(pgid))
Ejemplo n.º 13
0
def test_ray_debugger_public(shutdown_only, call_ray_stop_only,
                             ray_debugger_external):
    redis_substring_prefix = "--address='"
    cmd = ["ray", "start", "--head", "--num-cpus=1"]
    if ray_debugger_external:
        cmd.append("--ray-debugger-external")
    out = ray._private.utils.decode(
        subprocess.check_output(cmd, stderr=subprocess.STDOUT))
    # Get the redis address from the output.
    redis_substring_prefix = "--address='"
    address_location = (out.find(redis_substring_prefix) +
                        len(redis_substring_prefix))
    address = out[address_location:]
    address = address.split("'")[0]

    ray.init(address=address)

    @ray.remote
    def f():
        ray.util.pdb.set_trace()
        return 1

    result = f.remote()

    wait_for_condition(lambda: len(
        ray.experimental.internal_kv._internal_kv_list(
            "RAY_PDB_", namespace=ray_constants.KV_NAMESPACE_PDB)) > 0)

    active_sessions = ray.experimental.internal_kv._internal_kv_list(
        "RAY_PDB_", namespace=ray_constants.KV_NAMESPACE_PDB)
    assert len(active_sessions) == 1
    session = json.loads(
        ray.experimental.internal_kv._internal_kv_get(
            active_sessions[0], namespace=ray_constants.KV_NAMESPACE_PDB))

    host, port = session["pdb_address"].split(":")
    if ray_debugger_external:
        assert host == services.get_node_ip_address(), host
    else:
        assert host == "localhost", host

    # Check that we can successfully connect to both breakpoints.
    tn = Telnet(host, int(port))
    tn.write(b"c\n")

    # The message above should cause this to return now.
    ray.get(result)
Ejemplo n.º 14
0
def _run_autoscaler(cluster_name: str,
                    cluster_namespace: str,
                    redis_password: str = ""):
    _setup_logging()
    head_ip = get_node_ip_address()

    autoscaling_config_producer = AutoscalingConfigProducer(
        cluster_name, cluster_namespace)

    Monitor(
        address=f"{head_ip}:6379",
        redis_password=redis_password,
        # The `autoscaling_config` arg can be a dict or a `Callable: () -> dict`.
        # In this case, it's a callable.
        autoscaling_config=autoscaling_config_producer,
        monitor_ip=head_ip,
    ).run()
Ejemplo n.º 15
0
    def _process_events(self):
        failed_trial = self.trial_executor.get_next_failed_trial()
        if failed_trial:
            error_msg = (
                "{} (IP: {}) detected as stale. This is likely because the "
                "node was lost").format(failed_trial, failed_trial.node_ip)
            logger.info(error_msg)
            with warn_if_slow("process_failed_trial"):
                self._process_trial_failure(failed_trial, error_msg=error_msg)
        else:
            # TODO(ujvl): Consider combining get_next_available_trial and
            #  fetch_result functionality so that we don't timeout on fetch.
            trial = self.trial_executor.get_next_available_trial()  # blocking
            if trial.is_restoring:
                with warn_if_slow("process_trial_restore"):
                    self._process_trial_restore(trial)
                with warn_if_slow("callbacks.on_trial_restore"):
                    self._callbacks.on_trial_restore(iteration=self._iteration,
                                                     trials=self._trials,
                                                     trial=trial)
            elif trial.is_saving:
                with warn_if_slow("process_trial_save") as profile:
                    self._process_trial_save(trial)
                with warn_if_slow("callbacks.on_trial_save"):
                    self._callbacks.on_trial_save(iteration=self._iteration,
                                                  trials=self._trials,
                                                  trial=trial)
                if profile.too_slow and trial.sync_on_checkpoint:
                    # TODO(ujvl): Suggest using DurableTrainable once
                    #  API has converged.

                    msg = (
                        "Consider turning off forced head-worker trial "
                        "checkpoint syncs by setting sync_on_checkpoint=False"
                        ". Note that this may result in faulty trial "
                        "restoration if a failure occurs while the checkpoint "
                        "is being synced from the worker to the head node.")

                    if trial.location.hostname and (trial.location.hostname !=
                                                    get_node_ip_address()):
                        if log_once("tune_head_worker_checkpoint"):
                            logger.warning(msg)

            else:
                with warn_if_slow("process_trial"):
                    self._process_trial(trial)
Ejemplo n.º 16
0
 def __init__(
     self,
     logs_dir,
     gcs_publisher: gcs_pubsub.GcsPublisher,
     is_proc_alive_fn: Callable[[int], bool],
     max_files_open: int = ray_constants.LOG_MONITOR_MAX_OPEN_FILES,
 ):
     """Initialize the log monitor object."""
     self.ip: str = services.get_node_ip_address()
     self.logs_dir: str = logs_dir
     self.publisher = gcs_publisher
     self.log_filenames: Set[str] = set()
     self.open_file_infos: List[LogFileInfo] = []
     self.closed_file_infos: List[LogFileInfo] = []
     self.can_open_more_files: bool = True
     self.max_files_open: int = max_files_open
     self.is_proc_alive_fn: Callable[[int], bool] = is_proc_alive_fn
Ejemplo n.º 17
0
def run_kuberay_autoscaler(cluster_name: str, cluster_namespace: str):
    """Wait until the Ray head container is ready. Then start the autoscaler."""
    head_ip = get_node_ip_address()
    ray_address = f"{head_ip}:6379"
    while True:
        try:
            # Autoscaler Ray version might not exactly match GCS version, so skip the
            # version check when checking GCS status.
            subprocess.check_call(
                [
                    "ray",
                    "health-check",
                    "--address",
                    ray_address,
                    "--skip-version-check",
                ]
            )
            # Logging is not ready yet. Print to stdout for now.
            print("The Ray head is ready. Starting the autoscaler.")
            break
        except subprocess.CalledProcessError:
            print("The Ray head is not yet ready.")
            print(f"Will check again in {BACKOFF_S} seconds.")
            time.sleep(BACKOFF_S)

    # The Ray head container sets up the log directory. Thus, we set up logging
    # only after the Ray head is ready.
    _setup_logging()

    # autoscaling_config_producer reads the RayCluster CR from K8s and uses the CR
    # to output an autoscaling config.
    autoscaling_config_producer = AutoscalingConfigProducer(
        cluster_name, cluster_namespace
    )

    Monitor(
        address=ray_address,
        # The `autoscaling_config` arg can be a dict or a `Callable: () -> dict`.
        # In this case, it's a callable.
        autoscaling_config=autoscaling_config_producer,
        monitor_ip=head_ip,
        # Let the autoscaler process exit after it hits 5 exceptions.
        # (See ray.autoscaler._private.constants.AUTOSCALER_MAX_NUM_FAILURES.)
        # Kubernetes will then restart the autoscaler container.
        retry_on_failure=False,
    ).run()
Ejemplo n.º 18
0
def listen_for_spot_termination(timeout=None):
    MOCK = (os.environ.get("MOCK", "False").lower() == "true")
    logging.basicConfig(level=logging.INFO)

    if MOCK:
        logging.debug("Using mocked spot instance")
        endpoint = f"{services.get_node_ip_address()}:8234"
    else:
        # AWS spot instance termination endpoint
        endpoint = "169.254.169.254"

    start = time.time()

    while True:
        try:
            resp = requests.get(
                f'http://{endpoint}/latest/meta-data/spot/instance-action',
                timeout=0.1)
            if resp.status_code == 404:
                # AWS endpoint responded, no termination detected
                time.sleep(5)
            elif resp.status_code >= 200 and resp.status_code < 300:
                resp_json = resp.json()
                if (resp_json["action"] == "terminate"
                        or resp_json["action"] == "stop"):
                    ip = services.get_node_ip_address()
                    logging.info(f"termination detected on node {ip}")
                    return ip
            else:
                raise RuntimeError(
                    "AWS spot instance interrupt warning "
                    "endpoint not responding")
            if timeout and time.time() - start > timeout:
                return None
        except requests.RequestException as e:
            logging.error(e)
            time.sleep(5)
Ejemplo n.º 19
0
def run_adaptdl(job_key, job_uid, rank, replicas,
                num_restarts, checkpoint=None, offset=0, path="", argv=None):
    logging.basicConfig(level=logging.INFO)
    logging.info(f"Starting worker {rank}")

    def report_status(status):
        status_obj_ref = ray.put(status.value)
        controller.register_status.remote(status_obj_ref)

    controller = ray.get_actor("AdaptDLController")
    supervisor_url = ray.get(controller.get_url.remote())

    os.environ["ADAPTDL_MASTER_PORT"] = str(47000 + num_restarts + offset)
    os.environ["ADAPTDL_REPLICA_RANK"] = str(rank)
    os.environ["ADAPTDL_NUM_REPLICAS"] = str(replicas)
    os.environ["ADAPTDL_SUPERVISOR_URL"] = supervisor_url
    os.environ["ADAPTDL_JOB_ID"] = job_key
    os.environ["ADAPTDL_NUM_RESTARTS"] = str(num_restarts)
    os.environ["ADAPTDL_SCHED_VERSION"] = str(
        pkg_resources.get_distribution("adaptdl").version)
    suffix = f"{job_uid}-{rank}"
    checkpoint_path = f"/tmp/checkpoint-{suffix}"

    try:
        if os.path.exists(checkpoint_path):
            import shutil
            shutil.rmtree(checkpoint_path)
        os.mkdir(checkpoint_path)
        if checkpoint:
            _checkpoint_obj_to_dir(checkpoint_path, checkpoint)
        num_restarts = int(num_restarts)
        os.environ["ADAPTDL_CHECKPOINT_PATH"] = str(checkpoint_path)
        share_path = f"/tmp/share-{suffix}"
        if not os.path.exists(share_path):
            os.mkdir(share_path)
        os.environ["ADAPTDL_SHARE_PATH"] = str(share_path)

        rank_obj_ref = ray.put(rank)
        ip_obj_ref = ray.put(services.get_node_ip_address())
        controller.register_worker.remote(rank_obj_ref, ip_obj_ref)
    except Exception as e:
        logging.info(traceback.format_exc())
        time.sleep(5)
        report_status(Status.FAILED)
        raise e

        # TODO: replace with block
    try:
        filename = Path(path).name
        sys.argv = [filename]
        if argv:
            # Need to augment the argv to mimic that file being called
            sys.argv += argv
        spec = importlib.util.spec_from_file_location("__main__", path)
        module = importlib.util.module_from_spec(spec)
        # TODO: fix imports when caller module is not in the root path
        spec.loader.exec_module(module)
        time.sleep(5)

    except SystemExit:
        # Received a cancel from the controller -- the job is being rescheduled
        # Worker 0 needs to send the checkpoint back to the controller so the
        # next generation of workers can resume
        logging.info(f"Worker {rank} received system exit")
        if rank == 0:
            checkpoint_obj = _serialize_checkpoint(checkpoint_path)
            logging.info("checkpoint created")
            checkpoint_obj_ref = ray.put(checkpoint_obj)
            logging.info("checkpoint placed")
            result = ray.get(
                controller.register_checkpoint.remote(checkpoint_obj_ref))
            logging.info(f"checkpoint registered: {result}")
        # This sleep is to keep this remote task alive
        # until its worker object can be killed by the controller
        time.sleep(1800)

    except Exception as e:
        logging.error(traceback.format_exc())
        logging.error(e)
        time.sleep(5)
        report_status(Status.FAILED)
        raise e
    else:
        if rank == 0:
            logging.info("Job succeeded, exiting")
            time.sleep(5)
            report_status(Status.SUCCEEDED)
            time.sleep(5)
Ejemplo n.º 20
0
 def _invalid_nodes(self):
     current_ip = services.get_node_ip_address()
     return self._terminating_nodes.union({current_ip})
Ejemplo n.º 21
0
def start(node_ip_address, address, port, redis_password, redis_shard_ports,
          object_manager_port, node_manager_port, gcs_server_port,
          min_worker_port, max_worker_port, worker_port_list, memory,
          object_store_memory, redis_max_memory, num_cpus, num_gpus, resources,
          head, include_dashboard, dashboard_host, dashboard_port, block,
          plasma_directory, autoscaling_config, no_redirect_worker_output,
          no_redirect_output, plasma_store_socket_name, raylet_socket_name,
          temp_dir, java_worker_options, load_code_from_local,
          code_search_path, system_config, lru_evict,
          enable_object_reconstruction, metrics_export_port, log_style,
          log_color, verbose):
    """Start Ray processes manually on the local machine."""
    cli_logger.configure(log_style, log_color, verbose)
    if gcs_server_port and not head:
        raise ValueError(
            "gcs_server_port can be only assigned when you specify --head.")

    # Convert hostnames to numerical IP address.
    if node_ip_address is not None:
        node_ip_address = services.address_to_ip(node_ip_address)

    redis_address = None
    if address is not None:
        (redis_address, redis_address_ip,
         redis_address_port) = services.validate_redis_address(address)
    try:
        resources = json.loads(resources)
    except Exception:
        cli_logger.error("`{}` is not a valid JSON string.",
                         cf.bold("--resources"))
        cli_logger.abort(
            "Valid values look like this: `{}`",
            cf.bold("--resources='\"CustomResource3\": 1, "
                    "\"CustomResource2\": 2}'"))

        raise Exception("Unable to parse the --resources argument using "
                        "json.loads. Try using a format like\n\n"
                        "    --resources='{\"CustomResource1\": 3, "
                        "\"CustomReseource2\": 2}'")

    redirect_worker_output = None if not no_redirect_worker_output else True
    redirect_output = None if not no_redirect_output else True
    ray_params = ray.parameter.RayParams(
        node_ip_address=node_ip_address,
        min_worker_port=min_worker_port,
        max_worker_port=max_worker_port,
        worker_port_list=worker_port_list,
        object_manager_port=object_manager_port,
        node_manager_port=node_manager_port,
        gcs_server_port=gcs_server_port,
        memory=memory,
        object_store_memory=object_store_memory,
        redis_password=redis_password,
        redirect_worker_output=redirect_worker_output,
        redirect_output=redirect_output,
        num_cpus=num_cpus,
        num_gpus=num_gpus,
        resources=resources,
        plasma_directory=plasma_directory,
        huge_pages=False,
        plasma_store_socket_name=plasma_store_socket_name,
        raylet_socket_name=raylet_socket_name,
        temp_dir=temp_dir,
        include_dashboard=include_dashboard,
        dashboard_host=dashboard_host,
        dashboard_port=dashboard_port,
        java_worker_options=java_worker_options,
        load_code_from_local=load_code_from_local,
        code_search_path=code_search_path,
        _system_config=system_config,
        lru_evict=lru_evict,
        enable_object_reconstruction=enable_object_reconstruction,
        metrics_export_port=metrics_export_port)
    if head:
        # Use default if port is none, allocate an available port if port is 0
        if port is None:
            port = ray_constants.DEFAULT_PORT

        if port == 0:
            with socket() as s:
                s.bind(("", 0))
                port = s.getsockname()[1]

        num_redis_shards = None
        # Start Ray on the head node.
        if redis_shard_ports is not None:
            redis_shard_ports = redis_shard_ports.split(",")
            # Infer the number of Redis shards from the ports if the number is
            # not provided.
            num_redis_shards = len(redis_shard_ports)

        if redis_address is not None:
            cli_logger.abort(
                "`{}` starts a new Redis server, `{}` should not be set.",
                cf.bold("--head"), cf.bold("--address"))

            raise Exception("If --head is passed in, a Redis server will be "
                            "started, so a Redis address should not be "
                            "provided.")

        node_ip_address = services.get_node_ip_address()

        # Get the node IP address if one is not provided.
        ray_params.update_if_absent(node_ip_address=node_ip_address)
        cli_logger.labeled_value("Local node IP", ray_params.node_ip_address)
        ray_params.update_if_absent(
            redis_port=port,
            redis_shard_ports=redis_shard_ports,
            redis_max_memory=redis_max_memory,
            num_redis_shards=num_redis_shards,
            redis_max_clients=None,
            autoscaling_config=autoscaling_config,
        )

        # Fail early when starting a new cluster when one is already running
        if address is None:
            default_address = f"{node_ip_address}:{port}"
            redis_addresses = services.find_redis_address(default_address)
            if len(redis_addresses) > 0:
                raise ConnectionError(
                    f"Ray is already running at {default_address}. "
                    f"Please specify a different port using the `--port`"
                    f" command to `ray start`.")

        node = ray.node.Node(
            ray_params, head=True, shutdown_at_exit=block, spawn_reaper=block)
        redis_address = node.redis_address

        # this is a noop if new-style is not set, so the old logger calls
        # are still in place
        cli_logger.newline()
        startup_msg = "Ray runtime started."
        cli_logger.success("-" * len(startup_msg))
        cli_logger.success(startup_msg)
        cli_logger.success("-" * len(startup_msg))
        cli_logger.newline()
        with cli_logger.group("Next steps"):
            cli_logger.print(
                "To connect to this Ray runtime from another node, run")
            cli_logger.print(
                cf.bold("  ray start --address='{}'{}"), redis_address,
                f" --redis-password='******'"
                if redis_password else "")
            cli_logger.newline()
            cli_logger.print("Alternatively, use the following Python code:")
            with cli_logger.indented():
                with cf.with_style("monokai") as c:
                    cli_logger.print("{} ray", c.magenta("import"))
                    cli_logger.print(
                        "ray{}init(address{}{}{})", c.magenta("."),
                        c.magenta("="), c.yellow("'auto'"),
                        ", _redis_password{}{}".format(
                            c.magenta("="),
                            c.yellow("'" + redis_password + "'"))
                        if redis_password else "")
            cli_logger.newline()
            cli_logger.print(
                cf.underlined("If connection fails, check your "
                              "firewall settings and "
                              "network configuration."))
            cli_logger.newline()
            cli_logger.print("To terminate the Ray runtime, run")
            cli_logger.print(cf.bold("  ray stop"))
    else:
        # Start Ray on a non-head node.
        if not (port is None):
            cli_logger.abort("`{}` should not be specified without `{}`.",
                             cf.bold("--port"), cf.bold("--head"))

            raise Exception("If --head is not passed in, --port is not "
                            "allowed.")
        if redis_shard_ports is not None:
            cli_logger.abort("`{}` should not be specified without `{}`.",
                             cf.bold("--redis-shard-ports"), cf.bold("--head"))

            raise Exception("If --head is not passed in, --redis-shard-ports "
                            "is not allowed.")
        if redis_address is None:
            cli_logger.abort("`{}` is required unless starting with `{}`.",
                             cf.bold("--address"), cf.bold("--head"))

            raise Exception("If --head is not passed in, --address must "
                            "be provided.")
        if include_dashboard:
            cli_logger.abort("`{}` should not be specified without `{}`.",
                             cf.bold("--include-dashboard"), cf.bold("--head"))

            raise ValueError(
                "If --head is not passed in, the --include-dashboard"
                "flag is not relevant.")

        # Wait for the Redis server to be started. And throw an exception if we
        # can't connect to it.
        services.wait_for_redis_to_start(
            redis_address_ip, redis_address_port, password=redis_password)

        # Create a Redis client.
        redis_client = services.create_redis_client(
            redis_address, password=redis_password)

        # Check that the version information on this node matches the version
        # information that the cluster was started with.
        services.check_version_info(redis_client)

        # Get the node IP address if one is not provided.
        ray_params.update_if_absent(
            node_ip_address=services.get_node_ip_address(redis_address))

        cli_logger.labeled_value("Local node IP", ray_params.node_ip_address)

        # Check that there aren't already Redis clients with the same IP
        # address connected with this Redis instance. This raises an exception
        # if the Redis server already has clients on this node.
        check_no_existing_redis_clients(ray_params.node_ip_address,
                                        redis_client)
        ray_params.update(redis_address=redis_address)
        node = ray.node.Node(
            ray_params, head=False, shutdown_at_exit=block, spawn_reaper=block)

        cli_logger.newline()
        startup_msg = "Ray runtime started."
        cli_logger.success("-" * len(startup_msg))
        cli_logger.success(startup_msg)
        cli_logger.success("-" * len(startup_msg))
        cli_logger.newline()
        cli_logger.print("To terminate the Ray runtime, run")
        cli_logger.print(cf.bold("  ray stop"))

    if block:
        cli_logger.newline()
        with cli_logger.group(cf.bold("--block")):
            cli_logger.print(
                "This command will now block until terminated by a signal.")
            cli_logger.print(
                "Runing subprocesses are monitored and a message will be "
                "printed if any of them terminate unexpectedly.")

        while True:
            time.sleep(1)
            deceased = node.dead_processes()
            if len(deceased) > 0:
                cli_logger.newline()
                cli_logger.error("Some Ray subprcesses exited unexpectedly:")

                with cli_logger.indented():
                    for process_type, process in deceased:
                        cli_logger.error(
                            "{}",
                            cf.bold(str(process_type)),
                            _tags={"exit code": str(process.returncode)})

                # shutdown_at_exit will handle cleanup.
                cli_logger.newline()
                cli_logger.error("Remaining processes will be killed.")
                sys.exit(1)
Ejemplo n.º 22
0
    root_logger.setLevel(logging.INFO)

    root_handler = logging.StreamHandler()
    root_handler.setLevel(logging.INFO)
    root_handler.setFormatter(logging.Formatter(ray_constants.LOGGER_FORMAT))

    root_logger.addHandler(root_handler)


if __name__ == "__main__":
    setup_logging()

    parser = argparse.ArgumentParser(description="Kuberay Autoscaler")
    parser.add_argument(
        "--redis-password",
        required=False,
        type=str,
        default=None,
        help="The password to use for Redis")
    args = parser.parse_args()

    cluster_name = yaml.safe_load(
        open(AUTOSCALING_CONFIG_PATH).read())["cluster_name"]
    head_ip = get_node_ip_address()
    Monitor(
        address=f"{head_ip}:6379",
        redis_password=args.redis_password,
        autoscaling_config=AUTOSCALING_CONFIG_PATH,
        monitor_ip=head_ip,
    ).run()