Esempio n. 1
0
    def _get_or_start_routers(self, host, port):
        """Get the HTTP proxy belonging to this serve instance.

        If the HTTP proxy does not already exist, it will be started.
        """
        # TODO(simon): We don't handle nodes being added/removed. To do that,
        # we should implement some sort of control loop in master actor.
        for _, node_id_group in groupby(sorted(ray.state.node_ids())):
            for index, node_id in enumerate(node_id_group):
                proxy_name = format_actor_name(SERVE_PROXY_NAME,
                                               self.instance_name)
                proxy_name += "-{}-{}".format(node_id, index)
                try:
                    router = ray.get_actor(proxy_name)
                except ValueError:
                    logger.info(
                        "Starting HTTP proxy with name '{}' on node '{}' "
                        "listening on port {}".format(proxy_name, node_id,
                                                      port))
                    router = HTTPProxyActor.options(
                        name=proxy_name,
                        max_concurrency=ASYNC_CONCURRENCY,
                        max_restarts=-1,
                        max_task_retries=-1,
                        resources={
                            node_id: 0.01
                        },
                    ).remote(host, port, instance_name=self.instance_name)
                self.routers.append(router)
Esempio n. 2
0
    async def _start_backend_worker(self, backend_tag: str,
                                    replica_tag: str) -> ActorHandle:
        """Creates a backend worker and waits for it to start up.

        Assumes that the backend configuration has already been registered
        in self.backends.
        """
        logger.debug("Starting worker '{}' for backend '{}'.".format(
            replica_tag, backend_tag))
        backend_info = self.backends[backend_tag]

        replica_name = format_actor_name(replica_tag, self.instance_name)
        worker_handle = ray.remote(backend_info.worker_class).options(
            name=replica_name,
            lifetime="detached",
            max_restarts=-1,
            max_task_retries=-1,
            **backend_info.replica_config.ray_actor_options).remote(
                backend_tag,
                replica_tag,
                backend_info.replica_config.actor_init_args,
                backend_info.backend_config,
                instance_name=self.instance_name)
        # TODO(edoakes): we should probably have a timeout here.
        await worker_handle.ready.remote()
        return worker_handle
Esempio n. 3
0
 def get_proxy_names():
     proxy_names = []
     for node_id, _ in get_all_node_ids():
         proxy_names.append(
             format_actor_name(SERVE_PROXY_NAME, client._controller_name,
                               node_id))
     return proxy_names
Esempio n. 4
0
    async def _start_backend_replica(self, config_store: ConfigurationStore,
                                     backend_tag: BackendTag,
                                     replica_tag: ReplicaTag) -> ActorHandle:
        """Start a replica and return its actor handle.

        Checks if the named actor already exists before starting a new one.

        Assumes that the backend configuration has already been registered
        in the ConfigurationStore.
        """
        # NOTE(edoakes): the replicas may already be created if we
        # failed after creating them but before writing a
        # checkpoint.
        replica_name = format_actor_name(replica_tag, self.controller_name)
        try:
            replica_handle = ray.get_actor(replica_name)
        except ValueError:
            logger.debug("Starting replica '{}' for backend '{}'.".format(
                replica_tag, backend_tag))
            backend_info = config_store.get_backend(backend_tag)

            replica_handle = ray.remote(backend_info.worker_class).options(
                name=replica_name,
                lifetime="detached" if self.detached else None,
                max_restarts=-1,
                max_task_retries=-1,
                **backend_info.replica_config.ray_actor_options).remote(
                    backend_tag, replica_tag,
                    backend_info.replica_config.actor_init_args,
                    backend_info.backend_config, self.controller_name)

        return replica_handle
Esempio n. 5
0
    def _start_routers_if_needed(self, http_host: str, http_port: str,
                                 http_middlewares: List[Any]) -> None:
        """Start a router on every node if it doesn't already exist."""
        for node_id, node_resource in get_all_node_ids():
            if node_id in self.routers_cache:
                continue

            router_name = format_actor_name(SERVE_PROXY_NAME,
                                            self.controller_name, node_id)
            try:
                router = ray.get_actor(router_name)
            except ValueError:
                logger.info("Starting router with name '{}' on node '{}' "
                            "listening on '{}:{}'".format(
                                router_name, node_id, http_host, http_port))
                router = HTTPProxyActor.options(
                    name=router_name,
                    lifetime="detached" if self.detached else None,
                    max_concurrency=ASYNC_CONCURRENCY,
                    max_restarts=-1,
                    max_task_retries=-1,
                    resources={
                        node_resource: 0.01
                    },
                ).remote(http_host,
                         http_port,
                         controller_name=self.controller_name,
                         http_middlewares=http_middlewares)

            self.routers_cache[node_id] = router
Esempio n. 6
0
    async def _stop_pending_backend_replicas(self) -> None:
        """Stops the pending backend replicas in self.backend_replicas_to_stop.

        Removes backend_replicas from the router, kills them, and clears
        self.backend_replicas_to_stop.
        """
        for backend_tag, replicas_list in self.backend_replicas_to_stop.items(
        ):
            for replica_tag in replicas_list:
                # NOTE(edoakes): the replicas may already be stopped if we
                # failed after stopping them but before writing a checkpoint.
                replica_name = format_actor_name(replica_tag,
                                                 self.controller_name)
                try:
                    replica = ray.get_actor(replica_name)
                except ValueError:
                    continue

                # TODO(edoakes): this logic isn't ideal because there may be
                # pending tasks still executing on the replica. However, if we
                # use replica.__ray_terminate__, we may send it while the
                # replica is being restarted and there's no way to tell if it
                # successfully killed the worker or not.
                ray.kill(replica, no_restart=True)

        self.backend_replicas_to_stop.clear()
Esempio n. 7
0
    def _get_or_start_http_proxy(self, node_id, host, port):
        """Get the HTTP proxy belonging to this serve instance.

        If the HTTP proxy does not already exist, it will be started.
        """
        proxy_name = format_actor_name(SERVE_PROXY_NAME, self.instance_name)
        try:
            self.http_proxy = ray.get_actor(proxy_name)
        except ValueError:
            logger.info(
                "Starting HTTP proxy with name '{}' on node '{}'".format(
                    proxy_name, node_id))
            self.http_proxy = HTTPProxyActor.options(
                name=proxy_name,
                max_concurrency=ASYNC_CONCURRENCY,
                max_restarts=-1,
                max_task_retries=-1,
                resources={
                    node_id: 0.01
                },
            ).remote(host, port, instance_name=self.instance_name)

        # Since router is a merged with HTTP proxy actor, the router will be
        # proxied via the HTTP actor. Even though the two variable names are
        # pointing to the same object, their semantic differences make the code
        # more readable. (e.g. http_proxy.set_route_table, router.add_worker)
        self.router = self.http_proxy
Esempio n. 8
0
    async def _enqueue_pending_scale_changes_loop(self,
                                                  current_state: SystemState):
        for backend_tag, replicas_to_create in self.backend_replicas_to_start.\
                items():
            for replica_tag in replicas_to_create:
                replica_handle = await self._start_backend_replica(
                    current_state, backend_tag, replica_tag)
                ready_future = replica_handle.ready.remote().as_future()
                self.currently_starting_replicas[ready_future] = (
                    backend_tag, replica_tag, replica_handle)

        for backend_tag, replicas_to_stop in self.backend_replicas_to_stop.\
                items():
            for replica_tag in replicas_to_stop:
                replica_name = format_actor_name(replica_tag,
                                                 self.controller_name)

                async def kill_actor(replica_name_to_use):
                    # NOTE: the replicas may already be stopped if we failed
                    # after stopping them but before writing a checkpoint.
                    try:
                        replica = ray.get_actor(replica_name_to_use)
                    except ValueError:
                        return

                    # TODO(edoakes): this logic isn't ideal because there may
                    # be pending tasks still executing on the replica. However,
                    # if we use replica.__ray_terminate__, we may send it while
                    # the replica is being restarted and there's no way to tell
                    # if it successfully killed the worker or not.
                    ray.kill(replica, no_restart=True)

                self.currently_stopping_replicas[asyncio.ensure_future(
                    kill_actor(replica_name))] = (backend_tag, replica_tag)
Esempio n. 9
0
    def _start_proxies_if_needed(self) -> None:
        """Start a proxy on every node if it doesn't already exist."""
        for node_id, node_resource in self._get_target_nodes():
            if node_id in self._proxy_actors:
                continue

            name = format_actor_name(SERVE_PROXY_NAME, self._controller_name,
                                     node_id)
            try:
                proxy = ray.get_actor(
                    name, namespace=self._controller_namespace)
            except ValueError:
                logger.info("Starting HTTP proxy with name '{}' on node '{}' "
                            "listening on '{}:{}'".format(
                                name, node_id, self._config.host,
                                self._config.port))
                proxy = HTTPProxyActor.options(
                    num_cpus=self._config.num_cpus,
                    name=name,
                    lifetime="detached" if self._detached else None,
                    max_concurrency=ASYNC_CONCURRENCY,
                    max_restarts=-1,
                    max_task_retries=-1,
                    resources={
                        node_resource: 0.01
                    },
                ).remote(
                    self._config.host,
                    self._config.port,
                    controller_name=self._controller_name,
                    controller_namespace=self._controller_namespace,
                    http_middlewares=self._config.middlewares)

            self._proxy_actors[node_id] = proxy
Esempio n. 10
0
    def _stop_pending_replicas(self):
        for backend_tag, replicas_to_stop in (
                self.backend_replicas_to_stop.items()):
            for replica_tag, shutdown_timeout in replicas_to_stop:
                replica_name = format_actor_name(replica_tag,
                                                 self._controller_name)

                async def kill_actor(replica_name_to_use):
                    # NOTE: the replicas may already be stopped if we failed
                    # after stopping them but before writing a checkpoint.
                    try:
                        replica = ray.get_actor(replica_name_to_use)
                    except ValueError:
                        return

                    try:
                        await asyncio.wait_for(
                            replica.drain_pending_queries.remote(),
                            timeout=shutdown_timeout)
                    except asyncio.TimeoutError:
                        # Graceful period passed, kill it forcefully.
                        logger.debug(
                            f"{replica_name_to_use} did not shutdown after "
                            f"{shutdown_timeout}s, killing.")
                    finally:
                        ray.kill(replica, no_restart=True)

                self.currently_stopping_replicas[asyncio.ensure_future(
                    kill_actor(replica_name))] = (backend_tag, replica_tag)
Esempio n. 11
0
    def _start_routers_if_needed(self):
        """Start a router on every node if it doesn't already exist."""
        for node_id, node_resource in get_all_node_ids():
            if node_id in self.routers:
                continue

            router_name = format_actor_name(SERVE_PROXY_NAME,
                                            self.instance_name, node_id)
            try:
                router = ray.get_actor(router_name)
            except ValueError:
                logger.info("Starting router with name '{}' on node '{}' "
                            "listening on '{}:{}'".format(
                                router_name, node_id, self.http_host,
                                self.http_port))
                router = HTTPProxyActor.options(
                    name=router_name,
                    max_concurrency=ASYNC_CONCURRENCY,
                    max_restarts=-1,
                    max_task_retries=-1,
                    resources={
                        node_resource: 0.01
                    },
                ).remote(
                    node_id,
                    self.http_host,
                    self.http_port,
                    instance_name=self.instance_name,
                    _http_middlewares=self._http_middlewares)

            self.routers[node_id] = router
Esempio n. 12
0
def init(name=None,
         http_host=DEFAULT_HTTP_HOST,
         http_port=DEFAULT_HTTP_PORT,
         metric_exporter=InMemoryExporter):
    """Initialize or connect to a serve cluster.

    If serve cluster is already initialized, this function will just return.

    If `ray.init` has not been called in this process, it will be called with
    no arguments. To specify kwargs to `ray.init`, it should be called
    separately before calling `serve.init`.

    Args:
        name (str): A unique name for this serve instance. This allows
            multiple serve instances to run on the same ray cluster. Must be
            specified in all subsequent serve.init() calls.
        http_host (str): Host for HTTP server. Default to "0.0.0.0".
        http_port (int): Port for HTTP server. Default to 8000.
        metric_exporter(ExporterInterface): The class aggregates metrics from
            all RayServe actors and optionally export them to external
            services. RayServe has two options built in: InMemoryExporter and
            PrometheusExporter
    """
    if name is not None and not isinstance(name, str):
        raise TypeError("name must be a string.")

    # Initialize ray if needed.
    if not ray.is_initialized():
        ray.init()

    # Try to get serve master actor if it exists
    global master_actor
    master_actor_name = format_actor_name(SERVE_MASTER_NAME, name)
    try:
        master_actor = ray.get_actor(master_actor_name)
        return
    except ValueError:
        pass

    # Register serialization context once
    ray.register_custom_serializer(Query, Query.ray_serialize,
                                   Query.ray_deserialize)
    ray.register_custom_serializer(RequestMetadata,
                                   RequestMetadata.ray_serialize,
                                   RequestMetadata.ray_deserialize)

    # TODO(edoakes): for now, always start the HTTP proxy on the node that
    # serve.init() was run on. We should consider making this configurable
    # in the future.
    http_node_id = ray.state.current_node_id()
    master_actor = ServeMaster.options(
        name=master_actor_name,
        max_restarts=-1,
        max_task_retries=-1,
    ).remote(name, http_node_id, http_host, http_port, metric_exporter)

    block_until_http_ready("http://{}:{}/-/routes".format(
        http_host, http_port),
                           timeout=HTTP_PROXY_TIMEOUT)
Esempio n. 13
0
    def _recover_actor_handles(self) -> None:
        # Refresh the RouterCache
        for node_id in self.routers_cache.keys():
            router_name = format_actor_name(SERVE_PROXY_NAME,
                                            self.controller_name, node_id)
            self.routers_cache[node_id] = ray.get_actor(router_name)

        # Fetch actor handles for all of the backend replicas in the system.
        # All of these backend_replicas are guaranteed to already exist because
        #  they would not be written to a checkpoint in self.backend_replicas
        # until they were created.
        for backend_tag, replica_dict in self.backend_replicas.items():
            for replica_tag in replica_dict.keys():
                replica_name = format_actor_name(replica_tag,
                                                 self.controller_name)
                self.backend_replicas[backend_tag][
                    replica_tag] = ray.get_actor(replica_name)
Esempio n. 14
0
File: api.py Progetto: euhanna/ray
def init(name=None,
         http_host=DEFAULT_HTTP_HOST,
         http_port=DEFAULT_HTTP_PORT,
         metric_exporter=InMemoryExporter,
         _http_middlewares=[]):
    """Initialize or connect to a serve cluster.

    If serve cluster is already initialized, this function will just return.

    If `ray.init` has not been called in this process, it will be called with
    no arguments. To specify kwargs to `ray.init`, it should be called
    separately before calling `serve.init`.

    Args:
        name (str): A unique name for this serve instance. This allows
            multiple serve instances to run on the same ray cluster. Must be
            specified in all subsequent serve.init() calls.
        http_host (str): Host for HTTP servers. Default to "0.0.0.0". Serve
            starts one HTTP server per node in the Ray cluster.
        http_port (int, List[int]): Port for HTTP server. Default to 8000.
        metric_exporter(ExporterInterface): The class aggregates metrics from
            all RayServe actors and optionally export them to external
            services. Ray Serve has two options built in: InMemoryExporter and
            PrometheusExporter
    """
    if name is not None and not isinstance(name, str):
        raise TypeError("name must be a string.")

    # Initialize ray if needed.
    if not ray.is_initialized():
        ray.init()

    # Try to get serve controller if it exists
    global controller
    controller_name = format_actor_name(SERVE_CONTROLLER_NAME, name)
    try:
        controller = ray.get_actor(controller_name)
        return
    except ValueError:
        pass

    controller = ServeController.options(
        name=controller_name,
        max_restarts=-1,
        max_task_retries=-1,
    ).remote(name, http_host, http_port, metric_exporter, _http_middlewares)

    futures = []
    for node_id in ray.state.node_ids():
        future = block_until_http_ready.options(
            num_cpus=0, resources={
                node_id: 0.01
            }).remote(
                "http://{}:{}/-/routes".format(http_host, http_port),
                timeout=HTTP_PROXY_TIMEOUT)
        futures.append(future)
    ray.get(futures)
Esempio n. 15
0
 def get_replica_actors(self, backend_tag: BackendTag) -> List[ActorHandle]:
     return_list = []
     for replica_tag in self.replicas.get(backend_tag, []):
         try:
             replica_name = format_actor_name(replica_tag,
                                              self.controller_name)
             return_list.append(ray.get_actor(replica_name))
         except ValueError:
             pass
     return return_list
Esempio n. 16
0
 def check_dead():
     for actor_name in [
             constants.SERVE_CONTROLLER_NAME, constants.SERVE_PROXY_NAME
     ]:
         try:
             ray.get_actor(format_actor_name(actor_name, instance_name))
             return False
         except ValueError:
             pass
     return True
Esempio n. 17
0
def test_shutdown(ray_shutdown):
    ray.init(num_cpus=16)
    serve.start(http_options=dict(port=8003))

    @serve.deployment
    def f():
        pass

    f.deploy()

    serve_controller_name = serve.context._global_client._controller_name
    actor_names = [
        serve_controller_name,
        format_actor_name(
            SERVE_PROXY_NAME,
            serve.context._global_client._controller_name,
            get_all_node_ids()[0][0],
        ),
    ]

    def check_alive():
        alive = True
        for actor_name in actor_names:
            try:
                if actor_name == serve_controller_name:
                    ray.get_actor(
                        actor_name,
                        namespace=ray.get_runtime_context().namespace)
                else:
                    ray.get_actor(actor_name)
            except ValueError:
                alive = False
        return alive

    wait_for_condition(check_alive)

    serve.shutdown()
    with pytest.raises(RayServeException):
        serve.list_deployments()

    def check_dead():
        for actor_name in actor_names:
            try:
                if actor_name == serve_controller_name:
                    ray.get_actor(
                        actor_name,
                        namespace=ray.get_runtime_context().namespace)
                else:
                    ray.get_actor(actor_name)
                return False
            except ValueError:
                pass
        return True

    wait_for_condition(check_dead)
Esempio n. 18
0
 def __init__(self, controller_name: str, detached: bool,
              replica_tag: ReplicaTag, backend_tag: BackendTag):
     self._actor_name = format_actor_name(replica_tag, controller_name)
     self._controller_name = controller_name
     self._detached = detached
     self._replica_tag = replica_tag
     self._backend_tag = backend_tag
     self._actor_handle = None
     self._startup_obj_ref = None
     self._drain_obj_ref = None
     self._state = ReplicaState.SHOULD_START
Esempio n. 19
0
 def _recover_actor_handles(self) -> None:
     # Fetch actor handles for all of the backend replicas in the system.
     # All of these backend_replicas are guaranteed to already exist because
     #  they would not be written to a checkpoint in self.backend_replicas
     # until they were created.
     for backend_tag, replica_dict in self.backend_replicas.items():
         for replica_tag in replica_dict.keys():
             replica_name = format_actor_name(replica_tag,
                                              self.controller_name)
             self.backend_replicas[backend_tag][
                 replica_tag] = ray.get_actor(replica_name)
Esempio n. 20
0
 def check_dead():
     for actor_name in [
             client._controller_name,
             format_actor_name(SERVE_PROXY_NAME, client._controller_name)
     ]:
         try:
             ray.get_actor(actor_name)
             return False
         except ValueError:
             pass
     return True
Esempio n. 21
0
    def __init__(self, controller_name: str, detached: bool,
                 kv_store: RayInternalKVStore, long_poll_host: LongPollHost,
                 goal_manager: AsyncGoalManager):
        self._controller_name = controller_name
        self._detached = detached
        self._kv_store = kv_store
        self._long_poll_host = long_poll_host
        self._goal_manager = goal_manager

        # Non-checkpointed state.
        self.currently_starting_replicas: Dict[asyncio.Future,
                                               Tuple[BackendTag, ReplicaTag,
                                                     ActorHandle]] = dict()
        self.currently_stopping_replicas: Dict[asyncio.Future,
                                               Tuple[BackendTag,
                                                     ReplicaTag]] = dict()

        # Checkpointed state.
        self.backends: Dict[BackendTag, BackendInfo] = dict()
        self.backend_replicas: Dict[BackendTag,
                                    Dict[ReplicaTag,
                                         ActorHandle]] = defaultdict(dict)
        self.backend_goals: Dict[BackendTag, GoalId] = dict()
        self.backend_replicas_to_start: Dict[
            BackendTag, List[ReplicaTag]] = defaultdict(list)
        self.backend_replicas_to_stop: Dict[BackendTag, List[Tuple[
            ReplicaTag, Duration]]] = defaultdict(list)
        self.backends_to_remove: List[BackendTag] = list()

        checkpoint = self._kv_store.get(CHECKPOINT_KEY)
        if checkpoint is not None:
            (self.backends, self.backend_replicas, self.backend_goals,
             self.backend_replicas_to_start, self.backend_replicas_to_stop,
             self.backend_to_remove,
             pending_goal_ids) = pickle.loads(checkpoint)

            for goal_id in pending_goal_ids:
                self._goal_manager.create_goal(goal_id)

            # Fetch actor handles for all backend replicas in the system.
            # All of these backend_replicas are guaranteed to already exist
            # because they would not be written to a checkpoint in
            # self.backend_replicas until they were created.
            for backend_tag, replica_dict in self.backend_replicas.items():
                for replica_tag in replica_dict.keys():
                    replica_name = format_actor_name(replica_tag,
                                                     self._controller_name)
                    self.backend_replicas[backend_tag][
                        replica_tag] = ray.get_actor(replica_name)

        self._notify_backend_configs_changed()
        self._notify_replica_handles_changed()
Esempio n. 22
0
 def __init__(self, controller_name: str, detached: bool,
              replica_tag: ReplicaTag, backend_tag: BackendTag,
              version: str):
     self._actor = ActorReplicaWrapper(
         format_actor_name(replica_tag, controller_name), detached,
         controller_name, replica_tag, backend_tag)
     self._controller_name = controller_name
     self._replica_tag = replica_tag
     self._backend_tag = backend_tag
     self._version = version
     self._start_time = None
     self._prev_slow_startup_warning_time = None
     self._state = ReplicaState.SHOULD_START
Esempio n. 23
0
    async def _start_backend_replicas(self, config_store: ConfigurationStore,
                                      backend_tag: BackendTag,
                                      replica_tag: ReplicaTag) -> None:
        # NOTE(edoakes): the replicas may already be created if we
        # failed after creating them but before writing a
        # checkpoint.
        replica_name = format_actor_name(replica_tag, self.controller_name)
        try:
            replica_handle = ray.get_actor(replica_name)
        except ValueError:
            replica_handle = await self._start_single_replica(
                config_store, backend_tag, replica_tag, replica_name)

        self.backend_replicas[backend_tag][replica_tag] = replica_handle
Esempio n. 24
0
    def _get_or_start_metric_exporter(self, metric_exporter_class):
        """Get the metric exporter belonging to this serve instance.

        If the metric exporter does not already exist, it will be started.
        """
        metric_sink_name = format_actor_name(SERVE_METRIC_SINK_NAME,
                                             self.instance_name)
        try:
            self.metric_exporter = ray.get_actor(metric_sink_name)
        except ValueError:
            logger.info("Starting metric exporter with name '{}'".format(
                metric_sink_name))
            self.metric_exporter = MetricExporterActor.options(
                name=metric_sink_name).remote(metric_exporter_class)
Esempio n. 25
0
 def __init__(self, controller_name: str, detached: bool,
              replica_tag: ReplicaTag, backend_tag: BackendTag):
     self._actor_name = format_actor_name(replica_tag, controller_name)
     self._placement_group_name = self._actor_name + "_placement_group"
     self._controller_name = controller_name
     self._detached = detached
     self._replica_tag = replica_tag
     self._backend_tag = backend_tag
     self._actor_handle = None
     self._placement_group = None
     self._start_time = None
     self._prev_slow_startup_warning_time = None
     self._startup_obj_ref = None
     self._drain_obj_ref = None
     self._state = ReplicaState.SHOULD_START
Esempio n. 26
0
    def _get_or_start_router(self):
        """Get the router belonging to this serve cluster.

        If the router does not already exist, it will be started.
        """
        router_name = format_actor_name(SERVE_ROUTER_NAME, self.cluster_name)
        try:
            self.router = ray.get_actor(router_name)
        except ValueError:
            logger.info("Starting router with name '{}'".format(router_name))
            self.router = async_retryable(ray.remote(Router)).options(
                name=router_name,
                max_concurrency=ASYNC_CONCURRENCY,
                max_restarts=-1,
            ).remote(cluster_name=self.cluster_name)
Esempio n. 27
0
    def __init__(self,
                 controller_name: str,
                 detached: bool,
                 checkpoint: bytes = None):
        self.controller_name = controller_name
        self.detached = detached

        # Non-checkpointed state.
        self.currently_starting_replicas: Dict[asyncio.Future,
                                               Tuple[BackendTag, ReplicaTag,
                                                     ActorHandle]] = dict()
        self.currently_stopping_replicas: Dict[asyncio.Future,
                                               Tuple[BackendTag,
                                                     ReplicaTag]] = dict()

        # Checkpointed state.
        self.backends: Dict[BackendTag, BackendInfo] = dict()
        self.backend_replicas: Dict[BackendTag,
                                    Dict[ReplicaTag,
                                         ActorHandle]] = defaultdict(dict)
        self.goals: Dict[BackendTag, GoalId] = dict()
        self.backend_replicas_to_start: Dict[
            BackendTag, List[ReplicaTag]] = defaultdict(list)
        self.backend_replicas_to_stop: Dict[BackendTag, List[Tuple[
            ReplicaTag, Duration]]] = defaultdict(list)
        self.backends_to_remove: List[BackendTag] = list()

        if checkpoint is not None:
            (self.backends, self.backend_replicas, self.goals,
             self.backend_replicas_to_start, self.backend_replicas_to_stop,
             self.backend_to_remove) = pickle.loads(checkpoint)

        # Fetch actor handles for all of the backend replicas in the system.
        # All of these backend_replicas are guaranteed to already exist because
        # they would not be written to a checkpoint in self.backend_replicas
        # until they were created.
        for backend_tag, replica_dict in self.backend_replicas.items():
            for replica_tag in replica_dict.keys():
                replica_name = format_actor_name(replica_tag,
                                                 self.controller_name)
                self.backend_replicas[backend_tag][
                    replica_tag] = ray.get_actor(replica_name)
Esempio n. 28
0
def test_shutdown(ray_shutdown):
    ray.init(num_cpus=16)
    serve.start(http_port=8003)

    @serve.deployment
    def f():
        pass

    f.deploy()

    actor_names = [
        serve.api._global_client._controller_name,
        format_actor_name(SERVE_PROXY_NAME,
                          serve.api._global_client._controller_name,
                          get_all_node_ids()[0][0])
    ]

    def check_alive():
        alive = True
        for actor_name in actor_names:
            try:
                ray.get_actor(actor_name)
            except ValueError:
                alive = False
        return alive

    wait_for_condition(check_alive)

    serve.shutdown()
    with pytest.raises(RayServeException):
        serve.list_backends()

    def check_dead():
        for actor_name in actor_names:
            try:
                ray.get_actor(actor_name)
                return False
            except ValueError:
                pass
        return True

    wait_for_condition(check_dead)
Esempio n. 29
0
    async def _start_replica(self, backend_tag: str, replica_tag: str) -> None:
        # NOTE(edoakes): the replicas may already be created if we
        # failed after creating them but before writing a
        # checkpoint.
        replica_name = format_actor_name(replica_tag, self.controller_name)
        try:
            worker_handle = ray.get_actor(replica_name)
        except ValueError:
            worker_handle = await self._start_backend_worker(
                backend_tag, replica_tag, replica_name)

        self.replicas[backend_tag].append(replica_tag)
        self.workers[backend_tag][replica_tag] = worker_handle

        # Register the worker with the router.
        await asyncio.gather(*[
            router.add_new_worker.remote(backend_tag, replica_tag,
                                         worker_handle)
            for router in self.routers.values()
        ])
Esempio n. 30
0
    def _get_or_start_http_proxy(self, node_id, host, port):
        """Get the HTTP proxy belonging to this serve cluster.

        If the HTTP proxy does not already exist, it will be started.
        """
        proxy_name = format_actor_name(SERVE_PROXY_NAME, self.cluster_name)
        try:
            self.http_proxy = ray.get_actor(proxy_name)
        except ValueError:
            logger.info(
                "Starting HTTP proxy with name '{}' on node '{}'".format(
                    proxy_name, node_id))
            self.http_proxy = async_retryable(HTTPProxyActor).options(
                name=proxy_name,
                max_concurrency=ASYNC_CONCURRENCY,
                max_restarts=-1,
                resources={
                    node_id: 0.01
                },
            ).remote(host, port, cluster_name=self.cluster_name)