Beispiel #1
0
    async def _start_backend_replica(self, backend_tag):
        assert (backend_tag in self.backend_table.list_backends()
                ), "Backend {} is not registered.".format(backend_tag)

        replica_tag = "{}#{}".format(backend_tag, get_random_letters(length=6))

        # Register the worker in the DB.
        # TODO(edoakes): we should guarantee that if calls to the master
        # succeed, the cluster state has changed and if they fail, it hasn't.
        # Once we have master actor fault tolerance, this breaks that guarantee
        # because this method could fail after writing the replica to the DB.
        self.backend_table.add_replica(backend_tag, replica_tag)

        # Fetch the info to start the replica from the backend table.
        backend_actor = ray.remote(
            self.backend_table.get_backend_creator(backend_tag))
        backend_config_dict = self.backend_table.get_info(backend_tag)
        backend_config = BackendConfig(**backend_config_dict)
        init_args = [
            backend_tag, replica_tag,
            self.backend_table.get_init_args(backend_tag)
        ]
        kwargs = backend_config.get_actor_creation_args(init_args)

        # Start the worker.
        worker_handle = backend_actor._remote(**kwargs)
        self.tag_to_actor_handles[replica_tag] = worker_handle

        # Wait for the worker to start up.
        await worker_handle.ready.remote()
        await self.get_router()[0].add_new_worker.remote(
            backend_tag, worker_handle)

        # Register the worker with the metric monitor.
        self.get_metric_monitor()[0].add_target.remote(worker_handle)
Beispiel #2
0
    async def _start_backend_replica(self, backend_tag):
        assert (backend_tag in self.backend_table.list_backends()
                ), "Backend {} is not registered.".format(backend_tag)

        replica_tag = "{}#{}".format(backend_tag, get_random_letters(length=6))

        # Fetch the info to start the replica from the backend table.
        creator = self.backend_table.get_backend_creator(backend_tag)
        backend_config_dict = self.backend_table.get_info(backend_tag)
        backend_config = BackendConfig(**backend_config_dict)
        init_args = self.backend_table.get_init_args(backend_tag)
        kwargs = backend_config.get_actor_creation_args(init_args)

        runner_handle = creator(kwargs)
        self.tag_to_actor_handles[replica_tag] = runner_handle

        # Set up the worker.

        await runner_handle._ray_serve_setup.remote(backend_tag,
                                                    self.get_router()[0],
                                                    runner_handle)
        ray.get(runner_handle._ray_serve_fetch.remote())

        # Register the worker in config tables and metric monitor.
        self.backend_table.add_replica(backend_tag, replica_tag)
        self.get_metric_monitor()[0].add_target.remote(runner_handle)
Beispiel #3
0
def _start_replica(backend_tag):
    assert (backend_tag in global_state.backend_table.list_backends()
            ), "Backend {} is not registered.".format(backend_tag)

    replica_tag = "{}#{}".format(backend_tag, get_random_letters(length=6))

    # get the info which starts the replicas
    creator = global_state.backend_table.get_backend_creator(backend_tag)
    backend_config_dict = global_state.backend_table.get_info(backend_tag)
    backend_config = BackendConfig(**backend_config_dict)
    init_args = global_state.backend_table.get_init_args(backend_tag)

    # get actor creation kwargs
    actor_kwargs = backend_config.get_actor_creation_args(init_args)

    # Create the runner in the nursery
    [runner_handle] = ray.get(
        global_state.actor_nursery_handle.start_actor_with_creator.remote(
            creator, actor_kwargs, replica_tag))

    # Setup the worker
    ray.get(
        runner_handle._ray_serve_setup.remote(
            backend_tag, global_state.init_or_get_router(), runner_handle))
    runner_handle._ray_serve_fetch.remote()

    # Register the worker in config tables as well as metric monitor
    global_state.backend_table.add_replica(backend_tag, replica_tag)
    global_state.init_or_get_metric_monitor().add_target.remote(runner_handle)
    async def _start_backend_worker(self, backend_tag, replica_tag):
        """Creates a backend worker and waits for it to start up.

        Assumes that the backend configuration has already been registered
        in self.backends.
        """
        logger.debug("Starting worker '{}' for backend '{}'.".format(
            replica_tag, backend_tag))
        worker_creator, init_args, config_dict = self.backends[backend_tag]
        # TODO(edoakes): just store the BackendConfig in self.backends.
        backend_config = BackendConfig(**config_dict)
        kwargs = backend_config.get_actor_creation_args()

        worker_handle = async_retryable(ray.remote(worker_creator)).options(
            detached=True,
            name=replica_tag,
            max_reconstructions=ray.ray_constants.INFINITE_RECONSTRUCTION,
            **kwargs).remote(backend_tag, replica_tag, init_args)
        # TODO(edoakes): we should probably have a timeout here.
        await worker_handle.ready.remote()
        return worker_handle