Ejemplo n.º 1
0
    async def __init__(
        self,
        controller_name: str,
        http_config: HTTPOptions,
        checkpoint_path: str,
        detached: bool = False,
    ):
        configure_component_logger(component_name="controller",
                                   component_id=str(os.getpid()))

        # Used to read/write checkpoints.
        self.ray_worker_namespace = ray.get_runtime_context().namespace
        self.controller_name = controller_name
        self.checkpoint_path = checkpoint_path
        kv_store_namespace = f"{self.controller_name}-{self.ray_worker_namespace}"
        self.kv_store = make_kv_store(checkpoint_path,
                                      namespace=kv_store_namespace)
        self.snapshot_store = RayInternalKVStore(namespace=kv_store_namespace)

        # Dictionary of deployment_name -> proxy_name -> queue length.
        self.deployment_stats = defaultdict(lambda: defaultdict(dict))

        # Used to ensure that only a single state-changing operation happens
        # at any given time.
        self.write_lock = asyncio.Lock()

        self.long_poll_host = LongPollHost()

        self.http_state = HTTPState(
            controller_name,
            detached,
            http_config,
        )
        self.endpoint_state = EndpointState(self.kv_store, self.long_poll_host)

        # Fetch all running actors in current cluster as source of current
        # replica state for controller failure recovery
        all_current_actors = ray.util.list_named_actors(all_namespaces=True)
        all_serve_actor_names = [
            actor["name"] for actor in all_current_actors
            if actor["namespace"] == SERVE_NAMESPACE
        ]

        self.deployment_state_manager = DeploymentStateManager(
            controller_name,
            detached,
            self.kv_store,
            self.long_poll_host,
            all_serve_actor_names,
        )

        # Reference to Ray task executing most recent deployment request
        self.config_deployment_request_ref: ObjectRef = None

        # Unix timestamp of latest config deployment request. Defaults to 0.
        self.deployment_timestamp = 0

        asyncio.get_event_loop().create_task(self.run_control_loop())

        self._recover_config_from_checkpoint()
Ejemplo n.º 2
0
    async def __init__(
        self,
        controller_name: str,
        http_config: HTTPOptions,
        checkpoint_path: str,
        detached: bool = False,
        _override_controller_namespace: Optional[str] = None,
    ):
        configure_component_logger(component_name="controller",
                                   component_id=str(os.getpid()))

        # Used to read/write checkpoints.
        self.controller_namespace = ray.get_runtime_context().namespace
        self.controller_name = controller_name
        self.checkpoint_path = checkpoint_path
        kv_store_namespace = f"{self.controller_name}-{self.controller_namespace}"
        self.kv_store = make_kv_store(checkpoint_path,
                                      namespace=kv_store_namespace)
        self.snapshot_store = RayInternalKVStore(namespace=kv_store_namespace)

        # Dictionary of deployment_name -> proxy_name -> queue length.
        self.deployment_stats = defaultdict(lambda: defaultdict(dict))

        # Used to ensure that only a single state-changing operation happens
        # at any given time.
        self.write_lock = asyncio.Lock()

        self.long_poll_host = LongPollHost()

        self.http_state = HTTPState(
            controller_name,
            detached,
            http_config,
            _override_controller_namespace=_override_controller_namespace,
        )
        self.endpoint_state = EndpointState(self.kv_store, self.long_poll_host)
        # Fetch all running actors in current cluster as source of current
        # replica state for controller failure recovery
        all_current_actor_names = ray.util.list_named_actors()
        self.deployment_state_manager = DeploymentStateManager(
            controller_name,
            detached,
            self.kv_store,
            self.long_poll_host,
            all_current_actor_names,
            _override_controller_namespace=_override_controller_namespace,
        )

        # Reference to Ray task executing most recent deployment request
        self.config_deployment_request_ref: ObjectRef = None

        # Unix timestamp of latest config deployment request. Defaults to 0.
        self.deployment_timestamp = 0

        # TODO(simon): move autoscaling related stuff into a manager.
        self.autoscaling_metrics_store = InMemoryMetricsStore()

        asyncio.get_event_loop().create_task(self.run_control_loop())
Ejemplo n.º 3
0
    async def __init__(
        self,
        controller_name: str,
        http_config: HTTPOptions,
        checkpoint_path: str,
        detached: bool = False,
    ):
        # Used to read/write checkpoints.
        self.controller_namespace = ray.get_runtime_context().namespace
        self.controller_name = controller_name
        self.checkpoint_path = checkpoint_path
        kv_store_namespace = f"{self.controller_name}-{self.controller_namespace}"
        self.kv_store = make_kv_store(checkpoint_path,
                                      namespace=kv_store_namespace)
        self.snapshot_store = RayInternalKVStore(namespace=kv_store_namespace)

        # Dictionary of deployment_name -> proxy_name -> queue length.
        self.deployment_stats = defaultdict(lambda: defaultdict(dict))

        # Used to ensure that only a single state-changing operation happens
        # at any given time.
        self.write_lock = asyncio.Lock()

        self.long_poll_host = LongPollHost()

        self.goal_manager = AsyncGoalManager()
        self.http_state = HTTPState(controller_name, detached, http_config)
        self.endpoint_state = EndpointState(self.kv_store, self.long_poll_host)
        # Fetch all running actors in current cluster as source of current
        # replica state for controller failure recovery
        all_current_actor_names = ray.util.list_named_actors()
        self.deployment_state_manager = DeploymentStateManager(
            controller_name,
            detached,
            self.kv_store,
            self.long_poll_host,
            self.goal_manager,
            all_current_actor_names,
        )

        # TODO(simon): move autoscaling related stuff into a manager.
        self.autoscaling_metrics_store = InMemoryMetricsStore()

        asyncio.get_event_loop().create_task(self.run_control_loop())