Example #1
0
def test_imbalanced_replicas(ongoing_requests):
    config = AutoscalingConfig(
        min_replicas=1,
        max_replicas=10,
        target_num_ongoing_requests_per_replica=5,
        upscale_delay_s=0.0,
        downscale_delay_s=0.0,
    )

    policy = BasicAutoscalingPolicy(config)

    # Check that as long as the average number of ongoing requests equals
    # the target_num_ongoing_requests_per_replica, the number of replicas
    # stays the same
    if (
        sum(ongoing_requests) / len(ongoing_requests)
        == config.target_num_ongoing_requests_per_replica
    ):
        new_num_replicas = policy.get_decision_num_replicas(
            current_num_ongoing_requests=ongoing_requests,
            curr_target_num_replicas=4,
            current_handle_queued_queries=0,
        )
        assert new_num_replicas == 4

    # Check downscaling behavior when average number of requests
    # is lower than target_num_ongoing_requests_per_replica
    elif (
        sum(ongoing_requests) / len(ongoing_requests)
        < config.target_num_ongoing_requests_per_replica
    ):
        new_num_replicas = policy.get_decision_num_replicas(
            current_num_ongoing_requests=ongoing_requests,
            curr_target_num_replicas=4,
            current_handle_queued_queries=0,
        )

        if (
            config.target_num_ongoing_requests_per_replica
            - sum(ongoing_requests) / len(ongoing_requests)
            <= 1
        ):
            # Autoscaling uses a ceiling operator, which means a slightly low
            # current_num_ongoing_requests value is insufficient to downscale
            assert new_num_replicas == 4
        else:
            assert new_num_replicas == 3

    # Check upscaling behavior when average number of requests
    # is higher than target_num_ongoing_requests_per_replica
    else:
        new_num_replicas = policy.get_decision_num_replicas(
            current_num_ongoing_requests=ongoing_requests,
            curr_target_num_replicas=4,
            current_handle_queued_queries=0,
        )
        assert new_num_replicas == 5
Example #2
0
    async def create_backend(self, backend_tag, backend_config,
                             replica_config):
        """Register a new backend under the specified tag."""
        async with self.write_lock:
            backend_worker = create_backend_worker(
                replica_config.func_or_class)

            # Save creator that starts replicas, the arguments to be passed in,
            # and the configuration for the backends.
            self.backends[backend_tag] = BackendInfo(backend_worker,
                                                     backend_config,
                                                     replica_config)
            if backend_config.autoscaling_config is not None:
                self.autoscaling_policies[
                    backend_tag] = BasicAutoscalingPolicy(
                        backend_tag, backend_config.autoscaling_config)

            self._scale_replicas(backend_tag, backend_config.num_replicas)

            # NOTE(edoakes): we must write a checkpoint before starting new
            # or pushing the updated config to avoid inconsistent state if we
            # crash while making the change.
            self._checkpoint()
            await self._start_pending_replicas()

            # Set the backend config inside the router
            # (particularly for max-batch-size).
            await asyncio.gather(*[
                router.set_backend_config.remote(backend_tag, backend_config)
                for router in self.routers.values()
            ])
            await self.broadcast_backend_config(backend_tag)
Example #3
0
    def deploy(
        self,
        name: str,
        deployment_config_proto_bytes: bytes,
        replica_config: ReplicaConfig,
        version: Optional[str],
        prev_version: Optional[str],
        route_prefix: Optional[str],
        deployer_job_id: "ray._raylet.JobID",
    ) -> Tuple[Optional[GoalId], bool]:
        if route_prefix is not None:
            assert route_prefix.startswith("/")

        deployment_config = DeploymentConfig.from_proto_bytes(
            deployment_config_proto_bytes)

        if prev_version is not None:
            existing_deployment_info = self.deployment_state_manager.get_deployment(
                name)
            if existing_deployment_info is None or not existing_deployment_info.version:
                raise ValueError(
                    f"prev_version '{prev_version}' is specified but "
                    "there is no existing deployment.")
            if existing_deployment_info.version != prev_version:
                raise ValueError(
                    f"prev_version '{prev_version}' "
                    "does not match with the existing "
                    f"version '{existing_deployment_info.version}'.")

        autoscaling_config = deployment_config.autoscaling_config
        if autoscaling_config is not None:
            # TODO: is this the desired behaviour? Should this be a setting?
            deployment_config.num_replicas = autoscaling_config.min_replicas

            autoscaling_policy = BasicAutoscalingPolicy(autoscaling_config)
        else:
            autoscaling_policy = None

        deployment_info = DeploymentInfo(
            actor_name=name,
            serialized_deployment_def=replica_config.serialized_deployment_def,
            version=version,
            deployment_config=deployment_config,
            replica_config=replica_config,
            deployer_job_id=deployer_job_id,
            start_time_ms=int(time.time() * 1000),
            autoscaling_policy=autoscaling_policy,
        )
        # TODO(architkulkarni): When a deployment is redeployed, even if
        # the only change was num_replicas, the start_time_ms is refreshed.
        # Is this the desired behaviour?

        goal_id, updating = self.deployment_state_manager.deploy(
            name, deployment_info)

        if route_prefix is not None:
            endpoint_info = EndpointInfo(route=route_prefix)
            self.endpoint_state.update_endpoint(name, endpoint_info)

        return goal_id, updating
def test_single_replica_receives_all_requests(ongoing_requests):
    target_requests = 5

    config = AutoscalingConfig(
        min_replicas=1,
        max_replicas=50,
        target_num_ongoing_requests_per_replica=target_requests,
        upscale_delay_s=0.0,
        downscale_delay_s=0.0)

    policy = BasicAutoscalingPolicy(config)

    new_num_replicas = policy.get_decision_num_replicas(
        current_num_ongoing_requests=ongoing_requests,
        curr_target_num_replicas=4)
    assert new_num_replicas == sum(ongoing_requests) / target_requests
Example #5
0
def test_fluctuating_ongoing_requests(delay_s):
    """
    Simulates a workload that switches between too many and too few
    ongoing requests.
    """

    config = AutoscalingConfig(
        min_replicas=1,
        max_replicas=10,
        target_num_ongoing_requests_per_replica=50,
        upscale_delay_s=delay_s,
        downscale_delay_s=delay_s,
    )

    policy = BasicAutoscalingPolicy(config)

    if delay_s > 0:
        wait_periods = int(delay_s / CONTROL_LOOP_PERIOD_S)
        assert wait_periods > 1

    underload_requests, overload_requests = [20, 20], [100]
    trials = 1000

    new_num_replicas = None
    for trial in range(trials):
        if trial % 2 == 0:
            new_num_replicas = policy.get_decision_num_replicas(
                current_num_ongoing_requests=overload_requests,
                curr_target_num_replicas=1,
                current_handle_queued_queries=0,
            )
            if delay_s > 0:
                assert new_num_replicas == 1, trial
            else:
                assert new_num_replicas == 2, trial
        else:
            new_num_replicas = policy.get_decision_num_replicas(
                current_num_ongoing_requests=underload_requests,
                curr_target_num_replicas=2,
                current_handle_queued_queries=0,
            )
            if delay_s > 0:
                assert new_num_replicas == 2, trial
            else:
                assert new_num_replicas == 1, trial
Example #6
0
    async def create_backend(self, backend_tag: BackendTag,
                             backend_config: BackendConfig,
                             replica_config: ReplicaConfig) -> UUID:
        """Register a new backend under the specified tag."""
        async with self.write_lock:
            # Ensures this method is idempotent.
            backend_info = self.backend_state.get_backend(backend_tag)
            if backend_info is not None:
                if (backend_info.backend_config == backend_config
                        and backend_info.replica_config == replica_config):
                    return

            backend_replica = create_backend_replica(
                replica_config.func_or_class)

            # Save creator that starts replicas, the arguments to be passed in,
            # and the configuration for the backends.
            backend_info = BackendInfo(
                worker_class=backend_replica,
                backend_config=backend_config,
                replica_config=replica_config)
            self.backend_state.add_backend(backend_tag, backend_info)
            metadata = backend_config.internal_metadata
            if metadata.autoscaling_config is not None:
                self.autoscaling_policies[
                    backend_tag] = BasicAutoscalingPolicy(
                        backend_tag, metadata.autoscaling_config)

            try:
                # This call should be to run control loop
                self.actor_reconciler._scale_backend_replicas(
                    self.backend_state.backends, backend_tag,
                    backend_config.num_replicas)
            except RayServeException as e:
                del self.backend_state.backends[backend_tag]
                raise e

            return_uuid = self._create_event_with_result({
                backend_tag: backend_info
            })
            # NOTE(edoakes): we must write a checkpoint before starting new
            # or pushing the updated config to avoid inconsistent state if we
            # crash while making the change.
            self._checkpoint()
            await self.actor_reconciler._enqueue_pending_scale_changes_loop(
                self.backend_state)
            await self.actor_reconciler.backend_control_loop()

            self.notify_replica_handles_changed()

            # Set the backend config inside routers
            # (particularly for max_concurrent_queries).
            self.notify_backend_configs_changed()
            return return_uuid
def test_replicas_delayed_startup():
    """Unit test simulating replicas taking time to start up."""
    config = AutoscalingConfig(
        min_replicas=1,
        max_replicas=200,
        target_num_ongoing_requests_per_replica=1,
        upscale_delay_s=0,
        downscale_delay_s=100000,
    )

    policy = BasicAutoscalingPolicy(config)

    new_num_replicas = policy.get_decision_num_replicas([100], 1)
    assert new_num_replicas == 100

    # New target is 100, but no new replicas finished spinning up during this
    # timestep.
    new_num_replicas = policy.get_decision_num_replicas([100], 100)
    assert new_num_replicas == 100

    # Two new replicas spun up during this timestep.
    new_num_replicas = policy.get_decision_num_replicas([100, 20, 3], 100)
    assert new_num_replicas == 123

    # A lot of queries got drained and a lot of replicas started up, but
    # new_num_replicas should not decrease, because of the downscale delay.
    new_num_replicas = policy.get_decision_num_replicas([6, 2, 1, 1], 123)
    assert new_num_replicas == 123
Example #8
0
    def deploy(
        self,
        name: str,
        deployment_config_proto_bytes: bytes,
        replica_config_proto_bytes: bytes,
        route_prefix: Optional[str],
        deployer_job_id: Union["ray._raylet.JobID", bytes],
    ) -> bool:
        if route_prefix is not None:
            assert route_prefix.startswith("/")

        deployment_config = DeploymentConfig.from_proto_bytes(
            deployment_config_proto_bytes
        )
        version = deployment_config.version
        replica_config = ReplicaConfig.from_proto_bytes(
            replica_config_proto_bytes, deployment_config.needs_pickle()
        )

        autoscaling_config = deployment_config.autoscaling_config
        if autoscaling_config is not None:
            # TODO: is this the desired behaviour? Should this be a setting?
            deployment_config.num_replicas = autoscaling_config.min_replicas

            autoscaling_policy = BasicAutoscalingPolicy(autoscaling_config)
        else:
            autoscaling_policy = None
        if isinstance(deployer_job_id, bytes):
            deployer_job_id = ray.JobID.from_int(
                int.from_bytes(deployer_job_id, "little")
            )
        deployment_info = DeploymentInfo(
            actor_name=name,
            version=version,
            deployment_config=deployment_config,
            replica_config=replica_config,
            deployer_job_id=deployer_job_id,
            start_time_ms=int(time.time() * 1000),
            autoscaling_policy=autoscaling_policy,
        )
        # TODO(architkulkarni): When a deployment is redeployed, even if
        # the only change was num_replicas, the start_time_ms is refreshed.
        # Is this the desired behaviour?
        updating = self.deployment_state_manager.deploy(name, deployment_info)

        if route_prefix is not None:
            endpoint_info = EndpointInfo(route=route_prefix)
            self.endpoint_state.update_endpoint(name, endpoint_info)
        else:
            self.endpoint_state.delete_endpoint(name)

        return updating
Example #9
0
    async def create_backend(self, backend_tag: BackendTag,
                             backend_config: BackendConfig,
                             replica_config: ReplicaConfig) -> None:
        """Register a new backend under the specified tag."""
        async with self.write_lock:
            # Ensures this method is idempotent.
            backend_info = self.configuration_store.get_backend(backend_tag)
            if backend_info is not None:
                if (backend_info.backend_config == backend_config
                        and backend_info.replica_config == replica_config):
                    return

            backend_worker = create_backend_worker(
                replica_config.func_or_class)

            # Save creator that starts replicas, the arguments to be passed in,
            # and the configuration for the backends.
            self.configuration_store.add_backend(
                backend_tag,
                BackendInfo(
                    worker_class=backend_worker,
                    backend_config=backend_config,
                    replica_config=replica_config))
            metadata = backend_config.internal_metadata
            if metadata.autoscaling_config is not None:
                self.autoscaling_policies[
                    backend_tag] = BasicAutoscalingPolicy(
                        backend_tag, metadata.autoscaling_config)

            try:
                self.actor_reconciler._scale_replicas(
                    self.configuration_store.backends, backend_tag,
                    backend_config.num_replicas)
            except RayServeException as e:
                del self.configuration_store.backends[backend_tag]
                raise e

            # NOTE(edoakes): we must write a checkpoint before starting new
            # or pushing the updated config to avoid inconsistent state if we
            # crash while making the change.
            self._checkpoint()
            await self.actor_reconciler._start_pending_replicas(
                self.configuration_store)

            # Set the backend config inside the router
            # (particularly for max-batch-size).
            await asyncio.gather(*[
                router.set_backend_config.remote(backend_tag, backend_config)
                for router in self.actor_reconciler.router_handles()
            ])
            await self.broadcast_backend_config(backend_tag)
Example #10
0
    async def _recover_from_checkpoint(
            self, config_store: ConfigurationStore,
            controller: "ServeController"
    ) -> Dict[BackendTag, BasicAutoscalingPolicy]:
        self._recover_actor_handles()
        autoscaling_policies = dict()
        # Push configuration state to the router.
        # TODO(edoakes): should we make this a pull-only model for simplicity?
        for endpoint, traffic_policy in config_store.traffic_policies.items():
            await asyncio.gather(*[
                router.set_traffic.remote(endpoint, traffic_policy)
                for router in self.router_handles()
            ])

        for backend_tag, replica_dict in self.workers.items():
            for replica_tag, worker in replica_dict.items():
                await asyncio.gather(*[
                    router.add_new_worker.remote(backend_tag, replica_tag,
                                                 worker)
                    for router in self.router_handles()
                ])

        for backend, info in config_store.backends.items():
            await asyncio.gather(*[
                router.set_backend_config.remote(backend, info.backend_config)
                for router in self.router_handles()
            ])
            await controller.broadcast_backend_config(backend)
            metadata = info.backend_config.internal_metadata
            if metadata.autoscaling_config is not None:
                autoscaling_policies[backend] = BasicAutoscalingPolicy(
                    backend, metadata.autoscaling_config)

        # Push configuration state to the routers.
        await asyncio.gather(*[
            router.set_route_table.remote(config_store.routes)
            for router in self.router_handles()
        ])

        # Start/stop any pending backend replicas.
        await self._start_pending_replicas(config_store)
        await self._stop_pending_replicas()

        # Remove any pending backends and endpoints.
        await self._remove_pending_backends()
        await self._remove_pending_endpoints()

        return autoscaling_policies
Example #11
0
    async def _recover_from_checkpoint(
        self, backend_state: BackendState, controller: "ServeController"
    ) -> Dict[BackendTag, BasicAutoscalingPolicy]:
        self._recover_actor_handles()
        autoscaling_policies = dict()

        for backend, info in backend_state.backends.items():
            metadata = info.backend_config.internal_metadata
            if metadata.autoscaling_config is not None:
                autoscaling_policies[backend] = BasicAutoscalingPolicy(
                    backend, metadata.autoscaling_config)

        # Start/stop any pending backend replicas.
        await self._enqueue_pending_scale_changes_loop(backend_state)

        return autoscaling_policies
Example #12
0
    async def _recover_from_checkpoint(
        self, config_store: ConfigurationStore, controller: "ServeController"
    ) -> Dict[BackendTag, BasicAutoscalingPolicy]:
        self._recover_actor_handles()
        autoscaling_policies = dict()

        for backend, info in config_store.backends.items():
            metadata = info.backend_config.internal_metadata
            if metadata.autoscaling_config is not None:
                autoscaling_policies[backend] = BasicAutoscalingPolicy(
                    backend, metadata.autoscaling_config)

        # Start/stop any pending backend replicas.
        await self._start_pending_backend_replicas(config_store)
        await self._stop_pending_backend_replicas()

        return autoscaling_policies
def test_upscale_downscale_delay():
    """Unit test for upscale_delay_s and downscale_delay_s."""

    upscale_delay_s = 30.0
    downscale_delay_s = 600.0

    config = AutoscalingConfig(
        min_replicas=1,
        max_replicas=2,
        target_num_ongoing_requests_per_replica=1,
        upscale_delay_s=30.0,
        downscale_delay_s=600.0,
    )

    policy = BasicAutoscalingPolicy(config)

    upscale_wait_periods = int(upscale_delay_s / CONTROL_LOOP_PERIOD_S)
    downscale_wait_periods = int(downscale_delay_s / CONTROL_LOOP_PERIOD_S)

    overload_requests = [100]

    # We should scale up only after enough consecutive scale-up decisions.
    for i in range(upscale_wait_periods):
        new_num_replicas = policy.get_decision_num_replicas(
            current_num_ongoing_requests=overload_requests,
            curr_target_num_replicas=1)
        assert new_num_replicas == 1, i

    new_num_replicas = policy.get_decision_num_replicas(
        current_num_ongoing_requests=overload_requests,
        curr_target_num_replicas=1)
    assert new_num_replicas == 2

    no_requests = [0, 0]

    # We should scale down only after enough consecutive scale-down decisions.
    for i in range(downscale_wait_periods):
        new_num_replicas = policy.get_decision_num_replicas(
            current_num_ongoing_requests=no_requests,
            curr_target_num_replicas=2)
        assert new_num_replicas == 2, i

    new_num_replicas = policy.get_decision_num_replicas(
        current_num_ongoing_requests=no_requests, curr_target_num_replicas=2)
    assert new_num_replicas == 1

    # Get some scale-up decisions, but not enough to trigger a scale up.
    for i in range(int(upscale_wait_periods / 2)):
        new_num_replicas = policy.get_decision_num_replicas(
            current_num_ongoing_requests=overload_requests,
            curr_target_num_replicas=1)
        assert new_num_replicas == 1, i

    # Interrupt with a scale-down decision.
    policy.get_decision_num_replicas(current_num_ongoing_requests=[0],
                                     curr_target_num_replicas=1)

    # The counter should be reset, so it should require `upscale_wait_periods`
    # more periods before we actually scale up.
    for i in range(upscale_wait_periods):
        new_num_replicas = policy.get_decision_num_replicas(
            current_num_ongoing_requests=overload_requests,
            curr_target_num_replicas=1)
        assert new_num_replicas == 1, i

    new_num_replicas = policy.get_decision_num_replicas(
        current_num_ongoing_requests=overload_requests,
        curr_target_num_replicas=1)
    assert new_num_replicas == 2

    # Get some scale-down decisions, but not enough to trigger a scale down.
    for i in range(int(downscale_wait_periods / 2)):
        new_num_replicas = policy.get_decision_num_replicas(
            current_num_ongoing_requests=no_requests,
            curr_target_num_replicas=2)
        assert new_num_replicas == 2, i

    # Interrupt with a scale-up decision.
    policy.get_decision_num_replicas(current_num_ongoing_requests=[100, 100],
                                     curr_target_num_replicas=2)

    # The counter should be reset so it should require `downscale_wait_periods`
    # more periods before we actually scale down.
    for i in range(downscale_wait_periods):
        new_num_replicas = policy.get_decision_num_replicas(
            current_num_ongoing_requests=no_requests,
            curr_target_num_replicas=2)
        assert new_num_replicas == 2, i

    new_num_replicas = policy.get_decision_num_replicas(
        current_num_ongoing_requests=no_requests, curr_target_num_replicas=2)
    assert new_num_replicas == 1
Example #14
0
    async def _recover_from_checkpoint(self, checkpoint_bytes):
        """Recover the instance state from the provided checkpoint.

        Performs the following operations:
            1) Deserializes the internal state from the checkpoint.
            2) Pushes the latest configuration to the routers
               in case we crashed before updating them.
            3) Starts/stops any worker replicas that are pending creation or
               deletion.

        NOTE: this requires that self.write_lock is already acquired and will
        release it before returning.
        """
        assert self.write_lock.locked()

        start = time.time()
        logger.info("Recovering from checkpoint")

        # Load internal state from the checkpoint data.
        (
            self.routes,
            router_node_ids,
            self.backends,
            self.traffic_policies,
            self.replicas,
            self.replicas_to_start,
            self.replicas_to_stop,
            self.backends_to_remove,
            self.endpoints_to_remove,
        ) = pickle.loads(checkpoint_bytes)

        for node_id in router_node_ids:
            router_name = format_actor_name(SERVE_PROXY_NAME,
                                            self.instance_name, node_id)
            self.routers[node_id] = ray.get_actor(router_name)

        # Fetch actor handles for all of the backend replicas in the system.
        # All of these workers are guaranteed to already exist because they
        # would not be written to a checkpoint in self.workers until they
        # were created.
        for backend_tag, replica_tags in self.replicas.items():
            for replica_tag in replica_tags:
                replica_name = format_actor_name(replica_tag,
                                                 self.instance_name)
                self.workers[backend_tag][replica_tag] = ray.get_actor(
                    replica_name)

        # Push configuration state to the router.
        # TODO(edoakes): should we make this a pull-only model for simplicity?
        for endpoint, traffic_policy in self.traffic_policies.items():
            await asyncio.gather(*[
                router.set_traffic.remote(endpoint, traffic_policy)
                for router in self.routers.values()
            ])

        for backend_tag, replica_dict in self.workers.items():
            for replica_tag, worker in replica_dict.items():
                await asyncio.gather(*[
                    router.add_new_worker.remote(backend_tag, replica_tag,
                                                 worker)
                    for router in self.routers.values()
                ])

        for backend, info in self.backends.items():
            await asyncio.gather(*[
                router.set_backend_config.remote(backend, info.backend_config)
                for router in self.routers.values()
            ])
            await self.broadcast_backend_config(backend)
            if info.backend_config.autoscaling_config is not None:
                self.autoscaling_policies[backend] = BasicAutoscalingPolicy(
                    backend, info.backend_config.autoscaling_config)

        # Push configuration state to the routers.
        await asyncio.gather(*[
            router.set_route_table.remote(self.routes)
            for router in self.routers.values()
        ])

        # Start/stop any pending backend replicas.
        await self._start_pending_replicas()
        await self._stop_pending_replicas()

        # Remove any pending backends and endpoints.
        await self._remove_pending_backends()
        await self._remove_pending_endpoints()

        logger.info(
            "Recovered from checkpoint in {:.3f}s".format(time.time() - start))

        self.write_lock.release()