async def main(): # Give default cluster parameter values based on smoke_test config # if user provided values explicitly, use them instead. # IS_SMOKE_TEST is set by args of releaser's e2e.py if is_smoke_test(): setup_local_single_node_cluster(1) else: setup_anyscale_cluster() result_json = await benchmark_main() logger.info(result_json) save_test_results(result_json, default_output_file="/tmp/micro_benchmark.json")
def main(num_replicas: Optional[int], trial_length: Optional[str], max_batch_size: Optional[int]): # Give default cluster parameter values based on smoke_test config # if user provided values explicitly, use them instead. # IS_SMOKE_TEST is set by args of releaser's e2e.py smoke_test = os.environ.get("IS_SMOKE_TEST", "1") if smoke_test == "1": num_replicas = num_replicas or DEFAULT_SMOKE_TEST_NUM_REPLICA trial_length = trial_length or DEFAULT_SMOKE_TEST_TRIAL_LENGTH logger.info( f"Running local / smoke test with {num_replicas} replicas ..\n") # Choose cluster setup based on user config. Local test uses Cluster() # to mock actors that requires # of nodes to be specified, but ray # client doesn't need to num_nodes = int(math.ceil(num_replicas / NUM_CPU_PER_NODE)) logger.info( f"Setting up local ray cluster with {num_nodes} nodes ..\n") serve_client = setup_local_single_node_cluster(num_nodes)[0] else: num_replicas = num_replicas or DEFAULT_FULL_TEST_NUM_REPLICA trial_length = trial_length or DEFAULT_FULL_TEST_TRIAL_LENGTH logger.info(f"Running full test with {num_replicas} replicas ..\n") logger.info("Setting up anyscale ray cluster .. \n") serve_client = setup_anyscale_cluster() http_host = str(serve_client._http_config.host) http_port = str(serve_client._http_config.port) logger.info(f"Ray serve http_host: {http_host}, http_port: {http_port}") logger.info(f"Deploying with {num_replicas} target replicas ....\n") deploy_replicas(num_replicas, max_batch_size) logger.info("Warming up cluster ....\n") warm_up_one_cluster.remote(10, http_host, http_port, "echo") logger.info(f"Starting wrk trial on all nodes for {trial_length} ....\n") # For detailed discussion, see https://github.com/wg/wrk/issues/205 # TODO:(jiaodong) What's the best number to use here ? all_endpoints = list(serve.list_deployments().keys()) all_metrics, all_wrk_stdout = run_wrk_on_all_nodes( trial_length, NUM_CONNECTIONS, http_host, http_port, all_endpoints=all_endpoints) aggregated_metrics = aggregate_all_metrics(all_metrics) logger.info("Wrk stdout on each node: ") for wrk_stdout in all_wrk_stdout: logger.info(wrk_stdout) logger.info("Final aggregated metrics: ") for key, val in aggregated_metrics.items(): logger.info(f"{key}: {val}") save_test_results( aggregated_metrics, default_output_file="/tmp/single_deployment_1k_noop_replica.json")
def main( min_replicas: Optional[int], max_replicas: Optional[int], num_deployments: Optional[int], trial_length: Optional[str], ): # Give default cluster parameter values based on smoke_test config # if user provided values explicitly, use them instead. # IS_SMOKE_TEST is set by args of releaser's e2e.py if is_smoke_test(): min_replicas = min_replicas or DEFAULT_SMOKE_TEST_MIN_NUM_REPLICA max_replicas = max_replicas or DEFAULT_SMOKE_TEST_MAX_NUM_REPLICA num_deployments = num_deployments or DEFAULT_SMOKE_TEST_NUM_DEPLOYMENTS trial_length = trial_length or DEFAULT_SMOKE_TEST_TRIAL_LENGTH logger.info(f"Running smoke test with min {min_replicas} and max " f"{max_replicas} replicas, {num_deployments} deployments " f".. \n") # Choose cluster setup based on user config. Local test uses Cluster() # to mock actors that requires # of nodes to be specified, but ray # client doesn't need to num_nodes = int(math.ceil(max_replicas / NUM_CPU_PER_NODE)) logger.info( f"Setting up local ray cluster with {num_nodes} nodes .. \n") serve_client = setup_local_single_node_cluster(num_nodes)[0] else: min_replicas = min_replicas or DEFAULT_FULL_TEST_MIN_NUM_REPLICA max_replicas = max_replicas or DEFAULT_FULL_TEST_MAX_NUM_REPLICA num_deployments = num_deployments or DEFAULT_FULL_TEST_NUM_DEPLOYMENTS trial_length = trial_length or DEFAULT_FULL_TEST_TRIAL_LENGTH logger.info(f"Running full test with min {min_replicas} and max " f"{max_replicas} replicas, {num_deployments} deployments " f".. \n") logger.info("Setting up anyscale ray cluster .. \n") serve_client = setup_anyscale_cluster() http_host = str(serve_client._http_config.host) http_port = str(serve_client._http_config.port) logger.info(f"Ray serve http_host: {http_host}, http_port: {http_port}") logger.info(f"Deploying with min {min_replicas} and max {max_replicas}" f"target replicas ....\n") setup_multi_deployment_replicas(min_replicas, max_replicas, num_deployments) logger.info("Warming up cluster ....\n") endpoint_refs = [] all_endpoints = list(serve.list_deployments().keys()) for endpoint in all_endpoints: endpoint_refs.append( warm_up_one_cluster.options(num_cpus=0).remote( 10, http_host, http_port, endpoint)) for endpoint in ray.get(endpoint_refs): logger.info(f"Finished warming up {endpoint}") logger.info(f"Starting wrk trial on all nodes for {trial_length} ....\n") # For detailed discussion, see https://github.com/wg/wrk/issues/205 # TODO:(jiaodong) What's the best number to use here ? all_metrics, all_wrk_stdout = run_wrk_on_all_nodes( trial_length, NUM_CONNECTIONS, http_host, http_port, all_endpoints=all_endpoints) aggregated_metrics = aggregate_all_metrics(all_metrics) logger.info("Wrk stdout on each node: ") for wrk_stdout in all_wrk_stdout: logger.info(wrk_stdout) logger.info("Final aggregated metrics: ") for key, val in aggregated_metrics.items(): logger.info(f"{key}: {val}") save_test_results( aggregated_metrics, default_output_file="/tmp/autoscaling_multi_deployment.json")
def main(): # Setup local cluster, note this cluster setup is the same for both # local and product ray cluster env. # Each test uses different ray namespace, thus kv storage key for each # checkpoint is different to avoid collision. namespace = uuid.uuid4().hex # IS_SMOKE_TEST is set by args of releaser's e2e.py if is_smoke_test(): path = Path("checkpoint.db") checkpoint_path = f"file://{path}" if path.exists(): path.unlink() else: checkpoint_path = ( "s3://serve-nightly-tests/fault-tolerant-test-checkpoint" # noqa: E501 ) _, cluster = setup_local_single_node_cluster( 1, checkpoint_path=checkpoint_path, namespace=namespace) # Deploy for the first time @serve.deployment(num_replicas=DEFAULT_NUM_REPLICAS) def hello(): return serve.get_replica_context().deployment for name in ["hello", "world"]: hello.options(name=name).deploy() for _ in range(5): response = request_with_retries(f"/{name}/", timeout=3) assert response.text == name logger.info("Initial deployment successful with working endpoint.") # Kill current cluster, recover from remote checkpoint and ensure endpoint # is still available with expected results ray.kill(serve.context._global_client._controller, no_restart=True) ray.shutdown() cluster.shutdown() serve.context.set_global_client(None) # Start another ray cluster with same namespace to resume from previous # checkpoints with no new deploy() call. setup_local_single_node_cluster(1, checkpoint_path=checkpoint_path, namespace=namespace) for name in ["hello", "world"]: for _ in range(5): response = request_with_retries(f"/{name}/", timeout=3) assert response.text == name logger.info("Deployment recovery from s3 checkpoint is successful " "with working endpoint.") # Delete dangling checkpoints. If script failed before this step, it's up # to the TTL policy on s3 to clean up, but won't lead to collision with # subsequent tests since each test run in different uuid namespace. serve.shutdown() ray.shutdown() cluster.shutdown() # Checkpoints in S3 bucket are moved after 7 days with explicit lifecycle # rules. Each checkpoint is ~260 Bytes in size from this test. # Save results save_test_results( {"result": "success"}, default_output_file="/tmp/serve_cluster_fault_tolerance.json", )
def main( fanout_degree: Optional[int], init_delay_secs: Optional[int], compute_delay_secs: Optional[int], num_requests_per_client: Optional[int], num_clients: Optional[int], throughput_trial_duration_secs: Optional[int], local_test: Optional[bool], ): if local_test: setup_local_single_node_cluster(1, num_cpu_per_node=8) else: setup_anyscale_cluster() serve_dag = test_wide_fanout_deployment_graph( fanout_degree, init_delay_secs=init_delay_secs, compute_delay_secs=compute_delay_secs, ) dag_handle = serve.run(serve_dag) # 0 + 1 + 2 + 3 + 4 + ... + (fanout_degree - 1) expected = ((0 + fanout_degree - 1) * fanout_degree) / 2 assert ray.get(dag_handle.predict.remote(0)) == expected loop = asyncio.get_event_loop() throughput_mean_tps, throughput_std_tps = loop.run_until_complete( benchmark_throughput_tps( dag_handle, expected, duration_secs=throughput_trial_duration_secs, num_clients=num_clients, )) latency_mean_ms, latency_std_ms = loop.run_until_complete( benchmark_latency_ms( dag_handle, expected, num_requests=num_requests_per_client, num_clients=num_clients, )) print(f"fanout_degree: {fanout_degree}, num_clients: {num_clients}") print(f"latency_mean_ms: {latency_mean_ms}, " f"latency_std_ms: {latency_std_ms}") print(f"throughput_mean_tps: {throughput_mean_tps}, " f"throughput_std_tps: {throughput_std_tps}") results = { "fanout_degree": fanout_degree, "init_delay_secs": init_delay_secs, "compute_delay_secs": compute_delay_secs, "local_test": local_test, } results["perf_metrics"] = [ { "perf_metric_name": "throughput_mean_tps", "perf_metric_value": throughput_mean_tps, "perf_metric_type": "THROUGHPUT", }, { "perf_metric_name": "throughput_std_tps", "perf_metric_value": throughput_std_tps, "perf_metric_type": "THROUGHPUT", }, { "perf_metric_name": "latency_mean_ms", "perf_metric_value": latency_mean_ms, "perf_metric_type": "LATENCY", }, { "perf_metric_name": "latency_std_ms", "perf_metric_value": latency_std_ms, "perf_metric_type": "LATENCY", }, ] save_test_results(results)
def main(): # Setup local cluster, note this cluster setup is the same for both # local and product ray cluster env. # Each test uses different ray namespace, thus kv storage key for each # checkpoint is different to avoid collision. namespace = uuid.uuid4().hex # IS_SMOKE_TEST is set by args of releaser's e2e.py smoke_test = os.environ.get("IS_SMOKE_TEST", "1") if smoke_test == "1": checkpoint_path = "file://checkpoint.db" else: checkpoint_path = "s3://serve-nightly-tests/fault-tolerant-test-checkpoint" # noqa: E501 _, cluster = setup_local_single_node_cluster( 1, checkpoint_path=checkpoint_path, namespace=namespace) # Deploy for the first time @serve.deployment(name="echo", num_replicas=DEFAULT_NUM_REPLICAS) class Echo: def __init__(self): return True def __call__(self, request): return "hii" Echo.deploy() # Ensure endpoint is working for _ in range(5): response = request_with_retries("/echo/", timeout=3) assert response.text == "hii" logger.info("Initial deployment successful with working endpoint.") # Kill current cluster, recover from remote checkpoint and ensure endpoint # is still available with expected results ray.kill(serve.api._global_client._controller, no_restart=True) ray.shutdown() cluster.shutdown() serve.api._set_global_client(None) # Start another ray cluster with same namespace to resume from previous # checkpoints with no new deploy() call. setup_local_single_node_cluster(1, checkpoint_path=checkpoint_path, namespace=namespace) for _ in range(5): response = request_with_retries("/echo/", timeout=3) assert response.text == "hii" logger.info("Deployment recovery from s3 checkpoint is successful " "with working endpoint.") # Delete dangling checkpoints. If script failed before this step, it's up # to the TTL policy on s3 to clean up, but won't lead to collision with # subsequent tests since each test run in different uuid namespace. serve.shutdown() ray.shutdown() cluster.shutdown() # Checkpoints in S3 bucket are moved after 7 days with explicit lifecycle # rules. Each checkpoint is ~260 Bytes in size from this test. # Save results save_test_results( {"result": "success"}, default_output_file="/tmp/serve_cluster_fault_tolerance.json")
def main( chain_length: Optional[int], init_delay_secs: Optional[int], compute_delay_secs: Optional[int], num_requests_per_client: Optional[int], num_clients: Optional[int], throughput_trial_duration_secs: Optional[int], local_test: Optional[bool], ): if local_test: setup_local_single_node_cluster(1, num_cpu_per_node=8) else: setup_anyscale_cluster() serve_dag = test_long_chain_deployment_graph( chain_length, init_delay_secs=init_delay_secs, compute_delay_secs=compute_delay_secs, ) dag_handle = serve.run(serve_dag) assert ray.get(dag_handle.predict.remote(0)) == chain_length loop = asyncio.get_event_loop() throughput_mean_tps, throughput_std_tps = loop.run_until_complete( benchmark_throughput_tps( dag_handle, chain_length, duration_secs=throughput_trial_duration_secs, num_clients=num_clients, ) ) latency_mean_ms, latency_std_ms = loop.run_until_complete( benchmark_latency_ms( dag_handle, chain_length, num_requests=num_requests_per_client, num_clients=num_clients, ) ) print(f"chain_length: {chain_length}, num_clients: {num_clients}") print(f"latency_mean_ms: {latency_mean_ms}, " f"latency_std_ms: {latency_std_ms}") print( f"throughput_mean_tps: {throughput_mean_tps}, " f"throughput_std_tps: {throughput_std_tps}" ) results = { "chain_length": chain_length, "init_delay_secs": init_delay_secs, "compute_delay_secs": compute_delay_secs, "local_test": local_test, } results["perf_metrics"] = [ { "perf_metric_name": "throughput_mean_tps", "perf_metric_value": throughput_mean_tps, "perf_metric_type": "THROUGHPUT", }, { "perf_metric_name": "throughput_std_tps", "perf_metric_value": throughput_std_tps, "perf_metric_type": "THROUGHPUT", }, { "perf_metric_name": "latency_mean_ms", "perf_metric_value": latency_mean_ms, "perf_metric_type": "LATENCY", }, { "perf_metric_name": "latency_std_ms", "perf_metric_value": latency_std_ms, "perf_metric_type": "LATENCY", }, ] save_test_results(results)