def test_gcs_client_reconnect(ray_start_regular, auto_reconnect): redis_client = ray.worker.global_worker.redis_client channel = gcs_utils.GcsChannel(redis_client=redis_client) gcs_client = gcs_utils.GcsClient(channel) if auto_reconnect \ else gcs_utils.GcsClient(channel, nums_reconnect_retry=0) gcs_client.internal_kv_put(b"a", b"b", True, None) gcs_client.internal_kv_get(b"a", None) == b"b" ray.worker._global_node.kill_gcs_server() ray.worker._global_node.start_gcs_server() if auto_reconnect is False: with pytest.raises(Exception): gcs_client.internal_kv_get(b"a", None) else: assert gcs_client.internal_kv_get(b"a", None) == b"b"
def test_gcs_aio_client_reconnect(ray_start_regular_with_external_redis, auto_reconnect): gcs_address = ray._private.worker.global_worker.gcs_client.address gcs_client = gcs_utils.GcsClient(address=gcs_address) gcs_client.internal_kv_put(b"a", b"b", True, None) assert gcs_client.internal_kv_get(b"a", None) == b"b" passed = [False] async def async_kv_get(): gcs_aio_client = gcs_utils.GcsAioClient( address=gcs_address, nums_reconnect_retry=20 if auto_reconnect else 0) if not auto_reconnect: with pytest.raises(Exception): await gcs_aio_client.internal_kv_get(b"a", None) else: assert await gcs_aio_client.internal_kv_get(b"a", None) == b"b" return True def kv_get(): import asyncio asyncio.set_event_loop(asyncio.new_event_loop()) passed[0] = asyncio.get_event_loop().run_until_complete(async_kv_get()) ray._private.worker._global_node.kill_gcs_server() t = threading.Thread(target=kv_get) t.start() sleep(5) ray._private.worker._global_node.start_gcs_server() t.join() assert passed[0]
def test_gcs_client_reconnect(ray_start_regular_with_external_redis, auto_reconnect): gcs_address = ray._private.worker.global_worker.gcs_client.address gcs_client = gcs_utils.GcsClient( address=gcs_address, nums_reconnect_retry=20 if auto_reconnect else 0) gcs_client.internal_kv_put(b"a", b"b", True, None) assert gcs_client.internal_kv_get(b"a", None) == b"b" passed = [False] def kv_get(): if not auto_reconnect: with pytest.raises(Exception): gcs_client.internal_kv_get(b"a", None) else: assert gcs_client.internal_kv_get(b"a", None) == b"b" passed[0] = True ray._private.worker._global_node.kill_gcs_server() t = threading.Thread(target=kv_get) t.start() sleep(5) ray._private.worker._global_node.start_gcs_server() t.join() assert passed[0]
def test_gcs_client_reconnect(ray_start_regular_with_external_redis, auto_reconnect): gcs_address = ray.worker.global_worker.gcs_client.address gcs_client = gcs_utils.GcsClient(address=gcs_address) if auto_reconnect \ else gcs_utils.GcsClient(address=gcs_address, nums_reconnect_retry=0) gcs_client.internal_kv_put(b"a", b"b", True, None) gcs_client.internal_kv_get(b"a", None) == b"b" ray.worker._global_node.kill_gcs_server() ray.worker._global_node.start_gcs_server() if auto_reconnect is False: # This may flake: when GCS server restarted quickly, there would be no # connection error when calling internal_kv_get(). # with pytest.raises(Exception): # gcs_client.internal_kv_get(b"a", None) pass else: assert gcs_client.internal_kv_get(b"a", None) == b"b"
def get_ray_status_output(address): gcs_client = gcs_utils.GcsClient(address=address) internal_kv._initialize_internal_kv(gcs_client) status = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_STATUS) error = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_ERROR) return { "demand": debug_status(status, error).split("Demands:")[1].strip("\n").strip(" "), "usage": debug_status(status, error).split("Demands:")[0].split("Usage:") [1].strip("\n").strip(" "), }
def test_usage_lib_get_total_num_running_jobs_to_report( ray_start_cluster, reset_lib_usage): cluster = ray_start_cluster cluster.add_node(num_cpus=1) gcs_client = gcs_utils.GcsClient(address=cluster.gcs_address) assert ray_usage_lib.get_total_num_running_jobs_to_report(gcs_client) == 0 ray.init(address=cluster.address) assert ray_usage_lib.get_total_num_running_jobs_to_report(gcs_client) == 1 ray.shutdown() ray.init(address=cluster.address) # Make sure the previously finished job is not counted. assert ray_usage_lib.get_total_num_running_jobs_to_report(gcs_client) == 1 ray.shutdown()
def get_ray_status_output(address): if gcs_utils.use_gcs_for_bootstrap(): gcs_client = gcs_utils.GcsClient(address=address) else: redis_client = ray._private.services.create_redis_client(address, "") gcs_client = gcs_utils.GcsClient.create_from_redis(redis_client) internal_kv._initialize_internal_kv(gcs_client) status = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_STATUS) error = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_ERROR) return { "demand": debug_status(status, error).split("Demands:")[1].strip("\n").strip(" "), "usage": debug_status(status, error).split("Demands:")[0].split("Usage:") [1].strip("\n").strip(" ") }
def test_kv_timeout(ray_start_regular): gcs_address = ray._private.worker.global_worker.gcs_client.address gcs_client = gcs_utils.GcsClient(address=gcs_address, nums_reconnect_retry=0) assert gcs_client.internal_kv_put(b"A", b"", False, b"") == 1 with stop_gcs_server(): with pytest.raises(grpc.RpcError, match="Deadline Exceeded"): gcs_client.internal_kv_put(b"A", b"B", False, b"NS", timeout=2) with pytest.raises(grpc.RpcError, match="Deadline Exceeded"): gcs_client.internal_kv_get(b"A", b"NS", timeout=2) with pytest.raises(grpc.RpcError, match="Deadline Exceeded"): gcs_client.internal_kv_keys(b"A", b"NS", timeout=2) with pytest.raises(grpc.RpcError, match="Deadline Exceeded"): gcs_client.internal_kv_del(b"A", True, b"NS", timeout=2)
def test_kv_basic(ray_start_regular): gcs_address = ray._private.worker.global_worker.gcs_client.address gcs_client = gcs_utils.GcsClient(address=gcs_address, nums_reconnect_retry=0) assert gcs_client.internal_kv_get(b"A", b"NS") is None assert gcs_client.internal_kv_put(b"A", b"B", False, b"NS") == 1 assert gcs_client.internal_kv_get(b"A", b"NS") == b"B" assert gcs_client.internal_kv_put(b"A", b"C", False, b"NS") == 0 assert gcs_client.internal_kv_get(b"A", b"NS") == b"B" assert gcs_client.internal_kv_put(b"A", b"C", True, b"NS") == 0 assert gcs_client.internal_kv_get(b"A", b"NS") == b"C" assert gcs_client.internal_kv_put(b"AA", b"B", False, b"NS") == 1 assert gcs_client.internal_kv_put(b"AB", b"B", False, b"NS") == 1 assert set(gcs_client.internal_kv_keys(b"A", b"NS")) == {b"A", b"AA", b"AB"} assert gcs_client.internal_kv_del(b"A", False, b"NS") == 1 assert set(gcs_client.internal_kv_keys(b"A", b"NS")) == {b"AA", b"AB"} assert gcs_client.internal_kv_keys(b"A", b"NSS") == [] assert gcs_client.internal_kv_del(b"A", True, b"NS") == 2 assert gcs_client.internal_kv_keys(b"A", b"NS") == [] assert gcs_client.internal_kv_del(b"A", False, b"NSS") == 0
def generate_report_data( cluster_config_to_report: ClusterConfigToReport, total_success: int, total_failed: int, seq_number: int, gcs_address: str, ) -> UsageStatsToReport: """Generate the report data. Params: cluster_config_to_report: The cluster (autoscaler) config generated by `get_cluster_config_to_report`. total_success: The total number of successful report for the lifetime of the cluster. total_failed: The total number of failed report for the lifetime of the cluster. seq_number: The sequence number that's incremented whenever a new report is sent. gcs_address: the address of gcs to get data to report. Returns: UsageStats """ gcs_client = gcs_utils.GcsClient(address=gcs_address, nums_reconnect_retry=20) cluster_metadata = get_cluster_metadata(gcs_client) cluster_status_to_report = get_cluster_status_to_report(gcs_client) data = UsageStatsToReport( ray_version=cluster_metadata["ray_version"], python_version=cluster_metadata["python_version"], schema_version=cluster_metadata["schema_version"], source=cluster_metadata["source"], session_id=cluster_metadata["session_id"], git_commit=cluster_metadata["git_commit"], os=cluster_metadata["os"], collect_timestamp_ms=int(time.time() * 1000), session_start_timestamp_ms=cluster_metadata[ "session_start_timestamp_ms"], cloud_provider=cluster_config_to_report.cloud_provider, min_workers=cluster_config_to_report.min_workers, max_workers=cluster_config_to_report.max_workers, head_node_instance_type=cluster_config_to_report. head_node_instance_type, worker_node_instance_types=cluster_config_to_report. worker_node_instance_types, total_num_cpus=cluster_status_to_report.total_num_cpus, total_num_gpus=cluster_status_to_report.total_num_gpus, total_memory_gb=cluster_status_to_report.total_memory_gb, total_object_store_memory_gb=cluster_status_to_report. total_object_store_memory_gb, # noqa: E501 library_usages=get_library_usages_to_report(gcs_client), total_success=total_success, total_failed=total_failed, seq_number=seq_number, extra_usage_tags=_parse_extra_usage_tags(), total_num_nodes=get_total_num_nodes_to_report(gcs_client), total_num_running_jobs=get_total_num_running_jobs_to_report( gcs_client), ) return data