def test_case_stats_exist(): components_dict, metric_names, metric_samples = fetch_prometheus( prom_addresses) return all([ "ray_node_cpu" in metric_names, "ray_node_mem" in metric_names, "ray_raylet_cpu" in metric_names, "ray_raylet_mem" in metric_names ])
def test_cases(): components_dict, metric_names, metric_samples = fetch_prometheus( prom_addresses) # Raylet should be on every node assert all("raylet" in components for components in components_dict.values()) # GCS server should be on one node assert any("gcs_server" in components for components in components_dict.values()) # Core worker should be on at least on node assert any("core_worker" in components for components in components_dict.values()) # Make sure our user defined metrics exist for metric_name in [ "test_counter", "test_histogram", "test_driver_counter" ]: assert any(metric_name in full_name for full_name in metric_names) # Make sure metrics are recorded. for metric in _METRICS: assert metric in metric_names, \ f"metric {metric} not in {metric_names}" # Make sure the numeric values are correct test_counter_sample = [ m for m in metric_samples if "test_counter" in m.name ][0] assert test_counter_sample.value == 2.0 test_driver_counter_sample = [ m for m in metric_samples if "test_driver_counter" in m.name ][0] assert test_driver_counter_sample.value == 1.0 test_histogram_samples = [ m for m in metric_samples if "test_histogram" in m.name ] buckets = { m.labels["le"]: m.value for m in test_histogram_samples if "_bucket" in m.name } # We recorded value 1.5 for the histogram. In Prometheus data model # the histogram is cumulative. So we expect the count to appear in # <1.1 and <+Inf buckets. assert buckets == {"0.1": 0.0, "1.6": 1.0, "+Inf": 1.0} hist_count = [m for m in test_histogram_samples if "_count" in m.name][0].value hist_sum = [m for m in test_histogram_samples if "_sum" in m.name][0].value assert hist_count == 1 assert hist_sum == 1.5
def test_case_ip_correct(): components_dict, metric_names, metric_samples = fetch_prometheus( prom_addresses) raylet_proc = ray.worker._global_node.all_processes[ ray_constants.PROCESS_TYPE_RAYLET][0] raylet_pid = None # Find the raylet pid recorded in the tag. for sample in metric_samples: if sample.name == "ray_raylet_cpu": raylet_pid = sample.labels["pid"] break return str(raylet_proc.process.pid) == str(raylet_pid)
def test_case_stats_exist(): components_dict, metric_names, metric_samples = fetch_prometheus( prom_addresses) return all([ "ray_node_cpu" in metric_names, "ray_node_mem" in metric_names, "ray_raylet_cpu" in metric_names, "ray_raylet_mem" in metric_names, "ray_node_disk_usage" in metric_names, "ray_node_disk_utilization_percentage" in metric_names, "ray_node_network_sent" in metric_names, "ray_node_network_received" in metric_names, "ray_node_network_send_speed" in metric_names, "ray_node_network_receive_speed" in metric_names ])