def test_process_unscheduled_pods(self): objs = json.loads( self.get_data_test_input("data/dlts_unscheduled_pods.json")) pod_gauge = watchdog.gen_k8s_pod_gauge() container_gauge = watchdog.gen_k8s_container_gauge() pods_info = collections.defaultdict(lambda: []) vc_usage = watchdog.VcUsage() for obj in objs: watchdog.parse_pod_item(obj, pod_gauge, container_gauge, pods_info, [], vc_usage) self.assertEqual(1, len(pods_info)) self.assertEqual(2, len(pods_info["unscheduled"])) cluster_gpu_info = watchdog.ClusterGPUInfo() cluster_gpu_info.available = 10 cluster_gpu_info.preemptable_available = 10 watchdog.process_unscheduled_pods(pods_info, cluster_gpu_info) self.assertEqual(9, cluster_gpu_info.available) self.assertEqual(8, cluster_gpu_info.preemptable_available)
def test_process_dlws_nodes_status_with_unscheduable(self): obj = json.loads( self.get_data_test_input( "data/dlws_nodes_list_with_unschedulable.json")) pod_info = collections.defaultdict(lambda: []) pod_info["192.168.255.1"].append(watchdog.PodInfo("job1", False, 2)) gauges = watchdog.process_nodes_status(obj, pod_info, watchdog.ClusterGPUInfo()) self.assertEqual(5, len(gauges)) self.assertEqual("k8s_node_count", gauges[0].name) self.assertEqual(1, len(gauges[0].samples)) self.assertEqual("true", gauges[0].samples[0].labels["unschedulable"]) self.assertEqual("k8s_node_gpu_available", gauges[1].name) self.assertEqual(1, len(gauges[1].samples)) self.assertEqual(0, gauges[1].samples[0].value) self.assertEqual("k8s_node_preemptable_gpu_available", gauges[2].name) self.assertEqual(1, len(gauges[2].samples)) self.assertEqual(0, gauges[2].samples[0].value) self.assertEqual("k8s_node_gpu_total", gauges[3].name) self.assertEqual(1, len(gauges[3].samples)) self.assertEqual(4, gauges[3].samples[0].value) self.assertEqual("k8s_node_gpu_allocatable", gauges[4].name) self.assertEqual(1, len(gauges[4].samples)) self.assertEqual(0, gauges[4].samples[0].value) for gauge in gauges: self.assertTrue(len(gauge.samples) > 0) for gauge in gauges[1:]: self.assertEqual("192.168.255.1", gauge.samples[0].labels["host_ip"])
def test_process_nodes_status(self): obj = json.loads(self.get_data_test_input("data/nodes_list.json")) gauges = watchdog.process_nodes_status(obj, {}, watchdog.ClusterGPUInfo()) self.assertEqual(5, len(gauges)) for gauge in gauges: self.assertTrue(len(gauge.samples) > 0)
def test_gpu_accounting(self): vc_info = {"A": {"P40": 40}, "B": {"P40": 40}, "C": {"P40": 40}} vc_usage = watchdog.VcUsage() vc_usage.add_used("A", "P40", 40) vc_usage.add_used("B", "P40", 31) vc_usage.add_used("C", "P40", 0) cluster_gpu_info = watchdog.ClusterGPUInfo() cluster_gpu_info.capacity = 120 cluster_gpu_info.available = 29 cluster_gpu_info.allocatable = 100 vc_total, vc_avail, vc_preemptive_avail, vc_unschedulable_gauge = \ watchdog.gen_vc_metrics(vc_info, vc_usage, cluster_gpu_info) self.assertEqual(3, len(vc_total.samples)) for sample in vc_total.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(vc_info[vc_name][gpu_type], sample.value) target_vc_avail = {"A": {"P40": 0}, "B": {"P40": 1}, "C": {"P40": 27}} self.assertEqual(3, len(vc_avail.samples)) for sample in vc_avail.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(target_vc_avail[vc_name][gpu_type], sample.value, "vc " + vc_name + ", gpu " + gpu_type) target_vc_preemptive_avail = { "A": { "P40": 29 }, "B": { "P40": 29 }, "C": { "P40": 29 } } self.assertEqual(3, len(vc_preemptive_avail.samples)) for sample in vc_preemptive_avail.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(target_vc_preemptive_avail[vc_name][gpu_type], sample.value, "vc " + vc_name) target_vc_unschedulable = { "A": { "P40": 0 }, "B": { "P40": 8 }, "C": { "P40": 13 } } self.assertEqual(3, len(vc_unschedulable_gauge.samples)) for sample in vc_unschedulable_gauge.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(target_vc_unschedulable[vc_name][gpu_type], sample.value, "vc " + vc_name)
def test_process_vc_info(self): vc_info = { "default": { "P40": 10, "P80": 10 }, "platform": { "P40": 10 }, "relevance": { "P80": 4 } } vc_usage = watchdog.VcUsage() vc_usage.add_preemptable_used("default", "P40", 8) vc_usage.add_preemptable_used("default", "P80", 2) vc_usage.add_used("default", "P40", 2) vc_usage.add_used("platform", "P40", 3) cluster_gpu_info = watchdog.ClusterGPUInfo() cluster_gpu_info.capacity = 34 cluster_gpu_info.available = 29 cluster_gpu_info.allocatable = 34 vc_total, vc_avail, vc_preemptive_avail, vc_unschedulable_gauge = \ watchdog.gen_vc_metrics(vc_info, vc_usage, cluster_gpu_info) self.assertEqual(4, len(vc_total.samples)) for sample in vc_total.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(vc_info[vc_name][gpu_type], sample.value) target_vc_avail = { "default": { "P40": 8, "P80": 10 }, "platform": { "P40": 7 }, "relevance": { "P80": 4 } } self.assertEqual(4, len(vc_avail.samples)) for sample in vc_avail.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(target_vc_avail[vc_name][gpu_type], sample.value, "vc " + vc_name + ", gpu " + gpu_type) target_vc_preemptive_avail = { "default": { "P40": 29, "P80": 29 }, "platform": { "P40": 29 }, "relevance": { "P80": 29 } } self.assertEqual(4, len(vc_preemptive_avail.samples)) for sample in vc_preemptive_avail.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(target_vc_preemptive_avail[vc_name][gpu_type], sample.value, "vc " + vc_name) target_vc_unschedulable = { "default": { "P40": 0, "P80": 0 }, "platform": { "P40": 0 }, "relevance": { "P80": 0 } } for sample in vc_unschedulable_gauge.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(target_vc_unschedulable[vc_name][gpu_type], sample.value, "vc " + vc_name + ", gpu " + gpu_type)
def test_process_vc_info_real_case(self): vc_info = { "quantus": { "P40": 150 }, "relevance2": { "P40": 234 }, "relevance2-inf": { "P40": 40 } } vc_usage = watchdog.VcUsage() vc_usage.add_preemptable_used("relevance2", "P40", 24) vc_usage.add_used("relevance2", "P40", 231) vc_usage.add_used("quantus", "P40", 125) cluster_gpu_info = watchdog.ClusterGPUInfo() cluster_gpu_info.capacity = 424 cluster_gpu_info.available = 68 cluster_gpu_info.allocatable = 423 vc_total, vc_avail, vc_preemptive_avail, vc_unschedulable_gauge = \ watchdog.gen_vc_metrics(vc_info, vc_usage, cluster_gpu_info) self.assertEqual(3, len(vc_total.samples)) for sample in vc_total.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(vc_info[vc_name][gpu_type], sample.value) target_vc_avail = { "quantus": { "P40": 25 }, "relevance2": { "P40": 2 }, "relevance2-inf": { "P40": 40 } } self.assertEqual(3, len(vc_avail.samples)) for sample in vc_avail.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(target_vc_avail[vc_name][gpu_type], sample.value, "vc " + vc_name + ", gpu " + gpu_type) target_vc_preemptive_avail = { "quantus": { "P40": 68 }, "relevance2": { "P40": 68 }, "relevance2-inf": { "P40": 68 } } self.assertEqual(3, len(vc_preemptive_avail.samples)) for sample in vc_preemptive_avail.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(target_vc_preemptive_avail[vc_name][gpu_type], sample.value, "vc " + vc_name) target_vc_unschedulable = { "quantus": { "P40": 0 }, "relevance2": { "P40": 1 }, "relevance2-inf": { "P40": 0 } } self.assertEqual(3, len(vc_unschedulable_gauge.samples)) for sample in vc_unschedulable_gauge.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(target_vc_unschedulable[vc_name][gpu_type], sample.value, "vc " + vc_name + ", gpu " + gpu_type)