Esempio n. 1
0
    def test_process_unscheduled_pods(self):
        objs = json.loads(
            self.get_data_test_input("data/dlts_unscheduled_pods.json"))

        pod_gauge = watchdog.gen_k8s_pod_gauge()
        container_gauge = watchdog.gen_k8s_container_gauge()
        pods_info = collections.defaultdict(lambda: [])

        vc_usage = watchdog.VcUsage()

        for obj in objs:
            watchdog.parse_pod_item(obj, pod_gauge, container_gauge, pods_info,
                                    [], vc_usage)

        self.assertEqual(1, len(pods_info))
        self.assertEqual(2, len(pods_info["unscheduled"]))

        cluster_gpu_info = watchdog.ClusterGPUInfo()
        cluster_gpu_info.available = 10
        cluster_gpu_info.preemptable_available = 10

        watchdog.process_unscheduled_pods(pods_info, cluster_gpu_info)

        self.assertEqual(9, cluster_gpu_info.available)
        self.assertEqual(8, cluster_gpu_info.preemptable_available)
Esempio n. 2
0
    def test_process_dlws_nodes_status_with_unscheduable(self):
        obj = json.loads(
            self.get_data_test_input(
                "data/dlws_nodes_list_with_unschedulable.json"))

        pod_info = collections.defaultdict(lambda: [])
        pod_info["192.168.255.1"].append(watchdog.PodInfo("job1", False, 2))
        gauges = watchdog.process_nodes_status(obj, pod_info,
                                               watchdog.ClusterGPUInfo())

        self.assertEqual(5, len(gauges))

        self.assertEqual("k8s_node_count", gauges[0].name)
        self.assertEqual(1, len(gauges[0].samples))
        self.assertEqual("true", gauges[0].samples[0].labels["unschedulable"])
        self.assertEqual("k8s_node_gpu_available", gauges[1].name)
        self.assertEqual(1, len(gauges[1].samples))
        self.assertEqual(0, gauges[1].samples[0].value)
        self.assertEqual("k8s_node_preemptable_gpu_available", gauges[2].name)
        self.assertEqual(1, len(gauges[2].samples))
        self.assertEqual(0, gauges[2].samples[0].value)
        self.assertEqual("k8s_node_gpu_total", gauges[3].name)
        self.assertEqual(1, len(gauges[3].samples))
        self.assertEqual(4, gauges[3].samples[0].value)
        self.assertEqual("k8s_node_gpu_allocatable", gauges[4].name)
        self.assertEqual(1, len(gauges[4].samples))
        self.assertEqual(0, gauges[4].samples[0].value)

        for gauge in gauges:
            self.assertTrue(len(gauge.samples) > 0)

        for gauge in gauges[1:]:
            self.assertEqual("192.168.255.1",
                             gauge.samples[0].labels["host_ip"])
Esempio n. 3
0
    def test_process_nodes_status(self):
        obj = json.loads(self.get_data_test_input("data/nodes_list.json"))

        gauges = watchdog.process_nodes_status(obj, {},
                                               watchdog.ClusterGPUInfo())

        self.assertEqual(5, len(gauges))

        for gauge in gauges:
            self.assertTrue(len(gauge.samples) > 0)
Esempio n. 4
0
    def test_gpu_accounting(self):
        vc_info = {"A": {"P40": 40}, "B": {"P40": 40}, "C": {"P40": 40}}

        vc_usage = watchdog.VcUsage()

        vc_usage.add_used("A", "P40", 40)
        vc_usage.add_used("B", "P40", 31)
        vc_usage.add_used("C", "P40", 0)

        cluster_gpu_info = watchdog.ClusterGPUInfo()
        cluster_gpu_info.capacity = 120
        cluster_gpu_info.available = 29
        cluster_gpu_info.allocatable = 100
        vc_total, vc_avail, vc_preemptive_avail, vc_unschedulable_gauge = \
                watchdog.gen_vc_metrics(vc_info, vc_usage, cluster_gpu_info)

        self.assertEqual(3, len(vc_total.samples))
        for sample in vc_total.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(vc_info[vc_name][gpu_type], sample.value)

        target_vc_avail = {"A": {"P40": 0}, "B": {"P40": 1}, "C": {"P40": 27}}

        self.assertEqual(3, len(vc_avail.samples))
        for sample in vc_avail.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(target_vc_avail[vc_name][gpu_type], sample.value,
                             "vc " + vc_name + ", gpu " + gpu_type)

        target_vc_preemptive_avail = {
            "A": {
                "P40": 29
            },
            "B": {
                "P40": 29
            },
            "C": {
                "P40": 29
            }
        }

        self.assertEqual(3, len(vc_preemptive_avail.samples))
        for sample in vc_preemptive_avail.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(target_vc_preemptive_avail[vc_name][gpu_type],
                             sample.value, "vc " + vc_name)

        target_vc_unschedulable = {
            "A": {
                "P40": 0
            },
            "B": {
                "P40": 8
            },
            "C": {
                "P40": 13
            }
        }
        self.assertEqual(3, len(vc_unschedulable_gauge.samples))
        for sample in vc_unschedulable_gauge.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(target_vc_unschedulable[vc_name][gpu_type],
                             sample.value, "vc " + vc_name)
Esempio n. 5
0
    def test_process_vc_info(self):
        vc_info = {
            "default": {
                "P40": 10,
                "P80": 10
            },
            "platform": {
                "P40": 10
            },
            "relevance": {
                "P80": 4
            }
        }

        vc_usage = watchdog.VcUsage()

        vc_usage.add_preemptable_used("default", "P40", 8)
        vc_usage.add_preemptable_used("default", "P80", 2)
        vc_usage.add_used("default", "P40", 2)

        vc_usage.add_used("platform", "P40", 3)

        cluster_gpu_info = watchdog.ClusterGPUInfo()
        cluster_gpu_info.capacity = 34
        cluster_gpu_info.available = 29
        cluster_gpu_info.allocatable = 34
        vc_total, vc_avail, vc_preemptive_avail, vc_unschedulable_gauge = \
                watchdog.gen_vc_metrics(vc_info, vc_usage, cluster_gpu_info)

        self.assertEqual(4, len(vc_total.samples))
        for sample in vc_total.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(vc_info[vc_name][gpu_type], sample.value)

        target_vc_avail = {
            "default": {
                "P40": 8,
                "P80": 10
            },
            "platform": {
                "P40": 7
            },
            "relevance": {
                "P80": 4
            }
        }

        self.assertEqual(4, len(vc_avail.samples))
        for sample in vc_avail.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(target_vc_avail[vc_name][gpu_type], sample.value,
                             "vc " + vc_name + ", gpu " + gpu_type)

        target_vc_preemptive_avail = {
            "default": {
                "P40": 29,
                "P80": 29
            },
            "platform": {
                "P40": 29
            },
            "relevance": {
                "P80": 29
            }
        }

        self.assertEqual(4, len(vc_preemptive_avail.samples))
        for sample in vc_preemptive_avail.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(target_vc_preemptive_avail[vc_name][gpu_type],
                             sample.value, "vc " + vc_name)

        target_vc_unschedulable = {
            "default": {
                "P40": 0,
                "P80": 0
            },
            "platform": {
                "P40": 0
            },
            "relevance": {
                "P80": 0
            }
        }
        for sample in vc_unschedulable_gauge.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(target_vc_unschedulable[vc_name][gpu_type],
                             sample.value,
                             "vc " + vc_name + ", gpu " + gpu_type)
Esempio n. 6
0
    def test_process_vc_info_real_case(self):
        vc_info = {
            "quantus": {
                "P40": 150
            },
            "relevance2": {
                "P40": 234
            },
            "relevance2-inf": {
                "P40": 40
            }
        }

        vc_usage = watchdog.VcUsage()

        vc_usage.add_preemptable_used("relevance2", "P40", 24)
        vc_usage.add_used("relevance2", "P40", 231)
        vc_usage.add_used("quantus", "P40", 125)

        cluster_gpu_info = watchdog.ClusterGPUInfo()
        cluster_gpu_info.capacity = 424
        cluster_gpu_info.available = 68
        cluster_gpu_info.allocatable = 423
        vc_total, vc_avail, vc_preemptive_avail, vc_unschedulable_gauge = \
                watchdog.gen_vc_metrics(vc_info, vc_usage, cluster_gpu_info)

        self.assertEqual(3, len(vc_total.samples))
        for sample in vc_total.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(vc_info[vc_name][gpu_type], sample.value)

        target_vc_avail = {
            "quantus": {
                "P40": 25
            },
            "relevance2": {
                "P40": 2
            },
            "relevance2-inf": {
                "P40": 40
            }
        }

        self.assertEqual(3, len(vc_avail.samples))
        for sample in vc_avail.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(target_vc_avail[vc_name][gpu_type], sample.value,
                             "vc " + vc_name + ", gpu " + gpu_type)

        target_vc_preemptive_avail = {
            "quantus": {
                "P40": 68
            },
            "relevance2": {
                "P40": 68
            },
            "relevance2-inf": {
                "P40": 68
            }
        }

        self.assertEqual(3, len(vc_preemptive_avail.samples))
        for sample in vc_preemptive_avail.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(target_vc_preemptive_avail[vc_name][gpu_type],
                             sample.value, "vc " + vc_name)

        target_vc_unschedulable = {
            "quantus": {
                "P40": 0
            },
            "relevance2": {
                "P40": 1
            },
            "relevance2-inf": {
                "P40": 0
            }
        }
        self.assertEqual(3, len(vc_unschedulable_gauge.samples))
        for sample in vc_unschedulable_gauge.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(target_vc_unschedulable[vc_name][gpu_type],
                             sample.value,
                             "vc " + vc_name + ", gpu " + gpu_type)