Example #1
0
    def test_convert_to_metrics_with_no_zombie_info_BUGFIX(self):
        gpu_info = nvidia.construct_gpu_info([
            nvidia.NvidiaGpuStatus(20, 21, [22, 33, 44], nvidia.EccError(), "0", "GPU-uuid0", 40.0)])

        # zombie_info is empty should also have external process metric
        zombie_info = []

        pid_to_cid_mapping = {33: "def", 22: "ghi"} # only 44 is external process

        metrics = GpuCollector.convert_nvidia_gpu_info_to_metrics(gpu_info, zombie_info,
                self.make_pid_to_cid_fn(pid_to_cid_mapping), 20 * 1024)

        _, _, _, _, external_process, zombie_container, _, _, _ = metrics

        self.assertEqual(0, len(zombie_container.samples))
        self.assertEqual(1, len(external_process.samples))
        self.assertEqual("0", external_process.samples[0].labels["minor_number"])
        self.assertEqual("44", external_process.samples[0].labels["pid"])

        # zombie_info is None should also have external process metric
        zombie_info = None

        metrics = GpuCollector.convert_nvidia_gpu_info_to_metrics(gpu_info, zombie_info,
                self.make_pid_to_cid_fn(pid_to_cid_mapping), 20 * 1024)

        _, _, _, _, external_process, zombie_container, _, _, _ = metrics

        self.assertEqual(0, len(zombie_container.samples))
        self.assertEqual(1, len(external_process.samples))
        self.assertEqual("0", external_process.samples[0].labels["minor_number"])
        self.assertEqual("44", external_process.samples[0].labels["pid"])
Example #2
0
    def test_convert_to_metrics_with_real_id_BUGFIX(self):
        gpu_info = nvidia.construct_gpu_info([
            nvidia.NvidiaGpuStatus(20, 21, [22], nvidia.EccError(), "0",
                                   "GPU-uuid0", 50.0)
        ])

        # zombie_info is empty should also have external process metric
        zombie_info = {"ce5de12d6275"}

        pid_to_cid_mapping = {
            22:
            "ce5de12d6275dc05c9ec5b7f58484f075f4775d8f54f6a4be3dc1439344df356"
        }

        metrics = GpuCollector.convert_to_metrics(
            gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping),
            20 * 1024)

        core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container, gpu_temp, gpu_retired = metrics

        self.assertEqual(1, len(zombie_container.samples))
        self.assertEqual("0",
                         zombie_container.samples[0].labels["minor_number"])
        self.assertEqual("ce5de12d6275",
                         zombie_container.samples[0].labels["container_id"])
Example #3
0
    def test_convert_to_metrics_with_no_zombie_info_BUGFIX(self):
        gpu_info = {
            "0": nvidia.NvidiaGpuStatus(20, 21, [22, 33, 44],
                                        nvidia.EccError())
        }

        # zombie_info is empty should also have external process metric
        zombie_info = []

        pid_to_cid_mapping = {
            33: "def",
            22: "ghi"
        }  # only 44 is external process

        metrics = GpuCollector.convert_to_metrics(
            gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping),
            20 * 1024)

        core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container = metrics

        self.assertEqual(0, len(zombie_container.samples))
        self.assertEqual(1, len(external_process.samples))
        self.assertEqual("0",
                         external_process.samples[0].labels["minor_number"])
        self.assertEqual(44, external_process.samples[0].labels["pid"])

        # zombie_info is None should also have external process metric
        zombie_info = None

        metrics = GpuCollector.convert_to_metrics(
            gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping),
            20 * 1024)

        core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container = metrics

        self.assertEqual(0, len(zombie_container.samples))
        self.assertEqual(1, len(external_process.samples))
        self.assertEqual("0",
                         external_process.samples[0].labels["minor_number"])
        self.assertEqual(44, external_process.samples[0].labels["pid"])
Example #4
0
    def test_convert_to_metrics(self):
        # sample may not ordered, and can not assertEqual directly, so tear them apart
        gpu_info = nvidia.construct_gpu_info([
            nvidia.NvidiaGpuStatus(20, 21, [22, 33, 44], nvidia.EccError(),
                                   "0", "GPU-uuid0", 37.0)
        ])

        zombie_info = {"abc", "def"}

        pid_to_cid_mapping = {33: "def", 22: "ghi"}  # only 33 is zombie

        metrics = GpuCollector.convert_to_metrics(
            gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping),
            20 * 1024)

        core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container, gpu_temp, gpu_retired = metrics

        target_core_utils = collector.gen_gpu_util_gauge()
        target_core_utils.add_metric(["0", "GPU-uuid0"], 20)
        self.assertEqual(target_core_utils, core_utils)

        target_mem_utils = collector.gen_gpu_mem_util_gauge()
        target_mem_utils.add_metric(["0", "GPU-uuid0"], 21)
        self.assertEqual(target_mem_utils, mem_utils)

        target_ecc_errors = collector.gen_gpu_ecc_counter()
        target_ecc_errors.add_metric(["0", "GPU-uuid0", "volatile_single"], 0)
        target_ecc_errors.add_metric(["0", "GPU-uuid0", "volatile_double"], 0)
        target_ecc_errors.add_metric(["0", "GPU-uuid0", "aggregated_single"],
                                     0)
        target_ecc_errors.add_metric(["0", "GPU-uuid0", "aggregated_double"],
                                     0)
        self.assertEqual(target_ecc_errors, ecc_errors)

        target_mem_leak = collector.gen_gpu_memory_leak_counter()
        self.assertEqual(target_mem_leak, mem_leak)

        target_external_process = collector.gen_gpu_used_by_external_process_counter(
        )
        target_external_process.add_metric(["0", "44"], 1)
        self.assertEqual(target_external_process, external_process)

        target_zombie_container = collector.gen_gpu_used_by_zombie_container_counter(
        )
        target_zombie_container.add_metric(["0", "def"], 1)
        self.assertEqual(target_zombie_container, zombie_container)

        target_gpu_temp = collector.gen_gpu_temperature_gauge()
        target_gpu_temp.add_metric(["0", "GPU-uuid0"], 37.0)
        self.assertEqual(target_gpu_temp, gpu_temp)

        # test minor 1
        gpu_info = nvidia.construct_gpu_info([
            nvidia.NvidiaGpuStatus(
                30, 31, [55, 123],
                nvidia.EccError(volatile_single=2,
                                volatile_double=3,
                                aggregated_single=4,
                                aggregated_double=5), "1", "GPU-uuid1", 24.0)
        ])

        metrics = GpuCollector.convert_to_metrics(
            gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping),
            20 * 1024)

        core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container, gpu_temp, gpu_retired = metrics

        target_core_utils = collector.gen_gpu_util_gauge()
        target_core_utils.add_metric(["1", "GPU-uuid1"], 30)
        self.assertEqual(target_core_utils, core_utils)

        target_mem_utils = collector.gen_gpu_mem_util_gauge()
        target_mem_utils.add_metric(["1", "GPU-uuid1"], 31)
        self.assertEqual(target_mem_utils, mem_utils)

        target_ecc_errors = collector.gen_gpu_ecc_counter()
        target_ecc_errors.add_metric(["1", "GPU-uuid1", "volatile_single"], 2)
        target_ecc_errors.add_metric(["1", "GPU-uuid1", "volatile_double"], 3)
        target_ecc_errors.add_metric(["1", "GPU-uuid1", "aggregated_single"],
                                     4)
        target_ecc_errors.add_metric(["1", "GPU-uuid1", "aggregated_double"],
                                     5)
        self.assertEqual(target_ecc_errors, ecc_errors)

        target_mem_leak = collector.gen_gpu_memory_leak_counter()
        self.assertEqual(target_mem_leak, mem_leak)

        target_external_process = collector.gen_gpu_used_by_external_process_counter(
        )
        target_external_process.add_metric(["1", "55"], 1)
        target_external_process.add_metric(["1", "123"], 1)
        self.assertEqual(target_external_process, external_process)

        target_zombie_container = collector.gen_gpu_used_by_zombie_container_counter(
        )
        self.assertEqual(target_zombie_container, zombie_container)

        target_gpu_temp = collector.gen_gpu_temperature_gauge()
        target_gpu_temp.add_metric(["1", "GPU-uuid1"], 24.0)
        self.assertEqual(target_gpu_temp, gpu_temp)

        # test minor 2
        gpu_info = nvidia.construct_gpu_info([
            nvidia.NvidiaGpuStatus(40, 20 * 1024 * 1024, [], nvidia.EccError(),
                                   "2", "GPU-uuid2", 30.0)
        ])

        metrics = GpuCollector.convert_to_metrics(
            gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping),
            20 * 1024 * 1024)

        core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container, gpu_temp, gpu_retired = metrics

        target_core_utils = collector.gen_gpu_util_gauge()
        target_core_utils.add_metric(["2", "GPU-uuid2"], 40)
        self.assertEqual(target_core_utils, core_utils)

        target_mem_utils = collector.gen_gpu_mem_util_gauge()
        target_mem_utils.add_metric(["2", "GPU-uuid2"], 20 * 1024 * 1024)
        self.assertEqual(target_mem_utils, mem_utils)

        target_ecc_errors = collector.gen_gpu_ecc_counter()
        target_ecc_errors.add_metric(["2", "GPU-uuid2", "volatile_single"], 0)
        target_ecc_errors.add_metric(["2", "GPU-uuid2", "volatile_double"], 0)
        target_ecc_errors.add_metric(["2", "GPU-uuid2", "aggregated_single"],
                                     0)
        target_ecc_errors.add_metric(["2", "GPU-uuid2", "aggregated_double"],
                                     0)
        self.assertEqual(target_ecc_errors, ecc_errors)

        target_mem_leak = collector.gen_gpu_memory_leak_counter()
        self.assertEqual(target_mem_leak, mem_leak)

        target_external_process = collector.gen_gpu_used_by_external_process_counter(
        )
        self.assertEqual(target_external_process, external_process)

        target_zombie_container = collector.gen_gpu_used_by_zombie_container_counter(
        )
        self.assertEqual(target_zombie_container, zombie_container)

        target_gpu_temp = collector.gen_gpu_temperature_gauge()
        target_gpu_temp.add_metric(["2", "GPU-uuid2"], 30.0)
        self.assertEqual(target_gpu_temp, gpu_temp)

        # test memory leak
        gpu_info = nvidia.construct_gpu_info([
            nvidia.NvidiaGpuStatus(40, 20 * 1024 * 1024 + 1, [],
                                   nvidia.EccError(), "3", "GPU-uuid3", 30.0)
        ])

        metrics = GpuCollector.convert_to_metrics(
            gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping),
            20 * 1024)

        core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container, gpu_temp, gpu_retired = metrics

        target_mem_leak = collector.gen_gpu_memory_leak_counter()
        target_mem_leak.add_metric(["3", "GPU-uuid3"], 1)
        self.assertEqual(target_mem_leak, mem_leak)
Example #5
0
    def test_convert_to_metrics(self):
        # sample may not ordered, and can not assertEqual directly, so tear them apart
        gpu_info = {
            "0": nvidia.NvidiaGpuStatus(20, 21, [22, 33, 44],
                                        nvidia.EccError())
        }

        zombie_info = {"abc", "def"}

        pid_to_cid_mapping = {
            33: "def",
            22: "ghi",
            44: "jkl"
        }  # only 33 is zombie

        metrics = GpuCollector.convert_to_metrics(
            gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping),
            20 * 1024)

        core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container = metrics

        target_core_utils = collector.gen_gpu_util_gauge()
        target_core_utils.add_metric(["0"], 20)
        self.assertEqual(target_core_utils, core_utils)

        target_mem_utils = collector.gen_gpu_mem_util_gauge()
        target_mem_utils.add_metric(["0"], 21)
        self.assertEqual(target_mem_utils, mem_utils)

        target_ecc_errors = collector.gen_gpu_ecc_counter()
        target_ecc_errors.add_metric(["0", "single"], 0)
        target_ecc_errors.add_metric(["0", "double"], 0)
        self.assertEqual(target_ecc_errors, ecc_errors)

        target_mem_leak = collector.gen_gpu_memory_leak_counter()
        self.assertEqual(target_mem_leak, mem_leak)

        target_external_process = collector.gen_gpu_used_by_external_process_counter(
        )
        target_external_process.add_metric(["0", 22], 1)
        target_external_process.add_metric(["0", 44], 1)
        self.assertEqual(target_external_process, external_process)

        target_zombie_container = collector.gen_gpu_used_by_zombie_container_counter(
        )
        target_zombie_container.add_metric(["0", "def"], 1)
        self.assertEqual(target_zombie_container, zombie_container)

        # test minor 1
        gpu_info = {
            "1":
            nvidia.NvidiaGpuStatus(30, 31, [55, 123],
                                   nvidia.EccError(single=2, double=3))
        }

        metrics = GpuCollector.convert_to_metrics(
            gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping),
            20 * 1024)

        core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container = metrics

        target_core_utils = collector.gen_gpu_util_gauge()
        target_core_utils.add_metric(["1"], 30)
        self.assertEqual(target_core_utils, core_utils)

        target_mem_utils = collector.gen_gpu_mem_util_gauge()
        target_mem_utils.add_metric(["1"], 31)
        self.assertEqual(target_mem_utils, mem_utils)

        target_ecc_errors = collector.gen_gpu_ecc_counter()
        target_ecc_errors.add_metric(["1", "single"], 2)
        target_ecc_errors.add_metric(["1", "double"], 3)
        self.assertEqual(target_ecc_errors, ecc_errors)

        target_mem_leak = collector.gen_gpu_memory_leak_counter()
        self.assertEqual(target_mem_leak, mem_leak)

        target_external_process = collector.gen_gpu_used_by_external_process_counter(
        )
        target_external_process.add_metric(["1", 55], 1)
        target_external_process.add_metric(["1", 123], 1)
        self.assertEqual(target_external_process, external_process)

        target_zombie_container = collector.gen_gpu_used_by_zombie_container_counter(
        )
        self.assertEqual(target_zombie_container, zombie_container)

        # test minor 2
        gpu_info = {
            "2":
            nvidia.NvidiaGpuStatus(40, 20 * 1024 * 1024, [], nvidia.EccError())
        }

        metrics = GpuCollector.convert_to_metrics(
            gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping),
            20 * 1024 * 1024)

        core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container = metrics

        target_core_utils = collector.gen_gpu_util_gauge()
        target_core_utils.add_metric(["2"], 40)
        self.assertEqual(target_core_utils, core_utils)

        target_mem_utils = collector.gen_gpu_mem_util_gauge()
        target_mem_utils.add_metric(["2"], 20 * 1024 * 1024)
        self.assertEqual(target_mem_utils, mem_utils)

        target_ecc_errors = collector.gen_gpu_ecc_counter()
        target_ecc_errors.add_metric(["2", "single"], 0)
        target_ecc_errors.add_metric(["2", "double"], 0)
        self.assertEqual(target_ecc_errors, ecc_errors)

        target_mem_leak = collector.gen_gpu_memory_leak_counter()
        self.assertEqual(target_mem_leak, mem_leak)

        target_external_process = collector.gen_gpu_used_by_external_process_counter(
        )
        self.assertEqual(target_external_process, external_process)

        target_zombie_container = collector.gen_gpu_used_by_zombie_container_counter(
        )
        self.assertEqual(target_zombie_container, zombie_container)

        # test memory leak
        gpu_info = {
            "3":
            nvidia.NvidiaGpuStatus(40, 20 * 1024 * 1024 + 1, [],
                                   nvidia.EccError())
        }

        metrics = GpuCollector.convert_to_metrics(
            gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping),
            20 * 1024)

        core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container = metrics

        target_mem_leak = collector.gen_gpu_memory_leak_counter()
        target_mem_leak.add_metric(["3"], 1)
        self.assertEqual(target_mem_leak, mem_leak)