def test_convert_to_metrics_with_no_zombie_info_BUGFIX(self): gpu_info = nvidia.construct_gpu_info([ nvidia.NvidiaGpuStatus(20, 21, [22, 33, 44], nvidia.EccError(), "0", "GPU-uuid0", 40.0)]) # zombie_info is empty should also have external process metric zombie_info = [] pid_to_cid_mapping = {33: "def", 22: "ghi"} # only 44 is external process metrics = GpuCollector.convert_nvidia_gpu_info_to_metrics(gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping), 20 * 1024) _, _, _, _, external_process, zombie_container, _, _, _ = metrics self.assertEqual(0, len(zombie_container.samples)) self.assertEqual(1, len(external_process.samples)) self.assertEqual("0", external_process.samples[0].labels["minor_number"]) self.assertEqual("44", external_process.samples[0].labels["pid"]) # zombie_info is None should also have external process metric zombie_info = None metrics = GpuCollector.convert_nvidia_gpu_info_to_metrics(gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping), 20 * 1024) _, _, _, _, external_process, zombie_container, _, _, _ = metrics self.assertEqual(0, len(zombie_container.samples)) self.assertEqual(1, len(external_process.samples)) self.assertEqual("0", external_process.samples[0].labels["minor_number"]) self.assertEqual("44", external_process.samples[0].labels["pid"])
def test_convert_to_metrics_with_real_id_BUGFIX(self): gpu_info = nvidia.construct_gpu_info([ nvidia.NvidiaGpuStatus(20, 21, [22], nvidia.EccError(), "0", "GPU-uuid0", 50.0) ]) # zombie_info is empty should also have external process metric zombie_info = {"ce5de12d6275"} pid_to_cid_mapping = { 22: "ce5de12d6275dc05c9ec5b7f58484f075f4775d8f54f6a4be3dc1439344df356" } metrics = GpuCollector.convert_to_metrics( gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping), 20 * 1024) core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container, gpu_temp, gpu_retired = metrics self.assertEqual(1, len(zombie_container.samples)) self.assertEqual("0", zombie_container.samples[0].labels["minor_number"]) self.assertEqual("ce5de12d6275", zombie_container.samples[0].labels["container_id"])
def test_convert_to_metrics_with_no_zombie_info_BUGFIX(self): gpu_info = { "0": nvidia.NvidiaGpuStatus(20, 21, [22, 33, 44], nvidia.EccError()) } # zombie_info is empty should also have external process metric zombie_info = [] pid_to_cid_mapping = { 33: "def", 22: "ghi" } # only 44 is external process metrics = GpuCollector.convert_to_metrics( gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping), 20 * 1024) core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container = metrics self.assertEqual(0, len(zombie_container.samples)) self.assertEqual(1, len(external_process.samples)) self.assertEqual("0", external_process.samples[0].labels["minor_number"]) self.assertEqual(44, external_process.samples[0].labels["pid"]) # zombie_info is None should also have external process metric zombie_info = None metrics = GpuCollector.convert_to_metrics( gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping), 20 * 1024) core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container = metrics self.assertEqual(0, len(zombie_container.samples)) self.assertEqual(1, len(external_process.samples)) self.assertEqual("0", external_process.samples[0].labels["minor_number"]) self.assertEqual(44, external_process.samples[0].labels["pid"])
def test_convert_to_metrics(self): # sample may not ordered, and can not assertEqual directly, so tear them apart gpu_info = nvidia.construct_gpu_info([ nvidia.NvidiaGpuStatus(20, 21, [22, 33, 44], nvidia.EccError(), "0", "GPU-uuid0", 37.0) ]) zombie_info = {"abc", "def"} pid_to_cid_mapping = {33: "def", 22: "ghi"} # only 33 is zombie metrics = GpuCollector.convert_to_metrics( gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping), 20 * 1024) core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container, gpu_temp, gpu_retired = metrics target_core_utils = collector.gen_gpu_util_gauge() target_core_utils.add_metric(["0", "GPU-uuid0"], 20) self.assertEqual(target_core_utils, core_utils) target_mem_utils = collector.gen_gpu_mem_util_gauge() target_mem_utils.add_metric(["0", "GPU-uuid0"], 21) self.assertEqual(target_mem_utils, mem_utils) target_ecc_errors = collector.gen_gpu_ecc_counter() target_ecc_errors.add_metric(["0", "GPU-uuid0", "volatile_single"], 0) target_ecc_errors.add_metric(["0", "GPU-uuid0", "volatile_double"], 0) target_ecc_errors.add_metric(["0", "GPU-uuid0", "aggregated_single"], 0) target_ecc_errors.add_metric(["0", "GPU-uuid0", "aggregated_double"], 0) self.assertEqual(target_ecc_errors, ecc_errors) target_mem_leak = collector.gen_gpu_memory_leak_counter() self.assertEqual(target_mem_leak, mem_leak) target_external_process = collector.gen_gpu_used_by_external_process_counter( ) target_external_process.add_metric(["0", "44"], 1) self.assertEqual(target_external_process, external_process) target_zombie_container = collector.gen_gpu_used_by_zombie_container_counter( ) target_zombie_container.add_metric(["0", "def"], 1) self.assertEqual(target_zombie_container, zombie_container) target_gpu_temp = collector.gen_gpu_temperature_gauge() target_gpu_temp.add_metric(["0", "GPU-uuid0"], 37.0) self.assertEqual(target_gpu_temp, gpu_temp) # test minor 1 gpu_info = nvidia.construct_gpu_info([ nvidia.NvidiaGpuStatus( 30, 31, [55, 123], nvidia.EccError(volatile_single=2, volatile_double=3, aggregated_single=4, aggregated_double=5), "1", "GPU-uuid1", 24.0) ]) metrics = GpuCollector.convert_to_metrics( gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping), 20 * 1024) core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container, gpu_temp, gpu_retired = metrics target_core_utils = collector.gen_gpu_util_gauge() target_core_utils.add_metric(["1", "GPU-uuid1"], 30) self.assertEqual(target_core_utils, core_utils) target_mem_utils = collector.gen_gpu_mem_util_gauge() target_mem_utils.add_metric(["1", "GPU-uuid1"], 31) self.assertEqual(target_mem_utils, mem_utils) target_ecc_errors = collector.gen_gpu_ecc_counter() target_ecc_errors.add_metric(["1", "GPU-uuid1", "volatile_single"], 2) target_ecc_errors.add_metric(["1", "GPU-uuid1", "volatile_double"], 3) target_ecc_errors.add_metric(["1", "GPU-uuid1", "aggregated_single"], 4) target_ecc_errors.add_metric(["1", "GPU-uuid1", "aggregated_double"], 5) self.assertEqual(target_ecc_errors, ecc_errors) target_mem_leak = collector.gen_gpu_memory_leak_counter() self.assertEqual(target_mem_leak, mem_leak) target_external_process = collector.gen_gpu_used_by_external_process_counter( ) target_external_process.add_metric(["1", "55"], 1) target_external_process.add_metric(["1", "123"], 1) self.assertEqual(target_external_process, external_process) target_zombie_container = collector.gen_gpu_used_by_zombie_container_counter( ) self.assertEqual(target_zombie_container, zombie_container) target_gpu_temp = collector.gen_gpu_temperature_gauge() target_gpu_temp.add_metric(["1", "GPU-uuid1"], 24.0) self.assertEqual(target_gpu_temp, gpu_temp) # test minor 2 gpu_info = nvidia.construct_gpu_info([ nvidia.NvidiaGpuStatus(40, 20 * 1024 * 1024, [], nvidia.EccError(), "2", "GPU-uuid2", 30.0) ]) metrics = GpuCollector.convert_to_metrics( gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping), 20 * 1024 * 1024) core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container, gpu_temp, gpu_retired = metrics target_core_utils = collector.gen_gpu_util_gauge() target_core_utils.add_metric(["2", "GPU-uuid2"], 40) self.assertEqual(target_core_utils, core_utils) target_mem_utils = collector.gen_gpu_mem_util_gauge() target_mem_utils.add_metric(["2", "GPU-uuid2"], 20 * 1024 * 1024) self.assertEqual(target_mem_utils, mem_utils) target_ecc_errors = collector.gen_gpu_ecc_counter() target_ecc_errors.add_metric(["2", "GPU-uuid2", "volatile_single"], 0) target_ecc_errors.add_metric(["2", "GPU-uuid2", "volatile_double"], 0) target_ecc_errors.add_metric(["2", "GPU-uuid2", "aggregated_single"], 0) target_ecc_errors.add_metric(["2", "GPU-uuid2", "aggregated_double"], 0) self.assertEqual(target_ecc_errors, ecc_errors) target_mem_leak = collector.gen_gpu_memory_leak_counter() self.assertEqual(target_mem_leak, mem_leak) target_external_process = collector.gen_gpu_used_by_external_process_counter( ) self.assertEqual(target_external_process, external_process) target_zombie_container = collector.gen_gpu_used_by_zombie_container_counter( ) self.assertEqual(target_zombie_container, zombie_container) target_gpu_temp = collector.gen_gpu_temperature_gauge() target_gpu_temp.add_metric(["2", "GPU-uuid2"], 30.0) self.assertEqual(target_gpu_temp, gpu_temp) # test memory leak gpu_info = nvidia.construct_gpu_info([ nvidia.NvidiaGpuStatus(40, 20 * 1024 * 1024 + 1, [], nvidia.EccError(), "3", "GPU-uuid3", 30.0) ]) metrics = GpuCollector.convert_to_metrics( gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping), 20 * 1024) core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container, gpu_temp, gpu_retired = metrics target_mem_leak = collector.gen_gpu_memory_leak_counter() target_mem_leak.add_metric(["3", "GPU-uuid3"], 1) self.assertEqual(target_mem_leak, mem_leak)
def test_convert_to_metrics(self): # sample may not ordered, and can not assertEqual directly, so tear them apart gpu_info = { "0": nvidia.NvidiaGpuStatus(20, 21, [22, 33, 44], nvidia.EccError()) } zombie_info = {"abc", "def"} pid_to_cid_mapping = { 33: "def", 22: "ghi", 44: "jkl" } # only 33 is zombie metrics = GpuCollector.convert_to_metrics( gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping), 20 * 1024) core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container = metrics target_core_utils = collector.gen_gpu_util_gauge() target_core_utils.add_metric(["0"], 20) self.assertEqual(target_core_utils, core_utils) target_mem_utils = collector.gen_gpu_mem_util_gauge() target_mem_utils.add_metric(["0"], 21) self.assertEqual(target_mem_utils, mem_utils) target_ecc_errors = collector.gen_gpu_ecc_counter() target_ecc_errors.add_metric(["0", "single"], 0) target_ecc_errors.add_metric(["0", "double"], 0) self.assertEqual(target_ecc_errors, ecc_errors) target_mem_leak = collector.gen_gpu_memory_leak_counter() self.assertEqual(target_mem_leak, mem_leak) target_external_process = collector.gen_gpu_used_by_external_process_counter( ) target_external_process.add_metric(["0", 22], 1) target_external_process.add_metric(["0", 44], 1) self.assertEqual(target_external_process, external_process) target_zombie_container = collector.gen_gpu_used_by_zombie_container_counter( ) target_zombie_container.add_metric(["0", "def"], 1) self.assertEqual(target_zombie_container, zombie_container) # test minor 1 gpu_info = { "1": nvidia.NvidiaGpuStatus(30, 31, [55, 123], nvidia.EccError(single=2, double=3)) } metrics = GpuCollector.convert_to_metrics( gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping), 20 * 1024) core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container = metrics target_core_utils = collector.gen_gpu_util_gauge() target_core_utils.add_metric(["1"], 30) self.assertEqual(target_core_utils, core_utils) target_mem_utils = collector.gen_gpu_mem_util_gauge() target_mem_utils.add_metric(["1"], 31) self.assertEqual(target_mem_utils, mem_utils) target_ecc_errors = collector.gen_gpu_ecc_counter() target_ecc_errors.add_metric(["1", "single"], 2) target_ecc_errors.add_metric(["1", "double"], 3) self.assertEqual(target_ecc_errors, ecc_errors) target_mem_leak = collector.gen_gpu_memory_leak_counter() self.assertEqual(target_mem_leak, mem_leak) target_external_process = collector.gen_gpu_used_by_external_process_counter( ) target_external_process.add_metric(["1", 55], 1) target_external_process.add_metric(["1", 123], 1) self.assertEqual(target_external_process, external_process) target_zombie_container = collector.gen_gpu_used_by_zombie_container_counter( ) self.assertEqual(target_zombie_container, zombie_container) # test minor 2 gpu_info = { "2": nvidia.NvidiaGpuStatus(40, 20 * 1024 * 1024, [], nvidia.EccError()) } metrics = GpuCollector.convert_to_metrics( gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping), 20 * 1024 * 1024) core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container = metrics target_core_utils = collector.gen_gpu_util_gauge() target_core_utils.add_metric(["2"], 40) self.assertEqual(target_core_utils, core_utils) target_mem_utils = collector.gen_gpu_mem_util_gauge() target_mem_utils.add_metric(["2"], 20 * 1024 * 1024) self.assertEqual(target_mem_utils, mem_utils) target_ecc_errors = collector.gen_gpu_ecc_counter() target_ecc_errors.add_metric(["2", "single"], 0) target_ecc_errors.add_metric(["2", "double"], 0) self.assertEqual(target_ecc_errors, ecc_errors) target_mem_leak = collector.gen_gpu_memory_leak_counter() self.assertEqual(target_mem_leak, mem_leak) target_external_process = collector.gen_gpu_used_by_external_process_counter( ) self.assertEqual(target_external_process, external_process) target_zombie_container = collector.gen_gpu_used_by_zombie_container_counter( ) self.assertEqual(target_zombie_container, zombie_container) # test memory leak gpu_info = { "3": nvidia.NvidiaGpuStatus(40, 20 * 1024 * 1024 + 1, [], nvidia.EccError()) } metrics = GpuCollector.convert_to_metrics( gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping), 20 * 1024) core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container = metrics target_mem_leak = collector.gen_gpu_memory_leak_counter() target_mem_leak.add_metric(["3"], 1) self.assertEqual(target_mem_leak, mem_leak)