def helper_check_software_page_retirements_fail_on_pending_retirements(handle, gpuId): """ Ensure that the software test for page retirements fails when there are pending page retirements. """ # First verify that the software test passes for the gpu. # If it doesn't pass, skip test and add note to check GPU health dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId]) dd.UseFakeGpus() response = test_utils.diag_execute_wrapper(dd, handle) if not check_software_result_pass(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT): test_utils.skip_test("Skipping because GPU %s does not pass software page retirement test. " "Please verify whether the GPU is healthy." % gpuId) # Inject some pending page retirements inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING, 1, -30, True) response = test_utils.diag_execute_wrapper(dd, handle) # Ensure software test failed due to pending page retirments assert check_software_result_fail(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT), \ "Expected software test to fail due to pending page retirements in the GPU" # Reset injected value inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING, 0, -30, True) # Ensure diag passes now response = test_utils.diag_execute_wrapper(dd, handle) assert check_software_result_pass(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT), \ "Expected software test to pass"
def helper_check_software_page_retirements_fail_total_retirements(handle, gpuId): """ Ensure that the software test for page retirements fails when there are mroe than 60 page retirements. """ # First verify that the software test passes for the gpu. If it doesn't pass, skip test and add note to check GPU health dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId]) dd.UseFakeGpus() response = test_utils.diag_execute_wrapper(dd, handle) if not check_software_result_pass(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT): test_utils.skip_test("Skipping because GPU %s does not pass software page retirement test. " "Please verify whether the GPU is healthy." % gpuId) # Inject enough page retirements to cause failure inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_DBE, 33, -30, True) inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_SBE, 33, -30, True) response = test_utils.diag_execute_wrapper(dd, handle) assert check_software_result_fail(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT), \ "Expected software test to fail due to 60 total page retirements in the GPU" # Ensure 59 pages pass injected value inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_SBE, 25, -30, True) # Ensure diag passes now response = test_utils.diag_execute_wrapper(dd, handle) assert check_software_result_pass(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT), \ "Expected software test to pass since there are less than 60 total retired pages" # Reset retired pages count and verify pass inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_DBE, 0, -30, True) inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_SBE, 0, -30, True) # Ensure diag still passes response = test_utils.diag_execute_wrapper(dd, handle) assert check_software_result_pass(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT), \ "Expected software test to pass since there are no retired pages"
def perform_diag_with_throttle_mask_and_verify(dd, handle, gpuId, inserted_error, throttle_mask, shouldPass, failureMsg): fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS interval = 0.1 if throttle_mask is not None: dd.SetThrottleMask(throttle_mask) inject_value(handle, gpuId, fieldId, inserted_error, injection_offset, True) inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000, injection_offset, True) # Verify that the inserted values are visible in DCGM before starting the diag assert dcgm_internal_helpers.verify_field_value(gpuId, fieldId, inserted_error, checkInterval=interval, maxWait=5, numMatches=1), \ "Expected inserted values to be visible in DCGM" # Start the diag response = test_utils.diag_execute_wrapper(dd, handle) # Check for pass or failure as per the shouldPass parameter throttled, errMsg = find_throttle_failure( response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX) if shouldPass: assert throttled == False, "Expected to not have a throttling error but found %s" % errMsg else: assert throttled == True, "Expected to find a throttling error but did not (%s)" % errMsg
def helper_test_thermal_violations_in_seconds(handle, gpuIds): dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds, testNamesStr='diagnostic', paramsStr='diagnostic.test_duration=10') dd.UseFakeGpus() fieldId = dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION injected_value = 2344122048 inject_value(handle, gpuIds[0], fieldId, injected_value, 10, True) # Verify that the inserted values are visible in DCGM before starting the diag assert dcgm_internal_helpers.verify_field_value(gpuIds[0], fieldId, injected_value, maxWait=5, numMatches=1), \ "Expected inserted values to be visible in DCGM" # Start the diag response = dd.Execute(handle) testIndex = dcgm_structs.DCGM_DIAGNOSTIC_INDEX errmsg = response.perGpuResponses[gpuIds[0]].results[testIndex].error.msg # Check for hermal instead of thermal because sometimes it's capitalized if errmsg.find("hermal violations") != -1: foundError = True assert errmsg.find("totaling 2.3 seconds") != -1, \ "Expected 2.3 seconds of thermal violations but found %s" % errmsg else: # Didn't find an error assert False, "Thermal violations were injected but not found in error message: '%s'." % errmsg
def helper_test_inject_instance_fields(handle, gpuIds): instances, cis = ensure_instance_ids(handle, gpuIds[0], 1, 1) firstInstanceId = instances.keys()[0] lastCIId = cis.keys()[0] # Set up the watches on these groups groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY, 'tien') fieldGroupId = dcgm_agent.dcgmFieldGroupCreate( handle, [dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL], 'kal') dcgm_agent.dcgmGroupAddEntity(handle, groupId, dcgm_fields.DCGM_FE_GPU, gpuIds[0]) dcgm_agent.dcgmGroupAddEntity(handle, groupId, dcgm_fields.DCGM_FE_GPU_I, firstInstanceId) dcgm_agent.dcgmGroupAddEntity(handle, groupId, dcgm_fields.DCGM_FE_GPU_CI, lastCIId) dcgm_agent.dcgmWatchFields(handle, groupId, fieldGroupId, 1, 100, 100) dcgm_internal_helpers.inject_value( handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, 2, 5, isInt=True, verifyInsertion=True, entityType=dcgm_fields.DCGM_FE_GPU) # Read the values to make sure they were stored properly entities = [ dcgm_structs.c_dcgmGroupEntityPair_t(), dcgm_structs.c_dcgmGroupEntityPair_t(), dcgm_structs.c_dcgmGroupEntityPair_t() ] entities[0].entityGroupId = dcgm_fields.DCGM_FE_GPU_I entities[0].entityId = firstInstanceId entities[1].entityGroupId = dcgm_fields.DCGM_FE_GPU_CI entities[1].entityId = lastCIId entities[2].entityGroupId = dcgm_fields.DCGM_FE_GPU entities[2].entityId = gpuIds[0] fieldIds = [dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL] values = dcgm_agent.dcgmEntitiesGetLatestValues(handle, entities, fieldIds, 0) for v in values: if v.entityGroupId == dcgm_fields.DCGM_FE_GPU: assert v.value.i64 == 2, "Failed to inject value 2 for entity %u from group %u" % ( v.entityId, v.entityGroupId) else: from dcgm_structs import DCGM_ST_NO_DATA assert ( v.status == DCGM_ST_NO_DATA ), "Injected meaningless value %u for entity %u from group %u" % ( v.value.i64, v.entityId, v.entityGroupId)
def test_memtest_failures_standalone(handle, gpuIds): dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds, testNamesStr="memtest", paramsStr="memtest.test_duration=10") inject_value(handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, 1000, injection_offset, True) response = test_utils.diag_execute_wrapper(dd, handle) assert response.perGpuResponses[gpuIds[0]].results[dcgm_structs.DCGM_MEMTEST_INDEX].result != dcgm_structs.DCGM_DIAG_RESULT_PASS, \ "Should have a failure due to injected DBEs, but got passing result"
def helper_test_diagnostic_config_usage(handle, gpuIds): dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds, testNamesStr="diagnostic", paramsStr="diagnostic.test_duration=10") dd.SetConfigFileContents( "%YAML 1.2\n\ncustom:\n- custom:\n diagnostic:\n max_sbe_errors: 1" ) inject_value(handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, 1000, injection_offset, True) response = test_utils.diag_execute_wrapper(dd, handle) assert response.perGpuResponses[gpuIds[0]].results[dcgm_structs.DCGM_DIAGNOSTIC_INDEX].result != dcgm_structs.DCGM_DIAG_RESULT_PASS, \ "Should have a failure due to injected SBEs, but got passing result"
def helper_test_health_check_instances(handle, gpuIds): instances, cis = ensure_instance_ids(handle, gpuIds[0], 1, 1) instanceId = instances.keys()[0] ciId = cis.keys()[0] handleObj = DcgmHandle.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetEmptyGroup("test1") groupObj.AddEntity(dcgm_fields.DCGM_FE_GPU, gpuIds[0]) groupObj.AddEntity(dcgm_fields.DCGM_FE_GPU_I, instanceId) groupObj.AddEntity(dcgm_fields.DCGM_FE_GPU_CI, ciId) newSystems = dcgm_structs.DCGM_HEALTH_WATCH_MEM groupObj.health.Set(newSystems) # Verify health prior to testing responseV4 = groupObj.health.Check( dcgm_structs.dcgmHealthResponse_version4) if responseV4.incidentCount != 0: test_utils.skip_test("Cannot test on unhealthy systems.") # Inject one error per system dcgm_internal_helpers.inject_value( handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, 2, 5, isInt=True, verifyInsertion=True, entityType=dcgm_fields.DCGM_FE_GPU) responseV4 = groupObj.health.Check( dcgm_structs.dcgmHealthResponse_version4) assert ( responseV4.incidentCount == 1 ), "Should have 1 total incidents but found %d" % responseV4.incidentCount assert (responseV4.incidents[0].entityInfo.entityId == gpuIds[0]) assert (responseV4.incidents[0].entityInfo.entityGroupId == dcgm_fields.DCGM_FE_GPU) assert (responseV4.incidents[0].error.code == dcgm_errors.DCGM_FR_VOLATILE_DBE_DETECTED) assert ( responseV4.incidents[0].system == dcgm_structs.DCGM_HEALTH_WATCH_MEM) assert ( responseV4.incidents[0].health == dcgm_structs.DCGM_HEALTH_RESULT_FAIL)
def helper_per_gpu_responses_api(handle, gpuIds, testDir): """ Verify that pass/fail status for diagnostic tests are reported on a per GPU basis via dcgmActionValidate API call """ failGpuId = gpuIds[0] dd = helper_verify_diag_passing(handle, gpuIds, useFakeGpus=True) dd = DcgmDiag.DcgmDiag(gpuIds=[failGpuId], testNamesStr="SM Stress", paramsStr="sm stress.test_duration=15", version=dcgm_structs.dcgmRunDiag_version) dd.SetThrottleMask( 0 ) # We explicitly want to fail for throttle reasons since this test inserts throttling errors # for verification dd.UseFakeGpus() dd.SetStatsPath(testDir) dd.SetStatsOnFail(1) # Setup injection app fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS insertedError = dcgm_fields.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN interval = 0.1 # Use an offset to make these errors start after the benign values inject_value(handle, failGpuId, fieldId, insertedError, injection_offset, True) inject_value(handle, failGpuId, dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000, injection_offset, True) # Verify that the inserted values are visible in DCGM before starting the diag assert dcgm_internal_helpers.verify_field_value(failGpuId, fieldId, insertedError, checkInterval=interval, maxWait=5, numMatches=1), \ "Expected inserted values to be visible in DCGM" response = test_utils.diag_execute_wrapper(dd, handle) logger.info("Started diag") # Verify that responses are reported on a per gpu basis. Ensure the first GPU failed, and all others passed for gpuId in gpuIds: throttled, errMsg = find_throttle_failure( response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX) if gpuId == failGpuId: assert throttled, "Expected throttling error but found none (%s)" % errMsg else: assert not throttled, "Expected not to find a throttling error but found '%s'" % errMsg
def helper_throttling_masking_failures(handle, gpuId): ##### # First check whether the GPU is healthy dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr="SM Stress", paramsStr="sm stress.test_duration=2", version=dcgm_structs.dcgmRunDiag_version) dd.SetThrottleMask( 0 ) # We explicitly want to fail for throttle reasons since this test inserts throttling errors # for verification dd.UseFakeGpus() response = test_utils.diag_execute_wrapper(dd, handle) if not check_diag_result_pass(response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX): test_utils.skip_test( "Skipping because GPU %s does not pass SM Perf test. " "Please verify whether the GPU is supported and healthy." % gpuId) ##### dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr="SM Stress", paramsStr="sm stress.test_duration=15", version=dcgm_structs.dcgmRunDiag_version) dd.SetThrottleMask(0) dd.UseFakeGpus() fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS insertedError = dcgm_fields.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN interval = 0.1 logger.info("Injecting benign errors") inject_value(handle, gpuId, fieldId, 3, 1, True) # Verify that the inserted values are visible in DCGM before starting the diag assert dcgm_internal_helpers.verify_field_value(gpuId, fieldId, 3, checkInterval=interval, maxWait=5, numMatches=1), \ "Expected inserted values to be visible in DCGM" logger.info("Injecting actual errors") inject_value(handle, gpuId, fieldId, insertedError, injection_offset, True) inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000, injection_offset, True) logger.info("Started diag") response = test_utils.diag_execute_wrapper(dd, handle) # Verify that the inserted values are visible in DCGM # Max wait of 8 is because of 5 second offset + 2 seconds required for 20 matches + 1 second buffer. assert dcgm_internal_helpers.verify_field_value(gpuId, fieldId, insertedError, checkInterval=0.1, numMatches=1, maxWait=8), \ "Expected inserted errors to be visible in DCGM" throttled, errMsg = find_throttle_failure( response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX) assert throttled, "Expected to find throttling failure, but did not: (%s)" % errMsg
def verify_early_fail_checks_for_test(handle, gpuId, test_name, testIndex): """ Helper method for verifying the fail early checks for the specified test. """ if testIndex == dcgm_structs.DCGM_TARGETED_POWER_INDEX and not option_parser.options.developer_mode: # Skip this test since Targeted Power always fails when duration is less than 30 seconds test_utils.skip_test("Skipping fail early verification for Targeted Power test. Use developer mode " "to run this test.") duration = 2 if testIndex != dcgm_structs.DCGM_TARGETED_POWER_INDEX else 30 # Prevent false failures due to min # duration requirements for Targeted Power paramsStr = "%s.test_duration=%s" % (test_name, duration) data = [None] def runDiag(dd, data): # Simple helper method to run a diag (used as thread target) data[0] = test_utils.diag_execute_wrapper(dd, handle) ### # First verify that the given test passes for the gpu. # If it doesn't pass, skip test and add note to check GPU health logger.info("Checking whether %s test passes on GPU %s" % (test_name, gpuId)) dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr) test_name_no_spaces = test_name.replace(" ", "_") logname = '/tmp/nv_' + test_name_no_spaces + '%s.log' dd.SetDebugLogFile(logname % 1) dd.SetDebugLevel(5) response = test_utils.diag_execute_wrapper(dd, handle) if not check_diag_result_pass(response, gpuId, testIndex): test_utils.skip_test("Skipping because GPU %s does not pass %s test. " "Please verify whether the GPU is healthy." % (gpuId, test_name)) ### # Next, verify that the given test passes for the gpu when fail early checks are enabled and no errors are inserted logger.info("Checking whether %s test passes on GPU %s with fail early enabled" % (test_name, gpuId)) duration = 15 if testIndex != dcgm_structs.DCGM_TARGETED_POWER_INDEX else 30 # Prevent false failures due to min # duration requirements for Targeted Power paramsStr = "%s.test_duration=%s" % (test_name, duration) dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr) dd.SetFailEarly(checkInterval=2) # enable fail early checks dd.SetDebugLogFile(logname % 2) dd.SetDebugLevel(5) result_thread = threading.Thread(target=runDiag, args=[dd, data]) result_thread.start() # Ensure nvvs process has started running, debug_output = dcgm_internal_helpers.check_nvvs_process(want_running=True) assert running, "Nvvs process did not start within 10 seconds. pgrep output: %s" % debug_output start = time.time() result_thread.join() end = time.time() assert check_diag_result_pass(data[0], gpuId, testIndex), \ "Expected %s test to pass with fail early enabled and no inserted errors" % test_name assert (end - start) >= duration * 0.9, \ "Expected %s test to run for at least %ss, but it only ran for %ss." % (test_name, duration, end - start) ### # Verify fail early behavior by inserting an error. # Setup test parameters duration = 20 if testIndex != dcgm_structs.DCGM_TARGETED_POWER_INDEX else 30 # Prevent false failures due to min # duration requirements for Targeted Power paramsStr = "%s.test_duration=%s" % (test_name, duration) response = None dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr) dd.SetFailEarly(checkInterval=2) # enable fail early checks dd.SetDebugLogFile(logname % 3) # Setup threads / processes xid_inject_val = 2 result_thread = threading.Thread(target=runDiag, args=[dd, data]) inject_error = dcgm_internal_helpers.InjectionThread(handle, gpuId, dcgm_fields.DCGM_FI_DEV_XID_ERRORS, xid_inject_val, offset=5) logger.info("Verifying fail early behavior for %s test by inserting XIDs." % test_name) # Start inserting errors inject_error.start() # Ensure that inserted errors are visible assert \ dcgm_internal_helpers.verify_field_value(gpuId, dcgm_fields.DCGM_FI_DEV_XID_ERRORS, xid_inject_val, checkInterval=0.1, numMatches=5), \ "Expected inserted value for XIDs to be visible in DCGM" # Start test thread result_thread.start() # Ensure nvvs process has started running, debug_output = dcgm_internal_helpers.check_nvvs_process(want_running=True) assert running, "Nvvs process did not start within 10 seconds. pgrep output: %s" % debug_output start = time.time() # Give the test time to exit and verify that the test exits early # Test should exit within 75% of test duration if it is going to fail early. Ideally, it should exit within # 2 failure checks (~ 4 seconds of test start), but we provide bigger buffer to account for delays in starting # the test result_thread.join(20) test_exited_early = not result_thread.is_alive() # Cache thread isAlive value until we verify it end = time.time() # Stop the injection app inject_error.Stop() inject_error.join() # Verify injection app stopped correctly assert inject_error.retCode == dcgm_structs.DCGM_ST_OK, \ "There was an error inserting values into dcgm. Return code: %s" % inject_error.retCode if not test_exited_early: # Wait for the launched diag to end result_thread.join() end = time.time() response = data[0] # Check whether test exited early assert test_exited_early, \ "Expected %s test to exit early. Test took %ss to complete.\nGot result: %s (\ninfo: %s,\n warning: %s)" \ % (test_name, (end - start), response.perGpuResponses[gpuId].results[testIndex].result, response.perGpuResponses[gpuId].results[testIndex].info, response.perGpuResponses[gpuId].results[testIndex].error.msg) # Verify the test failed assert check_diag_result_fail(response, gpuId, testIndex), \ "Expected %s test to fail due to injected dbes.\nGot result: %s (\ninfo: %s,\n warning: %s)" % \ (test_name, response.perGpuResponses[gpuId].results[testIndex].result, response.perGpuResponses[gpuId].results[testIndex].info, response.perGpuResponses[gpuId].results[testIndex].error.msg) ### # Rerun the test to verify that the test passes now that there are no inserted errors duration = 30 paramsStr = "%s.test_duration=%s" % (test_name, duration) logger.info("Verifying that test passes once xid errors are removed.") dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr) dd.SetFailEarly(checkInterval=3) # enable fail early checks dd.SetDebugLogFile(logname % 4) # Reset dbes error inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_XID_ERRORS, 0, 0) # Sleep to ensure no pending errors left time.sleep(10) response = test_utils.diag_execute_wrapper(dd, handle) # Verify the test passed assert check_diag_result_pass(response, gpuId, testIndex), \ "Expected %s test to pass because there are no dbes\nGot result: %s (\ninfo: %s,\n warning: %s)" % \ (test_name, response.perGpuResponses[gpuId].results[testIndex].result, response.perGpuResponses[gpuId].results[testIndex].info, response.perGpuResponses[gpuId].results[testIndex].error.msg)
def helper_per_gpu_responses_dcgmi(handle, gpuIds, testName, testParams): """ Verify that pass/fail status for diagnostic tests are reported on a per GPU basis via dcgmi (for both normal stdout and JSON output). """ def get_stdout(app): output = '' for line in app.stdout_lines: output = output + line + " " return output def print_output(app): logger.info(get_stdout(app)) for line in app.stderr_lines: logger.error(line) def verify_successful_dcgmi_run(app): app.start(timeout=40) logger.info("Started dcgmi diag with pid %s" % app.getpid()) retcode = app.wait() if test_utils.is_mig_incompatible_failure(get_stdout(app)): app.validate() test_utils.skip_test( "Skipping this test because MIG is configured incompatibly (preventing access to the whole GPU)" ) # dcgm returns DCGM_ST_NVVS_ERROR on diag failure (which is expected here). expected_retcode = c_uint8( dcgm_structs.DCGM_ST_NVVS_ISOLATE_ERROR).value if retcode != expected_retcode: if app.stderr_lines or app.stdout_lines: logger.info("dcgmi output:") print_output(app) assert retcode == expected_retcode, \ "Expected dcgmi diag to have retcode %s. Got return code %s" % (expected_retcode, retcode) app.validate() # non-zero exit code must be validated #helper_verify_diag_passing(handle, gpuIds, useFakeGpus=True) # Setup injection app interval = 0.1 fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS insertedError = dcgm_fields.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN # Use an offset to make these errors start after the benign values inject_value(handle, gpuIds[0], fieldId, insertedError, injection_offset, True) inject_value(handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000, injection_offset, True) # Verify that the inserted values are visible in DCGM before starting the diag assert dcgm_internal_helpers.verify_field_value(gpuIds[0], fieldId, insertedError, checkInterval=interval, maxWait=5, numMatches=1), \ "Expected inserted values to be visible in DCGM" # Verify dcgmi output gpuIdStrings = list(map(str, gpuIds)) gpuList = ",".join(gpuIdStrings) args = [ "diag", "-r", testName, "-p", testParams, "-f", gpuList, "--throttle-mask", "0" ] dcgmiApp = DcgmiApp(args=args) logger.info("Verifying stdout output") verify_successful_dcgmi_run(dcgmiApp) # Verify dcgmi output shows per gpu results (crude approximation of verifying correct console output) stress_header_found = False fail_gpu_found = False fail_gpu_text = "Fail - GPU: %s" % gpuIds[0] check_for_warning = False warning_found = False for line in dcgmiApp.stdout_lines: if not stress_header_found: if "Stress" not in line: continue stress_header_found = True continue if not fail_gpu_found: if fail_gpu_text not in line: continue fail_gpu_found = True check_for_warning = True continue if check_for_warning: if "Warning" in line: warning_found = True break if not (stress_header_found and fail_gpu_found and warning_found): logger.info("dcgmi output:") print_output(dcgmiApp) assert stress_header_found, "Expected to see 'Stress' header in output" assert fail_gpu_found, "Expected to see %s in output" % fail_gpu_text assert warning_found, "Expected to see 'Warning' in output after GPU failure text" inject_value(handle, gpuIds[0], fieldId, insertedError, injection_offset, True) inject_value(handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000, injection_offset, True) # Verify that the inserted values are visible in DCGM before starting the diag assert dcgm_internal_helpers.verify_field_value(gpuIds[0], fieldId, insertedError, checkInterval=interval, maxWait=5, numMatches=1), \ "Expected inserted values to be visible in DCGM" # Verify JSON output logger.info("Verifying JSON output") args.append("-j") dcgmiApp = DcgmiApp(args=args) verify_successful_dcgmi_run(dcgmiApp) # Stop error insertion logger.info("Stopped error injection") # Verify per GPU results json_output = "\n".join(dcgmiApp.stdout_lines) output = json.loads(json_output) verifed = False if (len(output.get("DCGM GPU Diagnostic", {}).get("test_categories", [])) == 2 and output["DCGM GPU Diagnostic"]["test_categories"][1].get( "category", None) == "Stress" and output["DCGM GPU Diagnostic"] ["test_categories"][1]["tests"][0]["name"] == testName and len(output["DCGM GPU Diagnostic"]["test_categories"][1] ["tests"][0]["results"]) >= 2 and output["DCGM GPU Diagnostic"]["test_categories"][1]["tests"][0] ["results"][0]["gpu_ids"] == str(gpuIds[0]) and output["DCGM GPU Diagnostic"]["test_categories"][1]["tests"][0] ["results"][0]["status"] == "Fail" and output["DCGM GPU Diagnostic"] ["test_categories"][1]["tests"][0]["results"][1]["status"] == "Pass"): verifed = True if not verifed: print_output(dcgmiApp) assert verifed, "dcgmi JSON output did not pass verification"