def helper_throttling_masking_failures(handle, gpuId): ##### # First check whether the GPU is healthy dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr="SM Stress", paramsStr="sm stress.test_duration=2", version=dcgm_structs.dcgmRunDiag_version) dd.SetThrottleMask( 0 ) # We explicitly want to fail for throttle reasons since this test inserts throttling errors # for verification dd.UseFakeGpus() response = test_utils.diag_execute_wrapper(dd, handle) if not check_diag_result_pass(response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX): test_utils.skip_test( "Skipping because GPU %s does not pass SM Perf test. " "Please verify whether the GPU is supported and healthy." % gpuId) ##### dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr="SM Stress", paramsStr="sm stress.test_duration=15", version=dcgm_structs.dcgmRunDiag_version) dd.SetThrottleMask(0) dd.UseFakeGpus() fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS insertedError = dcgm_fields.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN interval = 0.1 logger.info("Injecting benign errors") inject_value(handle, gpuId, fieldId, 3, 1, True) # Verify that the inserted values are visible in DCGM before starting the diag assert dcgm_internal_helpers.verify_field_value(gpuId, fieldId, 3, checkInterval=interval, maxWait=5, numMatches=1), \ "Expected inserted values to be visible in DCGM" logger.info("Injecting actual errors") inject_value(handle, gpuId, fieldId, insertedError, injection_offset, True) inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000, injection_offset, True) logger.info("Started diag") response = test_utils.diag_execute_wrapper(dd, handle) # Verify that the inserted values are visible in DCGM # Max wait of 8 is because of 5 second offset + 2 seconds required for 20 matches + 1 second buffer. assert dcgm_internal_helpers.verify_field_value(gpuId, fieldId, insertedError, checkInterval=0.1, numMatches=1, maxWait=8), \ "Expected inserted errors to be visible in DCGM" throttled, errMsg = find_throttle_failure( response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX) assert throttled, "Expected to find throttling failure, but did not: (%s)" % errMsg
def perform_diag_with_throttle_mask_and_verify(dd, handle, gpuId, inserted_error, throttle_mask, shouldPass, failureMsg): fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS interval = 0.1 if throttle_mask is not None: dd.SetThrottleMask(throttle_mask) inject_value(handle, gpuId, fieldId, inserted_error, injection_offset, True) inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000, injection_offset, True) # Verify that the inserted values are visible in DCGM before starting the diag assert dcgm_internal_helpers.verify_field_value(gpuId, fieldId, inserted_error, checkInterval=interval, maxWait=5, numMatches=1), \ "Expected inserted values to be visible in DCGM" # Start the diag response = test_utils.diag_execute_wrapper(dd, handle) # Check for pass or failure as per the shouldPass parameter throttled, errMsg = find_throttle_failure( response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX) if shouldPass: assert throttled == False, "Expected to not have a throttling error but found %s" % errMsg else: assert throttled == True, "Expected to find a throttling error but did not (%s)" % errMsg
def helper_test_thermal_violations_in_seconds(handle, gpuIds): dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds, testNamesStr='diagnostic', paramsStr='diagnostic.test_duration=10') dd.UseFakeGpus() fieldId = dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION injected_value = 2344122048 inject_value(handle, gpuIds[0], fieldId, injected_value, 10, True) # Verify that the inserted values are visible in DCGM before starting the diag assert dcgm_internal_helpers.verify_field_value(gpuIds[0], fieldId, injected_value, maxWait=5, numMatches=1), \ "Expected inserted values to be visible in DCGM" # Start the diag response = dd.Execute(handle) testIndex = dcgm_structs.DCGM_DIAGNOSTIC_INDEX errmsg = response.perGpuResponses[gpuIds[0]].results[testIndex].error.msg # Check for hermal instead of thermal because sometimes it's capitalized if errmsg.find("hermal violations") != -1: foundError = True assert errmsg.find("totaling 2.3 seconds") != -1, \ "Expected 2.3 seconds of thermal violations but found %s" % errmsg else: # Didn't find an error assert False, "Thermal violations were injected but not found in error message: '%s'." % errmsg
def helper_per_gpu_responses_api(handle, gpuIds, testDir): """ Verify that pass/fail status for diagnostic tests are reported on a per GPU basis via dcgmActionValidate API call """ failGpuId = gpuIds[0] dd = helper_verify_diag_passing(handle, gpuIds, useFakeGpus=True) dd = DcgmDiag.DcgmDiag(gpuIds=[failGpuId], testNamesStr="SM Stress", paramsStr="sm stress.test_duration=15", version=dcgm_structs.dcgmRunDiag_version) dd.SetThrottleMask( 0 ) # We explicitly want to fail for throttle reasons since this test inserts throttling errors # for verification dd.UseFakeGpus() dd.SetStatsPath(testDir) dd.SetStatsOnFail(1) # Setup injection app fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS insertedError = dcgm_fields.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN interval = 0.1 # Use an offset to make these errors start after the benign values inject_value(handle, failGpuId, fieldId, insertedError, injection_offset, True) inject_value(handle, failGpuId, dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000, injection_offset, True) # Verify that the inserted values are visible in DCGM before starting the diag assert dcgm_internal_helpers.verify_field_value(failGpuId, fieldId, insertedError, checkInterval=interval, maxWait=5, numMatches=1), \ "Expected inserted values to be visible in DCGM" response = test_utils.diag_execute_wrapper(dd, handle) logger.info("Started diag") # Verify that responses are reported on a per gpu basis. Ensure the first GPU failed, and all others passed for gpuId in gpuIds: throttled, errMsg = find_throttle_failure( response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX) if gpuId == failGpuId: assert throttled, "Expected throttling error but found none (%s)" % errMsg else: assert not throttled, "Expected not to find a throttling error but found '%s'" % errMsg
def helper_test_dcgm_diag_dbe_insertion(handle, gpuIds, testDir): dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds, testNamesStr='diagnostic', paramsStr='diagnostic.test_duration=30') dd.SetStatsPath(testDir) dd.SetStatsOnFail(1) def run(dd): dd.Execute(handle) # Setup injection app fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS failGpuId = gpuIds[0] inject_error = dcgm_internal_helpers.InjectionThread( handle, failGpuId, fieldId, dcgm_fields.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN, offset=5) logger.info("Injecting HW_SLOWDOWN throttle error for GPU %s" % failGpuId) inject_error.start() # Verify that the inserted values are visible in DCGM before starting the diag assert dcgm_internal_helpers.verify_field_value(failGpuId, fieldId, 8, maxWait=5), \ "Expected inserted values to be visible" t = threading.Thread(target=run, args=[dd]) logger.info("Started diag") t.start() # Wait for diag to finish logger.info("Waiting for diag to finish") t.join() # Stop error insertion logger.info("Stopped error injection") inject_error.Stop() inject_error.join() assert inject_error.retCode == dcgm_structs.DCGM_ST_OK, "Error injection failed: %s" % inject_error.retCode
def with_error_run(handle, gpuIds, name, testname, parms=None): """ Runs the given test (testname) and inserts throttling / REPLAY_COUNTER errors depending on the test. name is the name of the plugin in nvvs (e.g. constant_perf) Logs an error (but does not fail the test) if the dcgmi return code is not 226 (lower 8 bits of -30/DCGM_ST_NVVS_ERROR) which is expected since the test should fail due to inserted errors. Since busgrind/PCIe does a diff for the REPLAY_COUNTER field we need to insert errors after busgrind has read some zero values for the field. As a result, the hardcoded delay of 15 seconds must be adjusted on different systems (currently a delay of 15 seconds works for the bstolle-dgx machine). """ output_file = OUTPUT_DIR + "/dcgmi_%s_with_err_%s.json" % (name, gpuIds[0]) log_file = OUTPUT_DIR + "/nvvs_%s_with_err_%s.log" % (name, gpuIds[0]) gpu_list = ",".join(map(str, gpuIds)) args = [ "diag", "-r", "%s" % testname, "-i", gpu_list, "-j", "-v", "-d", "5", "--debugLogFile", "/tmp/nvvs.log" ] if parms != None: args.extend(["-p", "%s" % parms]) dcgmi = DcgmiApp(args=args) field_id = dcgm_fields.DCGM_FI_DEV_GPU_TEMP value = 1000 delay = 0 if name == "busgrind": field_id = dcgm_fields.DCGM_FI_DEV_PCIE_REPLAY_COUNTER value = 1000 delay = 15 inject_error = InjectionThread(handle, gpuIds[0], field_id, value) if delay == 0: inject_error.start() logger.info("Injecting errors now (field %s, value %s)" % (field_id, value)) assert dcgm_internal_helpers.verify_field_value( gpuIds[0], field_id, value) start = time.time() dcgmi.start(timeout=1500) # 25min timeout logger.info("Started diag with args: %s" % args) # Some tests do a diff test for the field values so we must let them see 0 values first if delay != 0: running, _ = dcgm_internal_helpers.check_nvvs_process( want_running=True) assert running, "nvvs did not start" logger.info("Nvvs started after %.1fs" % (time.time() - start)) time.sleep(delay) logger.info("Injecting errors now (field %s, value %s)" % (field_id, value)) inject_error.start() assert dcgm_internal_helpers.verify_field_value(gpuIds[0], field_id, value, maxWait=3) retcode = dcgmi.wait() inject_error.Stop() inject_error.join() assert inject_error.retCode == dcgm_structs.DCGM_ST_OK copy_nvvs_log("/tmp/nvvs.log", log_file) expected_retcode = ctypes.c_uint8(dcgm_structs.DCGM_ST_NVVS_ERROR).value if retcode != expected_retcode: logger.error("Expected retcode to be %s, but retcode of dcgmi is %s" % (expected_retcode, retcode)) dcgmi.validate( ) # Validate because dcgmi returns non zero when the diag fails (expected) log_app_output_to_file(dcgmi, output_file)
def verify_early_fail_checks_for_test(handle, gpuId, test_name, testIndex): """ Helper method for verifying the fail early checks for the specified test. """ if testIndex == dcgm_structs.DCGM_TARGETED_POWER_INDEX and not option_parser.options.developer_mode: # Skip this test since Targeted Power always fails when duration is less than 30 seconds test_utils.skip_test("Skipping fail early verification for Targeted Power test. Use developer mode " "to run this test.") duration = 2 if testIndex != dcgm_structs.DCGM_TARGETED_POWER_INDEX else 30 # Prevent false failures due to min # duration requirements for Targeted Power paramsStr = "%s.test_duration=%s" % (test_name, duration) data = [None] def runDiag(dd, data): # Simple helper method to run a diag (used as thread target) data[0] = test_utils.diag_execute_wrapper(dd, handle) ### # First verify that the given test passes for the gpu. # If it doesn't pass, skip test and add note to check GPU health logger.info("Checking whether %s test passes on GPU %s" % (test_name, gpuId)) dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr) test_name_no_spaces = test_name.replace(" ", "_") logname = '/tmp/nv_' + test_name_no_spaces + '%s.log' dd.SetDebugLogFile(logname % 1) dd.SetDebugLevel(5) response = test_utils.diag_execute_wrapper(dd, handle) if not check_diag_result_pass(response, gpuId, testIndex): test_utils.skip_test("Skipping because GPU %s does not pass %s test. " "Please verify whether the GPU is healthy." % (gpuId, test_name)) ### # Next, verify that the given test passes for the gpu when fail early checks are enabled and no errors are inserted logger.info("Checking whether %s test passes on GPU %s with fail early enabled" % (test_name, gpuId)) duration = 15 if testIndex != dcgm_structs.DCGM_TARGETED_POWER_INDEX else 30 # Prevent false failures due to min # duration requirements for Targeted Power paramsStr = "%s.test_duration=%s" % (test_name, duration) dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr) dd.SetFailEarly(checkInterval=2) # enable fail early checks dd.SetDebugLogFile(logname % 2) dd.SetDebugLevel(5) result_thread = threading.Thread(target=runDiag, args=[dd, data]) result_thread.start() # Ensure nvvs process has started running, debug_output = dcgm_internal_helpers.check_nvvs_process(want_running=True) assert running, "Nvvs process did not start within 10 seconds. pgrep output: %s" % debug_output start = time.time() result_thread.join() end = time.time() assert check_diag_result_pass(data[0], gpuId, testIndex), \ "Expected %s test to pass with fail early enabled and no inserted errors" % test_name assert (end - start) >= duration * 0.9, \ "Expected %s test to run for at least %ss, but it only ran for %ss." % (test_name, duration, end - start) ### # Verify fail early behavior by inserting an error. # Setup test parameters duration = 20 if testIndex != dcgm_structs.DCGM_TARGETED_POWER_INDEX else 30 # Prevent false failures due to min # duration requirements for Targeted Power paramsStr = "%s.test_duration=%s" % (test_name, duration) response = None dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr) dd.SetFailEarly(checkInterval=2) # enable fail early checks dd.SetDebugLogFile(logname % 3) # Setup threads / processes xid_inject_val = 2 result_thread = threading.Thread(target=runDiag, args=[dd, data]) inject_error = dcgm_internal_helpers.InjectionThread(handle, gpuId, dcgm_fields.DCGM_FI_DEV_XID_ERRORS, xid_inject_val, offset=5) logger.info("Verifying fail early behavior for %s test by inserting XIDs." % test_name) # Start inserting errors inject_error.start() # Ensure that inserted errors are visible assert \ dcgm_internal_helpers.verify_field_value(gpuId, dcgm_fields.DCGM_FI_DEV_XID_ERRORS, xid_inject_val, checkInterval=0.1, numMatches=5), \ "Expected inserted value for XIDs to be visible in DCGM" # Start test thread result_thread.start() # Ensure nvvs process has started running, debug_output = dcgm_internal_helpers.check_nvvs_process(want_running=True) assert running, "Nvvs process did not start within 10 seconds. pgrep output: %s" % debug_output start = time.time() # Give the test time to exit and verify that the test exits early # Test should exit within 75% of test duration if it is going to fail early. Ideally, it should exit within # 2 failure checks (~ 4 seconds of test start), but we provide bigger buffer to account for delays in starting # the test result_thread.join(20) test_exited_early = not result_thread.is_alive() # Cache thread isAlive value until we verify it end = time.time() # Stop the injection app inject_error.Stop() inject_error.join() # Verify injection app stopped correctly assert inject_error.retCode == dcgm_structs.DCGM_ST_OK, \ "There was an error inserting values into dcgm. Return code: %s" % inject_error.retCode if not test_exited_early: # Wait for the launched diag to end result_thread.join() end = time.time() response = data[0] # Check whether test exited early assert test_exited_early, \ "Expected %s test to exit early. Test took %ss to complete.\nGot result: %s (\ninfo: %s,\n warning: %s)" \ % (test_name, (end - start), response.perGpuResponses[gpuId].results[testIndex].result, response.perGpuResponses[gpuId].results[testIndex].info, response.perGpuResponses[gpuId].results[testIndex].error.msg) # Verify the test failed assert check_diag_result_fail(response, gpuId, testIndex), \ "Expected %s test to fail due to injected dbes.\nGot result: %s (\ninfo: %s,\n warning: %s)" % \ (test_name, response.perGpuResponses[gpuId].results[testIndex].result, response.perGpuResponses[gpuId].results[testIndex].info, response.perGpuResponses[gpuId].results[testIndex].error.msg) ### # Rerun the test to verify that the test passes now that there are no inserted errors duration = 30 paramsStr = "%s.test_duration=%s" % (test_name, duration) logger.info("Verifying that test passes once xid errors are removed.") dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr) dd.SetFailEarly(checkInterval=3) # enable fail early checks dd.SetDebugLogFile(logname % 4) # Reset dbes error inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_XID_ERRORS, 0, 0) # Sleep to ensure no pending errors left time.sleep(10) response = test_utils.diag_execute_wrapper(dd, handle) # Verify the test passed assert check_diag_result_pass(response, gpuId, testIndex), \ "Expected %s test to pass because there are no dbes\nGot result: %s (\ninfo: %s,\n warning: %s)" % \ (test_name, response.perGpuResponses[gpuId].results[testIndex].result, response.perGpuResponses[gpuId].results[testIndex].info, response.perGpuResponses[gpuId].results[testIndex].error.msg)
def helper_per_gpu_responses_dcgmi(handle, gpuIds, testName, testParams): """ Verify that pass/fail status for diagnostic tests are reported on a per GPU basis via dcgmi (for both normal stdout and JSON output). """ def get_stdout(app): output = '' for line in app.stdout_lines: output = output + line + " " return output def print_output(app): logger.info(get_stdout(app)) for line in app.stderr_lines: logger.error(line) def verify_successful_dcgmi_run(app): app.start(timeout=40) logger.info("Started dcgmi diag with pid %s" % app.getpid()) retcode = app.wait() if test_utils.is_mig_incompatible_failure(get_stdout(app)): app.validate() test_utils.skip_test( "Skipping this test because MIG is configured incompatibly (preventing access to the whole GPU)" ) # dcgm returns DCGM_ST_NVVS_ERROR on diag failure (which is expected here). expected_retcode = c_uint8( dcgm_structs.DCGM_ST_NVVS_ISOLATE_ERROR).value if retcode != expected_retcode: if app.stderr_lines or app.stdout_lines: logger.info("dcgmi output:") print_output(app) assert retcode == expected_retcode, \ "Expected dcgmi diag to have retcode %s. Got return code %s" % (expected_retcode, retcode) app.validate() # non-zero exit code must be validated #helper_verify_diag_passing(handle, gpuIds, useFakeGpus=True) # Setup injection app interval = 0.1 fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS insertedError = dcgm_fields.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN # Use an offset to make these errors start after the benign values inject_value(handle, gpuIds[0], fieldId, insertedError, injection_offset, True) inject_value(handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000, injection_offset, True) # Verify that the inserted values are visible in DCGM before starting the diag assert dcgm_internal_helpers.verify_field_value(gpuIds[0], fieldId, insertedError, checkInterval=interval, maxWait=5, numMatches=1), \ "Expected inserted values to be visible in DCGM" # Verify dcgmi output gpuIdStrings = list(map(str, gpuIds)) gpuList = ",".join(gpuIdStrings) args = [ "diag", "-r", testName, "-p", testParams, "-f", gpuList, "--throttle-mask", "0" ] dcgmiApp = DcgmiApp(args=args) logger.info("Verifying stdout output") verify_successful_dcgmi_run(dcgmiApp) # Verify dcgmi output shows per gpu results (crude approximation of verifying correct console output) stress_header_found = False fail_gpu_found = False fail_gpu_text = "Fail - GPU: %s" % gpuIds[0] check_for_warning = False warning_found = False for line in dcgmiApp.stdout_lines: if not stress_header_found: if "Stress" not in line: continue stress_header_found = True continue if not fail_gpu_found: if fail_gpu_text not in line: continue fail_gpu_found = True check_for_warning = True continue if check_for_warning: if "Warning" in line: warning_found = True break if not (stress_header_found and fail_gpu_found and warning_found): logger.info("dcgmi output:") print_output(dcgmiApp) assert stress_header_found, "Expected to see 'Stress' header in output" assert fail_gpu_found, "Expected to see %s in output" % fail_gpu_text assert warning_found, "Expected to see 'Warning' in output after GPU failure text" inject_value(handle, gpuIds[0], fieldId, insertedError, injection_offset, True) inject_value(handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000, injection_offset, True) # Verify that the inserted values are visible in DCGM before starting the diag assert dcgm_internal_helpers.verify_field_value(gpuIds[0], fieldId, insertedError, checkInterval=interval, maxWait=5, numMatches=1), \ "Expected inserted values to be visible in DCGM" # Verify JSON output logger.info("Verifying JSON output") args.append("-j") dcgmiApp = DcgmiApp(args=args) verify_successful_dcgmi_run(dcgmiApp) # Stop error insertion logger.info("Stopped error injection") # Verify per GPU results json_output = "\n".join(dcgmiApp.stdout_lines) output = json.loads(json_output) verifed = False if (len(output.get("DCGM GPU Diagnostic", {}).get("test_categories", [])) == 2 and output["DCGM GPU Diagnostic"]["test_categories"][1].get( "category", None) == "Stress" and output["DCGM GPU Diagnostic"] ["test_categories"][1]["tests"][0]["name"] == testName and len(output["DCGM GPU Diagnostic"]["test_categories"][1] ["tests"][0]["results"]) >= 2 and output["DCGM GPU Diagnostic"]["test_categories"][1]["tests"][0] ["results"][0]["gpu_ids"] == str(gpuIds[0]) and output["DCGM GPU Diagnostic"]["test_categories"][1]["tests"][0] ["results"][0]["status"] == "Fail" and output["DCGM GPU Diagnostic"] ["test_categories"][1]["tests"][0]["results"][1]["status"] == "Pass"): verifed = True if not verifed: print_output(dcgmiApp) assert verifed, "dcgmi JSON output did not pass verification"