def Validate(self, validate): runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7() runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7 runDiagInfo.validate = validate runDiagInfo.groupId = self._groupId ret = dcgm_agent.dcgmActionValidate_v2(self._dcgmHandle.handle, runDiagInfo) return ret
def RunSpecificTest(self, testName): runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7() runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7 for i in range(len(testName)): runDiagInfo.testNames[0][i] = testName[i] runDiagInfo.groupId = self._groupId runDiagInfo.validate = dcgm_structs.DCGM_POLICY_VALID_NONE response = dcgm_agent.dcgmActionValidate_v2(self._dcgmHandle.handle, runDiagInfo) return response
def helper_check_diag_empty_group(handle, gpuIds): handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetEmptyGroup("test1") runDiagInfo = dcgm_structs.c_dcgmRunDiag_t() runDiagInfo.version = dcgm_structs.dcgmRunDiag_version runDiagInfo.groupId = groupObj.GetId() runDiagInfo.validate = 1 with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_GROUP_IS_EMPTY)): dcgm_agent.dcgmActionValidate_v2(handle, runDiagInfo) # Now make sure everything works well with a group groupObj.AddGpu(gpuIds[0]) response = dcgm_agent.dcgmActionValidate_v2(handle, runDiagInfo) assert response, "Should have received a response now that we have a non-empty group"
def test_dcgm_action_run_diag_bad_validation(handle, gpuIds): gpuIdStr = "" for i, gpuId in enumerate(gpuIds): if i > 0: gpuIdStr += "," gpuIdStr += str(gpuId) drd = dcgm_structs.c_dcgmRunDiag_t() drd.version = dcgm_structs.dcgmRunDiag_version drd.validate = dcgm_structs.DCGM_POLICY_VALID_SV_LONG + 1 #use an invalid value drd.groupId = 0 #Initializing to 0 in case the constructor above doesn't drd.gpuList = gpuIdStr with test_utils.assert_raises(dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)): ret = dcgm_agent.dcgmActionValidate_v2(handle, drd, dcgm_structs.dcgmRunDiag_version)
def helper_dcgm_action_run_diag_gpu_list(handle, gpuIds): ''' Test that running the DCGM diagnostic works if you provide a GPU ID list rather than a groupId. ''' gpuIdStr = "" for i, gpuId in enumerate(gpuIds): if i > 0: gpuIdStr += "," gpuIdStr += str(gpuId) drd = dcgm_structs.c_dcgmRunDiag_t() drd.version = dcgm_structs.dcgmRunDiag_version drd.validate = dcgm_structs.DCGM_POLICY_VALID_SV_SHORT drd.groupId = 0 #Initializing to 0 in case the constructor above doesn't drd.gpuList = gpuIdStr #this will throw an exception on error ret = dcgm_agent.dcgmActionValidate_v2(handle, drd, dcgm_structs.dcgmRunDiag_version)
def check_gpu_diagnostic(handleObj, settings): runDiagInfo, activeGpuIds = initialize_run_diag_info(settings) if len(activeGpuIds) == 0: return response = dcgm_agent.dcgmActionValidate_v2(handleObj.handle, runDiagInfo) sysError = response.systemError if (sysError.code != dcgm_errors.DCGM_FR_OK): raise ValueError(sysError) if check_passive_health_checks(response, activeGpuIds) == False: for gpuIndex in range(response.gpuCount): for testIndex in range(dcgm_structs.DCGM_PER_GPU_TEST_COUNT): if response.perGpuResponses[gpuIndex].results[ testIndex].result == dcgm_structs.DCGM_DIAG_RESULT_FAIL: gpuId = response.perGpuResponses[gpuIndex].gpuId mark_entity_unhealthy( g_gpus, gpuId, BR_ST_FAILED_ACTIVE_HEALTH, response. perGpuResponses[gpuIndex].results[testIndex].warning) # NVVS marks all subsequent tests as failed so there's no point in continuing break
def Execute(self, handle): return dcgm_agent.dcgmActionValidate_v2(handle, self.runDiagInfo, self.version)
## identify the newly created group. runDiagInfo.groupId = dcgm_agent.dcgmGroupCreate( handle, dcgm_structs.DCGM_GROUP_DEFAULT, "all_gpus_group") ## Invoke method to get information on the newly created group groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, runDiagInfo.groupId) ## define the actions and validations for those actions to take place runDiagInfo.validate = dcgm_structs.DCGM_POLICY_VALID_SV_SHORT ## This will go ahead and perform a "prologue" diagnostic ## to make sure everything is ready to run ## currently this calls an outside diagnostic binary but eventually ## that binary will be merged into the DCGM framework ## The "response" is a dcgmDiagResponse structure that can be parsed for errors response = dcgm_agent.dcgmActionValidate_v2(handle, runDiagInfo) ## This will perform an "eiplogue" diagnostic that will stress the system ## Currently commented out because it takes several minutes to execute # runDiagInfo.validate = dcgm_structs.DCGM_POLICY_VALID_SV_LONG #response = dcgm_agent.dcgmActionValidate_v2(handle, dcgmRunDiagInfo) ## prime the policy manager to look for ECC, PCIe events ## if a callback occurs the function above is called. Currently the data returned ## corresponds to the error that occurred (PCI, DBE, etc.) but in the future it will be a ## dcgmPolicyViolation_t or similar ret = dcgm_agent.dcgmPolicyRegister( handle, runDiagInfo.groupId, dcgm_structs.DCGM_POLICY_COND_PCI | dcgm_structs.DCGM_POLICY_COND_DBE, None, c_callback) ## trigger the policy loop
def test_dcgm_run_diag(drd, version): drd.validate = 1 # run a short test drd.gpuList = str(gpuId) # This will throw an exception on error ret = dcgm_agent.dcgmActionValidate_v2(handle, drd, version)