def helper_inject_vgpu_configuration(handle, gpuId, eccModeVal, powerLimitVal, computeModeVal): """ Helper method to inject configuration to Cachemanager """ if (eccModeVal != None): # inject an error into Ecc Mode eccMode = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() eccMode.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 eccMode.fieldId = dcgm_fields.DCGM_FI_DEV_ECC_CURRENT eccMode.status = 0 eccMode.fieldType = ord(dcgm_fields.DCGM_FT_INT64) eccMode.ts = int((time.time() + 1) * 1000000.0) # set the injected data into the future eccMode.value.i64 = eccModeVal ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, eccMode) assert (ret == dcgm_structs.DCGM_ST_OK) if (powerLimitVal != None): powerLimit = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() powerLimit.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 powerLimit.fieldId = dcgm_fields.DCGM_FI_DEV_POWER_MGMT_LIMIT powerLimit.status = 0 powerLimit.fieldType = ord(dcgm_fields.DCGM_FT_DOUBLE) powerLimit.ts = int((time.time() + 1) * 1000000.0) # set the injected data into the future powerLimit.value.dbl = powerLimitVal ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, powerLimit) assert (ret == dcgm_structs.DCGM_ST_OK) if (computeModeVal != None): computeMode = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() computeMode.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 computeMode.fieldId = dcgm_fields.DCGM_FI_DEV_COMPUTE_MODE computeMode.status = 0 computeMode.fieldType = ord(dcgm_fields.DCGM_FT_INT64) computeMode.ts = int( (time.time() + 1) * 1000000.0) # set the injected data into the future computeMode.value.i64 = computeModeVal ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, computeMode) assert (ret == dcgm_structs.DCGM_ST_OK)
def test_collectd_basic_integration(handle, gpuIds): """ Verifies that we can inject specific data and get that same data back """ dcgmHandle = pydcgm.DcgmHandle(handle) dcgmSystem = dcgmHandle.GetSystem() specificFieldIds = [ dcgm_fields.DCGM_FI_DEV_RETIRED_DBE, dcgm_fields.DCGM_FI_DEV_RETIRED_SBE, dcgm_fields.DCGM_FI_DEV_POWER_VIOLATION, dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION ] fieldValues = [1, 5, 1000, 9000] for gpuId in gpuIds: for i in range(0, len(specificFieldIds)): field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 field.fieldId = specificFieldIds[i] field.status = 0 field.fieldType = ord(dcgm_fields.DCGM_FT_INT64) field.ts = int((time.time() + 10) * 1000000.0) # set the injected data into the future field.value.i64 = fieldValues[i] ret = dcgm_agent_internal.dcgmInjectFieldValue( handle, gpuId, field) assert (ret == dcgm_structs.DCGM_ST_OK) gvars = collectd_tester_globals.gvars assert 'config' in gvars gvars['config']() assert 'init' in gvars gvars['init']() assert 'read' in gvars gvars['read']() assert 'out' in gvars outDict = gvars['out'] assert 'shutdown' in gvars # gvars['shutdown']() # Verify that we can read back the fields we watch. for gpuId in gpuIds: assert str(gpuId) in outDict gpuDict = outDict[str(gpuId)] for i in range(0, len(specificFieldIds)): fieldTag = dcgmSystem.fields.GetFieldById(specificFieldIds[i]).tag assert fieldTag in gpuDict assert gpuDict[fieldTag] == fieldValues[i]
def test_dcgmi_nvlink_nvswitches(handle, gpuIds, switchIds): """ Test dcgmi to display dmon values """ gpuGroupId = str(_create_dcgmi_group(dcgm_structs.DCGM_GROUP_DEFAULT)) switchGroupId = str( _create_dcgmi_group(dcgm_structs.DCGM_GROUP_DEFAULT_NVSWITCHES)) logger.info("Injected switch IDs:" + str(switchIds)) _test_valid_args([["nvlink", "-s"] # Link status should work without parameters ]) # Creates a comma separated list of gpus allGpusCsv = ",".join(map(str, gpuIds)) #Same for switches but predicate each one with nvswitch allSwitchesCsv = ",".join(map(helper_make_switch_string, switchIds)) switchFieldId = dcgm_fields.DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P00 #Inject a value for a field for each switch so we can retrieve it field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 field.fieldId = switchFieldId field.status = 0 field.fieldType = ord(dcgm_fields.DCGM_FT_INT64) field.ts = int((time.time() - 5) * 1000000.0) #5 seconds ago field.value.i64 = 0 for switchId in switchIds: ret = dcgm_agent_internal.dcgmInjectEntityFieldValue( handle, dcgm_fields.DCGM_FE_SWITCH, switchId, field) _test_valid_args([ ["dmon", "-e", "150,155", "-c", "1"], # run the dmon for default gpu group. ["dmon", "-e", "150,155", "-c", "1", "-g", gpuGroupId], # run the dmon for a specified gpu group ["dmon", "-e", "150,155", "-c", "1", "-g", 'all_gpus'], # run the dmon for a specified group [ "dmon", "-e", str(switchFieldId), "-c", "1", "-g", 'all_nvswitches' ], # run the dmon for a specified group - Reenable after DCGM-413 is fixed ["dmon", "-e", str(switchFieldId), "-c", "1", "-g", switchGroupId], # run the dmon for a specified group ["dmon", "-e", "150,155", "-c", "1", "-d", "2000"], # run the dmon for delay mentioned and default gpu group. ["dmon", "-e", "150,155", "-c", "1", "-d", "2000", "-i", allGpusCsv ], # run the dmon for devices mentioned and mentioned delay. [ "dmon", "-e", str(switchFieldId), "-c", "1", "-d", "2000", "-i", allSwitchesCsv ] # run the dmon for devices mentioned and mentioned delay. ])
def inject_field_value_fp64(handle, gpuId, fieldId, value, offset): field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 field.fieldId = fieldId field.status = 0 field.fieldType = ord(dcgm_fields.DCGM_FT_DOUBLE) field.ts = int((time.time() + offset) * 1000000.0) field.value.dbl = value return dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, field)
def inject_field_value_i64(handle, entityId, fieldId, value, offset, entityGroupId=dcgm_fields.DCGM_FE_GPU): field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 field.fieldId = fieldId field.status = 0 field.fieldType = ord(dcgm_fields.DCGM_FT_INT64) field.ts = int((time.time()+offset) * 1000000.0) field.value.i64 = value return dcgm_agent_internal.dcgmInjectEntityFieldValue(handle, entityGroupId, entityId, field)
def test_dcgm_policy_inject_retiredpages_standalone(handle, gpuIds): """ Verifies that we can inject an error into the retired pages counters and receive a callback """ newPolicy = dcgm_structs.c_dcgmPolicy_v1() newPolicy.version = dcgm_structs.dcgmPolicy_version1 newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_MAX_PAGES_RETIRED newPolicy.parms[2].tag = 1 newPolicy.parms[2].val.llval = 5 # find a GPU that supports ECC and retired pages (otherwise internal test will ignore it) dcgmHandle = pydcgm.DcgmHandle(handle) dcgmSystem = dcgmHandle.GetSystem() group = dcgmSystem.GetGroupWithGpuIds("test1", gpuIds) group.policy.Set(newPolicy) callbackQueue = Queue.Queue() c_callback = create_c_callback(callbackQueue) group.policy.Register(dcgm_structs.DCGM_POLICY_COND_MAX_PAGES_RETIRED, finishCallback=c_callback) # inject an error into ECC numPages = 10 field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 field.fieldId = dcgm_fields.DCGM_FI_DEV_RETIRED_DBE field.status = 0 field.fieldType = ord(dcgm_fields.DCGM_FT_INT64) field.ts = int((time.time() + 60) * 1000000.0) # set the injected data into the future field.value.i64 = numPages ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuIds[0], field) assert (ret == dcgm_structs.DCGM_ST_OK) #inject a SBE too so that the health check code gets past its internal checks field.fieldId = dcgm_fields.DCGM_FI_DEV_RETIRED_SBE ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuIds[0], field) assert (ret == dcgm_structs.DCGM_ST_OK) # wait for the the policy manager to call back try: callbackData = callbackQueue.get(timeout=POLICY_CALLBACK_TIMEOUT_SECS) except Queue.Empty: assert False, "Callback never happened" # check that the callback occurred with the correct arguments assert(dcgm_structs.DCGM_POLICY_COND_MAX_PAGES_RETIRED == callbackData.condition), \ ("error callback was not for a retired pages, got: %s" % callbackData.condition) assert(numPages == callbackData.val.mpr.dbepages), \ 'Expected %s errors but got %s' % (numPages, callbackData.val.mpr.dbepages)
def test_dcgm_injection_multi_fetch_remote(handle, gpuIds): """ Verify that multi-fetches work with the agent """ gpuId = gpuIds[0] NinjectValues = 10 firstTs = get_usec_since_1970() lastTs = 0 injectedValues = [] #Inject the values we're going to fetch for i in range(NinjectValues): fvGood = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() fvGood.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 fvGood.fieldId = dcgm_fields.DCGM_FI_DEV_ECC_PENDING fvGood.status = 0 fvGood.fieldType = ord(dcgm_fields.DCGM_FT_INT64) fvGood.ts = firstTs + i fvGood.value.i64 = 1 + i #This will throw an exception if it fails dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvGood) injectedValues.append(fvGood) lastTs = fvGood.ts #Fetch in forward order with no timestamp. Verify startTs = 0 endTs = 0 maxCount = 2 * NinjectValues #Pick a bigger number so we can verify only NinjectValues come back order = dcgm_structs.DCGM_ORDER_ASCENDING fvFetched = dcgm_agent_internal.dcgmGetMultipleValuesForField( handle, gpuId, fvGood.fieldId, maxCount, startTs, endTs, order) assert len(fvFetched) == NinjectValues, "Expected %d rows. Got %d" % ( NinjectValues, len(fvFetched)) helper_verify_multi_values(fvFetched, order, injectedValues) #Now do the same fetch with descending values startTs = 0 endTs = 0 maxCount = 2 * NinjectValues #Pick a bigger number so we can verify only NinjectValues come back order = dcgm_structs.DCGM_ORDER_DESCENDING fvFetched = dcgm_agent_internal.dcgmGetMultipleValuesForField( handle, gpuId, fvGood.fieldId, maxCount, startTs, endTs, order) assert len(fvFetched) == NinjectValues, "Expected %d rows. Got %d" % ( NinjectValues, len(fvFetched)) helper_verify_multi_values(fvFetched, order, injectedValues)
def helper_test_dcgm_injection_summaries(handle, gpuIds): gpuId = gpuIds[0] # Watch the field we're inserting into dcgm_agent_internal.dcgmWatchFieldValue( handle, gpuId, dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, 1, 3600.0, 10000) handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() #Make a base value that is good for starters field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 field.fieldId = dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL field.status = 0 field.fieldType = ord(dcgm_fields.DCGM_FT_INT64) baseTime = get_usec_since_1970() for i in range(0, 10): field.ts = baseTime + i field.value.i64 = i ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, field) assert (ret == dcgm_structs.DCGM_ST_OK) time.sleep(1) systemObj.UpdateAllFields(1) tmpMask = dcgm_structs.DCGM_SUMMARY_MIN | dcgm_structs.DCGM_SUMMARY_MAX tmpMask = tmpMask | dcgm_structs.DCGM_SUMMARY_AVG | dcgm_structs.DCGM_SUMMARY_DIFF # Pass baseTime for the start to get nothing from the first query with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_NO_DATA)): request = dcgm_agent.dcgmGetFieldSummary( handle, dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, dcgm_fields.DCGM_FE_GPU, gpuId, tmpMask, baseTime - 60, baseTime - 30) # Now adjust the time so we get values request = dcgm_agent.dcgmGetFieldSummary( handle, dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, dcgm_fields.DCGM_FE_GPU, gpuId, tmpMask, 0, 0) assert (request.response.values[0].i64 == 0) assert (request.response.values[1].i64 == 9) assert (request.response.values[2].i64 == 4) assert (request.response.values[3].i64 == 9)
def test_dcgm_policy_inject_nvlinkerror_standalone(handle, gpuIds): """ Verifies that we can inject an error into the NVLINK error and receive a callback """ newPolicy = dcgm_structs.c_dcgmPolicy_v1() newPolicy.version = dcgm_structs.dcgmPolicy_version1 newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_NVLINK newPolicy.parms[5].tag = 0 newPolicy.parms[5].val.boolean = True # find a GPU that supports nvlink (otherwise internal test will ignore it) dcgmHandle = pydcgm.DcgmHandle(handle) dcgmSystem = dcgmHandle.GetSystem() group = dcgmSystem.GetGroupWithGpuIds('test1', gpuIds) group.policy.Set(newPolicy) callbackQueue = Queue.Queue() c_callback = create_c_callback(callbackQueue) group.policy.Register(dcgm_structs.DCGM_POLICY_COND_NVLINK, finishCallback=c_callback) field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 field.fieldId = dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL field.status = 0 field.fieldType = ord(dcgm_fields.DCGM_FT_INT64) field.ts = int((time.time() + 60) * 1000000.0) # set the injected data into the future field.value.i64 = 1 ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuIds[0], field) assert (ret == dcgm_structs.DCGM_ST_OK) # wait for the the policy manager to call back try: callbackData = callbackQueue.get(timeout=POLICY_CALLBACK_TIMEOUT_SECS) except Queue.Empty: assert False, "Callback never happened" # check that the callback occurred with the correct arguments assert(dcgm_structs.DCGM_POLICY_COND_NVLINK == callbackData.condition), \ ("NVLINK error callback was not for a NVLINK error, got: %s" % callbackData.condition) assert(dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL == callbackData.val.nvlink.fieldId), \ ("Expected 130 fieldId but got %s" % callbackData.val.nvlink.fieldId) assert ( 1 == callbackData.val.nvlink.counter ), 'Expected 1 PCI error but got %s' % callbackData.val.nvlink.counter
def helper_dcgm_policy_inject_eccerror(handle, gpuIds): """ Verifies that we can inject an error into the ECC counters and receive a callback """ newPolicy = dcgm_structs.c_dcgmPolicy_v1() newPolicy.version = dcgm_structs.dcgmPolicy_version1 newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_DBE newPolicy.parms[0].tag = 0 newPolicy.parms[0].val.boolean = True dcgmHandle = pydcgm.DcgmHandle(handle) dcgmSystem = dcgmHandle.GetSystem() group = dcgmSystem.GetGroupWithGpuIds("test1", gpuIds) group.policy.Set(newPolicy) # the order of the callbacks will change once implementation is complete callbackQueue = Queue.Queue() c_callback = create_c_callback(callbackQueue) group.policy.Register(dcgm_structs.DCGM_POLICY_COND_DBE, c_callback, None) # inject an error into ECC field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 field.fieldId = dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_DEV field.status = 0 field.fieldType = ord(dcgm_fields.DCGM_FT_INT64) field.ts = int((time.time() + 60) * 1000000.0) # set the injected data into the future field.value.i64 = 1 logger.debug("injecting %s for gpuId %d" % (str(field), gpuIds[0])) ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuIds[0], field) assert (ret == dcgm_structs.DCGM_ST_OK) # wait for the the policy manager to call back try: callbackData = callbackQueue.get(timeout=POLICY_CALLBACK_TIMEOUT_SECS) except Queue.Empty: assert False, "Callback never happened" # check that the callback occurred with the correct arguments assert(dcgm_structs.DCGM_POLICY_COND_DBE == callbackData.condition), \ ("error callback was not for a DBE error, got: %s" % callbackData.condition) assert ( 1 == callbackData.val.dbe.numerrors ), 'Expected 1 DBE error but got %s' % callbackData.val.dbe.numerrors assert(dcgm_structs.c_dcgmPolicyConditionDbe_t.LOCATIONS['DEVICE'] == callbackData.val.dbe.location), \ 'got: %s' % callbackData.val.dbe.location
def helper_dcgm_policy_inject_pcierror(handle, gpuIds): """ Verifies that we can inject an error into the PCI counters and receive a callback """ newPolicy = dcgm_structs.c_dcgmPolicy_v1() newPolicy.version = dcgm_structs.dcgmPolicy_version1 newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_PCI newPolicy.parms[1].tag = 1 newPolicy.parms[1].val.llval = 0 gpuId = gpuIds[0] group = pydcgm.DcgmGroup(pydcgm.DcgmHandle(handle), groupName="test1", groupType=dcgm_structs.DCGM_GROUP_EMPTY) group.AddGpu(gpuId) group.policy.Set(newPolicy) callbackQueue = Queue.Queue() c_callback = create_c_callback(callbackQueue) group.policy.Register(dcgm_structs.DCGM_POLICY_COND_PCI, finishCallback=c_callback) field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 field.fieldId = dcgm_fields.DCGM_FI_DEV_PCIE_REPLAY_COUNTER field.status = 0 field.fieldType = ord(dcgm_fields.DCGM_FT_INT64) field.ts = int((time.time() + 60) * 1000000.0) # set the injected data into the future field.value.i64 = 1 ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, field) assert (ret == dcgm_structs.DCGM_ST_OK) # wait for the the policy manager to call back try: callbackData = callbackQueue.get(timeout=POLICY_CALLBACK_TIMEOUT_SECS) except Queue.Empty: assert False, "Callback never happened" # check that the callback occurred with the correct arguments assert(dcgm_structs.DCGM_POLICY_COND_PCI == callbackData.condition), \ ("PCI error callback was not for a PCI error, got: %s" % callbackData.condition) assert (1 == callbackData.val.pci.counter ), 'Expected 1 PCI error but got %s' % callbackData.val.pci.counter
def test_reading_specific_data(handle, gpuIds): """ Verifies that we can inject specific data and get that same data back """ dcgmHandle = pydcgm.DcgmHandle(handle) dcgmSystem = dcgmHandle.GetSystem() specificFieldIds = [ dcgm_fields.DCGM_FI_DEV_RETIRED_DBE, dcgm_fields.DCGM_FI_DEV_POWER_VIOLATION, dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION, ] fieldValues = [ 1, 1000, 9000, ] for i in range(0, len(specificFieldIds)): field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 field.fieldId = specificFieldIds[i] field.status = 0 field.fieldType = ord(dcgm_fields.DCGM_FT_INT64) field.ts = int((time.time() + 10) * 1000000.0) # set the injected data into the future field.value.i64 = fieldValues[i] ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuIds[0], field) assert (ret == dcgm_structs.DCGM_ST_OK) # pylint: disable=undefined-variable dr = DcgmReader(fieldIds=specificFieldIds) dr.SetHandle(handle) latest = dr.GetLatestGpuValuesAsFieldIdDict() assert len(latest[gpuIds[0]]) == len(specificFieldIds) for i in range(0, len(specificFieldIds)): assert latest[gpuIds[0]][specificFieldIds[i]] == fieldValues[i]
def helper_test_dcgm_policy_inject_xiderror(handle, gpuIds): """ Verifies that we can inject an XID error and receive a callback """ newPolicy = dcgm_structs.c_dcgmPolicy_v1() newPolicy.version = dcgm_structs.dcgmPolicy_version1 newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_XID newPolicy.parms[6].tag = 0 newPolicy.parms[6].val.boolean = True dcgmHandle = pydcgm.DcgmHandle(handle) validDeviceId = -1 devices = gpuIds for x in devices: fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, x, [ dcgm_fields.DCGM_FI_DEV_XID_ERRORS, ]) if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED): validDeviceId = x break if (validDeviceId == -1): test_utils.skip_test( "Can only run if at least one GPU that supports XID errors is present" ) group = pydcgm.DcgmGroup(dcgmHandle, groupName="test1", groupType=dcgm_structs.DCGM_GROUP_EMPTY) group.AddGpu(validDeviceId) group.policy.Set(newPolicy) callbackQueue = Queue.Queue() c_callback = create_c_callback(callbackQueue) group.policy.Register(dcgm_structs.DCGM_POLICY_COND_XID, finishCallback=c_callback) field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 field.fieldId = dcgm_fields.DCGM_FI_DEV_XID_ERRORS field.status = 0 field.fieldType = ord(dcgm_fields.DCGM_FT_INT64) field.ts = int((time.time() + 60) * 1000000.0) # set the injected data into the future field.value.i64 = 16 ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, validDeviceId, field) assert (ret == dcgm_structs.DCGM_ST_OK) # wait for the the policy manager to call back try: callbackData = callbackQueue.get(timeout=POLICY_CALLBACK_TIMEOUT_SECS) except Queue.Empty: assert False, "Callback never happened" # check that the callback occurred with the correct arguments assert(dcgm_structs.DCGM_POLICY_COND_XID == callbackData.condition), \ ("XID error callback was not for a XID error, got: %s" % callbackData.condition) assert (16 == callbackData.val.xid.errnum), ( 'Expected XID error 16 but got %s' % callbackData.val.xid.errnum)
), "Failed to set configuration for the group: %s" % ret dcgm_agent.dcgmStatusClear(statusHandle) helper_verify_power_value_standalone(handle, groupId, powerLimit_set) ret = dcgm_agent.dcgmPolicySet(handle, groupId, newPolicy, statusHandle) assert (ret == dcgm_structs.DCGM_ST_OK) time.sleep(5) # give the policy manager a chance to start requestId = dcgm_agent.dcgmPolicyRegister( handle, groupId, dcgm_structs.DCGM_POLICY_COND_MAX_PAGES_RETIRED, c_callback, c_callback) assert (requestId != None) # inject an error into page retirement field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 field.fieldId = dcgm_fields.DCGM_FI_DEV_RETIRED_DBE field.status = 0 field.fieldType = ord(dcgm_fields.DCGM_FT_INT64) field.ts = int( (time.time() + 11) * 1000000.0) # set the injected data into the future field.value.i64 = 10 ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, validDevice, field) assert (ret == dcgm_structs.DCGM_ST_OK) now = time.time() while not callbackCalled: if time.time() == now + 60: # wait 60 seconds
def test_dcgm_injection_agent(handle, gpuIds): """ Verifies that injection works with the agent host engine """ gpuId = gpuIds[0] #Make a base value that is good for starters fvGood = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() fvGood.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 fvGood.fieldId = dcgm_fields.DCGM_FI_DEV_ECC_CURRENT fvGood.status = 0 fvGood.fieldType = ord(dcgm_fields.DCGM_FT_INT64) fvGood.ts = get_usec_since_1970() fvGood.value.i64 = 1 fieldInfoBefore = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handle, gpuId, fvGood.fieldId) countBefore = fieldInfoBefore.numSamples #This will throw an exception if it fails dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvGood) fieldInfoAfter = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handle, gpuId, fvGood.fieldId) countAfter = fieldInfoAfter.numSamples assert countAfter > countBefore, "Expected countAfter %d > countBefore %d after injection" % ( countAfter, countBefore) #Fetch the value we just inserted and verify its attributes are the same fvFetched = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, gpuId, [ fvGood.fieldId, ])[0] helper_verify_fv_equal(fvFetched, fvGood) #Should be able to insert a null timestamp. The agent will just use "now" fvAlsoGood = fvGood fvAlsoGood.ts = 0 #This will thrown an exception if it fails dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvAlsoGood) #Now make some attributes bad and expect an error fvBad = fvGood fvBad.fieldType = ord(dcgm_fields.DCGM_FT_DOUBLE) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)): dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvBad) fvGood.fieldType = ord(dcgm_fields.DCGM_FT_INT64) """ TODO: DCGM-2130 - Restore this test when protobuf is removed #Now make some attributes bad and expect an error fvBad = fvGood fvBad.version = 0 with test_utils.assert_raises(dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvBad) fvGood.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 """ fvBad = fvGood fvBad.fieldId = dcgm_fields.DCGM_FI_MAX_FIELDS + 100 with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)): dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvBad)
def test_dcgmi_dmon(handle, gpuIds, switchIds, instanceIds, ciIds): """ Test dcgmi to display dmon values """ gpuGroupId = str(_create_dcgmi_group(dcgm_structs.DCGM_GROUP_DEFAULT)) switchGroupId = str( _create_dcgmi_group(dcgm_structs.DCGM_GROUP_DEFAULT_NVSWITCHES)) logger.info("Injected switch IDs:" + str(switchIds)) # Creates a comma separated list of gpus allGpusCsv = ",".join(map(str, gpuIds)) allInstancesCsv = ",".join( map(lambda x: ("instance:" + str(x)), instanceIds)) # All compute instances allCisCsv = ",".join(map(lambda x: ("ci:" + str(x)), ciIds)) #Same for switches but predicate each one with nvswitch allSwitchesCsv = ",".join(map(helper_make_switch_string, switchIds)) switchFieldId = dcgm_fields.DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P00 #Inject a value for a field for each switch so we can retrieve it field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 field.fieldId = switchFieldId field.status = 0 field.fieldType = ord(dcgm_fields.DCGM_FT_INT64) field.ts = int((time.time() - 5) * 1000000.0) #5 seconds ago field.value.i64 = 0 for switchId in switchIds: ret = dcgm_agent_internal.dcgmInjectEntityFieldValue( handle, dcgm_fields.DCGM_FE_SWITCH, switchId, field) _test_valid_args([ ["dmon", "-e", "150,155", "-c", "1"], # run the dmon for default gpu group. ["dmon", "-e", "150,155", "-c", "1", "-g", gpuGroupId], # run the dmon for a specified gpu group ["dmon", "-e", "150,155", "-c", "1", "-g", 'all_gpus'], # run the dmon for a specified group [ "dmon", "-e", str(switchFieldId), "-c", "1", "-g", 'all_nvswitches' ], # run the dmon for a specified group - Reenable after DCGM-413 is fixed ["dmon", "-e", str(switchFieldId), "-c", "1", "-g", switchGroupId], # run the dmon for a specified group ["dmon", "-e", "150,155", "-c", "1", "-d", "2000"], # run the dmon for delay mentioned and default gpu group. ["dmon", "-e", "150,155", "-c", "1", "-d", "2000", "-i", allGpusCsv ], # run the dmon for devices mentioned and mentioned delay. [ "dmon", "-e", "150,155", "-c", "1", "-d", "2000", "-i", allInstancesCsv ], ["dmon", "-e", "150,155", "-c", "1", "-d", "2000", "-i", allCisCsv], [ "dmon", "-e", "150,155", "-c", "1", "-d", "2000", "-i", allGpusCsv + "," + allInstancesCsv + "," + allCisCsv ], [ "dmon", "-e", str(switchFieldId), "-c", "1", "-d", "2000", "-i", allSwitchesCsv ] # run the dmon for devices mentioned and mentioned delay. ]) #Run tests that take a gpuId as an argument for gpu in gpuIds: _test_valid_args([ ["dmon", "-e", "150", "-c", "1", "-i", str(gpu)], # run the dmon for one gpu. ["dmon", "-e", "150", "-c", "1", "-i", 'gpu:' + str(gpu)], # run the dmon for one gpu, tagged as gpu:. ["dmon", "-e", "150", "-c", "1", "-i", str(gpu)], # run the dmon for mentioned devices and count value. [ "dmon", "-e", "150,155", "-c", "1", "-i", str(gpu) ] # run the dmon for devices mentioned, default delay and field values that are provided. ]) #Run tests that take a nvSwitch as an argument for switchId in switchIds: _test_valid_args([ [ "dmon", "-e", str(switchFieldId), "-c", "1", "-i", 'nvswitch:' + str(switchId) ], # run the dmon for one nvswitch, tagged as nvswitch:. ]) hugeGpuCsv = ",".join( map(str, range(0, dcgm_structs.DCGM_MAX_NUM_DEVICES * 2, 1))) _test_invalid_args([ ["dmon", "-c", "1"], # run without required fields. ["dmon", "-e", "-150", "-c", "1", "-i", "1"], # run with invalid field id. ["dmon", "-e", "150", "-c", "1", "-i", "-2"], # run with invalid gpu id. ["dmon", "-e", "150", "-c", "1", "-i", "gpu:999"], # run with invalid gpu id. ["dmon", "-e", "150", "-c", "1", "-g", "999"], # run with invalid group id. ["dmon", "-i", hugeGpuCsv, "-e", "150", "-c", "1"], # run with invalid number of devices. ["dmon", "-i", "instance:2000", "-e", "150", "-c", "1"], # run with invalid gpu_i ["dmon", "-i", "ci:2000", "-e", "150", "-c", "1"], # run with invalid gpu_ci ["dmon", "-e", "150", "f", "0", "-c", "1", "-i", "0,1,765"], # run with invalid device id (non existing id). ["dmon", "-e", "150", "-c", "-1", "-i", "1"], # run with invalid count value. ["dmon", "-e", "150", "-c", "1", "-i", "1", "-d", "-1"], # run with invalid delay (negative value). ["dmon", "-f", "-9", "-c", "1", "-i", "1", "-d", "10000"], # run with invalid field Id. ["dmon", "-f", "150", "-c", "1", "-i", "0", "-d", "99"] # run with invalid delay value. ])