def _sumMetadata(handle, getForFieldFn, getForAllFieldsFn, metaAttr): ''' Return a 3-tuple where the first entry is the aggregate of summing the metadata for every field, the second entry is the total before aggregating and the third entry is the total after aggregating. ''' system = pydcgm.DcgmSystem(pydcgm.DcgmHandle(handle)) group = pydcgm.DcgmGroup(pydcgm.DcgmHandle(handle), groupName="test-metadata", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) # watch every field possible on all GPUs watchedFields = test_utils.watch_all_fields(handle, group.GetGpuIds()) system.introspect.UpdateAll() # Get the total before and after to accomodate for any slight changes # in total memory usage while the individual field amounts are being summed startVal = getForAllFieldsFn().aggregateInfo.__getattribute__(metaAttr) aggregateVal = sum( getForFieldFn(handle, fieldId).aggregateInfo.__getattribute__(metaAttr) for fieldId in watchedFields) endVal = getForAllFieldsFn().aggregateInfo.__getattribute__(metaAttr) return aggregateVal, startVal, endVal
def test_dcgm_embedded_metadata_memory_get_field_sane(handle): ''' Sanity test for API that gets memory usage of a single field ''' if not option_parser.options.developer_mode: test_utils.skip_test("Skipping developer test.") handleObj = pydcgm.DcgmHandle(handle=handle) fieldIds = [ dcgm_fields.DCGM_FI_DEV_GPU_TEMP, ] fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "my_field_group", fieldIds) group = pydcgm.DcgmGroup(pydcgm.DcgmHandle(handle), groupName="test-metadata", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) system = pydcgm.DcgmSystem(pydcgm.DcgmHandle(handle)) _watch_field_group_basic(fieldGroup, handle, group.GetId()) system.introspect.UpdateAll() memoryInfo = dcgm_agent_internal.dcgmIntrospectGetFieldMemoryUsage( handle, fieldIds[0]) logger.debug("field %s using %.2f KB" % (fieldIds[0], memoryInfo.aggregateInfo.bytesUsed / 1024.)) # 0+ to 200 KB assert(0 < memoryInfo.aggregateInfo.bytesUsed < 1024*200), \ 'bytes used to store field was unreasonable for ID %s, bytes: %s' \ % (fieldIds[0], memoryInfo.aggregateInfo.bytesUsed)
def test_dcgm_prof_initial_valid_record(handle, gpuIds): ''' Test that we can retrieve a valid FV for a profiling field immediately after watching ''' dcgmHandle = pydcgm.DcgmHandle(handle=handle) dcgmSystem = dcgmHandle.GetSystem() dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds) helper_check_profiling_environment(dcgmGroup) fieldIds = helper_get_single_pass_field_ids(dcgmGroup) assert fieldIds is not None #Set watches using a large interval so we don't get a record for 10 seconds in the bug case dcgmGroup.profiling.WatchFields(fieldIds, 10000000, 3600.0, 0) gpuId = gpuIds[0] fieldValues = dcgm_agent.dcgmEntityGetLatestValues(handle, dcgm_fields.DCGM_FE_GPU, gpuId, fieldIds) assert len(fieldValues) == len(fieldIds), "%d != %d" % (len(fieldValues), len(fieldIds)) for i, fieldValue in enumerate(fieldValues): logger.info(str(fieldValue)) assert(fieldValue.version != 0), "idx %d Version was 0" % i assert(fieldValue.fieldId == fieldIds[i]), "idx %d fieldValue.fieldId %d != fieldIds[i] %d" % (i, fieldValue.fieldId, fieldIds[i]) assert(fieldValue.status == dcgm_structs.DCGM_ST_OK), "idx %d status was %d" % (i, fieldValue.status) #The following line catches the bug in Jira DCGM-1357. Previously, a record would be returned with a #0 timestamp assert(fieldValue.ts != 0), "idx %d timestamp was 0" % i
def test_dcgm_prof_with_dcgmreader(handle, gpuIds): """ Verifies that we can access profiling data with DcgmReader, which is the base class for dcgm exporters """ dcgmHandle = pydcgm.DcgmHandle(handle) dcgmSystem = dcgmHandle.GetSystem() dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds) helper_check_profiling_environment(dcgmGroup) fieldIds = helper_get_single_pass_field_ids(dcgmGroup) updateFrequencyUsec=10000 sleepTime = 2 * (updateFrequencyUsec / 1000000.0) #Sleep 2x the update freq so we get new values each time dr = DcgmReader.DcgmReader(fieldIds=fieldIds, updateFrequency=updateFrequencyUsec, maxKeepAge=30.0, gpuIds=gpuIds) dr.SetHandle(handle) for i in range(10): time.sleep(sleepTime) latest = dr.GetLatestGpuValuesAsFieldIdDict() logger.info(str(latest)) for gpuId in gpuIds: assert len(latest[gpuId]) == len(fieldIds), "i=%d, gpuId %d, len %d != %d" % (i, gpuId, len(latest[gpuIds[i]]), len(fieldIds))
def test_dcgm_vgpu_config_get_validate(handle): """ Validates structure version """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() gpuIdList = systemObj.discovery.GetAllGpuIds() assert len(gpuIdList ) >= 0, "Not able to find devices on the node for embedded case" groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "test1") groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId) status_handle = dcgm_agent.dcgmStatusCreate() with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmVgpuConfigGet(handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmVgpuConfigGet(handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle, versionTest)
def StartAppOnGpus(handle): dcgmHandle = pydcgm.DcgmHandle(handle=handle) dcgmSystem = dcgmHandle.GetSystem() allGpuIds = dcgmSystem.discovery.GetAllGpuIds() gpuInfoList = [] addedPids = [] for gpuId in allGpuIds: gpuAttrib = dcgmSystem.discovery.GetGpuAttributes(gpuId) gpuInfoList.append((gpuId, gpuAttrib.identifiers.pciBusId)) for info in gpuInfoList: gpuId = info[0] busId = info[1] appTimeout = int(1000) #miliseconds #Start a cuda app so we have something to accounted appParams = [ "--ctxCreate", busId, "--busyGpu", busId, str(appTimeout), "--ctxDestroy", busId ] app = apps.CudaCtxCreateAdvancedApp( appParams, env=test_utils.get_cuda_visible_devices_env(handle, gpuId)) app.start(appTimeout * 2) pid = app.getpid() addedPids.append(pid) app.wait() app.terminate() app.validate() logger.info("Started PID %d." % pid) return addedPids
def test_dcgm_policy_get_validate(handle): """ Validates structure version """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() gpuIdList = systemObj.discovery.GetAllGpuIds() assert len(gpuIdList ) >= 0, "Not able to find devices on the node for embedded case" groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "test1") status_handle = dcgm_agent.dcgmStatusCreate() count = 1 diagLevel = dcgm_structs.DCGM_DIAG_LVL_SHORT with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmPolicyGet(handle, groupId, count, status_handle, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmPolicyGet(handle, groupId, count, status_handle, versionTest)
def test_dcgm_modules_get_statuses(handle): ''' Do a basic sanity check of the DCGM module statuses returned ''' dcgmHandle = pydcgm.DcgmHandle(handle=handle) dcgmSystem = dcgmHandle.GetSystem() ms = dcgmSystem.modules.GetStatuses() assert ms.numStatuses == dcgm_structs.DcgmModuleIdCount, "%d != %d" % ( ms.numStatuses, dcgm_structs.DcgmModuleIdCount) assert ms.statuses[0].id == dcgm_structs.DcgmModuleIdCore, "%d != %d" % ( ms.statuses[0].id, dcgm_structs.DcgmModuleIdCore) assert ms.statuses[ 0].status == dcgm_structs.DcgmModuleStatusLoaded, "%d != %d" % ( ms.statuses[0].status, dcgm_structs.DcgmModuleStatusLoaded) for i in range(1, ms.numStatuses): #.id == index assert ms.statuses[i].id == i, "%d != %d" % (ms.statuses[i].id, i) #Assert all non-core modules aren't loaded besides NvSwitch. This one can be loaded #because creating default groups causes a RPC to the NvSwitch manager if ms.statuses[i].id != dcgm_structs.DcgmModuleIdNvSwitch: assert ms.statuses[ i].status == dcgm_structs.DcgmModuleStatusNotLoaded, "%d != %d" % ( ms.statuses[i].status, dcgm_structs.DcgmModuleStatusNotLoaded)
def helper_dcgm_config_get_attributes(handle): handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetDefaultGroup() gpuIdList = groupObj.GetGpuIds() for gpuId in gpuIdList: attributes = systemObj.discovery.GetGpuAttributes(gpuId) assert ( attributes.identifiers.deviceName != dcgmvalue.DCGM_STR_NOT_SUPPORTED and attributes.identifiers.deviceName != dcgmvalue.DCGM_STR_NOT_FOUND and attributes.identifiers.deviceName != dcgmvalue.DCGM_STR_NOT_SUPPORTED and attributes.identifiers.deviceName != dcgmvalue.DCGM_STR_NOT_PERMISSIONED), "Not able to find attributes" #We used to assert that attributes.clockSets.count was > 0. This was because the NVML internal API that provided it #bypassed the SKU check. If nvidia-smi -q -d SUPPORTED_CLOCKS returns N/A, we will no longer have clockSets. for i in range(attributes.clockSets.count): memClock = attributes.clockSets.clockSet[i].memClock smClock = attributes.clockSets.clockSet[i].smClock assert memClock > 0 and memClock < 10000, "gpuId %d got memClock out of range 0 - 10000: %d" % ( gpuId, memClock) assert smClock > 0 and smClock < 10000, "gpuId %d got smClock out of range 0 - 10000: %d" % ( gpuId, smClock)
def helper_dcgm_verify_sync_boost_single_gpu(handle, gpuIds): handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetEmptyGroup("test1") ## Add first GPU to the group groupObj.AddGpu(gpuIds[0]) gpuIds = groupObj.GetGpuIds() #Only reference GPUs we are testing against ## Set the sync boost for the group config_values = dcgm_structs.c_dcgmDeviceConfig_v1() config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.syncBoost = 1 config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK #Config Set must return DCGM_ST_BADPARAM since we only have a single GPU with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)): groupObj.config.Set(config_values) groupObj.Delete()
def test_dcgm_embedded_metadata_exectime_get_all_fields_sane(handle): """ Sanity test for API that gets execution time of all fields together """ if not option_parser.options.developer_mode: test_utils.skip_test("Skipping developer test.") handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) # watch a ton of fields so that we know that some are being stored updateFreqUsec = 1000 test_utils.watch_all_fields(handle.handle, group.GetGpuIds(), updateFreq=updateFreqUsec) system.introspect.UpdateAll() execTime = system.introspect.execTime.GetForAllFields().aggregateInfo perGpuSane = 300*1000 # 300 ms activeGpuCount = test_utils.get_live_gpu_count(handle.handle) saneLimit = perGpuSane*activeGpuCount # test that all struct fields in the API response have reasonable values assert(100 < execTime.totalEverUpdateUsec < saneLimit), ( 'execution time seems way too long for a system with %s gpus. Took %s ms. Sane limit: %s ms' % (activeGpuCount, execTime.totalEverUpdateUsec/1000, saneLimit/1000)) assert(100 < execTime.recentUpdateUsec < saneLimit), ( 'recent update time seems way too long for a system with %s gpus. Took %s ms. Sane limit: %s ms' % (activeGpuCount, execTime.recentUpdateUsec/1000, saneLimit/1000)) assert(updateFreqUsec-1 <= execTime.meanUpdateFreqUsec <= updateFreqUsec+1), execTime.meanUpdateFreqUsec
def test_dcgm_prof_multi_pause_resume(handle, gpuIds): ''' Test that we can pause and resume profiling over and over without error ''' dcgmHandle = pydcgm.DcgmHandle(handle=handle) dcgmSystem = dcgmHandle.GetSystem() dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds) helper_check_profiling_environment(dcgmGroup) #We should never get an error back from pause or resume. Pause and Resume throw exceptions on error numPauses = 0 numResumes = 0 for i in range(100): #Flip a coin and pause if we get 0. unpause otherwise (1) coin = random.randint(0,1) if coin == 0: dcgmSystem.profiling.Pause() numPauses += 1 else: dcgmSystem.profiling.Resume() numResumes += 1 logger.info("Got %d pauses and %d resumes" % (numPauses, numResumes))
def helper_test_dpt_field_id(handle, gpuIds, fieldId, extraArgs = None): ''' Test that we can retrieve a valid FV for a profiling field immediately after watching ''' dcgmHandle = pydcgm.DcgmHandle(handle=handle) dcgmSystem = dcgmHandle.GetSystem() dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds) helper_check_profiling_environment(dcgmGroup) cudaDriverVersion = test_utils.get_cuda_driver_version(handle, gpuIds[0]) supportedFieldIds = helper_get_supported_field_ids(dcgmGroup) # Just test the first GPU of our SKU. Other tests will cover multiple SKUs useGpuIds = [gpuIds[0], ] args = ["--target-max-value", "--no-dcgm-validation", "--dvs", "--reset", "--mode", "validate", "-d", "15.0", "-r", "1.0", "--sync-count", "5", "-w", "5", "-t", str(fieldId)] if extraArgs is not None: args.extend(extraArgs) app = apps.DcgmProfTesterApp(cudaDriverMajorVersion=cudaDriverVersion[0], gpuIds=useGpuIds, args=args) app.start(timeout=120.0 * len(gpuIds)) #Account for slow systems but still add an upper bound app.wait()
def test_dcgm_embedded_metadata_memory_get_field_group_sane(handle): ''' Sanity test for API that gets memory usage of a single field group ''' if not option_parser.options.developer_mode: test_utils.skip_test("Skipping developer test.") handle = pydcgm.DcgmHandle(handle) group = pydcgm.DcgmGroup(handle, groupName='test-metadata', groupType=dcgm_structs.DCGM_GROUP_DEFAULT) system = pydcgm.DcgmSystem(handle) fieldIds = [ dcgm_fields.DCGM_FI_DEV_GPU_TEMP, dcgm_fields.DCGM_FI_DEV_POWER_USAGE ] fieldGroup = pydcgm.DcgmFieldGroup(handle, "my_field_group", fieldIds) # ensure that the field group is watched _watch_field_group_basic(fieldGroup, handle.handle, group.GetId()) system.introspect.UpdateAll() memoryInfo = system.introspect.memory.GetForFieldGroup(fieldGroup) logger.debug("field group %s is using %.2f KB" % (fieldGroup.name, memoryInfo.aggregateInfo.bytesUsed / 1024.)) # 0+ to 20 MB assert(0 < memoryInfo.aggregateInfo.bytesUsed < 1024*1024*20), \ 'bytes used to store field was unreasonable for field group %s, bytes: %s' \ % (fieldGroup.name, memoryInfo.aggregateInfo.bytesUsed)
def test_dcgm_embedded_metadata_exectime_get_field_group_sane(handle): """ Sanity test for API that gets execution time of all fields together """ if not option_parser.options.developer_mode: test_utils.skip_test("Skipping developer test.") handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) fieldIds = [ dcgm_fields.DCGM_FI_DEV_POWER_USAGE, dcgm_fields.DCGM_FI_DEV_SM_CLOCK, dcgm_fields.DCGM_FI_DEV_GPU_TEMP ] fieldGroup = pydcgm.DcgmFieldGroup(handle, "my_field_group", fieldIds) updateFreqUsec = 1000 _watch_field_group_basic(fieldGroup, handle.handle, group.GetId(), updateFreq=updateFreqUsec) system.introspect.UpdateAll() execTime = system.introspect.execTime.GetForFieldGroup( fieldGroup).aggregateInfo # test that all struct fields in the API response have reasonable values assert (100 < execTime.totalEverUpdateUsec < 100 * 1000), execTime.totalEverUpdateUsec assert (100 < execTime.recentUpdateUsec < 100 * 1000), execTime.recentUpdateUsec assert (updateFreqUsec == execTime.meanUpdateFreqUsec ), execTime.meanUpdateFreqUsec
def test_dcgm_embedded_metadata_exectime_get_field_sane(handle): """ Sanity test for API that gets execution time of a single field """ if not option_parser.options.developer_mode: test_utils.skip_test("Skipping developer test.") handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) updateFreqUsec = 1000 dcgm_agent_internal.dcgmWatchFieldValue(handle.handle, group.GetGpuIds()[0], dcgm_fields.DCGM_FI_DEV_GPU_TEMP, updateFreqUsec, 100000, 10) system.UpdateAllFields(True) system.introspect.UpdateAll() execTime = dcgm_agent_internal.dcgmIntrospectGetFieldExecTime( handle.handle, dcgm_fields.DCGM_FI_DEV_GPU_TEMP).aggregateInfo # test that all struct fields in the API response have reasonable values assert (100 < execTime.totalEverUpdateUsec < 100 * 1000), execTime.totalEverUpdateUsec assert (100 < execTime.recentUpdateUsec < 100 * 1000), execTime.recentUpdateUsec assert (updateFreqUsec == execTime.meanUpdateFreqUsec ), execTime.meanUpdateFreqUsec
def test_dcgm_embedded_metadata_exectime_aggregate_is_sum_of_gpu_and_global( handle): """ Ensure that when execution time is retrieved relating to fields that the "global" and "gpu" values add up to the "aggregate" value """ handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) # watch a ton of fields so that we know that some are being stored test_utils.watch_all_fields(handle.handle, group.GetGpuIds(), updateFreq=100000) system.introspect.UpdateAll() execTimeInfo = system.introspect.execTime.GetForAllFields() gpuExecTime = sum( info.totalEverUpdateUsec for info in execTimeInfo.gpuInfo[:execTimeInfo.gpuInfoCount]) if execTimeInfo.hasGlobalInfo: globalExecTime = execTimeInfo.globalInfo.totalEverUpdateUsec else: globalExecTime = 0 assert ( execTimeInfo.aggregateInfo.totalEverUpdateUsec == globalExecTime + gpuExecTime ), ('aggregate for all fields reports %s usec but GPUs report %s usec and global reports %s usec. ' % (execTimeInfo.aggregateInfo.totalEverUpdateUsec, gpuExecTime, globalExecTime) + ' GPUs + global should sum to aggregate.')
def test_dcgm_embedded_metadata_memory_get_aggregate_fields_equals_total( handle): system = pydcgm.DcgmSystem(pydcgm.DcgmHandle(handle)) _metadata_get_aggregate_fields_equals_total( handle, dcgm_agent_internal.dcgmIntrospectGetFieldMemoryUsage, system.introspect.memory.GetForAllFields, 'bytesUsed')
def test_dcgm_embedded_metadata_memory_get_all_fields_sane(handle): """ Sanity test for API that gets memory usage of all fields together """ if not option_parser.options.developer_mode: test_utils.skip_test("Skipping developer test.") handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) # watch a ton of fields so that we know that some are being stored test_utils.watch_all_fields(handle.handle, group.GetGpuIds(), updateFreq=1000) system.introspect.UpdateAll() memoryInfo = system.introspect.memory.GetForAllFields().aggregateInfo logger.debug('All fields in hostengine are using %.2f MB' % (memoryInfo.bytesUsed / 1024. / 1024.)) assert (1024 * 20 < memoryInfo.bytesUsed < 100 * 1024 * 1024), memoryInfo.bytesUsed # 20 KB to 100 MB
def test_dcgm_embedded_metadata_memory_aggregate_is_sum_of_gpu_and_global( handle): """ Ensure that when memory info is retrieved relating to fields that the "global" and "gpu" values add up to the "aggregate" value """ handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) # watch a ton of fields so that we know that some are being stored test_utils.watch_all_fields(handle.handle, group.GetGpuIds(), updateFreq=100000) system.introspect.UpdateAll() memoryInfo = system.introspect.memory.GetForAllFields() gpuMemory = sum(mem.bytesUsed for mem in memoryInfo.gpuInfo[:memoryInfo.gpuInfoCount]) globalMemory = memoryInfo.globalInfo.bytesUsed if memoryInfo.hasGlobalInfo else 0 if (memoryInfo.hasGlobalInfo): logger.debug('global mem info: %s' % (memoryInfo.globalInfo)) for i in range(memoryInfo.gpuInfoCount): logger.debug('gpu mem info %s: %s' % (i, memoryInfo.gpuInfo[i])) assert (memoryInfo.aggregateInfo.bytesUsed == gpuMemory + globalMemory), ( 'aggregate for all fields reports %s bytes but a sum of GPU and global reports %s bytes. ' % (memoryInfo.aggregateInfo.bytesUsed, gpuMemory + globalMemory) + ' These values should be equal.')
def test_dcgm_group_get_all_ids_standalone(handle): """ Get all the group IDS configured on the host engine """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() #Get the list of groups before we add ours so that we account for them groupIdListBefore = dcgm_agent.dcgmGroupGetAllIds(handle) expectedCount = len(groupIdListBefore) groupObjs = [] for index in xrange(0, 10): expectedCount += 1 name = 'Test' name += ` index ` groupObj = systemObj.GetEmptyGroup(name) groupObjs.append( groupObj) #keep reference so it doesn't go out of scope pass groupIdListAfter = dcgm_agent.dcgmGroupGetAllIds(handle) assert len( groupIdListAfter ) == expectedCount, "Num of groups less than expected. Expected: %d Returned %d" % ( expectedCount, len(groupIdListAfter))
def helper_dcgm_group_update_grp(handle, gpuIds): handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetEmptyGroup("test1") groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY, "test1") gpuIdList = gpuIds assert len(gpuIdList) > 0, "Failed to get devices from the node" for gpuId in gpuIdList: groupObj.AddGpu(gpuId) gpuIdListAfterAdd = groupObj.GetGpuIds() assert gpuId in gpuIdListAfterAdd, "Expected gpuId %d in %s" % ( gpuId, str(gpuIdListAfterAdd)) for gpuId in gpuIdList: groupObj.RemoveGpu(gpuId) gpuIdListAfterAdd = groupObj.GetGpuIds() assert gpuId not in gpuIdListAfterAdd, "Expected gpuId %d NOT in %s" % ( gpuId, str(gpuIdListAfterAdd)) #Force the group to be deleted del (groupObj)
def helper_dcgm_group_get_grp_info_entities(handle, gpuIds): handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetEmptyGroup("test1") gpuIdList = gpuIds assert len(gpuIdList) > 0, "Failed to get devices from the node" for gpuId in gpuIdList: groupObj.AddEntity(dcgm_fields.DCGM_FE_GPU, gpuId) gpuIdListAfterAdd = groupObj.GetGpuIds() assert gpuIdList == gpuIdListAfterAdd, "Expected all GPUs from %s to be added. Got %s" % ( str(gpuIdList), str(gpuIdListAfterAdd)) entityListAfterAdd = groupObj.GetEntities() gpuList2 = [] for entity in entityListAfterAdd: assert entity.entityGroupId == dcgm_fields.DCGM_FE_GPU, str( entity.entityGroupId) gpuList2.append(entity.entityId) assert gpuIdList == gpuList2, "Expected all GPUs from %s to be added. Got %s" % ( str(gpuIdList), str(gpuList2)) #Remove all GPUs for gpuId in gpuIdList: groupObj.RemoveEntity(dcgm_fields.DCGM_FE_GPU, gpuId) entityListAfterRem = groupObj.GetEntities() assert len(entityListAfterRem) == 0, str(entityListAfterRem)
def test_dcgmproftester_parallel_gpus(handle, gpuIds): ''' Test that we can successfully read dcgmproftester metrics multiple concurrent GPUs This tests a few things: 1. That metrics work for more than GPU 0 2. That metrics work for multiple GPUs at a time ''' if len(gpuIds) < 2: test_utils.skip_test("Skipping multi-GPU test since there's only one of this SKU") dcgmHandle = pydcgm.DcgmHandle(handle=handle) dcgmSystem = dcgmHandle.GetSystem() dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds) helper_check_profiling_environment(dcgmGroup) cudaDriverVersion = test_utils.get_cuda_driver_version(handle, gpuIds[0]) #Graphics activity works for every GPU that supports DCP. It also works reliably even under heavy concurrecy fieldIds = "1001" args = ["--mode", "validate", "-d", "15.0", "-r", "1.0", "--sync-count", "5", "-w", "10", "-t", fieldIds] app = apps.DcgmProfTesterApp(cudaDriverMajorVersion=cudaDriverVersion[0], gpuIds=gpuIds, args=args) app.start(timeout=120.0 * len(gpuIds)) #Account for slow systems but still add an upper bound app.wait() app.validate() #Validate here so that errors are printed when they occur instead of at the end of the test
def _InitHandles(self): self._dcgmHandle = pydcgm.DcgmHandle(ipAddress=self._hostname) groupName = "error_mon_gpus" + self._pidPostfix self._allGpusGroup = pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName, groupType=dcgm_structs.DCGM_GROUP_DEFAULT) print("Found %d GPUs" % (len(self._allGpusGroup.GetEntities()))) groupName = "error_mon_nvswitches" + self._pidPostfix self._allNvSwitchesGroup = pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName, groupType=dcgm_structs.DCGM_GROUP_DEFAULT_NVSWITCHES) print("Found %d NvSwitches" % len(self._allNvSwitchesGroup.GetEntities())) fgName = "error_mon_nvswitches" + self._pidPostfix self._nvSwitchErrorFieldGroup = pydcgm.DcgmFieldGroup(self._dcgmHandle, name=fgName, fieldIds=self._nvSwitchErrorFieldIds) fgName = "error_mon_gpus" + self._pidPostfix self._gpuErrorFieldGroup = pydcgm.DcgmFieldGroup(self._dcgmHandle, name=fgName, fieldIds=self._gpuErrorFieldIds) updateFreq = int(self._updateIntervalSecs / 2.0) * 1000000 maxKeepAge = 3600.0 #1 hour maxKeepSamples = 0 #Rely on maxKeepAge self._nvSwitchWatcher = dcgm_field_helpers.DcgmFieldGroupEntityWatcher( self._dcgmHandle.handle, self._allNvSwitchesGroup.GetId(), self._nvSwitchErrorFieldGroup, dcgm_structs.DCGM_OPERATION_MODE_AUTO, updateFreq, maxKeepAge, maxKeepSamples, 0) self._gpuWatcher = dcgm_field_helpers.DcgmFieldGroupEntityWatcher( self._dcgmHandle.handle, self._allGpusGroup.GetId(), self._gpuErrorFieldGroup, dcgm_structs.DCGM_OPERATION_MODE_AUTO, updateFreq, maxKeepAge, maxKeepSamples, 0)
def test_dcgm_prof_pause_resume_values(handle, gpuIds): ''' Test that we get valid values when profiling is resumed and BLANK values when profiling is paused ''' dcgmHandle = pydcgm.DcgmHandle(handle=handle) dcgmSystem = dcgmHandle.GetSystem() dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds) helper_check_profiling_environment(dcgmGroup) fieldIds = helper_get_single_pass_field_ids(dcgmGroup) assert fieldIds is not None #10 ms watches so we can test quickly watchIntervalUsec = 10000 sleepIntervalSec = 0.1 * len(gpuIds) #100 ms per GPU #Start paused. All the other tests start unpaused dcgmSystem.profiling.Pause() dcgmGroup.profiling.WatchFields(fieldIds, watchIntervalUsec, 60.0, 0) gpuId = gpuIds[0] fieldValues = dcgm_agent.dcgmEntityGetLatestValues(handle, dcgm_fields.DCGM_FE_GPU, gpuId, fieldIds) assert len(fieldValues) == len(fieldIds), "%d != %d" % (len(fieldValues), len(fieldIds)) #All should be blank for i, fieldValue in enumerate(fieldValues): fv = dcgm_field_helpers.DcgmFieldValue(fieldValue) assert fv.isBlank, "Got nonblank fv index %d" % i #Resume. All should be valid dcgmSystem.profiling.Resume() time.sleep(sleepIntervalSec) fieldValues = dcgm_agent.dcgmEntityGetLatestValues(handle, dcgm_fields.DCGM_FE_GPU, gpuId, fieldIds) assert len(fieldValues) == len(fieldIds), "%d != %d" % (len(fieldValues), len(fieldIds)) #All should be non-blank for i, fieldValue in enumerate(fieldValues): fv = dcgm_field_helpers.DcgmFieldValue(fieldValue) assert not fv.isBlank, "Got blank fv index %d" % i #Pause again. All should be blank dcgmSystem.profiling.Pause() time.sleep(sleepIntervalSec) fieldValues = dcgm_agent.dcgmEntityGetLatestValues(handle, dcgm_fields.DCGM_FE_GPU, gpuId, fieldIds) assert len(fieldValues) == len(fieldIds), "%d != %d" % (len(fieldValues), len(fieldIds)) #All should be blank for i, fieldValue in enumerate(fieldValues): fv = dcgm_field_helpers.DcgmFieldValue(fieldValue) assert fv.isBlank, "Got nonblank fv index %d" % i #This shouldn't fail dcgmSystem.profiling.Resume()
def test_dcgm_config_standalone_get_devices(handle): """ Verifies that DCGM Engine returns list of devices """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() gpuIdList = systemObj.discovery.GetAllGpuIds() assert len(gpuIdList) >= 0, "Not able to find devices for standalone case"
def test_dcgm_policy_negative_unregister_standalone(handle): """ Verifies that the unregister function does not allow a bad groupId value """ policy = pydcgm.DcgmGroupPolicy(pydcgm.DcgmHandle(handle), 9999, None) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)): policy.Unregister(dcgm_structs.DCGM_POLICY_COND_DBE)
def helper_dcgm_group_create_grp(handle): handleObj = pydcgm.DcgmHandle(handle=handle) groupObj = pydcgm.DcgmGroup(handleObj, groupName="test1") groupId = groupObj.GetId() assert (groupId != 0) #Force the group to be deleted del (groupObj)
def test_dcgmi_config(handle, gpuIds): """ Test DCGMI config """ assert len(gpuIds) > 0, "Failed to get devices from the node" dcgmHandle = pydcgm.DcgmHandle(handle=handle) dcgmSystem = dcgmHandle.GetSystem() # Getting GPU power limits for gpuId in gpuIds: gpuAttrib = dcgmSystem.discovery.GetGpuAttributes(gpuId) dft_pwr = str(gpuAttrib.powerLimits.defaultPowerLimit) max_pwr = str(gpuAttrib.powerLimits.maxPowerLimit) groupId = str(_create_dcgmi_group()) ## keep args in this order. Changing it may break the test validArgsTestList = [ ["group", "-g", groupId, "-a", str(gpuIds[0])], # add gpu to group ["config", "--get", "-g", groupId], # get default group configuration [ "config", "--get", "-g", "0" ], # get default group configuration by ID. This will work as long as group IDs start at 0 ["config", "-g", groupId, "--set", "-P", dft_pwr], # set default power limit ["config", "-g", groupId, "--set", "-P", max_pwr], # set max power limit ["config", "--get", "-g", groupId, "--verbose"], # get verbose default group configuration ["config", "--enforce", "-g", groupId], # enforce default group configuration ["config", "--enforce", "-g", "0"] # enforce group configuration on default group by ID ] # Setting the compute mode is only supported when MIG mode is not enabled. if not test_utils.is_mig_mode_enabled(): # set group configuration on default group by ID validArgsTestList.append(["config", "--set", "-c", "0", "-g", "0"]) #Config management only works when the host engine is running as root if utils.is_root(): _test_valid_args(validArgsTestList) else: _test_invalid_args(validArgsTestList) ## keep args in this order. Changing it may break the test _test_invalid_args([ ["config", "--get", "-g", "9999"], # Can't get config of group that doesn't exist ["config", "--get", "-g", "9999", "--verbose"], # Can't get config of group that doesn't exist ["config", "--set", ""], # Can't set group configuration to nothing ["config", "--set", "-c", "5"], # Can't set an invalid compute mode ["config", "--enforce", "-g", "9999"] # Can't enforce a configuration of group that doesn't exist ])