def helper_dcgm_config_powerbudget(handle, gpuIds): handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetEmptyGroup("test1") ## Add first GPU to the group groupObj.AddGpu(gpuIds[0]) gpuIds = groupObj.GetGpuIds() #Only reference GPUs we are testing against ## Get Min and Max Power limit on the group attributes = dcgm_agent.dcgmGetDeviceAttributes(handle, gpuIds[0]) ## Verify that power is supported on the GPUs in the group if dcgmvalue.DCGM_INT32_IS_BLANK(attributes.powerLimits.maxPowerLimit): test_utils.skip_test("Needs Power limit to be supported on the GPU") powerLimit = int((attributes.powerLimits.maxPowerLimit + attributes.powerLimits.minPowerLimit) / 2) config_values = dcgm_structs.c_dcgmDeviceConfig_v1() config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK config_values.mPowerLimit.type = dcgm_structs.DCGM_CONFIG_POWER_BUDGET_GROUP config_values.mPowerLimit.val = powerLimit * len( gpuIds) #Assumes homogenous GPUs groupObj.config.Set(config_values) config_values = groupObj.config.Get(dcgm_structs.DCGM_CONFIG_CURRENT_STATE) assert len(config_values ) > 0, "Failed to get configuration using groupObj.config.Get" for x in range(0, len(gpuIds)): if (config_values[x].mPowerLimit.val != dcgmvalue.DCGM_INT32_NOT_SUPPORTED): assert config_values[ x].mPowerLimit.type == dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL, "The power limit type for gpuId %d is incorrect. Returned: %d Expected :%d" % ( x, config_values[x].mPowerLimit.type, dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL) assert config_values[ x].mPowerLimit.val == powerLimit, "The power limit value for gpuID %d is incorrect. Returned: %d Expected: %s" % ( x, config_values[x].mPowerLimit.val, powerLimit) pass
def initialize_devices(handle, flags): gpuIds = dcgm_agent.dcgmGetEntityGroupEntities(handle, dcgm_fields.DCGM_FE_GPU, flags) switchIds = dcgm_agent.dcgmGetEntityGroupEntities( handle, dcgm_fields.DCGM_FE_SWITCH, flags) i = 0 for gpuId in gpuIds: attributes = dcgm_agent.dcgmGetDeviceAttributes(handle, gpuId) gpuObj = Entity(gpuId, entityType=dcgm_fields.DCGM_FE_GPU, uuid=attributes.identifiers.uuid, bdf=attributes.identifiers.pciBusId) g_gpus.append(gpuObj) i = i + 1 i = 0 for switchId in switchIds: switchObj = Entity(switchId, entityType=dcgm_fields.DCGM_FE_SWITCH) g_switches.append(switchObj) i = i + 1
def GetGpuAttributes(self, gpuId): return dcgm_agent.dcgmGetDeviceAttributes(self._dcgmHandle.handle, gpuId)
sys.exit(1) print "Number of valid devices: %d" % len(validDevices) groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY, "test1") statusHandle = dcgm_agent.dcgmStatusCreate() for device in validDevices: ret = dcgm_agent.dcgmGroupAddDevice(handle, groupId, device) assert (ret == dcgm_structs.DCGM_ST_OK) ## Get attributes for all the devices attributesForDevices = list() for device in validDevices: attributes = dcgm_agent.dcgmGetDeviceAttributes(handle, device) attributesForDevices.append(attributes) assert len( attributesForDevices) != 0, "Can't get attributes for all the devices" device0_name = attributesForDevices[0].identifiers.deviceName for attribute in attributesForDevices: if attribute.identifiers.deviceName != device0_name: print "Can only run test if all the GPUs are same" sys.exit(1) powerLimit_set = dcgmvalue.DCGM_INT32_BLANK fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, x, [ dcgm_fields.DCGM_FI_DEV_POWER_MGMT_LIMIT,