Ejemplo n.º 1
0
def helper_dcgm_config_powerbudget(handle, gpuIds):
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetEmptyGroup("test1")
    ## Add first GPU to the group
    groupObj.AddGpu(gpuIds[0])
    gpuIds = groupObj.GetGpuIds()  #Only reference GPUs we are testing against

    ## Get Min and Max Power limit on the group
    attributes = dcgm_agent.dcgmGetDeviceAttributes(handle, gpuIds[0])

    ## Verify that power is supported on the GPUs in the group
    if dcgmvalue.DCGM_INT32_IS_BLANK(attributes.powerLimits.maxPowerLimit):
        test_utils.skip_test("Needs Power limit to be supported on the GPU")

    powerLimit = int((attributes.powerLimits.maxPowerLimit +
                      attributes.powerLimits.minPowerLimit) / 2)

    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
    config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.type = dcgm_structs.DCGM_CONFIG_POWER_BUDGET_GROUP
    config_values.mPowerLimit.val = powerLimit * len(
        gpuIds)  #Assumes homogenous GPUs

    groupObj.config.Set(config_values)

    config_values = groupObj.config.Get(dcgm_structs.DCGM_CONFIG_CURRENT_STATE)
    assert len(config_values
               ) > 0, "Failed to get configuration using groupObj.config.Get"

    for x in range(0, len(gpuIds)):
        if (config_values[x].mPowerLimit.val !=
                dcgmvalue.DCGM_INT32_NOT_SUPPORTED):
            assert config_values[
                x].mPowerLimit.type == dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL, "The power limit type for gpuId %d is incorrect. Returned: %d Expected :%d" % (
                    x, config_values[x].mPowerLimit.type,
                    dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL)
            assert config_values[
                x].mPowerLimit.val == powerLimit, "The power limit value for gpuID %d is incorrect. Returned: %d Expected: %s" % (
                    x, config_values[x].mPowerLimit.val, powerLimit)
        pass
Ejemplo n.º 2
0
def initialize_devices(handle, flags):
    gpuIds = dcgm_agent.dcgmGetEntityGroupEntities(handle,
                                                   dcgm_fields.DCGM_FE_GPU,
                                                   flags)
    switchIds = dcgm_agent.dcgmGetEntityGroupEntities(
        handle, dcgm_fields.DCGM_FE_SWITCH, flags)

    i = 0
    for gpuId in gpuIds:
        attributes = dcgm_agent.dcgmGetDeviceAttributes(handle, gpuId)
        gpuObj = Entity(gpuId,
                        entityType=dcgm_fields.DCGM_FE_GPU,
                        uuid=attributes.identifiers.uuid,
                        bdf=attributes.identifiers.pciBusId)
        g_gpus.append(gpuObj)
        i = i + 1

    i = 0
    for switchId in switchIds:
        switchObj = Entity(switchId, entityType=dcgm_fields.DCGM_FE_SWITCH)
        g_switches.append(switchObj)
        i = i + 1
Ejemplo n.º 3
0
 def GetGpuAttributes(self, gpuId):
     return dcgm_agent.dcgmGetDeviceAttributes(self._dcgmHandle.handle,
                                               gpuId)
Ejemplo n.º 4
0
    sys.exit(1)

print "Number of valid devices: %d" % len(validDevices)

groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY,
                                     "test1")
statusHandle = dcgm_agent.dcgmStatusCreate()

for device in validDevices:
    ret = dcgm_agent.dcgmGroupAddDevice(handle, groupId, device)
    assert (ret == dcgm_structs.DCGM_ST_OK)

## Get attributes for all the devices
attributesForDevices = list()
for device in validDevices:
    attributes = dcgm_agent.dcgmGetDeviceAttributes(handle, device)
    attributesForDevices.append(attributes)

assert len(
    attributesForDevices) != 0, "Can't get attributes for all the devices"

device0_name = attributesForDevices[0].identifiers.deviceName
for attribute in attributesForDevices:
    if attribute.identifiers.deviceName != device0_name:
        print "Can only run test if all the GPUs are same"
        sys.exit(1)

powerLimit_set = dcgmvalue.DCGM_INT32_BLANK
fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields(
    handle, x, [
        dcgm_fields.DCGM_FI_DEV_POWER_MGMT_LIMIT,