Ejemplo n.º 1
0
def test_dcgm_vgpu_config_get_validate(handle):
    """
    Validates structure version
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    gpuIdList = systemObj.discovery.GetAllGpuIds()
    assert len(gpuIdList
               ) >= 0, "Not able to find devices on the node for embedded case"

    groupId = dcgm_agent.dcgmGroupCreate(handle,
                                         dcgm_structs.DCGM_GROUP_DEFAULT,
                                         "test1")
    groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)
    status_handle = dcgm_agent.dcgmStatusCreate()

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmVgpuConfigGet(handle, groupId,
                                  dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
                                  groupInfo.count, status_handle, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmVgpuConfigGet(handle, groupId,
                                  dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
                                  groupInfo.count, status_handle, versionTest)
Ejemplo n.º 2
0
def helper_verify_power_value_standalone(handle, groupId, expected_power):
    """
    Helper Method to verify power value
    """
    groupInfo = dcgm_agent.dcgmGroupGetInfo(
        handle, groupId, dcgm_structs.c_dcgmGroupInfo_version2)
    status_handle = dcgm_agent.dcgmStatusCreate()

    config_values = dcgm_agent.dcgmConfigGet(
        handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
        groupInfo.count, status_handle)
    assert len(
        config_values) > 0, "Failed to get configuration using dcgmConfigGet"

    for x in range(0, groupInfo.count):
        if (config_values[x].mPowerLimit.val !=
                dcgmvalue.DCGM_INT32_NOT_SUPPORTED):
            assert config_values[x].mPowerLimit.type == dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL, \
                                    "The power limit type for gpuId %d is incorrect. Returned: %d Expected :%d" \
                                    % (x, config_values[x].mPowerLimit.type, dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL)
            assert config_values[x].mPowerLimit.val == expected_power, "The power limit value for gpuID %d is incorrect. Returned: %d Expected: %d" \
                                    % (x, config_values[x].mPowerLimit.val, expected_power)
        pass

    ret = dcgm_agent.dcgmStatusDestroy(status_handle)
    assert (ret == dcgm_structs.DCGM_ST_OK
            ), "Failed to remove status handler, error: %s" % ret
Ejemplo n.º 3
0
 def GetGpuIds(self):
     groupInfo = dcgm_agent.dcgmGroupGetInfo(self._dcgmHandle.handle, self._groupId)
     groupGpuIds = []
     for i in range(groupInfo.count):
         if groupInfo.entityList[i].entityGroupId != dcgm_fields.DCGM_FE_GPU:
             continue
         groupGpuIds.append(groupInfo.entityList[i].entityId)
     return groupGpuIds
Ejemplo n.º 4
0
def helper_verify_config_values_standalone(handle, groupId, expected_power, expected_ecc, \
                                            expected_proc_clock, expected_mem_clock, expected_compute_mode, \
                                            expected_sync_boost, expected_auto_boost):
    """
    Helper Method to verify all the values for the current configuration are as expected
    """

    groupInfo = dcgm_agent.dcgmGroupGetInfo(
        handle, groupId, dcgm_structs.c_dcgmGroupInfo_version2)
    status_handle = dcgm_agent.dcgmStatusCreate()

    config_values = dcgm_agent.dcgmConfigGet(
        handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
        groupInfo.count, status_handle)
    assert len(
        config_values) > 0, "Failed to get configuration using dcgmConfigGet"

    for x in xrange(0, groupInfo.count):
        assert config_values[x].mPowerLimit.type == dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL, \
                                "The power limit type for gpuId %d is incorrect. Returned: %d Expected :%d" \
                                % (x, config_values[x].mPowerLimit.type, dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL)
        assert config_values[x].mPowerLimit.val == expected_power, "The power limit value for gpuID %d is incorrect. Returned: %d Expected: %d" \
                                % (x, config_values[x].mPowerLimit.val, expected_power)

        assert config_values[x].mPerfState.syncBoost == expected_sync_boost, "The syncboost value for gpuID %d is incorrect."\
                                " Returned: %d Expected: %d" \
                                % (x, config_values[x].mPerfState.syncBoost, expected_sync_boost)

        assert config_values[x].mPerfState.autoBoost == expected_auto_boost, "The autoboost value for gpuID %d is incorrect."\
                                " Returned: %d Expected: %d" \
                                % (x, config_values[x].mPerfState.autoBoost, expected_auto_boost)

        assert config_values[x].mPerfState.minVPState.memClk == expected_mem_clock, "The min mem clock value for gpuID %d is incorrect."\
                                " Returned: %d Expected: %d" \
                                % (x, config_values.mPerfState.minVPState.memClk , expected_mem_clock)

        assert config_values[x].mPerfState.minVPState.procClk  == expected_proc_clock, "The min proc clock value for gpuID %d is incorrect."\
                                " Returned: %d Expected: %d" \
                                % (x, config_values[x].mPerfState.minVPState.procClk , expected_proc_clock)

        assert config_values[x].mComputeMode  == expected_compute_mode, "The compute mode value for gpuID %d is incorrect."\
                                " Returned: %d Expected: %d" \
                                % (x, config_values[x].mComputeMode, expected_compute_mode)

        assert config_values[x].mEccMode  == expected_ecc, "The ecc mode value for gpuID %d is incorrect."\
                                " Returned: %d Expected: %d" \
                                % (x, config_values[x].mEccMode, expected_ecc)
        pass

    ret = dcgm_agent.dcgmStatusDestroy(status_handle)
    assert (ret == dcgm_structs.DCGM_ST_OK
            ), "Failed to remove status handler, error: %s" % ret
Ejemplo n.º 5
0
def main():

    ## Initilaize the DCGM Engine as manual operation mode. This implies that it's execution is
    ## controlled by the monitoring agent. The user has to periodically call APIs such as
    ## dcgmEnginePolicyTrigger and dcgmEngineUpdateAllFields which tells DCGM to wake up and
    ## perform data collection and operations needed for policy management.
    with RunDCGM('127.0.0.1',
                 dcgm_structs.DCGM_OPERATION_MODE_MANUAL) as handle:

        ## Create a default group. (Default group is comprised of all the GPUs on the node)
        ## Let's call the group as "all_gpus_group". The method returns an opaque handle (groupId) to
        ## identify the newly created group.
        groupId = dcgm_agent.dcgmGroupCreate(handle,
                                             dcgm_structs.DCGM_GROUP_DEFAULT,
                                             "all_gpus_group")

        ## Invoke method to get information on the newly created group
        groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)

        ## Create reference to DCGM status handler which can be used to get the statuses for multiple
        ## operations on one or more devices present in the group
        status_handle = dcgm_agent.dcgmStatusCreate()

        ## The worker function can be executed as a separate thread or as part of the main thread.
        ## Executed as a separate thread here
        thread = Thread(target=agent_worker_function, args=(handle, groupId))
        thread.start()

        ##########################################
        # Any other useful work can be placed here
        ##########################################

        thread.join()
        print("Worker thread completed")

        ## Destroy the group
        try:
            dcgm_agent.dcgmGroupDestroy(handle, groupId)
        except dcgm_structs.DCGMError as e:
            print("Failed to remove the test group, error: %s" % e,
                  file=sys.stderr)
            sys.exit(1)

        ## Destroy the status handle
        try:
            dcgm_agent.dcgmStatusDestroy(status_handle)
        except dcgm_structs.DCGMError as e:
            print("Failed to remove status handler, error: %s" % e,
                  file=sys.stderr)
            sys.exit(1)
Ejemplo n.º 6
0
def dcgm_group_test_default_group(handle, gpuIds):
    """
    Test that the default group can not be deleted, or manipulated and is returning all GPUs.

    Note that we're not using groupObj for some tests because it protects against operations on the default group
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetDefaultGroup()

    gpuIdList = gpuIds
    assert len(gpuIdList) > 0, "Failed to get devices from the node"

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
        groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, 9999)

    groupGpuIdList = groupObj.GetGpuIds()
    assert (gpuIdList == groupGpuIdList
            ), "Expected gpuId list match %s != %s" % (str(gpuIdList),
                                                       str(groupGpuIdList))
    groupEntityList = groupObj.GetEntities()
    gpuIdList2 = []
    for entity in groupEntityList:
        assert entity.entityGroupId == dcgm_fields.DCGM_FE_GPU, str(
            entity.entityGroupId)
        gpuIdList2.append(entity.entityId)
    assert gpuIdList == gpuIdList2, "Expected gpuId list to match entity list: %s != %s" % (
        str(gpuIdList), str(gpuIdList2))

    for gpuId in gpuIdList:
        with test_utils.assert_raises(
                dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
            ret = dcgm_agent.dcgmGroupRemoveDevice(
                handle, dcgm_structs.DCGM_GROUP_ALL_GPUS, gpuId)
        with test_utils.assert_raises(pydcgm.DcgmException):
            groupObj.RemoveGpu(gpuId)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
        ret = dcgm_agent.dcgmGroupDestroy(handle,
                                          dcgm_structs.DCGM_GROUP_ALL_GPUS)
Ejemplo n.º 7
0
    def GetGpus(self):
        """
        Populate self.gpus
        """
        self.groupId = dcgm_agent.dcgmGroupCreate(
            self.heHandle, dcgm_structs.DCGM_GROUP_DEFAULT, self.groupName)
        groupInfo = dcgm_agent.dcgmGroupGetInfo(
            self.heHandle, self.groupId, dcgm_structs.c_dcgmGroupInfo_version2)

        gpuIds = groupInfo.gpuIdList[0:groupInfo.count]

        self.Log("Running on %d GPUs" % len(gpuIds))

        for gpuId in gpuIds:
            newGpu = ProcessStatsStressGpu()
            newGpu.gpuId = gpuId
            self.gpus.append(newGpu)

            #Get the busid of the GPU
            fieldId = dcgm_fields.DCGM_FI_DEV_PCI_BUSID
            updateFreq = 100000
            maxKeepAge = 3600.0  #one hour
            maxKeepEntries = 0  #no limit

            dcgm_agent_internal.dcgmWatchFieldValue(self.heHandle, gpuId,
                                                    fieldId, updateFreq,
                                                    maxKeepAge, maxKeepEntries)

        #Update all of the new watches
        dcgm_agent.dcgmUpdateAllFields(self.heHandle, 1)

        for gpu in self.gpus:
            values = dcgm_agent_internal.dcgmGetLatestValuesForFields(
                self.heHandle, gpuId, [
                    fieldId,
                ])
            busId = values[0].value.str
            gpu.busId = busId

            self.Log("    GPUID %d, busId %s" % (gpu.gpuId, gpu.busId))
Ejemplo n.º 8
0
def test_dcgm_connection_client_cleanup(handle, gpuIds):
    '''
    Make sure that resources that were allocated by a client are cleaned up
    '''
    fieldGroupFieldIds = [
        dcgm_fields.DCGM_FI_DEV_GPU_TEMP,
    ]

    #Get a 2nd connection which we'll check for cleanup. Use the raw APIs so we can explicitly cleanup
    connectParams = dcgm_structs.c_dcgmConnectV2Params_v1()
    connectParams.version = dcgm_structs.c_dcgmConnectV2Params_version
    connectParams.persistAfterDisconnect = 0
    cleanupHandle = dcgm_agent.dcgmConnect_v2('localhost', connectParams)

    groupName = 'clientcleanupgroup'
    groupId = dcgm_agent.dcgmGroupCreate(cleanupHandle,
                                         dcgm_structs.DCGM_GROUP_EMPTY,
                                         groupName)

    fieldGroupName = 'clientcleanupfieldgroup'
    fieldGroupId = dcgm_agent.dcgmFieldGroupCreate(cleanupHandle,
                                                   fieldGroupFieldIds,
                                                   fieldGroupName)

    #Disconnect our second handle. This should cause the cleanup to occur
    dcgm_agent.dcgmDisconnect(cleanupHandle)

    time.sleep(1.0)  #Allow connection cleanup to occur since it's asynchronous

    #Try to retrieve the field group info. This should throw an exception
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_NO_DATA)):
        fieldGroupInfo = dcgm_agent.dcgmFieldGroupGetInfo(handle, fieldGroupId)

    #Try to retrieve the group info. This should throw an exception
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(
                dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
        groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)
Ejemplo n.º 9
0
def helper_dcgm_group_get_grp_info(handle, gpuIds):
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetEmptyGroup("test1")

    gpuIdList = gpuIds
    assert len(gpuIdList) > 0, "Failed to get devices from the node"

    for gpuId in gpuIdList:
        groupObj.AddGpu(gpuId)

    # We used to test fetching negative value throws Bad Param error here.
    # This was only a usecase because we we mixing signed and unsigned values
    # Now we're just testing that passing an invalid group ID results in the
    # expected NOT_CONFIGURED error.
    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
        ret = dcgm_agent.dcgmGroupGetInfo(handle, -1)

    gpuIdListAfterAdd = groupObj.GetGpuIds()
    assert gpuIdList == gpuIdListAfterAdd, "Expected all GPUs from %s to be added. Got %s" % (
        str(gpuIdList), str(gpuIdListAfterAdd))
Ejemplo n.º 10
0
 def GetEntities(self):
     groupInfo = dcgm_agent.dcgmGroupGetInfo(self._dcgmHandle.handle, self._groupId)
     entities = groupInfo.entityList[0:groupInfo.count]
     return entities
Ejemplo n.º 11
0
def test_dcgm_run_diagnostic_validate(handle, gpuIds):
    """
    Validates structure version
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    gpuIdList = systemObj.discovery.GetAllGpuIds()
    assert len(gpuIdList
               ) >= 0, "Not able to find devices on the node for embedded case"

    groupId = dcgm_agent.dcgmGroupCreate(handle,
                                         dcgm_structs.DCGM_GROUP_DEFAULT,
                                         "test1")
    groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)
    status_handle = dcgm_agent.dcgmStatusCreate()

    diagLevel = dcgm_structs.DCGM_DIAG_LVL_SHORT

    gpuIdStr = ""
    for i, gpuId in enumerate(gpuIds):
        if i > 0:
            gpuIdStr += ","
        gpuIdStr += str(gpuId)

    drd = dcgm_structs.c_dcgmRunDiag_t()
    drd.version = dcgm_structs.dcgmRunDiag_version
    drd.validate = dcgm_structs.DCGM_POLICY_VALID_SV_SHORT
    drd.groupId = groupId
    drd.gpuList = gpuIdStr

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmActionValidate_v2(handle, drd, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmActionValidate_v2(handle, drd, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmActionValidate(handle, drd.groupId, drd.validate,
                                   versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmActionValidate(handle, drd.groupId, drd.validate,
                                   versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmRunDiagnostic(handle, drd.groupId, diagLevel, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmRunDiagnostic(handle, drd.groupId, diagLevel, versionTest)
Ejemplo n.º 12
0
## Entry point for this script
if __name__ == "__main__":
    
    ## Initialize the DCGM Engine as manual operation mode. This implies that it's execution is 
    ## controlled by the monitoring agent. The user has to periodically call APIs such as 
    ## dcgmEnginePolicyTrigger and dcgmEngineUpdateAllFields which tells DCGM to wake up and 
    ## perform data collection and operations needed for policy management.
    with RunDCGM('127.0.0.1', dcgm_structs.DCGM_OPERATION_MODE_MANUAL) as handle:
    
        ## Create a default group. (Default group is comprised of all the GPUs on the node)
        ## Let's call the group as "all_gpus_group". The method returns an opaque handle (groupId) to
        ## identify the newly created group.
        groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "all_gpus_group")
        
        ## Invoke method to get information on the newly created group
        groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)
        
        ## Create reference to DCGM status handler which can be used to get the statuses for multiple 
        ## operations on one or more devices present in the group
        status_handle = dcgm_agent.dcgmStatusCreate()
        
        ## The worker function can be executed as a separate thread or as part of the main thread.
        ## Executed as a separate thread here
        thread = Thread(target = agent_worker_function, args = (handle, groupId, groupInfo, status_handle))
        thread.start()
    
        ##########################################
        # Any other useful work can be placed here
        ##########################################
        
        thread.join()
Ejemplo n.º 13
0
        dcgm_fields.DCGM_FI_DEV_AUTOBOOST,
    ])
if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED):
    autoBoost_set = 1
    print "configure autobost"

assert attributesForDevices[
    0].vpStates.count > 0, "Can't find clocks for the device"
total_clocks = attributesForDevices[0].vpStates.count
proc_clk_set = attributesForDevices[0].vpStates.vpState[total_clocks /
                                                        2].procClk
mem_clk_set = attributesForDevices[0].vpStates.vpState[total_clocks / 2].memClk

## Always Switch the ecc mode
ecc_set = 1
groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId,
                                        dcgm_structs.c_dcgmGroupInfo_version2)
config_values = dcgm_agent.dcgmConfigGet(
    handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count,
    0)
assert len(config_values) > 0, "Failed to work with NULL status handle"
eccmodeOnGroupExisting = config_values[0].mEccMode

if eccmodeOnGroupExisting == 0:
    ecc_set = 1
else:
    ecc_set = 0

syncboost_set = 1
compute_set = dcgm_structs.DCGM_CONFIG_COMPUTEMODE_DEFAULT

config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
Ejemplo n.º 14
0
    ## perform data collection and operations needed for policy management.
    with RunDCGM('127.0.0.1',
                 dcgm_structs.DCGM_OPERATION_MODE_MANUAL) as handle:

        # The validate information should be packed in the dcgmRunDiag object
        runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
        runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7

        ## Create a default group. (Default group is comprised of all the GPUs on the node)
        ## Let's call the group as "all_gpus_group". The method returns an opaque handle (groupId) to
        ## identify the newly created group.
        runDiagInfo.groupId = dcgm_agent.dcgmGroupCreate(
            handle, dcgm_structs.DCGM_GROUP_DEFAULT, "all_gpus_group")

        ## Invoke method to get information on the newly created group
        groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, runDiagInfo.groupId)

        ## define the actions and validations for those actions to take place
        runDiagInfo.validate = dcgm_structs.DCGM_POLICY_VALID_SV_SHORT

        ## This will go ahead and perform a "prologue" diagnostic
        ## to make sure everything is ready to run
        ## currently this calls an outside diagnostic binary but eventually
        ## that binary will be merged into the DCGM framework
        ## The "response" is a dcgmDiagResponse structure that can be parsed for errors
        response = dcgm_agent.dcgmActionValidate_v2(handle, runDiagInfo)

        ## This will perform an "eiplogue" diagnostic that will stress the system
        ## Currently commented out because it takes several minutes to execute
        # runDiagInfo.validate = dcgm_structs.DCGM_POLICY_VALID_SV_LONG
        #response = dcgm_agent.dcgmActionValidate_v2(handle, dcgmRunDiagInfo)
Ejemplo n.º 15
0
def test_dcgm_configure_ecc_mode(handle, gpuIds):
    test_utils.skip_test("Skipping this test until bug 200377294 is fixed")

    groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY,
                                         "test1")

    validDevice = -1
    for x in gpuIds:
        fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields(
            handle, x, [
                dcgm_fields.DCGM_FI_DEV_ECC_CURRENT,
            ])
        if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED):
            validDevice = x
            break

    if (validDevice == -1):
        test_utils.skip_test(
            "Can only run if at least one GPU with ECC is present")

    ret = dcgm_agent.dcgmGroupAddDevice(handle, groupId, validDevice)
    assert (ret == dcgm_structs.DCGM_ST_OK
            ), "Failed to add a device to the group %d. Return %d" % (
                groupId.value, ret)

    groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)

    #Create a status handle
    status_handle = dcgm_agent.dcgmStatusCreate()

    ## Get original ECC mode on the device
    config_values = dcgm_agent.dcgmConfigGet(
        handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
        groupInfo.count, status_handle)
    assert len(
        config_values) > 0, "Failed to get configuration using dcgmConfigGet"

    eccmodeOnGroupExisting = config_values[0].mEccMode
    if eccmodeOnGroupExisting == 0:
        eccmodeOnGroupToSet = 1
    else:
        eccmodeOnGroupToSet = 0

    #print eccmodeOnGroupExisting
    #print eccmodeOnGroupToSet

    ## Toggle the ECC mode on the group
    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
    config_values.mEccMode = eccmodeOnGroupToSet
    config_values.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK

    #Clear the status handle to log the errors while setting the config
    ret = dcgm_agent.dcgmStatusClear(status_handle)
    assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret

    try:
        ret = dcgm_agent.dcgmConfigSet(handle, groupId, config_values,
                                       status_handle)
    except dcgm_structs.DCGMError as e:
        pass

    errors = helper_get_status_list(status_handle)

    if len(errors) > 0:
        for error in errors:
            if error.status == dcgm_structs.DCGM_ST_RESET_REQUIRED:
                test_utils.skip_test(
                    "Skipping the test - Unable to reset the Gpu, FieldId - %d, Return - %d"
                    % (error.fieldId, error.status))
            else:
                test_utils.skip_test(
                    "Skipping the test - Unable to set the ECC mode. FieldId - %d, Return %d"
                    % (error.fieldId, error.status))

    #Sleep after reset
    time.sleep(2)

    #Clear the status handle to log the errors while setting the config
    ret = dcgm_agent.dcgmStatusClear(status_handle)
    assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret

    #Get the current configuration
    config_values = dcgm_agent.dcgmConfigGet(
        handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
        groupInfo.count, status_handle)
    assert len(
        config_values) > 0, "Failed to get configuration using dcgmConfigGet"

    fvs = dcgm_agent_internal.dcgmGetLatestValuesForFields(
        handle, validDevice, [
            dcgm_fields.DCGM_FI_DEV_ECC_PENDING,
            dcgm_fields.DCGM_FI_DEV_ECC_CURRENT
        ])
    if fvs[0].value.i64 != fvs[1].value.i64:
        logger.warning(
            "Pending ECC %d != Current ECC %d for gpuId %d. Box probably needs a reboot"
            % (fvs[0].value.i64, fvs[1].value.i64, validDevice))
    else:
        assert config_values[0].mEccMode == (eccmodeOnGroupToSet), "ECC mode %d different from the set value %d" % \
                                                                   (config_values[0].mEccMode, eccmodeOnGroupToSet)
Ejemplo n.º 16
0
def test_dcgm_vgpu_configure_ecc_mode(handle, gpuIds):
    test_utils.skip_test("Skipping this test until bug 200377294 is fixed")

    groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY,
                                         "test1")

    validDevice = -1
    for x in gpuIds:
        fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields(
            handle, x, [dcgm_fields.DCGM_FI_DEV_RETIRED_DBE])
        if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED):
            validDevice = x
        break

    if (validDevice == -1):
        test_utils.skip_test(
            "Can only run if at least one GPU with ECC is present")

    ret = dcgm_agent.dcgmGroupAddDevice(handle, groupId, validDevice)
    assert (
        ret == dcgm_structs.DCGM_ST_OK
    ), "Failed to add a device to the group %d. Return %d" % (groupId, ret)

    groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)

    #Create a status handle
    status_handle = dcgm_agent.dcgmStatusCreate()

    ## Get original ECC mode on the device
    vgpu_config_values = dcgm_agent_internal.dcgmVgpuConfigGet(
        handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
        groupInfo.count, status_handle)
    assert len(
        vgpu_config_values) > 0, "Failed to work with NULL status handle"

    eccmodeOnGroupExisting = vgpu_config_values[0].mEccMode
    if eccmodeOnGroupExisting == 0:
        eccmodeOnGroupToSet = 1
    else:
        eccmodeOnGroupToSet = 0

    #print eccmodeOnGroupExisting
    #print eccmodeOnGroupToSet

    ## Toggle the ECC mode on the group
    vgpu_config_values = dcgm_structs.c_dcgmDeviceVgpuConfig_v1()
    vgpu_config_values.mEccMode = eccmodeOnGroupToSet
    vgpu_config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
    vgpu_config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK
    vgpu_config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK

    #Clear the status handle to log the errors while setting the config
    ret = dcgm_agent.dcgmStatusClear(status_handle)
    assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret

    try:
        ret = dcgm_agent_internal.dcgmVgpuConfigSet(handle, groupId,
                                                    vgpu_config_values,
                                                    status_handle)
    except dcgm_structs.DCGMError as e:
        pass

    errors = helper_get_status_list(status_handle)

    if len(errors) > 0:
        for error in errors:
            if error.status == dcgm_structs.DCGM_ST_RESET_REQUIRED:
                test_utils.skip_test(
                    "Skipping the test - Unable to reset the Gpu, FieldId - %d, Return - %d"
                    % (error.fieldId, error.status))
            else:
                test_utils.skip_test(
                    "Skipping the test - Unable to set the ECC mode. FieldId - %d, Return %d"
                    % (error.fieldId, error.status))

    #Sleep after reset and then apply update for it to occur
    time.sleep(2)

    dcgm_agent.dcgmUpdateAllFields(handle, 1)

    #Clear the status handle to log the errors while setting the config
    ret = dcgm_agent.dcgmStatusClear(status_handle)
    assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret

    #Get the current configuration
    config_values = dcgm_agent_internal.dcgmVgpuConfigGet(
        handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
        groupInfo.count, status_handle)
    assert len(config_values
               ) > 0, "Failed to get configuration using dcgmiVgpuConfigGet"

    assert config_values[0].mEccMode == (
        eccmodeOnGroupToSet), "ECC mode different from the set value"