Exemple #1
0
def test_dcgm_engine_unwatch_field_value(handle):
    """
    Verifies that the cache manager can unwatch a field value
    """

    # Watch field so we can fetch it
    fieldId = dcgm_fields.DCGM_FI_DEV_NAME
    gpuId = 0

    ret = dcgm_agent_internal.dcgmWatchFieldValue(handle, gpuId, fieldId,
                                                  10000000, 86400.0, 0)
    assert (ret == dcgm_structs.DCGM_ST_OK)

    fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
        handle, gpuId, fieldId)
    numWatchersBefore = fieldInfo.numWatchers

    # Unwatch field
    clearCache = 1
    ret = dcgm_agent_internal.dcgmUnwatchFieldValue(handle, gpuId, fieldId,
                                                    clearCache)
    assert (ret == dcgm_structs.DCGM_ST_OK)

    fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
        handle, gpuId, fieldId)
    numWatchersAfter = fieldInfo.numWatchers

    assert numWatchersAfter == numWatchersBefore - 1, "Expected 1 fewer watcher. Before %d. After %d" % (
        numWatchersBefore, numWatchersAfter)
    assert (numWatchersAfter > 0) or (
        0 == (fieldInfo.flags & dcgm_structs_internal.DCGM_CMI_F_WATCHED
              )), "Expected no watch. got flags %08X" % fieldInfo.flags
Exemple #2
0
def test_dcgm_engine_watch_field_values(handle):
    """
    Verifies that cache manager can watch a field value
    """

    # Watch field so we can fetch it
    fieldId = dcgm_fields.DCGM_FI_DEV_NAME
    gpuId = 0

    try:
        fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
            handle, gpuId, fieldId)
        numWatchersBefore = fieldInfo.numWatchers
    except dcgm_structs.dcgmExceptionClass(
            dcgm_structs.DCGM_ST_NOT_WATCHED) as e:
        numWatchersBefore = 0

    ret = dcgm_agent_internal.dcgmWatchFieldValue(handle, gpuId, fieldId,
                                                  10000000, 86400.0, 0)
    assert (ret == dcgm_structs.DCGM_ST_OK)

    fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
        handle, gpuId, fieldId)
    assert fieldInfo.flags & dcgm_structs_internal.DCGM_CMI_F_WATCHED, "Expected watch. got flags %08X" % fieldInfo.flags

    numWatchersAfter = fieldInfo.numWatchers
    assert numWatchersAfter == numWatchersBefore + 1, "Expected 1 extra watcher. Before %d. After %d" % (
        numWatchersBefore, numWatchersAfter)
Exemple #3
0
def helper_unwatch_field_values_public(handle, gpuIds):
    """
    Verifies that dcgm can unwatch a field value
    """
    fieldId = dcgm_fields.DCGM_FI_DEV_NAME
    fieldIds = [
        fieldId,
    ]

    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetGroupWithGpuIds('mygroup', gpuIds)
    fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "myfieldgroup", fieldIds)

    updateFreq = 10000000
    maxKeepAge = 86400
    maxKeepSamples = 0

    #These are all gpuId -> watcher count
    numWatchersBefore = {}
    numWatchersWithWatch = {}
    numWatchersAfter = {}

    #Get watch info before our test begins
    for gpuId in gpuIds:
        try:
            fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
                handleObj.handle, gpuId, fieldId)
            numWatchersBefore[gpuId] = fieldInfo.numWatchers
        except dcgm_structs.dcgmExceptionClass(
                dcgm_structs.DCGM_ST_NOT_WATCHED) as e:
            numWatchersBefore[gpuId] = 0

    #Now watch the fields
    groupObj.samples.WatchFields(fieldGroup, updateFreq, maxKeepAge,
                                 maxKeepSamples)

    #Get watcher info after our watch and check it against before our watch
    for gpuId in gpuIds:
        fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
            handleObj.handle, gpuId, fieldId)
        numWatchersWithWatch[gpuId] = fieldInfo.numWatchers
        assert numWatchersWithWatch[gpuId] == numWatchersBefore[gpuId] + 1,\
               "Watcher mismatch at gpuId %d, numWatchersWithWatch[gpuId] %d != numWatchersBefore[gpuId] %d + 1" %\
                (gpuId, numWatchersWithWatch[gpuId], numWatchersBefore[gpuId])

    #Unwatch fields
    groupObj.samples.UnwatchFields(fieldGroup)

    #Get watcher count after our unwatch. This should match our original watch count
    for gpuId in gpuIds:
        fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
            handleObj.handle, gpuId, fieldId)
        numWatchersAfter[gpuId] = fieldInfo.numWatchers

    assert numWatchersBefore == numWatchersAfter, "Expected numWatchersBefore (%s) to match numWatchersAfter %s" %\
           (str(numWatchersBefore), str(numWatchersAfter))
Exemple #4
0
def test_dcgm_prof_all_supported_fields_watchable(handle, gpuId):
    '''
    Verify that all fields that are reported as supported are watchable and 
    that values can be returned for them
    '''
    dcgmHandle = pydcgm.DcgmHandle(handle=handle)
    dcgmSystem = dcgmHandle.GetSystem()
    dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', [gpuId])

    helper_check_profiling_environment(dcgmGroup)

    fieldIds = helper_get_supported_field_ids(dcgmGroup)
    assert fieldIds is not None

    watchFreq = 1000 #1 ms
    maxKeepAge = 60.0
    maxKeepSamples = 0
    maxAgeUsec = int(maxKeepAge) * watchFreq

    entityPairList = [dcgm_structs.c_dcgmGroupEntityPair_t(dcgm_fields.DCGM_FE_GPU, gpuId)]

    for fieldId in fieldIds:
        # If there are only one unsupported SKUs in the group, WatchFields should return an error.
        # If at least one GPU in the group is supported, WatchFields will be successful.
        # The described logic is used to skip unsupported or fake SKUs.
        if dcgmGroup.profiling.WatchFields([fieldId, ], watchFreq, maxKeepAge,
                                           maxKeepSamples) == dcgm_structs.DCGM_ST_PROFILING_NOT_SUPPORTED:
            test_utils.skip_test_supported("DCP")

        # Sending a request to the profiling manager guarantees that an update cycle has happened since 
        # the last request
        dcgmGroup.profiling.GetSupportedMetricGroups()

        # validate watch freq, quota, and watched flags
        cmfi = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(handle, gpuId, fieldId)
        assert (cmfi.flags & dcgm_structs_internal.DCGM_CMI_F_WATCHED) != 0, "gpuId %u, fieldId %u not watched" % (gpuId, fieldId)
        assert cmfi.numSamples > 0
        assert cmfi.numWatchers == 1, "numWatchers %d" % cmfi.numWatchers
        assert cmfi.monitorFrequencyUsec == watchFreq, "monitorFrequencyUsec %u != watchFreq %u" % (cmfi.monitorFrequencyUsec, watchFreq)
        assert cmfi.lastStatus == dcgm_structs.DCGM_ST_OK, "lastStatus %u != DCGM_ST_OK" % (cmfi.lastStatus)

        fieldValues = dcgm_agent.dcgmEntitiesGetLatestValues(handle, entityPairList, [fieldId, ], 0)

        for i, fieldValue in enumerate(fieldValues):
            logger.debug(str(fieldValue))
            assert(fieldValue.status == dcgm_structs.DCGM_ST_OK), "idx %d status was %d" % (i, fieldValue.status)
            assert(fieldValue.ts != 0), "idx %d timestamp was 0" % (i)

        dcgmGroup.profiling.UnwatchFields()

        #Validate watch flags after unwatch
        cmfi = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(handle, gpuId, fieldId)
        assert (cmfi.flags & dcgm_structs_internal.DCGM_CMI_F_WATCHED) == 0, "gpuId %u, fieldId %u still watched. flags x%X" % (gpuId, fieldId, cmfi.flags)
        assert cmfi.numWatchers == 0, "numWatchers %d" % cmfi.numWatchers
Exemple #5
0
def helper_promote_field_values_watch_public(handle, gpuIds):
    """
    Verifies that dcgm can update a field value watch
    """
    fieldId = dcgm_fields.DCGM_FI_DEV_NAME
    fieldIds = [
        fieldId,
    ]

    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetGroupWithGpuIds('mygroup', gpuIds)
    fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "myfieldgroup", fieldIds)

    updateFreq = 100000  #100 msec
    maxKeepAge = 3600
    maxKeepSamples = 0

    #Track the number of watchers to make sure our watch promotion doesn't create another sub-watch
    #but rather updates the existing one
    numWatchersWithWatch = {}
    numWatchersAfter = {}

    #Watch the fields
    groupObj.samples.WatchFields(fieldGroup, updateFreq, maxKeepAge,
                                 maxKeepSamples)

    #Get watcher info after our watch and verify that the updateFrequency matches
    for gpuId in gpuIds:
        fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
            handleObj.handle, gpuId, fieldId)
        numWatchersWithWatch[gpuId] = fieldInfo.numWatchers
        assert fieldInfo.monitorFrequencyUsec == updateFreq, "after watch: fieldInfo.monitorFrequencyUsec %d != updateFreq %d" % \
               (fieldInfo.monitorFrequencyUsec, updateFreq)

    #Update the watch with a faster update frequency
    updateFreq = 50000  #50 msec
    groupObj.samples.WatchFields(fieldGroup, updateFreq, maxKeepAge,
                                 maxKeepSamples)

    #Get watcher info after our second watch and verify that the updateFrequency matches
    for gpuId in gpuIds:
        fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
            handleObj.handle, gpuId, fieldId)
        numWatchersAfter[gpuId] = fieldInfo.numWatchers
        assert fieldInfo.monitorFrequencyUsec == updateFreq, "after watch: fieldInfo.monitorFrequencyUsec %d != updateFreq %d" % \
               (fieldInfo.monitorFrequencyUsec, updateFreq)

    assert numWatchersWithWatch == numWatchersAfter, "numWatchersWithWatch (%s) != numWatchersAfter (%s)" % \
           (str(numWatchersWithWatch), str(numWatchersAfter))
Exemple #6
0
totalSampleCount = 0

cycleCount = 0

while True:
    cycleCount += 1
    print("Cycle %d. Fields that updated in the last 60 seconds" % cycleCount)

    driverTimeByFieldId = {}
    watchIntervalByFieldId = {}

    for gpuId in gpuIds:
        for fieldId in fieldIds:
            watchInfo = None
            try:
                watchInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(dcgmHandle.handle, gpuId, fieldId)
            except:
                pass
            
            if watchInfo is None:
                continue
            
            nowTs = int(time.time() * 1000000)
            oneMinuteAgoTs = nowTs - 60000000

            if watchInfo.newestTimestamp < oneMinuteAgoTs:
                continue
            
            perUpdate = 0
            if watchInfo.fetchCount > 0:
                perUpdate = watchInfo.execTimeUsec / watchInfo.fetchCount
Exemple #7
0
def test_dcgm_injection_agent(handle, gpuIds):
    """
    Verifies that injection works with the agent host engine
    """
    gpuId = gpuIds[0]

    #Make a base value that is good for starters
    fvGood = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
    fvGood.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    fvGood.fieldId = dcgm_fields.DCGM_FI_DEV_ECC_CURRENT
    fvGood.status = 0
    fvGood.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
    fvGood.ts = get_usec_since_1970()
    fvGood.value.i64 = 1

    fieldInfoBefore = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
        handle, gpuId, fvGood.fieldId)
    countBefore = fieldInfoBefore.numSamples

    #This will throw an exception if it fails
    dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvGood)

    fieldInfoAfter = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
        handle, gpuId, fvGood.fieldId)
    countAfter = fieldInfoAfter.numSamples

    assert countAfter > countBefore, "Expected countAfter %d > countBefore %d after injection" % (
        countAfter, countBefore)

    #Fetch the value we just inserted and verify its attributes are the same
    fvFetched = dcgm_agent_internal.dcgmGetLatestValuesForFields(
        handle, gpuId, [
            fvGood.fieldId,
        ])[0]
    helper_verify_fv_equal(fvFetched, fvGood)

    #Should be able to insert a null timestamp. The agent will just use "now"
    fvAlsoGood = fvGood
    fvAlsoGood.ts = 0
    #This will thrown an exception if it fails
    dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvAlsoGood)

    #Now make some attributes bad and expect an error
    fvBad = fvGood
    fvBad.fieldType = ord(dcgm_fields.DCGM_FT_DOUBLE)
    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)):
        dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvBad)

    fvGood.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
    """ TODO: DCGM-2130 - Restore this test when protobuf is removed
    #Now make some attributes bad and expect an error
    fvBad = fvGood
    fvBad.version = 0
    with test_utils.assert_raises(dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvBad)

    fvGood.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    """
    fvBad = fvGood
    fvBad.fieldId = dcgm_fields.DCGM_FI_MAX_FIELDS + 100
    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)):
        dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvBad)
Exemple #8
0
def test_dcgm_prof_all_supported_fields_watchable(handle, gpuIds):
    '''
    Verify that all fields that are reported as supported are watchable and 
    that values can be returned for them
    '''
    dcgmHandle = pydcgm.DcgmHandle(handle=handle)
    dcgmSystem = dcgmHandle.GetSystem()
    dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds)

    helper_check_profiling_environment(dcgmGroup)

    fieldIds = helper_get_supported_field_ids(dcgmGroup)
    assert fieldIds is not None

    watchFreq = 1000  #1 ms
    maxKeepAge = 60.0
    maxKeepSamples = 0
    maxAgeUsec = int(maxKeepAge) * watchFreq

    entityPairList = []
    for gpuId in gpuIds:
        entityPairList.append(
            dcgm_structs.c_dcgmGroupEntityPair_t(dcgm_fields.DCGM_FE_GPU,
                                                 gpuId))

    for fieldId in fieldIds:
        dcgmGroup.profiling.WatchFields([
            fieldId,
        ], watchFreq, maxKeepAge, maxKeepSamples)

        # Sending a request to the profiling manager guarantees that an update cycle has happened since
        # the last request
        dcgmGroup.profiling.GetSupportedMetricGroups()

        # validate watch freq, quota, and watched flags
        for gpuId in gpuIds:
            cmfi = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
                handle, gpuId, fieldId)
            assert (cmfi.flags & dcgm_structs_internal.DCGM_CMI_F_WATCHED
                    ) != 0, "gpuId %u, fieldId %u not watched" % (gpuId,
                                                                  fieldId)
            assert cmfi.numSamples > 0
            assert cmfi.numWatchers == 1, "numWatchers %d" % cmfi.numWatchers
            assert cmfi.monitorFrequencyUsec == watchFreq, "monitorFrequencyUsec %u != watchFreq %u" % (
                cmfi.monitorFrequencyUsec, watchFreq)
            assert cmfi.lastStatus == dcgm_structs.DCGM_ST_OK, "lastStatus %u != DCGM_ST_OK" % (
                cmfi.lastStatus)

        fieldValues = dcgm_agent.dcgmEntitiesGetLatestValues(
            handle, entityPairList, [
                fieldId,
            ], 0)

        for i, fieldValue in enumerate(fieldValues):
            logger.debug(str(fieldValue))
            assert (fieldValue.status == dcgm_structs.DCGM_ST_OK
                    ), "idx %d status was %d" % (i, fieldValue.status)
            assert (fieldValue.ts != 0), "idx %d timestamp was 0" % (i)

        dcgmGroup.profiling.UnwatchFields()

        #Validate watch flags after unwatch
        for gpuId in gpuIds:
            cmfi = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
                handle, gpuId, fieldId)
            assert (cmfi.flags & dcgm_structs_internal.DCGM_CMI_F_WATCHED
                    ) == 0, "gpuId %u, fieldId %u still watched. flags x%X" % (
                        gpuId, fieldId, cmfi.flags)
            assert cmfi.numWatchers == 0, "numWatchers %d" % cmfi.numWatchers