def _InitHandles(self):
        self._dcgmHandle = pydcgm.DcgmHandle(ipAddress=self._hostname)
        
        groupName = "error_mon_gpus" + self._pidPostfix
        self._allGpusGroup = pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName, groupType=dcgm_structs.DCGM_GROUP_DEFAULT)
        print("Found %d GPUs" % (len(self._allGpusGroup.GetEntities())))

        groupName = "error_mon_nvswitches" + self._pidPostfix
        self._allNvSwitchesGroup = pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName, groupType=dcgm_structs.DCGM_GROUP_DEFAULT_NVSWITCHES)
        print("Found %d NvSwitches" % len(self._allNvSwitchesGroup.GetEntities()))

        fgName = "error_mon_nvswitches" + self._pidPostfix
        self._nvSwitchErrorFieldGroup = pydcgm.DcgmFieldGroup(self._dcgmHandle, name=fgName, fieldIds=self._nvSwitchErrorFieldIds)
        
        fgName = "error_mon_gpus" + self._pidPostfix
        self._gpuErrorFieldGroup = pydcgm.DcgmFieldGroup(self._dcgmHandle, name=fgName, fieldIds=self._gpuErrorFieldIds)

        updateFreq = int(self._updateIntervalSecs / 2.0) * 1000000
        maxKeepAge = 3600.0 #1 hour
        maxKeepSamples = 0 #Rely on maxKeepAge

        self._nvSwitchWatcher = dcgm_field_helpers.DcgmFieldGroupEntityWatcher(
            self._dcgmHandle.handle, self._allNvSwitchesGroup.GetId(), 
            self._nvSwitchErrorFieldGroup, dcgm_structs.DCGM_OPERATION_MODE_AUTO,
            updateFreq, maxKeepAge, maxKeepSamples, 0)
        self._gpuWatcher = dcgm_field_helpers.DcgmFieldGroupEntityWatcher(
            self._dcgmHandle.handle, self._allGpusGroup.GetId(), 
            self._gpuErrorFieldGroup, dcgm_structs.DCGM_OPERATION_MODE_AUTO,
            updateFreq, maxKeepAge, maxKeepSamples, 0)
Exemple #2
0
    def GetFieldMetadata(self):
        self.m_fieldIdToInfo = {}

        findByNameId = self.m_dcgmSystem.GetFieldGroupIdByName(
            self.m_fieldGroupName)

        #Remove our field group if it exists already
        if findByNameId is not None:
            delFieldGroup = pydcgm.DcgmFieldGroup(dcgmHandle=self.m_dcgmHandle,
                                                  fieldGroupId=findByNameId)
            delFieldGroup.Delete()
            del (delFieldGroup)

        self.m_fieldGroup = pydcgm.DcgmFieldGroup(self.m_dcgmHandle,
                                                  self.m_fieldGroupName,
                                                  self.m_publishFieldIds)

        for fieldId in self.m_fieldGroup.fieldIds:
            self.m_fieldIdToInfo[
                fieldId] = self.m_dcgmSystem.fields.GetFieldById(fieldId)
            if self.m_fieldIdToInfo[fieldId] == 0 or self.m_fieldIdToInfo[
                    fieldId] == None:
                self.LogError(
                    "Cannot get field tag for field id %d. Please check dcgm_fields to see if it is valid."
                    % (fieldId))
                raise dcgm_structs.DCGMError(
                    dcgm_structs.DCGM_ST_UNKNOWN_FIELD)
Exemple #3
0
    def GetFieldMetadata(self):
        self.m_fieldIdToInfo = {}
        self.m_fieldGroups = {}
        self.m_fieldGroup = None
        allFieldIds = []

        # Initialize groups for all field intervals.
        self.LogInfo("GetFieldMetaData:\n")

        intervalIndex = 0
        for interval, fieldIds in self.m_publishFields.items():
            self.LogInfo("sampling interval = " + str(interval) + ":\n")
            for fieldId in fieldIds:
                self.LogInfo("   fieldId: " + str(fieldId) + "\n")

            intervalIndex += 1
            fieldGroupName = self.m_fieldGroupName + "_" + str(intervalIndex)
            findByNameId = self.m_dcgmSystem.GetFieldGroupIdByName(
                fieldGroupName)
            self.LogInfo("fieldGroupName: " + fieldGroupName + "\n")

            # Remove our field group if it exists already
            if findByNameId is not None:
                self.LogInfo("fieldGroupId: " + findByNameId + "\n")
                delFieldGroup = pydcgm.DcgmFieldGroup(
                    dcgmHandle=self.m_dcgmHandle, fieldGroupId=findByNameId)
                delFieldGroup.Delete()
                del (delFieldGroup)

            self.m_fieldGroups[interval] = pydcgm.DcgmFieldGroup(
                self.m_dcgmHandle, fieldGroupName, fieldIds)

            for fieldId in fieldIds:
                if fieldId not in allFieldIds:
                    allFieldIds += [fieldId]

                self.m_fieldIdToInfo[
                    fieldId] = self.m_dcgmSystem.fields.GetFieldById(fieldId)
                if self.m_fieldIdToInfo[fieldId] == 0 or self.m_fieldIdToInfo[
                        fieldId] == None:
                    self.LogError(
                        "Cannot get field tag for field id %d. Please check dcgm_fields to see if it is valid."
                        % (fieldId))
                    raise dcgm_structs.DCGMError(
                        dcgm_structs.DCGM_ST_UNKNOWN_FIELD)
        # Initialize a field group of ALL fields.
        fieldGroupName = self.m_fieldGroupName
        findByNameId = self.m_dcgmSystem.GetFieldGroupIdByName(fieldGroupName)

        # Remove our field group if it exists already
        if findByNameId is not None:
            delFieldGroup = pydcgm.DcgmFieldGroup(dcgmHandle=self.m_dcgmHandle,
                                                  fieldGroupId=findByNameId)
            delFieldGroup.Delete()
            del (delFieldGroup)

        self.m_fieldGroup = pydcgm.DcgmFieldGroup(self.m_dcgmHandle,
                                                  fieldGroupName, allFieldIds)
Exemple #4
0
def test_dcgm_field_group_get_by_name(handle):
    fieldIds = [dcgm_fields.DCGM_FI_DRIVER_VERSION, dcgm_fields.DCGM_FI_DEV_NAME, dcgm_fields.DCGM_FI_DEV_BRAND]
    handle = pydcgm.DcgmHandle(handle)

    fieldGroupName = "mygroup"
    fieldGroupObj = pydcgm.DcgmFieldGroup(handle, "mygroup", fieldIds)

    findByNameId = handle.GetSystem().GetFieldGroupIdByName(fieldGroupName)

    assert findByNameId is not None, "Expected field group ID. Got None"
    assert int(findByNameId.value) == int(fieldGroupObj.fieldGroupId.value), "Got field group ID handle mismatch %s != %s" % (findByNameId, fieldGroupObj.fieldGroupId)

    #Make sure we can create an object from our found id and delete it
    fieldGroupObj2 = pydcgm.DcgmFieldGroup(dcgmHandle=handle, fieldGroupId=findByNameId)
    fieldGroupObj2.Delete()
Exemple #5
0
def test_dcgm_embedded_metadata_exectime_get_field_group_sane(handle):
    """
    Sanity test for API that gets execution time of all fields together
    """
    if not option_parser.options.developer_mode:
        test_utils.skip_test("Skipping developer test.")
    handle = pydcgm.DcgmHandle(handle)
    system = pydcgm.DcgmSystem(handle)
    group = pydcgm.DcgmGroup(handle,
                             groupName="metadata-test",
                             groupType=dcgm_structs.DCGM_GROUP_DEFAULT)

    fieldIds = [
        dcgm_fields.DCGM_FI_DEV_POWER_USAGE, dcgm_fields.DCGM_FI_DEV_SM_CLOCK,
        dcgm_fields.DCGM_FI_DEV_GPU_TEMP
    ]
    fieldGroup = pydcgm.DcgmFieldGroup(handle, "my_field_group", fieldIds)

    updateFreqUsec = 1000
    _watch_field_group_basic(fieldGroup,
                             handle.handle,
                             group.GetId(),
                             updateFreq=updateFreqUsec)
    system.introspect.UpdateAll()

    execTime = system.introspect.execTime.GetForFieldGroup(
        fieldGroup).aggregateInfo

    # test that all struct fields in the API response have reasonable values
    assert (100 < execTime.totalEverUpdateUsec <
            100 * 1000), execTime.totalEverUpdateUsec
    assert (100 < execTime.recentUpdateUsec <
            100 * 1000), execTime.recentUpdateUsec
    assert (updateFreqUsec == execTime.meanUpdateFreqUsec
            ), execTime.meanUpdateFreqUsec
Exemple #6
0
def test_dcgm_embedded_metadata_memory_get_field_sane(handle):
    '''
    Sanity test for API that gets memory usage of a single field
    '''
    if not option_parser.options.developer_mode:
        test_utils.skip_test("Skipping developer test.")
    handleObj = pydcgm.DcgmHandle(handle=handle)
    fieldIds = [
        dcgm_fields.DCGM_FI_DEV_GPU_TEMP,
    ]
    fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "my_field_group", fieldIds)

    group = pydcgm.DcgmGroup(pydcgm.DcgmHandle(handle),
                             groupName="test-metadata",
                             groupType=dcgm_structs.DCGM_GROUP_DEFAULT)
    system = pydcgm.DcgmSystem(pydcgm.DcgmHandle(handle))

    _watch_field_group_basic(fieldGroup, handle, group.GetId())
    system.introspect.UpdateAll()

    memoryInfo = dcgm_agent_internal.dcgmIntrospectGetFieldMemoryUsage(
        handle, fieldIds[0])

    logger.debug("field %s using %.2f KB" %
                 (fieldIds[0], memoryInfo.aggregateInfo.bytesUsed / 1024.))

    # 0+ to 200 KB
    assert(0 < memoryInfo.aggregateInfo.bytesUsed < 1024*200), \
        'bytes used to store field was unreasonable for ID %s, bytes: %s' \
        % (fieldIds[0], memoryInfo.aggregateInfo.bytesUsed)
Exemple #7
0
def test_dcgm_embedded_metadata_memory_get_field_group_sane(handle):
    '''
    Sanity test for API that gets memory usage of a single field group
    '''
    if not option_parser.options.developer_mode:
        test_utils.skip_test("Skipping developer test.")
    handle = pydcgm.DcgmHandle(handle)
    group = pydcgm.DcgmGroup(handle,
                             groupName='test-metadata',
                             groupType=dcgm_structs.DCGM_GROUP_DEFAULT)
    system = pydcgm.DcgmSystem(handle)

    fieldIds = [
        dcgm_fields.DCGM_FI_DEV_GPU_TEMP, dcgm_fields.DCGM_FI_DEV_POWER_USAGE
    ]

    fieldGroup = pydcgm.DcgmFieldGroup(handle, "my_field_group", fieldIds)

    # ensure that the field group is watched
    _watch_field_group_basic(fieldGroup, handle.handle, group.GetId())
    system.introspect.UpdateAll()

    memoryInfo = system.introspect.memory.GetForFieldGroup(fieldGroup)

    logger.debug("field group %s is using %.2f KB" %
                 (fieldGroup.name, memoryInfo.aggregateInfo.bytesUsed / 1024.))

    # 0+ to 20 MB
    assert(0 < memoryInfo.aggregateInfo.bytesUsed < 1024*1024*20), \
        'bytes used to store field was unreasonable for field group %s, bytes: %s' \
        % (fieldGroup.name, memoryInfo.aggregateInfo.bytesUsed)
Exemple #8
0
def helper_unwatch_field_values_public(handle, gpuIds):
    """
    Verifies that dcgm can unwatch a field value
    """
    fieldId = dcgm_fields.DCGM_FI_DEV_NAME
    fieldIds = [
        fieldId,
    ]

    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetGroupWithGpuIds('mygroup', gpuIds)
    fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "myfieldgroup", fieldIds)

    updateFreq = 10000000
    maxKeepAge = 86400
    maxKeepSamples = 0

    #These are all gpuId -> watcher count
    numWatchersBefore = {}
    numWatchersWithWatch = {}
    numWatchersAfter = {}

    #Get watch info before our test begins
    for gpuId in gpuIds:
        try:
            fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
                handleObj.handle, gpuId, fieldId)
            numWatchersBefore[gpuId] = fieldInfo.numWatchers
        except dcgm_structs.dcgmExceptionClass(
                dcgm_structs.DCGM_ST_NOT_WATCHED) as e:
            numWatchersBefore[gpuId] = 0

    #Now watch the fields
    groupObj.samples.WatchFields(fieldGroup, updateFreq, maxKeepAge,
                                 maxKeepSamples)

    #Get watcher info after our watch and check it against before our watch
    for gpuId in gpuIds:
        fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
            handleObj.handle, gpuId, fieldId)
        numWatchersWithWatch[gpuId] = fieldInfo.numWatchers
        assert numWatchersWithWatch[gpuId] == numWatchersBefore[gpuId] + 1,\
               "Watcher mismatch at gpuId %d, numWatchersWithWatch[gpuId] %d != numWatchersBefore[gpuId] %d + 1" %\
                (gpuId, numWatchersWithWatch[gpuId], numWatchersBefore[gpuId])

    #Unwatch fields
    groupObj.samples.UnwatchFields(fieldGroup)

    #Get watcher count after our unwatch. This should match our original watch count
    for gpuId in gpuIds:
        fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
            handleObj.handle, gpuId, fieldId)
        numWatchersAfter[gpuId] = fieldInfo.numWatchers

    assert numWatchersBefore == numWatchersAfter, "Expected numWatchersBefore (%s) to match numWatchersAfter %s" %\
           (str(numWatchersBefore), str(numWatchersAfter))
Exemple #9
0
def test_dcgm_field_group_info(handle):
    fieldIds = [dcgm_fields.DCGM_FI_DRIVER_VERSION, dcgm_fields.DCGM_FI_DEV_NAME, dcgm_fields.DCGM_FI_DEV_BRAND]
    handle = pydcgm.DcgmHandle(handle)
    fieldGroup = pydcgm.DcgmFieldGroup(handle, "mygroup", fieldIds)

    #Get the field group we just added to verify it was added and the metadata is correct
    fieldGroupInfo = dcgm_agent.dcgmFieldGroupGetInfo(handle.handle, fieldGroup.fieldGroupId)
    assert fieldGroupInfo.version == dcgm_structs.dcgmFieldGroupInfo_version1, fieldGroupInfo.version
    assert fieldGroupInfo.fieldGroupId == int(fieldGroup.fieldGroupId.value), "%s != %s" %(str(fieldGroupInfo.fieldGroupId), str(fieldGroup.fieldGroupId))
    assert fieldGroupInfo.fieldGroupName == fieldGroup.name, str(fieldGroupInfo.name)
    assert fieldGroupInfo.numFieldIds == len(fieldIds), fieldGroupInfo.numFieldIds
    for i, fieldId in enumerate(fieldIds):
        assert fieldGroupInfo.fieldIds[i] == fieldId, "i = %d, %d != %d" % (i, fieldGroupInfo.fieldIds[i], fieldId)
Exemple #10
0
def helper_promote_field_values_watch_public(handle, gpuIds):
    """
    Verifies that dcgm can update a field value watch
    """
    fieldId = dcgm_fields.DCGM_FI_DEV_NAME
    fieldIds = [
        fieldId,
    ]

    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetGroupWithGpuIds('mygroup', gpuIds)
    fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "myfieldgroup", fieldIds)

    updateFreq = 100000  #100 msec
    maxKeepAge = 3600
    maxKeepSamples = 0

    #Track the number of watchers to make sure our watch promotion doesn't create another sub-watch
    #but rather updates the existing one
    numWatchersWithWatch = {}
    numWatchersAfter = {}

    #Watch the fields
    groupObj.samples.WatchFields(fieldGroup, updateFreq, maxKeepAge,
                                 maxKeepSamples)

    #Get watcher info after our watch and verify that the updateFrequency matches
    for gpuId in gpuIds:
        fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
            handleObj.handle, gpuId, fieldId)
        numWatchersWithWatch[gpuId] = fieldInfo.numWatchers
        assert fieldInfo.monitorFrequencyUsec == updateFreq, "after watch: fieldInfo.monitorFrequencyUsec %d != updateFreq %d" % \
               (fieldInfo.monitorFrequencyUsec, updateFreq)

    #Update the watch with a faster update frequency
    updateFreq = 50000  #50 msec
    groupObj.samples.WatchFields(fieldGroup, updateFreq, maxKeepAge,
                                 maxKeepSamples)

    #Get watcher info after our second watch and verify that the updateFrequency matches
    for gpuId in gpuIds:
        fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
            handleObj.handle, gpuId, fieldId)
        numWatchersAfter[gpuId] = fieldInfo.numWatchers
        assert fieldInfo.monitorFrequencyUsec == updateFreq, "after watch: fieldInfo.monitorFrequencyUsec %d != updateFreq %d" % \
               (fieldInfo.monitorFrequencyUsec, updateFreq)

    assert numWatchersWithWatch == numWatchersAfter, "numWatchersWithWatch (%s) != numWatchersAfter (%s)" % \
           (str(numWatchersWithWatch), str(numWatchersAfter))
Exemple #11
0
def main():
    operationMode = dcgm_structs.DCGM_OPERATION_MODE_AUTO
    timeStep = 1.0

    dcgm_structs._dcgmInit()
    dcgm_agent.dcgmInit()  #Will throw an exception on error
    handle = dcgm_agent.dcgmStartEmbedded(operationMode)
    handleObj = pydcgm.DcgmHandle(handle=handle)
    groupId = dcgm_structs.DCGM_GROUP_ALL_GPUS
    fieldIds = [
        dcgm_fields.DCGM_FI_DEV_SM_CLOCK, dcgm_fields.DCGM_FI_DEV_MEM_CLOCK
    ]

    fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "my_field_group", fieldIds)

    updateFreq = int(timeStep * 1000000.0)
    maxKeepAge = 3600.0  #1 hour
    maxKeepSamples = 0  #unlimited. maxKeepAge will enforce quota
    startTimestamp = 0  #beginning of time

    dfcw = DcgmFieldGroupWatcher(handle, groupId, fieldGroup, operationMode,
                                 updateFreq, maxKeepAge, maxKeepSamples,
                                 startTimestamp)
    dfcw2 = DcgmFieldGroupEntityWatcher(handle, groupId, fieldGroup,
                                        operationMode, updateFreq, maxKeepAge,
                                        maxKeepSamples, startTimestamp)

    while (True):
        newUpdateCount = dfcw.GetAllSinceLastCall()
        newUpdateCount2 = dfcw2.GetAllSinceLastCall()
        print("Got %d and %d new field value updates" %
              (newUpdateCount, newUpdateCount2))
        for gpuId in list(dfcw.values.keys()):
            print("gpuId %d" % gpuId)
            for fieldId in list(dfcw.values[gpuId].keys()):
                print("    fieldId %d: %d values. latest timestamp %d" % \
                      (fieldId, len(dfcw.values[gpuId][fieldId]), dfcw.values[gpuId][fieldId][-1].ts))

        for entityGroupId in list(dfcw2.values.keys()):
            print("entityGroupId %d" % entityGroupId)
            for entityId in list(dfcw2.values[entityGroupId].keys()):
                print("    entityId %d" % entityId)
                for fieldId in list(
                        dfcw2.values[entityGroupId][entityId].keys()):
                    print("        fieldId %d: %d values. latest timestamp %d" % \
                          (fieldId, len(dfcw2.values[entityGroupId][entityId][fieldId]), dfcw2.values[entityGroupId][entityId][fieldId][-1].ts))

        time.sleep(timeStep)
Exemple #12
0
def test_dcgm_field_group_add_remove(handle):
    fieldIds = [dcgm_fields.DCGM_FI_DRIVER_VERSION, dcgm_fields.DCGM_FI_DEV_NAME, dcgm_fields.DCGM_FI_DEV_BRAND]
    handle = pydcgm.DcgmHandle(handle)
    fieldGroup = pydcgm.DcgmFieldGroup(handle, "mygroup", fieldIds)

    #Save this ID before we mess with the object
    fieldGroupId = fieldGroup.fieldGroupId

    #This will assert on error
    fieldGroupInfo = dcgm_agent.dcgmFieldGroupGetInfo(handle.handle, fieldGroupId)

    #Delete the field group and make sure it's gone from the host engine
    del(fieldGroup)
    fieldGroup = None

    with test_utils.assert_raises(dcgmExceptionClass(dcgm_structs.DCGM_ST_NO_DATA)):
        fieldGroupInfo = dcgm_agent.dcgmFieldGroupGetInfo(handle.handle, fieldGroupId)
Exemple #13
0
def test_dcgm_embedded_metadata_mean_update_frequency(handle):
    """
    Ensure that mean update frequency is being calculated properly
    """
    handle = pydcgm.DcgmHandle(handle)
    system = pydcgm.DcgmSystem(handle)
    group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT)
    
    # these frequencies must have a perfect integer mean or the last assertion will fail
    updateFreqs = {
        dcgm_fields.DCGM_FI_DEV_POWER_USAGE: 10000, 
        dcgm_fields.DCGM_FI_DEV_GPU_TEMP: 20000,
    }
    meanUpdateFreq = stats.mean(updateFreqs.values())
    
    gpuId = group.GetGpuIds()[0]
    fieldIds = []
    
    for fieldId, freqUsec in updateFreqs.items():
        fieldIds.append(fieldId)
        dcgm_agent_internal.dcgmWatchFieldValue(handle.handle, gpuId, 
                                                fieldId, 
                                                freqUsec, 
                                                100000, 
                                                10)
        
    system.UpdateAllFields(True)
    system.introspect.UpdateAll()

    fieldGroup = pydcgm.DcgmFieldGroup(handle, "my_field_group", fieldIds)
    execTime = system.introspect.execTime.GetForFieldGroup(fieldGroup)
    
    resultGpuIndex = -1
    for i in range(execTime.gpuInfoCount):
        if execTime.gpuIdsForGpuInfo[i] == gpuId:
            resultGpuIndex = i
            break
        
    assert(resultGpuIndex >= 0), "no results returned for the watched GPU"
    
    actualMeanUpdateFreq = execTime.gpuInfo[resultGpuIndex].meanUpdateFreqUsec
    assert(actualMeanUpdateFreq == meanUpdateFreq), "expected %s, got %s" \
        % (meanUpdateFreq, actualMeanUpdateFreq)
Exemple #14
0
    def __init__(self, dcgmHandle, gpuIds, fieldIds, watchIntervalSecs):
        global nameIncrement

        self._dcgmHandle = dcgmHandle
        self._dcgmSystem = dcgmHandle.GetSystem()
        gpuGroupName = "%d_%d" % (os.getpid(), nameIncrement)
        nameIncrement += 1

        if gpuIds is None:
            self._dcgmGroup = self._dcgmSystem.GetDefaultGroup()
        else:
            self._dcgmGroup = self._dcgmSystem.GetGroupWithGpuIds(
                gpuGroupName, gpuIds)
        self._watchIntervalSecs = watchIntervalSecs
        fieldGroupName = "%d_%d" % (os.getpid(), nameIncrement)
        nameIncrement += 1
        self._dcgmFieldGroup = pydcgm.DcgmFieldGroup(dcgmHandle,
                                                     fieldGroupName, fieldIds,
                                                     None)
Exemple #15
0
def test_dcgm_field_group_get_info_validate(handle):
    """
    Validates structure version
    """
    fieldIds = [
        dcgm_fields.DCGM_FI_DRIVER_VERSION, dcgm_fields.DCGM_FI_DEV_NAME,
        dcgm_fields.DCGM_FI_DEV_BRAND
    ]
    handle = pydcgm.DcgmHandle(handle)
    fieldGroup = pydcgm.DcgmFieldGroup(handle, "mygroup", fieldIds)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmFieldGroupGetInfo(handle.handle, fieldGroup.fieldGroupId,
                                      versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmFieldGroupGetInfo(handle.handle, fieldGroup.fieldGroupId,
                                      versionTest)
Exemple #16
0
def test_dcgm_field_group_duplicate_name(handle):
    fieldIds = [dcgm_fields.DCGM_FI_DRIVER_VERSION, ]
    handle = pydcgm.DcgmHandle(handle)
    fieldGroup = pydcgm.DcgmFieldGroup(handle, "dupeme", fieldIds)
    with test_utils.assert_raises(dcgmExceptionClass(dcgm_structs.DCGM_ST_DUPLICATE_KEY)):
        fieldGroup2 = pydcgm.DcgmFieldGroup(handle, "dupeme", fieldIds)
def test_nvswitch_traffic_p2p(handle, switchIds):
    """
    Verifies that fabric can pass p2p read and write traffic successfully
    """

    test_utils.skip_test("Bandwidth field not being updated yet")

    # TX_0 and RX_0 on port 0
    nvSwitchBandwidth0FieldIds = []
    for i in range(dcgm_fields.DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P00,
                   dcgm_fields.DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P00 + 1, 1):
        nvSwitchBandwidth0FieldIds.append(i)

    # TX_1 and RX_1 on port 0
    nvSwitchBandwidth1FieldIds = []
    for i in range(dcgm_fields.DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P00,
                   dcgm_fields.DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P00 + 1, 1):
        nvSwitchBandwidth1FieldIds.append(i)

    dcgmHandle = pydcgm.DcgmHandle(ipAddress="127.0.0.1")

    groupName = "test_nvswitches"
    allNvSwitchesGroup = pydcgm.DcgmGroup(
        dcgmHandle,
        groupName=groupName,
        groupType=dcgm_structs.DCGM_GROUP_DEFAULT_NVSWITCHES)

    fgName = "test_nvswitches_bandwidth0"
    nvSwitchBandwidth0FieldGroup = pydcgm.DcgmFieldGroup(
        dcgmHandle, name=fgName, fieldIds=nvSwitchBandwidth0FieldIds)

    fgName = "test_nvswitches_bandwidth1"
    nvSwitchBandwidth1FieldGroup = pydcgm.DcgmFieldGroup(
        dcgmHandle, name=fgName, fieldIds=nvSwitchBandwidth1FieldIds)

    updateFreq = int(20 / 2.0) * 1000000
    maxKeepAge = 600.0
    maxKeepSamples = 0

    nvSwitchBandwidth0Watcher = dcgm_field_helpers.DcgmFieldGroupEntityWatcher(
        dcgmHandle.handle, allNvSwitchesGroup.GetId(),
        nvSwitchBandwidth0FieldGroup, dcgm_structs.DCGM_OPERATION_MODE_AUTO,
        updateFreq, maxKeepAge, maxKeepSamples, 0)
    nvSwitchBandwidth1Watcher = dcgm_field_helpers.DcgmFieldGroupEntityWatcher(
        dcgmHandle.handle, allNvSwitchesGroup.GetId(),
        nvSwitchBandwidth1FieldGroup, dcgm_structs.DCGM_OPERATION_MODE_AUTO,
        updateFreq, maxKeepAge, maxKeepSamples, 0)

    # wait for FM reports and populates stats
    time.sleep(30)

    # read the counters before sending traffic
    nvSwitchBandwidth0Watcher.GetMore()
    nvSwitchBandwidth1Watcher.GetMore()

    for entityGroupId in nvSwitchBandwidth0Watcher.values.keys():
        for entityId in nvSwitchBandwidth0Watcher.values[entityGroupId]:
            bandwidth0FieldId = dcgm_fields.DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P00
            bandwidth1FieldId = dcgm_fields.DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P00

            counter0TxBefore = nvSwitchBandwidth0Watcher.values[entityGroupId][
                entityId][bandwidth0FieldId].values[-1].value
            bandwidth0FieldId += 1
            counter0RxBefore = nvSwitchBandwidth0Watcher.values[entityGroupId][
                entityId][bandwidth0FieldId].values[-1].value
            counter1TxBefore = nvSwitchBandwidth1Watcher.values[entityGroupId][
                entityId][bandwidth1FieldId].values[-1].value
            bandwidth1FieldId += 1
            counter1RxBefore = nvSwitchBandwidth1Watcher.values[entityGroupId][
                entityId][bandwidth1FieldId].values[-1].value

    # Generate write traffic for the nvswitches
    test_utils.run_p2p_bandwidth_app(
        test_nvswitch_utils.MEMCPY_DTOD_WRITE_CE_BANDWIDTH)

    # Generate read traffic for the nvswitches
    test_utils.run_p2p_bandwidth_app(
        test_nvswitch_utils.MEMCPY_DTOD_READ_CE_BANDWIDTH)

    # read the counters again after sending traffic
    nvSwitchBandwidth0Watcher.GetMore()
    nvSwitchBandwidth1Watcher.GetMore()

    for entityGroupId in nvSwitchBandwidth0Watcher.values.keys():
        for entityId in nvSwitchBandwidth0Watcher.values[entityGroupId]:
            bandwidth0FieldId = dcgm_fields.DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P00
            bandwidth1FieldId = dcgm_fields.DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P00

            counter0TxAfter = nvSwitchBandwidth0Watcher.values[entityGroupId][
                entityId][bandwidth0FieldId].values[-1].value
            bandwidth0FieldId += 1
            counter0RxAfter = nvSwitchBandwidth0Watcher.values[entityGroupId][
                entityId][bandwidth0FieldId].values[-1].value
            counter1TxAfter = nvSwitchBandwidth1Watcher.values[entityGroupId][
                entityId][bandwidth1FieldId].values[-1].value
            bandwidth1FieldId += 1
            counter1RxAfter = nvSwitchBandwidth1Watcher.values[entityGroupId][
                entityId][bandwidth1FieldId].values[-1].value

    assert counter0TxAfter > counter0TxBefore, "Counter0Tx did not increase"
    assert counter0RxAfter > counter0RxBefore, "counter0Rx did not increase"
    assert counter1TxAfter > counter1TxBefore, "Counter1Tx did not increase"
    assert counter1RxAfter > counter1RxBefore, "counter1Rx did not increase"
Exemple #18
0
def _gather_perf_timeseries(handle, watchedFieldIds):
    '''
    Gathers metadata over time and returns a tuple of 
    4 MetadataTimeseries (mem usage, exec time, avg exec time, cpu utilization)
    '''

    system = pydcgm.DcgmSystem(handle)

    memUsageTS = MetadataTimeseries()
    execTimeTS = MetadataTimeseries()
    execTimeAvgTS = MetadataTimeseries()
    cpuUtilTS = CpuTimeseries()

    numFields = min(len(watchedFieldIds), 50)
    fieldGroups = []
    for i in range(1, 6):
        fieldGroups.append(
            pydcgm.DcgmFieldGroup(handle, "my_field_group_%d" % i,
                                  list(watchedFieldIds)[0:numFields]))

    startTime = datetime.datetime.now()

    while (datetime.datetime.now() -
           startTime).total_seconds() < BOUNDED_TEST_DURATION:

        # poll memory usage
        memUsageTS.timestamps.append(
            (datetime.datetime.now() - startTime).total_seconds())

        memUsageTS.processVals.append(
            system.introspect.memory.GetForHostengine().bytesUsed)
        memUsageTS.allFieldsVals.append(
            system.introspect.memory.GetForAllFields().aggregateInfo.bytesUsed)

        for id in watchedFieldIds:
            memUsageTS.fieldVals[id].append(
                dcgm_agent_internal.dcgmIntrospectGetFieldMemoryUsage(
                    handle.handle, id).aggregateInfo.bytesUsed)

        for fieldGroup in fieldGroups:
            memUsageTS.fieldGroupVals[int(
                fieldGroup.fieldGroupId.value)].append(
                    system.introspect.memory.GetForFieldGroup(
                        fieldGroup).aggregateInfo.bytesUsed)

        # poll execution time
        execTimeTS.timestamps.append(
            (datetime.datetime.now() - startTime).total_seconds())

        execTimeTS.allFieldsVals.append(
            system.introspect.execTime.GetForAllFields(
            ).aggregateInfo.totalEverUpdateUsec)

        for id in watchedFieldIds:
            execTimeTS.fieldVals[id].append(
                dcgm_agent_internal.dcgmIntrospectGetFieldExecTime(
                    handle.handle, id).aggregateInfo.totalEverUpdateUsec)
            #logger.info("fieldId %d: %s" % (id, str(execTimeTS.fieldVals[id][-1])))

        for fieldGroup in fieldGroups:
            execTimeTS.fieldGroupVals[int(
                fieldGroup.fieldGroupId.value)].append(
                    system.introspect.execTime.GetForFieldGroup(
                        fieldGroup).aggregateInfo.totalEverUpdateUsec)

        # poll average execution time
        execTimeAvgTS.timestamps.append(
            (datetime.datetime.now() - startTime).total_seconds())

        execTimeAvgTS.allFieldsVals.append(
            system.introspect.execTime.GetForAllFields(
            ).aggregateInfo.recentUpdateUsec)

        for id in watchedFieldIds:
            execTimeAvgTS.fieldVals[id].append(
                dcgm_agent_internal.dcgmIntrospectGetFieldExecTime(
                    handle.handle, id).aggregateInfo.recentUpdateUsec)

        for fieldGroup in fieldGroups:
            execTimeAvgTS.fieldGroupVals[int(
                fieldGroup.fieldGroupId.value)].append(
                    system.introspect.execTime.GetForFieldGroup(
                        fieldGroup).aggregateInfo.recentUpdateUsec)

        # poll cpu utilization
        cpuUtilTS.timestamps.append(
            (datetime.datetime.now() - startTime).total_seconds())
        cpuUtilTS.cpuInfo.append(system.introspect.cpuUtil.GetForHostengine())

        time.sleep(0.050)

    return memUsageTS, execTimeTS, execTimeAvgTS, cpuUtilTS