Ejemplo n.º 1
0
def test_dcgm_reader_default(handle):
    # pylint: disable=undefined-variable
    dr = DcgmReader()
    dr.SetHandle(handle)
    latest = dr.GetLatestGpuValuesAsFieldNameDict()

    for gpuId in latest:
        # latest data might be less than the list, because blank values aren't included
        # Defined in DcgmReader
        # pylint: disable=undefined-variable
        assert len(latest[gpuId]) <= len(defaultFieldIds)

        # Make sure we get strings
        for key in latest[gpuId]:
            assert isinstance(key, str)

    sample = dr.GetLatestGpuValuesAsFieldIdDict()

    for gpuId in sample:
        # Defined in DcgmReader
        # pylint: disable=undefined-variable
        assert len(sample[gpuId]) <= len(defaultFieldIds)

        # Make sure we get valid integer field ids
        for fieldId in sample[gpuId]:
            assert isinstance(fieldId, int)
            assert dcgm_fields.DcgmFieldGetById(fieldId) != None
Ejemplo n.º 2
0
    def GetFieldById(self, fieldId):
        '''
        Get a field's metadata by its dcgm_fields.DCGM_FI_* field ID

        fieldId: dcgm_fields.DCGM_FI_* field ID of the field

        Returns a dcgm_fields.c_dcgm_field_meta_t struct on success or None on error.
        '''
        return dcgm_fields.DcgmFieldGetById(fieldId)
Ejemplo n.º 3
0
def helper_field_has_variable_size(fieldId):
    '''
    Returns True if a field has a variable memory size per record. False if it doesn't.
    '''
    if fieldId == dcgm_fields.DCGM_FI_DEV_GPU_UTIL_SAMPLES or \
       fieldId == dcgm_fields.DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES or \
       fieldId == dcgm_fields.DCGM_FI_DEV_GRAPHICS_PIDS or \
       fieldId == dcgm_fields.DCGM_FI_DEV_COMPUTE_PIDS:
        return True

    fieldMeta = dcgm_fields.DcgmFieldGetById(fieldId)
    if fieldMeta.fieldType == dcgm_fields.DCGM_FT_BINARY:
        return True
    else:
        return False
Ejemplo n.º 4
0
 def _GetLatestGpuErrorSamples(self):
     numErrors = 0
     nowStr = time.strftime("%m/%d/%Y %H:%M:%S") 
     
     self._gpuWatcher.GetMore()
     for entityGroupId in self._gpuWatcher.values.keys():
         for entityId in self._gpuWatcher.values[entityGroupId]:
             for fieldId in self._gpuWatcher.values[entityGroupId][entityId]:
                 for value in self._gpuWatcher.values[entityGroupId][entityId][fieldId].values:
                     if not value.isBlank and value.value > 0:
                         fieldMeta = dcgm_fields.DcgmFieldGetById(fieldId)
                         print "%s: Got error for GPU %d, field Id %s, value %d" % (nowStr, entityId, fieldMeta.tag, int(value.value))
                         numErrors += 1
     
     self._gpuWatcher.EmptyValues()
     if numErrors == 0:
         print "%s: No GPU errors." % nowStr
Ejemplo n.º 5
0
# See the License for the specific language governing permissions and
# limitations under the License.
import pydcgm
import dcgm_fields
import dcgm_agent_internal
import dcgm_structs
import time

dcgmHandle = pydcgm.DcgmHandle(ipAddress="127.0.0.1")
dcgmSystem = dcgmHandle.GetSystem()
dcgmGroup = dcgmSystem.GetDefaultGroup()

#Discover which fieldIds are valid
g_fieldTags = {}
for fieldId in range(1, dcgm_fields.DCGM_FI_MAX_FIELDS):
    fieldMeta = dcgm_fields.DcgmFieldGetById(fieldId)
    if fieldMeta is None:
        continue
    
    g_fieldTags[fieldId] = fieldMeta.tag

#print("Found field tags: " + str(g_fieldTags))

fieldIds = sorted(g_fieldTags.keys())

gpuIds = dcgmGroup.GetGpuIds()

totalSampleCount = 0

cycleCount = 0