def dcgmFieldGroupGetInfo(dcgm_handle, fieldGroupId): c_fieldGroupInfo = dcgm_structs.c_dcgmFieldGroupInfo_v1() c_fieldGroupInfo.version = dcgm_structs.dcgmFieldGroupInfo_version1 c_fieldGroupInfo.fieldGroupId = fieldGroupId fn = dcgmFP("dcgmFieldGroupGetInfo") ret = fn(dcgm_handle, byref(c_fieldGroupInfo)) dcgm_structs._dcgmCheckReturn(ret) return c_fieldGroupInfo
def dcgmGetValuesSince_v2(dcgm_handle, groupId, fieldGroupId, sinceTimestamp, enumCB, userData): fn = dcgmFP("dcgmGetValuesSince_v2") c_nextSinceTimestamp = c_int64() ret = fn(dcgm_handle, groupId, fieldGroupId, c_int64(sinceTimestamp), byref(c_nextSinceTimestamp), enumCB, py_object(userData)) dcgm_structs._dcgmCheckReturn(ret) return c_nextSinceTimestamp.value
def dcgmGetAllDevices(dcgm_handle): c_count = c_uint() gpuid_list = c_uint * dcgm_structs.DCGM_MAX_NUM_DEVICES c_gpuid_list = gpuid_list() fn = dcgmFP("dcgmGetAllDevices") ret = fn(dcgm_handle, c_gpuid_list, byref(c_count)) dcgm_structs._dcgmCheckReturn(ret) return [c_gpuid_list[i] for i in range(c_count.value)[0:int(c_count.value)]]
def dcgmProfUnwatchFields(dcgmHandle, groupId): msg = dcgm_structs.c_dcgmProfUnwatchFields_v1() msg.version = dcgm_structs.dcgmProfUnwatchFields_version1 msg.groupId = groupId fn = dcgmFP("dcgmProfUnwatchFields") ret = fn(dcgmHandle, byref(msg)) dcgm_structs._dcgmCheckReturn(ret) return msg
def dcgmGroupGetAllIds(dcgmHandle): fn = dcgmFP("dcgmGroupGetAllIds") c_count = c_uint() groupIdList = c_void_p * dcgm_structs.DCGM_MAX_NUM_GROUPS c_groupIdList = groupIdList() ret = fn(dcgmHandle, c_groupIdList, byref(c_count)) dcgm_structs._dcgmCheckReturn(ret) return map(None, c_groupIdList[0:int(c_count.value)])
def dcgmProfGetSupportedMetricGroups(dcgmHandle, groupId): msg = dcgm_structs.c_dcgmProfGetMetricGroups_v2() msg.version = dcgm_structs.dcgmProfGetMetricGroups_version1 msg.groupId = groupId fn = dcgmFP("dcgmProfGetSupportedMetricGroups") ret = fn(dcgmHandle, byref(msg)) dcgm_structs._dcgmCheckReturn(ret) return msg
def dcgmDeleteMigEntity(dcgm_handle, entityGroupId, entityId, flags): fn = dcgmFP("dcgmDeleteMigEntity") dme = dcgm_structs.c_dcgmDeleteMigEntity_v1() dme.version = dcgm_structs.c_dcgmDeleteMigEntity_version1 dme.entityGroupId = entityGroupId dme.entityId = entityId dme.flags = flags ret = fn(dcgm_handle, byref(dme)) dcgm_structs._dcgmCheckReturn(ret)
def dcgmIntrospectGetHostengineCpuUtilization(dcgm_handle, waitIfNoData=True): fn = dcgmFP("dcgmIntrospectGetHostengineCpuUtilization") cpuUtil = dcgm_structs.c_dcgmIntrospectCpuUtil_v1() cpuUtil.version = dcgm_structs.dcgmIntrospectCpuUtil_version1 ret = fn(dcgm_handle, byref(cpuUtil), waitIfNoData) dcgm_structs._dcgmCheckReturn(ret) return cpuUtil
def dcgmIntrospectGetHostengineMemoryUsage(dcgm_handle, waitIfNoData=True): fn = dcgmFP("dcgmIntrospectGetHostengineMemoryUsage") memInfo = dcgm_structs.c_dcgmIntrospectMemory_v1() memInfo.version = dcgm_structs.dcgmIntrospectMemory_version1 ret = fn(dcgm_handle, byref(memInfo), waitIfNoData) dcgm_structs._dcgmCheckReturn(ret) return memInfo
def dcgmConnect_v2(ip_address, connectParams, version=dcgm_structs.c_dcgmConnectV2Params_version): connectParams.version = version dcgm_handle = c_void_p() fn = dcgmFP("dcgmConnect_v2") ret = fn(ip_address, byref(connectParams), byref(dcgm_handle)) dcgm_structs._dcgmCheckReturn(ret) return dcgm_handle
def dcgmJobGetStats(dcgm_handle, jobid): fn = dcgmFP("dcgmJobGetStats") jobInfo = dcgm_structs.c_dcgmJobInfo_v3() jobInfo.version = dcgm_structs.dcgmJobInfo_version3 ret = fn(dcgm_handle, jobid, byref(jobInfo)) dcgm_structs._dcgmCheckReturn(ret) return jobInfo
def dcgmEntityGetLatestValues(dcgmHandle, entityGroup, entityId, fieldIds): fn = dcgmFP("dcgmEntityGetLatestValues") field_values = (dcgm_structs.c_dcgmFieldValue_v1 * len(fieldIds))() id_values = (c_uint16 * len(fieldIds))(*fieldIds) ret = fn(dcgmHandle, c_uint(entityGroup), dcgm_fields.c_dcgm_field_eid_t(entityId), id_values, c_uint(len(fieldIds)), field_values) dcgm_structs._dcgmCheckReturn(ret) return field_values
def dcgmFieldGroupCreate(dcgm_handle, fieldIds, fieldGroupName): c_field_group_id = c_void_p() c_num_field_ids = c_int32(len(fieldIds)) c_field_ids = (c_uint16 * len(fieldIds))(*fieldIds) fn = dcgmFP("dcgmFieldGroupCreate") ret = fn(dcgm_handle, c_num_field_ids, byref(c_field_ids), fieldGroupName, byref(c_field_group_id)) dcgm_structs._dcgmCheckReturn(ret) return c_field_group_id
def dcgmGetEntityGroupEntities(dcgm_handle, entityGroup, flags): capacity = dcgm_structs.DCGM_GROUP_MAX_ENTITIES c_count = c_int32(capacity) entityIds = c_uint32 * capacity c_entityIds = entityIds() fn = dcgmFP("dcgmGetEntityGroupEntities") ret = fn(dcgm_handle, entityGroup, c_entityIds, byref(c_count), flags) dcgm_structs._dcgmCheckReturn(ret) return c_entityIds[0:int(c_count.value)]
def dcgmGetPidInfo(dcgm_handle, groupId, pid): fn = dcgmFP("dcgmGetPidInfo") pidInfo = dcgm_structs.c_dcgmPidInfo_v2() pidInfo.version = dcgm_structs.dcgmPidInfo_version2 pidInfo.pid = pid ret = fn(dcgm_handle, groupId, byref(pidInfo)) dcgm_structs._dcgmCheckReturn(ret) return pidInfo
def dcgmCreateMigEntity(dcgm_handle, parentId, profile, createOption, flags): fn = dcgmFP("dcgmCreateMigEntity") cme = dcgm_structs.c_dcgmCreateMigEntity_v1() cme.version = dcgm_structs.c_dcgmCreateMigEntity_version1 cme.parentId = parentId cme.createOption = createOption cme.profile = profile cme.flags = flags ret = fn(dcgm_handle, byref(cme)) dcgm_structs._dcgmCheckReturn(ret)
def dcgmEntitiesGetLatestValues(dcgmHandle, entities, fieldIds, flags): fn = dcgmFP("dcgmEntitiesGetLatestValues") numFvs = len(fieldIds) * len(entities) field_values = (dcgm_structs.c_dcgmFieldValue_v2 * numFvs)() entities_values = (dcgm_structs.c_dcgmGroupEntityPair_t * len(entities))(*entities) field_id_values = (c_uint16 * len(fieldIds))(*fieldIds) ret = fn(dcgmHandle, entities_values, c_uint(len(entities)), field_id_values, c_uint(len(fieldIds)), flags, field_values) dcgm_structs._dcgmCheckReturn(ret) return field_values
def dcgmIntrospectGetFieldsMemoryUsage(dcgm_handle, introspectContext, waitIfNoData=True): fn = dcgmFP("dcgmIntrospectGetFieldsMemoryUsage") memInfo = dcgm_structs.c_dcgmIntrospectFullMemory_v1() memInfo.version = dcgm_structs.dcgmIntrospectFullMemory_version1 ret = fn(dcgm_handle, byref(introspectContext), byref(memInfo), waitIfNoData) dcgm_structs._dcgmCheckReturn(ret) return memInfo
def dcgmHealthCheck(dcgm_handle, groupId, version=dcgm_structs.dcgmHealthResponse_version4): if version != dcgm_structs.dcgmHealthResponse_version4: dcgm_structs._dcgmCheckReturn(dcgm_structs.DCGM_ST_VER_MISMATCH) c_results = dcgm_structs.c_dcgmHealthResponse_v4() c_results.version = dcgm_structs.dcgmHealthResponse_version4 fn = dcgmFP("dcgmHealthCheck") ret = fn(dcgm_handle, groupId, byref(c_results)) dcgm_structs._dcgmCheckReturn(ret) return c_results
def dcgmPolicyGet(dcgm_handle, group_id, count, status_handle): fn = dcgmFP("dcgmPolicyGet") policy_array = count * dcgm_structs.c_dcgmPolicy_v1 c_policy_values = policy_array() for index in range(0, count): c_policy_values[index].version = dcgm_structs.dcgmPolicy_version1 ret = fn(dcgm_handle, group_id, count, c_policy_values, status_handle) dcgm_structs._dcgmCheckReturn(ret) return c_policy_values[0:count]
def dcgmIntrospectGetFieldsExecTime(dcgm_handle, introspectContext, waitIfNoData=True): fn = dcgmFP("dcgmIntrospectGetFieldsExecTime") execTime = dcgm_structs.c_dcgmIntrospectFullFieldsExecTime_v2() execTime.version = dcgm_structs.dcgmIntrospectFullFieldsExecTime_version2 ret = fn(dcgm_handle, byref(introspectContext), byref(execTime), waitIfNoData) dcgm_structs._dcgmCheckReturn(ret) return execTime
def dcgmHealthSet_v2(dcgm_handle, groupId, systems, updateInterval, maxKeepAge): params = dcgm_structs.c_dcgmHealthSetParams_v2() params.version = dcgm_structs.dcgmHealthSetParams_version2 params.groupId = groupId params.systems = systems params.updateInterval = updateInterval params.maxKeepAge = maxKeepAge fn = dcgmFP("dcgmHealthSet_v2") ret = fn(dcgm_handle, byref(params)) dcgm_structs._dcgmCheckReturn(ret) return ret
def dcgmConfigGet(dcgm_handle, group_id, reqCfgType, count, status_handle): fn = dcgmFP("dcgmConfigGet") config_values_array = count * dcgm_structs.c_dcgmDeviceConfig_v1 c_config_values = config_values_array() for index in range(0, count): c_config_values[index].version = dcgm_structs.dcgmDeviceConfig_version1 ret = fn(dcgm_handle, group_id, reqCfgType, count, c_config_values, status_handle) dcgm_structs._dcgmCheckReturn(ret) return map(None, c_config_values[0:count])
def GetLatestValues(self, fieldGroup): """ Get the latest values for a fieldGroup and store them to the .values member variable Note: This class does not automatically watch fieldGroup. You must do that ahead of time with dcgmGroup.samples.WatchFields() """ ret = dcgm_agent.dcgmGetLatestValues_v2( self._handle, self._groupId, fieldGroup.fieldGroupId, helper_dcgm_field_values_since_entity_callback, self) # Will throw exception on error dcgm_structs._dcgmCheckReturn(ret)
def dcgmGetFieldSummary(dcgmHandle, fieldId, entityGroupType, entityId, summaryMask, startTime, endTime): fn = dcgmFP("dcgmGetFieldSummary") request = dcgm_structs.c_dcgmFieldSummaryRequest_v1() request.version = dcgm_structs.dcgmFieldSummaryRequest_version1 request.fieldId = fieldId request.entityGroupType = entityGroupType request.entityId = entityId request.summaryTypeMask = summaryMask request.startTime = startTime request.endTime = endTime ret = fn(dcgmHandle, byref(request)) dcgm_structs._dcgmCheckReturn(ret) return request
def helperDiagCheckReturn(ret, response): try: dcgm_structs._dcgmCheckReturn(ret) except dcgm_structs.DCGMError as e: if response.systemError.msg != "": # Add systemError information to the raised exception. import sys info = "%s" % response.systemError.msg e.SetAdditionalInfo(info) raise e # pylint: disable=E0710 else: raise return response
def dcgmGroupGetInfo(dcgm_handle, group_id, version=dcgm_structs.c_dcgmGroupInfo_version2): fn = dcgmFP("dcgmGroupGetInfo") # support the old version of the request since the host engine does if version == dcgm_structs.c_dcgmGroupInfo_version2: device_values = dcgm_structs.c_dcgmGroupInfo_v2() device_values.version = dcgm_structs.c_dcgmGroupInfo_version2 else: dcgm_structs._dcgmCheckReturn(dcgm_structs.DCGM_ST_VER_MISMATCH) ret = fn(dcgm_handle, group_id, byref(device_values)) dcgm_structs._dcgmCheckReturn(ret) return device_values
def dcgmProfWatchFields(dcgmHandle, fieldIds, groupId, updateFreq, maxKeepAge, maxKeepSamples): msg = dcgm_structs.c_dcgmProfWatchFields_v1() msg.version = dcgm_structs.dcgmProfWatchFields_version1 msg.groupId = groupId msg.updateFreq = updateFreq msg.maxKeepAge = maxKeepAge msg.maxKeepSamples = maxKeepSamples msg.numFieldIds = c_uint32(len(fieldIds)) for i, fieldId in enumerate(fieldIds): msg.fieldIds[i] = fieldId fn = dcgmFP("dcgmProfWatchFields") ret = fn(dcgmHandle, byref(msg)) dcgm_structs._dcgmCheckReturn(ret) return msg
def _WatchFieldGroup(self): """ Initiate the host engine watch on the fields """ ret = dcgm_agent.dcgmWatchFields(self._handle, self._groupId, self._fieldGroup.fieldGroupId, self._updateFreq, self._maxKeepAge, self._maxKeepSamples) # Will throw exception on error dcgm_structs._dcgmCheckReturn(ret) # Force an update of the fields so that we can fetch initial values ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1) # Will throw exception on error dcgm_structs._dcgmCheckReturn(ret) # initial update will fetch from startTimestamp self.GetMore()
def GetMore(self): """ Method to cause more field values to be retrieved from DCGM. Returns the number of field values that were retrieved """ beforeCount = self._numValuesSeen # If we're in manual mode, force an update if self._oprationMode == dcgm_structs.DCGM_OPERATION_MODE_MANUAL: ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1) # Will throw exception on error dcgm_structs._dcgmCheckReturn(ret) self._nextSinceTimestamp = dcgm_agent.dcgmGetValuesSince_v2( self._handle, self._groupId, self._fieldGroup.fieldGroupId, self._nextSinceTimestamp, helper_dcgm_field_values_since_entity_callback, self) afterCount = self._numValuesSeen return afterCount - beforeCount