def __init__(self, handle=None, ipAddress=None, opMode=dcgm_structs.DCGM_OPERATION_MODE_AUTO, persistAfterDisconnect=False, unixSocketPath=None, timeoutMs=0): ''' Constructor handle is an existing handle from dcgmInit(). Pass None if you want this object to handle DCGM initialization for you ipAddress is the host to connect to. None = start embedded host engine opMode is a dcgm_structs.DCGM_OPERATION_MODE_* constant for how the host engine should run (embedded mode only) persistAfterDisconnect (TCP-IP connections only) is whether the host engine should persist all of our watches after we disconnect. 1=persist our watches. 0=clean up after our connection unixSocketPath is a path to a path on the local filesystem that is a unix socket that the host engine is listening on. This option is mutually exclusive with ipAddress timeoutMs is how long to wait for TCP/IP or Unix domain connections to establish in ms. 0=Default timeout (5000ms) ''' self._handleCreated = False self._persistAfterDisconnect = persistAfterDisconnect if handle is not None: self.handle = handle return self._ipAddress = ipAddress #Can't provide both unix socket and ip address if ipAddress is not None and unixSocketPath is not None: raise dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM) #Initialize the DCGM client library dcgm_structs._dcgmInit() dcgm_agent.dcgmInit() #Not harmful to call this multiple times in a process #If neither ipAddress nor unixSocketPath are present, start an embedded host engine if ipAddress is None and unixSocketPath is None: self.handle = dcgm_agent.dcgmStartEmbedded(opMode) self.isEmbedded = True self._handleCreated = True return #Set up connection parameters. We're connecting to something connectParams = dcgm_structs.c_dcgmConnectV2Params_v2() connectParams.version = dcgm_structs.c_dcgmConnectV2Params_version connectParams.timeoutMs = timeoutMs if self._persistAfterDisconnect: connectParams.persistAfterDisconnect = 1 else: connectParams.persistAfterDisconnect = 0 if ipAddress is not None: connectToAddress = ipAddress connectParams.addressIsUnixSocket = 0 else: connectToAddress = unixSocketPath connectParams.addressIsUnixSocket = 1 self.handle = dcgm_agent.dcgmConnect_v2(connectToAddress, connectParams) self.isEmbedded = False self._handleCreated = True
def main(): operationMode = dcgm_structs.DCGM_OPERATION_MODE_AUTO timeStep = 1.0 dcgm_structs._dcgmInit() dcgm_agent.dcgmInit() #Will throw an exception on error handle = dcgm_agent.dcgmStartEmbedded(operationMode) handleObj = pydcgm.DcgmHandle(handle=handle) groupId = dcgm_structs.DCGM_GROUP_ALL_GPUS fieldIds = [ dcgm_fields.DCGM_FI_DEV_SM_CLOCK, dcgm_fields.DCGM_FI_DEV_MEM_CLOCK ] fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "my_field_group", fieldIds) updateFreq = int(timeStep * 1000000.0) maxKeepAge = 3600.0 #1 hour maxKeepSamples = 0 #unlimited. maxKeepAge will enforce quota startTimestamp = 0 #beginning of time dfcw = DcgmFieldGroupWatcher(handle, groupId, fieldGroup, operationMode, updateFreq, maxKeepAge, maxKeepSamples, startTimestamp) dfcw2 = DcgmFieldGroupEntityWatcher(handle, groupId, fieldGroup, operationMode, updateFreq, maxKeepAge, maxKeepSamples, startTimestamp) while (True): newUpdateCount = dfcw.GetAllSinceLastCall() newUpdateCount2 = dfcw2.GetAllSinceLastCall() print("Got %d and %d new field value updates" % (newUpdateCount, newUpdateCount2)) for gpuId in list(dfcw.values.keys()): print("gpuId %d" % gpuId) for fieldId in list(dfcw.values[gpuId].keys()): print(" fieldId %d: %d values. latest timestamp %d" % \ (fieldId, len(dfcw.values[gpuId][fieldId]), dfcw.values[gpuId][fieldId][-1].ts)) for entityGroupId in list(dfcw2.values.keys()): print("entityGroupId %d" % entityGroupId) for entityId in list(dfcw2.values[entityGroupId].keys()): print(" entityId %d" % entityId) for fieldId in list( dfcw2.values[entityGroupId][entityId].keys()): print(" fieldId %d: %d values. latest timestamp %d" % \ (fieldId, len(dfcw2.values[entityGroupId][entityId][fieldId]), dfcw2.values[entityGroupId][entityId][fieldId][-1].ts)) time.sleep(timeStep)
def main(): #Make sure logging stuff is bootstrapped try: option_parser.parse_options() option_parser.options.no_logging = True #Don't log anything heHandle = None heAppRunner = None dcgm_structs._LoadDcgmLibrary() if g_embeddedMode: host = 0 else: #Start host engine heAppRunner = apps.NvHostEngineApp() heAppRunner.start(timeout=1000000000) time.sleep(2.0) host = "127.0.0.1" heHandle = dcgm_agent.dcgmInit() pssObj = ProcessStatsStress(g_embeddedMode, heHandle) pssObj.Run() del (pssObj) #Force destructor heAppRunner.wait() except Exception as e: raise finally: apps.AppRunner.clean_all() if heHandle is not None: dcgm_agent.dcgmShutdown()
def _test_connection_helper(domainSocketName): #Make sure the library is initialized dcgm_agent.dcgmInit() #First, try the raw method of using the dcgm_agent API directly v2Struct = dcgm_structs.c_dcgmConnectV2Params_v2() v2Struct.version = dcgm_structs.c_dcgmConnectV2Params_version2 v2Struct.addressIsUnixSocket = 1 v2Handle = dcgm_agent.dcgmConnect_v2( domainSocketName, v2Struct, dcgm_structs.c_dcgmConnectV2Params_version2) #Use the handle, which will throw an exception on error gpuIds2 = dcgm_agent.dcgmGetAllSupportedDevices(v2Handle) dcgm_agent.dcgmDisconnect(v2Handle) #Now use the DcgmHandle method dcgmHandle = pydcgm.DcgmHandle(unixSocketPath=domainSocketName) dcgmSystem = dcgmHandle.GetSystem() gpuIds = dcgmSystem.discovery.GetAllGpuIds() #Try to disconnect cleanly from our domain socket del (dcgmHandle) dcgmHandle = None
def __enter__(self): dcgm_structs._dcgmInit() self.handle = dcgm_agent.dcgmInit() return self.handle
assert config_values[x].mComputeMode == expected_compute_mode, "The compute mode value for gpuID %d is incorrect."\ " Returned: %d Expected: %d" \ % (x, config_values[x].mComputeMode, expected_compute_mode) assert config_values[x].mEccMode == expected_ecc, "The ecc mode value for gpuID %d is incorrect."\ " Returned: %d Expected: %d" \ % (x, config_values[x].mEccMode, expected_ecc) pass ret = dcgm_agent.dcgmStatusDestroy(status_handle) assert (ret == dcgm_structs.DCGM_ST_OK ), "Failed to remove status handler, error: %s" % ret dcgm_structs._LoadDcgmLibrary() handle = dcgm_agent.dcgmInit() devices = dcgm_agent.dcgmGetAllDevices(handle) validDevices = list() for x in devices: fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, x, [ dcgm_fields.DCGM_FI_DEV_RETIRED_DBE, ]) if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED): validDevices.append(x) if (len(validDevices) == 0): print "Can only run if at least one GPU with ECC is present" sys.exit(1)