Beispiel #1
0
    def __init__(self, handle=None, ipAddress=None,
                 opMode=dcgm_structs.DCGM_OPERATION_MODE_AUTO, persistAfterDisconnect=False,
                 unixSocketPath=None, timeoutMs=0):
        '''
        Constructor

        handle is an existing handle from dcgmInit(). Pass None if you want this object to handle DCGM initialization for you
        ipAddress is the host to connect to. None = start embedded host engine
        opMode is a dcgm_structs.DCGM_OPERATION_MODE_* constant for how the host engine should run (embedded mode only)
        persistAfterDisconnect (TCP-IP connections only) is whether the host engine should persist all of our watches
                               after we disconnect. 1=persist our watches. 0=clean up after our connection
        unixSocketPath is a path to a path on the local filesystem that is a unix socket that the host engine is listening on.
                       This option is mutually exclusive with ipAddress
        timeoutMs is how long to wait for TCP/IP or Unix domain connections to establish in ms. 0=Default timeout (5000ms)
        '''
        self._handleCreated = False
        self._persistAfterDisconnect = persistAfterDisconnect
        
        if handle is not None:
            self.handle = handle
            return

        self._ipAddress = ipAddress
        
        #Can't provide both unix socket and ip address
        if ipAddress is not None and unixSocketPath is not None:
            raise dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)

        #Initialize the DCGM client library
        dcgm_structs._dcgmInit()
        dcgm_agent.dcgmInit() #Not harmful to call this multiple times in a process

        #If neither ipAddress nor unixSocketPath are present, start an embedded host engine
        if ipAddress is None and unixSocketPath is None:
            self.handle = dcgm_agent.dcgmStartEmbedded(opMode)
            self.isEmbedded = True
            self._handleCreated = True
            return        
        
        #Set up connection parameters. We're connecting to something
        connectParams = dcgm_structs.c_dcgmConnectV2Params_v2()
        connectParams.version = dcgm_structs.c_dcgmConnectV2Params_version
        connectParams.timeoutMs = timeoutMs
        if self._persistAfterDisconnect:
            connectParams.persistAfterDisconnect = 1
        else:
            connectParams.persistAfterDisconnect = 0
        
        if ipAddress is not None:
            connectToAddress = ipAddress
            connectParams.addressIsUnixSocket = 0
        else:
            connectToAddress = unixSocketPath
            connectParams.addressIsUnixSocket = 1
        
        self.handle = dcgm_agent.dcgmConnect_v2(connectToAddress, connectParams)
        self.isEmbedded = False
        self._handleCreated = True
Beispiel #2
0
def main():
    operationMode = dcgm_structs.DCGM_OPERATION_MODE_AUTO
    timeStep = 1.0

    dcgm_structs._dcgmInit()
    dcgm_agent.dcgmInit()  #Will throw an exception on error
    handle = dcgm_agent.dcgmStartEmbedded(operationMode)
    handleObj = pydcgm.DcgmHandle(handle=handle)
    groupId = dcgm_structs.DCGM_GROUP_ALL_GPUS
    fieldIds = [
        dcgm_fields.DCGM_FI_DEV_SM_CLOCK, dcgm_fields.DCGM_FI_DEV_MEM_CLOCK
    ]

    fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "my_field_group", fieldIds)

    updateFreq = int(timeStep * 1000000.0)
    maxKeepAge = 3600.0  #1 hour
    maxKeepSamples = 0  #unlimited. maxKeepAge will enforce quota
    startTimestamp = 0  #beginning of time

    dfcw = DcgmFieldGroupWatcher(handle, groupId, fieldGroup, operationMode,
                                 updateFreq, maxKeepAge, maxKeepSamples,
                                 startTimestamp)
    dfcw2 = DcgmFieldGroupEntityWatcher(handle, groupId, fieldGroup,
                                        operationMode, updateFreq, maxKeepAge,
                                        maxKeepSamples, startTimestamp)

    while (True):
        newUpdateCount = dfcw.GetAllSinceLastCall()
        newUpdateCount2 = dfcw2.GetAllSinceLastCall()
        print("Got %d and %d new field value updates" %
              (newUpdateCount, newUpdateCount2))
        for gpuId in list(dfcw.values.keys()):
            print("gpuId %d" % gpuId)
            for fieldId in list(dfcw.values[gpuId].keys()):
                print("    fieldId %d: %d values. latest timestamp %d" % \
                      (fieldId, len(dfcw.values[gpuId][fieldId]), dfcw.values[gpuId][fieldId][-1].ts))

        for entityGroupId in list(dfcw2.values.keys()):
            print("entityGroupId %d" % entityGroupId)
            for entityId in list(dfcw2.values[entityGroupId].keys()):
                print("    entityId %d" % entityId)
                for fieldId in list(
                        dfcw2.values[entityGroupId][entityId].keys()):
                    print("        fieldId %d: %d values. latest timestamp %d" % \
                          (fieldId, len(dfcw2.values[entityGroupId][entityId][fieldId]), dfcw2.values[entityGroupId][entityId][fieldId][-1].ts))

        time.sleep(timeStep)
Beispiel #3
0
def main():
    #Make sure logging stuff is bootstrapped
    try:
        option_parser.parse_options()
        option_parser.options.no_logging = True  #Don't log anything
        heHandle = None
        heAppRunner = None

        dcgm_structs._LoadDcgmLibrary()

        if g_embeddedMode:
            host = 0
        else:
            #Start host engine
            heAppRunner = apps.NvHostEngineApp()
            heAppRunner.start(timeout=1000000000)
            time.sleep(2.0)
            host = "127.0.0.1"

        heHandle = dcgm_agent.dcgmInit()

        pssObj = ProcessStatsStress(g_embeddedMode, heHandle)
        pssObj.Run()
        del (pssObj)  #Force destructor
        heAppRunner.wait()
    except Exception as e:
        raise
    finally:
        apps.AppRunner.clean_all()
        if heHandle is not None:
            dcgm_agent.dcgmShutdown()
Beispiel #4
0
def _test_connection_helper(domainSocketName):
    #Make sure the library is initialized
    dcgm_agent.dcgmInit()
    #First, try the raw method of using the dcgm_agent API directly
    v2Struct = dcgm_structs.c_dcgmConnectV2Params_v2()
    v2Struct.version = dcgm_structs.c_dcgmConnectV2Params_version2
    v2Struct.addressIsUnixSocket = 1
    v2Handle = dcgm_agent.dcgmConnect_v2(
        domainSocketName, v2Struct,
        dcgm_structs.c_dcgmConnectV2Params_version2)
    #Use the handle, which will throw an exception on error
    gpuIds2 = dcgm_agent.dcgmGetAllSupportedDevices(v2Handle)
    dcgm_agent.dcgmDisconnect(v2Handle)

    #Now use the DcgmHandle method
    dcgmHandle = pydcgm.DcgmHandle(unixSocketPath=domainSocketName)
    dcgmSystem = dcgmHandle.GetSystem()

    gpuIds = dcgmSystem.discovery.GetAllGpuIds()

    #Try to disconnect cleanly from our domain socket
    del (dcgmHandle)
    dcgmHandle = None
Beispiel #5
0
 def __enter__(self):
     dcgm_structs._dcgmInit()
     self.handle = dcgm_agent.dcgmInit()
     return self.handle
Beispiel #6
0
        assert config_values[x].mComputeMode  == expected_compute_mode, "The compute mode value for gpuID %d is incorrect."\
                                " Returned: %d Expected: %d" \
                                % (x, config_values[x].mComputeMode, expected_compute_mode)

        assert config_values[x].mEccMode  == expected_ecc, "The ecc mode value for gpuID %d is incorrect."\
                                " Returned: %d Expected: %d" \
                                % (x, config_values[x].mEccMode, expected_ecc)
        pass

    ret = dcgm_agent.dcgmStatusDestroy(status_handle)
    assert (ret == dcgm_structs.DCGM_ST_OK
            ), "Failed to remove status handler, error: %s" % ret


dcgm_structs._LoadDcgmLibrary()
handle = dcgm_agent.dcgmInit()

devices = dcgm_agent.dcgmGetAllDevices(handle)
validDevices = list()
for x in devices:
    fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields(
        handle, x, [
            dcgm_fields.DCGM_FI_DEV_RETIRED_DBE,
        ])
    if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED):
        validDevices.append(x)

if (len(validDevices) == 0):
    print "Can only run if at least one GPU with ECC is present"
    sys.exit(1)