Example #1
0
    def __init__(self, handle=None, ipAddress=None,
                 opMode=dcgm_structs.DCGM_OPERATION_MODE_AUTO, persistAfterDisconnect=False,
                 unixSocketPath=None, timeoutMs=0):
        '''
        Constructor

        handle is an existing handle from dcgmInit(). Pass None if you want this object to handle DCGM initialization for you
        ipAddress is the host to connect to. None = start embedded host engine
        opMode is a dcgm_structs.DCGM_OPERATION_MODE_* constant for how the host engine should run (embedded mode only)
        persistAfterDisconnect (TCP-IP connections only) is whether the host engine should persist all of our watches
                               after we disconnect. 1=persist our watches. 0=clean up after our connection
        unixSocketPath is a path to a path on the local filesystem that is a unix socket that the host engine is listening on.
                       This option is mutually exclusive with ipAddress
        timeoutMs is how long to wait for TCP/IP or Unix domain connections to establish in ms. 0=Default timeout (5000ms)
        '''
        self._handleCreated = False
        self._persistAfterDisconnect = persistAfterDisconnect
        
        if handle is not None:
            self.handle = handle
            return

        self._ipAddress = ipAddress
        
        #Can't provide both unix socket and ip address
        if ipAddress is not None and unixSocketPath is not None:
            raise dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)

        #Initialize the DCGM client library
        dcgm_structs._dcgmInit()
        dcgm_agent.dcgmInit() #Not harmful to call this multiple times in a process

        #If neither ipAddress nor unixSocketPath are present, start an embedded host engine
        if ipAddress is None and unixSocketPath is None:
            self.handle = dcgm_agent.dcgmStartEmbedded(opMode)
            self.isEmbedded = True
            self._handleCreated = True
            return        
        
        #Set up connection parameters. We're connecting to something
        connectParams = dcgm_structs.c_dcgmConnectV2Params_v2()
        connectParams.version = dcgm_structs.c_dcgmConnectV2Params_version
        connectParams.timeoutMs = timeoutMs
        if self._persistAfterDisconnect:
            connectParams.persistAfterDisconnect = 1
        else:
            connectParams.persistAfterDisconnect = 0
        
        if ipAddress is not None:
            connectToAddress = ipAddress
            connectParams.addressIsUnixSocket = 0
        else:
            connectToAddress = unixSocketPath
            connectParams.addressIsUnixSocket = 1
        
        self.handle = dcgm_agent.dcgmConnect_v2(connectToAddress, connectParams)
        self.isEmbedded = False
        self._handleCreated = True
Example #2
0
def main():
    operationMode = dcgm_structs.DCGM_OPERATION_MODE_AUTO
    timeStep = 1.0

    dcgm_structs._dcgmInit()
    dcgm_agent.dcgmInit()  #Will throw an exception on error
    handle = dcgm_agent.dcgmStartEmbedded(operationMode)
    handleObj = pydcgm.DcgmHandle(handle=handle)
    groupId = dcgm_structs.DCGM_GROUP_ALL_GPUS
    fieldIds = [
        dcgm_fields.DCGM_FI_DEV_SM_CLOCK, dcgm_fields.DCGM_FI_DEV_MEM_CLOCK
    ]

    fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "my_field_group", fieldIds)

    updateFreq = int(timeStep * 1000000.0)
    maxKeepAge = 3600.0  #1 hour
    maxKeepSamples = 0  #unlimited. maxKeepAge will enforce quota
    startTimestamp = 0  #beginning of time

    dfcw = DcgmFieldGroupWatcher(handle, groupId, fieldGroup, operationMode,
                                 updateFreq, maxKeepAge, maxKeepSamples,
                                 startTimestamp)
    dfcw2 = DcgmFieldGroupEntityWatcher(handle, groupId, fieldGroup,
                                        operationMode, updateFreq, maxKeepAge,
                                        maxKeepSamples, startTimestamp)

    while (True):
        newUpdateCount = dfcw.GetAllSinceLastCall()
        newUpdateCount2 = dfcw2.GetAllSinceLastCall()
        print("Got %d and %d new field value updates" %
              (newUpdateCount, newUpdateCount2))
        for gpuId in list(dfcw.values.keys()):
            print("gpuId %d" % gpuId)
            for fieldId in list(dfcw.values[gpuId].keys()):
                print("    fieldId %d: %d values. latest timestamp %d" % \
                      (fieldId, len(dfcw.values[gpuId][fieldId]), dfcw.values[gpuId][fieldId][-1].ts))

        for entityGroupId in list(dfcw2.values.keys()):
            print("entityGroupId %d" % entityGroupId)
            for entityId in list(dfcw2.values[entityGroupId].keys()):
                print("    entityId %d" % entityId)
                for fieldId in list(
                        dfcw2.values[entityGroupId][entityId].keys()):
                    print("        fieldId %d: %d values. latest timestamp %d" % \
                          (fieldId, len(dfcw2.values[entityGroupId][entityId][fieldId]), dfcw2.values[entityGroupId][entityId][fieldId][-1].ts))

        time.sleep(timeStep)
Example #3
0
###############################################################################
def dcmg_http_app(environ, start_response):
    '''
    Main entry point
    '''
    responseStr = g_dcgmServer.WsgiMain(environ, start_response)
    return [responseStr]


###############################################################################
def application(environ, start_response):
    '''
    Callback for uWSGI
    '''
    return dcmg_http_app(environ, start_response)


#Try to load the DCGM library
dcgm_structs._dcgmInit()
g_dcgmServer = DcgmHttpServer()

###############################################################################
if __name__ == '__main__':
    httpd = make_server('', DCGM_HTTP_PORT, dcmg_http_app)
    print "Serving HTTP on port %d..." % DCGM_HTTP_PORT

    # Respond to requests until process is killed
    httpd.serve_forever()

###############################################################################
Example #4
0
def run_tests():
    '''
    testDir: Subdirectory to look for tests in. For example: "tests" for NVML

    '''
    with test_utils.SubTest("Main"):

        log_environment_info()

        test_utils.RestoreDefaultEnvironment.restore_env()
        try:
            dcgm_structs._dcgmInit(utils.get_testing_framework_library_path())

        except dcgm_structs.dcgmExceptionClass(
                dcgm_structs.DCGM_ST_LIBRARY_NOT_FOUND):
            logger.warning(
                "DCGM Library hasn't been found in the system, is the driver correctly installed?"
            )

            if utils.is_linux() and utils.is_32bit() and utils.is_system_64bit(
            ):
                # 32bit test on 64bit system
                logger.warning(
                    "Make sure that you've installed driver with both 64bit and 32bit binaries (e.g. not -internal.run or -no-compact32.run)"
                )
            raise

        if option_parser.options.use_running_hostengine:
            with test_utils.RunStandaloneHostEngine() as handle:
                dcgmGpuCount = test_utils.log_gpu_information(handle)
                if dcgmGpuCount < 1:
                    logger.error(
                        "No DCGM-whitelisted GPUs found. Skipping tests.")
                    return
        else:
            with test_utils.RunEmbeddedHostEngine() as handle:
                dcgmGpuCount = test_utils.log_gpu_information(handle)
                if dcgmGpuCount < 1:
                    logger.error(
                        "No DCGM-whitelisted GPUs found. Skipping tests.")
                    return

        # Persistence mode is required
        (_, error) = nvidia_smi_utils.enable_persistence_mode()
        if error:
            logger.error(error)
            return

        with test_utils.SubTest("restore state", quiet=True):
            test_utils.RestoreDefaultEnvironment.restore(
            )  # restore the nvml settings

        test_content = test_utils.get_test_content()
        nvswitchModuleCounter = 0
        try:
            for module in test_content:
                # Attempt to clean up stranded processes instead of aborting
                kill_hostengine_if_needed()

                with test_utils.SubTest("module %s" % module[0].__name__):
                    for function in module[1]:
                        test_utils.run_subtest(function)
                        with test_utils.SubTest("%s - restore state" %
                                                (function.__name__),
                                                quiet=True):
                            test_utils.RestoreDefaultEnvironment.restore()
        finally:
            # SubTest might return KeyboardInterrupt exception. We should try to restore
            # state before closing
            with test_utils.SubTest("restore state", quiet=True):
                test_utils.RestoreDefaultEnvironment.restore()
Example #5
0
except:
    print("psutil is missing. Install with 'pip3 install psutil'")
    sys.exit(1)

numClients = 4  #How many clients to simulate
watchInterval = 10.0  #How often to update watches in seconds
prodWatchInterval = 30.0  #How often DCGM-exporter updates watches in seconds
prodDivisor = prodWatchInterval / watchInterval
gpuIds = None  #Either set this to None to use all gpus or a list of gpuIds like [0,1]

fieldIds = [1001, 1004, 1005, 1009, 1010, 1011,
            1012]  #DCGM-exporter default list

print("Watch list: %s" % (str(fieldIds)))

dcgm_structs._dcgmInit('../apps/amd64')


def getNvHostEngineProcessObject():
    for proc in psutil.process_iter(['name', 'pid']):
        if proc.info['name'] == 'nv-hostengine':
            return psutil.Process(proc.info['pid'])

    return None


dcgmProcess = getNvHostEngineProcessObject()
if dcgmProcess is None:
    print("nv-hostengine was not running")
    sys.exit(1)
Example #6
0
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#Bring classes into this namespace
from DcgmHandle import *
from DcgmGroup import *
from DcgmStatus import *
from DcgmSystem import *
from DcgmFieldGroup import *

import os
if '__DCGM_TESTING_FRAMEWORK_ACTIVE' in os.environ and os.environ[
        '__DCGM_TESTING_FRAMEWORK_ACTIVE'] == '1':
    import utils
    import dcgm_structs
    dcgm_structs._dcgmInit(utils.get_testing_framework_library_path())
'''
Define a unique exception type we will return so that callers can distinguish our exceptions from python standard ones
'''


class DcgmException(Exception):
    pass
Example #7
0
 def __enter__(self):
     dcgm_structs._dcgmInit()
     self.handle = dcgm_agent.dcgmInit()
     return self.handle
Example #8
0
 def InitWrapped(self, path=None):
     dcgm_structs._dcgmInit(libDcgmPath=path)
     self.Reconnect()