def __init__(self, handle=None, ipAddress=None, opMode=dcgm_structs.DCGM_OPERATION_MODE_AUTO, persistAfterDisconnect=False, unixSocketPath=None, timeoutMs=0): ''' Constructor handle is an existing handle from dcgmInit(). Pass None if you want this object to handle DCGM initialization for you ipAddress is the host to connect to. None = start embedded host engine opMode is a dcgm_structs.DCGM_OPERATION_MODE_* constant for how the host engine should run (embedded mode only) persistAfterDisconnect (TCP-IP connections only) is whether the host engine should persist all of our watches after we disconnect. 1=persist our watches. 0=clean up after our connection unixSocketPath is a path to a path on the local filesystem that is a unix socket that the host engine is listening on. This option is mutually exclusive with ipAddress timeoutMs is how long to wait for TCP/IP or Unix domain connections to establish in ms. 0=Default timeout (5000ms) ''' self._handleCreated = False self._persistAfterDisconnect = persistAfterDisconnect if handle is not None: self.handle = handle return self._ipAddress = ipAddress #Can't provide both unix socket and ip address if ipAddress is not None and unixSocketPath is not None: raise dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM) #Initialize the DCGM client library dcgm_structs._dcgmInit() dcgm_agent.dcgmInit() #Not harmful to call this multiple times in a process #If neither ipAddress nor unixSocketPath are present, start an embedded host engine if ipAddress is None and unixSocketPath is None: self.handle = dcgm_agent.dcgmStartEmbedded(opMode) self.isEmbedded = True self._handleCreated = True return #Set up connection parameters. We're connecting to something connectParams = dcgm_structs.c_dcgmConnectV2Params_v2() connectParams.version = dcgm_structs.c_dcgmConnectV2Params_version connectParams.timeoutMs = timeoutMs if self._persistAfterDisconnect: connectParams.persistAfterDisconnect = 1 else: connectParams.persistAfterDisconnect = 0 if ipAddress is not None: connectToAddress = ipAddress connectParams.addressIsUnixSocket = 0 else: connectToAddress = unixSocketPath connectParams.addressIsUnixSocket = 1 self.handle = dcgm_agent.dcgmConnect_v2(connectToAddress, connectParams) self.isEmbedded = False self._handleCreated = True
def main(): operationMode = dcgm_structs.DCGM_OPERATION_MODE_AUTO timeStep = 1.0 dcgm_structs._dcgmInit() dcgm_agent.dcgmInit() #Will throw an exception on error handle = dcgm_agent.dcgmStartEmbedded(operationMode) handleObj = pydcgm.DcgmHandle(handle=handle) groupId = dcgm_structs.DCGM_GROUP_ALL_GPUS fieldIds = [ dcgm_fields.DCGM_FI_DEV_SM_CLOCK, dcgm_fields.DCGM_FI_DEV_MEM_CLOCK ] fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "my_field_group", fieldIds) updateFreq = int(timeStep * 1000000.0) maxKeepAge = 3600.0 #1 hour maxKeepSamples = 0 #unlimited. maxKeepAge will enforce quota startTimestamp = 0 #beginning of time dfcw = DcgmFieldGroupWatcher(handle, groupId, fieldGroup, operationMode, updateFreq, maxKeepAge, maxKeepSamples, startTimestamp) dfcw2 = DcgmFieldGroupEntityWatcher(handle, groupId, fieldGroup, operationMode, updateFreq, maxKeepAge, maxKeepSamples, startTimestamp) while (True): newUpdateCount = dfcw.GetAllSinceLastCall() newUpdateCount2 = dfcw2.GetAllSinceLastCall() print("Got %d and %d new field value updates" % (newUpdateCount, newUpdateCount2)) for gpuId in list(dfcw.values.keys()): print("gpuId %d" % gpuId) for fieldId in list(dfcw.values[gpuId].keys()): print(" fieldId %d: %d values. latest timestamp %d" % \ (fieldId, len(dfcw.values[gpuId][fieldId]), dfcw.values[gpuId][fieldId][-1].ts)) for entityGroupId in list(dfcw2.values.keys()): print("entityGroupId %d" % entityGroupId) for entityId in list(dfcw2.values[entityGroupId].keys()): print(" entityId %d" % entityId) for fieldId in list( dfcw2.values[entityGroupId][entityId].keys()): print(" fieldId %d: %d values. latest timestamp %d" % \ (fieldId, len(dfcw2.values[entityGroupId][entityId][fieldId]), dfcw2.values[entityGroupId][entityId][fieldId][-1].ts)) time.sleep(timeStep)
############################################################################### def dcmg_http_app(environ, start_response): ''' Main entry point ''' responseStr = g_dcgmServer.WsgiMain(environ, start_response) return [responseStr] ############################################################################### def application(environ, start_response): ''' Callback for uWSGI ''' return dcmg_http_app(environ, start_response) #Try to load the DCGM library dcgm_structs._dcgmInit() g_dcgmServer = DcgmHttpServer() ############################################################################### if __name__ == '__main__': httpd = make_server('', DCGM_HTTP_PORT, dcmg_http_app) print "Serving HTTP on port %d..." % DCGM_HTTP_PORT # Respond to requests until process is killed httpd.serve_forever() ###############################################################################
def run_tests(): ''' testDir: Subdirectory to look for tests in. For example: "tests" for NVML ''' with test_utils.SubTest("Main"): log_environment_info() test_utils.RestoreDefaultEnvironment.restore_env() try: dcgm_structs._dcgmInit(utils.get_testing_framework_library_path()) except dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_LIBRARY_NOT_FOUND): logger.warning( "DCGM Library hasn't been found in the system, is the driver correctly installed?" ) if utils.is_linux() and utils.is_32bit() and utils.is_system_64bit( ): # 32bit test on 64bit system logger.warning( "Make sure that you've installed driver with both 64bit and 32bit binaries (e.g. not -internal.run or -no-compact32.run)" ) raise if option_parser.options.use_running_hostengine: with test_utils.RunStandaloneHostEngine() as handle: dcgmGpuCount = test_utils.log_gpu_information(handle) if dcgmGpuCount < 1: logger.error( "No DCGM-whitelisted GPUs found. Skipping tests.") return else: with test_utils.RunEmbeddedHostEngine() as handle: dcgmGpuCount = test_utils.log_gpu_information(handle) if dcgmGpuCount < 1: logger.error( "No DCGM-whitelisted GPUs found. Skipping tests.") return # Persistence mode is required (_, error) = nvidia_smi_utils.enable_persistence_mode() if error: logger.error(error) return with test_utils.SubTest("restore state", quiet=True): test_utils.RestoreDefaultEnvironment.restore( ) # restore the nvml settings test_content = test_utils.get_test_content() nvswitchModuleCounter = 0 try: for module in test_content: # Attempt to clean up stranded processes instead of aborting kill_hostengine_if_needed() with test_utils.SubTest("module %s" % module[0].__name__): for function in module[1]: test_utils.run_subtest(function) with test_utils.SubTest("%s - restore state" % (function.__name__), quiet=True): test_utils.RestoreDefaultEnvironment.restore() finally: # SubTest might return KeyboardInterrupt exception. We should try to restore # state before closing with test_utils.SubTest("restore state", quiet=True): test_utils.RestoreDefaultEnvironment.restore()
except: print("psutil is missing. Install with 'pip3 install psutil'") sys.exit(1) numClients = 4 #How many clients to simulate watchInterval = 10.0 #How often to update watches in seconds prodWatchInterval = 30.0 #How often DCGM-exporter updates watches in seconds prodDivisor = prodWatchInterval / watchInterval gpuIds = None #Either set this to None to use all gpus or a list of gpuIds like [0,1] fieldIds = [1001, 1004, 1005, 1009, 1010, 1011, 1012] #DCGM-exporter default list print("Watch list: %s" % (str(fieldIds))) dcgm_structs._dcgmInit('../apps/amd64') def getNvHostEngineProcessObject(): for proc in psutil.process_iter(['name', 'pid']): if proc.info['name'] == 'nv-hostengine': return psutil.Process(proc.info['pid']) return None dcgmProcess = getNvHostEngineProcessObject() if dcgmProcess is None: print("nv-hostengine was not running") sys.exit(1)
# You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #Bring classes into this namespace from DcgmHandle import * from DcgmGroup import * from DcgmStatus import * from DcgmSystem import * from DcgmFieldGroup import * import os if '__DCGM_TESTING_FRAMEWORK_ACTIVE' in os.environ and os.environ[ '__DCGM_TESTING_FRAMEWORK_ACTIVE'] == '1': import utils import dcgm_structs dcgm_structs._dcgmInit(utils.get_testing_framework_library_path()) ''' Define a unique exception type we will return so that callers can distinguish our exceptions from python standard ones ''' class DcgmException(Exception): pass
def __enter__(self): dcgm_structs._dcgmInit() self.handle = dcgm_agent.dcgmInit() return self.handle
def InitWrapped(self, path=None): dcgm_structs._dcgmInit(libDcgmPath=path) self.Reconnect()