def test_dcgm_vgpu_config_get_validate(handle): """ Validates structure version """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() gpuIdList = systemObj.discovery.GetAllGpuIds() assert len(gpuIdList ) >= 0, "Not able to find devices on the node for embedded case" groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "test1") groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId) status_handle = dcgm_agent.dcgmStatusCreate() with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmVgpuConfigGet(handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmVgpuConfigGet(handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle, versionTest)
def test_dcgm_policy_get_validate(handle): """ Validates structure version """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() gpuIdList = systemObj.discovery.GetAllGpuIds() assert len(gpuIdList ) >= 0, "Not able to find devices on the node for embedded case" groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "test1") status_handle = dcgm_agent.dcgmStatusCreate() count = 1 diagLevel = dcgm_structs.DCGM_DIAG_LVL_SHORT with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmPolicyGet(handle, groupId, count, status_handle, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmPolicyGet(handle, groupId, count, status_handle, versionTest)
def helper_check_profiling_environment(dcgmGroup): try: dcgmGroup.profiling.GetSupportedMetricGroups() except dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_PROFILING_NOT_SUPPORTED) as e: test_utils.skip_test(g_profNotSupportedErrorStr) except dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_MODULE_NOT_LOADED) as e: test_utils.skip_test(g_moduleNotLoadedErrorStr) except dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_SUPPORTED) as e: test_utils.skip_test(g_profNotSupportedErrorStr)
def test_dcgm_get_vgpu_instance_attributes_validate(handle, gpuIds): """ Verifies that vGPU attributes are properly queried """ with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmGetVgpuInstanceAttributes(handle, gpuIds[0], versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmGetVgpuInstanceAttributes(handle, gpuIds[0], versionTest)
def test_dcgm_engine_watch_field_values(handle): """ Verifies that cache manager can watch a field value """ # Watch field so we can fetch it fieldId = dcgm_fields.DCGM_FI_DEV_NAME gpuId = 0 try: fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handle, gpuId, fieldId) numWatchersBefore = fieldInfo.numWatchers except dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_NOT_WATCHED) as e: numWatchersBefore = 0 ret = dcgm_agent_internal.dcgmWatchFieldValue(handle, gpuId, fieldId, 10000000, 86400.0, 0) assert (ret == dcgm_structs.DCGM_ST_OK) fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handle, gpuId, fieldId) assert fieldInfo.flags & dcgm_structs_internal.DCGM_CMI_F_WATCHED, "Expected watch. got flags %08X" % fieldInfo.flags numWatchersAfter = fieldInfo.numWatchers assert numWatchersAfter == numWatchersBefore + 1, "Expected 1 extra watcher. Before %d. After %d" % ( numWatchersBefore, numWatchersAfter)
def test_dcgm_job_get_stats_validate(handle): """ Validates structure version """ jobid = "1" with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmJobGetStats(handle, jobid, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmJobGetStats(handle, jobid, versionTest)
def helper_dcgm_verify_sync_boost_single_gpu(handle, gpuIds): handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetEmptyGroup("test1") ## Add first GPU to the group groupObj.AddGpu(gpuIds[0]) gpuIds = groupObj.GetGpuIds() #Only reference GPUs we are testing against ## Set the sync boost for the group config_values = dcgm_structs.c_dcgmDeviceConfig_v1() config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.syncBoost = 1 config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK #Config Set must return DCGM_ST_BADPARAM since we only have a single GPU with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)): groupObj.config.Set(config_values) groupObj.Delete()
def GetLatestGpuValuesAsDict(self, mapById): systemDictionary = {} with self.m_lock: try: self.Reconnect() fvs = self.m_dcgmGroup.samples.GetLatest( self.m_fieldGroup).values for gpuId in fvs.keys(): systemDictionary[gpuId] = { } # initialize the gpu's dictionary gpuFv = fvs[gpuId] for fieldId in gpuFv.keys(): val = gpuFv[fieldId][-1] if val.isBlank: continue if mapById == False: fieldTag = self.m_fieldIdToInfo[fieldId].tag systemDictionary[gpuId][fieldTag] = val.value else: systemDictionary[gpuId][fieldId] = val.value except dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID): self.LogError( "Can't connection to nv-hostengine. Please verify that it is running." ) self.SetDisconnected() return systemDictionary
def LoopOneIteration(self): for i in range(g_processesPerSecond): self.StartAppOnGpus() #How many PIDs should we buffer by? Below is 3 seconds worth pidBuffer = (3 * g_processesPerSecond * len(self.gpus)) + 1 #Do we have any pids that have finished yet? Clean them up while len(self.addedPids) > pidBuffer: #Look up PID info on a random PID that should be done. Assuming 3 seconds is enough pidIndex = random.randint(0, len(self.addedPids) - pidBuffer) pidObj = self.addedPids[pidIndex] try: processStats = dcgm_agent.dcgmGetPidInfo( self.heHandle, self.groupId, pidObj.pid) self.Log( "Found pid stats for pid %d. gpuId %d. returned pid %d" % (pidObj.pid, pidObj.gpuId, processStats.pid)) except dcgmExceptionClass(dcgm_structs.DCGM_ST_NO_DATA): self.Log( "Pid %d hasn't finished yet. Sleeping to allow cuda to catch up" % pidObj.pid) time.sleep(1.0) break #Finalize the resources the app object watches pidObj.appObj.wait() #Delete the found pid so we don't run out of file handles del self.addedPids[pidIndex] pidObj = None
def test_dcgmi_profile(handle, gpuIds): """ Test DCGMI "profile" subcommand """ dcgmHandle = pydcgm.DcgmHandle(handle=handle) dcgmSystem = dcgmHandle.GetSystem() dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds) # Creates a comma separated list of gpus allGpusCsv = ",".join(map(str, gpuIds)) #See if these GPUs even support profiling. This will bail out for non-Tesla or Pascal or older SKUs try: supportedMetrics = dcgmGroup.profiling.GetSupportedMetricGroups() except dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_PROFILING_NOT_SUPPORTED) as e: test_utils.skip_test("Profiling is not supported for gpuIds %s" % str(gpuIds)) except dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_MODULE_NOT_LOADED) as e: test_utils.skip_test("The profiling module could not be loaded") except dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_NOT_SUPPORTED) as e: test_utils.skip_test("The profling module is not supported") ## keep args in this order. Changing it may break the test _test_valid_args([ ["profile", "--list", "-i", allGpusCsv], ["profile", "--list", "-g", str(dcgmGroup.GetId().value)], ["profile", "--pause"], #Pause followed by resume ["profile", "--resume"], ["profile", "--pause"], #Double pause and double resume should be fine ["profile", "--pause"], ["profile", "--resume"], ["profile", "--resume"], ]) ## keep args in this order. Changing it may break the test _test_invalid_args([ ["profile", "--list", "--pause", "--resume"], #mutually exclusive flags ["profile", "--pause", "--resume"], #mutually exclusive flags ["profile", "--list", "-i", "999"], #Invalid gpuID ["profile", "--list", "-i", allGpusCsv + ",taco"], #Invalid gpu at end ["profile", "--list", "-g", "999"], #Invalid group ])
def Init(self, libpath=None): with self.m_lock: try: self.InitWrapped(path=libpath) except dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID): self.LogError("Can't connect to nv-hostengine. Is it down?") self.SetDisconnected()
def test_dcgm_policy_negative_unregister_standalone(handle): """ Verifies that the unregister function does not allow a bad groupId value """ policy = pydcgm.DcgmGroupPolicy(pydcgm.DcgmHandle(handle), 9999, None) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)): policy.Unregister(dcgm_structs.DCGM_POLICY_COND_DBE)
def __init__(self, handle=None, ipAddress=None, opMode=dcgm_structs.DCGM_OPERATION_MODE_AUTO, persistAfterDisconnect=False, unixSocketPath=None, timeoutMs=0): ''' Constructor handle is an existing handle from dcgmInit(). Pass None if you want this object to handle DCGM initialization for you ipAddress is the host to connect to. None = start embedded host engine opMode is a dcgm_structs.DCGM_OPERATION_MODE_* constant for how the host engine should run (embedded mode only) persistAfterDisconnect (TCP-IP connections only) is whether the host engine should persist all of our watches after we disconnect. 1=persist our watches. 0=clean up after our connection unixSocketPath is a path to a path on the local filesystem that is a unix socket that the host engine is listening on. This option is mutually exclusive with ipAddress timeoutMs is how long to wait for TCP/IP or Unix domain connections to establish in ms. 0=Default timeout (5000ms) ''' self._handleCreated = False self._persistAfterDisconnect = persistAfterDisconnect if handle is not None: self.handle = handle return self._ipAddress = ipAddress #Can't provide both unix socket and ip address if ipAddress is not None and unixSocketPath is not None: raise dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM) #Initialize the DCGM client library dcgm_structs._dcgmInit() dcgm_agent.dcgmInit() #Not harmful to call this multiple times in a process #If neither ipAddress nor unixSocketPath are present, start an embedded host engine if ipAddress is None and unixSocketPath is None: self.handle = dcgm_agent.dcgmStartEmbedded(opMode) self.isEmbedded = True self._handleCreated = True return #Set up connection parameters. We're connecting to something connectParams = dcgm_structs.c_dcgmConnectV2Params_v2() connectParams.version = dcgm_structs.c_dcgmConnectV2Params_version connectParams.timeoutMs = timeoutMs if self._persistAfterDisconnect: connectParams.persistAfterDisconnect = 1 else: connectParams.persistAfterDisconnect = 0 if ipAddress is not None: connectToAddress = ipAddress connectParams.addressIsUnixSocket = 0 else: connectToAddress = unixSocketPath connectParams.addressIsUnixSocket = 1 self.handle = dcgm_agent.dcgmConnect_v2(connectToAddress, connectParams) self.isEmbedded = False self._handleCreated = True
def test_dcgm_introspect_get_fields_memory_usage_validate(handle): """ Validates structure version """ introspectContext = dcgm_structs.c_dcgmIntrospectContext_v1() waitIfNoData = True with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmIntrospectGetFieldsMemoryUsage(handle, introspectContext, versionTest, waitIfNoData) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmIntrospectGetFieldsMemoryUsage(handle, introspectContext, versionTest, waitIfNoData)
def test_dcgm_introspect_get_hostengine_cpu_utilization_validate(handle): """ Validates structure version """ waitIfNoData = True with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmIntrospectGetHostengineCpuUtilization( handle, versionTest, waitIfNoData) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmIntrospectGetHostengineCpuUtilization( handle, versionTest, waitIfNoData)
def test_dcgm_introspect_get_field_exec_time_validate(handle): """ Validates structure version """ fieldId = dcgm_fields.DCGM_FI_DEV_GPU_TEMP waitIfNoData = True with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmIntrospectGetFieldExecTime(handle, fieldId, versionTest, waitIfNoData) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmIntrospectGetFieldExecTime(handle, fieldId, versionTest, waitIfNoData)
def test_dcgm_connection_error_when_no_hostengine_exists(): if not utils.is_bare_metal_system(): test_utils.skip_test("Virtualization Environment not supported") with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID)): # use a TEST-NET (rfc5737) addr instead of loopback in case a local hostengine is running handle = pydcgm.DcgmHandle(ipAddress='192.0.2.0', timeoutMs=100)
def helper_unwatch_field_values_public(handle, gpuIds): """ Verifies that dcgm can unwatch a field value """ fieldId = dcgm_fields.DCGM_FI_DEV_NAME fieldIds = [ fieldId, ] handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetGroupWithGpuIds('mygroup', gpuIds) fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "myfieldgroup", fieldIds) updateFreq = 10000000 maxKeepAge = 86400 maxKeepSamples = 0 #These are all gpuId -> watcher count numWatchersBefore = {} numWatchersWithWatch = {} numWatchersAfter = {} #Get watch info before our test begins for gpuId in gpuIds: try: fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handleObj.handle, gpuId, fieldId) numWatchersBefore[gpuId] = fieldInfo.numWatchers except dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_NOT_WATCHED) as e: numWatchersBefore[gpuId] = 0 #Now watch the fields groupObj.samples.WatchFields(fieldGroup, updateFreq, maxKeepAge, maxKeepSamples) #Get watcher info after our watch and check it against before our watch for gpuId in gpuIds: fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handleObj.handle, gpuId, fieldId) numWatchersWithWatch[gpuId] = fieldInfo.numWatchers assert numWatchersWithWatch[gpuId] == numWatchersBefore[gpuId] + 1,\ "Watcher mismatch at gpuId %d, numWatchersWithWatch[gpuId] %d != numWatchersBefore[gpuId] %d + 1" %\ (gpuId, numWatchersWithWatch[gpuId], numWatchersBefore[gpuId]) #Unwatch fields groupObj.samples.UnwatchFields(fieldGroup) #Get watcher count after our unwatch. This should match our original watch count for gpuId in gpuIds: fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo( handleObj.handle, gpuId, fieldId) numWatchersAfter[gpuId] = fieldInfo.numWatchers assert numWatchersBefore == numWatchersAfter, "Expected numWatchersBefore (%s) to match numWatchersAfter %s" %\ (str(numWatchersBefore), str(numWatchersAfter))
def test_dcgm_connect_validate(handle, gpuIds): """ Validates structure version """ fieldGroupFieldIds = [ dcgm_fields.DCGM_FI_DEV_GPU_TEMP, ] connectParams = dcgm_structs.c_dcgmConnectV2Params_v1() connectParams.persistAfterDisconnect = 0 with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmConnect_v2('localhost', connectParams, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmConnect_v2('localhost', connectParams, versionTest)
def test_dcgm_policy_negative_register_standalone(handle): """ Verifies that the register function does not allow a bad groupId value """ policy = pydcgm.DcgmGroupPolicy(pydcgm.DcgmHandle(handle), 9999, None) empty_c_callback = create_c_callback( ) # must hold ref so func is not GC'ed before c api uses it with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)): policy.Register(dcgm_structs.DCGM_POLICY_COND_DBE, empty_c_callback)
def test_dcgm_health_check_validate(handle): """ Validates structure version """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "test1") with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmHealthCheck(handle, groupId, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmHealthCheck(handle, groupId, versionTest)
def test_dcgm_field_group_get_all_validate(handle): """ Validates structure version """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() gpuIdList = systemObj.discovery.GetAllGpuIds() assert len(gpuIdList ) >= 0, "Not able to find devices on the node for embedded case" with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version vtDcgmFieldGroupGetAll(handle, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version vtDcgmFieldGroupGetAll(handle, versionTest)
def verify_exit_code_on_signal(signum): # Ensure that host engine is ready to launch a new diagnostic dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr='1') success = False start = time.time() while not success and (time.time() - start) <= 3: try: response = test_utils.diag_execute_wrapper(dd, handle) success = True except dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_DIAG_ALREADY_RUNNING): # Only acceptable error due to small race condition between the nvvs process exiting and # hostengine actually processing the exit. We try for a maximum of 3 seconds since this # should be rare and last only for a short amount of time time.sleep(1.5) diagApp = AppRunner(dcgmi_path, args=[ "diag", "-r", "SM Stress", "-i", "%s" % gpuId, "-d", "INFO", "--debugLogFile", "/tmp/nvvs.log" ]) # Start the diag diagApp.start(timeout=40) logger.info("Launched dcgmi process with pid: %s" % diagApp.getpid()) # Ensure diag is running before sending interrupt signal running, debug_output = dcgm_internal_helpers.check_nvvs_process( want_running=True, attempts=50) assert running, "The nvvs process did not start within 25 seconds: %s" % ( debug_output) # There is a small race condition here - it is possible that the hostengine sends a SIGTERM before the # nvvs process has setup a signal handler, and so the nvvs process does not stop when SIGTERM is sent. # We sleep for 1 second to reduce the possibility of this scenario time.sleep(1) diagApp.signal(signum) retCode = diagApp.wait() # Check the return code and stdout/stderr output before asserting for better debugging info if retCode == 0: logger.error("Got retcode '%s' from launched diag." % retCode) if diagApp.stderr_lines or diagApp.stdout_lines: logger.info("dcgmi output:") for line in diagApp.stdout_lines: logger.info(line) for line in diagApp.stderr_lines: logger.error(line) assert retCode != 0, "Expected a non-zero exit code, but got 0" # Since the app returns a non zero exit code, we call the validate method to prevent false # failures from the test framework diagApp.validate() # Give the launched nvvs process 15 seconds to terminate. not_running, debug_output = dcgm_internal_helpers.check_nvvs_process( want_running=False, attempts=50) assert not_running, "The launched nvvs process did not terminate within 25 seconds. pgrep output:\n%s" \ % debug_output
def dcgm_group_test_default_group(handle, gpuIds): """ Test that the default group can not be deleted, or manipulated and is returning all GPUs. Note that we're not using groupObj for some tests because it protects against operations on the default group """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetDefaultGroup() gpuIdList = gpuIds assert len(gpuIdList) > 0, "Failed to get devices from the node" with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)): groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, 9999) groupGpuIdList = groupObj.GetGpuIds() assert (gpuIdList == groupGpuIdList ), "Expected gpuId list match %s != %s" % (str(gpuIdList), str(groupGpuIdList)) groupEntityList = groupObj.GetEntities() gpuIdList2 = [] for entity in groupEntityList: assert entity.entityGroupId == dcgm_fields.DCGM_FE_GPU, str( entity.entityGroupId) gpuIdList2.append(entity.entityId) assert gpuIdList == gpuIdList2, "Expected gpuId list to match entity list: %s != %s" % ( str(gpuIdList), str(gpuIdList2)) for gpuId in gpuIdList: with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)): ret = dcgm_agent.dcgmGroupRemoveDevice( handle, dcgm_structs.DCGM_GROUP_ALL_GPUS, gpuId) with test_utils.assert_raises(pydcgm.DcgmException): groupObj.RemoveGpu(gpuId) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)): ret = dcgm_agent.dcgmGroupDestroy(handle, dcgm_structs.DCGM_GROUP_ALL_GPUS)
def test_dcgm_prof_watch_fields_multi_user(handle, gpuIds): dcgmHandle = pydcgm.DcgmHandle(ipAddress="127.0.0.1") dcgmSystem = dcgmHandle.GetSystem() dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds) helper_check_profiling_environment(dcgmGroup) dcgmHandle2 = pydcgm.DcgmHandle(ipAddress="127.0.0.1") dcgmSystem2 = dcgmHandle2.GetSystem() dcgmGroup2 = dcgmSystem2.GetGroupWithGpuIds('mygroup2', gpuIds) helper_check_profiling_environment(dcgmGroup) fieldIds = helper_get_single_pass_field_ids(dcgmGroup) assert fieldIds is not None #Take ownership of the profiling watches dcgmGroup.profiling.WatchFields(fieldIds, 1000000, 3600.0, 0) with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_IN_USE)): dcgmGroup2.profiling.WatchFields(fieldIds, 1000000, 3600.0, 0) with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_IN_USE)): dcgmGroup2.profiling.UnwatchFields() #Release the watches dcgmGroup.profiling.UnwatchFields() #Now dcgmHandle2 owns the watches dcgmGroup2.profiling.WatchFields(fieldIds, 1000000, 3600.0, 0) #connection 1 should fail to acquire the watches with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_IN_USE)): dcgmGroup.profiling.WatchFields(fieldIds, 1000000, 3600.0, 0) with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_IN_USE)): dcgmGroup.profiling.UnwatchFields() dcgmHandle.Shutdown() dcgmHandle2.Shutdown()
def _assert_metadata_not_configured_failure(handle): """ Verifies that: 1. metadata gathering is disabled by default 2. an appropriate error is raised when metadata APIs are accessed but metadata gathering is disabled. """ system = pydcgm.DcgmSystem(pydcgm.DcgmHandle(handle)) with test_utils.assert_raises(dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)): memoryInfo = system.introspect.memory.GetForAllFields()
def test_dcgm_get_pid_info_validate(handle, gpuIds): """ Validates structure version """ pidList = StartAppOnGpus(handle) groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "test1") for pid in pidList: with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmGetPidInfo(handle, groupId, pid, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmGetPidInfo(handle, groupId, pid, versionTest)
def helper_dcgm_verify_sync_boost_multi_gpu(handle, gpuIds): handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetEmptyGroup("test1") if len(gpuIds) < 2: test_utils.skip_test( "This test only works with 2 or more identical GPUs") ## Add all identical GPUs to the group for gpuId in gpuIds: groupObj.AddGpu(gpuId) gpuIds = groupObj.GetGpuIds() #Only reference GPUs we are testing against ## Set the sync boost for the group config_values = dcgm_structs.c_dcgmDeviceConfig_v1() config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.syncBoost = 1 config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK #Enable sync boost - Will throw an exception on error with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_NOT_SUPPORTED)): groupObj.config.Set(config_values) config_values.mPerfState.syncBoost = 0 #Disable sync boost - Will throw an exception on error with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_NOT_SUPPORTED)): groupObj.config.Set(config_values) groupObj.Delete()
def test_dcgm_connection_client_cleanup(handle, gpuIds): ''' Make sure that resources that were allocated by a client are cleaned up ''' fieldGroupFieldIds = [ dcgm_fields.DCGM_FI_DEV_GPU_TEMP, ] #Get a 2nd connection which we'll check for cleanup. Use the raw APIs so we can explicitly cleanup connectParams = dcgm_structs.c_dcgmConnectV2Params_v1() connectParams.version = dcgm_structs.c_dcgmConnectV2Params_version connectParams.persistAfterDisconnect = 0 cleanupHandle = dcgm_agent.dcgmConnect_v2('localhost', connectParams) groupName = 'clientcleanupgroup' groupId = dcgm_agent.dcgmGroupCreate(cleanupHandle, dcgm_structs.DCGM_GROUP_EMPTY, groupName) fieldGroupName = 'clientcleanupfieldgroup' fieldGroupId = dcgm_agent.dcgmFieldGroupCreate(cleanupHandle, fieldGroupFieldIds, fieldGroupName) #Disconnect our second handle. This should cause the cleanup to occur dcgm_agent.dcgmDisconnect(cleanupHandle) time.sleep(1.0) #Allow connection cleanup to occur since it's asynchronous #Try to retrieve the field group info. This should throw an exception with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_NO_DATA)): fieldGroupInfo = dcgm_agent.dcgmFieldGroupGetInfo(handle, fieldGroupId) #Try to retrieve the group info. This should throw an exception with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_NOT_CONFIGURED)): groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)
def test_dcgm_vgpu_config_set_validate(handle): """ Validates structure version """ groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "test1") status_handle = dcgm_agent.dcgmStatusCreate() config_values = dcgm_structs.c_dcgmDeviceConfig_v1() with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmVgpuConfigSet(handle, groupId, config_values, status_handle, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random invalid version ret = vtDcgmVgpuConfigSet(handle, groupId, config_values, status_handle, versionTest)