def test_dcgm_vgpu_config_get_validate(handle): """ Validates structure version """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() gpuIdList = systemObj.discovery.GetAllGpuIds() assert len(gpuIdList ) >= 0, "Not able to find devices on the node for embedded case" groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "test1") groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId) status_handle = dcgm_agent.dcgmStatusCreate() with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmVgpuConfigGet(handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmVgpuConfigGet(handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle, versionTest)
def test_dcgm_policy_get_validate(handle): """ Validates structure version """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() gpuIdList = systemObj.discovery.GetAllGpuIds() assert len(gpuIdList ) >= 0, "Not able to find devices on the node for embedded case" groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "test1") status_handle = dcgm_agent.dcgmStatusCreate() count = 1 diagLevel = dcgm_structs.DCGM_DIAG_LVL_SHORT with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmPolicyGet(handle, groupId, count, status_handle, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmPolicyGet(handle, groupId, count, status_handle, versionTest)
def test_dcgm_get_vgpu_instance_attributes_validate(handle, gpuIds): """ Verifies that vGPU attributes are properly queried """ with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmGetVgpuInstanceAttributes(handle, gpuIds[0], versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmGetVgpuInstanceAttributes(handle, gpuIds[0], versionTest)
def test_dcgm_job_get_stats_validate(handle): """ Validates structure version """ jobid = "1" with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmJobGetStats(handle, jobid, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmJobGetStats(handle, jobid, versionTest)
def helper_dcgm_verify_sync_boost_single_gpu(handle, gpuIds): handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetEmptyGroup("test1") ## Add first GPU to the group groupObj.AddGpu(gpuIds[0]) gpuIds = groupObj.GetGpuIds() #Only reference GPUs we are testing against ## Set the sync boost for the group config_values = dcgm_structs.c_dcgmDeviceConfig_v1() config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.syncBoost = 1 config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK #Config Set must return DCGM_ST_BADPARAM since we only have a single GPU with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)): groupObj.config.Set(config_values) groupObj.Delete()
def test_dcgm_policy_negative_unregister_standalone(handle): """ Verifies that the unregister function does not allow a bad groupId value """ policy = pydcgm.DcgmGroupPolicy(pydcgm.DcgmHandle(handle), 9999, None) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)): policy.Unregister(dcgm_structs.DCGM_POLICY_COND_DBE)
def test_dcgm_connection_error_when_no_hostengine_exists(): if not utils.is_bare_metal_system(): test_utils.skip_test("Virtualization Environment not supported") with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID)): # use a TEST-NET (rfc5737) addr instead of loopback in case a local hostengine is running handle = pydcgm.DcgmHandle(ipAddress='192.0.2.0', timeoutMs=100)
def test_dcgm_introspect_get_fields_memory_usage_validate(handle): """ Validates structure version """ introspectContext = dcgm_structs.c_dcgmIntrospectContext_v1() waitIfNoData = True with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmIntrospectGetFieldsMemoryUsage(handle, introspectContext, versionTest, waitIfNoData) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmIntrospectGetFieldsMemoryUsage(handle, introspectContext, versionTest, waitIfNoData)
def test_dcgm_introspect_get_hostengine_cpu_utilization_validate(handle): """ Validates structure version """ waitIfNoData = True with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmIntrospectGetHostengineCpuUtilization( handle, versionTest, waitIfNoData) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmIntrospectGetHostengineCpuUtilization( handle, versionTest, waitIfNoData)
def test_dcgm_introspect_get_field_exec_time_validate(handle): """ Validates structure version """ fieldId = dcgm_fields.DCGM_FI_DEV_GPU_TEMP waitIfNoData = True with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmIntrospectGetFieldExecTime(handle, fieldId, versionTest, waitIfNoData) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmIntrospectGetFieldExecTime(handle, fieldId, versionTest, waitIfNoData)
def test_dcgm_policy_get_with_no_gpus_standalone(handle): ''' Test that getting the policies when no GPUs are in the group raises an exception ''' group = pydcgm.DcgmGroup(pydcgm.DcgmHandle(handle), groupType=dcgm_structs.DCGM_GROUP_EMPTY, groupName="test") with test_utils.assert_raises(pydcgm.DcgmException): policies = group.policy.Get()
def test_dcgm_policy_negative_register_standalone(handle): """ Verifies that the register function does not allow a bad groupId value """ policy = pydcgm.DcgmGroupPolicy(pydcgm.DcgmHandle(handle), 9999, None) empty_c_callback = create_c_callback( ) # must hold ref so func is not GC'ed before c api uses it with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)): policy.Register(dcgm_structs.DCGM_POLICY_COND_DBE, empty_c_callback)
def test_dcgm_health_check_validate(handle): """ Validates structure version """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "test1") with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmHealthCheck(handle, groupId, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmHealthCheck(handle, groupId, versionTest)
def test_dcgm_field_group_get_all_validate(handle): """ Validates structure version """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() gpuIdList = systemObj.discovery.GetAllGpuIds() assert len(gpuIdList ) >= 0, "Not able to find devices on the node for embedded case" with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version vtDcgmFieldGroupGetAll(handle, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version vtDcgmFieldGroupGetAll(handle, versionTest)
def test_dcgm_connect_validate(handle, gpuIds): """ Validates structure version """ fieldGroupFieldIds = [ dcgm_fields.DCGM_FI_DEV_GPU_TEMP, ] connectParams = dcgm_structs.c_dcgmConnectV2Params_v1() connectParams.persistAfterDisconnect = 0 with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmConnect_v2('localhost', connectParams, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmConnect_v2('localhost', connectParams, versionTest)
def dcgm_group_test_default_group(handle, gpuIds): """ Test that the default group can not be deleted, or manipulated and is returning all GPUs. Note that we're not using groupObj for some tests because it protects against operations on the default group """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetDefaultGroup() gpuIdList = gpuIds assert len(gpuIdList) > 0, "Failed to get devices from the node" with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)): groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, 9999) groupGpuIdList = groupObj.GetGpuIds() assert (gpuIdList == groupGpuIdList ), "Expected gpuId list match %s != %s" % (str(gpuIdList), str(groupGpuIdList)) groupEntityList = groupObj.GetEntities() gpuIdList2 = [] for entity in groupEntityList: assert entity.entityGroupId == dcgm_fields.DCGM_FE_GPU, str( entity.entityGroupId) gpuIdList2.append(entity.entityId) assert gpuIdList == gpuIdList2, "Expected gpuId list to match entity list: %s != %s" % ( str(gpuIdList), str(gpuIdList2)) for gpuId in gpuIdList: with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)): ret = dcgm_agent.dcgmGroupRemoveDevice( handle, dcgm_structs.DCGM_GROUP_ALL_GPUS, gpuId) with test_utils.assert_raises(pydcgm.DcgmException): groupObj.RemoveGpu(gpuId) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)): ret = dcgm_agent.dcgmGroupDestroy(handle, dcgm_structs.DCGM_GROUP_ALL_GPUS)
def test_dcgm_prof_watch_fields_multi_user(handle, gpuIds): dcgmHandle = pydcgm.DcgmHandle(ipAddress="127.0.0.1") dcgmSystem = dcgmHandle.GetSystem() dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds) helper_check_profiling_environment(dcgmGroup) dcgmHandle2 = pydcgm.DcgmHandle(ipAddress="127.0.0.1") dcgmSystem2 = dcgmHandle2.GetSystem() dcgmGroup2 = dcgmSystem2.GetGroupWithGpuIds('mygroup2', gpuIds) helper_check_profiling_environment(dcgmGroup) fieldIds = helper_get_single_pass_field_ids(dcgmGroup) assert fieldIds is not None #Take ownership of the profiling watches dcgmGroup.profiling.WatchFields(fieldIds, 1000000, 3600.0, 0) with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_IN_USE)): dcgmGroup2.profiling.WatchFields(fieldIds, 1000000, 3600.0, 0) with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_IN_USE)): dcgmGroup2.profiling.UnwatchFields() #Release the watches dcgmGroup.profiling.UnwatchFields() #Now dcgmHandle2 owns the watches dcgmGroup2.profiling.WatchFields(fieldIds, 1000000, 3600.0, 0) #connection 1 should fail to acquire the watches with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_IN_USE)): dcgmGroup.profiling.WatchFields(fieldIds, 1000000, 3600.0, 0) with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_IN_USE)): dcgmGroup.profiling.UnwatchFields() dcgmHandle.Shutdown() dcgmHandle2.Shutdown()
def test_dcgm_get_pid_info_validate(handle, gpuIds): """ Validates structure version """ pidList = StartAppOnGpus(handle) groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "test1") for pid in pidList: with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmGetPidInfo(handle, groupId, pid, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmGetPidInfo(handle, groupId, pid, versionTest)
def start(self, timeout=app_runner.default_timeout): """ Blocks till cuda ctx is really created Raises Exception if assert does not work """ super(RunCudaAssert, self).start(timeout) with test_utils.assert_raises(EOFError): # if matching line is not found then EOFError exception is risen self.stdout_readtillmatch(lambda x: x == "Assertion `false` failed")
def _assert_metadata_not_configured_failure(handle): """ Verifies that: 1. metadata gathering is disabled by default 2. an appropriate error is raised when metadata APIs are accessed but metadata gathering is disabled. """ system = pydcgm.DcgmSystem(pydcgm.DcgmHandle(handle)) with test_utils.assert_raises(dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)): memoryInfo = system.introspect.memory.GetForAllFields()
def helper_dcgm_verify_sync_boost_multi_gpu(handle, gpuIds): handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetEmptyGroup("test1") if len(gpuIds) < 2: test_utils.skip_test( "This test only works with 2 or more identical GPUs") ## Add all identical GPUs to the group for gpuId in gpuIds: groupObj.AddGpu(gpuId) gpuIds = groupObj.GetGpuIds() #Only reference GPUs we are testing against ## Set the sync boost for the group config_values = dcgm_structs.c_dcgmDeviceConfig_v1() config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.syncBoost = 1 config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK #Enable sync boost - Will throw an exception on error with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_NOT_SUPPORTED)): groupObj.config.Set(config_values) config_values.mPerfState.syncBoost = 0 #Disable sync boost - Will throw an exception on error with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_NOT_SUPPORTED)): groupObj.config.Set(config_values) groupObj.Delete()
def test_dcgm_connection_client_cleanup(handle, gpuIds): ''' Make sure that resources that were allocated by a client are cleaned up ''' fieldGroupFieldIds = [ dcgm_fields.DCGM_FI_DEV_GPU_TEMP, ] #Get a 2nd connection which we'll check for cleanup. Use the raw APIs so we can explicitly cleanup connectParams = dcgm_structs.c_dcgmConnectV2Params_v1() connectParams.version = dcgm_structs.c_dcgmConnectV2Params_version connectParams.persistAfterDisconnect = 0 cleanupHandle = dcgm_agent.dcgmConnect_v2('localhost', connectParams) groupName = 'clientcleanupgroup' groupId = dcgm_agent.dcgmGroupCreate(cleanupHandle, dcgm_structs.DCGM_GROUP_EMPTY, groupName) fieldGroupName = 'clientcleanupfieldgroup' fieldGroupId = dcgm_agent.dcgmFieldGroupCreate(cleanupHandle, fieldGroupFieldIds, fieldGroupName) #Disconnect our second handle. This should cause the cleanup to occur dcgm_agent.dcgmDisconnect(cleanupHandle) time.sleep(1.0) #Allow connection cleanup to occur since it's asynchronous #Try to retrieve the field group info. This should throw an exception with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_NO_DATA)): fieldGroupInfo = dcgm_agent.dcgmFieldGroupGetInfo(handle, fieldGroupId) #Try to retrieve the group info. This should throw an exception with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_NOT_CONFIGURED)): groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)
def test_dcgm_vgpu_config_set_validate(handle): """ Validates structure version """ groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "test1") status_handle = dcgm_agent.dcgmStatusCreate() config_values = dcgm_structs.c_dcgmDeviceConfig_v1() with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmVgpuConfigSet(handle, groupId, config_values, status_handle, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random invalid version ret = vtDcgmVgpuConfigSet(handle, groupId, config_values, status_handle, versionTest)
def test_dcgm_field_group_get_info_validate(handle): """ Validates structure version """ fieldIds = [ dcgm_fields.DCGM_FI_DRIVER_VERSION, dcgm_fields.DCGM_FI_DEV_NAME, dcgm_fields.DCGM_FI_DEV_BRAND ] handle = pydcgm.DcgmHandle(handle) fieldGroup = pydcgm.DcgmFieldGroup(handle, "mygroup", fieldIds) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmFieldGroupGetInfo(handle.handle, fieldGroup.fieldGroupId, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmFieldGroupGetInfo(handle.handle, fieldGroup.fieldGroupId, versionTest)
def test_dcgm_get_device_attributes_validate(handle, gpuIds): """ Validates structure version """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetEmptyGroup("test1") ## Add first GPU to the group groupObj.AddGpu(gpuIds[0]) gpuIds = groupObj.GetGpuIds() #Only reference GPUs we are testing against #Make sure the device attributes and config fields have updated systemObj.UpdateAllFields(1) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmGetDeviceAttributes(handle, gpuIds[0], versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random invalid version ret = vtDcgmGetDeviceAttributes(handle, gpuIds[0], versionTest)
def helper_test_dcgm_injection_summaries(handle, gpuIds): gpuId = gpuIds[0] # Watch the field we're inserting into dcgm_agent_internal.dcgmWatchFieldValue( handle, gpuId, dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, 1, 3600.0, 10000) handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() #Make a base value that is good for starters field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 field.fieldId = dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL field.status = 0 field.fieldType = ord(dcgm_fields.DCGM_FT_INT64) baseTime = get_usec_since_1970() for i in range(0, 10): field.ts = baseTime + i field.value.i64 = i ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, field) assert (ret == dcgm_structs.DCGM_ST_OK) time.sleep(1) systemObj.UpdateAllFields(1) tmpMask = dcgm_structs.DCGM_SUMMARY_MIN | dcgm_structs.DCGM_SUMMARY_MAX tmpMask = tmpMask | dcgm_structs.DCGM_SUMMARY_AVG | dcgm_structs.DCGM_SUMMARY_DIFF # Pass baseTime for the start to get nothing from the first query with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_NO_DATA)): request = dcgm_agent.dcgmGetFieldSummary( handle, dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, dcgm_fields.DCGM_FE_GPU, gpuId, tmpMask, baseTime - 60, baseTime - 30) # Now adjust the time so we get values request = dcgm_agent.dcgmGetFieldSummary( handle, dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, dcgm_fields.DCGM_FE_GPU, gpuId, tmpMask, 0, 0) assert (request.response.values[0].i64 == 0) assert (request.response.values[1].i64 == 9) assert (request.response.values[2].i64 == 4) assert (request.response.values[3].i64 == 9)
def test_connection_disconnect_error_after_shutdown(): ''' Test that DCGM_ST_BADPARAM is returned when the dcgm API is used after a call to dcgmShutdown has been made. ''' handle = pydcgm.DcgmHandle() group = pydcgm.DcgmGroup(handle, groupName='test-connection') gpudIds = group.GetGpuIds() handle.Shutdown() with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)): gpuIds = group.GetGpuIds()
def test_dcgm_action_run_diag_bad_validation(handle, gpuIds): gpuIdStr = "" for i, gpuId in enumerate(gpuIds): if i > 0: gpuIdStr += "," gpuIdStr += str(gpuId) drd = dcgm_structs.c_dcgmRunDiag_t() drd.version = dcgm_structs.dcgmRunDiag_version drd.validate = dcgm_structs.DCGM_POLICY_VALID_SV_LONG + 1 #use an invalid value drd.groupId = 0 #Initializing to 0 in case the constructor above doesn't drd.gpuList = gpuIdStr with test_utils.assert_raises(dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)): ret = dcgm_agent.dcgmActionValidate_v2(handle, drd, dcgm_structs.dcgmRunDiag_version)
def test_dcgm_modules_blacklist_introspection(handle): ''' Make sure that the introspection module can be blacklisted ''' dcgmHandle = pydcgm.DcgmHandle(handle=handle) dcgmSystem = dcgmHandle.GetSystem() moduleId = dcgm_structs.DcgmModuleIdIntrospect dcgmSystem.modules.Blacklist(moduleId) #Try to lazy load the blacklisted introspection module with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_MODULE_NOT_LOADED)): dcgmSystem.introspect.state.toggle( dcgm_structs.DCGM_INTROSPECT_STATE.ENABLED)
def test_dcgm_modules_blacklist_health(handle): ''' Make sure that the health module can be blacklisted ''' dcgmHandle = pydcgm.DcgmHandle(handle=handle) dcgmSystem = dcgmHandle.GetSystem() dcgmGroup = dcgmSystem.GetDefaultGroup() moduleId = dcgm_structs.DcgmModuleIdHealth dcgmSystem.modules.Blacklist(moduleId) #Try to lazy load the blacklisted introspection module with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_MODULE_NOT_LOADED)): dcgmGroup.health.Set(dcgm_structs.DCGM_HEALTH_WATCH_ALL)