def test_dcgm_policy_get_validate(handle): """ Validates structure version """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() gpuIdList = systemObj.discovery.GetAllGpuIds() assert len(gpuIdList ) >= 0, "Not able to find devices on the node for embedded case" groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "test1") status_handle = dcgm_agent.dcgmStatusCreate() count = 1 diagLevel = dcgm_structs.DCGM_DIAG_LVL_SHORT with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmPolicyGet(handle, groupId, count, status_handle, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmPolicyGet(handle, groupId, count, status_handle, versionTest)
def test_dcgm_vgpu_config_get_validate(handle): """ Validates structure version """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() gpuIdList = systemObj.discovery.GetAllGpuIds() assert len(gpuIdList ) >= 0, "Not able to find devices on the node for embedded case" groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "test1") groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId) status_handle = dcgm_agent.dcgmStatusCreate() with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmVgpuConfigGet(handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmVgpuConfigGet(handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle, versionTest)
def helper_verify_power_value_standalone(handle, groupId, expected_power): """ Helper Method to verify power value """ groupInfo = dcgm_agent.dcgmGroupGetInfo( handle, groupId, dcgm_structs.c_dcgmGroupInfo_version2) status_handle = dcgm_agent.dcgmStatusCreate() config_values = dcgm_agent.dcgmConfigGet( handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle) assert len( config_values) > 0, "Failed to get configuration using dcgmConfigGet" for x in range(0, groupInfo.count): if (config_values[x].mPowerLimit.val != dcgmvalue.DCGM_INT32_NOT_SUPPORTED): assert config_values[x].mPowerLimit.type == dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL, \ "The power limit type for gpuId %d is incorrect. Returned: %d Expected :%d" \ % (x, config_values[x].mPowerLimit.type, dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL) assert config_values[x].mPowerLimit.val == expected_power, "The power limit value for gpuID %d is incorrect. Returned: %d Expected: %d" \ % (x, config_values[x].mPowerLimit.val, expected_power) pass ret = dcgm_agent.dcgmStatusDestroy(status_handle) assert (ret == dcgm_structs.DCGM_ST_OK ), "Failed to remove status handler, error: %s" % ret
def helper_verify_config_values_standalone(handle, groupId, expected_power, expected_ecc, \ expected_proc_clock, expected_mem_clock, expected_compute_mode, \ expected_sync_boost, expected_auto_boost): """ Helper Method to verify all the values for the current configuration are as expected """ groupInfo = dcgm_agent.dcgmGroupGetInfo( handle, groupId, dcgm_structs.c_dcgmGroupInfo_version2) status_handle = dcgm_agent.dcgmStatusCreate() config_values = dcgm_agent.dcgmConfigGet( handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle) assert len( config_values) > 0, "Failed to get configuration using dcgmConfigGet" for x in xrange(0, groupInfo.count): assert config_values[x].mPowerLimit.type == dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL, \ "The power limit type for gpuId %d is incorrect. Returned: %d Expected :%d" \ % (x, config_values[x].mPowerLimit.type, dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL) assert config_values[x].mPowerLimit.val == expected_power, "The power limit value for gpuID %d is incorrect. Returned: %d Expected: %d" \ % (x, config_values[x].mPowerLimit.val, expected_power) assert config_values[x].mPerfState.syncBoost == expected_sync_boost, "The syncboost value for gpuID %d is incorrect."\ " Returned: %d Expected: %d" \ % (x, config_values[x].mPerfState.syncBoost, expected_sync_boost) assert config_values[x].mPerfState.autoBoost == expected_auto_boost, "The autoboost value for gpuID %d is incorrect."\ " Returned: %d Expected: %d" \ % (x, config_values[x].mPerfState.autoBoost, expected_auto_boost) assert config_values[x].mPerfState.minVPState.memClk == expected_mem_clock, "The min mem clock value for gpuID %d is incorrect."\ " Returned: %d Expected: %d" \ % (x, config_values.mPerfState.minVPState.memClk , expected_mem_clock) assert config_values[x].mPerfState.minVPState.procClk == expected_proc_clock, "The min proc clock value for gpuID %d is incorrect."\ " Returned: %d Expected: %d" \ % (x, config_values[x].mPerfState.minVPState.procClk , expected_proc_clock) assert config_values[x].mComputeMode == expected_compute_mode, "The compute mode value for gpuID %d is incorrect."\ " Returned: %d Expected: %d" \ % (x, config_values[x].mComputeMode, expected_compute_mode) assert config_values[x].mEccMode == expected_ecc, "The ecc mode value for gpuID %d is incorrect."\ " Returned: %d Expected: %d" \ % (x, config_values[x].mEccMode, expected_ecc) pass ret = dcgm_agent.dcgmStatusDestroy(status_handle) assert (ret == dcgm_structs.DCGM_ST_OK ), "Failed to remove status handler, error: %s" % ret
def main(): ## Initilaize the DCGM Engine as manual operation mode. This implies that it's execution is ## controlled by the monitoring agent. The user has to periodically call APIs such as ## dcgmEnginePolicyTrigger and dcgmEngineUpdateAllFields which tells DCGM to wake up and ## perform data collection and operations needed for policy management. with RunDCGM('127.0.0.1', dcgm_structs.DCGM_OPERATION_MODE_MANUAL) as handle: ## Create a default group. (Default group is comprised of all the GPUs on the node) ## Let's call the group as "all_gpus_group". The method returns an opaque handle (groupId) to ## identify the newly created group. groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "all_gpus_group") ## Invoke method to get information on the newly created group groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId) ## Create reference to DCGM status handler which can be used to get the statuses for multiple ## operations on one or more devices present in the group status_handle = dcgm_agent.dcgmStatusCreate() ## The worker function can be executed as a separate thread or as part of the main thread. ## Executed as a separate thread here thread = Thread(target=agent_worker_function, args=(handle, groupId)) thread.start() ########################################## # Any other useful work can be placed here ########################################## thread.join() print("Worker thread completed") ## Destroy the group try: dcgm_agent.dcgmGroupDestroy(handle, groupId) except dcgm_structs.DCGMError as e: print("Failed to remove the test group, error: %s" % e, file=sys.stderr) sys.exit(1) ## Destroy the status handle try: dcgm_agent.dcgmStatusDestroy(status_handle) except dcgm_structs.DCGMError as e: print("Failed to remove status handler, error: %s" % e, file=sys.stderr) sys.exit(1)
def test_dcgm_vgpu_config_set_validate(handle): """ Validates structure version """ groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "test1") status_handle = dcgm_agent.dcgmStatusCreate() config_values = dcgm_structs.c_dcgmDeviceConfig_v1() with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmVgpuConfigSet(handle, groupId, config_values, status_handle, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random invalid version ret = vtDcgmVgpuConfigSet(handle, groupId, config_values, status_handle, versionTest)
def test_dcgm_run_diagnostic_validate(handle, gpuIds): """ Validates structure version """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() gpuIdList = systemObj.discovery.GetAllGpuIds() assert len(gpuIdList ) >= 0, "Not able to find devices on the node for embedded case" groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "test1") groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId) status_handle = dcgm_agent.dcgmStatusCreate() diagLevel = dcgm_structs.DCGM_DIAG_LVL_SHORT gpuIdStr = "" for i, gpuId in enumerate(gpuIds): if i > 0: gpuIdStr += "," gpuIdStr += str(gpuId) drd = dcgm_structs.c_dcgmRunDiag_t() drd.version = dcgm_structs.dcgmRunDiag_version drd.validate = dcgm_structs.DCGM_POLICY_VALID_SV_SHORT drd.groupId = groupId drd.gpuList = gpuIdStr with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmActionValidate_v2(handle, drd, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmActionValidate_v2(handle, drd, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmActionValidate(handle, drd.groupId, drd.validate, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmActionValidate(handle, drd.groupId, drd.validate, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 0 #invalid version ret = vtDcgmRunDiagnostic(handle, drd.groupId, diagLevel, versionTest) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)): versionTest = 50 #random number version ret = vtDcgmRunDiagnostic(handle, drd.groupId, diagLevel, versionTest)
## controlled by the monitoring agent. The user has to periodically call APIs such as ## dcgmEnginePolicyTrigger and dcgmEngineUpdateAllFields which tells DCGM to wake up and ## perform data collection and operations needed for policy management. with RunDCGM('127.0.0.1', dcgm_structs.DCGM_OPERATION_MODE_MANUAL) as handle: ## Create a default group. (Default group is comprised of all the GPUs on the node) ## Let's call the group as "all_gpus_group". The method returns an opaque handle (groupId) to ## identify the newly created group. groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "all_gpus_group") ## Invoke method to get information on the newly created group groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId) ## Create reference to DCGM status handler which can be used to get the statuses for multiple ## operations on one or more devices present in the group status_handle = dcgm_agent.dcgmStatusCreate() ## The worker function can be executed as a separate thread or as part of the main thread. ## Executed as a separate thread here thread = Thread(target = agent_worker_function, args = (handle, groupId, groupInfo, status_handle)) thread.start() ########################################## # Any other useful work can be placed here ########################################## thread.join() print "Worker thread completed" ## Destroy the group ret = dcgm_agent.dcgmGroupDestroy(handle, groupId)
def test_dcgm_configure_ecc_mode(handle, gpuIds): test_utils.skip_test("Skipping this test until bug 200377294 is fixed") groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY, "test1") validDevice = -1 for x in gpuIds: fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, x, [ dcgm_fields.DCGM_FI_DEV_ECC_CURRENT, ]) if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED): validDevice = x break if (validDevice == -1): test_utils.skip_test( "Can only run if at least one GPU with ECC is present") ret = dcgm_agent.dcgmGroupAddDevice(handle, groupId, validDevice) assert (ret == dcgm_structs.DCGM_ST_OK ), "Failed to add a device to the group %d. Return %d" % ( groupId.value, ret) groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId) #Create a status handle status_handle = dcgm_agent.dcgmStatusCreate() ## Get original ECC mode on the device config_values = dcgm_agent.dcgmConfigGet( handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle) assert len( config_values) > 0, "Failed to get configuration using dcgmConfigGet" eccmodeOnGroupExisting = config_values[0].mEccMode if eccmodeOnGroupExisting == 0: eccmodeOnGroupToSet = 1 else: eccmodeOnGroupToSet = 0 #print eccmodeOnGroupExisting #print eccmodeOnGroupToSet ## Toggle the ECC mode on the group config_values = dcgm_structs.c_dcgmDeviceConfig_v1() config_values.mEccMode = eccmodeOnGroupToSet config_values.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK #Clear the status handle to log the errors while setting the config ret = dcgm_agent.dcgmStatusClear(status_handle) assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret try: ret = dcgm_agent.dcgmConfigSet(handle, groupId, config_values, status_handle) except dcgm_structs.DCGMError as e: pass errors = helper_get_status_list(status_handle) if len(errors) > 0: for error in errors: if error.status == dcgm_structs.DCGM_ST_RESET_REQUIRED: test_utils.skip_test( "Skipping the test - Unable to reset the Gpu, FieldId - %d, Return - %d" % (error.fieldId, error.status)) else: test_utils.skip_test( "Skipping the test - Unable to set the ECC mode. FieldId - %d, Return %d" % (error.fieldId, error.status)) #Sleep after reset time.sleep(2) #Clear the status handle to log the errors while setting the config ret = dcgm_agent.dcgmStatusClear(status_handle) assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret #Get the current configuration config_values = dcgm_agent.dcgmConfigGet( handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle) assert len( config_values) > 0, "Failed to get configuration using dcgmConfigGet" fvs = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, validDevice, [ dcgm_fields.DCGM_FI_DEV_ECC_PENDING, dcgm_fields.DCGM_FI_DEV_ECC_CURRENT ]) if fvs[0].value.i64 != fvs[1].value.i64: logger.warning( "Pending ECC %d != Current ECC %d for gpuId %d. Box probably needs a reboot" % (fvs[0].value.i64, fvs[1].value.i64, validDevice)) else: assert config_values[0].mEccMode == (eccmodeOnGroupToSet), "ECC mode %d different from the set value %d" % \ (config_values[0].mEccMode, eccmodeOnGroupToSet)
def test_dcgm_vgpu_configure_ecc_mode(handle, gpuIds): test_utils.skip_test("Skipping this test until bug 200377294 is fixed") groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY, "test1") validDevice = -1 for x in gpuIds: fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, x, [dcgm_fields.DCGM_FI_DEV_RETIRED_DBE]) if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED): validDevice = x break if (validDevice == -1): test_utils.skip_test( "Can only run if at least one GPU with ECC is present") ret = dcgm_agent.dcgmGroupAddDevice(handle, groupId, validDevice) assert ( ret == dcgm_structs.DCGM_ST_OK ), "Failed to add a device to the group %d. Return %d" % (groupId, ret) groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId) #Create a status handle status_handle = dcgm_agent.dcgmStatusCreate() ## Get original ECC mode on the device vgpu_config_values = dcgm_agent_internal.dcgmVgpuConfigGet( handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle) assert len( vgpu_config_values) > 0, "Failed to work with NULL status handle" eccmodeOnGroupExisting = vgpu_config_values[0].mEccMode if eccmodeOnGroupExisting == 0: eccmodeOnGroupToSet = 1 else: eccmodeOnGroupToSet = 0 #print eccmodeOnGroupExisting #print eccmodeOnGroupToSet ## Toggle the ECC mode on the group vgpu_config_values = dcgm_structs.c_dcgmDeviceVgpuConfig_v1() vgpu_config_values.mEccMode = eccmodeOnGroupToSet vgpu_config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK vgpu_config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK vgpu_config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK #Clear the status handle to log the errors while setting the config ret = dcgm_agent.dcgmStatusClear(status_handle) assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret try: ret = dcgm_agent_internal.dcgmVgpuConfigSet(handle, groupId, vgpu_config_values, status_handle) except dcgm_structs.DCGMError as e: pass errors = helper_get_status_list(status_handle) if len(errors) > 0: for error in errors: if error.status == dcgm_structs.DCGM_ST_RESET_REQUIRED: test_utils.skip_test( "Skipping the test - Unable to reset the Gpu, FieldId - %d, Return - %d" % (error.fieldId, error.status)) else: test_utils.skip_test( "Skipping the test - Unable to set the ECC mode. FieldId - %d, Return %d" % (error.fieldId, error.status)) #Sleep after reset and then apply update for it to occur time.sleep(2) dcgm_agent.dcgmUpdateAllFields(handle, 1) #Clear the status handle to log the errors while setting the config ret = dcgm_agent.dcgmStatusClear(status_handle) assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret #Get the current configuration config_values = dcgm_agent_internal.dcgmVgpuConfigGet( handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle) assert len(config_values ) > 0, "Failed to get configuration using dcgmiVgpuConfigGet" assert config_values[0].mEccMode == ( eccmodeOnGroupToSet), "ECC mode different from the set value"
def __init__(self): self.handle = dcgm_agent.dcgmStatusCreate() self.errors = []