def agent_worker_function(handle, groupId, groupInfo, status_handle): NUM_ITERATIONS = 5 count = 0 while True: dcgm_agent.dcgmUpdateAllFields(handle, 1) ## Get the current configuration for the group config_values = dcgm_agent.dcgmConfigGet(handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle) ## Since this is a group operation, Check for the status codes if any of the property failed helper_investigate_status(status_handle) dcgm_agent.dcgmStatusClear(status_handle) ## Display current configuration for the group for x in xrange(0,groupInfo.count): print "GPU Id : %d" % (config_values[x].gpuId) print "Ecc Mode : %s" % (convert_value_to_string(config_values[x].mEccMode)) print "Auto Boost : %s" % (convert_value_to_string(config_values[x].mPerfState.autoBoost)) print "Sync Boost : %s" % (convert_value_to_string(config_values[x].mPerfState.autoBoost)) print "Mem Clock : %s" % (convert_value_to_string(config_values[x].mPerfState.minVPState.memClk)) print "SM Clock : %s" % (convert_value_to_string(config_values[x].mPerfState.minVPState.procClk)) print "Power Limit : %s" % (convert_value_to_string(config_values[x].mPowerLimit.val)) print "Compute Mode: %s" % (convert_value_to_string(config_values[x].mComputeMode)) print "\n" count = count + 1 if count == NUM_ITERATIONS: break sleep(2)
def test_dcgm_configure_ecc_mode(handle, gpuIds): test_utils.skip_test("Skipping this test until bug 200377294 is fixed") groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY, "test1") validDevice = -1 for x in gpuIds: fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, x, [ dcgm_fields.DCGM_FI_DEV_ECC_CURRENT, ]) if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED): validDevice = x break if (validDevice == -1): test_utils.skip_test( "Can only run if at least one GPU with ECC is present") ret = dcgm_agent.dcgmGroupAddDevice(handle, groupId, validDevice) assert (ret == dcgm_structs.DCGM_ST_OK ), "Failed to add a device to the group %d. Return %d" % ( groupId.value, ret) groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId) #Create a status handle status_handle = dcgm_agent.dcgmStatusCreate() ## Get original ECC mode on the device config_values = dcgm_agent.dcgmConfigGet( handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle) assert len( config_values) > 0, "Failed to get configuration using dcgmConfigGet" eccmodeOnGroupExisting = config_values[0].mEccMode if eccmodeOnGroupExisting == 0: eccmodeOnGroupToSet = 1 else: eccmodeOnGroupToSet = 0 #print eccmodeOnGroupExisting #print eccmodeOnGroupToSet ## Toggle the ECC mode on the group config_values = dcgm_structs.c_dcgmDeviceConfig_v1() config_values.mEccMode = eccmodeOnGroupToSet config_values.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK #Clear the status handle to log the errors while setting the config ret = dcgm_agent.dcgmStatusClear(status_handle) assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret try: ret = dcgm_agent.dcgmConfigSet(handle, groupId, config_values, status_handle) except dcgm_structs.DCGMError as e: pass errors = helper_get_status_list(status_handle) if len(errors) > 0: for error in errors: if error.status == dcgm_structs.DCGM_ST_RESET_REQUIRED: test_utils.skip_test( "Skipping the test - Unable to reset the Gpu, FieldId - %d, Return - %d" % (error.fieldId, error.status)) else: test_utils.skip_test( "Skipping the test - Unable to set the ECC mode. FieldId - %d, Return %d" % (error.fieldId, error.status)) #Sleep after reset time.sleep(2) #Clear the status handle to log the errors while setting the config ret = dcgm_agent.dcgmStatusClear(status_handle) assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret #Get the current configuration config_values = dcgm_agent.dcgmConfigGet( handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle) assert len( config_values) > 0, "Failed to get configuration using dcgmConfigGet" fvs = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, validDevice, [ dcgm_fields.DCGM_FI_DEV_ECC_PENDING, dcgm_fields.DCGM_FI_DEV_ECC_CURRENT ]) if fvs[0].value.i64 != fvs[1].value.i64: logger.warning( "Pending ECC %d != Current ECC %d for gpuId %d. Box probably needs a reboot" % (fvs[0].value.i64, fvs[1].value.i64, validDevice)) else: assert config_values[0].mEccMode == (eccmodeOnGroupToSet), "ECC mode %d different from the set value %d" % \ (config_values[0].mEccMode, eccmodeOnGroupToSet)
config_values.mPowerLimit.val = powerLimit_set ## Set Config and verify the value status_handle = dcgm_agent.dcgmStatusCreate() ret = dcgm_agent.dcgmConfigSet(handle, groupId, config_values, statusHandle) errors = helper_get_status_list(status_handle) ecc_to_verify = ecc_set if len(errors) > 0: ## Possible that reset failed. Check the error codes for error in errors: if error.fieldId == dcgm_fields.DCGM_FI_DEV_ECC_CURRENT: ecc_to_verify = eccmodeOnGroupExisting #assert(ret == dcgm_structs.DCGM_ST_OK), "Failed to set configuration for the group: %s" % ret dcgm_agent.dcgmStatusClear(statusHandle) helper_verify_config_values_standalone(handle, groupId, powerLimit_set, ecc_to_verify, proc_clk_set, mem_clk_set, compute_set, syncboost_set, autoBoost_set) print "Verification Successful" ret = dcgm_agent.dcgmGroupDestroy(handle, groupId) assert (ret == dcgm_structs.DCGM_ST_OK ), "Failed to remove the test group, error: %s" % ret ret = dcgm_agent.dcgmStatusDestroy(statusHandle) assert (ret == dcgm_structs.DCGM_ST_OK ), "Failed to remove status handler, error: %s" % ret
def test_dcgm_vgpu_configure_ecc_mode(handle, gpuIds): test_utils.skip_test("Skipping this test until bug 200377294 is fixed") groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY, "test1") validDevice = -1 for x in gpuIds: fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, x, [dcgm_fields.DCGM_FI_DEV_RETIRED_DBE]) if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED): validDevice = x break if (validDevice == -1): test_utils.skip_test( "Can only run if at least one GPU with ECC is present") ret = dcgm_agent.dcgmGroupAddDevice(handle, groupId, validDevice) assert ( ret == dcgm_structs.DCGM_ST_OK ), "Failed to add a device to the group %d. Return %d" % (groupId, ret) groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId) #Create a status handle status_handle = dcgm_agent.dcgmStatusCreate() ## Get original ECC mode on the device vgpu_config_values = dcgm_agent_internal.dcgmVgpuConfigGet( handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle) assert len( vgpu_config_values) > 0, "Failed to work with NULL status handle" eccmodeOnGroupExisting = vgpu_config_values[0].mEccMode if eccmodeOnGroupExisting == 0: eccmodeOnGroupToSet = 1 else: eccmodeOnGroupToSet = 0 #print eccmodeOnGroupExisting #print eccmodeOnGroupToSet ## Toggle the ECC mode on the group vgpu_config_values = dcgm_structs.c_dcgmDeviceVgpuConfig_v1() vgpu_config_values.mEccMode = eccmodeOnGroupToSet vgpu_config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK vgpu_config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK vgpu_config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK #Clear the status handle to log the errors while setting the config ret = dcgm_agent.dcgmStatusClear(status_handle) assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret try: ret = dcgm_agent_internal.dcgmVgpuConfigSet(handle, groupId, vgpu_config_values, status_handle) except dcgm_structs.DCGMError as e: pass errors = helper_get_status_list(status_handle) if len(errors) > 0: for error in errors: if error.status == dcgm_structs.DCGM_ST_RESET_REQUIRED: test_utils.skip_test( "Skipping the test - Unable to reset the Gpu, FieldId - %d, Return - %d" % (error.fieldId, error.status)) else: test_utils.skip_test( "Skipping the test - Unable to set the ECC mode. FieldId - %d, Return %d" % (error.fieldId, error.status)) #Sleep after reset and then apply update for it to occur time.sleep(2) dcgm_agent.dcgmUpdateAllFields(handle, 1) #Clear the status handle to log the errors while setting the config ret = dcgm_agent.dcgmStatusClear(status_handle) assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret #Get the current configuration config_values = dcgm_agent_internal.dcgmVgpuConfigGet( handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE, groupInfo.count, status_handle) assert len(config_values ) > 0, "Failed to get configuration using dcgmiVgpuConfigGet" assert config_values[0].mEccMode == ( eccmodeOnGroupToSet), "ECC mode different from the set value"