def Register(self, condition, beginCallback=None, finishCallback=None): if beginCallback is None and finishCallback is None: raise pydcgm.DcgmException( "At least 1 callback must be provided to register that is not None" ) dcgm_agent.dcgmPolicyRegister(self._dcgmHandle.handle, self._groupId, condition, beginCallback, finishCallback)
config_values.mPowerLimit.val = powerLimit_set ## Set Config and verify the value ret = dcgm_agent.dcgmConfigSet(handle, groupId, config_values, statusHandle) assert (ret == dcgm_structs.DCGM_ST_OK ), "Failed to set configuration for the group: %s" % ret dcgm_agent.dcgmStatusClear(statusHandle) helper_verify_power_value_standalone(handle, groupId, powerLimit_set) ret = dcgm_agent.dcgmPolicySet(handle, groupId, newPolicy, statusHandle) assert (ret == dcgm_structs.DCGM_ST_OK) time.sleep(5) # give the policy manager a chance to start requestId = dcgm_agent.dcgmPolicyRegister( handle, groupId, dcgm_structs.DCGM_POLICY_COND_MAX_PAGES_RETIRED, c_callback, c_callback) assert (requestId != None) # inject an error into page retirement field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1() field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1 field.fieldId = dcgm_fields.DCGM_FI_DEV_RETIRED_DBE field.status = 0 field.fieldType = ord(dcgm_fields.DCGM_FT_INT64) field.ts = int( (time.time() + 11) * 1000000.0) # set the injected data into the future field.value.i64 = 10 ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, validDevice, field) assert (ret == dcgm_structs.DCGM_ST_OK)
## to make sure everything is ready to run ## currently this calls an outside diagnostic binary but eventually ## that binary will be merged into the DCGM framework ## The "response" is a dcgmDiagResponse structure that can be parsed for errors response = dcgm_agent.dcgmActionValidate_v2(handle, runDiagInfo) ## This will perform an "eiplogue" diagnostic that will stress the system ## Currently commented out because it takes several minutes to execute # runDiagInfo.validate = dcgm_structs.DCGM_POLICY_VALID_SV_LONG #response = dcgm_agent.dcgmActionValidate_v2(handle, dcgmRunDiagInfo) ## prime the policy manager to look for ECC, PCIe events ## if a callback occurs the function above is called. Currently the data returned ## corresponds to the error that occurred (PCI, DBE, etc.) but in the future it will be a ## dcgmPolicyViolation_t or similar ret = dcgm_agent.dcgmPolicyRegister( handle, runDiagInfo.groupId, dcgm_structs.DCGM_POLICY_COND_PCI | dcgm_structs.DCGM_POLICY_COND_DBE, None, c_callback) ## trigger the policy loop ## typically this would be looped in a separate thread or called on demand ret = dcgm_agent.dcgmPolicyTrigger(handle) ## Destroy the group try: dcgm_agent.dcgmGroupDestroy(handle, runDiagInfo.groupId) except dcgm_structs.DCGMError as e: print("Failed to remove the test group, error: %s" % e, file=sys.stderr) sys.exit(1)