コード例 #1
0
ファイル: DcgmGroup.py プロジェクト: NVIDIA/DCGM
 def Register(self, condition, beginCallback=None, finishCallback=None):
     if beginCallback is None and finishCallback is None:
         raise pydcgm.DcgmException(
             "At least 1 callback must be provided to register that is not None"
         )
     dcgm_agent.dcgmPolicyRegister(self._dcgmHandle.handle, self._groupId,
                                   condition, beginCallback, finishCallback)
コード例 #2
0
config_values.mPowerLimit.val = powerLimit_set

## Set Config and verify the value
ret = dcgm_agent.dcgmConfigSet(handle, groupId, config_values, statusHandle)
assert (ret == dcgm_structs.DCGM_ST_OK
        ), "Failed to set configuration for the group: %s" % ret
dcgm_agent.dcgmStatusClear(statusHandle)
helper_verify_power_value_standalone(handle, groupId, powerLimit_set)

ret = dcgm_agent.dcgmPolicySet(handle, groupId, newPolicy, statusHandle)
assert (ret == dcgm_structs.DCGM_ST_OK)

time.sleep(5)  # give the policy manager a chance to start

requestId = dcgm_agent.dcgmPolicyRegister(
    handle, groupId, dcgm_structs.DCGM_POLICY_COND_MAX_PAGES_RETIRED,
    c_callback, c_callback)
assert (requestId != None)

# inject an error into page retirement
field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
field.fieldId = dcgm_fields.DCGM_FI_DEV_RETIRED_DBE
field.status = 0
field.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
field.ts = int(
    (time.time() + 11) * 1000000.0)  # set the injected data into the future
field.value.i64 = 10

ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, validDevice, field)
assert (ret == dcgm_structs.DCGM_ST_OK)
コード例 #3
0
ファイル: dcgm_diagnostic.py プロジェクト: NVIDIA/DCGM
        ## to make sure everything is ready to run
        ## currently this calls an outside diagnostic binary but eventually
        ## that binary will be merged into the DCGM framework
        ## The "response" is a dcgmDiagResponse structure that can be parsed for errors
        response = dcgm_agent.dcgmActionValidate_v2(handle, runDiagInfo)

        ## This will perform an "eiplogue" diagnostic that will stress the system
        ## Currently commented out because it takes several minutes to execute
        # runDiagInfo.validate = dcgm_structs.DCGM_POLICY_VALID_SV_LONG
        #response = dcgm_agent.dcgmActionValidate_v2(handle, dcgmRunDiagInfo)

        ## prime the policy manager to look for ECC, PCIe events
        ## if a callback occurs the function above is called. Currently the data returned
        ## corresponds to the error that occurred (PCI, DBE, etc.) but in the future it will be a
        ## dcgmPolicyViolation_t or similar
        ret = dcgm_agent.dcgmPolicyRegister(
            handle, runDiagInfo.groupId, dcgm_structs.DCGM_POLICY_COND_PCI
            | dcgm_structs.DCGM_POLICY_COND_DBE, None, c_callback)

        ## trigger the policy loop
        ## typically this would be looped in a separate thread or called on demand
        ret = dcgm_agent.dcgmPolicyTrigger(handle)

        ## Destroy the group
        try:
            dcgm_agent.dcgmGroupDestroy(handle, runDiagInfo.groupId)
        except dcgm_structs.DCGMError as e:
            print("Failed to remove the test group, error: %s" % e,
                  file=sys.stderr)
            sys.exit(1)