def main(): ## Initilaize the DCGM Engine as manual operation mode. This implies that it's execution is ## controlled by the monitoring agent. The user has to periodically call APIs such as ## dcgmEnginePolicyTrigger and dcgmEngineUpdateAllFields which tells DCGM to wake up and ## perform data collection and operations needed for policy management. with RunDCGM('127.0.0.1', dcgm_structs.DCGM_OPERATION_MODE_MANUAL) as handle: ## Create a default group. (Default group is comprised of all the GPUs on the node) ## Let's call the group as "all_gpus_group". The method returns an opaque handle (groupId) to ## identify the newly created group. groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "all_gpus_group") ## Invoke method to get information on the newly created group groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId) ## Create reference to DCGM status handler which can be used to get the statuses for multiple ## operations on one or more devices present in the group status_handle = dcgm_agent.dcgmStatusCreate() ## The worker function can be executed as a separate thread or as part of the main thread. ## Executed as a separate thread here thread = Thread(target=agent_worker_function, args=(handle, groupId)) thread.start() ########################################## # Any other useful work can be placed here ########################################## thread.join() print("Worker thread completed") ## Destroy the group try: dcgm_agent.dcgmGroupDestroy(handle, groupId) except dcgm_structs.DCGMError as e: print("Failed to remove the test group, error: %s" % e, file=sys.stderr) sys.exit(1) ## Destroy the status handle try: dcgm_agent.dcgmStatusDestroy(status_handle) except dcgm_structs.DCGMError as e: print("Failed to remove status handler, error: %s" % e, file=sys.stderr) sys.exit(1)
def Delete(self): del self.config self.config = None del self.samples self.samples = None del self.health self.health = None del self.policy self.policy = None del self.discovery self.discovery = None del self.stats self.stats = None del self.action self.action = None del self.profiling self.profiling = None #Delete the group we created if we're not using the special all-GPU group if self._groupId is not None and not self._IsGroupIdStatic(): ret = dcgm_agent.dcgmGroupDestroy(self._dcgmHandle.handle, self._groupId) dcgm_structs._dcgmCheckReturn(ret) self._groupId = None
def dcgm_group_test_default_group(handle, gpuIds): """ Test that the default group can not be deleted, or manipulated and is returning all GPUs. Note that we're not using groupObj for some tests because it protects against operations on the default group """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetDefaultGroup() gpuIdList = gpuIds assert len(gpuIdList) > 0, "Failed to get devices from the node" with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)): groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, 9999) groupGpuIdList = groupObj.GetGpuIds() assert (gpuIdList == groupGpuIdList ), "Expected gpuId list match %s != %s" % (str(gpuIdList), str(groupGpuIdList)) groupEntityList = groupObj.GetEntities() gpuIdList2 = [] for entity in groupEntityList: assert entity.entityGroupId == dcgm_fields.DCGM_FE_GPU, str( entity.entityGroupId) gpuIdList2.append(entity.entityId) assert gpuIdList == gpuIdList2, "Expected gpuId list to match entity list: %s != %s" % ( str(gpuIdList), str(gpuIdList2)) for gpuId in gpuIdList: with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)): ret = dcgm_agent.dcgmGroupRemoveDevice( handle, dcgm_structs.DCGM_GROUP_ALL_GPUS, gpuId) with test_utils.assert_raises(pydcgm.DcgmException): groupObj.RemoveGpu(gpuId) with test_utils.assert_raises( dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)): ret = dcgm_agent.dcgmGroupDestroy(handle, dcgm_structs.DCGM_GROUP_ALL_GPUS)
def __del__(self): if self.groupId is not None: dcgm_agent.dcgmGroupDestroy(self.heHandle, self.groupId) self.groupId = None self.heHandle = None
groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "all_gpus_group") ## Invoke method to get information on the newly created group groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId) ## Create reference to DCGM status handler which can be used to get the statuses for multiple ## operations on one or more devices present in the group status_handle = dcgm_agent.dcgmStatusCreate() ## The worker function can be executed as a separate thread or as part of the main thread. ## Executed as a separate thread here thread = Thread(target = agent_worker_function, args = (handle, groupId, groupInfo, status_handle)) thread.start() ########################################## # Any other useful work can be placed here ########################################## thread.join() print "Worker thread completed" ## Destroy the group ret = dcgm_agent.dcgmGroupDestroy(handle, groupId) assert(ret == dcgm_structs.DCGM_ST_OK), "Failed to remove the test group, error: %s" % ret ## Destroy the status handle ret = dcgm_agent.dcgmStatusDestroy(status_handle) assert(ret == dcgm_structs.DCGM_ST_OK), "Failed to remove status handler, error: %s" % ret
## to make sure everything is ready to run ## currently this calls an outside diagnostic binary but eventually ## that binary will be merged into the DCGM framework ## The "response" is a dcgmDiagResponse structure that can be parsed for errors response = dcgm_agent.dcgmActionValidate_v2(handle, runDiagInfo) ## This will perform an "eiplogue" diagnostic that will stress the system ## Currently commented out because it takes several minutes to execute # runDiagInfo.validate = dcgm_structs.DCGM_POLICY_VALID_SV_LONG #response = dcgm_agent.dcgmActionValidate_v2(handle, dcgmRunDiagInfo) ## prime the policy manager to look for ECC, PCIe events ## if a callback occurs the function above is called. Currently the data returned ## corresponds to the error that occurred (PCI, DBE, etc.) but in the future it will be a ## dcgmPolicyViolation_t or similar ret = dcgm_agent.dcgmPolicyRegister( handle, runDiagInfo.groupId, dcgm_structs.DCGM_POLICY_COND_PCI | dcgm_structs.DCGM_POLICY_COND_DBE, None, c_callback) ## trigger the policy loop ## typically this would be looped in a separate thread or called on demand ret = dcgm_agent.dcgmPolicyTrigger(handle) ## Destroy the group try: dcgm_agent.dcgmGroupDestroy(handle, runDiagInfo.groupId) except dcgm_structs.DCGMError as e: print("Failed to remove the test group, error: %s" % e, file=sys.stderr) sys.exit(1)