Example #1
0
def main():

    ## Initilaize the DCGM Engine as manual operation mode. This implies that it's execution is
    ## controlled by the monitoring agent. The user has to periodically call APIs such as
    ## dcgmEnginePolicyTrigger and dcgmEngineUpdateAllFields which tells DCGM to wake up and
    ## perform data collection and operations needed for policy management.
    with RunDCGM('127.0.0.1',
                 dcgm_structs.DCGM_OPERATION_MODE_MANUAL) as handle:

        ## Create a default group. (Default group is comprised of all the GPUs on the node)
        ## Let's call the group as "all_gpus_group". The method returns an opaque handle (groupId) to
        ## identify the newly created group.
        groupId = dcgm_agent.dcgmGroupCreate(handle,
                                             dcgm_structs.DCGM_GROUP_DEFAULT,
                                             "all_gpus_group")

        ## Invoke method to get information on the newly created group
        groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)

        ## Create reference to DCGM status handler which can be used to get the statuses for multiple
        ## operations on one or more devices present in the group
        status_handle = dcgm_agent.dcgmStatusCreate()

        ## The worker function can be executed as a separate thread or as part of the main thread.
        ## Executed as a separate thread here
        thread = Thread(target=agent_worker_function, args=(handle, groupId))
        thread.start()

        ##########################################
        # Any other useful work can be placed here
        ##########################################

        thread.join()
        print("Worker thread completed")

        ## Destroy the group
        try:
            dcgm_agent.dcgmGroupDestroy(handle, groupId)
        except dcgm_structs.DCGMError as e:
            print("Failed to remove the test group, error: %s" % e,
                  file=sys.stderr)
            sys.exit(1)

        ## Destroy the status handle
        try:
            dcgm_agent.dcgmStatusDestroy(status_handle)
        except dcgm_structs.DCGMError as e:
            print("Failed to remove status handler, error: %s" % e,
                  file=sys.stderr)
            sys.exit(1)
Example #2
0
    def Delete(self):
        del self.config
        self.config = None
        del self.samples
        self.samples = None
        del self.health
        self.health = None
        del self.policy
        self.policy = None
        del self.discovery
        self.discovery = None
        del self.stats
        self.stats = None
        del self.action
        self.action = None
        del self.profiling
        self.profiling = None

        #Delete the group we created if we're not using the special all-GPU group
        if self._groupId is not None and not self._IsGroupIdStatic():
            ret = dcgm_agent.dcgmGroupDestroy(self._dcgmHandle.handle,
                                              self._groupId)
            dcgm_structs._dcgmCheckReturn(ret)

        self._groupId = None
Example #3
0
def dcgm_group_test_default_group(handle, gpuIds):
    """
    Test that the default group can not be deleted, or manipulated and is returning all GPUs.

    Note that we're not using groupObj for some tests because it protects against operations on the default group
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetDefaultGroup()

    gpuIdList = gpuIds
    assert len(gpuIdList) > 0, "Failed to get devices from the node"

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
        groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, 9999)

    groupGpuIdList = groupObj.GetGpuIds()
    assert (gpuIdList == groupGpuIdList
            ), "Expected gpuId list match %s != %s" % (str(gpuIdList),
                                                       str(groupGpuIdList))
    groupEntityList = groupObj.GetEntities()
    gpuIdList2 = []
    for entity in groupEntityList:
        assert entity.entityGroupId == dcgm_fields.DCGM_FE_GPU, str(
            entity.entityGroupId)
        gpuIdList2.append(entity.entityId)
    assert gpuIdList == gpuIdList2, "Expected gpuId list to match entity list: %s != %s" % (
        str(gpuIdList), str(gpuIdList2))

    for gpuId in gpuIdList:
        with test_utils.assert_raises(
                dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
            ret = dcgm_agent.dcgmGroupRemoveDevice(
                handle, dcgm_structs.DCGM_GROUP_ALL_GPUS, gpuId)
        with test_utils.assert_raises(pydcgm.DcgmException):
            groupObj.RemoveGpu(gpuId)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
        ret = dcgm_agent.dcgmGroupDestroy(handle,
                                          dcgm_structs.DCGM_GROUP_ALL_GPUS)
Example #4
0
    def __del__(self):
        if self.groupId is not None:
            dcgm_agent.dcgmGroupDestroy(self.heHandle, self.groupId)
            self.groupId = None

        self.heHandle = None
Example #5
0
        groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "all_gpus_group")
        
        ## Invoke method to get information on the newly created group
        groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)
        
        ## Create reference to DCGM status handler which can be used to get the statuses for multiple 
        ## operations on one or more devices present in the group
        status_handle = dcgm_agent.dcgmStatusCreate()
        
        ## The worker function can be executed as a separate thread or as part of the main thread.
        ## Executed as a separate thread here
        thread = Thread(target = agent_worker_function, args = (handle, groupId, groupInfo, status_handle))
        thread.start()
    
        ##########################################
        # Any other useful work can be placed here
        ##########################################
        
        thread.join()
        print "Worker thread completed"
        
        ## Destroy the group
        ret = dcgm_agent.dcgmGroupDestroy(handle, groupId)
        assert(ret == dcgm_structs.DCGM_ST_OK), "Failed to remove the test group, error: %s" % ret 
    
        ## Destroy the status handle
        ret = dcgm_agent.dcgmStatusDestroy(status_handle)
        assert(ret == dcgm_structs.DCGM_ST_OK), "Failed to remove status handler, error: %s" % ret
        
    
Example #6
0
        ## to make sure everything is ready to run
        ## currently this calls an outside diagnostic binary but eventually
        ## that binary will be merged into the DCGM framework
        ## The "response" is a dcgmDiagResponse structure that can be parsed for errors
        response = dcgm_agent.dcgmActionValidate_v2(handle, runDiagInfo)

        ## This will perform an "eiplogue" diagnostic that will stress the system
        ## Currently commented out because it takes several minutes to execute
        # runDiagInfo.validate = dcgm_structs.DCGM_POLICY_VALID_SV_LONG
        #response = dcgm_agent.dcgmActionValidate_v2(handle, dcgmRunDiagInfo)

        ## prime the policy manager to look for ECC, PCIe events
        ## if a callback occurs the function above is called. Currently the data returned
        ## corresponds to the error that occurred (PCI, DBE, etc.) but in the future it will be a
        ## dcgmPolicyViolation_t or similar
        ret = dcgm_agent.dcgmPolicyRegister(
            handle, runDiagInfo.groupId, dcgm_structs.DCGM_POLICY_COND_PCI
            | dcgm_structs.DCGM_POLICY_COND_DBE, None, c_callback)

        ## trigger the policy loop
        ## typically this would be looped in a separate thread or called on demand
        ret = dcgm_agent.dcgmPolicyTrigger(handle)

        ## Destroy the group
        try:
            dcgm_agent.dcgmGroupDestroy(handle, runDiagInfo.groupId)
        except dcgm_structs.DCGMError as e:
            print("Failed to remove the test group, error: %s" % e,
                  file=sys.stderr)
            sys.exit(1)