Beispiel #1
0
def main():

    ## Initilaize the DCGM Engine as manual operation mode. This implies that it's execution is
    ## controlled by the monitoring agent. The user has to periodically call APIs such as
    ## dcgmEnginePolicyTrigger and dcgmEngineUpdateAllFields which tells DCGM to wake up and
    ## perform data collection and operations needed for policy management.
    with RunDCGM('127.0.0.1',
                 dcgm_structs.DCGM_OPERATION_MODE_MANUAL) as handle:

        ## Create a default group. (Default group is comprised of all the GPUs on the node)
        ## Let's call the group as "all_gpus_group". The method returns an opaque handle (groupId) to
        ## identify the newly created group.
        groupId = dcgm_agent.dcgmGroupCreate(handle,
                                             dcgm_structs.DCGM_GROUP_DEFAULT,
                                             "all_gpus_group")

        ## Invoke method to get information on the newly created group
        groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)

        ## Create reference to DCGM status handler which can be used to get the statuses for multiple
        ## operations on one or more devices present in the group
        status_handle = dcgm_agent.dcgmStatusCreate()

        ## The worker function can be executed as a separate thread or as part of the main thread.
        ## Executed as a separate thread here
        thread = Thread(target=agent_worker_function, args=(handle, groupId))
        thread.start()

        ##########################################
        # Any other useful work can be placed here
        ##########################################

        thread.join()
        print("Worker thread completed")

        ## Destroy the group
        try:
            dcgm_agent.dcgmGroupDestroy(handle, groupId)
        except dcgm_structs.DCGMError as e:
            print("Failed to remove the test group, error: %s" % e,
                  file=sys.stderr)
            sys.exit(1)

        ## Destroy the status handle
        try:
            dcgm_agent.dcgmStatusDestroy(status_handle)
        except dcgm_structs.DCGMError as e:
            print("Failed to remove status handler, error: %s" % e,
                  file=sys.stderr)
            sys.exit(1)
Beispiel #2
0
def helper_verify_power_value_standalone(handle, groupId, expected_power):
    """
    Helper Method to verify power value
    """
    groupInfo = dcgm_agent.dcgmGroupGetInfo(
        handle, groupId, dcgm_structs.c_dcgmGroupInfo_version2)
    status_handle = dcgm_agent.dcgmStatusCreate()

    config_values = dcgm_agent.dcgmConfigGet(
        handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
        groupInfo.count, status_handle)
    assert len(
        config_values) > 0, "Failed to get configuration using dcgmConfigGet"

    for x in range(0, groupInfo.count):
        if (config_values[x].mPowerLimit.val !=
                dcgmvalue.DCGM_INT32_NOT_SUPPORTED):
            assert config_values[x].mPowerLimit.type == dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL, \
                                    "The power limit type for gpuId %d is incorrect. Returned: %d Expected :%d" \
                                    % (x, config_values[x].mPowerLimit.type, dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL)
            assert config_values[x].mPowerLimit.val == expected_power, "The power limit value for gpuID %d is incorrect. Returned: %d Expected: %d" \
                                    % (x, config_values[x].mPowerLimit.val, expected_power)
        pass

    ret = dcgm_agent.dcgmStatusDestroy(status_handle)
    assert (ret == dcgm_structs.DCGM_ST_OK
            ), "Failed to remove status handler, error: %s" % ret
Beispiel #3
0
def helper_verify_config_values_standalone(handle, groupId, expected_power, expected_ecc, \
                                            expected_proc_clock, expected_mem_clock, expected_compute_mode, \
                                            expected_sync_boost, expected_auto_boost):
    """
    Helper Method to verify all the values for the current configuration are as expected
    """

    groupInfo = dcgm_agent.dcgmGroupGetInfo(
        handle, groupId, dcgm_structs.c_dcgmGroupInfo_version2)
    status_handle = dcgm_agent.dcgmStatusCreate()

    config_values = dcgm_agent.dcgmConfigGet(
        handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
        groupInfo.count, status_handle)
    assert len(
        config_values) > 0, "Failed to get configuration using dcgmConfigGet"

    for x in xrange(0, groupInfo.count):
        assert config_values[x].mPowerLimit.type == dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL, \
                                "The power limit type for gpuId %d is incorrect. Returned: %d Expected :%d" \
                                % (x, config_values[x].mPowerLimit.type, dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL)
        assert config_values[x].mPowerLimit.val == expected_power, "The power limit value for gpuID %d is incorrect. Returned: %d Expected: %d" \
                                % (x, config_values[x].mPowerLimit.val, expected_power)

        assert config_values[x].mPerfState.syncBoost == expected_sync_boost, "The syncboost value for gpuID %d is incorrect."\
                                " Returned: %d Expected: %d" \
                                % (x, config_values[x].mPerfState.syncBoost, expected_sync_boost)

        assert config_values[x].mPerfState.autoBoost == expected_auto_boost, "The autoboost value for gpuID %d is incorrect."\
                                " Returned: %d Expected: %d" \
                                % (x, config_values[x].mPerfState.autoBoost, expected_auto_boost)

        assert config_values[x].mPerfState.minVPState.memClk == expected_mem_clock, "The min mem clock value for gpuID %d is incorrect."\
                                " Returned: %d Expected: %d" \
                                % (x, config_values.mPerfState.minVPState.memClk , expected_mem_clock)

        assert config_values[x].mPerfState.minVPState.procClk  == expected_proc_clock, "The min proc clock value for gpuID %d is incorrect."\
                                " Returned: %d Expected: %d" \
                                % (x, config_values[x].mPerfState.minVPState.procClk , expected_proc_clock)

        assert config_values[x].mComputeMode  == expected_compute_mode, "The compute mode value for gpuID %d is incorrect."\
                                " Returned: %d Expected: %d" \
                                % (x, config_values[x].mComputeMode, expected_compute_mode)

        assert config_values[x].mEccMode  == expected_ecc, "The ecc mode value for gpuID %d is incorrect."\
                                " Returned: %d Expected: %d" \
                                % (x, config_values[x].mEccMode, expected_ecc)
        pass

    ret = dcgm_agent.dcgmStatusDestroy(status_handle)
    assert (ret == dcgm_structs.DCGM_ST_OK
            ), "Failed to remove status handler, error: %s" % ret
Beispiel #4
0
        groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "all_gpus_group")
        
        ## Invoke method to get information on the newly created group
        groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)
        
        ## Create reference to DCGM status handler which can be used to get the statuses for multiple 
        ## operations on one or more devices present in the group
        status_handle = dcgm_agent.dcgmStatusCreate()
        
        ## The worker function can be executed as a separate thread or as part of the main thread.
        ## Executed as a separate thread here
        thread = Thread(target = agent_worker_function, args = (handle, groupId, groupInfo, status_handle))
        thread.start()
    
        ##########################################
        # Any other useful work can be placed here
        ##########################################
        
        thread.join()
        print "Worker thread completed"
        
        ## Destroy the group
        ret = dcgm_agent.dcgmGroupDestroy(handle, groupId)
        assert(ret == dcgm_structs.DCGM_ST_OK), "Failed to remove the test group, error: %s" % ret 
    
        ## Destroy the status handle
        ret = dcgm_agent.dcgmStatusDestroy(status_handle)
        assert(ret == dcgm_structs.DCGM_ST_OK), "Failed to remove status handler, error: %s" % ret
        
    
Beispiel #5
0
 def __del__(self):
     dcgm_agent.dcgmStatusDestroy(self.handle)