コード例 #1
0
def test_dcgm_vgpu_config_set_validate(handle):
    """
    Validates structure version
    """

    groupId = dcgm_agent.dcgmGroupCreate(handle,
                                         dcgm_structs.DCGM_GROUP_DEFAULT,
                                         "test1")
    status_handle = dcgm_agent.dcgmStatusCreate()
    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmVgpuConfigSet(handle, groupId, config_values,
                                  status_handle, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random invalid version
        ret = vtDcgmVgpuConfigSet(handle, groupId, config_values,
                                  status_handle, versionTest)
コード例 #2
0
ファイル: test_connection.py プロジェクト: omertuc/DCGM
def test_dcgm_connection_client_cleanup(handle, gpuIds):
    '''
    Make sure that resources that were allocated by a client are cleaned up
    '''
    fieldGroupFieldIds = [
        dcgm_fields.DCGM_FI_DEV_GPU_TEMP,
    ]

    #Get a 2nd connection which we'll check for cleanup. Use the raw APIs so we can explicitly cleanup
    connectParams = dcgm_structs.c_dcgmConnectV2Params_v1()
    connectParams.version = dcgm_structs.c_dcgmConnectV2Params_version
    connectParams.persistAfterDisconnect = 0
    cleanupHandle = dcgm_agent.dcgmConnect_v2('localhost', connectParams)

    groupName = 'clientcleanupgroup'
    groupId = dcgm_agent.dcgmGroupCreate(cleanupHandle,
                                         dcgm_structs.DCGM_GROUP_EMPTY,
                                         groupName)

    fieldGroupName = 'clientcleanupfieldgroup'
    fieldGroupId = dcgm_agent.dcgmFieldGroupCreate(cleanupHandle,
                                                   fieldGroupFieldIds,
                                                   fieldGroupName)

    #Disconnect our second handle. This should cause the cleanup to occur
    dcgm_agent.dcgmDisconnect(cleanupHandle)

    time.sleep(1.0)  #Allow connection cleanup to occur since it's asynchronous

    #Try to retrieve the field group info. This should throw an exception
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_NO_DATA)):
        fieldGroupInfo = dcgm_agent.dcgmFieldGroupGetInfo(handle, fieldGroupId)

    #Try to retrieve the group info. This should throw an exception
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(
                dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
        groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)
コード例 #3
0
def test_dcgm_run_diagnostic_validate(handle, gpuIds):
    """
    Validates structure version
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    gpuIdList = systemObj.discovery.GetAllGpuIds()
    assert len(gpuIdList
               ) >= 0, "Not able to find devices on the node for embedded case"

    groupId = dcgm_agent.dcgmGroupCreate(handle,
                                         dcgm_structs.DCGM_GROUP_DEFAULT,
                                         "test1")
    groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)
    status_handle = dcgm_agent.dcgmStatusCreate()

    diagLevel = dcgm_structs.DCGM_DIAG_LVL_SHORT

    gpuIdStr = ""
    for i, gpuId in enumerate(gpuIds):
        if i > 0:
            gpuIdStr += ","
        gpuIdStr += str(gpuId)

    drd = dcgm_structs.c_dcgmRunDiag_t()
    drd.version = dcgm_structs.dcgmRunDiag_version
    drd.validate = dcgm_structs.DCGM_POLICY_VALID_SV_SHORT
    drd.groupId = groupId
    drd.gpuList = gpuIdStr

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmActionValidate_v2(handle, drd, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmActionValidate_v2(handle, drd, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmActionValidate(handle, drd.groupId, drd.validate,
                                   versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmActionValidate(handle, drd.groupId, drd.validate,
                                   versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmRunDiagnostic(handle, drd.groupId, diagLevel, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmRunDiagnostic(handle, drd.groupId, diagLevel, versionTest)
コード例 #4
0
ファイル: dcgm_config_settings.py プロジェクト: omertuc/DCGM
        sleep(2)


## Entry point for this script
if __name__ == "__main__":
    
    ## Initialize the DCGM Engine as manual operation mode. This implies that it's execution is 
    ## controlled by the monitoring agent. The user has to periodically call APIs such as 
    ## dcgmEnginePolicyTrigger and dcgmEngineUpdateAllFields which tells DCGM to wake up and 
    ## perform data collection and operations needed for policy management.
    with RunDCGM('127.0.0.1', dcgm_structs.DCGM_OPERATION_MODE_MANUAL) as handle:
    
        ## Create a default group. (Default group is comprised of all the GPUs on the node)
        ## Let's call the group as "all_gpus_group". The method returns an opaque handle (groupId) to
        ## identify the newly created group.
        groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_DEFAULT, "all_gpus_group")
        
        ## Invoke method to get information on the newly created group
        groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)
        
        ## Create reference to DCGM status handler which can be used to get the statuses for multiple 
        ## operations on one or more devices present in the group
        status_handle = dcgm_agent.dcgmStatusCreate()
        
        ## The worker function can be executed as a separate thread or as part of the main thread.
        ## Executed as a separate thread here
        thread = Thread(target = agent_worker_function, args = (handle, groupId, groupInfo, status_handle))
        thread.start()
    
        ##########################################
        # Any other useful work can be placed here
コード例 #5
0
validDevices = list()
for x in devices:
    fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields(
        handle, x, [
            dcgm_fields.DCGM_FI_DEV_RETIRED_DBE,
        ])
    if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED):
        validDevices.append(x)

if (len(validDevices) == 0):
    print "Can only run if at least one GPU with ECC is present"
    sys.exit(1)

print "Number of valid devices: %d" % len(validDevices)

groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY,
                                     "test1")
statusHandle = dcgm_agent.dcgmStatusCreate()

for device in validDevices:
    ret = dcgm_agent.dcgmGroupAddDevice(handle, groupId, device)
    assert (ret == dcgm_structs.DCGM_ST_OK)

## Get attributes for all the devices
attributesForDevices = list()
for device in validDevices:
    attributes = dcgm_agent.dcgmGetDeviceAttributes(handle, device)
    attributesForDevices.append(attributes)

assert len(
    attributesForDevices) != 0, "Can't get attributes for all the devices"
コード例 #6
0
def test_dcgm_configure_ecc_mode(handle, gpuIds):
    test_utils.skip_test("Skipping this test until bug 200377294 is fixed")

    groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY,
                                         "test1")

    validDevice = -1
    for x in gpuIds:
        fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields(
            handle, x, [
                dcgm_fields.DCGM_FI_DEV_ECC_CURRENT,
            ])
        if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED):
            validDevice = x
            break

    if (validDevice == -1):
        test_utils.skip_test(
            "Can only run if at least one GPU with ECC is present")

    ret = dcgm_agent.dcgmGroupAddDevice(handle, groupId, validDevice)
    assert (ret == dcgm_structs.DCGM_ST_OK
            ), "Failed to add a device to the group %d. Return %d" % (
                groupId.value, ret)

    groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)

    #Create a status handle
    status_handle = dcgm_agent.dcgmStatusCreate()

    ## Get original ECC mode on the device
    config_values = dcgm_agent.dcgmConfigGet(
        handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
        groupInfo.count, status_handle)
    assert len(
        config_values) > 0, "Failed to get configuration using dcgmConfigGet"

    eccmodeOnGroupExisting = config_values[0].mEccMode
    if eccmodeOnGroupExisting == 0:
        eccmodeOnGroupToSet = 1
    else:
        eccmodeOnGroupToSet = 0

    #print eccmodeOnGroupExisting
    #print eccmodeOnGroupToSet

    ## Toggle the ECC mode on the group
    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
    config_values.mEccMode = eccmodeOnGroupToSet
    config_values.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK

    #Clear the status handle to log the errors while setting the config
    ret = dcgm_agent.dcgmStatusClear(status_handle)
    assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret

    try:
        ret = dcgm_agent.dcgmConfigSet(handle, groupId, config_values,
                                       status_handle)
    except dcgm_structs.DCGMError as e:
        pass

    errors = helper_get_status_list(status_handle)

    if len(errors) > 0:
        for error in errors:
            if error.status == dcgm_structs.DCGM_ST_RESET_REQUIRED:
                test_utils.skip_test(
                    "Skipping the test - Unable to reset the Gpu, FieldId - %d, Return - %d"
                    % (error.fieldId, error.status))
            else:
                test_utils.skip_test(
                    "Skipping the test - Unable to set the ECC mode. FieldId - %d, Return %d"
                    % (error.fieldId, error.status))

    #Sleep after reset
    time.sleep(2)

    #Clear the status handle to log the errors while setting the config
    ret = dcgm_agent.dcgmStatusClear(status_handle)
    assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret

    #Get the current configuration
    config_values = dcgm_agent.dcgmConfigGet(
        handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
        groupInfo.count, status_handle)
    assert len(
        config_values) > 0, "Failed to get configuration using dcgmConfigGet"

    fvs = dcgm_agent_internal.dcgmGetLatestValuesForFields(
        handle, validDevice, [
            dcgm_fields.DCGM_FI_DEV_ECC_PENDING,
            dcgm_fields.DCGM_FI_DEV_ECC_CURRENT
        ])
    if fvs[0].value.i64 != fvs[1].value.i64:
        logger.warning(
            "Pending ECC %d != Current ECC %d for gpuId %d. Box probably needs a reboot"
            % (fvs[0].value.i64, fvs[1].value.i64, validDevice))
    else:
        assert config_values[0].mEccMode == (eccmodeOnGroupToSet), "ECC mode %d different from the set value %d" % \
                                                                   (config_values[0].mEccMode, eccmodeOnGroupToSet)
コード例 #7
0
def test_dcgm_vgpu_configure_ecc_mode(handle, gpuIds):
    test_utils.skip_test("Skipping this test until bug 200377294 is fixed")

    groupId = dcgm_agent.dcgmGroupCreate(handle, dcgm_structs.DCGM_GROUP_EMPTY,
                                         "test1")

    validDevice = -1
    for x in gpuIds:
        fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields(
            handle, x, [dcgm_fields.DCGM_FI_DEV_RETIRED_DBE])
        if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED):
            validDevice = x
        break

    if (validDevice == -1):
        test_utils.skip_test(
            "Can only run if at least one GPU with ECC is present")

    ret = dcgm_agent.dcgmGroupAddDevice(handle, groupId, validDevice)
    assert (
        ret == dcgm_structs.DCGM_ST_OK
    ), "Failed to add a device to the group %d. Return %d" % (groupId, ret)

    groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)

    #Create a status handle
    status_handle = dcgm_agent.dcgmStatusCreate()

    ## Get original ECC mode on the device
    vgpu_config_values = dcgm_agent_internal.dcgmVgpuConfigGet(
        handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
        groupInfo.count, status_handle)
    assert len(
        vgpu_config_values) > 0, "Failed to work with NULL status handle"

    eccmodeOnGroupExisting = vgpu_config_values[0].mEccMode
    if eccmodeOnGroupExisting == 0:
        eccmodeOnGroupToSet = 1
    else:
        eccmodeOnGroupToSet = 0

    #print eccmodeOnGroupExisting
    #print eccmodeOnGroupToSet

    ## Toggle the ECC mode on the group
    vgpu_config_values = dcgm_structs.c_dcgmDeviceVgpuConfig_v1()
    vgpu_config_values.mEccMode = eccmodeOnGroupToSet
    vgpu_config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
    vgpu_config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK
    vgpu_config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK

    #Clear the status handle to log the errors while setting the config
    ret = dcgm_agent.dcgmStatusClear(status_handle)
    assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret

    try:
        ret = dcgm_agent_internal.dcgmVgpuConfigSet(handle, groupId,
                                                    vgpu_config_values,
                                                    status_handle)
    except dcgm_structs.DCGMError as e:
        pass

    errors = helper_get_status_list(status_handle)

    if len(errors) > 0:
        for error in errors:
            if error.status == dcgm_structs.DCGM_ST_RESET_REQUIRED:
                test_utils.skip_test(
                    "Skipping the test - Unable to reset the Gpu, FieldId - %d, Return - %d"
                    % (error.fieldId, error.status))
            else:
                test_utils.skip_test(
                    "Skipping the test - Unable to set the ECC mode. FieldId - %d, Return %d"
                    % (error.fieldId, error.status))

    #Sleep after reset and then apply update for it to occur
    time.sleep(2)

    dcgm_agent.dcgmUpdateAllFields(handle, 1)

    #Clear the status handle to log the errors while setting the config
    ret = dcgm_agent.dcgmStatusClear(status_handle)
    assert ret == dcgm_structs.DCGM_ST_OK, "Failed to clear the status handle. Return %d" % ret

    #Get the current configuration
    config_values = dcgm_agent_internal.dcgmVgpuConfigGet(
        handle, groupId, dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
        groupInfo.count, status_handle)
    assert len(config_values
               ) > 0, "Failed to get configuration using dcgmiVgpuConfigGet"

    assert config_values[0].mEccMode == (
        eccmodeOnGroupToSet), "ECC mode different from the set value"