Beispiel #1
0
def test_dcgm_action_run_diag_bad_validation(handle, gpuIds):
    gpuIdStr = ""
    for i, gpuId in enumerate(gpuIds):
        if i > 0:
            gpuIdStr += ","
        gpuIdStr += str(gpuId)

    drd = dcgm_structs.c_dcgmRunDiag_t()
    drd.version = dcgm_structs.dcgmRunDiag_version
    drd.validate = dcgm_structs.DCGM_POLICY_VALID_SV_LONG + 1 #use an invalid value
    drd.groupId = 0 #Initializing to 0 in case the constructor above doesn't
    drd.gpuList = gpuIdStr

    with test_utils.assert_raises(dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)):
        ret = dcgm_agent.dcgmActionValidate_v2(handle, drd, dcgm_structs.dcgmRunDiag_version)
Beispiel #2
0
def helper_dcgm_action_run_diag_gpu_list(handle, gpuIds):
    '''
    Test that running the DCGM diagnostic works if you provide a GPU ID list rather
    than a groupId.
    '''
    gpuIdStr = ""
    for i, gpuId in enumerate(gpuIds):
        if i > 0:
            gpuIdStr += ","
        gpuIdStr += str(gpuId)

    drd = dcgm_structs.c_dcgmRunDiag_t()
    drd.version = dcgm_structs.dcgmRunDiag_version
    drd.validate = dcgm_structs.DCGM_POLICY_VALID_SV_SHORT
    drd.groupId = 0 #Initializing to 0 in case the constructor above doesn't
    drd.gpuList = gpuIdStr
    #this will throw an exception on error
    ret = dcgm_agent.dcgmActionValidate_v2(handle, drd, dcgm_structs.dcgmRunDiag_version)
Beispiel #3
0
def helper_check_diag_empty_group(handle, gpuIds):
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetEmptyGroup("test1")
    runDiagInfo = dcgm_structs.c_dcgmRunDiag_t()
    runDiagInfo.version = dcgm_structs.dcgmRunDiag_version
    runDiagInfo.groupId = groupObj.GetId()
    runDiagInfo.validate = 1

    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(
                dcgm_structs.DCGM_ST_GROUP_IS_EMPTY)):
        response = test_utils.action_validate_wrapper(runDiagInfo, handle)

    # Now make sure everything works well with a group
    groupObj.AddGpu(gpuIds[0])
    response = test_utils.action_validate_wrapper(runDiagInfo, handle)
    assert response, "Should have received a response now that we have a non-empty group"
Beispiel #4
0
    def __init__(self,
                 gpuIds=None,
                 testNamesStr='',
                 paramsStr='',
                 verbose=True,
                 train=False,
                 forceTrain=False,
                 version=dcgm_structs.dcgmRunDiag_version):
        # Make sure version is valid
        if version not in DcgmDiag._versionMap:
            raise ValueError("'%s' is not a valid version for dcgmRunDiag." %
                             version)
        self.version = version

        if self.version == dcgm_structs.dcgmRunDiag_version7:
            self.runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
        else:
            self.runDiagInfo = dcgm_structs.c_dcgmRunDiag_t()

        self.numTests = 0
        self.numParams = 0
        self.SetVerbose(verbose)
        if testNamesStr == '':
            # default to a level 1 test
            self.runDiagInfo.validate = 1
        elif testNamesStr == '1':
            self.runDiagInfo.validate = 1
        elif testNamesStr == '2':
            self.runDiagInfo.validate = 2
        elif testNamesStr == '3':
            self.runDiagInfo.validate = 3
        else:
            # Make sure no number other that 1-3 were submitted
            if testNamesStr.isdigit():
                raise ValueError("'%s' is not a valid test name." %
                                 testNamesStr)

            # Copy to the testNames portion of the object
            names = testNamesStr.split(',')
            if len(names) > dcgm_structs.DCGM_MAX_TEST_NAMES:
                err = 'DcgmDiag cannot initialize: %d test names were specified exceeding the limit of %d.' %\
                      (len(names), dcgm_structs.DCGM_MAX_TEST_NAMES)
                raise ValueError(err)

            for testName in names:
                self.AddTest(testName)

        if paramsStr != '':
            params = paramsStr.split(';')
            if len(params) >= dcgm_structs.DCGM_MAX_TEST_PARMS:
                err = 'DcgmDiag cannot initialize: %d parameters were specified, exceeding the limit of %d.' %\
                      (len(params), dcgm_structs.DCGM_MAX_TEST_PARMS)
                raise ValueError(err)

            for param in params:
                self.AddParameter(param)

        if train == True:
            self.runDiagInfo.flags = dcgm_structs.DCGM_RUN_FLAGS_TRAIN
            if forceTrain == True:
                self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_FORCE_TRAIN

        if gpuIds:
            first = True
            for gpu in gpuIds:
                if first:
                    self.runDiagInfo.gpuList = str(gpu)
                    first = False
                else:
                    self.runDiagInfo.gpuList = "%s,%s" % (
                        self.runDiagInfo.gpuList, str(gpu))
Beispiel #5
0
def test_dcgm_run_diagnostic_validate(handle, gpuIds):
    """
    Validates structure version
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    gpuIdList = systemObj.discovery.GetAllGpuIds()
    assert len(gpuIdList
               ) >= 0, "Not able to find devices on the node for embedded case"

    groupId = dcgm_agent.dcgmGroupCreate(handle,
                                         dcgm_structs.DCGM_GROUP_DEFAULT,
                                         "test1")
    groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)
    status_handle = dcgm_agent.dcgmStatusCreate()

    diagLevel = dcgm_structs.DCGM_DIAG_LVL_SHORT

    gpuIdStr = ""
    for i, gpuId in enumerate(gpuIds):
        if i > 0:
            gpuIdStr += ","
        gpuIdStr += str(gpuId)

    drd = dcgm_structs.c_dcgmRunDiag_t()
    drd.version = dcgm_structs.dcgmRunDiag_version
    drd.validate = dcgm_structs.DCGM_POLICY_VALID_SV_SHORT
    drd.groupId = groupId
    drd.gpuList = gpuIdStr

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmActionValidate_v2(handle, drd, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmActionValidate_v2(handle, drd, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmActionValidate(handle, drd.groupId, drd.validate,
                                   versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmActionValidate(handle, drd.groupId, drd.validate,
                                   versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmRunDiagnostic(handle, drd.groupId, diagLevel, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmRunDiagnostic(handle, drd.groupId, diagLevel, versionTest)