Exemple #1
0
def helper_inject_vgpu_configuration(handle, gpuId, eccModeVal, powerLimitVal,
                                     computeModeVal):
    """
    Helper method to inject configuration to Cachemanager
    """
    if (eccModeVal != None):
        # inject an error into Ecc Mode
        eccMode = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
        eccMode.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
        eccMode.fieldId = dcgm_fields.DCGM_FI_DEV_ECC_CURRENT
        eccMode.status = 0
        eccMode.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
        eccMode.ts = int((time.time() + 1) *
                         1000000.0)  # set the injected data into the future
        eccMode.value.i64 = eccModeVal

        ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, eccMode)
        assert (ret == dcgm_structs.DCGM_ST_OK)

    if (powerLimitVal != None):
        powerLimit = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
        powerLimit.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
        powerLimit.fieldId = dcgm_fields.DCGM_FI_DEV_POWER_MGMT_LIMIT
        powerLimit.status = 0
        powerLimit.fieldType = ord(dcgm_fields.DCGM_FT_DOUBLE)
        powerLimit.ts = int((time.time() + 1) *
                            1000000.0)  # set the injected data into the future
        powerLimit.value.dbl = powerLimitVal

        ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId,
                                                       powerLimit)
        assert (ret == dcgm_structs.DCGM_ST_OK)

    if (computeModeVal != None):
        computeMode = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
        computeMode.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
        computeMode.fieldId = dcgm_fields.DCGM_FI_DEV_COMPUTE_MODE
        computeMode.status = 0
        computeMode.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
        computeMode.ts = int(
            (time.time() + 1) *
            1000000.0)  # set the injected data into the future
        computeMode.value.i64 = computeModeVal

        ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId,
                                                       computeMode)
        assert (ret == dcgm_structs.DCGM_ST_OK)
Exemple #2
0
def test_collectd_basic_integration(handle, gpuIds):
    """ 
    Verifies that we can inject specific data and get that same data back
    """
    dcgmHandle = pydcgm.DcgmHandle(handle)
    dcgmSystem = dcgmHandle.GetSystem()

    specificFieldIds = [
        dcgm_fields.DCGM_FI_DEV_RETIRED_DBE,
        dcgm_fields.DCGM_FI_DEV_RETIRED_SBE,
        dcgm_fields.DCGM_FI_DEV_POWER_VIOLATION,
        dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION
    ]

    fieldValues = [1, 5, 1000, 9000]

    for gpuId in gpuIds:
        for i in range(0, len(specificFieldIds)):
            field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
            field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
            field.fieldId = specificFieldIds[i]
            field.status = 0
            field.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
            field.ts = int((time.time() + 10) *
                           1000000.0)  # set the injected data into the future
            field.value.i64 = fieldValues[i]
            ret = dcgm_agent_internal.dcgmInjectFieldValue(
                handle, gpuId, field)
            assert (ret == dcgm_structs.DCGM_ST_OK)

    gvars = collectd_tester_globals.gvars

    assert 'config' in gvars
    gvars['config']()

    assert 'init' in gvars
    gvars['init']()

    assert 'read' in gvars
    gvars['read']()

    assert 'out' in gvars
    outDict = gvars['out']

    assert 'shutdown' in gvars
    #    gvars['shutdown']()

    # Verify that we can read back the fields we watch.
    for gpuId in gpuIds:
        assert str(gpuId) in outDict

        gpuDict = outDict[str(gpuId)]

        for i in range(0, len(specificFieldIds)):
            fieldTag = dcgmSystem.fields.GetFieldById(specificFieldIds[i]).tag
            assert fieldTag in gpuDict
            assert gpuDict[fieldTag] == fieldValues[i]
Exemple #3
0
def test_dcgmi_nvlink_nvswitches(handle, gpuIds, switchIds):
    """
    Test dcgmi to display dmon values
    """
    gpuGroupId = str(_create_dcgmi_group(dcgm_structs.DCGM_GROUP_DEFAULT))
    switchGroupId = str(
        _create_dcgmi_group(dcgm_structs.DCGM_GROUP_DEFAULT_NVSWITCHES))

    logger.info("Injected switch IDs:" + str(switchIds))

    _test_valid_args([["nvlink",
                       "-s"]  # Link status should work without parameters
                      ])

    # Creates a comma separated list of gpus
    allGpusCsv = ",".join(map(str, gpuIds))
    #Same for switches but predicate each one with nvswitch
    allSwitchesCsv = ",".join(map(helper_make_switch_string, switchIds))

    switchFieldId = dcgm_fields.DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P00

    #Inject a value for a field for each switch so we can retrieve it
    field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
    field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    field.fieldId = switchFieldId
    field.status = 0
    field.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
    field.ts = int((time.time() - 5) * 1000000.0)  #5 seconds ago
    field.value.i64 = 0
    for switchId in switchIds:
        ret = dcgm_agent_internal.dcgmInjectEntityFieldValue(
            handle, dcgm_fields.DCGM_FE_SWITCH, switchId, field)

    _test_valid_args([
        ["dmon", "-e", "150,155", "-c",
         "1"],  # run the dmon for default gpu group.
        ["dmon", "-e", "150,155", "-c", "1", "-g",
         gpuGroupId],  # run the dmon for a specified gpu group
        ["dmon", "-e", "150,155", "-c", "1", "-g",
         'all_gpus'],  # run the dmon for a specified group
        [
            "dmon", "-e",
            str(switchFieldId), "-c", "1", "-g", 'all_nvswitches'
        ],  # run the dmon for a specified group - Reenable after DCGM-413 is fixed
        ["dmon", "-e",
         str(switchFieldId), "-c", "1", "-g",
         switchGroupId],  # run the dmon for a specified group
        ["dmon", "-e", "150,155", "-c", "1", "-d",
         "2000"],  # run the dmon for delay mentioned and default gpu group. 
        ["dmon", "-e", "150,155", "-c", "1", "-d", "2000", "-i", allGpusCsv
         ],  # run the dmon for devices mentioned and mentioned delay.
        [
            "dmon", "-e",
            str(switchFieldId), "-c", "1", "-d", "2000", "-i", allSwitchesCsv
        ]  # run the dmon for devices mentioned and mentioned delay.
    ])
Exemple #4
0
def inject_field_value_fp64(handle, gpuId, fieldId, value, offset):
    field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
    field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    field.fieldId = fieldId
    field.status = 0
    field.fieldType = ord(dcgm_fields.DCGM_FT_DOUBLE)
    field.ts = int((time.time() + offset) * 1000000.0)
    field.value.dbl = value

    return dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, field)
Exemple #5
0
def inject_field_value_i64(handle, entityId, fieldId, value, offset, entityGroupId=dcgm_fields.DCGM_FE_GPU):
    field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
    field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    field.fieldId = fieldId
    field.status = 0
    field.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
    field.ts = int((time.time()+offset) * 1000000.0)
    field.value.i64 = value

    return dcgm_agent_internal.dcgmInjectEntityFieldValue(handle, entityGroupId, entityId, field)
Exemple #6
0
def test_dcgm_policy_inject_retiredpages_standalone(handle, gpuIds):
    """ 
    Verifies that we can inject an error into the retired pages counters and receive a callback
    """
    newPolicy = dcgm_structs.c_dcgmPolicy_v1()

    newPolicy.version = dcgm_structs.dcgmPolicy_version1
    newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_MAX_PAGES_RETIRED
    newPolicy.parms[2].tag = 1
    newPolicy.parms[2].val.llval = 5

    # find a GPU that supports ECC and retired pages (otherwise internal test will ignore it)
    dcgmHandle = pydcgm.DcgmHandle(handle)
    dcgmSystem = dcgmHandle.GetSystem()
    group = dcgmSystem.GetGroupWithGpuIds("test1", gpuIds)

    group.policy.Set(newPolicy)

    callbackQueue = Queue.Queue()
    c_callback = create_c_callback(callbackQueue)
    group.policy.Register(dcgm_structs.DCGM_POLICY_COND_MAX_PAGES_RETIRED,
                          finishCallback=c_callback)

    # inject an error into ECC
    numPages = 10
    field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
    field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    field.fieldId = dcgm_fields.DCGM_FI_DEV_RETIRED_DBE
    field.status = 0
    field.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
    field.ts = int((time.time() + 60) *
                   1000000.0)  # set the injected data into the future
    field.value.i64 = numPages

    ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuIds[0], field)
    assert (ret == dcgm_structs.DCGM_ST_OK)

    #inject a SBE too so that the health check code gets past its internal checks
    field.fieldId = dcgm_fields.DCGM_FI_DEV_RETIRED_SBE

    ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuIds[0], field)
    assert (ret == dcgm_structs.DCGM_ST_OK)

    # wait for the the policy manager to call back
    try:
        callbackData = callbackQueue.get(timeout=POLICY_CALLBACK_TIMEOUT_SECS)
    except Queue.Empty:
        assert False, "Callback never happened"

    # check that the callback occurred with the correct arguments
    assert(dcgm_structs.DCGM_POLICY_COND_MAX_PAGES_RETIRED == callbackData.condition), \
            ("error callback was not for a retired pages, got: %s" % callbackData.condition)
    assert(numPages == callbackData.val.mpr.dbepages), \
            'Expected %s errors but got %s' % (numPages, callbackData.val.mpr.dbepages)
Exemple #7
0
def test_dcgm_injection_multi_fetch_remote(handle, gpuIds):
    """
    Verify that multi-fetches work with the agent
    """

    gpuId = gpuIds[0]
    NinjectValues = 10

    firstTs = get_usec_since_1970()
    lastTs = 0
    injectedValues = []

    #Inject the values we're going to fetch
    for i in range(NinjectValues):
        fvGood = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
        fvGood.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
        fvGood.fieldId = dcgm_fields.DCGM_FI_DEV_ECC_PENDING
        fvGood.status = 0
        fvGood.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
        fvGood.ts = firstTs + i
        fvGood.value.i64 = 1 + i

        #This will throw an exception if it fails
        dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvGood)

        injectedValues.append(fvGood)

    lastTs = fvGood.ts

    #Fetch in forward order with no timestamp. Verify
    startTs = 0
    endTs = 0
    maxCount = 2 * NinjectValues  #Pick a bigger number so we can verify only NinjectValues come back
    order = dcgm_structs.DCGM_ORDER_ASCENDING
    fvFetched = dcgm_agent_internal.dcgmGetMultipleValuesForField(
        handle, gpuId, fvGood.fieldId, maxCount, startTs, endTs, order)

    assert len(fvFetched) == NinjectValues, "Expected %d rows. Got %d" % (
        NinjectValues, len(fvFetched))
    helper_verify_multi_values(fvFetched, order, injectedValues)

    #Now do the same fetch with descending values
    startTs = 0
    endTs = 0
    maxCount = 2 * NinjectValues  #Pick a bigger number so we can verify only NinjectValues come back
    order = dcgm_structs.DCGM_ORDER_DESCENDING
    fvFetched = dcgm_agent_internal.dcgmGetMultipleValuesForField(
        handle, gpuId, fvGood.fieldId, maxCount, startTs, endTs, order)

    assert len(fvFetched) == NinjectValues, "Expected %d rows. Got %d" % (
        NinjectValues, len(fvFetched))
    helper_verify_multi_values(fvFetched, order, injectedValues)
Exemple #8
0
def helper_test_dcgm_injection_summaries(handle, gpuIds):

    gpuId = gpuIds[0]

    # Watch the field we're inserting into
    dcgm_agent_internal.dcgmWatchFieldValue(
        handle, gpuId, dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, 1, 3600.0,
        10000)

    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()

    #Make a base value that is good for starters
    field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
    field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    field.fieldId = dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL
    field.status = 0
    field.fieldType = ord(dcgm_fields.DCGM_FT_INT64)

    baseTime = get_usec_since_1970()

    for i in range(0, 10):
        field.ts = baseTime + i
        field.value.i64 = i
        ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, field)
        assert (ret == dcgm_structs.DCGM_ST_OK)

    time.sleep(1)

    systemObj.UpdateAllFields(1)

    tmpMask = dcgm_structs.DCGM_SUMMARY_MIN | dcgm_structs.DCGM_SUMMARY_MAX
    tmpMask = tmpMask | dcgm_structs.DCGM_SUMMARY_AVG | dcgm_structs.DCGM_SUMMARY_DIFF
    # Pass baseTime for the start to get nothing from the first query
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_NO_DATA)):
        request = dcgm_agent.dcgmGetFieldSummary(
            handle, dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL,
            dcgm_fields.DCGM_FE_GPU, gpuId, tmpMask, baseTime - 60,
            baseTime - 30)

    # Now adjust the time so we get values
    request = dcgm_agent.dcgmGetFieldSummary(
        handle, dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL,
        dcgm_fields.DCGM_FE_GPU, gpuId, tmpMask, 0, 0)
    assert (request.response.values[0].i64 == 0)
    assert (request.response.values[1].i64 == 9)
    assert (request.response.values[2].i64 == 4)
    assert (request.response.values[3].i64 == 9)
Exemple #9
0
def test_dcgm_policy_inject_nvlinkerror_standalone(handle, gpuIds):
    """ 
    Verifies that we can inject an error into the NVLINK error and receive a callback
    """
    newPolicy = dcgm_structs.c_dcgmPolicy_v1()

    newPolicy.version = dcgm_structs.dcgmPolicy_version1
    newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_NVLINK
    newPolicy.parms[5].tag = 0
    newPolicy.parms[5].val.boolean = True

    # find a GPU that supports nvlink (otherwise internal test will ignore it)
    dcgmHandle = pydcgm.DcgmHandle(handle)
    dcgmSystem = dcgmHandle.GetSystem()
    group = dcgmSystem.GetGroupWithGpuIds('test1', gpuIds)
    group.policy.Set(newPolicy)

    callbackQueue = Queue.Queue()
    c_callback = create_c_callback(callbackQueue)
    group.policy.Register(dcgm_structs.DCGM_POLICY_COND_NVLINK,
                          finishCallback=c_callback)

    field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
    field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    field.fieldId = dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL
    field.status = 0
    field.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
    field.ts = int((time.time() + 60) *
                   1000000.0)  # set the injected data into the future
    field.value.i64 = 1

    ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuIds[0], field)
    assert (ret == dcgm_structs.DCGM_ST_OK)

    # wait for the the policy manager to call back
    try:
        callbackData = callbackQueue.get(timeout=POLICY_CALLBACK_TIMEOUT_SECS)
    except Queue.Empty:
        assert False, "Callback never happened"

    # check that the callback occurred with the correct arguments
    assert(dcgm_structs.DCGM_POLICY_COND_NVLINK == callbackData.condition), \
            ("NVLINK error callback was not for a NVLINK error, got: %s" % callbackData.condition)
    assert(dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL == callbackData.val.nvlink.fieldId), \
            ("Expected 130 fieldId but got %s" % callbackData.val.nvlink.fieldId)
    assert (
        1 == callbackData.val.nvlink.counter
    ), 'Expected 1 PCI error but got %s' % callbackData.val.nvlink.counter
Exemple #10
0
def helper_dcgm_policy_inject_eccerror(handle, gpuIds):
    """ 
    Verifies that we can inject an error into the ECC counters and receive a callback
    """
    newPolicy = dcgm_structs.c_dcgmPolicy_v1()
    newPolicy.version = dcgm_structs.dcgmPolicy_version1
    newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_DBE
    newPolicy.parms[0].tag = 0
    newPolicy.parms[0].val.boolean = True

    dcgmHandle = pydcgm.DcgmHandle(handle)
    dcgmSystem = dcgmHandle.GetSystem()
    group = dcgmSystem.GetGroupWithGpuIds("test1", gpuIds)
    group.policy.Set(newPolicy)

    # the order of the callbacks will change once implementation is complete
    callbackQueue = Queue.Queue()
    c_callback = create_c_callback(callbackQueue)
    group.policy.Register(dcgm_structs.DCGM_POLICY_COND_DBE, c_callback, None)

    # inject an error into ECC
    field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
    field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    field.fieldId = dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_DEV
    field.status = 0
    field.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
    field.ts = int((time.time() + 60) *
                   1000000.0)  # set the injected data into the future
    field.value.i64 = 1
    logger.debug("injecting %s for gpuId %d" % (str(field), gpuIds[0]))

    ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuIds[0], field)
    assert (ret == dcgm_structs.DCGM_ST_OK)

    # wait for the the policy manager to call back
    try:
        callbackData = callbackQueue.get(timeout=POLICY_CALLBACK_TIMEOUT_SECS)
    except Queue.Empty:
        assert False, "Callback never happened"

    # check that the callback occurred with the correct arguments
    assert(dcgm_structs.DCGM_POLICY_COND_DBE == callbackData.condition), \
            ("error callback was not for a DBE error, got: %s" % callbackData.condition)
    assert (
        1 == callbackData.val.dbe.numerrors
    ), 'Expected 1 DBE error but got %s' % callbackData.val.dbe.numerrors
    assert(dcgm_structs.c_dcgmPolicyConditionDbe_t.LOCATIONS['DEVICE'] == callbackData.val.dbe.location), \
        'got: %s' % callbackData.val.dbe.location
Exemple #11
0
def helper_dcgm_policy_inject_pcierror(handle, gpuIds):
    """ 
    Verifies that we can inject an error into the PCI counters and receive a callback
    """
    newPolicy = dcgm_structs.c_dcgmPolicy_v1()

    newPolicy.version = dcgm_structs.dcgmPolicy_version1
    newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_PCI
    newPolicy.parms[1].tag = 1
    newPolicy.parms[1].val.llval = 0

    gpuId = gpuIds[0]

    group = pydcgm.DcgmGroup(pydcgm.DcgmHandle(handle),
                             groupName="test1",
                             groupType=dcgm_structs.DCGM_GROUP_EMPTY)
    group.AddGpu(gpuId)
    group.policy.Set(newPolicy)

    callbackQueue = Queue.Queue()
    c_callback = create_c_callback(callbackQueue)
    group.policy.Register(dcgm_structs.DCGM_POLICY_COND_PCI,
                          finishCallback=c_callback)

    field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
    field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    field.fieldId = dcgm_fields.DCGM_FI_DEV_PCIE_REPLAY_COUNTER
    field.status = 0
    field.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
    field.ts = int((time.time() + 60) *
                   1000000.0)  # set the injected data into the future
    field.value.i64 = 1

    ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, field)
    assert (ret == dcgm_structs.DCGM_ST_OK)

    # wait for the the policy manager to call back
    try:
        callbackData = callbackQueue.get(timeout=POLICY_CALLBACK_TIMEOUT_SECS)
    except Queue.Empty:
        assert False, "Callback never happened"

    # check that the callback occurred with the correct arguments
    assert(dcgm_structs.DCGM_POLICY_COND_PCI == callbackData.condition), \
            ("PCI error callback was not for a PCI error, got: %s" % callbackData.condition)
    assert (1 == callbackData.val.pci.counter
            ), 'Expected 1 PCI error but got %s' % callbackData.val.pci.counter
Exemple #12
0
def test_reading_specific_data(handle, gpuIds):
    """ 
    Verifies that we can inject specific data and get that same data back
    """

    dcgmHandle = pydcgm.DcgmHandle(handle)
    dcgmSystem = dcgmHandle.GetSystem()

    specificFieldIds = [
        dcgm_fields.DCGM_FI_DEV_RETIRED_DBE,
        dcgm_fields.DCGM_FI_DEV_POWER_VIOLATION,
        dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION,
    ]
    fieldValues = [
        1,
        1000,
        9000,
    ]

    for i in range(0, len(specificFieldIds)):
        field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
        field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
        field.fieldId = specificFieldIds[i]
        field.status = 0
        field.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
        field.ts = int((time.time() + 10) *
                       1000000.0)  # set the injected data into the future
        field.value.i64 = fieldValues[i]
        ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuIds[0],
                                                       field)
        assert (ret == dcgm_structs.DCGM_ST_OK)

    # pylint: disable=undefined-variable
    dr = DcgmReader(fieldIds=specificFieldIds)
    dr.SetHandle(handle)
    latest = dr.GetLatestGpuValuesAsFieldIdDict()

    assert len(latest[gpuIds[0]]) == len(specificFieldIds)

    for i in range(0, len(specificFieldIds)):
        assert latest[gpuIds[0]][specificFieldIds[i]] == fieldValues[i]
Exemple #13
0
def helper_test_dcgm_policy_inject_xiderror(handle, gpuIds):
    """ 
    Verifies that we can inject an XID error and receive a callback
    """
    newPolicy = dcgm_structs.c_dcgmPolicy_v1()

    newPolicy.version = dcgm_structs.dcgmPolicy_version1
    newPolicy.condition = dcgm_structs.DCGM_POLICY_COND_XID
    newPolicy.parms[6].tag = 0
    newPolicy.parms[6].val.boolean = True

    dcgmHandle = pydcgm.DcgmHandle(handle)
    validDeviceId = -1
    devices = gpuIds
    for x in devices:
        fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields(
            handle, x, [
                dcgm_fields.DCGM_FI_DEV_XID_ERRORS,
            ])
        if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED):
            validDeviceId = x
            break
    if (validDeviceId == -1):
        test_utils.skip_test(
            "Can only run if at least one GPU that supports XID errors is present"
        )

    group = pydcgm.DcgmGroup(dcgmHandle,
                             groupName="test1",
                             groupType=dcgm_structs.DCGM_GROUP_EMPTY)
    group.AddGpu(validDeviceId)
    group.policy.Set(newPolicy)

    callbackQueue = Queue.Queue()
    c_callback = create_c_callback(callbackQueue)
    group.policy.Register(dcgm_structs.DCGM_POLICY_COND_XID,
                          finishCallback=c_callback)

    field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
    field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    field.fieldId = dcgm_fields.DCGM_FI_DEV_XID_ERRORS
    field.status = 0
    field.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
    field.ts = int((time.time() + 60) *
                   1000000.0)  # set the injected data into the future
    field.value.i64 = 16

    ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, validDeviceId,
                                                   field)
    assert (ret == dcgm_structs.DCGM_ST_OK)

    # wait for the the policy manager to call back
    try:
        callbackData = callbackQueue.get(timeout=POLICY_CALLBACK_TIMEOUT_SECS)
    except Queue.Empty:
        assert False, "Callback never happened"

    # check that the callback occurred with the correct arguments
    assert(dcgm_structs.DCGM_POLICY_COND_XID == callbackData.condition), \
            ("XID error callback was not for a XID error, got: %s" % callbackData.condition)
    assert (16 == callbackData.val.xid.errnum), (
        'Expected XID error 16 but got %s' % callbackData.val.xid.errnum)
Exemple #14
0
        ), "Failed to set configuration for the group: %s" % ret
dcgm_agent.dcgmStatusClear(statusHandle)
helper_verify_power_value_standalone(handle, groupId, powerLimit_set)

ret = dcgm_agent.dcgmPolicySet(handle, groupId, newPolicy, statusHandle)
assert (ret == dcgm_structs.DCGM_ST_OK)

time.sleep(5)  # give the policy manager a chance to start

requestId = dcgm_agent.dcgmPolicyRegister(
    handle, groupId, dcgm_structs.DCGM_POLICY_COND_MAX_PAGES_RETIRED,
    c_callback, c_callback)
assert (requestId != None)

# inject an error into page retirement
field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
field.fieldId = dcgm_fields.DCGM_FI_DEV_RETIRED_DBE
field.status = 0
field.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
field.ts = int(
    (time.time() + 11) * 1000000.0)  # set the injected data into the future
field.value.i64 = 10

ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, validDevice, field)
assert (ret == dcgm_structs.DCGM_ST_OK)

now = time.time()

while not callbackCalled:
    if time.time() == now + 60:  # wait 60 seconds
Exemple #15
0
def test_dcgm_injection_agent(handle, gpuIds):
    """
    Verifies that injection works with the agent host engine
    """
    gpuId = gpuIds[0]

    #Make a base value that is good for starters
    fvGood = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
    fvGood.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    fvGood.fieldId = dcgm_fields.DCGM_FI_DEV_ECC_CURRENT
    fvGood.status = 0
    fvGood.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
    fvGood.ts = get_usec_since_1970()
    fvGood.value.i64 = 1

    fieldInfoBefore = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
        handle, gpuId, fvGood.fieldId)
    countBefore = fieldInfoBefore.numSamples

    #This will throw an exception if it fails
    dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvGood)

    fieldInfoAfter = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
        handle, gpuId, fvGood.fieldId)
    countAfter = fieldInfoAfter.numSamples

    assert countAfter > countBefore, "Expected countAfter %d > countBefore %d after injection" % (
        countAfter, countBefore)

    #Fetch the value we just inserted and verify its attributes are the same
    fvFetched = dcgm_agent_internal.dcgmGetLatestValuesForFields(
        handle, gpuId, [
            fvGood.fieldId,
        ])[0]
    helper_verify_fv_equal(fvFetched, fvGood)

    #Should be able to insert a null timestamp. The agent will just use "now"
    fvAlsoGood = fvGood
    fvAlsoGood.ts = 0
    #This will thrown an exception if it fails
    dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvAlsoGood)

    #Now make some attributes bad and expect an error
    fvBad = fvGood
    fvBad.fieldType = ord(dcgm_fields.DCGM_FT_DOUBLE)
    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)):
        dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvBad)

    fvGood.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
    """ TODO: DCGM-2130 - Restore this test when protobuf is removed
    #Now make some attributes bad and expect an error
    fvBad = fvGood
    fvBad.version = 0
    with test_utils.assert_raises(dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvBad)

    fvGood.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    """
    fvBad = fvGood
    fvBad.fieldId = dcgm_fields.DCGM_FI_MAX_FIELDS + 100
    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)):
        dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, fvBad)
Exemple #16
0
def test_dcgmi_dmon(handle, gpuIds, switchIds, instanceIds, ciIds):
    """
    Test dcgmi to display dmon values
    """
    gpuGroupId = str(_create_dcgmi_group(dcgm_structs.DCGM_GROUP_DEFAULT))
    switchGroupId = str(
        _create_dcgmi_group(dcgm_structs.DCGM_GROUP_DEFAULT_NVSWITCHES))

    logger.info("Injected switch IDs:" + str(switchIds))

    # Creates a comma separated list of gpus
    allGpusCsv = ",".join(map(str, gpuIds))
    allInstancesCsv = ",".join(
        map(lambda x: ("instance:" + str(x)), instanceIds))
    # All compute instances
    allCisCsv = ",".join(map(lambda x: ("ci:" + str(x)), ciIds))
    #Same for switches but predicate each one with nvswitch
    allSwitchesCsv = ",".join(map(helper_make_switch_string, switchIds))

    switchFieldId = dcgm_fields.DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P00

    #Inject a value for a field for each switch so we can retrieve it
    field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
    field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    field.fieldId = switchFieldId
    field.status = 0
    field.fieldType = ord(dcgm_fields.DCGM_FT_INT64)
    field.ts = int((time.time() - 5) * 1000000.0)  #5 seconds ago
    field.value.i64 = 0
    for switchId in switchIds:
        ret = dcgm_agent_internal.dcgmInjectEntityFieldValue(
            handle, dcgm_fields.DCGM_FE_SWITCH, switchId, field)

    _test_valid_args([
        ["dmon", "-e", "150,155", "-c",
         "1"],  # run the dmon for default gpu group.
        ["dmon", "-e", "150,155", "-c", "1", "-g",
         gpuGroupId],  # run the dmon for a specified gpu group
        ["dmon", "-e", "150,155", "-c", "1", "-g",
         'all_gpus'],  # run the dmon for a specified group
        [
            "dmon", "-e",
            str(switchFieldId), "-c", "1", "-g", 'all_nvswitches'
        ],  # run the dmon for a specified group - Reenable after DCGM-413 is fixed
        ["dmon", "-e",
         str(switchFieldId), "-c", "1", "-g",
         switchGroupId],  # run the dmon for a specified group
        ["dmon", "-e", "150,155", "-c", "1", "-d",
         "2000"],  # run the dmon for delay mentioned and default gpu group. 
        ["dmon", "-e", "150,155", "-c", "1", "-d", "2000", "-i", allGpusCsv
         ],  # run the dmon for devices mentioned and mentioned delay.
        [
            "dmon", "-e", "150,155", "-c", "1", "-d", "2000", "-i",
            allInstancesCsv
        ],
        ["dmon", "-e", "150,155", "-c", "1", "-d", "2000", "-i", allCisCsv],
        [
            "dmon", "-e", "150,155", "-c", "1", "-d", "2000", "-i",
            allGpusCsv + "," + allInstancesCsv + "," + allCisCsv
        ],
        [
            "dmon", "-e",
            str(switchFieldId), "-c", "1", "-d", "2000", "-i", allSwitchesCsv
        ]  # run the dmon for devices mentioned and mentioned delay.
    ])

    #Run tests that take a gpuId as an argument
    for gpu in gpuIds:
        _test_valid_args([
            ["dmon", "-e", "150", "-c", "1", "-i",
             str(gpu)],  # run the dmon for one gpu.
            ["dmon", "-e", "150", "-c", "1", "-i",
             'gpu:' + str(gpu)],  # run the dmon for one gpu, tagged as gpu:.
            ["dmon", "-e", "150", "-c", "1", "-i",
             str(gpu)],  # run the dmon for mentioned devices and count value.
            [
                "dmon", "-e", "150,155", "-c", "1", "-i",
                str(gpu)
            ]  # run the dmon for devices mentioned, default delay and field values that are provided.
        ])

    #Run tests that take a nvSwitch as an argument
    for switchId in switchIds:
        _test_valid_args([
            [
                "dmon", "-e",
                str(switchFieldId), "-c", "1", "-i",
                'nvswitch:' + str(switchId)
            ],  # run the dmon for one nvswitch, tagged as nvswitch:.
        ])

    hugeGpuCsv = ",".join(
        map(str, range(0, dcgm_structs.DCGM_MAX_NUM_DEVICES * 2, 1)))

    _test_invalid_args([
        ["dmon", "-c", "1"],  # run without required fields.
        ["dmon", "-e", "-150", "-c", "1", "-i",
         "1"],  # run with invalid field id.
        ["dmon", "-e", "150", "-c", "1", "-i",
         "-2"],  # run with invalid gpu id.
        ["dmon", "-e", "150", "-c", "1", "-i",
         "gpu:999"],  # run with invalid gpu id.
        ["dmon", "-e", "150", "-c", "1", "-g",
         "999"],  # run with invalid group id.
        ["dmon", "-i", hugeGpuCsv, "-e", "150", "-c",
         "1"],  # run with invalid number of devices.
        ["dmon", "-i", "instance:2000", "-e", "150", "-c",
         "1"],  # run with invalid gpu_i
        ["dmon", "-i", "ci:2000", "-e", "150", "-c",
         "1"],  # run with invalid gpu_ci
        ["dmon", "-e", "150", "f", "0", "-c", "1", "-i",
         "0,1,765"],  # run with invalid device id (non existing id).
        ["dmon", "-e", "150", "-c", "-1", "-i",
         "1"],  # run with invalid count value.
        ["dmon", "-e", "150", "-c", "1", "-i", "1", "-d",
         "-1"],  # run with invalid delay (negative value).
        ["dmon", "-f", "-9", "-c", "1", "-i", "1", "-d",
         "10000"],  # run with invalid field Id.
        ["dmon", "-f", "150", "-c", "1", "-i", "0", "-d",
         "99"]  # run with invalid delay value.
    ])