Esempio n. 1
0
def helper_test_blacklist_checks(handle, gpuIds):
    handleObj = DcgmHandle.DcgmHandle(handle=handle)
    settings = {}
    settings['instant'] = True
    settings['entity_get_flags'] = 0
    settings['testNames'] = '3'
    settings['hostname'] = 'localhost'
    settings[
        'watches'] = dcgm_structs.DCGM_HEALTH_WATCH_MEM | dcgm_structs.DCGM_HEALTH_WATCH_PCIE
    error_list = []

    ret = dcgm_internal_helpers.inject_field_value_i64(
        handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, 0, -50)
    blacklist_recommendations.check_health(handleObj, settings, error_list)

    # Make sure the GPUs pass a basic health test before running this test
    for gpuObj in blacklist_recommendations.g_gpus:
        if gpuObj.IsHealthy() == False:
            test_utils.skip_test("Skipping because GPU %d is not healthy. " %
                                 gpuObj.GetEntityId())

    # Inject a memory error and verify that we fail
    blacklist_recommendations.g_gpus = []  # Reset g_gpus

    ret = dcgm_internal_helpers.inject_field_value_i64(
        handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, 1000, 10)
    assert (ret == dcgm_structs.DCGM_ST_OK)

    blacklist_recommendations.check_health(handleObj, settings, error_list)
    for gpuObj in blacklist_recommendations.g_gpus:
        if gpuObj.GetEntityId() == gpuIds[0]:
            assert gpuObj.IsHealthy(
            ) == False, "Injected error didn't trigger a failure on GPU %d" % gpuIds[
                0]
        else:
            assert gpuObj.IsHealthy(
            ), "GPU %d reported unhealthy despite not having an inserted error: '%s'" % (
                gpuIds[0], gpuObj.WhyUnhealthy())

    # Remove the memory monitor and make sure we pass our checks
    blacklist_recommendations.g_gpus = []  # Reset g_gpus
    settings['watches'] = dcgm_structs.DCGM_HEALTH_WATCH_PCIE
    blacklist_recommendations.check_health(handleObj, settings, error_list)
    for gpuObj in blacklist_recommendations.g_gpus:
        if gpuObj.GetEntityId() == gpuIds[0]:
            assert gpuObj.IsHealthy(
            ), "Injected error wasn't ignored for GPU %d: %s" % (
                gpuIds[0], gpuObj.WhyUnhealthy())
        else:
            assert gpuObj.IsHealthy(
            ), "GPU %d reported unhealthy despite not having an inserted error: '%s'" % (
                gpuIds[0], gpuObj.WhyUnhealthy())
Esempio n. 2
0
def helper_test_dcgm_diag_dbe_insertion(handle, gpuIds):
    dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds,
                           testNamesStr='diagnostic',
                           paramsStr='diagnostic.test_duration=8')
    dd.UseFakeGpus()
    ret = dcgm_internal_helpers.inject_field_value_i64(
        handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, 1, 5)
    assert ret == dcgm_structs.DCGM_ST_OK, "Could not insert an error to test forced failure"
    ret = dcgm_internal_helpers.inject_field_value_i64(
        handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, 1, 15)
    assert ret == dcgm_structs.DCGM_ST_OK, "Could not insert an error to test forced failure"
    response = dd.Execute(handle)
    errorStr = "Expected results for %d GPUs, but found %d" % (
        len(gpuIds), response.gpuCount)
    assert response.gpuCount == len(gpuIds), errorStr
    diag_result_assert_fail(
        response, gpuIds[0], dcgm_structs.DCGM_DIAGNOSTIC_INDEX,
        "Expected the diagnostic test to fail because we injected a DBE",
        dcgm_errors.DCGM_FR_FIELD_VIOLATION)
Esempio n. 3
0
def injection_wrapper(handle, gpuId, fieldId, value, isInt):
    # Sleep 1 second so that the insertion happens after the test run begins while not prolonging things
    time.sleep(1)
    if isInt:
        ret = dcgm_internal_helpers.inject_field_value_i64(
            handle, gpuId, fieldId, value, 0)
        assert ret == dcgm_structs.DCGM_ST_OK
        ret = dcgm_internal_helpers.inject_field_value_i64(
            handle, gpuId, fieldId, value, 5)
        assert ret == dcgm_structs.DCGM_ST_OK
        ret = dcgm_internal_helpers.inject_field_value_i64(
            handle, gpuId, fieldId, value, 10)
        assert ret == dcgm_structs.DCGM_ST_OK
    else:
        ret = dcgm_internal_helpers.inject_field_value_fp64(
            handle, gpuId, fieldId, value, 0)
        assert ret == dcgm_structs.DCGM_ST_OK
        ret = dcgm_internal_helpers.inject_field_value_fp64(
            handle, gpuId, fieldId, value, 5)
        assert ret == dcgm_structs.DCGM_ST_OK
        ret = dcgm_internal_helpers.inject_field_value_fp64(
            handle, gpuId, fieldId, value, 10)
        assert ret == dcgm_structs.DCGM_ST_OK