Beispiel #1
0
def helper_check_software_page_retirements_fail_on_pending_retirements(handle, gpuId):
    """
    Ensure that the software test for page retirements fails when there are pending page retirements.
    """
    # First verify that the software test passes for the gpu.
    # If it doesn't pass, skip test and add note to check GPU health
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId])
    dd.UseFakeGpus()
    response = test_utils.diag_execute_wrapper(dd, handle)
    if not check_software_result_pass(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT):
        test_utils.skip_test("Skipping because GPU %s does not pass software page retirement test. "
                             "Please verify whether the GPU is healthy." % gpuId)

    # Inject some pending page retirements
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING, 1, -30, True)
    response = test_utils.diag_execute_wrapper(dd, handle)
    # Ensure software test failed due to pending page retirments
    assert check_software_result_fail(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT), \
        "Expected software test to fail due to pending page retirements in the GPU"

    # Reset injected value
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING, 0, -30, True)
    # Ensure diag passes now
    response = test_utils.diag_execute_wrapper(dd, handle)
    assert check_software_result_pass(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT), \
        "Expected software test to pass"
Beispiel #2
0
def helper_check_software_page_retirements_fail_total_retirements(handle, gpuId):
    """
    Ensure that the software test for page retirements fails when there are mroe than 60 page retirements.
    """
    # First verify that the software test passes for the gpu. If it doesn't pass, skip test and add note to check GPU health
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId])
    dd.UseFakeGpus()
    response = test_utils.diag_execute_wrapper(dd, handle)
    if not check_software_result_pass(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT):
        test_utils.skip_test("Skipping because GPU %s does not pass software page retirement test. "
                             "Please verify whether the GPU is healthy." % gpuId)

    # Inject enough page retirements to cause failure
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_DBE, 33, -30, True)
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_SBE, 33, -30, True)
    response = test_utils.diag_execute_wrapper(dd, handle)
    assert check_software_result_fail(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT), \
           "Expected software test to fail due to 60 total page retirements in the GPU"

    # Ensure 59 pages pass injected value
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_SBE, 25, -30, True)
    # Ensure diag passes now
    response = test_utils.diag_execute_wrapper(dd, handle)
    assert check_software_result_pass(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT), \
           "Expected software test to pass since there are less than 60 total retired pages"

    # Reset retired pages count and verify pass
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_DBE, 0, -30, True)
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_RETIRED_SBE, 0, -30, True)
    # Ensure diag still passes
    response = test_utils.diag_execute_wrapper(dd, handle)
    assert check_software_result_pass(response, dcgm_structs.DCGM_SWTEST_PAGE_RETIREMENT), \
           "Expected software test to pass since there are no retired pages"
Beispiel #3
0
def helper_throttling_masking_failures(handle, gpuId):
    #####
    # First check whether the GPU is healthy
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId],
                           testNamesStr="SM Stress",
                           paramsStr="sm stress.test_duration=2",
                           version=dcgm_structs.dcgmRunDiag_version)
    dd.SetThrottleMask(
        0
    )  # We explicitly want to fail for throttle reasons since this test inserts throttling errors
    # for verification
    dd.UseFakeGpus()
    response = test_utils.diag_execute_wrapper(dd, handle)
    if not check_diag_result_pass(response, gpuId,
                                  dcgm_structs.DCGM_SM_STRESS_INDEX):
        test_utils.skip_test(
            "Skipping because GPU %s does not pass SM Perf test. "
            "Please verify whether the GPU is supported and healthy." % gpuId)

    #####
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId],
                           testNamesStr="SM Stress",
                           paramsStr="sm stress.test_duration=15",
                           version=dcgm_structs.dcgmRunDiag_version)
    dd.SetThrottleMask(0)
    dd.UseFakeGpus()

    fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS
    insertedError = dcgm_fields.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN
    interval = 0.1

    logger.info("Injecting benign errors")
    inject_value(handle, gpuId, fieldId, 3, 1, True)
    # Verify that the inserted values are visible in DCGM before starting the diag
    assert dcgm_internal_helpers.verify_field_value(gpuId, fieldId, 3, checkInterval=interval, maxWait=5, numMatches=1), \
        "Expected inserted values to be visible in DCGM"

    logger.info("Injecting actual errors")
    inject_value(handle, gpuId, fieldId, insertedError, injection_offset, True)
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000,
                 injection_offset, True)

    logger.info("Started diag")
    response = test_utils.diag_execute_wrapper(dd, handle)
    # Verify that the inserted values are visible in DCGM
    # Max wait of 8 is because of 5 second offset + 2 seconds required for 20 matches + 1 second buffer.
    assert dcgm_internal_helpers.verify_field_value(gpuId, fieldId, insertedError, checkInterval=0.1, numMatches=1, maxWait=8), \
            "Expected inserted errors to be visible in DCGM"

    throttled, errMsg = find_throttle_failure(
        response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX)
    assert throttled, "Expected to find throttling failure, but did not: (%s)" % errMsg
Beispiel #4
0
def perform_diag_with_throttle_mask_and_verify(dd, handle, gpuId,
                                               inserted_error, throttle_mask,
                                               shouldPass, failureMsg):
    fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS
    interval = 0.1
    if throttle_mask is not None:
        dd.SetThrottleMask(throttle_mask)

    inject_value(handle, gpuId, fieldId, inserted_error, injection_offset,
                 True)
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000,
                 injection_offset, True)
    # Verify that the inserted values are visible in DCGM before starting the diag
    assert dcgm_internal_helpers.verify_field_value(gpuId, fieldId, inserted_error, checkInterval=interval, maxWait=5, numMatches=1), \
        "Expected inserted values to be visible in DCGM"

    # Start the diag
    response = test_utils.diag_execute_wrapper(dd, handle)

    # Check for pass or failure as per the shouldPass parameter
    throttled, errMsg = find_throttle_failure(
        response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX)
    if shouldPass:
        assert throttled == False, "Expected to not have a throttling error but found %s" % errMsg
    else:
        assert throttled == True, "Expected to find a throttling error but did not (%s)" % errMsg
Beispiel #5
0
    def verify_exit_code_on_signal(signum):
        # Ensure that host engine is ready to launch a new diagnostic
        dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr='1')
        success = False
        start = time.time()
        while not success and (time.time() - start) <= 3:
            try:
                response = test_utils.diag_execute_wrapper(dd, handle)
                success = True
            except dcgm_structs.dcgmExceptionClass(
                    dcgm_structs.DCGM_ST_DIAG_ALREADY_RUNNING):
                # Only acceptable error due to small race condition between the nvvs process exiting and
                # hostengine actually processing the exit. We try for a maximum of 3 seconds since this
                # should be rare and last only for a short amount of time
                time.sleep(1.5)

        diagApp = AppRunner(dcgmi_path,
                            args=[
                                "diag", "-r", "SM Stress", "-i",
                                "%s" % gpuId, "-d", "INFO", "--debugLogFile",
                                "/tmp/nvvs.log"
                            ])
        # Start the diag
        diagApp.start(timeout=40)
        logger.info("Launched dcgmi process with pid: %s" % diagApp.getpid())

        # Ensure diag is running before sending interrupt signal
        running, debug_output = dcgm_internal_helpers.check_nvvs_process(
            want_running=True, attempts=50)
        assert running, "The nvvs process did not start within 25 seconds: %s" % (
            debug_output)
        # There is a small race condition here - it is possible that the hostengine sends a SIGTERM before the
        # nvvs process has setup a signal handler, and so the nvvs process does not stop when SIGTERM is sent.
        # We sleep for 1 second to reduce the possibility of this scenario
        time.sleep(1)
        diagApp.signal(signum)
        retCode = diagApp.wait()
        # Check the return code and stdout/stderr output before asserting for better debugging info
        if retCode == 0:
            logger.error("Got retcode '%s' from launched diag." % retCode)
            if diagApp.stderr_lines or diagApp.stdout_lines:
                logger.info("dcgmi output:")
                for line in diagApp.stdout_lines:
                    logger.info(line)
                for line in diagApp.stderr_lines:
                    logger.error(line)
        assert retCode != 0, "Expected a non-zero exit code, but got 0"
        # Since the app returns a non zero exit code, we call the validate method to prevent false
        # failures from the test framework
        diagApp.validate()
        # Give the launched nvvs process 15 seconds to terminate.
        not_running, debug_output = dcgm_internal_helpers.check_nvvs_process(
            want_running=False, attempts=50)
        assert not_running, "The launched nvvs process did not terminate within 25 seconds. pgrep output:\n%s" \
                % debug_output
Beispiel #6
0
def test_memtest_failures_standalone(handle, gpuIds):
    dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds,
                           testNamesStr="memtest",
                           paramsStr="memtest.test_duration=10")

    inject_value(handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL,
                 1000, injection_offset, True)

    response = test_utils.diag_execute_wrapper(dd, handle)

    assert response.perGpuResponses[gpuIds[0]].results[dcgm_structs.DCGM_MEMTEST_INDEX].result != dcgm_structs.DCGM_DIAG_RESULT_PASS, \
                "Should have a failure due to injected DBEs, but got passing result"
Beispiel #7
0
def helper_verify_log_file_creation(handle, gpuIds):
    dd = helper_verify_diag_passing(
        handle,
        gpuIds,
        testNames="targeted stress",
        testIndex=dcgm_structs.DCGM_TARGETED_STRESS_INDEX,
        params="targeted stress.test_duration=10",
        useFakeGpus=True)
    logname = '/tmp/tmp_test_debug_log'
    dd.SetDebugLogFile(logname)
    dd.SetDebugLevel(5)
    response = test_utils.diag_execute_wrapper(dd, handle)

    if len(response.systemError.msg) == 0:
        skippedAll = True
        passedCount = 0
        errors = ""
        for gpuId in gpuIds:
            resultType = response.perGpuResponses[gpuId].results[
                dcgm_structs.DCGM_TARGETED_STRESS_INDEX].result
            if resultType not in [
                    dcgm_structs.DCGM_DIAG_RESULT_SKIP,
                    dcgm_structs.DCGM_DIAG_RESULT_NOT_RUN
            ]:
                skippedAll = False
                if resultType == dcgm_structs.DCGM_DIAG_RESULT_PASS:
                    passedCount = passedCount + 1
                else:
                    warning = response.perGpuResponses[gpuId].results[
                        dcgm_structs.DCGM_TARGETED_STRESS_INDEX].error.msg
                    if len(warning):
                        errors = "%s, GPU %d failed: %s" % (errors, gpuId,
                                                            warning)

        if skippedAll == False:
            detailedMsg = "passed on %d of %d GPUs" % (passedCount,
                                                       response.gpuCount)
            if len(errors):
                detailedMsg = "%s and had these errors: %s" % (detailedMsg,
                                                               errors)
                logger.info(detailedMsg)
            assert os.path.isfile(
                logname), "Logfile '%s' was not created and %s" % (logname,
                                                                   detailedMsg)
        else:
            logger.info(
                "The diagnostic was skipped, so we cannot run this test.")
    else:
        logger.info(
            "The diagnostic had a problem when executing, so we cannot run this test."
        )
Beispiel #8
0
def helper_test_bad_statspath(handle, gpuIds):
    dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds,
                           testNamesStr='diagnostic',
                           paramsStr='diagnostic.test_duration=20')
    dd.SetStatsPath('/fake/superfake/notreal/')
    failed = False
    try:
        response = test_utils.diag_execute_wrapper(dd, handle)
    except dcgm_structs.dcgmExceptionClass(
            dcgm_structs.DCGM_ST_NVVS_ERROR) as e:
        failed = True
        assert str(e).find(
            'cannot access statspath'
        ) != -1, "Should have received a statspath error but got %s" % str(e)

    assert failed, "We must fail when attempting to access a fake dir"

    filename = '/tmp/not_a_file'
    if not os.path.isfile(filename):
        # create the file
        with open(filename, 'w') as f:
            f.write('lorem ipsum')

        failed = False
        dd.SetStatsPath(filename)
        try:
            response = test_utils.diag_execute_wrapper(dd, handle)
        except dcgm_structs.dcgmExceptionClass(
                dcgm_structs.DCGM_ST_NVVS_ERROR) as e:
            failed = True
            assert str(e).find(
                'is not a directory'
            ) != -1, "Should have received a statspath error but got %s" % str(
                e)
        assert failed, "We must fail when attempting to set statspath to a file"

        # Remove the file to clean up after ourselves
        os.remove(filename)
Beispiel #9
0
def helper_test_diagnostic_config_usage(handle, gpuIds):
    dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds,
                           testNamesStr="diagnostic",
                           paramsStr="diagnostic.test_duration=10")
    dd.SetConfigFileContents(
        "%YAML 1.2\n\ncustom:\n- custom:\n    diagnostic:\n      max_sbe_errors: 1"
    )

    inject_value(handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_ECC_SBE_VOL_TOTAL,
                 1000, injection_offset, True)

    response = test_utils.diag_execute_wrapper(dd, handle)

    assert response.perGpuResponses[gpuIds[0]].results[dcgm_structs.DCGM_DIAGNOSTIC_INDEX].result != dcgm_structs.DCGM_DIAG_RESULT_PASS, \
                "Should have a failure due to injected SBEs, but got passing result"
Beispiel #10
0
def helper_test_dcgm_short_diagnostic_run(handle, gpuIds):
    dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds,
                           testNamesStr="diagnostic",
                           paramsStr="diagnostic.test_duration=15")
    response = test_utils.diag_execute_wrapper(dd, handle)
    for gpuId in gpuIds:
        if response.perGpuResponses[gpuId].results[
                dcgm_structs.
                DCGM_DIAGNOSTIC_INDEX].result == dcgm_structs.DCGM_DIAG_RESULT_SKIP:
            logger.info(
                "Got status DCGM_DIAG_RESULT_SKIP for gpuId %d. This is expected if this GPU does not support the Diagnostic test."
                % gpuId)
            continue

        assert response.perGpuResponses[gpuId].results[dcgm_structs.DCGM_DIAGNOSTIC_INDEX].result == dcgm_structs.DCGM_DIAG_RESULT_PASS, \
                    "Should have passed the 15 second diagnostic for all GPUs"
Beispiel #11
0
def helper_verify_diag_passing(handle,
                               gpuIds,
                               testNames="SM Stress",
                               testIndex=dcgm_structs.DCGM_SM_STRESS_INDEX,
                               params="sm stress.test_duration=15",
                               version=dcgm_structs.dcgmRunDiag_version,
                               useFakeGpus=False):
    dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds,
                           testNamesStr=testNames,
                           paramsStr=params,
                           version=version)
    dd.SetThrottleMask(
        0
    )  # We explicitly want to fail for throttle reasons since this test inserts throttling errors
    # for verification
    if useFakeGpus:
        dd.UseFakeGpus()

    # If we've already chchecked this GPU, then use the previous result
    runDiag = False
    for gpuId in gpuIds:
        if gpuId in checked_gpus:
            if checked_gpus[gpuId] == False:
                test_utils.skip_test(
                    "Skipping because GPU %s does not pass SM Perf test. "
                    "Please verify whether the GPU is supported and healthy." %
                    gpuId)
        else:
            runDiag = True

    if runDiag == False:
        return dd

    response = test_utils.diag_execute_wrapper(dd, handle)
    for gpuId in gpuIds:
        if not check_diag_result_pass(response, gpuId, testIndex):
            checked_gpus[gpuId] = False
            test_utils.skip_test(
                "Skipping because GPU %s does not pass SM Perf test. "
                "Please verify whether the GPU is supported and healthy." %
                gpuId)
        else:
            checked_gpus[gpuId] = True

    return dd
Beispiel #12
0
def helper_per_gpu_responses_api(handle, gpuIds, testDir):
    """
    Verify that pass/fail status for diagnostic tests are reported on a per GPU basis via dcgmActionValidate API call
    """
    failGpuId = gpuIds[0]
    dd = helper_verify_diag_passing(handle, gpuIds, useFakeGpus=True)

    dd = DcgmDiag.DcgmDiag(gpuIds=[failGpuId],
                           testNamesStr="SM Stress",
                           paramsStr="sm stress.test_duration=15",
                           version=dcgm_structs.dcgmRunDiag_version)
    dd.SetThrottleMask(
        0
    )  # We explicitly want to fail for throttle reasons since this test inserts throttling errors
    # for verification
    dd.UseFakeGpus()
    dd.SetStatsPath(testDir)
    dd.SetStatsOnFail(1)

    # Setup injection app
    fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS
    insertedError = dcgm_fields.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN
    interval = 0.1
    # Use an offset to make these errors start after the benign values
    inject_value(handle, failGpuId, fieldId, insertedError, injection_offset,
                 True)
    inject_value(handle, failGpuId, dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000,
                 injection_offset, True)
    # Verify that the inserted values are visible in DCGM before starting the diag
    assert dcgm_internal_helpers.verify_field_value(failGpuId, fieldId, insertedError, checkInterval=interval, maxWait=5, numMatches=1), \
        "Expected inserted values to be visible in DCGM"

    response = test_utils.diag_execute_wrapper(dd, handle)
    logger.info("Started diag")

    # Verify that responses are reported on a per gpu basis. Ensure the first GPU failed, and all others passed
    for gpuId in gpuIds:
        throttled, errMsg = find_throttle_failure(
            response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX)
        if gpuId == failGpuId:
            assert throttled, "Expected throttling error but found none (%s)" % errMsg
        else:
            assert not throttled, "Expected not to find a throttling error but found '%s'" % errMsg
Beispiel #13
0
def helper_check_diag_high_temp_fail(handle, gpuIds):
    dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds,
                           testNamesStr='diagnostic',
                           paramsStr='diagnostic.test_duration=10')

    # kick off a thread to inject the failing value while I run the diag
    diag_thread = threading.Thread(
        target=injection_wrapper,
        args=[handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 120, True])
    diag_thread.start()
    response = test_utils.diag_execute_wrapper(dd, handle)
    diag_thread.join()

    assert response.gpuCount == len(
        gpuIds), "Expected %d gpus, but found %d reported" % (
            len(gpuIds), response.gpuCount)
    diag_result_assert_fail(
        response, gpuIds[0], dcgm_structs.DCGM_DIAGNOSTIC_INDEX,
        "Expected a failure due to 120 degree inserted temp.",
        dcgm_errors.DCGM_FR_TEMP_VIOLATION)
Beispiel #14
0
def helper_check_diag_thermal_violation(handle, gpuIds):
    dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds,
                           testNamesStr='diagnostic',
                           paramsStr='diagnostic.test_duration=10')

    # kick off a thread to inject the failing value while I run the diag
    diag_thread = threading.Thread(
        target=injection_wrapper,
        args=[
            handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION,
            9223372036854775792, True
        ])
    diag_thread.start()
    response = test_utils.diag_execute_wrapper(dd, handle)
    diag_thread.join()

    assert response.gpuCount == len(
        gpuIds), "Expected %d gpus, but found %d reported" % (
            len(gpuIds), response.gpuCount)
    for gpuIndex in range(response.gpuCount):
        diag_assert_error_not_found(response, gpuIndex,
                                    dcgm_structs.DCGM_DIAGNOSTIC_INDEX,
                                    "Thermal violations")
Beispiel #15
0
 def runDiag(dd, data): # Simple helper method to run a diag (used as thread target)
     data[0] = test_utils.diag_execute_wrapper(dd, handle)
Beispiel #16
0
def verify_early_fail_checks_for_test(handle, gpuId, test_name, testIndex):
    """
    Helper method for verifying the fail early checks for the specified test.
    """
    if testIndex == dcgm_structs.DCGM_TARGETED_POWER_INDEX and not option_parser.options.developer_mode:
        # Skip this test since Targeted Power always fails when duration is less than 30 seconds
        test_utils.skip_test("Skipping fail early verification for Targeted Power test. Use developer mode "
                             "to run this test.")
    duration = 2 if testIndex != dcgm_structs.DCGM_TARGETED_POWER_INDEX else 30 # Prevent false failures due to min
                                                                                # duration requirements for Targeted Power
    paramsStr = "%s.test_duration=%s" % (test_name, duration)

    data = [None]
    def runDiag(dd, data): # Simple helper method to run a diag (used as thread target)
        data[0] = test_utils.diag_execute_wrapper(dd, handle)

    ###
    # First verify that the given test passes for the gpu.
    # If it doesn't pass, skip test and add note to check GPU health
    logger.info("Checking whether %s test passes on GPU %s" % (test_name, gpuId))
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr)
    test_name_no_spaces = test_name.replace(" ", "_")
    logname = '/tmp/nv_' + test_name_no_spaces + '%s.log'
    dd.SetDebugLogFile(logname % 1)
    dd.SetDebugLevel(5)
    response = test_utils.diag_execute_wrapper(dd, handle)
    if not check_diag_result_pass(response, gpuId, testIndex):
        test_utils.skip_test("Skipping because GPU %s does not pass %s test. "
                             "Please verify whether the GPU is healthy." % (gpuId, test_name))

    ###
    # Next, verify that the given test passes for the gpu when fail early checks are enabled and no errors are inserted
    logger.info("Checking whether %s test passes on GPU %s with fail early enabled" % (test_name, gpuId))
    duration = 15 if testIndex != dcgm_structs.DCGM_TARGETED_POWER_INDEX else 30 # Prevent false failures due to min
                                                                                 # duration requirements for Targeted Power
    paramsStr = "%s.test_duration=%s" % (test_name, duration)
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr)
    dd.SetFailEarly(checkInterval=2) # enable fail early checks
    dd.SetDebugLogFile(logname % 2)
    dd.SetDebugLevel(5)

    result_thread = threading.Thread(target=runDiag, args=[dd, data])
    result_thread.start()

    # Ensure nvvs process has started
    running, debug_output = dcgm_internal_helpers.check_nvvs_process(want_running=True)
    assert running, "Nvvs process did not start within 10 seconds. pgrep output: %s" % debug_output

    start = time.time()
    result_thread.join()
    end = time.time()

    assert check_diag_result_pass(data[0], gpuId, testIndex), \
        "Expected %s test to pass with fail early enabled and no inserted errors" % test_name
    assert (end - start) >= duration * 0.9, \
        "Expected %s test to run for at least %ss, but it only ran for %ss." % (test_name, duration, end - start)

    ###
    # Verify fail early behavior by inserting an error.
    # Setup test parameters
    duration = 20 if testIndex != dcgm_structs.DCGM_TARGETED_POWER_INDEX else 30 # Prevent false failures due to min
                                                                                 # duration requirements for Targeted Power
    paramsStr = "%s.test_duration=%s" % (test_name, duration)
    response = None
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr)
    dd.SetFailEarly(checkInterval=2) # enable fail early checks
    dd.SetDebugLogFile(logname % 3)

    # Setup threads / processes
    xid_inject_val = 2
    result_thread = threading.Thread(target=runDiag, args=[dd, data])
    inject_error = dcgm_internal_helpers.InjectionThread(handle, gpuId,
        dcgm_fields.DCGM_FI_DEV_XID_ERRORS, xid_inject_val, offset=5)

    logger.info("Verifying fail early behavior for %s test by inserting XIDs." % test_name)
    # Start inserting errors
    inject_error.start()
    # Ensure that inserted errors are visible
    assert \
        dcgm_internal_helpers.verify_field_value(gpuId, dcgm_fields.DCGM_FI_DEV_XID_ERRORS,
                                                 xid_inject_val, checkInterval=0.1, numMatches=5), \
        "Expected inserted value for XIDs to be visible in DCGM"

    # Start test thread
    result_thread.start()
    # Ensure nvvs process has started
    running, debug_output = dcgm_internal_helpers.check_nvvs_process(want_running=True)
    assert running, "Nvvs process did not start within 10 seconds. pgrep output: %s" % debug_output
    start = time.time()
    
    # Give the test time to exit and verify that the test exits early
    # Test should exit within 75% of test duration if it is going to fail early. Ideally, it should exit within 
    # 2 failure checks (~ 4 seconds of test start), but we provide bigger buffer to account for delays in starting 
    # the test
    result_thread.join(20)
    test_exited_early = not result_thread.is_alive() # Cache thread isAlive value until we verify it
    end = time.time()

    # Stop the injection app
    inject_error.Stop()
    inject_error.join()
    # Verify injection app stopped correctly
    assert inject_error.retCode == dcgm_structs.DCGM_ST_OK, \
        "There was an error inserting values into dcgm. Return code: %s" % inject_error.retCode

    if not test_exited_early:
        # Wait for the launched diag to end
        result_thread.join()
        end = time.time()
    
    response = data[0]
    # Check whether test exited early
    assert test_exited_early, \
        "Expected %s test to exit early. Test took %ss to complete.\nGot result: %s (\ninfo: %s,\n warning: %s)" \
            % (test_name, (end - start),
               response.perGpuResponses[gpuId].results[testIndex].result,
               response.perGpuResponses[gpuId].results[testIndex].info,
               response.perGpuResponses[gpuId].results[testIndex].error.msg)

    # Verify the test failed
    assert check_diag_result_fail(response, gpuId, testIndex), \
        "Expected %s test to fail due to injected dbes.\nGot result: %s (\ninfo: %s,\n warning: %s)" % \
            (test_name, response.perGpuResponses[gpuId].results[testIndex].result,
             response.perGpuResponses[gpuId].results[testIndex].info,
             response.perGpuResponses[gpuId].results[testIndex].error.msg)

    ###
    # Rerun the test to verify that the test passes now that there are no inserted errors
    duration = 30
    paramsStr = "%s.test_duration=%s" % (test_name, duration)

    logger.info("Verifying that test passes once xid errors are removed.")
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr)
    dd.SetFailEarly(checkInterval=3) # enable fail early checks
    dd.SetDebugLogFile(logname % 4)
    # Reset dbes error
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_XID_ERRORS, 0, 0)
    # Sleep to ensure no pending errors left
    time.sleep(10)

    response = test_utils.diag_execute_wrapper(dd, handle)
    # Verify the test passed
    assert check_diag_result_pass(response, gpuId, testIndex), \
        "Expected %s test to pass because there are no dbes\nGot result: %s (\ninfo: %s,\n warning: %s)" % \
            (test_name, response.perGpuResponses[gpuId].results[testIndex].result,
             response.perGpuResponses[gpuId].results[testIndex].info,
             response.perGpuResponses[gpuId].results[testIndex].error.msg)
Beispiel #17
0
def test_nvvs_plugin_software_inforom_embedded(handle, gpuIds):
    dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds, testNamesStr="short")
    response = test_utils.diag_execute_wrapper(dd, handle)
    for gpuId in gpuIds:
        result = response.levelOneResults[dcgm_structs.DCGM_SWTEST_INFOROM].result
        assert(result == dcgm_structs.DCGM_DIAG_RESULT_PASS or result == dcgm_structs.DCGM_DIAG_RESULT_SKIP)
Beispiel #18
0
def helper_test_stats_file_basics(handle,
                                  gpuIds,
                                  statsAsString,
                                  pluginName,
                                  pluginIndex,
                                  statName=None):
    dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds,
                           testNamesStr=pluginName,
                           paramsStr='%s.test_duration=20' %
                           pluginName)  # was 20

    dd.SetStatsPath('/tmp/')

    # Make sure a stats file was created
    statsfile = '/tmp/stats_%s.json' % (pluginName.replace(' ', '_'))

    if statsAsString == True:
        dd.SetConfigFileContents(
            "%YAML 1.2\n\nglobals:\n  logfile_type: text\n")

    response = test_utils.diag_execute_wrapper(dd, handle)

    skippedAll = True

    try:
        if len(response.systemError.msg) == 0:
            passedCount = 0
            errors = ""

            for gpuIndex in range(response.gpuCount):
                resultType = response.perGpuResponses[gpuIndex].results[
                    pluginIndex].result
                if resultType != dcgm_structs.DCGM_DIAG_RESULT_SKIP \
                   and resultType != dcgm_structs.DCGM_DIAG_RESULT_NOT_RUN:
                    skippedAll = False
                    if resultType == dcgm_structs.DCGM_DIAG_RESULT_PASS:
                        passedCount = passedCount + 1
                    else:
                        warning = response.perGpuResponses[gpuIndex].results[
                            pluginIndex].error.msg
                        if len(warning):
                            errors = "%s GPU %d failed: %s" % (
                                errors, gpuIndex, warning)

            if skippedAll == False and passedCount > 0:
                detailedMsg = "passed on %d of %d GPUs" % (passedCount,
                                                           response.gpuCount)
                if len(errors):
                    detailedMsg = "%s and had these errors: %s" % (detailedMsg,
                                                                   errors)
                    logger.info("%s when running the %s plugin" %
                                (detailedMsg, pluginName))

                    assert os.path.isfile(
                        statsfile
                    ), "Statsfile '%s' was not created as expected and %s" % (
                        statsfile, detailedMsg)

                if not statsAsString:
                    helper_basic_stats_file_check(statsfile, gpuIds, statName)
            elif passedCount == 0:
                test_utils.skip_test(
                    "Unable to pass any of these short runs for plugin %s." %
                    pluginName)
            else:
                test_utils.skip_test(
                    "The %s plugin was skipped, so we cannot run this test." %
                    pluginName)
        else:
            test_utils.skip_test(
                "The %s plugin had a problem when executing, so we cannot run this test."
                % pluginName)
    finally:
        if os.path.exists(statsfile):
            os.remove(statsfile)
Beispiel #19
0
 def run(dd, response):
     response = test_utils.diag_execute_wrapper(dd, handle)
Beispiel #20
0
def helper_check_diag_stop_on_interrupt_signals(handle, gpuId):
    """
    Verifies that a launched diag is stopped when the dcgmi executable recieves a SIGINT, SIGHUP, SIGQUIT, or SIGTERM
    signal.
    """
    # First check whether the GPU is healthy/supported
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId],
                           testNamesStr="SM Stress",
                           paramsStr="sm stress.test_duration=2",
                           version=dcgm_structs.dcgmRunDiag_version7)
    response = test_utils.diag_execute_wrapper(dd, handle)
    if not check_diag_result_pass(response, gpuId,
                                  dcgm_structs.DCGM_SM_STRESS_INDEX):
        test_utils.skip_test(
            "Skipping because GPU %s does not pass SM Stress test. "
            "Please verify whether the GPU is supported and healthy." % gpuId)

    # paths to dcgmi executable
    paths = {
        "Linux_32bit": "./apps/x86/dcgmi",
        "Linux_64bit": "./apps/amd64/dcgmi",
        "Linux_ppc64le": "./apps/ppc64le/dcgmi",
        "Linux_aarch64": "./apps/aarch64/dcgmi"
    }
    # Verify test is running on a supported platform
    if utils.platform_identifier not in paths:
        test_utils.skip_test("Dcgmi is not supported on the current platform.")
    dcgmi_path = paths[utils.platform_identifier]

    def verify_exit_code_on_signal(signum):
        # Ensure that host engine is ready to launch a new diagnostic
        dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr='1')
        success = False
        start = time.time()
        while not success and (time.time() - start) <= 3:
            try:
                response = test_utils.diag_execute_wrapper(dd, handle)
                success = True
            except dcgm_structs.dcgmExceptionClass(
                    dcgm_structs.DCGM_ST_DIAG_ALREADY_RUNNING):
                # Only acceptable error due to small race condition between the nvvs process exiting and
                # hostengine actually processing the exit. We try for a maximum of 3 seconds since this
                # should be rare and last only for a short amount of time
                time.sleep(1.5)

        diagApp = AppRunner(dcgmi_path,
                            args=[
                                "diag", "-r", "SM Stress", "-i",
                                "%s" % gpuId, "-d", "INFO", "--debugLogFile",
                                "/tmp/nvvs.log"
                            ])
        # Start the diag
        diagApp.start(timeout=40)
        logger.info("Launched dcgmi process with pid: %s" % diagApp.getpid())

        # Ensure diag is running before sending interrupt signal
        running, debug_output = dcgm_internal_helpers.check_nvvs_process(
            want_running=True, attempts=50)
        assert running, "The nvvs process did not start within 25 seconds: %s" % (
            debug_output)
        # There is a small race condition here - it is possible that the hostengine sends a SIGTERM before the
        # nvvs process has setup a signal handler, and so the nvvs process does not stop when SIGTERM is sent.
        # We sleep for 1 second to reduce the possibility of this scenario
        time.sleep(1)
        diagApp.signal(signum)
        retCode = diagApp.wait()
        # Check the return code and stdout/stderr output before asserting for better debugging info
        if retCode == 0:
            logger.error("Got retcode '%s' from launched diag." % retCode)
            if diagApp.stderr_lines or diagApp.stdout_lines:
                logger.info("dcgmi output:")
                for line in diagApp.stdout_lines:
                    logger.info(line)
                for line in diagApp.stderr_lines:
                    logger.error(line)
        assert retCode != 0, "Expected a non-zero exit code, but got 0"
        # Since the app returns a non zero exit code, we call the validate method to prevent false
        # failures from the test framework
        diagApp.validate()
        # Give the launched nvvs process 15 seconds to terminate.
        not_running, debug_output = dcgm_internal_helpers.check_nvvs_process(
            want_running=False, attempts=50)
        assert not_running, "The launched nvvs process did not terminate within 25 seconds. pgrep output:\n%s" \
                % debug_output

    # Verify return code on SIGINT
    # We simply verify the return code because explicitly checking whether the nvvs process has terminated is
    # clunky and error-prone
    logger.info("Testing stop on SIGINT")
    verify_exit_code_on_signal(signal.SIGINT)

    # Verify return code on SIGHUP
    logger.info("Testing stop on SIGHUP")
    verify_exit_code_on_signal(signal.SIGHUP)

    # Verify return code on SIGQUIT
    logger.info("Testing stop on SIGQUIT")
    verify_exit_code_on_signal(signal.SIGQUIT)

    # Verify return code on SIGTERM
    logger.info("Testing stop on SIGTERM")
    verify_exit_code_on_signal(signal.SIGTERM)