Exemple #1
0
def helper_throttling_masking_failures(handle, gpuId):
    #####
    # First check whether the GPU is healthy
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId],
                           testNamesStr="SM Stress",
                           paramsStr="sm stress.test_duration=2",
                           version=dcgm_structs.dcgmRunDiag_version)
    dd.SetThrottleMask(
        0
    )  # We explicitly want to fail for throttle reasons since this test inserts throttling errors
    # for verification
    dd.UseFakeGpus()
    response = test_utils.diag_execute_wrapper(dd, handle)
    if not check_diag_result_pass(response, gpuId,
                                  dcgm_structs.DCGM_SM_STRESS_INDEX):
        test_utils.skip_test(
            "Skipping because GPU %s does not pass SM Perf test. "
            "Please verify whether the GPU is supported and healthy." % gpuId)

    #####
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId],
                           testNamesStr="SM Stress",
                           paramsStr="sm stress.test_duration=15",
                           version=dcgm_structs.dcgmRunDiag_version)
    dd.SetThrottleMask(0)
    dd.UseFakeGpus()

    fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS
    insertedError = dcgm_fields.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN
    interval = 0.1

    logger.info("Injecting benign errors")
    inject_value(handle, gpuId, fieldId, 3, 1, True)
    # Verify that the inserted values are visible in DCGM before starting the diag
    assert dcgm_internal_helpers.verify_field_value(gpuId, fieldId, 3, checkInterval=interval, maxWait=5, numMatches=1), \
        "Expected inserted values to be visible in DCGM"

    logger.info("Injecting actual errors")
    inject_value(handle, gpuId, fieldId, insertedError, injection_offset, True)
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000,
                 injection_offset, True)

    logger.info("Started diag")
    response = test_utils.diag_execute_wrapper(dd, handle)
    # Verify that the inserted values are visible in DCGM
    # Max wait of 8 is because of 5 second offset + 2 seconds required for 20 matches + 1 second buffer.
    assert dcgm_internal_helpers.verify_field_value(gpuId, fieldId, insertedError, checkInterval=0.1, numMatches=1, maxWait=8), \
            "Expected inserted errors to be visible in DCGM"

    throttled, errMsg = find_throttle_failure(
        response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX)
    assert throttled, "Expected to find throttling failure, but did not: (%s)" % errMsg
Exemple #2
0
def perform_diag_with_throttle_mask_and_verify(dd, handle, gpuId,
                                               inserted_error, throttle_mask,
                                               shouldPass, failureMsg):
    fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS
    interval = 0.1
    if throttle_mask is not None:
        dd.SetThrottleMask(throttle_mask)

    inject_value(handle, gpuId, fieldId, inserted_error, injection_offset,
                 True)
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000,
                 injection_offset, True)
    # Verify that the inserted values are visible in DCGM before starting the diag
    assert dcgm_internal_helpers.verify_field_value(gpuId, fieldId, inserted_error, checkInterval=interval, maxWait=5, numMatches=1), \
        "Expected inserted values to be visible in DCGM"

    # Start the diag
    response = test_utils.diag_execute_wrapper(dd, handle)

    # Check for pass or failure as per the shouldPass parameter
    throttled, errMsg = find_throttle_failure(
        response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX)
    if shouldPass:
        assert throttled == False, "Expected to not have a throttling error but found %s" % errMsg
    else:
        assert throttled == True, "Expected to find a throttling error but did not (%s)" % errMsg
Exemple #3
0
def helper_test_thermal_violations_in_seconds(handle, gpuIds):
    dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds,
                           testNamesStr='diagnostic',
                           paramsStr='diagnostic.test_duration=10')
    dd.UseFakeGpus()
    fieldId = dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION
    injected_value = 2344122048
    inject_value(handle, gpuIds[0], fieldId, injected_value, 10, True)

    # Verify that the inserted values are visible in DCGM before starting the diag
    assert dcgm_internal_helpers.verify_field_value(gpuIds[0], fieldId, injected_value, maxWait=5, numMatches=1), \
        "Expected inserted values to be visible in DCGM"

    # Start the diag
    response = dd.Execute(handle)

    testIndex = dcgm_structs.DCGM_DIAGNOSTIC_INDEX
    errmsg = response.perGpuResponses[gpuIds[0]].results[testIndex].error.msg
    # Check for hermal instead of thermal because sometimes it's capitalized
    if errmsg.find("hermal violations") != -1:
        foundError = True
        assert errmsg.find("totaling 2.3 seconds") != -1, \
            "Expected 2.3 seconds of thermal violations but found %s" % errmsg
    else:
        # Didn't find an error
        assert False, "Thermal violations were injected but not found in error message: '%s'." % errmsg
Exemple #4
0
def helper_per_gpu_responses_api(handle, gpuIds, testDir):
    """
    Verify that pass/fail status for diagnostic tests are reported on a per GPU basis via dcgmActionValidate API call
    """
    failGpuId = gpuIds[0]
    dd = helper_verify_diag_passing(handle, gpuIds, useFakeGpus=True)

    dd = DcgmDiag.DcgmDiag(gpuIds=[failGpuId],
                           testNamesStr="SM Stress",
                           paramsStr="sm stress.test_duration=15",
                           version=dcgm_structs.dcgmRunDiag_version)
    dd.SetThrottleMask(
        0
    )  # We explicitly want to fail for throttle reasons since this test inserts throttling errors
    # for verification
    dd.UseFakeGpus()
    dd.SetStatsPath(testDir)
    dd.SetStatsOnFail(1)

    # Setup injection app
    fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS
    insertedError = dcgm_fields.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN
    interval = 0.1
    # Use an offset to make these errors start after the benign values
    inject_value(handle, failGpuId, fieldId, insertedError, injection_offset,
                 True)
    inject_value(handle, failGpuId, dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000,
                 injection_offset, True)
    # Verify that the inserted values are visible in DCGM before starting the diag
    assert dcgm_internal_helpers.verify_field_value(failGpuId, fieldId, insertedError, checkInterval=interval, maxWait=5, numMatches=1), \
        "Expected inserted values to be visible in DCGM"

    response = test_utils.diag_execute_wrapper(dd, handle)
    logger.info("Started diag")

    # Verify that responses are reported on a per gpu basis. Ensure the first GPU failed, and all others passed
    for gpuId in gpuIds:
        throttled, errMsg = find_throttle_failure(
            response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX)
        if gpuId == failGpuId:
            assert throttled, "Expected throttling error but found none (%s)" % errMsg
        else:
            assert not throttled, "Expected not to find a throttling error but found '%s'" % errMsg
Exemple #5
0
def helper_test_dcgm_diag_dbe_insertion(handle, gpuIds, testDir):
    dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds,
                           testNamesStr='diagnostic',
                           paramsStr='diagnostic.test_duration=30')
    dd.SetStatsPath(testDir)
    dd.SetStatsOnFail(1)

    def run(dd):
        dd.Execute(handle)

    # Setup injection app
    fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS
    failGpuId = gpuIds[0]
    inject_error = dcgm_internal_helpers.InjectionThread(
        handle,
        failGpuId,
        fieldId,
        dcgm_fields.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN,
        offset=5)
    logger.info("Injecting HW_SLOWDOWN throttle error for GPU %s" % failGpuId)
    inject_error.start()
    # Verify that the inserted values are visible in DCGM before starting the diag
    assert dcgm_internal_helpers.verify_field_value(failGpuId, fieldId, 8, maxWait=5), \
            "Expected inserted values to be visible"

    t = threading.Thread(target=run, args=[dd])
    logger.info("Started diag")
    t.start()

    # Wait for diag to finish
    logger.info("Waiting for diag to finish")
    t.join()

    # Stop error insertion
    logger.info("Stopped error injection")
    inject_error.Stop()
    inject_error.join()
    assert inject_error.retCode == dcgm_structs.DCGM_ST_OK, "Error injection failed: %s" % inject_error.retCode
Exemple #6
0
def with_error_run(handle, gpuIds, name, testname, parms=None):
    """
    Runs the given test (testname) and inserts throttling / REPLAY_COUNTER errors depending on the test.
    name is the name of the plugin in nvvs (e.g. constant_perf)

    Logs an error (but does not fail the test) if the dcgmi return code is not 226 (lower 8 bits of
    -30/DCGM_ST_NVVS_ERROR) which is expected since the test should fail due to inserted errors.

    Since busgrind/PCIe does a diff for the REPLAY_COUNTER field we need to insert errors after busgrind has read 
    some zero values for the field. As a result, the hardcoded delay of 15 seconds must be adjusted on different
    systems (currently a delay of 15 seconds works for the bstolle-dgx machine).
    """
    output_file = OUTPUT_DIR + "/dcgmi_%s_with_err_%s.json" % (name, gpuIds[0])
    log_file = OUTPUT_DIR + "/nvvs_%s_with_err_%s.log" % (name, gpuIds[0])
    gpu_list = ",".join(map(str, gpuIds))

    args = [
        "diag", "-r",
        "%s" % testname, "-i", gpu_list, "-j", "-v", "-d", "5",
        "--debugLogFile", "/tmp/nvvs.log"
    ]
    if parms != None:
        args.extend(["-p", "%s" % parms])
    dcgmi = DcgmiApp(args=args)

    field_id = dcgm_fields.DCGM_FI_DEV_GPU_TEMP
    value = 1000
    delay = 0
    if name == "busgrind":
        field_id = dcgm_fields.DCGM_FI_DEV_PCIE_REPLAY_COUNTER
        value = 1000
        delay = 15

    inject_error = InjectionThread(handle, gpuIds[0], field_id, value)
    if delay == 0:
        inject_error.start()
        logger.info("Injecting errors now (field %s, value %s)" %
                    (field_id, value))
        assert dcgm_internal_helpers.verify_field_value(
            gpuIds[0], field_id, value)

    start = time.time()
    dcgmi.start(timeout=1500)  # 25min timeout
    logger.info("Started diag with args: %s" % args)

    # Some tests do a diff test for the field values so we must let them see 0 values first
    if delay != 0:
        running, _ = dcgm_internal_helpers.check_nvvs_process(
            want_running=True)
        assert running, "nvvs did not start"
        logger.info("Nvvs started after %.1fs" % (time.time() - start))
        time.sleep(delay)
        logger.info("Injecting errors now (field %s, value %s)" %
                    (field_id, value))
        inject_error.start()
        assert dcgm_internal_helpers.verify_field_value(gpuIds[0],
                                                        field_id,
                                                        value,
                                                        maxWait=3)

    retcode = dcgmi.wait()

    inject_error.Stop()
    inject_error.join()
    assert inject_error.retCode == dcgm_structs.DCGM_ST_OK

    copy_nvvs_log("/tmp/nvvs.log", log_file)
    expected_retcode = ctypes.c_uint8(dcgm_structs.DCGM_ST_NVVS_ERROR).value
    if retcode != expected_retcode:
        logger.error("Expected retcode to be %s, but retcode of dcgmi is %s" %
                     (expected_retcode, retcode))
    dcgmi.validate(
    )  # Validate because dcgmi returns non zero when the diag fails (expected)
    log_app_output_to_file(dcgmi, output_file)
Exemple #7
0
def verify_early_fail_checks_for_test(handle, gpuId, test_name, testIndex):
    """
    Helper method for verifying the fail early checks for the specified test.
    """
    if testIndex == dcgm_structs.DCGM_TARGETED_POWER_INDEX and not option_parser.options.developer_mode:
        # Skip this test since Targeted Power always fails when duration is less than 30 seconds
        test_utils.skip_test("Skipping fail early verification for Targeted Power test. Use developer mode "
                             "to run this test.")
    duration = 2 if testIndex != dcgm_structs.DCGM_TARGETED_POWER_INDEX else 30 # Prevent false failures due to min
                                                                                # duration requirements for Targeted Power
    paramsStr = "%s.test_duration=%s" % (test_name, duration)

    data = [None]
    def runDiag(dd, data): # Simple helper method to run a diag (used as thread target)
        data[0] = test_utils.diag_execute_wrapper(dd, handle)

    ###
    # First verify that the given test passes for the gpu.
    # If it doesn't pass, skip test and add note to check GPU health
    logger.info("Checking whether %s test passes on GPU %s" % (test_name, gpuId))
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr)
    test_name_no_spaces = test_name.replace(" ", "_")
    logname = '/tmp/nv_' + test_name_no_spaces + '%s.log'
    dd.SetDebugLogFile(logname % 1)
    dd.SetDebugLevel(5)
    response = test_utils.diag_execute_wrapper(dd, handle)
    if not check_diag_result_pass(response, gpuId, testIndex):
        test_utils.skip_test("Skipping because GPU %s does not pass %s test. "
                             "Please verify whether the GPU is healthy." % (gpuId, test_name))

    ###
    # Next, verify that the given test passes for the gpu when fail early checks are enabled and no errors are inserted
    logger.info("Checking whether %s test passes on GPU %s with fail early enabled" % (test_name, gpuId))
    duration = 15 if testIndex != dcgm_structs.DCGM_TARGETED_POWER_INDEX else 30 # Prevent false failures due to min
                                                                                 # duration requirements for Targeted Power
    paramsStr = "%s.test_duration=%s" % (test_name, duration)
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr)
    dd.SetFailEarly(checkInterval=2) # enable fail early checks
    dd.SetDebugLogFile(logname % 2)
    dd.SetDebugLevel(5)

    result_thread = threading.Thread(target=runDiag, args=[dd, data])
    result_thread.start()

    # Ensure nvvs process has started
    running, debug_output = dcgm_internal_helpers.check_nvvs_process(want_running=True)
    assert running, "Nvvs process did not start within 10 seconds. pgrep output: %s" % debug_output

    start = time.time()
    result_thread.join()
    end = time.time()

    assert check_diag_result_pass(data[0], gpuId, testIndex), \
        "Expected %s test to pass with fail early enabled and no inserted errors" % test_name
    assert (end - start) >= duration * 0.9, \
        "Expected %s test to run for at least %ss, but it only ran for %ss." % (test_name, duration, end - start)

    ###
    # Verify fail early behavior by inserting an error.
    # Setup test parameters
    duration = 20 if testIndex != dcgm_structs.DCGM_TARGETED_POWER_INDEX else 30 # Prevent false failures due to min
                                                                                 # duration requirements for Targeted Power
    paramsStr = "%s.test_duration=%s" % (test_name, duration)
    response = None
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr)
    dd.SetFailEarly(checkInterval=2) # enable fail early checks
    dd.SetDebugLogFile(logname % 3)

    # Setup threads / processes
    xid_inject_val = 2
    result_thread = threading.Thread(target=runDiag, args=[dd, data])
    inject_error = dcgm_internal_helpers.InjectionThread(handle, gpuId,
        dcgm_fields.DCGM_FI_DEV_XID_ERRORS, xid_inject_val, offset=5)

    logger.info("Verifying fail early behavior for %s test by inserting XIDs." % test_name)
    # Start inserting errors
    inject_error.start()
    # Ensure that inserted errors are visible
    assert \
        dcgm_internal_helpers.verify_field_value(gpuId, dcgm_fields.DCGM_FI_DEV_XID_ERRORS,
                                                 xid_inject_val, checkInterval=0.1, numMatches=5), \
        "Expected inserted value for XIDs to be visible in DCGM"

    # Start test thread
    result_thread.start()
    # Ensure nvvs process has started
    running, debug_output = dcgm_internal_helpers.check_nvvs_process(want_running=True)
    assert running, "Nvvs process did not start within 10 seconds. pgrep output: %s" % debug_output
    start = time.time()
    
    # Give the test time to exit and verify that the test exits early
    # Test should exit within 75% of test duration if it is going to fail early. Ideally, it should exit within 
    # 2 failure checks (~ 4 seconds of test start), but we provide bigger buffer to account for delays in starting 
    # the test
    result_thread.join(20)
    test_exited_early = not result_thread.is_alive() # Cache thread isAlive value until we verify it
    end = time.time()

    # Stop the injection app
    inject_error.Stop()
    inject_error.join()
    # Verify injection app stopped correctly
    assert inject_error.retCode == dcgm_structs.DCGM_ST_OK, \
        "There was an error inserting values into dcgm. Return code: %s" % inject_error.retCode

    if not test_exited_early:
        # Wait for the launched diag to end
        result_thread.join()
        end = time.time()
    
    response = data[0]
    # Check whether test exited early
    assert test_exited_early, \
        "Expected %s test to exit early. Test took %ss to complete.\nGot result: %s (\ninfo: %s,\n warning: %s)" \
            % (test_name, (end - start),
               response.perGpuResponses[gpuId].results[testIndex].result,
               response.perGpuResponses[gpuId].results[testIndex].info,
               response.perGpuResponses[gpuId].results[testIndex].error.msg)

    # Verify the test failed
    assert check_diag_result_fail(response, gpuId, testIndex), \
        "Expected %s test to fail due to injected dbes.\nGot result: %s (\ninfo: %s,\n warning: %s)" % \
            (test_name, response.perGpuResponses[gpuId].results[testIndex].result,
             response.perGpuResponses[gpuId].results[testIndex].info,
             response.perGpuResponses[gpuId].results[testIndex].error.msg)

    ###
    # Rerun the test to verify that the test passes now that there are no inserted errors
    duration = 30
    paramsStr = "%s.test_duration=%s" % (test_name, duration)

    logger.info("Verifying that test passes once xid errors are removed.")
    dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr=test_name, paramsStr=paramsStr)
    dd.SetFailEarly(checkInterval=3) # enable fail early checks
    dd.SetDebugLogFile(logname % 4)
    # Reset dbes error
    inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_XID_ERRORS, 0, 0)
    # Sleep to ensure no pending errors left
    time.sleep(10)

    response = test_utils.diag_execute_wrapper(dd, handle)
    # Verify the test passed
    assert check_diag_result_pass(response, gpuId, testIndex), \
        "Expected %s test to pass because there are no dbes\nGot result: %s (\ninfo: %s,\n warning: %s)" % \
            (test_name, response.perGpuResponses[gpuId].results[testIndex].result,
             response.perGpuResponses[gpuId].results[testIndex].info,
             response.perGpuResponses[gpuId].results[testIndex].error.msg)
Exemple #8
0
def helper_per_gpu_responses_dcgmi(handle, gpuIds, testName, testParams):
    """
    Verify that pass/fail status for diagnostic tests are reported on a per GPU basis via dcgmi (for both normal stdout 
    and JSON output).
    """
    def get_stdout(app):
        output = ''
        for line in app.stdout_lines:
            output = output + line + " "
        return output

    def print_output(app):
        logger.info(get_stdout(app))
        for line in app.stderr_lines:
            logger.error(line)

    def verify_successful_dcgmi_run(app):
        app.start(timeout=40)

        logger.info("Started dcgmi diag with pid %s" % app.getpid())
        retcode = app.wait()

        if test_utils.is_mig_incompatible_failure(get_stdout(app)):
            app.validate()
            test_utils.skip_test(
                "Skipping this test because MIG is configured incompatibly (preventing access to the whole GPU)"
            )

        # dcgm returns DCGM_ST_NVVS_ERROR on diag failure (which is expected here).
        expected_retcode = c_uint8(
            dcgm_structs.DCGM_ST_NVVS_ISOLATE_ERROR).value
        if retcode != expected_retcode:
            if app.stderr_lines or app.stdout_lines:
                logger.info("dcgmi output:")
                print_output(app)
        assert retcode == expected_retcode, \
            "Expected dcgmi diag to have retcode %s. Got return code %s" % (expected_retcode, retcode)
        app.validate()  # non-zero exit code must be validated

    #helper_verify_diag_passing(handle, gpuIds, useFakeGpus=True)

    # Setup injection app
    interval = 0.1
    fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS
    insertedError = dcgm_fields.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN
    # Use an offset to make these errors start after the benign values
    inject_value(handle, gpuIds[0], fieldId, insertedError, injection_offset,
                 True)
    inject_value(handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000,
                 injection_offset, True)
    # Verify that the inserted values are visible in DCGM before starting the diag
    assert dcgm_internal_helpers.verify_field_value(gpuIds[0], fieldId, insertedError, checkInterval=interval, maxWait=5, numMatches=1), \
        "Expected inserted values to be visible in DCGM"

    # Verify dcgmi output
    gpuIdStrings = list(map(str, gpuIds))
    gpuList = ",".join(gpuIdStrings)
    args = [
        "diag", "-r", testName, "-p", testParams, "-f", gpuList,
        "--throttle-mask", "0"
    ]
    dcgmiApp = DcgmiApp(args=args)

    logger.info("Verifying stdout output")
    verify_successful_dcgmi_run(dcgmiApp)
    # Verify dcgmi output shows per gpu results (crude approximation of verifying correct console output)
    stress_header_found = False
    fail_gpu_found = False
    fail_gpu_text = "Fail - GPU: %s" % gpuIds[0]
    check_for_warning = False
    warning_found = False
    for line in dcgmiApp.stdout_lines:
        if not stress_header_found:
            if "Stress" not in line:
                continue
            stress_header_found = True
            continue
        if not fail_gpu_found:
            if fail_gpu_text not in line:
                continue
            fail_gpu_found = True
            check_for_warning = True
            continue
        if check_for_warning:
            if "Warning" in line:
                warning_found = True
            break

    if not (stress_header_found and fail_gpu_found and warning_found):
        logger.info("dcgmi output:")
        print_output(dcgmiApp)

    assert stress_header_found, "Expected to see 'Stress' header in output"
    assert fail_gpu_found, "Expected to see %s in output" % fail_gpu_text
    assert warning_found, "Expected to see 'Warning' in output after GPU failure text"

    inject_value(handle, gpuIds[0], fieldId, insertedError, injection_offset,
                 True)
    inject_value(handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000,
                 injection_offset, True)
    # Verify that the inserted values are visible in DCGM before starting the diag
    assert dcgm_internal_helpers.verify_field_value(gpuIds[0], fieldId, insertedError, checkInterval=interval, maxWait=5, numMatches=1), \
        "Expected inserted values to be visible in DCGM"

    # Verify JSON output
    logger.info("Verifying JSON output")
    args.append("-j")
    dcgmiApp = DcgmiApp(args=args)
    verify_successful_dcgmi_run(dcgmiApp)

    # Stop error insertion
    logger.info("Stopped error injection")

    # Verify per GPU results
    json_output = "\n".join(dcgmiApp.stdout_lines)
    output = json.loads(json_output)
    verifed = False
    if (len(output.get("DCGM GPU Diagnostic", {}).get("test_categories", []))
            == 2 and output["DCGM GPU Diagnostic"]["test_categories"][1].get(
                "category", None) == "Stress" and output["DCGM GPU Diagnostic"]
        ["test_categories"][1]["tests"][0]["name"] == testName
            and len(output["DCGM GPU Diagnostic"]["test_categories"][1]
                    ["tests"][0]["results"]) >= 2
            and output["DCGM GPU Diagnostic"]["test_categories"][1]["tests"][0]
        ["results"][0]["gpu_ids"] == str(gpuIds[0])
            and output["DCGM GPU Diagnostic"]["test_categories"][1]["tests"][0]
        ["results"][0]["status"] == "Fail" and output["DCGM GPU Diagnostic"]
        ["test_categories"][1]["tests"][0]["results"][1]["status"] == "Pass"):
        verifed = True

    if not verifed:
        print_output(dcgmiApp)

    assert verifed, "dcgmi JSON output did not pass verification"