Esempio n. 1
0
def test_sdk_example_script_smoke_embedded_manual():
    """
    Smoke test ensuring that the example script for using dcgm does not fail 
    for an embedded hostengine with manual operation mode
    """
    env = {'PYTHONPATH': ':'.join(sys.path)}
    script = os.path.join(sdk_sample_scripts_path, 'dcgm_example.py')
    example = AppRunner(sys.executable, [script, '--opmode=manual', '--type=embedded'], env=env)
    example.run(timeout=SAMPLE_SCRIPT_TIMEOUT)
Esempio n. 2
0
    def verify_exit_code_on_signal(signum):
        # Ensure that host engine is ready to launch a new diagnostic
        dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr='1')
        success = False
        start = time.time()
        while not success and (time.time() - start) <= 3:
            try:
                response = test_utils.diag_execute_wrapper(dd, handle)
                success = True
            except dcgm_structs.dcgmExceptionClass(
                    dcgm_structs.DCGM_ST_DIAG_ALREADY_RUNNING):
                # Only acceptable error due to small race condition between the nvvs process exiting and
                # hostengine actually processing the exit. We try for a maximum of 3 seconds since this
                # should be rare and last only for a short amount of time
                time.sleep(1.5)

        diagApp = AppRunner(dcgmi_path,
                            args=[
                                "diag", "-r", "SM Stress", "-i",
                                "%s" % gpuId, "-d", "INFO", "--debugLogFile",
                                "/tmp/nvvs.log"
                            ])
        # Start the diag
        diagApp.start(timeout=40)
        logger.info("Launched dcgmi process with pid: %s" % diagApp.getpid())

        # Ensure diag is running before sending interrupt signal
        running, debug_output = dcgm_internal_helpers.check_nvvs_process(
            want_running=True, attempts=50)
        assert running, "The nvvs process did not start within 25 seconds: %s" % (
            debug_output)
        # There is a small race condition here - it is possible that the hostengine sends a SIGTERM before the
        # nvvs process has setup a signal handler, and so the nvvs process does not stop when SIGTERM is sent.
        # We sleep for 1 second to reduce the possibility of this scenario
        time.sleep(1)
        diagApp.signal(signum)
        retCode = diagApp.wait()
        # Check the return code and stdout/stderr output before asserting for better debugging info
        if retCode == 0:
            logger.error("Got retcode '%s' from launched diag." % retCode)
            if diagApp.stderr_lines or diagApp.stdout_lines:
                logger.info("dcgmi output:")
                for line in diagApp.stdout_lines:
                    logger.info(line)
                for line in diagApp.stderr_lines:
                    logger.error(line)
        assert retCode != 0, "Expected a non-zero exit code, but got 0"
        # Since the app returns a non zero exit code, we call the validate method to prevent false
        # failures from the test framework
        diagApp.validate()
        # Give the launched nvvs process 15 seconds to terminate.
        not_running, debug_output = dcgm_internal_helpers.check_nvvs_process(
            want_running=False, attempts=50)
        assert not_running, "The launched nvvs process did not terminate within 25 seconds. pgrep output:\n%s" \
                % debug_output
Esempio n. 3
0
def createBlacklistApp(numGpus=None, numSwitches=None, testNames=None, instantaneous=False):
    args = ["./%s" % STANDALONE_BLACKLIST_SCRIPT_NAME]
    if numGpus == None or numSwitches == None:
        args.append("-d")
    else:
        args.append("-g")
        args.append(str(numGpus))
        args.append("-s")
        args.append(str(numSwitches))

    if instantaneous:
        args.append("-i")
    elif testNames:
        args.append("-r")
        args.append(testNames)
    else:
        args.append("-r")
        args.append("memory bandwidth")

    return AppRunner(sys.executable, args)