コード例 #1
0
ファイル: process_stats_stress.py プロジェクト: omertuc/DCGM
def main():
    #Make sure logging stuff is bootstrapped
    try:
        option_parser.parse_options()
        option_parser.options.no_logging = True  #Don't log anything
        heHandle = None
        heAppRunner = None

        dcgm_structs._LoadDcgmLibrary()

        if g_embeddedMode:
            host = 0
        else:
            #Start host engine
            heAppRunner = apps.NvHostEngineApp()
            heAppRunner.start(timeout=1000000000)
            time.sleep(2.0)
            host = "127.0.0.1"

        heHandle = dcgm_agent.dcgmInit()

        pssObj = ProcessStatsStress(g_embeddedMode, heHandle)
        pssObj.Run()
        del (pssObj)  #Force destructor
        heAppRunner.wait()
    except Exception as e:
        raise
    finally:
        apps.AppRunner.clean_all()
        if heHandle is not None:
            dcgm_agent.dcgmShutdown()
コード例 #2
0
        assert config_values[x].mComputeMode  == expected_compute_mode, "The compute mode value for gpuID %d is incorrect."\
                                " Returned: %d Expected: %d" \
                                % (x, config_values[x].mComputeMode, expected_compute_mode)

        assert config_values[x].mEccMode  == expected_ecc, "The ecc mode value for gpuID %d is incorrect."\
                                " Returned: %d Expected: %d" \
                                % (x, config_values[x].mEccMode, expected_ecc)
        pass

    ret = dcgm_agent.dcgmStatusDestroy(status_handle)
    assert (ret == dcgm_structs.DCGM_ST_OK
            ), "Failed to remove status handler, error: %s" % ret


dcgm_structs._LoadDcgmLibrary()
handle = dcgm_agent.dcgmInit()

devices = dcgm_agent.dcgmGetAllDevices(handle)
validDevices = list()
for x in devices:
    fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields(
        handle, x, [
            dcgm_fields.DCGM_FI_DEV_RETIRED_DBE,
        ])
    if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED):
        validDevices.append(x)

if (len(validDevices) == 0):
    print "Can only run if at least one GPU with ECC is present"
    sys.exit(1)