コード例 #1
0
def test_dcgm_vgpu_config_get_validate(handle):
    """
    Validates structure version
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    gpuIdList = systemObj.discovery.GetAllGpuIds()
    assert len(gpuIdList
               ) >= 0, "Not able to find devices on the node for embedded case"

    groupId = dcgm_agent.dcgmGroupCreate(handle,
                                         dcgm_structs.DCGM_GROUP_DEFAULT,
                                         "test1")
    groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)
    status_handle = dcgm_agent.dcgmStatusCreate()

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmVgpuConfigGet(handle, groupId,
                                  dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
                                  groupInfo.count, status_handle, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmVgpuConfigGet(handle, groupId,
                                  dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
                                  groupInfo.count, status_handle, versionTest)
コード例 #2
0
def test_dcgm_policy_get_validate(handle):
    """
    Validates structure version
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    gpuIdList = systemObj.discovery.GetAllGpuIds()
    assert len(gpuIdList
               ) >= 0, "Not able to find devices on the node for embedded case"

    groupId = dcgm_agent.dcgmGroupCreate(handle,
                                         dcgm_structs.DCGM_GROUP_DEFAULT,
                                         "test1")
    status_handle = dcgm_agent.dcgmStatusCreate()
    count = 1

    diagLevel = dcgm_structs.DCGM_DIAG_LVL_SHORT

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmPolicyGet(handle, groupId, count, status_handle,
                              versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmPolicyGet(handle, groupId, count, status_handle,
                              versionTest)
コード例 #3
0
def helper_check_profiling_environment(dcgmGroup):
    try:
        dcgmGroup.profiling.GetSupportedMetricGroups()
    except dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_PROFILING_NOT_SUPPORTED) as e:
        test_utils.skip_test(g_profNotSupportedErrorStr)
    except dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_MODULE_NOT_LOADED) as e:
        test_utils.skip_test(g_moduleNotLoadedErrorStr)
    except dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_SUPPORTED) as e:
        test_utils.skip_test(g_profNotSupportedErrorStr)
コード例 #4
0
def test_dcgm_get_vgpu_instance_attributes_validate(handle, gpuIds):
    """
    Verifies that vGPU attributes are properly queried
    """

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmGetVgpuInstanceAttributes(handle, gpuIds[0], versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmGetVgpuInstanceAttributes(handle, gpuIds[0], versionTest)
コード例 #5
0
ファイル: test_starter.py プロジェクト: omertuc/DCGM
def test_dcgm_engine_watch_field_values(handle):
    """
    Verifies that cache manager can watch a field value
    """

    # Watch field so we can fetch it
    fieldId = dcgm_fields.DCGM_FI_DEV_NAME
    gpuId = 0

    try:
        fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
            handle, gpuId, fieldId)
        numWatchersBefore = fieldInfo.numWatchers
    except dcgm_structs.dcgmExceptionClass(
            dcgm_structs.DCGM_ST_NOT_WATCHED) as e:
        numWatchersBefore = 0

    ret = dcgm_agent_internal.dcgmWatchFieldValue(handle, gpuId, fieldId,
                                                  10000000, 86400.0, 0)
    assert (ret == dcgm_structs.DCGM_ST_OK)

    fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
        handle, gpuId, fieldId)
    assert fieldInfo.flags & dcgm_structs_internal.DCGM_CMI_F_WATCHED, "Expected watch. got flags %08X" % fieldInfo.flags

    numWatchersAfter = fieldInfo.numWatchers
    assert numWatchersAfter == numWatchersBefore + 1, "Expected 1 extra watcher. Before %d. After %d" % (
        numWatchersBefore, numWatchersAfter)
コード例 #6
0
def test_dcgm_job_get_stats_validate(handle):
    """
    Validates structure version
    """

    jobid = "1"

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmJobGetStats(handle, jobid, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmJobGetStats(handle, jobid, versionTest)
コード例 #7
0
def helper_dcgm_verify_sync_boost_single_gpu(handle, gpuIds):
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetEmptyGroup("test1")

    ## Add first GPU to the group
    groupObj.AddGpu(gpuIds[0])
    gpuIds = groupObj.GetGpuIds()  #Only reference GPUs we are testing against

    ## Set the sync boost for the group
    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
    config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.syncBoost = 1
    config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK

    #Config Set must return DCGM_ST_BADPARAM since we only have a single GPU
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)):
        groupObj.config.Set(config_values)

    groupObj.Delete()
コード例 #8
0
ファイル: DcgmReader.py プロジェクト: NVIDIA/DCGM
    def GetLatestGpuValuesAsDict(self, mapById):
        systemDictionary = {}

        with self.m_lock:
            try:
                self.Reconnect()
                fvs = self.m_dcgmGroup.samples.GetLatest(
                    self.m_fieldGroup).values
                for gpuId in fvs.keys():
                    systemDictionary[gpuId] = {
                    }  # initialize the gpu's dictionary
                    gpuFv = fvs[gpuId]

                    for fieldId in gpuFv.keys():
                        val = gpuFv[fieldId][-1]

                        if val.isBlank:
                            continue

                        if mapById == False:
                            fieldTag = self.m_fieldIdToInfo[fieldId].tag
                            systemDictionary[gpuId][fieldTag] = val.value
                        else:
                            systemDictionary[gpuId][fieldId] = val.value
            except dcgm_structs.dcgmExceptionClass(
                    dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
                self.LogError(
                    "Can't connection to nv-hostengine. Please verify that it is running."
                )
                self.SetDisconnected()

        return systemDictionary
コード例 #9
0
ファイル: process_stats_stress.py プロジェクト: omertuc/DCGM
    def LoopOneIteration(self):
        for i in range(g_processesPerSecond):
            self.StartAppOnGpus()

        #How many PIDs should we buffer by? Below is 3 seconds worth
        pidBuffer = (3 * g_processesPerSecond * len(self.gpus)) + 1

        #Do we have any pids that have finished yet? Clean them up
        while len(self.addedPids) > pidBuffer:

            #Look up PID info on a random PID that should be done. Assuming 3 seconds is enough
            pidIndex = random.randint(0, len(self.addedPids) - pidBuffer)

            pidObj = self.addedPids[pidIndex]

            try:
                processStats = dcgm_agent.dcgmGetPidInfo(
                    self.heHandle, self.groupId, pidObj.pid)
                self.Log(
                    "Found pid stats for pid %d. gpuId %d. returned pid %d" %
                    (pidObj.pid, pidObj.gpuId, processStats.pid))
            except dcgmExceptionClass(dcgm_structs.DCGM_ST_NO_DATA):
                self.Log(
                    "Pid %d hasn't finished yet. Sleeping to allow cuda to catch up"
                    % pidObj.pid)
                time.sleep(1.0)
                break

            #Finalize the resources the app object watches
            pidObj.appObj.wait()
            #Delete the found pid so we don't run out of file handles
            del self.addedPids[pidIndex]
            pidObj = None
コード例 #10
0
ファイル: test_dcgmi.py プロジェクト: omertuc/DCGM
def test_dcgmi_profile(handle, gpuIds):
    """
    Test DCGMI "profile" subcommand
    """
    dcgmHandle = pydcgm.DcgmHandle(handle=handle)
    dcgmSystem = dcgmHandle.GetSystem()
    dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds)

    # Creates a comma separated list of gpus
    allGpusCsv = ",".join(map(str, gpuIds))

    #See if these GPUs even support profiling. This will bail out for non-Tesla or Pascal or older SKUs
    try:
        supportedMetrics = dcgmGroup.profiling.GetSupportedMetricGroups()
    except dcgm_structs.dcgmExceptionClass(
            dcgm_structs.DCGM_ST_PROFILING_NOT_SUPPORTED) as e:
        test_utils.skip_test("Profiling is not supported for gpuIds %s" %
                             str(gpuIds))
    except dcgm_structs.dcgmExceptionClass(
            dcgm_structs.DCGM_ST_MODULE_NOT_LOADED) as e:
        test_utils.skip_test("The profiling module could not be loaded")
    except dcgm_structs.dcgmExceptionClass(
            dcgm_structs.DCGM_ST_NOT_SUPPORTED) as e:
        test_utils.skip_test("The profling module is not supported")

    ## keep args in this order. Changing it may break the test
    _test_valid_args([
        ["profile", "--list", "-i", allGpusCsv],
        ["profile", "--list", "-g",
         str(dcgmGroup.GetId().value)],
        ["profile", "--pause"],  #Pause followed by resume
        ["profile", "--resume"],
        ["profile", "--pause"],  #Double pause and double resume should be fine
        ["profile", "--pause"],
        ["profile", "--resume"],
        ["profile", "--resume"],
    ])

    ## keep args in this order. Changing it may break the test
    _test_invalid_args([
        ["profile", "--list", "--pause",
         "--resume"],  #mutually exclusive flags
        ["profile", "--pause", "--resume"],  #mutually exclusive flags
        ["profile", "--list", "-i", "999"],  #Invalid gpuID
        ["profile", "--list", "-i", allGpusCsv + ",taco"],  #Invalid gpu at end
        ["profile", "--list", "-g", "999"],  #Invalid group
    ])
コード例 #11
0
ファイル: DcgmReader.py プロジェクト: NVIDIA/DCGM
 def Init(self, libpath=None):
     with self.m_lock:
         try:
             self.InitWrapped(path=libpath)
         except dcgm_structs.dcgmExceptionClass(
                 dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
             self.LogError("Can't connect to nv-hostengine. Is it down?")
             self.SetDisconnected()
コード例 #12
0
def test_dcgm_policy_negative_unregister_standalone(handle):
    """
    Verifies that the unregister function does not allow a bad groupId value
    """
    policy = pydcgm.DcgmGroupPolicy(pydcgm.DcgmHandle(handle), 9999, None)
    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
        policy.Unregister(dcgm_structs.DCGM_POLICY_COND_DBE)
コード例 #13
0
ファイル: DcgmHandle.py プロジェクト: omertuc/DCGM
    def __init__(self, handle=None, ipAddress=None,
                 opMode=dcgm_structs.DCGM_OPERATION_MODE_AUTO, persistAfterDisconnect=False,
                 unixSocketPath=None, timeoutMs=0):
        '''
        Constructor

        handle is an existing handle from dcgmInit(). Pass None if you want this object to handle DCGM initialization for you
        ipAddress is the host to connect to. None = start embedded host engine
        opMode is a dcgm_structs.DCGM_OPERATION_MODE_* constant for how the host engine should run (embedded mode only)
        persistAfterDisconnect (TCP-IP connections only) is whether the host engine should persist all of our watches
                               after we disconnect. 1=persist our watches. 0=clean up after our connection
        unixSocketPath is a path to a path on the local filesystem that is a unix socket that the host engine is listening on.
                       This option is mutually exclusive with ipAddress
        timeoutMs is how long to wait for TCP/IP or Unix domain connections to establish in ms. 0=Default timeout (5000ms)
        '''
        self._handleCreated = False
        self._persistAfterDisconnect = persistAfterDisconnect
        
        if handle is not None:
            self.handle = handle
            return

        self._ipAddress = ipAddress
        
        #Can't provide both unix socket and ip address
        if ipAddress is not None and unixSocketPath is not None:
            raise dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)

        #Initialize the DCGM client library
        dcgm_structs._dcgmInit()
        dcgm_agent.dcgmInit() #Not harmful to call this multiple times in a process

        #If neither ipAddress nor unixSocketPath are present, start an embedded host engine
        if ipAddress is None and unixSocketPath is None:
            self.handle = dcgm_agent.dcgmStartEmbedded(opMode)
            self.isEmbedded = True
            self._handleCreated = True
            return        
        
        #Set up connection parameters. We're connecting to something
        connectParams = dcgm_structs.c_dcgmConnectV2Params_v2()
        connectParams.version = dcgm_structs.c_dcgmConnectV2Params_version
        connectParams.timeoutMs = timeoutMs
        if self._persistAfterDisconnect:
            connectParams.persistAfterDisconnect = 1
        else:
            connectParams.persistAfterDisconnect = 0
        
        if ipAddress is not None:
            connectToAddress = ipAddress
            connectParams.addressIsUnixSocket = 0
        else:
            connectToAddress = unixSocketPath
            connectParams.addressIsUnixSocket = 1
        
        self.handle = dcgm_agent.dcgmConnect_v2(connectToAddress, connectParams)
        self.isEmbedded = False
        self._handleCreated = True
コード例 #14
0
def test_dcgm_introspect_get_fields_memory_usage_validate(handle):
    """
    Validates structure version
    """
    introspectContext = dcgm_structs.c_dcgmIntrospectContext_v1()
    waitIfNoData = True

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmIntrospectGetFieldsMemoryUsage(handle, introspectContext,
                                                   versionTest, waitIfNoData)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmIntrospectGetFieldsMemoryUsage(handle, introspectContext,
                                                   versionTest, waitIfNoData)
コード例 #15
0
def test_dcgm_introspect_get_hostengine_cpu_utilization_validate(handle):
    """
    Validates structure version
    """

    waitIfNoData = True

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmIntrospectGetHostengineCpuUtilization(
            handle, versionTest, waitIfNoData)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmIntrospectGetHostengineCpuUtilization(
            handle, versionTest, waitIfNoData)
コード例 #16
0
def test_dcgm_introspect_get_field_exec_time_validate(handle):
    """
    Validates structure version
    """
    fieldId = dcgm_fields.DCGM_FI_DEV_GPU_TEMP
    waitIfNoData = True

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmIntrospectGetFieldExecTime(handle, fieldId, versionTest,
                                               waitIfNoData)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmIntrospectGetFieldExecTime(handle, fieldId, versionTest,
                                               waitIfNoData)
コード例 #17
0
ファイル: test_connection.py プロジェクト: omertuc/DCGM
def test_dcgm_connection_error_when_no_hostengine_exists():
    if not utils.is_bare_metal_system():
        test_utils.skip_test("Virtualization Environment not supported")

    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(
                dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID)):
        # use a TEST-NET (rfc5737) addr instead of loopback in case a local hostengine is running
        handle = pydcgm.DcgmHandle(ipAddress='192.0.2.0', timeoutMs=100)
コード例 #18
0
ファイル: test_starter.py プロジェクト: omertuc/DCGM
def helper_unwatch_field_values_public(handle, gpuIds):
    """
    Verifies that dcgm can unwatch a field value
    """
    fieldId = dcgm_fields.DCGM_FI_DEV_NAME
    fieldIds = [
        fieldId,
    ]

    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetGroupWithGpuIds('mygroup', gpuIds)
    fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "myfieldgroup", fieldIds)

    updateFreq = 10000000
    maxKeepAge = 86400
    maxKeepSamples = 0

    #These are all gpuId -> watcher count
    numWatchersBefore = {}
    numWatchersWithWatch = {}
    numWatchersAfter = {}

    #Get watch info before our test begins
    for gpuId in gpuIds:
        try:
            fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
                handleObj.handle, gpuId, fieldId)
            numWatchersBefore[gpuId] = fieldInfo.numWatchers
        except dcgm_structs.dcgmExceptionClass(
                dcgm_structs.DCGM_ST_NOT_WATCHED) as e:
            numWatchersBefore[gpuId] = 0

    #Now watch the fields
    groupObj.samples.WatchFields(fieldGroup, updateFreq, maxKeepAge,
                                 maxKeepSamples)

    #Get watcher info after our watch and check it against before our watch
    for gpuId in gpuIds:
        fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
            handleObj.handle, gpuId, fieldId)
        numWatchersWithWatch[gpuId] = fieldInfo.numWatchers
        assert numWatchersWithWatch[gpuId] == numWatchersBefore[gpuId] + 1,\
               "Watcher mismatch at gpuId %d, numWatchersWithWatch[gpuId] %d != numWatchersBefore[gpuId] %d + 1" %\
                (gpuId, numWatchersWithWatch[gpuId], numWatchersBefore[gpuId])

    #Unwatch fields
    groupObj.samples.UnwatchFields(fieldGroup)

    #Get watcher count after our unwatch. This should match our original watch count
    for gpuId in gpuIds:
        fieldInfo = dcgm_agent_internal.dcgmGetCacheManagerFieldInfo(
            handleObj.handle, gpuId, fieldId)
        numWatchersAfter[gpuId] = fieldInfo.numWatchers

    assert numWatchersBefore == numWatchersAfter, "Expected numWatchersBefore (%s) to match numWatchersAfter %s" %\
           (str(numWatchersBefore), str(numWatchersAfter))
コード例 #19
0
def test_dcgm_connect_validate(handle, gpuIds):
    """
    Validates structure version
    """
    fieldGroupFieldIds = [
        dcgm_fields.DCGM_FI_DEV_GPU_TEMP,
    ]
    connectParams = dcgm_structs.c_dcgmConnectV2Params_v1()
    connectParams.persistAfterDisconnect = 0

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmConnect_v2('localhost', connectParams, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmConnect_v2('localhost', connectParams, versionTest)
コード例 #20
0
def test_dcgm_policy_negative_register_standalone(handle):
    """
    Verifies that the register function does not allow a bad groupId value
    """
    policy = pydcgm.DcgmGroupPolicy(pydcgm.DcgmHandle(handle), 9999, None)
    empty_c_callback = create_c_callback(
    )  # must hold ref so func is not GC'ed before c api uses it
    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
        policy.Register(dcgm_structs.DCGM_POLICY_COND_DBE, empty_c_callback)
コード例 #21
0
def test_dcgm_health_check_validate(handle):
    """
    Validates structure version
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()

    groupId = dcgm_agent.dcgmGroupCreate(handle,
                                         dcgm_structs.DCGM_GROUP_DEFAULT,
                                         "test1")

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmHealthCheck(handle, groupId, versionTest)
    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmHealthCheck(handle, groupId, versionTest)
コード例 #22
0
def test_dcgm_field_group_get_all_validate(handle):
    """
    Validates structure version
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    gpuIdList = systemObj.discovery.GetAllGpuIds()
    assert len(gpuIdList
               ) >= 0, "Not able to find devices on the node for embedded case"

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        vtDcgmFieldGroupGetAll(handle, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        vtDcgmFieldGroupGetAll(handle, versionTest)
コード例 #23
0
    def verify_exit_code_on_signal(signum):
        # Ensure that host engine is ready to launch a new diagnostic
        dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr='1')
        success = False
        start = time.time()
        while not success and (time.time() - start) <= 3:
            try:
                response = test_utils.diag_execute_wrapper(dd, handle)
                success = True
            except dcgm_structs.dcgmExceptionClass(
                    dcgm_structs.DCGM_ST_DIAG_ALREADY_RUNNING):
                # Only acceptable error due to small race condition between the nvvs process exiting and
                # hostengine actually processing the exit. We try for a maximum of 3 seconds since this
                # should be rare and last only for a short amount of time
                time.sleep(1.5)

        diagApp = AppRunner(dcgmi_path,
                            args=[
                                "diag", "-r", "SM Stress", "-i",
                                "%s" % gpuId, "-d", "INFO", "--debugLogFile",
                                "/tmp/nvvs.log"
                            ])
        # Start the diag
        diagApp.start(timeout=40)
        logger.info("Launched dcgmi process with pid: %s" % diagApp.getpid())

        # Ensure diag is running before sending interrupt signal
        running, debug_output = dcgm_internal_helpers.check_nvvs_process(
            want_running=True, attempts=50)
        assert running, "The nvvs process did not start within 25 seconds: %s" % (
            debug_output)
        # There is a small race condition here - it is possible that the hostengine sends a SIGTERM before the
        # nvvs process has setup a signal handler, and so the nvvs process does not stop when SIGTERM is sent.
        # We sleep for 1 second to reduce the possibility of this scenario
        time.sleep(1)
        diagApp.signal(signum)
        retCode = diagApp.wait()
        # Check the return code and stdout/stderr output before asserting for better debugging info
        if retCode == 0:
            logger.error("Got retcode '%s' from launched diag." % retCode)
            if diagApp.stderr_lines or diagApp.stdout_lines:
                logger.info("dcgmi output:")
                for line in diagApp.stdout_lines:
                    logger.info(line)
                for line in diagApp.stderr_lines:
                    logger.error(line)
        assert retCode != 0, "Expected a non-zero exit code, but got 0"
        # Since the app returns a non zero exit code, we call the validate method to prevent false
        # failures from the test framework
        diagApp.validate()
        # Give the launched nvvs process 15 seconds to terminate.
        not_running, debug_output = dcgm_internal_helpers.check_nvvs_process(
            want_running=False, attempts=50)
        assert not_running, "The launched nvvs process did not terminate within 25 seconds. pgrep output:\n%s" \
                % debug_output
コード例 #24
0
ファイル: test_groupmgmt.py プロジェクト: omertuc/DCGM
def dcgm_group_test_default_group(handle, gpuIds):
    """
    Test that the default group can not be deleted, or manipulated and is returning all GPUs.

    Note that we're not using groupObj for some tests because it protects against operations on the default group
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetDefaultGroup()

    gpuIdList = gpuIds
    assert len(gpuIdList) > 0, "Failed to get devices from the node"

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
        groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, 9999)

    groupGpuIdList = groupObj.GetGpuIds()
    assert (gpuIdList == groupGpuIdList
            ), "Expected gpuId list match %s != %s" % (str(gpuIdList),
                                                       str(groupGpuIdList))
    groupEntityList = groupObj.GetEntities()
    gpuIdList2 = []
    for entity in groupEntityList:
        assert entity.entityGroupId == dcgm_fields.DCGM_FE_GPU, str(
            entity.entityGroupId)
        gpuIdList2.append(entity.entityId)
    assert gpuIdList == gpuIdList2, "Expected gpuId list to match entity list: %s != %s" % (
        str(gpuIdList), str(gpuIdList2))

    for gpuId in gpuIdList:
        with test_utils.assert_raises(
                dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
            ret = dcgm_agent.dcgmGroupRemoveDevice(
                handle, dcgm_structs.DCGM_GROUP_ALL_GPUS, gpuId)
        with test_utils.assert_raises(pydcgm.DcgmException):
            groupObj.RemoveGpu(gpuId)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
        ret = dcgm_agent.dcgmGroupDestroy(handle,
                                          dcgm_structs.DCGM_GROUP_ALL_GPUS)
コード例 #25
0
ファイル: test_prof.py プロジェクト: omertuc/DCGM
def test_dcgm_prof_watch_fields_multi_user(handle, gpuIds):
    dcgmHandle = pydcgm.DcgmHandle(ipAddress="127.0.0.1")
    dcgmSystem = dcgmHandle.GetSystem()
    dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds)

    helper_check_profiling_environment(dcgmGroup)

    dcgmHandle2 = pydcgm.DcgmHandle(ipAddress="127.0.0.1")
    dcgmSystem2 = dcgmHandle2.GetSystem()
    dcgmGroup2 = dcgmSystem2.GetGroupWithGpuIds('mygroup2', gpuIds)

    helper_check_profiling_environment(dcgmGroup)

    fieldIds = helper_get_single_pass_field_ids(dcgmGroup)
    assert fieldIds is not None

    #Take ownership of the profiling watches
    dcgmGroup.profiling.WatchFields(fieldIds, 1000000, 3600.0, 0)

    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_IN_USE)):
        dcgmGroup2.profiling.WatchFields(fieldIds, 1000000, 3600.0, 0)
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_IN_USE)):
        dcgmGroup2.profiling.UnwatchFields()

    #Release the watches
    dcgmGroup.profiling.UnwatchFields()

    #Now dcgmHandle2 owns the watches
    dcgmGroup2.profiling.WatchFields(fieldIds, 1000000, 3600.0, 0)

    #connection 1 should fail to acquire the watches
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_IN_USE)):
        dcgmGroup.profiling.WatchFields(fieldIds, 1000000, 3600.0, 0)
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_IN_USE)):
        dcgmGroup.profiling.UnwatchFields()

    dcgmHandle.Shutdown()
    dcgmHandle2.Shutdown()
コード例 #26
0
ファイル: test_metadata.py プロジェクト: omertuc/DCGM
def _assert_metadata_not_configured_failure(handle):
    """
    Verifies that:
    1. metadata gathering is disabled by default 
    2. an appropriate error is raised when metadata APIs are accessed but 
       metadata gathering is disabled.
    """
    system = pydcgm.DcgmSystem(pydcgm.DcgmHandle(handle))

    with test_utils.assert_raises(dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
        memoryInfo = system.introspect.memory.GetForAllFields()
コード例 #27
0
def test_dcgm_get_pid_info_validate(handle, gpuIds):
    """
    Validates structure version
    """

    pidList = StartAppOnGpus(handle)
    groupId = dcgm_agent.dcgmGroupCreate(handle,
                                         dcgm_structs.DCGM_GROUP_DEFAULT,
                                         "test1")

    for pid in pidList:
        with test_utils.assert_raises(
                dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
            versionTest = 0  #invalid version
            ret = vtDcgmGetPidInfo(handle, groupId, pid, versionTest)

        with test_utils.assert_raises(
                dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
            versionTest = 50  #random number version
            ret = vtDcgmGetPidInfo(handle, groupId, pid, versionTest)
コード例 #28
0
def helper_dcgm_verify_sync_boost_multi_gpu(handle, gpuIds):
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetEmptyGroup("test1")

    if len(gpuIds) < 2:
        test_utils.skip_test(
            "This test only works with 2 or more identical GPUs")

    ## Add all identical GPUs to the group
    for gpuId in gpuIds:
        groupObj.AddGpu(gpuId)

    gpuIds = groupObj.GetGpuIds()  #Only reference GPUs we are testing against

    ## Set the sync boost for the group
    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
    config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.syncBoost = 1
    config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK

    #Enable sync boost - Will throw an exception on error
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(
                dcgm_structs.DCGM_ST_NOT_SUPPORTED)):
        groupObj.config.Set(config_values)

    config_values.mPerfState.syncBoost = 0

    #Disable sync boost - Will throw an exception on error
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(
                dcgm_structs.DCGM_ST_NOT_SUPPORTED)):
        groupObj.config.Set(config_values)

    groupObj.Delete()
コード例 #29
0
ファイル: test_connection.py プロジェクト: omertuc/DCGM
def test_dcgm_connection_client_cleanup(handle, gpuIds):
    '''
    Make sure that resources that were allocated by a client are cleaned up
    '''
    fieldGroupFieldIds = [
        dcgm_fields.DCGM_FI_DEV_GPU_TEMP,
    ]

    #Get a 2nd connection which we'll check for cleanup. Use the raw APIs so we can explicitly cleanup
    connectParams = dcgm_structs.c_dcgmConnectV2Params_v1()
    connectParams.version = dcgm_structs.c_dcgmConnectV2Params_version
    connectParams.persistAfterDisconnect = 0
    cleanupHandle = dcgm_agent.dcgmConnect_v2('localhost', connectParams)

    groupName = 'clientcleanupgroup'
    groupId = dcgm_agent.dcgmGroupCreate(cleanupHandle,
                                         dcgm_structs.DCGM_GROUP_EMPTY,
                                         groupName)

    fieldGroupName = 'clientcleanupfieldgroup'
    fieldGroupId = dcgm_agent.dcgmFieldGroupCreate(cleanupHandle,
                                                   fieldGroupFieldIds,
                                                   fieldGroupName)

    #Disconnect our second handle. This should cause the cleanup to occur
    dcgm_agent.dcgmDisconnect(cleanupHandle)

    time.sleep(1.0)  #Allow connection cleanup to occur since it's asynchronous

    #Try to retrieve the field group info. This should throw an exception
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_NO_DATA)):
        fieldGroupInfo = dcgm_agent.dcgmFieldGroupGetInfo(handle, fieldGroupId)

    #Try to retrieve the group info. This should throw an exception
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(
                dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
        groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)
コード例 #30
0
def test_dcgm_vgpu_config_set_validate(handle):
    """
    Validates structure version
    """

    groupId = dcgm_agent.dcgmGroupCreate(handle,
                                         dcgm_structs.DCGM_GROUP_DEFAULT,
                                         "test1")
    status_handle = dcgm_agent.dcgmStatusCreate()
    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmVgpuConfigSet(handle, groupId, config_values,
                                  status_handle, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random invalid version
        ret = vtDcgmVgpuConfigSet(handle, groupId, config_values,
                                  status_handle, versionTest)