Beispiel #1
0
def test_dcgm_vgpu_config_get_validate(handle):
    """
    Validates structure version
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    gpuIdList = systemObj.discovery.GetAllGpuIds()
    assert len(gpuIdList
               ) >= 0, "Not able to find devices on the node for embedded case"

    groupId = dcgm_agent.dcgmGroupCreate(handle,
                                         dcgm_structs.DCGM_GROUP_DEFAULT,
                                         "test1")
    groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)
    status_handle = dcgm_agent.dcgmStatusCreate()

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmVgpuConfigGet(handle, groupId,
                                  dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
                                  groupInfo.count, status_handle, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmVgpuConfigGet(handle, groupId,
                                  dcgm_structs.DCGM_CONFIG_CURRENT_STATE,
                                  groupInfo.count, status_handle, versionTest)
Beispiel #2
0
def test_dcgm_policy_get_validate(handle):
    """
    Validates structure version
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    gpuIdList = systemObj.discovery.GetAllGpuIds()
    assert len(gpuIdList
               ) >= 0, "Not able to find devices on the node for embedded case"

    groupId = dcgm_agent.dcgmGroupCreate(handle,
                                         dcgm_structs.DCGM_GROUP_DEFAULT,
                                         "test1")
    status_handle = dcgm_agent.dcgmStatusCreate()
    count = 1

    diagLevel = dcgm_structs.DCGM_DIAG_LVL_SHORT

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmPolicyGet(handle, groupId, count, status_handle,
                              versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmPolicyGet(handle, groupId, count, status_handle,
                              versionTest)
Beispiel #3
0
def test_dcgm_get_vgpu_instance_attributes_validate(handle, gpuIds):
    """
    Verifies that vGPU attributes are properly queried
    """

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmGetVgpuInstanceAttributes(handle, gpuIds[0], versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmGetVgpuInstanceAttributes(handle, gpuIds[0], versionTest)
Beispiel #4
0
def test_dcgm_job_get_stats_validate(handle):
    """
    Validates structure version
    """

    jobid = "1"

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmJobGetStats(handle, jobid, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmJobGetStats(handle, jobid, versionTest)
Beispiel #5
0
def helper_dcgm_verify_sync_boost_single_gpu(handle, gpuIds):
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetEmptyGroup("test1")

    ## Add first GPU to the group
    groupObj.AddGpu(gpuIds[0])
    gpuIds = groupObj.GetGpuIds()  #Only reference GPUs we are testing against

    ## Set the sync boost for the group
    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
    config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.syncBoost = 1
    config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK

    #Config Set must return DCGM_ST_BADPARAM since we only have a single GPU
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)):
        groupObj.config.Set(config_values)

    groupObj.Delete()
Beispiel #6
0
def test_dcgm_policy_negative_unregister_standalone(handle):
    """
    Verifies that the unregister function does not allow a bad groupId value
    """
    policy = pydcgm.DcgmGroupPolicy(pydcgm.DcgmHandle(handle), 9999, None)
    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
        policy.Unregister(dcgm_structs.DCGM_POLICY_COND_DBE)
Beispiel #7
0
def test_dcgm_connection_error_when_no_hostengine_exists():
    if not utils.is_bare_metal_system():
        test_utils.skip_test("Virtualization Environment not supported")

    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(
                dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID)):
        # use a TEST-NET (rfc5737) addr instead of loopback in case a local hostengine is running
        handle = pydcgm.DcgmHandle(ipAddress='192.0.2.0', timeoutMs=100)
Beispiel #8
0
def test_dcgm_introspect_get_fields_memory_usage_validate(handle):
    """
    Validates structure version
    """
    introspectContext = dcgm_structs.c_dcgmIntrospectContext_v1()
    waitIfNoData = True

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmIntrospectGetFieldsMemoryUsage(handle, introspectContext,
                                                   versionTest, waitIfNoData)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmIntrospectGetFieldsMemoryUsage(handle, introspectContext,
                                                   versionTest, waitIfNoData)
Beispiel #9
0
def test_dcgm_introspect_get_hostengine_cpu_utilization_validate(handle):
    """
    Validates structure version
    """

    waitIfNoData = True

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmIntrospectGetHostengineCpuUtilization(
            handle, versionTest, waitIfNoData)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmIntrospectGetHostengineCpuUtilization(
            handle, versionTest, waitIfNoData)
Beispiel #10
0
def test_dcgm_introspect_get_field_exec_time_validate(handle):
    """
    Validates structure version
    """
    fieldId = dcgm_fields.DCGM_FI_DEV_GPU_TEMP
    waitIfNoData = True

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmIntrospectGetFieldExecTime(handle, fieldId, versionTest,
                                               waitIfNoData)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmIntrospectGetFieldExecTime(handle, fieldId, versionTest,
                                               waitIfNoData)
Beispiel #11
0
def test_dcgm_policy_get_with_no_gpus_standalone(handle):
    '''
    Test that getting the policies when no GPUs are in the group raises an exception
    '''
    group = pydcgm.DcgmGroup(pydcgm.DcgmHandle(handle),
                             groupType=dcgm_structs.DCGM_GROUP_EMPTY,
                             groupName="test")

    with test_utils.assert_raises(pydcgm.DcgmException):
        policies = group.policy.Get()
Beispiel #12
0
def test_dcgm_policy_negative_register_standalone(handle):
    """
    Verifies that the register function does not allow a bad groupId value
    """
    policy = pydcgm.DcgmGroupPolicy(pydcgm.DcgmHandle(handle), 9999, None)
    empty_c_callback = create_c_callback(
    )  # must hold ref so func is not GC'ed before c api uses it
    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
        policy.Register(dcgm_structs.DCGM_POLICY_COND_DBE, empty_c_callback)
Beispiel #13
0
def test_dcgm_health_check_validate(handle):
    """
    Validates structure version
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()

    groupId = dcgm_agent.dcgmGroupCreate(handle,
                                         dcgm_structs.DCGM_GROUP_DEFAULT,
                                         "test1")

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmHealthCheck(handle, groupId, versionTest)
    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmHealthCheck(handle, groupId, versionTest)
Beispiel #14
0
def test_dcgm_field_group_get_all_validate(handle):
    """
    Validates structure version
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    gpuIdList = systemObj.discovery.GetAllGpuIds()
    assert len(gpuIdList
               ) >= 0, "Not able to find devices on the node for embedded case"

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        vtDcgmFieldGroupGetAll(handle, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        vtDcgmFieldGroupGetAll(handle, versionTest)
Beispiel #15
0
def test_dcgm_connect_validate(handle, gpuIds):
    """
    Validates structure version
    """
    fieldGroupFieldIds = [
        dcgm_fields.DCGM_FI_DEV_GPU_TEMP,
    ]
    connectParams = dcgm_structs.c_dcgmConnectV2Params_v1()
    connectParams.persistAfterDisconnect = 0

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmConnect_v2('localhost', connectParams, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmConnect_v2('localhost', connectParams, versionTest)
Beispiel #16
0
def dcgm_group_test_default_group(handle, gpuIds):
    """
    Test that the default group can not be deleted, or manipulated and is returning all GPUs.

    Note that we're not using groupObj for some tests because it protects against operations on the default group
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetDefaultGroup()

    gpuIdList = gpuIds
    assert len(gpuIdList) > 0, "Failed to get devices from the node"

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
        groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, 9999)

    groupGpuIdList = groupObj.GetGpuIds()
    assert (gpuIdList == groupGpuIdList
            ), "Expected gpuId list match %s != %s" % (str(gpuIdList),
                                                       str(groupGpuIdList))
    groupEntityList = groupObj.GetEntities()
    gpuIdList2 = []
    for entity in groupEntityList:
        assert entity.entityGroupId == dcgm_fields.DCGM_FE_GPU, str(
            entity.entityGroupId)
        gpuIdList2.append(entity.entityId)
    assert gpuIdList == gpuIdList2, "Expected gpuId list to match entity list: %s != %s" % (
        str(gpuIdList), str(gpuIdList2))

    for gpuId in gpuIdList:
        with test_utils.assert_raises(
                dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
            ret = dcgm_agent.dcgmGroupRemoveDevice(
                handle, dcgm_structs.DCGM_GROUP_ALL_GPUS, gpuId)
        with test_utils.assert_raises(pydcgm.DcgmException):
            groupObj.RemoveGpu(gpuId)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
        ret = dcgm_agent.dcgmGroupDestroy(handle,
                                          dcgm_structs.DCGM_GROUP_ALL_GPUS)
Beispiel #17
0
def test_dcgm_prof_watch_fields_multi_user(handle, gpuIds):
    dcgmHandle = pydcgm.DcgmHandle(ipAddress="127.0.0.1")
    dcgmSystem = dcgmHandle.GetSystem()
    dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds)

    helper_check_profiling_environment(dcgmGroup)

    dcgmHandle2 = pydcgm.DcgmHandle(ipAddress="127.0.0.1")
    dcgmSystem2 = dcgmHandle2.GetSystem()
    dcgmGroup2 = dcgmSystem2.GetGroupWithGpuIds('mygroup2', gpuIds)

    helper_check_profiling_environment(dcgmGroup)

    fieldIds = helper_get_single_pass_field_ids(dcgmGroup)
    assert fieldIds is not None

    #Take ownership of the profiling watches
    dcgmGroup.profiling.WatchFields(fieldIds, 1000000, 3600.0, 0)

    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_IN_USE)):
        dcgmGroup2.profiling.WatchFields(fieldIds, 1000000, 3600.0, 0)
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_IN_USE)):
        dcgmGroup2.profiling.UnwatchFields()

    #Release the watches
    dcgmGroup.profiling.UnwatchFields()

    #Now dcgmHandle2 owns the watches
    dcgmGroup2.profiling.WatchFields(fieldIds, 1000000, 3600.0, 0)

    #connection 1 should fail to acquire the watches
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_IN_USE)):
        dcgmGroup.profiling.WatchFields(fieldIds, 1000000, 3600.0, 0)
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_IN_USE)):
        dcgmGroup.profiling.UnwatchFields()

    dcgmHandle.Shutdown()
    dcgmHandle2.Shutdown()
Beispiel #18
0
def test_dcgm_get_pid_info_validate(handle, gpuIds):
    """
    Validates structure version
    """

    pidList = StartAppOnGpus(handle)
    groupId = dcgm_agent.dcgmGroupCreate(handle,
                                         dcgm_structs.DCGM_GROUP_DEFAULT,
                                         "test1")

    for pid in pidList:
        with test_utils.assert_raises(
                dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
            versionTest = 0  #invalid version
            ret = vtDcgmGetPidInfo(handle, groupId, pid, versionTest)

        with test_utils.assert_raises(
                dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
            versionTest = 50  #random number version
            ret = vtDcgmGetPidInfo(handle, groupId, pid, versionTest)
Beispiel #19
0
    def start(self, timeout=app_runner.default_timeout):
        """
        Blocks till cuda ctx is really created
        Raises Exception if assert does not work
        """

        super(RunCudaAssert, self).start(timeout)

        with test_utils.assert_raises(EOFError):
            # if matching line is not found then EOFError exception is risen
            self.stdout_readtillmatch(lambda x: x == "Assertion `false` failed")
Beispiel #20
0
def _assert_metadata_not_configured_failure(handle):
    """
    Verifies that:
    1. metadata gathering is disabled by default 
    2. an appropriate error is raised when metadata APIs are accessed but 
       metadata gathering is disabled.
    """
    system = pydcgm.DcgmSystem(pydcgm.DcgmHandle(handle))

    with test_utils.assert_raises(dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
        memoryInfo = system.introspect.memory.GetForAllFields()
Beispiel #21
0
def helper_dcgm_verify_sync_boost_multi_gpu(handle, gpuIds):
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetEmptyGroup("test1")

    if len(gpuIds) < 2:
        test_utils.skip_test(
            "This test only works with 2 or more identical GPUs")

    ## Add all identical GPUs to the group
    for gpuId in gpuIds:
        groupObj.AddGpu(gpuId)

    gpuIds = groupObj.GetGpuIds()  #Only reference GPUs we are testing against

    ## Set the sync boost for the group
    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()
    config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.syncBoost = 1
    config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK
    config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.type = dcgmvalue.DCGM_INT32_BLANK
    config_values.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK

    #Enable sync boost - Will throw an exception on error
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(
                dcgm_structs.DCGM_ST_NOT_SUPPORTED)):
        groupObj.config.Set(config_values)

    config_values.mPerfState.syncBoost = 0

    #Disable sync boost - Will throw an exception on error
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(
                dcgm_structs.DCGM_ST_NOT_SUPPORTED)):
        groupObj.config.Set(config_values)

    groupObj.Delete()
Beispiel #22
0
def test_dcgm_connection_client_cleanup(handle, gpuIds):
    '''
    Make sure that resources that were allocated by a client are cleaned up
    '''
    fieldGroupFieldIds = [
        dcgm_fields.DCGM_FI_DEV_GPU_TEMP,
    ]

    #Get a 2nd connection which we'll check for cleanup. Use the raw APIs so we can explicitly cleanup
    connectParams = dcgm_structs.c_dcgmConnectV2Params_v1()
    connectParams.version = dcgm_structs.c_dcgmConnectV2Params_version
    connectParams.persistAfterDisconnect = 0
    cleanupHandle = dcgm_agent.dcgmConnect_v2('localhost', connectParams)

    groupName = 'clientcleanupgroup'
    groupId = dcgm_agent.dcgmGroupCreate(cleanupHandle,
                                         dcgm_structs.DCGM_GROUP_EMPTY,
                                         groupName)

    fieldGroupName = 'clientcleanupfieldgroup'
    fieldGroupId = dcgm_agent.dcgmFieldGroupCreate(cleanupHandle,
                                                   fieldGroupFieldIds,
                                                   fieldGroupName)

    #Disconnect our second handle. This should cause the cleanup to occur
    dcgm_agent.dcgmDisconnect(cleanupHandle)

    time.sleep(1.0)  #Allow connection cleanup to occur since it's asynchronous

    #Try to retrieve the field group info. This should throw an exception
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_NO_DATA)):
        fieldGroupInfo = dcgm_agent.dcgmFieldGroupGetInfo(handle, fieldGroupId)

    #Try to retrieve the group info. This should throw an exception
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(
                dcgm_structs.DCGM_ST_NOT_CONFIGURED)):
        groupInfo = dcgm_agent.dcgmGroupGetInfo(handle, groupId)
Beispiel #23
0
def test_dcgm_vgpu_config_set_validate(handle):
    """
    Validates structure version
    """

    groupId = dcgm_agent.dcgmGroupCreate(handle,
                                         dcgm_structs.DCGM_GROUP_DEFAULT,
                                         "test1")
    status_handle = dcgm_agent.dcgmStatusCreate()
    config_values = dcgm_structs.c_dcgmDeviceConfig_v1()

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmVgpuConfigSet(handle, groupId, config_values,
                                  status_handle, versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random invalid version
        ret = vtDcgmVgpuConfigSet(handle, groupId, config_values,
                                  status_handle, versionTest)
Beispiel #24
0
def test_dcgm_field_group_get_info_validate(handle):
    """
    Validates structure version
    """
    fieldIds = [
        dcgm_fields.DCGM_FI_DRIVER_VERSION, dcgm_fields.DCGM_FI_DEV_NAME,
        dcgm_fields.DCGM_FI_DEV_BRAND
    ]
    handle = pydcgm.DcgmHandle(handle)
    fieldGroup = pydcgm.DcgmFieldGroup(handle, "mygroup", fieldIds)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmFieldGroupGetInfo(handle.handle, fieldGroup.fieldGroupId,
                                      versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random number version
        ret = vtDcgmFieldGroupGetInfo(handle.handle, fieldGroup.fieldGroupId,
                                      versionTest)
Beispiel #25
0
def test_dcgm_get_device_attributes_validate(handle, gpuIds):
    """
    Validates structure version
    """
    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()
    groupObj = systemObj.GetEmptyGroup("test1")
    ## Add first GPU to the group
    groupObj.AddGpu(gpuIds[0])
    gpuIds = groupObj.GetGpuIds()  #Only reference GPUs we are testing against

    #Make sure the device attributes and config fields have updated
    systemObj.UpdateAllFields(1)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 0  #invalid version
        ret = vtDcgmGetDeviceAttributes(handle, gpuIds[0], versionTest)

    with test_utils.assert_raises(
            dcgmExceptionClass(dcgm_structs.DCGM_ST_VER_MISMATCH)):
        versionTest = 50  #random invalid version
        ret = vtDcgmGetDeviceAttributes(handle, gpuIds[0], versionTest)
Beispiel #26
0
def helper_test_dcgm_injection_summaries(handle, gpuIds):

    gpuId = gpuIds[0]

    # Watch the field we're inserting into
    dcgm_agent_internal.dcgmWatchFieldValue(
        handle, gpuId, dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, 1, 3600.0,
        10000)

    handleObj = pydcgm.DcgmHandle(handle=handle)
    systemObj = handleObj.GetSystem()

    #Make a base value that is good for starters
    field = dcgm_structs_internal.c_dcgmInjectFieldValue_v1()
    field.version = dcgm_structs_internal.dcgmInjectFieldValue_version1
    field.fieldId = dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL
    field.status = 0
    field.fieldType = ord(dcgm_fields.DCGM_FT_INT64)

    baseTime = get_usec_since_1970()

    for i in range(0, 10):
        field.ts = baseTime + i
        field.value.i64 = i
        ret = dcgm_agent_internal.dcgmInjectFieldValue(handle, gpuId, field)
        assert (ret == dcgm_structs.DCGM_ST_OK)

    time.sleep(1)

    systemObj.UpdateAllFields(1)

    tmpMask = dcgm_structs.DCGM_SUMMARY_MIN | dcgm_structs.DCGM_SUMMARY_MAX
    tmpMask = tmpMask | dcgm_structs.DCGM_SUMMARY_AVG | dcgm_structs.DCGM_SUMMARY_DIFF
    # Pass baseTime for the start to get nothing from the first query
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_NO_DATA)):
        request = dcgm_agent.dcgmGetFieldSummary(
            handle, dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL,
            dcgm_fields.DCGM_FE_GPU, gpuId, tmpMask, baseTime - 60,
            baseTime - 30)

    # Now adjust the time so we get values
    request = dcgm_agent.dcgmGetFieldSummary(
        handle, dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL,
        dcgm_fields.DCGM_FE_GPU, gpuId, tmpMask, 0, 0)
    assert (request.response.values[0].i64 == 0)
    assert (request.response.values[1].i64 == 9)
    assert (request.response.values[2].i64 == 4)
    assert (request.response.values[3].i64 == 9)
Beispiel #27
0
def test_connection_disconnect_error_after_shutdown():
    '''
    Test that DCGM_ST_BADPARAM is returned when the dcgm API is used after
    a call to dcgmShutdown has been made.
    '''
    handle = pydcgm.DcgmHandle()
    group = pydcgm.DcgmGroup(handle, groupName='test-connection')

    gpudIds = group.GetGpuIds()

    handle.Shutdown()

    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)):
        gpuIds = group.GetGpuIds()
Beispiel #28
0
def test_dcgm_action_run_diag_bad_validation(handle, gpuIds):
    gpuIdStr = ""
    for i, gpuId in enumerate(gpuIds):
        if i > 0:
            gpuIdStr += ","
        gpuIdStr += str(gpuId)

    drd = dcgm_structs.c_dcgmRunDiag_t()
    drd.version = dcgm_structs.dcgmRunDiag_version
    drd.validate = dcgm_structs.DCGM_POLICY_VALID_SV_LONG + 1 #use an invalid value
    drd.groupId = 0 #Initializing to 0 in case the constructor above doesn't
    drd.gpuList = gpuIdStr

    with test_utils.assert_raises(dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)):
        ret = dcgm_agent.dcgmActionValidate_v2(handle, drd, dcgm_structs.dcgmRunDiag_version)
Beispiel #29
0
def test_dcgm_modules_blacklist_introspection(handle):
    '''
    Make sure that the introspection module can be blacklisted
    '''
    dcgmHandle = pydcgm.DcgmHandle(handle=handle)
    dcgmSystem = dcgmHandle.GetSystem()
    moduleId = dcgm_structs.DcgmModuleIdIntrospect

    dcgmSystem.modules.Blacklist(moduleId)

    #Try to lazy load the blacklisted introspection module
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(
                dcgm_structs.DCGM_ST_MODULE_NOT_LOADED)):
        dcgmSystem.introspect.state.toggle(
            dcgm_structs.DCGM_INTROSPECT_STATE.ENABLED)
Beispiel #30
0
def test_dcgm_modules_blacklist_health(handle):
    '''
    Make sure that the health module can be blacklisted
    '''
    dcgmHandle = pydcgm.DcgmHandle(handle=handle)
    dcgmSystem = dcgmHandle.GetSystem()
    dcgmGroup = dcgmSystem.GetDefaultGroup()
    moduleId = dcgm_structs.DcgmModuleIdHealth

    dcgmSystem.modules.Blacklist(moduleId)

    #Try to lazy load the blacklisted introspection module
    with test_utils.assert_raises(
            dcgm_structs.dcgmExceptionClass(
                dcgm_structs.DCGM_ST_MODULE_NOT_LOADED)):
        dcgmGroup.health.Set(dcgm_structs.DCGM_HEALTH_WATCH_ALL)