Beispiel #1
0
def test_dcgm_embedded_metadata_exectime_aggregate_is_sum_of_gpu_and_global(
        handle):
    """
    Ensure that when execution time is retrieved relating to fields that the "global" and "gpu" 
    values add up to the "aggregate" value
    """
    handle = pydcgm.DcgmHandle(handle)
    system = pydcgm.DcgmSystem(handle)
    group = pydcgm.DcgmGroup(handle,
                             groupName="metadata-test",
                             groupType=dcgm_structs.DCGM_GROUP_DEFAULT)

    # watch a ton of fields so that we know that some are being stored
    test_utils.watch_all_fields(handle.handle,
                                group.GetGpuIds(),
                                updateFreq=100000)
    system.introspect.UpdateAll()

    execTimeInfo = system.introspect.execTime.GetForAllFields()

    gpuExecTime = sum(
        info.totalEverUpdateUsec
        for info in execTimeInfo.gpuInfo[:execTimeInfo.gpuInfoCount])

    if execTimeInfo.hasGlobalInfo:
        globalExecTime = execTimeInfo.globalInfo.totalEverUpdateUsec
    else:
        globalExecTime = 0

    assert (
        execTimeInfo.aggregateInfo.totalEverUpdateUsec == globalExecTime +
        gpuExecTime
    ), ('aggregate for all fields reports %s usec but GPUs report %s usec and global reports %s usec. '
        % (execTimeInfo.aggregateInfo.totalEverUpdateUsec, gpuExecTime,
           globalExecTime) + ' GPUs + global should sum to aggregate.')
Beispiel #2
0
def test_dcgm_embedded_metadata_memory_aggregate_is_sum_of_gpu_and_global(
        handle):
    """
    Ensure that when memory info is retrieved relating to fields that the "global" and "gpu" 
    values add up to the "aggregate" value
    """
    handle = pydcgm.DcgmHandle(handle)
    system = pydcgm.DcgmSystem(handle)
    group = pydcgm.DcgmGroup(handle,
                             groupName="metadata-test",
                             groupType=dcgm_structs.DCGM_GROUP_DEFAULT)

    # watch a ton of fields so that we know that some are being stored
    test_utils.watch_all_fields(handle.handle,
                                group.GetGpuIds(),
                                updateFreq=100000)
    system.introspect.UpdateAll()

    memoryInfo = system.introspect.memory.GetForAllFields()

    gpuMemory = sum(mem.bytesUsed
                    for mem in memoryInfo.gpuInfo[:memoryInfo.gpuInfoCount])

    globalMemory = memoryInfo.globalInfo.bytesUsed if memoryInfo.hasGlobalInfo else 0

    if (memoryInfo.hasGlobalInfo):
        logger.debug('global mem info: %s' % (memoryInfo.globalInfo))

    for i in range(memoryInfo.gpuInfoCount):
        logger.debug('gpu mem info %s: %s' % (i, memoryInfo.gpuInfo[i]))

    assert (memoryInfo.aggregateInfo.bytesUsed == gpuMemory + globalMemory), (
        'aggregate for all fields reports %s bytes but a sum of GPU and global reports %s bytes. '
        % (memoryInfo.aggregateInfo.bytesUsed, gpuMemory + globalMemory) +
        ' These values should be equal.')
Beispiel #3
0
def test_dcgm_embedded_metadata_memory_get_all_fields_sane(handle):
    """
    Sanity test for API that gets memory usage of all fields together
    """
    if not option_parser.options.developer_mode:
        test_utils.skip_test("Skipping developer test.")
    handle = pydcgm.DcgmHandle(handle)
    system = pydcgm.DcgmSystem(handle)
    group = pydcgm.DcgmGroup(handle,
                             groupName="metadata-test",
                             groupType=dcgm_structs.DCGM_GROUP_DEFAULT)

    # watch a ton of fields so that we know that some are being stored
    test_utils.watch_all_fields(handle.handle,
                                group.GetGpuIds(),
                                updateFreq=1000)
    system.introspect.UpdateAll()

    memoryInfo = system.introspect.memory.GetForAllFields().aggregateInfo

    logger.debug('All fields in hostengine are using %.2f MB' %
                 (memoryInfo.bytesUsed / 1024. / 1024.))

    assert (1024 * 20 < memoryInfo.bytesUsed <
            100 * 1024 * 1024), memoryInfo.bytesUsed  # 20 KB to 100 MB
Beispiel #4
0
def test_dcgm_embedded_metadata_exectime_get_all_fields_sane(handle):
    """
    Sanity test for API that gets execution time of all fields together
    """
    if not option_parser.options.developer_mode:
        test_utils.skip_test("Skipping developer test.")
    handle = pydcgm.DcgmHandle(handle)
    system = pydcgm.DcgmSystem(handle)
    group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT)
    
    # watch a ton of fields so that we know that some are being stored
    updateFreqUsec = 1000
    test_utils.watch_all_fields(handle.handle, group.GetGpuIds(), updateFreq=updateFreqUsec)
    system.introspect.UpdateAll()
    
    execTime = system.introspect.execTime.GetForAllFields().aggregateInfo
    
    perGpuSane = 300*1000 # 300 ms
    activeGpuCount = test_utils.get_live_gpu_count(handle.handle)
    saneLimit = perGpuSane*activeGpuCount
    
    # test that all struct fields in the API response have reasonable values
    assert(100 < execTime.totalEverUpdateUsec < saneLimit), (
        'execution time seems way too long for a system with %s gpus. Took %s ms. Sane limit: %s ms' 
        % (activeGpuCount, execTime.totalEverUpdateUsec/1000, saneLimit/1000))
    
    assert(100 < execTime.recentUpdateUsec < saneLimit), (
        'recent update time seems way too long for a system with %s gpus. Took %s ms. Sane limit: %s ms' 
        % (activeGpuCount, execTime.recentUpdateUsec/1000, saneLimit/1000))
    
    assert(updateFreqUsec-1 <= execTime.meanUpdateFreqUsec <= updateFreqUsec+1), execTime.meanUpdateFreqUsec
Beispiel #5
0
def _sumMetadata(handle, getForFieldFn, getForAllFieldsFn, metaAttr):
    '''
    Return a 3-tuple where the first entry is the aggregate of summing the metadata for every 
    field, the second entry is the total before aggregating and the third entry is the total after aggregating.
    '''
    system = pydcgm.DcgmSystem(pydcgm.DcgmHandle(handle))
    group = pydcgm.DcgmGroup(pydcgm.DcgmHandle(handle),
                             groupName="test-metadata",
                             groupType=dcgm_structs.DCGM_GROUP_DEFAULT)

    # watch every field possible on all GPUs
    watchedFields = test_utils.watch_all_fields(handle, group.GetGpuIds())

    system.introspect.UpdateAll()

    # Get the total before and after to accomodate for any slight changes
    # in total memory usage while the individual field amounts are being summed
    startVal = getForAllFieldsFn().aggregateInfo.__getattribute__(metaAttr)

    aggregateVal = sum(
        getForFieldFn(handle, fieldId).aggregateInfo.__getattribute__(metaAttr)
        for fieldId in watchedFields)

    endVal = getForAllFieldsFn().aggregateInfo.__getattribute__(metaAttr)

    return aggregateVal, startVal, endVal
Beispiel #6
0
def test_dcgm_standalone_perf_bounded(handle):
    '''
    Test that runs some subtests.  When we bound the number of samples to keep for each field: 
      - DCGM memory usage eventually flatlines on a field, field group, all fields, and process level.
      - DCGM memory usage is at a value that we expect (golden value).  If what we 
         expect changes over time the we must update what these values are (the tests will fail if we don't).
         
    Plots of the memory usage and execution time generated during this test are saved and the 
    filename of the figure is output on the terminal.
    
    Multiple tests are included in this test in order to save time by only gathering data once.
    '''
    if not option_parser.options.developer_mode:
        test_utils.skip_test("Skipping developer test.")

    handle = pydcgm.DcgmHandle(handle)
    group = pydcgm.DcgmGroup(handle,
                             groupName="metadata-test",
                             groupType=dcgm_structs.DCGM_GROUP_DEFAULT)
    updateFreq = 1000000  # 1 second. Needs to be long enough for all fields on all GPUs to update, or the record density will vary based on CPU consumption

    watchedFieldIds = test_utils.watch_all_fields(
        handle.handle,
        group.GetGpuIds(),
        updateFreq,
        maxKeepAge=0.0,  #Use maxKeepEntries only to enforce the quota
        maxKeepEntries=10)

    memUsageTS, execTimeTS, execTimeAvgTS, cpuUtilTS = _gather_perf_timeseries(
        handle, watchedFieldIds)
    activeGpuCount = test_utils.get_live_gpu_count(handle.handle)

    # run the actual tests on the gathered data

    # test that memory usage flatlines
    test_utils.run_subtest(_test_mem_bounded_flatlines_fields, memUsageTS)
    test_utils.run_subtest(_test_mem_bounded_flatlines_fieldgroups, memUsageTS)
    test_utils.run_subtest(_test_mem_bounded_flatlines_allfields, memUsageTS)
    test_utils.run_subtest(_test_mem_bounded_flatlines_process, memUsageTS)

    # test that memory usage is at an expected level (golden value)
    # the tail end of the series should be VERY close to the end since we compare the mean
    # of the tail to the golden value
    tailStart = int(0.8 * len(memUsageTS.timestamps))
    test_utils.run_subtest(_test_mem_bounded_golden_values_fields,
                           activeGpuCount, memUsageTS, tailStart)
    test_utils.run_subtest(_test_mem_bounded_golden_values_allfields,
                           activeGpuCount, memUsageTS, tailStart,
                           len(watchedFieldIds))
    test_utils.run_subtest(_test_mem_bounded_golden_values_process, memUsageTS,
                           tailStart, len(watchedFieldIds))

    # tests for CPU utilization (see functions for descriptions)
    test_utils.run_subtest(_test_cpuutil_bounded_flatlines_hostengine,
                           cpuUtilTS)

    # test that execution time grows at a linear rate
    #test_utils.run_subtest(_test_exectime_bounded_linear_growth, execTimeTS)

    # make some pretty graphs to look at for insight or to help debug failures
    _generate_metadata_line_charts(memUsageTS,
                                   ylabel='bytes',
                                   title='Bytes Used')
    _generate_metadata_line_charts(execTimeTS,
                                   ylabel='usec',
                                   title='Execution Time')
    _generate_metadata_line_charts(execTimeAvgTS,
                                   ylabel='usec',
                                   title='Recent Exec Time')
    _generate_cpu_line_charts(cpuUtilTS)

    barPlotPoints = [(id, execTimeAvgTS.fieldVals[id][-1])
                     for id in execTimeAvgTS.fieldVals]
    _plotFinalValueOrderedBarChart(barPlotPoints,
                                   title='Top 20 Field Recent Exec Time',
                                   ylabel='usec',
                                   filenameBase='test-perf')