def test_dcgm_embedded_metadata_exectime_aggregate_is_sum_of_gpu_and_global( handle): """ Ensure that when execution time is retrieved relating to fields that the "global" and "gpu" values add up to the "aggregate" value """ handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) # watch a ton of fields so that we know that some are being stored test_utils.watch_all_fields(handle.handle, group.GetGpuIds(), updateFreq=100000) system.introspect.UpdateAll() execTimeInfo = system.introspect.execTime.GetForAllFields() gpuExecTime = sum( info.totalEverUpdateUsec for info in execTimeInfo.gpuInfo[:execTimeInfo.gpuInfoCount]) if execTimeInfo.hasGlobalInfo: globalExecTime = execTimeInfo.globalInfo.totalEverUpdateUsec else: globalExecTime = 0 assert ( execTimeInfo.aggregateInfo.totalEverUpdateUsec == globalExecTime + gpuExecTime ), ('aggregate for all fields reports %s usec but GPUs report %s usec and global reports %s usec. ' % (execTimeInfo.aggregateInfo.totalEverUpdateUsec, gpuExecTime, globalExecTime) + ' GPUs + global should sum to aggregate.')
def test_dcgm_embedded_metadata_memory_aggregate_is_sum_of_gpu_and_global( handle): """ Ensure that when memory info is retrieved relating to fields that the "global" and "gpu" values add up to the "aggregate" value """ handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) # watch a ton of fields so that we know that some are being stored test_utils.watch_all_fields(handle.handle, group.GetGpuIds(), updateFreq=100000) system.introspect.UpdateAll() memoryInfo = system.introspect.memory.GetForAllFields() gpuMemory = sum(mem.bytesUsed for mem in memoryInfo.gpuInfo[:memoryInfo.gpuInfoCount]) globalMemory = memoryInfo.globalInfo.bytesUsed if memoryInfo.hasGlobalInfo else 0 if (memoryInfo.hasGlobalInfo): logger.debug('global mem info: %s' % (memoryInfo.globalInfo)) for i in range(memoryInfo.gpuInfoCount): logger.debug('gpu mem info %s: %s' % (i, memoryInfo.gpuInfo[i])) assert (memoryInfo.aggregateInfo.bytesUsed == gpuMemory + globalMemory), ( 'aggregate for all fields reports %s bytes but a sum of GPU and global reports %s bytes. ' % (memoryInfo.aggregateInfo.bytesUsed, gpuMemory + globalMemory) + ' These values should be equal.')
def test_dcgm_embedded_metadata_memory_get_all_fields_sane(handle): """ Sanity test for API that gets memory usage of all fields together """ if not option_parser.options.developer_mode: test_utils.skip_test("Skipping developer test.") handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) # watch a ton of fields so that we know that some are being stored test_utils.watch_all_fields(handle.handle, group.GetGpuIds(), updateFreq=1000) system.introspect.UpdateAll() memoryInfo = system.introspect.memory.GetForAllFields().aggregateInfo logger.debug('All fields in hostengine are using %.2f MB' % (memoryInfo.bytesUsed / 1024. / 1024.)) assert (1024 * 20 < memoryInfo.bytesUsed < 100 * 1024 * 1024), memoryInfo.bytesUsed # 20 KB to 100 MB
def test_dcgm_embedded_metadata_exectime_get_all_fields_sane(handle): """ Sanity test for API that gets execution time of all fields together """ if not option_parser.options.developer_mode: test_utils.skip_test("Skipping developer test.") handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) # watch a ton of fields so that we know that some are being stored updateFreqUsec = 1000 test_utils.watch_all_fields(handle.handle, group.GetGpuIds(), updateFreq=updateFreqUsec) system.introspect.UpdateAll() execTime = system.introspect.execTime.GetForAllFields().aggregateInfo perGpuSane = 300*1000 # 300 ms activeGpuCount = test_utils.get_live_gpu_count(handle.handle) saneLimit = perGpuSane*activeGpuCount # test that all struct fields in the API response have reasonable values assert(100 < execTime.totalEverUpdateUsec < saneLimit), ( 'execution time seems way too long for a system with %s gpus. Took %s ms. Sane limit: %s ms' % (activeGpuCount, execTime.totalEverUpdateUsec/1000, saneLimit/1000)) assert(100 < execTime.recentUpdateUsec < saneLimit), ( 'recent update time seems way too long for a system with %s gpus. Took %s ms. Sane limit: %s ms' % (activeGpuCount, execTime.recentUpdateUsec/1000, saneLimit/1000)) assert(updateFreqUsec-1 <= execTime.meanUpdateFreqUsec <= updateFreqUsec+1), execTime.meanUpdateFreqUsec
def _sumMetadata(handle, getForFieldFn, getForAllFieldsFn, metaAttr): ''' Return a 3-tuple where the first entry is the aggregate of summing the metadata for every field, the second entry is the total before aggregating and the third entry is the total after aggregating. ''' system = pydcgm.DcgmSystem(pydcgm.DcgmHandle(handle)) group = pydcgm.DcgmGroup(pydcgm.DcgmHandle(handle), groupName="test-metadata", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) # watch every field possible on all GPUs watchedFields = test_utils.watch_all_fields(handle, group.GetGpuIds()) system.introspect.UpdateAll() # Get the total before and after to accomodate for any slight changes # in total memory usage while the individual field amounts are being summed startVal = getForAllFieldsFn().aggregateInfo.__getattribute__(metaAttr) aggregateVal = sum( getForFieldFn(handle, fieldId).aggregateInfo.__getattribute__(metaAttr) for fieldId in watchedFields) endVal = getForAllFieldsFn().aggregateInfo.__getattribute__(metaAttr) return aggregateVal, startVal, endVal
def test_dcgm_standalone_perf_bounded(handle): ''' Test that runs some subtests. When we bound the number of samples to keep for each field: - DCGM memory usage eventually flatlines on a field, field group, all fields, and process level. - DCGM memory usage is at a value that we expect (golden value). If what we expect changes over time the we must update what these values are (the tests will fail if we don't). Plots of the memory usage and execution time generated during this test are saved and the filename of the figure is output on the terminal. Multiple tests are included in this test in order to save time by only gathering data once. ''' if not option_parser.options.developer_mode: test_utils.skip_test("Skipping developer test.") handle = pydcgm.DcgmHandle(handle) group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) updateFreq = 1000000 # 1 second. Needs to be long enough for all fields on all GPUs to update, or the record density will vary based on CPU consumption watchedFieldIds = test_utils.watch_all_fields( handle.handle, group.GetGpuIds(), updateFreq, maxKeepAge=0.0, #Use maxKeepEntries only to enforce the quota maxKeepEntries=10) memUsageTS, execTimeTS, execTimeAvgTS, cpuUtilTS = _gather_perf_timeseries( handle, watchedFieldIds) activeGpuCount = test_utils.get_live_gpu_count(handle.handle) # run the actual tests on the gathered data # test that memory usage flatlines test_utils.run_subtest(_test_mem_bounded_flatlines_fields, memUsageTS) test_utils.run_subtest(_test_mem_bounded_flatlines_fieldgroups, memUsageTS) test_utils.run_subtest(_test_mem_bounded_flatlines_allfields, memUsageTS) test_utils.run_subtest(_test_mem_bounded_flatlines_process, memUsageTS) # test that memory usage is at an expected level (golden value) # the tail end of the series should be VERY close to the end since we compare the mean # of the tail to the golden value tailStart = int(0.8 * len(memUsageTS.timestamps)) test_utils.run_subtest(_test_mem_bounded_golden_values_fields, activeGpuCount, memUsageTS, tailStart) test_utils.run_subtest(_test_mem_bounded_golden_values_allfields, activeGpuCount, memUsageTS, tailStart, len(watchedFieldIds)) test_utils.run_subtest(_test_mem_bounded_golden_values_process, memUsageTS, tailStart, len(watchedFieldIds)) # tests for CPU utilization (see functions for descriptions) test_utils.run_subtest(_test_cpuutil_bounded_flatlines_hostengine, cpuUtilTS) # test that execution time grows at a linear rate #test_utils.run_subtest(_test_exectime_bounded_linear_growth, execTimeTS) # make some pretty graphs to look at for insight or to help debug failures _generate_metadata_line_charts(memUsageTS, ylabel='bytes', title='Bytes Used') _generate_metadata_line_charts(execTimeTS, ylabel='usec', title='Execution Time') _generate_metadata_line_charts(execTimeAvgTS, ylabel='usec', title='Recent Exec Time') _generate_cpu_line_charts(cpuUtilTS) barPlotPoints = [(id, execTimeAvgTS.fieldVals[id][-1]) for id in execTimeAvgTS.fieldVals] _plotFinalValueOrderedBarChart(barPlotPoints, title='Top 20 Field Recent Exec Time', ylabel='usec', filenameBase='test-perf')