def test_dcgm_embedded_metadata_memory_aggregate_is_sum_of_gpu_and_global( handle): """ Ensure that when memory info is retrieved relating to fields that the "global" and "gpu" values add up to the "aggregate" value """ handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) # watch a ton of fields so that we know that some are being stored test_utils.watch_all_fields(handle.handle, group.GetGpuIds(), updateFreq=100000) system.introspect.UpdateAll() memoryInfo = system.introspect.memory.GetForAllFields() gpuMemory = sum(mem.bytesUsed for mem in memoryInfo.gpuInfo[:memoryInfo.gpuInfoCount]) globalMemory = memoryInfo.globalInfo.bytesUsed if memoryInfo.hasGlobalInfo else 0 if (memoryInfo.hasGlobalInfo): logger.debug('global mem info: %s' % (memoryInfo.globalInfo)) for i in range(memoryInfo.gpuInfoCount): logger.debug('gpu mem info %s: %s' % (i, memoryInfo.gpuInfo[i])) assert (memoryInfo.aggregateInfo.bytesUsed == gpuMemory + globalMemory), ( 'aggregate for all fields reports %s bytes but a sum of GPU and global reports %s bytes. ' % (memoryInfo.aggregateInfo.bytesUsed, gpuMemory + globalMemory) + ' These values should be equal.')
def test_dcgm_embedded_metadata_memory_get_field_group_sane(handle): ''' Sanity test for API that gets memory usage of a single field group ''' if not option_parser.options.developer_mode: test_utils.skip_test("Skipping developer test.") handle = pydcgm.DcgmHandle(handle) group = pydcgm.DcgmGroup(handle, groupName='test-metadata', groupType=dcgm_structs.DCGM_GROUP_DEFAULT) system = pydcgm.DcgmSystem(handle) fieldIds = [ dcgm_fields.DCGM_FI_DEV_GPU_TEMP, dcgm_fields.DCGM_FI_DEV_POWER_USAGE ] fieldGroup = pydcgm.DcgmFieldGroup(handle, "my_field_group", fieldIds) # ensure that the field group is watched _watch_field_group_basic(fieldGroup, handle.handle, group.GetId()) system.introspect.UpdateAll() memoryInfo = system.introspect.memory.GetForFieldGroup(fieldGroup) logger.debug("field group %s is using %.2f KB" % (fieldGroup.name, memoryInfo.aggregateInfo.bytesUsed / 1024.)) # 0+ to 20 MB assert(0 < memoryInfo.aggregateInfo.bytesUsed < 1024*1024*20), \ 'bytes used to store field was unreasonable for field group %s, bytes: %s' \ % (fieldGroup.name, memoryInfo.aggregateInfo.bytesUsed)
def test_dcgm_embedded_metadata_exectime_get_field_sane(handle): """ Sanity test for API that gets execution time of a single field """ if not option_parser.options.developer_mode: test_utils.skip_test("Skipping developer test.") handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) updateFreqUsec = 1000 dcgm_agent_internal.dcgmWatchFieldValue(handle.handle, group.GetGpuIds()[0], dcgm_fields.DCGM_FI_DEV_GPU_TEMP, updateFreqUsec, 100000, 10) system.UpdateAllFields(True) system.introspect.UpdateAll() execTime = dcgm_agent_internal.dcgmIntrospectGetFieldExecTime( handle.handle, dcgm_fields.DCGM_FI_DEV_GPU_TEMP).aggregateInfo # test that all struct fields in the API response have reasonable values assert (100 < execTime.totalEverUpdateUsec < 100 * 1000), execTime.totalEverUpdateUsec assert (100 < execTime.recentUpdateUsec < 100 * 1000), execTime.recentUpdateUsec assert (updateFreqUsec == execTime.meanUpdateFreqUsec ), execTime.meanUpdateFreqUsec
def test_dcgm_embedded_metadata_memory_get_field_sane(handle): ''' Sanity test for API that gets memory usage of a single field ''' if not option_parser.options.developer_mode: test_utils.skip_test("Skipping developer test.") handleObj = pydcgm.DcgmHandle(handle=handle) fieldIds = [ dcgm_fields.DCGM_FI_DEV_GPU_TEMP, ] fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "my_field_group", fieldIds) group = pydcgm.DcgmGroup(pydcgm.DcgmHandle(handle), groupName="test-metadata", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) system = pydcgm.DcgmSystem(pydcgm.DcgmHandle(handle)) _watch_field_group_basic(fieldGroup, handle, group.GetId()) system.introspect.UpdateAll() memoryInfo = dcgm_agent_internal.dcgmIntrospectGetFieldMemoryUsage( handle, fieldIds[0]) logger.debug("field %s using %.2f KB" % (fieldIds[0], memoryInfo.aggregateInfo.bytesUsed / 1024.)) # 0+ to 200 KB assert(0 < memoryInfo.aggregateInfo.bytesUsed < 1024*200), \ 'bytes used to store field was unreasonable for ID %s, bytes: %s' \ % (fieldIds[0], memoryInfo.aggregateInfo.bytesUsed)
def test_dcgm_embedded_metadata_memory_get_aggregate_fields_equals_total( handle): system = pydcgm.DcgmSystem(pydcgm.DcgmHandle(handle)) _metadata_get_aggregate_fields_equals_total( handle, dcgm_agent_internal.dcgmIntrospectGetFieldMemoryUsage, system.introspect.memory.GetForAllFields, 'bytesUsed')
def test_dcgm_embedded_metadata_exectime_get_field_group_sane(handle): """ Sanity test for API that gets execution time of all fields together """ if not option_parser.options.developer_mode: test_utils.skip_test("Skipping developer test.") handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) fieldIds = [ dcgm_fields.DCGM_FI_DEV_POWER_USAGE, dcgm_fields.DCGM_FI_DEV_SM_CLOCK, dcgm_fields.DCGM_FI_DEV_GPU_TEMP ] fieldGroup = pydcgm.DcgmFieldGroup(handle, "my_field_group", fieldIds) updateFreqUsec = 1000 _watch_field_group_basic(fieldGroup, handle.handle, group.GetId(), updateFreq=updateFreqUsec) system.introspect.UpdateAll() execTime = system.introspect.execTime.GetForFieldGroup( fieldGroup).aggregateInfo # test that all struct fields in the API response have reasonable values assert (100 < execTime.totalEverUpdateUsec < 100 * 1000), execTime.totalEverUpdateUsec assert (100 < execTime.recentUpdateUsec < 100 * 1000), execTime.recentUpdateUsec assert (updateFreqUsec == execTime.meanUpdateFreqUsec ), execTime.meanUpdateFreqUsec
def _sumMetadata(handle, getForFieldFn, getForAllFieldsFn, metaAttr): ''' Return a 3-tuple where the first entry is the aggregate of summing the metadata for every field, the second entry is the total before aggregating and the third entry is the total after aggregating. ''' system = pydcgm.DcgmSystem(pydcgm.DcgmHandle(handle)) group = pydcgm.DcgmGroup(pydcgm.DcgmHandle(handle), groupName="test-metadata", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) # watch every field possible on all GPUs watchedFields = test_utils.watch_all_fields(handle, group.GetGpuIds()) system.introspect.UpdateAll() # Get the total before and after to accomodate for any slight changes # in total memory usage while the individual field amounts are being summed startVal = getForAllFieldsFn().aggregateInfo.__getattribute__(metaAttr) aggregateVal = sum( getForFieldFn(handle, fieldId).aggregateInfo.__getattribute__(metaAttr) for fieldId in watchedFields) endVal = getForAllFieldsFn().aggregateInfo.__getattribute__(metaAttr) return aggregateVal, startVal, endVal
def test_dcgm_embedded_metadata_memory_get_all_fields_sane(handle): """ Sanity test for API that gets memory usage of all fields together """ if not option_parser.options.developer_mode: test_utils.skip_test("Skipping developer test.") handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) # watch a ton of fields so that we know that some are being stored test_utils.watch_all_fields(handle.handle, group.GetGpuIds(), updateFreq=1000) system.introspect.UpdateAll() memoryInfo = system.introspect.memory.GetForAllFields().aggregateInfo logger.debug('All fields in hostengine are using %.2f MB' % (memoryInfo.bytesUsed / 1024. / 1024.)) assert (1024 * 20 < memoryInfo.bytesUsed < 100 * 1024 * 1024), memoryInfo.bytesUsed # 20 KB to 100 MB
def test_dcgm_embedded_metadata_exectime_get_all_fields_sane(handle): """ Sanity test for API that gets execution time of all fields together """ if not option_parser.options.developer_mode: test_utils.skip_test("Skipping developer test.") handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) # watch a ton of fields so that we know that some are being stored updateFreqUsec = 1000 test_utils.watch_all_fields(handle.handle, group.GetGpuIds(), updateFreq=updateFreqUsec) system.introspect.UpdateAll() execTime = system.introspect.execTime.GetForAllFields().aggregateInfo perGpuSane = 300*1000 # 300 ms activeGpuCount = test_utils.get_live_gpu_count(handle.handle) saneLimit = perGpuSane*activeGpuCount # test that all struct fields in the API response have reasonable values assert(100 < execTime.totalEverUpdateUsec < saneLimit), ( 'execution time seems way too long for a system with %s gpus. Took %s ms. Sane limit: %s ms' % (activeGpuCount, execTime.totalEverUpdateUsec/1000, saneLimit/1000)) assert(100 < execTime.recentUpdateUsec < saneLimit), ( 'recent update time seems way too long for a system with %s gpus. Took %s ms. Sane limit: %s ms' % (activeGpuCount, execTime.recentUpdateUsec/1000, saneLimit/1000)) assert(updateFreqUsec-1 <= execTime.meanUpdateFreqUsec <= updateFreqUsec+1), execTime.meanUpdateFreqUsec
def test_dcgm_embedded_metadata_exectime_aggregate_is_sum_of_gpu_and_global( handle): """ Ensure that when execution time is retrieved relating to fields that the "global" and "gpu" values add up to the "aggregate" value """ handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) # watch a ton of fields so that we know that some are being stored test_utils.watch_all_fields(handle.handle, group.GetGpuIds(), updateFreq=100000) system.introspect.UpdateAll() execTimeInfo = system.introspect.execTime.GetForAllFields() gpuExecTime = sum( info.totalEverUpdateUsec for info in execTimeInfo.gpuInfo[:execTimeInfo.gpuInfoCount]) if execTimeInfo.hasGlobalInfo: globalExecTime = execTimeInfo.globalInfo.totalEverUpdateUsec else: globalExecTime = 0 assert ( execTimeInfo.aggregateInfo.totalEverUpdateUsec == globalExecTime + gpuExecTime ), ('aggregate for all fields reports %s usec but GPUs report %s usec and global reports %s usec. ' % (execTimeInfo.aggregateInfo.totalEverUpdateUsec, gpuExecTime, globalExecTime) + ' GPUs + global should sum to aggregate.')
def test_dcgm_embedded_metadata_exectime_get_aggregate_fields_equals_total( handle): ''' Test that the aggregate of execution time across all fields is within 5% of the total value. ''' system = pydcgm.DcgmSystem(pydcgm.DcgmHandle(handle)) _metadata_get_aggregate_fields_equals_total( handle, dcgm_agent_internal.dcgmIntrospectGetFieldExecTime, system.introspect.execTime.GetForAllFields, 'totalEverUpdateUsec')
def test_dcgm_embedded_metadata_cpuutil_get_hostengine_sane(handle): """ Sanity test for API that gets CPU Utilization of the hostengine process. """ from multiprocessing import cpu_count handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) def generate_cpu_load(duration_sec): """ Generate a CPU load for a given duration. """ from multiprocessing import Pool from itertools import repeat start_time = time.time() processes = cpu_count() with Pool(processes) as pool: pool.starmap( _cpu_load_star, zip(repeat((start_time, duration_sec)), range(processes))) def get_current_process_cpu_util(): """Return a tuple representing CPU user-time and system-time and total for the current process """ import os with open('/proc/self/stat', "rb", buffering=0) as f: data = f.readline() values = data[data.rfind(b')') + 2:].split() utime = float(values[11]) / os.sysconf("SC_CLK_TCK") stime = float(values[12]) / os.sysconf("SC_CLK_TCK") return utime, stime, utime + stime start = time.time() start_cpu_util = get_current_process_cpu_util() generate_cpu_load(1) stop = time.time() stop_cpu_util = get_current_process_cpu_util() cpuUtil = system.introspect.cpuUtil.GetForHostengine() #diff_utime = stop_cpu_util[0] - start_cpu_util[0] #diff_stime = stop_cpu_util[1] - start_cpu_util[1] diff_total = stop_cpu_util[2] - start_cpu_util[2] diff_time = stop - start overall_cpu_util = diff_total / diff_time logger.debug("DCGM CPU Util: %f" % (cpuUtil.total * cpu_count())) logger.debug('Stats CPU Util: %f' % overall_cpu_util) assert abs(overall_cpu_util - (cpu_count() * cpuUtil.total) ) < 0.05, "CPU Utilization was not within 5% of expected value" # test that user and kernel add to total (with rough float accuracy) assert abs(cpuUtil.total - (cpuUtil.user + cpuUtil.kernel)) <= 4*float_info.epsilon, \ 'CPU kernel and user utilization did not add up to total. Kernel: %f, User: %f, Total: %f' \ % (cpuUtil.kernel, cpuUtil.user, cpuUtil.total)
def _assert_metadata_not_configured_failure(handle): """ Verifies that: 1. metadata gathering is disabled by default 2. an appropriate error is raised when metadata APIs are accessed but metadata gathering is disabled. """ system = pydcgm.DcgmSystem(pydcgm.DcgmHandle(handle)) with test_utils.assert_raises(dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_NOT_CONFIGURED)): memoryInfo = system.introspect.memory.GetForAllFields()
def test_dcgm_standalone_metadata_memory_get_hostengine_sane(handle): """ Sanity test for API that gets memory usage of the hostengine process """ if not option_parser.options.developer_mode: test_utils.skip_test("Skipping developer test.") handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) system.introspect.UpdateAll() bytesUsed = system.introspect.memory.GetForHostengine().bytesUsed logger.debug('the hostengine process is using %.2f MB' % (bytesUsed / 1024. / 1024.)) assert(1*1024*1024 < bytesUsed < 100*1024*1024), bytesUsed # 1MB to 100MB
def test_dcgmi_introspect_enable_disable(handle): """ Test that the dcgmi commands for enabling/disabling introspection actually do as they say """ _run_dcgmi_command(["introspect", "--enable"]) dcgmHandle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(dcgmHandle) # throws exception if it's not enabled mem = system.introspect.memory.GetForHostengine() _run_dcgmi_command(["introspect", "--disable"]) with test_utils.assert_raises(dcgmExceptionClass(DCGM_ST_NOT_CONFIGURED)): mem = system.introspect.memory.GetForHostengine()
def test_dcgm_embedded_metadata_mean_update_frequency(handle): """ Ensure that mean update frequency is being calculated properly """ handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) # these frequencies must have a perfect integer mean or the last assertion will fail updateFreqs = { dcgm_fields.DCGM_FI_DEV_POWER_USAGE: 10000, dcgm_fields.DCGM_FI_DEV_GPU_TEMP: 20000, } meanUpdateFreq = stats.mean(updateFreqs.values()) gpuId = group.GetGpuIds()[0] fieldIds = [] for fieldId, freqUsec in updateFreqs.items(): fieldIds.append(fieldId) dcgm_agent_internal.dcgmWatchFieldValue(handle.handle, gpuId, fieldId, freqUsec, 100000, 10) system.UpdateAllFields(True) system.introspect.UpdateAll() fieldGroup = pydcgm.DcgmFieldGroup(handle, "my_field_group", fieldIds) execTime = system.introspect.execTime.GetForFieldGroup(fieldGroup) resultGpuIndex = -1 for i in range(execTime.gpuInfoCount): if execTime.gpuIdsForGpuInfo[i] == gpuId: resultGpuIndex = i break assert(resultGpuIndex >= 0), "no results returned for the watched GPU" actualMeanUpdateFreq = execTime.gpuInfo[resultGpuIndex].meanUpdateFreqUsec assert(actualMeanUpdateFreq == meanUpdateFreq), "expected %s, got %s" \ % (meanUpdateFreq, actualMeanUpdateFreq)
def test_dcgm_embedded_metadata_cpuutil_get_hostengine_sane(handle): """ Sanity test for API that gets CPU Utilization of the hostengine process. """ handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) # wait up to 1 second for CPU utilization to be averaged properly (not be 0) for attempt in xrange(100): cpuUtil = system.introspect.cpuUtil.GetForHostengine() assert(0 <= cpuUtil.total <= 1), cpuUtil.total if (0 < cpuUtil.total < 1): break time.sleep(0.010) # 0+% to 50% CPU utilization assert(0.00001 < cpuUtil.total < 0.50), cpuUtil.total # test that user and kernel add to total (with rough float accuracy) assert(abs(cpuUtil.total - (cpuUtil.user + cpuUtil.kernel)) <= 4*float_info.epsilon), \ 'CPU kernel and user utilization did not add up to total. Kernel: %f, User: %f, Total: %f' \ % (cpuUtil.kernel, cpuUtil.user, cpuUtil.total)
def GetSystem(self): ''' Get a DcgmSystem instance for this handle ''' return pydcgm.DcgmSystem(self)
def _gather_perf_timeseries(handle, watchedFieldIds): ''' Gathers metadata over time and returns a tuple of 4 MetadataTimeseries (mem usage, exec time, avg exec time, cpu utilization) ''' system = pydcgm.DcgmSystem(handle) memUsageTS = MetadataTimeseries() execTimeTS = MetadataTimeseries() execTimeAvgTS = MetadataTimeseries() cpuUtilTS = CpuTimeseries() numFields = min(len(watchedFieldIds), 50) fieldGroups = [] for i in range(1, 6): fieldGroups.append( pydcgm.DcgmFieldGroup(handle, "my_field_group_%d" % i, list(watchedFieldIds)[0:numFields])) startTime = datetime.datetime.now() while (datetime.datetime.now() - startTime).total_seconds() < BOUNDED_TEST_DURATION: # poll memory usage memUsageTS.timestamps.append( (datetime.datetime.now() - startTime).total_seconds()) memUsageTS.processVals.append( system.introspect.memory.GetForHostengine().bytesUsed) memUsageTS.allFieldsVals.append( system.introspect.memory.GetForAllFields().aggregateInfo.bytesUsed) for id in watchedFieldIds: memUsageTS.fieldVals[id].append( dcgm_agent_internal.dcgmIntrospectGetFieldMemoryUsage( handle.handle, id).aggregateInfo.bytesUsed) for fieldGroup in fieldGroups: memUsageTS.fieldGroupVals[int( fieldGroup.fieldGroupId.value)].append( system.introspect.memory.GetForFieldGroup( fieldGroup).aggregateInfo.bytesUsed) # poll execution time execTimeTS.timestamps.append( (datetime.datetime.now() - startTime).total_seconds()) execTimeTS.allFieldsVals.append( system.introspect.execTime.GetForAllFields( ).aggregateInfo.totalEverUpdateUsec) for id in watchedFieldIds: execTimeTS.fieldVals[id].append( dcgm_agent_internal.dcgmIntrospectGetFieldExecTime( handle.handle, id).aggregateInfo.totalEverUpdateUsec) #logger.info("fieldId %d: %s" % (id, str(execTimeTS.fieldVals[id][-1]))) for fieldGroup in fieldGroups: execTimeTS.fieldGroupVals[int( fieldGroup.fieldGroupId.value)].append( system.introspect.execTime.GetForFieldGroup( fieldGroup).aggregateInfo.totalEverUpdateUsec) # poll average execution time execTimeAvgTS.timestamps.append( (datetime.datetime.now() - startTime).total_seconds()) execTimeAvgTS.allFieldsVals.append( system.introspect.execTime.GetForAllFields( ).aggregateInfo.recentUpdateUsec) for id in watchedFieldIds: execTimeAvgTS.fieldVals[id].append( dcgm_agent_internal.dcgmIntrospectGetFieldExecTime( handle.handle, id).aggregateInfo.recentUpdateUsec) for fieldGroup in fieldGroups: execTimeAvgTS.fieldGroupVals[int( fieldGroup.fieldGroupId.value)].append( system.introspect.execTime.GetForFieldGroup( fieldGroup).aggregateInfo.recentUpdateUsec) # poll cpu utilization cpuUtilTS.timestamps.append( (datetime.datetime.now() - startTime).total_seconds()) cpuUtilTS.cpuInfo.append(system.introspect.cpuUtil.GetForHostengine()) time.sleep(0.050) return memUsageTS, execTimeTS, execTimeAvgTS, cpuUtilTS