def _assert_valid_dcgmi_results(args, retValue, stdout_lines, stderr_lines): assert (len(stdout_lines) > 0), 'No output detected for args "%s"' % ' '.join(args[1:]) if _is_eris_diag_inforom_failure(args, stdout_lines): # If we see inforom corruption, the test should not fail test_utils.skip_test('Detected corrupt inforom for diag test') return if retValue != c_ubyte(dcgm_structs.DCGM_ST_OK).value: if retValue == c_ubyte(dcgm_structs.DCGM_ST_NVVS_ERROR).value: # DCGM_ST_NVVS_ERROR means NVVS ran but returned a bad result. In other words, the arguments were # valid and this return code means you had valid arguments. return logger.error( 'Valid test - Function returned error code: %s . Args used: "%s"' % (retValue, ' '.join(args[1:]))) logger.error('Stdout:') for line in stdout_lines: logger.error('\t' + line) logger.error('Stderr:') for line in stderr_lines: logger.error('\t' + line) assert False, "See errors above." errLines = _lines_with_errors(stdout_lines) assert len( errLines ) == 0, "Found errors in output. Offending lines: \n%s" % '\n'.join( errLines)
def test_dcgm_stub_library(handle): """ Verifies that DCGM fails gracefully using the stub library if a proper DCGM installation is not present or shared libraries are not included in the library search path """ if utils.is_esx_hypervisor_system(): test_utils.skip_test( "Compute Mode tests are not supported in VMware ESX Environments") if is_dcgm_package_installed(): test_utils.skip_test( "A DCGM package is already installed on this machine") # Checks if libdcgm.so.2 is set within LD_LIBRARY_PATH libdcgm_path = get_libdcgm_path() assert libdcgm_path is not None if libdcgm_path is not None: # Verify is stub library is present if not (os.path.isfile(libdcgm_path + "/libdcgm_stub.a")): test_utils.skip_test("Unable to find \"libdcgm_stub.a\" in %s" % libdcgm_path) else: dcgm_lib_original = libdcgm_path + "/libdcgm.so.2" dcgm_lib_modified = dcgm_lib_original + "_modified" else: # Tear down the environment by finding and renaming "libdcgm.so.2" to "libdcgm.so.2_orig" # gets the path to libdcgm.so.2, like: /usr/lib/libdcgm.so.2 try: dcgm_lib = filter(lambda x: "libdcgm.so.2" in x, check_output(["ldconfig", "-p"]).split("\n")) dcgm_lib_original = filter(lambda x: x[0] != " ", dcgm_lib[0].split("=>")[-1]) dcgm_lib_modified = filter(lambda x: x[0] != " ", dcgm_lib_original + "_modified") except: test_utils.skip_test("Unable to find libdcgm.so.2 library") # Renaming the file try: os.rename(dcgm_lib_original, dcgm_lib_modified) except: test_utils.skip_test("Unable to rename libdcgm.so.2 library") try: stub_app = apps.DcgmStubRunnerApp() stub_app.start() pid = stub_app.getpid() stub_app.wait() finally: # Restore environment os.rename(dcgm_lib_modified, dcgm_lib_original) logger.info("stub_library_tet PID was %d" % pid) assert "!!!!!!!!" in stub_app.stdout_lines[ 1], "Failed to collect stub library output" assert "WARNING:" in stub_app.stdout_lines[ 2], "Failed to collect stub library output"
def test_linting_create_python_path_env_var(): ''' A unit test for linting.py's _create_python_path_env_var function. This tests that given a number of python filepaths it returns a python path that only includes the topmost directories containing python files. ''' if not option_parser.options.lint: test_utils.skip_test("not supported when the \"--no-lint\" option is used") filepaths = [ '/a/file1.py', '/a/file2.py', '/a2/b/file3.py', '/a2/b/c/file4.py', '/a2/b/c/d/file5.py' ] expectedPyPath = ':'.join([ '/a', '/a2/b', ]) actualPyPath = linting._create_python_path_env_var(filepaths) assert(actualPyPath == expectedPyPath), ( 'actual python path:\n%s\n' % actualPyPath + 'not the same as expected:\n%s' % expectedPyPath)
def test_dcgm_embedded_metadata_exectime_get_all_fields_sane(handle): """ Sanity test for API that gets execution time of all fields together """ if not option_parser.options.developer_mode: test_utils.skip_test("Skipping developer test.") handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) group = pydcgm.DcgmGroup(handle, groupName="metadata-test", groupType=dcgm_structs.DCGM_GROUP_DEFAULT) # watch a ton of fields so that we know that some are being stored updateFreqUsec = 1000 test_utils.watch_all_fields(handle.handle, group.GetGpuIds(), updateFreq=updateFreqUsec) system.introspect.UpdateAll() execTime = system.introspect.execTime.GetForAllFields().aggregateInfo perGpuSane = 300*1000 # 300 ms activeGpuCount = test_utils.get_live_gpu_count(handle.handle) saneLimit = perGpuSane*activeGpuCount # test that all struct fields in the API response have reasonable values assert(100 < execTime.totalEverUpdateUsec < saneLimit), ( 'execution time seems way too long for a system with %s gpus. Took %s ms. Sane limit: %s ms' % (activeGpuCount, execTime.totalEverUpdateUsec/1000, saneLimit/1000)) assert(100 < execTime.recentUpdateUsec < saneLimit), ( 'recent update time seems way too long for a system with %s gpus. Took %s ms. Sane limit: %s ms' % (activeGpuCount, execTime.recentUpdateUsec/1000, saneLimit/1000)) assert(updateFreqUsec-1 <= execTime.meanUpdateFreqUsec <= updateFreqUsec+1), execTime.meanUpdateFreqUsec
def test_dcgm_diag_handle_concurrency_standalone(handle, gpuIds): ''' Test that we can use a DCGM handle concurrently with a diagnostic running ''' diagDuration = 10 gpuId = gpuIds[0] dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr="SM Stress", paramsStr="sm stress.test_duration=%d" % diagDuration, version=dcgm_structs.dcgmRunDiag_version) dd.UseFakeGpus() response = [None] def run(dd, response): response = test_utils.diag_execute_wrapper(dd, handle) diagStartTime = time.time() threadObj = threading.Thread(target=run, args=[dd, response]) threadObj.start() #Give threadObj a head start on its 10 second run time.sleep(1.0) firstReturnedRequestLatency = None numConcurrentCompleted = 0 sleepDuration = 1.0 while threadObj.is_alive(): #Make another request on the handle concurrently moduleStatuses = dcgm_agent.dcgmModuleGetStatuses(handle) secondRequestLatency = time.time() - diagStartTime numConcurrentCompleted += 1 if firstReturnedRequestLatency is None: firstReturnedRequestLatency = secondRequestLatency time.sleep(sleepDuration) diagThreadEndTime = time.time() diagDuration = diagThreadEndTime - diagStartTime if firstReturnedRequestLatency is None: test_utils.skip_test( "Diag returned instantly. It is probably not supported for gpuId %u" % gpuId) logger.info("Completed %d concurrent requests. Diag ran for %.1f seconds" % (numConcurrentCompleted, diagDuration)) #We should have been able to complete a request every 2 seconds if we slept for 1 (conservatively) numShouldHaveCompleted = int((diagDuration / sleepDuration) / 2.0) assert numConcurrentCompleted >= numShouldHaveCompleted, "Expected at least %d concurrent tests completed. Got %d" % ( numShouldHaveCompleted, numConcurrentCompleted)
def test_logging_modules(): """ Verifies that module logging is functional """ PASSED = "passed" FAILED = "FAILED" SKIPPED = "SKIPPED" result = FAILED nvhost_engine = apps.NvHostEngineApp() nvhost_engine.start(timeout=10) contents = None # Try for 5 seconds for i in range(25): time.sleep(0.2) with closing(open(nvhost_engine.dcgm_trace_fname)) as f: # pylint: disable=no-member contents = f.read() logger.debug("Read %d bytes from %s" % (len(contents), nvhost_engine.dcgm_trace_fname)) # NVSwitch module is loaded on startup. So we check for records from that module test_string = "Initialized logging for module 1" # Note that if --eris is passsed, we only log at WARNING level # If logging is not at DEBUG level, then skip the test if test_utils.loggingLevel != 'DEBUG': # Skipping in a roundabout way to ensure we terminate the processes we launch result = SKIPPED break if test_string in contents: result = PASSED break # Cleaning up nvhost_engine.terminate() nvhost_engine.validate() if (result == SKIPPED): test_utils.skip_test( "Detected logLevel = WARN. This test requires DEBUG. Likely cause: --eris option" ) errorString = "" if (result != PASSED): if contents is not None: errorString = "Unable to find $test_string in log file" else: errorString = "log file %s was never read" % nvhost_engine.dcgm_trace_fname assert result == PASSED, errorString
def helper_test_blacklist_checks(handle, gpuIds): handleObj = DcgmHandle.DcgmHandle(handle=handle) settings = {} settings['instant'] = True settings['entity_get_flags'] = 0 settings['testNames'] = '3' settings['hostname'] = 'localhost' settings[ 'watches'] = dcgm_structs.DCGM_HEALTH_WATCH_MEM | dcgm_structs.DCGM_HEALTH_WATCH_PCIE error_list = [] ret = dcgm_internal_helpers.inject_field_value_i64( handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, 0, -50) blacklist_recommendations.check_health(handleObj, settings, error_list) # Make sure the GPUs pass a basic health test before running this test for gpuObj in blacklist_recommendations.g_gpus: if gpuObj.IsHealthy() == False: test_utils.skip_test("Skipping because GPU %d is not healthy. " % gpuObj.GetEntityId()) # Inject a memory error and verify that we fail blacklist_recommendations.g_gpus = [] # Reset g_gpus ret = dcgm_internal_helpers.inject_field_value_i64( handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, 1000, 10) assert (ret == dcgm_structs.DCGM_ST_OK) blacklist_recommendations.check_health(handleObj, settings, error_list) for gpuObj in blacklist_recommendations.g_gpus: if gpuObj.GetEntityId() == gpuIds[0]: assert gpuObj.IsHealthy( ) == False, "Injected error didn't trigger a failure on GPU %d" % gpuIds[ 0] else: assert gpuObj.IsHealthy( ), "GPU %d reported unhealthy despite not having an inserted error: '%s'" % ( gpuIds[0], gpuObj.WhyUnhealthy()) # Remove the memory monitor and make sure we pass our checks blacklist_recommendations.g_gpus = [] # Reset g_gpus settings['watches'] = dcgm_structs.DCGM_HEALTH_WATCH_PCIE blacklist_recommendations.check_health(handleObj, settings, error_list) for gpuObj in blacklist_recommendations.g_gpus: if gpuObj.GetEntityId() == gpuIds[0]: assert gpuObj.IsHealthy( ), "Injected error wasn't ignored for GPU %d: %s" % ( gpuIds[0], gpuObj.WhyUnhealthy()) else: assert gpuObj.IsHealthy( ), "GPU %d reported unhealthy despite not having an inserted error: '%s'" % ( gpuIds[0], gpuObj.WhyUnhealthy())
def helper_throttling_masking_failures(handle, gpuId): ##### # First check whether the GPU is healthy dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr="SM Stress", paramsStr="sm stress.test_duration=2", version=dcgm_structs.dcgmRunDiag_version) dd.SetThrottleMask( 0 ) # We explicitly want to fail for throttle reasons since this test inserts throttling errors # for verification dd.UseFakeGpus() response = test_utils.diag_execute_wrapper(dd, handle) if not check_diag_result_pass(response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX): test_utils.skip_test( "Skipping because GPU %s does not pass SM Perf test. " "Please verify whether the GPU is supported and healthy." % gpuId) ##### dd = DcgmDiag.DcgmDiag(gpuIds=[gpuId], testNamesStr="SM Stress", paramsStr="sm stress.test_duration=15", version=dcgm_structs.dcgmRunDiag_version) dd.SetThrottleMask(0) dd.UseFakeGpus() fieldId = dcgm_fields.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS insertedError = dcgm_fields.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN interval = 0.1 logger.info("Injecting benign errors") inject_value(handle, gpuId, fieldId, 3, 1, True) # Verify that the inserted values are visible in DCGM before starting the diag assert dcgm_internal_helpers.verify_field_value(gpuId, fieldId, 3, checkInterval=interval, maxWait=5, numMatches=1), \ "Expected inserted values to be visible in DCGM" logger.info("Injecting actual errors") inject_value(handle, gpuId, fieldId, insertedError, injection_offset, True) inject_value(handle, gpuId, dcgm_fields.DCGM_FI_DEV_GPU_TEMP, 1000, injection_offset, True) logger.info("Started diag") response = test_utils.diag_execute_wrapper(dd, handle) # Verify that the inserted values are visible in DCGM # Max wait of 8 is because of 5 second offset + 2 seconds required for 20 matches + 1 second buffer. assert dcgm_internal_helpers.verify_field_value(gpuId, fieldId, insertedError, checkInterval=0.1, numMatches=1, maxWait=8), \ "Expected inserted errors to be visible in DCGM" throttled, errMsg = find_throttle_failure( response, gpuId, dcgm_structs.DCGM_SM_STRESS_INDEX) assert throttled, "Expected to find throttling failure, but did not: (%s)" % errMsg
def test_dcgm_diag_per_gpu_responses_standalone_dcgmi(handle, gpuIds): if len(gpuIds) < 2: test_utils.skip_test( "Skipping because this test requires 2 or more GPUs with same SKU") if test_utils.is_throttling_masked_by_nvvs( handle, gpuIds[0], dcgm_fields.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN): test_utils.skip_test( "Skipping because this SKU ignores the throttling we inject for this test" ) logger.info("Starting test for per gpu responses (dcgmi output)") helper_per_gpu_responses_dcgmi(handle, gpuIds)
def test_dcgm_standalone_metadata_memory_get_hostengine_sane(handle): """ Sanity test for API that gets memory usage of the hostengine process """ if not option_parser.options.developer_mode: test_utils.skip_test("Skipping developer test.") handle = pydcgm.DcgmHandle(handle) system = pydcgm.DcgmSystem(handle) system.introspect.UpdateAll() bytesUsed = system.introspect.memory.GetForHostengine().bytesUsed logger.debug('the hostengine process is using %.2f MB' % (bytesUsed / 1024. / 1024.)) assert(1*1024*1024 < bytesUsed < 100*1024*1024), bytesUsed # 1MB to 100MB
def helper_test_config_config_power_enforce(handle, gpuIds): """ Checks if DCGM can enforce the power settings if it's changed behind the scenes """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetEmptyGroup("test1") ## Add first GPU to the group groupObj.AddGpu(gpuIds[0]) gpuIds = groupObj.GetGpuIds() #Only reference GPUs we are testing against gpuId = gpuIds[0] ## Get Min and Max Power limit on the group attributes = systemObj.discovery.GetGpuAttributes(gpuId) ## Verify that power is supported on the GPUs in the group if dcgmvalue.DCGM_INT32_IS_BLANK(attributes.powerLimits.maxPowerLimit): test_utils.skip_test("Needs Power limit to be supported on the GPU") powerLimit_set_dcgmi = int((attributes.powerLimits.maxPowerLimit + attributes.powerLimits.minPowerLimit) / 2) powerLimit_set_nvsmi = attributes.powerLimits.maxPowerLimit config_values = dcgm_structs.c_dcgmDeviceConfig_v1() config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK config_values.mPowerLimit.type = dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL config_values.mPowerLimit.val = powerLimit_set_dcgmi groupObj.config.Set(config_values) logger.info("Verify if dcgmi configured value has taken effect") helper_verify_power_value(groupObj, powerLimit_set_dcgmi) ## Change Power limit to max from external entity like nvidia-smi assert 0 == apps.NvidiaSmiApp(["-pl", str(powerLimit_set_nvsmi), "-i", str(gpuIds[0])]).run(), \ "Nvidia smi couldn't set the power limit" logger.info("Verify if nvsmi configured value has taken effect") helper_verify_power_value(groupObj, powerLimit_set_nvsmi) groupObj.config.Enforce() logger.info("Verify if dcgmi enforced value has taken effect") helper_verify_power_value(groupObj, powerLimit_set_dcgmi)
def test_dcgm_topology_device_standalone(handle, gpuIds): """ Verifies that the topology get for the default group works """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetDefaultGroup() gpuIds = groupObj.GetGpuIds() #Use just the GPUs in our group if len(gpuIds) < 2: test_utils.skip_test("Needs >= 2 GPUs") topologyInfo = systemObj.discovery.GetGpuTopology(gpuIds[0]) assert (topologyInfo.numGpus == len(gpuIds) - 1), "Expected %d, received numGpus = %d" % (len(gpuIds) - 1, topologyInfo.numGpus) assert (topologyInfo.cpuAffinityMask[0] != 0), "GPU 0 should have *some* affinity"
def helper_test_health_check_instances(handle, gpuIds): instances, cis = ensure_instance_ids(handle, gpuIds[0], 1, 1) instanceId = instances.keys()[0] ciId = cis.keys()[0] handleObj = DcgmHandle.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetEmptyGroup("test1") groupObj.AddEntity(dcgm_fields.DCGM_FE_GPU, gpuIds[0]) groupObj.AddEntity(dcgm_fields.DCGM_FE_GPU_I, instanceId) groupObj.AddEntity(dcgm_fields.DCGM_FE_GPU_CI, ciId) newSystems = dcgm_structs.DCGM_HEALTH_WATCH_MEM groupObj.health.Set(newSystems) # Verify health prior to testing responseV4 = groupObj.health.Check( dcgm_structs.dcgmHealthResponse_version4) if responseV4.incidentCount != 0: test_utils.skip_test("Cannot test on unhealthy systems.") # Inject one error per system dcgm_internal_helpers.inject_value( handle, gpuIds[0], dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, 2, 5, isInt=True, verifyInsertion=True, entityType=dcgm_fields.DCGM_FE_GPU) responseV4 = groupObj.health.Check( dcgm_structs.dcgmHealthResponse_version4) assert ( responseV4.incidentCount == 1 ), "Should have 1 total incidents but found %d" % responseV4.incidentCount assert (responseV4.incidents[0].entityInfo.entityId == gpuIds[0]) assert (responseV4.incidents[0].entityInfo.entityGroupId == dcgm_fields.DCGM_FE_GPU) assert (responseV4.incidents[0].error.code == dcgm_errors.DCGM_FR_VOLATILE_DBE_DETECTED) assert ( responseV4.incidents[0].system == dcgm_structs.DCGM_HEALTH_WATCH_MEM) assert ( responseV4.incidents[0].health == dcgm_structs.DCGM_HEALTH_RESULT_FAIL)
def test_logging_env_var(): """ Verifies that we log to the supplied env var """ if test_utils.loggingLevel != 'DEBUG': test_utils.skip_test( "Detected logLevel != DEBUG. This test requires DEBUG. Likely cause: --eris option" ) passed = False # Env var is automatically set in NvHostEngineApp nvhost_engine = apps.NvHostEngineApp() nvhost_engine.start(timeout=10) contents = None # Try for 5 seconds for i in range(25): time.sleep(0.2) with closing(open(nvhost_engine.dcgm_trace_fname, encoding='utf-8')) as f: # pylint: disable=no-member contents = f.read() logger.debug("Read %d bytes from %s" % (len(contents), nvhost_engine.dcgm_trace_fname)) # This is checking two things: # - that we are logging to the file specified in ENV # - that we are setting severity according to ENV (DEBUG) if 'DEBUG' in contents: passed = True break # Cleaning up nvhost_engine.terminate() nvhost_engine.validate() errorString = "" if (not passed): if contents is not None: errorString = "Unable to find 'DEBUG' in log file" else: errorString = "log file %s was never read" % nvhost_engine.dcgm_trace_fname assert passed, errorString
def helper_dcgm_config_powerbudget(handle, gpuIds): handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetEmptyGroup("test1") ## Add first GPU to the group groupObj.AddGpu(gpuIds[0]) gpuIds = groupObj.GetGpuIds() #Only reference GPUs we are testing against ## Get Min and Max Power limit on the group attributes = dcgm_agent.dcgmGetDeviceAttributes(handle, gpuIds[0]) ## Verify that power is supported on the GPUs in the group if dcgmvalue.DCGM_INT32_IS_BLANK(attributes.powerLimits.maxPowerLimit): test_utils.skip_test("Needs Power limit to be supported on the GPU") powerLimit = int((attributes.powerLimits.maxPowerLimit + attributes.powerLimits.minPowerLimit) / 2) config_values = dcgm_structs.c_dcgmDeviceConfig_v1() config_values.mEccMode = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK config_values.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK config_values.mComputeMode = dcgmvalue.DCGM_INT32_BLANK config_values.mPowerLimit.type = dcgm_structs.DCGM_CONFIG_POWER_BUDGET_GROUP config_values.mPowerLimit.val = powerLimit * len( gpuIds) #Assumes homogenous GPUs groupObj.config.Set(config_values) config_values = groupObj.config.Get(dcgm_structs.DCGM_CONFIG_CURRENT_STATE) assert len(config_values ) > 0, "Failed to get configuration using groupObj.config.Get" for x in range(0, len(gpuIds)): if (config_values[x].mPowerLimit.val != dcgmvalue.DCGM_INT32_NOT_SUPPORTED): assert config_values[ x].mPowerLimit.type == dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL, "The power limit type for gpuId %d is incorrect. Returned: %d Expected :%d" % ( x, config_values[x].mPowerLimit.type, dcgm_structs.DCGM_CONFIG_POWER_CAP_INDIVIDUAL) assert config_values[ x].mPowerLimit.val == powerLimit, "The power limit value for gpuID %d is incorrect. Returned: %d Expected: %s" % ( x, config_values[x].mPowerLimit.val, powerLimit) pass
def helper_verify_diag_passing(handle, gpuIds, testNames="SM Stress", testIndex=dcgm_structs.DCGM_SM_STRESS_INDEX, params="sm stress.test_duration=15", version=dcgm_structs.dcgmRunDiag_version, useFakeGpus=False): dd = DcgmDiag.DcgmDiag(gpuIds=gpuIds, testNamesStr=testNames, paramsStr=params, version=version) dd.SetThrottleMask( 0 ) # We explicitly want to fail for throttle reasons since this test inserts throttling errors # for verification if useFakeGpus: dd.UseFakeGpus() # If we've already chchecked this GPU, then use the previous result runDiag = False for gpuId in gpuIds: if gpuId in checked_gpus: if checked_gpus[gpuId] == False: test_utils.skip_test( "Skipping because GPU %s does not pass SM Perf test. " "Please verify whether the GPU is supported and healthy." % gpuId) else: runDiag = True if runDiag == False: return dd response = test_utils.diag_execute_wrapper(dd, handle) for gpuId in gpuIds: if not check_diag_result_pass(response, gpuId, testIndex): checked_gpus[gpuId] = False test_utils.skip_test( "Skipping because GPU %s does not pass SM Perf test. " "Please verify whether the GPU is supported and healthy." % gpuId) else: checked_gpus[gpuId] = True return dd
def test_dcgm_topology_device_nvlink_standalone(handle, gpuIds): """ Verifies that the topology get for the default group returns valid NVLINK info """ handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetDefaultGroup() gpuIds = groupObj.GetGpuIds() #Use just the GPUs in our group if len(gpuIds) < 2: test_utils.skip_test("Needs >= 2 GPUs") topologyInfo = systemObj.discovery.GetGpuTopology(gpuIds[0]) if topologyInfo.gpuPaths[0].localNvLinkIds == 0: test_utils.skip_test("Needs NVLINK support") assert ((topologyInfo.gpuPaths[0].path & 0xFFFFFF00) > 0), "No NVLINK state set when localNvLinkIds is > 0"
def test_dcgm_topology_group_single_gpu_standalone(handle, gpuIds): """ Verifies that the topology get for a group works for a single GPU """ #Topology will work for a one-GPU group if there are > 1 GPUs on the system if len(gpuIds) < 2: test_utils.skip_test("Needs >= 2 GPUs") handleObj = pydcgm.DcgmHandle(handle=handle) systemObj = handleObj.GetSystem() groupObj = systemObj.GetEmptyGroup("test1") groupObj.AddGpu(gpuIds[0]) gpuIds = groupObj.GetGpuIds() #Use just the GPUs in our group topologyInfo = groupObj.discovery.GetTopology() assert (topologyInfo.numaOptimalFlag > 0), "with a single GPU, numa is by default optimal" assert (topologyInfo.slowestPath == 0), "with a single GPU, slowest path shouldn't be set"
def test_linting_get_py_files_to_lint(): ''' A unit test for linting.py's _get_py_files_to_lint function. It tests that: - if a python file has no .pylint-passed file then it is linted - if a .pylint-passed file modify time is older than its python file then it is linted - otherwise the file is not selected for linting ''' if not option_parser.options.lint: test_utils.skip_test("not supported when the \"--no-lint\" option is used") notYetLinted = 'notYetLinted.py' oldLinted = 'oldLinted.py' newLinted = 'newLinted.py' tmpDir = tempfile.mkdtemp() def pylint_file(pyFile): return '.%s.pylint-passed' % pyFile # set up the py/pylint-passed files with the proper modification times for file in [notYetLinted, pylint_file(oldLinted), oldLinted, newLinted, pylint_file(newLinted)]: fp = os.path.join(tmpDir, file) with open(fp, 'a'): os.utime(fp, None) time.sleep(0.005) # must wait so that modification time changes filesToLint = linting._get_py_files_to_lint(tmpDir) expectedFilesToLint = [ os.path.join(tmpDir, file) for file in [notYetLinted, oldLinted] ] unexpectedFilesToLint = set(filesToLint) - set(expectedFilesToLint) assert len(unexpectedFilesToLint) == 0, \ "These python files should not have been linted: %s" % unexpectedFilesToLint expectedFilesNotLinted = set(expectedFilesToLint) - set(filesToLint) assert len(expectedFilesNotLinted) == 0, \ "These python files should have been linted: %s" % expectedFilesNotLinted
def test_linting_clear_lint_artifacts(): ''' A unit test for linting.py's _clear_lint_artifacts function. It tests that it actually clears any artifact files that were generated by pylint. ''' if not option_parser.options.lint: test_utils.skip_test("not supported when the \"--no-lint\" option is used") tmpDir = tempfile.mkdtemp() artifactFile = os.path.join(tmpDir, '.file.py.pylint-passed') with open(artifactFile, 'w'): os.utime(artifactFile, None) assert os.path.isfile(artifactFile), 'Failed to create file for testing' linting._clear_lint_artifacts(tmpDir) assert not os.path.isfile(artifactFile), 'pylint artifact was not removed'
def test_dcgm_prof_watch_multipass(handle, gpuIds): dcgmHandle = pydcgm.DcgmHandle(handle=handle) dcgmSystem = dcgmHandle.GetSystem() dcgmGroup = dcgmSystem.GetGroupWithGpuIds('mygroup', gpuIds) helper_check_profiling_environment(dcgmGroup) mpFieldIds = helper_get_multipass_field_ids(dcgmGroup) if mpFieldIds is None: test_utils.skip_test( "No multipass profiling fields exist for the gpu group") logger.info("Multipass fieldIds: " + str(mpFieldIds)) #Make sure that multipass watching up to DLG_MAX_METRIC_GROUPS groups works for i in range(min(len(mpFieldIds), DLG_MAX_METRIC_GROUPS)): fieldIds = [] for j in range(i + 1): fieldIds.extend(mpFieldIds[j]) logger.info("Positive testing multipass fieldIds %s" % str(fieldIds)) dcgmGroup.profiling.WatchFields(fieldIds, 1000000, 3600.0, 0) dcgmGroup.profiling.UnwatchFields() if len(mpFieldIds) <= DLG_MAX_METRIC_GROUPS: test_utils.skip_test( "Skipping multipass failure test since there are %d <= %d multipass groups." % (len(mpFieldIds), DLG_MAX_METRIC_GROUPS)) for i in range(DLG_MAX_METRIC_GROUPS + 1, len(mpFieldIds) + 1): fieldIds = [] for j in range(i): fieldIds.extend(mpFieldIds[j]) logger.info("Negative testing multipass fieldIds %s" % str(fieldIds)) with test_utils.assert_raises( dcgm_structs.dcgmExceptionClass( dcgm_structs.DCGM_ST_PROFILING_MULTI_PASS)): dcgmGroup.profiling.WatchFields(fieldIds, 1000000, 3600.0, 0) dcgmGroup.profiling.UnwatchFields()