def test_dcgm_port_standalone(handle, gpuIds): """ Verifies that DCGM Engine works on different port """ gpuIdList = dcgm_agent.dcgmGetAllDevices(handle) assert len( gpuIdList ) >= 0, "Standalone host engine using different port number failed."
def GetAllGpuIds(self): gpuIds = dcgm_agent.dcgmGetAllDevices(self._dcgmHandle.handle) return gpuIds
% (x, config_values[x].mComputeMode, expected_compute_mode) assert config_values[x].mEccMode == expected_ecc, "The ecc mode value for gpuID %d is incorrect."\ " Returned: %d Expected: %d" \ % (x, config_values[x].mEccMode, expected_ecc) pass ret = dcgm_agent.dcgmStatusDestroy(status_handle) assert (ret == dcgm_structs.DCGM_ST_OK ), "Failed to remove status handler, error: %s" % ret dcgm_structs._LoadDcgmLibrary() handle = dcgm_agent.dcgmInit() devices = dcgm_agent.dcgmGetAllDevices(handle) validDevices = list() for x in devices: fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields( handle, x, [ dcgm_fields.DCGM_FI_DEV_RETIRED_DBE, ]) if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED): validDevices.append(x) if (len(validDevices) == 0): print "Can only run if at least one GPU with ECC is present" sys.exit(1) print "Number of valid devices: %d" % len(validDevices)