def get_devices(self): """ Return name of devices """ self.initialize() if self.device_count == 0: names = list() elif IS_MACOS: names = [pynvx.cudaGetName(handle, ignore=True) for handle in self.handles] else: names = [pynvml.nvmlDeviceGetName(handle).decode("utf-8") for handle in self.handles] if self.logger: self.logger.debug("GPU Devices: %s", names) return names
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] # TODO: ps_process is being cached, but the dict below is not. process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' process['full_command'] = ['?'] else: process['command'] = os.path.basename(_cmdline[0]) process['full_command'] = _cmdline # Bytes to MBytes # if drivers are not TTC this will be None. usedmem = (nv_process.usedGpuMemory // MB if nv_process.usedGpuMemory else None) process['gpu_memory_usage'] = usedmem # process['gpu_memory_usage'] = ("%d MiB" % usedmem if usedmem is not None else usedmem) process['cpu_percent'] = ps_process.cpu_percent() # process['cpu_memory_usage'] = "%d MiB" % ( # round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) // MB) process['cpu_memory_usage'] = ( round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) // MB) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU ) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: utilization_enc = N.nvmlDeviceGetEncoderUtilization(handle) except N.NVMLError: utilization_enc = None # Not supported try: utilization_dec = N.nvmlDeviceGetDecoderUtilization(handle) except N.NVMLError: utilization_dec = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] # A single process might run in both of graphics and compute mode, # However we will display the process only once seen_pids = set() for nv_process in nv_comp_processes + nv_graphics_processes: if nv_process.pid in seen_pids: continue seen_pids.add(nv_process.pid) try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass except FileNotFoundError: # Ignore the exception which probably has occured # from psutil, due to a non-existent PID (see #95). # The exception should have been translated, but # there appears to be a bug of psutil. It is unlikely # FileNotFoundError is thrown in different situations. pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] try: process['cpu_percent'] = cache_process.cpu_percent() except psutil.NoSuchProcess: process['cpu_percent'] = 0.0 except FileNotFoundError: # Ignore the exception which probably has occured # from psutil, due to a non-existent PID (see #95). # The exception should have been translated, but # there appears to be a bug of psutil. It is unlikely # FileNotFoundError is thrown in different situations. process['cpu_percent'] = 0.0 pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else 0, 'utilization.enc': utilization_enc[0] if utilization_enc else None, 'utilization.dec': utilization_dec[0] if utilization_dec else None, 'power.draw': power // 1000 if power is not None else 0, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else 0, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else 0, 'memory.total': memory.total // MB if memory else 0, 'processes': processes, } GPUStatCollection.clean_processes() return gpu_info
def get_infos(): """Get all information about all your graphics cards. Returns: dict: The returned result is a dict with 3 keys: count, driver_version and devices: count: Number of gpus found driver_version: The version of the system’s graphics driver devices: It's a list and every item is a namedtuple Device which has 10 fields, for exzample id, name and fan_speed etc. It should be noted that the Process field is also a namedtuple which has 11 fields. """ infos = {} Device = namedtuple( "Device", [ "id", "name", "free", "used", "total", "temperature", "fan_speed", "power_usage", "power_state", "process", ], ) Process = namedtuple( "Process", [ "pid", "memory_percent", "status", "username", "num_threads", "cpu_num", "cpu_percent", "name", "cmdline", "used_gpu_mem", "create_time", ], ) driver_version = pynvml.nvmlSystemGetDriverVersion().decode() device_count = pynvml.nvmlDeviceGetCount() devices = [] for i in range(device_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) name = pynvml.nvmlDeviceGetName(handle).decode() mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) power_usage = pynvml.nvmlDeviceGetPowerUsage( handle) # Power usage in milliwatts mW processes = pynvml.nvmlDeviceGetComputeRunningProcesses( handle) # Which processes are using the GPU # process_info = [(item.pid, item.usedGpuMemory) for item in process_info] process_info = [] for p in processes: # append Process object to process_info pid = p.pid used_gpu_mem = p.usedGpuMemory p = psutil.Process(pid=pid) _ = p.cpu_percent() time.sleep(0.05) process_info.append( Process( pid=pid, memory_percent=p.memory_percent(), status=p.status(), username=p.username(), num_threads=p.num_threads(), cpu_num=p.cpu_num(), cpu_percent=p.cpu_percent(), name=p.name(), cmdline=" ".join(p.cmdline()), used_gpu_mem=used_gpu_mem, create_time=p.create_time(), )) try: fan_speed = pynvml.nvmlDeviceGetFanSpeed(handle) except pynvml.NVMLError_NotSupported as e: fan_speed = None power_usage = pynvml.nvmlDeviceGetPowerUsage(handle) power_state = pynvml.nvmlDeviceGetPowerState(handle) temperature = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) devices.append( Device( id=i, name=name, free=mem_info.free, used=mem_info.used, total=mem_info.total, temperature=temperature, fan_speed=fan_speed, power_usage=power_usage, power_state=power_state, process=process_info, )) infos["count"] = device_count infos["driver_version"] = driver_version infos["devices"] = devices return infos
def _get_data(self): data = {} if self.deviceCount: for i in range(self.deviceCount): gpuIdx = str(i) handle = pynvml.nvmlDeviceGetHandleByIndex(i) name = pynvml.nvmlDeviceGetName(handle) brand = pynvml.nvmlDeviceGetBrand(handle) brands = ['Unknown', 'Quadro', 'Tesla', 'NVS', 'Grid', 'GeForce', 'Titan'] ### Get data ### ## Memory usage try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) except Exception as e: self.debug(str(e)) mem = None ## ECC errors try: _memError = {} _eccCounter = {} eccErrors = {} eccCounterType = ['VOLATILE_ECC', 'AGGREGATE_ECC'] memErrorType = ['ERROR_TYPE_CORRECTED', 'ERROR_TYPE_UNCORRECTED'] memoryLocationType = ['L1_CACHE', 'L2_CACHE', 'DEVICE_MEMORY', 'REGISTER_FILE', 'TEXTURE_MEMORY'] for memoryLocation in range(5): for eccCounter in range(2): for memError in range(2): _memError[memErrorType[memError]] = pynvml.nvmlDeviceGetMemoryErrorCounter(handle,memError,eccCounter,memoryLocation) _eccCounter[eccCounterType[eccCounter]] = _memError eccErrors[memoryLocationType[memoryLocation]] = _eccCounter except Exception as e: self.debug(str(e)) eccErrors = None ## Temperature try: temp = pynvml.nvmlDeviceGetTemperature(handle,pynvml.NVML_TEMPERATURE_GPU) except Exception as e: self.debug(str(e)) temp = None ## Fan try: fanspeed = pynvml.nvmlDeviceGetFanSpeed(handle) except Exception as e: self.debug(str(e)) fanspeed = None ## GPU and Memory Utilization try: util = pynvml.nvmlDeviceGetUtilizationRates(handle) gpu_util = util.gpu mem_util = util.memory except Exception as e: self.debug(str(e)) gpu_util = None mem_util = None ## Encoder Utilization try: encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) enc_util = encoder[0] except Exception as e: self.debug(str(e)) enc_util = None ## Decoder Utilization try: decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) dec_util = decoder[0] except Exception as e: self.debug(str(e)) dec_util = None ## Clock frequencies try: clock_core = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_GRAPHICS) clock_sm = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_SM) clock_mem = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_MEM) * self.nvMemFactor except Exception as e: self.debug(str(e)) clock_core = None clock_sm = None clock_mem = None ### Packing data ### self.debug("Device", gpuIdx, ":", str(name)) data["device_name_" + gpuIdx] = name self.debug("Brand:", str(brands[brand])) self.debug(str(name), "Temp :", str(temp)) data["device_temp_" + gpuIdx] = temp self.debug(str(name), "Mem total :", str(mem.total), 'bytes') data["device_mem_total_" + gpuIdx] = mem.total self.debug(str(name), "Mem used :", str(mem.used), 'bytes') data["device_mem_used_" + gpuIdx] = mem.used self.debug(str(name), "Mem free :", str(mem.free), 'bytes') data["device_mem_free_" + gpuIdx] = mem.free self.debug(str(name), "Load GPU :", str(gpu_util), '%') data["device_load_gpu_" + gpuIdx] = gpu_util self.debug(str(name), "Load MEM :", str(mem_util), '%') data["device_load_mem_" + gpuIdx] = mem_util self.debug(str(name), "Load ENC :", str(enc_util), '%') data["device_load_enc_" + gpuIdx] = enc_util self.debug(str(name), "Load DEC :", str(dec_util), '%') data["device_load_dec_" + gpuIdx] = dec_util self.debug(str(name), "Core clock:", str(clock_core), 'MHz') data["device_core_clock_" + gpuIdx] = clock_core self.debug(str(name), "SM clock :", str(clock_sm), 'MHz') data["device_sm_clock_" + gpuIdx] = clock_sm self.debug(str(name), "Mem clock :", str(clock_mem), 'MHz') data["device_mem_clock_" + gpuIdx] = clock_mem self.debug(str(name), "Fan speed :", str(fanspeed), '%') data["device_fanspeed_" + gpuIdx] = fanspeed self.debug(str(name), "ECC errors:", str(eccErrors)) if eccErrors is not None: data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L1_CACHE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_L1_CACHE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L1_CACHE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_L2_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L2_CACHE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_L2_CACHE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L2_CACHE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_DEVICE_MEMORY_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_DEVICE_MEMORY_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_DEVICE_MEMORY_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_DEVICE_MEMORY_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_REGISTER_FILE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_REGISTER_FILE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_REGISTER_FILE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_REGISTER_FILE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_TEXTURE_MEMORY_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_TEXTURE_MEMORY_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] else: data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = None ## Get unit (S-class Nvidia cards) data if self.unitCount: for i in range(self.unitCount): gpuIdx = str(i) handle = pynvml.nvmlUnitGetHandleByIndex(i) try: fan = pynvml.nvmlUnitGetFanSpeedInfo(handle) fan_speed = fan.speed # Fan speed (RPM) fan_state = fan.state # Flag that indicates whether fan is working properly except Exception as e: self.debug(str(e)) fan_speed = None fan_state = None try: psu = pynvml.nvmlUnitGetPsuInfo(handle) psu_current = psu.current # PSU current (A) psu_power = psu.power # PSU power draw (W) psu_state = psu.state # The power supply state psu_voltage = psu.voltage # PSU voltage (V) except Exception as e: self.debug(str(e)) psu_current = None psu_power = None psu_state = None psu_voltage = None try: temp_intake = pynvml.nvmlUnitGetTemperature(handle,0) # Temperature at intake in C temp_exhaust = pynvml.nvmlUnitGetTemperature(handle,1) # Temperature at exhaust in C temp_board = pynvml.nvmlUnitGetTemperature(handle,2) # Temperature on board in C except Exception as e: self.debug(str(e)) temp_intake = None temp_exhaust = None temp_board = None self.debug('Unit fan speed:',str(fan_speed)) data["unit_fan_speed_" + gpuIdx] = fan_speed self.debug('Unit fan state:',str(fan_state)) data["unit_fan_state_" + gpuIdx] = fan_state self.debug('Unit PSU current:',str(psu_current)) data["unit_psu_current_" + gpuIdx] = psu_current self.debug('Unit PSU power:', str(psu_power)) data["unit_psu_power_" + gpuIdx] = psu_power self.debug('Unit PSU state:', str(psu_state)) data["unit_psu_state_" + gpuIdx] = psu_state self.debug('Unit PSU voltage:', str(psu_voltage)) data["unit_psu_voltage_" + gpuIdx] = psu_voltage self.debug('Unit temp intake:', str(temp_intake)) data["unit_temp_intake_" + gpuIdx] = temp_intake self.debug('Unit temp exhaust:', str(temp_exhaust)) data["unit_temp_exhaust_" + gpuIdx] = temp_exhaust self.debug('Unit temp board:', str(temp_board)) data["unit_temp_board_" + gpuIdx] = temp_board ## Get data via legacy mode if self.legacy: try: output, error = Popen( [ "nvidia-settings", "-c", ":0", "-q", "GPUUtilization", "-q", "GPUCurrentClockFreqs", "-q", "GPUCoreTemp", "-q", "TotalDedicatedGPUMemory", "-q", "UsedDedicatedGPUMemory" ], shell=False, stdout=PIPE,stderr=PIPE).communicate() output = repr(str(output)) if len(output) < 800: raise Exception('Error in fetching data from nvidia-settings ' + output) self.debug(str(error), output) except Exception as e: self.error(str(e)) self.error('Setting legacy mode to False') self.legacy = False return data for i in range(self.deviceCount): gpuIdx = str(i) if data["device_temp_" + gpuIdx] is None: coreTemp = findall('GPUCoreTemp.*?(gpu:\d*).*?\s(\d*)', output)[i][1] try: data["device_temp_" + gpuIdx] = int(coreTemp) self.debug('Using legacy temp for GPU {0}: {1}'.format(gpuIdx, coreTemp)) except Exception as e: self.debug(str(e), "skipping device_temp_" + gpuIdx) if data["device_mem_used_" + gpuIdx] is None: memUsed = findall('UsedDedicatedGPUMemory.*?(gpu:\d*).*?\s(\d*)', output)[i][1] try: data["device_mem_used_" + gpuIdx] = int(memUsed) self.debug('Using legacy mem_used for GPU {0}: {1}'.format(gpuIdx, memUsed)) except Exception as e: self.debug(str(e), "skipping device_mem_used_" + gpuIdx) if data["device_load_gpu_" + gpuIdx] is None: gpu_util = findall('(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)', output)[i][1] try: data["device_load_gpu_" + gpuIdx] = int(gpu_util) self.debug('Using legacy load_gpu for GPU {0}: {1}'.format(gpuIdx, gpu_util)) except Exception as e: self.debug(str(e), "skipping device_load_gpu_" + gpuIdx) if data["device_load_mem_" + gpuIdx] is None: mem_util = findall('(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)', output)[i][2] try: data["device_load_mem_" + gpuIdx] = int(mem_util) self.debug('Using legacy load_mem for GPU {0}: {1}'.format(gpuIdx, mem_util)) except Exception as e: self.debug(str(e), "skipping device_load_mem_" + gpuIdx) if data["device_core_clock_" + gpuIdx] is None: clock_core = findall('GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)', output)[i][1] try: data["device_core_clock_" + gpuIdx] = int(clock_core) self.debug('Using legacy core_clock for GPU {0}: {1}'.format(gpuIdx, clock_core)) except Exception as e: self.debug(str(e), "skipping device_core_clock_" + gpuIdx) if data["device_mem_clock_" + gpuIdx] is None: clock_mem = findall('GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)', output)[i][2] try: data["device_mem_clock_" + gpuIdx] = int(clock_mem) self.debug('Using legacy mem_clock for GPU {0}: {1}'.format(gpuIdx, clock_mem)) except Exception as e: self.debug(str(e), "skipping device_mem_clock_" + gpuIdx) return data
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_last_used(index): last_useds = [] if not os.path.exists('gpu_history.pkl'): pickle.dump({}, open('gpu_history.pkl', 'wb')) with open('gpu_history.pkl', 'rb') as f: history = pickle.load(f) if platform.node() in history: for user, last_used in history[ platform.node()][index].items(): # 1 day = 24 hours, 1 hour = 3600 seconds used_before = (datetime.now() - last_used['last_used']).days * 24 + \ (datetime.now() - last_used['last_used']).seconds / 3600 last_useds.append((user, used_before)) return last_useds else: return [] def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' process['full_command'] = ['?'] else: process['command'] = os.path.basename(_cmdline[0]) process['full_command'] = _cmdline # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['cpu_percent'] = ps_process.cpu_percent() process['cpu_memory_usage'] = \ round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] process['cpu_percent'] = cache_process.cpu_percent() index = N.nvmlDeviceGetIndex(handle) last_used = get_last_used(index) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, 'last_used': last_used, } GPUStatCollection.clean_processes() return gpu_info
def getName(self): r"""Get obect name""" return pynvml.nvmlDeviceGetName(self.handle)
def setup(self): class TimeOutException(Exception): pass def alarm_handler(signum, frame): raise TimeOutException() self.data["root"] = os.getcwd() program = os.getenv(env.PROGRAM) or util.get_program() if program: self.data["program"] = program else: self.data["program"] = '<python with no main file>' if wandb._get_python_type() != "python": if os.getenv(env.NOTEBOOK_NAME): self.data["program"] = os.getenv(env.NOTEBOOK_NAME) else: meta = wandb.jupyter.notebook_metadata() if meta.get("path"): if "fileId=" in meta["path"]: self.data[ "colab"] = "https://colab.research.google.com/drive/" + meta[ "path"].split("fileId=")[1] self.data["program"] = meta["name"] else: self.data["program"] = meta["path"] self.data["root"] = meta["root"] if not os.getenv(env.DISABLE_CODE): logger.debug("code probe starting") in_jupyter = wandb._get_python_type() != "python" # windows doesn't support alarm() and jupyter could call this in a thread context if platform.system() == "Windows" or not hasattr( signal, 'SIGALRM') or in_jupyter: logger.debug("non time limited probe of code") self._setup_code_git() self._setup_code_program() else: old_alarm = None try: try: old_alarm = signal.signal(signal.SIGALRM, alarm_handler) signal.alarm(25) self._setup_code_git() self._setup_code_program() finally: signal.alarm(0) except TimeOutException: logger.debug("timeout waiting for setup_code") finally: if old_alarm: signal.signal(signal.SIGALRM, old_alarm) logger.debug("code probe done") self.data["startedAt"] = datetime.utcfromtimestamp( wandb.START_TIME).isoformat() try: username = getpass.getuser() except KeyError: # getuser() could raise KeyError in restricted environments like # chroot jails or docker containers. Return user id in these cases. username = str(os.getuid()) # Host names, usernames, emails, the root directory, and executable paths are sensitive for anonymous users. if self._api.settings().get('anonymous') != 'true': self.data["host"] = os.environ.get(env.HOST, socket.gethostname()) self.data["username"] = os.getenv(env.USERNAME, username) self.data["executable"] = sys.executable else: self.data.pop("email", None) self.data.pop("root", None) self.data["os"] = platform.platform(aliased=True) self.data["python"] = platform.python_version() if env.get_docker(): self.data["docker"] = env.get_docker() try: pynvml.nvmlInit() self.data["gpu"] = pynvml.nvmlDeviceGetName( pynvml.nvmlDeviceGetHandleByIndex(0)).decode("utf8") self.data["gpu_count"] = pynvml.nvmlDeviceGetCount() except pynvml.NVMLError: pass try: self.data["cpu_count"] = multiprocessing.cpu_count() except NotImplementedError: pass # TODO: we should use the cuda library to collect this if os.path.exists("/usr/local/cuda/version.txt"): with open("/usr/local/cuda/version.txt") as f: self.data["cuda"] = f.read().split(" ")[-1].strip() self.data["args"] = sys.argv[1:] self.data["state"] = "running"
def check(self, instance): pynvml.nvmlInit() msg_list = [] gpus_in_use = 0 try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 for device_id in xrange(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization GPU/Memory info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) gpus_in_use += 1 if util_rate.memory > 50.0 else 0 except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err)) # utilization Encoder info try: util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) self.log.debug('nvml.util.encoder %s' % long(util_encoder[0])) self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err)) # utilization Decoder info try: util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) self.log.debug('nvml.util.decoder %s' % long(util_decoder[0])) self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) self.gauge('nvml.process.count', len(cps), d_tags) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['pname'] = pynvml.nvmlSystemGetProcessName(ps.pid) p_tags['puser'] = self.get_process_owner(ps.pid) docker_name, docker_image = self.get_container_name(ps.pid) p_tags['docker_image'] = docker_image p_tags['docker_name'] = docker_name p_tags = self._dict2list(p_tags) print p_tags self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) self.gauge('nvml.gpus_in_use_count', gpus_in_use) if msg_list: status = AgentCheck.CRITICAL msg = u','.join(msg_list) else: status = AgentCheck.OK msg = u'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)
def init(self): self.util_history = [] self.temp_history = [] pynvml.nvmlInit() self.gpu_handles = [] self.deviceCount = pynvml.nvmlDeviceGetCount() for i in range(self.deviceCount): self.gpu_handles.append(pynvml.nvmlDeviceGetHandleByIndex(i)) self.cpu_box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=6) self.cpu_prog_bars = [] self.gpu_boxes = [] self.gpu_prog_bars = [] self.prev_idle = [] self.prev_total = [] self.idle = [] self.total = [] #---cpu_box--- try: stat = open("/proc/stat") statlines = stat.read().splitlines() stat.close() self.corecount = -1 for line in statlines: if (line[0:2] == "cp"): self.corecount+= 1 else: break except IOError: print("Problem opening /proc/stat, exiting..") pynvml.nvmlShutdown() quit() for i in range(self.corecount): self.cpu_prog_bars.append(Gtk.ProgressBar(text="CPU %d" % i, show_text=True)) self.cpu_box.pack_start(self.cpu_prog_bars[i], True, True, 0) self.prev_idle.append(0) self.prev_total.append(0) self.idle.append(0) self.total.append(0) #---gpu_boxes--- for i in range(self.deviceCount): product_name = pynvml.nvmlDeviceGetName(self.gpu_handles[i]) product_name = product_name.decode('utf-8') gpu_box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=8) label = Gtk.Label(product_name) self.gpu_prog_bars.append(Gtk.ProgressBar(text="GPU", show_text=True)) self.gpu_prog_bars.append(Gtk.ProgressBar(text="Memory Utilization", show_text=True)) self.gpu_prog_bars.append(Gtk.ProgressBar(text="Memory Usage", show_text=True)) self.gpu_prog_bars.append(Gtk.ProgressBar(text="Temperature", show_text=True)) self.gpu_prog_bars.append(Gtk.ProgressBar(text="Encoder", show_text=True)) self.gpu_prog_bars.append(Gtk.ProgressBar(text="Decoder", show_text=True)) gpu_box.pack_start(label, True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6], True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6 +1], True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6 +2], True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6 +3], True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6 +4], True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6 +5], True, True, 0) self.gpu_boxes.append(gpu_box) #---proc--- proc_liststore = Gtk.ListStore(int, str, int) self.tree = Gtk.TreeView(model=proc_liststore) renderer_pid = Gtk.CellRendererText() column_pid = Gtk.TreeViewColumn("Proccess ID", renderer_pid, text=0) column_pid.set_resizable(True) self.tree.append_column(column_pid) renderer_path = Gtk.CellRendererText() column_path = Gtk.TreeViewColumn("Command Line", renderer_path, text=1) column_path.set_resizable(True) column_path.set_fixed_width(250) self.tree.append_column(column_path) renderer_mem = Gtk.CellRendererText() column_mem = Gtk.TreeViewColumn("Memory (MiB)", renderer_mem, text=2) column_mem.set_resizable(True) self.tree.append_column(column_mem)
def step(self): valuesDict = {} valuesDict['table'] = self._tableName cpu = valuesDict['cpu'] = psutil.cpu_percent(interval=0) mem = valuesDict['mem'] = psutil.virtual_memory().percent swap = valuesDict['swap'] = psutil.swap_memory().percent # some code examples: # https://github.com/ngi644/datadog_nvml/blob/master/nvml.py if self.doGpu: for i in self.gpusToUse: try: handle = nvmlDeviceGetHandleByIndex(i) memInfo = nvmlDeviceGetMemoryInfo(handle) valuesDict["gpuMem_%d" % i] = \ float(memInfo.used)*100./float(memInfo.total) util = nvmlDeviceGetUtilizationRates(handle) valuesDict["gpuUse_%d" % i] = util.gpu temp = nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU) valuesDict["gpuTem_%d" % i] = temp except NVMLError as err: handle = nvmlDeviceGetHandleByIndex(i) msg = "Device %d -> %s not suported\n" \ "Remove device %d from FORM" % \ (i, nvmlDeviceGetName(handle), i) errorWindow(None, msg) if self.doNetwork: try: # measure a sort interval pnic_before = psutil.net_io_counters(pernic=True)[self.nif] time.sleep(self.samplingTime) # sec pnic_after = psutil.net_io_counters(pernic=True)[self.nif] bytes_sent = pnic_after.bytes_sent - pnic_before.bytes_sent bytes_recv = pnic_after.bytes_recv - pnic_before.bytes_recv valuesDict["%s_send" % self.nif] = \ bytes_sent * self.samplingTime / 1048576 valuesDict["%s_recv" % self.nif] = \ bytes_recv * self.samplingTime / 1048576 except: msg = "cannot get information of network interface %s" % \ self.nif if self.doDiskIO: try: # measure a sort interval disk_before = psutil.disk_io_counters(perdisk=False) time.sleep(self.samplingTime) # sec disk_after = psutil.disk_io_counters(perdisk=False) bytes_read = disk_after.read_bytes - disk_before.read_bytes bytes_write = disk_after.write_bytes - disk_before.write_bytes valuesDict["disk_read"] = \ self.samplingTime * bytes_read / self.mega valuesDict["disk_write"] = \ self.samplingTime * bytes_write / self.mega except: msg = "cannot get information of disk usage " if self.cpuAlert < 100 and cpu > self.cpuAlert: self.warning("CPU allocation =%f." % cpu) self.cpuAlert = cpu if self.memAlert < 100 and mem.percent > self.memAlert: self.warning("Memory allocation =%f." % mem) self.memAlert = mem if self.swapAlert < 100 and swap.percent > self.swapAlert: self.warning("SWAP allocation =%f." % swap) self.swapAlert = swap sqlCommand = "INSERT INTO %(table)s (" for label in self.labelList: sqlCommand += "%s, " % label # remove last comma sqlCommand = sqlCommand[:-2] sqlCommand += ") VALUES(" for label in self.labelList: sqlCommand += "%"+"(%s)f, " % label # remove last comma sqlCommand = sqlCommand[:-2] sqlCommand += ");" sql = sqlCommand % valuesDict try: self.cur.execute(sql) except Exception as e: print("ERROR: saving one data point (monitor). I continue") # Return finished = True if all protocols have finished finished = [] for prot in self.protocols: updatedProt = getUpdatedProtocol(prot) finished.append(updatedProt.getStatus() != STATUS_RUNNING) return all(finished)
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU ) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: # TODO: could be more information such as system memory # usage, CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } return gpu_info
def __init__(self, model: TransformerMT, corpus: Corpus, optimizer: torch.optim.Optimizer, stats: Stats, bleu: BLEU, tgt_character_level: bool, buffer_every_steps: int, report_every_steps: int, eval_every_steps: int, num_of_steps: int, eval_type: str, processed_steps: int, learning_rate_schedule: str, update_decay: int, batch_capacity: int, max_save_models: int, grad_norm_clip: float, grad_norm_clip_type: float, annotate: str, device_idxs: [int], gpu_memory_limit: float, ): self.model = model self.corpus = corpus self.optimizer = optimizer self.stats = stats self.bleu = bleu self.tgt_character_level = tgt_character_level self.buffer_every_steps = buffer_every_steps self.report_every_steps = report_every_steps self.eval_every_steps = eval_every_steps self.num_of_steps = num_of_steps self.eval_type = eval_type self.processed_steps = processed_steps self.update_decay = update_decay self.batch_capacity = batch_capacity self.src_pad_idx = self.model.src_pad_idx self.tgt_eos_idx = self.model.tgt_eos_idx self.tgt_pad_idx = self.model.tgt_pad_idx self.max_save_models = max_save_models self.grad_norm_clip = grad_norm_clip if grad_norm_clip > 0.0 else None self.grad_norm_clip_type = grad_norm_clip_type self.annotate = annotate self.device_idxs = device_idxs self.num_of_devices = len(self.device_idxs) self.gpu_memory_limit = gpu_memory_limit self.best_acc = 0.0 self.best_loss = float('inf') self.best_bleu = 0.0 self.best_step = 0 self.lr_schedule = eval(learning_rate_schedule) self.lr = 0.005 self.backward_factor = list() self.loss_report = numpy.zeros(self.report_every_steps, dtype=float) self.acc_report = numpy.zeros(self.report_every_steps, dtype=float) self.update_decay_steps = numpy.zeros(self.report_every_steps, dtype=int) self.src_tokens = numpy.zeros(self.report_every_steps, dtype=int) self.tgt_tokens = numpy.zeros(self.report_every_steps, dtype=int) self.src_num_pad_tokens = numpy.zeros(self.report_every_steps, dtype=int) self.tgt_num_pad_tokens = numpy.zeros(self.report_every_steps, dtype=int) self.num_examples = numpy.zeros(self.report_every_steps, dtype=int) self.time_sum = 0.0 self.memory_unit = float(2 ** 30) # for uncertainty estimation self.esti_variance_every_steps = 1000 self.tolerance = 4 nvmlInit() print('Driver version: %s' % nvmlSystemGetDriverVersion().decode('utf-8')) device_true_idxs = list(int(x) for x in os.environ['CUDA_VISIBLE_DEVICES'].split(',')) self.gpu_info_handler = list(nvmlDeviceGetHandleByIndex(x) for x in device_true_idxs) for idx, handler in enumerate(self.gpu_info_handler): print('Device no.%d, true idx: %d' % (idx, device_true_idxs[idx])) print('\tGPU Name: %s' % nvmlDeviceGetName(handler).decode('utf-8')) self.queue = Queue(maxsize=self.num_of_devices) self.replicas = list() self.async_update_rules = list() device_idxs_rules = self.device_idxs.copy() while len(device_idxs_rules) > 1: rules = dict() for i in range(1, len(device_idxs_rules), 2): rules[device_idxs_rules[i]] = device_idxs_rules[i - 1] device_idxs_rules = device_idxs_rules[::2] self.async_update_rules.append(rules) return
def detect_devices(self) -> None: self.devices_count = nvmlDeviceGetCount() for i in range(self.devices_count): handle = nvmlDeviceGetHandleByIndex(i) device_name = nvmlDeviceGetName(handle).decode("UTF-8") self.logger.info(f"Device nr. {i}: '{device_name}'")
""" show_str_tot_lst = [] drv_ver = pml.nvmlSystemGetDriverVersion() show_str_tot_lst.append('Driver Version: ' + bytes.decode(drv_ver)) show_str_tot_lst.append('{:<4}{:12}{:<13}{:6}{:6}{:8}{:12}{:8}{:<10}'.format( 'id', 'type', 'video memory', 'temp.', 'util.', 'pid', 'process', 'users', 'MemUsed')) for i in range(deviceCount): handle = pml.nvmlDeviceGetHandleByIndex(i) show_str_lst = [] show_str_lst.append(str(i) + ' ') # 获取显卡全名 card_name = pml.nvmlDeviceGetName(handle) card_name = bytes.decode(card_name) card_name = ''.join(card_name.split(' ')[1:]) show_str_lst.append(card_name) # 显存使用情况 mem_info = pml.nvmlDeviceGetMemoryInfo(handle) mem_total = '{:6}'.format(mem_info.total // mega) + 'M' mem_free = '{:6}'.format(mem_info.free // mega) + 'M' mem_used = '{:<6}'.format(str(mem_info.used // mega) + 'M') show_str_lst.append(' ' + mem_used + '/' + mem_total) # 温度 card_temp = ' ' + str( pml.nvmlDeviceGetTemperature(handle, pml.NVML_TEMPERATURE_GPU)) + 'C' show_str_lst.append(card_temp)
pip install nvidia-ml-py2 #python3 pip install nvidia-ml-py3 import pynvml pynvml.nvmlInit() print('显示驱动信息: ') print("Driver: ", pynvml.nvmlSystemGetDriverVersion()) print('--------------') print('设备信息: ') deviceCount = pynvml.nvmlDeviceGetCount() print(' 共 %s 块 GPU,名称为:'%deviceCount) for i in range(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(i) print(" GPU", i, ":", pynvml.nvmlDeviceGetName(handle)) print('--------------') for i in range(deviceCount): print('查看第 %s 块GPU的显存、温度、风扇、电源: '%i) handle = pynvml.nvmlDeviceGetHandleByIndex(i) info = pynvml.nvmlDeviceGetMemoryInfo(handle) print("Memory Total: %0.2f G"%(info.total/1024/1024/1024)) # 总的显存大小 print("Memory Free: %0.2f G "%(info.free/1024/1024/1024)) # 剩余显存大小 print("Memory Used: %0.2f G "%(info.used/1024/1024/1024)) print("Memory Used percent: %0.2f %% "%(info.used/info.total*100)) print("Temperature is %d C"%(pynvml.nvmlDeviceGetTemperature(handle,0))) print("Fan speed is ",pynvml.nvmlDeviceGetFanSpeed(handle)) print("Power ststus",pynvml.nvmlDeviceGetPowerState(handle)) print('--------------') #最后要关闭管理工具
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' process['full_command'] = ['?'] else: process['command'] = os.path.basename(_cmdline[0]) process['full_command'] = _cmdline # Bytes to MBytes # if drivers are not TTC this will be None. usedmem = nv_process.usedGpuMemory // MB if \ nv_process.usedGpuMemory else None process['gpu_memory_usage'] = usedmem process['cpu_percent'] = ps_process.cpu_percent() process['cpu_memory_usage'] = \ round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] process['cpu_percent'] = cache_process.cpu_percent() index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } GPUStatCollection.clean_processes() return gpu_info
from pynvml import ( nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetName, nvmlDeviceGetMemoryInfo, nvmlDeviceGetUtilizationRates, ) nvmlInit() print("Driver Version: %s" % nvmlSystemGetDriverVersion()) deviceCount = nvmlDeviceGetCount() for i in range(deviceCount): handle = nvmlDeviceGetHandleByIndex(i) print("Device %s: %s" % (i, nvmlDeviceGetName(handle))) memory_info = nvmlDeviceGetMemoryInfo(handle) print("Device %s: Total memory: %s" % (i, memory_info.total / 1024 / 1024)) print("Device %s: Free memory: %s" % (i, memory_info.free / 1024 / 1024)) print("Device %s: Used memory: %s" % (i, memory_info.used / 1024 / 1024)) util = nvmlDeviceGetUtilizationRates(handle) print("Device %s: GPU Utilization: %s%%" % (i, util.gpu)) print("Device %s: Memory Utilization: %s%%" % (i, util.memory)) nvmlShutdown()
def _get_gpu_type(gpu_device): return nvmlDeviceGetName(gpu_device)
def do_GET(self): #checks if the server is alive if self.path == '/test': send_header(self) self.wfile.write(bytes('passed<br>', 'utf-8')) self.wfile.write(bytes('server is responding', 'utf-8')) #returns the running processes if self.path == '/runningProcesses': send_header(self) #send response: if modules['psutil']: for proc in psutil.process_iter(): try: pinfo = proc.as_dict(attrs=['pid', 'name']) except psutil.NoSuchProcess: pass print(pinfo) self.wfile.write(bytes(str(pinfo), 'utf-8')) else: self.wfile.write('I am sorry but the Python module psutil is not installed. Therefore the running processes cannot be shown.', 'utf-8') #returns the CPU utilization and number of cores elif self.path == '/cpuInfo': send_header(self) #get CPU info cpuInfo = {} if modules['psutil']: cpuInfo['CPU Utilization'] = int(psutil.cpu_percent()) cpuInfo['CPU Cores'] = int(psutil.cpu_count()) else: cpuInfo['Missing Python module'] = 'I am sorry but the Python module psutil is not installed. Therefore the number of CPU cores cannot be shown.' json_dump = json.dumps(cpuInfo) self.wfile.write(bytes(json_dump, 'utf-8')) #get GPU info if modules['pynvml']: try: pynvml.nvmlInit() gpus = pynvml.nvmlDeviceGetCount() except: gpus = 0 self.wfile.write(bytes('No NVIDIA GPU detected', 'utf-8')) else: gpus = 0 self.wfile.write(bytes('I am sorry but the the Python module pynvml is not installed. Therefore info about NVIDIA GPUs cannot be shown.', 'utf-8')) for i in range(gpus): handle = pynvml.nvmlDeviceGetHandleByIndex(i) self.wfile.write(bytes("<br>GPU " + str(i + 1) + ": " + pynvml.nvmlDeviceGetName(handle).decode('utf-8'), 'utf-8')) try: self.wfile.write(bytes('<br>Temperature: ' + str(pynvml.nvmlDeviceGetTemperature(handle, 0)) + '°C', 'utf-8')) except: self.wfile.write(bytes('<br>Could not retrieve temperature', 'utf-8')) try: gpu_mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.wfile.write(bytes('<br>Total memory: %i Megabytes' % (gpu_mem.total / 10**6), 'utf-8')) self.wfile.write(bytes(str('<br>Free memory: %i' % (gpu_mem.free/gpu_mem.total*100)) + '%', 'utf-8')) except: self.wfile.write(bytes('<br>nCould not retrieve memory information', 'utf-8')) if gpus > 0: try: pynvml.nvmlShutdown() except: pass elif self.path == '/availableComputers': send_header(self) s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(('google.com', 0)) global myownsocket myownsocket = s.getsockname()[0] port = 8003 available_computers = [] for i in range(1, 256): host = '192.168.178.' + str(i) sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.settimeout(0.2) try: alive = sock.connect_ex((host, port)) except: alive = -1 if alive == 0: print('available') available_computers.append(host) else: print('not available') print(host) self.wfile.write(bytes('<form action="submit_job">\n', 'utf-8')) cmd_txt = """@echo off call "C:\Program Files\Autodesk\Softimage 2015\Application\bin\setenv.bat" echo ##### start_rendering xsibatch -render "Z:\TAZ_RoterFaden\PROCESS\XSI\Scenes\SC_060\088_160523_SC_060_V007.scn" -frames #1#-#2# -pass "BEAUTY" -skip on -verbose on echo ##### rendering_done """ self.wfile.write(bytes('Command: <textarea name="command">' + cmd_txt + '</textarea><br>\n', 'utf-8')) self.wfile.write(bytes('<table border="1">\n', 'utf-8')) self.wfile.write(bytes('<tr>\n', 'utf-8')) self.wfile.write(bytes('<th>Computer</th>\n', 'utf-8')) self.wfile.write(bytes('<th>CPU cores</th>\n', 'utf-8')) self.wfile.write(bytes('<th>Start Frame [%]</th>\n', 'utf-8')) self.wfile.write(bytes('<th>End Frame [%]</th>\n</tr>\n', 'utf-8')) available_cpus = {} for host in available_computers: available_cpus[host] = abs(get_cpu_cores(host)) total_cpus = sum(available_cpus.values()) frame_list = {} start_frame = 0 for host in available_computers: start_frame += 1 frame_list[host] = [start_frame] start_frame = start_frame + int(100 * (available_cpus[host] / total_cpus)) if start_frame > 100: start_frame = 100 frame_list[host].append(start_frame) index = 0 for host in available_computers: index += 1 self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8')) self.wfile.write(bytes(host, 'utf-8')) self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('</tr>', 'utf-8')) index = 2 self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8')) self.wfile.write(bytes(host, 'utf-8')) self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('</tr>', 'utf-8')) self.wfile.write(bytes('</table>\n', 'utf-8')) self.wfile.write(bytes('<input type="submit" value="Submit Job">\n', 'utf-8')) self.wfile.write(bytes('</form>\n', 'utf-8')) self.wfile.write(bytes('</body>\n', 'utf-8')) self.wfile.write(bytes('</html>\n', 'utf-8')) elif self.path == '/execute_job': send_header(self) parsed = urlparse(self.path) parameters = parse_qs(parsed.query) elif '/submit_job' in self.path: send_header(self) self.wfile.write(bytes(str(self.client_address), 'utf-8')) parsed = urlparse(self.path) parameters = parse_qs(parsed.query) #print(parsed) print(parameters) self.wfile.write(bytes('<body>', 'utf-8')) for index in range(1, 100): if not parameters.get('host' + str(index)).strip(): pass elif not parameters.get('start' + str(index)).strip(): pass elif not parameters.get('end' + str(index)).strip(): pass elif parameters.get('command'): cmd_txt = parameters['command'][0].replace('#1#', parameters['start' + str(index)][0].strip()) cmd_txt = cmd_txt.replace('#2#', parameters['end' + str(index)][0].strip()) self.wfile.write(bytes(escape(cmd_txt), 'utf-8')) self.wfile.write(bytes('<br>', 'utf-8')) print(cmd_txt) self.wfile.write(bytes('</body></html>', 'utf-8')) elif '/shutdown' in self.path: send_header(self) self.wfile.write(bytes(str(self.client_address), 'utf-8')) self.wfile.write(bytes("Server will be shut down now......", 'utf-8')) server.shutdown() sys.exit() else: send_header(self) self.wfile.write(bytes(str(self.client_address), 'utf-8')) self.wfile.write(bytes("<br>", 'utf-8')) self.wfile.write(bytes(self.path, 'utf-8')) print(self.path)
def check(self, instance): pynvml.nvmlInit() msg_list = [] try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 for device_id in xrange(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp.', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization GPU/Memory info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err)) # utilization Encoder info try: util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) self.log.info('nvml.util.encoder %s' % long(util_encoder[0])) self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err)) # utilization Decoder info try: util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) self.log.info('nvml.util.decoder %s' % long(util_decoder[0])) self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['name'] = psutil.Process(ps.pid).name() p_tags = self._dict2list(p_tags) self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) if msg_list: status = AgentCheck.CRITICAL msg = u','.join(msg_list) else: status = AgentCheck.OK msg = u'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)
def get_gpu_name(handle): """Returns the name of the GPU device https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1ga5361803e044c6fdf3b08523fb6d1481 """ name = pynvml.nvmlDeviceGetName(handle) return to_utf8(name)
def one_time(): h = _pynvml_handles() return { "memory-total": pynvml.nvmlDeviceGetMemoryInfo(h).total, "name": pynvml.nvmlDeviceGetName(h).decode(), }
def get_devices(self): """ Return total vram in megabytes per device """ vram = [pynvml.nvmlDeviceGetName(handle).decode("utf-8") for handle in self.handles] return vram
if no_gather_nvml_gpu_info: logging.debug("--no-gather-nvml-gpu-info passed, " + "using blank values for source database GPU info fields " + "[gpu_driver_ver, run_gpu_name] ") elif conn_machine_name == "localhost" or gather_nvml_gpu_info: logging.debug("Gathering source database GPU info fields " + "[gpu_driver_ver, run_gpu_name] " + "from local GPU using pynvml. ") import pynvml pynvml.nvmlInit() source_db_gpu_driver_ver = pynvml.nvmlSystemGetDriverVersion().decode() for i in range(source_db_gpu_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) # Assume all cards are the same, overwrite name value source_db_gpu_name = pynvml.nvmlDeviceGetName(handle).decode() pynvml.nvmlShutdown() # If gpu_count argument passed in, override gathered value if gpu_count: source_db_gpu_count = gpu_count # Set machine names, using local info if connected to localhost if conn_machine_name == "localhost": local_uname = os.uname() if machine_name: run_machine_name = machine_name else: if conn_machine_name == "localhost": run_machine_name = local_uname.nodename.split(".")[0] else: run_machine_name = conn_machine_name if machine_uname:
import pynvml pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(0) device_name = pynvml.nvmlDeviceGetName(handle) if device_name != b'Tesla T4': raise Exception(""" Unfortunately Colab didn't give you a T4 GPU. Make sure you've configured Colab to request a GPU instance type. If you get a K80 GPU, try Runtime -> Reset all runtimes... """) else: print('Woo! You got the right kind of GPU!')
def get_device_name(device_handle): """Get GPU device name""" try: return pynvml.nvmlDeviceGetName(device_handle) except pynvml.NVMlError: return "NVIDIA"
def _get_name(h): try: return pynvml.nvmlDeviceGetName(h).decode() except pynvml.NVMLError_NotSupported: return None
sysdata['ssd0_exist'] = False if os.path.isdir('/ssd1'): ssd1_usage = psutil.disk_usage('/ssd1') sysdata['ssd1_exist'] = True sysdata['ssd1_used'] = toGB(ssd1_usage.used) sysdata['ssd1_total'] = toGB(ssd1_usage.total) else: sysdata['ssd1_exist'] = False procs = deviceCount * [None] gpu_error = deviceCount * [False] for i in range(deviceCount): try: handle = nvmlDeviceGetHandleByIndex(i) name = nvmlDeviceGetName(handle) gpudata[i]['name'] = name.decode('utf-8') memInfo = nvmlDeviceGetMemoryInfo(handle) gpudata[i]['mem_free'] = toMB(memInfo.total - memInfo.used) gpudata[i]['mem_total'] = toMB(memInfo.total) gpudata[i]['mem_usage'] = memInfo.used / memInfo.total * 100 procs_prefilter = nvmlDeviceGetComputeRunningProcesses(handle) # for unknown reasons, nvmlDeviceGetComputeRunningProcesses # sometimes returns nonexistent processes on 3090 GPUs procs[i] = [] gpudata[i]['procs'] = [] for p in procs_prefilter: try: P = psutil.Process(p.pid)
def setup(self): self.data["root"] = os.getcwd() try: import __main__ self.data["program"] = __main__.__file__ except (ImportError, AttributeError): self.data["program"] = '<python with no main file>' if wandb._get_python_type() != "python": if os.getenv(env.NOTEBOOK_NAME): self.data["program"] = os.getenv(env.NOTEBOOK_NAME) else: meta = wandb.jupyter.notebook_metadata() if meta.get("path"): if "fileId=" in meta["path"]: self.data[ "colab"] = "https://colab.research.google.com/drive/" + meta[ "path"].split("fileId=")[1] self.data["program"] = meta["name"] else: self.data["program"] = meta["path"] self.data["root"] = meta["root"] program = os.path.join(self.data["root"], self.data["program"]) if not os.getenv(env.DISABLE_CODE): if self._api.git.enabled: self.data["git"] = { "remote": self._api.git.remote_url, "commit": self._api.git.last_commit } self.data["email"] = self._api.git.email self.data["root"] = self._api.git.root or self.data["root"] if os.path.exists(program) and self._api.git.is_untracked( self.data["program"]): util.mkdir_exists_ok( os.path.join(self.out_dir, "code", os.path.dirname(self.data["program"]))) saved_program = os.path.join(self.out_dir, "code", self.data["program"]) if not os.path.exists(saved_program): self.data["codeSaved"] = True copyfile(program, saved_program) self.data["startedAt"] = datetime.utcfromtimestamp( wandb.START_TIME).isoformat() self.data["host"] = os.environ.get(env.HOST, socket.gethostname()) try: username = getpass.getuser() except KeyError: # getuser() could raise KeyError in restricted environments like # chroot jails or docker containers. Return user id in these cases. username = str(os.getuid()) self.data["username"] = os.getenv(env.USERNAME, username) self.data["os"] = platform.platform(aliased=True) self.data["python"] = platform.python_version() self.data["executable"] = sys.executable if env.get_docker(): self.data["docker"] = env.get_docker() try: pynvml.nvmlInit() self.data["gpu"] = pynvml.nvmlDeviceGetName( pynvml.nvmlDeviceGetHandleByIndex(0)).decode("utf8") self.data["gpu_count"] = pynvml.nvmlDeviceGetCount() except pynvml.NVMLError: pass try: self.data["cpu_count"] = multiprocessing.cpu_count() except NotImplementedError: pass # TODO: we should use the cuda library to collect this if os.path.exists("/usr/local/cuda/version.txt"): self.data["cuda"] = open( "/usr/local/cuda/version.txt").read().split(" ")[-1].strip() self.data["args"] = sys.argv[1:] self.data["state"] = "running"
import pynvml as nv import time import os import sys #import psutil gpu_id = 0 query_interval = 0.5 nv.nvmlInit() handle = nv.nvmlDeviceGetHandleByIndex(gpu_id) print "Driver Version: ", nv.nvmlSystemGetDriverVersion() print "GPU", gpu_id, "Device Name: ", nv.nvmlDeviceGetName(handle) while (1): try: memory = nv.nvmlDeviceGetMemoryInfo(handle) device_util = nv.nvmlDeviceGetUtilizationRates(handle) print "Memory total:", memory.total / 1024 / 1024, "M. ", "Memory used:", memory.used / 1024 / 1024, "M." print "Memory-util: %.2f" % ( memory.used * 100.0 / memory.total), "%. ", "GPU-util:", device_util.gpu, "%." time.sleep(query_interval) except IndexError, e: nv.nvmlShutdown() print "process terminal!" sys.exit() ''' while(1): try: pid_obj = os.popen('pgrep -f matrix_apps_config_8801') pid = int(pid_obj.read().split()[0]) print pid
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(pid): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=pid) process['username'] = ps_process.username() # cmdline returns full path; as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024) process['pid'] = nv_process.pid return process def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except: power_limit = None processes = [] try: nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses( handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses( handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None # Not supported (in both cases) else: nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in (nv_comp_processes + nv_graphics_processes): # TODO: could be more information such as system memory usage, # CPU percentage, create time etc. try: process = get_process_info(nv_process.pid) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': int(power / 1000) if power is not None else None, 'enforced.power.limit': int(power_limit / 1000) if power is not None else None, # Convert bytes into MBytes 'memory.used': int(memory.used / 1024 / 1024) if memory else None, 'memory.total': int(memory.total / 1024 / 1024) if memory else None, 'processes': processes, } return gpu_info
def step(self): valuesDict = {} valuesDict['table'] = self._tableName cpu = valuesDict['cpu'] = psutil.cpu_percent(interval=0) mem = valuesDict['mem'] = psutil.virtual_memory().percent swap = valuesDict['swap'] = psutil.swap_memory().percent # some code examples: # https://github.com/ngi644/datadog_nvml/blob/master/nvml.py if self.doGpu: for i in self.gpusToUse: try: handle = nvmlDeviceGetHandleByIndex(i) memInfo = nvmlDeviceGetMemoryInfo(handle) valuesDict["gpuMem_%d" % i] = \ float(memInfo.used)*100./float(memInfo.total) util = nvmlDeviceGetUtilizationRates(handle) valuesDict["gpuUse_%d" % i] = util.gpu temp = nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU) valuesDict["gpuTem_%d" % i] = temp except NVMLError as err: handle = nvmlDeviceGetHandleByIndex(i) msg = "Device %d -> %s not suported\n" \ "Remove device %d from FORM" % \ (i, nvmlDeviceGetName(handle), i) errorWindow(None, msg) if self.doNetwork: try: # measure a sort interval pnic_before = psutil.net_io_counters(pernic=True)[self.nif] time.sleep(self.samplingTime) # sec pnic_after = psutil.net_io_counters(pernic=True)[self.nif] bytes_sent = pnic_after.bytes_sent - pnic_before.bytes_sent bytes_recv = pnic_after.bytes_recv - pnic_before.bytes_recv valuesDict["%s_send" % self.nif] = \ bytes_sent * self.samplingTime / 1048576 valuesDict["%s_recv" % self.nif] = \ bytes_recv * self.samplingTime / 1048576 except: msg = "cannot get information of network interface %s" % \ self.nif if self.doDiskIO: try: # measure a sort interval disk_before = psutil.disk_io_counters(perdisk=False) time.sleep(self.samplingTime) # sec disk_after = psutil.disk_io_counters(perdisk=False) bytes_read = disk_after.read_bytes - disk_before.read_bytes bytes_write = disk_after.write_bytes - disk_before.write_bytes valuesDict["disk_read"] = \ self.samplingTime * bytes_read / self.mega valuesDict["disk_write"] = \ self.samplingTime * bytes_write / self.mega except: msg = "cannot get information of disk usage " if self.cpuAlert < 100 and cpu > self.cpuAlert: self.warning("CPU allocation =%f." % cpu) self.cpuAlert = cpu if self.memAlert < 100 and mem.percent > self.memAlert: self.warning("Memory allocation =%f." % mem) self.memAlert = mem if self.swapAlert < 100 and swap.percent > self.swapAlert: self.warning("SWAP allocation =%f." % swap) self.swapAlert = swap sqlCommand = "INSERT INTO %(table)s (" for label in self.labelList: sqlCommand += "%s, " % label # remove last comma sqlCommand = sqlCommand[:-2] sqlCommand += ") VALUES(" for label in self.labelList: sqlCommand += "%" + "(%s)f, " % label # remove last comma sqlCommand = sqlCommand[:-2] sqlCommand += ");" sql = sqlCommand % valuesDict try: self.cur.execute(sql) except Exception as e: print("ERROR: saving one data point (monitor). I continue") # Return finished = True if all protocols have finished finished = [] for prot in self.protocols: updatedProt = getUpdatedProtocol(prot) finished.append(updatedProt.getStatus() != STATUS_RUNNING) return all(finished)
def get_device_name(device_handle): """Get GPU device name.""" try: return nativestr(pynvml.nvmlDeviceGetName(device_handle)) except pynvml.NVMlError: return "NVIDIA"
def get(self): """Write the web page content.""" global cpu_load global gpu_load_compute global gpu_load_memory memory = psutil.virtual_memory() swap = psutil.swap_memory() if nvidia: nvmlHandle = nvmlDeviceGetHandleByIndex(0) gpu = nvmlDeviceGetName(nvmlHandle).decode('utf-8') gpu_memory = nvmlDeviceGetMemoryInfo(nvmlHandle) gpu_ram = round(gpu_memory.total / (1024 * 1048576), 2) gpu += " - " + str(gpu_ram) + "GB" else: gpu = "Not recognized" ram = str(int(round(float(memory.total) / (1024 * 1048576)))) + "GB" ram += " (swap: " + str(int(round(float(swap.total) / (1024 * 1048576)))) + "GB)" real_cores = psutil.cpu_count(False) cores_ratio = int(psutil.cpu_count(True) / real_cores) cores = " (" + str(cores_ratio) + "x " + str(real_cores) + " cores)" if sys.platform.startswith('linux'): distribution = distro.linux_distribution() os_name = 'Linux ' + distribution[0] + " " + distribution[1] + " " + distribution[2] command = "cat /proc/cpuinfo" all_info = subprocess.check_output(command, shell=True).decode('utf-8').strip() for line in all_info.split("\n"): if "model name" in line: cpu = re.sub(".*model name.*:", "", line, 1) break elif sys.platform == 'win32': computer = wmi.WMI() os_info = computer.Win32_OperatingSystem()[0] cpu = computer.Win32_Processor()[0].Name os_name = os_info.Name.split('|')[0] + ", version " + os_info.Version elif sys.platform == 'darwin': os_name = 'macOS ' + platform.mac_ver()[0] os.environ['PATH'] = os.environ['PATH'] + os.pathsep + '/usr/sbin' command = 'sysctl -n machdep.cpu.brand_string' cpu = subprocess.check_output(command).strip() else: # unknown platform os_name = 'Unknown' cpu = 'Unknown' self.write("<!DOCTYPE html>\n") self.write("<html><head><meta charset='utf-8'/><title>Webots simulation server</title>") self.write("<link rel='stylesheet' type='text/css' href='css/monitor.css'></head>\n") self.write("<body><h1>Webots simulation server: " + socket.getfqdn() + "</h1>") self.write("<h2>Host: " + os_name + "</h2>\n") self.write("<p><b>CPU load: %g%%</b><br>\n" % cpu_load) self.write(cpu + cores + "</p>\n") self.write("<p><b>GPU load compute: %g%% — load memory: %g%%</b><br>\n" % (gpu_load_compute, gpu_load_memory)) self.write(gpu + "</p>\n") self.write("<p><b>RAM:</b><br>" + ram + "</p>\n") self.write("<canvas id='graph' height='400' width='1024'></canvas>\n") self.write("<script src='https://www.cyberbotics.com/harry-plotter/0.9f/harry.min.js'></script>\n") self.write("<script>\n") self.write("window.onload = function() {\n") def appendData(label): global snapshots d = "{title:'" + label + "',values:[" for s in snapshots: d += str(s.data[label]) + ',' return d[:-1] + "]}," datas = '' datas += appendData('Webots running') datas += appendData('Webots idle') datas += appendData('CPU load') datas += appendData('CPU memory') datas += appendData('GPU load compute') datas += appendData('GPU load memory') datas += appendData('GPU memory') datas += appendData('Swap') datas += appendData('Disk') datas += appendData('Network sent') datas += appendData('Network received') datas = datas[:-1] # remove the last coma self.write(" plotter({\n") self.write(" canvas: 'graph',\n") self.write(" datas:[ " + datas + "],\n") self.write(""" labels:{ ypos:"left", x:100, y:[50,100], marks:2 }, fill:"none", opacity:0.5, linewidth:3, background:"#fff", autoscale:"top", grid:{ x:[0,100] }, mouseover:{ radius:4, linewidth:2, bullet:"#444", shadowbox:"1,1,0,#000", axis:"x" } });""") self.write("}\n") self.write("</script>\n") self.write("</body></html>")
def getName(self): return pynvml.nvmlDeviceGetName(self.handle)
def device_name_for(device_handle): """Get GPU device name""" try: return nativestr(pynvml.nvmlDeviceGetName(device_handle)) except pynvml.NVMlError: return "NVIDIA"
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['pid'] = nv_process.pid if process['username'] == 'root': out = subprocess.check_output( 'docker inspect --format "{{.Name}}" "$(cat /proc/' + str(process['pid']) + '/cgroup |head -n 1 |cut -d / -f 3)" | sed "s/^\///"', shell="True") if "Error" not in out: process['username'] = out return process def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: # TODO: could be more information such as system memory # usage, CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } return gpu_info
def get_device_name(self, device_handle): """Get GPU device name""" try: return pynvml.nvmlDeviceGetName(device_handle) except pynvml.NVMlError: return "NVIDIA"