def stats(self): stats = {} for i in range(self.gpu_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) try: util = pynvml.nvmlDeviceGetUtilizationRates(handle) memory = pynvml.nvmlDeviceGetMemoryInfo(handle) temp = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) in_use_by_us = gpu_in_use_by_this_process(handle) stats["gpu.{}.{}".format(i, "gpu")] = util.gpu stats["gpu.{}.{}".format(i, "memory")] = util.memory stats["gpu.{}.{}".format( i, "memoryAllocated")] = (memory.used / float(memory.total)) * 100 stats["gpu.{}.{}".format(i, "temp")] = temp if in_use_by_us: stats["gpu.process.{}.{}".format(i, "gpu")] = util.gpu stats["gpu.process.{}.{}".format(i, "memory")] = util.memory stats["gpu.process.{}.{}".format( i, "memoryAllocated")] = (memory.used / float(memory.total)) * 100 stats["gpu.process.{}.{}".format(i, "temp")] = temp # Some GPUs don't provide information about power usage try: power_watts = pynvml.nvmlDeviceGetPowerUsage( handle) / 1000.0 power_capacity_watts = ( pynvml.nvmlDeviceGetEnforcedPowerLimit(handle) / 1000.0) power_usage = (power_watts / power_capacity_watts) * 100 stats["gpu.{}.{}".format(i, "powerWatts")] = power_watts stats["gpu.{}.{}".format(i, "powerPercent")] = power_usage if in_use_by_us: stats["gpu.process.{}.{}".format( i, "powerWatts")] = power_watts stats["gpu.process.{}.{}".format( i, "powerPercent")] = power_usage except pynvml.NVMLError: pass except pynvml.NVMLError: pass if psutil: net = psutil.net_io_counters() sysmem = psutil.virtual_memory() stats["cpu"] = psutil.cpu_percent() stats["memory"] = sysmem.percent stats["network"] = { "sent": net.bytes_sent - self.network_init["sent"], "recv": net.bytes_recv - self.network_init["recv"], } # TODO: maybe show other partitions, will likely need user to configure stats["disk"] = psutil.disk_usage("/").percent stats["proc.memory.availableMB"] = sysmem.available / 1048576.0 try: stats["proc.memory.rssMB"] = self.proc.memory_info( ).rss / 1048576.0 stats["proc.memory.percent"] = self.proc.memory_percent() stats["proc.cpu.threads"] = self.proc.num_threads() except psutil.NoSuchProcess: pass return stats
def stats(self): stats = {} for i in range(0, self.gpu_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) try: utilz = pynvml.nvmlDeviceGetUtilizationRates(handle) memory = pynvml.nvmlDeviceGetMemoryInfo(handle) temp = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) in_use_by_us = gpu_in_use_by_this_process(handle) stats["gpu.{}.{}".format(i, "gpu")] = utilz.gpu stats["gpu.{}.{}".format(i, "memory")] = utilz.memory stats["gpu.{}.{}".format( i, "memoryAllocated")] = (memory.used / float(memory.total)) * 100 stats["gpu.{}.{}".format(i, "temp")] = temp if in_use_by_us: stats["gpu.process.{}.{}".format(i, "gpu")] = utilz.gpu stats["gpu.process.{}.{}".format(i, "memory")] = utilz.memory stats["gpu.process.{}.{}".format( i, "memoryAllocated")] = (memory.used / float(memory.total)) * 100 stats["gpu.process.{}.{}".format(i, "temp")] = temp # Some GPUs don't provide information about power usage try: power_watts = pynvml.nvmlDeviceGetPowerUsage( handle) / 1000.0 power_capacity_watts = ( pynvml.nvmlDeviceGetEnforcedPowerLimit(handle) / 1000.0) power_usage = (power_watts / power_capacity_watts) * 100 stats["gpu.{}.{}".format(i, "powerWatts")] = power_watts stats["gpu.{}.{}".format(i, "powerPercent")] = power_usage if in_use_by_us: stats["gpu.process.{}.{}".format( i, "powerWatts")] = power_watts stats["gpu.process.{}.{}".format( i, "powerPercent")] = power_usage except pynvml.NVMLError: pass except pynvml.NVMLError: pass # On Apple M1 systems let's look for the gpu if (platform.system() == "Darwin" and platform.processor() == "arm" and self.gpu_count == 0): try: out = subprocess.check_output( [util.apple_gpu_stats_binary(), "--json"]) m1_stats = json.loads(out.split(b"\n")[0]) stats["gpu.0.gpu"] = m1_stats["utilization"] stats["gpu.0.memoryAllocated"] = m1_stats["mem_used"] stats["gpu.0.temp"] = m1_stats["temperature"] stats["gpu.0.powerWatts"] = m1_stats["power"] stats["gpu.0.powerPercent"] = (m1_stats["power"] / M1_MAX_POWER_WATTS) * 100 # TODO: this stat could be useful eventually, it was consistently # 0 in my experimentation and requires a frontend change # so leaving it out for now. # stats["gpu.0.cpuWaitMs"] = m1_stats["cpu_wait_ms"] if self._interface and not self._telem.env.m1_gpu: self._telem.env.m1_gpu = True self._interface.publish_telemetry(self._telem) except (OSError, ValueError, TypeError, subprocess.CalledProcessError) as e: wandb.termwarn("GPU stats error {}".format(e)) pass if psutil: net = psutil.net_io_counters() sysmem = psutil.virtual_memory() stats["cpu"] = psutil.cpu_percent() stats["memory"] = sysmem.percent stats["network"] = { "sent": net.bytes_sent - self.network_init["sent"], "recv": net.bytes_recv - self.network_init["recv"], } # TODO: maybe show other partitions, will likely need user to configure stats["disk"] = psutil.disk_usage("/").percent stats["proc.memory.availableMB"] = sysmem.available / 1048576.0 try: stats["proc.memory.rssMB"] = self.proc.memory_info( ).rss / 1048576.0 stats["proc.memory.percent"] = self.proc.memory_percent() stats["proc.cpu.threads"] = self.proc.num_threads() except psutil.NoSuchProcess: pass if self._tpu_profiler: stats["tpu"] = self._tpu_profiler.get_tpu_utilization() return stats