Exemple #1
0
    def _setup_sys(self):
        self.data["os"] = self._settings._os
        self.data["python"] = self._settings._python
        self.data["heartbeatAt"] = datetime.utcnow().isoformat()
        self.data["startedAt"] = datetime.utcfromtimestamp(
            self._settings._start_time
        ).isoformat()

        self.data["docker"] = self._settings.docker

        try:
            pynvml.nvmlInit()
            self.data["gpu"] = pynvml.nvmlDeviceGetName(
                pynvml.nvmlDeviceGetHandleByIndex(0)
            ).decode("utf8")
            self.data["gpu_count"] = pynvml.nvmlDeviceGetCount()
        except pynvml.NVMLError:
            pass
        try:
            self.data["cpu_count"] = multiprocessing.cpu_count()
        except NotImplementedError:
            pass

        self.data["cuda"] = self._settings._cuda
        self.data["args"] = self._settings._args
        self.data["state"] = "running"
Exemple #2
0
    def stats(self):
        stats = {}
        for i in range(self.gpu_count):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            try:
                util = pynvml.nvmlDeviceGetUtilizationRates(handle)
                memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
                temp = pynvml.nvmlDeviceGetTemperature(
                    handle, pynvml.NVML_TEMPERATURE_GPU)
                in_use_by_us = gpu_in_use_by_this_process(handle)

                stats["gpu.{}.{}".format(i, "gpu")] = util.gpu
                stats["gpu.{}.{}".format(i, "memory")] = util.memory
                stats["gpu.{}.{}".format(
                    i, "memoryAllocated")] = (memory.used /
                                              float(memory.total)) * 100
                stats["gpu.{}.{}".format(i, "temp")] = temp

                if in_use_by_us:
                    stats["gpu.process.{}.{}".format(i, "gpu")] = util.gpu
                    stats["gpu.process.{}.{}".format(i,
                                                     "memory")] = util.memory
                    stats["gpu.process.{}.{}".format(
                        i, "memoryAllocated")] = (memory.used /
                                                  float(memory.total)) * 100
                    stats["gpu.process.{}.{}".format(i, "temp")] = temp

                    # Some GPUs don't provide information about power usage
                try:
                    power_watts = pynvml.nvmlDeviceGetPowerUsage(
                        handle) / 1000.0
                    power_capacity_watts = (
                        pynvml.nvmlDeviceGetEnforcedPowerLimit(handle) /
                        1000.0)
                    power_usage = (power_watts / power_capacity_watts) * 100

                    stats["gpu.{}.{}".format(i, "powerWatts")] = power_watts
                    stats["gpu.{}.{}".format(i, "powerPercent")] = power_usage

                    if in_use_by_us:
                        stats["gpu.process.{}.{}".format(
                            i, "powerWatts")] = power_watts
                        stats["gpu.process.{}.{}".format(
                            i, "powerPercent")] = power_usage

                except pynvml.NVMLError:
                    pass

            except pynvml.NVMLError:
                pass
        if psutil:
            net = psutil.net_io_counters()
            sysmem = psutil.virtual_memory()
            stats["cpu"] = psutil.cpu_percent()
            stats["memory"] = sysmem.percent
            stats["network"] = {
                "sent": net.bytes_sent - self.network_init["sent"],
                "recv": net.bytes_recv - self.network_init["recv"],
            }
            # TODO: maybe show other partitions, will likely need user to configure
            stats["disk"] = psutil.disk_usage("/").percent
            stats["proc.memory.availableMB"] = sysmem.available / 1048576.0
            try:
                stats["proc.memory.rssMB"] = self.proc.memory_info(
                ).rss / 1048576.0
                stats["proc.memory.percent"] = self.proc.memory_percent()
                stats["proc.cpu.threads"] = self.proc.num_threads()
            except psutil.NoSuchProcess:
                pass
        return stats
    def stats(self):
        stats = {}
        for i in range(0, self.gpu_count):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            try:
                utilz = pynvml.nvmlDeviceGetUtilizationRates(handle)
                memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
                temp = pynvml.nvmlDeviceGetTemperature(
                    handle, pynvml.NVML_TEMPERATURE_GPU)
                in_use_by_us = gpu_in_use_by_this_process(handle)

                stats["gpu.{}.{}".format(i, "gpu")] = utilz.gpu
                stats["gpu.{}.{}".format(i, "memory")] = utilz.memory
                stats["gpu.{}.{}".format(
                    i, "memoryAllocated")] = (memory.used /
                                              float(memory.total)) * 100
                stats["gpu.{}.{}".format(i, "temp")] = temp

                if in_use_by_us:
                    stats["gpu.process.{}.{}".format(i, "gpu")] = utilz.gpu
                    stats["gpu.process.{}.{}".format(i,
                                                     "memory")] = utilz.memory
                    stats["gpu.process.{}.{}".format(
                        i, "memoryAllocated")] = (memory.used /
                                                  float(memory.total)) * 100
                    stats["gpu.process.{}.{}".format(i, "temp")] = temp

                    # Some GPUs don't provide information about power usage
                try:
                    power_watts = pynvml.nvmlDeviceGetPowerUsage(
                        handle) / 1000.0
                    power_capacity_watts = (
                        pynvml.nvmlDeviceGetEnforcedPowerLimit(handle) /
                        1000.0)
                    power_usage = (power_watts / power_capacity_watts) * 100

                    stats["gpu.{}.{}".format(i, "powerWatts")] = power_watts
                    stats["gpu.{}.{}".format(i, "powerPercent")] = power_usage

                    if in_use_by_us:
                        stats["gpu.process.{}.{}".format(
                            i, "powerWatts")] = power_watts
                        stats["gpu.process.{}.{}".format(
                            i, "powerPercent")] = power_usage

                except pynvml.NVMLError:
                    pass

            except pynvml.NVMLError:
                pass

        # On Apple M1 systems let's look for the gpu
        if (platform.system() == "Darwin" and platform.processor() == "arm"
                and self.gpu_count == 0):
            try:
                out = subprocess.check_output(
                    [util.apple_gpu_stats_binary(), "--json"])
                m1_stats = json.loads(out.split(b"\n")[0])
                stats["gpu.0.gpu"] = m1_stats["utilization"]
                stats["gpu.0.memoryAllocated"] = m1_stats["mem_used"]
                stats["gpu.0.temp"] = m1_stats["temperature"]
                stats["gpu.0.powerWatts"] = m1_stats["power"]
                stats["gpu.0.powerPercent"] = (m1_stats["power"] /
                                               M1_MAX_POWER_WATTS) * 100
                # TODO: this stat could be useful eventually, it was consistently
                # 0 in my experimentation and requires a frontend change
                # so leaving it out for now.
                # stats["gpu.0.cpuWaitMs"] = m1_stats["cpu_wait_ms"]

                if self._interface and not self._telem.env.m1_gpu:
                    self._telem.env.m1_gpu = True
                    self._interface.publish_telemetry(self._telem)

            except (OSError, ValueError, TypeError,
                    subprocess.CalledProcessError) as e:
                wandb.termwarn("GPU stats error {}".format(e))
                pass

        if psutil:
            net = psutil.net_io_counters()
            sysmem = psutil.virtual_memory()
            stats["cpu"] = psutil.cpu_percent()
            stats["memory"] = sysmem.percent
            stats["network"] = {
                "sent": net.bytes_sent - self.network_init["sent"],
                "recv": net.bytes_recv - self.network_init["recv"],
            }
            # TODO: maybe show other partitions, will likely need user to configure
            stats["disk"] = psutil.disk_usage("/").percent
            stats["proc.memory.availableMB"] = sysmem.available / 1048576.0
            try:
                stats["proc.memory.rssMB"] = self.proc.memory_info(
                ).rss / 1048576.0
                stats["proc.memory.percent"] = self.proc.memory_percent()
                stats["proc.cpu.threads"] = self.proc.num_threads()
            except psutil.NoSuchProcess:
                pass
        if self._tpu_profiler:
            stats["tpu"] = self._tpu_profiler.get_tpu_utilization()
        return stats