Beispiel #1
0
    def __get_gpu_info(self):
        def parse_unit(val, scale=1000):
            unit_ls = ['B', 'KB', 'MB', 'GB']
            unit_lv = 0
            while val >= scale:
                val /= scale
                unit_lv += 1
                if unit_lv == len(unit_ls) - 1:
                    break
            return '{:.2f} {}'.format(val, unit_ls[unit_lv])

        sum_info = []
        process_ls = []

        nv.nvmlInit()
        gpu_num = nv.nvmlDeviceGetCount()
        # 遍历每块卡
        for gpu_idx in range(gpu_num):
            h = nv.nvmlDeviceGetHandleByIndex(gpu_idx)
            dev_name = nv.nvmlDeviceGetName(h).decode()
            raw_total_mem = nv.nvmlDeviceGetMemoryInfo(h).total
            total_mem = parse_unit(raw_total_mem, 1024)
            raw_used_mem = nv.nvmlDeviceGetMemoryInfo(h).used
            used_mem = parse_unit(raw_used_mem, 1024)
            gpu_util = '{:.2f}'.format(nv.nvmlDeviceGetUtilizationRates(h).gpu)
            gpu_mem_util = '{:.2f}'.format(raw_used_mem * 100 / raw_total_mem)

            tmp = {}
            tmp['gpu_idx'] = str(gpu_idx)
            tmp['dev_name'] = dev_name
            tmp['total_mem'] = total_mem
            tmp['used_mem'] = used_mem
            tmp['gpu_util'] = gpu_util
            tmp['gpu_mem_util'] = gpu_mem_util
            sum_info.append(tmp)

            running_process_obj_ls = nv.nvmlDeviceGetComputeRunningProcesses(h)
            for obj in running_process_obj_ls:
                process_pid = obj.pid
                process_type = 'C'
                process_raw_gpu_mem = obj.usedGpuMemory
                process_name = nv.nvmlSystemGetProcessName(
                    process_pid).decode()
                ctan_name = self.get_ctan_name_by_pid(process_pid)

                tmp = {}
                tmp['gpu_idx'] = str(gpu_idx)
                tmp['dev_name'] = dev_name
                tmp['process_pid'] = str(process_pid)
                tmp['process_type'] = process_type
                tmp['process_name'] = process_name
                tmp['process_gpu_mem'] = parse_unit(process_raw_gpu_mem, 1024)
                tmp['ctan_name'] = ctan_name
                process_ls.append(tmp)

            running_process_obj_ls = nv.nvmlDeviceGetGraphicsRunningProcesses(
                h)
            for obj in running_process_obj_ls:
                process_pid = obj.pid
                process_type = 'G'
                process_raw_gpu_mem = obj.usedGpuMemory
                process_name = nv.nvmlSystemGetProcessName(
                    process_pid).decode()
                ctan_name = self.get_ctan_name_by_pid(process_pid)

                tmp = {}
                tmp['gpu_idx'] = str(gpu_idx)
                tmp['dev_name'] = dev_name
                tmp['process_pid'] = str(process_pid)
                tmp['process_type'] = process_type
                tmp['process_name'] = process_name
                tmp['process_gpu_mem'] = parse_unit(process_raw_gpu_mem, 1024)
                tmp['ctan_name'] = ctan_name
                process_ls.append(tmp)
        return sum_info, process_ls
Beispiel #2
0
    def __get_ctan_verbose_stats(self, name):
        # 连续获得参数
        def graceful_chain_get(d, *args, default=None):
            t = d
            for a in args:
                try:
                    t = t[a]
                except (KeyError, ValueError, TypeError, AttributeError):
                    return default
            return t

        # 计算cpu使用占比
        def calculate_cpu_percent2(d,
                                   previous_cpu_total=None,
                                   previous_cpu_system=None):
            cpu_percent = 0.0
            cpu_total = float(d["cpu_stats"]["cpu_usage"]["total_usage"])
            if previous_cpu_total is None:
                previous_cpu_total = cpu_total
            cpu_delta = cpu_total - previous_cpu_total
            cpu_system = float(d["cpu_stats"]["system_cpu_usage"])
            if previous_cpu_system is None:
                previous_cpu_system = cpu_system
            system_delta = cpu_system - previous_cpu_system
            online_cpus = d["cpu_stats"].get(
                "online_cpus",
                len(d["cpu_stats"]["cpu_usage"]["percpu_usage"]))
            if system_delta > 0.0:
                cpu_percent = (cpu_delta / system_delta) * online_cpus * 100.0
            return cpu_percent, cpu_total, cpu_system

        # 计算IO
        def calculate_blkio_bytes(d):
            """
            :param d:
            :return: (read_bytes, wrote_bytes), ints
            """
            bytes_stats = graceful_chain_get(d, "blkio_stats",
                                             "io_service_bytes_recursive")
            if not bytes_stats:
                return 0, 0
            r = 0
            w = 0
            for s in bytes_stats:
                if s["op"] == "Read":
                    r += s["value"]
                elif s["op"] == "Write":
                    w += s["value"]
            return r, w

        # 计算网络
        def calculate_network_bytes(d):
            """
            :param d:
            :return: (received_bytes, transceived_bytes), ints
            """
            networks = graceful_chain_get(d, "networks")
            if not networks:
                return 0, 0
            r = 0
            t = 0
            for if_name, data in networks.items():
                r += data["rx_bytes"]
                t += data["tx_bytes"]
            return r, t

        def calculate_mem_bytes(d):
            mem_limit = d['memory_stats']['limit']
            mem_usage = d['memory_stats']['usage']
            return mem_usage, mem_limit

        def parse_unit(val, scale=1000):
            unit_ls = ['B', 'KB', 'MB', 'GB']
            unit_lv = 0
            while val >= scale:
                val /= scale
                unit_lv += 1
                if unit_lv == len(unit_ls) - 1:
                    break
            return '{:.2f} {}'.format(val, unit_ls[unit_lv])

        if name not in self.user_stats_stream:
            # print('add {} into user_stats_stream'.format(name))
            ctan = self.containers.get(name)
            self.user_stats_stream[name] = ctan.stats(decode=True)

        # 通过数据流获取信息
        if self.containers.get(name).status == 'running':
            raw_stats = self.user_stats_stream[name].__next__()
            pre_cpu_stats = self.pre_cpu_stats[name]
        else:
            return None

        # cpu
        cpu_percent, cpu_total, cpu_system = calculate_cpu_percent2(
            raw_stats, pre_cpu_stats[0], pre_cpu_stats[1])
        self.pre_cpu_stats[name] = [cpu_total, cpu_system]  # 更新usage
        # blk
        read_blk, write_blk = calculate_blkio_bytes(raw_stats)
        # net
        read_net, write_net = calculate_network_bytes(raw_stats)
        # mem
        mem_usage, mem_limit = calculate_mem_bytes(raw_stats)

        # user gpu
        gpu_all_mem, gpu_used_mem, gpu_used_pcnt = 0, 0, 0
        gpu_num = nv.nvmlDeviceGetCount()
        for gpu_idx in range(gpu_num):
            h = nv.nvmlDeviceGetHandleByIndex(gpu_idx)
            running_process_obj_ls = nv.nvmlDeviceGetComputeRunningProcesses(h)
            for obj in running_process_obj_ls:
                process_pid = obj.pid
                process_raw_gpu_mem = obj.usedGpuMemory
                ctan_name = self.get_ctan_name_by_pid(process_pid)
                if ctan_name == name:
                    gpu_used_mem += process_raw_gpu_mem

            gpu_all_mem += nv.nvmlDeviceGetMemoryInfo(h).total

        ret_dt = {
            'id': raw_stats['id'],
            'pid': str(raw_stats['pids_stats']['current']),
            'cpu_percent': '{:.2f}'.format(cpu_percent),
            'read_blk': parse_unit(read_blk),
            'write_blk': parse_unit(write_blk),
            'read_net': parse_unit(read_net),
            'write_net': parse_unit(write_net),
            'mem_usage': parse_unit(mem_usage, scale=1024),
            'mem_limit': parse_unit(mem_limit, scale=1024),
            'mem_usage_pcnt': '{:.2f}'.format(mem_usage / mem_limit * 100),
            'gpu_mem_usage': parse_unit(gpu_used_mem, 1024),
            'gpu_mem_limit': parse_unit(gpu_all_mem, 1024),
            'gpu_mem_usage_pcnt':
            '{:.2f}'.format(gpu_used_mem / gpu_all_mem * 100)
        }

        return ret_dt