Example #1
0
    def __init__(self):
        self.request = Request()
        self._slaves = {
            'ip': [],
            'port': [],
            'system': [],
            'cpu': [],
            'mem': [],
            'time': [],
            'disk': [],
            'nic': [],
            'network_speed': [],
            'disk_size': [],
            'mem_usage': [],
            'cpu_usage': [],
            'disk_usage': []
        }

        # 设置数据库过期时间
        conn = influxdb.InfluxDBClient(cfg.getInflux('host'),
                                       cfg.getInflux('port'),
                                       cfg.getInflux('username'),
                                       cfg.getInflux('password'),
                                       cfg.getInflux('database'))
        conn.query(
            f'alter retention policy "autogen" on "{cfg.getInflux("database")}" duration '
            f'{cfg.getInflux("expiryTime")}d REPLICATION 1 SHARD DURATION {cfg.getInflux("shardDuration")} default;'
        )
        logger.info(f'设置数据过期时间为{cfg.getInflux("expiryTime")}天。')

        t = threading.Thread(target=self.check_status,
                             args=())  # 开启线程,检查已经注册的客户端是否在线
        t.start()
Example #2
0
    def __init__(self):
        self.check_sysstat_version()
        self.IP = get_ip()
        self.thread_pool = cfg.getServer(
            'threadPool') if cfg.getServer('threadPool') >= 0 else 0
        self._msg = {
            'port': [],
            'pid': [],
            'isRun': [],
            'startTime': []
        }  # port、pid、status、startTime
        self.is_system = cfg.getMonitor(
            'isMonSystem')  # Whether to monitor the server system
        self.error_times = cfg.getMonitor('errorTimes')
        self.sleepTime = cfg.getMonitor('sleepTime')
        self.maxCPU = cfg.getMonitor('maxCPU')
        self.CPUDuration = cfg.getMonitor('CPUDuration')
        self.isCPUAlert = cfg.getMonitor('isCPUAlert')
        self.minMem = cfg.getMonitor('minMem')
        self.isMemAlert = cfg.getMonitor('isMemAlert')
        self.isPidAlert = cfg.getMonitor('isPidAlert')
        self.errorTimesOfPid = cfg.getMonitor('errorTimesOfPid')
        self.frequencyFGC = cfg.getMonitor('frequencyFGC')
        self.isJvmAlert = cfg.getMonitor('isJvmAlert')
        self.echo = cfg.getMonitor('echo')
        self.isDiskAlert = cfg.getMonitor('isDiskAlert')
        self.maxDiskUsage = cfg.getMonitor('maxDiskUsage') / 100
        self.isTCP = cfg.getMonitor('isTCP')
        self.timeSetting = cfg.getMonitor('timeSetting')

        system_interval = cfg.getMonitor('system_interval')
        port_interval = cfg.getMonitor('port_interval')
        self.system_interval = max(
            system_interval,
            1)  # If the set value is less than 1, the default is 1
        self.port_interval = max(
            port_interval,
            1)  # If the set value is less than 1, the default is 1
        self.system_interval = self.system_interval - 1.1  # Program running time
        self.system_interval = max(self.system_interval, 0)
        self.port_interval = self.port_interval - 1.03  # Program running time
        self.port_interval = max(self.port_interval, 0)

        self.system_version = ''  # system version
        self.cpu_info = ''
        self.cpu_usage = 0.0  # CPU usage
        self.cpu_cores = 0  # number of CPU core
        self.mem_usage = 0.0  # memory usage
        self.total_mem = 0  # totel memory, unit: G
        self.total_mem_100 = 0  # total memory, unit: 100*G
        self.nic = ''  # network card
        self.all_disk = []  # disk number
        self.total_disk = 1  # total disk size, unit: M
        self.total_disk_h = 0  # total disk size, unit:T or G
        self.network_speed = cfg.getServer('nicSpeed')  # bandwidth

        self.get_system_version()
        self.get_cpu_cores()
        self.get_total_mem()
        self.get_system_nic()
        self.get_disks()
        self.get_system_net_speed()
        self.get_total_disk_size()

        self.monitor_task = queue.Queue()  # FIFO queue
        # thread pool, +2 is the need for monitoring system and registration service
        self.executor = ThreadPoolExecutor(self.thread_pool + 2)
        self.client = influxdb.InfluxDBClient(
            cfg.getInflux('host'), cfg.getInflux('port'),
            cfg.getInflux('username'), cfg.getInflux('password'),
            cfg.getInflux('database'))  # influxdb connection

        self.FGC = {}  # full gc times
        self.FGC_time = {}  # full gc time
        self.last_cpu_io = []  # recently cpu usage
        self.is_java = {}  # whether is java, 0 or 1

        self.monitor()
def draw_data_from_db(host,
                      port=None,
                      pid=None,
                      startTime=None,
                      endTime=None,
                      system=None,
                      disk=None):
    """
    Get data from InfluxDB, and visualize
    :param host: client IP, required
    :param port: port, visualize port data; optional, choose one from port, pid and system
    :param pid: pid, visualize pid data; optional, choose one from port, pid and system
    :param startTime: Start time; optional
    :param endTime: end time; optional
    :param system: visualize system data; optional, choose one from port, pid and system
    :param disk: disk number; optional
    :return:
    """
    post_data = {
        'types': 'system',
        'cpu_time': [],
        'cpu': [],
        'iowait': [],
        'mem': [],
        'mem_available': [],
        'jvm': [],
        'io_time': [],
        'io': [],
        'disk_r': [],
        'disk_w': [],
        'disk_d': [],
        'rec': [],
        'trans': [],
        'nic': [],
        'tcp': [],
        'close_wait': [],
        'time_wait': [],
        'retrans': [],
        'disk': disk
    }

    res = {'code': 1, 'flag': 1, 'message': 'Successful!'}

    connection = influxdb.InfluxDBClient(cfg.getInflux('host'),
                                         cfg.getInflux('port'),
                                         cfg.getInflux('username'),
                                         cfg.getInflux('password'),
                                         cfg.getInflux('database'))

    try:
        if startTime and endTime:  # If there is a start time and an end time
            pass
        elif startTime is None and endTime is None:  # If the start time and end time do not exist, use the default time.
            startTime = '2020-05-20 20:20:20'
            endTime = time.strftime('%Y-%m-%d %H:%M:%S')
        else:  # If the end time does not exist, the current time is used
            endTime = time.strftime('%Y-%m-%d %H:%M:%S')

        s_time = time.time()
        if port:
            sql = f"select c_time, cpu, wait_cpu, mem, tcp, jvm, rKbs, wKbs, iodelay, close_wait, time_wait from \"{host}\" " \
                  f"where time>'{startTime}' and time<'{endTime}' and type='{port}' tz('Asia/Shanghai')"
            logger.info(f'Execute sql: {sql}')
            datas = connection.query(sql)
            if datas:
                post_data['types'] = 'port'
                for data in datas.get_points():
                    # post_data['cpu_time'].append(data['time'][:19].replace('T', ' '))
                    post_data['cpu_time'].append(data['c_time'])
                    post_data['cpu'].append(data['cpu'])
                    post_data['iowait'].append(data['wait_cpu'])
                    post_data['mem'].append(data['mem'])
                    post_data['tcp'].append(data['tcp'])
                    post_data['jvm'].append(data['jvm'])
                    post_data['io'].append(data['iodelay'])
                    post_data['disk_r'].append(data['rKbs'])
                    post_data['disk_w'].append(data['wKbs'])
                    post_data['close_wait'].append(data['close_wait'])
                    post_data['time_wait'].append(data['time_wait'])
            else:
                res['message'] = f'The monitoring data of the port {port} is not queried, ' \
                                 f'please check the port or time setting.'
                res['code'] = 0

            if disk:
                sql = f"select rec, trans, net from \"{host}\" where time>'{startTime}' and time<'{endTime}' and " \
                      f"type='system' tz('Asia/Shanghai')"
                logger.info(f'Execute sql: {sql}')
                datas = connection.query(sql)
                if datas:
                    for data in datas.get_points():
                        post_data['nic'].append(data['net'])
                        post_data['rec'].append(data['rec'])
                        post_data['trans'].append(data['trans'])
                else:
                    res['message'] = 'No monitoring data is found, please check the disk number or time setting.'
                    res['code'] = 0

        if pid:
            pass

        if system and disk:
            disk_n = disk.replace('-', '')
            disk_r = disk_n + '_r'
            disk_w = disk_n + '_w'
            disk_d = disk_n + '_d'
            sql = f"select c_time, cpu, iowait, mem, mem_available, {disk_n}, {disk_r}, {disk_w}, {disk_d}, rec, trans, " \
                  f"net, tcp, retrans from \"{host}\" where time>'{startTime}' and time<'{endTime}' and " \
                  f"type='system' tz('Asia/Shanghai')"
            logger.info(f'Execute sql: {sql}')
            datas = connection.query(sql)
            if datas:
                post_data['types'] = 'system'
                for data in datas.get_points():
                    # post_data['cpu_time'].append(data['time'][:19].replace('T', ' '))
                    post_data['cpu_time'].append(data['c_time'])
                    post_data['cpu'].append(data['cpu'])
                    post_data['iowait'].append(data['iowait'])
                    post_data['mem'].append(data['mem'])
                    post_data['mem_available'].append(data['mem_available'])
                    post_data['rec'].append(data['rec'])
                    post_data['trans'].append(data['trans'])
                    post_data['nic'].append(data['net'])
                    post_data['io'].append(data[disk_n])
                    post_data['disk_r'].append(data[disk_r])
                    post_data['disk_w'].append(data[disk_w])
                    post_data['disk_d'].append(data[disk_d])
                    post_data['tcp'].append(data['tcp'])
                    post_data['retrans'].append(data['retrans'])

            else:
                res['message'] = 'No monitoring data is found, please check the disk number or time setting.'
                res['code'] = 0

        res.update({'post_data': post_data})
        logger.info(f'Time consuming to query is {time.time() - s_time}')

        # lines = get_lines(post_data)      # Calculate percentile, 75%, 90%, 95%, 99%
        # res.update(lines)

    except Exception as err:
        logger.error(traceback.format_exc())
        res['message'] = str(err)
        res['code'] = 0

    del connection, post_data
    return res
Example #4
0
def draw_data_from_db(host,
                      port=None,
                      pid=None,
                      start_time=None,
                      end_time=None,
                      system=None,
                      disk=None):
    """
    从hbase数据库中读取数据并画图
    :param host: 客户端服务器IP,必传参数
    :param port: 端口号,即画该端口号的图;可选参数,和pid、system参数互斥,三选一
    :param pid: 进程号,即画该进程号的图;可选参数,和port、system参数互斥,三选一
    :param start_time: 画图数据开始时间;可选参数
    :param end_time: 画图数据截止时间;可选参数
    :param system: 画整个系统CPU和内存图,可选参数,和port、pid参数互斥,三选一
    :param disk: 磁盘号,查看指定磁盘号的IO,可选参数
    :return:
    """
    try:
        post_data = {
            'types': 'system',
            'cpu_time': [],
            'cpu': [],
            'mem': [],
            'jvm': [],
            'io_time': [],
            'io': [],
            'disk_r': [],
            'disk_w': [],
            'rec': [],
            'trans': [],
            'nic': [],
            'tcp': [],
            'close_wait': [],
            'time_wait': [],
            'retrans': [],
            'disk': disk
        }

        res = {'code': 1, 'flag': 1, 'message': '查询成功'}

        connection = influxdb.InfluxDBClient(
            cfg.getInflux('host'), cfg.getInflux('port'),
            cfg.getInflux('username'), cfg.getInflux('password'),
            cfg.getInflux('database'))  # 创建数据库连接

        if start_time and end_time:  # 如果存在开始时间和结束时间
            pass
        elif start_time is None and end_time is None:  # 如果开始时间和结束时间都不存在,则使用默认时间,即查询所有数据
            start_time = '2020-05-20 20:20:20'
            end_time = time.strftime('%Y-%m-%d %H:%M:%S')
        else:  # 如果结束时间不存在,则使用当前时间
            end_time = time.strftime('%Y-%m-%d %H:%M:%S')

        s_time = time.time()
        if port:  # 读取和端口号相关的CPU使用率、内存使用大小和jvm变化数据
            sql = f"select cpu, mem, tcp, jvm, close_wait, time_wait from \"{host}\" where time>'{start_time}' and time<'{end_time}' and type='{port}' tz('Asia/Shanghai')"
            datas = connection.query(sql)
            if datas:
                post_data['types'] = 'port'
                for data in datas.get_points():
                    post_data['cpu_time'].append(
                        data['time'].split('.')[0].replace('T', ' '))
                    post_data['cpu'].append(data['cpu'])
                    post_data['mem'].append(data['mem'])
                    post_data['tcp'].append(data['tcp'])
                    post_data['jvm'].append(data['jvm'])
                    post_data['close_wait'].append(data['close_wait'])
                    post_data['time_wait'].append(data['time_wait'])
            else:
                res['message'] = f'未查询到{port}端口的监控数据,请检查端口是否已监控,或者时间设置是否正确!'
                res['code'] = 0

            if disk:  # 读取磁盘IO数据
                disk_n = disk.replace('-', '')
                disk_r = disk_n + '_r'
                disk_w = disk_n + '_w'
                sql = f"select {disk_n}, {disk_r}, {disk_w}, rec, trans, net from \"{host}\" where time>'{start_time}' and time<'{end_time}' and type='system' tz('Asia/Shanghai')"
                datas = connection.query(sql)
                if datas:
                    for data in datas.get_points():
                        post_data['nic'].append(data['net'])
                        post_data['rec'].append(data['rec'])
                        post_data['trans'].append(data['trans'])
                        post_data['io'].append(data[disk_n])
                        post_data['disk_r'].append(data[disk_r])
                        post_data['disk_w'].append(data[disk_w])
                else:
                    res['message'] = '未查询到监控数据,请检查磁盘号,或者时间设置!'
                    res['code'] = 0

        if pid:  # 读取和进程号相关的CPU使用率、内存使用大小和jvm变化数据
            pass

        if system and disk:  # 读取整个系统的CPU使用率、剩余内存大小
            disk_n = disk.replace('-', '')
            disk_r = disk_n + '_r'
            disk_w = disk_n + '_w'
            sql = f"select cpu, mem, {disk_n}, {disk_r}, {disk_w}, rec, trans, net, tcp, retrans from \"{host}\" where time>'{start_time}' and time<'{end_time}' and type='system' tz('Asia/Shanghai')"
            datas = connection.query(sql)
            if datas:
                post_data['types'] = 'system'
                for data in datas.get_points():
                    post_data['cpu_time'].append(
                        data['time'].split('.')[0].replace('T', ' '))
                    post_data['cpu'].append(data['cpu'])
                    post_data['mem'].append(data['mem'])
                    post_data['rec'].append(data['rec'])
                    post_data['trans'].append(data['trans'])
                    post_data['nic'].append(data['net'])
                    post_data['io'].append(data[disk_n])
                    post_data['disk_r'].append(data[disk_r])
                    post_data['disk_w'].append(data[disk_w])
                    post_data['tcp'].append(data['tcp'])
                    post_data['retrans'].append(data['retrans'])

                post_data['io_time'] = post_data['cpu_time']
            else:
                res['message'] = '未查询到系统监控数据,请检查磁盘号,或者时间设置!'
                res['code'] = 0

        logger.info(f'查询数据库耗时:{time.time() - s_time}')

        s_time = time.time()
        lines = get_lines(post_data)  # 计算百分位数,75%、90%、95%、99%
        res.update(lines)
        logger.info(f'计算百分位数耗时:{time.time() - s_time}')
        connection.close()
        del connection, post_data
        return res

    except Exception as err:
        del connection, post_data
        logger.error(err)
        logger.error(traceback.format_exc())
        res['message'] = err
        res['code'] = 0
        return res
Example #5
0
    def __init__(self):
        self.IP = get_ip()
        self.thread_pool = cfg.getServer(
            'threadPool') if cfg.getServer('threadPool') >= 0 else 0
        self._msg = {
            'port': [],
            'pid': [],
            'isRun': [],
            'startTime': []
        }  # 端口号、进程号、监控状态、开始监控时间
        self.is_system = cfg.getMonitor('isMonSystem')  # 是否监控服务器的资源
        self.error_duration = cfg.getMonitor('errorDuration')  # 执行命令失败次数
        self.sleepTime = cfg.getMonitor('sleepTime')
        self.maxCPU = cfg.getMonitor('maxCPU')
        self.CPUDuration = cfg.getMonitor('CPUDuration')
        self.isCPUAlert = cfg.getMonitor('isCPUAlert')
        self.minMem = cfg.getMonitor('minMem')
        self.isMemAlert = cfg.getMonitor('isMemAlert')
        self.frequencyFGC = cfg.getMonitor('frequencyFGC')
        self.isJvmAlert = cfg.getMonitor('isJvmAlert')
        self.echo = cfg.getMonitor('echo')
        self.isDiskAlert = cfg.getMonitor('isDiskAlert')
        self.maxDiskUsage = cfg.getMonitor('maxDiskUsage') / 100
        self.isTCP = cfg.getMonitor('isTCP')
        self.timeSetting = cfg.getMonitor('timeSetting')

        system_interval = cfg.getMonitor('system_interval')  # 每次执行监控命令的时间间隔
        port_interval = cfg.getMonitor('port_interval')  # 每次执行监控命令的时间间隔
        self.system_interval = max(system_interval, 1)  # 设置的值如果小于1,则默认为1
        self.port_interval = max(port_interval, 1)
        self.system_interval = self.system_interval - 1.1  # 程序运行、写库时间
        self.system_interval = max(self.system_interval, 0)
        self.port_interval = self.port_interval - 0.02  # 0.02为程序运行、写库时间

        self.system_version = ''  # 系统版本
        self.cpu_info = ''
        self.cpu_cores = 0  # CPU核数
        self.total_mem = 0  # 总内存,单位G
        self.total_mem_100 = 0  # 总内存,单位100*G,主要用于求内存占比,减少运算量
        self.nic = ''  # 系统正在使用的网卡
        self.all_disk = []  # 磁盘号
        self.total_disk = 1  # 磁盘总大小,单位M
        self.total_disk_h = 0  # 磁盘总大小,以人可读的方式展示,单位T或G
        self.network_speed = 1  # 服务器网卡带宽

        self.get_system_version()
        self.get_cpu_cores()
        self.get_total_mem()
        self.get_system_nic()
        self.get_disks()
        self.get_system_net_speed()
        self.get_total_disk_size()

        self.monitor_task = queue.Queue()  # 创建一个FIFO队列
        self.executor = ThreadPoolExecutor(self.thread_pool +
                                           1)  # 创建线程池, +1是需要监控系统
        self.client = influxdb.InfluxDBClient(
            cfg.getInflux('host'), cfg.getInflux('port'),
            cfg.getInflux('username'), cfg.getInflux('password'),
            cfg.getInflux('database'))  # 创建数据库连接

        self.FGC = {}  # 每个端口的full gc次数
        self.FGC_time = {}  # 每个端口每次full gc的时间
        self.last_cpu_io = []  # 最近一段时间的cpu的值,约100s
        self.is_java = {}  # 监控的端口是否是java服务,0 or 1

        self.monitor()
def draw_data_from_db(host,
                      port=None,
                      pid=None,
                      start_time=None,
                      end_time=None,
                      system=None,
                      disk=None):
    """
    从hbase数据库中读取数据并画图
    :param host: 客户端服务器IP,必传参数
    :param port: 端口号,即画该端口号的图;可选参数,和pid、system参数互斥,三选一
    :param pid: 进程号,即画该进程号的图;可选参数,和port、system参数互斥,三选一
    :param start_time: 画图数据开始时间;可选参数
    :param end_time: 画图数据截止时间;可选参数
    :param system: 画整个系统CPU和内存图,可选参数,和port、pid参数互斥,三选一
    :param disk: 磁盘号,查看指定磁盘号的IO,可选参数
    :return:
    """
    try:
        post_data = {
            'types': 'system',
            'cpu_time': [],
            'cpu': [],
            'mem': [],
            'jvm': [],
            'io_time': [],
            'io': [],
            'rec': [],
            'trans': [],
            'nic': [],
            'disk': disk
        }

        res = {'code': 1, 'message': None}

        connection = influxdb.InfluxDBClient(
            cfg.getInflux('host'), cfg.getInflux('port'),
            cfg.getInflux('username'), cfg.getInflux('password'),
            cfg.getInflux('database'))  # 创建数据库连接

        if start_time and end_time:  # 如果存在开始时间和结束时间
            startTime = local2utc(start_time)
            endTime = local2utc(end_time)
        elif start_time is None and end_time is None:  # 如果开始时间和结束时间都不存在,则使用默认时间,即查询所有数据
            startTime = local2utc('2020-02-02 02:02:02')
            endTime = local2utc(time.strftime('%Y-%m-%d %H:%M:%S'))
        else:  # 如果结束时间不存在,则使用当前时间
            startTime = local2utc(start_time)
            endTime = local2utc(time.strftime('%Y-%m-%d %H:%M:%S'))

        if port:  # 读取和端口号相关的CPU使用率、内存使用大小和jvm变化数据
            sql = f"select cpu, mem, jvm from \"{host}\" where time>'{startTime}' and time<'{endTime}' and type='{port}'"
            datas = connection.query(sql)
            if datas:
                post_data['types'] = 'port'
                for data in datas.get_points():
                    post_data['cpu_time'].append(data['time'])
                    post_data['cpu'].append(data['cpu'])
                    post_data['mem'].append(data['mem'])
                    post_data['jvm'].append(data['jvm'])
            else:
                res['message'] = f'未查询到端口{port}的监控数据,请检查端口是否已监控,或者时间设置是否正确!'
                res['code'] = 0

            if disk:  # 读取磁盘IO数据
                disk_n = disk.replace('-', '')
                sql = f"select {disk_n}, net from \"{host}\" where time>'{startTime}' and time<'{endTime}' and type='system'"
                datas = connection.query(sql)
                if datas:
                    for data in datas.get_points():
                        post_data['io_time'].append(data['time'])
                        post_data['nic'].append(data['net'])
                        post_data['io'].append(float(data[disk_n]))
                else:
                    res['message'] = '未查询到监控数据,请检查磁盘号,或者时间设置!'
                    res['code'] = 0

        if pid:  # 读取和进程号相关的CPU使用率、内存使用大小和jvm变化数据
            pass

        if system and disk:  # 读取整个系统的CPU使用率、剩余内存大小
            disk_n = disk.replace('-', '')
            sql = f"select cpu, mem, {disk_n}, rec, trans, net from \"{host}\" where time>'{startTime}' and time<'{endTime}' and type='system'"
            datas = connection.query(sql)
            if datas:
                post_data['types'] = 'system'
                for data in datas.get_points():
                    post_data['cpu_time'].append(data['time'])
                    post_data['cpu'].append(data['cpu'])
                    post_data['mem'].append(data['mem'])
                    post_data['rec'].append(data['rec'])
                    post_data['trans'].append(data['trans'])
                    post_data['nic'].append(data['net'])
                    post_data['io'].append(float(data[disk_n]))

                post_data['io_time'] = post_data['cpu_time']
            else:
                res['message'] = '未查询到系统监控数据,请检查磁盘号,或者时间设置!'
                res['code'] = 0

        img = draw(post_data)  # 画图
        res.update(img)

        lines = get_lines(post_data['cpu'], post_data['io'],
                          post_data['nic'])  # 计算百分位数,75%、90%、95%、99%
        res.update(lines)
        del connection
        del post_data
        return res

    except Exception as err:
        del connection
        del post_data
        logger.error(err)
        return res