def __init__(self): self.request = Request() self._slaves = { 'ip': [], 'port': [], 'system': [], 'cpu': [], 'mem': [], 'time': [], 'disk': [], 'nic': [], 'network_speed': [], 'disk_size': [], 'mem_usage': [], 'cpu_usage': [], 'disk_usage': [] } # 设置数据库过期时间 conn = influxdb.InfluxDBClient(cfg.getInflux('host'), cfg.getInflux('port'), cfg.getInflux('username'), cfg.getInflux('password'), cfg.getInflux('database')) conn.query( f'alter retention policy "autogen" on "{cfg.getInflux("database")}" duration ' f'{cfg.getInflux("expiryTime")}d REPLICATION 1 SHARD DURATION {cfg.getInflux("shardDuration")} default;' ) logger.info(f'设置数据过期时间为{cfg.getInflux("expiryTime")}天。') t = threading.Thread(target=self.check_status, args=()) # 开启线程,检查已经注册的客户端是否在线 t.start()
def __init__(self): self.check_sysstat_version() self.IP = get_ip() self.thread_pool = cfg.getServer( 'threadPool') if cfg.getServer('threadPool') >= 0 else 0 self._msg = { 'port': [], 'pid': [], 'isRun': [], 'startTime': [] } # port、pid、status、startTime self.is_system = cfg.getMonitor( 'isMonSystem') # Whether to monitor the server system self.error_times = cfg.getMonitor('errorTimes') self.sleepTime = cfg.getMonitor('sleepTime') self.maxCPU = cfg.getMonitor('maxCPU') self.CPUDuration = cfg.getMonitor('CPUDuration') self.isCPUAlert = cfg.getMonitor('isCPUAlert') self.minMem = cfg.getMonitor('minMem') self.isMemAlert = cfg.getMonitor('isMemAlert') self.isPidAlert = cfg.getMonitor('isPidAlert') self.errorTimesOfPid = cfg.getMonitor('errorTimesOfPid') self.frequencyFGC = cfg.getMonitor('frequencyFGC') self.isJvmAlert = cfg.getMonitor('isJvmAlert') self.echo = cfg.getMonitor('echo') self.isDiskAlert = cfg.getMonitor('isDiskAlert') self.maxDiskUsage = cfg.getMonitor('maxDiskUsage') / 100 self.isTCP = cfg.getMonitor('isTCP') self.timeSetting = cfg.getMonitor('timeSetting') system_interval = cfg.getMonitor('system_interval') port_interval = cfg.getMonitor('port_interval') self.system_interval = max( system_interval, 1) # If the set value is less than 1, the default is 1 self.port_interval = max( port_interval, 1) # If the set value is less than 1, the default is 1 self.system_interval = self.system_interval - 1.1 # Program running time self.system_interval = max(self.system_interval, 0) self.port_interval = self.port_interval - 1.03 # Program running time self.port_interval = max(self.port_interval, 0) self.system_version = '' # system version self.cpu_info = '' self.cpu_usage = 0.0 # CPU usage self.cpu_cores = 0 # number of CPU core self.mem_usage = 0.0 # memory usage self.total_mem = 0 # totel memory, unit: G self.total_mem_100 = 0 # total memory, unit: 100*G self.nic = '' # network card self.all_disk = [] # disk number self.total_disk = 1 # total disk size, unit: M self.total_disk_h = 0 # total disk size, unit:T or G self.network_speed = cfg.getServer('nicSpeed') # bandwidth self.get_system_version() self.get_cpu_cores() self.get_total_mem() self.get_system_nic() self.get_disks() self.get_system_net_speed() self.get_total_disk_size() self.monitor_task = queue.Queue() # FIFO queue # thread pool, +2 is the need for monitoring system and registration service self.executor = ThreadPoolExecutor(self.thread_pool + 2) self.client = influxdb.InfluxDBClient( cfg.getInflux('host'), cfg.getInflux('port'), cfg.getInflux('username'), cfg.getInflux('password'), cfg.getInflux('database')) # influxdb connection self.FGC = {} # full gc times self.FGC_time = {} # full gc time self.last_cpu_io = [] # recently cpu usage self.is_java = {} # whether is java, 0 or 1 self.monitor()
def draw_data_from_db(host, port=None, pid=None, startTime=None, endTime=None, system=None, disk=None): """ Get data from InfluxDB, and visualize :param host: client IP, required :param port: port, visualize port data; optional, choose one from port, pid and system :param pid: pid, visualize pid data; optional, choose one from port, pid and system :param startTime: Start time; optional :param endTime: end time; optional :param system: visualize system data; optional, choose one from port, pid and system :param disk: disk number; optional :return: """ post_data = { 'types': 'system', 'cpu_time': [], 'cpu': [], 'iowait': [], 'mem': [], 'mem_available': [], 'jvm': [], 'io_time': [], 'io': [], 'disk_r': [], 'disk_w': [], 'disk_d': [], 'rec': [], 'trans': [], 'nic': [], 'tcp': [], 'close_wait': [], 'time_wait': [], 'retrans': [], 'disk': disk } res = {'code': 1, 'flag': 1, 'message': 'Successful!'} connection = influxdb.InfluxDBClient(cfg.getInflux('host'), cfg.getInflux('port'), cfg.getInflux('username'), cfg.getInflux('password'), cfg.getInflux('database')) try: if startTime and endTime: # If there is a start time and an end time pass elif startTime is None and endTime is None: # If the start time and end time do not exist, use the default time. startTime = '2020-05-20 20:20:20' endTime = time.strftime('%Y-%m-%d %H:%M:%S') else: # If the end time does not exist, the current time is used endTime = time.strftime('%Y-%m-%d %H:%M:%S') s_time = time.time() if port: sql = f"select c_time, cpu, wait_cpu, mem, tcp, jvm, rKbs, wKbs, iodelay, close_wait, time_wait from \"{host}\" " \ f"where time>'{startTime}' and time<'{endTime}' and type='{port}' tz('Asia/Shanghai')" logger.info(f'Execute sql: {sql}') datas = connection.query(sql) if datas: post_data['types'] = 'port' for data in datas.get_points(): # post_data['cpu_time'].append(data['time'][:19].replace('T', ' ')) post_data['cpu_time'].append(data['c_time']) post_data['cpu'].append(data['cpu']) post_data['iowait'].append(data['wait_cpu']) post_data['mem'].append(data['mem']) post_data['tcp'].append(data['tcp']) post_data['jvm'].append(data['jvm']) post_data['io'].append(data['iodelay']) post_data['disk_r'].append(data['rKbs']) post_data['disk_w'].append(data['wKbs']) post_data['close_wait'].append(data['close_wait']) post_data['time_wait'].append(data['time_wait']) else: res['message'] = f'The monitoring data of the port {port} is not queried, ' \ f'please check the port or time setting.' res['code'] = 0 if disk: sql = f"select rec, trans, net from \"{host}\" where time>'{startTime}' and time<'{endTime}' and " \ f"type='system' tz('Asia/Shanghai')" logger.info(f'Execute sql: {sql}') datas = connection.query(sql) if datas: for data in datas.get_points(): post_data['nic'].append(data['net']) post_data['rec'].append(data['rec']) post_data['trans'].append(data['trans']) else: res['message'] = 'No monitoring data is found, please check the disk number or time setting.' res['code'] = 0 if pid: pass if system and disk: disk_n = disk.replace('-', '') disk_r = disk_n + '_r' disk_w = disk_n + '_w' disk_d = disk_n + '_d' sql = f"select c_time, cpu, iowait, mem, mem_available, {disk_n}, {disk_r}, {disk_w}, {disk_d}, rec, trans, " \ f"net, tcp, retrans from \"{host}\" where time>'{startTime}' and time<'{endTime}' and " \ f"type='system' tz('Asia/Shanghai')" logger.info(f'Execute sql: {sql}') datas = connection.query(sql) if datas: post_data['types'] = 'system' for data in datas.get_points(): # post_data['cpu_time'].append(data['time'][:19].replace('T', ' ')) post_data['cpu_time'].append(data['c_time']) post_data['cpu'].append(data['cpu']) post_data['iowait'].append(data['iowait']) post_data['mem'].append(data['mem']) post_data['mem_available'].append(data['mem_available']) post_data['rec'].append(data['rec']) post_data['trans'].append(data['trans']) post_data['nic'].append(data['net']) post_data['io'].append(data[disk_n]) post_data['disk_r'].append(data[disk_r]) post_data['disk_w'].append(data[disk_w]) post_data['disk_d'].append(data[disk_d]) post_data['tcp'].append(data['tcp']) post_data['retrans'].append(data['retrans']) else: res['message'] = 'No monitoring data is found, please check the disk number or time setting.' res['code'] = 0 res.update({'post_data': post_data}) logger.info(f'Time consuming to query is {time.time() - s_time}') # lines = get_lines(post_data) # Calculate percentile, 75%, 90%, 95%, 99% # res.update(lines) except Exception as err: logger.error(traceback.format_exc()) res['message'] = str(err) res['code'] = 0 del connection, post_data return res
def draw_data_from_db(host, port=None, pid=None, start_time=None, end_time=None, system=None, disk=None): """ 从hbase数据库中读取数据并画图 :param host: 客户端服务器IP,必传参数 :param port: 端口号,即画该端口号的图;可选参数,和pid、system参数互斥,三选一 :param pid: 进程号,即画该进程号的图;可选参数,和port、system参数互斥,三选一 :param start_time: 画图数据开始时间;可选参数 :param end_time: 画图数据截止时间;可选参数 :param system: 画整个系统CPU和内存图,可选参数,和port、pid参数互斥,三选一 :param disk: 磁盘号,查看指定磁盘号的IO,可选参数 :return: """ try: post_data = { 'types': 'system', 'cpu_time': [], 'cpu': [], 'mem': [], 'jvm': [], 'io_time': [], 'io': [], 'disk_r': [], 'disk_w': [], 'rec': [], 'trans': [], 'nic': [], 'tcp': [], 'close_wait': [], 'time_wait': [], 'retrans': [], 'disk': disk } res = {'code': 1, 'flag': 1, 'message': '查询成功'} connection = influxdb.InfluxDBClient( cfg.getInflux('host'), cfg.getInflux('port'), cfg.getInflux('username'), cfg.getInflux('password'), cfg.getInflux('database')) # 创建数据库连接 if start_time and end_time: # 如果存在开始时间和结束时间 pass elif start_time is None and end_time is None: # 如果开始时间和结束时间都不存在,则使用默认时间,即查询所有数据 start_time = '2020-05-20 20:20:20' end_time = time.strftime('%Y-%m-%d %H:%M:%S') else: # 如果结束时间不存在,则使用当前时间 end_time = time.strftime('%Y-%m-%d %H:%M:%S') s_time = time.time() if port: # 读取和端口号相关的CPU使用率、内存使用大小和jvm变化数据 sql = f"select cpu, mem, tcp, jvm, close_wait, time_wait from \"{host}\" where time>'{start_time}' and time<'{end_time}' and type='{port}' tz('Asia/Shanghai')" datas = connection.query(sql) if datas: post_data['types'] = 'port' for data in datas.get_points(): post_data['cpu_time'].append( data['time'].split('.')[0].replace('T', ' ')) post_data['cpu'].append(data['cpu']) post_data['mem'].append(data['mem']) post_data['tcp'].append(data['tcp']) post_data['jvm'].append(data['jvm']) post_data['close_wait'].append(data['close_wait']) post_data['time_wait'].append(data['time_wait']) else: res['message'] = f'未查询到{port}端口的监控数据,请检查端口是否已监控,或者时间设置是否正确!' res['code'] = 0 if disk: # 读取磁盘IO数据 disk_n = disk.replace('-', '') disk_r = disk_n + '_r' disk_w = disk_n + '_w' sql = f"select {disk_n}, {disk_r}, {disk_w}, rec, trans, net from \"{host}\" where time>'{start_time}' and time<'{end_time}' and type='system' tz('Asia/Shanghai')" datas = connection.query(sql) if datas: for data in datas.get_points(): post_data['nic'].append(data['net']) post_data['rec'].append(data['rec']) post_data['trans'].append(data['trans']) post_data['io'].append(data[disk_n]) post_data['disk_r'].append(data[disk_r]) post_data['disk_w'].append(data[disk_w]) else: res['message'] = '未查询到监控数据,请检查磁盘号,或者时间设置!' res['code'] = 0 if pid: # 读取和进程号相关的CPU使用率、内存使用大小和jvm变化数据 pass if system and disk: # 读取整个系统的CPU使用率、剩余内存大小 disk_n = disk.replace('-', '') disk_r = disk_n + '_r' disk_w = disk_n + '_w' sql = f"select cpu, mem, {disk_n}, {disk_r}, {disk_w}, rec, trans, net, tcp, retrans from \"{host}\" where time>'{start_time}' and time<'{end_time}' and type='system' tz('Asia/Shanghai')" datas = connection.query(sql) if datas: post_data['types'] = 'system' for data in datas.get_points(): post_data['cpu_time'].append( data['time'].split('.')[0].replace('T', ' ')) post_data['cpu'].append(data['cpu']) post_data['mem'].append(data['mem']) post_data['rec'].append(data['rec']) post_data['trans'].append(data['trans']) post_data['nic'].append(data['net']) post_data['io'].append(data[disk_n]) post_data['disk_r'].append(data[disk_r]) post_data['disk_w'].append(data[disk_w]) post_data['tcp'].append(data['tcp']) post_data['retrans'].append(data['retrans']) post_data['io_time'] = post_data['cpu_time'] else: res['message'] = '未查询到系统监控数据,请检查磁盘号,或者时间设置!' res['code'] = 0 logger.info(f'查询数据库耗时:{time.time() - s_time}') s_time = time.time() lines = get_lines(post_data) # 计算百分位数,75%、90%、95%、99% res.update(lines) logger.info(f'计算百分位数耗时:{time.time() - s_time}') connection.close() del connection, post_data return res except Exception as err: del connection, post_data logger.error(err) logger.error(traceback.format_exc()) res['message'] = err res['code'] = 0 return res
def __init__(self): self.IP = get_ip() self.thread_pool = cfg.getServer( 'threadPool') if cfg.getServer('threadPool') >= 0 else 0 self._msg = { 'port': [], 'pid': [], 'isRun': [], 'startTime': [] } # 端口号、进程号、监控状态、开始监控时间 self.is_system = cfg.getMonitor('isMonSystem') # 是否监控服务器的资源 self.error_duration = cfg.getMonitor('errorDuration') # 执行命令失败次数 self.sleepTime = cfg.getMonitor('sleepTime') self.maxCPU = cfg.getMonitor('maxCPU') self.CPUDuration = cfg.getMonitor('CPUDuration') self.isCPUAlert = cfg.getMonitor('isCPUAlert') self.minMem = cfg.getMonitor('minMem') self.isMemAlert = cfg.getMonitor('isMemAlert') self.frequencyFGC = cfg.getMonitor('frequencyFGC') self.isJvmAlert = cfg.getMonitor('isJvmAlert') self.echo = cfg.getMonitor('echo') self.isDiskAlert = cfg.getMonitor('isDiskAlert') self.maxDiskUsage = cfg.getMonitor('maxDiskUsage') / 100 self.isTCP = cfg.getMonitor('isTCP') self.timeSetting = cfg.getMonitor('timeSetting') system_interval = cfg.getMonitor('system_interval') # 每次执行监控命令的时间间隔 port_interval = cfg.getMonitor('port_interval') # 每次执行监控命令的时间间隔 self.system_interval = max(system_interval, 1) # 设置的值如果小于1,则默认为1 self.port_interval = max(port_interval, 1) self.system_interval = self.system_interval - 1.1 # 程序运行、写库时间 self.system_interval = max(self.system_interval, 0) self.port_interval = self.port_interval - 0.02 # 0.02为程序运行、写库时间 self.system_version = '' # 系统版本 self.cpu_info = '' self.cpu_cores = 0 # CPU核数 self.total_mem = 0 # 总内存,单位G self.total_mem_100 = 0 # 总内存,单位100*G,主要用于求内存占比,减少运算量 self.nic = '' # 系统正在使用的网卡 self.all_disk = [] # 磁盘号 self.total_disk = 1 # 磁盘总大小,单位M self.total_disk_h = 0 # 磁盘总大小,以人可读的方式展示,单位T或G self.network_speed = 1 # 服务器网卡带宽 self.get_system_version() self.get_cpu_cores() self.get_total_mem() self.get_system_nic() self.get_disks() self.get_system_net_speed() self.get_total_disk_size() self.monitor_task = queue.Queue() # 创建一个FIFO队列 self.executor = ThreadPoolExecutor(self.thread_pool + 1) # 创建线程池, +1是需要监控系统 self.client = influxdb.InfluxDBClient( cfg.getInflux('host'), cfg.getInflux('port'), cfg.getInflux('username'), cfg.getInflux('password'), cfg.getInflux('database')) # 创建数据库连接 self.FGC = {} # 每个端口的full gc次数 self.FGC_time = {} # 每个端口每次full gc的时间 self.last_cpu_io = [] # 最近一段时间的cpu的值,约100s self.is_java = {} # 监控的端口是否是java服务,0 or 1 self.monitor()
def draw_data_from_db(host, port=None, pid=None, start_time=None, end_time=None, system=None, disk=None): """ 从hbase数据库中读取数据并画图 :param host: 客户端服务器IP,必传参数 :param port: 端口号,即画该端口号的图;可选参数,和pid、system参数互斥,三选一 :param pid: 进程号,即画该进程号的图;可选参数,和port、system参数互斥,三选一 :param start_time: 画图数据开始时间;可选参数 :param end_time: 画图数据截止时间;可选参数 :param system: 画整个系统CPU和内存图,可选参数,和port、pid参数互斥,三选一 :param disk: 磁盘号,查看指定磁盘号的IO,可选参数 :return: """ try: post_data = { 'types': 'system', 'cpu_time': [], 'cpu': [], 'mem': [], 'jvm': [], 'io_time': [], 'io': [], 'rec': [], 'trans': [], 'nic': [], 'disk': disk } res = {'code': 1, 'message': None} connection = influxdb.InfluxDBClient( cfg.getInflux('host'), cfg.getInflux('port'), cfg.getInflux('username'), cfg.getInflux('password'), cfg.getInflux('database')) # 创建数据库连接 if start_time and end_time: # 如果存在开始时间和结束时间 startTime = local2utc(start_time) endTime = local2utc(end_time) elif start_time is None and end_time is None: # 如果开始时间和结束时间都不存在,则使用默认时间,即查询所有数据 startTime = local2utc('2020-02-02 02:02:02') endTime = local2utc(time.strftime('%Y-%m-%d %H:%M:%S')) else: # 如果结束时间不存在,则使用当前时间 startTime = local2utc(start_time) endTime = local2utc(time.strftime('%Y-%m-%d %H:%M:%S')) if port: # 读取和端口号相关的CPU使用率、内存使用大小和jvm变化数据 sql = f"select cpu, mem, jvm from \"{host}\" where time>'{startTime}' and time<'{endTime}' and type='{port}'" datas = connection.query(sql) if datas: post_data['types'] = 'port' for data in datas.get_points(): post_data['cpu_time'].append(data['time']) post_data['cpu'].append(data['cpu']) post_data['mem'].append(data['mem']) post_data['jvm'].append(data['jvm']) else: res['message'] = f'未查询到端口{port}的监控数据,请检查端口是否已监控,或者时间设置是否正确!' res['code'] = 0 if disk: # 读取磁盘IO数据 disk_n = disk.replace('-', '') sql = f"select {disk_n}, net from \"{host}\" where time>'{startTime}' and time<'{endTime}' and type='system'" datas = connection.query(sql) if datas: for data in datas.get_points(): post_data['io_time'].append(data['time']) post_data['nic'].append(data['net']) post_data['io'].append(float(data[disk_n])) else: res['message'] = '未查询到监控数据,请检查磁盘号,或者时间设置!' res['code'] = 0 if pid: # 读取和进程号相关的CPU使用率、内存使用大小和jvm变化数据 pass if system and disk: # 读取整个系统的CPU使用率、剩余内存大小 disk_n = disk.replace('-', '') sql = f"select cpu, mem, {disk_n}, rec, trans, net from \"{host}\" where time>'{startTime}' and time<'{endTime}' and type='system'" datas = connection.query(sql) if datas: post_data['types'] = 'system' for data in datas.get_points(): post_data['cpu_time'].append(data['time']) post_data['cpu'].append(data['cpu']) post_data['mem'].append(data['mem']) post_data['rec'].append(data['rec']) post_data['trans'].append(data['trans']) post_data['nic'].append(data['net']) post_data['io'].append(float(data[disk_n])) post_data['io_time'] = post_data['cpu_time'] else: res['message'] = '未查询到系统监控数据,请检查磁盘号,或者时间设置!' res['code'] = 0 img = draw(post_data) # 画图 res.update(img) lines = get_lines(post_data['cpu'], post_data['io'], post_data['nic']) # 计算百分位数,75%、90%、95%、99% res.update(lines) del connection del post_data return res except Exception as err: del connection del post_data logger.error(err) return res