def core_file_info(self, info): command = 'find /home/trade -name core.*' hostip = info[0] # servername = info[4]"command: " + command) sshRes = self.sshExecCmd(command) # print("sshRes:", sshRes) if sshRes == []: self.single_info_verify = True msg = "OK: Server %s The count of core file is 0 " % str(hostip) else: self.single_info_verify = False sshResStr = ''.join(sshRes) sshResList = sshResStr.strip().split('\n') print("sshResList: ", sshResList) # ps_list = [] for datalist in sshResList: msg = "error: " + hostip + " Have core file:" + datalist ct.write_log(error_log_file, msg) logger.warning(msg) sms_msg = "error: " + hostip + " 有core文件,请检查服务器文件" logger.error(sms_msg) ct.send_sms_control("core", sms_msg) msg = "core file Check Result: " + str(self.single_info_verify)
def get_db_records(info): tablename = info[4] sql = info[5] records_file = log_dir + tablename + "_" + ndates + '.csv' #清空文件内容 if os.path.exists(records_file): with open(records_file, "r+") as f: f.truncate() # sql = "SELECT [OrderLocalID], [OrderSysID] from \ # dbo.t_SSEOrder ORDER BY OrderLocalID DESC" # sql = "SELECT UserID FROM dbo.t_User WHERE UserName = '******'" (res, des) = mt.fetchall_sql(info, sql) if res == None or res == []: msg = "Failed to get records" logger.error(msg) ct.write_log(log_file, msg) else: logger.debug(res) db_columns = list(zip(*des))[0] logger.debug(db_columns) with, mode='w', encoding='utf-8') as f: write = csv.writer(f, dialect='excel') write.writerow(db_columns) # write.writerows(res) for item in res: logger.debug(item) write.writerow(item)
def non_trade_ps_info(self, info): hostip = info[0] username = info[2] # servername = info[4] processes = info[5] # process_count = len(str(processes).split('|')) command = 'ps -u ' + username + ' -elf | grep -E "' + processes + '" | grep -v grep' # command = 'ps -u trade -elf |grep -E "dbsync 1|dbsync 2" | grep -v grep'"command: " + command) sshRes = self.sshExecCmd(command) # print "sshRes:", sshRes if sshRes == []: self.ps_info_verify = True msg = "OK: Server %s The count of the processes is 0 " % str( hostip) else: self.ps_info_verify = False sshResStr = ''.join(sshRes) sshResList = sshResStr.strip().split('\n') # print "sshResList: ", sshResList sshResLists = [] for sshCom in sshResList: sshResLists.append(sshCom.strip().split()) # # print "sshResLists:\n", sshResLists # titlename="F,S,UID,PID,PPID,C,PRI,NI,ADDR,SZ,WCHAN,TTY,TIME,CMD" ps_list = [] for datalist in sshResLists: # psstr=','.join(datalist) psstr = ' '.join(datalist[14:])"ps:" + psstr) # chg_psstr = psstr.encode('utf-8') ct.write_log(error_log_file, psstr) msg = "error:" + hostip + " ::The process is " + psstr + ":: Time: " + str( datalist[13]) + " is still working!" ct.write_log(error_log_file, msg) logger.warning(msg) ps_list.append(psstr) ps_cmd = ';'.join(ps_list) sms_msg = "error:" + hostip + " ::Processes : " + ps_cmd + " is still working!" ct.send_sms_control("ps_port", sms_msg) msg = "ps Processes Check Result: " + str(self.ps_info_verify)
def get_query_data(linuxInfo): logger = logging.getLogger() yaml_path = './config/non_trade_monitor_logger.yaml' ct.setup_logging(yaml_path) for info in linuxInfo: hostip = info[0] port = info[1] username = info[2] password = info[3] # servername = info[4] command = info[5] cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) temstr = "**********" + cur_time + "::" + hostip + "::" + command + "::**********" ct.write_file(query_result_file, temstr) sshClient = ct.sshConnect(hostip, port, username, password) sshRes = ct.sshExecCmd(sshClient, command) + "::" + command) try: for item in sshRes: # de_item = item.decode('gb2312') # error_list = de_item.strip().split(':', 1) # grep_lists.append(error_list) # memstr=','.join(error_list) # print memstr # temstr= item.strip().encode('utf-8') temstr = item.strip() ct.write_file(query_result_file, temstr) except Exception as e: msg = "write failed: [hostip:%s];[username:%s];[error:%s]" % ( hostip, username, str(e)) logger.error(msg) ct.write_log(log_file, msg) ct.sshClose(sshClient)"get_query_data finished") for handler in logger.handlers: logger.removeHandler(handler)
def non_trade_mem_info(self, info): command = 'cat /proc/meminfo' hostip = info[0] # servername = info[4] # #非交易时间自动清理一下缓存 # start_time = '08:45' # end_time = '15:30' # if not (ct.time_check(start_time, end_time)): # self.mem_info_verify = True #"Clear BuffersCachedRate") # else: #"Not to clear BuffersCachedRate") sshRes = self.sshExecCmd(command) mem_values = re.findall("(\d+)\ kB", ",".join(sshRes)) MemTotal = mem_values[0] MemFree = mem_values[1] MemAvailable = mem_values[2] Buffers = mem_values[3] Cached = mem_values[4] SwapCached = mem_values[5] SwapTotal = mem_values[14] SwapFree = mem_values[15] '******************************Mem Monitor: [server:%s]*********************************' % hostip) cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) ct.write_file(result_file, cur_time + "::" + hostip + "_mem_info_result:") titlename = "MemTotal,MemFree,MemAvailable,Buffers,Cached,SwapCached,SwapTotal,SwapFree,BuffersCachedRate,Rate_Mem" ct.write_file(result_file, titlename) #计算b/cRate,RateMem BuffersCachedRate = round( 100 * (int(Buffers) + int(Cached)) / float(MemTotal), 2)"BuffersCachedRate:" + str("%.2f" % BuffersCachedRate) + "%") Free_Mem = int(MemFree) + int(Buffers) + int(Cached) Used_Mem = int(MemTotal) - Free_Mem Rate_Mem = round(100 * Used_Mem / float(MemTotal), 2)"Rate_Mem:" + str("%.2f" % Rate_Mem) + "%") tem_list = [ MemTotal, MemFree, MemAvailable, Buffers, Cached, SwapCached, SwapTotal, SwapFree, BuffersCachedRate, Rate_Mem ] temp = map(str, tem_list) memstr = ','.join(temp) logger.debug(memstr) ct.write_file(result_file, memstr) # BuffersCachedRate > 50报警,交易时间不判断 start_time = '08:45' end_time = '15:30' if (ct.time_check(start_time, end_time)): self.mem_info_verify = True"Not to check BuffersCachedRate") else: if BuffersCachedRate < 50: self.mem_info_verify = True msg = "ok:" + hostip + " ::The BuffersCachedRate is " + str( BuffersCachedRate) + " % is ok" else: #清理缓存 command_clear = 'sync;echo 3 > /proc/sys/vm/drop_caches' sshRes_clear = self.sshExecCmd(command_clear) logger.debug(sshRes_clear) #再次检查一次 sshRes = self.sshExecCmd(command) mem_values = re.findall("(\d+)\ kB", ",".join(sshRes)) MemTotal = mem_values[0] MemFree = mem_values[1] MemAvailable = mem_values[2] Buffers = mem_values[3] Cached = mem_values[4] SwapCached = mem_values[5] SwapTotal = mem_values[14] SwapFree = mem_values[15] '******************************Mem Monitor2: [server:%s]*********************************' % hostip) cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) ct.write_file(result_file, cur_time + "::" + hostip + "_mem_info_result:") titlename = "MemTotal,MemFree,MemAvailable,Buffers,Cached,SwapCached,SwapTotal,SwapFree,BuffersCachedRate,Rate_Mem" ct.write_file(result_file, titlename) #计算b/cRate,RateMem BuffersCachedRate = round( 100 * (int(Buffers) + int(Cached)) / float(MemTotal), 2)"BuffersCachedRate:" + str("%.2f" % BuffersCachedRate) + "%") #如果还大于等于50,则报警 if BuffersCachedRate >= 50: self.mem_info_verify = False msg = "error:" + hostip + " ::The BuffersCachedRate is " + str( BuffersCachedRate) + " % is overload" ct.write_log(error_log_file, msg) logger.error(msg) ct.send_sms_control("mem", msg) # Rate_Mem>80报警 if Rate_Mem < 80: self.mem_info_verify = self.mem_info_verify and True msg = "ok:" + hostip + " ::The Rate_Mem is " + str( Rate_Mem) + " % is ok" else: self.mem_info_verify = False msg = "error:" + hostip + " ::The Rate_Mem is " + str( Rate_Mem) + " % is overload" ct.write_log(error_log_file, msg) logger.error(msg) ct.send_sms_control("mem", msg) msg = "Mem Check Result: " + str(self.mem_info_verify)
def fpga_file_info(self, info): # info = ['', 22, 'trade', 'trade', 'FPGAServer','/home/trade/FPGA'] hostip = info[0] # servername = info[4] filepath = info[5] command = 'ls -l ' + filepath sshRes = self.sshExecCmd(command) # print "sshRes:", sshRes if sshRes == []: self.fpga_file_info_verify = False msg = "error: The sshResturn is None, please check it" # print msg ct.write_log(error_log_file, msg) logger.warning(msg) else: sshResStr = ''.join(sshRes) # print "sshResStr: ", sshResStr sshResList = sshResStr.strip().split('\n') # print "sshResList: ", sshResList sshResLists = [] for sshCom in sshResList: sshResLists.append(sshCom.strip().split()) # print "len(sshResLists):", len(sshResLists) "******************************FPGA Monitor: [server:%s]*********************************" % hostip) cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) ct.write_file(result_file, cur_time + "::" + hostip + "_ps_info_result:") # print("sshResLists:\n", sshResLists) titlename = "TypePermission,ConnectedCount,Owner,Group,Size,ModifyMonth,ModifyDate,ModifyTime,FName" logger.debug(titlename) ct.write_file(result_file, titlename) #从第二行开始 fileSize_dict = { 'sent0': None, 'received0': None, 'received1': None } for datalist in sshResLists[1:]: lsstr = ','.join(datalist) logger.debug(lsstr) # chg_str = lsstr.encode('utf-8') # print("aaggg:", chg_str) ct.write_file(result_file, lsstr) if len(datalist) == 9: Size = int(datalist[4]) FName = datalist[8] Today ="%Y%m%d") # matchlist=['sent','journal','received'] str1 = 'FPGA0_CSESSION00_' + Today + '.sent' str2 = 'FPGA0_VSESSION00_' + Today + '.received' str3 = 'FPGA0_VSESSION01_' + Today + '.received' if str1 == FName: fileSize_dict['sent0'] = Size # print("Fname:",FName,Size) if str2 == FName: fileSize_dict['received0'] = Size # print("Fname:",FName,Size) if str3 == FName: fileSize_dict['received1'] = Size # print("Fname:",FName,Size) ntime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) dictstr = ntime + "::FPGA file size: " + str(fileSize_dict) ct.write_file(result_file, dictstr) # self.fpga_file_info_verify = (fileSize_dict['sent0'] or fileSize_dict['sent1']) and (fileSize_dict['journal0'] or fileSize_dict['journal1']) and (fileSize_dict['received0'] or fileSize_dict['received1']) self.fpga_file_info_verify = (fileSize_dict['sent0'] and (fileSize_dict['received0'] and fileSize_dict['received1'])) if self.fpga_file_info_verify: msg = "ok: The server %s FPGA Monitor is ok, %s " % (hostip, dictstr) self.fpga_Check_flag_list.append(1) else: msg = "error: The server %s FPGA Monitor is not correct, %s " % ( hostip, dictstr) ct.write_log(error_log_file, msg) logger.error(msg) ct.send_sms_control("fpga", msg) self.fpga_Check_flag_list.append(0)
def ps_info(self, info): hostip = info[0] username = info[2] # servername = info[4] processes = info[5] process_count = len(str(processes).split('|')) command = 'ps -u ' + username + ' -elf | grep -E "' + processes + '" | grep -v grep' # command = 'ps -u trade -elf |grep -E "dbsync 1|dbsync 2" | grep -v grep' logger.debug("command: " + command) sshRes = self.sshExecCmd(command) # print "sshRes:", sshRes if sshRes == []: self.ps_info_verify = False msg = "error: Server %s The count of the processes is 0, please check it" % str( hostip) ct.write_log(error_log_file, msg) logger.error(msg) ct.send_sms_control("ps_port", msg) else: sshResStr = ''.join(sshRes) # print("sshResStr: ", sshResStr) sshResList = sshResStr.strip().split('\n') # print "sshResList: ", sshResList sshResLists = [] for sshCom in sshResList: sshResLists.append(sshCom.strip().split()) "******************************Processes Monitor: [server:%s]*********************************" % hostip) cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) ct.write_file(result_file, cur_time + "::" + hostip + "_ps_info_result:") # print "sshResLists:\n", sshResLists # F S UID PID PPID C PRI NI ADDR SZ WCHAN TTY TIME CMD titlename = "F,S,UID,PID,PPID,C,PRI,NI,ADDR,SZ,WCHAN,TTY,TIME,CMD" logger.debug(titlename) ct.write_file(result_file, titlename) #判断线程数量是否正确 self.ps_info_verify = (process_count == len(sshRes)) if (self.ps_info_verify): msg = "ok: The query count of the processes is " + str( len(sshRes)) #再检查进程的状态是否正确 for datalist in sshResLists: self.ps_info_verify = True psstr = ','.join(datalist) logger.debug(psstr) # chg_psstr = psstr.encode('utf-8') ct.write_file(result_file, psstr) if datalist[1] in ['R', 'S', 'D']: self.ps_info_verify = self.ps_info_verify and True msg = "ok:" + hostip + ":: Time:" + str( datalist[13]) + " ::The state is " + str( datalist[1]) + " is ok" else: self.ps_info_verify = False msg = "error:" + hostip + ":: Time: " + str( datalist[13]) + " ::The state is " + str( datalist[1]) + " is not correct" ct.write_log(error_log_file, msg) logger.error(msg) ct.send_sms_control("ps_port", msg) else: msg = "error: Server %s The query count %s of the processes is not equal: %s" % ( hostip, str(len(sshRes)), str(process_count)) ct.write_log(error_log_file, msg) logger.error(msg) ct.send_sms_control("ps_port", msg) msg = "ps Processes Check Result: " + str(self.ps_info_verify)
def disk_info(self, info): command = 'df -h' hostip = info[0] # servername = info[4] sshRes = self.sshExecCmd(command) # print "sshRes:", sshRes sshResStr = ''.join(sshRes) sshResList = sshResStr.strip().split('\n') df_info_list = [] for disk in sshResList[1:]: df_info_list.append(disk.strip().split()) # print "df_info_list:", df_info_list # print "len(df_info_list):", len(df_info_list) sshResLists = [] for i in range(len(df_info_list)): if len(df_info_list[i]) == 1 and len(df_info_list[i + 1]) == 5: sshResLists.append(df_info_list[i] + df_info_list[i + 1]) elif len(df_info_list[i]) == 6: sshResLists.append(df_info_list[i]) elif len(df_info_list[i]) != 1 and len(df_info_list[i]) != 5: msg = "The df_info's format is not correct!" # print msg ct.write_file(error_log_file, msg) logger.error(msg) # print "sshResLists:",sshResLists # print "len(sshResLists):", len(sshResLists) "************************Disk Monitor: [server:%s]****************************" % hostip) cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) ct.write_file(result_file, cur_time + "::" + hostip + "_disk_info_result:") titlename = "Filesystem,Size,Used,Avail,Use%,Mounted on" logger.debug(titlename) ct.write_file(result_file, titlename) if len(sshResLists): self.disk_info_verify = True for disklist in sshResLists: diskstr = ','.join(disklist) logger.debug(diskstr) ct.write_file(result_file, diskstr) Use_Rate = int(disklist[4].split('%')[0]) #匹配网络路径 matchObj ='//.*?/', disklist[0], re.M | re.I) #磁盘空间已用%>80报警,去掉mnt/cdrom和//ip/path这样的文件 #if (disklist[0]!="/dev/sr0") and not(matchObj): if (disklist[0] not in ["/dev/sr0", "/dev/sr1" ]) and not (matchObj): if Use_Rate < 80: self.disk_info_verify = self.disk_info_verify and True msg = "ok:" + hostip + "::" + disklist[ 0] + " ::The Use% is " + str(Use_Rate) + " % is ok" else: self.disk_info_verify = False msg = "error:" + hostip + "::" + disklist[ 0] + " ::The Use% is " + str( Use_Rate) + " % is overload" # print msg ct.write_log(error_log_file, msg) logger.error(msg) ct.send_sms_control("disk", msg) msg = "Disk Check Result: " + str(self.disk_info_verify) # print msg # ct.write_log(log_file,msg)
def ping_server_info(self, info): hostip = info[0] sysstr = platform.system() if sysstr == "Windows": logger.debug('ping ' + hostip) ping = subprocess.Popen('ping ' + hostip, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) # 执行命令 res, err = ping.communicate() # print("err:", err.decode('gbk')) # if err: sys.exit(err.decode('gbk').strip('\n')) if err: logger.warning("ping error: %s" % str(err)) pres = [] else: pres = list(res.decode('gbk').split('\n')) logger.debug("pres:" + pres) try: loss = pres[8].split('(')[1].split('%')[0] + "%" # 获取丢包率 except IndexError: loss = "100%" try: rtt = pres[10].split('=')[3].split('ms')[0] # 获取rtt avg值 except IndexError: rtt = "" else: # ping = subprocess.Popen('ping -i 0.2 -c 4 -q -I ' + src + ' ' + dest, #-I<网络界面> 使用指定的网络接口送出数据包 ping = subprocess.Popen('ping -i 1 -c 4 -q ' + hostip, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) # 执行命令 res, err = ping.communicate() # print("err:", err.decode('gbk')) # if err: sys.exit(err.decode('gbk').strip('\n')) if err: logger.warning("ping error: %s" % str(err)) pres = [] else: pres = list(res.decode('gbk').split('\n')) logger.debug("pres:" + pres) try: #tem = "4 packets transmitted, 0 received, 100% packet loss, time 611ms" loss = pres[3].split()[5] # 获取丢包率 #loss = tem.split()[5] except IndexError: loss = "100%" try: rtt = pres[4].split('/')[4] # 获取rtt avg值 except IndexError: rtt = "9999" # loss>0,rtt>800报警 if float(loss.strip('%')) > 0 or float(rtt) > 800: self.ping_info_verify = False msg = "error:" + hostip + " ::The ping lost is " + loss + " rtt is " + rtt + "ms" ct.write_log(error_log_file, msg) logger.error(msg) ct.send_sms_control("ping", msg) else: self.ping_info_verify = True msg = "ok:" + hostip + " ::The ping lost is " + loss + " rtt is " + rtt + "ms" msg = "Ping Check Result: " + str(self.ping_info_verify)