def DataBaseCheck(entry,serverip,x): ''' 数据库检测的入口函数 ''' entry_list=list(entry) oItemConfig = eval(entry_list[4]) database_name = serverip+":"+str(oItemConfig['port']) a,b = IsDataBaseNormal(oItemConfig,serverip) result = ExistInserverinfo(entry_list[0],database_name) save_Thread_log('MONTHREAD',serverip+'_'+str(x)+'_'+oItemConfig['alarmcmd'],'subThread databasecheck func now!status is :'+a+',result is:'+str(result)) #此次检测结果是正常的 if a == '1': if result: if result[3] == '0': if entry_list[8] == '1': SendMail('RECOVERNOTIFY',entry_list[9],database_name,'databasecheck') CreateAlarmMsg(entry_list[1],entry_list[0],serverip,"数据库"+database_name+"已经恢复正常:",0) result[3] = '1' result[4] = 0 Update_osa_serverinfo(result) sys.exit() if result == None: InsertInto_osa_serverinfo(entry_list[0],database_name,'1',0) sys.exit() if a == '0': if result: if result[3] == '1': CreateAlarmMsg(entry_list[1],entry_list[0],database_name,"数据库:"+database_name+"出现异常:"+str(b),3) SendMail('STATUS_EXCEPTION',entry_list[9],database_name,'databasecheck') result[4]=result[4]+1 result[3] = '0' Update_osa_serverinfo(result) sys.exit() if result[3] =='0': if result[4] <= entry_list[7]: SendMail('STATUS_EXCEPTION',entry_list[9],database_name,'databasecheck') result[4]=result[4]+1 Update_osa_serverinfo(result) sys.exit() if result == None: SendMail('STATUS_EXCEPTION',entry_list[9],database_name,'databasecheck') InsertInto_osa_serverinfo(entry_list[0],database_name,'0',1) sys.exit() sys.exit()
def serverMonitor(serverip,entry,x): ''' 服务器信息监控入口线程函数 ''' entry_list = list(entry) #配置项 oItemConfig = eval(entry_list[4]) save_Thread_log('MONTHREAD',serverip+'_'+str(x)+'_'+oItemConfig['alarmcmd'],'subThread start now!------------------------------') save_Thread_log('MONTHREAD',serverip+'_'+str(x)+'_'+oItemConfig['alarmcmd'],'subThread name:' + oItemConfig['alarmcmd']) #取信息指令 cmd = 'SYSTEM_RUN_COMMAND!{"mon_all_stat":""}' save_Thread_log('MONTHREAD',serverip+'_'+str(x)+'_'+oItemConfig['alarmcmd'],'subThread oMonText begin,Cmd is :'+cmd) ##如果是端口或者数据库检测就不需要取客户端信息了 if oItemConfig['alarmcmd'] == 'portstatcheck' or oItemConfig['alarmcmd'] == 'databasecheck': oMonText = '1!1' save_Thread_log('MONTHREAD',serverip+'_'+str(x)+'_'+oItemConfig['alarmcmd'],'subThread subtype is portstatcheck or databasecheck ,subtype is :'+oItemConfig['alarmcmd']) elif PortIsAlive(serverip,SOCKET['REMOTE_PORT']) == False: each_ipinfo = None #端口如果不通,说明服务器异常! each_ipinfo_list = select("SELECT * FROM osa_ipinfo where oIp = '"+serverip+"'") if each_ipinfo_list : each_ipinfo = each_ipinfo_list[0] else: save_log('INFO','ip not found:'+str(serverip)) sys.exit() AllUserEmailAddress = GetUserEmailAddress('ALL') save_Thread_log('MONTHREAD',serverip+'_'+str(x)+'_'+oItemConfig['alarmcmd'],'subThread server PortIsAlive == false,exit now!,IP is :'+serverip+',each_info is:'+str(each_ipinfo)+',AllUserEmailAddress is :'+str(AllUserEmailAddress)) #检测单个服务器是否正常,是否需要告警 try: ThreadForEachServer(each_ipinfo,AllUserEmailAddress) except Exception as e: save_log('ERROR','ThreadForEachServer run fail first:'+str(e)) sys.exit() sys.exit() else: rtime = round(float(random.randrange(0, 100, 1))/10,2) time.sleep(rtime) oMonText = proSocket(serverip, SOCKET['REMOTE_PORT'], cmd, type=None) if oMonText: oMonText = oMonText.split('!') save_Thread_log('MONTHREAD',serverip+'_'+str(x)+'_'+oItemConfig['alarmcmd'],'subThread get oMonText over!,oMonText is :'+str(oMonText)) else: save_Thread_log('MONTHREAD',serverip+'_'+str(x)+'_'+oItemConfig['alarmcmd'],'subThread get oMonText faild ,exit now!') each_ipinfo = None #这里如果没取到数据,说明服务器异常! each_ipinfo_list = select("SELECT * FROM osa_ipinfo where oIp = '"+serverip+"'") if each_ipinfo_list : each_ipinfo = each_ipinfo_list[0] AllUserEmailAddress = GetUserEmailAddress('ALL') save_Thread_log('MONTHREAD',serverip+'_'+str(x)+'_'+oItemConfig['alarmcmd'],'subThread server PortIsAlive == false,exit now!,IP is :'+serverip+',each_info is:'+str(each_ipinfo)+',AllUserEmailAddress is :'+str(AllUserEmailAddress)) #检测单个服务器是否正常,是否需要告警 try: ThreadForEachServer(each_ipinfo,AllUserEmailAddress) except Exception as e: save_log('ERROR','ThreadForEachServer run fail again:'+str(e)) sys.exit() sys.exit() save_log('INFO','serverMonitor oMonText : '+str(oMonText)) #将不同的配置项提出来! serverChooseDict={ 'diskspacecheck' : {'oMonText' : 'diskstat','oItemConfig' : 'percentage', 'strrecov' : '的磁盘空间率已经恢复!' , 'errorstr1' : '的磁盘使用率过高,分区 ' , 'errorstr2' : '当前使用率为:'}, 'topstatcheck' : {'oMonText' : 'loadstat','oItemConfig' : 'topvalue', 'strrecov' : '的负载已经恢复正常!' , 'errorstr1':'的负载状态过高。当前负载值:','errorstr2' : ''}, 'loginusercheck' : {'oMonText' : 'login','oItemConfig' : 'usernum', 'strrecov' : '的登录用户数量已经恢复正常!' , 'errorstr1':'登录用户过多。当前用户数:','errorstr2':''}, 'networkcheck' : {'oMonText' : 'network','oItemConfig' : 'topvalue', 'strrecov' : '的网络流量已经恢复正常!' , 'errorstr1':'流量过载。网卡 ','errorstr2':',当前进出流量峰值(MB): '} } #接下来走不同的函数完成监控 if oItemConfig['alarmcmd'] in serverChooseDict.keys(): save_Thread_log('MONTHREAD',serverip+'_'+str(x)+'_'+oItemConfig['alarmcmd'],'subThread serverChoose now!') return serverChooseAlarm(oItemConfig['alarmcmd'],serverChooseDict,serverip,entry,oMonText,x) #数据库报警 if oItemConfig['alarmcmd'] == 'databasecheck': save_Thread_log('MONTHREAD',serverip+'_'+str(x)+'_'+oItemConfig['alarmcmd'],'subThread databasecheck now!') return DataBaseCheck(entry,serverip,x) #端口报警 if oItemConfig['alarmcmd'] == 'portstatcheck': a=IsPortAlive(serverip,oItemConfig['portlist']) save_Thread_log('MONTHREAD',serverip+'_'+str(x)+'_'+oItemConfig['alarmcmd'],'subThread portstatcheck func now!status is :'+str(a[0])) if a[0] == True:#此次检测结果是正常的 result = ExistInserverinfo(entry_list[0],serverip) save_Thread_log('MONTHREAD',serverip+'_'+str(x)+'_'+oItemConfig['alarmcmd'],'subThread portstatcheck exit now!result is :'+str(result)) if result: if result[3] == '0': if entry_list[8] == '1': SendMail('RECOVERNOTIFY',entry_list[9],serverip,'portstatcheck') CreateAlarmMsg(entry_list[1],entry_list[0],serverip,"服务器"+serverip+"的端口访问已经恢复正常:"+oItemConfig['portlist'],0) result[3] = '1' result[4] = 0 Update_osa_serverinfo(result) sys.exit() if result == None: InsertInto_osa_serverinfo(entry_list[0],serverip,'1',0) sys.exit() if a[0] == False: result = ExistInserverinfo(entry_list[0],serverip) save_Thread_log('MONTHREAD',serverip+'_'+str(x)+'_'+oItemConfig['alarmcmd'],'subThread portstatcheck exit now!result is :'+str(result)) if result: if result[3] == '1': CreateAlarmMsg(entry_list[1],entry_list[0],serverip,"服务器"+serverip+"端口:"+a[1]+"异常",3) SendMail('STATUS_EXCEPTION',entry_list[9],serverip,'portstatcheck') result[4]=result[4]+1 result[3] = '0' Update_osa_serverinfo(result) sys.exit() if result[3] =='0': if result[4] < entry_list[7]: SendMail('STATUS_EXCEPTION',entry_list[9],serverip,'portstatcheck') result[4]=result[4]+1 Update_osa_serverinfo(result) sys.exit() #如果osa_serverinfo不存在记录,则添加记录,更新下一次检测时间 if result == None: #发送异常告警通知,添加告警记录。 CreateAlarmMsg(entry_list[1],entry_list[0],serverip,"服务器"+serverip+'端口:'+a[1]+' 未存活!',3) SendMail('STATUS_EXCEPTION',entry_list[9],serverip,oItemConfig['alarmcmd']) #更新状态为0,告警次数为1 InsertInto_osa_serverinfo(entry_list[0],serverip,'0',1) sys.exit() sys.exit()
def serverChooseAlarm(alarmcmd,serverChooseDict,serverip,entry,oMonText,x): ''' 根据不同监控类型执行不同的告警方式 ''' oItemConfig = {} oItemConfig['alarmcmd'] = alarmcmd save_Thread_log('MONTHREAD',serverip+'_'+str(x)+'_'+oItemConfig['alarmcmd'],'subThread in serverChooseAlarm,clientdata now!! oMonText is: '+str(oMonText)) #从字典中提出键值,比如:'diskstat' ckey = serverChooseDict[alarmcmd]['oMonText'] try: oMonTextDic = eval(oMonText[1]) except Exception as e: save_log('ERROR','oMonTextDic is error:'+str(e)+',ip is:'+str(serverip)+', str : '+str(oMonText)) sys.exit() clientdata = None if ckey in oMonTextDic: clientdata = oMonTextDic[ckey] else: save_Thread_log('MONTHREAD',serverip+'_'+str(x)+'_'+oItemConfig['alarmcmd'],'subThread in serverChooseAlarm,clientdata error exit now!! oMonText is '+str(oMonText)) sys.exit(0) entry_list = list(entry) okey = serverChooseDict[alarmcmd]['oItemConfig'] oItemConfig = eval(entry_list[4]) a = [] if alarmcmd == 'diskspacecheck': a = IsDiskSpaceNormal(clientdata,int(oItemConfig[okey])) elif alarmcmd == 'topstatcheck': a = IsLoadStatNormal(clientdata,int(oItemConfig[okey])) elif alarmcmd == 'loginusercheck': a = IsLoginUserOver(clientdata,int(oItemConfig[okey])) elif alarmcmd == 'networkcheck': a = IsNetworkTrafficNormal(clientdata,int(oItemConfig[okey])) else: sys.exit() #恢复时通知信息 rstr = serverChooseDict[alarmcmd]['strrecov'] #异常时告警通知信息 estr = serverChooseDict[alarmcmd]['errorstr1'] + str(a[1]) if len(a) > 2: estr = estr+ serverChooseDict[alarmcmd]['errorstr2'] + str(a[2]) save_Thread_log('MONTHREAD',serverip+'_'+str(x)+'_'+oItemConfig['alarmcmd'],'subThread in serverChoose func now! status is '+str(a[0])) #此次检测结果是正常的 if a[0] == True: result = ExistInserverinfo(entry_list[0],serverip) save_Thread_log('MONTHREAD',serverip+'_'+str(x)+'_'+oItemConfig['alarmcmd'],'subThread in serverChoose func exit now! result is '+str(result)) #如果osa_serverinfo存在记录了 if result: #如果原来记录为不正常 if result[3] == '0': #是否恢复通知为 1 发送邮件 if entry_list[8] == '1': SendMail('RECOVERNOTIFY',entry_list[9],serverip,alarmcmd) #往osa_alarmmsg表里增加新的条目 CreateAlarmMsg(entry_list[1],entry_list[0],serverip,"服务器"+serverip+rstr,0) #恢复时更新数据状态和下一次检测时间 DatabaseUpdateWhenRecovey(result,entry_list) sys.exit() #如果osa_serverinfo不存在记录,则添加记录,更新下一次检测时间 if result == None: InsertInto_osa_serverinfo(entry_list[0],serverip,'1',0) sys.exit() #此次检测结果是异常的 if a[0] == False: result = ExistInserverinfo(entry_list[0],serverip) save_Thread_log('MONTHREAD',serverip+'_'+str(x)+'_'+oItemConfig['alarmcmd'],'subThread in serverChoose func exit now! result is '+str(result)) #如果osa_serverinfo 存在记录 if result: #如果之前记录为正常 if result[3] == '1': #发送异常告警通知,添加告警记录。 CreateAlarmMsg(entry_list[1],entry_list[0],serverip,"服务器"+serverip+estr,3) SendMail('STATUS_EXCEPTION',entry_list[9],serverip,alarmcmd) #告警次数加1 result[4]=result[4]+1 #状态为0,表示不正常 result[3] = '0' #更新osa_serverinfo记录! Update_osa_serverinfo(result) #更新下一次时间 sys.exit() #如果之前记录为异常 if result[3] =='0': #如果当前告警次数小于用户设定的告警次数,则发送告警 if result[4] < entry_list[7]: SendMail('STATUS_EXCEPTION',entry_list[9],serverip,alarmcmd) result[4]=result[4]+1 Update_osa_serverinfo(result) sys.exit() #如果osa_serverinfo不存在记录,则添加记录,更新下一次检测时间 ################补充########################################### if result == None: #发送异常告警通知,添加告警记录。 CreateAlarmMsg(entry_list[1],entry_list[0],serverip,"服务器"+serverip+estr,3) SendMail('STATUS_EXCEPTION',entry_list[9],serverip,alarmcmd) #更新状态为0,告警次数为1 InsertInto_osa_serverinfo(entry_list[0],serverip,'0',1) sys.exit() sys.exit()