class LogFile: def __init__(self, ldap, product, type, file, timeInterval, numberKeys, stringKeys, statisticTool, wholeFile, keyAlarmers, debug, logger, cubeName): self.filePattern = file self.curFile = file self.timePattern = self.parseTimePattern(file) self.timeInterval = timeInterval self.numberKeys = numberKeys self.stringKeys = stringKeys self.statisticTool = statisticTool self.wholeFile = wholeFile self.keyAlarmers = keyAlarmers self.dbop = DBOperator(ldap, product, type, cubeName, numberKeys, stringKeys, []) self.debug = debug self.logger = logger self.cubeName = cubeName # self.position = 0 self.preFileSize = 0 self.defaultReadSize = 16 * 1024 * 1024 def validate(self): if self.filePattern == '': self.logger.warning('%s: 未配置文件名, 不对日志做收集' % self.cubeName) self.dbop.close() return False if len(self.numberKeys) + len(self.stringKeys) == 0: self.logger.warning('%s: fileName=%s, 未配置监控keys, 不对日志做收集' % (self.cubeName, self.filePattern)) return True def toString(self): str = '' str += 'fileName=%s\n' % self.filePattern str += 'timeInterval=%s\n' % self.timeInterval str += 'numberKeys=%s\n' % ','.join(self.numberKeys) str += 'stringKeys=%s\n' % ','.join(self.stringKeys) str += 'wholeFile=%s\n' % self.wholeFile str += 'keyAlarmers=' for key, alarmers in self.keyAlarmers.items(): str += '%s:[' % key str += ','.join([alarmer.toString() for alarmer in alarmers]) str += '],' str = str[0:-1] + '\n' return str def parseTimePattern(self, file): timePattern = re.match(".*@@(.*)@@", self.filePattern) if timePattern: return timePattern.group(1) return None def updateLogFile(self): ''' 更新文件名称和读取位置 ''' if self.timePattern: curTimePattern = datetime.datetime.strftime( datetime.datetime.now(), self.timePattern) logFile = re.sub("@@.*@@", curTimePattern, self.filePattern) if self.curFile != logFile: # 1. 文件名变化 self.logger.info('@@logHandler@@ file changed: %s -> %s' % (self.curFile, logFile)) self.curFile = logFile self.position = 0 self.preFileSize = 0 return if os.path.exists(self.curFile): curFileSize = os.stat(self.curFile).st_size if self.preFileSize <= curFileSize: # 2. 文件名无变化 & 文件名无变化 & 文件大小合理变化 return # 3. 文件名无变化 & (文件不存在 | 文件大小不合理变化) self.position = 0 self.preFileSize = 0 def monitor(self): self.updateLogFile() # 若文件存在, 且不需要分析整个log, 则指向末尾, 否则指向开头并等待文件出现 if os.path.exists(self.curFile) and not self.wholeFile: lf = open(self.curFile) lf.seek(-1, os.SEEK_END) self.position = lf.tell() lf.close() # 开始监控logfile while True: # update file name self.updateLogFile() if not os.path.exists(self.curFile): time.sleep(5) self.logger.warning( '%s: @@logHandler@@ file=%s, 待监控文件不存在, 暂不做监控, 等待文件出现ing...' % (self.cubeName, self.curFile)) continue self.log2db() self.logger.info( '%s: @@logHandler@@ file=%s, sleeping, will wake up after %s seconds' % (self.cubeName, self.curFile, self.timeInterval)) time.sleep(self.timeInterval) # 若单行log大于defaultReadSize, 将会出bug: 无法往下继续读, 因此defaultReadSize不宜过小, 但从资源消耗角度看, 也不宜过大 # 每次均读至末尾 def log2db(self): lf = open(self.curFile) # 先记住当前位置, 避免由于日志输出过快而每次处理完日志后又有新日志输出 lf.seek(-1, os.SEEK_END) currentEndPosition = lf.tell() lf.seek(self.position) while True: if currentEndPosition <= lf.tell(): break rawLines = lf.read(self.defaultReadSize) index = rawLines.rfind('\n') if index < 0: break lf.seek(index + 1 - len(rawLines), os.SEEK_CUR) logLines = rawLines[0:index].splitlines() result = self.statisticTool.getStatisticData(logLines) self.alarm(result) if len(result) > 0: if not self.debug: self.dbop.insert(result) if not self.debug and len(result) > 20: self.logger.info( '@@logHandler@@ file=%s, insert.status=success, data.length=%s, data=%s, ..., %s' % (self.curFile, len(result), result[0:10], result[-10:])) else: self.logger.info( '@@logHandler@@ file=%s, insert.status=success, data.length=%s, data=%s' % (self.curFile, len(result), result)) else: self.logger.info( '@@logHandler@@ file=%s, insert.status=empty' % self.curFile) self.position = lf.tell() self.preFileSize = os.stat(self.curFile).st_size lf.close() def alarm(self, result): for res in result: if self.keyAlarmers.has_key(res[1]): for alarmer in self.keyAlarmers[res[1]]: alarmer.alarm(res[2], self.cubeName, res[1])
class CommandKey: def __init__(self, ldap, product, type, key, command, alarmers, logger, cubeName, debug, timeInterval): self.key = key self.timeInterval = timeInterval self.command = command self.alarmers = alarmers self.dbop = DBOperator(ldap, product, type, cubeName, [key], [], []) self.logger = logger self.cubeName = cubeName self.debug = debug def validate(self): if self.key == '': logger.warning('%s: commandKey未配置key, 不做command的收集') self.dbop.close() return False if self.command == '': logger.warning('%s: commandKey未配置command, 不做command的收集') self.dbop.close() return False return True def toString(self): str = '' str += 'key=%s\n' % self.key str += 'timeInterval=%s\n' % self.timeInterval str += 'command=%s\n' % self.command str += 'alarmers=[%s]\n' % ','.join( [alarmer.toString() for alarmer in self.alarmers]) return str def monitor(self): while True: currentTime = int(time.time()) currentTime = currentTime - currentTime % self.timeInterval value = self.getDataFromCommand() if value: # add timestamp insertData = [[currentTime, self.key, value]] self.logger.info('%s: @@commandKeyHandler@@ result=%s' % (self.cubeName, insertData)) self.alarm(insertData) if not self.debug: self.dbop.insert(insertData) self.logger.info( '%s: @@commandKeyHandler@@ result inserted to database' % self.cubeName) else: self.logger.warning('%s: @@commandKeyHandler@@ no data get' % self.cubeName) endTime = int(time.time()) sleepTime = self.timeInterval - (endTime - currentTime) if sleepTime <= 0: continue self.logger.info( '%s: @@commandKeyHandler@@ sleep, will wake up in %ss' % (self.cubeName, sleepTime)) time.sleep(sleepTime) def alarm(self, insertData): for data in insertData: for alarmer in self.alarmers: alarmer.alarm(data[2], self.cubeName, self.key) def getDataFromCommand(self): result = commands.getoutput(self.command).splitlines() result = result[0].split() result = result[0] try: return float(result) except: return None
class LogFile: def __init__(self,ldap,product,type,file,timeInterval,numberKeys,stringKeys,statisticTool,wholeFile,keyAlarmers,debug,logger,cubeName): self.filePattern=file self.curFile=file self.timePattern=self.parseTimePattern(file) self.timeInterval=timeInterval self.numberKeys=numberKeys self.stringKeys=stringKeys self.statisticTool=statisticTool self.wholeFile=wholeFile self.keyAlarmers=keyAlarmers self.dbop=DBOperator(ldap,product,type,cubeName,numberKeys,stringKeys,[]) self.debug=debug self.logger=logger self.cubeName=cubeName # self.position=0 self.preFileSize=0 self.defaultReadSize= 16*1024*1024 def validate(self): if self.filePattern=='': self.logger.warning('%s: 未配置文件名, 不对日志做收集' % self.cubeName) self.dbop.close() return False if len(self.numberKeys)+len(self.stringKeys)==0: self.logger.warning('%s: fileName=%s, 未配置监控keys, 不对日志做收集' % (self.cubeName,self.filePattern)) return True def toString(self): str='' str+='fileName=%s\n' % self.filePattern str+='timeInterval=%s\n' % self.timeInterval str+='numberKeys=%s\n' % ','.join(self.numberKeys) str+='stringKeys=%s\n' % ','.join(self.stringKeys) str+='wholeFile=%s\n' % self.wholeFile str+='keyAlarmers=' for key,alarmers in self.keyAlarmers.items(): str+='%s:[' % key str+=','.join([alarmer.toString() for alarmer in alarmers]) str+='],' str=str[0:-1]+'\n' return str def parseTimePattern(self,file): timePattern=re.match(".*@@(.*)@@",self.filePattern) if timePattern: return timePattern.group(1) return None def updateLogFile(self): ''' 更新文件名称和读取位置 ''' if self.timePattern: curTimePattern = datetime.datetime.strftime(datetime.datetime.now(),self.timePattern) logFile = re.sub( "@@.*@@", curTimePattern , self.filePattern ) if self.curFile != logFile: # 1. 文件名变化 self.logger.info('@@logHandler@@ file changed: %s -> %s' % (self.curFile, logFile)) self.curFile=logFile self.position=0 self.preFileSize=0 return if os.path.exists(self.curFile): curFileSize = os.stat(self.curFile).st_size if self.preFileSize <= curFileSize: # 2. 文件名无变化 & 文件名无变化 & 文件大小合理变化 return # 3. 文件名无变化 & (文件不存在 | 文件大小不合理变化) self.position=0 self.preFileSize=0 def monitor(self): self.updateLogFile() # 若文件存在, 且不需要分析整个log, 则指向末尾, 否则指向开头并等待文件出现 if os.path.exists(self.curFile) and not self.wholeFile: lf=open(self.curFile) lf.seek( -1, os.SEEK_END) self.position = lf.tell() lf.close() # 开始监控logfile while True: # update file name self.updateLogFile() if not os.path.exists(self.curFile): time.sleep(5) self.logger.warning( '%s: @@logHandler@@ file=%s, 待监控文件不存在, 暂不做监控, 等待文件出现ing...' % (self.cubeName, self.curFile) ) continue self.log2db() self.logger.info( '%s: @@logHandler@@ file=%s, sleeping, will wake up after %s seconds' % ( self.cubeName, self.curFile, self.timeInterval ) ) time.sleep( self.timeInterval ) # 若单行log大于defaultReadSize, 将会出bug: 无法往下继续读, 因此defaultReadSize不宜过小, 但从资源消耗角度看, 也不宜过大 # 每次均读至末尾 def log2db( self ): lf = open( self.curFile ) # 先记住当前位置, 避免由于日志输出过快而每次处理完日志后又有新日志输出 lf.seek( -1, os.SEEK_END) currentEndPosition=lf.tell() lf.seek( self.position ) while True: if currentEndPosition<=lf.tell(): break rawLines = lf.read( self.defaultReadSize ) index = rawLines.rfind( '\n' ) if index < 0: break lf.seek( index + 1 - len( rawLines ), os.SEEK_CUR ) logLines = rawLines[0:index].splitlines() result = self.statisticTool.getStatisticData( logLines ) self.alarm(result) if len( result ) > 0: if not self.debug: self.dbop.insert( result ) if not self.debug and len( result ) > 20: self.logger.info( '@@logHandler@@ file=%s, insert.status=success, data.length=%s, data=%s, ..., %s' % (self.curFile, len(result), result[0:10], result[-10:]) ) else: self.logger.info( '@@logHandler@@ file=%s, insert.status=success, data.length=%s, data=%s' % (self.curFile, len(result), result) ) else: self.logger.info( '@@logHandler@@ file=%s, insert.status=empty' % self.curFile) self.position=lf.tell() self.preFileSize=os.stat(self.curFile).st_size lf.close() def alarm(self,result): for res in result: if self.keyAlarmers.has_key(res[1]): for alarmer in self.keyAlarmers[res[1]]: alarmer.alarm(res[2],self.cubeName,res[1])
class CommandKey: def __init__(self,ldap,product,type,key,command,alarmers,logger,cubeName,debug,timeInterval): self.key=key self.timeInterval=timeInterval self.command=command self.alarmers=alarmers self.dbop=DBOperator(ldap,product,type,cubeName,[key],[],[]) self.logger=logger self.cubeName=cubeName self.debug=debug def validate(self): if self.key=='': logger.warning('%s: commandKey未配置key, 不做command的收集') self.dbop.close() return False if self.command=='': logger.warning('%s: commandKey未配置command, 不做command的收集') self.dbop.close() return False return True def toString(self): str='' str+='key=%s\n' % self.key str+='timeInterval=%s\n' % self.timeInterval str+='command=%s\n' % self.command str+='alarmers=[%s]\n' % ','.join([alarmer.toString() for alarmer in self.alarmers]) return str def monitor(self): while True: currentTime = int( time.time() ) currentTime = currentTime - currentTime % self.timeInterval value = self.getDataFromCommand() if value: # add timestamp insertData = [[currentTime, self.key, value]] self.logger.info( '%s: @@commandKeyHandler@@ result=%s' % ( self.cubeName, insertData ) ) self.alarm(insertData) if not self.debug: self.dbop.insert( insertData ) self.logger.info( '%s: @@commandKeyHandler@@ result inserted to database' % self.cubeName ) else: self.logger.warning( '%s: @@commandKeyHandler@@ no data get' % self.cubeName ) endTime = int( time.time() ) sleepTime = self.timeInterval - ( endTime - currentTime ) if sleepTime <= 0: continue self.logger.info( '%s: @@commandKeyHandler@@ sleep, will wake up in %ss' % ( self.cubeName, sleepTime ) ) time.sleep( sleepTime ) def alarm(self,insertData): for data in insertData: for alarmer in self.alarmers: alarmer.alarm(data[2],self.cubeName,self.key) def getDataFromCommand(self): result=commands.getoutput(self.command).splitlines() result=result[0].split() result=result[0] try: return float(result) except: return None
class Process: def __init__(self, ldap, product, type, alias, regPattern, processPath, pids, keys, keyAlarmers, logger, cubeName, debug, timeInterval): self.alias = alias self.regPattern = regPattern self.processPath = processPath self.pids = pids self.oldPids = [] self.keys = keys self.keyAlarmers = keyAlarmers self.dbop = DBOperator(ldap, product, type, cubeName, self.getDBKeys(alias, keys), [], []) self.logger = logger self.cubeName = cubeName self.debug = debug self.timeInterval = timeInterval # total mem of machine, unit: mb self.totalMem = 0 self.curUser = None self.topN = 12 # 用于指向获取进程pid的函数 self.getAllPid = None # 若不是直接指定pid, 则每次都调用系统命令, 用于check进程正常 -> 优化: 先尝试用上次的pid, 找不到后再通过regPattern和processPath去更新pid self.currentPidCmdPath = [] def getDBKeys(self, alias, keys): dbKeys = [] for als in alias: for key in keys: dbKeys.append(als + '.p' + key) return dbKeys def validate(self): if len(self.alias) == 0: self.logger.warning('@@process@@ %s: 未配置 process alias: 将不对进程做收集' % self.cubeName) self.dbop.close() return False if self.regPattern == '' and self.processPath == '' and len( self.pids) == 0: self.logger.warning( '@@process@@ %s: alias=%s: process配置不正确, 暂不做收集--每个process请至少用pid, processPath, regPattern中的一个来指定' % (self.cubeName, self.alias)) return False if len(self.pids) != 0 and len(self.pids) != len(self.alias): self.logger.warning( '@@process@@ %s: alias=%s, pid=%s, process由pid指定, 但pid个数与alias个数不相同' % (self.cubeName, self.alias, self.pids)) self.dbop.close() return False # 若由pid指定process, 则regPattern及processPath无效 if len(self.pids) != 0: if len(self.regPattern) != 0: self.logger.warning( '@@process@@ %s: alias=%s, pid=%s, regPattern=%s: process由pid指定, regPattern配置无效' % (self.cubeName, self.alias, self.pids, self.regPattern)) self.regPattern = '' if len(self.processPath) != 0: self.logger.warning( '@@process@@ %s: alias=%s, pid=%s, processPath=%s: process由pid指定, processPath配置无效' % (self.cubeName, self.alias, self.pids, self.processPath)) self.processPath = '' self.postInit() return True def toString(self): str = '' str += 'keys=%s\n' % ','.join(self.keys) str += 'alias=%s\n' % ','.join(self.alias) str += 'pids=%s\n' % ','.join(self.pids) str += 'regPattern=%s\n' % self.regPattern str += 'processPath=%s\n' % self.processPath str += 'timeInterval=%s\n' % self.timeInterval str += 'keyAlarmers=' for key, alarmers in self.keyAlarmers.items(): str += '%s:[' % key str += ','.join([alarmer.toString() for alarmer in alarmers]) str += '],' str = str[0:-1] + '\n' return str # check通过后的初始化任务 def postInit(self): # 计算total_mem: 目前每个Process都会算一次, 不过占用资源较少, 暂不设为global或由Collect来计算并赋值 meminfo = os.popen( "cat /proc/meminfo |grep 'MemTotal'|awk '{print $2,$3}'", "r").read().split() self.totalMem = float(meminfo[0]) unit = meminfo[1].lower() if unit == 'kb': self.totalMem = self.totalMem / 1024 # 后面两种应该没啥必要, 不过有备无患:) elif unit == 'mb': pass elif unit == 'gb': self.totalMem = self.totalMem * 1024 else: self.logger.info('@@process@@ %s: %s cannot get total mem' % (self.cubeName, self.alias)) exit(-1) # 获取当前用户名 self.curUser = commands.getoutput('whoami').strip() # 指定获取进程的方法 if len(self.pids) != 0: self.getAllPid = self.getPid elif self.regPattern != '': if self.processPath != '': self.getAllPid = self.getPidByRegAndPath else: self.getAllPid = self.getPidByReg else: self.getAllPid = self.getPidByPath def monitor(self): while True: currentTime = int(time.time()) currentTime = currentTime - currentTime % self.timeInterval processDatas = self.getAllProcessData() if len(processDatas) > 0: # add timestamp insertData = [] for data in processDatas: insertData.append([currentTime, data[0], data[1]]) self.logger.info('@@processHandler@@ %s: result=%s' % (self.cubeName, insertData)) self.alarm(insertData) # insert, 与logHandler共用dbop if not self.debug: self.dbop.insert(insertData) self.logger.info( '@@processHandler@@ %s: result inserted to database' % self.cubeName) else: self.logger.info('%s: @@processHandler@@ no process data get' % self.cubeName) endTime = int(time.time()) sleepTime = self.timeInterval - (endTime - currentTime) if sleepTime <= 0: continue self.logger.info( '@@processHandler@@ %s: sleep, will wake up in %ss' % (self.cubeName, sleepTime)) time.sleep(sleepTime) def alarm(self, result): # process 的数据较少, 可以这样干 for key, alarmers in self.keyAlarmers.items(): # 所有alias if key in self.keys: for res in result: if key in res[1]: for alarmer in alarmers: alarmer.alarm(res[2], self.cubeName, key) # 仅对某alias else: # zhangb的前台遗留问题, pid相关的key形式为: alias+'.p'+key for res in result: if key.replace(key[key.rfind('.')], '.p') == res[1]: for alarmer in alarmers: alarmer.alarm(res[2], self.cubeName, key) def getAllProcessData(self): if len(self.pids) != 0: pids = self.getAllPid() elif self.validOldPids(): pids = self.oldPids elif len(self.pids) == 0: self.refreshPidInfo() pids = self.getAllPid() if len(pids) > len(self.alias): self.logger.info( '@@process@@ %s: alias=%s, 配置的进程数(%s) < 检测到的个数(%s), 暂不对进程做收集' % (self.cubeName, self.alias, len(self.alias), len(pids))) return [] if len(pids) < len(self.alias): self.logger.warning( '@@process@@ %s: alias=%s 检测到的进程数(%s) < 配置的个数(%s)' % (self.cubeName, self.alias, len(pids), len(self.alias))) if len(pids) == 0: self.logger.warning( '@@process@@ %s: alias=%s 检测到的进程数为0, 暂不对进程做收集' % (self.cubeName, self.alias)) return [] if len(self.oldPids) == len(pids): for i in range(len(pids)): if pids[i] != self.oldPids[i]: self.logger.info( '@@process@@ %s: pid changed for alias=%s: %s -> %s' % (self.cubeName, self.alias[i], self.oldPids[i], pids[i])) self.oldPids = pids # 获取配置进程数据 memAndCpuData = self.getMemAndCpuData(pids) fdsData = self.getFdsData(pids) netLinksData = self.getNetLinksData(pids) threadsData = self.getThreadsData(pids) return memAndCpuData + fdsData + netLinksData + threadsData def validOldPids(self): ''' 是否所有进程都在 ''' if len(self.oldPids) == 0: self.logger.debug('@@process@@ %s: alias=%s, valid=False' % (self.cubeName, self.alias)) return False for pid in self.oldPids: # check该进程是否存在 try: os.kill(int(pid), 0) except OSError: self.logger.debug( '@@process@@ %s: alias=%s, pid=%s, valid=False' % (self.cubeName, pid, self.alias)) return False self.logger.debug('@@process@@ %s: alias=%s, valid=True' % (self.cubeName, self.alias)) return True def refreshPidInfo(self): self.currentPidCmdPath = [] # 获取当前user的进程pid, cmd, path的shell命令 (返回格式执行可见): # ps x --columns=100000 | sed -r 's/^\s*(.*)/\1/' | grep -P "^\s*\d" | grep -v -P "\d+:\d+\s+(ps|grep|sed|ssh|-bash|ls|sh)" | sed -r 's/^([0-9]*).*[0-9]+:[0-9]+\s+(.*)/echo -ne "\1\\\\t\2\\\\t";ls -l \/proc\/\1\/cwd/' | grep -v columns=100000 | sh | grep -P "^\d" | sed -r "s/([0-9]*)\t(.*)\t.*-> (.*)/\1\t\2\t\3/" # 1. ps -x (可改为用ps -xo) 注:ps 命令会有截断, 设为10w长可对付一般情况, os默认为page length, 一般为4096, 可能取不到完整的commands # 2. 去掉非进程开头的行 # 3. 去掉shell本身的进程 注: 过滤了sh, top等进程, 而一般的监控场景也不是对此类进程做监控 # 4. 获取pid及path # 5. ls -l /proc/pid/cwd获取进程cwd 注: ls -1 /proc/pid/cwd时可能会有Permission denied的情况, 因此需要对返回值做处理 # 6. escape真是很xx...!!!: reg+sed(还好用了-r)+echo -ne+python+... cmd = "ps x --columns=100000 | sed -r \"s/^\s*(.*)/\\1/\" |grep -P \"^\s*\d\" | grep -v -P \"\d+:\d+\s+(ps|grep|sed|ssh|top|tail|less|tail|more|-bash|ls|sh|vim?\s)\" | sed -r 's/^([0-9]*).*[0-9]+:[0-9]+\s+(.*)/echo -ne \"\\1\\\\t\\2\\\\t\";ls -l \/proc\/\\1\/cwd/' | grep -v columns=100000 | sh | grep -P \"^\d\" | sed -r \"s/([0-9]*)\\\\t(.*)\\t.*-> (.*)/\\1\\\\t\\2\\\\t\\3/\"" lines = subprocess.Popen( args=cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0].splitlines() for line in lines: if 'Permission denied' in line: continue datas = line.split() if len(datas) < 3: continue if len(datas) == 3: self.currentPidCmdPath.append([datas[0], datas[1], datas[2]]) else: self.currentPidCmdPath.append( [datas[0], ' '.join(datas[1:-1]), datas[-1]]) self.logger.info( '@@process@@ %s: alias=%s, update user\'s process info successfully, pidCmdPath.info=%s' % (self.cubeName, self.alias, self.currentPidCmdPath)) def getPid(self): return self.pids def getPidByReg(self): pids = [] for pid, cmd, path in self.currentPidCmdPath: if len(re.findall(self.regPattern, cmd)) != 0: pids.append(pid) self.logger.info('@@process@@ %s: ailas=%s, getPidByRegPattern=%s' % (self.cubeName, self.alias, pids)) return pids def getPidByPath(self): pids = [] for pid, cmd, path in self.currentPidCmdPath: if path == '/' and self.processPath != '/': continue if self.processPath.strip('/') in path.strip('/'): pids.append(pid) self.logger.info('@@process@@ %s: ailas=%s, getPidByPath=%s' % (self.cubeName, self.alias, pids)) return pids def getPidByRegAndPath(self): regPid = self.getPidByReg() pathPid = self.getPidByPath() pids = [] for pid in regPid: if pid in pathPid: pids.append(pid) return pids def getMemAndCpuData(self, currentPids): memAndCpuData = [] cmd = "top -n1 -b -p %s | tail -n %s" % ( ','.join(currentPids), len(currentPids) + 1) + " | awk '{printf(\"%s %s %s\\n\",$1,$9,$10);}' " self.logger.debug('@@process@@ %s: cmd=%s' % (self.cubeName, cmd)) results = subprocess.Popen( args=cmd, shell=True, stdout=subprocess.PIPE).communicate()[0].strip().splitlines() for result in results: lineinfos = result.split() # 正常情况下每行应只有\s\d\. # 若有进程已被kill掉, len(lineinfos)可能不为3 if result.upper() != result.lower() or len(lineinfos) != 3: self.logger.warning( '@@process@@ %s: alias=%s: 获取到异常进程信息: result=%s' % (self.cubeName, self.alias, result)) break memAndCpuData.append([ self.alias[currentPids.index(lineinfos[0])] + '.pcpu', float(lineinfos[1]) ]) memAndCpuData.append([ self.alias[currentPids.index(lineinfos[0])] + '.pmem', float(lineinfos[2]) * self.totalMem / 100 ]) self.logger.info('@@process@@ %s: alias=%s, memAndCpuData=%s' % (self.cubeName, self.alias, memAndCpuData)) return memAndCpuData def getFdsData(self, pids): fdsData = [] if 'fds' not in self.keys: return fdsData for pid in pids: cmd = 'ls -l /proc/%s/fd | wc -l' % pid self.logger.debug('@@process@@ %s: cmd=%s' % (self.cubeName, cmd)) res = subprocess.Popen(args=cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate( )[0].splitlines()[0].split() if len(res) == 0 or not res[0].isdigit(): continue else: fdsData.append( [self.alias[pids.index(pid)] + '.pfds', int(res[0])]) return fdsData def getNetLinksData(self, pids): netLinksData = [] if 'netLinks' not in self.keys: return netLinksData for pid in pids: cmd = 'netstat -tnp | grep %s | grep ESTABLISHED | wc -l' % pid self.logger.debug('@@process@@ %s: cmd=%s' % (self.cubeName, cmd)) res = subprocess.Popen(args=cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate( )[0].splitlines()[0].split() if len(res) == 0 or not res[0].isdigit(): continue else: netLinksData.append( [self.alias[pids.index(pid)] + '.pnetLinks', int(res[0])]) return netLinksData def getThreadsData(self, pids): threadsData = [] if 'threads' not in self.keys: return threadsData for pid in pids: cmd = 'pstree -p %s | wc -l' % pid self.logger.debug('@@process@@ %s: cmd=%s' % (self.cubeName, cmd)) res = subprocess.Popen(args=cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate( )[0].splitlines()[0].split() if len(res) == 0 or not res[0].isdigit(): continue else: threadsData.append( [self.alias[pids.index(pid)] + '.pthreads', int(res[0])]) return threadsData def getOtherUserMemCpu(self): ''' 获取其他用户的cpu和内存信息 ''' otheruserinfos = [] if not self.curUser: return otheruserinfos cmd = "top -n1 b | grep -e \"^ *[0-9]\+\" | grep -v %s" % ( self.curUser) + " | awk '{printf(\"%s %s %s\\n\",$2,$9,$10);}' " self.logger.debug('@@process@@ %s: cmd=%s' % (self.cubeName, cmd)) results = subprocess.Popen( args=cmd, shell=True, stdout=subprocess.PIPE).communicate()[0].strip().splitlines() otheruserinfo = {} for result in results: lineinfos = result.split() # 正常情况下每行应只有\s\d\. # 若有进程已被kill掉, len(lineinfos)可能不为3 if len(lineinfos) != 3: continue try: username = lineinfos[0] cpurate = float(lineinfos[1]) memrate = float(lineinfos[2]) if not otheruserinfo.has_key(username): otheruserinfo[username] = { 'user_%s.pall' % (username): cpurate + memrate, 'user_%s.pcpu' % (username): cpurate, 'user_%s.pmem' % (username): memrate * self.totalMem / 100 } else: otheruserinfo[username]['user_%s.pall' % (username)] += (cpurate + memrate) otheruserinfo[username]['user_%s.pcpu' % (username)] += cpurate otheruserinfo[username]['user_%s.pmem' % (username)] += (memrate * self.totalMem / 100) except: pass # 排序 otheruserinfolist = sorted( otheruserinfo.iteritems(), key=lambda (username, uservalue): uservalue['user_%s.pall' % (username)], reverse=True) otheruserinfolist = otheruserinfolist[:self.topN] for username, uservalue in otheruserinfolist: otheruserinfos.append([ 'user_%s.pcpu' % (username), uservalue['user_%s.pcpu' % (username)] ]) otheruserinfos.append([ 'user_%s.pmem' % (username), uservalue['user_%s.pmem' % (username)] ]) self.logger.info( '@@process@@ %s: otheruser processinfo , memAndCpuData=%s' % (self.cubeName, otheruserinfos)) return otheruserinfos
class Process: def __init__( self, ldap, product, type, alias, regPattern, processPath, pids, keys, keyAlarmers, logger, cubeName, debug, timeInterval, ): self.alias = alias self.regPattern = regPattern self.processPath = processPath self.pids = pids self.oldPids = [] self.keys = keys self.keyAlarmers = keyAlarmers self.dbop = DBOperator(ldap, product, type, cubeName, self.getDBKeys(alias, keys), [], []) self.logger = logger self.cubeName = cubeName self.debug = debug self.timeInterval = timeInterval # total mem of machine, unit: mb self.totalMem = 0 self.curUser = None self.topN = 12 # 用于指向获取进程pid的函数 self.getAllPid = None # 若不是直接指定pid, 则每次都调用系统命令, 用于check进程正常 -> 优化: 先尝试用上次的pid, 找不到后再通过regPattern和processPath去更新pid self.currentPidCmdPath = [] def getDBKeys(self, alias, keys): dbKeys = [] for als in alias: for key in keys: dbKeys.append(als + ".p" + key) return dbKeys def validate(self): if len(self.alias) == 0: self.logger.warning("@@process@@ %s: 未配置 process alias: 将不对进程做收集" % self.cubeName) self.dbop.close() return False if self.regPattern == "" and self.processPath == "" and len(self.pids) == 0: self.logger.warning( "@@process@@ %s: alias=%s: process配置不正确, 暂不做收集--每个process请至少用pid, processPath, regPattern中的一个来指定" % (self.cubeName, self.alias) ) return False if len(self.pids) != 0 and len(self.pids) != len(self.alias): self.logger.warning( "@@process@@ %s: alias=%s, pid=%s, process由pid指定, 但pid个数与alias个数不相同" % (self.cubeName, self.alias, self.pids) ) self.dbop.close() return False # 若由pid指定process, 则regPattern及processPath无效 if len(self.pids) != 0: if len(self.regPattern) != 0: self.logger.warning( "@@process@@ %s: alias=%s, pid=%s, regPattern=%s: process由pid指定, regPattern配置无效" % (self.cubeName, self.alias, self.pids, self.regPattern) ) self.regPattern = "" if len(self.processPath) != 0: self.logger.warning( "@@process@@ %s: alias=%s, pid=%s, processPath=%s: process由pid指定, processPath配置无效" % (self.cubeName, self.alias, self.pids, self.processPath) ) self.processPath = "" self.postInit() return True def toString(self): str = "" str += "keys=%s\n" % ",".join(self.keys) str += "alias=%s\n" % ",".join(self.alias) str += "pids=%s\n" % ",".join(self.pids) str += "regPattern=%s\n" % self.regPattern str += "processPath=%s\n" % self.processPath str += "timeInterval=%s\n" % self.timeInterval str += "keyAlarmers=" for key, alarmers in self.keyAlarmers.items(): str += "%s:[" % key str += ",".join([alarmer.toString() for alarmer in alarmers]) str += "]," str = str[0:-1] + "\n" return str # check通过后的初始化任务 def postInit(self): # 计算total_mem: 目前每个Process都会算一次, 不过占用资源较少, 暂不设为global或由Collect来计算并赋值 meminfo = os.popen("cat /proc/meminfo |grep 'MemTotal'|awk '{print $2,$3}'", "r").read().split() self.totalMem = float(meminfo[0]) unit = meminfo[1].lower() if unit == "kb": self.totalMem = self.totalMem / 1024 # 后面两种应该没啥必要, 不过有备无患:) elif unit == "mb": pass elif unit == "gb": self.totalMem = self.totalMem * 1024 else: self.logger.info("@@process@@ %s: %s cannot get total mem" % (self.cubeName, self.alias)) exit(-1) # 获取当前用户名 self.curUser = commands.getoutput("whoami").strip() # 指定获取进程的方法 if len(self.pids) != 0: self.getAllPid = self.getPid elif self.regPattern != "": if self.processPath != "": self.getAllPid = self.getPidByRegAndPath else: self.getAllPid = self.getPidByReg else: self.getAllPid = self.getPidByPath def monitor(self): while True: currentTime = int(time.time()) currentTime = currentTime - currentTime % self.timeInterval processDatas = self.getAllProcessData() if len(processDatas) > 0: # add timestamp insertData = [] for data in processDatas: insertData.append([currentTime, data[0], data[1]]) self.logger.info("@@processHandler@@ %s: result=%s" % (self.cubeName, insertData)) self.alarm(insertData) # insert, 与logHandler共用dbop if not self.debug: self.dbop.insert(insertData) self.logger.info("@@processHandler@@ %s: result inserted to database" % self.cubeName) else: self.logger.info("%s: @@processHandler@@ no process data get" % self.cubeName) endTime = int(time.time()) sleepTime = self.timeInterval - (endTime - currentTime) if sleepTime <= 0: continue self.logger.info("@@processHandler@@ %s: sleep, will wake up in %ss" % (self.cubeName, sleepTime)) time.sleep(sleepTime) def alarm(self, result): # process 的数据较少, 可以这样干 for key, alarmers in self.keyAlarmers.items(): # 所有alias if key in self.keys: for res in result: if key in res[1]: for alarmer in alarmers: alarmer.alarm(res[2], self.cubeName, key) # 仅对某alias else: # zhangb的前台遗留问题, pid相关的key形式为: alias+'.p'+key for res in result: if key.replace(key[key.rfind(".")], ".p") == res[1]: for alarmer in alarmers: alarmer.alarm(res[2], self.cubeName, key) def getAllProcessData(self): if len(self.pids) != 0: pids = self.getAllPid() elif self.validOldPids(): pids = self.oldPids elif len(self.pids) == 0: self.refreshPidInfo() pids = self.getAllPid() if len(pids) > len(self.alias): self.logger.info( "@@process@@ %s: alias=%s, 配置的进程数(%s) < 检测到的个数(%s), 暂不对进程做收集" % (self.cubeName, self.alias, len(self.alias), len(pids)) ) return [] if len(pids) < len(self.alias): self.logger.warning( "@@process@@ %s: alias=%s 检测到的进程数(%s) < 配置的个数(%s)" % (self.cubeName, self.alias, len(pids), len(self.alias)) ) if len(pids) == 0: self.logger.warning("@@process@@ %s: alias=%s 检测到的进程数为0, 暂不对进程做收集" % (self.cubeName, self.alias)) return [] if len(self.oldPids) == len(pids): for i in range(len(pids)): if pids[i] != self.oldPids[i]: self.logger.info( "@@process@@ %s: pid changed for alias=%s: %s -> %s" % (self.cubeName, self.alias[i], self.oldPids[i], pids[i]) ) self.oldPids = pids # 获取配置进程数据 memAndCpuData = self.getMemAndCpuData(pids) fdsData = self.getFdsData(pids) netLinksData = self.getNetLinksData(pids) threadsData = self.getThreadsData(pids) return memAndCpuData + fdsData + netLinksData + threadsData def validOldPids(self): """ 是否所有进程都在 """ if len(self.oldPids) == 0: self.logger.debug("@@process@@ %s: alias=%s, valid=False" % (self.cubeName, self.alias)) return False for pid in self.oldPids: # check该进程是否存在 try: os.kill(int(pid), 0) except OSError: self.logger.debug("@@process@@ %s: alias=%s, pid=%s, valid=False" % (self.cubeName, pid, self.alias)) return False self.logger.debug("@@process@@ %s: alias=%s, valid=True" % (self.cubeName, self.alias)) return True def refreshPidInfo(self): self.currentPidCmdPath = [] # 获取当前user的进程pid, cmd, path的shell命令 (返回格式执行可见): # ps x --columns=100000 | sed -r 's/^\s*(.*)/\1/' | grep -P "^\s*\d" | grep -v -P "\d+:\d+\s+(ps|grep|sed|ssh|-bash|ls|sh)" | sed -r 's/^([0-9]*).*[0-9]+:[0-9]+\s+(.*)/echo -ne "\1\\\\t\2\\\\t";ls -l \/proc\/\1\/cwd/' | grep -v columns=100000 | sh | grep -P "^\d" | sed -r "s/([0-9]*)\t(.*)\t.*-> (.*)/\1\t\2\t\3/" # 1. ps -x (可改为用ps -xo) 注:ps 命令会有截断, 设为10w长可对付一般情况, os默认为page length, 一般为4096, 可能取不到完整的commands # 2. 去掉非进程开头的行 # 3. 去掉shell本身的进程 注: 过滤了sh, top等进程, 而一般的监控场景也不是对此类进程做监控 # 4. 获取pid及path # 5. ls -l /proc/pid/cwd获取进程cwd 注: ls -1 /proc/pid/cwd时可能会有Permission denied的情况, 因此需要对返回值做处理 # 6. escape真是很xx...!!!: reg+sed(还好用了-r)+echo -ne+python+... cmd = 'ps x --columns=100000 | sed -r "s/^\s*(.*)/\\1/" |grep -P "^\s*\d" | grep -v -P "\d+:\d+\s+(ps|grep|sed|ssh|top|tail|less|tail|more|-bash|ls|sh|vim?\s)" | sed -r \'s/^([0-9]*).*[0-9]+:[0-9]+\s+(.*)/echo -ne "\\1\\\\t\\2\\\\t";ls -l \/proc\/\\1\/cwd/\' | grep -v columns=100000 | sh | grep -P "^\d" | sed -r "s/([0-9]*)\\\\t(.*)\\t.*-> (.*)/\\1\\\\t\\2\\\\t\\3/"' lines = ( subprocess.Popen(args=cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) .communicate()[0] .splitlines() ) for line in lines: if "Permission denied" in line: continue datas = line.split() if len(datas) < 3: continue if len(datas) == 3: self.currentPidCmdPath.append([datas[0], datas[1], datas[2]]) else: self.currentPidCmdPath.append([datas[0], " ".join(datas[1:-1]), datas[-1]]) self.logger.info( "@@process@@ %s: alias=%s, update user's process info successfully, pidCmdPath.info=%s" % (self.cubeName, self.alias, self.currentPidCmdPath) ) def getPid(self): return self.pids def getPidByReg(self): pids = [] for pid, cmd, path in self.currentPidCmdPath: if len(re.findall(self.regPattern, cmd)) != 0: pids.append(pid) self.logger.info("@@process@@ %s: ailas=%s, getPidByRegPattern=%s" % (self.cubeName, self.alias, pids)) return pids def getPidByPath(self): pids = [] for pid, cmd, path in self.currentPidCmdPath: if path == "/" and self.processPath != "/": continue if self.processPath.strip("/") in path.strip("/"): pids.append(pid) self.logger.info("@@process@@ %s: ailas=%s, getPidByPath=%s" % (self.cubeName, self.alias, pids)) return pids def getPidByRegAndPath(self): regPid = self.getPidByReg() pathPid = self.getPidByPath() pids = [] for pid in regPid: if pid in pathPid: pids.append(pid) return pids def getMemAndCpuData(self, currentPids): memAndCpuData = [] cmd = ( "top -n1 -b -p %s | tail -n %s" % (",".join(currentPids), len(currentPids) + 1) + " | awk '{printf(\"%s %s %s\\n\",$1,$9,$10);}' " ) self.logger.debug("@@process@@ %s: cmd=%s" % (self.cubeName, cmd)) results = subprocess.Popen(args=cmd, shell=True, stdout=subprocess.PIPE).communicate()[0].strip().splitlines() for result in results: lineinfos = result.split() # 正常情况下每行应只有\s\d\. # 若有进程已被kill掉, len(lineinfos)可能不为3 if result.upper() != result.lower() or len(lineinfos) != 3: self.logger.warning( "@@process@@ %s: alias=%s: 获取到异常进程信息: result=%s" % (self.cubeName, self.alias, result) ) break memAndCpuData.append([self.alias[currentPids.index(lineinfos[0])] + ".pcpu", float(lineinfos[1])]) memAndCpuData.append( [self.alias[currentPids.index(lineinfos[0])] + ".pmem", float(lineinfos[2]) * self.totalMem / 100] ) self.logger.info("@@process@@ %s: alias=%s, memAndCpuData=%s" % (self.cubeName, self.alias, memAndCpuData)) return memAndCpuData def getFdsData(self, pids): fdsData = [] if "fds" not in self.keys: return fdsData for pid in pids: cmd = "ls -l /proc/%s/fd | wc -l" % pid self.logger.debug("@@process@@ %s: cmd=%s" % (self.cubeName, cmd)) res = ( subprocess.Popen(args=cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) .communicate()[0] .splitlines()[0] .split() ) if len(res) == 0 or not res[0].isdigit(): continue else: fdsData.append([self.alias[pids.index(pid)] + ".pfds", int(res[0])]) return fdsData def getNetLinksData(self, pids): netLinksData = [] if "netLinks" not in self.keys: return netLinksData for pid in pids: cmd = "netstat -tnp | grep %s | grep ESTABLISHED | wc -l" % pid self.logger.debug("@@process@@ %s: cmd=%s" % (self.cubeName, cmd)) res = ( subprocess.Popen(args=cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) .communicate()[0] .splitlines()[0] .split() ) if len(res) == 0 or not res[0].isdigit(): continue else: netLinksData.append([self.alias[pids.index(pid)] + ".pnetLinks", int(res[0])]) return netLinksData def getThreadsData(self, pids): threadsData = [] if "threads" not in self.keys: return threadsData for pid in pids: cmd = "pstree -p %s | wc -l" % pid self.logger.debug("@@process@@ %s: cmd=%s" % (self.cubeName, cmd)) res = ( subprocess.Popen(args=cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) .communicate()[0] .splitlines()[0] .split() ) if len(res) == 0 or not res[0].isdigit(): continue else: threadsData.append([self.alias[pids.index(pid)] + ".pthreads", int(res[0])]) return threadsData def getOtherUserMemCpu(self): """ 获取其他用户的cpu和内存信息 """ otheruserinfos = [] if not self.curUser: return otheruserinfos cmd = ( 'top -n1 b | grep -e "^ *[0-9]\+" | grep -v %s' % (self.curUser) + " | awk '{printf(\"%s %s %s\\n\",$2,$9,$10);}' " ) self.logger.debug("@@process@@ %s: cmd=%s" % (self.cubeName, cmd)) results = subprocess.Popen(args=cmd, shell=True, stdout=subprocess.PIPE).communicate()[0].strip().splitlines() otheruserinfo = {} for result in results: lineinfos = result.split() # 正常情况下每行应只有\s\d\. # 若有进程已被kill掉, len(lineinfos)可能不为3 if len(lineinfos) != 3: continue try: username = lineinfos[0] cpurate = float(lineinfos[1]) memrate = float(lineinfos[2]) if not otheruserinfo.has_key(username): otheruserinfo[username] = { "user_%s.pall" % (username): cpurate + memrate, "user_%s.pcpu" % (username): cpurate, "user_%s.pmem" % (username): memrate * self.totalMem / 100, } else: otheruserinfo[username]["user_%s.pall" % (username)] += cpurate + memrate otheruserinfo[username]["user_%s.pcpu" % (username)] += cpurate otheruserinfo[username]["user_%s.pmem" % (username)] += memrate * self.totalMem / 100 except: pass # 排序 otheruserinfolist = sorted( otheruserinfo.iteritems(), key=lambda (username, uservalue): uservalue["user_%s.pall" % (username)], reverse=True, ) otheruserinfolist = otheruserinfolist[: self.topN] for username, uservalue in otheruserinfolist: otheruserinfos.append(["user_%s.pcpu" % (username), uservalue["user_%s.pcpu" % (username)]]) otheruserinfos.append(["user_%s.pmem" % (username), uservalue["user_%s.pmem" % (username)]]) self.logger.info("@@process@@ %s: otheruser processinfo , memAndCpuData=%s" % (self.cubeName, otheruserinfos)) return otheruserinfos