def isAlive(self): try: self.getStatus() return 'true' except Exception, e: AgentLog.error('Mysql.isAlive failed, %s' % e) return "false"
def check(self): """ 检测进程是否存在,检测方法: 1.检测pid文件是否存在,如果不存在,则说明进程没有运行,新进程可以运行 2.如果pid文件存在,则读取pid文件中的pid,在所有进程中查找文件中的pid的进程是否存在, 如果进程存在,则新的进程不能运行,直接退出,如果进程不存在,说明可能是由于非正常退出(如:CTRL + C) 造成pid文件未被清理,这种情况下,agent可以启动,先删除失效的pid文件,然后启动新的进程即可。 """ if not os.path.isfile(self.__pidFile__): return pid = 0 try: file = open(self.__pidFile__, 'rt') data = file.read() file.close() # 获取文件中的pid进程号 pid = int(data) if not self.isAlive(pid): try: os.remove(self.__pidFile__) return except: AgentLog.warning("remove pid file {0} failed.".format( self.__pidFile__)) except: pass AgentLog.info("agent exist, only one process allow running") sys.exit(1)
def doWork(self): content = {} for name, monitor in self.montorInstances.iteritems(): data = monitor.getData() if data == None: AgentLog.warning( "MonitorSenderThread.doWork can not get the %s Info,size is 0, now skip this time" % name) return if len(data) == 0: AgentLog.warning( "MonitorSenderThread.doWork can not get the %s Info,size is 0" % name) continue content = dict(content, **data) request = AgentRequest(content=content) request.setId(self.instanceId, self.groupId, self.clusterId) for filter in self.filters: request.setContent(filter.filter(request.getContent())) if not self.isFirst: self.sender.send(request.getRequest()) else: self.isFirst = False
def getSocket(self): socket = self.getValue(self.configFile, "socket") if socket == "": AgentLog.warning( 'Mysql can not get socket value from config, use /tmp/mysql.sock' ) socket = "/tmp/mysql.sock" return socket
def getDataDir(self): dataDir = self.getValue(self.configFile, "datadir") if dataDir == "": AgentLog.warning( 'Mysql can not get datadir value from config, start get innodb_data_home_dir' ) dataDir = self.getValue(self.configFile, "innodb_data_home_dir") return dataDir
def reconnect(self): AgentLog.warning("receiver %s disconnectted, re-connect..." % self.name) RabbitMQWrapper.reconnect(self) queue_ = self.queue if self.exclusive: queue_ = self.channel.queue_declare(exclusive=True).method.queue self.queue = queue_ else: self.channel.queue_declare(queue=queue_) self.channel.queue_bind(exchange=self.exchange, queue=queue_, routing_key=self.routing_key)
def doWork(self): isSuccess, errMsg = self.check() if not isSuccess: return self.responseFailed(errMsg) try: ret = self.handle() except Exception, e: AgentLog.error('MysqlShell.doWork error :%s' % e) ret = self.errorHandler("%s" % e) return self.responseFailed(ret)
def __init__(self, name_="RabbitMQSender", host_="localhost", port_=5672, exchange_='exchange', routing_key_='key'): RabbitMQWrapper.__init__(self, name_, host_, port_, exchange_, '', routing_key_) AgentLog.info( "create sender %s {host:%s, port:%d, exchange:%s, routing_key:%s}" % (name_, host_, port_, exchange_, routing_key_))
def stop(self): """ 程序退出时调用,在进程退出时,删除pid文件 """ AgentLog.info("RDS Agent start to exit") try: os.remove(self.__pidFile__) except: AgentLog.warning("remove pid file {0} error".format( self.__pidFile__)) raise AgentException("remove pid file {0} error".format( self.__pidFile__))
def __init__(self, attr): AgentLog.info('Start to init Mysql from config') Database.__init__(self, attr) try: self.installPath = attr['home'] self.configFile = attr['config'] self.backupDir = attr['backupdir'] self.socket = self.getSocket() self.dataDir = self.getDataDir() self.binlogName = self.getBinlogName() except (KeyError, AgentAtrrException), e: raise AgentAtrrException('Mysql init KeyError:%s' % e)
def doWork(self): isSuccess, errMsg = self.check() if not isSuccess: return self.responseFailed(errMsg) timestamp = self.response.getTimeStamp() timestamp = time.mktime(time.strptime(timestamp, '%Y-%m-%d %H:%M:%S')) try: isSuccess, masterFile, masterPos, errMsg, fileSize = self.db.backup( timestamp, self.backupType, self.binlogFile, self.backupTool) except Exception, e: isSuccess = False errMsg = 'Error %s' % e AgentLog.error(errMsg)
def getInstanceInfo(self, instanceId): try: ret = {'DBName': None, 'IP': None, 'Port': None} ret['version'] = self.db.execSql('select version()')[0][0] ret['schema'] = len(self.db.execSql('show databases')) ret['Uptime'] = self.db.execSql('show status like "Uptime"')[0][0] ret['Engine'] = 'Innodb' ret['NbCluster'] = 'NO' return ret except Exception, e: AgentLog.error("AWR getInstanceInfo error:%s" % e)
def start(self): """ 把进程pid写入到对应的pid文件 """ AgentLog.info("RDS Agent start to run") try: file = open(self.__pidFile__, 'wt') file.write(str(os.getpid())) file.close() except: AgentLog.error("open pid file {0} error, start failed".format( self.__pidFile__)) raise AgentException("open pid file {0} failed".format( self.__pidFile__))
def __init__(self, name_='RabbitMQReceiver', host_='localhost', port_=5672, exchange_='exchange', queue_='', routing_key_='key'): RabbitMQWrapper.__init__(self, name_, host_, port_, exchange_, queue_, routing_key_) self.exclusive = False if queue_ == "": queue_ = self.channel.queue_declare(exclusive=True).method.queue self.queue = queue_ self.exclusive = True else: self.channel.queue_declare(queue=queue_) self.channel.queue_bind(exchange=exchange_, queue=queue_, routing_key=routing_key_) self.doWork = None AgentLog.info("create receiver %s {host:%s, port:%d, exchange:%s, routing_key:%s}" % ( name_, host_, port_, exchange_, routing_key_))
def __init__(self, threadName, activeReportInterval=5.0, loopInterval=60): threading.Thread.__init__(self, name=threadName) self.isRunning = True # 线程循环间隔时间 self.loopInterval = loopInterval # 线程每隔一段时间需要上报一次当前状态,activeReportInterval为上报间隔 self.activeReportInterval = activeReportInterval self.lastReportTime = 0 # 多数线程为循环线程,需要每个一定时间运行一次,通过event.wait()进行等待, # 采用event.wait而不是sleep的好处是,在线程退出时可以直接通过set命令取消等待, # 不需要等待sleep结束 self.event = threading.Event() self.event.clear() AgentLog.info("thread [{0}] created, report interval: {1}, loop interval: {2}".format( \ self.getName(), activeReportInterval, loopInterval))
class RabbitMQSender(RabbitMQWrapper): def __init__(self, name_="RabbitMQSender", host_="localhost", port_=5672, exchange_='exchange', routing_key_='key'): RabbitMQWrapper.__init__(self, name_, host_, port_, exchange_, '', routing_key_) AgentLog.info( "create sender %s {host:%s, port:%d, exchange:%s, routing_key:%s}" % (name_, host_, port_, exchange_, routing_key_)) def send(self, msg): try: self.channel.basic_publish(exchange=self.exchange, routing_key=self.routing_key, body=msg) except ConnectionClosed, e: self.reconnect() self.channel.basic_publish(exchange=self.exchange, routing_key=self.routing_key, body=msg) except Exception, e: AgentLog.error("{0} send msg:{1} error".format(self.name, msg))
def upload_ftp(self, params, srcDir): remoteHost = params.get('remoteHost', None) remotePort = params.get('remotePort', None) remoteUser = params.get('remoteUser', None) remotePasswd = params.get('remotePassword', None) backupPath = params.get('backupPath', None) uploadLimit = params.get('uploadLimit', None) if remoteHost == None or remotePort == None or remoteUser == None or \ remotePasswd == None or backupPath == None or uploadLimit == None: return False, 'remote host information errors' AgentLog.info("start upload backup data to ftp server") try: ftp = FTP() ftp.connect(remoteHost, int(remotePort), 60) except Exception, e: return False, 'can not connect to remote host: %s with port: %s, error: %s' % ( remoteHost, remotePort, e)
def setUp(self): print 'init logging' nowPath = re.findall('(.*?)IMAgent\\\\test', os.getcwd())[0] self.configPath = nowPath + 'IMAgent\\test\\conf\\' AgentLog.init(self.configPath + 'logging.cnf') #正确配置 AgentConfig.database = AgentConfig.database = { 'name': 'Mysql', 'attr': { 'user': '', 'passwd': '', 'home': '', 'configFile': 'test.cnf', 'backupDir': '' } }
def parseMsg(self, response): AgentLog.debug("receive msg: {0}".format(response.body)) try: try: command = CommandFactory.getCommand(response.getDBType(), response.getAction())(self.context, response) except AttributeError: request = self.createErrorRequest(response, 'command is not exist') self.sender.send(request.getRequest()) return # request = command.doWork() # self.sender.send(request.getRequest()) data = [((), {'command': command, 'sender': self.sender})] reqs = threadpool.makeRequests(doWork, data) [SyncCMDReceiver.pool.putRequest(req) for req in reqs] except Exception,e: AgentLog.error('syncCMDReceiver.parseMsg error %s'%e)
def createMonitors(self, context, dbName, monitorList): if isinstance(monitorList, list) is not True: raise AgentAtrrException( 'MonitorManager.createMonitors invalid monitorList') AgentLog.info('start to create monitors :%s' % (','.join(monitorList))) try: monitorInstances = {} base = 'from imagent.' + dbName for monitor in monitorList: url = base + '.monitor.' + monitor + ' import ' + monitor exec url monitorInstances[monitor] = eval(monitor)(context) return monitorInstances except AttributeError, e: raise AgentAtrrException( 'MonitorManager.createMonitors can not get monitor :%s' % e)
def doWork(self): try: temp = {} items = [ 'COUNT_STAR', 'SUM_TIMER_WAIT', 'MIN_TIMER_WAIT', 'AVG_TIMER_WAIT', 'MAX_TIMER_WAIT', 'SUM_LOCK_TIME', 'SUM_ERRORS', 'SUM_WARNINGS', 'SUM_ROWS_AFFECTED', 'SUM_ROWS_SENT', 'SUM_ROWS_EXAMINED', 'SUM_CREATED_TMP_DISK_TABLES', 'SUM_CREATED_TMP_TABLES', 'SUM_SELECT_FULL_JOIN', 'SUM_SELECT_FULL_RANGE_JOIN', 'SUM_SELECT_RANGE', 'SUM_SELECT_RANGE_CHECK', 'SUM_SELECT_SCAN', 'SUM_SORT_MERGE_PASSES', 'SUM_SORT_RANGE', 'SUM_SORT_ROWS', 'SUM_SORT_SCAN', 'SUM_NO_INDEX_USED', 'SUM_NO_GOOD_INDEX_USED' ] result = {} #查询每个field的topN数据,然后合并 for item in items: cmd = 'select SCHEMA_NAME,DIGEST,'+','.join(self.fields)+\ ' from performance_schema.events_statements_summary_by_digest order by '+item +' desc limit '+str(self.topNum) rows = self.dataBaseInstance.execSql(cmd) for row in rows: if str(row[0]) + row[1] not in result.keys(): result[str(row[0]) + row[1]] = row if result == {}: AgentLog.warning( 'TopSQLMonitor.doWork can not get topsql info') return for key, line in result.iteritems(): if line[0] is None: # 有schema为None的,这里取出来就是空了,为了下面的加法正确,将这个特殊值处理 statusBase = 'null' + "|" + line[1] else: statusBase = line[0] + "|" + line[1] for index, val in enumerate(line[2:]): status = statusBase + '|' + self.fields[index] temp[status] = val self.lock.acquire() self.dataDict = temp self.lock.release() except Exception, e: raise e
def getCacheProfile(self, instanceId): try: ret = {} ret['PoolSize'] = int(self.db.execSql("show variables like 'innodb_buffer_pool_size'")[0][1]) / 1024 / 1024 ret['RedologSize'] = int(self.db.execSql("show variables like 'innodb_log_file_size'")[0][1]) / 1024 / 1024 # 获取hint answers = self.db.execSql("show status like 'innodb%_reads'") hint = {'Innodb_buffer_pool_reads': 0, 'Innodb_data_reads': 0} for line in answers: if line[0] in ('Innodb_buffer_pool_reads', 'Innodb_data_reads'): hint[line[0]] = line[1] ret['Hint'] = float(hint['Innodb_buffer_pool_reads']) / float(hint['Innodb_data_reads']) return ret except Exception, e: AgentLog.error("AWR getCacheProfile error:%s" % e)
def doWork(self): isSuccess, errMsg = self.check() if not isSuccess: return self.responseFailed(errMsg) endTime = self.endTime startTime = self.startTime instanceID = self.instanceID try: # 获取实例参数 instanceData = self.getInstanceInfo(self.instanceID) # 获取时间参数 elapsed = time.mktime(time.strptime(endTime, "%Y-%m-%d %H:%M")) - time.mktime( time.strptime(startTime, "%Y-%m-%d %H:%M")) if elapsed < 0: return self.responseFailed('TimeError') snapTime = {'BeginSnap': startTime, 'EndSnap': endTime, 'Elapsed': elapsed} # 获取全局参数 showGlobal = self.db.execSql("show global variables") # 获取sql参数 sqlData = self.getSQL(instanceID) # 获取状态参数 # statusData = self.getLoadProfile(elapsed, startTime, endTime, instanceID) cacheProfile = self.getCacheProfile(instanceID) waitEvent = self.getWaitEvent(instanceID) ret = {'awr': {'Instance': instanceData, 'loadProfile': None, 'SnapTime': snapTime, 'CacheProfile': cacheProfile, 'WaitEvent': waitEvent, 'showGlobal': showGlobal, 'sql': sqlData }} return self.responseOK(ret) except Exception, e: AgentLog.error('AWR get, error:%s' % (e))
def upload_scp(self, params, srcDir): remoteHost = params.get('remoteHost', None) remotePort = params.get('remotePort', None) remoteUser = params.get('remoteUser', None) remotePasswd = params.get('remotePassword', None) backupPath = params.get('backupPath', None) uploadLimit = params.get('uploadLimit', None) if remoteHost == None or remotePort == None or remoteUser == None or \ remotePasswd == None or backupPath == None or uploadLimit == None: return False, 'remote host information errors' uploadLimit = long(uploadLimit) AgentLog.info("start upload backup data to remote server") cmd = "scp -r -P %s %s %s@%s:%s" % (remotePort, srcDir, remoteUser, remoteHost, backupPath) if uploadLimit > 0: cmd = "scp -r -P %s -l %d %s %s@%s:%s" % (remotePort, uploadLimit * 8, srcDir, remoteUser, remoteHost, backupPath) lines = Util.popen(cmd).readlines() if len(lines) != 0: return False, ' '.join(lines) return True, ''
def main(): agentManager = None try: # 解析命令行,获取配置文件地址和是否console执行 optParser = AgentOptionParser() cfgFile = Util.getRealPath(optParser.configFile) console = optParser.console # 获取配置文件配置 userConfig, sysConfig = AgentConfigManager.initAllConfig(cfgFile) # 如果为console状态,则直接运行,如果为非console状态,则在后台以daemon形式运行 if not console: Daemon() else: userConfig.log['logSection'] = "debug" # 初始化日志句柄,所有相关日志的操作都需要放该操作后面 AgentLog.init(userConfig.log['logConfig'], userConfig.log['logSection']) agentManager = AgentManager(userConfig, sysConfig) agentManager.start() while True: Util.sleep(5) agentManager.stop() except KeyboardInterrupt: if agentManager is not None: agentManager.stop() finally: if agentManager is not None: agentManager.stop()
class BaseThread(threading.Thread): def __init__(self, threadName, activeReportInterval=5.0, loopInterval=60): threading.Thread.__init__(self, name=threadName) self.isRunning = True # 线程循环间隔时间 self.loopInterval = loopInterval # 线程每隔一段时间需要上报一次当前状态,activeReportInterval为上报间隔 self.activeReportInterval = activeReportInterval self.lastReportTime = 0 # 多数线程为循环线程,需要每个一定时间运行一次,通过event.wait()进行等待, # 采用event.wait而不是sleep的好处是,在线程退出时可以直接通过set命令取消等待, # 不需要等待sleep结束 self.event = threading.Event() self.event.clear() AgentLog.info("thread [{0}] created, report interval: {1}, loop interval: {2}".format( \ self.getName(), activeReportInterval, loopInterval)) def run(self): AgentLog.info('thread {0} start running'.format(self.getName())) while self.isRunning: try: self.doWork() if self.lastReportTime == 0: self.lastReportTime = int(time.time()) # 获取当前时间 currentTime = int(time.time()) # 比较是否已经超时,如果超时,则在日志文件中打印线程alive信息 if currentTime >= self.lastReportTime + self.activeReportInterval: AgentLog.info( ' thread:[{0}] is alive '.format(self.getName()).ljust(45, ' ').center(75, '#')) self.lastReportTime = currentTime except Exception, e: AgentLog.error( "thread [{0}] raise Exception: {1}".format(self.getName(), traceback.format_exc())) # 等待下一次运行 self.event.wait(self.loopInterval) AgentLog.info('thread:{0} exit'.format(self.getName()))
def receive(self, doWork, ack=False): self.doWork = doWork while 1: try: self.channel.basic_consume(self.callback, queue=self.queue, no_ack=ack) self.channel.start_consuming() except KeyboardInterrupt: AgentLog.warning("receive KeyboardInterrupt error") break except ConnectionClosed, e: AgentLog.warning("receive ConnectionClosed error:%s" % e) if not self.normalExit: self.reconnect() else: break except Exception, e: self.reconnect() AgentLog.warning("receive error: %s" % e)
def run(self): AgentLog.info('thread {0} start running'.format(self.getName())) while self.isRunning: try: self.doWork() if self.lastReportTime == 0: self.lastReportTime = int(time.time()) # 获取当前时间 currentTime = int(time.time()) # 比较是否已经超时,如果超时,则在日志文件中打印线程alive信息 if currentTime >= self.lastReportTime + self.activeReportInterval: AgentLog.info( ' thread:[{0}] is alive '.format(self.getName()).ljust(45, ' ').center(75, '#')) self.lastReportTime = currentTime except Exception, e: AgentLog.error( "thread [{0}] raise Exception: {1}".format(self.getName(), traceback.format_exc())) # 等待下一次运行 self.event.wait(self.loopInterval)
class RabbitMQSender(RabbitMQWrapper): def __init__(self, name_="RabbitMQSender", host_="localhost", port_=5672, exchange_='exchange', routing_key_='key'): RabbitMQWrapper.__init__(self, name_, host_, port_, exchange_, '', routing_key_) AgentLog.info( "create sender %s {host:%s, port:%d, exchange:%s, routing_key:%s}" % (name_, host_, port_, exchange_, routing_key_)) def send(self, msg): try: self.channel.basic_publish(exchange=self.exchange, routing_key=self.routing_key, body=msg) except ConnectionClosed, e: self.reconnect() self.channel.basic_publish(exchange=self.exchange, routing_key=self.routing_key, body=msg) except Exception, e: AgentLog.error("{0} send msg:{1} error".format(self.name, msg)) if len(msg) > 128: AgentLog.debug("{0} send msg: {1}".format(self.name, msg[:128])) else: AgentLog.debug("{0} send msg: {1}".format(self.name, msg))
if not Util.isExists(configFile): raise AgentFileException( 'Mysql can not find config File from path :%s' % configFile) try: with open(configFile, 'r') as f: for line in f: if line.find(key) != -1: key = (line.split('=')[0]).strip() if key[0] != '#': value = (line.split('=')[1]).strip() print value return value except IOError, e: raise AgentFileException( 'Mysql can not find config File from path :%s' % configFile) except Exception, e: AgentLog.warning( 'Mysql can not get Value from config, key: %s,configFile:%s' % (key, configFile)) return "" class SlowLogFile(DBFile): def getData(self): pass class ErrorLogFile(DBFile): def getData(self): pass