def aboutCross(self): ## 检查跨服连接状态, 1, 异常; 0, 正常; ## 750 是组队竞技场,战场,帮派战,751 是组队副本,襄阳城和跨服运镖 try: dbName = self.srvFlag + "_dbtx" conn = MySQLdb.connect(host=dbOneIp,port=db1Port,user=dbUser,passwd=dbPass,db=dbName,charset=dbChar) cursor = conn.cursor(cursorclass = MySQLdb.cursors.DictCursor) dSql = "select public_url from t_cross_copy_channel where channel=750 limit 1;" ySql = "select public_url from t_cross_copy_channel where channel=751 limit 1;" cursor.execute(dSql) pkRes = cursor.fetchone() ## pkUrl = cursor.fetchone()['public_url'] ## pkDomain, pkPort = pkUrl.split(":") cursor.execute(ySql) fbRes = cursor.fetchone() ## fbUrl = cursor.fetchone()['public_url'] ## fbDomain, fbPort = fbUrl.split(":") crossSwStat, fixSwStat = self.sDb.read(self.srvFlag, "crossAlarm"), self.sDb.read(self.srvFlag, "isFixing") try: pkUrl, fbUrl = pkRes['public_url'], fbRes['public_url'] except Exception, err: saveLog.warning("get Cross Domain error occured. %s" % str(err)) crossStatus, pkDomain, fbDomain = 1, 'null', 'null' ## if pkUrl and fbUrl: else:
def getSrvVer(self, srvPidStatus): ## 服务端版本号 try: DbAppPid = int(srvPidStatus['DbCacheApp_' + self.srvFlag]['DbCacheApp_' + self.srvFlag + "_pid"]) except Exception,err: srvVersion = "null" saveLog.warning("Failed to get DbCacheApp's PID, %s" % str(err))
def getDisk(self): mountPoints, diskTotal = [], 0 #diskTotal累加 parts = psutil.disk_partitions() #分区信息 for p in parts: mp = str(p).split(",")[1].split("=")[1] mountPoints.append(mp) for x in mountPoints: pTotal = psutil.disk_usage(x.strip("'")).total #单个分区容量 diskTotal += pTotal diskTotal = changeUnit(int(diskTotal)) rootSize, rootRate = psutil.disk_usage('/').free, psutil.disk_usage('/').percent #获取/分区的剩余空间和使用率 writeToFile("totalDisk", diskTotal) writeToFile("rootSize", changeUnit(int(rootSize))) writeToFile("rootRate", rootRate) if float(rootRate) > float(85): """ON开关,防止多次重复报警;报警一次之后设为关闭""" if self.sDb.read(self.telIP, "rootAlarm") == "ON": ## 状态为"NO"且开关为"ON"时,报故障 sub = "DBTX ROOT partition Alarm:%s" % self.ps1 msg = "DateTime: %s\n根分区已使用%s%%, 超过85%%, 请警惕!!!" % (getTimeNow(), rootRate) saveLog.warning(msg) sendMail(sub, msg) self.sDb.update(self.telIP, "rootAlarm", "OFF") ## 报警后,将开关置为"OFF" else: if self.sDb.read(self.telIP, "rootAlarm") == "OFF": ## 状态OK且开关为"OFF"时,报恢复 sub = "DBTX ROOT partition is OK:%s" % self.ps1 msg = "DateTime: %s\n根分区可用空间充足%s." % (getTimeNow(), changeUnit(int(rootSize))) saveLog.info(msg) sendMail(sub, msg) self.sDb.update(self.telIP, "rootAlarm", "ON") ## 使用率不足85%时,将开关置为"ON"
def chkPort(tAddr, tPort): ## 检查端口号 import saveLog s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.settimeout(5) try: s.connect((tAddr, int(tPort))) return True except Exception,err: saveLog.warning("chkPort() error occured. %s" % str(err)) return False
def sigRmAct(self, fPath): ## 删除单个文件/目录 ap = os.path.abspath(fPath) splPath = ap.split("/") if os.path.ismount(fPath): ## 判断是否是挂载点,挂载点严禁删除 saveLog.error("This path is a mount point,can't rm !!!") sys.exit(1) elif os.path.isdir(fPath): ## 如果是目录 if splPath[1] == "" or splPath[1] == "n*": saveLog.error("You may deleting root-dir,can't rm !!!") sys.exit(2) elif splPath[1] in self.sysDirList: saveLog.error("This path is in system-dir-lists, can't rm !!!") sys.exit(2) elif splPath[1] in self.riskDirList: saveLog.warning("This path is in risk_dir_lists, move to %s... " % self.tmpDir) refPath = ap.replace("/","-") ## 重命名,将绝对路径中的"/"替换成"-",以保存完整路径 oTime = str(datetime.datetime.now()).replace(" ","_") shutil.move(ap, self.tmpDir+"/"+refPath+"_"+oTime) else: saveLog.info("Deleting %s... " % ap) shutil.rmtree(ap) elif os.path.isfile(fPath): ## 如果是文件 if splPath[1] in self.riskDirList: if os.path.basename(ap) in self.riskFileList: saveLog.error("Important document(file),can't rm !!!") sys.exit(3) saveLog.warning("This path(file) is in risk_dir_lists, move to %s..." % self.tmpDir) refPath = ap.replace("/","-") ## 重命名,将绝对路径中的"/"替换成"-",以保存完整路径 oTime = str(datetime.datetime.now()).replace(" ","_") shutil.move(ap, self.tmpDir+"/"+refPath+"_"+oTime) elif splPath[1] in self.sysDirList: saveLog.error("This path is in system-dir-lists, can't rm !!!") sys.exit(2) else: saveLog.info("Deleting %s..." % ap) os.remove(ap) elif os.path.islink(fPath): ## 是链接的话,先找出它的真实路径 realPath = os.path.realpath(fPath) splPath = realPath.split("/") if splPath[1] in self.riskDirList: if os.path.basename(ap) in self.riskFileList: saveLog.error("Important document(link),can't rm !!!") sys.exit(3) saveLog.warning("This path(link) is in risk_dir_lists, move to %s..." % self.tmpDir) refPath = ap.replace("/","-") ## 重命名,将绝对路径中的"/"替换成"-",以保存完整路径 oTime = str(datetime.datetime.now()).replace(" ","_") shutil.move(ap, self.tmpDir+"/"+refPath+"_"+oTime) elif splPath[1] in self.sysDirList: saveLog.error("This path is in system-dir-lists, can't rm !!!") sys.exit(2) else: saveLog.info("Deleting %s..." % ap) os.remove(ap) else: pass
def getAppStatus(self): ## 检查游戏程序运行状态, 0, 关闭; 1, 正常; 2, 异常 srvFlag = str(self.srvFlag) srvStatus, srvPidStatus, mUser = {'Success': [], 'Error': []}, {}, "*****@*****.**" ## srvNeedStart = ['DbCacheApp','LogDbApp','MailApp','SnapshotApp','CopyApp','CellMgrApp','CellApp','InterApp','GateMgrApp','GateApp','LoginApp'] for srv in gameApps: try: srv += "_" + srvFlag # App名_区服标识 srvPid = getProcessID(srv) # 调用shell pidof 获取pid if "GateApp" in srv: if srvPid: GateAppCount = len(srvPid.split()) # GateApp个数 else: GateAppCount = 0 if srvPid: srvStatus['Success'].append(srv) srvPidStatus[srv] = {srv+"_status": 1, srv+"_pid": srvPid} else: srvStatus['Error'].append(srv) srvPidStatus[srv] = {srv+"_status": 0, srv+"_pid": None} except Exception, err: saveLog.warning("getAppStatus() error occured while get proce pid, %s" % str(err))
def gateAnalyze(srvFlag, nowNu, sDb): from dbtx.Merge.etc import ps1 pubFile = "/opt/dbtx/" + str(srvFlag) + "/config/public_config.xml" pubObj = open(pubFile) fixGateNum = ''.join([ l.strip().split(">")[1].split("<")[0] for l in pubObj.readlines() if "<GateNum>" in l]) pubObj.close() corMuser = "******" if int(nowNu) < int(fixGateNum): if sDb.read(srvFlag, "isFixing") == "OFF": ## 如果维护标识为"OFF" if sDb.read(srvFlag, "appNumAlarm") == "ON": ## 状态为"NO"且开关为"ON"时,报故障 sub = "DBTX GateApps Alarm: %s" % ps1 msg = "DateTime: %s\n%s 预设开启 %s 个GateApp,当前只有 %s 个,挂了 %s 个 !!!" % (getTimeNow(), srvFlag, fixGateNum, nowNu, (int(fixGateNum)-int(nowNu))) saveLog.warning(msg) sendMail(sub, msg, corMuser) sDb.update(srvFlag, "appNumAlarm", "OFF") ## 报警后,将开关置为"OFF" else: if sDb.read(srvFlag, "isFixing") == "OFF": ## 如果维护标识为"OFF" if sDb.read(srvFlag, "appNumAlarm") == "OFF": ## 状态OK且开关为"OFF"时,报恢复 sub = "DBTX GateApps is OK: %s" % ps1 msg = "DateTime: %s\n%s GateApp Numbers正常." % (getTimeNow(), srvFlag) saveLog.warning(msg) sendMail(sub, msg, corMuser) sDb.update(srvFlag, "appNumAlarm", "ON")
def appCpuAnalyze(srvPidStatus, srvFlag, sDb): from dbtx.Merge.etc import ps1 corMuser = "******" tNowCpu = topNow() if not sDb.exists(srvFlag, "ct"): ## Redis库中不存在该服区标识,初始化游戏程序占用CPU达到95%的次数 sDb.intAppCpuCt(srvFlag) for app in srvPidStatus: appName = app.split("_")[0] if srvPidStatus[app][app+"_status"] == 0 or "Snapshot" in app: ## 如果app状态异常,不检测CPU,MEM信息,忽略掉Snapshot continue appPid = srvPidStatus[app][app+"_pid"].split() for pid in appPid: ## CellApp,GateApp,CopyApp会有多个进程ID nCpu, appCount = tNowCpu[pid], int(sDb.read(appName, srvFlag)) if float(nCpu) > 95.5: ## CPU使用率都大于95%时,count+1 appCount += 1 sDb.update(appName, srvFlag, appCount) ## 将新的count值更新到Redis库 else: sDb.update(appName, srvFlag, 0) ## 如果没有超过95%,则置为0 if int(appCount) >= 8: if sDb.read(srvFlag, "isFixing") == "OFF": ## 如果维护标识为"OFF" if sDb.read(srvFlag, "appCpuAlarm") == "ON": ## 状态为"NO"且开关为"ON"时,报故障 sub = "DBTX App-CPU Alarm: %s" % ps1 msg = "DateTime: %s\n%s CPU使用率连续五次达到95%%, 进程ID为: %s !!!" % (getTimeNow(), app, pid) saveLog.warning(msg) sendMail(sub, msg, corMuser) ## sDb.updateDb(mulDb, srvFlag, "appCpuAlarm", "OFF") ## 报警后,将开关置为"OFF" sDb.update(appName, srvFlag, 0) ## 报警后,重置count值 if "DbCacheApp_"+srvFlag == app: proc = psutil.Process(int(pid)) shr = proc.get_ext_memory_info()[2] if float(shr) > 1932735283.2: ## DbCacheApp共享内存大于1.8G时,报警 if sDb.read(srvFlag, "isFixing") == "OFF": ## 如果维护标识为"OFF" if sDb.read(srvFlag, "appShrAlarm") == "ON": ## 状态为"NO"且开关为"ON"时,报故障 sub = "DBTX DbCacheApp-SHR Alarm: %s" % ps1 msg = "DateTime: %s\n%s DbCacheApp 共享内存达到红色警戒,当前占用共享内存: %s !!!" % (getTimeNow(), srvFlag, changeUnit(shr)) saveLog.warning(msg) sendMail(sub, msg, corMuser) sDb.update(srvFlag, "appShrAlarm", "OFF") ## 报警后,将开关置为"OFF" else: if sDb.read(srvFlag, "isFixing") == "OFF": ## 如果维护标识为"OFF" if sDb.read(srvFlag, "appShrAlarm") == "OFF": ## 状态OK且开关为"OFF"时,报恢复 sub = "DBTX DbCacheApp-SHR Alarm: %s" % ps1 msg = "DateTime: %s\n%s DbCacheApp 占用共享内存恢复到正常水平: %s ." % (getTimeNow(), srvFlag, changeUnit(shr)) saveLog.warning(msg) sendMail(sub, msg, corMuser) sDb.update(srvFlag, "appShrAlarm", "ON")
def getNginxStatus(self): try: if os.path.exists("/usr/local/nginx/logs/nginx.pid"): fObj = open("/usr/local/nginx/logs/nginx.pid") ngxMainPid = fObj.read().strip() fObj.close() else: ngxMainPid = 0 pids = getProcessID("nginx") if ngxMainPid: if not pids or len(pids) < 8: wp = "" res = os.popen("ps -C nginx -o pid,cmd").readlines() for l in res: wp += "\n" + l.strip() if self.sDb.read(telIP, "ngxAlarm") == "ON": ## 状态为"NO"且开关为"ON"时,报故障 sub = "DBTX Nginx Alarm: %s" % ps1 msg = "Datetime: %s\nNginx主进程PID:%s\nWorker_processes:%s\n少于10个!!" % (getTimeNow(), ngxMainPid, wp) saveLog.warning(msg) sendMail(sub, msg) self.sDb.update(telIP, "ngxAlarm", "OFF") else: if self.sDb.read(telIP, "ngxAlarm") == "OFF": ## 状态OK且开关为"OFF"时,报恢复 sub = "DBTX Nginx is OK: %s" % ps1 msg = "Datetime: %s\nNginx主进程恢复开启状态." % (getTimeNow()) saveLog.info(msg) sendMail(sub, msg) self.sDb.update(telIP, "ngxAlarm", "ON") else: if self.sDb.read(telIP, "ngxAlarm") == "ON": ## 状态为"NO"且开关为"ON"时,报故障 sub = "DBTX Nginx Alarm: %s" % ps1 msg = "Datetime: %s\nNginx主进程挂了!!" % (getTimeNow()) saveLog.warning(msg) sendMail(sub, msg) self.sDb.update(telIP, "ngxAlarm", "OFF") except Exception,err: saveLog.warning("getNginxStatus, %s" % str(err))
redisData["command_rate"] = (int(redisData["total_commands"]) - int(last_commands)) / float(60) fZbx = str(redisData).strip("{}").replace(",","\n").replace("'","") ## 转换成zabbix需要的格式 fObj = open(rZbx,'w') fObj.write(fZbx) fObj.close() except Exception,err: saveLog.error("getRedisBaseInfo(): %s" % str(err)) ## 分析数据,必要时报警: 1, 使用内存大于1G; 2, 命中率低于20% if int(redisData["used_memory"]) > 1073741824: sDb = opRedis() ## 实例化 Redis库操作类 if sDb.read(telIP, "rRateAlarm") == "ON": sub = "DBTX Redis Memory Alarm: %s" % ps1 msg = "DateTime: %s\nRedis使用内存为: %s, 超过1G,请警惕 !!!" % (getTimeNow(), changeUnit(int(redisData["used_memory"]))) saveLog.warning(msg) sendMail(sub, msg) sDb.update(telIP, "rRateAlarm", "OFF") ## 报警后,将开关置为"OFF" else: sDb = opRedis() ## 实例化 Redis库操作类 if sDb.read(telIP, "rRateAlarm") == "OFF": sub = "DBTX Redis Memory is OK: %s" % ps1 msg = "DateTime: %s\nRedis使用内存情况正常: %s." % (getTimeNow(), changeUnit(int(redisData["used_memory"]))) saveLog.info(msg) sendMail(sub, msg) sDb.update(telIP, "rRateAlarm", "ON") ## if totalHits != 0 and redisData["hit_rate"] < 20: ## sDb = opShelveDb() ## 实例化 shelve库操作类 ## if sDb.readDb(pubDb, telIP, "rRateAlarm") == "ON": ## sub = "DBTX Redis HitRate Alarm: %s" % ps1
def runCollect(self): try: from dbtx.Merge.etc import telIP,cncIP,lanIP,ps1,dbOneIp,dbTwoIp,cpuCores,totalDisk,rootSize,rootRate,net_adapter_type,net_adapter_driver_version,net_bandwidth,totalMem,role_type,ser_vendor,ser_product except Exception, err: saveLog.warning("Import Error. %s" % str(err)) os.system("/usr/bin/python /data/sh/monitor/Alive/doCollectFixInfo.py")
def getCoreFile(self, srvRootDir): ## 监控coredown try: corBaseDir = os.path.dirname(srvRootDir) except Exception,err: saveLog.warning("getCoreDir, %s" % str(err))
if appSwStat == "ON": ## 状态为"NO"且开关为"ON"时,报故障 sub = "DBTX AppService Alarm: %s" % ps1 msg = "DateTime: %s\n%s 挂啦 !!!" % (getTimeNow(), srvStatus['Error']) saveLog.warning(msg) sendMail(sub, msg, mUser) self.sDb.update(srvFlag, "appAlarm", "OFF") except Exception, err: saveLog.warning("app_status is 2,get switch status error occured.") elif srvStatus['Error'] and not srvStatus['Success']: ## 全部处于关闭状态 app_status = 0 try: if fixSwStat == "OFF": ## 如果维护标识为"OFF" if appSwStat == "ON": ## 状态为"NO"且开关为"ON"时,报故障 sub = "DBTX AppService is Down: %s" % ps1 msg = "DateTime: %s\n%s AppService 关闭 !!!" % (getTimeNow(), srvFlag) saveLog.warning(msg) sendMail(sub, msg, mUser) self.sDb.update(srvFlag, "appAlarm", "OFF") except Exception, err: saveLog.warning("app_status is 0,get switch status error occured.") elif not srvStatus['Error'] and srvStatus['Success']: ## 全部处于开启状态 app_status = 1 appCpuAnalyze(srvPidStatus,srvFlag,self.sDb) self.getSrvVer(srvPidStatus) gateAnalyze(srvFlag,GateAppCount,self.sDb) try: if fixSwStat == "OFF": ## 如果维护标识为"OFF" if appSwStat == "OFF": ## 状态OK且开关为"OFF"时,报恢复 sub = "DBTX AppService is OK: %s" % ps1 msg = "DateTime: %s\n%s AppService 恢复开启 !!!" % (getTimeNow(), srvFlag) saveLog.info(msg)