Exemple #1
0
    def aboutCross(self):
        ## 检查跨服连接状态, 1, 异常; 0, 正常;
        ## 750 是组队竞技场,战场,帮派战,751 是组队副本,襄阳城和跨服运镖
        try:
            dbName = self.srvFlag + "_dbtx"
            conn = MySQLdb.connect(host=dbOneIp,port=db1Port,user=dbUser,passwd=dbPass,db=dbName,charset=dbChar)
            cursor = conn.cursor(cursorclass = MySQLdb.cursors.DictCursor)
            dSql = "select public_url from t_cross_copy_channel where channel=750 limit 1;"
            ySql = "select public_url from t_cross_copy_channel where channel=751 limit 1;"
            cursor.execute(dSql)
            pkRes = cursor.fetchone()
##            pkUrl = cursor.fetchone()['public_url']
##            pkDomain, pkPort = pkUrl.split(":")
            cursor.execute(ySql)
            fbRes = cursor.fetchone()
##            fbUrl = cursor.fetchone()['public_url']
##            fbDomain, fbPort = fbUrl.split(":")

            crossSwStat, fixSwStat = self.sDb.read(self.srvFlag, "crossAlarm"), self.sDb.read(self.srvFlag, "isFixing")
            try:
                pkUrl, fbUrl = pkRes['public_url'], fbRes['public_url']
            except Exception, err:
                saveLog.warning("get Cross Domain error occured. %s" % str(err))
                crossStatus, pkDomain, fbDomain = 1, 'null', 'null'
##            if pkUrl and fbUrl:
            else:
Exemple #2
0
 def getSrvVer(self, srvPidStatus):
     ## 服务端版本号
     try:
         DbAppPid = int(srvPidStatus['DbCacheApp_' + self.srvFlag]['DbCacheApp_' + self.srvFlag + "_pid"])
     except Exception,err:
         srvVersion = "null"
         saveLog.warning("Failed to get DbCacheApp's PID, %s" % str(err))
Exemple #3
0
 def getDisk(self):
     mountPoints, diskTotal = [], 0          #diskTotal累加
     parts = psutil.disk_partitions()        #分区信息
     for p in parts:
         mp = str(p).split(",")[1].split("=")[1]
         mountPoints.append(mp)
     for x in mountPoints:
         pTotal = psutil.disk_usage(x.strip("'")).total      #单个分区容量
         diskTotal += pTotal
     diskTotal = changeUnit(int(diskTotal))
     rootSize, rootRate = psutil.disk_usage('/').free, psutil.disk_usage('/').percent    #获取/分区的剩余空间和使用率
     writeToFile("totalDisk", diskTotal)
     writeToFile("rootSize", changeUnit(int(rootSize)))
     writeToFile("rootRate", rootRate)
     if float(rootRate) > float(85):
         """ON开关,防止多次重复报警;报警一次之后设为关闭"""
         if self.sDb.read(self.telIP, "rootAlarm") == "ON":     ## 状态为"NO"且开关为"ON"时,报故障
             sub = "DBTX ROOT partition Alarm:%s" % self.ps1
             msg = "DateTime: %s\n根分区已使用%s%%, 超过85%%, 请警惕!!!" % (getTimeNow(), rootRate)
             saveLog.warning(msg)
             sendMail(sub, msg)
             self.sDb.update(self.telIP, "rootAlarm", "OFF")    ## 报警后,将开关置为"OFF"
     else:
         if self.sDb.read(self.telIP, "rootAlarm") == "OFF":    ## 状态OK且开关为"OFF"时,报恢复
             sub = "DBTX ROOT partition is OK:%s" % self.ps1
             msg = "DateTime: %s\n根分区可用空间充足%s." % (getTimeNow(), changeUnit(int(rootSize)))
             saveLog.info(msg)
             sendMail(sub, msg)
             self.sDb.update(self.telIP, "rootAlarm", "ON")     ## 使用率不足85%时,将开关置为"ON"
Exemple #4
0
def chkPort(tAddr, tPort):    ## 检查端口号
    import saveLog
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.settimeout(5)
    try:
        s.connect((tAddr, int(tPort)))
        return True
    except Exception,err:
        saveLog.warning("chkPort() error occured. %s" % str(err))
        return False
Exemple #5
0
    def sigRmAct(self, fPath):       ## 删除单个文件/目录
        ap = os.path.abspath(fPath)
        splPath = ap.split("/")
        if os.path.ismount(fPath):   ## 判断是否是挂载点,挂载点严禁删除
            saveLog.error("This path is a mount point,can't rm !!!")
            sys.exit(1)

        elif os.path.isdir(fPath):   ## 如果是目录
            if splPath[1] == "" or splPath[1] == "n*":
                saveLog.error("You may deleting root-dir,can't rm !!!")
                sys.exit(2)
            elif splPath[1] in self.sysDirList:
                saveLog.error("This path is in system-dir-lists, can't rm !!!")
                sys.exit(2)
            elif splPath[1] in self.riskDirList:
                saveLog.warning("This path is in risk_dir_lists, move to %s... " % self.tmpDir)
                refPath = ap.replace("/","-")      ## 重命名,将绝对路径中的"/"替换成"-",以保存完整路径
                oTime = str(datetime.datetime.now()).replace(" ","_")
                shutil.move(ap, self.tmpDir+"/"+refPath+"_"+oTime)
            else:
                saveLog.info("Deleting %s... " % ap)
                shutil.rmtree(ap)

        elif os.path.isfile(fPath):   ## 如果是文件
            if splPath[1] in self.riskDirList:
                if os.path.basename(ap) in self.riskFileList:
                    saveLog.error("Important document(file),can't rm !!!")
                    sys.exit(3)
                saveLog.warning("This path(file) is in risk_dir_lists, move to %s..." % self.tmpDir)
                refPath = ap.replace("/","-")      ## 重命名,将绝对路径中的"/"替换成"-",以保存完整路径
                oTime = str(datetime.datetime.now()).replace(" ","_")
                shutil.move(ap, self.tmpDir+"/"+refPath+"_"+oTime)
            elif splPath[1] in self.sysDirList:
                saveLog.error("This path is in system-dir-lists, can't rm !!!")
                sys.exit(2)
            else:
                saveLog.info("Deleting %s..." % ap)
                os.remove(ap)

        elif os.path.islink(fPath):   ## 是链接的话,先找出它的真实路径
            realPath = os.path.realpath(fPath)
            splPath = realPath.split("/")
            if splPath[1] in self.riskDirList:
                if os.path.basename(ap) in self.riskFileList:
                    saveLog.error("Important document(link),can't rm !!!")
                    sys.exit(3)
                saveLog.warning("This path(link) is in risk_dir_lists, move to %s..." % self.tmpDir)
                refPath = ap.replace("/","-")      ## 重命名,将绝对路径中的"/"替换成"-",以保存完整路径
                oTime = str(datetime.datetime.now()).replace(" ","_")
                shutil.move(ap, self.tmpDir+"/"+refPath+"_"+oTime)
            elif splPath[1] in self.sysDirList:
                saveLog.error("This path is in system-dir-lists, can't rm !!!")
                sys.exit(2)
            else:
                saveLog.info("Deleting %s..." % ap)
                os.remove(ap)

        else:
            pass
Exemple #6
0
    def getAppStatus(self):
        ## 检查游戏程序运行状态, 0, 关闭; 1, 正常; 2, 异常
        srvFlag = str(self.srvFlag)
        srvStatus, srvPidStatus, mUser = {'Success': [], 'Error': []}, {}, "*****@*****.**"
##        srvNeedStart = ['DbCacheApp','LogDbApp','MailApp','SnapshotApp','CopyApp','CellMgrApp','CellApp','InterApp','GateMgrApp','GateApp','LoginApp']
        for srv in gameApps:
            try:
                srv += "_" + srvFlag    # App名_区服标识
                srvPid = getProcessID(srv)  # 调用shell pidof 获取pid
                if "GateApp" in srv:
                    if srvPid:
                        GateAppCount = len(srvPid.split())  # GateApp个数
                    else:
                        GateAppCount = 0
                if srvPid:
                    srvStatus['Success'].append(srv)
                    srvPidStatus[srv] = {srv+"_status": 1, srv+"_pid": srvPid}
                else:
                    srvStatus['Error'].append(srv)
                    srvPidStatus[srv] = {srv+"_status": 0, srv+"_pid": None}
            except Exception, err:
                saveLog.warning("getAppStatus() error occured while get proce pid, %s" % str(err))
Exemple #7
0
def gateAnalyze(srvFlag, nowNu, sDb):
    from dbtx.Merge.etc import ps1
    pubFile = "/opt/dbtx/" + str(srvFlag) + "/config/public_config.xml"
    pubObj = open(pubFile)
    fixGateNum = ''.join([ l.strip().split(">")[1].split("<")[0] for l in pubObj.readlines() if "<GateNum>" in l])
    pubObj.close()
    corMuser = "******"

    if int(nowNu) < int(fixGateNum):
        if sDb.read(srvFlag, "isFixing") == "OFF":         ## 如果维护标识为"OFF"
            if sDb.read(srvFlag, "appNumAlarm") == "ON":     ## 状态为"NO"且开关为"ON"时,报故障
                sub = "DBTX GateApps Alarm: %s" % ps1
                msg = "DateTime: %s\n%s 预设开启 %s 个GateApp,当前只有 %s 个,挂了 %s 个 !!!" % (getTimeNow(), srvFlag, fixGateNum, nowNu, (int(fixGateNum)-int(nowNu)))
                saveLog.warning(msg)
                sendMail(sub, msg, corMuser)
                sDb.update(srvFlag, "appNumAlarm", "OFF")    ## 报警后,将开关置为"OFF"
    else:
        if sDb.read(srvFlag, "isFixing") == "OFF":         ## 如果维护标识为"OFF"
            if sDb.read(srvFlag, "appNumAlarm") == "OFF":    ## 状态OK且开关为"OFF"时,报恢复
                sub = "DBTX GateApps is OK: %s" % ps1
                msg = "DateTime: %s\n%s GateApp Numbers正常." % (getTimeNow(), srvFlag)
                saveLog.warning(msg)
                sendMail(sub, msg, corMuser)
                sDb.update(srvFlag, "appNumAlarm", "ON")
Exemple #8
0
def appCpuAnalyze(srvPidStatus, srvFlag, sDb):
    from dbtx.Merge.etc import ps1
    corMuser = "******"
    tNowCpu = topNow()
    if not sDb.exists(srvFlag, "ct"):  ## Redis库中不存在该服区标识,初始化游戏程序占用CPU达到95%的次数
        sDb.intAppCpuCt(srvFlag)

    for app in srvPidStatus:
        appName = app.split("_")[0]
        if srvPidStatus[app][app+"_status"] == 0 or "Snapshot" in app:     ## 如果app状态异常,不检测CPU,MEM信息,忽略掉Snapshot
            continue
        appPid = srvPidStatus[app][app+"_pid"].split()
        for pid in appPid:   ## CellApp,GateApp,CopyApp会有多个进程ID
            nCpu, appCount = tNowCpu[pid], int(sDb.read(appName, srvFlag))
            if float(nCpu) > 95.5:   ## CPU使用率都大于95%时,count+1
                appCount += 1
                sDb.update(appName, srvFlag, appCount)     ## 将新的count值更新到Redis库
            else:
                sDb.update(appName, srvFlag, 0)            ## 如果没有超过95%,则置为0

            if int(appCount) >= 8:
                if sDb.read(srvFlag, "isFixing") == "OFF":         ## 如果维护标识为"OFF"
                    if sDb.read(srvFlag, "appCpuAlarm") == "ON":     ## 状态为"NO"且开关为"ON"时,报故障
                        sub = "DBTX App-CPU Alarm: %s" % ps1
                        msg = "DateTime: %s\n%s CPU使用率连续五次达到95%%, 进程ID为: %s !!!" % (getTimeNow(), app, pid)
                        saveLog.warning(msg)
                        sendMail(sub, msg, corMuser)
##                        sDb.updateDb(mulDb, srvFlag, "appCpuAlarm", "OFF")    ## 报警后,将开关置为"OFF"
                        sDb.update(appName, srvFlag, 0)                                       ## 报警后,重置count值

            if "DbCacheApp_"+srvFlag == app:
                proc = psutil.Process(int(pid))
                shr = proc.get_ext_memory_info()[2]
                if float(shr) > 1932735283.2:    ## DbCacheApp共享内存大于1.8G时,报警
                    if sDb.read(srvFlag, "isFixing") == "OFF":         ## 如果维护标识为"OFF"
                        if sDb.read(srvFlag, "appShrAlarm") == "ON":     ## 状态为"NO"且开关为"ON"时,报故障
                            sub = "DBTX DbCacheApp-SHR Alarm: %s" % ps1
                            msg = "DateTime: %s\n%s DbCacheApp 共享内存达到红色警戒,当前占用共享内存: %s !!!" % (getTimeNow(), srvFlag, changeUnit(shr))
                            saveLog.warning(msg)
                            sendMail(sub, msg, corMuser)
                            sDb.update(srvFlag, "appShrAlarm", "OFF")    ## 报警后,将开关置为"OFF"
                else:
                    if sDb.read(srvFlag, "isFixing") == "OFF":         ## 如果维护标识为"OFF"
                        if sDb.read(srvFlag, "appShrAlarm") == "OFF":    ## 状态OK且开关为"OFF"时,报恢复
                            sub = "DBTX DbCacheApp-SHR Alarm: %s" % ps1
                            msg = "DateTime: %s\n%s DbCacheApp 占用共享内存恢复到正常水平: %s ." % (getTimeNow(), srvFlag, changeUnit(shr))
                            saveLog.warning(msg)
                            sendMail(sub, msg, corMuser)
                            sDb.update(srvFlag, "appShrAlarm", "ON")
Exemple #9
0
 def getNginxStatus(self):
     try:
         if os.path.exists("/usr/local/nginx/logs/nginx.pid"):
             fObj = open("/usr/local/nginx/logs/nginx.pid")
             ngxMainPid = fObj.read().strip()
             fObj.close()
         else:
             ngxMainPid = 0
         pids = getProcessID("nginx")
         if ngxMainPid:
             if not pids or len(pids) < 8:
                 wp = ""
                 res = os.popen("ps -C nginx -o pid,cmd").readlines()
                 for l in res:
                     wp += "\n" + l.strip()
                 if self.sDb.read(telIP, "ngxAlarm") == "ON":     ## 状态为"NO"且开关为"ON"时,报故障
                     sub = "DBTX Nginx Alarm: %s" % ps1
                     msg = "Datetime: %s\nNginx主进程PID:%s\nWorker_processes:%s\n少于10个!!" % (getTimeNow(), ngxMainPid, wp)
                     saveLog.warning(msg)
                     sendMail(sub, msg)
                     self.sDb.update(telIP, "ngxAlarm", "OFF")
             else:
                 if self.sDb.read(telIP, "ngxAlarm") == "OFF":    ## 状态OK且开关为"OFF"时,报恢复
                     sub = "DBTX Nginx is OK: %s" % ps1
                     msg = "Datetime: %s\nNginx主进程恢复开启状态." % (getTimeNow())
                     saveLog.info(msg)
                     sendMail(sub, msg)
                     self.sDb.update(telIP, "ngxAlarm", "ON")
         else:
             if self.sDb.read(telIP, "ngxAlarm") == "ON":     ## 状态为"NO"且开关为"ON"时,报故障
                 sub = "DBTX Nginx Alarm: %s" % ps1
                 msg = "Datetime: %s\nNginx主进程挂了!!" % (getTimeNow())
                 saveLog.warning(msg)
                 sendMail(sub, msg)
                 self.sDb.update(telIP, "ngxAlarm", "OFF")
     except Exception,err:
         saveLog.warning("getNginxStatus, %s" % str(err))
Exemple #10
0
        redisData["command_rate"] = (int(redisData["total_commands"]) - int(last_commands)) / float(60)

        fZbx = str(redisData).strip("{}").replace(",","\n").replace("'","")   ## 转换成zabbix需要的格式
        fObj = open(rZbx,'w')
        fObj.write(fZbx)
        fObj.close()
    except Exception,err:
        saveLog.error("getRedisBaseInfo(): %s" % str(err))

    ## 分析数据,必要时报警: 1, 使用内存大于1G; 2, 命中率低于20%
    if int(redisData["used_memory"]) > 1073741824:
        sDb = opRedis()    ## 实例化 Redis库操作类
        if sDb.read(telIP, "rRateAlarm") == "ON":
            sub = "DBTX Redis Memory Alarm: %s" % ps1
            msg = "DateTime: %s\nRedis使用内存为: %s, 超过1G,请警惕 !!!" % (getTimeNow(), changeUnit(int(redisData["used_memory"])))
            saveLog.warning(msg)
            sendMail(sub, msg)
            sDb.update(telIP, "rRateAlarm", "OFF")    ## 报警后,将开关置为"OFF"
    else:
        sDb = opRedis()    ## 实例化 Redis库操作类
        if sDb.read(telIP, "rRateAlarm") == "OFF":
            sub = "DBTX Redis Memory is OK: %s" % ps1
            msg = "DateTime: %s\nRedis使用内存情况正常: %s." % (getTimeNow(), changeUnit(int(redisData["used_memory"])))
            saveLog.info(msg)
            sendMail(sub, msg)
            sDb.update(telIP, "rRateAlarm", "ON")

##    if totalHits != 0 and redisData["hit_rate"] < 20:
##        sDb = opShelveDb()    ## 实例化 shelve库操作类
##        if sDb.readDb(pubDb, telIP, "rRateAlarm") == "ON":
##            sub = "DBTX Redis HitRate Alarm: %s" % ps1
Exemple #11
0
 def runCollect(self):
     try:
         from dbtx.Merge.etc import telIP,cncIP,lanIP,ps1,dbOneIp,dbTwoIp,cpuCores,totalDisk,rootSize,rootRate,net_adapter_type,net_adapter_driver_version,net_bandwidth,totalMem,role_type,ser_vendor,ser_product
     except Exception, err:
         saveLog.warning("Import Error. %s" % str(err))  
         os.system("/usr/bin/python /data/sh/monitor/Alive/doCollectFixInfo.py")
Exemple #12
0
 def getCoreFile(self, srvRootDir):
     ## 监控coredown
     try:
         corBaseDir = os.path.dirname(srvRootDir)
     except Exception,err:
         saveLog.warning("getCoreDir, %s" % str(err))
Exemple #13
0
             if appSwStat == "ON":     ## 状态为"NO"且开关为"ON"时,报故障
                 sub = "DBTX AppService Alarm: %s" % ps1
                 msg = "DateTime: %s\n%s 挂啦 !!!" % (getTimeNow(), srvStatus['Error'])
                 saveLog.warning(msg)
                 sendMail(sub, msg, mUser)
                 self.sDb.update(srvFlag, "appAlarm", "OFF")
     except Exception, err:
         saveLog.warning("app_status is 2,get switch status error occured.")
 elif srvStatus['Error'] and not srvStatus['Success']:  ## 全部处于关闭状态
     app_status = 0
     try:
         if fixSwStat == "OFF":        ## 如果维护标识为"OFF"
             if appSwStat == "ON":     ## 状态为"NO"且开关为"ON"时,报故障
                 sub = "DBTX AppService is Down: %s" % ps1
                 msg = "DateTime: %s\n%s AppService  关闭 !!!" % (getTimeNow(), srvFlag)
                 saveLog.warning(msg)
                 sendMail(sub, msg, mUser)
                 self.sDb.update(srvFlag, "appAlarm", "OFF")
     except Exception, err:
         saveLog.warning("app_status is 0,get switch status error occured.")
 elif not srvStatus['Error'] and srvStatus['Success']:  ## 全部处于开启状态
     app_status = 1
     appCpuAnalyze(srvPidStatus,srvFlag,self.sDb)
     self.getSrvVer(srvPidStatus)
     gateAnalyze(srvFlag,GateAppCount,self.sDb)
     try:
         if fixSwStat == "OFF":        ## 如果维护标识为"OFF"
             if appSwStat == "OFF":    ## 状态OK且开关为"OFF"时,报恢复
                 sub = "DBTX AppService is OK: %s" % ps1
                 msg = "DateTime: %s\n%s AppService  恢复开启 !!!" % (getTimeNow(), srvFlag)
                 saveLog.info(msg)