def recoveryUnknownNode(self, nodeId, targetState): ''' 检测unknown状态的节点,如果是进程问题,则恢复到目标状态,否则是SSH网卡问题,需要人工修复''' (rst, sshClient) = self.getSSHClient(nodeId) if (not rst): return False (rst, msg) = sshClient.execute( self.sshClients, nodeId, "RECOVERY_UNKNOWN_NODE_TARGETSTATE", [config.dbDatanodePaths, config.dbNodePort, targetState]) if (not rst): return False elif (msg.startswith("0#")): system_log.error( "Recover standby nodeId %d failed for reason:\n%s" % ((nodeId + 1), msg[2:])) return False elif (msg.startswith("1#")): system_log.info("Recover standby nodeId %d succeed:\n" % (nodeId + 1)) return True elif (msg.startswith("2#")): system_log.fatal( "There may be have ssh network card failure in the nodeId %s, it needs manual support" % (nodeId + 1)) return False
def saveToFile(self): impl = minidom.getDOMImplementation() doc = impl.createDocument(None, None, None) root = doc.createElement('Config') # 每一组信息先创建节点<order>,然后插入到父节点<orderlist>下 unfinished_operations_comment = doc.createComment( " 1: unfinished float ip opearation, 2: unfinished cluster config refresh" ) root.appendChild(unfinished_operations_comment) unfinished_operations = doc.createElement('unfinished_operations') unfinished_operations_text = doc.createTextNode( str(self.__unfinishedOperations)[1:-1]) unfinished_operations.appendChild(unfinished_operations_text) root.appendChild(unfinished_operations) float_ip_state_comment = doc.createComment( "1: normal primary, 0: normal standby, -1: unclear primary floatip" ) root.appendChild(float_ip_state_comment) float_ip_state = doc.createElement('float_ip_state') float_ip_state_text = doc.createTextNode( str(self.__floatIpStates)[1:-1]) float_ip_state.appendChild(float_ip_state_text) root.appendChild(float_ip_state) doc.appendChild(root) # 将dom对象写入本地xml文件 try: tmpFileFullName = self.filePath + os.sep + self.fileName + ".tmp" tmpFile = open(tmpFileFullName, 'w') doc.writexml(tmpFile, addindent=" ", newl='\n', encoding='utf-8') if os.path.exists(self.fileFullName + ".bak"): os.remove(self.fileFullName + ".bak") if os.path.exists(self.fileFullName): os.renames(self.fileFullName, self.fileFullName + ".bak") os.renames(tmpFileFullName, self.fileFullName) except BaseException: system_log.error("Save status file failed\n\%s" % traceback.format_exc()) return False return True
def getClusterListenipConfig(self): (rst, sshClient) = self.getSSHClient(0) if (not rst): system_log.error( "Get SSH Client failed at querying cluster listen ip") os._exit(-1) exeOut = "" for i in range(3): # 如果失败,尝试3次 (rst, exeOut) = sshClient.execute(self.sshClients, 0, "GET_CLUSTER_LISTENIP_CONFIG", [config.dbDatanodePaths]) if (rst): break system_log.error( "Get SSH Client failed at querying cluster listen ip") if (i == 2): os._exit(-1) self.setDbNodeListenIp(exeOut) if (len(config.dbNodeIps) != len(self.dbNodeListenIps)): system_log.error( "failed to get the db cluster listen addresses to check, maybe network error, system exited!" ) os._exit(-1) for nodeId in range(len(config.dbNodeIps)): if (config.dbNodeIps[nodeId] != self.dbNodeListenIps[nodeId]): system_log.error( "by checking, the configed listen ips %s is not same as the queried %s, system exited! " % (config.dbNodeIps, self.dbNodeListenIps)) os._exit(-1) system_log.info("finished getting the db cluster listen addresses!")
def parseListenAddressMsg(cls, msg): summaryFlag = False nodeListenIps = [] nodeNames = [] if (msg.startswith('1#')): msg = msg[2:] for info in msg.split("\n"): if (not summaryFlag): matches = re.findall( "Total GUC values:\s+(\d+).+Failed GUC values:\s+(\d+)", info) if (len(matches) == 0): continue elif (int(matches[0][0]) != len(config.dbNodeIps) and int(matches[0][1]) != 0): system_log.error( "Get db_listen_address failure: \n%s " % msg) return (False, []) summaryFlag = True else: matches = re.match(".*\[(.*)\]\s*.*\'((?:[0-9,\.\s]*))\'", info) if (matches == None): continue nodeNames.append(matches.groups()[0]) str1 = matches.groups()[1] #去除IP地址间可能存在的空格 str2 = "".join(str1.split()) ips = str2.split(",") if (config.floatIp in ips): ips.remove(config.floatIp) if (len(ips) != 1): system_log.error( "Get db_listen_address failure: \n%s " % msg) return (False, []) nodeListenIps.append(ips[0]) system_log.info("dbNodenames: %s" % nodeNames) system_log.info("nodeListenIps: %s" % nodeListenIps) return (True, nodeNames, nodeListenIps)
def readConfig(self, fileFullName): self.fileName = os.path.basename(fileFullName) self.filePath = os.path.dirname(fileFullName) self.fileFullName = fileFullName if (not os.path.exists(fileFullName)): system_log.info("no status file: %s is existed." % fileFullName) return True self.hasStatusFile = True try: domTree = parse(fileFullName) rootNode = domTree.documentElement unfinished_operation_nodes = rootNode.getElementsByTagName( "unfinished_operations")[0].childNodes if (len(unfinished_operation_nodes) == 0): self.__unfinishedOperations = [] else: unfinished_operations = unfinished_operation_nodes[ 0].data.split(",") arr = list(map(int, unfinished_operations)) self.__unfinishedOperations = arr float_ip_state_nodes = rootNode.getElementsByTagName( "float_ip_state")[0].childNodes if (len(float_ip_state_nodes) == 0): self.__floatIpStates = [] else: float_ip_state = float_ip_state_nodes[0].data.split(",") arr = list(map(int, float_ip_state)) self.__floatIpStates = arr system_log.info("load status file: %s succeed." % fileFullName) system_log.info(str(self)) return True except BaseException: system_log.error(traceback.format_exc()) system_log.fatal("load status file: %s failed, system exits." % fileFullName) return False
def recoveryPrimaryNodeBeforeUnaviable(self, oldPrimaryNodeId, currDbClusterState): '''检测故障前主节点是否因为网络故障导致集群状态Unavailable,如果是DB进程故障,则强制拉起''' (rst, sshClient) = self.getSSHClient(oldPrimaryNodeId) if (not rst): return False if (currDbClusterState.getNodeState(oldPrimaryNodeId).lower() == "unknown"): (rst, msg) = sshClient.execute( self.sshClients, oldPrimaryNodeId, "CHECK_AND_RECOVERY_PRIMARY_NODE", [config.dbDatanodePaths, config.dbNodePort]) else: (rst, msg) = sshClient.execute(self.sshClients, oldPrimaryNodeId, "FORCE_RECOVERY_PRIMARY_NODE", [config.dbDatanodePaths]) if (not rst): return False elif (msg.startswith("0#")): system_log.error( "Recover the primary nodeId %d before unavailable failed for the reason:\n%s" % ((oldPrimaryNodeId + 1), msg[2:])) return False elif (msg.startswith("1#")): system_log.info("Recover the nodeId %d to primary succeed." % (oldPrimaryNodeId + 1)) return True elif (msg.startswith("2#")): system_log.fatal( "The database process is existed on nodeId %s, please check its ssh newwork card." % (oldPrimaryNodeId + 1)) return True else: system_log.fatal( "The nodeId %s state is unknown and can not connect to it, it needs manual support" % (oldPrimaryNodeId + 1)) return True
def getClusterDbState(self): ''' 循环选择集群节点进行检测''' nodeSize = len(config.dbNodeIps) sshClient = None for _ in range(nodeSize): self.lastCheckNodeid = (self.lastCheckNodeid + 1) % nodeSize (rst, sshClient) = self.getSSHClient(self.lastCheckNodeid) if (rst): break if (None == sshClient): system_log.error("Cannot get SSH Connect, Cluster checking failed") return None self.doUnfinishedOperations(sshClient, self.lastCheckNodeid) system_log.debug("to get db cluster state on nodeId %d" % (self.lastCheckNodeid + 1)) (rst, cmdOut) = sshClient.execute(self.sshClients, self.lastCheckNodeid, "GET_CLUSTER_STATUS") if (not rst): system_log.error( "For ssh client reason, failed to get the cluster state and will try it in the next round" ) return None tmpDbCluster = DbCluster() rst = tmpDbCluster.buildByQuery(cmdOut[2:]) if (rst): system_log.debug("current db cluster state is: %s" % (str(tmpDbCluster))) return tmpDbCluster else: system_log.error("current db cluster state is: %s" % (str(tmpDbCluster))) return None
def check(self): try: self.initDbSSHClients() self.getClusterListenipConfig() system_log.info("System successfully started.") print("System successfully started.") firstCheckFlag = True # 首次测试 while True: start = time.time() currDbClusterState = self.getClusterDbState() # if(system_log.level == logging.DEBUG): # print("db current state is %s" % str(currDbClusterState)) if (currDbClusterState != None): if (currDbClusterState.state != "Unavailable"): self.dbClusterBeforeUnaviable = None # 在初始进入unavailable状态时,保留之前的集群状态,以便必要时恢复故障前主机 elif (self.dbClusterBeforeUnaviable == None): self.dbClusterBeforeUnaviable = copy.deepcopy( self.lastDbCluster) try: if (currDbClusterState.state == "Normal"): self.dbClusterBeforeUnaviable = None system_log.debug( "db current state is normal, system does nothing" ) pass elif (currDbClusterState.state == "Degraded"): self.dbClusterBeforeUnaviable = None system_log.debug( "db current state is degraded, system will process it" ) self.processStatusDegrade( currDbClusterState, [currDbClusterState.getPrimaryNodeIds()]) else: # Unavailable if (firstCheckFlag): system_log.error( "when system start, DB cluster state is '%s'. " "It should be Normal or Degraded and can successfully acquire db nodes' listen_addresses, " "so system exits." % str(currDbClusterState)) os._exit(1) system_log.debug( "db current state is unavailable, system will process it" ) self.processStatusUnavailable(currDbClusterState) except BaseException: system_log.error("%s", traceback.format_exc()) traceback.format_exc() #处理状态变化,记录状态日志,将currDbClusterState的变化复制给lastDbCluster,使其保持最新变化 self.prcessClusterState(currDbClusterState) if (firstCheckFlag): #确保Primary节点配置有浮动ip,如果没有则进行配置 primaryNodeIds = currDbClusterState.getPrimaryNodeIds() self.confirmPrimaryFloatIp(primaryNodeIds[0]) firstCheckFlag = False stop = time.time() wait = int(config.stateCheckPeriod) - (stop - start) if (wait > 0): time.sleep(wait) finally: system_log.info("System stopped.") print("System stopped.") self.closeDbSSHClients() pass