def recoveryFaultStandby(self, currDbClusterState, exceptNodeIds=[]): for tmpNodeid in range(len(currDbClusterState.nodes)): if (tmpNodeid in exceptNodeIds): continue node = currDbClusterState.nodes[tmpNodeid] if (node.state == "Pending"): continue elif (node.state == "Standby"): if (node.subState == "Normal"): continue elif (node.subState == "Need repair"): if (node.supplementInfo.startswith("WAL")): system_log.info( "the node is in state '%s', build it to standby" % str(node)) self.buildStanbyForNode(tmpNodeid) else: system_log.debug( "the node is in state '%s', wait for it to recover automatically" % str(node)) continue elif (node.subState == "Coredump" or node.subState == "Unknown"): system_log.info( "the node is in state '%s', build it to standby" % str(node)) self.buildStanbyForNode(tmpNodeid) else: system_log.info( "the node is in state '%s', wait for it to recover automatically" % str(node)) elif (node.state == "Normal" or node.state == "Down" or node.state == "Manually stopped" or node.state == "Abnormal"): system_log.info( "the node is in state '%s', build it to standby" % str(node)) self.buildStanbyForNode(tmpNodeid) elif (node.state == "Unknown"): system_log.info( "the node is in state '%s', build it to standby" % str(node)) self.recoveryUnknownNode(tmpNodeid, 'standby') elif (node.state == "Primary" and node.subState == "Normal"): continue else: system_log.info( "the node is in state '%s', wait for it to recover automatically" % str(node))
def execute(self, sshClients, nodeId, execmd, params=[]): revc_str = "" cmdLine = "%s %s" % (config.gghsAgentPath, execmd) for p in params: cmdLine = "%s %s" % (cmdLine, p) system_log.debug("Send request to nodeId %d: \n%s" % ((nodeId + 1), cmdLine)) try: _, stdout, _ = self.ssh.exec_command(cmdLine, timeout=int(config.sshTimeout), get_pty=True) for info in stdout.readlines(): revc_str += info except BaseException: self.close() sshClients[nodeId] = None system_log.fatal("The network card may be failure in the connection of nodeId %d at sending '%s'.\n%s" % ((nodeId + 1), execmd, traceback.format_exc())) return (False, None) system_log.debug("Receive Response from nodeId %d: \n%s" % ((nodeId + 1), revc_str)) return (True, revc_str)
def getClusterDbState(self): ''' 循环选择集群节点进行检测''' nodeSize = len(config.dbNodeIps) sshClient = None for _ in range(nodeSize): self.lastCheckNodeid = (self.lastCheckNodeid + 1) % nodeSize (rst, sshClient) = self.getSSHClient(self.lastCheckNodeid) if (rst): break if (None == sshClient): system_log.error("Cannot get SSH Connect, Cluster checking failed") return None self.doUnfinishedOperations(sshClient, self.lastCheckNodeid) system_log.debug("to get db cluster state on nodeId %d" % (self.lastCheckNodeid + 1)) (rst, cmdOut) = sshClient.execute(self.sshClients, self.lastCheckNodeid, "GET_CLUSTER_STATUS") if (not rst): system_log.error( "For ssh client reason, failed to get the cluster state and will try it in the next round" ) return None tmpDbCluster = DbCluster() rst = tmpDbCluster.buildByQuery(cmdOut[2:]) if (rst): system_log.debug("current db cluster state is: %s" % (str(tmpDbCluster))) return tmpDbCluster else: system_log.error("current db cluster state is: %s" % (str(tmpDbCluster))) return None
def check(self): try: self.initDbSSHClients() self.getClusterListenipConfig() system_log.info("System successfully started.") print("System successfully started.") firstCheckFlag = True # 首次测试 while True: start = time.time() currDbClusterState = self.getClusterDbState() # if(system_log.level == logging.DEBUG): # print("db current state is %s" % str(currDbClusterState)) if (currDbClusterState != None): if (currDbClusterState.state != "Unavailable"): self.dbClusterBeforeUnaviable = None # 在初始进入unavailable状态时,保留之前的集群状态,以便必要时恢复故障前主机 elif (self.dbClusterBeforeUnaviable == None): self.dbClusterBeforeUnaviable = copy.deepcopy( self.lastDbCluster) try: if (currDbClusterState.state == "Normal"): self.dbClusterBeforeUnaviable = None system_log.debug( "db current state is normal, system does nothing" ) pass elif (currDbClusterState.state == "Degraded"): self.dbClusterBeforeUnaviable = None system_log.debug( "db current state is degraded, system will process it" ) self.processStatusDegrade( currDbClusterState, [currDbClusterState.getPrimaryNodeIds()]) else: # Unavailable if (firstCheckFlag): system_log.error( "when system start, DB cluster state is '%s'. " "It should be Normal or Degraded and can successfully acquire db nodes' listen_addresses, " "so system exits." % str(currDbClusterState)) os._exit(1) system_log.debug( "db current state is unavailable, system will process it" ) self.processStatusUnavailable(currDbClusterState) except BaseException: system_log.error("%s", traceback.format_exc()) traceback.format_exc() #处理状态变化,记录状态日志,将currDbClusterState的变化复制给lastDbCluster,使其保持最新变化 self.prcessClusterState(currDbClusterState) if (firstCheckFlag): #确保Primary节点配置有浮动ip,如果没有则进行配置 primaryNodeIds = currDbClusterState.getPrimaryNodeIds() self.confirmPrimaryFloatIp(primaryNodeIds[0]) firstCheckFlag = False stop = time.time() wait = int(config.stateCheckPeriod) - (stop - start) if (wait > 0): time.sleep(wait) finally: system_log.info("System stopped.") print("System stopped.") self.closeDbSSHClients() pass
def doUnfinishedOperations(self, sshClient, nodeId): ''' context.unfinishedOperations操作 1: 主备切换后配置未能刷, 2: 浮动IP处理未完成 context.float_ip_state: 1: Primary正常启动floatip,0: Standby正常清除floatip,-1: 故障主节点未能清除floatip ''' if (not context.hasUnfinishOpers()): system_log.debug("context has no unfinished operations.") return system_log.debug("to do unfinished operations") stateModFlag = False # context是否变化 floatIpAllModifyFlag = True # 浮动IP是否存在修改失败 #主备切换后配置未能刷新配置 if context.needDBRefreshConf(): system_log.debug("need to refresh db cluster config") (rst, rsp) = sshClient.execute(self.sshClients, nodeId, "CLUSTER_REFRESH_CONFIG") if (not rst or not Util.parseRefreshClusterConfMsg(rsp)): system_log.info( "Cluster config refreshing failed, and will try again in the next round" ) else: system_log.info("Cluster config refreshing success") context.removeUnfinishOper(const.CLUSTER_REFRESH_CONFIG) stateModFlag = True # 浮动IP处理 elif (context.needClearFloatIp()): system_log.debug("need to clear standby node float ip") dbNodesSize = len(self.dbNodeListenIps) for idx in range(dbNodesSize): if (context.getFloatIpState( idx) == const.STANDBY_UNCLEAR_FLOATIP ): # 说明该节点浮动IP没有消除成功,需获取其sshClient进行 (rst, sshClientTmp) = self.getSSHClient(idx) if (not rst): system_log.debug( "Cannot get ssh connection for nodeId %d, will try delete the float ip in the next round" % (idx + 1)) floatIpAllModifyFlag = False continue (rst, rsp) = sshClientTmp.execute( self.sshClients, idx, "CLEAR_NODE_FLOATIP_BUILD", [ self.dbNodeListenIps[idx], self.dbNodeNames[idx], config.dbDatanodePaths ]) if (not rst or not Util.parseRefreshClusterConfMsg(rsp)): system_log.info( "float ip clear failed on nodeId %d, will try it in the next round" % (idx + 1)) floatIpAllModifyFlag = False else: system_log.info("float ip clear succeed on nodeId %d" % (idx + 1)) context.setFloatIpState(idx, const.FLOATIP_NORMAL) stateModFlag = True if (floatIpAllModifyFlag): context.removeUnfinishOper(const.CLEAR_FLOAT_IP) system_log.info( "DB cluster's float ip on non primary nodes written in status file are all cleared." ) if (stateModFlag): context.saveToFile() system_log.debug("context save to file.")