Exemple #1
0
    def recoveryFaultStandby(self, currDbClusterState, exceptNodeIds=[]):
        for tmpNodeid in range(len(currDbClusterState.nodes)):
            if (tmpNodeid in exceptNodeIds): continue

            node = currDbClusterState.nodes[tmpNodeid]
            if (node.state == "Pending"):
                continue
            elif (node.state == "Standby"):
                if (node.subState == "Normal"):
                    continue
                elif (node.subState == "Need repair"):
                    if (node.supplementInfo.startswith("WAL")):
                        system_log.info(
                            "the node is in state '%s', build it to standby" %
                            str(node))
                        self.buildStanbyForNode(tmpNodeid)
                    else:
                        system_log.debug(
                            "the node is in state '%s', wait for it to recover automatically"
                            % str(node))
                        continue
                elif (node.subState == "Coredump"
                      or node.subState == "Unknown"):
                    system_log.info(
                        "the node is in state '%s', build it to standby" %
                        str(node))
                    self.buildStanbyForNode(tmpNodeid)
                else:
                    system_log.info(
                        "the node is in state '%s', wait for it to recover automatically"
                        % str(node))
            elif (node.state == "Normal" or node.state == "Down"
                  or node.state == "Manually stopped"
                  or node.state == "Abnormal"):
                system_log.info(
                    "the node is in state '%s', build it to standby" %
                    str(node))
                self.buildStanbyForNode(tmpNodeid)
            elif (node.state == "Unknown"):
                system_log.info(
                    "the node is in state '%s', build it to standby" %
                    str(node))
                self.recoveryUnknownNode(tmpNodeid, 'standby')
            elif (node.state == "Primary" and node.subState == "Normal"):
                continue
            else:
                system_log.info(
                    "the node is in state '%s', wait for it to recover automatically"
                    % str(node))
    def execute(self, sshClients, nodeId, execmd, params=[]):
        revc_str = ""
        cmdLine = "%s %s" % (config.gghsAgentPath, execmd)
        for p in params:
            cmdLine = "%s %s" % (cmdLine, p)
        
        system_log.debug("Send request to nodeId %d: \n%s" % ((nodeId + 1), cmdLine))
        try:
            
            _, stdout, _ = self.ssh.exec_command(cmdLine, timeout=int(config.sshTimeout), get_pty=True)
            for info in stdout.readlines():
                revc_str += info
        except BaseException:
            self.close()
            sshClients[nodeId] = None
            system_log.fatal("The network card may be failure in the connection of nodeId %d at sending '%s'.\n%s" 
                             % ((nodeId + 1), execmd, traceback.format_exc()))
            return (False, None)

        system_log.debug("Receive Response from nodeId %d: \n%s" % ((nodeId + 1), revc_str))
        return (True, revc_str)
Exemple #3
0
    def getClusterDbState(self):
        ''' 循环选择集群节点进行检测'''
        nodeSize = len(config.dbNodeIps)
        sshClient = None

        for _ in range(nodeSize):
            self.lastCheckNodeid = (self.lastCheckNodeid + 1) % nodeSize
            (rst, sshClient) = self.getSSHClient(self.lastCheckNodeid)
            if (rst): break

        if (None == sshClient):
            system_log.error("Cannot get SSH Connect, Cluster checking failed")
            return None

        self.doUnfinishedOperations(sshClient, self.lastCheckNodeid)

        system_log.debug("to get db cluster state on nodeId %d" %
                         (self.lastCheckNodeid + 1))
        (rst, cmdOut) = sshClient.execute(self.sshClients,
                                          self.lastCheckNodeid,
                                          "GET_CLUSTER_STATUS")
        if (not rst):
            system_log.error(
                "For ssh client reason, failed to get the cluster state and will try it in the next round"
            )
            return None

        tmpDbCluster = DbCluster()
        rst = tmpDbCluster.buildByQuery(cmdOut[2:])
        if (rst):
            system_log.debug("current db cluster state is: %s" %
                             (str(tmpDbCluster)))
            return tmpDbCluster
        else:
            system_log.error("current db cluster state is: %s" %
                             (str(tmpDbCluster)))
            return None
Exemple #4
0
    def check(self):
        try:
            self.initDbSSHClients()
            self.getClusterListenipConfig()
            system_log.info("System successfully started.")
            print("System successfully started.")
            firstCheckFlag = True  # 首次测试
            while True:
                start = time.time()

                currDbClusterState = self.getClusterDbState()

                # if(system_log.level == logging.DEBUG):
                #    print("db current state is %s" % str(currDbClusterState))

                if (currDbClusterState != None):

                    if (currDbClusterState.state != "Unavailable"):
                        self.dbClusterBeforeUnaviable = None

                    # 在初始进入unavailable状态时,保留之前的集群状态,以便必要时恢复故障前主机
                    elif (self.dbClusterBeforeUnaviable == None):
                        self.dbClusterBeforeUnaviable = copy.deepcopy(
                            self.lastDbCluster)

                    try:
                        if (currDbClusterState.state == "Normal"):
                            self.dbClusterBeforeUnaviable = None
                            system_log.debug(
                                "db current state is normal, system does nothing"
                            )
                            pass
                        elif (currDbClusterState.state == "Degraded"):
                            self.dbClusterBeforeUnaviable = None
                            system_log.debug(
                                "db current state is degraded, system will process it"
                            )
                            self.processStatusDegrade(
                                currDbClusterState,
                                [currDbClusterState.getPrimaryNodeIds()])
                        else:  # Unavailable
                            if (firstCheckFlag):
                                system_log.error(
                                    "when system start, DB cluster state is '%s'. "
                                    "It should be Normal or Degraded and can successfully acquire db nodes' listen_addresses, "
                                    "so system exits." %
                                    str(currDbClusterState))
                                os._exit(1)

                            system_log.debug(
                                "db current state is unavailable, system will process it"
                            )
                            self.processStatusUnavailable(currDbClusterState)
                    except BaseException:
                        system_log.error("%s", traceback.format_exc())
                        traceback.format_exc()

                    #处理状态变化,记录状态日志,将currDbClusterState的变化复制给lastDbCluster,使其保持最新变化
                    self.prcessClusterState(currDbClusterState)
                    if (firstCheckFlag):
                        #确保Primary节点配置有浮动ip,如果没有则进行配置
                        primaryNodeIds = currDbClusterState.getPrimaryNodeIds()
                        self.confirmPrimaryFloatIp(primaryNodeIds[0])

                    firstCheckFlag = False

                stop = time.time()
                wait = int(config.stateCheckPeriod) - (stop - start)
                if (wait > 0):
                    time.sleep(wait)

        finally:
            system_log.info("System stopped.")
            print("System stopped.")
            self.closeDbSSHClients()
        pass
Exemple #5
0
    def doUnfinishedOperations(self, sshClient, nodeId):
        '''
        context.unfinishedOperations操作  1: 主备切换后配置未能刷,  2: 浮动IP处理未完成
        context.float_ip_state: 1: Primary正常启动floatip,0: Standby正常清除floatip,-1: 故障主节点未能清除floatip 
        '''
        if (not context.hasUnfinishOpers()):
            system_log.debug("context has no unfinished operations.")
            return

        system_log.debug("to do unfinished operations")

        stateModFlag = False  # context是否变化
        floatIpAllModifyFlag = True  # 浮动IP是否存在修改失败

        #主备切换后配置未能刷新配置
        if context.needDBRefreshConf():
            system_log.debug("need to refresh db cluster config")
            (rst, rsp) = sshClient.execute(self.sshClients, nodeId,
                                           "CLUSTER_REFRESH_CONFIG")
            if (not rst or not Util.parseRefreshClusterConfMsg(rsp)):
                system_log.info(
                    "Cluster config refreshing failed, and will try again in the next round"
                )
            else:
                system_log.info("Cluster config refreshing success")
                context.removeUnfinishOper(const.CLUSTER_REFRESH_CONFIG)
                stateModFlag = True
        # 浮动IP处理
        elif (context.needClearFloatIp()):
            system_log.debug("need to clear standby node float ip")
            dbNodesSize = len(self.dbNodeListenIps)
            for idx in range(dbNodesSize):
                if (context.getFloatIpState(
                        idx) == const.STANDBY_UNCLEAR_FLOATIP
                    ):  # 说明该节点浮动IP没有消除成功,需获取其sshClient进行
                    (rst, sshClientTmp) = self.getSSHClient(idx)
                    if (not rst):
                        system_log.debug(
                            "Cannot get ssh connection for nodeId %d, will try delete the float ip in the next round"
                            % (idx + 1))
                        floatIpAllModifyFlag = False
                        continue

                    (rst, rsp) = sshClientTmp.execute(
                        self.sshClients, idx, "CLEAR_NODE_FLOATIP_BUILD", [
                            self.dbNodeListenIps[idx], self.dbNodeNames[idx],
                            config.dbDatanodePaths
                        ])
                    if (not rst or not Util.parseRefreshClusterConfMsg(rsp)):
                        system_log.info(
                            "float ip clear failed on nodeId %d, will try it in the next round"
                            % (idx + 1))
                        floatIpAllModifyFlag = False
                    else:
                        system_log.info("float ip clear succeed on nodeId %d" %
                                        (idx + 1))
                        context.setFloatIpState(idx, const.FLOATIP_NORMAL)
                        stateModFlag = True

            if (floatIpAllModifyFlag):
                context.removeUnfinishOper(const.CLEAR_FLOAT_IP)
                system_log.info(
                    "DB cluster's float ip on non primary nodes written in status file are all cleared."
                )

        if (stateModFlag):
            context.saveToFile()
            system_log.debug("context save to file.")