Beispiel #1
0
    def recoveryUnknownNode(self, nodeId, targetState):
        ''' 检测unknown状态的节点,如果是进程问题,则恢复到目标状态,否则是SSH网卡问题,需要人工修复'''
        (rst, sshClient) = self.getSSHClient(nodeId)
        if (not rst):
            return False

        (rst, msg) = sshClient.execute(
            self.sshClients, nodeId, "RECOVERY_UNKNOWN_NODE_TARGETSTATE",
            [config.dbDatanodePaths, config.dbNodePort, targetState])

        if (not rst): return False
        elif (msg.startswith("0#")):
            system_log.error(
                "Recover standby nodeId %d failed for reason:\n%s" %
                ((nodeId + 1), msg[2:]))
            return False
        elif (msg.startswith("1#")):
            system_log.info("Recover standby nodeId %d succeed:\n" %
                            (nodeId + 1))
            return True
        elif (msg.startswith("2#")):
            system_log.fatal(
                "There may be have ssh network card failure in the nodeId %s, it needs manual support"
                % (nodeId + 1))
            return False
Beispiel #2
0
 def nodeFailover(self, nodeId):
     (rst, sshClient) = self.getSSHClient(nodeId)
     '''如果是单网卡,获取连接失败,说明主机故障或网卡故障;如果是双网卡说明监听网卡故障'''
     if (not rst):
         system_log.fatal(
             "Cannot ssh connect candidate primary nodeId %d, need manual support!"
             % (nodeId + 1))
         return False
     else:
         (rst, msg) = sshClient.execute(
             self.sshClients, nodeId, "SET_FLOATIP_FAILOVER", [
                 self.dbNodeListenIps[nodeId] + ",%s" % config.floatIp,
                 self.lastDbCluster.nodes[nodeId].nodeName,
                 config.dbDatanodePaths
             ])
         if (not rst):
             return False
         else:
             if (msg.startswith('1#')):
                 system_log.info(
                     "the primary node has failed over to nodeId %d and has refreshed the cluster config"
                     % (nodeId + 1))
                 return True
             elif (msg.startswith('4#')):
                 system_log.info(
                     "the primary node has failed over to nodeId %d but refresh the cluster config failed "
                     % (nodeId + 1))
                 context.setUnfinishOper(const.CLUSTER_REFRESH_CONFIG)
                 context.saveToFile()
                 return True
             else:
                 system_log.info(
                     "Primary failover to nodeId %d failed, need manual support!"
                     % (nodeId + 1))
                 return False
Beispiel #3
0
 def closeSSHClient(self, nodeId):
     sshClient = self.sshClients[nodeId]
     if (sshClient != None):
         try:
             sshClient.close()
             system_log.info("Close ssh connect for node %d success!" %
                             (nodeId + 1))
         except BaseException:
             system_log.fatal(traceback.format_exc())
Beispiel #4
0
 def buildStanbyForNode(self, nodeId):
     (rst, sshClient) = self.getSSHClient(nodeId)
     if (not rst):
         return
     system_log.info("build the nodeId %d to standby" % (nodeId + 1))
     (rst,
      msg) = sshClient.execute(self.sshClients, nodeId,
                               'BUILD_AS_STANDBY_NODE',
                               [config.dbDatanodePaths, config.dbNodePort])
     if (not rst or not msg.startswith("1#")):
         system_log.fatal(
             "DB nodeId %d needs manual to recover, the reason is:\n%s" %
             ((nodeId + 1), msg))
 def connect(self):
     try:
         self.ssh = paramiko.SSHClient()
         self.ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
         key = paramiko.RSAKey.from_private_key_file(self.private_key_file)
         self.ssh.connect(
             hostname=self.host,
             port=self.port,
             username=self.username,
             pkey=key,
             timeout=10)
         return True
     except BaseException:
         system_log.fatal(traceback.format_exc())
         return False
Beispiel #6
0
 def buildStandbyForNotLastPrimary(self, primaryNodeIds):
     primaryNodeIdBeforeUnaviables = self.dbClusterBeforeUnaviable.getPrimaryNodeIds(
     )
     if (len(primaryNodeIdBeforeUnaviables) != 1):
         system_log.fatal(
             "Can not getting Primary node before Cluster unavailable, DB Cluster needs manual to recover"
         )
         return
     '''恢复非持续时间最长的Primary为Standby'''
     needRecoveryNodeids = list(
         set(primaryNodeIds) - set(primaryNodeIdBeforeUnaviables))
     needRecoveryNodeidsOut = [x + 1 for x in needRecoveryNodeids]
     system_log.info(
         "the not last primary nodeIds needed to recover to standby are: %s"
         % needRecoveryNodeidsOut)
     for nodeId in needRecoveryNodeids:
         self.buildStanbyForNode(nodeId)
Beispiel #7
0
    def processStatusUnavailable(self, currDbClusterState):

        primaryNodeIds = currDbClusterState.getPrimaryNodeIds()
        if (len(primaryNodeIds) > 1):
            primaryNodeIdsOut = [x + 1 for x in primaryNodeIds]
            system_log.info(
                "There are more than one primary nodeIds, current are %s" %
                primaryNodeIdsOut)
            self.buildStandbyForNotLastPrimary(primaryNodeIds)
            return
        elif currDbClusterState.existsPendingNode():
            system_log.info(
                "Current DB Cluster exists pending nodes, Wait for it to recover automatically!"
            )
            return

        primaryNodeIdBeforeUnaviables = self.dbClusterBeforeUnaviable.getPrimaryNodeIds(
        )
        if (len(primaryNodeIdBeforeUnaviables) != 1):
            system_log.fatal(
                "Can not getting Primary node before Cluster became unavailable, DB cluster needs manual to recover"
            )
            return
        '''恢复故障前主节点'''
        rst = self.recoveryPrimaryNodeBeforeUnaviable(
            primaryNodeIdBeforeUnaviables[0], currDbClusterState)
        if (rst):  # 如果恢复主节点成功
            self.recoveryFaultStandby(currDbClusterState,
                                      primaryNodeIdBeforeUnaviables)
            return

        system_log.info(
            "recover the nodeId %s to primary failed, so system will find the candidate primary node and make it to primary."
            % (primaryNodeIdBeforeUnaviables[0] + 1))
        ''' 找到候选主节点进行主备切换 '''
        (rst, candidatePrimaryNodeId) = self.getCandidatePrimary()
        if (not rst):
            return

        system_log.info(
            "the candidate primary nodeId is %d and will fail over to it" %
            (candidatePrimaryNodeId + 1))
        self.clusterFailover(primaryNodeIdBeforeUnaviables[0],
                             candidatePrimaryNodeId, currDbClusterState)
Beispiel #8
0
    def readConfig(self, fileFullName):
        self.fileName = os.path.basename(fileFullName)
        self.filePath = os.path.dirname(fileFullName)
        self.fileFullName = fileFullName

        if (not os.path.exists(fileFullName)):
            system_log.info("no status file: %s is existed." % fileFullName)
            return True

        self.hasStatusFile = True
        try:
            domTree = parse(fileFullName)
            rootNode = domTree.documentElement

            unfinished_operation_nodes = rootNode.getElementsByTagName(
                "unfinished_operations")[0].childNodes
            if (len(unfinished_operation_nodes) == 0):
                self.__unfinishedOperations = []
            else:
                unfinished_operations = unfinished_operation_nodes[
                    0].data.split(",")
                arr = list(map(int, unfinished_operations))
                self.__unfinishedOperations = arr

            float_ip_state_nodes = rootNode.getElementsByTagName(
                "float_ip_state")[0].childNodes
            if (len(float_ip_state_nodes) == 0):
                self.__floatIpStates = []
            else:
                float_ip_state = float_ip_state_nodes[0].data.split(",")
                arr = list(map(int, float_ip_state))
                self.__floatIpStates = arr

            system_log.info("load status file: %s succeed." % fileFullName)
            system_log.info(str(self))

            return True
        except BaseException:
            system_log.error(traceback.format_exc())
            system_log.fatal("load status file: %s failed, system exits." %
                             fileFullName)
            return False
Beispiel #9
0
    def getSSHClient(self, nodeId):
        if (self.sshClients[nodeId] != None):
            return (True, self.sshClients[nodeId])

        hostip = config.dbNodeIps[nodeId]
        username = config.dbUser
        privateKeyFile = config.gghcPrivateKeyFile
        sshClient = SSH_Client(hostip, username, privateKeyFile)
        rst = sshClient.connect()
        if (rst):
            self.sshClients[nodeId] = sshClient
            system_log.info("get ssh connection for node %s succeed!" %
                            (nodeId + 1))
            return (True, sshClient)
        else:
            self.sshClients[nodeId] = None
            system_log.fatal(
                "gets ssh connection for node %s failed, it needs manual support"
                % (nodeId + 1))
            return (False, None)
    def execute(self, sshClients, nodeId, execmd, params=[]):
        revc_str = ""
        cmdLine = "%s %s" % (config.gghsAgentPath, execmd)
        for p in params:
            cmdLine = "%s %s" % (cmdLine, p)
        
        system_log.debug("Send request to nodeId %d: \n%s" % ((nodeId + 1), cmdLine))
        try:
            
            _, stdout, _ = self.ssh.exec_command(cmdLine, timeout=int(config.sshTimeout), get_pty=True)
            for info in stdout.readlines():
                revc_str += info
        except BaseException:
            self.close()
            sshClients[nodeId] = None
            system_log.fatal("The network card may be failure in the connection of nodeId %d at sending '%s'.\n%s" 
                             % ((nodeId + 1), execmd, traceback.format_exc()))
            return (False, None)

        system_log.debug("Receive Response from nodeId %d: \n%s" % ((nodeId + 1), revc_str))
        return (True, revc_str)
Beispiel #11
0
    def recoveryPrimaryNodeBeforeUnaviable(self, oldPrimaryNodeId,
                                           currDbClusterState):
        '''检测故障前主节点是否因为网络故障导致集群状态Unavailable,如果是DB进程故障,则强制拉起'''
        (rst, sshClient) = self.getSSHClient(oldPrimaryNodeId)
        if (not rst):
            return False

        if (currDbClusterState.getNodeState(oldPrimaryNodeId).lower() ==
                "unknown"):
            (rst, msg) = sshClient.execute(
                self.sshClients, oldPrimaryNodeId,
                "CHECK_AND_RECOVERY_PRIMARY_NODE",
                [config.dbDatanodePaths, config.dbNodePort])
        else:
            (rst, msg) = sshClient.execute(self.sshClients, oldPrimaryNodeId,
                                           "FORCE_RECOVERY_PRIMARY_NODE",
                                           [config.dbDatanodePaths])

        if (not rst):
            return False
        elif (msg.startswith("0#")):
            system_log.error(
                "Recover the primary nodeId %d before unavailable failed for the reason:\n%s"
                % ((oldPrimaryNodeId + 1), msg[2:]))
            return False
        elif (msg.startswith("1#")):
            system_log.info("Recover the nodeId %d to primary succeed." %
                            (oldPrimaryNodeId + 1))
            return True
        elif (msg.startswith("2#")):
            system_log.fatal(
                "The database process is existed on nodeId %s, please check its ssh newwork card."
                % (oldPrimaryNodeId + 1))
            return True
        else:
            system_log.fatal(
                "The nodeId %s state is unknown and can not connect to it, it needs manual support"
                % (oldPrimaryNodeId + 1))
            return True
Beispiel #12
0
    def confirmPrimaryFloatIp(self, primaryNodeId):
        '''检测Primary是否存在浮动IP,如果不存在则进行配置'''
        (rst, sshClient) = self.getSSHClient(primaryNodeId)
        if (not rst):
            system_log.fatal(
                "Can not get ssh connection to primary nodeId %d to confirm float ip, system exit"
                % (primaryNodeId + 1))
            os._exit(-1)

        (rst, msg) = sshClient.execute(
            self.sshClients, primaryNodeId, 'CONFIRM_FLOATIP_NETWORK',
            [config.floatipEth, config.floatIp, config.dbNodePort])
        if (not rst):
            system_log.fatal(
                "ssh connect to primary nodeId %d to confirm float ip failed, system exit"
                % (primaryNodeId + 1))
            os._exit(-1)

        if (msg.startswith('1#')):
            system_log.info(
                "successfully confirmed primary node has float ip %s on network card %s"
                % (config.floatIp, config.floatipEth))
        else:
            system_log.info(
                "Find primary node has no float ip %s on network card %s or not become effective in postgresql.conf"
                % (config.floatIp, config.floatipEth))
            (rst, msg) = sshClient.execute(
                self.sshClients, primaryNodeId, 'PRIMARY_ADD_FLOATIP', [
                    self.dbNodeListenIps[primaryNodeId] +
                    ",%s" % config.floatIp,
                    self.lastDbCluster.nodes[primaryNodeId].nodeName,
                    config.dbDatanodePaths
                ])
            if (not rst):
                system_log.fatal(
                    "set float ip on primary node %d failed, system exited" %
                    (primaryNodeId + 1))
                os._exit(-1)
            else:
                system_log.info("set float ip on primary node %d succeed" %
                                (primaryNodeId + 1))
Beispiel #13
0
    def getCandidatePrimary(self):
        ''' 获取unavailable前的Stanby节点,如果只有一个,则它是候选主节点;如果有多个,则按照算法进行选择;如果没有进行告警'''
        standbyNodeIdsBeforeUnaviable = self.dbClusterBeforeUnaviable.getStandbyNodeIds(
        )
        standbyNodesCount = len(standbyNodeIdsBeforeUnaviable)

        if (standbyNodesCount == 0):
            system_log.fatal(
                "DB Cluster '%s' has not standby node before become unavailable,"
                " so can not get candidate primary node. it needs manual support!"
                % str(self.dbClusterBeforeUnaviable))
            return (False, -1)
        elif (standbyNodesCount == 1):
            system_log.info(
                "DB Cluster '%s' has only one standby node before become unavailable, so"
                "it is the candidate primary node" %
                str(self.dbClusterBeforeUnaviable))
            return (True, standbyNodeIdsBeforeUnaviable[0])
        else:  # standbyNodesCount > 1
            candidateNodeId = -1
            candidateNodeTermlsn = ()
            for nodeId in standbyNodeIdsBeforeUnaviable:
                (rst, sshClient) = self.getSSHClient(nodeId)
                if (not rst):
                    system_log.fatal(
                        "Cannot query term and lsn from nodeId %d because can not get ssh connect,"
                        "\nit needs manual support!" % (nodeId + 1))
                    return (False, -1)

                (rst, msg) = sshClient.execute(self.sshClients, nodeId,
                                               "QUERY_NODE_TERM_LSN",
                                               [config.dbNodePort])
                if (not rst):
                    system_log.fatal(
                        "Querying term and lsn from nodeId %d failed for the reason:\n%s"
                        "\nit needs manual support!" % ((nodeId + 1), msg))
                elif (msg.startswith('1#')):
                    (rst, termlsn) = Util.parseTermLsn(msg[2:])
                    if (not rst):
                        system_log.fatal(
                            "Parase term lsn from '%s' in nodeId %d, it needs manual support!"
                            % (msg[2:], (nodeId + 1)))
                        return (False, -1)

                    if (candidateNodeId == -1):
                        candidateNodeId = nodeId
                        candidateNodeTermlsn = termlsn
                    else:
                        if (candidateNodeTermlsn[0] > termlsn[0]
                                or (candidateNodeTermlsn[0] == termlsn[0]
                                    and candidateNodeTermlsn[1] > termlsn[1])):
                            candidateNodeId = nodeId
                            candidateNodeTermlsn = termlsn
                else:  # msg.startswith('0#') 或其它原因
                    system_log.fatal(
                        "Querying term lsn failed in nodeId %d, the reason:\n%s,\nit needs manual support!"
                        % ((nodeId + 1), msg[2:]))

            system_log.info("get the candidate nodeId %d" %
                            (candidateNodeId + 1))
            return (True, candidateNodeId)