Ejemplo n.º 1
0
    def prepare_fencing_watcher(self, name):
        # If we don't have quorum now but get it as a result of starting this node,
        # then a bunch of nodes might get fenced
        upnode = None
        if self.HasQuorum(None):
            self.debug("Have quorum")
            return None

        if not self.templates["Pat:Fencing_start"]:
            print("No start pattern")
            return None

        if not self.templates["Pat:Fencing_ok"]:
            print("No ok pattern")
            return None

        stonith = None
        stonithPats = []
        for peer in self.Env["nodes"]:
            if self.ShouldBeStatus[peer] != "up":
                stonithPats.append(self.templates["Pat:Fencing_ok"] % peer)
                stonithPats.append(self.templates["Pat:Fencing_start"] % peer)

        stonith = LogWatcher(self.Env["LogFileName"],
                             stonithPats,
                             "StartupFencing",
                             0,
                             hosts=self.Env["nodes"],
                             kind=self.Env["LogWatcher"])
        stonith.setwatch()
        return stonith
Ejemplo n.º 2
0
    def prepare_fencing_watcher(self, name):
        # If we don't have quorum now but get it as a result of starting this node,
        # then a bunch of nodes might get fenced
        upnode = None
        if self.HasQuorum(None):
            self.debug("Have quorum")
            return None

        if not self.templates["Pat:Fencing_start"]:
            print("No start pattern")
            return None

        if not self.templates["Pat:Fencing_ok"]:
            print("No ok pattern")
            return None

        stonith = None
        stonithPats = []
        for peer in self.Env["nodes"]:
            if self.ShouldBeStatus[peer] != "up":
                stonithPats.append(self.templates["Pat:Fencing_ok"] % peer)
                stonithPats.append(self.templates["Pat:Fencing_start"] % peer)
            elif self.Env["Stack"] == "corosync (cman)":
                # There is a delay between gaining quorum and CMAN starting fencing
                # This can mean that even nodes that are fully up get fenced
                # There is no use fighting it, just look for everyone so that CTS doesn't get confused
                stonithPats.append(self.templates["Pat:Fencing_ok"] % peer)
                stonithPats.append(self.templates["Pat:Fencing_start"] % peer)

        stonith = LogWatcher(self.Env["LogFileName"], stonithPats, "StartupFencing", 0, hosts=self.Env["nodes"], kind=self.Env["LogWatcher"])
        stonith.setwatch()
        return stonith
Ejemplo n.º 3
0
    def partition_stable(self, nodes, timeout=None):
        watchpats = [ ]
        watchpats.append("Current ping state: S_IDLE")
        watchpats.append(self.templates["Pat:DC_IDLE"])
        self.debug("Waiting for cluster stability...") 

        if timeout == None:
            timeout = self.Env["DeadTime"]

        if len(nodes) < 3:
            self.debug("Cluster is inactive") 
            return 1

        idle_watch = LogWatcher(self.Env["LogFileName"], watchpats, "ClusterStable", timeout, hosts=nodes.split(), kind=self.Env["LogWatcher"])
        idle_watch.setwatch()

        for node in nodes.split():
            # have each node dump its current state
            self.rsh(node, self.templates["StatusCmd"] % node, 1)

        ret = idle_watch.look()
        while ret:
            self.debug(ret) 
            for node in nodes.split():
                if re.search(node, ret):
                    return 1
            ret = idle_watch.look()

        self.debug("Warn: Partition %s not IDLE after %ds" % (repr(nodes), timeout)) 
        return None
Ejemplo n.º 4
0
    def SetUp(self):
        '''Set up the Scenario. Return TRUE on success.'''

        self.ClusterManager.prepare()
        self.audit()  # Also detects remote/local log config
        self.ClusterManager.StatsMark(0)
        self.ClusterManager.ns.WaitForAllNodesToComeUp(
            self.ClusterManager.Env["nodes"])

        self.audit()
        self.ClusterManager.install_support()

        self.BadNews = LogWatcher(self.ClusterManager.Env["LogFileName"],
                                  self.ClusterManager.templates.get_patterns(
                                      self.ClusterManager.Env["Name"],
                                      "BadNews"),
                                  "BadNews",
                                  0,
                                  kind=self.ClusterManager.Env["LogWatcher"],
                                  hosts=self.ClusterManager.Env["nodes"])
        self.BadNews.setwatch(
        )  # Call after we've figured out what type of log watching to do in LogAudit

        j = 0
        while j < len(self.Components):
            if not self.Components[j].SetUp(self.ClusterManager):
                # OOPS!  We failed.  Tear partial setups down.
                self.audit()
                self.ClusterManager.log("Tearing down partial setup")
                self.TearDown(j)
                return None
            j = j + 1

        self.audit()
        return 1
Ejemplo n.º 5
0
    def startall(self, nodelist=None, verbose=False, quick=False):
        '''Start the cluster manager on every node in the cluster.
        We can do it on a subset of the cluster if nodelist is not None.
        '''
        map = {}
        if not nodelist:
            nodelist = self.Env["nodes"]

        for node in nodelist:
            if self.ShouldBeStatus[node] == "down":
                self.ns.WaitForAllNodesToComeUp(nodelist, 300)

        if not quick:
            # This is used for "basic sanity checks", so only start one node ...
            if not self.StartaCM(node, verbose=verbose):
                return 0
            return 1

        # Approximation of SimulStartList for --boot
        watchpats = []
        watchpats.append(self.templates["Pat:DC_IDLE"])
        for node in nodelist:
            watchpats.append(self.templates["Pat:InfraUp"] % node)
            watchpats.append(self.templates["Pat:PacemakerUp"] % node)
            watchpats.append(self.templates["Pat:Local_started"] % node)
            watchpats.append(self.templates["Pat:They_up"] %
                             (nodelist[0], node))

        #   Start all the nodes - at about the same time...
        watch = LogWatcher(self.Env["LogFileName"],
                           watchpats,
                           "fast-start",
                           self.Env["DeadTime"] + 10,
                           hosts=self.Env["nodes"],
                           kind=self.Env["LogWatcher"])
        watch.setwatch()

        if not self.StartaCM(nodelist[0], verbose=verbose):
            return 0
        for node in nodelist:
            self.StartaCMnoBlock(node, verbose=verbose)

        watch.lookforall()
        if watch.unmatched:
            for regex in watch.unmatched:
                self.logger.log("Warn: Startup pattern not found: %s" %
                                (regex))

        if not self.cluster_stable():
            self.logger.log("Cluster did not stabilize")
            return 0

        return 1
Ejemplo n.º 6
0
    def test_node_CM(self, node):
        '''Report the status of the cluster manager on a given node'''

        watchpats = []
        watchpats.append("Current ping state: (S_IDLE|S_NOT_DC)")
        watchpats.append(self.templates["Pat:NonDC_started"] % node)
        watchpats.append(self.templates["Pat:DC_started"] % node)
        idle_watch = LogWatcher(self.Env["LogFileName"],
                                watchpats,
                                "ClusterIdle",
                                hosts=[node],
                                kind=self.Env["LogWatcher"])
        idle_watch.setwatch()

        out = self.rsh(node, self.templates["StatusCmd"] % node, 1)
        self.debug("Node %s status: '%s'" % (node, out))

        if not out or (out.find('ok') < 0):
            if self.ShouldBeStatus[node] == "up":
                self.log(
                    "Node status for %s is %s but we think it should be %s" %
                    (node, "down", self.ShouldBeStatus[node]))
            self.ShouldBeStatus[node] = "down"
            return 0

        if self.ShouldBeStatus[node] == "down":
            self.log(
                "Node status for %s is %s but we think it should be %s: %s" %
                (node, "up", self.ShouldBeStatus[node], out))

        self.ShouldBeStatus[node] = "up"

        # check the output first - because syslog-ng loses messages
        if out.find('S_NOT_DC') != -1:
            # Up and stable
            return 2
        if out.find('S_IDLE') != -1:
            # Up and stable
            return 2

        # fall back to syslog-ng and wait
        if not idle_watch.look():
            # just up
            self.debug("Warn: Node %s is unstable: %s" % (node, out))
            return 1

        # Up and stable
        return 2
Ejemplo n.º 7
0
    def TestLogging(self):
        patterns = []
        prefix = "Test message from"
        suffix = str(uuid.uuid4())
        watch = {}

        for node in self.CM.Env["nodes"]:
            # Look for the node name in two places to make sure
            # that syslog is logging with the correct hostname
            m = re.search("^([^.]+).*", node)
            if m:
                simple = m.group(1)
            else:
                simple = node
            patterns.append("%s.*%s %s %s" % (simple, prefix, node, suffix))

        watch_pref = self.CM.Env["LogWatcher"]
        if watch_pref == "any":
            for k in self.kinds:
                watch[k] = LogWatcher(self.CM.Env["LogFileName"],
                                      patterns,
                                      "LogAudit",
                                      5,
                                      silent=True,
                                      hosts=self.CM.Env["nodes"],
                                      kind=k)
                watch[k].setwatch()
        else:
            k = watch_pref
            watch[k] = LogWatcher(self.CM.Env["LogFileName"],
                                  patterns,
                                  "LogAudit",
                                  5,
                                  silent=True,
                                  hosts=self.CM.Env["nodes"],
                                  kind=k)
            watch[k].setwatch()

        if watch_pref == "any":
            self.CM.log("Writing log with key: %s" % (suffix))
        for node in self.CM.Env["nodes"]:
            cmd = "logger -p %s.info %s %s %s" % (
                self.CM.Env["SyslogFacility"], prefix, node, suffix)
            if self.CM.rsh(node, cmd, synchronous=0, silent=True) != 0:
                self.CM.log("ERROR: Cannot execute remote command [%s] on %s" %
                            (cmd, node))

        for k in self.kinds:
            if k in watch:
                w = watch[k]
                if watch_pref == "any":
                    self.CM.log("Testing for %s logs" % (k))
                w.lookforall(silent=True)
                if not w.unmatched:
                    if watch_pref == "any":
                        self.CM.log("Continuing with %s-based log reader" %
                                    (w.kind))
                        self.CM.Env["LogWatcher"] = w.kind
                    return 1

        for k in list(watch.keys()):
            w = watch[k]
            if w.unmatched:
                for regex in w.unmatched:
                    self.CM.log("Test message [%s] not found in %s logs." %
                                (regex, w.kind))

        return 0
Ejemplo n.º 8
0
    def StartaCM(self, node, verbose=False):
        '''Start up the cluster manager on a given node'''
        if verbose:
            self.logger.log("Starting %s on node %s" %
                            (self.templates["Name"], node))
        else:
            self.debug("Starting %s on node %s" %
                       (self.templates["Name"], node))
        ret = 1

        if not node in self.ShouldBeStatus:
            self.ShouldBeStatus[node] = "down"

        if self.ShouldBeStatus[node] != "down":
            return 1

        patterns = []
        # Technically we should always be able to notice ourselves starting
        patterns.append(self.templates["Pat:Local_started"] % node)
        if self.upcount() == 0:
            patterns.append(self.templates["Pat:DC_started"] % node)
        else:
            patterns.append(self.templates["Pat:NonDC_started"] % node)

        watch = LogWatcher(self.Env["LogFileName"],
                           patterns,
                           "StartaCM",
                           self.Env["StartTime"] + 10,
                           hosts=self.Env["nodes"],
                           kind=self.Env["LogWatcher"])

        self.install_config(node)

        self.ShouldBeStatus[node] = "any"
        if self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]):
            self.logger.log("%s was already started" % (node))
            return 1

        stonith = self.prepare_fencing_watcher(node)
        watch.setwatch()

        if self.rsh(node, self.templates["StartCmd"]) != 0:
            self.logger.log("Warn: Start command failed on node %s" % (node))
            self.fencing_cleanup(node, stonith)
            return None

        self.ShouldBeStatus[node] = "up"
        watch_result = watch.lookforall()

        if watch.unmatched:
            for regex in watch.unmatched:
                self.logger.log("Warn: Startup pattern not found: %s" %
                                (regex))

        if watch_result and self.cluster_stable(self.Env["DeadTime"]):
            #self.debug("Found match: "+ repr(watch_result))
            self.fencing_cleanup(node, stonith)
            return 1

        elif self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]):
            self.fencing_cleanup(node, stonith)
            return 1

        self.logger.log("Warn: Start failed for node %s" % (node))
        return None
Ejemplo n.º 9
0
    def StartaCM(self, node, verbose=False):

        '''Start up the cluster manager on a given node'''
        if verbose: self.logger.log("Starting %s on node %s" % (self.templates["Name"], node))
        else: self.debug("Starting %s on node %s" % (self.templates["Name"], node))
        ret = 1

        if not node in self.ShouldBeStatus:
            self.ShouldBeStatus[node] = "down"

        if self.ShouldBeStatus[node] != "down":
            return 1

        patterns = []
        # Technically we should always be able to notice ourselves starting
        patterns.append(self.templates["Pat:Local_started"] % node)
        if self.upcount() == 0:
            patterns.append(self.templates["Pat:Master_started"] % node)
        else:
            patterns.append(self.templates["Pat:Slave_started"] % node)

        watch = LogWatcher(
            self.Env["LogFileName"], patterns, "StartaCM", self.Env["StartTime"]+10, hosts=self.Env["nodes"], kind=self.Env["LogWatcher"])

        self.install_config(node)

        self.ShouldBeStatus[node] = "any"
        if self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]):
            self.logger.log ("%s was already started" % (node))
            return 1

        # Clear out the host cache so autojoin can be exercised
        if self.clear_cache:
            self.debug("Removing cache file on: "+node)
            self.rsh(node, "rm -f "+CTSvars.HA_VARLIBHBDIR+"/hostcache")

        if not(self.Env["valgrind-tests"]):
            startCmd = self.templates["StartCmd"]
        else:
            if self.Env["valgrind-prefix"]:
                prefix = self.Env["valgrind-prefix"]
            else:
                prefix = "cts"

            startCmd = """G_SLICE=always-malloc HA_VALGRIND_ENABLED='%s' VALGRIND_OPTS='%s --log-file=/tmp/%s-%s.valgrind' %s""" % (
                self.Env["valgrind-procs"], self.Env["valgrind-opts"], prefix, """%p""", self.templates["StartCmd"])

        stonith = self.prepare_fencing_watcher(node)

        watch.setwatch()

        if self.rsh(node, startCmd) != 0:
            self.logger.log ("Warn: Start command failed on node %s" % (node))
            self.fencing_cleanup(node, stonith)
            return None

        self.ShouldBeStatus[node] = "up"
        watch_result = watch.lookforall()

        if watch.unmatched:
            for regex in watch.unmatched:
                self.logger.log ("Warn: Startup pattern not found: %s" % (regex))

        if watch_result and self.cluster_stable(self.Env["DeadTime"]):
            #self.debug("Found match: "+ repr(watch_result))
            self.fencing_cleanup(node, stonith)
            return 1

        elif self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]):
            self.fencing_cleanup(node, stonith)
            return 1

        self.logger.log ("Warn: Start failed for node %s" % (node))
        return None