Ejemplo n.º 1
0
    def prepare_fencing_watcher(self, name):
        # If we don't have quorum now but get it as a result of starting this node,
        # then a bunch of nodes might get fenced
        upnode = None
        if self.HasQuorum(None):
            self.debug("Have quorum")
            return None

        if not self.templates["Pat:Fencing_start"]:
            print("No start pattern")
            return None

        if not self.templates["Pat:Fencing_ok"]:
            print("No ok pattern")
            return None

        stonith = None
        stonithPats = []
        for peer in self.Env["nodes"]:
            if self.ShouldBeStatus[peer] != "up":
                stonithPats.append(self.templates["Pat:Fencing_ok"] % peer)
                stonithPats.append(self.templates["Pat:Fencing_start"] % peer)
            elif self.Env["Stack"] == "corosync (cman)":
                # There is a delay between gaining quorum and CMAN starting fencing
                # This can mean that even nodes that are fully up get fenced
                # There is no use fighting it, just look for everyone so that CTS doesn't get confused
                stonithPats.append(self.templates["Pat:Fencing_ok"] % peer)
                stonithPats.append(self.templates["Pat:Fencing_start"] % peer)

        stonith = LogWatcher(self.Env["LogFileName"], stonithPats, "StartupFencing", 0, hosts=self.Env["nodes"], kind=self.Env["LogWatcher"])
        stonith.setwatch()
        return stonith
Ejemplo n.º 2
0
    def setup_scenario(self, cluster_manager):
        cluster_manager.log("Reusing cluster")

        # Disable STONITH by default. A dedicated ScenarioComponent
        # is in charge of enabling it if requested
        self.rsh_check(self.Env["nodes"][0], "pcs property set stonith-enabled=false")

        # Stop and remove galera if it exists
        # Note1: in order to avoid error when stopping the resource while
        # in unknown state, we first reprobe the resource state.
        # Note2: if you clean and delete before pacemaker had a
        # chance to re-probe state, it will consider resource is stopped
        # and will happily delete the resource from the cib even if
        # galera is still running!
        # Note3: after a cleanup, pacemaker may log a warning log
        # if it finds the resource is still running. This does not
        # count as an error for the CTS test
        target=self.Env["nodes"][0]
        rc = self.rsh(target, "pcs resource unmanage galera")
        if rc == 0:
            patterns = [r"crmd.*:\s*Initiating action.*: probe_complete probe_complete-%s on %s"%(n,n) \
                    for n in self.Env["nodes"]]
            watch=LogWatcher(self.Env["LogFileName"], patterns, None, self.Env["DeadTime"], kind=self.Env["LogWatcher"], hosts=self.Env["nodes"])
            watch.setwatch()
            self.rsh(target, "pcs resource cleanup galera")
            watch.lookforall()
            assert not watch.unmatched, watch.unmatched
            self.rsh(target, "pcs resource disable galera")
            self.rsh(target, "pcs resource manage galera")
            self.rsh(target, "pcs resource delete galera --wait")
Ejemplo n.º 3
0
    def SetUp(self):
        '''Set up the Scenario. Return TRUE on success.'''

        self.ClusterManager.prepare()
        self.audit()  # Also detects remote/local log config
        self.ClusterManager.StatsMark(0)
        self.ClusterManager.ns.WaitForAllNodesToComeUp(
            self.ClusterManager.Env["nodes"])

        self.audit()
        self.ClusterManager.install_support()

        self.BadNews = LogWatcher(self.ClusterManager.Env["LogFileName"],
                                  self.ClusterManager.templates.get_patterns(
                                      self.ClusterManager.Env["Name"],
                                      "BadNews"),
                                  "BadNews",
                                  0,
                                  kind=self.ClusterManager.Env["LogWatcher"],
                                  hosts=self.ClusterManager.Env["nodes"])
        self.BadNews.setwatch(
        )  # Call after we've figured out what type of log watching to do in LogAudit

        j = 0
        while j < len(self.Components):
            if not self.Components[j].SetUp(self.ClusterManager):
                # OOPS!  We failed.  Tear partial setups down.
                self.audit()
                self.ClusterManager.log("Tearing down partial setup")
                self.TearDown(j)
                return None
            j = j + 1

        self.audit()
        return 1
Ejemplo n.º 4
0
    def partition_stable(self, nodes, timeout=None):
        watchpats = [ ]
        watchpats.append("Current ping state: S_IDLE")
        watchpats.append(self.templates["Pat:DC_IDLE"])
        self.debug("Waiting for cluster stability...") 

        if timeout == None:
            timeout = self.Env["DeadTime"]

        if len(nodes) < 3:
            self.debug("Cluster is inactive") 
            return 1

        idle_watch = LogWatcher(self.Env["LogFileName"], watchpats, "ClusterStable", timeout, hosts=nodes.split(), kind=self.Env["LogWatcher"])
        idle_watch.setwatch()

        for node in nodes.split():
            # have each node dump its current state
            self.rsh(node, self.templates["StatusCmd"] % node, 1)

        ret = idle_watch.look()
        while ret:
            self.debug(ret) 
            for node in nodes.split():
                if re.search(node, ret):
                    return 1
            ret = idle_watch.look()

        self.debug("Warn: Partition %s not IDLE after %ds" % (repr(nodes), timeout)) 
        return None
Ejemplo n.º 5
0
    def prepare_fencing_watcher(self, name):
        # If we don't have quorum now but get it as a result of starting this node,
        # then a bunch of nodes might get fenced
        upnode = None
        if self.HasQuorum(None):
            self.debug("Have quorum")
            return None

        if not self.templates["Pat:Fencing_start"]:
            print("No start pattern")
            return None

        if not self.templates["Pat:Fencing_ok"]:
            print("No ok pattern")
            return None

        stonith = None
        stonithPats = []
        for peer in self.Env["nodes"]:
            if self.ShouldBeStatus[peer] != "up":
                stonithPats.append(self.templates["Pat:Fencing_ok"] % peer)
                stonithPats.append(self.templates["Pat:Fencing_start"] % peer)

        stonith = LogWatcher(self.Env["LogFileName"],
                             stonithPats,
                             "StartupFencing",
                             0,
                             hosts=self.Env["nodes"],
                             kind=self.Env["LogWatcher"])
        stonith.setwatch()
        return stonith
Ejemplo n.º 6
0
    def prepare_fencing_watcher(self, name):
        # If we don't have quorum now but get it as a result of starting this node,
        # then a bunch of nodes might get fenced
        upnode = None
        if self.HasQuorum(None):
            self.debug("Have quorum")
            return None

        if not self.templates["Pat:Fencing_start"]:
            print("No start pattern")
            return None

        if not self.templates["Pat:Fencing_ok"]:
            print("No ok pattern")
            return None

        stonith = None
        stonithPats = []
        for peer in self.Env["nodes"]:
            if self.ShouldBeStatus[peer] != "up":
                stonithPats.append(self.templates["Pat:Fencing_ok"] % peer)
                stonithPats.append(self.templates["Pat:Fencing_start"] % peer)

        stonith = LogWatcher(self.Env["LogFileName"], stonithPats, "StartupFencing", 0, hosts=self.Env["nodes"], kind=self.Env["LogWatcher"])
        stonith.setwatch()
        return stonith
Ejemplo n.º 7
0
    def prepare_fencing_watcher(self, name):
        # If we don't have quorum now but get it as a result of starting this node,
        # then a bunch of nodes might get fenced
        upnode = None
        if self.HasQuorum(None):
            self.debug("Have quorum")
            return None

        if not self.templates["Pat:Fencing_start"]:
            print("No start pattern")
            return None

        if not self.templates["Pat:Fencing_ok"]:
            print("No ok pattern")
            return None

        stonith = None
        stonithPats = []
        for peer in self.Env["nodes"]:
            if self.ShouldBeStatus[peer] != "up":
                stonithPats.append(self.templates["Pat:Fencing_ok"] % peer)
                stonithPats.append(self.templates["Pat:Fencing_start"] % peer)
            elif self.Env["Stack"] == "corosync (cman)":
                # There is a delay between gaining quorum and CMAN starting fencing
                # This can mean that even nodes that are fully up get fenced
                # There is no use fighting it, just look for everyone so that CTS doesn't get confused
                stonithPats.append(self.templates["Pat:Fencing_ok"] % peer)
                stonithPats.append(self.templates["Pat:Fencing_start"] % peer)

        stonith = LogWatcher(self.Env["LogFileName"], stonithPats, "StartupFencing", 0, hosts=self.Env["nodes"], kind=self.Env["LogWatcher"])
        stonith.setwatch()
        return stonith
Ejemplo n.º 8
0
    def setup_scenario(self, cluster_manager):
        # mysql setup
        self.init_and_setup_mysql_defaults()
        self.setup_galera_config()

        remote_authkey = "/etc/pacemaker/authkey"
        if not self.rsh.exists_on_all(remote_authkey, self.Env["nodes"]):
            self.log("Creating auth key for communication with pacemaker remote")
            with tempfile.NamedTemporaryFile() as tmp:
                tmp.write(os.urandom(4096))
                tmp.flush()
                self.copy_to_nodes([(tmp.name, remote_authkey)], True, "root:haclient", "440")

        # cluster_manager.prepare()

        # stop cluster if previously running, failure is not fatal
        for node in self.Env["nodes"]:
            self.rsh(node, "pcs cluster destroy")
            self.rsh(node, "systemctl stop pacemaker_remote")
            self.rsh(node, "systemctl enable pacemaker")

        # reconfigure cluster for 2-nodes + one remote arbitrator
        self.Env["arb"]=self.Env["nodes"][-1]
        self.Env["nodes"]=self.Env["nodes"][:-1]
        self.rsh_check(self.Env["nodes"][0], "pcs cluster setup --force --name ratester %s %s" % \
                       (self.Env["nodes"][0],self.Env["nodes"][1]))
        # note: setting up cluster disable pacemaker service. re-enable it
        self.rsh_check(self.Env["nodes"][0], "systemctl enable pacemaker")
        self.rsh_check(self.Env["nodes"][0], "pcs cluster start --all")

        # TODO: better way to wait until cluster is started
        time.sleep(8)

        # Disable STONITH by default. A dedicated ScenarioComponent
        # is in charge of enabling it if requested
        self.rsh_check(self.Env["nodes"][0], "pcs property set stonith-enabled=false")

        for node in self.Env["nodes"]:
            self.rsh_check(node, "pcs property set --node %s osprole=controller"%node)

        # cluster_manager.prepare()

        # pacemaker remote to host garbd
        res=self.rsh_check(self.Env["arb"], "systemctl disable pacemaker")
        res=self.rsh_check(self.Env["arb"], "systemctl enable pacemaker_remote")
        res=self.rsh_check(self.Env["arb"], "systemctl start pacemaker_remote")

        remote_ok_pat = self.ratemplates.build("Pat:RscRemoteOp", "start", "arb", "\S+", "ok")
        watch=LogWatcher(self.Env["LogFileName"], [remote_ok_pat], None, self.Env["DeadTime"], kind=self.Env["LogWatcher"], hosts=self.Env["nodes"])
        # watch = self.create_watch([remote_ok_pat], self.Env["DeadTime"])
        watch.setwatch()
        res=self.rsh_check(self.Env["nodes"][0], "pcs resource create arb ocf:pacemaker:remote server=%s reconnect_interval=60 op monitor interval=20"%self.Env["arb"])
        watch.lookforall()
        assert not watch.unmatched, watch.unmatched

        self.rsh_check(self.Env["nodes"][0], "pcs property set --node arb osprole=arbitrator")

        # there's no selinux context for garbd currently
        res=self.rsh_check(self.Env["arb"], "test -x /usr/sbin/setenforce && setenforce 0 || true")
Ejemplo n.º 9
0
    def setup_scenario(self, cluster_manager):
        # pre-requisites
        prerequisite = ["/usr/bin/gdb", "/usr/bin/screen", "/usr/bin/dig"]
        missing_reqs = False
        for req in prerequisite:
            if not self.rsh.exists_on_all(req, self.Env["nodes"]):
                self.log("error: %s could not be found on remote nodes. "
                         "Please install the necessary package to run the tests"%  req)
                missing_reqs = True
        assert not missing_reqs

        # galera-specific data
        test_scripts = ["kill-during-txn.gdb", "slow_down_sst.sh"]
        for node in self.Env["nodes"]:
            for script in test_scripts:
                src = os.path.join(os.path.dirname(os.path.abspath(__file__)), script)
                rc = self.rsh.cp(src, "root@%s:/tmp/%s" % (node, script))
                assert rc == 0, \
                    "failed to copy data \"%s\" on remote node \"%s\"" % \
                    (src, node)

        # mysql setup
        self.init_and_setup_mysql_defaults()
        self.setup_galera_config()

        # clean up any traffic control on target network interface
        for node in self.Env["nodes"]:
            self.rsh(node, "/tmp/slow_down_sst.sh -n %s off"%node)

        # stop cluster if previously running, failure is not fatal
        for node in self.Env["nodes"]:
            self.rsh(node, "pcs cluster destroy")
            self.rsh(node, "systemctl enable pacemaker")
            self.rsh(node, "systemctl stop pacemaker_remote")
            self.rsh(node, "systemctl disable pacemaker_remote")

        # create a new cluster
        # note: setting up cluster disable pacemaker service. re-enable it
        patterns = [r"crmd.*:\s*notice:\sState\stransition\sS_STARTING(\s->.*origin=do_started)?",
                    r"crmd.*:\s*notice:\sState\stransition\s.*->\sS_IDLE(\s.*origin=notify_crmd)?"]
        watch = LogWatcher(self.Env["LogFileName"], patterns, None, self.Env["DeadTime"], kind=self.Env["LogWatcher"], hosts=self.Env["nodes"])
        watch.setwatch()
        self.rsh_check(self.Env["nodes"][0], "pcs cluster setup --force --name ratester %s" % \
                       " ".join(self.Env["nodes"]))
        self.rsh_check(self.Env["nodes"][0], "systemctl enable pacemaker")
        self.rsh_check(self.Env["nodes"][0], "pcs cluster start --all")
        # Disable STONITH by default. A dedicated ScenarioComponent
        # is in charge of enabling it if requested
        self.rsh_check(self.Env["nodes"][0], "pcs property set stonith-enabled=false")
        watch.lookforall()
        assert not watch.unmatched, watch.unmatched
Ejemplo n.º 10
0
    def test_node_CM(self, node):
        '''Report the status of the cluster manager on a given node'''

        watchpats = []
        watchpats.append("Current ping state: (S_IDLE|S_NOT_DC)")
        watchpats.append(self.templates["Pat:NonDC_started"] % node)
        watchpats.append(self.templates["Pat:DC_started"] % node)
        idle_watch = LogWatcher(self.Env["LogFileName"],
                                watchpats,
                                "ClusterIdle",
                                hosts=[node],
                                kind=self.Env["LogWatcher"])
        idle_watch.setwatch()

        out = self.rsh(node, self.templates["StatusCmd"] % node, 1)
        self.debug("Node %s status: '%s'" % (node, out))

        if not out or (out.find('ok') < 0):
            if self.ShouldBeStatus[node] == "up":
                self.log(
                    "Node status for %s is %s but we think it should be %s" %
                    (node, "down", self.ShouldBeStatus[node]))
            self.ShouldBeStatus[node] = "down"
            return 0

        if self.ShouldBeStatus[node] == "down":
            self.log(
                "Node status for %s is %s but we think it should be %s: %s" %
                (node, "up", self.ShouldBeStatus[node], out))

        self.ShouldBeStatus[node] = "up"

        # check the output first - because syslog-ng loses messages
        if out.find('S_NOT_DC') != -1:
            # Up and stable
            return 2
        if out.find('S_IDLE') != -1:
            # Up and stable
            return 2

        # fall back to syslog-ng and wait
        if not idle_watch.look():
            # just up
            self.debug("Warn: Node %s is unstable: %s" % (node, out))
            return 1

        # Up and stable
        return 2
Ejemplo n.º 11
0
    def SetUp(self):
        '''Set up the Scenario. Return TRUE on success.'''

        self.ClusterManager.prepare()
        self.audit() # Also detects remote/local log config
        self.ClusterManager.StatsMark(0)
        self.ClusterManager.ns.WaitForAllNodesToComeUp(self.ClusterManager.Env["nodes"])

        self.audit()
        if self.ClusterManager.Env["valgrind-tests"]:
            self.ClusterManager.install_helper("cts.supp")

        self.BadNews = LogWatcher(self.ClusterManager.Env["LogFileName"],
                                  self.ClusterManager.templates.get_patterns(
                                      self.ClusterManager.Env["Name"], "BadNews"), "BadNews", 0,
                                  kind=self.ClusterManager.Env["LogWatcher"],
                                  hosts=self.ClusterManager.Env["nodes"])
        self.BadNews.setwatch() # Call after we've figured out what type of log watching to do in LogAudit

        j = 0
        while j < len(self.Components):
            if not self.Components[j].SetUp(self.ClusterManager):
                # OOPS!  We failed.  Tear partial setups down.
                self.audit()
                self.ClusterManager.log("Tearing down partial setup")
                self.TearDown(j)
                return None
            j = j + 1

        self.audit()
        return 1
Ejemplo n.º 12
0
    def startall(self, nodelist=None, verbose=False, quick=False):

        '''Start the cluster manager on every node in the cluster.
        We can do it on a subset of the cluster if nodelist is not None.
        '''
        map = {}
        if not nodelist:
            nodelist = self.Env["nodes"]

        for node in nodelist:
            if self.ShouldBeStatus[node] == "down":
                self.ns.WaitForAllNodesToComeUp(nodelist, 300)

        if not quick:
            # This is used for "basic sanity checks", so only start one node ...
            if not self.StartaCM(node, verbose=verbose):
                return 0
            return 1

        # Approximation of SimulStartList for --boot 
        watchpats = [ ]
        watchpats.append(self.templates["Pat:DC_IDLE"])
        for node in nodelist:
            watchpats.append(self.templates["Pat:Local_started"] % node)
            watchpats.append(self.templates["Pat:InfraUp"] % node)
            watchpats.append(self.templates["Pat:PacemakerUp"] % node)

        #   Start all the nodes - at about the same time...
        watch = LogWatcher(self.Env["LogFileName"], watchpats, "fast-start", self.Env["DeadTime"]+10, hosts=self.Env["nodes"], kind=self.Env["LogWatcher"])
        watch.setwatch()

        if not self.StartaCM(nodelist[0], verbose=verbose):
            return 0
        for node in nodelist:
            self.StartaCMnoBlock(node, verbose=verbose)

        watch.lookforall()
        if watch.unmatched:
            for regex in watch.unmatched:
                self.logger.log ("Warn: Startup pattern not found: %s" % (regex))

        if not self.cluster_stable():
            self.logger.log("Cluster did not stabilize")
            return 0

        return 1
Ejemplo n.º 13
0
    def test_node_CM(self, node):
        '''Report the status of the cluster manager on a given node'''

        watchpats = [ ]
        watchpats.append("Current ping state: (S_IDLE|S_NOT_DC)")
        watchpats.append(self.templates["Pat:Slave_started"]%node)
        watchpats.append(self.templates["Pat:Master_started"]%node)
        idle_watch = LogWatcher(self.Env["LogFileName"], watchpats, "ClusterIdle", hosts=[node], kind=self.Env["LogWatcher"])
        idle_watch.setwatch()

        out = self.rsh(node, self.templates["StatusCmd"]%node, 1)
        self.debug("Node %s status: '%s'" %(node, out))            

        if not out or string.find(out, 'ok') < 0:
            if self.ShouldBeStatus[node] == "up":
                self.log(
                    "Node status for %s is %s but we think it should be %s"
                    % (node, "down", self.ShouldBeStatus[node]))
            self.ShouldBeStatus[node] = "down"
            return 0

        if self.ShouldBeStatus[node] == "down":
            self.log(
                "Node status for %s is %s but we think it should be %s: %s"
                % (node, "up", self.ShouldBeStatus[node], out))

        self.ShouldBeStatus[node] = "up"

        # check the output first - because syslog-ng looses messages
        if string.find(out, 'S_NOT_DC') != -1:
            # Up and stable
            return 2
        if string.find(out, 'S_IDLE') != -1:
            # Up and stable
            return 2

        # fall back to syslog-ng and wait
        if not idle_watch.look():
            # just up
            self.debug("Warn: Node %s is unstable: %s" % (node, out))
            return 1

        # Up and stable
        return 2
Ejemplo n.º 14
0
    def setup_scenario(self, cluster_manager):
        # consider cluster has 2-nodes + one remote arbitrator
        cluster_manager.log("Reusing cluster")
        target=self.Env["nodes"][0]

        self.Env["arb"]=self.Env["nodes"][-1]
        self.rsh_check(target, "pcs property set --node arb osprole=arbitrator")

        # attempt at cleaning up and remove garbd if it exists
        rc = self.rsh(target, "pcs resource unmanage garbd")
        if rc == 0:
            self.rsh(target, "pcs resource cleanup garbd")
            self.rsh(target, "pcs resource disable garbd")
            self.rsh(target, "pcs resource manage garbd")
            self.rsh(target, "pcs resource delete garbd --wait")

        self.Env["nodes"]=self.Env["nodes"][:-1]
        for node in self.Env["nodes"]:
            self.rsh_check(node, "pcs property set --node %s osprole=controller"%node)

        # Stop and remove galera if it exists
        # Note1: in order to avoid error when stopping the resource while
        # in unknown state, we first reprobe the resource state.
        # Note2: if you clean and delete before pacemaker had a
        # chance to re-probe state, it will consider resource is stopped
        # and will happily delete the resource from the cib even if
        # galera is still running!
        # Note3: after a cleanup, pacemaker may log a warning log
        # if it finds the resource is still running. This does not
        # count as an error for the CTS test
        rc = self.rsh(target, "pcs resource unmanage galera")
        if rc == 0:
            patterns = [r"crmd.*:\s*Initiating action.*: probe_complete probe_complete-%s on %s"%(n,n) \
                    for n in self.Env["nodes"]]
            watch=LogWatcher(self.Env["LogFileName"], patterns, None, self.Env["DeadTime"], kind=self.Env["LogWatcher"], hosts=self.Env["nodes"])
            watch.setwatch()
            self.rsh(target, "pcs resource cleanup galera")
            watch.lookforall()
            assert not watch.unmatched, watch.unmatched
            self.rsh(target, "pcs resource disable galera")
            self.rsh(target, "pcs resource manage galera")
            self.rsh(target, "pcs resource delete galera --wait")
Ejemplo n.º 15
0
    def partition_stable(self, nodes, timeout=None):
        watchpats = [ ]
        watchpats.append("Current ping state: S_IDLE")
        watchpats.append(self.templates["Pat:DC_IDLE"])
        self.debug("Waiting for cluster stability...") 

        if timeout == None:
            timeout = self.Env["DeadTime"]

        if len(nodes) < 3:
            self.debug("Cluster is inactive") 
            return 1

        idle_watch = LogWatcher(self.Env["LogFileName"], watchpats, "ClusterStable", timeout, hosts=nodes.split(), kind=self.Env["LogWatcher"])
        idle_watch.setwatch()

        for node in nodes.split():
            # have each node dump its current state
            self.rsh(node, self.templates["StatusCmd"] % node, 1)

        ret = idle_watch.look()
        while ret:
            self.debug(ret) 
            for node in nodes.split():
                if re.search(node, ret):
                    return 1
            ret = idle_watch.look()

        self.debug("Warn: Partition %s not IDLE after %ds" % (repr(nodes), timeout)) 
        return None
Ejemplo n.º 16
0
    def prepare_fencing_watcher(self, node):
        # If we don't have quorum now but get it as a result of starting this node,
        # then a bunch of nodes might get fenced
        upnode = None
        if self.HasQuorum(None):
            return None

        if not self.has_key("Pat:Fencing_start"):
            return None

        if not self.has_key("Pat:Fencing_ok"):
            return None

        stonith = None
        stonithPats = []
        for peer in self.Env["nodes"]:
            if peer != node and self.ShouldBeStatus[peer] != "up":
                stonithPats.append(self.templates["Pat:Fencing_ok"] % peer)
                stonithPats.append(self.templates["Pat:Fencing_start"] % peer)
            elif self.Env["Stack"] == "corosync (cman)":
                # There is a delay between gaining quorum and CMAN starting fencing
                # This can mean that even nodes that are fully up get fenced
                # There is no use fighting it, just look for everyone so that CTS doesn't get confused
                stonithPats.append(self.templates["Pat:Fencing_ok"] % peer)
                stonithPats.append(self.templates["Pat:Fencing_start"] % peer)

            if peer != node and not upnode and self.ShouldBeStatus[peer] == "up":
                upnode = peer

        # Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status
        if not upnode:
            return None

        stonith = LogWatcher(self.Env["LogFileName"], stonithPats, "StartupFencing", 0, hosts=[upnode], kind=self.Env["LogWatcher"])
        stonith.setwatch()
        return stonith
Ejemplo n.º 17
0
    def startall(self, nodelist=None, verbose=False, quick=False):
        '''Start the cluster manager on every node in the cluster.
        We can do it on a subset of the cluster if nodelist is not None.
        '''
        map = {}
        if not nodelist:
            nodelist = self.Env["nodes"]

        for node in nodelist:
            if self.ShouldBeStatus[node] == "down":
                self.ns.WaitForAllNodesToComeUp(nodelist, 300)

        if not quick:
            # This is used for "basic sanity checks", so only start one node ...
            if not self.StartaCM(node, verbose=verbose):
                return 0
            return 1

        # Approximation of SimulStartList for --boot
        watchpats = []
        watchpats.append(self.templates["Pat:DC_IDLE"])
        for node in nodelist:
            watchpats.append(self.templates["Pat:InfraUp"] % node)
            watchpats.append(self.templates["Pat:PacemakerUp"] % node)
            watchpats.append(self.templates["Pat:Local_started"] % node)
            watchpats.append(self.templates["Pat:They_up"] %
                             (nodelist[0], node))

        #   Start all the nodes - at about the same time...
        watch = LogWatcher(self.Env["LogFileName"],
                           watchpats,
                           "fast-start",
                           self.Env["DeadTime"] + 10,
                           hosts=self.Env["nodes"],
                           kind=self.Env["LogWatcher"])
        watch.setwatch()

        if not self.StartaCM(nodelist[0], verbose=verbose):
            return 0
        for node in nodelist:
            self.StartaCMnoBlock(node, verbose=verbose)

        watch.lookforall()
        if watch.unmatched:
            for regex in watch.unmatched:
                self.logger.log("Warn: Startup pattern not found: %s" %
                                (regex))

        if not self.cluster_stable():
            self.logger.log("Cluster did not stabilize")
            return 0

        return 1
Ejemplo n.º 18
0
    def StartaCM(self, node, verbose=False):

        '''Start up the cluster manager on a given node'''
        if verbose: self.logger.log("Starting %s on node %s" % (self.templates["Name"], node))
        else: self.debug("Starting %s on node %s" % (self.templates["Name"], node))
        ret = 1

        if not node in self.ShouldBeStatus:
            self.ShouldBeStatus[node] = "down"

        if self.ShouldBeStatus[node] != "down":
            return 1

        patterns = []
        # Technically we should always be able to notice ourselves starting
        patterns.append(self.templates["Pat:Local_started"] % node)
        if self.upcount() == 0:
            patterns.append(self.templates["Pat:Master_started"] % node)
        else:
            patterns.append(self.templates["Pat:Slave_started"] % node)

        watch = LogWatcher(
            self.Env["LogFileName"], patterns, "StartaCM", self.Env["StartTime"]+10, hosts=self.Env["nodes"], kind=self.Env["LogWatcher"])

        self.install_config(node)

        self.ShouldBeStatus[node] = "any"
        if self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]):
            self.logger.log ("%s was already started" % (node))
            return 1

        # Clear out the host cache so autojoin can be exercised
        if self.clear_cache:
            self.debug("Removing cache file on: "+node)
            self.rsh(node, "rm -f "+CTSvars.HA_VARLIBHBDIR+"/hostcache")

        if not(self.Env["valgrind-tests"]):
            startCmd = self.templates["StartCmd"]
        else:
            if self.Env["valgrind-prefix"]:
                prefix = self.Env["valgrind-prefix"]
            else:
                prefix = "cts"

            startCmd = """G_SLICE=always-malloc HA_VALGRIND_ENABLED='%s' VALGRIND_OPTS='%s --log-file=/tmp/%s-%s.valgrind' %s""" % (
                self.Env["valgrind-procs"], self.Env["valgrind-opts"], prefix, """%p""", self.templates["StartCmd"])

        stonith = self.prepare_fencing_watcher(node)

        watch.setwatch()

        if self.rsh(node, startCmd) != 0:
            self.logger.log ("Warn: Start command failed on node %s" % (node))
            self.fencing_cleanup(node, stonith)
            return None

        self.ShouldBeStatus[node] = "up"
        watch_result = watch.lookforall()

        if watch.unmatched:
            for regex in watch.unmatched:
                self.logger.log ("Warn: Startup pattern not found: %s" % (regex))

        if watch_result and self.cluster_stable(self.Env["DeadTime"]):
            #self.debug("Found match: "+ repr(watch_result))
            self.fencing_cleanup(node, stonith)
            return 1

        elif self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]):
            self.fencing_cleanup(node, stonith)
            return 1

        self.logger.log ("Warn: Start failed for node %s" % (node))
        return None
Ejemplo n.º 19
0
    def StartaCM(self, node, verbose=False):
        '''Start up the cluster manager on a given node'''
        if verbose:
            self.logger.log("Starting %s on node %s" %
                            (self.templates["Name"], node))
        else:
            self.debug("Starting %s on node %s" %
                       (self.templates["Name"], node))
        ret = 1

        if not node in self.ShouldBeStatus:
            self.ShouldBeStatus[node] = "down"

        if self.ShouldBeStatus[node] != "down":
            return 1

        patterns = []
        # Technically we should always be able to notice ourselves starting
        patterns.append(self.templates["Pat:Local_started"] % node)
        if self.upcount() == 0:
            patterns.append(self.templates["Pat:DC_started"] % node)
        else:
            patterns.append(self.templates["Pat:NonDC_started"] % node)

        watch = LogWatcher(self.Env["LogFileName"],
                           patterns,
                           "StartaCM",
                           self.Env["StartTime"] + 10,
                           hosts=self.Env["nodes"],
                           kind=self.Env["LogWatcher"])

        self.install_config(node)

        self.ShouldBeStatus[node] = "any"
        if self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]):
            self.logger.log("%s was already started" % (node))
            return 1

        stonith = self.prepare_fencing_watcher(node)
        watch.setwatch()

        if self.rsh(node, self.templates["StartCmd"]) != 0:
            self.logger.log("Warn: Start command failed on node %s" % (node))
            self.fencing_cleanup(node, stonith)
            return None

        self.ShouldBeStatus[node] = "up"
        watch_result = watch.lookforall()

        if watch.unmatched:
            for regex in watch.unmatched:
                self.logger.log("Warn: Startup pattern not found: %s" %
                                (regex))

        if watch_result and self.cluster_stable(self.Env["DeadTime"]):
            #self.debug("Found match: "+ repr(watch_result))
            self.fencing_cleanup(node, stonith)
            return 1

        elif self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]):
            self.fencing_cleanup(node, stonith)
            return 1

        self.logger.log("Warn: Start failed for node %s" % (node))
        return None
Ejemplo n.º 20
0
class Scenario:
    (
'''The basic idea of a scenario is that of an ordered list of
ScenarioComponent objects.  Each ScenarioComponent is SetUp() in turn,
and then after the tests have been run, they are torn down using TearDown()
(in reverse order).

A Scenario is applicable to a particular cluster manager iff each
ScenarioComponent is applicable.

A partially set up scenario is torn down if it fails during setup.
''')

    def __init__(self, ClusterManager, Components, Audits, Tests):

        "Initialize the Scenario from the list of ScenarioComponents"

        self.ClusterManager = ClusterManager
        self.Components = Components
        self.Audits  = Audits
        self.Tests = Tests

        self.BadNews = None
        self.TestSets = []
        self.Stats = {"success":0, "failure":0, "BadNews":0, "skipped":0}
        self.Sets = []

        #self.ns=CTS.NodeStatus(self.Env)

        for comp in Components:
            if not issubclass(comp.__class__, ScenarioComponent):
                raise ValueError("Init value must be subclass of ScenarioComponent")

        for audit in Audits:
            if not issubclass(audit.__class__, ClusterAudit):
                raise ValueError("Init value must be subclass of ClusterAudit")

        for test in Tests:
            if not issubclass(test.__class__, CTSTest):
                raise ValueError("Init value must be a subclass of CTSTest")

    def IsApplicable(self):
        (
'''A Scenario IsApplicable() iff each of its ScenarioComponents IsApplicable()
'''
        )

        for comp in self.Components:
            if not comp.IsApplicable():
                return None
        return 1

    def SetUp(self):
        '''Set up the Scenario. Return TRUE on success.'''

        self.ClusterManager.prepare()
        self.audit() # Also detects remote/local log config
        self.ClusterManager.StatsMark(0)
        self.ClusterManager.ns.WaitForAllNodesToComeUp(self.ClusterManager.Env["nodes"])

        self.audit()
        if self.ClusterManager.Env["valgrind-tests"]:
            self.ClusterManager.install_helper("cts.supp")

        self.BadNews = LogWatcher(self.ClusterManager.Env["LogFileName"],
                                  self.ClusterManager.templates.get_patterns(
                                      self.ClusterManager.Env["Name"], "BadNews"), "BadNews", 0,
                                  kind=self.ClusterManager.Env["LogWatcher"],
                                  hosts=self.ClusterManager.Env["nodes"])
        self.BadNews.setwatch() # Call after we've figured out what type of log watching to do in LogAudit

        j = 0
        while j < len(self.Components):
            if not self.Components[j].SetUp(self.ClusterManager):
                # OOPS!  We failed.  Tear partial setups down.
                self.audit()
                self.ClusterManager.log("Tearing down partial setup")
                self.TearDown(j)
                return None
            j = j + 1

        self.audit()
        return 1

    def TearDown(self, max=None):

        '''Tear Down the Scenario - in reverse order.'''

        if max == None:
            max = len(self.Components)-1
        j = max
        while j >= 0:
            self.Components[j].TearDown(self.ClusterManager)
            j = j - 1

        self.audit()
        self.ClusterManager.StatsExtract()

    def incr(self, name):
        '''Increment (or initialize) the value associated with the given name'''
        if not name in self.Stats:
            self.Stats[name] = 0
        self.Stats[name] = self.Stats[name]+1

    def run(self, Iterations):
        self.ClusterManager.oprofileStart()
        try:
            self.run_loop(Iterations)
            self.ClusterManager.oprofileStop()
        except:
            self.ClusterManager.oprofileStop()
            raise

    def run_loop(self, Iterations):
        raise ValueError("Abstract Class member (run_loop)")

    def run_test(self, test, testcount):
        nodechoice = self.ClusterManager.Env.RandomNode()

        ret = 1
        where = ""
        did_run = 0

        self.ClusterManager.StatsMark(testcount)
        self.ClusterManager.instance_errorstoignore_clear()
        self.ClusterManager.log(("Running test %s" % test.name).ljust(35) + (" (%s) " % nodechoice).ljust(15) + "[" + ("%d" % testcount).rjust(3) + "]")

        starttime = test.set_timer()
        if not test.setup(nodechoice):
            self.ClusterManager.log("Setup failed")
            ret = 0

        elif not test.canrunnow(nodechoice):
            self.ClusterManager.log("Skipped")
            test.skipped()

        else:
            did_run = 1
            ret = test(nodechoice)

        if not test.teardown(nodechoice):
            self.ClusterManager.log("Teardown failed")
            if self.ClusterManager.Env["continue"] == 1:
                answer = "Y"
            else:
                try:
                    answer = raw_input('Continue? [nY]')
                except EOFError as e:
                    answer = "n"
            if answer and answer == "n":
                raise ValueError("Teardown of %s on %s failed" % (test.name, nodechoice))
            ret = 0

        stoptime = time.time()
        self.ClusterManager.oprofileSave(testcount)

        elapsed_time = stoptime - starttime
        test_time = stoptime - test.get_timer()
        if not test["min_time"]:
            test["elapsed_time"] = elapsed_time
            test["min_time"] = test_time
            test["max_time"] = test_time
        else:
            test["elapsed_time"] = test["elapsed_time"] + elapsed_time
            if test_time < test["min_time"]:
                test["min_time"] = test_time
            if test_time > test["max_time"]:
                test["max_time"] = test_time

        if ret:
            self.incr("success")
            test.log_timer()
        else:
            self.incr("failure")
            self.ClusterManager.statall()
            did_run = 1  # Force the test count to be incremented anyway so test extraction works

        self.audit(test.errorstoignore())
        return did_run

    def summarize(self):
        self.ClusterManager.log("****************")
        self.ClusterManager.log("Overall Results:" + repr(self.Stats))
        self.ClusterManager.log("****************")

        stat_filter = {
            "calls":0,
            "failure":0,
            "skipped":0,
            "auditfail":0,
            }
        self.ClusterManager.log("Test Summary")
        for test in self.Tests:
            for key in list(stat_filter.keys()):
                stat_filter[key] = test.Stats[key]
            self.ClusterManager.log(("Test %s: "%test.name).ljust(25) + " %s"%repr(stat_filter))

        self.ClusterManager.debug("Detailed Results")
        for test in self.Tests:
            self.ClusterManager.debug(("Test %s: "%test.name).ljust(25) + " %s"%repr(test.Stats))

        self.ClusterManager.log("<<<<<<<<<<<<<<<< TESTS COMPLETED")

    def audit(self, LocalIgnore=[]):
        errcount = 0
        ignorelist = []
        ignorelist.append("CTS:")
        ignorelist.extend(LocalIgnore)
        ignorelist.extend(self.ClusterManager.errorstoignore())
        ignorelist.extend(self.ClusterManager.instance_errorstoignore())

        # This makes sure everything is stabilized before starting...
        failed = 0
        for audit in self.Audits:
            if not audit():
                self.ClusterManager.log("Audit " + audit.name() + " FAILED.")
                failed += 1
            else:
                self.ClusterManager.debug("Audit " + audit.name() + " passed.")

        while errcount < 1000:
            match = None
            if self.BadNews:
                match = self.BadNews.look(0)

            if match:
                add_err = 1
                for ignore in ignorelist:
                    if add_err == 1 and re.search(ignore, match):
                        add_err = 0
                if add_err == 1:
                    self.ClusterManager.log("BadNews: " + match)
                    self.incr("BadNews")
                    errcount = errcount + 1
            else:
                break
        else:
            if self.ClusterManager.Env["continue"] == 1:
                answer = "Y"
            else:
                try:
                    answer = raw_input('Big problems. Continue? [nY]')
                except EOFError as e:
                    answer = "n"
            if answer and answer == "n":
                self.ClusterManager.log("Shutting down.")
                self.summarize()
                self.TearDown()
                raise ValueError("Looks like we hit a BadNews jackpot!")

        if self.BadNews:
            self.BadNews.end()
        return failed
Ejemplo n.º 21
0
class Scenario:
    ('''The basic idea of a scenario is that of an ordered list of
ScenarioComponent objects.  Each ScenarioComponent is SetUp() in turn,
and then after the tests have been run, they are torn down using TearDown()
(in reverse order).

A Scenario is applicable to a particular cluster manager iff each
ScenarioComponent is applicable.

A partially set up scenario is torn down if it fails during setup.
''')

    def __init__(self, ClusterManager, Components, Audits, Tests):

        "Initialize the Scenario from the list of ScenarioComponents"

        self.ClusterManager = ClusterManager
        self.Components = Components
        self.Audits = Audits
        self.Tests = Tests

        self.BadNews = None
        self.TestSets = []
        self.Stats = {"success": 0, "failure": 0, "BadNews": 0, "skipped": 0}
        self.Sets = []

        #self.ns=CTS.NodeStatus(self.Env)

        for comp in Components:
            if not issubclass(comp.__class__, ScenarioComponent):
                raise ValueError(
                    "Init value must be subclass of ScenarioComponent")

        for audit in Audits:
            if not issubclass(audit.__class__, ClusterAudit):
                raise ValueError("Init value must be subclass of ClusterAudit")

        for test in Tests:
            if not issubclass(test.__class__, CTSTest):
                raise ValueError("Init value must be a subclass of CTSTest")

    def IsApplicable(self):
        ('''A Scenario IsApplicable() iff each of its ScenarioComponents IsApplicable()
''')

        for comp in self.Components:
            if not comp.IsApplicable():
                return None
        return 1

    def SetUp(self):
        '''Set up the Scenario. Return TRUE on success.'''

        self.ClusterManager.prepare()
        self.audit()  # Also detects remote/local log config
        self.ClusterManager.StatsMark(0)
        self.ClusterManager.ns.WaitForAllNodesToComeUp(
            self.ClusterManager.Env["nodes"])

        self.audit()
        if self.ClusterManager.Env["valgrind-tests"]:
            self.ClusterManager.install_helper("cts.supp")

        self.BadNews = LogWatcher(self.ClusterManager.Env["LogFileName"],
                                  self.ClusterManager.templates.get_patterns(
                                      self.ClusterManager.Env["Name"],
                                      "BadNews"),
                                  "BadNews",
                                  0,
                                  kind=self.ClusterManager.Env["LogWatcher"],
                                  hosts=self.ClusterManager.Env["nodes"])
        self.BadNews.setwatch(
        )  # Call after we've figured out what type of log watching to do in LogAudit

        j = 0
        while j < len(self.Components):
            if not self.Components[j].SetUp(self.ClusterManager):
                # OOPS!  We failed.  Tear partial setups down.
                self.audit()
                self.ClusterManager.log("Tearing down partial setup")
                self.TearDown(j)
                return None
            j = j + 1

        self.audit()
        return 1

    def TearDown(self, max=None):
        '''Tear Down the Scenario - in reverse order.'''

        if max == None:
            max = len(self.Components) - 1
        j = max
        while j >= 0:
            self.Components[j].TearDown(self.ClusterManager)
            j = j - 1

        self.audit()
        self.ClusterManager.StatsExtract()

    def incr(self, name):
        '''Increment (or initialize) the value associated with the given name'''
        if not name in self.Stats:
            self.Stats[name] = 0
        self.Stats[name] = self.Stats[name] + 1

    def run(self, Iterations):
        self.ClusterManager.oprofileStart()
        try:
            self.run_loop(Iterations)
            self.ClusterManager.oprofileStop()
        except:
            self.ClusterManager.oprofileStop()
            raise

    def run_loop(self, Iterations):
        raise ValueError("Abstract Class member (run_loop)")

    def run_test(self, test, testcount):
        nodechoice = self.ClusterManager.Env.RandomNode()

        ret = 1
        where = ""
        did_run = 0

        self.ClusterManager.StatsMark(testcount)
        self.ClusterManager.instance_errorstoignore_clear()
        self.ClusterManager.log(("Running test %s" % test.name).ljust(35) +
                                (" (%s) " % nodechoice).ljust(15) + "[" +
                                ("%d" % testcount).rjust(3) + "]")

        starttime = test.set_timer()
        if not test.setup(nodechoice):
            self.ClusterManager.log("Setup failed")
            ret = 0

        elif not test.canrunnow(nodechoice):
            self.ClusterManager.log("Skipped")
            test.skipped()

        else:
            did_run = 1
            ret = test(nodechoice)

        if not test.teardown(nodechoice):
            self.ClusterManager.log("Teardown failed")
            answer = raw_input('Continue? [nY] ')
            if answer and answer == "n":
                raise ValueError("Teardown of %s on %s failed" %
                                 (test.name, nodechoice))
            ret = 0

        stoptime = time.time()
        self.ClusterManager.oprofileSave(testcount)

        elapsed_time = stoptime - starttime
        test_time = stoptime - test.get_timer()
        if not test["min_time"]:
            test["elapsed_time"] = elapsed_time
            test["min_time"] = test_time
            test["max_time"] = test_time
        else:
            test["elapsed_time"] = test["elapsed_time"] + elapsed_time
            if test_time < test["min_time"]:
                test["min_time"] = test_time
            if test_time > test["max_time"]:
                test["max_time"] = test_time

        if ret:
            self.incr("success")
            test.log_timer()
        else:
            self.incr("failure")
            self.ClusterManager.statall()
            did_run = 1  # Force the test count to be incrimented anyway so test extraction works

        self.audit(test.errorstoignore())
        return did_run

    def summarize(self):
        self.ClusterManager.log("****************")
        self.ClusterManager.log("Overall Results:" + repr(self.Stats))
        self.ClusterManager.log("****************")

        stat_filter = {
            "calls": 0,
            "failure": 0,
            "skipped": 0,
            "auditfail": 0,
        }
        self.ClusterManager.log("Test Summary")
        for test in self.Tests:
            for key in list(stat_filter.keys()):
                stat_filter[key] = test.Stats[key]
            self.ClusterManager.log(("Test %s: " % test.name).ljust(25) +
                                    " %s" % repr(stat_filter))

        self.ClusterManager.debug("Detailed Results")
        for test in self.Tests:
            self.ClusterManager.debug(("Test %s: " % test.name).ljust(25) +
                                      " %s" % repr(test.Stats))

        self.ClusterManager.log("<<<<<<<<<<<<<<<< TESTS COMPLETED")

    def audit(self, LocalIgnore=[]):
        errcount = 0
        ignorelist = []
        ignorelist.append("CTS:")
        ignorelist.extend(LocalIgnore)
        ignorelist.extend(self.ClusterManager.errorstoignore())
        ignorelist.extend(self.ClusterManager.instance_errorstoignore())

        # This makes sure everything is stabilized before starting...
        failed = 0
        for audit in self.Audits:
            if not audit():
                self.ClusterManager.log("Audit " + audit.name() + " FAILED.")
                failed += 1
            else:
                self.ClusterManager.debug("Audit " + audit.name() + " passed.")

        while errcount < 1000:
            match = None
            if self.BadNews:
                match = self.BadNews.look(0)

            if match:
                add_err = 1
                for ignore in ignorelist:
                    if add_err == 1 and re.search(ignore, match):
                        add_err = 0
                if add_err == 1:
                    self.ClusterManager.log("BadNews: " + match)
                    self.incr("BadNews")
                    errcount = errcount + 1
            else:
                break
        else:
            answer = raw_input('Big problems.  Continue? [nY]')
            if answer and answer == "n":
                self.ClusterManager.log("Shutting down.")
                self.summarize()
                self.TearDown()
                raise ValueError("Looks like we hit a BadNews jackpot!")

        if self.BadNews:
            self.BadNews.end()
        return failed
Ejemplo n.º 22
0
    def StartaCM(self, node, verbose=False):

        '''Start up the cluster manager on a given node'''
        if verbose: self.logger.log("Starting %s on node %s" % (self.templates["Name"], node))
        else: self.debug("Starting %s on node %s" % (self.templates["Name"], node))
        ret = 1

        if not node in self.ShouldBeStatus:
            self.ShouldBeStatus[node] = "down"

        if self.ShouldBeStatus[node] != "down":
            return 1

        patterns = []
        # Technically we should always be able to notice ourselves starting
        patterns.append(self.templates["Pat:Local_started"] % node)
        if self.upcount() == 0:
            patterns.append(self.templates["Pat:Master_started"] % node)
        else:
            patterns.append(self.templates["Pat:Slave_started"] % node)

        watch = LogWatcher(
            self.Env["LogFileName"], patterns, "StartaCM", self.Env["StartTime"]+10, hosts=self.Env["nodes"], kind=self.Env["LogWatcher"])

        self.install_config(node)

        self.ShouldBeStatus[node] = "any"
        if self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]):
            self.logger.log ("%s was already started" % (node))
            return 1

        # Clear out the host cache so autojoin can be exercised
        if self.clear_cache:
            self.debug("Removing cache file on: "+node)
            self.rsh(node, "rm -f "+CTSvars.HA_VARLIBHBDIR+"/hostcache")

        if not(self.Env["valgrind-tests"]):
            startCmd = self.templates["StartCmd"]
        else:
            if self.Env["valgrind-prefix"]:
                prefix = self.Env["valgrind-prefix"]
            else:
                prefix = "cts"

            startCmd = """G_SLICE=always-malloc HA_VALGRIND_ENABLED='%s' VALGRIND_OPTS='%s --log-file=/tmp/%s-%s.valgrind' %s""" % (
                self.Env["valgrind-procs"], self.Env["valgrind-opts"], prefix, """%p""", self.templates["StartCmd"])

        stonith = self.prepare_fencing_watcher(node)

        watch.setwatch()

        if self.rsh(node, startCmd) != 0:
            self.logger.log ("Warn: Start command failed on node %s" % (node))
            self.fencing_cleanup(node, stonith)
            return None

        self.ShouldBeStatus[node] = "up"
        watch_result = watch.lookforall()

        if watch.unmatched:
            for regex in watch.unmatched:
                self.logger.log ("Warn: Startup pattern not found: %s" % (regex))

        if watch_result and self.cluster_stable(self.Env["DeadTime"]):
            #self.debug("Found match: "+ repr(watch_result))
            self.fencing_cleanup(node, stonith)
            return 1

        elif self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]):
            self.fencing_cleanup(node, stonith)
            return 1

        self.logger.log ("Warn: Start failed for node %s" % (node))
        return None
Ejemplo n.º 23
0
    def TestLogging(self):
        patterns = []
        prefix = "Test message from"
        suffix = str(uuid.uuid4())
        watch = {}

        for node in self.CM.Env["nodes"]:
            # Look for the node name in two places to make sure
            # that syslog is logging with the correct hostname
            m = re.search("^([^.]+).*", node)
            if m:
                simple = m.group(1)
            else:
                simple = node
            patterns.append("%s.*%s %s %s" % (simple, prefix, node, suffix))

        watch_pref = self.CM.Env["LogWatcher"]
        if watch_pref == "any":
            for k in self.kinds:
                watch[k] = LogWatcher(self.CM.Env["LogFileName"],
                                      patterns,
                                      "LogAudit",
                                      5,
                                      silent=True,
                                      hosts=self.CM.Env["nodes"],
                                      kind=k)
                watch[k].setwatch()
        else:
            k = watch_pref
            watch[k] = LogWatcher(self.CM.Env["LogFileName"],
                                  patterns,
                                  "LogAudit",
                                  5,
                                  silent=True,
                                  hosts=self.CM.Env["nodes"],
                                  kind=k)
            watch[k].setwatch()

        if watch_pref == "any":
            self.CM.log("Writing log with key: %s" % (suffix))
        for node in self.CM.Env["nodes"]:
            cmd = "logger -p %s.info %s %s %s" % (
                self.CM.Env["SyslogFacility"], prefix, node, suffix)
            if self.CM.rsh(node, cmd, synchronous=0, silent=True) != 0:
                self.CM.log("ERROR: Cannot execute remote command [%s] on %s" %
                            (cmd, node))

        for k in self.kinds:
            if k in watch:
                w = watch[k]
                if watch_pref == "any":
                    self.CM.log("Testing for %s logs" % (k))
                w.lookforall(silent=True)
                if not w.unmatched:
                    if watch_pref == "any":
                        self.CM.log("Continuing with %s-based log reader" %
                                    (w.kind))
                        self.CM.Env["LogWatcher"] = w.kind
                    return 1

        for k in list(watch.keys()):
            w = watch[k]
            if w.unmatched:
                for regex in w.unmatched:
                    self.CM.log("Test message [%s] not found in %s logs." %
                                (regex, w.kind))

        return 0
Ejemplo n.º 24
0
class Scenario:
    ('''The basic idea of a scenario is that of an ordered list of
ScenarioComponent objects.  Each ScenarioComponent is SetUp() in turn,
and then after the tests have been run, they are torn down using TearDown()
(in reverse order).

A Scenario is applicable to a particular cluster manager iff each
ScenarioComponent is applicable.

A partially set up scenario is torn down if it fails during setup.
''')

    def __init__(self, ClusterManager, Components, Audits, Tests):

        "Initialize the Scenario from the list of ScenarioComponents"

        self.ClusterManager = ClusterManager
        self.Components = Components
        self.Audits = Audits
        self.Tests = Tests

        self.BadNews = None
        self.TestSets = []
        self.Stats = {"success": 0, "failure": 0, "BadNews": 0, "skipped": 0}
        self.Sets = []

        #self.ns=CTS.NodeStatus(self.Env)

        for comp in Components:
            if not issubclass(comp.__class__, ScenarioComponent):
                raise ValueError(
                    "Init value must be subclass of ScenarioComponent")

        for audit in Audits:
            if not issubclass(audit.__class__, ClusterAudit):
                raise ValueError("Init value must be subclass of ClusterAudit")

        for test in Tests:
            if not issubclass(test.__class__, CTSTest):
                raise ValueError("Init value must be a subclass of CTSTest")

    def IsApplicable(self):
        ('''A Scenario IsApplicable() iff each of its ScenarioComponents IsApplicable()
''')

        for comp in self.Components:
            if not comp.IsApplicable():
                return None
        return 1

    def SetUp(self):
        '''Set up the Scenario. Return TRUE on success.'''

        self.ClusterManager.prepare()
        self.audit()  # Also detects remote/local log config
        self.ClusterManager.StatsMark(0)
        self.ClusterManager.ns.WaitForAllNodesToComeUp(
            self.ClusterManager.Env["nodes"])

        self.audit()
        if self.ClusterManager.Env["valgrind-tests"]:
            self.ClusterManager.install_helper("cts.supp")

        self.BadNews = LogWatcher(self.ClusterManager.Env["LogFileName"],
                                  self.ClusterManager.templates.get_patterns(
                                      self.ClusterManager.Env["Name"],
                                      "BadNews"),
                                  "BadNews",
                                  0,
                                  kind=self.ClusterManager.Env["LogWatcher"],
                                  hosts=self.ClusterManager.Env["nodes"])
        self.BadNews.setwatch(
        )  # Call after we've figured out what type of log watching to do in LogAudit

        j = 0
        while j < len(self.Components):
            if not self.Components[j].SetUp(self.ClusterManager):
                # OOPS!  We failed.  Tear partial setups down.
                self.audit()
                self.ClusterManager.log("Tearing down partial setup")
                self.TearDown(j)
                return None
            j = j + 1

        self.audit()
        return 1

    def TearDown(self, max=None):
        '''Tear Down the Scenario - in reverse order.'''

        if max == None:
            max = len(self.Components) - 1
        j = max
        while j >= 0:
            self.Components[j].TearDown(self.ClusterManager)
            j = j - 1

        self.audit()
        self.ClusterManager.StatsExtract()

    def incr(self, name):
        '''Increment (or initialize) the value associated with the given name'''
        if not name in self.Stats:
            self.Stats[name] = 0
        self.Stats[name] = self.Stats[name] + 1

    def run(self, Iterations):
        self.ClusterManager.oprofileStart()
        try:
            self.run_loop(Iterations)
            self.ClusterManager.oprofileStop()
        except:
            self.ClusterManager.oprofileStop()
            raise

    def run_loop(self, Iterations):
        raise ValueError("Abstract Class member (run_loop)")

    def run_test(self, test, testcount):
        nodechoice = self.ClusterManager.Env.RandomNode()

        ret = 1
        where = ""
        did_run = 0

        self.ClusterManager.StatsMark(testcount)
        self.ClusterManager.instance_errorstoignore_clear()
        self.ClusterManager.log(("Running test %s" % test.name).ljust(35) +
                                (" (%s) " % nodechoice).ljust(15) + "[" +
                                ("%d" % testcount).rjust(3) + "]")

        starttime = test.set_timer()
        if not test.setup(nodechoice):
            self.ClusterManager.log("Setup failed")
            ret = 0

        elif not test.canrunnow(nodechoice):
            self.ClusterManager.log("Skipped")
            test.skipped()

        else:
            did_run = 1
            ret = test(nodechoice)

        if not test.teardown(nodechoice):
            self.ClusterManager.log("Teardown failed")
            if self.ClusterManager.Env["continue"] == 1:
                answer = "Y"
            else:
                try:
                    answer = raw_input('Continue? [nY]')
                except EOFError, e:
                    answer = "n"
            if answer and answer == "n":
                raise ValueError("Teardown of %s on %s failed" %
                                 (test.name, nodechoice))
            ret = 0

        stoptime = time.time()
        self.ClusterManager.oprofileSave(testcount)

        elapsed_time = stoptime - starttime
        test_time = stoptime - test.get_timer()
        if not test["min_time"]:
            test["elapsed_time"] = elapsed_time
            test["min_time"] = test_time
            test["max_time"] = test_time
        else:
            test["elapsed_time"] = test["elapsed_time"] + elapsed_time
            if test_time < test["min_time"]:
                test["min_time"] = test_time
            if test_time > test["max_time"]:
                test["max_time"] = test_time

        if ret:
            self.incr("success")
            test.log_timer()
        else:
            self.incr("failure")
            self.ClusterManager.statall()
            did_run = 1  # Force the test count to be incremented anyway so test extraction works

        self.audit(test.errorstoignore())
        return did_run