Ejemplo n.º 1
0
    def setup_scenario(self, cluster_manager):
        cluster_manager.log("Reusing cluster")

        # Disable STONITH by default. A dedicated ScenarioComponent
        # is in charge of enabling it if requested
        self.rsh_check(self.Env["nodes"][0], "pcs property set stonith-enabled=false")

        # Stop and remove galera if it exists
        # Note1: in order to avoid error when stopping the resource while
        # in unknown state, we first reprobe the resource state.
        # Note2: if you clean and delete before pacemaker had a
        # chance to re-probe state, it will consider resource is stopped
        # and will happily delete the resource from the cib even if
        # galera is still running!
        # Note3: after a cleanup, pacemaker may log a warning log
        # if it finds the resource is still running. This does not
        # count as an error for the CTS test
        target=self.Env["nodes"][0]
        rc = self.rsh(target, "pcs resource unmanage galera")
        if rc == 0:
            patterns = [r"crmd.*:\s*Initiating action.*: probe_complete probe_complete-%s on %s"%(n,n) \
                    for n in self.Env["nodes"]]
            watch=LogWatcher(self.Env["LogFileName"], patterns, None, self.Env["DeadTime"], kind=self.Env["LogWatcher"], hosts=self.Env["nodes"])
            watch.setwatch()
            self.rsh(target, "pcs resource cleanup galera")
            watch.lookforall()
            assert not watch.unmatched, watch.unmatched
            self.rsh(target, "pcs resource disable galera")
            self.rsh(target, "pcs resource manage galera")
            self.rsh(target, "pcs resource delete galera --wait")
Ejemplo n.º 2
0
    def setup_scenario(self, cluster_manager):
        # mysql setup
        self.init_and_setup_mysql_defaults()
        self.setup_galera_config()

        remote_authkey = "/etc/pacemaker/authkey"
        if not self.rsh.exists_on_all(remote_authkey, self.Env["nodes"]):
            self.log("Creating auth key for communication with pacemaker remote")
            with tempfile.NamedTemporaryFile() as tmp:
                tmp.write(os.urandom(4096))
                tmp.flush()
                self.copy_to_nodes([(tmp.name, remote_authkey)], True, "root:haclient", "440")

        # cluster_manager.prepare()

        # stop cluster if previously running, failure is not fatal
        for node in self.Env["nodes"]:
            self.rsh(node, "pcs cluster destroy")
            self.rsh(node, "systemctl stop pacemaker_remote")
            self.rsh(node, "systemctl enable pacemaker")

        # reconfigure cluster for 2-nodes + one remote arbitrator
        self.Env["arb"]=self.Env["nodes"][-1]
        self.Env["nodes"]=self.Env["nodes"][:-1]
        self.rsh_check(self.Env["nodes"][0], "pcs cluster setup --force --name ratester %s %s" % \
                       (self.Env["nodes"][0],self.Env["nodes"][1]))
        # note: setting up cluster disable pacemaker service. re-enable it
        self.rsh_check(self.Env["nodes"][0], "systemctl enable pacemaker")
        self.rsh_check(self.Env["nodes"][0], "pcs cluster start --all")

        # TODO: better way to wait until cluster is started
        time.sleep(8)

        # Disable STONITH by default. A dedicated ScenarioComponent
        # is in charge of enabling it if requested
        self.rsh_check(self.Env["nodes"][0], "pcs property set stonith-enabled=false")

        for node in self.Env["nodes"]:
            self.rsh_check(node, "pcs property set --node %s osprole=controller"%node)

        # cluster_manager.prepare()

        # pacemaker remote to host garbd
        res=self.rsh_check(self.Env["arb"], "systemctl disable pacemaker")
        res=self.rsh_check(self.Env["arb"], "systemctl enable pacemaker_remote")
        res=self.rsh_check(self.Env["arb"], "systemctl start pacemaker_remote")

        remote_ok_pat = self.ratemplates.build("Pat:RscRemoteOp", "start", "arb", "\S+", "ok")
        watch=LogWatcher(self.Env["LogFileName"], [remote_ok_pat], None, self.Env["DeadTime"], kind=self.Env["LogWatcher"], hosts=self.Env["nodes"])
        # watch = self.create_watch([remote_ok_pat], self.Env["DeadTime"])
        watch.setwatch()
        res=self.rsh_check(self.Env["nodes"][0], "pcs resource create arb ocf:pacemaker:remote server=%s reconnect_interval=60 op monitor interval=20"%self.Env["arb"])
        watch.lookforall()
        assert not watch.unmatched, watch.unmatched

        self.rsh_check(self.Env["nodes"][0], "pcs property set --node arb osprole=arbitrator")

        # there's no selinux context for garbd currently
        res=self.rsh_check(self.Env["arb"], "test -x /usr/sbin/setenforce && setenforce 0 || true")
Ejemplo n.º 3
0
    def startall(self, nodelist=None, verbose=False, quick=False):
        '''Start the cluster manager on every node in the cluster.
        We can do it on a subset of the cluster if nodelist is not None.
        '''
        map = {}
        if not nodelist:
            nodelist = self.Env["nodes"]

        for node in nodelist:
            if self.ShouldBeStatus[node] == "down":
                self.ns.WaitForAllNodesToComeUp(nodelist, 300)

        if not quick:
            # This is used for "basic sanity checks", so only start one node ...
            if not self.StartaCM(node, verbose=verbose):
                return 0
            return 1

        # Approximation of SimulStartList for --boot
        watchpats = []
        watchpats.append(self.templates["Pat:DC_IDLE"])
        for node in nodelist:
            watchpats.append(self.templates["Pat:InfraUp"] % node)
            watchpats.append(self.templates["Pat:PacemakerUp"] % node)
            watchpats.append(self.templates["Pat:Local_started"] % node)
            watchpats.append(self.templates["Pat:They_up"] %
                             (nodelist[0], node))

        #   Start all the nodes - at about the same time...
        watch = LogWatcher(self.Env["LogFileName"],
                           watchpats,
                           "fast-start",
                           self.Env["DeadTime"] + 10,
                           hosts=self.Env["nodes"],
                           kind=self.Env["LogWatcher"])
        watch.setwatch()

        if not self.StartaCM(nodelist[0], verbose=verbose):
            return 0
        for node in nodelist:
            self.StartaCMnoBlock(node, verbose=verbose)

        watch.lookforall()
        if watch.unmatched:
            for regex in watch.unmatched:
                self.logger.log("Warn: Startup pattern not found: %s" %
                                (regex))

        if not self.cluster_stable():
            self.logger.log("Cluster did not stabilize")
            return 0

        return 1
Ejemplo n.º 4
0
    def setup_scenario(self, cluster_manager):
        # pre-requisites
        prerequisite = ["/usr/bin/gdb", "/usr/bin/screen", "/usr/bin/dig"]
        missing_reqs = False
        for req in prerequisite:
            if not self.rsh.exists_on_all(req, self.Env["nodes"]):
                self.log("error: %s could not be found on remote nodes. "
                         "Please install the necessary package to run the tests"%  req)
                missing_reqs = True
        assert not missing_reqs

        # galera-specific data
        test_scripts = ["kill-during-txn.gdb", "slow_down_sst.sh"]
        for node in self.Env["nodes"]:
            for script in test_scripts:
                src = os.path.join(os.path.dirname(os.path.abspath(__file__)), script)
                rc = self.rsh.cp(src, "root@%s:/tmp/%s" % (node, script))
                assert rc == 0, \
                    "failed to copy data \"%s\" on remote node \"%s\"" % \
                    (src, node)

        # mysql setup
        self.init_and_setup_mysql_defaults()
        self.setup_galera_config()

        # clean up any traffic control on target network interface
        for node in self.Env["nodes"]:
            self.rsh(node, "/tmp/slow_down_sst.sh -n %s off"%node)

        # stop cluster if previously running, failure is not fatal
        for node in self.Env["nodes"]:
            self.rsh(node, "pcs cluster destroy")
            self.rsh(node, "systemctl enable pacemaker")
            self.rsh(node, "systemctl stop pacemaker_remote")
            self.rsh(node, "systemctl disable pacemaker_remote")

        # create a new cluster
        # note: setting up cluster disable pacemaker service. re-enable it
        patterns = [r"crmd.*:\s*notice:\sState\stransition\sS_STARTING(\s->.*origin=do_started)?",
                    r"crmd.*:\s*notice:\sState\stransition\s.*->\sS_IDLE(\s.*origin=notify_crmd)?"]
        watch = LogWatcher(self.Env["LogFileName"], patterns, None, self.Env["DeadTime"], kind=self.Env["LogWatcher"], hosts=self.Env["nodes"])
        watch.setwatch()
        self.rsh_check(self.Env["nodes"][0], "pcs cluster setup --force --name ratester %s" % \
                       " ".join(self.Env["nodes"]))
        self.rsh_check(self.Env["nodes"][0], "systemctl enable pacemaker")
        self.rsh_check(self.Env["nodes"][0], "pcs cluster start --all")
        # Disable STONITH by default. A dedicated ScenarioComponent
        # is in charge of enabling it if requested
        self.rsh_check(self.Env["nodes"][0], "pcs property set stonith-enabled=false")
        watch.lookforall()
        assert not watch.unmatched, watch.unmatched
Ejemplo n.º 5
0
    def startall(self, nodelist=None, verbose=False, quick=False):

        '''Start the cluster manager on every node in the cluster.
        We can do it on a subset of the cluster if nodelist is not None.
        '''
        map = {}
        if not nodelist:
            nodelist = self.Env["nodes"]

        for node in nodelist:
            if self.ShouldBeStatus[node] == "down":
                self.ns.WaitForAllNodesToComeUp(nodelist, 300)

        if not quick:
            # This is used for "basic sanity checks", so only start one node ...
            if not self.StartaCM(node, verbose=verbose):
                return 0
            return 1

        # Approximation of SimulStartList for --boot 
        watchpats = [ ]
        watchpats.append(self.templates["Pat:DC_IDLE"])
        for node in nodelist:
            watchpats.append(self.templates["Pat:Local_started"] % node)
            watchpats.append(self.templates["Pat:InfraUp"] % node)
            watchpats.append(self.templates["Pat:PacemakerUp"] % node)

        #   Start all the nodes - at about the same time...
        watch = LogWatcher(self.Env["LogFileName"], watchpats, "fast-start", self.Env["DeadTime"]+10, hosts=self.Env["nodes"], kind=self.Env["LogWatcher"])
        watch.setwatch()

        if not self.StartaCM(nodelist[0], verbose=verbose):
            return 0
        for node in nodelist:
            self.StartaCMnoBlock(node, verbose=verbose)

        watch.lookforall()
        if watch.unmatched:
            for regex in watch.unmatched:
                self.logger.log ("Warn: Startup pattern not found: %s" % (regex))

        if not self.cluster_stable():
            self.logger.log("Cluster did not stabilize")
            return 0

        return 1
Ejemplo n.º 6
0
    def setup_scenario(self, cluster_manager):
        # consider cluster has 2-nodes + one remote arbitrator
        cluster_manager.log("Reusing cluster")
        target=self.Env["nodes"][0]

        self.Env["arb"]=self.Env["nodes"][-1]
        self.rsh_check(target, "pcs property set --node arb osprole=arbitrator")

        # attempt at cleaning up and remove garbd if it exists
        rc = self.rsh(target, "pcs resource unmanage garbd")
        if rc == 0:
            self.rsh(target, "pcs resource cleanup garbd")
            self.rsh(target, "pcs resource disable garbd")
            self.rsh(target, "pcs resource manage garbd")
            self.rsh(target, "pcs resource delete garbd --wait")

        self.Env["nodes"]=self.Env["nodes"][:-1]
        for node in self.Env["nodes"]:
            self.rsh_check(node, "pcs property set --node %s osprole=controller"%node)

        # Stop and remove galera if it exists
        # Note1: in order to avoid error when stopping the resource while
        # in unknown state, we first reprobe the resource state.
        # Note2: if you clean and delete before pacemaker had a
        # chance to re-probe state, it will consider resource is stopped
        # and will happily delete the resource from the cib even if
        # galera is still running!
        # Note3: after a cleanup, pacemaker may log a warning log
        # if it finds the resource is still running. This does not
        # count as an error for the CTS test
        rc = self.rsh(target, "pcs resource unmanage galera")
        if rc == 0:
            patterns = [r"crmd.*:\s*Initiating action.*: probe_complete probe_complete-%s on %s"%(n,n) \
                    for n in self.Env["nodes"]]
            watch=LogWatcher(self.Env["LogFileName"], patterns, None, self.Env["DeadTime"], kind=self.Env["LogWatcher"], hosts=self.Env["nodes"])
            watch.setwatch()
            self.rsh(target, "pcs resource cleanup galera")
            watch.lookforall()
            assert not watch.unmatched, watch.unmatched
            self.rsh(target, "pcs resource disable galera")
            self.rsh(target, "pcs resource manage galera")
            self.rsh(target, "pcs resource delete galera --wait")
Ejemplo n.º 7
0
    def StartaCM(self, node, verbose=False):
        '''Start up the cluster manager on a given node'''
        if verbose:
            self.logger.log("Starting %s on node %s" %
                            (self.templates["Name"], node))
        else:
            self.debug("Starting %s on node %s" %
                       (self.templates["Name"], node))
        ret = 1

        if not node in self.ShouldBeStatus:
            self.ShouldBeStatus[node] = "down"

        if self.ShouldBeStatus[node] != "down":
            return 1

        patterns = []
        # Technically we should always be able to notice ourselves starting
        patterns.append(self.templates["Pat:Local_started"] % node)
        if self.upcount() == 0:
            patterns.append(self.templates["Pat:DC_started"] % node)
        else:
            patterns.append(self.templates["Pat:NonDC_started"] % node)

        watch = LogWatcher(self.Env["LogFileName"],
                           patterns,
                           "StartaCM",
                           self.Env["StartTime"] + 10,
                           hosts=self.Env["nodes"],
                           kind=self.Env["LogWatcher"])

        self.install_config(node)

        self.ShouldBeStatus[node] = "any"
        if self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]):
            self.logger.log("%s was already started" % (node))
            return 1

        stonith = self.prepare_fencing_watcher(node)
        watch.setwatch()

        if self.rsh(node, self.templates["StartCmd"]) != 0:
            self.logger.log("Warn: Start command failed on node %s" % (node))
            self.fencing_cleanup(node, stonith)
            return None

        self.ShouldBeStatus[node] = "up"
        watch_result = watch.lookforall()

        if watch.unmatched:
            for regex in watch.unmatched:
                self.logger.log("Warn: Startup pattern not found: %s" %
                                (regex))

        if watch_result and self.cluster_stable(self.Env["DeadTime"]):
            #self.debug("Found match: "+ repr(watch_result))
            self.fencing_cleanup(node, stonith)
            return 1

        elif self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]):
            self.fencing_cleanup(node, stonith)
            return 1

        self.logger.log("Warn: Start failed for node %s" % (node))
        return None
Ejemplo n.º 8
0
    def StartaCM(self, node, verbose=False):

        '''Start up the cluster manager on a given node'''
        if verbose: self.logger.log("Starting %s on node %s" % (self.templates["Name"], node))
        else: self.debug("Starting %s on node %s" % (self.templates["Name"], node))
        ret = 1

        if not node in self.ShouldBeStatus:
            self.ShouldBeStatus[node] = "down"

        if self.ShouldBeStatus[node] != "down":
            return 1

        patterns = []
        # Technically we should always be able to notice ourselves starting
        patterns.append(self.templates["Pat:Local_started"] % node)
        if self.upcount() == 0:
            patterns.append(self.templates["Pat:Master_started"] % node)
        else:
            patterns.append(self.templates["Pat:Slave_started"] % node)

        watch = LogWatcher(
            self.Env["LogFileName"], patterns, "StartaCM", self.Env["StartTime"]+10, hosts=self.Env["nodes"], kind=self.Env["LogWatcher"])

        self.install_config(node)

        self.ShouldBeStatus[node] = "any"
        if self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]):
            self.logger.log ("%s was already started" % (node))
            return 1

        # Clear out the host cache so autojoin can be exercised
        if self.clear_cache:
            self.debug("Removing cache file on: "+node)
            self.rsh(node, "rm -f "+CTSvars.HA_VARLIBHBDIR+"/hostcache")

        if not(self.Env["valgrind-tests"]):
            startCmd = self.templates["StartCmd"]
        else:
            if self.Env["valgrind-prefix"]:
                prefix = self.Env["valgrind-prefix"]
            else:
                prefix = "cts"

            startCmd = """G_SLICE=always-malloc HA_VALGRIND_ENABLED='%s' VALGRIND_OPTS='%s --log-file=/tmp/%s-%s.valgrind' %s""" % (
                self.Env["valgrind-procs"], self.Env["valgrind-opts"], prefix, """%p""", self.templates["StartCmd"])

        stonith = self.prepare_fencing_watcher(node)

        watch.setwatch()

        if self.rsh(node, startCmd) != 0:
            self.logger.log ("Warn: Start command failed on node %s" % (node))
            self.fencing_cleanup(node, stonith)
            return None

        self.ShouldBeStatus[node] = "up"
        watch_result = watch.lookforall()

        if watch.unmatched:
            for regex in watch.unmatched:
                self.logger.log ("Warn: Startup pattern not found: %s" % (regex))

        if watch_result and self.cluster_stable(self.Env["DeadTime"]):
            #self.debug("Found match: "+ repr(watch_result))
            self.fencing_cleanup(node, stonith)
            return 1

        elif self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]):
            self.fencing_cleanup(node, stonith)
            return 1

        self.logger.log ("Warn: Start failed for node %s" % (node))
        return None
Ejemplo n.º 9
0
    def StartaCM(self, node, verbose=False):

        '''Start up the cluster manager on a given node'''
        if verbose: self.logger.log("Starting %s on node %s" % (self.templates["Name"], node))
        else: self.debug("Starting %s on node %s" % (self.templates["Name"], node))
        ret = 1

        if not node in self.ShouldBeStatus:
            self.ShouldBeStatus[node] = "down"

        if self.ShouldBeStatus[node] != "down":
            return 1

        patterns = []
        # Technically we should always be able to notice ourselves starting
        patterns.append(self.templates["Pat:Local_started"] % node)
        if self.upcount() == 0:
            patterns.append(self.templates["Pat:Master_started"] % node)
        else:
            patterns.append(self.templates["Pat:Slave_started"] % node)

        watch = LogWatcher(
            self.Env["LogFileName"], patterns, "StartaCM", self.Env["StartTime"]+10, hosts=self.Env["nodes"], kind=self.Env["LogWatcher"])

        self.install_config(node)

        self.ShouldBeStatus[node] = "any"
        if self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]):
            self.logger.log ("%s was already started" % (node))
            return 1

        # Clear out the host cache so autojoin can be exercised
        if self.clear_cache:
            self.debug("Removing cache file on: "+node)
            self.rsh(node, "rm -f "+CTSvars.HA_VARLIBHBDIR+"/hostcache")

        if not(self.Env["valgrind-tests"]):
            startCmd = self.templates["StartCmd"]
        else:
            if self.Env["valgrind-prefix"]:
                prefix = self.Env["valgrind-prefix"]
            else:
                prefix = "cts"

            startCmd = """G_SLICE=always-malloc HA_VALGRIND_ENABLED='%s' VALGRIND_OPTS='%s --log-file=/tmp/%s-%s.valgrind' %s""" % (
                self.Env["valgrind-procs"], self.Env["valgrind-opts"], prefix, """%p""", self.templates["StartCmd"])

        stonith = self.prepare_fencing_watcher(node)

        watch.setwatch()

        if self.rsh(node, startCmd) != 0:
            self.logger.log ("Warn: Start command failed on node %s" % (node))
            self.fencing_cleanup(node, stonith)
            return None

        self.ShouldBeStatus[node] = "up"
        watch_result = watch.lookforall()

        if watch.unmatched:
            for regex in watch.unmatched:
                self.logger.log ("Warn: Startup pattern not found: %s" % (regex))

        if watch_result and self.cluster_stable(self.Env["DeadTime"]):
            #self.debug("Found match: "+ repr(watch_result))
            self.fencing_cleanup(node, stonith)
            return 1

        elif self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]):
            self.fencing_cleanup(node, stonith)
            return 1

        self.logger.log ("Warn: Start failed for node %s" % (node))
        return None