def setup_scenario(self, cluster_manager): cluster_manager.log("Reusing cluster") # Disable STONITH by default. A dedicated ScenarioComponent # is in charge of enabling it if requested self.rsh_check(self.Env["nodes"][0], "pcs property set stonith-enabled=false") # Stop and remove galera if it exists # Note1: in order to avoid error when stopping the resource while # in unknown state, we first reprobe the resource state. # Note2: if you clean and delete before pacemaker had a # chance to re-probe state, it will consider resource is stopped # and will happily delete the resource from the cib even if # galera is still running! # Note3: after a cleanup, pacemaker may log a warning log # if it finds the resource is still running. This does not # count as an error for the CTS test target=self.Env["nodes"][0] rc = self.rsh(target, "pcs resource unmanage galera") if rc == 0: patterns = [r"crmd.*:\s*Initiating action.*: probe_complete probe_complete-%s on %s"%(n,n) \ for n in self.Env["nodes"]] watch=LogWatcher(self.Env["LogFileName"], patterns, None, self.Env["DeadTime"], kind=self.Env["LogWatcher"], hosts=self.Env["nodes"]) watch.setwatch() self.rsh(target, "pcs resource cleanup galera") watch.lookforall() assert not watch.unmatched, watch.unmatched self.rsh(target, "pcs resource disable galera") self.rsh(target, "pcs resource manage galera") self.rsh(target, "pcs resource delete galera --wait")
def setup_scenario(self, cluster_manager): # mysql setup self.init_and_setup_mysql_defaults() self.setup_galera_config() remote_authkey = "/etc/pacemaker/authkey" if not self.rsh.exists_on_all(remote_authkey, self.Env["nodes"]): self.log("Creating auth key for communication with pacemaker remote") with tempfile.NamedTemporaryFile() as tmp: tmp.write(os.urandom(4096)) tmp.flush() self.copy_to_nodes([(tmp.name, remote_authkey)], True, "root:haclient", "440") # cluster_manager.prepare() # stop cluster if previously running, failure is not fatal for node in self.Env["nodes"]: self.rsh(node, "pcs cluster destroy") self.rsh(node, "systemctl stop pacemaker_remote") self.rsh(node, "systemctl enable pacemaker") # reconfigure cluster for 2-nodes + one remote arbitrator self.Env["arb"]=self.Env["nodes"][-1] self.Env["nodes"]=self.Env["nodes"][:-1] self.rsh_check(self.Env["nodes"][0], "pcs cluster setup --force --name ratester %s %s" % \ (self.Env["nodes"][0],self.Env["nodes"][1])) # note: setting up cluster disable pacemaker service. re-enable it self.rsh_check(self.Env["nodes"][0], "systemctl enable pacemaker") self.rsh_check(self.Env["nodes"][0], "pcs cluster start --all") # TODO: better way to wait until cluster is started time.sleep(8) # Disable STONITH by default. A dedicated ScenarioComponent # is in charge of enabling it if requested self.rsh_check(self.Env["nodes"][0], "pcs property set stonith-enabled=false") for node in self.Env["nodes"]: self.rsh_check(node, "pcs property set --node %s osprole=controller"%node) # cluster_manager.prepare() # pacemaker remote to host garbd res=self.rsh_check(self.Env["arb"], "systemctl disable pacemaker") res=self.rsh_check(self.Env["arb"], "systemctl enable pacemaker_remote") res=self.rsh_check(self.Env["arb"], "systemctl start pacemaker_remote") remote_ok_pat = self.ratemplates.build("Pat:RscRemoteOp", "start", "arb", "\S+", "ok") watch=LogWatcher(self.Env["LogFileName"], [remote_ok_pat], None, self.Env["DeadTime"], kind=self.Env["LogWatcher"], hosts=self.Env["nodes"]) # watch = self.create_watch([remote_ok_pat], self.Env["DeadTime"]) watch.setwatch() res=self.rsh_check(self.Env["nodes"][0], "pcs resource create arb ocf:pacemaker:remote server=%s reconnect_interval=60 op monitor interval=20"%self.Env["arb"]) watch.lookforall() assert not watch.unmatched, watch.unmatched self.rsh_check(self.Env["nodes"][0], "pcs property set --node arb osprole=arbitrator") # there's no selinux context for garbd currently res=self.rsh_check(self.Env["arb"], "test -x /usr/sbin/setenforce && setenforce 0 || true")
def startall(self, nodelist=None, verbose=False, quick=False): '''Start the cluster manager on every node in the cluster. We can do it on a subset of the cluster if nodelist is not None. ''' map = {} if not nodelist: nodelist = self.Env["nodes"] for node in nodelist: if self.ShouldBeStatus[node] == "down": self.ns.WaitForAllNodesToComeUp(nodelist, 300) if not quick: # This is used for "basic sanity checks", so only start one node ... if not self.StartaCM(node, verbose=verbose): return 0 return 1 # Approximation of SimulStartList for --boot watchpats = [] watchpats.append(self.templates["Pat:DC_IDLE"]) for node in nodelist: watchpats.append(self.templates["Pat:InfraUp"] % node) watchpats.append(self.templates["Pat:PacemakerUp"] % node) watchpats.append(self.templates["Pat:Local_started"] % node) watchpats.append(self.templates["Pat:They_up"] % (nodelist[0], node)) # Start all the nodes - at about the same time... watch = LogWatcher(self.Env["LogFileName"], watchpats, "fast-start", self.Env["DeadTime"] + 10, hosts=self.Env["nodes"], kind=self.Env["LogWatcher"]) watch.setwatch() if not self.StartaCM(nodelist[0], verbose=verbose): return 0 for node in nodelist: self.StartaCMnoBlock(node, verbose=verbose) watch.lookforall() if watch.unmatched: for regex in watch.unmatched: self.logger.log("Warn: Startup pattern not found: %s" % (regex)) if not self.cluster_stable(): self.logger.log("Cluster did not stabilize") return 0 return 1
def setup_scenario(self, cluster_manager): # pre-requisites prerequisite = ["/usr/bin/gdb", "/usr/bin/screen", "/usr/bin/dig"] missing_reqs = False for req in prerequisite: if not self.rsh.exists_on_all(req, self.Env["nodes"]): self.log("error: %s could not be found on remote nodes. " "Please install the necessary package to run the tests"% req) missing_reqs = True assert not missing_reqs # galera-specific data test_scripts = ["kill-during-txn.gdb", "slow_down_sst.sh"] for node in self.Env["nodes"]: for script in test_scripts: src = os.path.join(os.path.dirname(os.path.abspath(__file__)), script) rc = self.rsh.cp(src, "root@%s:/tmp/%s" % (node, script)) assert rc == 0, \ "failed to copy data \"%s\" on remote node \"%s\"" % \ (src, node) # mysql setup self.init_and_setup_mysql_defaults() self.setup_galera_config() # clean up any traffic control on target network interface for node in self.Env["nodes"]: self.rsh(node, "/tmp/slow_down_sst.sh -n %s off"%node) # stop cluster if previously running, failure is not fatal for node in self.Env["nodes"]: self.rsh(node, "pcs cluster destroy") self.rsh(node, "systemctl enable pacemaker") self.rsh(node, "systemctl stop pacemaker_remote") self.rsh(node, "systemctl disable pacemaker_remote") # create a new cluster # note: setting up cluster disable pacemaker service. re-enable it patterns = [r"crmd.*:\s*notice:\sState\stransition\sS_STARTING(\s->.*origin=do_started)?", r"crmd.*:\s*notice:\sState\stransition\s.*->\sS_IDLE(\s.*origin=notify_crmd)?"] watch = LogWatcher(self.Env["LogFileName"], patterns, None, self.Env["DeadTime"], kind=self.Env["LogWatcher"], hosts=self.Env["nodes"]) watch.setwatch() self.rsh_check(self.Env["nodes"][0], "pcs cluster setup --force --name ratester %s" % \ " ".join(self.Env["nodes"])) self.rsh_check(self.Env["nodes"][0], "systemctl enable pacemaker") self.rsh_check(self.Env["nodes"][0], "pcs cluster start --all") # Disable STONITH by default. A dedicated ScenarioComponent # is in charge of enabling it if requested self.rsh_check(self.Env["nodes"][0], "pcs property set stonith-enabled=false") watch.lookforall() assert not watch.unmatched, watch.unmatched
def startall(self, nodelist=None, verbose=False, quick=False): '''Start the cluster manager on every node in the cluster. We can do it on a subset of the cluster if nodelist is not None. ''' map = {} if not nodelist: nodelist = self.Env["nodes"] for node in nodelist: if self.ShouldBeStatus[node] == "down": self.ns.WaitForAllNodesToComeUp(nodelist, 300) if not quick: # This is used for "basic sanity checks", so only start one node ... if not self.StartaCM(node, verbose=verbose): return 0 return 1 # Approximation of SimulStartList for --boot watchpats = [ ] watchpats.append(self.templates["Pat:DC_IDLE"]) for node in nodelist: watchpats.append(self.templates["Pat:Local_started"] % node) watchpats.append(self.templates["Pat:InfraUp"] % node) watchpats.append(self.templates["Pat:PacemakerUp"] % node) # Start all the nodes - at about the same time... watch = LogWatcher(self.Env["LogFileName"], watchpats, "fast-start", self.Env["DeadTime"]+10, hosts=self.Env["nodes"], kind=self.Env["LogWatcher"]) watch.setwatch() if not self.StartaCM(nodelist[0], verbose=verbose): return 0 for node in nodelist: self.StartaCMnoBlock(node, verbose=verbose) watch.lookforall() if watch.unmatched: for regex in watch.unmatched: self.logger.log ("Warn: Startup pattern not found: %s" % (regex)) if not self.cluster_stable(): self.logger.log("Cluster did not stabilize") return 0 return 1
def setup_scenario(self, cluster_manager): # consider cluster has 2-nodes + one remote arbitrator cluster_manager.log("Reusing cluster") target=self.Env["nodes"][0] self.Env["arb"]=self.Env["nodes"][-1] self.rsh_check(target, "pcs property set --node arb osprole=arbitrator") # attempt at cleaning up and remove garbd if it exists rc = self.rsh(target, "pcs resource unmanage garbd") if rc == 0: self.rsh(target, "pcs resource cleanup garbd") self.rsh(target, "pcs resource disable garbd") self.rsh(target, "pcs resource manage garbd") self.rsh(target, "pcs resource delete garbd --wait") self.Env["nodes"]=self.Env["nodes"][:-1] for node in self.Env["nodes"]: self.rsh_check(node, "pcs property set --node %s osprole=controller"%node) # Stop and remove galera if it exists # Note1: in order to avoid error when stopping the resource while # in unknown state, we first reprobe the resource state. # Note2: if you clean and delete before pacemaker had a # chance to re-probe state, it will consider resource is stopped # and will happily delete the resource from the cib even if # galera is still running! # Note3: after a cleanup, pacemaker may log a warning log # if it finds the resource is still running. This does not # count as an error for the CTS test rc = self.rsh(target, "pcs resource unmanage galera") if rc == 0: patterns = [r"crmd.*:\s*Initiating action.*: probe_complete probe_complete-%s on %s"%(n,n) \ for n in self.Env["nodes"]] watch=LogWatcher(self.Env["LogFileName"], patterns, None, self.Env["DeadTime"], kind=self.Env["LogWatcher"], hosts=self.Env["nodes"]) watch.setwatch() self.rsh(target, "pcs resource cleanup galera") watch.lookforall() assert not watch.unmatched, watch.unmatched self.rsh(target, "pcs resource disable galera") self.rsh(target, "pcs resource manage galera") self.rsh(target, "pcs resource delete galera --wait")
def StartaCM(self, node, verbose=False): '''Start up the cluster manager on a given node''' if verbose: self.logger.log("Starting %s on node %s" % (self.templates["Name"], node)) else: self.debug("Starting %s on node %s" % (self.templates["Name"], node)) ret = 1 if not node in self.ShouldBeStatus: self.ShouldBeStatus[node] = "down" if self.ShouldBeStatus[node] != "down": return 1 patterns = [] # Technically we should always be able to notice ourselves starting patterns.append(self.templates["Pat:Local_started"] % node) if self.upcount() == 0: patterns.append(self.templates["Pat:DC_started"] % node) else: patterns.append(self.templates["Pat:NonDC_started"] % node) watch = LogWatcher(self.Env["LogFileName"], patterns, "StartaCM", self.Env["StartTime"] + 10, hosts=self.Env["nodes"], kind=self.Env["LogWatcher"]) self.install_config(node) self.ShouldBeStatus[node] = "any" if self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]): self.logger.log("%s was already started" % (node)) return 1 stonith = self.prepare_fencing_watcher(node) watch.setwatch() if self.rsh(node, self.templates["StartCmd"]) != 0: self.logger.log("Warn: Start command failed on node %s" % (node)) self.fencing_cleanup(node, stonith) return None self.ShouldBeStatus[node] = "up" watch_result = watch.lookforall() if watch.unmatched: for regex in watch.unmatched: self.logger.log("Warn: Startup pattern not found: %s" % (regex)) if watch_result and self.cluster_stable(self.Env["DeadTime"]): #self.debug("Found match: "+ repr(watch_result)) self.fencing_cleanup(node, stonith) return 1 elif self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]): self.fencing_cleanup(node, stonith) return 1 self.logger.log("Warn: Start failed for node %s" % (node)) return None
def StartaCM(self, node, verbose=False): '''Start up the cluster manager on a given node''' if verbose: self.logger.log("Starting %s on node %s" % (self.templates["Name"], node)) else: self.debug("Starting %s on node %s" % (self.templates["Name"], node)) ret = 1 if not node in self.ShouldBeStatus: self.ShouldBeStatus[node] = "down" if self.ShouldBeStatus[node] != "down": return 1 patterns = [] # Technically we should always be able to notice ourselves starting patterns.append(self.templates["Pat:Local_started"] % node) if self.upcount() == 0: patterns.append(self.templates["Pat:Master_started"] % node) else: patterns.append(self.templates["Pat:Slave_started"] % node) watch = LogWatcher( self.Env["LogFileName"], patterns, "StartaCM", self.Env["StartTime"]+10, hosts=self.Env["nodes"], kind=self.Env["LogWatcher"]) self.install_config(node) self.ShouldBeStatus[node] = "any" if self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]): self.logger.log ("%s was already started" % (node)) return 1 # Clear out the host cache so autojoin can be exercised if self.clear_cache: self.debug("Removing cache file on: "+node) self.rsh(node, "rm -f "+CTSvars.HA_VARLIBHBDIR+"/hostcache") if not(self.Env["valgrind-tests"]): startCmd = self.templates["StartCmd"] else: if self.Env["valgrind-prefix"]: prefix = self.Env["valgrind-prefix"] else: prefix = "cts" startCmd = """G_SLICE=always-malloc HA_VALGRIND_ENABLED='%s' VALGRIND_OPTS='%s --log-file=/tmp/%s-%s.valgrind' %s""" % ( self.Env["valgrind-procs"], self.Env["valgrind-opts"], prefix, """%p""", self.templates["StartCmd"]) stonith = self.prepare_fencing_watcher(node) watch.setwatch() if self.rsh(node, startCmd) != 0: self.logger.log ("Warn: Start command failed on node %s" % (node)) self.fencing_cleanup(node, stonith) return None self.ShouldBeStatus[node] = "up" watch_result = watch.lookforall() if watch.unmatched: for regex in watch.unmatched: self.logger.log ("Warn: Startup pattern not found: %s" % (regex)) if watch_result and self.cluster_stable(self.Env["DeadTime"]): #self.debug("Found match: "+ repr(watch_result)) self.fencing_cleanup(node, stonith) return 1 elif self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]): self.fencing_cleanup(node, stonith) return 1 self.logger.log ("Warn: Start failed for node %s" % (node)) return None