def start_slave_collector(node, arg_str): if node == constants.WE: cmd = r"hb_report __slave".format(os.getcwd()) for item in arg_str.split(): cmd += " {}".format(str(item)) _, out = crmutils.get_stdout(cmd) cmd = r"(cd {} && tar xf -)".format(constants.WORKDIR) crmutils.get_stdout(cmd, input_s=out) else: cmd = r'ssh {} {} "{} hb_report __slave"'.\ format(constants.SSH_OPTS, node, \ constants.SUDO, os.getcwd()) for item in arg_str.split(): cmd += " {}".format(str(item)) code, out, err = crmutils.get_stdout_stderr(cmd) if code != 0: log_warning(err) for ip in get_peer_ip(): log_info("Trying connect by %s" % ip) cmd = cmd.replace(node, ip, 1) code, out, err = crmutils.get_stdout_stderr(cmd) if code != 0: log_warning(err) break cmd = r"(cd {} && tar xf -)".format(constants.WORKDIR) crmutils.get_stdout(cmd, input_s=out)
def un_block_firewalld(self): """ Unblock corosync ports """ self.info("Trying to add port {}".format(','.join(self.ports))) for p in self.ports: crmshutils.get_stdout_stderr(config.ADD_PORT.format(port=p))
def start_slave_collector(node, arg_str): if node == constants.WE: cmd = r"hb_report __slave".format(os.getcwd()) for item in arg_str.split(): cmd += " {}".format(str(item)) _, out = crmutils.get_stdout(cmd) else: cmd = r'ssh {} {} "{} hb_report __slave"'.\ format(constants.SSH_OPTS, node, constants.SUDO, os.getcwd()) for item in arg_str.split(): cmd += " {}".format(str(item)) code, out, err = crmutils.get_stdout_stderr(cmd) if code != 0: log_warning(err) for ip in get_peer_ip(): log_info("Trying connect by %s" % ip) cmd = cmd.replace(node, ip, 1) code, out, err = crmutils.get_stdout_stderr(cmd) if code != 0: log_warning(err) break cmd = r"(cd {} && tar xf -)".format(constants.WORKDIR) # typeof out here will always "str" # input_s of get_stdout will always need bytes # different situation depend on whether found pe file if out.startswith("b'"): crmutils.get_stdout(cmd, input_s=eval(out)) else: crmutils.get_stdout(cmd, input_s=out.encode('utf-8'))
def start_slave_collector(node, arg_str): if node == constants.WE: cmd = r"crm report __slave".format(os.getcwd()) for item in arg_str.split(): cmd += " {}".format(str(item)) _, out = crmutils.get_stdout(cmd) else: cmd = r'ssh {} {} "crm report __slave"'.format(constants.SSH_OPTS, node, os.getcwd()) for item in arg_str.split(): cmd += " {}".format(str(item)) code, out, err = crmutils.get_stdout_stderr(cmd) if code != 0: logger.warning(err) for ip in get_peer_ip(): logger.info("Trying connect by %s", ip) cmd = cmd.replace(node, ip, 1) code, out, err = crmutils.get_stdout_stderr(cmd) if code != 0: logger.warning(err) break compress_data = "" for data in out.split('\n'): if data.startswith(constants.COMPRESS_DATA_FLAG): # crm report data from collector compress_data = data.lstrip(constants.COMPRESS_DATA_FLAG) else: # log data from collector print(data) cmd = r"(cd {} && tar xf -)".format(constants.WORKDIR) crmutils.get_stdout(cmd, input_s=eval(compress_data))
def run(self): """ Fence node and start a thread to monitor the result """ self.info("Trying to fence node \"{}\"".format(self.target_node)) crmshutils.get_stdout_stderr(config.FENCE_NODE.format(self.target_node)) th = threading.Thread(target=self.fence_action_monitor) th.start()
def un_block_iptables(self): """ Unblock corosync communication ip """ for node in self.peer_nodelist: self.info("Trying to recover {} communication ip".format(node)) for ip in crmshutils.get_iplist_from_name(node): crmshutils.get_stdout_stderr(config.BLOCK_IP.format(action='D', peer_ip=ip))
def do_block_iptables(self): """ Block corosync communication ip """ self.peer_nodelist = utils.peer_node_list() for node in self.peer_nodelist: self.info("Trying to temporarily block {} communication ip".format(node)) for ip in crmshutils.get_iplist_from_name(node): crmshutils.get_stdout_stderr(config.BLOCK_IP.format(action='I', peer_ip=ip))
def do_block_firewalld(self): """ Block corosync ports """ self.ports = utils.corosync_port_list() if not self.ports: raise TaskError("Can not get corosync's port") self.info("Trying to temporarily block port {}".format(','.join(self.ports))) for p in self.ports: crmshutils.get_stdout_stderr(config.REMOVE_PORT.format(port=p))
def lsof_ocfs2_device(): """ List open files for OCFS2 device """ out_string = "" _, out, _ = crmutils.get_stdout_stderr("mount") dev_list = re.findall("\n(.*) on .* type ocfs2 ", out) for dev in dev_list: cmd = "lsof {}".format(dev) out_string += "\n\n#=====[ Command ] ==========================#\n" out_string += "# {}\n".format(cmd) _, cmd_out, _ = crmutils.get_stdout_stderr(cmd) if cmd_out: out_string += cmd_out return out_string
def print_logseg(logf, from_time, to_time): cat = find_decompressor(logf) if cat != "cat": tmp = create_tempfile() add_tmpfiles(tmp) cmd = "%s %s > %s" % (cat, logf, tmp) code, out, err = crmutils.get_stdout_stderr(cmd) if code != 0: log_fatal("maybe disk full: %s" % err) sourcef = tmp else: sourcef = logf tmp = "" if from_time == 0: FROM_LINE = 1 else: FROM_LINE = findln_by_time(sourcef, from_time) if not FROM_LINE: log_warning("couldn't find line for time %d; corrupt log file?" % from_time) return "" TO_LINE = "" if to_time != 0: TO_LINE = findln_by_time(sourcef, to_time) if not TO_LINE: log_warning("couldn't find line for time %d; corrupt log file?" % to_time) return "" log_debug("including segment [%s-%s] from %s" % (FROM_LINE, TO_LINE, sourcef)) return dump_log(sourcef, FROM_LINE, TO_LINE)
def check_fencing(): """ Check STONITH/Fence: Whether stonith is enabled Whether stonith resource is configured and running """ task_inst = task.TaskCheck("Checking STONITH/Fence") with task_inst.run(): if not utils.FenceInfo().fence_enabled: task_inst.warn("stonith is disabled") return task_inst.info("stonith is enabled") rc, outp, _ = crmshutils.get_stdout_stderr( "crm_mon -r1 | grep '(stonith:.*):'") if rc != 0: task_inst.warn("No stonith resource configured!") return res = re.search(r'([^\s]+)\s+\(stonith:(.*)\):\s+(\w+)', outp) res_name, res_agent, res_state = res.groups() common_msg = "stonith resource {}({})".format(res_name, res_agent) state_msg = "{} is {}".format(common_msg, res_state) task_inst.info("{} is configured".format(common_msg)) if res_state == "Started": task_inst.info(state_msg) else: task_inst.warn(state_msg) if re.search(r'sbd$', res_agent): if crmshutils.service_is_active("sbd"): task_inst.info("sbd service is running") else: task_inst.warn("sbd service is not running!")
def dump_D_process(): ''' dump D-state process stack ''' out_string = "" _, out, _ = crmutils.get_stdout_stderr("ps aux|awk '$8 ~ /^D/{print $2}'") len_D_process = len(out.split('\n')) if out else 0 out_string += "Dump D-state process stack: {}\n".format(len_D_process) if len_D_process == 0: return out_string for pid in out.split('\n'): _, cmd_out, _ = crmutils.get_stdout_stderr("cat /proc/{}/comm".format(pid)) out_string += "pid: {} comm: {}\n".format(pid, cmd_out) _, stack_out, _ = crmutils.get_stdout_stderr("cat /proc/{}/stack".format(pid)) out_string += stack_out + "\n\n" return out_string
def collect_ocfs2_info(): ocfs2_f = os.path.join(constants.WORKDIR, constants.OCFS2_F) with open(ocfs2_f, "w") as f: rc, out, err = crmutils.get_stdout_stderr("mounted.ocfs2 -d") if rc != 0: f.write("Failed to run \"mounted.ocfs2 -d\": {}".format(err)) return # No ocfs2 device, just header line printed elif len(out.split('\n')) == 1: f.write("No ocfs2 partitions found") return f.write(utillib.dump_D_process()) f.write(utillib.lsof_ocfs2_device()) cmds = [ "dmesg", "ps -efL", "lsblk -o 'NAME,KNAME,MAJ:MIN,FSTYPE,LABEL,RO,RM,MODEL,SIZE,OWNER,GROUP,MODE,ALIGNMENT,MIN-IO,OPT-IO,PHY-SEC,LOG-SEC,ROTA,SCHED,MOUNTPOINT'", "mounted.ocfs2 -f", "findmnt", "mount", "cat /sys/fs/ocfs2/cluster_stack" ] for cmd in cmds: cmd_name = cmd.split()[0] if not utillib.which(cmd_name) or \ cmd_name == "cat" and not os.path.exists(cmd.split()[1]): continue _, out = crmutils.get_stdout(cmd) f.write("\n\n#=====[ Command ] ==========================#\n") f.write("# %s\n" % (cmd)) f.write(out)
def fence_action_monitor(self): """ Monitor fencing process, running in thread, exit on two cases: 1. There is one latest fence action successfully done 2. No fence action during fence timeout, thread_stop_event triggered by main thread """ target_node = None from_node = None fence_timestamp = None # Try to find out which node fire the fence action while not self.thread_stop_event.is_set(): rc, out, _ = crmshutils.get_stdout_stderr( "crm_mon -1|grep -A1 \"Fencing Actions:\"") if rc == 0 and out: match = re.search(r"of (.*) pending: .*origin=(.*)$", out) if match: target_node, from_node = match.groups() self.info("Node \"{}\" will be fenced by \"{}\"!".format( target_node, from_node)) self.fence_start_event.set() break time.sleep(1) # Try to find out proof that fence happened while not self.thread_stop_event.is_set(): rc, out, _ = crmshutils.get_stdout_stderr( config.FENCE_HISTORY.format(node=target_node)) if rc == 0 and out: match = re.search( r"Node {} last fenced at: (.*)".format(target_node), out) if match: fence_timestamp = match.group(1) task_timestamp_dt = utils.str_to_datetime( self.timestamp, '%Y/%m/%d %H:%M:%S') fence_timestamp_dt = utils.str_to_datetime( fence_timestamp, '%a %b %d %H:%M:%S %Y') # If the fence action timestamp larger than this task's timestamp # That is the proof if task_timestamp_dt < fence_timestamp_dt: self.info( "Node \"{}\" was successfully fenced by \"{}\"". format(target_node, from_node)) # Tell main thread fence happened self.fence_finish_event.set() break time.sleep(1)
def corosync_port_list(): """ Get corosync ports using corosync-cmapctl """ ports = [] rc, out, _ = crmshutils.get_stdout_stderr("corosync-cmapctl totem.interface") if rc == 0 and out: ports = re.findall(r'(?:mcastport.*) ([0-9]+)', out) return ports
def online_nodes(): """ Get online node list """ rc, stdout, stderr = crmshutils.get_stdout_stderr('crm_mon -1') if rc == 0 and stdout: res = re.search(r'Online:\s+\[\s(.*)\s\]', stdout) if res: return res.group(1).split() return []
def this_node(): """ Try to get the node name from crm_node command If failed, use its hostname """ rc, stdout, stderr = crmshutils.get_stdout_stderr("crm_node --name") if rc != 0: msg_error(stderr) return crmshutils.this_node() return stdout
def get_property(name): """ Get cluster properties """ cmd = "crm configure get_property " + name rc, stdout, _ = crmshutils.get_stdout_stderr(cmd) if rc != 0: return None else: return stdout
def run(self): """ Execute specific kill command and monitor the results """ while True: rc, pid = utils.get_process_status(self.target_kill) if rc: self.info("Process {}({}) is running...".format(self.target_kill, pid)) else: continue self.info("Trying to run \"{}\"".format(self.cmd)) crmshutils.get_stdout_stderr(self.cmd) # endless loop will lead to fence if not self.looping: break fence_check_th = threading.Thread(target=self.fence_action_monitor) fence_check_th.start() restart_check_th = threading.Thread(target=self.process_monitor) restart_check_th.start()
def run_command(context, cmd, err_record=False): rc, out, err = utils.get_stdout_stderr(cmd) if rc != 0 and err: if err_record: context.command_error_output = err return if out: context.logger.info("\n{}\n".format(out)) context.logger.error("\n{}\n".format(err)) context.failed = True return out
def run_command(context, cmd, err_record=False): rc, out, err = utils.get_stdout_stderr(cmd) if rc != 0 and err: if err_record: res = re.sub(r'\x1b\[[0-9]+m', '', err) context.command_error_output = res return rc, re.sub(r'\x1b\[[0-9]+m', '', out) if out: context.logger.info("\n{}\n".format(out)) context.logger.error("\n{}\n".format(err)) context.failed = True return rc, re.sub(r'\x1b\[[0-9]+m', '', out)
def check_node_status(node, state): """ Check whether the node has expected state """ rc, stdout, stderr = crmshutils.get_stdout_stderr('crm_node -l') if rc != 0: msg_error(stderr) return False pattern = re.compile(r'^.* {} {}'.format(node, state), re.MULTILINE) if not pattern.search(stdout): return False return True
def pre_check(self): """ Check the prerequisite for fence node """ self.task_pre_check() for cmd in ['crm_node', 'stonith_admin', 'crm_attribute']: rc, _, err = crmshutils.get_stdout_stderr("which {}".format(cmd)) if rc != 0 and err: raise TaskError(err) if not utils.check_node_status(self.target_node, 'member'): raise TaskError("Node \"{}\" not in cluster!".format(self.target_node))
def pre_check(self): """ Check the prerequisite """ self.task_pre_check() for cmd in ["iptables"]: rc, _, err = crmshutils.get_stdout_stderr("which {}".format(cmd)) if rc != 0 and err: raise TaskError(err) if len(utils.online_nodes()) < 2: raise TaskError("At least two nodes online!")
def is_valid_sbd(dev): """ Check whether the device is a initialized SBD device dev: dev path return 'True' if 'dev' is a initialized SBD device """ if not os.path.exists(dev): return False rc, out, err = crmshutils.get_stdout_stderr(config.SBD_CHECK_CMD.format(dev=dev)) if rc != 0 and err: msg_error(err) return False return True
def check_port_open(task, firewall_type): """ Check whether corosync port is blocked by iptables """ ports = utils.corosync_port_list() if not ports: task.error("Can not get corosync's port") return if firewall_type == "firewalld": rc, out, err = crmshutils.get_stdout_stderr('firewall-cmd --list-port') if rc != 0: task.error(err) return for p in ports: if re.search(' {}/udp'.format(p), out): task.info("UDP port {} is opened in firewalld".format(p)) else: task.error("UDP port {} should open in firewalld".format(p)) elif firewall_type == "SuSEfirewall2": #TODO pass
def check_nodes(): """ Check nodes info: Current DC Quorum status Online/OFFLINE/UNCLEAN nodes """ task_inst = task.TaskCheck("Checking nodes") with task_inst.run(): rc, outp, errp = crmshutils.get_stdout_stderr("crm_mon -1") if rc != 0: task_inst.error("run \"crm_mon -1\" error: {}".format(errp)) return # check DC res = re.search(r'Current DC: (.*) \(', outp) if res: task_inst.info("DC node: {}".format(res.group(1))) # check quorum if re.search(r'partition with quorum', outp): task_inst.info("Cluster have quorum") else: task_inst.warn("Cluster lost quorum!") # check Online nodes res = re.search(r'Online:\s+(\[.*\])', outp) if res: task_inst.info("Online nodes: {}".format(res.group(1))) # check OFFLINE nodes res = re.search(r'OFFLINE:\s+(\[.*\])', outp) if res: task_inst.warn("OFFLINE nodes: {}".format(res.group(1))) # check UNCLEAN nodes res = re.findall(r'Node (.*): UNCLEAN', outp) for item in res: task_inst.warn('Node {} is UNCLEAN!'.format(item))
def get_online_nodes(): _, out, _ = utils.get_stdout_stderr('crm_node -l') if out: return re.findall(r'[0-9]+ (.*) member', out) else: return None
def get_file_type(file_path): rc, out, _ = utils.get_stdout_stderr("file {}".format(file_path)) if re.search(r'{}: bzip2'.format(file_path), out): return "bzip2" if re.search(r'{}: directory'.format(file_path), out): return "directory"
def get_command_info(cmd): code, out, err = crmutils.get_stdout_stderr(cmd) if out: return (code, out + '\n') else: return (code, "")