def if_ha(self): rc, stdout, stderr, exc = run_cmd("crm_mon -r -1 -X") if rc == 127: # command not found return False if rc == 107: # stopped? return False try: xml_root = ET.fromstring(stdout) except ET.ParseError: return False if xml_root.tag != 'crm_mon': return False ha_status = {'nodes': [], 'resources': []} xml_nodes = xml_root.find('nodes') for container in xml_root.find('resources'): if container.tag == 'resource': res = self._get_res(container) if res is not None: ha_status['resources'].append(res) else: for resource in container.findall('resource'): res = self._get_res(resource) if res is not None: ha_status['resources'].append(res) if xml_nodes is None: return False for elem in xml_nodes: attr = elem.attrib ha_status['nodes'].append(attr) hostname = attr['name'] if hostname not in self.hosts: self.hosts.append(hostname) return ha_status
def get_downed_hosts(self, hosts): if self.out is None: return downed = [] answers = [] for host in hosts: answer = { 'column': host, 'status': 'UNKN', 'category': category.UNKN, 'history': [], 'info': '', 'details': '' } cmd = ( "ssh -o ConnectTimeout={} -o StrictHostKeyChecking=no {} uname" ).format(self.args.timeout, host) rc, stdout, stderr, exc = run_cmd(cmd) if rc: answer['status'] = 'DOWN' answer['category'] = category.BAD downed.append(host) if rc == 0: answer['status'] = 'OK' answer['category'] = category.GOOD answers.append(answer) self.log.debug("Hosts: '{}' are down".format(str(downed))) self.out.line('ssh', answers) return set(downed)
def status(self): res, comment = True, '' cmd = self.cmd_prefix cmd += ('scontrol ping') rc, stdout, stderr, exc = run_cmd(cmd, timeout=self.timeout) expected1 = 'Slurmctld(primary/backup) at ' expected2 = ' are UP/DOWN' if rc != 0: res = False comment = "'{}' exit code is not 0".cormat(cmd) return res, comment stdout = stdout.strip() if (len(stdout) < len(expected1) + len(expected2) or stdout[:len(expected1)] != expected1 or stdout[-len(expected2):] != expected2): res = False comment = "Stdout of '{}' is not matching '{}...{}'".format( cmd, expected1, expected2) return res, comment return res, comment
def status(self): res, comment = True, "" cmd = self.cmd_prefix cmd += ("dig +tries=1 +time={} +short @localhost localhost").format( self.timeout) expected = "127.0.0.1" rc, stdout, stderr, exc = run_cmd(cmd) if rc or stdout.strip() != "127.0.0.1": comment = "'{}' did not return '{}'".format(cmd, expected) res = False return res, comment
def check_zfs(self, answer, res, host): running_on = [e['name'] for e in res['running_on']] if not running_on: answer['status'] = 'ERR' answer['category'] = category.ERROR answer['details'] = "ZFS is not running anywhere" return answer if len(running_on) > 1: answer['status'] = 'ERR' answer['category'] = category.ERROR answer['details'] = "pcs reported ZFS is mounted on 2 nodes" return answer running_on = running_on[0] cmd = 'ssh -o ConnectTimeout={} -o StrictHostKeyChecking=no {} ' cmd = cmd.format(self.args.timeout, host) cmd += 'zpool list -H -o name,health' rc, stdout, stderr, exc = run_cmd(cmd) if rc: answer['status'] = 'ERR' answer['category'] = category.ERROR answer['details'] = "'{}' returned non-zero exit code".format(cmd) return answer if stdout and host != running_on: answer['status'] = 'ERR' answer['category'] = category.ERROR answer['details'] = ( "'{}' returned some output on passive node" ).format(cmd) return answer if host != running_on: return answer stdout = stdout.strip().split('\n') for line in stdout: name, status = line.split() if status != "ONLINE": answer['status'] = 'ERR' answer['category'] = category.ERROR answer['details'] += ( "Status of '{}' is '{}'" ).format(name, status) if answer['category'] != category.GOOD: return answer answer['status'] = 'ONLINE' return answer
def status(self): res, comment = True, "" cmd = self.cmd_prefix cmd += 'ssh localhost uptime' rc, stdout, stderr, exc = run_cmd(cmd) if rc or len(stdout.strip().split('\n')) < 1: res = False comment = "'{}' returned unexpected result".format(cmd) return res, comment return res, comment
def status(self): res, comment = True, "" cmd = self.cmd_prefix cmd += 'sacctmgr -n list cluster' rc, stdout, stderr, exc = run_cmd(cmd) stdout = stdout.strip().split('\n') if rc or len(stdout) < 1 or len(stdout[0].split()) < 2: res = False comment = "'{}' returned no clusters configured".format(cmd) return res, comment return res, comment
def status(self): res, comment = True, "" num = "123" cmd = self.cmd_prefix if self.cmd_prefix: cmd += '"' cmd += "mysql -e 'select " + num + ";' -s -r" if self.cmd_prefix: cmd += '"' rc, stdout, srderr, exc = run_cmd(cmd) stdout = stdout.strip() if rc or stdout != num: res = False comment = '{} returned unexpected result'.format(cmd) return res, comment return res, comment
def check_fencing(self): stonith_conf = [] rc, stdout, stderr, exc = run_cmd("pcs property") stonith_enabled = 'false' if rc == 0: stdout = stdout.split('\n') for line in stdout: find = ' stonith-enabled:' if len(line) > len(find) and line[:len(find)] == find: stonith_enabled = line.split(':')[1].strip() for res in self.ha_status['resources']: agent = res['resource_agent'] find = 'stonith:' if len(agent) > len(find) and agent[:len(find)] == find: stonith_conf.append(res) answers = [] for node_id, host in self.node_ids.items(): answer = { 'column': host, 'status': 'UNCONFIG', 'category': category.UNKN, 'history': [], 'info': '', 'details': '' } node_stonith = None for res in stonith_conf: nodes = [e['name'] for e in res['running_on']] if host in nodes: node_stonith = res break if node_stonith is not None: if stonith_enabled == 'true': answer['status'] = 'CONFIGURED' answer['category'] = category.GOOD else: answer['status'] = 'DISABLED' answers.append(answer) self.out.line('STONITH', answers) return answers
def status(self): res, comment = True, "" ping = "111222333" cmd = self.cmd_prefix if self.cmd_prefix: cmd += '"' cmd += "mongo --eval '{ping: " + ping + "}'" if self.cmd_prefix: cmd += '"' rc, stdout, stderr, exc = run_cmd(cmd) stdout = stdout.strip().split('\n') if rc or len(stdout) < 1 or stdout[-1] != ping: res = False comment = "'{}' returned no ping".format(cmd) return res, comment return res, comment
def status(self): res, comment = True, "" cmd = self.cmd_prefix cmd += 'munge -n | unmunge' rc, stdout, stderr, exc = run_cmd(cmd) stdout = stdout.strip().split('\n') if len(stdout) < 1 or len(stdout[0].split()) < 2: res = False comment = "'{}' returned no status".format(cmd) return res, comment status = stdout[0].split()[1] if rc or status != 'Success': res = False comment = "'{}' returned error".format(cmd) return res, comment return res, comment
def check_drbd(self, answer, res, host): cmd = 'ssh -o ConnectTimeout={} -o StrictHostKeyChecking=no {} ' cmd = cmd.format(self.args.timeout, host) cmd += 'drbd-overview | grep trinity' rc, stdout, stderr, exc = run_cmd(cmd) if rc: answer['status'] = 'ERR' answer['category'] = category.ERROR answer['details'] = "{} returned non-zero exit code".format(cmd) return answer stdout = stdout.strip() stdout = stdout.split() if len(stdout) < 4 or stdout[3] < 7 or stdout[3][:6] != 'UpToDa': answer['status'] = 'ERR' answer['category'] = category.ERROR answer['details'] = 'DRBD status is not UpToDate' return answer return answer
def cmd(self, cmd): self.tagged_log_debug("Command to run: '{}'".format(cmd)) rc, stdout, stderr, e = utils.run_cmd(cmd) if e: self.tagged_log_debug( "Exception on running '{}': '{}'".format(cmd, e) ) stdout_lines = [] stdout_lines = filter( lambda x: True if x else False, stdout.split('\n') ) oneline = lambda x: "\\n".join(x.split("\n")) self.tagged_log_debug( ( "cmd = '{}', rc = {}, stdout = '{}', stderr = '{}'" ).format(cmd, rc, oneline(stdout), oneline(stderr)) ) return rc, stdout, stdout_lines, stderr
def get_sinfo(self): """ Returns stdout for sinfo -N -o "%N %6T" """ self.statuses = {} cmd = 'sinfo -N -o "%N %6T"' rc, stdout, _, _ = utils.run_cmd(cmd) if rc: return self.statuses for line in stdout.split("\n"): line = line.split() if len(line) < 2: continue nodename = line[0] status = line[1] if nodename not in self.statuses: self.statuses[nodename] = set() self.statuses[nodename].add(status) return self.statuses
def status(self): res, comment = True, '' cmd = self.cmd_prefix cmd += 'chronyc tracking' rc, stdout, stderr, exc = run_cmd(cmd) if rc != 0: res = False comment = "'{}' exit code is not 0".cormat(cmd) return res, comment stdout = stdout.split('\n') if len(stdout) < 1: res = False comment = "'{}' returned no output".format(cmd) return res, comment line1 = stdout[0] line1 = line1.split() # Magic number is from man chronyc if line1[3] == '7F7F0101': res = False comment = 'Computer is not synchronised to any external source.' return res, comment cmd = self.cmd_prefix cmd += 'chronyc sources' rc, stdout, stderr, exc = run_cmd(cmd) if rc != 0: res = False comment = "'{}' exit code is not 0".cormat(cmd) return res, comment stdout = stdout.strip().split('\n') if len(stdout) < 1: res = False comment = "'{}' returned no output".format(cmd) return res, comment try: n_sources = int(stdout[0].split()[-1]) except (IndexError, ValueError): n_sources = 0 if n_sources < 1: res = False comment = "'{}' did not return numnber of sources".format(cmd) return res, comment try: is_current_synced = bool( [True for e in stdout[-n_sources:] if e[1] == '*']) except IndexError: is_current_synced = False if not is_current_synced: res = False comment = "'{}' returned no currenly synced servers".format(cmd) return res, comment return res, comment
def check_systemd_unit(self, answer, service, host=None, need_started=True, need_enabled=True): if host is not None: cmd_prefix = ( "ssh -o ConnectTimeout={} -o StrictHostKeyChecking=no {} " ).format(self.args.timeout, host) else: cmd_prefix = "" cmd = cmd_prefix cmd += "systemctl is-enabled " + service rc, stdout, stderr, exc = run_cmd(cmd) is_enabled = stdout.strip() if need_enabled and is_enabled != "enabled": answer['status'] = 'ERR' answer['category'] = category.ERROR answer['info'] = 'systemd' answer['details'] = 'Autostart is disabled for the unit.' if not need_enabled and is_enabled != "disabled": answer['status'] = 'ERR' answer['category'] = category.ERROR answer['info'] = 'systemd' answer['details'] = 'Autostart is enabled for the unit.' cmd = cmd_prefix cmd += "systemctl status {}".format(service) rc, stdout, stderr, exc = run_cmd(cmd) if rc: is_started = False else: is_started = True if need_started and not is_started: answer['status'] = 'ERR' answer['category'] = category.ERROR answer['info'] = 'systemd' answer['details'] = 'Unit should run on this host.' return answer if not need_started and is_started: answer['status'] = 'ERR' answer['category'] = category.ERROR answer['info'] = 'systemd' answer['details'] = 'Unit should not run on this host.' return answer if not is_started: return answer answer['status'] = 'UP' answer['category'] = category.GOOD answer['info'] = '' answer = self.service_checker(answer, service, host) return answer