def _lnet_state(self): lnet_up = False lnet_loaded = not bool( AgentShell.run(['udevadm', 'info', '--path', '/sys/module/lnet' ]).rc) if lnet_loaded: lnet_up = not bool(AgentShell.run(['lnetctl', 'net', 'show']).rc) return { (False, False): "lnet_unloaded", (False, True): "lnet_unloaded", (True, False): "lnet_down", (True, True): "lnet_up" }[(lnet_loaded, lnet_up)]
def _lnet_state(self): lnet_up = False lnet_loaded = not bool( AgentShell.run(["udevadm", "info", "--path", "/sys/module/lnet" ]).rc) if lnet_loaded: lnet_up = not bool(AgentShell.run(["lnetctl", "net", "show"]).rc) return { (False, False): "lnet_unloaded", (False, True): "lnet_unloaded", (True, False): "lnet_down", (True, True): "lnet_up", }[(lnet_loaded, lnet_up)]
def _resource_exists(ha_label): ''' Check if a resource exists in current configuration. :return: True if exists ''' result = AgentShell.run(["crm_resource", "-W", "-r", ha_label]) return result.rc == 0
def _cibadmin(command_args, timeout=120, raise_on_timeout=False): assert timeout > 0, "timeout must be greater than zero" # I think these are "errno" values, but I'm not positive # but going forward, any additions to this should try to be informative # about the type of exit code and why it's OK to retry RETRY_CODES = { 10: "something unknown", 41: "something unknown", 62: "Timer expired", 107: "Transport endpoint is not connected", } command_args.insert(0, "cibadmin") # NB: This isn't a "true" timeout, in that it won't forcibly stop the # subprocess after a timeout. We'd need more invasive changes to # shell._run() for that. for _ in util.wait(timeout): result = AgentShell.run(command_args) if result.rc == 0: return result elif result.rc not in RETRY_CODES: break if raise_on_timeout and result.rc in RETRY_CODES: raise PacemakerError( "%s timed out after %d seconds: rc: %s, stderr: %s" % (" ".join(command_args), timeout, result.rc, result.stderr)) return result
def unconfigure_corosync2(host_fqdn, mcast_port): """ Unconfigure the corosync application. For corosync2 don't disable pcsd, just remove host node from cluster and disable corosync from auto starting (service should already be stopped in state transition) Note that pcs cluster commands handle editing and removal of the corosync.conf file Return: Value using simple return protocol """ error = corosync_service.disable() if error: return agent_error(error) # Detect if we are the only node in the cluster, we want to do this before next command removes conf file cluster_nodes = _nodes_in_cluster() result = AgentShell.run(["pcs", "--force", "cluster", "node", "remove", host_fqdn]) if result.rc != 0: if "No such file or directory" in result.stderr: # we want to return successful if the configuration file does not exist console_log.warning(result.stderr) elif "Error: Unable to update any nodes" in result.stderr: # this error is expected when this is the last node in the cluster if len(cluster_nodes) != 1: return agent_error(result.stderr) else: return agent_error(result.stderr) return agent_ok_or_error( firewall_control.remove_rule(PCS_TCP_PORT, "tcp", "pcs", persist=True) or firewall_control.remove_rule(mcast_port, "udp", "corosync", persist=True) )
def kernel_status(): """ :return: {'running': {'kernel-X.Y.Z'}, 'required': <'kernel-A.B.C' or None>} """ running_kernel = "kernel-%s" % AgentShell.try_run(["uname", "-r"]).strip() available_kernels = [ k for k in AgentShell.try_run(["rpm", "-q", "kernel"]).split("\n") if k ] if AgentShell.run(["rpm", "-q", "--whatprovides", "kmod-lustre"]).rc == 0: try: modlist = [ os.path.splitext(os.path.basename(k))[0] for k in AgentShell.try_run([ "rpm", "-ql", "--whatprovides", "lustre-osd", "kmod-lustre" ]).split("\n") if k.endswith(".ko") ] required_kernel = latest_kernel(available_kernels, modlist) except (AgentShell.CommandExecutionError, StopIteration): required_kernel = None elif AgentShell.run(["rpm", "-q", "kmod-lustre-client"]).rc == 0: # but on a worker, we can ask kmod-lustre-client what the required # kernel is try: modlist = [ os.path.splitext(os.path.basename(k))[0] for k in AgentShell.try_run([ "rpm", "-ql", "--whatprovides", "kmod-lustre-client" ]).split("\n") if k.endswith(".ko") ] required_kernel = latest_kernel(available_kernels, modlist) except (AgentShell.CommandExecutionError, StopIteration): required_kernel = None else: required_kernel = None return { "running": running_kernel, "required": required_kernel, "available": available_kernels, }
def _full_scan(self): # If we are a worker node then return nothing because our devices are not of interest. This is a short term # solution for HYD-3140. This plugin should really be loaded if it is not needed but for now this sorts out # and issue with PluginAgentResources being in the linux plugin. if config.get('settings', 'profile')['worker']: return {} # Before we do anything do a partprobe, this will ensure that everything gets an up to date view of the # device partitions. partprobe might throw errors so ignore return value AgentShell.run(["partprobe"]) # Map of block devices major:minors to /dev/ path. block_devices = BlockDevices() # Devicemapper: LVM and Multipath dmsetup = DmsetupTable(block_devices) # Software RAID mds = MdRaid(block_devices).all() # _zpools zfs_devices = ZfsDevices() zfs_devices.full_scan(block_devices) # EMCPower Devices emcpowers = EMCPower(block_devices).all() # Local filesystems (not lustre) in /etc/fstab or /proc/mounts local_fs = LocalFilesystems(block_devices).all() # We have scan devices, so set the devices scanned flags. LinuxDevicePlugin.devices_scanned = True return { "vgs": dmsetup.vgs, "lvs": dmsetup.lvs, "zfspools": zfs_devices.zpools, "zfsdatasets": zfs_devices.datasets, "zfsvols": zfs_devices.zvols, "mpath": dmsetup.mpaths, "devs": block_devices.block_device_nodes, "local_fs": local_fs, 'emcpower': emcpowers, 'mds': mds }
def convert_targets(force=False): ''' Convert existing ocf:chroma:Target to ZFS + Lustre ''' try: result = AgentShell.run(['cibadmin', '--query']) except OSError, err: if err.errno != errno.ENOENT: raise
def kernel_status(): """ :return: {'running': {'kernel-X.Y.Z'}, 'required': <'kernel-A.B.C' or None>} """ running_kernel = "kernel-%s" % AgentShell.try_run(["uname", "-r"]).strip() if AgentShell.run(["rpm", "-q", "--whatprovides", "kmod-lustre"]).rc == 0: # on a server, a required kernel is a lustre patched kernel since we # are building storage servers that can support both ldiskfs and zfs try: required_kernel = \ next(k for k in sorted(AgentShell.try_run(["rpm", "-q", "kernel"]).split('\n'), reverse=True) if "_lustre" in k) except (AgentShell.CommandExecutionError, StopIteration): required_kernel = None elif AgentShell.run(["rpm", "-q", "kmod-lustre-client"]).rc == 0: # but on a worker, we can ask kmod-lustre-client what the required # kernel is try: required_kernel_prefix = \ next(k for k in AgentShell.try_run(["rpm", "-q", "--requires", "kmod-lustre-client"]).split('\n') if "kernel >=" in k).split(" >= ")[1] required_kernel = AgentShell.try_run( ["rpm", "-q", "kernel-%s*" % required_kernel_prefix]).split('\n')[0] except (AgentShell.CommandExecutionError, StopIteration): required_kernel = None else: required_kernel = None available_kernels = [] for installed_kernel in AgentShell.try_run(["rpm", "-q", "kernel"]).split("\n"): if installed_kernel: available_kernels.append(installed_kernel) return { 'running': running_kernel, 'required': required_kernel, 'available': available_kernels }
def unmount_target(uuid): # This is called by the Target RA from corosync # only unmount targets that are controlled by chroma:Target try: result = AgentShell.run( ['cibadmin', '--query', '--xpath', '//primitive']) except OSError, err: if err.errno != errno.ENOENT: raise
def get_resource_locations(): """Parse `crm_mon -1` to identify where (if anywhere) resources (i.e. targets) are running returns [ resoure_id: location|None, ... ] """ try: result = AgentShell.run(["crm_mon", "-1", "-r", "-X"]) except OSError, err: # ENOENT is fine here. Pacemaker might not be installed yet. if err.errno != errno.ENOENT: raise
def selinux_status(): """ Get selinux status on node :return: {'status': 'Disabled'} """ status = "Disabled" rc = AgentShell.run(["getenforce"]) if rc.rc == 0: status = rc.stdout.strip() return {"status": status}
def latest_kernel(kernel_list, modlist): required_kernel = None arch = AgentShell.try_run(["uname", "-m"]).strip() for kernel in kernel_list: if not kver_gt(kernel, required_kernel, arch): continue kver = kernel.split("-", 1)[1] if AgentShell.run(["modinfo", "-n", "-k", kver] + modlist).rc == 0: required_kernel = kernel return required_kernel
def scanner_cmd(cmd): # Because we are pulling from device-scanner, # It is very important that we wait for # the udev queue to settle before requesting new data AgentShell.run(["udevadm", "settle"]) client = socket.socket(socket.AF_UNIX) client.settimeout(10) client.connect_ex("/var/run/device-scanner.sock") client.sendall(json.dumps(cmd) + "\n") out = "" while True: out += client.recv(1024) if out.endswith("\n"): try: return json.loads(out) except ValueError: pass
def _configure_target_priority(primary, ha_label, node): if primary: score = "20" else: score = "10" name = _constraint(ha_label, primary) result = AgentShell.run( ['pcs', 'constraint', 'location', 'add', name, ha_label, node, score]) if result.rc == 76: console_log.warn("A constraint with the name %s already exists", name) result.rc = 0 return result
def get_resource_locations(): """Parse `crm_mon -1` to identify where (if anywhere) resources (i.e. targets) are running returns [ resoure_id: location|None, ... ] """ try: result = AgentShell.run(["crm_mon", "-1", "-r", "-X"]) except OSError as err: # ENOENT is fine here. Pacemaker might not be installed yet. if err.errno != errno.ENOENT: raise err return {} if result.rc != 0: console_log.info("crm_mon failed (%d): '%s' '%s'", result.rc, result.stdout, result.stderr) return {} return _get_resource_locations(result.stdout)
def _nodes_in_cluster(): """ Returns the nodes in the corosync cluster example output from command 'pcs status corosync': > Corosync Nodes: > Online: > Offline: bill.bailey.com bob.marley.com :return: a list of all nodes in cluster """ nodes = [] result = AgentShell.run(["pcs", "status", "nodes", "corosync"]) if result.rc != 0: # log all command errors but always continue to remove node from cluster console_log.warning(result.stderr) else: # nodes are on the right side of lines separated with ':' for line in result.stdout.split("\n"): if line.find(":") > 0: nodes.extend(line.split(":")[1].strip().split()) return nodes
def fetch_device_list(): AgentShell.run(["udevadm", "settle"]) info = scanner_cmd("info") return pipe(info.itervalues(), cmap(as_device), cfilter(filter_device), list)
def yum_util(action, packages=[], fromrepo=None, enablerepo=None, narrow_updates=False): """ A wrapper to perform yum actions in encapsulated way. :param action: clean, install, remove, update, requires etc :param packages: Packages to install or remove :param fromrepo: The repo the action should be carried out from, others are disabled. :param enablerepo: The repo to enable for the action, others are not disabled or enabled :param narrow_updates: ? :return: No return but throws CommandExecutionError on error. """ if fromrepo and enablerepo: raise ValueError( "Cannot provide fromrepo and enablerepo simultaneously") repo_arg = [] valid_rc_values = [0] # Some errors values other than 0 are valid. tries = 2 if fromrepo: repo_arg = ["--disablerepo=*" ] + ["--enablerepo=%s" % r for r in fromrepo] elif enablerepo: repo_arg = ["--enablerepo=%s" % r for r in enablerepo] if narrow_updates and action == "query": repo_arg.extend(["--upgrades"]) if action == "clean": cmd = ["yum", "clean", "all" ] + (repo_arg if repo_arg else ["--enablerepo=*"]) elif action == "install": cmd = (["yum", "install", "-y", "--exclude", "kernel-debug"] + repo_arg + list(packages)) elif action == "remove": cmd = ["yum", "remove", "-y"] + repo_arg + list(packages) elif action == "update": cmd = (["yum", "update", "-y", "--exclude", "kernel-debug"] + repo_arg + list(packages)) elif action == "requires": cmd = ["repoquery", "--requires"] + repo_arg + list(packages) elif action == "query": cmd = ["repoquery"] + repo_arg + list(packages) elif action == "repoquery": cmd = (["repoquery", "--show-duplicates"] + repo_arg + [ "--queryformat=%{EPOCH} %{NAME} " "%{VERSION} %{RELEASE} %{ARCH}" ]) else: raise RuntimeError("Unknown yum util action %s" % action) # This is a poor solution for HYD-3855 but not one that carries any known cost. # We sometimes see intermittent failures in test, and possibly out of test, that occur # 1 in 50 (estimate) times. yum commands are idempotent and so trying the command three # times has no downside and changes the estimated chance of fail to 1 in 12500. for hyd_3885 in range(tries, -1, -1): result = AgentShell.run(cmd) if result.rc in valid_rc_values: return result.stdout else: # if we were trying to install, clean the metadata before # trying again if action == "install": AgentShell.run(["yum", "clean", "metadata"]) daemon_log.info("HYD-3885 Retrying yum command '%s'" % " ".join(cmd)) if hyd_3885 == 0: daemon_log.info("HYD-3885 Retry yum command failed '%s'" % " ".join(cmd)) raise AgentShell.CommandExecutionError( result, cmd) # Out of retries so raise for the caller..
for con in dom.getElementsByTagName('rsc_location'): ha_label = con.getAttribute("rsc") if not locations.get(ha_label): locations[ha_label] = {} if con.getAttribute("id") == _constraint(ha_label, True): ind = 0 elif con.getAttribute("id") == _constraint(ha_label, False): ind = 1 else: console_log.info("Unknown constraint: %s", con.getAttribute("id")) continue locations[ha_label][ind] = con.getAttribute("node") active = get_resource_locations() AgentShell.run(['pcs', 'property', 'set', 'maintenance-mode=true']) wait_list = [] for res in dom.getElementsByTagName('primitive'): if not (res.getAttribute("provider") == "chroma" and res.getAttribute("type") == "Target"): continue ha_label = res.getAttribute("id") # _get_target_config() will raise KeyError if uuid doesn't exist locally # next() will raise StopIteration if it doesn't find attribute target try: info = next( _get_target_config(ops.getAttribute("value")) for ops in res.getElementsByTagName('nvpair')
def _unconfigure_target_priority(primary, ha_label): return AgentShell.run([ 'pcs', 'constraint', 'location', 'remove', _constraint(ha_label, primary) ])
def yum_util(action, packages=[], fromrepo=None, enablerepo=None, narrow_updates=False): ''' A wrapper to perform yum actions in encapsulated way. :param action: clean, install, remove, update, requires etc :param packages: Packages to install or remove :param fromrepo: The repo the action should be carried out from, others are disabled. :param enablerepo: The repo to enable for the action, others are not disabled or enabled :param narrow_updates: ? :return: No return but throws CommandExecutionError on error. ''' if fromrepo and enablerepo: raise ValueError("Cannot provide fromrepo and enablerepo simultaneously") repo_arg = [] valid_rc_values = [0] # Some errors values other than 0 are valid. tries = 2 if fromrepo: repo_arg = ['--disablerepo=*'] + ['--enablerepo=%s' % r for r in fromrepo] elif enablerepo: repo_arg = ['--enablerepo=%s' % r for r in enablerepo] if narrow_updates and action == 'query': repo_arg.extend(['--upgrades']) if action == 'clean': cmd = ['dnf', 'clean', 'all'] + (repo_arg if repo_arg else ["--enablerepo=*"]) elif action == 'install': cmd = ['dnf', 'install', '--allowerasing', '-y', '--exclude', 'kernel-debug'] + \ repo_arg + list(packages) elif action == 'remove': cmd = ['dnf', 'remove', '-y'] + repo_arg + list(packages) elif action == 'update': cmd = ['dnf', 'update', '--allowerasing', '-y', '--exclude', 'kernel-debug'] + \ repo_arg + list(packages) elif action == 'requires': cmd = ['dnf', 'repoquery', '--requires'] + repo_arg + list(packages) elif action == 'query': cmd = ['dnf', 'repoquery', '--available'] + repo_arg + list(packages) elif action == 'repoquery': cmd = ['dnf', 'repoquery', '--available'] + repo_arg + ['--queryformat=%{EPOCH} %{NAME} %{VERSION} %{RELEASE} %{ARCH}'] elif action == 'check-update': cmd = ['dnf', 'repoquery', '--queryformat=%{name} %{version}-%{release}.' '%{arch} %{repoid}', '--upgrades'] + repo_arg + \ list(packages) else: raise RuntimeError('Unknown yum util action %s' % action) # This is a poor solution for HYD-3855 but not one that carries any known cost. # We sometimes see intermittent failures in test, and possibly out of test, that occur # 1 in 50 (estimate) times. yum commands are idempotent and so trying the command three # times has no downside and changes the estimated chance of fail to 1 in 12500. for hyd_3885 in range(tries, -1, -1): result = AgentShell.run(cmd) if result.rc in valid_rc_values: return result.stdout else: # if we were trying to install, clean the metadata before # trying again if action == 'install': AgentShell.run(['dnf', 'clean', 'metadata']) daemon_log.info("HYD-3885 Retrying yum command '%s'" % " ".join(cmd)) if hyd_3885 == 0: daemon_log.info("HYD-3885 Retry yum command failed '%s'" % " ".join(cmd)) raise AgentShell.CommandExecutionError(result, cmd) # Out of retries so raise for the caller..
def convert_targets(force=False): """ Convert existing ocf:chroma:Target to ZFS + Lustre """ try: result = AgentShell.run(["cibadmin", "--query"]) except OSError as err: if err.errno != errno.ENOENT: raise err return { "crm_mon_error": { "rc": err.errno, "stdout": err.message, "stderr": err.strerror, } } if result.rc != 0: # Pacemaker not running, or no resources configured yet return { "crm_mon_error": { "rc": result.rc, "stdout": result.stdout, "stderr": result.stderr, } } dom = ET.fromstring(result.stdout) this_node = _this_node() # node elements are numbered from 1 # dc-uuid is the node id of the domain controller dcuuid = next( (node.get("uname") for node in dom.findall(".//node") if node.get("id") == dom.get("dc-uuid")), "", ) if dcuuid != this_node and not force: console_log.info("This is not Pacemaker DC %s this is %s", dcuuid, this_node) return # Build map of resource -> [ primary node, secondary node ] locations = {} for con in dom.findall(".//rsc_location"): ha_label = con.get("rsc") if not locations.get(ha_label): locations[ha_label] = {} if con.get("id") == _constraint(ha_label, True): ind = 0 elif con.get("id") == _constraint(ha_label, False): ind = 1 else: console_log.info("Unknown constraint: %s", con.get("id")) continue locations[ha_label][ind] = con.get("node") active = get_resource_locations() AgentShell.try_run([ "crm_attribute", "--type", "crm_config", "--name", "maintenance-mode", "--update", "true", ]) wait_list = [] for res in dom.findall(".//primitive"): if not (res.get("provider") == "chroma" and res.get("type") == "Target"): continue ha_label = res.get("id") # _get_target_config() will raise KeyError if uuid doesn't exist locally # next() will raise StopIteration if it doesn't find attribute target try: info = next( _get_target_config(ops.get("value")) for ops in res.findall('.//nvpair[@name="target"]')) except Exception as err: console_log.error("No local info for resource: %s", ha_label) continue _unconfigure_target_priority(False, ha_label) _unconfigure_target_priority(True, ha_label) _unconfigure_target_ha(ha_label, True) _configure_target_ha(ha_label, info, (active.get(ha_label) is not None)) _configure_target_priority(True, ha_label, locations[ha_label][0]) _configure_target_priority(False, ha_label, locations[ha_label][1]) wait_list.append([ha_label, (active.get(ha_label) is not None)]) # wait for last item for wait in wait_list: console_log.info("Waiting on %s", wait[0]) _wait_target(*wait) AgentShell.try_run([ "crm_attribute", "--type", "crm_config", "--name", "maintenance-mode", "--delete", ])
def pacemaker_running(): result = AgentShell.run(['service', 'pacemaker', 'status']) return result.rc == 0
def monitor(self): result = AgentShell.run(self.base_cmd + ["-n", self.plug, "-o", "monitor"]) return result.rc