def yum_check_update(repos): ''' Uses yum check_update -q to return a list of packages from the repos passed in that require an update Will raise a CommandExecutionError if yum throws unexpected errors. :param repos: The repos to check for update :return: List of packages that require an update. ''' packages = [] yum_response = yum_util('check-update', fromrepo=repos) for line in filter(None, yum_response.split('\n')): elements = line.split() # Valid lines have 3 elements with the third entry being one of the repos anything else should be ignored but logged if len(elements) == 3 and (elements[2] in repos): packages.append(elements[0]) else: daemon_log.warning( "yum check_update found unknown response of: %s\nIn: %s\nLooking at: repos %s" % (line, yum_response, repos)) return packages
def _add_zfs_pool(self, line, block_devices): name, size_str, uuid = line.split() size = util.human_to_bytes(size_str) drive_mms = block_devices.paths_to_major_minors( _get_all_zpool_devices(name)) if drive_mms is None: daemon_log.warning("Could not find major minors for zpool '%s'" % name) return datasets = _get_zpool_datasets(name, drive_mms) zvols = _get_zpool_zvols(name, drive_mms, block_devices) pool_md = { "name": name, "path": name, # fabricate a major:minor. Do we ever use them as numbers? "block_device": "zfspool:%s" % name, "uuid": uuid, "size": size, "drives": drive_mms } # write new data to store (_pool/datasets/Zvols) write_to_store(uuid, { 'pool': pool_md, 'datasets': datasets, 'zvols': zvols }) self._update_pool_or_datasets(block_devices, pool_md, datasets, zvols)
def _load(cls): def _walk_parents(dir): """Walk backwards up the tree to first non-module directory.""" components = [] if (os.path.isfile("%s/__init__.pyc" % dir) or os.path.isfile("%s/__init__.py" % dir)): parent, child = os.path.split(dir) components.append(child) components.extend(_walk_parents(parent)) return components def _build_namespace(dir): """Builds a namespace by finding all parent modules.""" return ".".join(reversed(_walk_parents(dir))) names = set() assert os.path.isdir(cls.path) for modfile in sorted(glob.glob("%s/*.py*" % cls.path)): dir, filename = os.path.split(modfile) module = filename.split(".py")[0] if not module in EXCLUDED_PLUGINS: namespace = _build_namespace(dir) name = "%s.%s" % (namespace, module) names.add(name) daemon_log.info("Found action plugin modules: %s" % names) cls.commands = {} capabilities = set() for name in [n for n in names if not n.split(".")[-1].startswith('_')]: try: module = __import__(name, None, None, ['ACTIONS', 'CAPABILITIES']) if hasattr(module, 'ACTIONS'): for fn in module.ACTIONS: cls.commands[fn.func_name] = fn daemon_log.info( "Loaded actions from %s: %s" % (name, [fn.func_name for fn in module.ACTIONS])) else: daemon_log.warning( "No 'ACTIONS' defined in action module %s" % name) if hasattr(module, 'CAPABILITIES') and module.CAPABILITIES: capabilities.add(*module.CAPABILITIES) except Exception: daemon_log.warn("** error loading plugin %s" % name) daemon_log.warn(traceback.format_exc()) cls.capabilities = list(capabilities)
def _search_for_inactive(self): """ Return list of importable zpool names by parsing the 'zpool import' command output # [root@lotus-33vm17 ~]# zpool import # pool: lustre # id: 5856902799170956568 # state: ONLINE # action: The pool can be imported using its name or numeric identifier. # config: # # lustre ONLINE # scsi-0QEMU_QEMU_HARDDISK_disk15 ONLINE # scsi-0QEMU_QEMU_HARDDISK_disk14 ONLINE # # ... (repeats for all discovered zpools) """ try: out = AgentShell.try_run(["zpool", "import"]) except AgentShell.CommandExecutionError as e: # zpool import errors with error code 1 if nothing available to import if e.result.rc == 1: out = "" else: raise e zpool_names = [] zpool_name = None for line in filter(None, out.split("\n")): match = re.match("(\s*)pool: (\S*)", line) if match is not None: zpool_name = match.group(2) match = re.match("(\s*)state: (\S*)", line) if match is not None: if zpool_name: if match.group(2) in self.acceptable_health: zpool_names.append(zpool_name) else: daemon_log.warning( "Not scanning zpool %s because it is %s." % (zpool_name, match.group(2))) else: daemon_log.warning( "Found a zpool import state but had no zpool name") # After each 'state' line is encountered, move onto the next zpool name zpool_name = None return zpool_names
def _read_crm_mon_as_xml(self): """Run crm_mon --one-shot --as-xml, return raw output or None For expected return values (0, 10), return the stdout from output. If the return value is unexpected, log a warning, and return None """ crm_command = ['crm_mon', '--one-shot', '--as-xml'] rc, stdout, stderr = AgentShell.run_old(crm_command) if rc not in [0, 10]: # 10 Corosync is not running on this node daemon_log.warning("rc=%s running '%s': '%s' '%s'" % (rc, crm_command, stdout, stderr)) stdout = None return stdout
def _parse_crm_as_xml(self, raw): """ Parse the crm_mon response returns dict of nodes status or None if corosync is down """ return_dict = None try: root = xml.fromstring(raw) except ParseError: # not xml, might be a known error message if CorosyncPlugin.COROSYNC_CONNECTION_FAILURE not in raw: daemon_log.warning("Bad xml from corosync crm_mon: %s" % raw) else: return_dict = {} # Got node info, pack it up and return tm_str = root.find("summary/last_update").get("time") tm_datetime = IMLDateTime.strptime(tm_str, "%a %b %d %H:%M:%S %Y") return_dict.update( { "datetime": IMLDateTime.convert_datetime_to_utc( tm_datetime ).strftime("%Y-%m-%dT%H:%M:%S+00:00") } ) nodes = {} for node in root.findall("nodes/node"): host = node.get("name") nodes.update({host: node.attrib}) return_dict["nodes"] = nodes return_dict["options"] = {"stonith_enabled": False} cluster_options = root.find("summary/cluster_options") if cluster_options is not None: return_dict["options"].update( { "stonith_enabled": cluster_options.get("stonith-enabled") == "true" } ) return return_dict
def terminate(self, plugin_name): try: session = self.get(plugin_name) except KeyError: daemon_log.warning("SessionTable.terminate not found %s" % plugin_name) return else: daemon_log.info("SessionTable.terminate %s/%s" % (plugin_name, session.id)) session.teardown() try: del self._sessions[plugin_name] except KeyError: daemon_log.warning( "SessionTable.terminate session object already gone")
def _handle_messages(self, messages): daemon_log.info("HttpReader: got %s messages" % (len(messages))) for message in messages: m = Message() m.parse(message) daemon_log.info("HttpReader: %s(%s, %s)" % (m.type, m.plugin_name, m.session_id)) try: if m.type == "SESSION_CREATE_RESPONSE": self._client.sessions.create(m.plugin_name, m.session_id) elif m.type == "SESSION_TERMINATE_ALL": self._client.sessions.terminate_all() elif m.type == "SESSION_TERMINATE": self._client.sessions.terminate(m.plugin_name) elif m.type == "DATA": try: session = self._client.sessions.get( m.plugin_name, m.session_id) except KeyError: daemon_log.warning( "Received a message for unknown session %s/%s" % (m.plugin_name, m.session_id)) else: # We have successfully routed the message to the plugin instance # for this session try: session.receive_message(m.body) except: daemon_log.error("%s/%s raised an exception: %s" % (m.plugin_name, m.session_id, traceback.format_exc())) self._client.sessions.terminate(m.plugin_name) else: raise NotImplementedError(m.type) except Exception: backtrace = "\n".join( traceback.format_exception(*(sys.exc_info()))) daemon_log.error("Plugin exception handling data message: %s" % backtrace)
def _run(self): get_args = { "server_boot_time": self._client.boot_time.isoformat() + "Z", "client_start_time": self._client.start_time.isoformat() + "Z", } while not self._stopping.is_set(): daemon_log.info("HttpReader: get") try: body = self._client.get(params=get_args) except HttpError: daemon_log.warning("HttpReader: request failed") # We potentially dropped TX messages if this happened, which could include # session control messages, so have to completely reset. # NB could change this to only terminate_all if an HTTP request was started: there is # no need to do the teardown if we didn't even get a TCP connection to the manager. self._client.sessions.terminate_all() self._stopping.wait(timeout=self.HTTP_RETRY_PERIOD) continue else: self._handle_messages(body["messages"]) daemon_log.info("HttpReader: stopping")
def _lnet_devices(self, interfaces): """ :param interfaces: A list of the interfaces on the current node :return: Returns a dict of dicts describing the nids on the current node. """ try: lines = AgentShell.try_run(["lctl", "get_param", "-n", "nis"]).split("\n") except Exception as err: daemon_log.warning("get_nids: failed to open: {}".format( err.message)) return LinuxNetworkDevicePlugin.cached_results # Skip header line lines = lines[1:] # Parse each NID string out into result list lnet_nids = [] for line in lines: if not line: continue try: lnet_nids.append(LNetNid(line, interfaces)) except NetworkInterfaces.InterfaceNotFound as e: daemon_log.warning(e) result = {} for lnet_nid in lnet_nids: if lnet_nid.lnd_type not in EXCLUDE_INTERFACES: result[lnet_nid.name] = { "nid_address": lnet_nid.nid_address, "lnd_type": lnet_nid.lnd_type, "lnd_network": lnet_nid.lnd_network, } LinuxNetworkDevicePlugin.cache_results(raw_result=result) return result
def _read_crm_mon_as_xml(self): """Run crm_mon --one-shot --as-xml, return raw output or None For expected return values (0, 10), return the stdout from output. If the return value is unexpected, log a warning, and return None """ crm_command = ["crm_mon", "--one-shot", "--as-xml"] try: rc, stdout, stderr = AgentShell.run_old(crm_command) except OSError as e: # ENOENT is fine here. Pacemaker might not be installed yet. if e.errno != errno.ENOENT: raise e return None if rc not in [0, 10]: # 10 Corosync is not running on this node daemon_log.warning("rc=%s running '%s': '%s' '%s'" % (rc, crm_command, stdout, stderr)) stdout = None return stdout
def _lnet_devices(self, interfaces): ''' :param interfaces: A list of the interfaces on the current node :return: Returns a dict of dicts describing the nids on the current node. ''' # Read active NIDs from /proc try: with open("/proc/sys/lnet/nis") as file: lines = file.readlines() except IOError: daemon_log.warning("get_nids: failed to open") return LinuxNetworkDevicePlugin.cached_results # Skip header line lines = lines[1:] # Parse each NID string out into result list lnet_nids = [] for line in lines: try: lnet_nids.append(LNetNid(line, interfaces)) except NetworkInterfaces.InterfaceNotFound as e: daemon_log.warning(e) result = {} for lnet_nid in lnet_nids: if lnet_nid.lnd_type not in EXCLUDE_INTERFACES: result[lnet_nid.name] = { 'nid_address': lnet_nid.nid_address, 'lnd_type': lnet_nid.lnd_type, 'lnd_network': lnet_nid.lnd_network } LinuxNetworkDevicePlugin.cache_results(raw_result=result) return result
def send(self): """Return True if the POST succeeds, else False""" messages = [] completion_callbacks = [] post_envelope = { "messages": [], "server_boot_time": self._client.boot_time.isoformat() + "Z", "client_start_time": self._client.start_time.isoformat() + "Z", } # Any message we drop will need its session killed kill_sessions = set() messages_bytes = len(json.dumps(post_envelope)) while True: try: message = self._retry_messages.get_nowait() daemon_log.debug("HttpWriter got message from retry queue") except Queue.Empty: try: message = self._messages.get_nowait() daemon_log.debug( "HttpWriter got message from primary queue") except Queue.Empty: break if message.callback: completion_callbacks.append(message.callback) message_length = len(json.dumps(message.dump(self._client._fqdn))) if message_length > MAX_BYTES_PER_POST: daemon_log.warning("Oversized message %s/%s: %s" % ( message_length, MAX_BYTES_PER_POST, message.dump(self._client._fqdn), )) if messages and message_length > MAX_BYTES_PER_POST - messages_bytes: # This message will not fit into this POST: pop it back into the queue daemon_log.info( "HttpWriter message %s overflowed POST %s/%s (%d " "messages), enqueuing" % ( message.dump(self._client._fqdn), message_length, MAX_BYTES_PER_POST, len(messages), )) self._retry_messages.put(message) break messages.append(message) messages_bytes += message_length daemon_log.debug("HttpWriter sending %s messages" % len(messages)) try: post_envelope["messages"] = [ m.dump(self._client._fqdn) for m in messages ] self._client.post(post_envelope) except HttpError: daemon_log.warning("HttpWriter: request failed") # Terminate any sessions which we've just droppped messages for for message in messages: if message.type == "DATA": kill_sessions.add(message.plugin_name) for plugin_name in kill_sessions: self._client.sessions.terminate(plugin_name) return False else: return True finally: for callback in completion_callbacks: callback()
def __init__(self): """ :return: A dist of dicts that describe all of the network interfaces on the node with the exception of the the lo interface which is excluded from the list. """ def interface_to_lnet_type(if_type): """ To keep everything consistant we report networks types as the lnd name not the linux name we have to translate somewhere so do it at source, if the user ever needs to see it as Linux types we can translate back. There is a train of thought that says it if is unknown we should cause an exception which means the app will not work, I prefer to try an approach that says returning just the unknown might well work, and if not it causes an exception somewhere else. """ return self.network_translation.get(if_type.lower(), if_type.lower()) try: ip_out = AgentShell.try_run(["ip", "addr"]) with open("/proc/net/dev") as file: dev_stats = file.readlines() except IOError: daemon_log.warning("ip: failed to run") return # Parse the ip command output and create a list of lists, where each entry is the output from one device. device_lines = [] devices = [device_lines] for line in ip_out.split("\n"): if line and line[0] != " ": # First line of a new device. if device_lines: device_lines = [] devices.append(device_lines) device_lines.append(line) # Parse the /proc/net/dev output and create a dictionary of stats (just rx_byte, tx_bytes today) with an entry for each # network port. The input will look something like below. # Inter-| Receive | Transmit # face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed # lo: 8305402 85521 0 0 0 0 0 0 8305402 85521 0 0 0 0 0 0 # eth0: 318398818 2069564 0 0 0 0 0 0 6622219 50337 0 0 0 0 0 0 # eth1: 408736 7857 0 0 0 0 0 0 4347300 35206 0 0 0 0 0 0 if NetworkInterfaces.proc_net_dev_keys == {}: keys = dev_stats[1].replace("|", " ").split() keys = keys[1:] for index in range(0, len(keys)): NetworkInterfaces.proc_net_dev_keys[ ("rx_" if index < len(keys) / 2 else "tx_") + keys[index]] = (index + 1) proc_net_dev_values = {} # Data is on the 3rd line to the end, so get the values for each entry. for line in dev_stats[2:]: values = line.split() proc_net_dev_values[values[0][:-1]] = NetworkInterface.RXTXStats( values[NetworkInterfaces.proc_net_dev_keys["rx_bytes"]], values[NetworkInterfaces.proc_net_dev_keys["tx_bytes"]], ) # Now create a network interface for each of the entries. for device_lines in devices: interface = NetworkInterface(device_lines, proc_net_dev_values) if (interface.interface not in EXCLUDE_INTERFACES) and (interface.slave is False): self[interface.interface] = { "mac_address": interface.mac_address, "inet4_address": interface.inet4_addr, "inet4_prefix": interface.inet4_prefix, "inet6_address": interface.inet6_addr, "type": interface_to_lnet_type(interface.type), "rx_bytes": interface.rx_tx_stats.rx_bytes, "tx_bytes": interface.rx_tx_stats.tx_bytes, "up": interface.up, "slave": interface.slave, }
class CorosyncPlugin(DevicePlugin): """ Agent Plugin to read corosync node health status information This plugin will run on all nodes and report about the health of all nodes in it's peer group. See also the chroma_core/services/corosync Node status is reported as a dictionary of host names containing all of the possible crm_mon data as attributes: { 'node1': {name: attr, name: attr...} 'node2': {name: attr, name: attr...} } datetime is passed in localtime converted to UTC. Based on xml output from this version of corosync/pacemaker crm --version 1.1.7-6.el6 (Build 148fccfd5985c5590cc601123c6c16e966b85d14) """ # This is the message that crm_mon will report # when corosync is not running COROSYNC_CONNECTION_FAILURE = ("Connection to cluster failed: " "connection failed") def _parse_crm_as_xml(self, raw): """ Parse the crm_mon response returns dict of nodes status or None if corosync is down """ return_dict = None try: root = xml.fromstring(raw) except ParseError: # not xml, might be a known error message if CorosyncPlugin.COROSYNC_CONNECTION_FAILURE not in raw: daemon_log.warning("Bad xml from corosync crm_mon: %s" % raw) else: return_dict = {} # Got node info, pack it up and return tm_str = root.find('summary/last_update').get('time') tm_datetime = IMLDateTime.strptime(tm_str, '%a %b %d %H:%M:%S %Y') return_dict.update({'datetime': IMLDateTime.convert_datetime_to_utc(tm_datetime).strftime("%Y-%m-%dT%H:%M:%S+00:00")}) nodes = {} for node in root.findall("nodes/node"): host = node.get("name") nodes.update({host: node.attrib}) return_dict['nodes'] = nodes return_dict['options'] = { 'stonith_enabled': False } cluster_options = root.find('summary/cluster_options') if cluster_options is not None: return_dict['options'].update({'stonith_enabled': cluster_options.get('stonith-enabled') == 'true'}) return return_dict def _read_crm_mon_as_xml(self): """Run crm_mon --one-shot --as-xml, return raw output or None For expected return values (0, 10), return the stdout from output. If the return value is unexpected, log a warning, and return None """ crm_command = ['crm_mon', '--one-shot', '--as-xml'] try: rc, stdout, stderr = AgentShell.run_old(crm_command) except OSError, e: # ENOENT is fine here. Pacemaker might not be installed yet. if e.errno != errno.ENOENT: raise if rc not in [0, 10]: # 10 Corosync is not running on this node daemon_log.warning("rc=%s running '%s': '%s' '%s'" % (rc, crm_command, stdout, stderr)) stdout = None return stdout