class Service(ChromaService): PLUGIN_NAME = 'lustre' def __init__(self): self._queue = AgentRxQueue(Service.PLUGIN_NAME) self._queue.purge() def run(self): super(Service, self).run() self._queue.serve(data_callback=self.on_data) def on_data(self, fqdn, data): with transaction.commit_manually(): transaction.commit() try: host = ManagedHost.objects.get(fqdn=fqdn) UpdateScan().run(host.id, data) except Exception: log.error("Error handling lustre message: %s", '\n'.join(traceback.format_exception(*(sys.exc_info())))) def stop(self): super(Service, self).stop() self._queue.stop()
class AgentPluginHandler(object): """Handle messages sent from the agent. Similar to ScanDaemon, the main difference is that ScanDaemon polls plugin callbacks, whereas AgentDaemon is driven by a message queue. Plugins might be loaded by both AgentDaemon and ScanDaemon, where the ScanDaemon side is connecting out to storage controllers, while the AgentDaemon side is getting notifications of devices popping up on Lustre servers. Creates one plugin instance per plugin per host which sends messages for that plugin. """ def __init__(self, resource_manager, plugin_name): from chroma_core.lib.storage_plugin.manager import storage_plugin_manager self._resource_manager = resource_manager # Map of host ID to Session self._sessions = {} self._stopping = False self._processing_lock = threading.Lock() self._plugin_name = plugin_name self._plugin_klass = storage_plugin_manager.get_plugin_class( plugin_name) self._queue = AgentRxQueue(self._plugin_name) # Disregard any old messages self._queue.purge() def stop(self): self._queue.stop() def run(self): self._queue.serve(session_callback=self.on_message) def remove_host_resources(self, host_id): log.info("Removing resources for host %s, plugin %s" % (host_id, self._plugin_name)) # Stop the session, and block it from starting again with self._processing_lock: try: del self._sessions[host_id] except KeyError: log.warning("remove_host_resources: No session for host %s" % host_id) try: record = ResourceQuery().get_record_by_attributes( "linux", "PluginAgentResources", host_id=host_id, plugin_name=self._plugin_name) except StorageResourceRecord.DoesNotExist: pass else: self._resource_manager.global_remove_resource(record.id) log.info("AgentDaemon: finished removing resources for host %s" % host_id) def setup_host(self, host_id, data): with self._processing_lock: session = self._sessions.get(host_id, None) assert session is not None session.plugin.do_agent_session_continue(data) @transaction.atomic def update_host_resources(self, host_id, data): with self._processing_lock: session = self._sessions.get(host_id, None) if session: session.plugin.do_agent_session_continue(data) def _create_plugin_instance(self, host): from chroma_core.lib.storage_plugin.manager import storage_plugin_manager resource_class, resource_class_id = storage_plugin_manager.get_plugin_resource_class( "linux", "PluginAgentResources") # FIXME: it is weird that the PluginAgentResources class lives in the linux plugin but is used by all of them record, created = StorageResourceRecord.get_or_create_root( resource_class, resource_class_id, { "plugin_name": self._plugin_name, "host_id": host.id }) return self._plugin_klass(self._resource_manager, record.id) def on_message(self, message): with self._processing_lock: fqdn = message["fqdn"] assert message["plugin"] == self._plugin_name if message["type"] != "DATA": # We are session aware in that we check sequence numbers etc, but # we don't actually require any actions on SESSION_CREATE or # SESSION_TERMINATE. assert message["type"] in ("SESSION_CREATE", "SESSION_TERMINATE") return try: host = ManagedHost.objects.get(fqdn=fqdn) except ManagedHost.DoesNotExist: log.error("Received agent message for non-existent host %s" % fqdn) return log.debug("Received agent message for %s/%s/%s" % (fqdn, message["plugin"], message["session_id"])) existing_session = self._sessions.get(host.id, None) if existing_session is None: if message["session_seq"] == 0: # No existing session, start a new one log.info("New session") self._sessions[host.id] = Session( message["session_id"], self._create_plugin_instance(host)) else: # Partway through a session, reset it log.info( "Nonzero counter for new (to me) session, resetting") HttpAgentRpc().reset_session(fqdn, self._plugin_name, message["session_id"]) return elif existing_session.id != message["session_id"]: if message["session_seq"] == 0: # Existing session to be replaced with this one log.info("Replacing session %s/%s with %s/%s" % (self._plugin_name, existing_session.id, self._plugin_name, message["session_id"])) self._sessions[host.id] = Session( message["session_id"], self._create_plugin_instance(host)) else: # Existing session is dead, new session is not at zero, must send a reset log.info( "Nonzero counter for new (to me) replacement session, resetting" ) del self._sessions[host.id] HttpAgentRpc().reset_session(fqdn, self._plugin_name, message["session_id"]) return else: if message["session_seq"] == existing_session.seq + 1: # Continuation of session pass else: # Got out of sequence, reset it log.info( "Out of sequence message (seq %s, expected %s), resetting" % (message["session_seq"], existing_session.seq + 1)) del self._sessions[host.id] HttpAgentRpc().reset_session(fqdn, self._plugin_name, message["session_id"]) return session = self._sessions.get(host.id, None) try: if message["session_seq"] == 0: session.plugin.do_agent_session_start(message["body"]) else: session.seq += 1 session.plugin.do_agent_session_continue(message["body"]) except Exception: exc_info = sys.exc_info() backtrace = "\n".join( traceback.format_exception(*(exc_info or sys.exc_info()))) log.error("Exception in agent session for %s from %s: %s" % (self._plugin_name, host, backtrace)) log.error("Data: %s" % message["body"]) HttpAgentRpc().reset_session(fqdn, self._plugin_name, message["session_id"])
class Service(ChromaService): """Corosync host offline detection service The corosync agent will report host status for all nodes in it's peer group. Any node that is down according to corosync will be recorded here, and in the DB, and and alert will be saved. Be sure to have all nodes on the exact same time - ntp. This service will drop older reports that come in late, so correct timing is critical. """ PLUGIN_NAME = 'corosync' # Class to store the in-memory online/offline status and sample times # a HostStatus object is created for each host that is reported HostStatus = namedtuple('HostStatus', ['status', 'datetime']) def __init__(self): super(Service, self).__init__() # Holds each host seen as a key with a HostStatus value last set self._host_status = defaultdict(self.HostStatus) self._queue = AgentRxQueue(Service.PLUGIN_NAME) # Using transaction decorator to ensure that subsequent calls # see fresh data when polling the ManagedHost model. @transaction.commit_on_success() def on_data(self, fqdn, body): """Process all incoming messages from the Corosync agent plugin Request to have the status changed for an instance. If the current state determines that a host is offline, then raise that alert. old messages should not be processed. datetime is in UTC of the node's localtime in the standard ISO string format """ try: host = ManagedHost.objects.get(fqdn=fqdn) except ManagedHost.DoesNotExist: # This might happen when we are deleting a host and the queues mean a message is still sat waiting to be # processed. Something has spoken to us and we don't know anything about it so really we can't do anything # other than drop it. log.warning( "Corosync message from unknown host %s, the message was dropped." % fqdn) return # If corosync is not configured yet, or we don't actually have corosync - then ignore the input if (not host.corosync_configuration ) or host.corosync_configuration.state == 'unconfigured': return if body.get('state'): job_scheduler_notify.notify(host.corosync_configuration, timezone.now(), {'state': body['state']['corosync']}) job_scheduler_notify.notify(host.pacemaker_configuration, timezone.now(), {'state': body['state']['pacemaker']}) if body['state']['corosync'] == 'stopped': return else: if host.corosync_configuration.state != 'started': return if body.get('crm_info'): nodes = body['crm_info']['nodes'] dt = body['crm_info']['datetime'] options = body['crm_info'].get('options', {'stonith_enabled': None}) stonith_enabled = options['stonith_enabled'] try: dt = IMLDateTime.parse(dt) except ValueError: if dt != '': log.warning( "Invalid date or tz string from corosync plugin: %s" % dt) raise def is_new(peer_node_identifier): return (peer_node_identifier not in self._host_status or self._host_status[peer_node_identifier].datetime < dt) peers_str = "; ".join([ "%s: online=%s, new=%s" % (peer_node_identifier, data['online'], is_new(peer_node_identifier)) for peer_node_identifier, data in nodes.items() ]) log.debug("Incoming peer report from %s: %s" % (fqdn, peers_str)) # NB: This will ignore any unknown peers in the report. cluster_nodes = ManagedHost.objects.select_related( 'ha_cluster_peers').filter( Q(nodename__in=nodes.keys()) | Q(fqdn__in=nodes.keys())) unknown_nodes = set(nodes.keys()) - set([ h.nodename for h in cluster_nodes ]) - set([h.fqdn for h in cluster_nodes]) # Leaving this out for now - because they raise issue caused by limitations in the simulator and # test system as a whole. Difficult to know if they will or won't be raised it all depends on the past. # CorosyncUnknownPeersAlert.notify(host.corosync_configuration, unknown_nodes != set()) if unknown_nodes: log.warning("Unknown nodes in report from %s: %s" % (fqdn, unknown_nodes)) if stonith_enabled is not None: StonithNotEnabledAlert.notify(host.corosync_configuration, stonith_enabled is False) CorosyncNoPeersAlert.notify(host.corosync_configuration, len(cluster_nodes) == 1) # CorosyncToManyPeersAlert.notify(host.corosync_configuration, len(cluster_nodes) > 2) # Consider all nodes in the peer group for this reporting agent for host in cluster_nodes: try: data = nodes[host.nodename] node_identifier = host.nodename except KeyError: data = nodes[host.fqdn] node_identifier = host.fqdn cluster_peer_keys = sorted( [node.pk for node in cluster_nodes if node is not host]) if is_new(node_identifier) and host.corosync_configuration: host_reported_online = data['online'] == 'true' log.debug("Corosync processing " "peer %s of %s " % (host.fqdn, fqdn)) # Raise an Alert - system suppresses duplicates log.debug("Alert notify on %s: active=%s" % (host, not host_reported_online)) HostOfflineAlert.notify(host, not host_reported_online) if host_reported_online == False: log.debug("Host %s offline" % host.fqdn) else: log.debug("Host %s online" % host.fqdn) # Attempt to save the state. if host.corosync_configuration.corosync_reported_up != host_reported_online: job_scheduler_notify.notify( host.corosync_configuration, timezone.now(), {'corosync_reported_up': host_reported_online}) peer_host_peer_keys = sorted( [h.pk for h in host.ha_cluster_peers.all()]) if peer_host_peer_keys != cluster_peer_keys: job_scheduler_notify.notify( host, timezone.now(), {'ha_cluster_peers': cluster_peer_keys}) # Keep internal track of the hosts state. self._host_status[node_identifier] = self.HostStatus( status=host_reported_online, datetime=dt) def run(self): super(Service, self).run() self._queue.serve(data_callback=self.on_data) def stop(self): super(Service, self).stop() self._queue.stop()
class AgentRpcMessenger(object): """ This class consumes AgentRunnerPluginRxQueue, sends messages to AgentTxQueue, and maintains state for actions in progress. """ # The name of the device plugin on the agent with which # this module will communicate PLUGIN_NAME = "action_runner" # A bit rubbish, but a tag so callers can know the failure was because the server could not be contacted. # Improve with a different Exception one day. COULD_NOT_CONTACT_TAG = "Could not contact server" # If no action_runner session is present when trying to run # an action, wait this long for one to show up SESSION_WAIT_TIMEOUT = 30 def __init__(self): super(AgentRpcMessenger, self).__init__() # session to list of RPC IDs self._session_rpcs = defaultdict(dict) # FQDN to session self._sessions = {} self._cancelled_rpcs = ExpiringList(10 * 60) self._action_runner_rx_queue = AgentRxQueue( AgentRpcMessenger.PLUGIN_NAME) self._action_runner_rx_queue.purge() self._lock = threading.Lock() def run(self): try: HttpAgentRpc().reset_plugin_sessions(AgentRpcMessenger.PLUGIN_NAME) except RpcTimeout: # Assume this means that the http_agent service isn't running: this # is acceptable, as our goal of there not being any sessions is # already the case. log.warning("Unable to reset %s sessions" % AgentRpcMessenger.PLUGIN_NAME) self._action_runner_rx_queue.serve(session_callback=self.on_rx) log.info("AgentRpcMessenger.complete") def stop(self): log.info("AgentRpcMessenger.stop") self._action_runner_rx_queue.stop() def complete_all(self): log.info("AgentRpcMessenger.complete_all") for session_id, rpc_id_to_rpc in self._session_rpcs.items(): for rpc_id, rpc_state in rpc_id_to_rpc.items(): log.info("AgentRpcMessenger.complete_all: erroring %s" % rpc_state.id) if not rpc_state.complete.is_set(): rpc_state.exception = "Cancelled due to service shutdown" rpc_state.complete.set() def remove(self, fqdn): with self._lock: try: del self._sessions[fqdn] except KeyError: pass def _abort_session(self, fqdn, message, old_session_id, new_session_id=None): log.warning("AgentRpcMessenger.on_rx: aborting session %s because %s" % (old_session_id, message)) old_rpcs = self._session_rpcs[old_session_id] if new_session_id is not None: self._sessions[fqdn] = new_session_id else: try: del self._sessions[fqdn] except KeyError: pass for rpc in old_rpcs.values(): if new_session_id: log.warning( "AgentRpcMessenger.on_rx: re-issuing RPC %s for session %s (was %s) because %s" % (rpc.id, new_session_id, old_session_id, message)) rpc.session_id = new_session_id self._resend(rpc) else: rpc.exception = "Communications error with %s because %s" % ( fqdn, message) rpc.complete.set() del self._session_rpcs[old_session_id] def get_session_id(self, fqdn): with self._lock: try: return self._sessions[fqdn] except KeyError: return None def await_restart(self, fqdn, timeout, old_session_id=None): """ If there is currently an action_runner session, wait for a different one. Else wait for any action_runner session to start.""" if old_session_id is None: old_session_id = self.get_session_id(fqdn) log.info("AgentRpcMessenger.await_restart: awaiting %s (old %s)" % (fqdn, old_session_id)) # Note: using polling here for simplicity, if efficiency became an issue here # we could set up events to be triggered by the new session logic in on_rx, and # sleep on them instead of polling. duration = 0 poll_period = 1.0 while True: current_session_id = self.get_session_id(fqdn) if current_session_id is not None and current_session_id != old_session_id: log.info("AgentRpcMessenger.await_restart: %s new %s" % (fqdn, current_session_id)) break if duration >= timeout: log.info( "AgentRpcMessenger.await_restart: %s timeout after %ss" % (fqdn, duration)) duration += poll_period time.sleep(poll_period) def on_rx(self, message): with self._lock: log.debug("on_rx: %s" % message) session_id = message["session_id"] fqdn = message["fqdn"] log.info("AgentRpcMessenger.on_rx: %s/%s" % (fqdn, session_id)) if message["type"] == "SESSION_CREATE": if fqdn in self._sessions: old_session_id = self._sessions[fqdn] self._abort_session(fqdn, "new session created", old_session_id, session_id) else: self._sessions[fqdn] = session_id elif message["type"] == "SESSION_TERMINATE": # An agent has timed out or restarted, we're being told its session is now dead if message["fqdn"] in self._sessions: self._abort_session(fqdn, "session terminated", message["session_id"]) elif message["type"] == "SESSION_TERMINATE_ALL": # The http_agent service has restarted, all sessions are now over for fqdn, session in self._sessions.items(): self._abort_session(fqdn, "all sessions terminated", session) else: rpc_response = message["body"] if rpc_response["type"] != "ACTION_COMPLETE": log.error("Unexpected type '%s'" % rpc_response["type"]) return if fqdn in self._sessions and self._sessions[ fqdn] != session_id: log.info( "AgentRpcMessenger.on_rx: cancelling session %s/%s (replaced by %s)" % (fqdn, self._sessions[fqdn], session_id)) self._abort_session(fqdn, "session cancelled", self._sessions[fqdn]) HttpAgentRpc().reset_session(fqdn, AgentRpcMessenger.PLUGIN_NAME, session_id) elif fqdn in self._sessions: log.info("AgentRpcMessenger.on_rx: good session %s/%s" % (fqdn, session_id)) # Find this RPC and complete it try: rpc = self._session_rpcs[session_id][ rpc_response["id"]] except KeyError: if rpc_response["id"] in self._cancelled_rpcs: log.debug( "Response received from a cancelled RPC (id: %s)", rpc_response["id"]) else: log.error( "Response received from UNKNOWN RPC of (id: %s)", rpc_response["id"]) else: del self._session_rpcs[session_id][rpc_response["id"]] rpc.exception = rpc_response["exception"] rpc.result = rpc_response["result"] rpc.subprocesses = rpc_response["subprocesses"] log.info("AgentRpcMessenger.on_rx: completing rpc %s" % rpc.id) rpc.complete.set() else: log.info("AgentRpcMessenger.on_rx: unknown session %s/%s" % (fqdn, session_id)) # A session I never heard of? HttpAgentRpc().reset_session(fqdn, AgentRpcMessenger.PLUGIN_NAME, session_id) def _resend(self, rpc): log.debug("AgentRpcMessenger._resend: rpc %s in session %s" % (rpc.id, rpc.session_id)) self._session_rpcs[rpc.session_id][rpc.id] = rpc AgentTxQueue().put(rpc.get_request()) def _send_request(self, fqdn, action, args): wait_count = 0 if not self.await_session(fqdn, AgentRpcMessenger.SESSION_WAIT_TIMEOUT): log.error("No %s session for %s after %s seconds" % (AgentRpcMessenger.PLUGIN_NAME, fqdn, wait_count)) raise AgentException( fqdn, action, args, "%s %s no session after %s seconds" % (self.COULD_NOT_CONTACT_TAG, fqdn, AgentRpcMessenger.SESSION_WAIT_TIMEOUT), ) with self._lock: try: session_id = self._sessions[fqdn] except KeyError: # This could happen in spite of the earlier check, as that was outside the lock. log.warning("AgentRpcMessenger._send: no session for %s" % fqdn) raise AgentException( fqdn, action, args, "%s %s" % (self.COULD_NOT_CONTACT_TAG, fqdn)) log.debug("AgentRpcMessenger._send: using session %s" % session_id) rpc = ActionInFlight(session_id, fqdn, action, args) self._session_rpcs[session_id][rpc.id] = rpc AgentTxQueue().put(rpc.get_request()) return rpc def _send_cancellation(self, rpc): with self._lock: try: self._session_rpcs[rpc.session_id][rpc.id] except KeyError: log.warning( "Dropping cancellation of RPC %s, it is already complete or aborted" % rpc.id) else: log.warning("Cancelling RPC %s" % rpc.id) AgentTxQueue().put(rpc.get_cancellation()) del self._session_rpcs[rpc.session_id][rpc.id] def _complete(self, rpc, cancel_event): log.info("AgentRpcMessenger._complete: starting wait for rpc %s" % rpc.id) # Wait for rpc.complete, waking up every second to # check cancel_event while True: if cancel_event.is_set(): self._send_cancellation(rpc) self._cancelled_rpcs.append(rpc.id) raise AgentCancellation() else: rpc.complete.wait(timeout=1.0) if rpc.complete.is_set(): break log.info("AgentRpcMessenger._complete: completed wait for rpc %s" % rpc.id) if rpc.exception: raise AgentException(rpc.fqdn, rpc.action, rpc.args, rpc.exception, subprocesses=rpc.subprocesses) else: return rpc.result def call(self, fqdn, action, args, cancel_event): log.debug("AgentRpcMessenger.call: %s %s" % (fqdn, action)) rpc = self._send_request(fqdn, action, args) return self._complete(rpc, cancel_event), rpc def await_session(self, fqdn, timeout): """ Wait for the agent to connect back to the manager and hence be ready to accept commands :param fqdn: fqdn of the agent we are waiting for :param timeout: how long to wait before quiting. :return: timeout remaining 0=failed, !0 is pass and useful for debug. """ while self.get_session_id(fqdn) == None and timeout > 0: # Allow a short wait for a session to show up, for example # when running setup actions on a host we've just added its # session may not yet have been fully established log.info( "AgentRpcMessenger._send: no session yet for %s, %s seconds remain" % (fqdn, timeout)) timeout -= 1 time.sleep(1) return timeout
class Service(ChromaService): PLUGIN_NAME = "systemd_journal" def __init__(self): super(Service, self).__init__() self._queue = AgentRxQueue(Service.PLUGIN_NAME) self._queue.purge() self._table_size = LogMessage.objects.count() self._parser = LogMessageParser() def _check_size(self): """Apply a size limit to the table of log messages""" MAX_ROWS_PER_TRANSACTION = 10000 removed_num_entries = 0 overflow_filename = os.path.join(settings.LOG_PATH, "db_log") if self._table_size > settings.DBLOG_HW: remove_num_entries = self._table_size - settings.DBLOG_LW trans_size = min(MAX_ROWS_PER_TRANSACTION, remove_num_entries) with transaction.atomic(): while remove_num_entries > 0: removed_entries = LogMessage.objects.all().order_by( "id")[0:trans_size] self.log.debug("writing %s batch of entries" % trans_size) try: f = open(overflow_filename, "a") for line in removed_entries: f.write("%s\n" % line.__str__()) LogMessage.objects.filter( id__lte=removed_entries[-1].id).delete() except Exception as e: self.log.error( "error opening/writing/closing the db_log: %s" % e) finally: f.close() remove_num_entries -= trans_size removed_num_entries += trans_size if remove_num_entries < trans_size: trans_size = remove_num_entries self._table_size -= removed_num_entries self.log.info("Wrote %s DB log entries to %s" % (removed_num_entries, overflow_filename)) return removed_num_entries def on_data(self, fqdn, body): with transaction.atomic(): with DelayedContextFrom(LogMessage) as log_messages: for msg in body["log_lines"]: try: log_messages.insert( dict( fqdn=fqdn, message=msg["message"], severity=msg["severity"], facility=msg["facility"], tag=msg["source"], datetime=IMLDateTime.parse( msg["datetime"]).as_datetime, message_class=LogMessage.get_message_class( msg["message"]), )) self._table_size += 1 self._parser.parse(fqdn, msg) except Exception as e: self.log.error( "Error %s ingesting systemd-journal entry: %s" % (e, msg)) def run(self): super(Service, self).run() self._queue.serve(data_callback=self.on_data) def stop(self): super(Service, self).stop() self._queue.stop()