class Service(ChromaService):
    PLUGIN_NAME = 'lustre'

    def __init__(self):
        self._queue = AgentRxQueue(Service.PLUGIN_NAME)
        self._queue.purge()

    def run(self):
        super(Service, self).run()

        self._queue.serve(data_callback=self.on_data)

    def on_data(self, fqdn, data):
        with transaction.commit_manually():
            transaction.commit()

        try:
            host = ManagedHost.objects.get(fqdn=fqdn)
            UpdateScan().run(host.id, data)
        except Exception:
            log.error("Error handling lustre message: %s",
                      '\n'.join(traceback.format_exception(*(sys.exc_info()))))

    def stop(self):
        super(Service, self).stop()

        self._queue.stop()
class Service(ChromaService):
    PLUGIN_NAME = 'syslog'

    def __init__(self):
        super(Service, self).__init__()
        self._queue = AgentRxQueue(Service.PLUGIN_NAME)
        self._queue.purge()
        self._table_size = LogMessage.objects.count()
        self._parser = LogMessageParser()

        dse.patch_models()

    def _check_size(self):
        """Apply a size limit to the table of log messages"""
        MAX_ROWS_PER_TRANSACTION = 10000
        removed_num_entries = 0
        overflow_filename = os.path.join(settings.LOG_PATH, "db_log")

        if self._table_size > settings.DBLOG_HW:
            remove_num_entries = self._table_size - settings.DBLOG_LW

            trans_size = min(MAX_ROWS_PER_TRANSACTION, remove_num_entries)
            with transaction.commit_on_success():
                while remove_num_entries > 0:
                    removed_entries = LogMessage.objects.all().order_by(
                        'id')[0:trans_size]
                    self.log.debug("writing %s batch of entries" % trans_size)
                    try:
                        f = open(overflow_filename, "a")
                        for line in removed_entries:
                            f.write("%s\n" % line.__str__())
                        LogMessage.objects.filter(
                            id__lte=removed_entries[-1].id).delete()
                    except Exception, e:
                        self.log.error(
                            "error opening/writing/closing the db_log: %s" % e)
                    finally:
                        f.close()

                    remove_num_entries -= trans_size
                    removed_num_entries += trans_size
                    if remove_num_entries < trans_size:
                        trans_size = remove_num_entries
Esempio n. 3
0
class AgentPluginHandler(object):
    """Handle messages sent from the agent.

    Similar to ScanDaemon, the main difference is that ScanDaemon polls plugin callbacks,
    whereas AgentDaemon is driven by a message queue.

    Plugins might be loaded by both AgentDaemon and ScanDaemon, where the ScanDaemon side
    is connecting out to storage controllers, while the AgentDaemon side is getting notifications
    of devices popping up on Lustre servers.

    Creates one plugin instance per plugin per host which sends messages for that plugin.

    """
    def __init__(self, resource_manager, plugin_name):
        from chroma_core.lib.storage_plugin.manager import storage_plugin_manager

        self._resource_manager = resource_manager

        # Map of host ID to Session
        self._sessions = {}

        self._stopping = False
        self._processing_lock = threading.Lock()
        self._plugin_name = plugin_name
        self._plugin_klass = storage_plugin_manager.get_plugin_class(
            plugin_name)

        self._queue = AgentRxQueue(self._plugin_name)
        # Disregard any old messages
        self._queue.purge()

    def stop(self):
        self._queue.stop()

    def run(self):
        self._queue.serve(session_callback=self.on_message)

    def remove_host_resources(self, host_id):
        log.info("Removing resources for host %s, plugin %s" %
                 (host_id, self._plugin_name))

        # Stop the session, and block it from starting again
        with self._processing_lock:
            try:
                del self._sessions[host_id]
            except KeyError:
                log.warning("remove_host_resources: No session for host %s" %
                            host_id)

        try:
            record = ResourceQuery().get_record_by_attributes(
                "linux",
                "PluginAgentResources",
                host_id=host_id,
                plugin_name=self._plugin_name)
        except StorageResourceRecord.DoesNotExist:
            pass
        else:
            self._resource_manager.global_remove_resource(record.id)

        log.info("AgentDaemon: finished removing resources for host %s" %
                 host_id)

    def setup_host(self, host_id, data):
        with self._processing_lock:
            session = self._sessions.get(host_id, None)

            assert session is not None
            session.plugin.do_agent_session_continue(data)

    @transaction.atomic
    def update_host_resources(self, host_id, data):
        with self._processing_lock:
            session = self._sessions.get(host_id, None)

            if session:
                session.plugin.do_agent_session_continue(data)

    def _create_plugin_instance(self, host):
        from chroma_core.lib.storage_plugin.manager import storage_plugin_manager

        resource_class, resource_class_id = storage_plugin_manager.get_plugin_resource_class(
            "linux", "PluginAgentResources")
        # FIXME: it is weird that the PluginAgentResources class lives in the linux plugin but is used by all of them
        record, created = StorageResourceRecord.get_or_create_root(
            resource_class, resource_class_id, {
                "plugin_name": self._plugin_name,
                "host_id": host.id
            })

        return self._plugin_klass(self._resource_manager, record.id)

    def on_message(self, message):
        with self._processing_lock:
            fqdn = message["fqdn"]
            assert message["plugin"] == self._plugin_name

            if message["type"] != "DATA":
                # We are session aware in that we check sequence numbers etc, but
                # we don't actually require any actions on SESSION_CREATE or
                # SESSION_TERMINATE.
                assert message["type"] in ("SESSION_CREATE",
                                           "SESSION_TERMINATE")
                return

            try:
                host = ManagedHost.objects.get(fqdn=fqdn)
            except ManagedHost.DoesNotExist:
                log.error("Received agent message for non-existent host %s" %
                          fqdn)
                return

            log.debug("Received agent message for %s/%s/%s" %
                      (fqdn, message["plugin"], message["session_id"]))

            existing_session = self._sessions.get(host.id, None)
            if existing_session is None:
                if message["session_seq"] == 0:
                    # No existing session, start a new one
                    log.info("New session")
                    self._sessions[host.id] = Session(
                        message["session_id"],
                        self._create_plugin_instance(host))
                else:
                    # Partway through a session, reset it
                    log.info(
                        "Nonzero counter for new (to me) session, resetting")
                    HttpAgentRpc().reset_session(fqdn, self._plugin_name,
                                                 message["session_id"])
                    return

            elif existing_session.id != message["session_id"]:
                if message["session_seq"] == 0:
                    # Existing session to be replaced with this one
                    log.info("Replacing session %s/%s with %s/%s" %
                             (self._plugin_name, existing_session.id,
                              self._plugin_name, message["session_id"]))
                    self._sessions[host.id] = Session(
                        message["session_id"],
                        self._create_plugin_instance(host))
                else:
                    # Existing session is dead, new session is not at zero, must send a reset
                    log.info(
                        "Nonzero counter for new (to me) replacement session, resetting"
                    )
                    del self._sessions[host.id]
                    HttpAgentRpc().reset_session(fqdn, self._plugin_name,
                                                 message["session_id"])
                    return
            else:
                if message["session_seq"] == existing_session.seq + 1:
                    # Continuation of session
                    pass
                else:
                    # Got out of sequence, reset it
                    log.info(
                        "Out of sequence message (seq %s, expected %s), resetting"
                        % (message["session_seq"], existing_session.seq + 1))
                    del self._sessions[host.id]
                    HttpAgentRpc().reset_session(fqdn, self._plugin_name,
                                                 message["session_id"])
                    return

            session = self._sessions.get(host.id, None)
            try:
                if message["session_seq"] == 0:
                    session.plugin.do_agent_session_start(message["body"])
                else:
                    session.seq += 1
                    session.plugin.do_agent_session_continue(message["body"])
            except Exception:
                exc_info = sys.exc_info()
                backtrace = "\n".join(
                    traceback.format_exception(*(exc_info or sys.exc_info())))
                log.error("Exception in agent session for %s from %s: %s" %
                          (self._plugin_name, host, backtrace))
                log.error("Data: %s" % message["body"])

                HttpAgentRpc().reset_session(fqdn, self._plugin_name,
                                             message["session_id"])
class AgentRpcMessenger(object):
    """
    This class consumes AgentRunnerPluginRxQueue, sends
    messages to AgentTxQueue, and maintains state for
    actions in progress.
    """

    # The name of the device plugin on the agent with which
    # this module will communicate
    PLUGIN_NAME = "action_runner"

    # A bit rubbish, but a tag so callers can know the failure was because the server could not be contacted.
    # Improve with a different Exception one day.
    COULD_NOT_CONTACT_TAG = "Could not contact server"

    # If no action_runner session is present when trying to run
    # an action, wait this long for one to show up
    SESSION_WAIT_TIMEOUT = 30

    def __init__(self):
        super(AgentRpcMessenger, self).__init__()

        # session to list of RPC IDs
        self._session_rpcs = defaultdict(dict)

        # FQDN to session
        self._sessions = {}
        self._cancelled_rpcs = ExpiringList(10 * 60)

        self._action_runner_rx_queue = AgentRxQueue(
            AgentRpcMessenger.PLUGIN_NAME)
        self._action_runner_rx_queue.purge()

        self._lock = threading.Lock()

    def run(self):
        try:
            HttpAgentRpc().reset_plugin_sessions(AgentRpcMessenger.PLUGIN_NAME)
        except RpcTimeout:
            # Assume this means that the http_agent service isn't running: this
            # is acceptable, as our goal of there not being any sessions is
            # already the case.
            log.warning("Unable to reset %s sessions" %
                        AgentRpcMessenger.PLUGIN_NAME)

        self._action_runner_rx_queue.serve(session_callback=self.on_rx)
        log.info("AgentRpcMessenger.complete")

    def stop(self):
        log.info("AgentRpcMessenger.stop")
        self._action_runner_rx_queue.stop()

    def complete_all(self):
        log.info("AgentRpcMessenger.complete_all")
        for session_id, rpc_id_to_rpc in self._session_rpcs.items():
            for rpc_id, rpc_state in rpc_id_to_rpc.items():
                log.info("AgentRpcMessenger.complete_all: erroring %s" %
                         rpc_state.id)
                if not rpc_state.complete.is_set():
                    rpc_state.exception = "Cancelled due to service shutdown"
                    rpc_state.complete.set()

    def remove(self, fqdn):
        with self._lock:
            try:
                del self._sessions[fqdn]
            except KeyError:
                pass

    def _abort_session(self,
                       fqdn,
                       message,
                       old_session_id,
                       new_session_id=None):
        log.warning("AgentRpcMessenger.on_rx: aborting session %s because %s" %
                    (old_session_id, message))
        old_rpcs = self._session_rpcs[old_session_id]

        if new_session_id is not None:
            self._sessions[fqdn] = new_session_id
        else:
            try:
                del self._sessions[fqdn]
            except KeyError:
                pass

        for rpc in old_rpcs.values():
            if new_session_id:
                log.warning(
                    "AgentRpcMessenger.on_rx: re-issuing RPC %s for session %s (was %s) because %s"
                    % (rpc.id, new_session_id, old_session_id, message))
                rpc.session_id = new_session_id
                self._resend(rpc)
            else:
                rpc.exception = "Communications error with %s because %s" % (
                    fqdn, message)
                rpc.complete.set()
        del self._session_rpcs[old_session_id]

    def get_session_id(self, fqdn):
        with self._lock:
            try:
                return self._sessions[fqdn]
            except KeyError:
                return None

    def await_restart(self, fqdn, timeout, old_session_id=None):
        """
        If there is currently an action_runner session, wait for a different one.  Else
        wait for any action_runner session to start."""

        if old_session_id is None:
            old_session_id = self.get_session_id(fqdn)

        log.info("AgentRpcMessenger.await_restart: awaiting %s (old %s)" %
                 (fqdn, old_session_id))

        # Note: using polling here for simplicity, if efficiency became an issue here
        # we could set up events to be triggered by the new session logic in on_rx, and
        # sleep on them instead of polling.

        duration = 0
        poll_period = 1.0
        while True:
            current_session_id = self.get_session_id(fqdn)

            if current_session_id is not None and current_session_id != old_session_id:
                log.info("AgentRpcMessenger.await_restart: %s new %s" %
                         (fqdn, current_session_id))
                break

            if duration >= timeout:
                log.info(
                    "AgentRpcMessenger.await_restart: %s timeout after %ss" %
                    (fqdn, duration))

            duration += poll_period
            time.sleep(poll_period)

    def on_rx(self, message):
        with self._lock:
            log.debug("on_rx: %s" % message)
            session_id = message["session_id"]
            fqdn = message["fqdn"]
            log.info("AgentRpcMessenger.on_rx: %s/%s" % (fqdn, session_id))

            if message["type"] == "SESSION_CREATE":
                if fqdn in self._sessions:
                    old_session_id = self._sessions[fqdn]
                    self._abort_session(fqdn, "new session created",
                                        old_session_id, session_id)
                else:
                    self._sessions[fqdn] = session_id
            elif message["type"] == "SESSION_TERMINATE":
                # An agent has timed out or restarted, we're being told its session is now dead
                if message["fqdn"] in self._sessions:
                    self._abort_session(fqdn, "session terminated",
                                        message["session_id"])
            elif message["type"] == "SESSION_TERMINATE_ALL":
                # The http_agent service has restarted, all sessions are now over
                for fqdn, session in self._sessions.items():
                    self._abort_session(fqdn, "all sessions terminated",
                                        session)
            else:
                rpc_response = message["body"]
                if rpc_response["type"] != "ACTION_COMPLETE":
                    log.error("Unexpected type '%s'" % rpc_response["type"])
                    return

                if fqdn in self._sessions and self._sessions[
                        fqdn] != session_id:
                    log.info(
                        "AgentRpcMessenger.on_rx: cancelling session %s/%s (replaced by %s)"
                        % (fqdn, self._sessions[fqdn], session_id))
                    self._abort_session(fqdn, "session cancelled",
                                        self._sessions[fqdn])
                    HttpAgentRpc().reset_session(fqdn,
                                                 AgentRpcMessenger.PLUGIN_NAME,
                                                 session_id)
                elif fqdn in self._sessions:
                    log.info("AgentRpcMessenger.on_rx: good session %s/%s" %
                             (fqdn, session_id))
                    # Find this RPC and complete it
                    try:
                        rpc = self._session_rpcs[session_id][
                            rpc_response["id"]]
                    except KeyError:
                        if rpc_response["id"] in self._cancelled_rpcs:
                            log.debug(
                                "Response received from a cancelled RPC (id: %s)",
                                rpc_response["id"])
                        else:
                            log.error(
                                "Response received from UNKNOWN RPC of (id: %s)",
                                rpc_response["id"])
                    else:
                        del self._session_rpcs[session_id][rpc_response["id"]]
                        rpc.exception = rpc_response["exception"]
                        rpc.result = rpc_response["result"]
                        rpc.subprocesses = rpc_response["subprocesses"]
                        log.info("AgentRpcMessenger.on_rx: completing rpc %s" %
                                 rpc.id)
                        rpc.complete.set()
                else:
                    log.info("AgentRpcMessenger.on_rx: unknown session %s/%s" %
                             (fqdn, session_id))
                    # A session I never heard of?
                    HttpAgentRpc().reset_session(fqdn,
                                                 AgentRpcMessenger.PLUGIN_NAME,
                                                 session_id)

    def _resend(self, rpc):
        log.debug("AgentRpcMessenger._resend: rpc %s in session %s" %
                  (rpc.id, rpc.session_id))
        self._session_rpcs[rpc.session_id][rpc.id] = rpc
        AgentTxQueue().put(rpc.get_request())

    def _send_request(self, fqdn, action, args):
        wait_count = 0

        if not self.await_session(fqdn,
                                  AgentRpcMessenger.SESSION_WAIT_TIMEOUT):
            log.error("No %s session for %s after %s seconds" %
                      (AgentRpcMessenger.PLUGIN_NAME, fqdn, wait_count))
            raise AgentException(
                fqdn,
                action,
                args,
                "%s %s no session after %s seconds" %
                (self.COULD_NOT_CONTACT_TAG, fqdn,
                 AgentRpcMessenger.SESSION_WAIT_TIMEOUT),
            )

        with self._lock:
            try:
                session_id = self._sessions[fqdn]
            except KeyError:
                # This could happen in spite of the earlier check, as that was outside the lock.
                log.warning("AgentRpcMessenger._send: no session for %s" %
                            fqdn)
                raise AgentException(
                    fqdn, action, args,
                    "%s %s" % (self.COULD_NOT_CONTACT_TAG, fqdn))

            log.debug("AgentRpcMessenger._send: using session %s" % session_id)

            rpc = ActionInFlight(session_id, fqdn, action, args)

            self._session_rpcs[session_id][rpc.id] = rpc
            AgentTxQueue().put(rpc.get_request())
            return rpc

    def _send_cancellation(self, rpc):
        with self._lock:
            try:
                self._session_rpcs[rpc.session_id][rpc.id]
            except KeyError:
                log.warning(
                    "Dropping cancellation of RPC %s, it is already complete or aborted"
                    % rpc.id)
            else:
                log.warning("Cancelling RPC %s" % rpc.id)
                AgentTxQueue().put(rpc.get_cancellation())
                del self._session_rpcs[rpc.session_id][rpc.id]

    def _complete(self, rpc, cancel_event):
        log.info("AgentRpcMessenger._complete: starting wait for rpc %s" %
                 rpc.id)

        # Wait for rpc.complete, waking up every second to
        # check cancel_event
        while True:
            if cancel_event.is_set():
                self._send_cancellation(rpc)
                self._cancelled_rpcs.append(rpc.id)
                raise AgentCancellation()
            else:
                rpc.complete.wait(timeout=1.0)
                if rpc.complete.is_set():
                    break

        log.info("AgentRpcMessenger._complete: completed wait for rpc %s" %
                 rpc.id)
        if rpc.exception:
            raise AgentException(rpc.fqdn,
                                 rpc.action,
                                 rpc.args,
                                 rpc.exception,
                                 subprocesses=rpc.subprocesses)
        else:
            return rpc.result

    def call(self, fqdn, action, args, cancel_event):
        log.debug("AgentRpcMessenger.call: %s %s" % (fqdn, action))
        rpc = self._send_request(fqdn, action, args)
        return self._complete(rpc, cancel_event), rpc

    def await_session(self, fqdn, timeout):
        """
        Wait for the agent to connect back to the manager and hence be ready to accept commands
        :param fqdn: fqdn of the agent we are waiting for
        :param timeout: how long to wait before quiting.
        :return: timeout remaining 0=failed, !0 is pass and useful for debug.
        """
        while self.get_session_id(fqdn) == None and timeout > 0:
            # Allow a short wait for a session to show up, for example
            # when running setup actions on a host we've just added its
            # session may not yet have been fully established
            log.info(
                "AgentRpcMessenger._send: no session yet for %s, %s seconds remain"
                % (fqdn, timeout))
            timeout -= 1
            time.sleep(1)

        return timeout
Esempio n. 5
0
class Service(ChromaService):
    PLUGIN_NAME = "systemd_journal"

    def __init__(self):
        super(Service, self).__init__()
        self._queue = AgentRxQueue(Service.PLUGIN_NAME)
        self._queue.purge()
        self._table_size = LogMessage.objects.count()
        self._parser = LogMessageParser()

    def _check_size(self):
        """Apply a size limit to the table of log messages"""
        MAX_ROWS_PER_TRANSACTION = 10000
        removed_num_entries = 0
        overflow_filename = os.path.join(settings.LOG_PATH, "db_log")

        if self._table_size > settings.DBLOG_HW:
            remove_num_entries = self._table_size - settings.DBLOG_LW

            trans_size = min(MAX_ROWS_PER_TRANSACTION, remove_num_entries)
            with transaction.atomic():
                while remove_num_entries > 0:
                    removed_entries = LogMessage.objects.all().order_by(
                        "id")[0:trans_size]
                    self.log.debug("writing %s batch of entries" % trans_size)
                    try:
                        f = open(overflow_filename, "a")
                        for line in removed_entries:
                            f.write("%s\n" % line.__str__())
                        LogMessage.objects.filter(
                            id__lte=removed_entries[-1].id).delete()
                    except Exception as e:
                        self.log.error(
                            "error opening/writing/closing the db_log: %s" % e)
                    finally:
                        f.close()

                    remove_num_entries -= trans_size
                    removed_num_entries += trans_size
                    if remove_num_entries < trans_size:
                        trans_size = remove_num_entries

        self._table_size -= removed_num_entries
        self.log.info("Wrote %s DB log entries to %s" %
                      (removed_num_entries, overflow_filename))

        return removed_num_entries

    def on_data(self, fqdn, body):
        with transaction.atomic():
            with DelayedContextFrom(LogMessage) as log_messages:
                for msg in body["log_lines"]:
                    try:
                        log_messages.insert(
                            dict(
                                fqdn=fqdn,
                                message=msg["message"],
                                severity=msg["severity"],
                                facility=msg["facility"],
                                tag=msg["source"],
                                datetime=IMLDateTime.parse(
                                    msg["datetime"]).as_datetime,
                                message_class=LogMessage.get_message_class(
                                    msg["message"]),
                            ))
                        self._table_size += 1

                        self._parser.parse(fqdn, msg)
                    except Exception as e:
                        self.log.error(
                            "Error %s ingesting systemd-journal entry: %s" %
                            (e, msg))

    def run(self):
        super(Service, self).run()

        self._queue.serve(data_callback=self.on_data)

    def stop(self):
        super(Service, self).stop()

        self._queue.stop()