def __init__(self):
        super(Service, self).__init__()

        #  Holds each host seen as a key with a HostStatus value last set
        self._host_status = defaultdict(self.HostStatus)

        self._queue = AgentRxQueue(Service.PLUGIN_NAME)
예제 #2
0
    def __init__(self):
        super(Service, self).__init__()
        self._queue = AgentRxQueue(Service.PLUGIN_NAME)
        self._queue.purge()
        self._table_size = LogMessage.objects.count()
        self._parser = LogMessageParser()

        dse.patch_models()
예제 #3
0
    def __init__(self):
        super(AgentRpcMessenger, self).__init__()

        # session to list of RPC IDs
        self._session_rpcs = defaultdict(dict)

        # FQDN to session
        self._sessions = {}
        self._cancelled_rpcs = ExpiringList(10 * 60)

        self._action_runner_rx_queue = AgentRxQueue(AgentRpcMessenger.PLUGIN_NAME)
        self._action_runner_rx_queue.purge()

        self._lock = threading.Lock()
예제 #4
0
    def __init__(self, resource_manager, plugin_name):
        from chroma_core.lib.storage_plugin.manager import storage_plugin_manager
        self._resource_manager = resource_manager

        # Map of host ID to Session
        self._sessions = {}

        self._stopping = False
        self._processing_lock = threading.Lock()
        self._plugin_name = plugin_name
        self._plugin_klass = storage_plugin_manager.get_plugin_class(
            plugin_name)

        self._queue = AgentRxQueue(self._plugin_name)
        # Disregard any old messages
        self._queue.purge()
예제 #5
0
class Service(ChromaService):
    PLUGIN_NAME = 'syslog'

    def __init__(self):
        super(Service, self).__init__()
        self._queue = AgentRxQueue(Service.PLUGIN_NAME)
        self._queue.purge()
        self._table_size = LogMessage.objects.count()
        self._parser = LogMessageParser()

        dse.patch_models()

    def _check_size(self):
        """Apply a size limit to the table of log messages"""
        MAX_ROWS_PER_TRANSACTION = 10000
        removed_num_entries = 0
        overflow_filename = os.path.join(settings.LOG_PATH, "db_log")

        if self._table_size > settings.DBLOG_HW:
            remove_num_entries = self._table_size - settings.DBLOG_LW

            trans_size = min(MAX_ROWS_PER_TRANSACTION, remove_num_entries)
            with transaction.commit_on_success():
                while remove_num_entries > 0:
                    removed_entries = LogMessage.objects.all().order_by(
                        'id')[0:trans_size]
                    self.log.debug("writing %s batch of entries" % trans_size)
                    try:
                        f = open(overflow_filename, "a")
                        for line in removed_entries:
                            f.write("%s\n" % line.__str__())
                        LogMessage.objects.filter(
                            id__lte=removed_entries[-1].id).delete()
                    except Exception, e:
                        self.log.error(
                            "error opening/writing/closing the db_log: %s" % e)
                    finally:
                        f.close()

                    remove_num_entries -= trans_size
                    removed_num_entries += trans_size
                    if remove_num_entries < trans_size:
                        trans_size = remove_num_entries
class Service(ChromaService):
    PLUGIN_NAME = 'lustre'

    def __init__(self):
        self._queue = AgentRxQueue(Service.PLUGIN_NAME)
        self._queue.purge()

    def run(self):
        super(Service, self).run()

        self._queue.serve(data_callback=self.on_data)

    def on_data(self, fqdn, data):
        with transaction.commit_manually():
            transaction.commit()

        try:
            host = ManagedHost.objects.get(fqdn=fqdn)
            UpdateScan().run(host.id, data)
        except Exception:
            log.error("Error handling lustre message: %s",
                      '\n'.join(traceback.format_exception(*(sys.exc_info()))))

    def stop(self):
        super(Service, self).stop()

        self._queue.stop()
예제 #7
0
class AgentPluginHandler(object):
    """Handle messages sent from the agent.

    Similar to ScanDaemon, the main difference is that ScanDaemon polls plugin callbacks,
    whereas AgentDaemon is driven by a message queue.

    Plugins might be loaded by both AgentDaemon and ScanDaemon, where the ScanDaemon side
    is connecting out to storage controllers, while the AgentDaemon side is getting notifications
    of devices popping up on Lustre servers.

    Creates one plugin instance per plugin per host which sends messages for that plugin.

    """
    def __init__(self, resource_manager, plugin_name):
        from chroma_core.lib.storage_plugin.manager import storage_plugin_manager

        self._resource_manager = resource_manager

        # Map of host ID to Session
        self._sessions = {}

        self._stopping = False
        self._processing_lock = threading.Lock()
        self._plugin_name = plugin_name
        self._plugin_klass = storage_plugin_manager.get_plugin_class(
            plugin_name)

        self._queue = AgentRxQueue(self._plugin_name)
        # Disregard any old messages
        self._queue.purge()

    def stop(self):
        self._queue.stop()

    def run(self):
        self._queue.serve(session_callback=self.on_message)

    def remove_host_resources(self, host_id):
        log.info("Removing resources for host %s, plugin %s" %
                 (host_id, self._plugin_name))

        # Stop the session, and block it from starting again
        with self._processing_lock:
            try:
                del self._sessions[host_id]
            except KeyError:
                log.warning("remove_host_resources: No session for host %s" %
                            host_id)

        try:
            record = ResourceQuery().get_record_by_attributes(
                "linux",
                "PluginAgentResources",
                host_id=host_id,
                plugin_name=self._plugin_name)
        except StorageResourceRecord.DoesNotExist:
            pass
        else:
            self._resource_manager.global_remove_resource(record.id)

        log.info("AgentDaemon: finished removing resources for host %s" %
                 host_id)

    def setup_host(self, host_id, data):
        with self._processing_lock:
            session = self._sessions.get(host_id, None)

            assert session is not None
            session.plugin.do_agent_session_continue(data)

    @transaction.atomic
    def update_host_resources(self, host_id, data):
        with self._processing_lock:
            session = self._sessions.get(host_id, None)

            if session:
                session.plugin.do_agent_session_continue(data)

    def _create_plugin_instance(self, host):
        from chroma_core.lib.storage_plugin.manager import storage_plugin_manager

        resource_class, resource_class_id = storage_plugin_manager.get_plugin_resource_class(
            "linux", "PluginAgentResources")
        # FIXME: it is weird that the PluginAgentResources class lives in the linux plugin but is used by all of them
        record, created = StorageResourceRecord.get_or_create_root(
            resource_class, resource_class_id, {
                "plugin_name": self._plugin_name,
                "host_id": host.id
            })

        return self._plugin_klass(self._resource_manager, record.id)

    def on_message(self, message):
        with self._processing_lock:
            fqdn = message["fqdn"]
            assert message["plugin"] == self._plugin_name

            if message["type"] != "DATA":
                # We are session aware in that we check sequence numbers etc, but
                # we don't actually require any actions on SESSION_CREATE or
                # SESSION_TERMINATE.
                assert message["type"] in ("SESSION_CREATE",
                                           "SESSION_TERMINATE")
                return

            try:
                host = ManagedHost.objects.get(fqdn=fqdn)
            except ManagedHost.DoesNotExist:
                log.error("Received agent message for non-existent host %s" %
                          fqdn)
                return

            log.debug("Received agent message for %s/%s/%s" %
                      (fqdn, message["plugin"], message["session_id"]))

            existing_session = self._sessions.get(host.id, None)
            if existing_session is None:
                if message["session_seq"] == 0:
                    # No existing session, start a new one
                    log.info("New session")
                    self._sessions[host.id] = Session(
                        message["session_id"],
                        self._create_plugin_instance(host))
                else:
                    # Partway through a session, reset it
                    log.info(
                        "Nonzero counter for new (to me) session, resetting")
                    HttpAgentRpc().reset_session(fqdn, self._plugin_name,
                                                 message["session_id"])
                    return

            elif existing_session.id != message["session_id"]:
                if message["session_seq"] == 0:
                    # Existing session to be replaced with this one
                    log.info("Replacing session %s/%s with %s/%s" %
                             (self._plugin_name, existing_session.id,
                              self._plugin_name, message["session_id"]))
                    self._sessions[host.id] = Session(
                        message["session_id"],
                        self._create_plugin_instance(host))
                else:
                    # Existing session is dead, new session is not at zero, must send a reset
                    log.info(
                        "Nonzero counter for new (to me) replacement session, resetting"
                    )
                    del self._sessions[host.id]
                    HttpAgentRpc().reset_session(fqdn, self._plugin_name,
                                                 message["session_id"])
                    return
            else:
                if message["session_seq"] == existing_session.seq + 1:
                    # Continuation of session
                    pass
                else:
                    # Got out of sequence, reset it
                    log.info(
                        "Out of sequence message (seq %s, expected %s), resetting"
                        % (message["session_seq"], existing_session.seq + 1))
                    del self._sessions[host.id]
                    HttpAgentRpc().reset_session(fqdn, self._plugin_name,
                                                 message["session_id"])
                    return

            session = self._sessions.get(host.id, None)
            try:
                if message["session_seq"] == 0:
                    session.plugin.do_agent_session_start(message["body"])
                else:
                    session.seq += 1
                    session.plugin.do_agent_session_continue(message["body"])
            except Exception:
                exc_info = sys.exc_info()
                backtrace = "\n".join(
                    traceback.format_exception(*(exc_info or sys.exc_info())))
                log.error("Exception in agent session for %s from %s: %s" %
                          (self._plugin_name, host, backtrace))
                log.error("Data: %s" % message["body"])

                HttpAgentRpc().reset_session(fqdn, self._plugin_name,
                                             message["session_id"])
예제 #8
0
 def __init__(self):
     self._queue = AgentRxQueue(Service.PLUGIN_NAME)
     self._queue.purge()
class Service(ChromaService):
    """Corosync host offline detection service

    The corosync agent will report host status for all nodes in it's
    peer group.  Any node that is down according to corosync
    will be recorded here, and in the DB, and and alert will be saved.

    Be sure to have all nodes on the exact same time - ntp.  This service will
    drop older reports that come in late, so correct timing is critical.
    """

    PLUGIN_NAME = 'corosync'

    #  Class to store the in-memory online/offline status and sample times
    #  a HostStatus object is created for each host that is reported
    HostStatus = namedtuple('HostStatus', ['status', 'datetime'])

    def __init__(self):
        super(Service, self).__init__()

        #  Holds each host seen as a key with a HostStatus value last set
        self._host_status = defaultdict(self.HostStatus)

        self._queue = AgentRxQueue(Service.PLUGIN_NAME)

    # Using transaction decorator to ensure that subsequent calls
    # see fresh data when polling the ManagedHost model.
    @transaction.commit_on_success()
    def on_data(self, fqdn, body):
        """Process all incoming messages from the Corosync agent plugin

        Request to have the status changed for an instance.  If the current
        state determines that a host is offline, then raise that alert.

        old messages should not be processed.

        datetime is in UTC of the node's localtime in the standard
        ISO string format
        """

        try:
            host = ManagedHost.objects.get(fqdn=fqdn)
        except ManagedHost.DoesNotExist:
            # This might happen when we are deleting a host and the queues mean a message is still sat waiting to be
            # processed. Something has spoken to us and we don't know anything about it so really we can't do anything
            # other than drop it.
            log.warning(
                "Corosync message from unknown host %s, the message was dropped."
                % fqdn)
            return

        # If corosync is not configured yet, or we don't actually have corosync - then ignore the input
        if (not host.corosync_configuration
            ) or host.corosync_configuration.state == 'unconfigured':
            return

        if body.get('state'):
            job_scheduler_notify.notify(host.corosync_configuration,
                                        timezone.now(),
                                        {'state': body['state']['corosync']})

            job_scheduler_notify.notify(host.pacemaker_configuration,
                                        timezone.now(),
                                        {'state': body['state']['pacemaker']})

            if body['state']['corosync'] == 'stopped':
                return
        else:
            if host.corosync_configuration.state != 'started':
                return

        if body.get('crm_info'):
            nodes = body['crm_info']['nodes']
            dt = body['crm_info']['datetime']

            options = body['crm_info'].get('options',
                                           {'stonith_enabled': None})
            stonith_enabled = options['stonith_enabled']

            try:
                dt = IMLDateTime.parse(dt)
            except ValueError:
                if dt != '':
                    log.warning(
                        "Invalid date or tz string from corosync plugin: %s" %
                        dt)
                    raise

            def is_new(peer_node_identifier):
                return (peer_node_identifier not in self._host_status or
                        self._host_status[peer_node_identifier].datetime < dt)

            peers_str = "; ".join([
                "%s: online=%s, new=%s" %
                (peer_node_identifier, data['online'],
                 is_new(peer_node_identifier))
                for peer_node_identifier, data in nodes.items()
            ])
            log.debug("Incoming peer report from %s:  %s" % (fqdn, peers_str))

            # NB: This will ignore any unknown peers in the report.
            cluster_nodes = ManagedHost.objects.select_related(
                'ha_cluster_peers').filter(
                    Q(nodename__in=nodes.keys()) | Q(fqdn__in=nodes.keys()))

            unknown_nodes = set(nodes.keys()) - set([
                h.nodename for h in cluster_nodes
            ]) - set([h.fqdn for h in cluster_nodes])

            # Leaving this out for now - because they raise issue caused by limitations in the simulator and
            # test system as a whole. Difficult to know if they will or won't be raised it all depends on the past.
            # CorosyncUnknownPeersAlert.notify(host.corosync_configuration, unknown_nodes != set())
            if unknown_nodes:
                log.warning("Unknown nodes in report from %s: %s" %
                            (fqdn, unknown_nodes))

            if stonith_enabled is not None:
                StonithNotEnabledAlert.notify(host.corosync_configuration,
                                              stonith_enabled is False)

            CorosyncNoPeersAlert.notify(host.corosync_configuration,
                                        len(cluster_nodes) == 1)
            # CorosyncToManyPeersAlert.notify(host.corosync_configuration, len(cluster_nodes) > 2)

            #  Consider all nodes in the peer group for this reporting agent
            for host in cluster_nodes:
                try:
                    data = nodes[host.nodename]
                    node_identifier = host.nodename
                except KeyError:
                    data = nodes[host.fqdn]
                    node_identifier = host.fqdn

                cluster_peer_keys = sorted(
                    [node.pk for node in cluster_nodes if node is not host])

                if is_new(node_identifier) and host.corosync_configuration:
                    host_reported_online = data['online'] == 'true'

                    log.debug("Corosync processing "
                              "peer %s of %s " % (host.fqdn, fqdn))

                    #  Raise an Alert - system suppresses duplicates
                    log.debug("Alert notify on %s: active=%s" %
                              (host, not host_reported_online))
                    HostOfflineAlert.notify(host, not host_reported_online)
                    if host_reported_online == False:
                        log.debug("Host %s offline" % host.fqdn)
                    else:
                        log.debug("Host %s online" % host.fqdn)

                    #  Attempt to save the state.
                    if host.corosync_configuration.corosync_reported_up != host_reported_online:
                        job_scheduler_notify.notify(
                            host.corosync_configuration, timezone.now(),
                            {'corosync_reported_up': host_reported_online})

                    peer_host_peer_keys = sorted(
                        [h.pk for h in host.ha_cluster_peers.all()])
                    if peer_host_peer_keys != cluster_peer_keys:
                        job_scheduler_notify.notify(
                            host, timezone.now(),
                            {'ha_cluster_peers': cluster_peer_keys})

                    #  Keep internal track of the hosts state.
                    self._host_status[node_identifier] = self.HostStatus(
                        status=host_reported_online, datetime=dt)

    def run(self):
        super(Service, self).run()

        self._queue.serve(data_callback=self.on_data)

    def stop(self):
        super(Service, self).stop()

        self._queue.stop()
class AgentRpcMessenger(object):
    """
    This class consumes AgentRunnerPluginRxQueue, sends
    messages to AgentTxQueue, and maintains state for
    actions in progress.
    """

    # The name of the device plugin on the agent with which
    # this module will communicate
    PLUGIN_NAME = "action_runner"

    # A bit rubbish, but a tag so callers can know the failure was because the server could not be contacted.
    # Improve with a different Exception one day.
    COULD_NOT_CONTACT_TAG = "Could not contact server"

    # If no action_runner session is present when trying to run
    # an action, wait this long for one to show up
    SESSION_WAIT_TIMEOUT = 30

    def __init__(self):
        super(AgentRpcMessenger, self).__init__()

        # session to list of RPC IDs
        self._session_rpcs = defaultdict(dict)

        # FQDN to session
        self._sessions = {}
        self._cancelled_rpcs = ExpiringList(10 * 60)

        self._action_runner_rx_queue = AgentRxQueue(
            AgentRpcMessenger.PLUGIN_NAME)
        self._action_runner_rx_queue.purge()

        self._lock = threading.Lock()

    def run(self):
        try:
            HttpAgentRpc().reset_plugin_sessions(AgentRpcMessenger.PLUGIN_NAME)
        except RpcTimeout:
            # Assume this means that the http_agent service isn't running: this
            # is acceptable, as our goal of there not being any sessions is
            # already the case.
            log.warning("Unable to reset %s sessions" %
                        AgentRpcMessenger.PLUGIN_NAME)

        self._action_runner_rx_queue.serve(session_callback=self.on_rx)
        log.info("AgentRpcMessenger.complete")

    def stop(self):
        log.info("AgentRpcMessenger.stop")
        self._action_runner_rx_queue.stop()

    def complete_all(self):
        log.info("AgentRpcMessenger.complete_all")
        for session_id, rpc_id_to_rpc in self._session_rpcs.items():
            for rpc_id, rpc_state in rpc_id_to_rpc.items():
                log.info("AgentRpcMessenger.complete_all: erroring %s" %
                         rpc_state.id)
                if not rpc_state.complete.is_set():
                    rpc_state.exception = "Cancelled due to service shutdown"
                    rpc_state.complete.set()

    def remove(self, fqdn):
        with self._lock:
            try:
                del self._sessions[fqdn]
            except KeyError:
                pass

    def _abort_session(self,
                       fqdn,
                       message,
                       old_session_id,
                       new_session_id=None):
        log.warning("AgentRpcMessenger.on_rx: aborting session %s because %s" %
                    (old_session_id, message))
        old_rpcs = self._session_rpcs[old_session_id]

        if new_session_id is not None:
            self._sessions[fqdn] = new_session_id
        else:
            try:
                del self._sessions[fqdn]
            except KeyError:
                pass

        for rpc in old_rpcs.values():
            if new_session_id:
                log.warning(
                    "AgentRpcMessenger.on_rx: re-issuing RPC %s for session %s (was %s) because %s"
                    % (rpc.id, new_session_id, old_session_id, message))
                rpc.session_id = new_session_id
                self._resend(rpc)
            else:
                rpc.exception = "Communications error with %s because %s" % (
                    fqdn, message)
                rpc.complete.set()
        del self._session_rpcs[old_session_id]

    def get_session_id(self, fqdn):
        with self._lock:
            try:
                return self._sessions[fqdn]
            except KeyError:
                return None

    def await_restart(self, fqdn, timeout, old_session_id=None):
        """
        If there is currently an action_runner session, wait for a different one.  Else
        wait for any action_runner session to start."""

        if old_session_id is None:
            old_session_id = self.get_session_id(fqdn)

        log.info("AgentRpcMessenger.await_restart: awaiting %s (old %s)" %
                 (fqdn, old_session_id))

        # Note: using polling here for simplicity, if efficiency became an issue here
        # we could set up events to be triggered by the new session logic in on_rx, and
        # sleep on them instead of polling.

        duration = 0
        poll_period = 1.0
        while True:
            current_session_id = self.get_session_id(fqdn)

            if current_session_id is not None and current_session_id != old_session_id:
                log.info("AgentRpcMessenger.await_restart: %s new %s" %
                         (fqdn, current_session_id))
                break

            if duration >= timeout:
                log.info(
                    "AgentRpcMessenger.await_restart: %s timeout after %ss" %
                    (fqdn, duration))

            duration += poll_period
            time.sleep(poll_period)

    def on_rx(self, message):
        with self._lock:
            log.debug("on_rx: %s" % message)
            session_id = message["session_id"]
            fqdn = message["fqdn"]
            log.info("AgentRpcMessenger.on_rx: %s/%s" % (fqdn, session_id))

            if message["type"] == "SESSION_CREATE":
                if fqdn in self._sessions:
                    old_session_id = self._sessions[fqdn]
                    self._abort_session(fqdn, "new session created",
                                        old_session_id, session_id)
                else:
                    self._sessions[fqdn] = session_id
            elif message["type"] == "SESSION_TERMINATE":
                # An agent has timed out or restarted, we're being told its session is now dead
                if message["fqdn"] in self._sessions:
                    self._abort_session(fqdn, "session terminated",
                                        message["session_id"])
            elif message["type"] == "SESSION_TERMINATE_ALL":
                # The http_agent service has restarted, all sessions are now over
                for fqdn, session in self._sessions.items():
                    self._abort_session(fqdn, "all sessions terminated",
                                        session)
            else:
                rpc_response = message["body"]
                if rpc_response["type"] != "ACTION_COMPLETE":
                    log.error("Unexpected type '%s'" % rpc_response["type"])
                    return

                if fqdn in self._sessions and self._sessions[
                        fqdn] != session_id:
                    log.info(
                        "AgentRpcMessenger.on_rx: cancelling session %s/%s (replaced by %s)"
                        % (fqdn, self._sessions[fqdn], session_id))
                    self._abort_session(fqdn, "session cancelled",
                                        self._sessions[fqdn])
                    HttpAgentRpc().reset_session(fqdn,
                                                 AgentRpcMessenger.PLUGIN_NAME,
                                                 session_id)
                elif fqdn in self._sessions:
                    log.info("AgentRpcMessenger.on_rx: good session %s/%s" %
                             (fqdn, session_id))
                    # Find this RPC and complete it
                    try:
                        rpc = self._session_rpcs[session_id][
                            rpc_response["id"]]
                    except KeyError:
                        if rpc_response["id"] in self._cancelled_rpcs:
                            log.debug(
                                "Response received from a cancelled RPC (id: %s)",
                                rpc_response["id"])
                        else:
                            log.error(
                                "Response received from UNKNOWN RPC of (id: %s)",
                                rpc_response["id"])
                    else:
                        del self._session_rpcs[session_id][rpc_response["id"]]
                        rpc.exception = rpc_response["exception"]
                        rpc.result = rpc_response["result"]
                        rpc.subprocesses = rpc_response["subprocesses"]
                        log.info("AgentRpcMessenger.on_rx: completing rpc %s" %
                                 rpc.id)
                        rpc.complete.set()
                else:
                    log.info("AgentRpcMessenger.on_rx: unknown session %s/%s" %
                             (fqdn, session_id))
                    # A session I never heard of?
                    HttpAgentRpc().reset_session(fqdn,
                                                 AgentRpcMessenger.PLUGIN_NAME,
                                                 session_id)

    def _resend(self, rpc):
        log.debug("AgentRpcMessenger._resend: rpc %s in session %s" %
                  (rpc.id, rpc.session_id))
        self._session_rpcs[rpc.session_id][rpc.id] = rpc
        AgentTxQueue().put(rpc.get_request())

    def _send_request(self, fqdn, action, args):
        wait_count = 0

        if not self.await_session(fqdn,
                                  AgentRpcMessenger.SESSION_WAIT_TIMEOUT):
            log.error("No %s session for %s after %s seconds" %
                      (AgentRpcMessenger.PLUGIN_NAME, fqdn, wait_count))
            raise AgentException(
                fqdn,
                action,
                args,
                "%s %s no session after %s seconds" %
                (self.COULD_NOT_CONTACT_TAG, fqdn,
                 AgentRpcMessenger.SESSION_WAIT_TIMEOUT),
            )

        with self._lock:
            try:
                session_id = self._sessions[fqdn]
            except KeyError:
                # This could happen in spite of the earlier check, as that was outside the lock.
                log.warning("AgentRpcMessenger._send: no session for %s" %
                            fqdn)
                raise AgentException(
                    fqdn, action, args,
                    "%s %s" % (self.COULD_NOT_CONTACT_TAG, fqdn))

            log.debug("AgentRpcMessenger._send: using session %s" % session_id)

            rpc = ActionInFlight(session_id, fqdn, action, args)

            self._session_rpcs[session_id][rpc.id] = rpc
            AgentTxQueue().put(rpc.get_request())
            return rpc

    def _send_cancellation(self, rpc):
        with self._lock:
            try:
                self._session_rpcs[rpc.session_id][rpc.id]
            except KeyError:
                log.warning(
                    "Dropping cancellation of RPC %s, it is already complete or aborted"
                    % rpc.id)
            else:
                log.warning("Cancelling RPC %s" % rpc.id)
                AgentTxQueue().put(rpc.get_cancellation())
                del self._session_rpcs[rpc.session_id][rpc.id]

    def _complete(self, rpc, cancel_event):
        log.info("AgentRpcMessenger._complete: starting wait for rpc %s" %
                 rpc.id)

        # Wait for rpc.complete, waking up every second to
        # check cancel_event
        while True:
            if cancel_event.is_set():
                self._send_cancellation(rpc)
                self._cancelled_rpcs.append(rpc.id)
                raise AgentCancellation()
            else:
                rpc.complete.wait(timeout=1.0)
                if rpc.complete.is_set():
                    break

        log.info("AgentRpcMessenger._complete: completed wait for rpc %s" %
                 rpc.id)
        if rpc.exception:
            raise AgentException(rpc.fqdn,
                                 rpc.action,
                                 rpc.args,
                                 rpc.exception,
                                 subprocesses=rpc.subprocesses)
        else:
            return rpc.result

    def call(self, fqdn, action, args, cancel_event):
        log.debug("AgentRpcMessenger.call: %s %s" % (fqdn, action))
        rpc = self._send_request(fqdn, action, args)
        return self._complete(rpc, cancel_event), rpc

    def await_session(self, fqdn, timeout):
        """
        Wait for the agent to connect back to the manager and hence be ready to accept commands
        :param fqdn: fqdn of the agent we are waiting for
        :param timeout: how long to wait before quiting.
        :return: timeout remaining 0=failed, !0 is pass and useful for debug.
        """
        while self.get_session_id(fqdn) == None and timeout > 0:
            # Allow a short wait for a session to show up, for example
            # when running setup actions on a host we've just added its
            # session may not yet have been fully established
            log.info(
                "AgentRpcMessenger._send: no session yet for %s, %s seconds remain"
                % (fqdn, timeout))
            timeout -= 1
            time.sleep(1)

        return timeout
예제 #11
0
class Service(ChromaService):
    PLUGIN_NAME = "systemd_journal"

    def __init__(self):
        super(Service, self).__init__()
        self._queue = AgentRxQueue(Service.PLUGIN_NAME)
        self._queue.purge()
        self._table_size = LogMessage.objects.count()
        self._parser = LogMessageParser()

    def _check_size(self):
        """Apply a size limit to the table of log messages"""
        MAX_ROWS_PER_TRANSACTION = 10000
        removed_num_entries = 0
        overflow_filename = os.path.join(settings.LOG_PATH, "db_log")

        if self._table_size > settings.DBLOG_HW:
            remove_num_entries = self._table_size - settings.DBLOG_LW

            trans_size = min(MAX_ROWS_PER_TRANSACTION, remove_num_entries)
            with transaction.atomic():
                while remove_num_entries > 0:
                    removed_entries = LogMessage.objects.all().order_by(
                        "id")[0:trans_size]
                    self.log.debug("writing %s batch of entries" % trans_size)
                    try:
                        f = open(overflow_filename, "a")
                        for line in removed_entries:
                            f.write("%s\n" % line.__str__())
                        LogMessage.objects.filter(
                            id__lte=removed_entries[-1].id).delete()
                    except Exception as e:
                        self.log.error(
                            "error opening/writing/closing the db_log: %s" % e)
                    finally:
                        f.close()

                    remove_num_entries -= trans_size
                    removed_num_entries += trans_size
                    if remove_num_entries < trans_size:
                        trans_size = remove_num_entries

        self._table_size -= removed_num_entries
        self.log.info("Wrote %s DB log entries to %s" %
                      (removed_num_entries, overflow_filename))

        return removed_num_entries

    def on_data(self, fqdn, body):
        with transaction.atomic():
            with DelayedContextFrom(LogMessage) as log_messages:
                for msg in body["log_lines"]:
                    try:
                        log_messages.insert(
                            dict(
                                fqdn=fqdn,
                                message=msg["message"],
                                severity=msg["severity"],
                                facility=msg["facility"],
                                tag=msg["source"],
                                datetime=IMLDateTime.parse(
                                    msg["datetime"]).as_datetime,
                                message_class=LogMessage.get_message_class(
                                    msg["message"]),
                            ))
                        self._table_size += 1

                        self._parser.parse(fqdn, msg)
                    except Exception as e:
                        self.log.error(
                            "Error %s ingesting systemd-journal entry: %s" %
                            (e, msg))

    def run(self):
        super(Service, self).run()

        self._queue.serve(data_callback=self.on_data)

    def stop(self):
        super(Service, self).stop()

        self._queue.stop()