Ejemplo n.º 1
0
    def fetch(self, minion_id, sync_type):
        log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type))
        if minion_id is None:
            # We're probably being replayed to from the database
            log.warn("SyncObjects.fetch called with minion_id=None")
            return

        self._fetching_at[sync_type] = now()
        client = LocalClient(config.get('cthulhu', 'salt_config_path'))
        # TODO clean up unused 'since' argument
        pub_data = client.run_job(
            minion_id, 'ceph.get_cluster_object',
            condition_kwarg(
                [], {
                    'cluster_name': self._cluster_name,
                    'sync_type': sync_type.str,
                    'since': None
                }))
        if not pub_data:
            log.error("Failed to start fetch job %s/%s" %
                      (minion_id, sync_type))
            # Don't throw an exception because if a fetch fails we should always
        else:
            log.debug("SyncObjects.fetch: jid=%s minions=%s" %
                      (pub_data['jid'], pub_data['minions']))
Ejemplo n.º 2
0
    def __init__(self, fsid, cluster_name, commands):
        """
        Requiring cluster_name and fsid is redundant (ideally everything would
        speak in terms of fsid) but convenient, because the librados interface
        wants a cluster name when you create a client, and otherwise we would
        have to look up via ceph.conf.
        """
        self.log = log.getChild(self.__class__.__name__)

        self.requested_at = now()
        self.completed_at = None

        # This is actually kind of overkill compared with having a counter,
        # somewhere but it's easy.
        self.id = uuid.uuid4().__str__()

        self._minion_id = None
        self._fsid = fsid
        self._cluster_name = cluster_name
        self._commands = commands

        self.jid = None

        self.state = self.NEW
        self.result = None
        self.error = False
        self.error_message = ""

        # Time at which we last believed the current JID to be really running
        self.alive_at = None
Ejemplo n.º 3
0
    def _submit(self):
        self.jid = remote.run_job(self._minion_id, self._cmd, self._args)
        self.alive_at = now()

        self.log.info("Request %s started job %s" % (self.id, self.jid))

        return self.jid
Ejemplo n.º 4
0
    def _is_favorite(self, minion_id):
        """
        Check if this minion is the one which we are currently treating
        as the primary source of updates, and promote it to be the
        favourite if the favourite has not sent a heartbeat since
        cthulhu->favorite_timeout_s.

        :return True if this minion was the favorite or has just been
                promoted.
        """
        t_now = now()
        self._last_heartbeat[minion_id] = t_now

        if self._favorite_mon is None:
            log.debug("%s is my new favourite" % minion_id)
            self._set_favorite(minion_id)
            return True
        elif minion_id != self._favorite_mon:
            # Consider whether this minion should become my new favourite: has it been
            # too long since my current favourite reported in?
            time_since = t_now - self._last_heartbeat[self._favorite_mon]
            favorite_timeout_s = self._servers.get_contact_period(self._favorite_mon) * FAVORITE_TIMEOUT_FACTOR
            if time_since > datetime.timedelta(seconds=favorite_timeout_s):
                log.debug("My old favourite, %s, has not sent a heartbeat for %s: %s is my new favourite" % (
                    self._favorite_mon, time_since, minion_id
                ))
                self._set_favorite(minion_id)

        return minion_id == self._favorite_mon
Ejemplo n.º 5
0
    def on_version(self, reported_by, sync_type, new_version):
        """
        Notify me that a particular version of a particular map exists.

        I may choose to initiate RPC to retrieve the map
        """
        log.debug("SyncObjects.on_version %s/%s/%s" % (reported_by, sync_type.str, new_version))
        old_version = self.get_version(sync_type)
        if sync_type.cmp(new_version, old_version) > 0:
            known_version = self._known_versions[sync_type]
            if sync_type.cmp(new_version, known_version) > 0:
                # We are out of date: request an up to date copy
                log.info("Advanced known version %s/%s %s->%s" % (
                    self._cluster_name, sync_type.str, known_version, new_version))
                self._known_versions[sync_type] = new_version
            else:
                log.info("on_version: %s is newer than %s" % (new_version, old_version))

            # If we already have a request out for this type of map, then consider
            # cancelling it if we've already waited for a while.
            if self._fetching_at[sync_type] is not None:
                if now() - self._fetching_at[sync_type] < self.FETCH_TIMEOUT:
                    log.info("Fetch already underway for %s" % sync_type.str)
                    return
                else:
                    log.warn("Abandoning fetch for %s started at %s" % (
                        sync_type.str, self._fetching_at[sync_type]))

            log.info("on_version: fetching %s/%s from %s, currently got %s, know %s" % (
                sync_type, new_version, reported_by, old_version, known_version
            ))
            self.fetch(reported_by, sync_type)
Ejemplo n.º 6
0
    def _submit(self):
        self.jid = remote.run_job(self._minion_id, self._cmd, self._args)
        self.alive_at = now()

        self.log.info("Request %s started job %s" % (self.id, self.jid))

        return self.jid
Ejemplo n.º 7
0
    def __init__(self, fsid, cluster_name):
        """
        Requiring cluster_name and fsid is redundant (ideally everything would
        speak in terms of fsid) but convenient, because the librados interface
        wants a cluster name when you create a client, and otherwise we would
        have to look up via ceph.conf.
        """
        # getChild isn't in 2.6
        logname = '.'.join((log.name, self.__class__.__name__))
        self.log = logging.getLogger(logname)
        self.requested_at = now()
        self.completed_at = None

        # This is actually kind of overkill compared with having a counter,
        # somewhere but it's easy.
        self.id = uuid.uuid4().__str__()

        self._minion_id = None
        self.fsid = fsid
        self._cluster_name = cluster_name

        self.jid = None

        self.state = self.NEW
        self.result = None
        self.error = False
        self.error_message = ""

        # Time at which we last believed the current JID to be really running
        self.alive_at = None
Ejemplo n.º 8
0
    def complete(self):
        """
        Call this when you're all done
        """
        assert self.state != self.COMPLETE
        assert self.jid is None

        self.log.info("Request %s completed with error=%s (%s)" % (self.id, self.error, self.error_message))
        self.state = self.COMPLETE
        self.completed_at = now()
Ejemplo n.º 9
0
    def complete(self):
        """
        Call this when you're all done
        """
        assert self.state != self.COMPLETE
        assert self.jid is None

        self.log.info("Request %s completed with error=%s (%s)" % (self.id, self.error, self.error_message))
        self.state = self.COMPLETE
        self.completed_at = now()
Ejemplo n.º 10
0
    def _update_sync_object(self, fsid, name, sync_type, version, when, data):
        self._session.add(
            SyncObject(
                fsid=fsid, cluster_name=name, sync_type=sync_type, version=version, when=when, data=msgpack.packb(data)
            )
        )

        # Time-limited FIFO
        threshold = now() - CLUSTER_MAP_RETENTION
        self._session.query(SyncObject).filter(
            SyncObject.when < threshold, SyncObject.fsid == fsid, SyncObject.sync_type == sync_type
        ).delete()
Ejemplo n.º 11
0
    def _update_sync_object(self, fsid, name, sync_type, version, when, data):
        self._session.add(
            SyncObject(fsid=fsid,
                       cluster_name=name,
                       sync_type=sync_type,
                       version=version,
                       when=when,
                       data=msgpack.packb(data)))

        # Time-limited FIFO
        threshold = now() - CLUSTER_MAP_RETENTION
        self._session.query(SyncObject).filter(
            SyncObject.when < threshold, SyncObject.fsid == fsid,
            SyncObject.sync_type == sync_type).delete()
Ejemplo n.º 12
0
    def on_tick(self):
        """
        Periodically call this to drive non-event-driven events (i.e. things
        which are based on walltime checks)
        """
        log.debug("Eventer.on_tick")

        now_utc = now()

        for fqdn, server_state in self._manager.servers.servers.items():
            if not server_state.managed:
                # We don't expect messages from unmanaged servers so don't
                # worry about whether they sent us one recently.
                continue

            if len(server_state.clusters) == 1:
                # Because Events can only be associated with one FSID, we only make this
                # association for servers with exactly one cluster.  This is a bit cheeky and
                # kind of an unnecessary limitation in the Event DB schema.
                fsid = server_state.clusters[0]
            else:
                fsid = None

            contact_threshold = CONTACT_THRESHOLD_FACTOR * self._manager.servers.get_contact_period(fqdn)
            if now_utc - server_state.last_contact > datetime.timedelta(seconds=contact_threshold):
                if fqdn not in self._servers_complained:
                    self._emit(WARNING, "Server {fqdn} is late reporting in, last report at {last}".format(
                        fqdn=fqdn, last=server_state.last_contact
                    ), fqdn=fqdn, fsid=fsid)
                    self._servers_complained.add(fqdn)
            else:
                if fqdn in self._servers_complained:
                    self._emit(RECOVERY, "Server {fqdn} regained contact".format(fqdn=fqdn),
                               fqdn=fqdn, fsid=fsid)
                    self._servers_complained.discard(fqdn)

        for fsid, cluster_monitor in self._manager.clusters.items():
            if cluster_monitor.update_time is None or now_utc - cluster_monitor.update_time > datetime.timedelta(
                    seconds=CLUSTER_CONTACT_THRESHOLD):
                if fsid not in self._clusters_complained:
                    self._clusters_complained.add(fsid)
                    self._emit(WARNING, "Cluster '{name}' is late reporting in".format(name=cluster_monitor.name),
                               fsid=fsid)
            else:
                if fsid in self._clusters_complained:
                    self._emit(RECOVERY, "Cluster '{name}' regained contact".format(name=cluster_monitor.name),
                               fsid=fsid)
                    self._clusters_complained.discard(fsid)

        self._flush()
Ejemplo n.º 13
0
    def _submit(self):
        client = LocalClient(config.get('cthulhu', 'salt_config_path'))
        pub_data = client.run_job(self._minion_id, self._cmd, self._args)
        if not pub_data:
            # FIXME: LocalClient uses 'print' to record the
            # details of what went wrong :-(
            raise PublishError("Failed to publish job")

        self.log.info("Request %s started job %s" % (self.id, pub_data['jid']))

        self.alive_at = now()
        self.jid = pub_data['jid']

        return self.jid
Ejemplo n.º 14
0
    def tick(self):
        """
        For walltime-based monitoring of running requests.  Long-running requests
        get a periodic call to saltutil.running to verify that things really
        are still happening.
        """

        if not self._by_jid:
            return
        else:
            log.debug("RequestCollection.tick: %s JIDs underway" %
                      len(self._by_jid))

        # Identify JIDs who haven't had a saltutil.running reponse for too long.
        # Kill requests in a separate phase because request:JID is not 1:1
        stale_jobs = set()
        _now = now()
        for request in self._by_jid.values():
            if _now - request.alive_at > datetime.timedelta(
                    seconds=TICK_PERIOD * 3):
                log.error("Request %s JID %s stale: now=%s, alive_at=%s" %
                          (request.id, request.jid, _now, request.alive_at))
                stale_jobs.add(request)

        # Any identified stale jobs are errored out.
        for request in stale_jobs:
            with self._update_index(request):
                request.set_error("Lost contact")
                request.jid = None
                request.complete()

        # Identify minions associated with JIDs in flight
        query_minions = set()
        for jid, request in self._by_jid.items():
            query_minions.add(request.minion_id)

        # Attempt to emit a saltutil.running to ping jobs, next tick we
        # will see if we got updates to the alive_at attribute to indicate non-staleness
        if query_minions:
            log.info("RequestCollection.tick: sending saltutil.running to {0}".
                     format(query_minions))
            client = LocalClient(config.get('cthulhu', 'salt_config_path'))
            pub_data = client.run_job(list(query_minions),
                                      'saltutil.running', [],
                                      expr_form="list")
            if not pub_data:
                log.warning("Failed to publish saltutil.running to {0}".format(
                    query_minions))
Ejemplo n.º 15
0
    def _emit(self, severity, message, **associations):
        """
        :param severity: One of the defined serverity values
        :param message: One line human readable string
        :param associations: Optional extra attributes to associate
                             the event with a particular cluster/server/service
        """
        now_utc = now()
        log.info("Eventer._emit: %s/%s/%s" %
                 (now_utc, severity_str(severity), message))

        self._events.append(
            Event(when=now_utc,
                  message=message,
                  severity=severity,
                  **associations))
Ejemplo n.º 16
0
    def on_tick_response(self, minion_id, jobs):
        """
        Update the alive_at parameter of requests to record that they
        are still running remotely.

        :param jobs: The response from a saltutil.running
        """
        log.debug("RequestCollection.on_tick_response: %s from %s" % (len(jobs), minion_id))
        for job in jobs:
            try:
                request = self._by_jid[job['jid']]
            except KeyError:
                # Not one of mine, ignore it
                pass
            else:
                request.alive_at = now()
Ejemplo n.º 17
0
    def _emit(self, severity, message, **associations):
        """
        :param severity: One of the defined serverity values
        :param message: One line human readable string
        :param associations: Optional extra attributes to associate
                             the event with a particular cluster/server/service
        """
        now_utc = now()
        log.info("Eventer._emit: %s/%s/%s" % (now_utc, severity_str(severity), message))

        self._events.append(Event(
            when=now_utc,
            message=message,
            severity=severity,
            **associations
        ))
Ejemplo n.º 18
0
    def _submit(self, commands):
        self.log.debug("Request._submit: %s/%s/%s" % (self._minion_id, self._cluster_name, commands))

        client = LocalClient(config.get('cthulhu', 'salt_config_path'))
        pub_data = client.run_job(self._minion_id, 'ceph.rados_commands',
                                  [self._fsid, self._cluster_name, commands])
        if not pub_data:
            # FIXME: LocalClient uses 'print' to record the
            # details of what went wrong :-(
            raise PublishError("Failed to publish job")

        self.log.info("Request %s started job %s" % (self.id, pub_data['jid']))

        self.alive_at = now()
        self.jid = pub_data['jid']

        return self.jid
Ejemplo n.º 19
0
    def _submit(self, commands=None):
        if commands is None:
            commands = self._commands

        self.log.debug("%s._submit: %s/%s/%s" % (self.__class__.__name__,
                                                 self._minion_id,
                                                 self._cluster_name,
                                                 commands))

        self.jid = remote.run_job(self._minion_id, 'ceph.rados_commands',
                                  {'fsid': self.fsid,
                                   'cluster_name': self._cluster_name,
                                   'commands': commands})
        self.log.info("Request %s started job %s" % (self.id, self.jid))
        self.alive_at = now()

        return self.jid
Ejemplo n.º 20
0
    def _submit(self, commands=None):
        if commands is None:
            commands = self._commands

        self.log.debug("%s._submit: %s/%s/%s" %
                       (self.__class__.__name__, self._minion_id,
                        self._cluster_name, commands))

        self.jid = remote.run_job(
            self._minion_id, 'ceph.rados_commands', {
                'fsid': self.fsid,
                'cluster_name': self._cluster_name,
                'commands': commands
            })
        self.log.info("Request %s started job %s" % (self.id, self.jid))
        self.alive_at = now()

        return self.jid
Ejemplo n.º 21
0
    def fetch(self, minion_id, sync_type):
        log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type))
        if minion_id is None:
            # We're probably being replayed to from the database
            log.warn("SyncObjects.fetch called with minion_id=None")
            return

        self._fetching_at[sync_type] = now()
        client = LocalClient(config.get('cthulhu', 'salt_config_path'))
        # TODO clean up unused 'since' argument
        pub_data = client.run_job(minion_id, 'ceph.get_cluster_object',
                                  condition_kwarg([], {'cluster_name': self._cluster_name,
                                                       'sync_type': sync_type.str,
                                                       'since': None}))
        if not pub_data:
            log.error("Failed to start fetch job %s/%s" % (minion_id, sync_type))
            # Don't throw an exception because if a fetch fails we should always
        else:
            log.debug("SyncObjects.fetch: jid=%s minions=%s" % (pub_data['jid'], pub_data['minions']))
Ejemplo n.º 22
0
    def on_tick(self):
        # This procedure is to catch the annoying case of AES key changes (#7836), which are otherwise
        # ignored by minions which are doing only minion->master messaging.  To ensure they
        # pick up on key changes, we actively send them something (doesn't matter what).  To
        # avoid doing this constantly, we only send things to minions which seem to be a little
        # late

        # After this length of time, doubt a minion enough to send it a message in case
        # it needs a kick to update its key
        def _ping_period(fqdn):
            return datetime.timedelta(seconds=self.get_contact_period(fqdn) * 2)

        t = now()
        late_servers = [s.fqdn for s in self.servers.values() if s.last_contact and (t - s.last_contact) > _ping_period(s.fqdn)]
        log.debug("late servers: %s" % late_servers)
        if late_servers:
            client = LocalClient(config.get('cthulhu', 'salt_config_path'))
            pub = client.pub(late_servers, "test.ping", expr_form='list')
            log.debug(pub)
Ejemplo n.º 23
0
    def tick(self):
        """
        For walltime-based monitoring of running requests.  Long-running requests
        get a periodic call to saltutil.running to verify that things really
        are still happening.
        """

        if not self._by_jid:
            return
        else:
            log.debug("RequestCollection.tick: %s JIDs underway" % len(self._by_jid))

        # Identify JIDs who haven't had a saltutil.running reponse for too long.
        # Kill requests in a separate phase because request:JID is not 1:1
        stale_jobs = set()
        _now = now()
        for request in self._by_jid.values():
            if _now - request.alive_at > datetime.timedelta(seconds=TICK_PERIOD * 3):
                log.error("Request %s JID %s stale: now=%s, alive_at=%s" % (
                    request.id, request.jid, _now, request.alive_at
                ))
                stale_jobs.add(request)

        # Any identified stale jobs are errored out.
        for request in stale_jobs:
            with self._update_index(request):
                request.set_error("Lost contact")
                request.jid = None
                request.complete()

        # Identify minions associated with JIDs in flight
        query_minions = set()
        for jid, request in self._by_jid.items():
            query_minions.add(request.minion_id)

        # Attempt to emit a saltutil.running to ping jobs, next tick we
        # will see if we got updates to the alive_at attribute to indicate non-staleness
        if query_minions:
            log.info("RequestCollection.tick: sending saltutil.running to {0}".format(query_minions))
            client = LocalClient(config.get('cthulhu', 'salt_config_path'))
            pub_data = client.run_job(list(query_minions), 'saltutil.running', [], expr_form="list")
            if not pub_data:
                log.warning("Failed to publish saltutil.running to {0}".format(query_minions))
Ejemplo n.º 24
0
    def fetch(self, minion_id, sync_type):
        log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type))
        if minion_id is None:
            # We're probably being replayed to from the database
            log.warn("SyncObjects.fetch called with minion_id=None")
            return

        self._fetching_at[sync_type] = now()
        try:
            # TODO clean up unused 'since' argument
            jid = remote.run_job(minion_id, 'ceph.get_cluster_object',
                                 {'cluster_name': self._cluster_name,
                                  'sync_type': sync_type.str,
                                  'since': None})
        except Unavailable:
            # Don't throw an exception because if a fetch fails we should end up
            # issuing another on next heartbeat
            log.error("Failed to start fetch job %s/%s" % (minion_id, sync_type))
        else:
            log.debug("SyncObjects.fetch: jid=%s" % jid)
Ejemplo n.º 25
0
    def on_sync_object(self, minion_id, data):
        if minion_id != self._favorite_mon:
            log.debug("Ignoring map from %s, it is not my favourite (%s)" % (minion_id, self._favorite_mon))

        assert data['fsid'] == self.fsid

        sync_object = data['data']

        sync_type = SYNC_OBJECT_STR_TYPE[data['type']]
        new_object = self.inject_sync_object(minion_id, data['type'], data['version'], sync_object)
        if new_object:
            self._requests.on_map(self.fsid, sync_type, new_object)
            self._persister.update_sync_object(
                self.fsid,
                self.name,
                sync_type.str,
                new_object.version if isinstance(new_object.version, int) else None,
                now(), sync_object)
        else:
            log.warn("ClusterMonitor.on_sync_object: stale object received from %s" % minion_id)
Ejemplo n.º 26
0
    def _submit(self, commands=None):
        if commands is None:
            commands = self._commands

        self.log.debug("%s._submit: %s/%s/%s" % (self.__class__.__name__,
                                                 self._minion_id, self._cluster_name, commands))

        client = LocalClient(config.get('cthulhu', 'salt_config_path'))
        pub_data = client.run_job(self._minion_id, 'ceph.rados_commands',
                                  [self.fsid, self._cluster_name, commands])
        if not pub_data:
            # FIXME: LocalClient uses 'print' to record the
            # details of what went wrong :-(
            raise PublishError("Failed to publish job")

        self.log.info("Request %s started job %s" % (self.id, pub_data['jid']))

        self.alive_at = now()
        self.jid = pub_data['jid']

        return self.jid
Ejemplo n.º 27
0
    def fetch(self, minion_id, sync_type):
        log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type))
        if minion_id is None:
            # We're probably being replayed to from the database
            log.warn("SyncObjects.fetch called with minion_id=None")
            return

        self._fetching_at[sync_type] = now()
        try:
            # TODO clean up unused 'since' argument
            jid = remote.run_job(
                minion_id, 'ceph.get_cluster_object', {
                    'cluster_name': self._cluster_name,
                    'sync_type': sync_type.str,
                    'since': None
                })
        except Unavailable:
            # Don't throw an exception because if a fetch fails we should end up
            # issuing another on next heartbeat
            log.error("Failed to start fetch job %s/%s" %
                      (minion_id, sync_type))
        else:
            log.debug("SyncObjects.fetch: jid=%s" % jid)
Ejemplo n.º 28
0
    def on_tick(self):
        # This procedure is to catch the annoying case of AES key changes (#7836), which are otherwise
        # ignored by minions which are doing only minion->master messaging.  To ensure they
        # pick up on key changes, we actively send them something (doesn't matter what).  To
        # avoid doing this constantly, we only send things to minions which seem to be a little
        # late

        # After this length of time, doubt a minion enough to send it a message in case
        # it needs a kick to update its key
        def _ping_period(fqdn):
            return datetime.timedelta(seconds=self.get_contact_period(fqdn) *
                                      2)

        t = now()
        late_servers = [
            s.fqdn for s in self.servers.values()
            if s.last_contact and (t - s.last_contact) > _ping_period(s.fqdn)
        ]
        log.debug("late servers: %s" % late_servers)
        if late_servers:
            client = LocalClient(config.get('cthulhu', 'salt_config_path'))
            pub = client.pub(late_servers, "test.ping", expr_form='list')
            log.debug(pub)
Ejemplo n.º 29
0
    def on_server_heartbeat(self, fqdn, server_heartbeat):
        """
        Call back for when a ceph.service message is received from a salt minion.

        This is actually a fairly simple operation of updating the in memory ServerState
        to reflect what is in the message, but it's convoluted because we may be seeing
        a new server, a known server, or a server which was known but unmanaged.
        """
        log.debug("ServerMonitor.on_server_heartbeat: %s" % fqdn)
        new_server = True
        newly_managed_server = False
        try:
            server_state = self.servers[fqdn]
            new_server = False
        except KeyError:
            # Look up the grains for this server, we need to know its hostname in order
            # to resolve this vs. the OSD map.
            hostname = self._get_grains(fqdn)['host']

            if hostname in self.hostname_to_server:
                server_state = self.hostname_to_server[hostname]
                if not server_state.managed:
                    # Take over a ServerState that was created from OSD map
                    server_state.managed = True
                    old_fqdn = server_state.fqdn
                    # OSD map servers would have faked up FQDN as hostname, so clear that out
                    del self.servers[old_fqdn]
                    server_state.fqdn = fqdn
                    self.servers[server_state.fqdn] = server_state
                    self._persister.update_server(old_fqdn,
                                                  fqdn=fqdn,
                                                  managed=True)
                    new_server = False
                    log.info("Server %s went from unmanaged to managed" % fqdn)
                    newly_managed_server = True

                else:
                    # We will go on to treat these as distinct servers even though
                    # they have the same hostname
                    log.warn(
                        "Hostname clash: FQDNs '%s' and '%s' both have hostname %s"
                        % (fqdn, server_state.fqdn, hostname))
        else:
            # The case where hostname == FQDN, we may already have this FQDN in our
            # map from an unmanaged server being reported by hostname.
            if not server_state.managed:
                newly_managed_server = True
                server_state.managed = True
                self._persister.update_server(server_state.fqdn, managed=True)
                log.info("Server %s went from unmanaged to managed" % fqdn)

        boot_time = datetime.datetime.fromtimestamp(
            server_heartbeat['boot_time'], tz=tz.tzutc())
        if new_server:
            hostname = self._get_grains(fqdn)['host']
            server_state = ServerState(
                fqdn,
                hostname,
                managed=True,
                last_contact=now(),
                boot_time=boot_time,
                ceph_version=server_heartbeat['ceph_version'])
            self.inject_server(server_state)
            self._persister.create_server(
                Server(fqdn=server_state.fqdn,
                       hostname=server_state.hostname,
                       managed=server_state.managed,
                       last_contact=server_state.last_contact))
            log.info("Saw server %s for the first time" % server_state)

        server_state.last_contact = now()
        self._persister.update_server(server_state.fqdn,
                                      last_contact=server_state.last_contact)

        if server_state.boot_time != boot_time:
            log.warn("{0} boot time changed, old {1} new {2}".format(
                server_state.fqdn, server_state.boot_time, boot_time))
            old_boot_time = server_state.boot_time
            server_state.boot_time = boot_time
            self._persister.update_server(server_state.fqdn,
                                          boot_time=server_state.boot_time)
            if old_boot_time is not None:  # i.e. a reboot, not an unmanaged->managed transition
                if server_state.boot_time < old_boot_time:
                    log.warn("Server boot time went backwards")
                elif server_state.boot_time - old_boot_time < REBOOT_THRESHOLD:
                    log.warn("Server boot time changed, but only a little")
                else:
                    # A substantial forward change in boot time, that's a reboot: emit
                    # a user visible event
                    log.warn("{0} rebooted!".format(fqdn))
                    self._eventer.on_reboot(server_state, False)

        if server_state.ceph_version != server_heartbeat['ceph_version']:
            # Interpret "no package installed but some services running" as meaning we're
            # in the process of upgrading.
            upgrading = server_heartbeat[
                'ceph_version'] is None and server_heartbeat['services']
            if server_heartbeat['ceph_version'] is None and upgrading:
                # Ignore version=None while upgrading to avoid generating spurious
                # "ceph uninstalled" events
                pass
            else:
                server_state.ceph_version = server_heartbeat['ceph_version']
                self._persister.update_server(
                    server_state.fqdn, ceph_version=server_state.ceph_version)
                if not (new_server or newly_managed_server):
                    self._eventer.on_new_version(server_state)

        seen_id_tuples = set()
        for service_name, service in server_heartbeat['services'].items():
            id_tuple = ServiceId(service['fsid'], service['type'],
                                 service['id'])
            seen_id_tuples.add(id_tuple)
            self._register_service(server_state,
                                   id_tuple,
                                   running=True,
                                   status=service['status'])

        # For any service which was last reported on this server but
        # is now gone, mark it as not running
        for unseen_id_tuple in set(
                server_state.services.keys()) ^ seen_id_tuples:
            service_state = self.services[unseen_id_tuple]
            if service_state.running:
                log.info("Service %s stopped on server %s" %
                         (service_state, server_state))
                service_state.running = False

        if new_server or newly_managed_server:
            # We do this at the end so that by the time we emit the event
            # the ServiceState objects have been created
            self._eventer.on_server(server_state)
Ejemplo n.º 30
0
 def __init__(self, severity, message, **associations):
     self.severity = severity
     self.message = message
     self.associations = associations
     self.when = now()
Ejemplo n.º 31
0
    def on_server_heartbeat(self, fqdn, server_heartbeat):
        """
        Call back for when a ceph.service message is received from a salt minion.

        This is actually a fairly simple operation of updating the in memory ServerState
        to reflect what is in the message, but it's convoluted because we may be seeing
        a new server, a known server, or a server which was known but unmanaged.
        """
        log.debug("ServerMonitor.on_server_heartbeat: %s" % fqdn)
        new_server = True
        newly_managed_server = False
        try:
            server_state = self.servers[fqdn]
            new_server = False
        except KeyError:
            # Look up the grains for this server, we need to know its hostname in order
            # to resolve this vs. the OSD map.
            hostname = self._get_grains(fqdn)['host']

            if hostname in self.hostname_to_server:
                server_state = self.hostname_to_server[hostname]
                if not server_state.managed:
                    # Take over a ServerState that was created from OSD map
                    server_state.managed = True
                    old_fqdn = server_state.fqdn
                    # OSD map servers would have faked up FQDN as hostname, so clear that out
                    del self.servers[old_fqdn]
                    server_state.fqdn = fqdn
                    self.servers[server_state.fqdn] = server_state
                    self._persister.update_server(old_fqdn, fqdn=fqdn, managed=True)
                    new_server = False
                    log.info("Server %s went from unmanaged to managed" % fqdn)
                    newly_managed_server = True
                else:
                    # We will go on to treat these as distinct servers even though
                    # they have the same hostname
                    log.warn("Hostname clash: FQDNs '%s' and '%s' both have hostname %s" % (
                        fqdn, server_state.fqdn, hostname
                    ))
        else:
            # The case where hostname == FQDN, we may already have this FQDN in our
            # map from an unmanaged server being reported by hostname.
            if not server_state.managed:
                newly_managed_server = True
                server_state.managed = True
                self._persister.update_server(server_state.fqdn, managed=True)
                log.info("Server %s went from unmanaged to managed" % fqdn)

        boot_time = datetime.datetime.fromtimestamp(server_heartbeat['boot_time'], tz=tz.tzutc())
        if new_server:
            hostname = self._get_grains(fqdn)['host']
            server_state = ServerState(fqdn, hostname, managed=True,
                                       last_contact=now(), boot_time=boot_time,
                                       ceph_version=server_heartbeat['ceph_version'])
            self.inject_server(server_state)
            self._persister.create_server(Server(
                fqdn=server_state.fqdn,
                hostname=server_state.hostname,
                managed=server_state.managed,
                last_contact=server_state.last_contact
            ))
            log.info("Saw server %s for the first time" % server_state)

        server_state.last_contact = now()
        self._persister.update_server(server_state.fqdn, last_contact=server_state.last_contact)

        if server_state.boot_time != boot_time:
            log.warn("{0} boot time changed, old {1} new {2}".format(
                server_state.fqdn, server_state.boot_time, boot_time
            ))
            old_boot_time = server_state.boot_time
            server_state.boot_time = boot_time
            self._persister.update_server(server_state.fqdn, boot_time=server_state.boot_time)
            if old_boot_time is not None:  # i.e. a reboot, not an unmanaged->managed transition
                if server_state.boot_time < old_boot_time:
                    log.warn("Server boot time went backwards")
                elif server_state.boot_time - old_boot_time < REBOOT_THRESHOLD:
                    log.warn("Server boot time changed, but only a little")
                else:
                    # A substantial forward change in boot time, that's a reboot: emit
                    # a user visible event
                    log.warn("{0} rebooted!".format(fqdn))
                    self._eventer.on_reboot(server_state, False)

        if server_state.ceph_version != server_heartbeat['ceph_version']:
            # Interpret "no package installed but some services running" as meaning we're
            # in the process of upgrading.
            upgrading = server_heartbeat['ceph_version'] is None and server_heartbeat['services']
            if server_heartbeat['ceph_version'] is None and upgrading:
                # Ignore version=None while upgrading to avoid generating spurious
                # "ceph uninstalled" events
                pass
            else:
                server_state.ceph_version = server_heartbeat['ceph_version']
                self._persister.update_server(server_state.fqdn, ceph_version=server_state.ceph_version)
                if not (new_server or newly_managed_server):
                    self._eventer.on_new_version(server_state)

        seen_id_tuples = set()
        for service_name, service in server_heartbeat['services'].items():
            id_tuple = ServiceId(service['fsid'], service['type'], service['id'])
            seen_id_tuples.add(id_tuple)
            self._register_service(server_state, id_tuple, running=True, status=service['status'])

        # For any service which was last reported on this server but
        # is now gone, mark it as not running
        for unseen_id_tuple in set(server_state.services.keys()) ^ seen_id_tuples:
            service_state = self.services[unseen_id_tuple]
            if service_state.running:
                log.info("Service %s stopped on server %s" % (service_state, server_state))
                service_state.running = False

        if new_server or newly_managed_server:
            # We do this at the end so that by the time we emit the event
            # the ServiceState objects have been created
            self._eventer.on_server(server_state)
Ejemplo n.º 32
0
 def __init__(self, severity, message, **associations):
     self.severity = severity
     self.message = message
     self.associations = associations
     self.when = now()