Exemple #1
0
    def on_osd_map(self, osd_map):
        """
        For when a new OSD map is received: we may infer the existence of
        hosts from the CRUSH map if the hosts are not all sending
        us data with salt.

        :param osd_map: The data from an OsdMap sync object
        """
        log.debug("ServerMonitor.on_osd_map: epoch %s" % osd_map['epoch'])

        hostname_to_osds = self.get_hostname_to_osds(osd_map)
        log.debug("ServerMonitor.on_osd_map: got service data for %s servers" %
                  len(hostname_to_osds))

        osds_in_map = set()
        for hostname, osds in hostname_to_osds.items():
            id_to_osd = dict([(ServiceId(osd_map['fsid'], 'osd',
                                         str(o['osd'])), o) for o in osds])
            osds_in_map |= set(id_to_osd.keys())

            # Identify if this is a CRUSH alias rather than a real hostname, by
            # checking if any of the OSDs mentioned are already recorded as children
            # of a managed host.
            crush_alias_to = None
            if hostname not in self.hostname_to_server:
                for service_id, osd in id_to_osd.items():
                    try:
                        service_state = self.services[service_id]
                        if service_state.server_state.managed:
                            crush_alias_to = service_state.server_state
                    except KeyError:
                        pass

            if crush_alias_to:
                log.info("'{0}' is a CRUSH alias to {1}".format(
                    hostname, crush_alias_to))
                continue

            # Look up or create ServerState for the server named in the CRUSH map
            try:
                server_state = self.hostname_to_server[hostname]
            except KeyError:
                # Fake FQDN to equal hostname
                server_state = ServerState(hostname,
                                           hostname,
                                           managed=False,
                                           last_contact=None,
                                           boot_time=None,
                                           ceph_version=None)
                self.inject_server(server_state)
                self._persister.create_server(
                    Server(fqdn=server_state.fqdn,
                           hostname=server_state.hostname,
                           managed=server_state.managed))

            # Register all the OSDs reported under this hostname with the ServerState
            for service_id, osd in id_to_osd.items():
                if not server_state.managed:
                    # Only pay attention to these services for unmanaged servers,
                    # for managed servers rely on ceph/server salt messages
                    self._register_service(server_state, service_id,
                                           bool(osd['up']), None)

        # Remove ServiceState for any OSDs for this FSID which are not
        # mentioned in hostname_to_osds
        known_osds = set([
            s.id for s in self.fsid_services[osd_map['fsid']]
            if s.service_type == 'osd'
        ])
        for stale_service_id in known_osds - osds_in_map:
            self.forget_service(self.services[stale_service_id])
Exemple #2
0
    def on_server_heartbeat(self, fqdn, server_heartbeat):
        """
        Call back for when a ceph.service message is received from a salt minion.

        This is actually a fairly simple operation of updating the in memory ServerState
        to reflect what is in the message, but it's convoluted because we may be seeing
        a new server, a known server, or a server which was known but unmanaged.
        """
        log.debug("ServerMonitor.on_server_heartbeat: %s" % fqdn)
        new_server = True
        newly_managed_server = False
        try:
            server_state = self.servers[fqdn]
            new_server = False
        except KeyError:
            # Look up the grains for this server, we need to know its hostname in order
            # to resolve this vs. the OSD map.
            hostname = self._get_grains(fqdn)['host']

            if hostname in self.hostname_to_server:
                server_state = self.hostname_to_server[hostname]
                if not server_state.managed:
                    # Take over a ServerState that was created from OSD map
                    server_state.managed = True
                    old_fqdn = server_state.fqdn
                    # OSD map servers would have faked up FQDN as hostname, so clear that out
                    del self.servers[old_fqdn]
                    server_state.fqdn = fqdn
                    self.servers[server_state.fqdn] = server_state
                    self._persister.update_server(old_fqdn,
                                                  fqdn=fqdn,
                                                  managed=True)
                    new_server = False
                    log.info("Server %s went from unmanaged to managed" % fqdn)
                    newly_managed_server = True

                else:
                    # We will go on to treat these as distinct servers even though
                    # they have the same hostname
                    log.warn(
                        "Hostname clash: FQDNs '%s' and '%s' both have hostname %s"
                        % (fqdn, server_state.fqdn, hostname))
        else:
            # The case where hostname == FQDN, we may already have this FQDN in our
            # map from an unmanaged server being reported by hostname.
            if not server_state.managed:
                newly_managed_server = True
                server_state.managed = True
                self._persister.update_server(server_state.fqdn, managed=True)
                log.info("Server %s went from unmanaged to managed" % fqdn)

        boot_time = datetime.datetime.fromtimestamp(
            server_heartbeat['boot_time'], tz=tz.tzutc())
        if new_server:
            hostname = self._get_grains(fqdn)['host']
            server_state = ServerState(
                fqdn,
                hostname,
                managed=True,
                last_contact=now(),
                boot_time=boot_time,
                ceph_version=server_heartbeat['ceph_version'])
            self.inject_server(server_state)
            self._persister.create_server(
                Server(fqdn=server_state.fqdn,
                       hostname=server_state.hostname,
                       managed=server_state.managed,
                       last_contact=server_state.last_contact))
            log.info("Saw server %s for the first time" % server_state)

        server_state.last_contact = now()
        self._persister.update_server(server_state.fqdn,
                                      last_contact=server_state.last_contact)

        if server_state.boot_time != boot_time:
            log.warn("{0} boot time changed, old {1} new {2}".format(
                server_state.fqdn, server_state.boot_time, boot_time))
            old_boot_time = server_state.boot_time
            server_state.boot_time = boot_time
            self._persister.update_server(server_state.fqdn,
                                          boot_time=server_state.boot_time)
            if old_boot_time is not None:  # i.e. a reboot, not an unmanaged->managed transition
                if server_state.boot_time < old_boot_time:
                    log.warn("Server boot time went backwards")
                elif server_state.boot_time - old_boot_time < REBOOT_THRESHOLD:
                    log.warn("Server boot time changed, but only a little")
                else:
                    # A substantial forward change in boot time, that's a reboot: emit
                    # a user visible event
                    log.warn("{0} rebooted!".format(fqdn))
                    self._eventer.on_reboot(server_state, False)

        if server_state.ceph_version != server_heartbeat['ceph_version']:
            # Interpret "no package installed but some services running" as meaning we're
            # in the process of upgrading.
            upgrading = server_heartbeat[
                'ceph_version'] is None and server_heartbeat['services']
            if server_heartbeat['ceph_version'] is None and upgrading:
                # Ignore version=None while upgrading to avoid generating spurious
                # "ceph uninstalled" events
                pass
            else:
                server_state.ceph_version = server_heartbeat['ceph_version']
                self._persister.update_server(
                    server_state.fqdn, ceph_version=server_state.ceph_version)
                if not (new_server or newly_managed_server):
                    self._eventer.on_new_version(server_state)

        seen_id_tuples = set()
        for service_name, service in server_heartbeat['services'].items():
            id_tuple = ServiceId(service['fsid'], service['type'],
                                 service['id'])
            seen_id_tuples.add(id_tuple)
            self._register_service(server_state,
                                   id_tuple,
                                   running=True,
                                   status=service['status'])

        # For any service which was last reported on this server but
        # is now gone, mark it as not running
        for unseen_id_tuple in set(
                server_state.services.keys()) ^ seen_id_tuples:
            service_state = self.services[unseen_id_tuple]
            if service_state.running:
                log.info("Service %s stopped on server %s" %
                         (service_state, server_state))
                service_state.running = False

        if new_server or newly_managed_server:
            # We do this at the end so that by the time we emit the event
            # the ServiceState objects have been created
            self._eventer.on_server(server_state)
Exemple #3
0
 def _create_server(self, *args, **kwargs):
     self._session.add(Server(*args, **kwargs))