Beispiel #1
0
 def status_by_service(self, services):
     result = self._manager.servers.get_services(
         [ServiceId(*s) for s in services])
     return [({
         'running': ss.running,
         'server': ss.server_state.fqdn,
         'status': ss.status
     } if ss else None) for ss in result]
Beispiel #2
0
    def _get_fqdn(self, fsid, service_type, service_id):
        """Resolve a service to a FQDN if possible, else return None

        """
        server = self._manager.servers.get_by_service(
            ServiceId(fsid, service_type, str(service_id)))
        if server is None:
            LOG.warn("No server found for service %s %s" %
                     (service_type, service_id))
        return server.fqdn if server else None
Beispiel #3
0
    def server_by_service(self, services):
        """Return a list of 2-tuples mapping of service ID to server FQDN

        Note that we would rather return a dict but tuple dict keys are

        awkward to serialize

        """
        result = self._manager.servers.list_by_service(
            [ServiceId(*s) for s in services])
        return result
Beispiel #4
0
    def on_mon_map(self, mon_map):
        """When a new mon map is received, use it to eliminate any mon

        ServiceState records that no longer exist in the real world.

        """
        map_mons = set([
            ServiceId(mon_map['fsid'], 'mon', m['name'])
            for m in mon_map['mons']
        ])
        known_mons = set([
            s.id for s in self.fsid_services[mon_map['fsid']]
            if s.service_type == 'mon'
        ])
        for stale_mon_id in known_mons - map_mons:
            self.forget_service(self.services[stale_mon_id])
Beispiel #5
0
    def on_mds_map(self, fsid, mds_map):
        """When a new MDS map is received, use it to eliminate any MDS

        ServiceState records that no longer exist in the real world.

        :param fsid: Pass in fsid string because mds map doesn't include it

        :param mds_map: The MDS map sync object

        """
        map_mds = set([
            ServiceId(fsid, 'mds', i['name'])
            for i in mds_map['info'].values()
        ])
        known_mds = set([
            s.id for s in self.fsid_services[fsid] if s.service_type == 'mds'
        ])
        for stale_mds_id in known_mds - map_mds:
            self.forget_service(self.services[stale_mds_id])
Beispiel #6
0
    def on_server_heartbeat(self, fqdn, server_heartbeat):
        """Call back for when a ceph.service message is received from a salt

        minion. This is actually a fairly simple operation of updating the

        in memory ServerState to reflect what is in the message, but it's

        convoluted because we may be seeing a new server, a known server,

        or a server which was known but unmanaged.

        """
        LOG.debug("ServerMonitor.on_server_heartbeat: %s" % fqdn)
        new_server = True
        newly_managed_server = False
        try:
            server_state = self.servers[fqdn]
            new_server = False
        except KeyError:
            # Look up the grains for this server, we need to know its
            # hostname in order to resolve this vs. the OSD map.
            hostname = fqdn

            if hostname in self.hostname_to_server:
                server_state = self.hostname_to_server[hostname]
                if not server_state.managed:
                    # Take over a ServerState that was created from OSD map
                    server_state.managed = True
                    old_fqdn = server_state.fqdn
                    # OSD map servers would have faked up FQDN as hostname,
                    # so clear that out
                    del self.servers[old_fqdn]
                    server_state.fqdn = fqdn
                    self.servers[server_state.fqdn] = server_state
                    for service_name, service in server_heartbeat[
                            'services'].items():
                        self._persister.create_server(
                            Server(
                                fsid=service['fsid'],
                                fqdn=server_state.fqdn,
                                managed=True,
                            ))
                        break
                    new_server = False
                    LOG.info("Server %s went from unmanaged to managed" % fqdn)
                    newly_managed_server = True
                else:
                    # We will go on to treat these as distinct servers even
                    # though they have the same hostname
                    LOG.warn("Hostname clash: FQDNs '%s' and"
                             " '%s' both have hostname %s" %
                             (fqdn, server_state.fqdn, hostname))
        else:
            # The case where hostname == FQDN, we may already have this
            # FQDN in our map from an unmanaged server being reported by
            # hostname.
            if not server_state.managed:
                newly_managed_server = True
                server_state.managed = True
                for service_name, service in server_heartbeat[
                        'services'].items():
                    self._persister.create_server(
                        Server(
                            fsid=service['fsid'],
                            fqdn=server_state.fqdn,
                            managed=True,
                        ))
                    LOG.info("Server %s went from unmanaged to managed" % fqdn)
                    break

        boot_time = datetime.datetime.fromtimestamp(
            server_heartbeat['boot_time'], tz=tz.tzutc())
        if new_server:
            hostname = fqdn
            server_state = ServerState(
                fqdn,
                hostname,
                managed=True,
                last_contact=now(),
                boot_time=boot_time,
                ceph_version=server_heartbeat['ceph_version'])
            self.inject_server(server_state)
            for service_name, service in server_heartbeat['services'].items():
                self._persister.create_server(
                    Server(fsid=service['fsid'],
                           fqdn=server_state.fqdn,
                           hostname=server_state.hostname,
                           managed=server_state.managed,
                           last_contact=server_state.last_contact,
                           boot_time=boot_time,
                           ceph_version=server_heartbeat['ceph_version']))
                LOG.info("Saw server %s for the first time" % server_state)
                break

        server_state.last_contact = now()
        for service_name, service in server_heartbeat['services'].items():
            self._persister.create_server(
                Server(
                    fsid=service['fsid'],
                    fqdn=server_state.fqdn,
                    last_contact=server_state.last_contact,
                ))
            break

        if server_state.boot_time != boot_time:
            LOG.warn("{0} boot time changed, old {1} new {2}".format(
                server_state.fqdn, server_state.boot_time, boot_time))
            old_boot_time = server_state.boot_time
            server_state.boot_time = boot_time
            for service_name, service in server_heartbeat['services'].items():
                self._persister.create_server(
                    Server(
                        fsid=service['fsid'],
                        fqdn=server_state.fqdn,
                        boot_time=server_state.boot_time,
                    ))
                break

            if old_boot_time is not None:
                # i.e. a reboot, not an unmanaged->managed transition
                if server_state.boot_time < old_boot_time:
                    LOG.warn("Server boot time went backwards")
                elif server_state.boot_time - old_boot_time < REBOOT_THRESHOLD:
                    LOG.warn("Server boot time changed, but only a little")
                else:
                    # A substantial forward change in boot time, that's a
                    # reboot: emit a user visible event
                    LOG.warn("{0} rebooted!".format(fqdn))
                    self._eventer.on_reboot(server_state, False)

        if server_state.ceph_version != server_heartbeat['ceph_version']:
            # Interpret "no package installed but some services running" as
            # meaning we're in the process of upgrading.
            upgrading = server_heartbeat[
                'ceph_version'] is None and server_heartbeat['services']
            if server_heartbeat['ceph_version'] is None and upgrading:
                # Ignore version=None while upgrading to avoid generating
                # spurious "ceph uninstalled" events
                pass
            else:
                server_state.ceph_version = server_heartbeat['ceph_version']
                for service_name, service in server_heartbeat[
                        'services'].items():
                    self._persister.create_server(
                        Server(
                            fsid=service['fsid'],
                            fqdn=server_state.fqdn,
                            ceph_version=server_state.ceph_version,
                        ))
                    break

                if not (new_server or newly_managed_server):
                    self._eventer.on_new_version(server_state)

        seen_id_tuples = set()
        for service_name, service in server_heartbeat['services'].items():
            id_tuple = ServiceId(service['fsid'], service['type'],
                                 service['id'])
            seen_id_tuples.add(id_tuple)
            self._register_service(server_state,
                                   id_tuple,
                                   running=True,
                                   status=service['status'],
                                   fsid=service['fsid'],
                                   fqdn=fqdn)

        # For any service which was last reported on this server but
        # is now gone, mark it as not running
        for unseen_id_tuple in set(
                server_state.services.keys()) ^ seen_id_tuples:
            service_state = self.services[unseen_id_tuple]
            if service_state.running:
                LOG.info("Service %s stopped on server %s" %
                         (service_state, server_state))
                service_state.running = False

        if new_server or newly_managed_server:
            # We do this at the end so that by the time we emit the event
            # the ServiceState objects have been created
            self._eventer.on_server(server_state)
Beispiel #7
0
    def on_osd_map(self, osd_map):
        """For when a new OSD map is received: we may infer the existence of

        hosts from the CRUSH map if the hosts are not all sending

        us data with salt.

        :param osd_map: The data from an OsdMap sync object

        """
        LOG.debug("ServerMonitor.on_osd_map: epoch %s" % osd_map['epoch'])

        hostname_to_osds = self.get_hostname_to_osds(osd_map)
        LOG.debug("ServerMonitor.on_osd_map: got service"
                  " data for %s servers" % len(hostname_to_osds))

        osds_in_map = set()
        for hostname, osds in hostname_to_osds.items():
            id_to_osd = dict([(ServiceId(osd_map['fsid'], 'osd',
                                         str(o['osd'])), o) for o in osds])
            osds_in_map |= set(id_to_osd.keys())

            # Identify if this is a CRUSH alias rather than a real hostname, by
            # checking if any of the OSDs mentioned are already recorded as
            # children of a managed host.
            crush_alias_to = None
            if hostname not in self.hostname_to_server:
                for service_id, osd in id_to_osd.items():
                    try:
                        service_state = self.services[service_id]
                        if service_state.server_state.managed:
                            crush_alias_to = service_state.server_state
                    except KeyError:
                        pass

            if crush_alias_to:
                LOG.info("'{0}' is a CRUSH alias to {1}".format(
                    hostname, crush_alias_to))
                continue

            # Look up or create ServerState for the server named in the
            # CRUSH map
            try:
                server_state = self.hostname_to_server[hostname]
            except KeyError:
                # Fake FQDN to equal hostname
                server_state = ServerState(hostname,
                                           hostname,
                                           managed=False,
                                           last_contact=None,
                                           boot_time=None,
                                           ceph_version=None)
                self.inject_server(server_state)
                self._persister.create_server(
                    Server(fsid=osd_map['fsid'],
                           fqdn=server_state.fqdn,
                           hostname=server_state.hostname,
                           managed=server_state.managed))

            # Register all the OSDs reported under this hostname with the
            # ServerState
            for service_id, osd in id_to_osd.items():
                if not server_state.managed:
                    # Only pay attention to these services for unmanaged
                    # servers,
                    # for managed servers rely on ceph/server salt messages
                    self._register_service(server_state,
                                           service_id,
                                           bool(osd['up']),
                                           None,
                                           fsid=osd_map['fsid'],
                                           fqdn=server_state.fqdn)

        # Remove ServiceState for any OSDs for this FSID which are not
        # mentioned in hostname_to_osds
        known_osds = set([
            s.id for s in self.fsid_services[osd_map['fsid']]
            if s.service_type == 'osd'
        ])
        for stale_service_id in known_osds - osds_in_map:
            self.forget_service(self.services[stale_service_id])
Beispiel #8
0
 def id(self):
     return ServiceId(self.fsid, self.service_type, self.service_id)