def status_by_service(self, services): result = self._manager.servers.get_services( [ServiceId(*s) for s in services]) return [({ 'running': ss.running, 'server': ss.server_state.fqdn, 'status': ss.status } if ss else None) for ss in result]
def _get_fqdn(self, fsid, service_type, service_id): """Resolve a service to a FQDN if possible, else return None """ server = self._manager.servers.get_by_service( ServiceId(fsid, service_type, str(service_id))) if server is None: LOG.warn("No server found for service %s %s" % (service_type, service_id)) return server.fqdn if server else None
def server_by_service(self, services): """Return a list of 2-tuples mapping of service ID to server FQDN Note that we would rather return a dict but tuple dict keys are awkward to serialize """ result = self._manager.servers.list_by_service( [ServiceId(*s) for s in services]) return result
def on_mon_map(self, mon_map): """When a new mon map is received, use it to eliminate any mon ServiceState records that no longer exist in the real world. """ map_mons = set([ ServiceId(mon_map['fsid'], 'mon', m['name']) for m in mon_map['mons'] ]) known_mons = set([ s.id for s in self.fsid_services[mon_map['fsid']] if s.service_type == 'mon' ]) for stale_mon_id in known_mons - map_mons: self.forget_service(self.services[stale_mon_id])
def on_mds_map(self, fsid, mds_map): """When a new MDS map is received, use it to eliminate any MDS ServiceState records that no longer exist in the real world. :param fsid: Pass in fsid string because mds map doesn't include it :param mds_map: The MDS map sync object """ map_mds = set([ ServiceId(fsid, 'mds', i['name']) for i in mds_map['info'].values() ]) known_mds = set([ s.id for s in self.fsid_services[fsid] if s.service_type == 'mds' ]) for stale_mds_id in known_mds - map_mds: self.forget_service(self.services[stale_mds_id])
def on_server_heartbeat(self, fqdn, server_heartbeat): """Call back for when a ceph.service message is received from a salt minion. This is actually a fairly simple operation of updating the in memory ServerState to reflect what is in the message, but it's convoluted because we may be seeing a new server, a known server, or a server which was known but unmanaged. """ LOG.debug("ServerMonitor.on_server_heartbeat: %s" % fqdn) new_server = True newly_managed_server = False try: server_state = self.servers[fqdn] new_server = False except KeyError: # Look up the grains for this server, we need to know its # hostname in order to resolve this vs. the OSD map. hostname = fqdn if hostname in self.hostname_to_server: server_state = self.hostname_to_server[hostname] if not server_state.managed: # Take over a ServerState that was created from OSD map server_state.managed = True old_fqdn = server_state.fqdn # OSD map servers would have faked up FQDN as hostname, # so clear that out del self.servers[old_fqdn] server_state.fqdn = fqdn self.servers[server_state.fqdn] = server_state for service_name, service in server_heartbeat[ 'services'].items(): self._persister.create_server( Server( fsid=service['fsid'], fqdn=server_state.fqdn, managed=True, )) break new_server = False LOG.info("Server %s went from unmanaged to managed" % fqdn) newly_managed_server = True else: # We will go on to treat these as distinct servers even # though they have the same hostname LOG.warn("Hostname clash: FQDNs '%s' and" " '%s' both have hostname %s" % (fqdn, server_state.fqdn, hostname)) else: # The case where hostname == FQDN, we may already have this # FQDN in our map from an unmanaged server being reported by # hostname. if not server_state.managed: newly_managed_server = True server_state.managed = True for service_name, service in server_heartbeat[ 'services'].items(): self._persister.create_server( Server( fsid=service['fsid'], fqdn=server_state.fqdn, managed=True, )) LOG.info("Server %s went from unmanaged to managed" % fqdn) break boot_time = datetime.datetime.fromtimestamp( server_heartbeat['boot_time'], tz=tz.tzutc()) if new_server: hostname = fqdn server_state = ServerState( fqdn, hostname, managed=True, last_contact=now(), boot_time=boot_time, ceph_version=server_heartbeat['ceph_version']) self.inject_server(server_state) for service_name, service in server_heartbeat['services'].items(): self._persister.create_server( Server(fsid=service['fsid'], fqdn=server_state.fqdn, hostname=server_state.hostname, managed=server_state.managed, last_contact=server_state.last_contact, boot_time=boot_time, ceph_version=server_heartbeat['ceph_version'])) LOG.info("Saw server %s for the first time" % server_state) break server_state.last_contact = now() for service_name, service in server_heartbeat['services'].items(): self._persister.create_server( Server( fsid=service['fsid'], fqdn=server_state.fqdn, last_contact=server_state.last_contact, )) break if server_state.boot_time != boot_time: LOG.warn("{0} boot time changed, old {1} new {2}".format( server_state.fqdn, server_state.boot_time, boot_time)) old_boot_time = server_state.boot_time server_state.boot_time = boot_time for service_name, service in server_heartbeat['services'].items(): self._persister.create_server( Server( fsid=service['fsid'], fqdn=server_state.fqdn, boot_time=server_state.boot_time, )) break if old_boot_time is not None: # i.e. a reboot, not an unmanaged->managed transition if server_state.boot_time < old_boot_time: LOG.warn("Server boot time went backwards") elif server_state.boot_time - old_boot_time < REBOOT_THRESHOLD: LOG.warn("Server boot time changed, but only a little") else: # A substantial forward change in boot time, that's a # reboot: emit a user visible event LOG.warn("{0} rebooted!".format(fqdn)) self._eventer.on_reboot(server_state, False) if server_state.ceph_version != server_heartbeat['ceph_version']: # Interpret "no package installed but some services running" as # meaning we're in the process of upgrading. upgrading = server_heartbeat[ 'ceph_version'] is None and server_heartbeat['services'] if server_heartbeat['ceph_version'] is None and upgrading: # Ignore version=None while upgrading to avoid generating # spurious "ceph uninstalled" events pass else: server_state.ceph_version = server_heartbeat['ceph_version'] for service_name, service in server_heartbeat[ 'services'].items(): self._persister.create_server( Server( fsid=service['fsid'], fqdn=server_state.fqdn, ceph_version=server_state.ceph_version, )) break if not (new_server or newly_managed_server): self._eventer.on_new_version(server_state) seen_id_tuples = set() for service_name, service in server_heartbeat['services'].items(): id_tuple = ServiceId(service['fsid'], service['type'], service['id']) seen_id_tuples.add(id_tuple) self._register_service(server_state, id_tuple, running=True, status=service['status'], fsid=service['fsid'], fqdn=fqdn) # For any service which was last reported on this server but # is now gone, mark it as not running for unseen_id_tuple in set( server_state.services.keys()) ^ seen_id_tuples: service_state = self.services[unseen_id_tuple] if service_state.running: LOG.info("Service %s stopped on server %s" % (service_state, server_state)) service_state.running = False if new_server or newly_managed_server: # We do this at the end so that by the time we emit the event # the ServiceState objects have been created self._eventer.on_server(server_state)
def on_osd_map(self, osd_map): """For when a new OSD map is received: we may infer the existence of hosts from the CRUSH map if the hosts are not all sending us data with salt. :param osd_map: The data from an OsdMap sync object """ LOG.debug("ServerMonitor.on_osd_map: epoch %s" % osd_map['epoch']) hostname_to_osds = self.get_hostname_to_osds(osd_map) LOG.debug("ServerMonitor.on_osd_map: got service" " data for %s servers" % len(hostname_to_osds)) osds_in_map = set() for hostname, osds in hostname_to_osds.items(): id_to_osd = dict([(ServiceId(osd_map['fsid'], 'osd', str(o['osd'])), o) for o in osds]) osds_in_map |= set(id_to_osd.keys()) # Identify if this is a CRUSH alias rather than a real hostname, by # checking if any of the OSDs mentioned are already recorded as # children of a managed host. crush_alias_to = None if hostname not in self.hostname_to_server: for service_id, osd in id_to_osd.items(): try: service_state = self.services[service_id] if service_state.server_state.managed: crush_alias_to = service_state.server_state except KeyError: pass if crush_alias_to: LOG.info("'{0}' is a CRUSH alias to {1}".format( hostname, crush_alias_to)) continue # Look up or create ServerState for the server named in the # CRUSH map try: server_state = self.hostname_to_server[hostname] except KeyError: # Fake FQDN to equal hostname server_state = ServerState(hostname, hostname, managed=False, last_contact=None, boot_time=None, ceph_version=None) self.inject_server(server_state) self._persister.create_server( Server(fsid=osd_map['fsid'], fqdn=server_state.fqdn, hostname=server_state.hostname, managed=server_state.managed)) # Register all the OSDs reported under this hostname with the # ServerState for service_id, osd in id_to_osd.items(): if not server_state.managed: # Only pay attention to these services for unmanaged # servers, # for managed servers rely on ceph/server salt messages self._register_service(server_state, service_id, bool(osd['up']), None, fsid=osd_map['fsid'], fqdn=server_state.fqdn) # Remove ServiceState for any OSDs for this FSID which are not # mentioned in hostname_to_osds known_osds = set([ s.id for s in self.fsid_services[osd_map['fsid']] if s.service_type == 'osd' ]) for stale_service_id in known_osds - osds_in_map: self.forget_service(self.services[stale_service_id])
def id(self): return ServiceId(self.fsid, self.service_type, self.service_id)