def fetch(self, minion_id, sync_type): log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type)) if minion_id is None: # We're probably being replayed to from the database log.warn("SyncObjects.fetch called with minion_id=None") return self._fetching_at[sync_type] = now() client = LocalClient(config.get('cthulhu', 'salt_config_path')) # TODO clean up unused 'since' argument pub_data = client.run_job( minion_id, 'ceph.get_cluster_object', condition_kwarg( [], { 'cluster_name': self._cluster_name, 'sync_type': sync_type.str, 'since': None })) if not pub_data: log.error("Failed to start fetch job %s/%s" % (minion_id, sync_type)) # Don't throw an exception because if a fetch fails we should always else: log.debug("SyncObjects.fetch: jid=%s minions=%s" % (pub_data['jid'], pub_data['minions']))
def __init__(self, fsid, cluster_name, commands): """ Requiring cluster_name and fsid is redundant (ideally everything would speak in terms of fsid) but convenient, because the librados interface wants a cluster name when you create a client, and otherwise we would have to look up via ceph.conf. """ self.log = log.getChild(self.__class__.__name__) self.requested_at = now() self.completed_at = None # This is actually kind of overkill compared with having a counter, # somewhere but it's easy. self.id = uuid.uuid4().__str__() self._minion_id = None self._fsid = fsid self._cluster_name = cluster_name self._commands = commands self.jid = None self.state = self.NEW self.result = None self.error = False self.error_message = "" # Time at which we last believed the current JID to be really running self.alive_at = None
def _submit(self): self.jid = remote.run_job(self._minion_id, self._cmd, self._args) self.alive_at = now() self.log.info("Request %s started job %s" % (self.id, self.jid)) return self.jid
def _is_favorite(self, minion_id): """ Check if this minion is the one which we are currently treating as the primary source of updates, and promote it to be the favourite if the favourite has not sent a heartbeat since cthulhu->favorite_timeout_s. :return True if this minion was the favorite or has just been promoted. """ t_now = now() self._last_heartbeat[minion_id] = t_now if self._favorite_mon is None: log.debug("%s is my new favourite" % minion_id) self._set_favorite(minion_id) return True elif minion_id != self._favorite_mon: # Consider whether this minion should become my new favourite: has it been # too long since my current favourite reported in? time_since = t_now - self._last_heartbeat[self._favorite_mon] favorite_timeout_s = self._servers.get_contact_period(self._favorite_mon) * FAVORITE_TIMEOUT_FACTOR if time_since > datetime.timedelta(seconds=favorite_timeout_s): log.debug("My old favourite, %s, has not sent a heartbeat for %s: %s is my new favourite" % ( self._favorite_mon, time_since, minion_id )) self._set_favorite(minion_id) return minion_id == self._favorite_mon
def on_version(self, reported_by, sync_type, new_version): """ Notify me that a particular version of a particular map exists. I may choose to initiate RPC to retrieve the map """ log.debug("SyncObjects.on_version %s/%s/%s" % (reported_by, sync_type.str, new_version)) old_version = self.get_version(sync_type) if sync_type.cmp(new_version, old_version) > 0: known_version = self._known_versions[sync_type] if sync_type.cmp(new_version, known_version) > 0: # We are out of date: request an up to date copy log.info("Advanced known version %s/%s %s->%s" % ( self._cluster_name, sync_type.str, known_version, new_version)) self._known_versions[sync_type] = new_version else: log.info("on_version: %s is newer than %s" % (new_version, old_version)) # If we already have a request out for this type of map, then consider # cancelling it if we've already waited for a while. if self._fetching_at[sync_type] is not None: if now() - self._fetching_at[sync_type] < self.FETCH_TIMEOUT: log.info("Fetch already underway for %s" % sync_type.str) return else: log.warn("Abandoning fetch for %s started at %s" % ( sync_type.str, self._fetching_at[sync_type])) log.info("on_version: fetching %s/%s from %s, currently got %s, know %s" % ( sync_type, new_version, reported_by, old_version, known_version )) self.fetch(reported_by, sync_type)
def __init__(self, fsid, cluster_name): """ Requiring cluster_name and fsid is redundant (ideally everything would speak in terms of fsid) but convenient, because the librados interface wants a cluster name when you create a client, and otherwise we would have to look up via ceph.conf. """ # getChild isn't in 2.6 logname = '.'.join((log.name, self.__class__.__name__)) self.log = logging.getLogger(logname) self.requested_at = now() self.completed_at = None # This is actually kind of overkill compared with having a counter, # somewhere but it's easy. self.id = uuid.uuid4().__str__() self._minion_id = None self.fsid = fsid self._cluster_name = cluster_name self.jid = None self.state = self.NEW self.result = None self.error = False self.error_message = "" # Time at which we last believed the current JID to be really running self.alive_at = None
def complete(self): """ Call this when you're all done """ assert self.state != self.COMPLETE assert self.jid is None self.log.info("Request %s completed with error=%s (%s)" % (self.id, self.error, self.error_message)) self.state = self.COMPLETE self.completed_at = now()
def _update_sync_object(self, fsid, name, sync_type, version, when, data): self._session.add( SyncObject( fsid=fsid, cluster_name=name, sync_type=sync_type, version=version, when=when, data=msgpack.packb(data) ) ) # Time-limited FIFO threshold = now() - CLUSTER_MAP_RETENTION self._session.query(SyncObject).filter( SyncObject.when < threshold, SyncObject.fsid == fsid, SyncObject.sync_type == sync_type ).delete()
def _update_sync_object(self, fsid, name, sync_type, version, when, data): self._session.add( SyncObject(fsid=fsid, cluster_name=name, sync_type=sync_type, version=version, when=when, data=msgpack.packb(data))) # Time-limited FIFO threshold = now() - CLUSTER_MAP_RETENTION self._session.query(SyncObject).filter( SyncObject.when < threshold, SyncObject.fsid == fsid, SyncObject.sync_type == sync_type).delete()
def on_tick(self): """ Periodically call this to drive non-event-driven events (i.e. things which are based on walltime checks) """ log.debug("Eventer.on_tick") now_utc = now() for fqdn, server_state in self._manager.servers.servers.items(): if not server_state.managed: # We don't expect messages from unmanaged servers so don't # worry about whether they sent us one recently. continue if len(server_state.clusters) == 1: # Because Events can only be associated with one FSID, we only make this # association for servers with exactly one cluster. This is a bit cheeky and # kind of an unnecessary limitation in the Event DB schema. fsid = server_state.clusters[0] else: fsid = None contact_threshold = CONTACT_THRESHOLD_FACTOR * self._manager.servers.get_contact_period(fqdn) if now_utc - server_state.last_contact > datetime.timedelta(seconds=contact_threshold): if fqdn not in self._servers_complained: self._emit(WARNING, "Server {fqdn} is late reporting in, last report at {last}".format( fqdn=fqdn, last=server_state.last_contact ), fqdn=fqdn, fsid=fsid) self._servers_complained.add(fqdn) else: if fqdn in self._servers_complained: self._emit(RECOVERY, "Server {fqdn} regained contact".format(fqdn=fqdn), fqdn=fqdn, fsid=fsid) self._servers_complained.discard(fqdn) for fsid, cluster_monitor in self._manager.clusters.items(): if cluster_monitor.update_time is None or now_utc - cluster_monitor.update_time > datetime.timedelta( seconds=CLUSTER_CONTACT_THRESHOLD): if fsid not in self._clusters_complained: self._clusters_complained.add(fsid) self._emit(WARNING, "Cluster '{name}' is late reporting in".format(name=cluster_monitor.name), fsid=fsid) else: if fsid in self._clusters_complained: self._emit(RECOVERY, "Cluster '{name}' regained contact".format(name=cluster_monitor.name), fsid=fsid) self._clusters_complained.discard(fsid) self._flush()
def _submit(self): client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(self._minion_id, self._cmd, self._args) if not pub_data: # FIXME: LocalClient uses 'print' to record the # details of what went wrong :-( raise PublishError("Failed to publish job") self.log.info("Request %s started job %s" % (self.id, pub_data['jid'])) self.alive_at = now() self.jid = pub_data['jid'] return self.jid
def tick(self): """ For walltime-based monitoring of running requests. Long-running requests get a periodic call to saltutil.running to verify that things really are still happening. """ if not self._by_jid: return else: log.debug("RequestCollection.tick: %s JIDs underway" % len(self._by_jid)) # Identify JIDs who haven't had a saltutil.running reponse for too long. # Kill requests in a separate phase because request:JID is not 1:1 stale_jobs = set() _now = now() for request in self._by_jid.values(): if _now - request.alive_at > datetime.timedelta( seconds=TICK_PERIOD * 3): log.error("Request %s JID %s stale: now=%s, alive_at=%s" % (request.id, request.jid, _now, request.alive_at)) stale_jobs.add(request) # Any identified stale jobs are errored out. for request in stale_jobs: with self._update_index(request): request.set_error("Lost contact") request.jid = None request.complete() # Identify minions associated with JIDs in flight query_minions = set() for jid, request in self._by_jid.items(): query_minions.add(request.minion_id) # Attempt to emit a saltutil.running to ping jobs, next tick we # will see if we got updates to the alive_at attribute to indicate non-staleness if query_minions: log.info("RequestCollection.tick: sending saltutil.running to {0}". format(query_minions)) client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(list(query_minions), 'saltutil.running', [], expr_form="list") if not pub_data: log.warning("Failed to publish saltutil.running to {0}".format( query_minions))
def _emit(self, severity, message, **associations): """ :param severity: One of the defined serverity values :param message: One line human readable string :param associations: Optional extra attributes to associate the event with a particular cluster/server/service """ now_utc = now() log.info("Eventer._emit: %s/%s/%s" % (now_utc, severity_str(severity), message)) self._events.append( Event(when=now_utc, message=message, severity=severity, **associations))
def on_tick_response(self, minion_id, jobs): """ Update the alive_at parameter of requests to record that they are still running remotely. :param jobs: The response from a saltutil.running """ log.debug("RequestCollection.on_tick_response: %s from %s" % (len(jobs), minion_id)) for job in jobs: try: request = self._by_jid[job['jid']] except KeyError: # Not one of mine, ignore it pass else: request.alive_at = now()
def _emit(self, severity, message, **associations): """ :param severity: One of the defined serverity values :param message: One line human readable string :param associations: Optional extra attributes to associate the event with a particular cluster/server/service """ now_utc = now() log.info("Eventer._emit: %s/%s/%s" % (now_utc, severity_str(severity), message)) self._events.append(Event( when=now_utc, message=message, severity=severity, **associations ))
def _submit(self, commands): self.log.debug("Request._submit: %s/%s/%s" % (self._minion_id, self._cluster_name, commands)) client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(self._minion_id, 'ceph.rados_commands', [self._fsid, self._cluster_name, commands]) if not pub_data: # FIXME: LocalClient uses 'print' to record the # details of what went wrong :-( raise PublishError("Failed to publish job") self.log.info("Request %s started job %s" % (self.id, pub_data['jid'])) self.alive_at = now() self.jid = pub_data['jid'] return self.jid
def _submit(self, commands=None): if commands is None: commands = self._commands self.log.debug("%s._submit: %s/%s/%s" % (self.__class__.__name__, self._minion_id, self._cluster_name, commands)) self.jid = remote.run_job(self._minion_id, 'ceph.rados_commands', {'fsid': self.fsid, 'cluster_name': self._cluster_name, 'commands': commands}) self.log.info("Request %s started job %s" % (self.id, self.jid)) self.alive_at = now() return self.jid
def _submit(self, commands=None): if commands is None: commands = self._commands self.log.debug("%s._submit: %s/%s/%s" % (self.__class__.__name__, self._minion_id, self._cluster_name, commands)) self.jid = remote.run_job( self._minion_id, 'ceph.rados_commands', { 'fsid': self.fsid, 'cluster_name': self._cluster_name, 'commands': commands }) self.log.info("Request %s started job %s" % (self.id, self.jid)) self.alive_at = now() return self.jid
def fetch(self, minion_id, sync_type): log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type)) if minion_id is None: # We're probably being replayed to from the database log.warn("SyncObjects.fetch called with minion_id=None") return self._fetching_at[sync_type] = now() client = LocalClient(config.get('cthulhu', 'salt_config_path')) # TODO clean up unused 'since' argument pub_data = client.run_job(minion_id, 'ceph.get_cluster_object', condition_kwarg([], {'cluster_name': self._cluster_name, 'sync_type': sync_type.str, 'since': None})) if not pub_data: log.error("Failed to start fetch job %s/%s" % (minion_id, sync_type)) # Don't throw an exception because if a fetch fails we should always else: log.debug("SyncObjects.fetch: jid=%s minions=%s" % (pub_data['jid'], pub_data['minions']))
def on_tick(self): # This procedure is to catch the annoying case of AES key changes (#7836), which are otherwise # ignored by minions which are doing only minion->master messaging. To ensure they # pick up on key changes, we actively send them something (doesn't matter what). To # avoid doing this constantly, we only send things to minions which seem to be a little # late # After this length of time, doubt a minion enough to send it a message in case # it needs a kick to update its key def _ping_period(fqdn): return datetime.timedelta(seconds=self.get_contact_period(fqdn) * 2) t = now() late_servers = [s.fqdn for s in self.servers.values() if s.last_contact and (t - s.last_contact) > _ping_period(s.fqdn)] log.debug("late servers: %s" % late_servers) if late_servers: client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub = client.pub(late_servers, "test.ping", expr_form='list') log.debug(pub)
def tick(self): """ For walltime-based monitoring of running requests. Long-running requests get a periodic call to saltutil.running to verify that things really are still happening. """ if not self._by_jid: return else: log.debug("RequestCollection.tick: %s JIDs underway" % len(self._by_jid)) # Identify JIDs who haven't had a saltutil.running reponse for too long. # Kill requests in a separate phase because request:JID is not 1:1 stale_jobs = set() _now = now() for request in self._by_jid.values(): if _now - request.alive_at > datetime.timedelta(seconds=TICK_PERIOD * 3): log.error("Request %s JID %s stale: now=%s, alive_at=%s" % ( request.id, request.jid, _now, request.alive_at )) stale_jobs.add(request) # Any identified stale jobs are errored out. for request in stale_jobs: with self._update_index(request): request.set_error("Lost contact") request.jid = None request.complete() # Identify minions associated with JIDs in flight query_minions = set() for jid, request in self._by_jid.items(): query_minions.add(request.minion_id) # Attempt to emit a saltutil.running to ping jobs, next tick we # will see if we got updates to the alive_at attribute to indicate non-staleness if query_minions: log.info("RequestCollection.tick: sending saltutil.running to {0}".format(query_minions)) client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(list(query_minions), 'saltutil.running', [], expr_form="list") if not pub_data: log.warning("Failed to publish saltutil.running to {0}".format(query_minions))
def fetch(self, minion_id, sync_type): log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type)) if minion_id is None: # We're probably being replayed to from the database log.warn("SyncObjects.fetch called with minion_id=None") return self._fetching_at[sync_type] = now() try: # TODO clean up unused 'since' argument jid = remote.run_job(minion_id, 'ceph.get_cluster_object', {'cluster_name': self._cluster_name, 'sync_type': sync_type.str, 'since': None}) except Unavailable: # Don't throw an exception because if a fetch fails we should end up # issuing another on next heartbeat log.error("Failed to start fetch job %s/%s" % (minion_id, sync_type)) else: log.debug("SyncObjects.fetch: jid=%s" % jid)
def on_sync_object(self, minion_id, data): if minion_id != self._favorite_mon: log.debug("Ignoring map from %s, it is not my favourite (%s)" % (minion_id, self._favorite_mon)) assert data['fsid'] == self.fsid sync_object = data['data'] sync_type = SYNC_OBJECT_STR_TYPE[data['type']] new_object = self.inject_sync_object(minion_id, data['type'], data['version'], sync_object) if new_object: self._requests.on_map(self.fsid, sync_type, new_object) self._persister.update_sync_object( self.fsid, self.name, sync_type.str, new_object.version if isinstance(new_object.version, int) else None, now(), sync_object) else: log.warn("ClusterMonitor.on_sync_object: stale object received from %s" % minion_id)
def _submit(self, commands=None): if commands is None: commands = self._commands self.log.debug("%s._submit: %s/%s/%s" % (self.__class__.__name__, self._minion_id, self._cluster_name, commands)) client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(self._minion_id, 'ceph.rados_commands', [self.fsid, self._cluster_name, commands]) if not pub_data: # FIXME: LocalClient uses 'print' to record the # details of what went wrong :-( raise PublishError("Failed to publish job") self.log.info("Request %s started job %s" % (self.id, pub_data['jid'])) self.alive_at = now() self.jid = pub_data['jid'] return self.jid
def fetch(self, minion_id, sync_type): log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type)) if minion_id is None: # We're probably being replayed to from the database log.warn("SyncObjects.fetch called with minion_id=None") return self._fetching_at[sync_type] = now() try: # TODO clean up unused 'since' argument jid = remote.run_job( minion_id, 'ceph.get_cluster_object', { 'cluster_name': self._cluster_name, 'sync_type': sync_type.str, 'since': None }) except Unavailable: # Don't throw an exception because if a fetch fails we should end up # issuing another on next heartbeat log.error("Failed to start fetch job %s/%s" % (minion_id, sync_type)) else: log.debug("SyncObjects.fetch: jid=%s" % jid)
def on_tick(self): # This procedure is to catch the annoying case of AES key changes (#7836), which are otherwise # ignored by minions which are doing only minion->master messaging. To ensure they # pick up on key changes, we actively send them something (doesn't matter what). To # avoid doing this constantly, we only send things to minions which seem to be a little # late # After this length of time, doubt a minion enough to send it a message in case # it needs a kick to update its key def _ping_period(fqdn): return datetime.timedelta(seconds=self.get_contact_period(fqdn) * 2) t = now() late_servers = [ s.fqdn for s in self.servers.values() if s.last_contact and (t - s.last_contact) > _ping_period(s.fqdn) ] log.debug("late servers: %s" % late_servers) if late_servers: client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub = client.pub(late_servers, "test.ping", expr_form='list') log.debug(pub)
def on_server_heartbeat(self, fqdn, server_heartbeat): """ Call back for when a ceph.service message is received from a salt minion. This is actually a fairly simple operation of updating the in memory ServerState to reflect what is in the message, but it's convoluted because we may be seeing a new server, a known server, or a server which was known but unmanaged. """ log.debug("ServerMonitor.on_server_heartbeat: %s" % fqdn) new_server = True newly_managed_server = False try: server_state = self.servers[fqdn] new_server = False except KeyError: # Look up the grains for this server, we need to know its hostname in order # to resolve this vs. the OSD map. hostname = self._get_grains(fqdn)['host'] if hostname in self.hostname_to_server: server_state = self.hostname_to_server[hostname] if not server_state.managed: # Take over a ServerState that was created from OSD map server_state.managed = True old_fqdn = server_state.fqdn # OSD map servers would have faked up FQDN as hostname, so clear that out del self.servers[old_fqdn] server_state.fqdn = fqdn self.servers[server_state.fqdn] = server_state self._persister.update_server(old_fqdn, fqdn=fqdn, managed=True) new_server = False log.info("Server %s went from unmanaged to managed" % fqdn) newly_managed_server = True else: # We will go on to treat these as distinct servers even though # they have the same hostname log.warn( "Hostname clash: FQDNs '%s' and '%s' both have hostname %s" % (fqdn, server_state.fqdn, hostname)) else: # The case where hostname == FQDN, we may already have this FQDN in our # map from an unmanaged server being reported by hostname. if not server_state.managed: newly_managed_server = True server_state.managed = True self._persister.update_server(server_state.fqdn, managed=True) log.info("Server %s went from unmanaged to managed" % fqdn) boot_time = datetime.datetime.fromtimestamp( server_heartbeat['boot_time'], tz=tz.tzutc()) if new_server: hostname = self._get_grains(fqdn)['host'] server_state = ServerState( fqdn, hostname, managed=True, last_contact=now(), boot_time=boot_time, ceph_version=server_heartbeat['ceph_version']) self.inject_server(server_state) self._persister.create_server( Server(fqdn=server_state.fqdn, hostname=server_state.hostname, managed=server_state.managed, last_contact=server_state.last_contact)) log.info("Saw server %s for the first time" % server_state) server_state.last_contact = now() self._persister.update_server(server_state.fqdn, last_contact=server_state.last_contact) if server_state.boot_time != boot_time: log.warn("{0} boot time changed, old {1} new {2}".format( server_state.fqdn, server_state.boot_time, boot_time)) old_boot_time = server_state.boot_time server_state.boot_time = boot_time self._persister.update_server(server_state.fqdn, boot_time=server_state.boot_time) if old_boot_time is not None: # i.e. a reboot, not an unmanaged->managed transition if server_state.boot_time < old_boot_time: log.warn("Server boot time went backwards") elif server_state.boot_time - old_boot_time < REBOOT_THRESHOLD: log.warn("Server boot time changed, but only a little") else: # A substantial forward change in boot time, that's a reboot: emit # a user visible event log.warn("{0} rebooted!".format(fqdn)) self._eventer.on_reboot(server_state, False) if server_state.ceph_version != server_heartbeat['ceph_version']: # Interpret "no package installed but some services running" as meaning we're # in the process of upgrading. upgrading = server_heartbeat[ 'ceph_version'] is None and server_heartbeat['services'] if server_heartbeat['ceph_version'] is None and upgrading: # Ignore version=None while upgrading to avoid generating spurious # "ceph uninstalled" events pass else: server_state.ceph_version = server_heartbeat['ceph_version'] self._persister.update_server( server_state.fqdn, ceph_version=server_state.ceph_version) if not (new_server or newly_managed_server): self._eventer.on_new_version(server_state) seen_id_tuples = set() for service_name, service in server_heartbeat['services'].items(): id_tuple = ServiceId(service['fsid'], service['type'], service['id']) seen_id_tuples.add(id_tuple) self._register_service(server_state, id_tuple, running=True, status=service['status']) # For any service which was last reported on this server but # is now gone, mark it as not running for unseen_id_tuple in set( server_state.services.keys()) ^ seen_id_tuples: service_state = self.services[unseen_id_tuple] if service_state.running: log.info("Service %s stopped on server %s" % (service_state, server_state)) service_state.running = False if new_server or newly_managed_server: # We do this at the end so that by the time we emit the event # the ServiceState objects have been created self._eventer.on_server(server_state)
def __init__(self, severity, message, **associations): self.severity = severity self.message = message self.associations = associations self.when = now()
def on_server_heartbeat(self, fqdn, server_heartbeat): """ Call back for when a ceph.service message is received from a salt minion. This is actually a fairly simple operation of updating the in memory ServerState to reflect what is in the message, but it's convoluted because we may be seeing a new server, a known server, or a server which was known but unmanaged. """ log.debug("ServerMonitor.on_server_heartbeat: %s" % fqdn) new_server = True newly_managed_server = False try: server_state = self.servers[fqdn] new_server = False except KeyError: # Look up the grains for this server, we need to know its hostname in order # to resolve this vs. the OSD map. hostname = self._get_grains(fqdn)['host'] if hostname in self.hostname_to_server: server_state = self.hostname_to_server[hostname] if not server_state.managed: # Take over a ServerState that was created from OSD map server_state.managed = True old_fqdn = server_state.fqdn # OSD map servers would have faked up FQDN as hostname, so clear that out del self.servers[old_fqdn] server_state.fqdn = fqdn self.servers[server_state.fqdn] = server_state self._persister.update_server(old_fqdn, fqdn=fqdn, managed=True) new_server = False log.info("Server %s went from unmanaged to managed" % fqdn) newly_managed_server = True else: # We will go on to treat these as distinct servers even though # they have the same hostname log.warn("Hostname clash: FQDNs '%s' and '%s' both have hostname %s" % ( fqdn, server_state.fqdn, hostname )) else: # The case where hostname == FQDN, we may already have this FQDN in our # map from an unmanaged server being reported by hostname. if not server_state.managed: newly_managed_server = True server_state.managed = True self._persister.update_server(server_state.fqdn, managed=True) log.info("Server %s went from unmanaged to managed" % fqdn) boot_time = datetime.datetime.fromtimestamp(server_heartbeat['boot_time'], tz=tz.tzutc()) if new_server: hostname = self._get_grains(fqdn)['host'] server_state = ServerState(fqdn, hostname, managed=True, last_contact=now(), boot_time=boot_time, ceph_version=server_heartbeat['ceph_version']) self.inject_server(server_state) self._persister.create_server(Server( fqdn=server_state.fqdn, hostname=server_state.hostname, managed=server_state.managed, last_contact=server_state.last_contact )) log.info("Saw server %s for the first time" % server_state) server_state.last_contact = now() self._persister.update_server(server_state.fqdn, last_contact=server_state.last_contact) if server_state.boot_time != boot_time: log.warn("{0} boot time changed, old {1} new {2}".format( server_state.fqdn, server_state.boot_time, boot_time )) old_boot_time = server_state.boot_time server_state.boot_time = boot_time self._persister.update_server(server_state.fqdn, boot_time=server_state.boot_time) if old_boot_time is not None: # i.e. a reboot, not an unmanaged->managed transition if server_state.boot_time < old_boot_time: log.warn("Server boot time went backwards") elif server_state.boot_time - old_boot_time < REBOOT_THRESHOLD: log.warn("Server boot time changed, but only a little") else: # A substantial forward change in boot time, that's a reboot: emit # a user visible event log.warn("{0} rebooted!".format(fqdn)) self._eventer.on_reboot(server_state, False) if server_state.ceph_version != server_heartbeat['ceph_version']: # Interpret "no package installed but some services running" as meaning we're # in the process of upgrading. upgrading = server_heartbeat['ceph_version'] is None and server_heartbeat['services'] if server_heartbeat['ceph_version'] is None and upgrading: # Ignore version=None while upgrading to avoid generating spurious # "ceph uninstalled" events pass else: server_state.ceph_version = server_heartbeat['ceph_version'] self._persister.update_server(server_state.fqdn, ceph_version=server_state.ceph_version) if not (new_server or newly_managed_server): self._eventer.on_new_version(server_state) seen_id_tuples = set() for service_name, service in server_heartbeat['services'].items(): id_tuple = ServiceId(service['fsid'], service['type'], service['id']) seen_id_tuples.add(id_tuple) self._register_service(server_state, id_tuple, running=True, status=service['status']) # For any service which was last reported on this server but # is now gone, mark it as not running for unseen_id_tuple in set(server_state.services.keys()) ^ seen_id_tuples: service_state = self.services[unseen_id_tuple] if service_state.running: log.info("Service %s stopped on server %s" % (service_state, server_state)) service_state.running = False if new_server or newly_managed_server: # We do this at the end so that by the time we emit the event # the ServiceState objects have been created self._eventer.on_server(server_state)