def _run(self): log.info("%s running" % self.__class__.__name__) event = SaltEventSource(salt_config) while not self._complete.is_set(): # No salt tag filtering: https://github.com/saltstack/salt/issues/11582 ev = event.get_event(full=True) if ev is not None: tag = ev['tag'] data = ev['data'] try: if tag.startswith("ceph/cluster/"): cluster_data = data['data'] if not cluster_data['fsid'] in self._manager.clusters: self._manager.on_discovery(data['id'], cluster_data) else: log.debug("%s: heartbeat from existing cluster %s" % ( self.__class__.__name__, cluster_data['fsid'])) else: # This does not concern us, ignore it pass except: log.debug("Message content: %s" % data) log.exception("Exception handling message") log.info("%s complete" % self.__class__.__name__)
def _run(self): log.info("%s running" % self.__class__.__name__) event = SaltEventSource(log, salt_config) while not self._complete.is_set(): # No salt tag filtering: https://github.com/saltstack/salt/issues/11582 ev = event.get_event(full=True) if ev is not None and 'tag' in ev: tag = ev['tag'] data = ev['data'] try: if tag.startswith("ceph/cluster/"): cluster_data = data['data'] if not cluster_data['fsid'] in self._manager.clusters: self._manager.on_discovery(data['id'], cluster_data) else: log.debug("%s: heartbeat from existing cluster %s" % ( self.__class__.__name__, cluster_data['fsid'])) elif re.match("^salt/job/\d+/ret/[^/]+$", tag): if data['fun'] == 'saltutil.running': self._manager.requests.on_tick_response(data['id'], data['return']) else: self._manager.requests.on_completion(data) else: # This does not concern us, ignore it log.debug("TopLevelEvents: ignoring %s" % tag) pass except: log.exception("Exception handling message tag=%s" % tag) log.info("%s complete" % self.__class__.__name__)
def _run(self): log.info("Running {0}".format(self.__class__.__name__)) while not self._complete.is_set(): # self._emit_stats() self._complete.wait(self.MONITOR_PERIOD) self._close()
def update(self, node_id, attributes): # TODO report Not Modified http://tracker.ceph.com/issues/9764 current_node = self.osd_map.get_tree_node(node_id) parent = self.osd_map.parent_bucket_by_node_id.get(node_id, None) name, bucket_type, items = [attributes[key] for key in ('name', 'bucket_type', 'items')] commands = [] # TODO change to use rename-bucket when #9526 lands in ceph 0.89 if name != current_node['name'] or bucket_type != current_node['type_name']: commands.append(add_bucket(name, bucket_type)) if parent is not None: commands.append(move_bucket(name, parent['name'], parent['type'])) to_remove = [item for item in current_node['items'] if item not in items] commands += self._remove_items(name, bucket_type, to_remove) for c in self._add_items(name, bucket_type, items): if c not in commands: commands.append(c) if name != current_node['name'] or bucket_type != current_node['type_name']: commands.append(remove_bucket(current_node['name'], None)) log.info("Updating CRUSH node {c} parent {p} version {v}".format(c=commands, p=parent, v=self.osd_map.version)) message = "Updating CRUSH node in {cluster_name}".format(cluster_name=self._cluster_monitor.name) return OsdMapModifyingRequest(message, self._cluster_monitor.fsid, self._cluster_monitor.name, commands)
def main(): log.info('calamari-list: starting') complete = gevent.event.Event() ceph_argparse = None while not ceph_argparse: try: import ceph_argparse except ImportError: log.error('Cannot import ceph_arg_parse module -- please install ceph') complete.wait(timeout=50) from cthulhu.manager.manager import Manager carbon = ShallowCarbonCache() carbon.start() cthulhu = Manager() cthulhu_started = False while not cthulhu_started: try: if not cthulhu_started: cthulhu_started = cthulhu.start() except Exception, e: log.exception('It borked') log.error(str(e)) complete.wait(timeout=5)
def on_fetch_complete(self, minion_id, sync_type, version, data): """ :return A SyncObject if this version was new to us, else None """ log.debug("SyncObjects.on_fetch_complete %s/%s/%s" % (minion_id, sync_type.str, version)) self._fetching_at[sync_type] = None # A fetch might give us a newer version than we knew we had asked for if sync_type.cmp(version, self._known_versions[sync_type]) > 0: self._known_versions[sync_type] = version # Don't store this if we already got something newer if sync_type.cmp(version, self.get_version(sync_type)) <= 0: log.warn("Ignoring outdated update %s/%s from %s" % (sync_type.str, version, minion_id)) new_object = None else: log.info("Got new version %s/%s" % (sync_type.str, version)) new_object = self.set_map(sync_type, version, data) # This might not be the latest: if it's not, send out another fetch # right away if sync_type.cmp(self._known_versions[sync_type], version) > 0: self.fetch(minion_id, sync_type) return new_object
def _run(self): log.info("Starting %s" % self.__class__.__name__) threads = [gevent.spawn(self.run_plugin, name, status_processor.run, status_processor.period) for name, status_processor in self.load_plugins()] gevent.joinall(threads)
def stop(self): log.info("%s stopping" % self.__class__.__name__) for monitor in self.clusters.values(): monitor.stop() self._rpc_thread.stop() self._discovery_thread.stop() self._process_monitor.stop() self.eventer.stop() self._request_ticker.stop()
def filter_errors(self, check_data, salt_name): filtered_output = {} for node, results in check_data.iteritems(): if results == '"%s" is not available.' % salt_name: log.info(node + results) else: filtered_output[node] = results return filtered_output
def _emit(self, severity, message, **associations): """ :param severity: One of the defined serverity values :param message: One line human readable string :param associations: Optional extra attributes to associate the event with a particular cluster/server/service """ log.info("Eventer._emit: %s/%s" % (severity_str(severity), message)) self._events.append(Event(severity, message, **associations))
def _run(self): log.info("%s running" % self.__class__.__name__) remote = get_remote() remote.listen(self._complete, on_heartbeat=self.on_heartbeat, on_job=self.on_job, on_running_jobs=self._manager.requests.on_tick_response) log.info("%s complete" % self.__class__.__name__)
def join(self): log.info("%s joining" % self.__class__.__name__) self._rpc_thread.join() self._discovery_thread.join() self._process_monitor.join() self.persister.join() self.eventer.join() self._request_ticker.join() self.servers.join() for monitor in self.clusters.values(): monitor.join()
def _run(self): assert self._bound while not self._complete.is_set(): try: log.info("%s run..." % self.__class__.__name__) self._server.run() except: log.error(traceback.format_exc()) self._complete.wait(self.EXCEPTION_BACKOFF) log.info("%s complete..." % self.__class__.__name__)
def start(self): log.info("%s starting" % self.__class__.__name__) self._rpc_thread.bind() self._rpc_thread.start() self._discovery_thread.start() self._process_monitor.start() self.persister.start() self.eventer.start() self._request_ticker.start() self.servers.start() return True
def on_version(self, reported_by, sync_type, new_version): """ Notify me that a particular version of a particular map exists. I may choose to initiate RPC to retrieve the map """ log.debug("SyncObjects.on_version %s/%s/%s" % (reported_by, sync_type.str, new_version)) old_version = self.get_version(sync_type) if sync_type.cmp(new_version, old_version) > 0: known_version = self._known_versions[sync_type] if sync_type.cmp(new_version, known_version) > 0: # We are out of date: request an up to date copy log.info("Advanced known version %s/%s %s->%s" % ( self._cluster_name, sync_type.str, known_version, new_version)) self._known_versions[sync_type] = new_version else: log.info("on_version: %s is newer than %s" % (new_version, old_version)) # If we already have a request out for this type of map, then consider # cancelling it if we've already waited for a while. if self._fetching_at[sync_type] is not None: if now() - self._fetching_at[sync_type] < self.FETCH_TIMEOUT: log.info("Fetch already underway for %s" % sync_type.str) return else: log.warn("Abandoning fetch for %s started at %s" % ( sync_type.str, self._fetching_at[sync_type])) log.info("on_version: fetching %s/%s from %s, currently got %s, know %s" % ( sync_type, new_version, reported_by, old_version, known_version )) self.fetch(reported_by, sync_type)
def on_discovery(self, minion_id, heartbeat_data): log.info("on_discovery: {0}/{1}".format(minion_id, heartbeat_data['fsid'])) cluster_monitor = ClusterMonitor(heartbeat_data['fsid'], heartbeat_data['name'], self.persister, self.servers, self.eventer, self.requests) self.clusters[heartbeat_data['fsid']] = cluster_monitor # Run before passing on the heartbeat, because otherwise the # syncs resulting from the heartbeat might not be received # by the monitor. cluster_monitor.start() # Wait for ClusterMonitor to start accepting events before asking it # to do anything cluster_monitor.ready() cluster_monitor.on_heartbeat(minion_id, heartbeat_data)
def _run(self): self._plugin_monitor.start() self._ready.set() log.debug("ClusterMonitor._run: ready") remote.listen(self._complete, on_heartbeat=self.on_heartbeat, fsid=self.fsid, on_job=self.on_job_complete) log.info("%s complete" % self.__class__.__name__) self._plugin_monitor.stop() self._plugin_monitor.join() self.done.set()
def _run(self): log.info("Persister listening") while not self._complete.is_set(): try: data = self._queue.get(block=True, timeout=1) except gevent.queue.Empty: continue else: try: data.fn(*data.args, **data.kwargs) self._session.commit() except Exception: # Catch-all because all kinds of things can go wrong and our # behaviour is the same: log the exception, the data that # caused it, then try to go back to functioning. log.exception("Persister exception persisting data: %s" % (data.fn,)) self._session.rollback()
def tick(self): """ For walltime-based monitoring of running requests. Long-running requests get a periodic call to saltutil.running to verify that things really are still happening. """ if not self._by_jid: return else: log.debug("RequestCollection.tick: %s JIDs underway" % len(self._by_jid)) # Identify JIDs who haven't had a saltutil.running reponse for too long. # Kill requests in a separate phase because request:JID is not 1:1 stale_jobs = set() _now = now() for request in self._by_jid.values(): if _now - request.alive_at > datetime.timedelta(seconds=TICK_PERIOD * 3): log.error("Request %s JID %s stale: now=%s, alive_at=%s" % ( request.id, request.jid, _now, request.alive_at )) stale_jobs.add(request) # Any identified stale jobs are errored out. for request in stale_jobs: with self._update_index(request): request.set_error("Lost contact") request.jid = None request.complete() # Identify minions associated with JIDs in flight query_minions = set() for jid, request in self._by_jid.items(): query_minions.add(request.minion_id) # Attempt to emit a saltutil.running to ping jobs, next tick we # will see if we got updates to the alive_at attribute to indicate non-staleness if query_minions: log.info("RequestCollection.tick: sending saltutil.running to {0}".format(query_minions)) client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(list(query_minions), 'saltutil.running', [], expr_form="list") if not pub_data: log.warning("Failed to publish saltutil.running to {0}".format(query_minions))
def _pool_min_size(self, req_size, req_min_size): ''' Find an appropriate "min_size" parameter for a pool create operation req_size is requested pool size; 0 means "use osd_pool_default_size" req_min_size is requested min size Used in both create and update ''' ceph_config = self._cluster_monitor.get_sync_object_data(Config) size = req_size or int(ceph_config.get('osd_pool_default_size'), 0) min_size = req_min_size or \ int(ceph_config.get('osd_pool_default_min_size'), 0) if min_size: ret_min_size = min(min_size, size) else: ret_min_size = size - size / 2 log.info('_pool_min_size: size %d, min_size %d, ret %d' % (size, min_size, ret_min_size)) return ret_min_size
def start(self): log.info("%s starting" % self.__class__.__name__) # Before we start listening to the outside world, recover # our last known state from persistent storage try: self._recover() except: log.exception("Recovery failed") os._exit(-1) self._rpc_thread.bind() self._rpc_thread.start() self._discovery_thread.start() self._process_monitor.start() self.notifier.start() self.persister.start() self.eventer.start() self.servers.start()
def _emit_stats(self): try: if not self._socket: log.info("Opening carbon socket {0}:{1}".format(self.CARBON_HOST, self.CARBON_PORT)) self._socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self._socket.connect((self.CARBON_HOST, self.CARBON_PORT)) carbon_data = "" t = int(time.time()) usage = resource.getrusage(resource.RUSAGE_SELF) for usage_field in ("utime", "stime", "maxrss", "ixrss", "idrss", "isrss", "minflt", "majflt", "nswap", "inblock", "oublock", "msgsnd", "msgrcv", "nsignals", "nvcsw", "nivcsw"): val = getattr(usage, "ru_{0}".format(usage_field)) log.debug("{0}: {1}".format(usage_field, val)) carbon_data += "calamari.cthulhu.ru_{0} {1} {2}\n".format(usage_field, val, t) self._socket.sendall(carbon_data) except socket.gaierror, resource.error: log.exception("Failed to send debugging statistics") self._close()
def load_plugins(self): """ Try to load a status_processor from each module in plugin_path, store keyed by module_name """ loaded_plugins = [] # FIXME this assumes that plugin_path has been added to PYTHONPATH and/or is in site-packages plugin_path = config.get('cthulhu', 'plugin_path') if os.path.exists(plugin_path): for plugin in os.listdir(plugin_path): plugin = plugin.split('.')[0] if plugin in ('__init__', 'README'): continue status_processor = None try: plugin_module = importlib.import_module('.'.join((plugin, 'status_processor'))) status_processor = plugin_module.StatusProcessor() except ImportError, e: log.info("Error importing plugin %s %s" % (plugin, str(e))) if status_processor is not None: loaded_plugins.append((plugin, status_processor))
def main(): parser = argparse.ArgumentParser(description='Calamari management service') parser.add_argument('--debug', dest='debug', action='store_true', default=False, help='print log to stdout') args = parser.parse_args() if args.debug: handler = logging.StreamHandler(sys.stdout) handler.setFormatter(logging.Formatter(cthulhu.log.FORMAT)) log.addHandler(handler) # Instruct salt to use the gevent version of ZMQ import zmq.green import salt.utils.event salt.utils.event.zmq = zmq.green if sqlalchemy is not None: # Set up gevent compatibility in psycopg2 import psycogreen.gevent psycogreen.gevent.patch_psycopg() if manhole is not None: # Enable manhole for debugging. Use oneshot mode # for gevent compatibility manhole.cry = lambda message: log.info("MANHOLE: %s" % message) manhole.install(oneshot_on=signal.SIGUSR1) m = Manager() m.start() complete = gevent.event.Event() def shutdown(): log.info("Signal handler: stopping") complete.set() gevent.signal(signal.SIGTERM, shutdown) gevent.signal(signal.SIGINT, shutdown) while not complete.is_set(): complete.wait(timeout=1)
def on_version(self, reported_by, sync_type, new_version): """ Notify me that a particular version of a particular map exists. I may choose to initiate RPC to retrieve the map """ log.debug("SyncObjects.on_version %s/%s/%s" % (reported_by, sync_type.str, new_version)) old_version = self.get_version(sync_type) if sync_type.cmp(new_version, old_version) > 0: known_version = self._known_versions[sync_type] if sync_type.cmp(new_version, known_version) > 0: # We are out of date: request an up to date copy log.info("Advanced known version %s/%s %s->%s" % (self._cluster_name, sync_type.str, known_version, new_version)) self._known_versions[sync_type] = new_version else: log.info("on_version: %s is newer than %s" % (new_version, old_version)) # If we already have a request out for this type of map, then consider # cancelling it if we've already waited for a while. if self._fetching_at[sync_type] is not None: if now() - self._fetching_at[sync_type] < self.FETCH_TIMEOUT: log.info("Fetch already underway for %s" % sync_type.str) return else: log.warn("Abandoning fetch for %s started at %s" % (sync_type.str, self._fetching_at[sync_type])) log.info( "on_version: fetching %s/%s from %s, currently got %s, know %s" % (sync_type, new_version, reported_by, old_version, known_version)) self.fetch(reported_by, sync_type)
def on_completion(self, data): """ Callback for when a salt/job/<jid>/ret event is received, in which we find the UserRequest that created the job, and inform it of completion so that it can progress. """ with self._lock: jid = data['jid'] result = data['return'] log.debug("on_completion: jid=%s data=%s" % (jid, data)) try: request = self.get_by_jid(jid) log.debug("on_completion: jid %s belongs to request %s" % (jid, request.id)) except KeyError: log.warning("on_completion: unknown jid {0}".format(jid)) return if not data['success']: # This indicates a failure at the salt level, i.e. job threw an exception log.error("Remote execution failed for request %s: %s" % (request.id, result)) if isinstance(result, dict): # Handler ran and recorded an error for us request.set_error(result['error_status']) else: # An exception, probably, stringized by salt for us request.set_error(result) request.complete() elif result['error']: # This indicates a failure within ceph.rados_commands which was caught # by our code, like one of our Ceph commands returned an error code. # NB in future there may be UserRequest subclasses which want to receive # and handle these errors themselves, so this branch would be refactored # to allow that. log.error("Request %s experienced an error: %s" % (request.id, result['error_status'])) request.jid = None request.set_error(result['error_status']) request.complete() else: if request.state != UserRequest.SUBMITTED: # Unexpected, ignore. log.error( "Received completion for request %s/%s in state %s" % (request.id, request.jid, request.state)) return try: with self._update_index(request): old_jid = request.jid request.complete_jid(result) assert request.jid != old_jid # After a jid completes, requests may start waiting for cluster # map updates, we ask ClusterMonitor to hurry up and get them on # behalf of the request. if request.awaiting_versions: for sync_type, version in request.awaiting_versions.items( ): if version is not None: log.debug( "Notifying SyncObjects of awaited version %s/%s" % (sync_type.str, version)) self._sync_objects.on_version( data['id'], sync_type, version) # The request may be waiting for an epoch that we already have, if so # give it to the request right away for sync_type, want_version in request.awaiting_versions.items( ): got_version = self._sync_objects.get_version( sync_type) if want_version and sync_type.cmp( got_version, want_version) >= 0: log.info( "Awaited %s %s is immediately available" % (sync_type, want_version)) request.on_map(sync_type, self._sync_objects) except Exception as e: # Ensure that a misbehaving piece of code in a UserRequest subclass # results in a terminated job, not a zombie job log.exception("Calling complete_jid for %s/%s" % (request.id, request.jid)) request.jid = None request.set_error("Internal error %s" % e) request.complete() if request.state == UserRequest.COMPLETE: self._eventer.on_user_request_complete(request)
def _run(self): self._plugin_monitor.start() self._ready.set() log.debug("ClusterMonitor._run: ready") event = SaltEventSource(log, salt_config) while not self._complete.is_set(): # No salt tag filtering: https://github.com/saltstack/salt/issues/11582 ev = event.get_event(full=True) if ev is not None: data = ev['data'] tag = ev['tag'] log.debug("_run.ev: %s/tag=%s" % (data['id'] if 'id' in data else None, tag)) # I am interested in the following tags: # - salt/job/<jid>/ret/<minion id> where jid is one that I started # (this includes ceph.rados_command and ceph.get_cluster_object) # - ceph/cluster/<fsid> where fsid is my fsid try: if tag.startswith("ceph/cluster/{0}".format(self.fsid)): # A ceph.heartbeat beacon self.on_heartbeat(data['id'], data['data']) elif re.match("^salt/job/\d+/ret/[^/]+$", tag): if data['fun'] == "saltutil.running": # Update on what jobs are running # It would be nice to filter these down to those which really are for # this cluster, but as long as N_clusters and N_jobs are reasonably small # it's not an efficiency problem. self._requests.on_tick_response(data['id'], data['return']) # It would be much nicer to put the FSID at the start of # the tag, if salt would only let us add custom tags to our jobs. # Instead we enforce a convention that all calamari jobs must include # fsid in their return value. if (not isinstance(data, dict)) or not isinstance(data['return'], dict): # Something not formatted for ClusterMonitor log.warning("Ignoring event %s" % tag) continue if 'fsid' not in data['return'] or data['return']['fsid'] != self.fsid: # Something for a different ClusterMonitor log.debug("Ignoring job return, not for my FSID") continue if data['fun'] == 'ceph.get_cluster_object': # A ceph.get_cluster_object response if not data['success']: log.error("on_sync_object: failure from %s: %s" % (data['id'], data['return'])) continue self.on_sync_object(data['id'], data['return']) else: log.warning("Unexpected function '%s' (%s)" % (data['fun'], tag)) else: # This does not concern us, ignore it pass except: # Because this is our main event handling loop, swallow exceptions # instead of letting them end the world. log.exception("Exception handling message with tag %s" % tag) log.debug("Message content: %s" % data) log.info("%s complete" % self.__class__.__name__) self._plugin_monitor.stop() self._plugin_monitor.join() self.done.set()
def stop(self): log.info("%s stopping" % self.__class__.__name__) self._complete.set()
def on_completion(self, data): """ Callback for when a salt/job/<jid>/ret event is received, in which we find the UserRequest that created the job, and inform it of completion so that it can progress. """ with self._lock: jid = data['jid'] result = data['return'] log.debug("on_completion: jid=%s data=%s" % (jid, data)) try: request = self.get_by_jid(jid) log.debug("on_completion: jid %s belongs to request %s" % (jid, request.id)) except KeyError: log.warning("on_completion: unknown jid {0}".format(jid)) return if not data['success']: # This indicates a failure at the salt level, i.e. job threw an exception log.error("Remote execution failed for request %s: %s" % (request.id, result)) if isinstance(result, dict): # Handler ran and recorded an error for us request.set_error(result['error_status']) else: # An exception, probably, stringized by salt for us request.set_error(result) request.complete() elif result['error']: # This indicates a failure within ceph.rados_commands which was caught # by our code, like one of our Ceph commands returned an error code. # NB in future there may be UserRequest subclasses which want to receive # and handle these errors themselves, so this branch would be refactored # to allow that. log.error("Request %s experienced an error: %s" % (request.id, result['error_status'])) request.jid = None request.set_error(result['error_status']) request.complete() else: if request.state != UserRequest.SUBMITTED: # Unexpected, ignore. log.error("Received completion for request %s/%s in state %s" % ( request.id, request.jid, request.state )) return try: with self._update_index(request): old_jid = request.jid request.complete_jid(result) assert request.jid != old_jid # After a jid completes, requests may start waiting for cluster # map updates, we ask ClusterMonitor to hurry up and get them on # behalf of the request. if request.awaiting_versions: for sync_type, version in request.awaiting_versions.items(): if version is not None: log.debug("Notifying SyncObjects of awaited version %s/%s" % (sync_type.str, version)) self._sync_objects.on_version(data['id'], sync_type, version) # The request may be waiting for an epoch that we already have, if so # give it to the request right away for sync_type, want_version in request.awaiting_versions.items(): got_version = self._sync_objects.get_version(sync_type) if want_version and sync_type.cmp(got_version, want_version) >= 0: log.info("Awaited %s %s is immediately available" % (sync_type, want_version)) request.on_map(sync_type, self._sync_objects) except Exception as e: # Ensure that a misbehaving piece of code in a UserRequest subclass # results in a terminated job, not a zombie job log.exception("Calling complete_jid for %s/%s" % (request.id, request.jid)) request.jid = None request.set_error("Internal error %s" % e) request.complete() if request.state == UserRequest.COMPLETE: self._eventer.on_user_request_complete(request)
def bind(self): log.info("%s bind..." % self.__class__.__name__) self._server.bind(config.get('cthulhu', 'rpc_url')) self._bound = True
def shutdown(): log.info("Signal handler: stopping") complete.set()
def _recover(self): if sqlalchemy is None: return session = Session() for server in session.query(Server).all(): log.debug("Recovered server %s" % server.fqdn) assert server.boot_time is None or server.boot_time.tzinfo is not None # expect timezone-aware DB backend self.servers.inject_server( ServerState(fqdn=server.fqdn, hostname=server.hostname, managed=server.managed, last_contact=server.last_contact, boot_time=server.boot_time, ceph_version=server.ceph_version)) for service in session.query(Service).all(): if service.server: server = session.query(Server).get(service.server) else: server = None log.debug("Recovered service %s/%s/%s on %s" % (service.fsid, service.service_type, service.service_id, server.fqdn if server else None)) self.servers.inject_service( ServiceState(fsid=service.fsid, service_type=service.service_type, service_id=service.service_id), server.fqdn if server else None) # I want the most recent version of every sync_object fsids = [(row[0], row[1]) for row in session.query( SyncObject.fsid, SyncObject.cluster_name).distinct(SyncObject.fsid) ] for fsid, name in fsids: cluster_monitor = ClusterMonitor(fsid, name, self.notifier, self.persister, self.servers, self.eventer, self.requests) self.clusters[fsid] = cluster_monitor object_types = [ row[0] for row in session.query(SyncObject.sync_type).filter_by( fsid=fsid).distinct() ] for sync_type in object_types: latest_record = session.query(SyncObject).filter_by( fsid=fsid, sync_type=sync_type).order_by(SyncObject.version.desc(), SyncObject.when.desc())[0] # FIXME: bit of a hack because records persisted only store their 'version' # if it's a real counter version, underlying problem is that we have # underlying data (health, pg_brief) without usable version counters. def md5(raw): hasher = hashlib.md5() hasher.update(raw) return hasher.hexdigest() if latest_record.version: version = latest_record.version else: version = md5(latest_record.data) when = latest_record.when when = when.replace(tzinfo=tzutc()) if cluster_monitor.update_time is None or when > cluster_monitor.update_time: cluster_monitor.update_time = when cluster_monitor.inject_sync_object( None, sync_type, version, msgpack.unpackb(latest_record.data)) for monitor in self.clusters.values(): log.info("Recovery: Cluster %s with update time %s" % (monitor.fsid, monitor.update_time)) monitor.start()
def _recover(self): if sqlalchemy is None: return session = Session() for server in session.query(Server).all(): log.debug("Recovered server %s" % server.fqdn) assert server.boot_time is None or server.boot_time.tzinfo is not None # expect timezone-aware DB backend self.servers.inject_server(ServerState( fqdn=server.fqdn, hostname=server.hostname, managed=server.managed, last_contact=server.last_contact, boot_time=server.boot_time, ceph_version=server.ceph_version )) for service in session.query(Service).all(): if service.server: server = session.query(Server).get(service.server) else: server = None log.debug("Recovered service %s/%s/%s on %s" % ( service.fsid, service.service_type, service.service_id, server.fqdn if server else None )) self.servers.inject_service(ServiceState( fsid=service.fsid, service_type=service.service_type, service_id=service.service_id ), server.fqdn if server else None) # I want the most recent version of every sync_object fsids = [(row[0], row[1]) for row in session.query(SyncObject.fsid, SyncObject.cluster_name).distinct(SyncObject.fsid)] for fsid, name in fsids: cluster_monitor = ClusterMonitor(fsid, name, self.persister, self.servers, self.eventer, self.requests) self.clusters[fsid] = cluster_monitor object_types = [row[0] for row in session.query(SyncObject.sync_type).filter_by(fsid=fsid).distinct()] for sync_type in object_types: latest_record = session.query(SyncObject).filter_by( fsid=fsid, sync_type=sync_type).order_by( SyncObject.version.desc(), SyncObject.when.desc())[0] # FIXME: bit of a hack because records persisted only store their 'version' # if it's a real counter version, underlying problem is that we have # underlying data (health, pg_brief) without usable version counters. def md5(raw): hasher = hashlib.md5() hasher.update(raw) return hasher.hexdigest() if latest_record.version: version = latest_record.version else: version = md5(latest_record.data) when = latest_record.when when = when.replace(tzinfo=tzutc()) if cluster_monitor.update_time is None or when > cluster_monitor.update_time: cluster_monitor.update_time = when cluster_monitor.inject_sync_object(None, sync_type, version, msgpack.unpackb(latest_record.data)) for monitor in self.clusters.values(): log.info("Recovery: Cluster %s with update time %s" % (monitor.fsid, monitor.update_time)) monitor.start()