def on_fetch_complete(self, minion_id, sync_type, version, data): """ :return A SyncObject if this version was new to us, else None """ log.debug("SyncObjects.on_fetch_complete %s/%s/%s" % (minion_id, sync_type.str, version)) self._fetching_at[sync_type] = None # A fetch might give us a newer version than we knew we had asked for if sync_type.cmp(version, self._known_versions[sync_type]) > 0: self._known_versions[sync_type] = version # Don't store this if we already got something newer if sync_type.cmp(version, self.get_version(sync_type)) <= 0: log.warn("Ignoring outdated update %s/%s from %s" % (sync_type.str, version, minion_id)) new_object = None else: log.info("Got new version %s/%s" % (sync_type.str, version)) new_object = self.set_map(sync_type, version, data) # This might not be the latest: if it's not, send out another fetch # right away if sync_type.cmp(self._known_versions[sync_type], version) > 0: self.fetch(minion_id, sync_type) return new_object
def _run(self): log.info("Running {0}".format(self.__class__.__name__)) while not self._complete.is_set(): # self._emit_stats() self._complete.wait(self.MONITOR_PERIOD) self._close()
def _emit_stats(self): try: if not self._socket: log.info("Opening carbon socket {0}:{1}".format( self.CARBON_HOST, self.CARBON_PORT)) self._socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self._socket.connect((self.CARBON_HOST, self.CARBON_PORT)) carbon_data = "" t = int(time.time()) usage = resource.getrusage(resource.RUSAGE_SELF) for usage_field in ("utime", "stime", "maxrss", "ixrss", "idrss", "isrss", "minflt", "majflt", "nswap", "inblock", "oublock", "msgsnd", "msgrcv", "nsignals", "nvcsw", "nivcsw"): val = getattr(usage, "ru_{0}".format(usage_field)) log.debug("{0}: {1}".format(usage_field, val)) carbon_data += "calamari.rlyeh.ru_{0} {1} {2}\n".format( usage_field, val, t) self._socket.sendall(carbon_data) except socket.gaierror, resource.error: log.exception("Failed to send debugging statistics") self._close()
def stop(self): log.info("%s stopping" % self.__class__.__name__) for monitor in self.clusters.values(): monitor.stop() self._rpc_thread.stop() self._discovery_thread.stop() self._process_monitor.stop() self.eventer.stop() self._request_ticker.stop()
def _run(self): log.info("%s running" % self.__class__.__name__) remote = get_remote() remote.listen(self._complete, on_heartbeat=self.on_heartbeat, on_job=self.on_job, on_running_jobs=self._manager.requests.on_tick_response) log.info("%s complete" % self.__class__.__name__)
def join(self): log.info("%s joining" % self.__class__.__name__) self._rpc_thread.join() self._discovery_thread.join() self._process_monitor.join() self.persister.join() self.eventer.join() self._request_ticker.join() self.servers.join() for monitor in self.clusters.values(): monitor.join()
def _run(self): assert self._bound while not self._complete.is_set(): try: log.info("%s run..." % self.__class__.__name__) self._server.run() except: log.error(traceback.format_exc()) self._complete.wait(self.EXCEPTION_BACKOFF) log.info("%s complete..." % self.__class__.__name__)
def start(self): log.info("%s starting" % self.__class__.__name__) self._rpc_thread.bind() self._rpc_thread.start() self._discovery_thread.start() self._process_monitor.start() self.persister.start() self.eventer.start() self._request_ticker.start() self.servers.start() return True
def on_discovery(self, minion_id, heartbeat_data): log.info("on_discovery: {0}/{1}".format(minion_id, heartbeat_data['fsid'])) cluster_monitor = ClusterMonitor(heartbeat_data['fsid'], heartbeat_data['name'], self.persister, self.servers, self.eventer, self.requests) self.clusters[heartbeat_data['fsid']] = cluster_monitor # Run before passing on the heartbeat, because otherwise the # syncs resulting from the heartbeat might not be received # by the monitor. cluster_monitor.start() # Wait for ClusterMonitor to start accepting events before asking it # to do anything cluster_monitor.ready() cluster_monitor.on_heartbeat(minion_id, heartbeat_data)
def _run(self): self._plugin_monitor.start() self._ready.set() log.debug("ClusterMonitor._run: ready") remote.listen(self._complete, on_heartbeat=self.on_heartbeat, fsid=self.fsid, on_job=self.on_job_complete) log.info("%s complete" % self.__class__.__name__) self._plugin_monitor.stop() self._plugin_monitor.join() self.done.set()
def _emit_stats(self): try: if not self._socket: log.info("Opening carbon socket {0}:{1}".format(self.CARBON_HOST, self.CARBON_PORT)) self._socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self._socket.connect((self.CARBON_HOST, self.CARBON_PORT)) carbon_data = "" t = int(time.time()) usage = resource.getrusage(resource.RUSAGE_SELF) for usage_field in ("utime", "stime", "maxrss", "ixrss", "idrss", "isrss", "minflt", "majflt", "nswap", "inblock", "oublock", "msgsnd", "msgrcv", "nsignals", "nvcsw", "nivcsw"): val = getattr(usage, "ru_{0}".format(usage_field)) log.debug("{0}: {1}".format(usage_field, val)) carbon_data += "calamari.rlyeh.ru_{0} {1} {2}\n".format(usage_field, val, t) self._socket.sendall(carbon_data) except socket.gaierror, resource.error: log.exception("Failed to send debugging statistics") self._close()
def main(): parser = argparse.ArgumentParser(description='Calamari management service') parser.add_argument('--debug', dest='debug', action='store_true', default=False, help='print log to stdout') args = parser.parse_args() if args.debug: handler = logging.StreamHandler(sys.stdout) handler.setFormatter(logging.Formatter(rlyeh.log.FORMAT)) log.addHandler(handler) # Instruct salt to use the gevent version of ZMQ import zmq.green import salt.utils.event salt.utils.event.zmq = zmq.green if sqlalchemy is not None: # Set up gevent compatibility in psycopg2 import psycogreen.gevent psycogreen.gevent.patch_psycopg() if manhole is not None: # Enable manhole for debugging. Use oneshot mode # for gevent compatibility manhole.cry = lambda message: log.info("MANHOLE: %s" % message) manhole.install(oneshot_on=signal.SIGUSR1) m = Manager() m.start() complete = gevent.event.Event() def shutdown(): log.info("Signal handler: stopping") complete.set() gevent.signal(signal.SIGTERM, shutdown) gevent.signal(signal.SIGINT, shutdown) while not complete.is_set(): complete.wait(timeout=1)
def on_version(self, reported_by, sync_type, new_version): """ Notify me that a particular version of a particular map exists. I may choose to initiate RPC to retrieve the map """ log.debug("SyncObjects.on_version %s/%s/%s" % (reported_by, sync_type.str, new_version)) old_version = self.get_version(sync_type) if sync_type.cmp(new_version, old_version) > 0: known_version = self._known_versions[sync_type] if sync_type.cmp(new_version, known_version) > 0: # We are out of date: request an up to date copy log.info("Advanced known version %s/%s %s->%s" % (self._cluster_name, sync_type.str, known_version, new_version)) self._known_versions[sync_type] = new_version else: log.info("on_version: %s is newer than %s" % (new_version, old_version)) # If we already have a request out for this type of map, then consider # cancelling it if we've already waited for a while. if self._fetching_at[sync_type] is not None: if now() - self._fetching_at[sync_type] < self.FETCH_TIMEOUT: log.info("Fetch already underway for %s" % sync_type.str) return else: log.warn("Abandoning fetch for %s started at %s" % (sync_type.str, self._fetching_at[sync_type])) log.info( "on_version: fetching %s/%s from %s, currently got %s, know %s" % (sync_type, new_version, reported_by, old_version, known_version)) self.fetch(reported_by, sync_type)
def _recover(self): if sqlalchemy is None: return session = Session() for server in session.query(Server).all(): log.debug("Recovered server %s" % server.fqdn) assert server.boot_time is None or server.boot_time.tzinfo is not None # expect timezone-aware DB backend self.servers.inject_server(ServerState( fqdn=server.fqdn, hostname=server.hostname, managed=server.managed, last_contact=server.last_contact, boot_time=server.boot_time, ceph_version=server.ceph_version )) for service in session.query(Service).all(): if service.server: server = session.query(Server).get(service.server) else: server = None log.debug("Recovered service %s/%s/%s on %s" % ( service.fsid, service.service_type, service.service_id, server.fqdn if server else None )) self.servers.inject_service(ServiceState( fsid=service.fsid, service_type=service.service_type, service_id=service.service_id ), server.fqdn if server else None) # I want the most recent version of every sync_object fsids = [(row[0], row[1]) for row in session.query(SyncObject.fsid, SyncObject.cluster_name).distinct(SyncObject.fsid)] for fsid, name in fsids: cluster_monitor = ClusterMonitor(fsid, name, self.persister, self.servers, self.eventer, self.requests) self.clusters[fsid] = cluster_monitor object_types = [row[0] for row in session.query(SyncObject.sync_type).filter_by(fsid=fsid).distinct()] for sync_type in object_types: latest_record = session.query(SyncObject).filter_by( fsid=fsid, sync_type=sync_type).order_by( SyncObject.version.desc(), SyncObject.when.desc())[0] # FIXME: bit of a hack because records persisted only store their 'version' # if it's a real counter version, underlying problem is that we have # underlying data (health, pg_brief) without usable version counters. def md5(raw): hasher = hashlib.md5() hasher.update(raw) return hasher.hexdigest() if latest_record.version: version = latest_record.version else: version = md5(latest_record.data) when = latest_record.when when = when.replace(tzinfo=tzutc()) if cluster_monitor.update_time is None or when > cluster_monitor.update_time: cluster_monitor.update_time = when cluster_monitor.inject_sync_object(None, sync_type, version, msgpack.unpackb(latest_record.data)) for monitor in self.clusters.values(): log.info("Recovery: Cluster %s with update time %s" % (monitor.fsid, monitor.update_time)) monitor.start()
def _recover(self): if sqlalchemy is None: return session = Session() for server in session.query(Server).all(): log.debug("Recovered server %s" % server.fqdn) assert server.boot_time is None or server.boot_time.tzinfo is not None # expect timezone-aware DB backend self.servers.inject_server( ServerState(fqdn=server.fqdn, hostname=server.hostname, managed=server.managed, last_contact=server.last_contact, boot_time=server.boot_time, ceph_version=server.ceph_version)) for service in session.query(Service).all(): if service.server: server = session.query(Server).get(service.server) else: server = None log.debug("Recovered service %s/%s/%s on %s" % (service.fsid, service.service_type, service.service_id, server.fqdn if server else None)) self.servers.inject_service( ServiceState(fsid=service.fsid, service_type=service.service_type, service_id=service.service_id), server.fqdn if server else None) # I want the most recent version of every sync_object fsids = [(row[0], row[1]) for row in session.query( SyncObject.fsid, SyncObject.cluster_name).distinct(SyncObject.fsid) ] for fsid, name in fsids: cluster_monitor = ClusterMonitor(fsid, name, self.persister, self.servers, self.eventer, self.requests) self.clusters[fsid] = cluster_monitor object_types = [ row[0] for row in session.query(SyncObject.sync_type).filter_by( fsid=fsid).distinct() ] for sync_type in object_types: latest_record = session.query(SyncObject).filter_by( fsid=fsid, sync_type=sync_type).order_by(SyncObject.version.desc(), SyncObject.when.desc())[0] # FIXME: bit of a hack because records persisted only store their 'version' # if it's a real counter version, underlying problem is that we have # underlying data (health, pg_brief) without usable version counters. def md5(raw): hasher = hashlib.md5() hasher.update(raw) return hasher.hexdigest() if latest_record.version: version = latest_record.version else: version = md5(latest_record.data) when = latest_record.when when = when.replace(tzinfo=tzutc()) if cluster_monitor.update_time is None or when > cluster_monitor.update_time: cluster_monitor.update_time = when cluster_monitor.inject_sync_object( None, sync_type, version, msgpack.unpackb(latest_record.data)) for monitor in self.clusters.values(): log.info("Recovery: Cluster %s with update time %s" % (monitor.fsid, monitor.update_time)) monitor.start()
def shutdown(): log.info("Signal handler: stopping") complete.set()
def bind(self): log.info("%s bind..." % self.__class__.__name__) self._server.bind(config.get('rlyeh', 'rpc_url')) self._bound = True
def stop(self): log.info("%s stopping" % self.__class__.__name__) self._complete.set() if self._server: self._server.stop()
def stop(self): log.info("%s stopping" % self.__class__.__name__) self._complete.set()