def get_sync_object(self, fs_id, object_type, path=None): """ Get one of the objects that ClusterMonitor keeps a copy of from the mon, such as the cluster maps. :param fs_id: The fsid of a cluster :param object_type: String, one of SYNC_OBJECT_TYPES :param path: List, optional, a path within the object to return instead of the whole thing :return: the requested data, or None if it was not found (including if any element of ``path`` was not found) """ if path: obj = self._fs_resolve(fs_id).get_sync_object(SYNC_OBJECT_STR_TYPE[object_type]) try: for part in path: if isinstance(obj, dict): obj = obj[part] else: obj = getattr(obj, part) except (AttributeError, KeyError) as e: log.exception("Exception %s traversing %s: obj=%s" % (e, path, obj)) raise NotFound(object_type, path) return obj else: return self._fs_resolve(fs_id).get_sync_object_data(SYNC_OBJECT_STR_TYPE[object_type])
def main(): log.info('calamari-list: starting') complete = gevent.event.Event() ceph_argparse = None while not ceph_argparse: try: import ceph_argparse except ImportError: log.error( 'Cannot import ceph_arg_parse module -- please install ceph') complete.wait(timeout=50) from cthulhu.manager.manager import Manager carbon = ShallowCarbonCache() carbon.start() cthulhu = Manager() cthulhu_started = False while not cthulhu_started: try: if not cthulhu_started: cthulhu_started = cthulhu.start() except Exception, e: log.exception('It borked') log.error(str(e)) complete.wait(timeout=5)
def on_map(self, sync_type, sync_objects): """ Callback for when a new cluster map is available, in which we notify any interested ongoing UserRequests of the new map so that they can progress if they were waiting for it. """ with self._lock: requests = self.get_all(state=UserRequest.SUBMITTED) for request in requests: try: # If this is one of the types that this request # is waiting for, invoke on_map. for awaited_type in request.awaiting_versions.keys(): if awaited_type == sync_type: with self._update_index(request): request.on_map(sync_type, sync_objects) except Exception as e: log.exception("Request %s threw exception in on_map", request.id) if request.jid: log.error("Abandoning job {0}".format(request.jid)) request.jid = None request.set_error("Internal error %s" % e) request.complete() if request.state == UserRequest.COMPLETE: self._eventer.on_user_request_complete(request)
def _run(self): log.info("%s running" % self.__class__.__name__) event = SaltEventSource(log, salt_config) while not self._complete.is_set(): # No salt tag filtering: https://github.com/saltstack/salt/issues/11582 ev = event.get_event(full=True) if ev is not None and 'tag' in ev: tag = ev['tag'] data = ev['data'] try: if tag.startswith("ceph/cluster/"): cluster_data = data['data'] if not cluster_data['fsid'] in self._manager.clusters: self._manager.on_discovery(data['id'], cluster_data) else: log.debug( "%s: heartbeat from existing cluster %s" % (self.__class__.__name__, cluster_data['fsid'])) elif re.match("^salt/job/\d+/ret/[^/]+$", tag): if data['fun'] == 'saltutil.running': self._manager.requests.on_tick_response( data['id'], data['return']) else: self._manager.requests.on_completion(data) else: # This does not concern us, ignore it log.debug("TopLevelEvents: ignoring %s" % tag) pass except: log.exception("Exception handling message tag=%s" % tag) log.info("%s complete" % self.__class__.__name__)
def main(): log.info('calamari-list: starting') complete = gevent.event.Event() ceph_argparse = None while not ceph_argparse: try: import ceph_argparse except ImportError: log.error('Cannot import ceph_arg_parse module -- please install ceph') complete.wait(timeout=50) from cthulhu.manager.manager import Manager carbon = ShallowCarbonCache() carbon.start() cthulhu = Manager() cthulhu_started = False while not cthulhu_started: try: if not cthulhu_started: cthulhu_started = cthulhu.start() except Exception, e: log.exception('It borked') log.error(str(e)) complete.wait(timeout=5)
def get_sync_object(self, fs_id, object_type, path=None): """ Get one of the objects that ClusterMonitor keeps a copy of from the mon, such as the cluster maps. :param fs_id: The fsid of a cluster :param object_type: String, one of SYNC_OBJECT_TYPES :param path: List, optional, a path within the object to return instead of the whole thing :return: the requested data, or None if it was not found (including if any element of ``path`` was not found) """ if path: obj = self._fs_resolve(fs_id).get_sync_object( SYNC_OBJECT_STR_TYPE[object_type]) try: for part in path: if isinstance(obj, dict): obj = obj[part] else: obj = getattr(obj, part) except (AttributeError, KeyError) as e: log.exception("Exception %s traversing %s: obj=%s" % (e, path, obj)) raise NotFound(object_type, path) return obj else: return self._fs_resolve(fs_id).get_sync_object_data( SYNC_OBJECT_STR_TYPE[object_type])
def _run(self): log.info("%s running" % self.__class__.__name__) event = SaltEventSource(salt_config) while not self._complete.is_set(): # No salt tag filtering: https://github.com/saltstack/salt/issues/11582 ev = event.get_event(full=True) if ev is not None: tag = ev['tag'] data = ev['data'] try: if tag.startswith("ceph/cluster/"): cluster_data = data['data'] if not cluster_data['fsid'] in self._manager.clusters: self._manager.on_discovery(data['id'], cluster_data) else: log.debug("%s: heartbeat from existing cluster %s" % ( self.__class__.__name__, cluster_data['fsid'])) else: # This does not concern us, ignore it pass except: log.debug("Message content: %s" % data) log.exception("Exception handling message") log.info("%s complete" % self.__class__.__name__)
def _emit_stats(self): try: if not self._socket: log.info("Opening carbon socket {0}:{1}".format( self.CARBON_HOST, self.CARBON_PORT)) self._socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self._socket.connect((self.CARBON_HOST, self.CARBON_PORT)) carbon_data = "" t = int(time.time()) usage = resource.getrusage(resource.RUSAGE_SELF) for usage_field in ("utime", "stime", "maxrss", "ixrss", "idrss", "isrss", "minflt", "majflt", "nswap", "inblock", "oublock", "msgsnd", "msgrcv", "nsignals", "nvcsw", "nivcsw"): val = getattr(usage, "ru_{0}".format(usage_field)) log.debug("{0}: {1}".format(usage_field, val)) carbon_data += "calamari.cthulhu.ru_{0} {1} {2}\n".format( usage_field, val, t) self._socket.sendall(carbon_data) except socket.gaierror, resource.error: log.exception("Failed to send debugging statistics") self._close()
def _run(self): log.info("%s running" % self.__class__.__name__) event = SaltEventSource(log, salt_config) while not self._complete.is_set(): # No salt tag filtering: https://github.com/saltstack/salt/issues/11582 ev = event.get_event(full=True) if ev is not None and 'tag' in ev: tag = ev['tag'] data = ev['data'] try: if tag.startswith("ceph/cluster/"): cluster_data = data['data'] if not cluster_data['fsid'] in self._manager.clusters: self._manager.on_discovery(data['id'], cluster_data) else: log.debug("%s: heartbeat from existing cluster %s" % ( self.__class__.__name__, cluster_data['fsid'])) elif re.match("^salt/job/\d+/ret/[^/]+$", tag): if data['fun'] == 'saltutil.running': self._manager.requests.on_tick_response(data['id'], data['return']) else: self._manager.requests.on_completion(data) else: # This does not concern us, ignore it log.debug("TopLevelEvents: ignoring %s" % tag) pass except: log.exception("Exception handling message tag=%s" % tag) log.info("%s complete" % self.__class__.__name__)
def _run(self): log.info("%s running" % self.__class__.__name__) event = SaltEventSource(salt_config) while not self._complete.is_set(): # No salt tag filtering: https://github.com/saltstack/salt/issues/11582 ev = event.get_event(full=True) if ev is not None: tag = ev['tag'] data = ev['data'] try: if tag.startswith("ceph/cluster/"): cluster_data = data['data'] if not cluster_data['fsid'] in self._manager.clusters: self._manager.on_discovery(data['id'], cluster_data) else: log.debug( "%s: heartbeat from existing cluster %s" % (self.__class__.__name__, cluster_data['fsid'])) else: # This does not concern us, ignore it pass except: log.debug("Message content: %s" % data) log.exception("Exception handling message") log.info("%s complete" % self.__class__.__name__)
def wrap(*args, **kwargs): log.debug("RpcInterface >> %s(%s, %s)" % (item, args, kwargs)) try: rc = attr(*args, **kwargs) log.debug("RpcInterface << %s" % item) except: log.exception("RpcInterface !! %s" % item) raise return rc
def _run(self): log.info("Persister listening") while not self._complete.is_set(): try: data = self._queue.get(block=True, timeout=1) except gevent.queue.Empty: continue else: try: data.fn(*data.args, **data.kwargs) self._session.commit() except Exception: # Catch-all because all kinds of things can go wrong and our # behaviour is the same: log the exception, the data that # caused it, then try to go back to functioning. log.exception("Persister exception persisting data: %s" % (data.fn,)) self._session.rollback()
def start(self): log.info("%s starting" % self.__class__.__name__) # Before we start listening to the outside world, recover # our last known state from persistent storage try: self._recover() except: log.exception("Recovery failed") os._exit(-1) self._rpc_thread.bind() self._rpc_thread.start() self._discovery_thread.start() self._process_monitor.start() self.notifier.start() self.persister.start() self.eventer.start() self.servers.start()
def _run(self): log.info("Persister listening") while not self._complete.is_set(): try: data = self._queue.get(block=True, timeout=1) except gevent.queue.Empty: continue else: try: data.fn(*data.args, **data.kwargs) self._session.commit() except Exception: # Catch-all because all kinds of things can go wrong and our # behaviour is the same: log the exception, the data that # caused it, then try to go back to functioning. log.exception("Persister exception persisting data: %s" % (data.fn, )) self._session.rollback()
def start(self): log.info("%s starting" % self.__class__.__name__) # Before we start listening to the outside world, recover # our last known state from persistent storage try: self._recover() except: log.exception("Recovery failed") os._exit(-1) self._rpc_thread.bind() self._rpc_thread.start() self._discovery_thread.start() self._process_monitor.start() self.persister.start() self.eventer.start() self._request_ticker.start() self.servers.start()
def _emit_stats(self): try: if not self._socket: log.info("Opening carbon socket {0}:{1}".format(self.CARBON_HOST, self.CARBON_PORT)) self._socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self._socket.connect((self.CARBON_HOST, self.CARBON_PORT)) carbon_data = "" t = int(time.time()) usage = resource.getrusage(resource.RUSAGE_SELF) for usage_field in ("utime", "stime", "maxrss", "ixrss", "idrss", "isrss", "minflt", "majflt", "nswap", "inblock", "oublock", "msgsnd", "msgrcv", "nsignals", "nvcsw", "nivcsw"): val = getattr(usage, "ru_{0}".format(usage_field)) log.debug("{0}: {1}".format(usage_field, val)) carbon_data += "calamari.cthulhu.ru_{0} {1} {2}\n".format(usage_field, val, t) self._socket.sendall(carbon_data) except socket.gaierror, resource.error: log.exception("Failed to send debugging statistics") self._close()
def on_completion(self, data): """ Callback for when a salt/job/<jid>/ret event is received, in which we find the UserRequest that created the job, and inform it of completion so that it can progress. """ with self._lock: jid = data['jid'] result = data['return'] log.debug("on_completion: jid=%s data=%s" % (jid, data)) try: request = self.get_by_jid(jid) log.debug("on_completion: jid %s belongs to request %s" % (jid, request.id)) except KeyError: log.warning("on_completion: unknown jid {0}".format(jid)) return if not data['success']: # This indicates a failure at the salt level, i.e. job threw an exception log.error("Remote execution failed for request %s: %s" % (request.id, result)) if isinstance(result, dict): # Handler ran and recorded an error for us request.set_error(result['error_status']) else: # An exception, probably, stringized by salt for us request.set_error(result) request.complete() elif result['error']: # This indicates a failure within ceph.rados_commands which was caught # by our code, like one of our Ceph commands returned an error code. # NB in future there may be UserRequest subclasses which want to receive # and handle these errors themselves, so this branch would be refactored # to allow that. log.error("Request %s experienced an error: %s" % (request.id, result['error_status'])) request.jid = None request.set_error(result['error_status']) request.complete() else: if request.state != UserRequest.SUBMITTED: # Unexpected, ignore. log.error("Received completion for request %s/%s in state %s" % ( request.id, request.jid, request.state )) return try: with self._update_index(request): old_jid = request.jid request.complete_jid(result) assert request.jid != old_jid # After a jid completes, requests may start waiting for cluster # map updates, we ask ClusterMonitor to hurry up and get them on # behalf of the request. if request.awaiting_versions: for sync_type, version in request.awaiting_versions.items(): if version is not None: log.debug("Notifying SyncObjects of awaited version %s/%s" % (sync_type.str, version)) self._sync_objects.on_version(data['id'], sync_type, version) # The request may be waiting for an epoch that we already have, if so # give it to the request right away for sync_type, want_version in request.awaiting_versions.items(): got_version = self._sync_objects.get_version(sync_type) if want_version and sync_type.cmp(got_version, want_version) >= 0: log.info("Awaited %s %s is immediately available" % (sync_type, want_version)) request.on_map(sync_type, self._sync_objects) except Exception as e: # Ensure that a misbehaving piece of code in a UserRequest subclass # results in a terminated job, not a zombie job log.exception("Calling complete_jid for %s/%s" % (request.id, request.jid)) request.jid = None request.set_error("Internal error %s" % e) request.complete() if request.state == UserRequest.COMPLETE: self._eventer.on_user_request_complete(request)
def _run(self): self._plugin_monitor.start() self._ready.set() log.debug("ClusterMonitor._run: ready") event = SaltEventSource(log, salt_config) while not self._complete.is_set(): # No salt tag filtering: https://github.com/saltstack/salt/issues/11582 ev = event.get_event(full=True) if ev is not None: data = ev['data'] tag = ev['tag'] log.debug("_run.ev: %s/tag=%s" % (data['id'] if 'id' in data else None, tag)) # I am interested in the following tags: # - salt/job/<jid>/ret/<minion id> where jid is one that I started # (this includes ceph.rados_command and ceph.get_cluster_object) # - ceph/cluster/<fsid> where fsid is my fsid try: if tag.startswith("ceph/cluster/{0}".format(self.fsid)): # A ceph.heartbeat beacon self.on_heartbeat(data['id'], data['data']) elif re.match("^salt/job/\d+/ret/[^/]+$", tag): if data['fun'] == "saltutil.running": # Update on what jobs are running # It would be nice to filter these down to those which really are for # this cluster, but as long as N_clusters and N_jobs are reasonably small # it's not an efficiency problem. self._requests.on_tick_response(data['id'], data['return']) # It would be much nicer to put the FSID at the start of # the tag, if salt would only let us add custom tags to our jobs. # Instead we enforce a convention that all calamari jobs must include # fsid in their return value. if (not isinstance(data, dict)) or not isinstance(data['return'], dict): # Something not formatted for ClusterMonitor log.warning("Ignoring event %s" % tag) continue if 'fsid' not in data['return'] or data['return']['fsid'] != self.fsid: # Something for a different ClusterMonitor log.debug("Ignoring job return, not for my FSID") continue if data['fun'] == 'ceph.get_cluster_object': # A ceph.get_cluster_object response if not data['success']: log.error("on_sync_object: failure from %s: %s" % (data['id'], data['return'])) continue self.on_sync_object(data['id'], data['return']) else: log.warning("Unexpected function '%s' (%s)" % (data['fun'], tag)) else: # This does not concern us, ignore it pass except: # Because this is our main event handling loop, swallow exceptions # instead of letting them end the world. log.exception("Exception handling message with tag %s" % tag) log.debug("Message content: %s" % data) log.info("%s complete" % self.__class__.__name__) self._plugin_monitor.stop() self._plugin_monitor.join() self.done.set()
def on_completion(self, data): """ Callback for when a salt/job/<jid>/ret event is received, in which we find the UserRequest that created the job, and inform it of completion so that it can progress. """ with self._lock: jid = data['jid'] result = data['return'] log.debug("on_completion: jid=%s data=%s" % (jid, data)) try: request = self.get_by_jid(jid) log.debug("on_completion: jid %s belongs to request %s" % (jid, request.id)) except KeyError: log.warning("on_completion: unknown jid {0}".format(jid)) return if not data['success']: # This indicates a failure at the salt level, i.e. job threw an exception log.error("Remote execution failed for request %s: %s" % (request.id, result)) if isinstance(result, dict): # Handler ran and recorded an error for us request.set_error(result['error_status']) else: # An exception, probably, stringized by salt for us request.set_error(result) request.complete() elif result['error']: # This indicates a failure within ceph.rados_commands which was caught # by our code, like one of our Ceph commands returned an error code. # NB in future there may be UserRequest subclasses which want to receive # and handle these errors themselves, so this branch would be refactored # to allow that. log.error("Request %s experienced an error: %s" % (request.id, result['error_status'])) request.jid = None request.set_error(result['error_status']) request.complete() else: if request.state != UserRequest.SUBMITTED: # Unexpected, ignore. log.error( "Received completion for request %s/%s in state %s" % (request.id, request.jid, request.state)) return try: with self._update_index(request): old_jid = request.jid request.complete_jid(result) assert request.jid != old_jid # After a jid completes, requests may start waiting for cluster # map updates, we ask ClusterMonitor to hurry up and get them on # behalf of the request. if request.awaiting_versions: for sync_type, version in request.awaiting_versions.items( ): if version is not None: log.debug( "Notifying SyncObjects of awaited version %s/%s" % (sync_type.str, version)) self._sync_objects.on_version( data['id'], sync_type, version) # The request may be waiting for an epoch that we already have, if so # give it to the request right away for sync_type, want_version in request.awaiting_versions.items( ): got_version = self._sync_objects.get_version( sync_type) if want_version and sync_type.cmp( got_version, want_version) >= 0: log.info( "Awaited %s %s is immediately available" % (sync_type, want_version)) request.on_map(sync_type, self._sync_objects) except Exception as e: # Ensure that a misbehaving piece of code in a UserRequest subclass # results in a terminated job, not a zombie job log.exception("Calling complete_jid for %s/%s" % (request.id, request.jid)) request.jid = None request.set_error("Internal error %s" % e) request.complete() if request.state == UserRequest.COMPLETE: self._eventer.on_user_request_complete(request)