Ejemplo n.º 1
0
    def get_sync_object(self, fs_id, object_type, path=None):
        """
        Get one of the objects that ClusterMonitor keeps a copy of from the mon, such
        as the cluster maps.

        :param fs_id: The fsid of a cluster
        :param object_type: String, one of SYNC_OBJECT_TYPES
        :param path: List, optional, a path within the object to return instead of the whole thing

        :return: the requested data, or None if it was not found (including if any element of ``path``
                 was not found)
        """

        if path:
            obj = self._fs_resolve(fs_id).get_sync_object(SYNC_OBJECT_STR_TYPE[object_type])
            try:
                for part in path:
                    if isinstance(obj, dict):
                        obj = obj[part]
                    else:
                        obj = getattr(obj, part)
            except (AttributeError, KeyError) as e:
                log.exception("Exception %s traversing %s: obj=%s" % (e, path, obj))
                raise NotFound(object_type, path)
            return obj
        else:
            return self._fs_resolve(fs_id).get_sync_object_data(SYNC_OBJECT_STR_TYPE[object_type])
Ejemplo n.º 2
0
def main():

    log.info('calamari-list: starting')
    complete = gevent.event.Event()
    ceph_argparse = None
    while not ceph_argparse:
        try:
            import ceph_argparse
        except ImportError:
            log.error(
                'Cannot import ceph_arg_parse module -- please install ceph')
            complete.wait(timeout=50)

    from cthulhu.manager.manager import Manager

    carbon = ShallowCarbonCache()
    carbon.start()

    cthulhu = Manager()
    cthulhu_started = False

    while not cthulhu_started:
        try:
            if not cthulhu_started:
                cthulhu_started = cthulhu.start()

        except Exception, e:
            log.exception('It borked')
            log.error(str(e))
            complete.wait(timeout=5)
Ejemplo n.º 3
0
    def on_map(self, sync_type, sync_objects):
        """
        Callback for when a new cluster map is available, in which
        we notify any interested ongoing UserRequests of the new map
        so that they can progress if they were waiting for it.
        """
        with self._lock:
            requests = self.get_all(state=UserRequest.SUBMITTED)
            for request in requests:
                try:
                    # If this is one of the types that this request
                    # is waiting for, invoke on_map.
                    for awaited_type in request.awaiting_versions.keys():
                        if awaited_type == sync_type:
                            with self._update_index(request):
                                request.on_map(sync_type, sync_objects)
                except Exception as e:
                    log.exception("Request %s threw exception in on_map", request.id)
                    if request.jid:
                        log.error("Abandoning job {0}".format(request.jid))
                        request.jid = None

                    request.set_error("Internal error %s" % e)
                    request.complete()

                if request.state == UserRequest.COMPLETE:
                    self._eventer.on_user_request_complete(request)
Ejemplo n.º 4
0
    def _run(self):
        log.info("%s running" % self.__class__.__name__)

        event = SaltEventSource(log, salt_config)
        while not self._complete.is_set():
            # No salt tag filtering: https://github.com/saltstack/salt/issues/11582
            ev = event.get_event(full=True)
            if ev is not None and 'tag' in ev:
                tag = ev['tag']
                data = ev['data']
                try:
                    if tag.startswith("ceph/cluster/"):
                        cluster_data = data['data']
                        if not cluster_data['fsid'] in self._manager.clusters:
                            self._manager.on_discovery(data['id'],
                                                       cluster_data)
                        else:
                            log.debug(
                                "%s: heartbeat from existing cluster %s" %
                                (self.__class__.__name__,
                                 cluster_data['fsid']))
                    elif re.match("^salt/job/\d+/ret/[^/]+$", tag):
                        if data['fun'] == 'saltutil.running':
                            self._manager.requests.on_tick_response(
                                data['id'], data['return'])
                        else:
                            self._manager.requests.on_completion(data)
                    else:
                        # This does not concern us, ignore it
                        log.debug("TopLevelEvents: ignoring %s" % tag)
                        pass
                except:
                    log.exception("Exception handling message tag=%s" % tag)

        log.info("%s complete" % self.__class__.__name__)
Ejemplo n.º 5
0
    def on_map(self, sync_type, sync_objects):
        """
        Callback for when a new cluster map is available, in which
        we notify any interested ongoing UserRequests of the new map
        so that they can progress if they were waiting for it.
        """
        with self._lock:
            requests = self.get_all(state=UserRequest.SUBMITTED)
            for request in requests:
                try:
                    # If this is one of the types that this request
                    # is waiting for, invoke on_map.
                    for awaited_type in request.awaiting_versions.keys():
                        if awaited_type == sync_type:
                            with self._update_index(request):
                                request.on_map(sync_type, sync_objects)
                except Exception as e:
                    log.exception("Request %s threw exception in on_map",
                                  request.id)
                    if request.jid:
                        log.error("Abandoning job {0}".format(request.jid))
                        request.jid = None

                    request.set_error("Internal error %s" % e)
                    request.complete()

                if request.state == UserRequest.COMPLETE:
                    self._eventer.on_user_request_complete(request)
Ejemplo n.º 6
0
def main():

    log.info('calamari-list: starting')
    complete = gevent.event.Event()
    ceph_argparse = None
    while not ceph_argparse:
        try:
            import ceph_argparse
        except ImportError:
            log.error('Cannot import ceph_arg_parse module -- please install ceph')
            complete.wait(timeout=50)

    from cthulhu.manager.manager import Manager

    carbon = ShallowCarbonCache()
    carbon.start()

    cthulhu = Manager()
    cthulhu_started = False

    while not cthulhu_started:
        try:
            if not cthulhu_started:
                cthulhu_started = cthulhu.start()

        except Exception, e:
            log.exception('It borked')
            log.error(str(e))
            complete.wait(timeout=5)
Ejemplo n.º 7
0
    def get_sync_object(self, fs_id, object_type, path=None):
        """
        Get one of the objects that ClusterMonitor keeps a copy of from the mon, such
        as the cluster maps.

        :param fs_id: The fsid of a cluster
        :param object_type: String, one of SYNC_OBJECT_TYPES
        :param path: List, optional, a path within the object to return instead of the whole thing

        :return: the requested data, or None if it was not found (including if any element of ``path``
                 was not found)
        """

        if path:
            obj = self._fs_resolve(fs_id).get_sync_object(
                SYNC_OBJECT_STR_TYPE[object_type])
            try:
                for part in path:
                    if isinstance(obj, dict):
                        obj = obj[part]
                    else:
                        obj = getattr(obj, part)
            except (AttributeError, KeyError) as e:
                log.exception("Exception %s traversing %s: obj=%s" %
                              (e, path, obj))
                raise NotFound(object_type, path)
            return obj
        else:
            return self._fs_resolve(fs_id).get_sync_object_data(
                SYNC_OBJECT_STR_TYPE[object_type])
Ejemplo n.º 8
0
    def _run(self):
        log.info("%s running" % self.__class__.__name__)

        event = SaltEventSource(salt_config)
        while not self._complete.is_set():
            # No salt tag filtering: https://github.com/saltstack/salt/issues/11582
            ev = event.get_event(full=True)
            if ev is not None:
                tag = ev['tag']
                data = ev['data']
                try:
                    if tag.startswith("ceph/cluster/"):
                        cluster_data = data['data']
                        if not cluster_data['fsid'] in self._manager.clusters:
                            self._manager.on_discovery(data['id'], cluster_data)
                        else:
                            log.debug("%s: heartbeat from existing cluster %s" % (
                                self.__class__.__name__, cluster_data['fsid']))
                    else:
                        # This does not concern us, ignore it
                        pass
                except:
                    log.debug("Message content: %s" % data)
                    log.exception("Exception handling message")

        log.info("%s complete" % self.__class__.__name__)
Ejemplo n.º 9
0
    def _emit_stats(self):
        try:
            if not self._socket:
                log.info("Opening carbon socket {0}:{1}".format(
                    self.CARBON_HOST, self.CARBON_PORT))
                self._socket = socket.socket(socket.AF_INET,
                                             socket.SOCK_STREAM)
                self._socket.connect((self.CARBON_HOST, self.CARBON_PORT))

            carbon_data = ""
            t = int(time.time())
            usage = resource.getrusage(resource.RUSAGE_SELF)
            for usage_field in ("utime", "stime", "maxrss", "ixrss", "idrss",
                                "isrss", "minflt", "majflt", "nswap",
                                "inblock", "oublock", "msgsnd", "msgrcv",
                                "nsignals", "nvcsw", "nivcsw"):
                val = getattr(usage, "ru_{0}".format(usage_field))
                log.debug("{0}: {1}".format(usage_field, val))
                carbon_data += "calamari.cthulhu.ru_{0} {1} {2}\n".format(
                    usage_field, val, t)

            self._socket.sendall(carbon_data)
        except socket.gaierror, resource.error:
            log.exception("Failed to send debugging statistics")
            self._close()
Ejemplo n.º 10
0
    def _run(self):
        log.info("%s running" % self.__class__.__name__)

        event = SaltEventSource(log, salt_config)
        while not self._complete.is_set():
            # No salt tag filtering: https://github.com/saltstack/salt/issues/11582
            ev = event.get_event(full=True)
            if ev is not None and 'tag' in ev:
                tag = ev['tag']
                data = ev['data']
                try:
                    if tag.startswith("ceph/cluster/"):
                        cluster_data = data['data']
                        if not cluster_data['fsid'] in self._manager.clusters:
                            self._manager.on_discovery(data['id'], cluster_data)
                        else:
                            log.debug("%s: heartbeat from existing cluster %s" % (
                                self.__class__.__name__, cluster_data['fsid']))
                    elif re.match("^salt/job/\d+/ret/[^/]+$", tag):
                        if data['fun'] == 'saltutil.running':
                            self._manager.requests.on_tick_response(data['id'], data['return'])
                        else:
                            self._manager.requests.on_completion(data)
                    else:
                        # This does not concern us, ignore it
                        log.debug("TopLevelEvents: ignoring %s" % tag)
                        pass
                except:
                    log.exception("Exception handling message tag=%s" % tag)

        log.info("%s complete" % self.__class__.__name__)
Ejemplo n.º 11
0
    def _run(self):
        log.info("%s running" % self.__class__.__name__)

        event = SaltEventSource(salt_config)
        while not self._complete.is_set():
            # No salt tag filtering: https://github.com/saltstack/salt/issues/11582
            ev = event.get_event(full=True)
            if ev is not None:
                tag = ev['tag']
                data = ev['data']
                try:
                    if tag.startswith("ceph/cluster/"):
                        cluster_data = data['data']
                        if not cluster_data['fsid'] in self._manager.clusters:
                            self._manager.on_discovery(data['id'],
                                                       cluster_data)
                        else:
                            log.debug(
                                "%s: heartbeat from existing cluster %s" %
                                (self.__class__.__name__,
                                 cluster_data['fsid']))
                    else:
                        # This does not concern us, ignore it
                        pass
                except:
                    log.debug("Message content: %s" % data)
                    log.exception("Exception handling message")

        log.info("%s complete" % self.__class__.__name__)
Ejemplo n.º 12
0
 def wrap(*args, **kwargs):
     log.debug("RpcInterface >> %s(%s, %s)" % (item, args, kwargs))
     try:
         rc = attr(*args, **kwargs)
         log.debug("RpcInterface << %s" % item)
     except:
         log.exception("RpcInterface !! %s" % item)
         raise
     return rc
Ejemplo n.º 13
0
 def wrap(*args, **kwargs):
     log.debug("RpcInterface >> %s(%s, %s)" % (item, args, kwargs))
     try:
         rc = attr(*args, **kwargs)
         log.debug("RpcInterface << %s" % item)
     except:
         log.exception("RpcInterface !! %s" % item)
         raise
     return rc
Ejemplo n.º 14
0
    def _run(self):
        log.info("Persister listening")

        while not self._complete.is_set():
            try:
                data = self._queue.get(block=True, timeout=1)
            except gevent.queue.Empty:
                continue
            else:
                try:
                    data.fn(*data.args, **data.kwargs)
                    self._session.commit()
                except Exception:
                    # Catch-all because all kinds of things can go wrong and our
                    # behaviour is the same: log the exception, the data that
                    # caused it, then try to go back to functioning.
                    log.exception("Persister exception persisting data: %s" % (data.fn,))

                    self._session.rollback()
Ejemplo n.º 15
0
    def start(self):
        log.info("%s starting" % self.__class__.__name__)

        # Before we start listening to the outside world, recover
        # our last known state from persistent storage
        try:
            self._recover()
        except:
            log.exception("Recovery failed")
            os._exit(-1)

        self._rpc_thread.bind()
        self._rpc_thread.start()
        self._discovery_thread.start()
        self._process_monitor.start()
        self.notifier.start()
        self.persister.start()
        self.eventer.start()

        self.servers.start()
Ejemplo n.º 16
0
    def _run(self):
        log.info("Persister listening")

        while not self._complete.is_set():
            try:
                data = self._queue.get(block=True, timeout=1)
            except gevent.queue.Empty:
                continue
            else:
                try:
                    data.fn(*data.args, **data.kwargs)
                    self._session.commit()
                except Exception:
                    # Catch-all because all kinds of things can go wrong and our
                    # behaviour is the same: log the exception, the data that
                    # caused it, then try to go back to functioning.
                    log.exception("Persister exception persisting data: %s" %
                                  (data.fn, ))

                    self._session.rollback()
Ejemplo n.º 17
0
    def start(self):
        log.info("%s starting" % self.__class__.__name__)

        # Before we start listening to the outside world, recover
        # our last known state from persistent storage
        try:
            self._recover()
        except:
            log.exception("Recovery failed")
            os._exit(-1)

        self._rpc_thread.bind()
        self._rpc_thread.start()
        self._discovery_thread.start()
        self._process_monitor.start()
        self.persister.start()
        self.eventer.start()
        self._request_ticker.start()

        self.servers.start()
Ejemplo n.º 18
0
    def _emit_stats(self):
        try:
            if not self._socket:
                log.info("Opening carbon socket {0}:{1}".format(self.CARBON_HOST, self.CARBON_PORT))
                self._socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                self._socket.connect((self.CARBON_HOST, self.CARBON_PORT))

            carbon_data = ""
            t = int(time.time())
            usage = resource.getrusage(resource.RUSAGE_SELF)
            for usage_field in ("utime", "stime", "maxrss", "ixrss", "idrss", "isrss", "minflt", "majflt",
                                "nswap", "inblock", "oublock", "msgsnd", "msgrcv", "nsignals", "nvcsw", "nivcsw"):
                val = getattr(usage, "ru_{0}".format(usage_field))
                log.debug("{0}: {1}".format(usage_field, val))
                carbon_data += "calamari.cthulhu.ru_{0} {1} {2}\n".format(usage_field, val, t)

            self._socket.sendall(carbon_data)
        except socket.gaierror, resource.error:
            log.exception("Failed to send debugging statistics")
            self._close()
Ejemplo n.º 19
0
    def on_completion(self, data):
        """
        Callback for when a salt/job/<jid>/ret event is received, in which
        we find the UserRequest that created the job, and inform it of
        completion so that it can progress.
        """
        with self._lock:
            jid = data['jid']
            result = data['return']
            log.debug("on_completion: jid=%s data=%s" % (jid, data))

            try:
                request = self.get_by_jid(jid)
                log.debug("on_completion: jid %s belongs to request %s" % (jid, request.id))
            except KeyError:
                log.warning("on_completion: unknown jid {0}".format(jid))
                return

            if not data['success']:
                # This indicates a failure at the salt level, i.e. job threw an exception
                log.error("Remote execution failed for request %s: %s" % (request.id, result))
                if isinstance(result, dict):
                    # Handler ran and recorded an error for us
                    request.set_error(result['error_status'])
                else:
                    # An exception, probably, stringized by salt for us
                    request.set_error(result)
                request.complete()
            elif result['error']:
                # This indicates a failure within ceph.rados_commands which was caught
                # by our code, like one of our Ceph commands returned an error code.
                # NB in future there may be UserRequest subclasses which want to receive
                # and handle these errors themselves, so this branch would be refactored
                # to allow that.
                log.error("Request %s experienced an error: %s" % (request.id, result['error_status']))
                request.jid = None
                request.set_error(result['error_status'])
                request.complete()
            else:
                if request.state != UserRequest.SUBMITTED:
                    # Unexpected, ignore.
                    log.error("Received completion for request %s/%s in state %s" % (
                        request.id, request.jid, request.state
                    ))
                    return

                try:
                    with self._update_index(request):
                        old_jid = request.jid
                        request.complete_jid(result)
                        assert request.jid != old_jid

                        # After a jid completes, requests may start waiting for cluster
                        # map updates, we ask ClusterMonitor to hurry up and get them on
                        # behalf of the request.
                        if request.awaiting_versions:
                            for sync_type, version in request.awaiting_versions.items():
                                if version is not None:
                                    log.debug("Notifying SyncObjects of awaited version %s/%s" % (sync_type.str, version))
                                    self._sync_objects.on_version(data['id'], sync_type, version)

                            # The request may be waiting for an epoch that we already have, if so
                            # give it to the request right away
                            for sync_type, want_version in request.awaiting_versions.items():
                                got_version = self._sync_objects.get_version(sync_type)
                                if want_version and sync_type.cmp(got_version, want_version) >= 0:
                                    log.info("Awaited %s %s is immediately available" % (sync_type, want_version))
                                    request.on_map(sync_type, self._sync_objects)

                except Exception as e:
                    # Ensure that a misbehaving piece of code in a UserRequest subclass
                    # results in a terminated job, not a zombie job
                    log.exception("Calling complete_jid for %s/%s" % (request.id, request.jid))
                    request.jid = None
                    request.set_error("Internal error %s" % e)
                    request.complete()

        if request.state == UserRequest.COMPLETE:
            self._eventer.on_user_request_complete(request)
Ejemplo n.º 20
0
    def _run(self):
        self._plugin_monitor.start()

        self._ready.set()
        log.debug("ClusterMonitor._run: ready")

        event = SaltEventSource(log, salt_config)

        while not self._complete.is_set():
            # No salt tag filtering: https://github.com/saltstack/salt/issues/11582
            ev = event.get_event(full=True)

            if ev is not None:
                data = ev['data']
                tag = ev['tag']
                log.debug("_run.ev: %s/tag=%s" % (data['id'] if 'id' in data else None, tag))

                # I am interested in the following tags:
                # - salt/job/<jid>/ret/<minion id> where jid is one that I started
                #   (this includes ceph.rados_command and ceph.get_cluster_object)
                # - ceph/cluster/<fsid> where fsid is my fsid

                try:
                    if tag.startswith("ceph/cluster/{0}".format(self.fsid)):
                        # A ceph.heartbeat beacon
                        self.on_heartbeat(data['id'], data['data'])
                    elif re.match("^salt/job/\d+/ret/[^/]+$", tag):
                        if data['fun'] == "saltutil.running":
                            # Update on what jobs are running
                            # It would be nice to filter these down to those which really are for
                            # this cluster, but as long as N_clusters and N_jobs are reasonably small
                            # it's not an efficiency problem.
                            self._requests.on_tick_response(data['id'], data['return'])

                        # It would be much nicer to put the FSID at the start of
                        # the tag, if salt would only let us add custom tags to our jobs.
                        # Instead we enforce a convention that all calamari jobs must include
                        # fsid in their return value.
                        if (not isinstance(data, dict)) or not isinstance(data['return'], dict):
                            # Something not formatted for ClusterMonitor
                            log.warning("Ignoring event %s" % tag)
                            continue

                        if 'fsid' not in data['return'] or data['return']['fsid'] != self.fsid:
                            # Something for a different ClusterMonitor
                            log.debug("Ignoring job return, not for my FSID")
                            continue

                        if data['fun'] == 'ceph.get_cluster_object':
                            # A ceph.get_cluster_object response
                            if not data['success']:
                                log.error("on_sync_object: failure from %s: %s" % (data['id'], data['return']))
                                continue

                            self.on_sync_object(data['id'], data['return'])
                        else:
                            log.warning("Unexpected function '%s' (%s)" % (data['fun'], tag))
                    else:
                        # This does not concern us, ignore it
                        pass
                except:
                    # Because this is our main event handling loop, swallow exceptions
                    # instead of letting them end the world.
                    log.exception("Exception handling message with tag %s" % tag)
                    log.debug("Message content: %s" % data)

        log.info("%s complete" % self.__class__.__name__)
        self._plugin_monitor.stop()
        self._plugin_monitor.join()
        self.done.set()
Ejemplo n.º 21
0
    def on_completion(self, data):
        """
        Callback for when a salt/job/<jid>/ret event is received, in which
        we find the UserRequest that created the job, and inform it of
        completion so that it can progress.
        """
        with self._lock:
            jid = data['jid']
            result = data['return']
            log.debug("on_completion: jid=%s data=%s" % (jid, data))

            try:
                request = self.get_by_jid(jid)
                log.debug("on_completion: jid %s belongs to request %s" %
                          (jid, request.id))
            except KeyError:
                log.warning("on_completion: unknown jid {0}".format(jid))
                return

            if not data['success']:
                # This indicates a failure at the salt level, i.e. job threw an exception
                log.error("Remote execution failed for request %s: %s" %
                          (request.id, result))
                if isinstance(result, dict):
                    # Handler ran and recorded an error for us
                    request.set_error(result['error_status'])
                else:
                    # An exception, probably, stringized by salt for us
                    request.set_error(result)
                request.complete()
            elif result['error']:
                # This indicates a failure within ceph.rados_commands which was caught
                # by our code, like one of our Ceph commands returned an error code.
                # NB in future there may be UserRequest subclasses which want to receive
                # and handle these errors themselves, so this branch would be refactored
                # to allow that.
                log.error("Request %s experienced an error: %s" %
                          (request.id, result['error_status']))
                request.jid = None
                request.set_error(result['error_status'])
                request.complete()
            else:
                if request.state != UserRequest.SUBMITTED:
                    # Unexpected, ignore.
                    log.error(
                        "Received completion for request %s/%s in state %s" %
                        (request.id, request.jid, request.state))
                    return

                try:
                    with self._update_index(request):
                        old_jid = request.jid
                        request.complete_jid(result)
                        assert request.jid != old_jid

                        # After a jid completes, requests may start waiting for cluster
                        # map updates, we ask ClusterMonitor to hurry up and get them on
                        # behalf of the request.
                        if request.awaiting_versions:
                            for sync_type, version in request.awaiting_versions.items(
                            ):
                                if version is not None:
                                    log.debug(
                                        "Notifying SyncObjects of awaited version %s/%s"
                                        % (sync_type.str, version))
                                    self._sync_objects.on_version(
                                        data['id'], sync_type, version)

                            # The request may be waiting for an epoch that we already have, if so
                            # give it to the request right away
                            for sync_type, want_version in request.awaiting_versions.items(
                            ):
                                got_version = self._sync_objects.get_version(
                                    sync_type)
                                if want_version and sync_type.cmp(
                                        got_version, want_version) >= 0:
                                    log.info(
                                        "Awaited %s %s is immediately available"
                                        % (sync_type, want_version))
                                    request.on_map(sync_type,
                                                   self._sync_objects)

                except Exception as e:
                    # Ensure that a misbehaving piece of code in a UserRequest subclass
                    # results in a terminated job, not a zombie job
                    log.exception("Calling complete_jid for %s/%s" %
                                  (request.id, request.jid))
                    request.jid = None
                    request.set_error("Internal error %s" % e)
                    request.complete()

        if request.state == UserRequest.COMPLETE:
            self._eventer.on_user_request_complete(request)