Ejemplo n.º 1
0
    def _run(self):
        log.info("%s running" % self.__class__.__name__)

        event = SaltEventSource(salt_config)
        while not self._complete.is_set():
            # No salt tag filtering: https://github.com/saltstack/salt/issues/11582
            ev = event.get_event(full=True)
            if ev is not None:
                tag = ev['tag']
                data = ev['data']
                try:
                    if tag.startswith("ceph/cluster/"):
                        cluster_data = data['data']
                        if not cluster_data['fsid'] in self._manager.clusters:
                            self._manager.on_discovery(data['id'], cluster_data)
                        else:
                            log.debug("%s: heartbeat from existing cluster %s" % (
                                self.__class__.__name__, cluster_data['fsid']))
                    else:
                        # This does not concern us, ignore it
                        pass
                except:
                    log.debug("Message content: %s" % data)
                    log.exception("Exception handling message")

        log.info("%s complete" % self.__class__.__name__)
Ejemplo n.º 2
0
    def _run(self):
        log.info("%s running" % self.__class__.__name__)

        event = SaltEventSource(log, salt_config)
        while not self._complete.is_set():
            # No salt tag filtering: https://github.com/saltstack/salt/issues/11582
            ev = event.get_event(full=True)
            if ev is not None and 'tag' in ev:
                tag = ev['tag']
                data = ev['data']
                try:
                    if tag.startswith("ceph/cluster/"):
                        cluster_data = data['data']
                        if not cluster_data['fsid'] in self._manager.clusters:
                            self._manager.on_discovery(data['id'], cluster_data)
                        else:
                            log.debug("%s: heartbeat from existing cluster %s" % (
                                self.__class__.__name__, cluster_data['fsid']))
                    elif re.match("^salt/job/\d+/ret/[^/]+$", tag):
                        if data['fun'] == 'saltutil.running':
                            self._manager.requests.on_tick_response(data['id'], data['return'])
                        else:
                            self._manager.requests.on_completion(data)
                    else:
                        # This does not concern us, ignore it
                        log.debug("TopLevelEvents: ignoring %s" % tag)
                        pass
                except:
                    log.exception("Exception handling message tag=%s" % tag)

        log.info("%s complete" % self.__class__.__name__)
Ejemplo n.º 3
0
    def _run(self):
        log.info("Running {0}".format(self.__class__.__name__))
        while not self._complete.is_set():
            # self._emit_stats()
            self._complete.wait(self.MONITOR_PERIOD)

        self._close()
Ejemplo n.º 4
0
    def update(self, node_id, attributes):
        # TODO report Not Modified http://tracker.ceph.com/issues/9764
        current_node = self.osd_map.get_tree_node(node_id)
        parent = self.osd_map.parent_bucket_by_node_id.get(node_id, None)
        name, bucket_type, items = [attributes[key] for key in ('name', 'bucket_type', 'items')]
        commands = []

        # TODO change to use rename-bucket when #9526 lands in ceph 0.89
        if name != current_node['name'] or bucket_type != current_node['type_name']:
            commands.append(add_bucket(name, bucket_type))
            if parent is not None:
                commands.append(move_bucket(name, parent['name'], parent['type']))

        to_remove = [item for item in current_node['items'] if item not in items]
        commands += self._remove_items(name, bucket_type, to_remove)
        for c in self._add_items(name, bucket_type, items):
                if c not in commands:
                        commands.append(c)

        if name != current_node['name'] or bucket_type != current_node['type_name']:
            commands.append(remove_bucket(current_node['name'], None))

        log.info("Updating CRUSH node {c} parent {p} version {v}".format(c=commands, p=parent, v=self.osd_map.version))
        message = "Updating CRUSH node in {cluster_name}".format(cluster_name=self._cluster_monitor.name)
        return OsdMapModifyingRequest(message, self._cluster_monitor.fsid, self._cluster_monitor.name, commands)
Ejemplo n.º 5
0
def main():

    log.info('calamari-list: starting')
    complete = gevent.event.Event()
    ceph_argparse = None
    while not ceph_argparse:
        try:
            import ceph_argparse
        except ImportError:
            log.error('Cannot import ceph_arg_parse module -- please install ceph')
            complete.wait(timeout=50)

    from cthulhu.manager.manager import Manager

    carbon = ShallowCarbonCache()
    carbon.start()

    cthulhu = Manager()
    cthulhu_started = False

    while not cthulhu_started:
        try:
            if not cthulhu_started:
                cthulhu_started = cthulhu.start()

        except Exception, e:
            log.exception('It borked')
            log.error(str(e))
            complete.wait(timeout=5)
Ejemplo n.º 6
0
    def on_fetch_complete(self, minion_id, sync_type, version, data):
        """
        :return A SyncObject if this version was new to us, else None
        """
        log.debug("SyncObjects.on_fetch_complete %s/%s/%s" % (minion_id, sync_type.str, version))
        self._fetching_at[sync_type] = None

        # A fetch might give us a newer version than we knew we had asked for
        if sync_type.cmp(version, self._known_versions[sync_type]) > 0:
            self._known_versions[sync_type] = version

        # Don't store this if we already got something newer
        if sync_type.cmp(version, self.get_version(sync_type)) <= 0:
            log.warn("Ignoring outdated update %s/%s from %s" % (sync_type.str, version, minion_id))
            new_object = None
        else:
            log.info("Got new version %s/%s" % (sync_type.str, version))
            new_object = self.set_map(sync_type, version, data)

        # This might not be the latest: if it's not, send out another fetch
        # right away
        if sync_type.cmp(self._known_versions[sync_type], version) > 0:
            self.fetch(minion_id, sync_type)

        return new_object
Ejemplo n.º 7
0
    def _run(self):
        log.info("Starting %s" % self.__class__.__name__)
        threads = [gevent.spawn(self.run_plugin,
                                name,
                                status_processor.run,
                                status_processor.period) for name, status_processor in self.load_plugins()]

        gevent.joinall(threads)
Ejemplo n.º 8
0
 def stop(self):
     log.info("%s stopping" % self.__class__.__name__)
     for monitor in self.clusters.values():
         monitor.stop()
     self._rpc_thread.stop()
     self._discovery_thread.stop()
     self._process_monitor.stop()
     self.eventer.stop()
     self._request_ticker.stop()
Ejemplo n.º 9
0
    def filter_errors(self, check_data, salt_name):
        filtered_output = {}
        for node, results in check_data.iteritems():
            if results == '"%s" is not available.' % salt_name:
                log.info(node + results)
            else:
                filtered_output[node] = results

        return filtered_output
Ejemplo n.º 10
0
    def _emit(self, severity, message, **associations):
        """
        :param severity: One of the defined serverity values
        :param message: One line human readable string
        :param associations: Optional extra attributes to associate
                             the event with a particular cluster/server/service
        """
        log.info("Eventer._emit: %s/%s" % (severity_str(severity), message))

        self._events.append(Event(severity, message, **associations))
Ejemplo n.º 11
0
    def _run(self):
        log.info("%s running" % self.__class__.__name__)

        remote = get_remote()
        remote.listen(self._complete,
                      on_heartbeat=self.on_heartbeat,
                      on_job=self.on_job,
                      on_running_jobs=self._manager.requests.on_tick_response)

        log.info("%s complete" % self.__class__.__name__)
Ejemplo n.º 12
0
 def join(self):
     log.info("%s joining" % self.__class__.__name__)
     self._rpc_thread.join()
     self._discovery_thread.join()
     self._process_monitor.join()
     self.persister.join()
     self.eventer.join()
     self._request_ticker.join()
     self.servers.join()
     for monitor in self.clusters.values():
         monitor.join()
Ejemplo n.º 13
0
    def _run(self):
        assert self._bound

        while not self._complete.is_set():
            try:
                log.info("%s run..." % self.__class__.__name__)
                self._server.run()
            except:
                log.error(traceback.format_exc())
                self._complete.wait(self.EXCEPTION_BACKOFF)

        log.info("%s complete..." % self.__class__.__name__)
Ejemplo n.º 14
0
    def start(self):
        log.info("%s starting" % self.__class__.__name__)

        self._rpc_thread.bind()
        self._rpc_thread.start()
        self._discovery_thread.start()
        self._process_monitor.start()
        self.persister.start()
        self.eventer.start()
        self._request_ticker.start()

        self.servers.start()
        return True
Ejemplo n.º 15
0
    def on_version(self, reported_by, sync_type, new_version):
        """
        Notify me that a particular version of a particular map exists.

        I may choose to initiate RPC to retrieve the map
        """
        log.debug("SyncObjects.on_version %s/%s/%s" % (reported_by, sync_type.str, new_version))
        old_version = self.get_version(sync_type)
        if sync_type.cmp(new_version, old_version) > 0:
            known_version = self._known_versions[sync_type]
            if sync_type.cmp(new_version, known_version) > 0:
                # We are out of date: request an up to date copy
                log.info("Advanced known version %s/%s %s->%s" % (
                    self._cluster_name, sync_type.str, known_version, new_version))
                self._known_versions[sync_type] = new_version
            else:
                log.info("on_version: %s is newer than %s" % (new_version, old_version))

            # If we already have a request out for this type of map, then consider
            # cancelling it if we've already waited for a while.
            if self._fetching_at[sync_type] is not None:
                if now() - self._fetching_at[sync_type] < self.FETCH_TIMEOUT:
                    log.info("Fetch already underway for %s" % sync_type.str)
                    return
                else:
                    log.warn("Abandoning fetch for %s started at %s" % (
                        sync_type.str, self._fetching_at[sync_type]))

            log.info("on_version: fetching %s/%s from %s, currently got %s, know %s" % (
                sync_type, new_version, reported_by, old_version, known_version
            ))
            self.fetch(reported_by, sync_type)
Ejemplo n.º 16
0
    def on_discovery(self, minion_id, heartbeat_data):
        log.info("on_discovery: {0}/{1}".format(minion_id, heartbeat_data['fsid']))
        cluster_monitor = ClusterMonitor(heartbeat_data['fsid'], heartbeat_data['name'],
                                         self.persister, self.servers, self.eventer, self.requests)
        self.clusters[heartbeat_data['fsid']] = cluster_monitor

        # Run before passing on the heartbeat, because otherwise the
        # syncs resulting from the heartbeat might not be received
        # by the monitor.
        cluster_monitor.start()
        # Wait for ClusterMonitor to start accepting events before asking it
        # to do anything
        cluster_monitor.ready()
        cluster_monitor.on_heartbeat(minion_id, heartbeat_data)
Ejemplo n.º 17
0
    def _run(self):
        self._plugin_monitor.start()

        self._ready.set()
        log.debug("ClusterMonitor._run: ready")

        remote.listen(self._complete,
                      on_heartbeat=self.on_heartbeat,
                      fsid=self.fsid,
                      on_job=self.on_job_complete)

        log.info("%s complete" % self.__class__.__name__)
        self._plugin_monitor.stop()
        self._plugin_monitor.join()
        self.done.set()
Ejemplo n.º 18
0
    def _run(self):
        log.info("Persister listening")

        while not self._complete.is_set():
            try:
                data = self._queue.get(block=True, timeout=1)
            except gevent.queue.Empty:
                continue
            else:
                try:
                    data.fn(*data.args, **data.kwargs)
                    self._session.commit()
                except Exception:
                    # Catch-all because all kinds of things can go wrong and our
                    # behaviour is the same: log the exception, the data that
                    # caused it, then try to go back to functioning.
                    log.exception("Persister exception persisting data: %s" % (data.fn,))

                    self._session.rollback()
Ejemplo n.º 19
0
    def tick(self):
        """
        For walltime-based monitoring of running requests.  Long-running requests
        get a periodic call to saltutil.running to verify that things really
        are still happening.
        """

        if not self._by_jid:
            return
        else:
            log.debug("RequestCollection.tick: %s JIDs underway" % len(self._by_jid))

        # Identify JIDs who haven't had a saltutil.running reponse for too long.
        # Kill requests in a separate phase because request:JID is not 1:1
        stale_jobs = set()
        _now = now()
        for request in self._by_jid.values():
            if _now - request.alive_at > datetime.timedelta(seconds=TICK_PERIOD * 3):
                log.error("Request %s JID %s stale: now=%s, alive_at=%s" % (
                    request.id, request.jid, _now, request.alive_at
                ))
                stale_jobs.add(request)

        # Any identified stale jobs are errored out.
        for request in stale_jobs:
            with self._update_index(request):
                request.set_error("Lost contact")
                request.jid = None
                request.complete()

        # Identify minions associated with JIDs in flight
        query_minions = set()
        for jid, request in self._by_jid.items():
            query_minions.add(request.minion_id)

        # Attempt to emit a saltutil.running to ping jobs, next tick we
        # will see if we got updates to the alive_at attribute to indicate non-staleness
        if query_minions:
            log.info("RequestCollection.tick: sending saltutil.running to {0}".format(query_minions))
            client = LocalClient(config.get('cthulhu', 'salt_config_path'))
            pub_data = client.run_job(list(query_minions), 'saltutil.running', [], expr_form="list")
            if not pub_data:
                log.warning("Failed to publish saltutil.running to {0}".format(query_minions))
Ejemplo n.º 20
0
    def _pool_min_size(self, req_size, req_min_size):
        '''
        Find an appropriate "min_size" parameter for a pool create operation
        req_size is requested pool size; 0 means "use osd_pool_default_size"
        req_min_size is requested min size

        Used in both create and update
        '''
        ceph_config = self._cluster_monitor.get_sync_object_data(Config)
        size = req_size or int(ceph_config.get('osd_pool_default_size'), 0)
        min_size = req_min_size or \
            int(ceph_config.get('osd_pool_default_min_size'), 0)
        if min_size:
            ret_min_size = min(min_size, size)
        else:
            ret_min_size = size - size / 2
        log.info('_pool_min_size: size %d, min_size %d, ret %d' %
                 (size, min_size, ret_min_size))
        return ret_min_size
Ejemplo n.º 21
0
    def start(self):
        log.info("%s starting" % self.__class__.__name__)

        # Before we start listening to the outside world, recover
        # our last known state from persistent storage
        try:
            self._recover()
        except:
            log.exception("Recovery failed")
            os._exit(-1)

        self._rpc_thread.bind()
        self._rpc_thread.start()
        self._discovery_thread.start()
        self._process_monitor.start()
        self.notifier.start()
        self.persister.start()
        self.eventer.start()

        self.servers.start()
Ejemplo n.º 22
0
    def _emit_stats(self):
        try:
            if not self._socket:
                log.info("Opening carbon socket {0}:{1}".format(self.CARBON_HOST, self.CARBON_PORT))
                self._socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                self._socket.connect((self.CARBON_HOST, self.CARBON_PORT))

            carbon_data = ""
            t = int(time.time())
            usage = resource.getrusage(resource.RUSAGE_SELF)
            for usage_field in ("utime", "stime", "maxrss", "ixrss", "idrss", "isrss", "minflt", "majflt",
                                "nswap", "inblock", "oublock", "msgsnd", "msgrcv", "nsignals", "nvcsw", "nivcsw"):
                val = getattr(usage, "ru_{0}".format(usage_field))
                log.debug("{0}: {1}".format(usage_field, val))
                carbon_data += "calamari.cthulhu.ru_{0} {1} {2}\n".format(usage_field, val, t)

            self._socket.sendall(carbon_data)
        except socket.gaierror, resource.error:
            log.exception("Failed to send debugging statistics")
            self._close()
Ejemplo n.º 23
0
    def load_plugins(self):

        """
        Try to load a status_processor from each module in plugin_path, store keyed by module_name
        """
        loaded_plugins = []
        # FIXME this assumes that plugin_path has been added to PYTHONPATH and/or is in site-packages
        plugin_path = config.get('cthulhu', 'plugin_path')

        if os.path.exists(plugin_path):
            for plugin in os.listdir(plugin_path):
                plugin = plugin.split('.')[0]
                if plugin in ('__init__', 'README'):
                    continue

                status_processor = None
                try:
                    plugin_module = importlib.import_module('.'.join((plugin, 'status_processor')))
                    status_processor = plugin_module.StatusProcessor()
                except ImportError, e:
                    log.info("Error importing plugin %s %s" % (plugin, str(e)))

                if status_processor is not None:
                    loaded_plugins.append((plugin, status_processor))
Ejemplo n.º 24
0
def main():
    parser = argparse.ArgumentParser(description='Calamari management service')
    parser.add_argument('--debug',
                        dest='debug',
                        action='store_true',
                        default=False,
                        help='print log to stdout')

    args = parser.parse_args()
    if args.debug:
        handler = logging.StreamHandler(sys.stdout)
        handler.setFormatter(logging.Formatter(cthulhu.log.FORMAT))
        log.addHandler(handler)

    # Instruct salt to use the gevent version of ZMQ
    import zmq.green
    import salt.utils.event
    salt.utils.event.zmq = zmq.green

    if sqlalchemy is not None:
        # Set up gevent compatibility in psycopg2
        import psycogreen.gevent
        psycogreen.gevent.patch_psycopg()

    if manhole is not None:
        # Enable manhole for debugging.  Use oneshot mode
        # for gevent compatibility
        manhole.cry = lambda message: log.info("MANHOLE: %s" % message)
        manhole.install(oneshot_on=signal.SIGUSR1)

    m = Manager()
    m.start()

    complete = gevent.event.Event()

    def shutdown():
        log.info("Signal handler: stopping")
        complete.set()

    gevent.signal(signal.SIGTERM, shutdown)
    gevent.signal(signal.SIGINT, shutdown)

    while not complete.is_set():
        complete.wait(timeout=1)
Ejemplo n.º 25
0
def main():
    parser = argparse.ArgumentParser(description='Calamari management service')
    parser.add_argument('--debug', dest='debug', action='store_true',
                        default=False, help='print log to stdout')

    args = parser.parse_args()
    if args.debug:
        handler = logging.StreamHandler(sys.stdout)
        handler.setFormatter(logging.Formatter(cthulhu.log.FORMAT))
        log.addHandler(handler)

    # Instruct salt to use the gevent version of ZMQ
    import zmq.green
    import salt.utils.event
    salt.utils.event.zmq = zmq.green

    if sqlalchemy is not None:
        # Set up gevent compatibility in psycopg2
        import psycogreen.gevent
        psycogreen.gevent.patch_psycopg()

    if manhole is not None:
        # Enable manhole for debugging.  Use oneshot mode
        # for gevent compatibility
        manhole.cry = lambda message: log.info("MANHOLE: %s" % message)
        manhole.install(oneshot_on=signal.SIGUSR1)

    m = Manager()
    m.start()

    complete = gevent.event.Event()

    def shutdown():
        log.info("Signal handler: stopping")
        complete.set()

    gevent.signal(signal.SIGTERM, shutdown)
    gevent.signal(signal.SIGINT, shutdown)

    while not complete.is_set():
        complete.wait(timeout=1)
Ejemplo n.º 26
0
    def on_version(self, reported_by, sync_type, new_version):
        """
        Notify me that a particular version of a particular map exists.

        I may choose to initiate RPC to retrieve the map
        """
        log.debug("SyncObjects.on_version %s/%s/%s" %
                  (reported_by, sync_type.str, new_version))
        old_version = self.get_version(sync_type)
        if sync_type.cmp(new_version, old_version) > 0:
            known_version = self._known_versions[sync_type]
            if sync_type.cmp(new_version, known_version) > 0:
                # We are out of date: request an up to date copy
                log.info("Advanced known version %s/%s %s->%s" %
                         (self._cluster_name, sync_type.str, known_version,
                          new_version))
                self._known_versions[sync_type] = new_version
            else:
                log.info("on_version: %s is newer than %s" %
                         (new_version, old_version))

            # If we already have a request out for this type of map, then consider
            # cancelling it if we've already waited for a while.
            if self._fetching_at[sync_type] is not None:
                if now() - self._fetching_at[sync_type] < self.FETCH_TIMEOUT:
                    log.info("Fetch already underway for %s" % sync_type.str)
                    return
                else:
                    log.warn("Abandoning fetch for %s started at %s" %
                             (sync_type.str, self._fetching_at[sync_type]))

            log.info(
                "on_version: fetching %s/%s from %s, currently got %s, know %s"
                % (sync_type, new_version, reported_by, old_version,
                   known_version))
            self.fetch(reported_by, sync_type)
Ejemplo n.º 27
0
    def on_completion(self, data):
        """
        Callback for when a salt/job/<jid>/ret event is received, in which
        we find the UserRequest that created the job, and inform it of
        completion so that it can progress.
        """
        with self._lock:
            jid = data['jid']
            result = data['return']
            log.debug("on_completion: jid=%s data=%s" % (jid, data))

            try:
                request = self.get_by_jid(jid)
                log.debug("on_completion: jid %s belongs to request %s" %
                          (jid, request.id))
            except KeyError:
                log.warning("on_completion: unknown jid {0}".format(jid))
                return

            if not data['success']:
                # This indicates a failure at the salt level, i.e. job threw an exception
                log.error("Remote execution failed for request %s: %s" %
                          (request.id, result))
                if isinstance(result, dict):
                    # Handler ran and recorded an error for us
                    request.set_error(result['error_status'])
                else:
                    # An exception, probably, stringized by salt for us
                    request.set_error(result)
                request.complete()
            elif result['error']:
                # This indicates a failure within ceph.rados_commands which was caught
                # by our code, like one of our Ceph commands returned an error code.
                # NB in future there may be UserRequest subclasses which want to receive
                # and handle these errors themselves, so this branch would be refactored
                # to allow that.
                log.error("Request %s experienced an error: %s" %
                          (request.id, result['error_status']))
                request.jid = None
                request.set_error(result['error_status'])
                request.complete()
            else:
                if request.state != UserRequest.SUBMITTED:
                    # Unexpected, ignore.
                    log.error(
                        "Received completion for request %s/%s in state %s" %
                        (request.id, request.jid, request.state))
                    return

                try:
                    with self._update_index(request):
                        old_jid = request.jid
                        request.complete_jid(result)
                        assert request.jid != old_jid

                        # After a jid completes, requests may start waiting for cluster
                        # map updates, we ask ClusterMonitor to hurry up and get them on
                        # behalf of the request.
                        if request.awaiting_versions:
                            for sync_type, version in request.awaiting_versions.items(
                            ):
                                if version is not None:
                                    log.debug(
                                        "Notifying SyncObjects of awaited version %s/%s"
                                        % (sync_type.str, version))
                                    self._sync_objects.on_version(
                                        data['id'], sync_type, version)

                            # The request may be waiting for an epoch that we already have, if so
                            # give it to the request right away
                            for sync_type, want_version in request.awaiting_versions.items(
                            ):
                                got_version = self._sync_objects.get_version(
                                    sync_type)
                                if want_version and sync_type.cmp(
                                        got_version, want_version) >= 0:
                                    log.info(
                                        "Awaited %s %s is immediately available"
                                        % (sync_type, want_version))
                                    request.on_map(sync_type,
                                                   self._sync_objects)

                except Exception as e:
                    # Ensure that a misbehaving piece of code in a UserRequest subclass
                    # results in a terminated job, not a zombie job
                    log.exception("Calling complete_jid for %s/%s" %
                                  (request.id, request.jid))
                    request.jid = None
                    request.set_error("Internal error %s" % e)
                    request.complete()

        if request.state == UserRequest.COMPLETE:
            self._eventer.on_user_request_complete(request)
Ejemplo n.º 28
0
    def _run(self):
        self._plugin_monitor.start()

        self._ready.set()
        log.debug("ClusterMonitor._run: ready")

        event = SaltEventSource(log, salt_config)

        while not self._complete.is_set():
            # No salt tag filtering: https://github.com/saltstack/salt/issues/11582
            ev = event.get_event(full=True)

            if ev is not None:
                data = ev['data']
                tag = ev['tag']
                log.debug("_run.ev: %s/tag=%s" % (data['id'] if 'id' in data else None, tag))

                # I am interested in the following tags:
                # - salt/job/<jid>/ret/<minion id> where jid is one that I started
                #   (this includes ceph.rados_command and ceph.get_cluster_object)
                # - ceph/cluster/<fsid> where fsid is my fsid

                try:
                    if tag.startswith("ceph/cluster/{0}".format(self.fsid)):
                        # A ceph.heartbeat beacon
                        self.on_heartbeat(data['id'], data['data'])
                    elif re.match("^salt/job/\d+/ret/[^/]+$", tag):
                        if data['fun'] == "saltutil.running":
                            # Update on what jobs are running
                            # It would be nice to filter these down to those which really are for
                            # this cluster, but as long as N_clusters and N_jobs are reasonably small
                            # it's not an efficiency problem.
                            self._requests.on_tick_response(data['id'], data['return'])

                        # It would be much nicer to put the FSID at the start of
                        # the tag, if salt would only let us add custom tags to our jobs.
                        # Instead we enforce a convention that all calamari jobs must include
                        # fsid in their return value.
                        if (not isinstance(data, dict)) or not isinstance(data['return'], dict):
                            # Something not formatted for ClusterMonitor
                            log.warning("Ignoring event %s" % tag)
                            continue

                        if 'fsid' not in data['return'] or data['return']['fsid'] != self.fsid:
                            # Something for a different ClusterMonitor
                            log.debug("Ignoring job return, not for my FSID")
                            continue

                        if data['fun'] == 'ceph.get_cluster_object':
                            # A ceph.get_cluster_object response
                            if not data['success']:
                                log.error("on_sync_object: failure from %s: %s" % (data['id'], data['return']))
                                continue

                            self.on_sync_object(data['id'], data['return'])
                        else:
                            log.warning("Unexpected function '%s' (%s)" % (data['fun'], tag))
                    else:
                        # This does not concern us, ignore it
                        pass
                except:
                    # Because this is our main event handling loop, swallow exceptions
                    # instead of letting them end the world.
                    log.exception("Exception handling message with tag %s" % tag)
                    log.debug("Message content: %s" % data)

        log.info("%s complete" % self.__class__.__name__)
        self._plugin_monitor.stop()
        self._plugin_monitor.join()
        self.done.set()
Ejemplo n.º 29
0
 def stop(self):
     log.info("%s stopping" % self.__class__.__name__)
     self._complete.set()
Ejemplo n.º 30
0
    def on_completion(self, data):
        """
        Callback for when a salt/job/<jid>/ret event is received, in which
        we find the UserRequest that created the job, and inform it of
        completion so that it can progress.
        """
        with self._lock:
            jid = data['jid']
            result = data['return']
            log.debug("on_completion: jid=%s data=%s" % (jid, data))

            try:
                request = self.get_by_jid(jid)
                log.debug("on_completion: jid %s belongs to request %s" % (jid, request.id))
            except KeyError:
                log.warning("on_completion: unknown jid {0}".format(jid))
                return

            if not data['success']:
                # This indicates a failure at the salt level, i.e. job threw an exception
                log.error("Remote execution failed for request %s: %s" % (request.id, result))
                if isinstance(result, dict):
                    # Handler ran and recorded an error for us
                    request.set_error(result['error_status'])
                else:
                    # An exception, probably, stringized by salt for us
                    request.set_error(result)
                request.complete()
            elif result['error']:
                # This indicates a failure within ceph.rados_commands which was caught
                # by our code, like one of our Ceph commands returned an error code.
                # NB in future there may be UserRequest subclasses which want to receive
                # and handle these errors themselves, so this branch would be refactored
                # to allow that.
                log.error("Request %s experienced an error: %s" % (request.id, result['error_status']))
                request.jid = None
                request.set_error(result['error_status'])
                request.complete()
            else:
                if request.state != UserRequest.SUBMITTED:
                    # Unexpected, ignore.
                    log.error("Received completion for request %s/%s in state %s" % (
                        request.id, request.jid, request.state
                    ))
                    return

                try:
                    with self._update_index(request):
                        old_jid = request.jid
                        request.complete_jid(result)
                        assert request.jid != old_jid

                        # After a jid completes, requests may start waiting for cluster
                        # map updates, we ask ClusterMonitor to hurry up and get them on
                        # behalf of the request.
                        if request.awaiting_versions:
                            for sync_type, version in request.awaiting_versions.items():
                                if version is not None:
                                    log.debug("Notifying SyncObjects of awaited version %s/%s" % (sync_type.str, version))
                                    self._sync_objects.on_version(data['id'], sync_type, version)

                            # The request may be waiting for an epoch that we already have, if so
                            # give it to the request right away
                            for sync_type, want_version in request.awaiting_versions.items():
                                got_version = self._sync_objects.get_version(sync_type)
                                if want_version and sync_type.cmp(got_version, want_version) >= 0:
                                    log.info("Awaited %s %s is immediately available" % (sync_type, want_version))
                                    request.on_map(sync_type, self._sync_objects)

                except Exception as e:
                    # Ensure that a misbehaving piece of code in a UserRequest subclass
                    # results in a terminated job, not a zombie job
                    log.exception("Calling complete_jid for %s/%s" % (request.id, request.jid))
                    request.jid = None
                    request.set_error("Internal error %s" % e)
                    request.complete()

        if request.state == UserRequest.COMPLETE:
            self._eventer.on_user_request_complete(request)
Ejemplo n.º 31
0
 def bind(self):
     log.info("%s bind..." % self.__class__.__name__)
     self._server.bind(config.get('cthulhu', 'rpc_url'))
     self._bound = True
Ejemplo n.º 32
0
 def shutdown():
     log.info("Signal handler: stopping")
     complete.set()
Ejemplo n.º 33
0
    def _recover(self):
        if sqlalchemy is None:
            return

        session = Session()
        for server in session.query(Server).all():
            log.debug("Recovered server %s" % server.fqdn)
            assert server.boot_time is None or server.boot_time.tzinfo is not None  # expect timezone-aware DB backend
            self.servers.inject_server(
                ServerState(fqdn=server.fqdn,
                            hostname=server.hostname,
                            managed=server.managed,
                            last_contact=server.last_contact,
                            boot_time=server.boot_time,
                            ceph_version=server.ceph_version))

        for service in session.query(Service).all():
            if service.server:
                server = session.query(Server).get(service.server)
            else:
                server = None
            log.debug("Recovered service %s/%s/%s on %s" %
                      (service.fsid, service.service_type, service.service_id,
                       server.fqdn if server else None))
            self.servers.inject_service(
                ServiceState(fsid=service.fsid,
                             service_type=service.service_type,
                             service_id=service.service_id),
                server.fqdn if server else None)

        # I want the most recent version of every sync_object
        fsids = [(row[0], row[1]) for row in session.query(
            SyncObject.fsid, SyncObject.cluster_name).distinct(SyncObject.fsid)
                 ]
        for fsid, name in fsids:
            cluster_monitor = ClusterMonitor(fsid, name, self.notifier,
                                             self.persister, self.servers,
                                             self.eventer, self.requests)
            self.clusters[fsid] = cluster_monitor

            object_types = [
                row[0]
                for row in session.query(SyncObject.sync_type).filter_by(
                    fsid=fsid).distinct()
            ]
            for sync_type in object_types:
                latest_record = session.query(SyncObject).filter_by(
                    fsid=fsid,
                    sync_type=sync_type).order_by(SyncObject.version.desc(),
                                                  SyncObject.when.desc())[0]

                # FIXME: bit of a hack because records persisted only store their 'version'
                # if it's a real counter version, underlying problem is that we have
                # underlying data (health, pg_brief) without usable version counters.
                def md5(raw):
                    hasher = hashlib.md5()
                    hasher.update(raw)
                    return hasher.hexdigest()

                if latest_record.version:
                    version = latest_record.version
                else:
                    version = md5(latest_record.data)

                when = latest_record.when
                when = when.replace(tzinfo=tzutc())
                if cluster_monitor.update_time is None or when > cluster_monitor.update_time:
                    cluster_monitor.update_time = when

                cluster_monitor.inject_sync_object(
                    None, sync_type, version,
                    msgpack.unpackb(latest_record.data))

        for monitor in self.clusters.values():
            log.info("Recovery: Cluster %s with update time %s" %
                     (monitor.fsid, monitor.update_time))
            monitor.start()
Ejemplo n.º 34
0
 def bind(self):
     log.info("%s bind..." % self.__class__.__name__)
     self._server.bind(config.get('cthulhu', 'rpc_url'))
     self._bound = True
Ejemplo n.º 35
0
    def _recover(self):
        if sqlalchemy is None:
            return

        session = Session()
        for server in session.query(Server).all():
            log.debug("Recovered server %s" % server.fqdn)
            assert server.boot_time is None or server.boot_time.tzinfo is not None  # expect timezone-aware DB backend
            self.servers.inject_server(ServerState(
                fqdn=server.fqdn,
                hostname=server.hostname,
                managed=server.managed,
                last_contact=server.last_contact,
                boot_time=server.boot_time,
                ceph_version=server.ceph_version
            ))

        for service in session.query(Service).all():
            if service.server:
                server = session.query(Server).get(service.server)
            else:
                server = None
            log.debug("Recovered service %s/%s/%s on %s" % (
                service.fsid, service.service_type, service.service_id, server.fqdn if server else None
            ))
            self.servers.inject_service(ServiceState(
                fsid=service.fsid,
                service_type=service.service_type,
                service_id=service.service_id
            ), server.fqdn if server else None)

        # I want the most recent version of every sync_object
        fsids = [(row[0], row[1]) for row in session.query(SyncObject.fsid, SyncObject.cluster_name).distinct(SyncObject.fsid)]
        for fsid, name in fsids:
            cluster_monitor = ClusterMonitor(fsid, name, self.persister, self.servers,
                                             self.eventer, self.requests)
            self.clusters[fsid] = cluster_monitor

            object_types = [row[0] for row in session.query(SyncObject.sync_type).filter_by(fsid=fsid).distinct()]
            for sync_type in object_types:
                latest_record = session.query(SyncObject).filter_by(
                    fsid=fsid, sync_type=sync_type).order_by(
                    SyncObject.version.desc(), SyncObject.when.desc())[0]

                # FIXME: bit of a hack because records persisted only store their 'version'
                # if it's a real counter version, underlying problem is that we have
                # underlying data (health, pg_brief) without usable version counters.
                def md5(raw):
                    hasher = hashlib.md5()
                    hasher.update(raw)
                    return hasher.hexdigest()

                if latest_record.version:
                    version = latest_record.version
                else:
                    version = md5(latest_record.data)

                when = latest_record.when
                when = when.replace(tzinfo=tzutc())
                if cluster_monitor.update_time is None or when > cluster_monitor.update_time:
                    cluster_monitor.update_time = when

                cluster_monitor.inject_sync_object(None, sync_type, version, msgpack.unpackb(latest_record.data))

        for monitor in self.clusters.values():
            log.info("Recovery: Cluster %s with update time %s" % (monitor.fsid, monitor.update_time))
            monitor.start()
Ejemplo n.º 36
0
 def shutdown():
     log.info("Signal handler: stopping")
     complete.set()