Beispiel #1
0
    def fetch(self, minion_id, sync_type):
        log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type))
        if minion_id is None:
            # We're probably being replayed to from the database
            log.warn("SyncObjects.fetch called with minion_id=None")
            return

        self._fetching_at[sync_type] = now()
        client = LocalClient(config.get('cthulhu', 'salt_config_path'))
        # TODO clean up unused 'since' argument
        pub_data = client.run_job(
            minion_id, 'ceph.get_cluster_object',
            condition_kwarg(
                [], {
                    'cluster_name': self._cluster_name,
                    'sync_type': sync_type.str,
                    'since': None
                }))
        if not pub_data:
            log.error("Failed to start fetch job %s/%s" %
                      (minion_id, sync_type))
            # Don't throw an exception because if a fetch fails we should always
        else:
            log.debug("SyncObjects.fetch: jid=%s minions=%s" %
                      (pub_data['jid'], pub_data['minions']))
Beispiel #2
0
    def on_sync_object(self, fsid, sync_type, new, old):
        """
        Notification that a newer version of a SyncObject is available, or
        the first version of a SyncObject is available at startup (wherein
        old will be a null SyncObject)

        :param fsid: The FSID of the cluster to which the object belongs
        :param sync_type: A SyncObject subclass
        :param new: A SyncObject
        :param old: A SyncObject (same type as new)
        """
        log.debug("Eventer.on_sync_object: %s" % sync_type.str)

        if old.data is None:
            return

        if sync_type == OsdMap:
            self._on_pool_status(fsid, new, old)
            self._on_osd_map(fsid, new, old)
        elif sync_type == Health:
            self._on_health(fsid, new, old)
        elif sync_type == MonStatus:
            self._on_mon_status(fsid, new, old)
        elif sync_type == QuorumStatus:
            self._on_quorum_status(fsid, new, old)

        self._flush()
Beispiel #3
0
    def on_fetch_complete(self, minion_id, sync_type, version, data):
        """
        :return A SyncObject if this version was new to us, else None
        """
        log.debug("SyncObjects.on_fetch_complete %s/%s/%s" % (minion_id, sync_type.str, version))
        self._fetching_at[sync_type] = None

        # A fetch might give us a newer version than we knew we had asked for
        if sync_type.cmp(version, self._known_versions[sync_type]) > 0:
            self._known_versions[sync_type] = version

        # Don't store this if we already got something newer
        if sync_type.cmp(version, self.get_version(sync_type)) <= 0:
            log.warn("Ignoring outdated update %s/%s from %s" % (sync_type.str, version, minion_id))
            new_object = None
        else:
            log.info("Got new version %s/%s" % (sync_type.str, version))
            new_object = self.set_map(sync_type, version, data)

        # This might not be the latest: if it's not, send out another fetch
        # right away
        if sync_type.cmp(self._known_versions[sync_type], version) > 0:
            self.fetch(minion_id, sync_type)

        return new_object
Beispiel #4
0
    def _run(self):
        log.info("%s running" % self.__class__.__name__)

        event = SaltEventSource(log, salt_config)
        while not self._complete.is_set():
            # No salt tag filtering: https://github.com/saltstack/salt/issues/11582
            ev = event.get_event(full=True)
            if ev is not None and 'tag' in ev:
                tag = ev['tag']
                data = ev['data']
                try:
                    if tag.startswith("ceph/cluster/"):
                        cluster_data = data['data']
                        if not cluster_data['fsid'] in self._manager.clusters:
                            self._manager.on_discovery(data['id'],
                                                       cluster_data)
                        else:
                            log.debug(
                                "%s: heartbeat from existing cluster %s" %
                                (self.__class__.__name__,
                                 cluster_data['fsid']))
                    elif re.match("^salt/job/\d+/ret/[^/]+$", tag):
                        if data['fun'] == 'saltutil.running':
                            self._manager.requests.on_tick_response(
                                data['id'], data['return'])
                        else:
                            self._manager.requests.on_completion(data)
                    else:
                        # This does not concern us, ignore it
                        log.debug("TopLevelEvents: ignoring %s" % tag)
                        pass
                except:
                    log.exception("Exception handling message tag=%s" % tag)

        log.info("%s complete" % self.__class__.__name__)
Beispiel #5
0
    def _run(self):
        log.info("%s running" % self.__class__.__name__)

        event = SaltEventSource(salt_config)
        while not self._complete.is_set():
            # No salt tag filtering: https://github.com/saltstack/salt/issues/11582
            ev = event.get_event(full=True)
            if ev is not None:
                tag = ev['tag']
                data = ev['data']
                try:
                    if tag.startswith("ceph/cluster/"):
                        cluster_data = data['data']
                        if not cluster_data['fsid'] in self._manager.clusters:
                            self._manager.on_discovery(data['id'],
                                                       cluster_data)
                        else:
                            log.debug(
                                "%s: heartbeat from existing cluster %s" %
                                (self.__class__.__name__,
                                 cluster_data['fsid']))
                    else:
                        # This does not concern us, ignore it
                        pass
                except:
                    log.debug("Message content: %s" % data)
                    log.exception("Exception handling message")

        log.info("%s complete" % self.__class__.__name__)
Beispiel #6
0
    def _run(self):
        log.info("%s running" % self.__class__.__name__)

        event = SaltEventSource(salt_config)
        while not self._complete.is_set():
            # No salt tag filtering: https://github.com/saltstack/salt/issues/11582
            ev = event.get_event(full=True)
            if ev is not None:
                tag = ev['tag']
                data = ev['data']
                try:
                    if tag.startswith("ceph/cluster/"):
                        cluster_data = data['data']
                        if not cluster_data['fsid'] in self._manager.clusters:
                            self._manager.on_discovery(data['id'], cluster_data)
                        else:
                            log.debug("%s: heartbeat from existing cluster %s" % (
                                self.__class__.__name__, cluster_data['fsid']))
                    else:
                        # This does not concern us, ignore it
                        pass
                except:
                    log.debug("Message content: %s" % data)
                    log.exception("Exception handling message")

        log.info("%s complete" % self.__class__.__name__)
Beispiel #7
0
    def on_sync_object(self, fsid, sync_type, new, old):
        """
        Notification that a newer version of a SyncObject is available, or
        the first version of a SyncObject is available at startup (wherein
        old will be a null SyncObject)

        :param fsid: The FSID of the cluster to which the object belongs
        :param sync_type: A SyncObject subclass
        :param new: A SyncObject
        :param old: A SyncObject (same type as new)
        """
        log.debug("Eventer.on_sync_object: %s" % sync_type.str)

        if old.data is None:
            return

        if sync_type == OsdMap:
            self._on_pool_status(fsid, new, old)
            self._on_osd_map(fsid, new, old)
        elif sync_type == Health:
            self._on_health(fsid, new, old)
        elif sync_type == MonStatus:
            self._on_mon_status(fsid, new, old)
        elif sync_type == QuorumStatus:
            self._on_quorum_status(fsid, new, old)

        self._flush()
Beispiel #8
0
    def _run(self):
        log.info("%s running" % self.__class__.__name__)

        event = SaltEventSource(log, salt_config)
        while not self._complete.is_set():
            # No salt tag filtering: https://github.com/saltstack/salt/issues/11582
            ev = event.get_event(full=True)
            if ev is not None and 'tag' in ev:
                tag = ev['tag']
                data = ev['data']
                try:
                    if tag.startswith("ceph/cluster/"):
                        cluster_data = data['data']
                        if not cluster_data['fsid'] in self._manager.clusters:
                            self._manager.on_discovery(data['id'], cluster_data)
                        else:
                            log.debug("%s: heartbeat from existing cluster %s" % (
                                self.__class__.__name__, cluster_data['fsid']))
                    elif re.match("^salt/job/\d+/ret/[^/]+$", tag):
                        if data['fun'] == 'saltutil.running':
                            self._manager.requests.on_tick_response(data['id'], data['return'])
                        else:
                            self._manager.requests.on_completion(data)
                    else:
                        # This does not concern us, ignore it
                        log.debug("TopLevelEvents: ignoring %s" % tag)
                        pass
                except:
                    log.exception("Exception handling message tag=%s" % tag)

        log.info("%s complete" % self.__class__.__name__)
Beispiel #9
0
    def _is_favorite(self, minion_id):
        """
        Check if this minion is the one which we are currently treating
        as the primary source of updates, and promote it to be the
        favourite if the favourite has not sent a heartbeat since
        cthulhu->favorite_timeout_s.

        :return True if this minion was the favorite or has just been
                promoted.
        """
        t_now = now()
        self._last_heartbeat[minion_id] = t_now

        if self._favorite_mon is None:
            log.debug("%s is my new favourite" % minion_id)
            self._set_favorite(minion_id)
            return True
        elif minion_id != self._favorite_mon:
            # Consider whether this minion should become my new favourite: has it been
            # too long since my current favourite reported in?
            time_since = t_now - self._last_heartbeat[self._favorite_mon]
            favorite_timeout_s = self._servers.get_contact_period(self._favorite_mon) * FAVORITE_TIMEOUT_FACTOR
            if time_since > datetime.timedelta(seconds=favorite_timeout_s):
                log.debug("My old favourite, %s, has not sent a heartbeat for %s: %s is my new favourite" % (
                    self._favorite_mon, time_since, minion_id
                ))
                self._set_favorite(minion_id)

        return minion_id == self._favorite_mon
Beispiel #10
0
    def on_version(self, reported_by, sync_type, new_version):
        """
        Notify me that a particular version of a particular map exists.

        I may choose to initiate RPC to retrieve the map
        """
        log.debug("SyncObjects.on_version %s/%s/%s" % (reported_by, sync_type.str, new_version))
        old_version = self.get_version(sync_type)
        if sync_type.cmp(new_version, old_version) > 0:
            known_version = self._known_versions[sync_type]
            if sync_type.cmp(new_version, known_version) > 0:
                # We are out of date: request an up to date copy
                log.info("Advanced known version %s/%s %s->%s" % (
                    self._cluster_name, sync_type.str, known_version, new_version))
                self._known_versions[sync_type] = new_version
            else:
                log.info("on_version: %s is newer than %s" % (new_version, old_version))

            # If we already have a request out for this type of map, then consider
            # cancelling it if we've already waited for a while.
            if self._fetching_at[sync_type] is not None:
                if now() - self._fetching_at[sync_type] < self.FETCH_TIMEOUT:
                    log.info("Fetch already underway for %s" % sync_type.str)
                    return
                else:
                    log.warn("Abandoning fetch for %s started at %s" % (
                        sync_type.str, self._fetching_at[sync_type]))

            log.info("on_version: fetching %s/%s from %s, currently got %s, know %s" % (
                sync_type, new_version, reported_by, old_version, known_version
            ))
            self.fetch(reported_by, sync_type)
    def create(self, attributes):
        commands = [('osd pool create', {'pool': attributes['name'], 'pg_num': attributes['pg_num']})]

        # Calculate appropriate min_size, including default if none given
        req_size = attributes.get('size', 0)
        req_min_size = attributes.get('min_size', 0)
        attributes['min_size'] = self._pool_min_size(req_size, req_min_size)

        # Which attributes must we set after the initial create?
        post_create_attrs = attributes.copy()
        del post_create_attrs['name']
        del post_create_attrs['pg_num']
        if 'pgp_num' in post_create_attrs:
            del post_create_attrs['pgp_num']

        commands.extend(self._pool_attribute_commands(
            attributes['name'],
            post_create_attrs
        ))

        log.debug("Post-create attributes: %s" % post_create_attrs)
        log.debug("Commands: %s" % commands)

        return PoolCreatingRequest(
            "Creating pool '{name}'".format(name=attributes['name']),
            self._cluster_monitor.fsid, self._cluster_monitor.name,
            attributes['name'], commands)
Beispiel #12
0
    def _emit_stats(self):
        try:
            if not self._socket:
                log.info("Opening carbon socket {0}:{1}".format(
                    self.CARBON_HOST, self.CARBON_PORT))
                self._socket = socket.socket(socket.AF_INET,
                                             socket.SOCK_STREAM)
                self._socket.connect((self.CARBON_HOST, self.CARBON_PORT))

            carbon_data = ""
            t = int(time.time())
            usage = resource.getrusage(resource.RUSAGE_SELF)
            for usage_field in ("utime", "stime", "maxrss", "ixrss", "idrss",
                                "isrss", "minflt", "majflt", "nswap",
                                "inblock", "oublock", "msgsnd", "msgrcv",
                                "nsignals", "nvcsw", "nivcsw"):
                val = getattr(usage, "ru_{0}".format(usage_field))
                log.debug("{0}: {1}".format(usage_field, val))
                carbon_data += "calamari.cthulhu.ru_{0} {1} {2}\n".format(
                    usage_field, val, t)

            self._socket.sendall(carbon_data)
        except socket.gaierror, resource.error:
            log.exception("Failed to send debugging statistics")
            self._close()
Beispiel #13
0
 def wrap(*args, **kwargs):
     log.debug("RpcInterface >> %s(%s, %s)" % (item, args, kwargs))
     try:
         rc = attr(*args, **kwargs)
         log.debug("RpcInterface << %s" % item)
     except:
         log.exception("RpcInterface !! %s" % item)
         raise
     return rc
Beispiel #14
0
 def wrap(*args, **kwargs):
     log.debug("RpcInterface >> %s(%s, %s)" % (item, args, kwargs))
     try:
         rc = attr(*args, **kwargs)
         log.debug("RpcInterface << %s" % item)
     except:
         log.exception("RpcInterface !! %s" % item)
         raise
     return rc
Beispiel #15
0
    def _run(self):
        self._emit(INFO, "Calamari server started")
        self._flush()

        self._complete.wait(GRACE_PERIOD)
        while not self._complete.is_set():
            self.on_tick()
            self._complete.wait(TICK_SECONDS)
        log.debug("Eventer complete")
Beispiel #16
0
    def _run(self):
        self._emit(INFO, "Calamari server started")
        self._flush()

        self._complete.wait(GRACE_PERIOD)
        while not self._complete.is_set():
            self.on_tick()
            self._complete.wait(TICK_SECONDS)
        log.debug("Eventer complete")
Beispiel #17
0
    def _run(self):
        log.debug("Eventer running")
        self._emit(INFO, "Calamari server started")
        self._emit_to_salt_bus(SEVERITIES[INFO], "Calamari server started", "ceph/calamari/started")
        self._flush()

        self._complete.wait(GRACE_PERIOD)
        while not self._complete.is_set():
            self.on_tick()
            self._complete.wait(TICK_SECONDS)
        log.debug("Eventer complete")
Beispiel #18
0
    def _run(self):
        log.debug("Eventer running")
        self._emit(INFO, "Calamari server started")
        self._emit_to_salt_bus(SEVERITIES[INFO], "Calamari server started",
                               "ceph/calamari/started")
        self._flush()

        self._complete.wait(GRACE_PERIOD)
        while not self._complete.is_set():
            self.on_tick()
            self._complete.wait(TICK_SECONDS)
        log.debug("Eventer complete")
Beispiel #19
0
    def on_tick(self):
        """
        Periodically call this to drive non-event-driven events (i.e. things
        which are based on walltime checks)
        """
        log.debug("Eventer.on_tick")

        now_utc = now()

        for fqdn, server_state in self._manager.servers.servers.items():
            if not server_state.managed:
                # We don't expect messages from unmanaged servers so don't
                # worry about whether they sent us one recently.
                continue

            if len(server_state.clusters) == 1:
                # Because Events can only be associated with one FSID, we only make this
                # association for servers with exactly one cluster.  This is a bit cheeky and
                # kind of an unnecessary limitation in the Event DB schema.
                fsid = server_state.clusters[0]
            else:
                fsid = None

            contact_threshold = CONTACT_THRESHOLD_FACTOR * self._manager.servers.get_contact_period(fqdn)
            if now_utc - server_state.last_contact > datetime.timedelta(seconds=contact_threshold):
                if fqdn not in self._servers_complained:
                    self._emit(WARNING, "Server {fqdn} is late reporting in, last report at {last}".format(
                        fqdn=fqdn, last=server_state.last_contact
                    ), fqdn=fqdn, fsid=fsid)
                    self._servers_complained.add(fqdn)
            else:
                if fqdn in self._servers_complained:
                    self._emit(RECOVERY, "Server {fqdn} regained contact".format(fqdn=fqdn),
                               fqdn=fqdn, fsid=fsid)
                    self._servers_complained.discard(fqdn)

        for fsid, cluster_monitor in self._manager.clusters.items():
            if cluster_monitor.update_time is None or now_utc - cluster_monitor.update_time > datetime.timedelta(
                    seconds=CLUSTER_CONTACT_THRESHOLD):
                if fsid not in self._clusters_complained:
                    self._clusters_complained.add(fsid)
                    self._emit(WARNING, "Cluster '{name}' is late reporting in".format(name=cluster_monitor.name),
                               fsid=fsid)
            else:
                if fsid in self._clusters_complained:
                    self._emit(RECOVERY, "Cluster '{name}' regained contact".format(name=cluster_monitor.name),
                               fsid=fsid)
                    self._clusters_complained.discard(fsid)

        self._flush()
    def _run(self):
        self._plugin_monitor.start()

        self._ready.set()
        log.debug("ClusterMonitor._run: ready")

        remote.listen(self._complete,
                      on_heartbeat=self.on_heartbeat,
                      fsid=self.fsid,
                      on_job=self.on_job_complete)

        log.info("%s complete" % self.__class__.__name__)
        self._plugin_monitor.stop()
        self._plugin_monitor.join()
        self.done.set()
Beispiel #21
0
    def _run(self):
        self._plugin_monitor.start()

        self._ready.set()
        log.debug("ClusterMonitor._run: ready")

        remote.listen(self._complete,
                      on_heartbeat=self.on_heartbeat,
                      fsid=self.fsid,
                      on_job=self.on_job_complete)

        log.info("%s complete" % self.__class__.__name__)
        self._plugin_monitor.stop()
        self._plugin_monitor.join()
        self.done.set()
Beispiel #22
0
    def tick(self):
        """
        For walltime-based monitoring of running requests.  Long-running requests
        get a periodic call to saltutil.running to verify that things really
        are still happening.
        """

        if not self._by_jid:
            return
        else:
            log.debug("RequestCollection.tick: %s JIDs underway" %
                      len(self._by_jid))

        # Identify JIDs who haven't had a saltutil.running reponse for too long.
        # Kill requests in a separate phase because request:JID is not 1:1
        stale_jobs = set()
        _now = now()
        for request in self._by_jid.values():
            if _now - request.alive_at > datetime.timedelta(
                    seconds=TICK_PERIOD * 3):
                log.error("Request %s JID %s stale: now=%s, alive_at=%s" %
                          (request.id, request.jid, _now, request.alive_at))
                stale_jobs.add(request)

        # Any identified stale jobs are errored out.
        for request in stale_jobs:
            with self._update_index(request):
                request.set_error("Lost contact")
                request.jid = None
                request.complete()

        # Identify minions associated with JIDs in flight
        query_minions = set()
        for jid, request in self._by_jid.items():
            query_minions.add(request.minion_id)

        # Attempt to emit a saltutil.running to ping jobs, next tick we
        # will see if we got updates to the alive_at attribute to indicate non-staleness
        if query_minions:
            log.info("RequestCollection.tick: sending saltutil.running to {0}".
                     format(query_minions))
            client = LocalClient(config.get('cthulhu', 'salt_config_path'))
            pub_data = client.run_job(list(query_minions),
                                      'saltutil.running', [],
                                      expr_form="list")
            if not pub_data:
                log.warning("Failed to publish saltutil.running to {0}".format(
                    query_minions))
Beispiel #23
0
    def on_tick_response(self, minion_id, jobs):
        """
        Update the alive_at parameter of requests to record that they
        are still running remotely.

        :param jobs: The response from a saltutil.running
        """
        log.debug("RequestCollection.on_tick_response: %s from %s" % (len(jobs), minion_id))
        for job in jobs:
            try:
                request = self._by_jid[job['jid']]
            except KeyError:
                # Not one of mine, ignore it
                pass
            else:
                request.alive_at = now()
Beispiel #24
0
    def on_tick_response(self, minion_id, jobs):
        """
        Update the alive_at parameter of requests to record that they
        are still running remotely.

        :param jobs: The response from a saltutil.running
        """
        log.debug("RequestCollection.on_tick_response: %s from %s" %
                  (len(jobs), minion_id))
        for job in jobs:
            try:
                request = self._by_jid[job['jid']]
            except KeyError:
                # Not one of mine, ignore it
                pass
            else:
                request.alive_at = now()
Beispiel #25
0
    def on_job_complete(self, fqdn, jid, success, result, cmd, args):
        # It would be much nicer to put the FSID at the start of
        # the tag, if salt would only let us add custom tags to our jobs.
        # Instead we enforce a convention that calamari jobs include
        # fsid in their return value.
        if 'fsid' not in result or result['fsid'] != self.fsid:
            # Something for a different ClusterMonitor
            log.debug("Ignoring job return, not for my FSID")
            return

        if cmd == 'ceph.get_cluster_object':
            # A ceph.get_cluster_object response
            if not success:
                log.error("on_sync_object: failure from %s: %s" % (fqdn, result))
                return

            self.on_sync_object(fqdn, result)
        else:
            log.warning("Unexpected function '%s' (%s)" % (cmd, cmd))
Beispiel #26
0
    def tick(self):
        """
        For walltime-based monitoring of running requests.  Long-running requests
        get a periodic call to saltutil.running to verify that things really
        are still happening.
        """

        if not self._by_jid:
            return
        else:
            log.debug("RequestCollection.tick: %s JIDs underway" % len(self._by_jid))

        # Identify JIDs who haven't had a saltutil.running reponse for too long.
        # Kill requests in a separate phase because request:JID is not 1:1
        stale_jobs = set()
        _now = now()
        for request in self._by_jid.values():
            if _now - request.alive_at > datetime.timedelta(seconds=TICK_PERIOD * 3):
                log.error("Request %s JID %s stale: now=%s, alive_at=%s" % (
                    request.id, request.jid, _now, request.alive_at
                ))
                stale_jobs.add(request)

        # Any identified stale jobs are errored out.
        for request in stale_jobs:
            with self._update_index(request):
                request.set_error("Lost contact")
                request.jid = None
                request.complete()

        # Identify minions associated with JIDs in flight
        query_minions = set()
        for jid, request in self._by_jid.items():
            query_minions.add(request.minion_id)

        # Attempt to emit a saltutil.running to ping jobs, next tick we
        # will see if we got updates to the alive_at attribute to indicate non-staleness
        if query_minions:
            log.info("RequestCollection.tick: sending saltutil.running to {0}".format(query_minions))
            client = LocalClient(config.get('cthulhu', 'salt_config_path'))
            pub_data = client.run_job(list(query_minions), 'saltutil.running', [], expr_form="list")
            if not pub_data:
                log.warning("Failed to publish saltutil.running to {0}".format(query_minions))
Beispiel #27
0
    def run_plugin(self, plugin_name, status_processor, period):
        # slice of some time for the checks, leaving some for the status_processor
        check_timeout = int(period * .75)
        salt_name = '.'.join((plugin_name, 'status_check'))

        while not self._complete.is_set():
            start = int(time.time())
            timeout_at = start + period
            servers = [s.fqdn for s in self._servers.get_all()]
            check_data = self.filter_errors(self._remote_run_cmd_async(servers,
                                                                       salt_name,
                                                                       timeout=check_timeout),
                                            salt_name)

            self.plugin_results[plugin_name] = status_processor(check_data)
            log.debug("processed " + str(plugin_name) + str(check_data))

            time_left = timeout_at - int(time.time())
            gevent.sleep(max(0, time_left))
Beispiel #28
0
    def run_plugin(self, plugin_name, status_processor, period):
        # slice of some time for the checks, leaving some for the status_processor
        check_timeout = int(period * .75)
        salt_name = '.'.join((plugin_name, 'status_check'))

        while not self._complete.is_set():
            start = int(time.time())
            timeout_at = start + period
            servers = [s.fqdn for s in self._servers.get_all()]
            check_data = self.filter_errors(
                self._remote_run_cmd_async(servers,
                                           salt_name,
                                           timeout=check_timeout), salt_name)

            self.plugin_results[plugin_name] = status_processor(check_data)
            log.debug("processed " + str(plugin_name) + str(check_data))

            time_left = timeout_at - int(time.time())
            gevent.sleep(max(0, time_left))
Beispiel #29
0
    def fetch(self, minion_id, sync_type):
        log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type))
        if minion_id is None:
            # We're probably being replayed to from the database
            log.warn("SyncObjects.fetch called with minion_id=None")
            return

        self._fetching_at[sync_type] = now()
        client = LocalClient(config.get('cthulhu', 'salt_config_path'))
        # TODO clean up unused 'since' argument
        pub_data = client.run_job(minion_id, 'ceph.get_cluster_object',
                                  condition_kwarg([], {'cluster_name': self._cluster_name,
                                                       'sync_type': sync_type.str,
                                                       'since': None}))
        if not pub_data:
            log.error("Failed to start fetch job %s/%s" % (minion_id, sync_type))
            # Don't throw an exception because if a fetch fails we should always
        else:
            log.debug("SyncObjects.fetch: jid=%s minions=%s" % (pub_data['jid'], pub_data['minions']))
Beispiel #30
0
    def __init__(self, manager):
        super(Eventer, self).__init__()
        self._manager = manager

        self._complete = gevent.event.Event()

        # Flags for things we have complained about being out of contact
        # with, to avoid generating the same events repeatedly
        self._servers_complained = set()
        self._clusters_complained = set()

        # Check the config to decide if events has to be pushed to salt event bus.
        # If config is set initialize the salt caller object used to push events.
        if EMIT_EVENTS_TO_SALT_EVENT_BUS:
            log.debug("Events will be emitted to salt event bus")
            __opts__ = salt.config.minion_config(MINION_CONFIG)
            __opts__['file_client'] = 'local'
            self.caller = salt.client.Caller(mopts=__opts__)

        self._events = []
Beispiel #31
0
    def fetch(self, minion_id, sync_type):
        log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type))
        if minion_id is None:
            # We're probably being replayed to from the database
            log.warn("SyncObjects.fetch called with minion_id=None")
            return

        self._fetching_at[sync_type] = now()
        try:
            # TODO clean up unused 'since' argument
            jid = remote.run_job(minion_id, 'ceph.get_cluster_object',
                                 {'cluster_name': self._cluster_name,
                                  'sync_type': sync_type.str,
                                  'since': None})
        except Unavailable:
            # Don't throw an exception because if a fetch fails we should end up
            # issuing another on next heartbeat
            log.error("Failed to start fetch job %s/%s" % (minion_id, sync_type))
        else:
            log.debug("SyncObjects.fetch: jid=%s" % jid)
Beispiel #32
0
    def __init__(self, manager):
        super(Eventer, self).__init__()
        self._manager = manager

        self._complete = gevent.event.Event()

        # Flags for things we have complained about being out of contact
        # with, to avoid generating the same events repeatedly
        self._servers_complained = set()
        self._clusters_complained = set()

        # Check the config to decide if events has to be pushed to salt event bus.
        # If config is set initialize the salt caller object used to push events.
        if EMIT_EVENTS_TO_SALT_EVENT_BUS:
            log.debug("Events will be emitted to salt event bus")
            __opts__ = salt.config.minion_config(MINION_CONFIG)
            __opts__['file_client'] = 'local'
            self.caller = salt.client.Caller(mopts=__opts__)

        self._events = []
Beispiel #33
0
    def on_sync_object(self, minion_id, data):
        if minion_id != self._favorite_mon:
            log.debug("Ignoring map from %s, it is not my favourite (%s)" % (minion_id, self._favorite_mon))

        assert data['fsid'] == self.fsid

        sync_object = data['data']

        sync_type = SYNC_OBJECT_STR_TYPE[data['type']]
        new_object = self.inject_sync_object(minion_id, data['type'], data['version'], sync_object)
        if new_object:
            self._requests.on_map(self.fsid, sync_type, new_object)
            self._persister.update_sync_object(
                self.fsid,
                self.name,
                sync_type.str,
                new_object.version if isinstance(new_object.version, int) else None,
                now(), sync_object)
        else:
            log.warn("ClusterMonitor.on_sync_object: stale object received from %s" % minion_id)
Beispiel #34
0
    def _emit_stats(self):
        try:
            if not self._socket:
                log.info("Opening carbon socket {0}:{1}".format(self.CARBON_HOST, self.CARBON_PORT))
                self._socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                self._socket.connect((self.CARBON_HOST, self.CARBON_PORT))

            carbon_data = ""
            t = int(time.time())
            usage = resource.getrusage(resource.RUSAGE_SELF)
            for usage_field in ("utime", "stime", "maxrss", "ixrss", "idrss", "isrss", "minflt", "majflt",
                                "nswap", "inblock", "oublock", "msgsnd", "msgrcv", "nsignals", "nvcsw", "nivcsw"):
                val = getattr(usage, "ru_{0}".format(usage_field))
                log.debug("{0}: {1}".format(usage_field, val))
                carbon_data += "calamari.cthulhu.ru_{0} {1} {2}\n".format(usage_field, val, t)

            self._socket.sendall(carbon_data)
        except socket.gaierror, resource.error:
            log.exception("Failed to send debugging statistics")
            self._close()
    def on_job_complete(self, fqdn, jid, success, result, cmd, args):
        # It would be much nicer to put the FSID at the start of
        # the tag, if salt would only let us add custom tags to our jobs.
        # Instead we enforce a convention that calamari jobs include
        # fsid in their return value.
        if 'fsid' not in result or result['fsid'] != self.fsid:
            # Something for a different ClusterMonitor
            log.debug("Ignoring job return, not for my FSID")
            return

        if cmd == 'ceph.get_cluster_object':
            # A ceph.get_cluster_object response
            if not success:
                log.error("on_sync_object: failure from %s: %s" %
                          (fqdn, result))
                return

            self.on_sync_object(fqdn, result)
        else:
            log.warning("Unexpected function '%s' (%s)" % (cmd, cmd))
Beispiel #36
0
    def create(self, attributes):
        commands = [('osd pool create', {'pool': attributes['name'], 'pg_num': attributes['pg_num']})]

        # Which attributes must we set after the initial create?
        post_create_attrs = attributes.copy()
        del post_create_attrs['name']
        del post_create_attrs['pg_num']
        if 'pgp_num' in post_create_attrs:
            del post_create_attrs['pgp_num']

        commands.extend(self._pool_attribute_commands(
            attributes['name'],
            post_create_attrs
        ))

        log.debug("Post-create attributes: %s" % post_create_attrs)
        log.debug("Commands: %s" % post_create_attrs)

        return PoolCreatingRequest(
            "Creating pool '{name}'".format(name=attributes['name']),
            self._cluster_monitor.fsid, self._cluster_monitor.name,
            attributes['name'], commands)
Beispiel #37
0
    def on_heartbeat(self, minion_id, cluster_data):
        """
        Handle a ceph.heartbeat from a minion.

        Heartbeats come from all servers, but we're mostly interested in those
        which come from a mon (and therefore have the 'clusters' attribute populated)
        as these tells us whether there are any new versions of cluster maps
        for us to fetch.
        """

        if not self._is_favorite(minion_id):
            log.debug('Ignoring cluster data from %s, it is not my favourite (%s)' % (minion_id, self._favorite_mon))
            return

        self.update_time = datetime.datetime.utcnow().replace(tzinfo=utc)

        log.debug('Checking for version increments in heartbeat from %s' % minion_id)
        for sync_type in SYNC_OBJECT_TYPES:
            self._sync_objects.on_version(
                minion_id,
                sync_type,
                cluster_data['versions'][sync_type.str])
    def fetch(self, minion_id, sync_type):
        log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type))
        if minion_id is None:
            # We're probably being replayed to from the database
            log.warn("SyncObjects.fetch called with minion_id=None")
            return

        self._fetching_at[sync_type] = now()
        try:
            # TODO clean up unused 'since' argument
            jid = remote.run_job(
                minion_id, 'ceph.get_cluster_object', {
                    'cluster_name': self._cluster_name,
                    'sync_type': sync_type.str,
                    'since': None
                })
        except Unavailable:
            # Don't throw an exception because if a fetch fails we should end up
            # issuing another on next heartbeat
            log.error("Failed to start fetch job %s/%s" %
                      (minion_id, sync_type))
        else:
            log.debug("SyncObjects.fetch: jid=%s" % jid)
Beispiel #39
0
    def inject_sync_object(self, minion_id, sync_type, version, data):
        sync_type = SYNC_OBJECT_STR_TYPE[sync_type]
        old_object = self._sync_objects.get(sync_type)
        new_object = self._sync_objects.on_fetch_complete(minion_id, sync_type, version, data)

        if new_object:
            # The ServerMonitor is interested in cluster maps, do this prior
            # to updating any derived objects so that derived generators have
            # access to latest view of server state
            if sync_type == OsdMap:
                self._servers.on_osd_map(data)
            elif sync_type == MonMap:
                self._servers.on_mon_map(data)
            elif sync_type == MdsMap:
                self._servers.on_mds_map(self.fsid, data)

            # The frontend would like us to maintain some derived objects that
            # munge together the PG and OSD maps into an easier-to-consume form.
            for generator in derived.generators:
                if sync_type in generator.depends:
                    dependency_data = {}
                    for t in generator.depends:
                        obj = self._sync_objects.get(t)
                        if obj is not None:
                            dependency_data[t] = obj.data
                        else:
                            dependency_data[t] = None

                    if None not in dependency_data.values():
                        log.debug("Updating %s" % generator.__name__)
                        derived_objects = generator.generate(self, self._servers, dependency_data)
                        self._derived_objects.update(derived_objects)

            self._eventer.on_sync_object(self.fsid, sync_type, new_object, old_object)

        return new_object
Beispiel #40
0
    def _emit_to_salt_bus(self, severity, message, tag, **tags):
        """
        This function emits events to salt event bus, if the config
        value "emit_events_to_salt_event_bus" is set to true.
        """
        log.debug("Eventer running _emit_salt")
        if not EMIT_EVENTS_TO_SALT_EVENT_BUS:
            return

        log.debug("Eventer running _emit_salt")
        res = {}
        res["message"] = message
        res["severity"] = severity
        res["tags"] = tags
        tag = EVENT_TAG_PREFIX + tag

        log.debug("Eventer._emit_to_salt_bus: Tag:%s | Data: %s" %
                  (str(tag), str(res)))

        self.caller.sminion.functions['event.send'](tag, res)
Beispiel #41
0
    def _emit_to_salt_bus(self, severity, message, tag, **tags):
        """
        This function emits events to salt event bus, if the config
        value "emit_events_to_salt_event_bus" is set to true.
        """
        log.debug("Eventer running _emit_salt")
        if not EMIT_EVENTS_TO_SALT_EVENT_BUS:
            return

        log.debug("Eventer running _emit_salt")
        res = {}
        res["message"] = message
        res["severity"] = severity
        res["tags"] = tags
        tag = EVENT_TAG_PREFIX + tag

        log.debug("Eventer._emit_to_salt_bus: Tag:%s | Data: %s" % (str(tag), str(res)))

        self.caller.sminion.functions['event.send'](
            tag,
            res
        )
Beispiel #42
0
 def on_heartbeat(self, fqdn, data):
     if not data['fsid'] in self._manager.clusters:
         self._manager.on_discovery(fqdn, data)
     else:
         log.debug("%s: heartbeat from existing cluster %s" % (
             self.__class__.__name__, data['fsid']))
Beispiel #43
0
 def list_server_logs(self, fqdn):
     client = LocalClient(config.get('cthulhu', 'salt_config_path'))
     results = client.cmd(fqdn, "log_tail.list_logs", ["."])
     log.debug('list_server_log result !!! {results}'.format(
         results=str(results)))
     return results
Beispiel #44
0
 def stop(self):
     log.debug("Eventer stopping")
     self._complete.set()
Beispiel #45
0
    def _run(self):
        self._plugin_monitor.start()

        self._ready.set()
        log.debug("ClusterMonitor._run: ready")

        event = SaltEventSource(log, salt_config)

        while not self._complete.is_set():
            # No salt tag filtering: https://github.com/saltstack/salt/issues/11582
            ev = event.get_event(full=True)

            if ev is not None:
                data = ev['data']
                tag = ev['tag']
                log.debug("_run.ev: %s/tag=%s" % (data['id'] if 'id' in data else None, tag))

                # I am interested in the following tags:
                # - salt/job/<jid>/ret/<minion id> where jid is one that I started
                #   (this includes ceph.rados_command and ceph.get_cluster_object)
                # - ceph/cluster/<fsid> where fsid is my fsid

                try:
                    if tag.startswith("ceph/cluster/{0}".format(self.fsid)):
                        # A ceph.heartbeat beacon
                        self.on_heartbeat(data['id'], data['data'])
                    elif re.match("^salt/job/\d+/ret/[^/]+$", tag):
                        if data['fun'] == "saltutil.running":
                            # Update on what jobs are running
                            # It would be nice to filter these down to those which really are for
                            # this cluster, but as long as N_clusters and N_jobs are reasonably small
                            # it's not an efficiency problem.
                            self._requests.on_tick_response(data['id'], data['return'])

                        # It would be much nicer to put the FSID at the start of
                        # the tag, if salt would only let us add custom tags to our jobs.
                        # Instead we enforce a convention that all calamari jobs must include
                        # fsid in their return value.
                        if (not isinstance(data, dict)) or not isinstance(data['return'], dict):
                            # Something not formatted for ClusterMonitor
                            log.warning("Ignoring event %s" % tag)
                            continue

                        if 'fsid' not in data['return'] or data['return']['fsid'] != self.fsid:
                            # Something for a different ClusterMonitor
                            log.debug("Ignoring job return, not for my FSID")
                            continue

                        if data['fun'] == 'ceph.get_cluster_object':
                            # A ceph.get_cluster_object response
                            if not data['success']:
                                log.error("on_sync_object: failure from %s: %s" % (data['id'], data['return']))
                                continue

                            self.on_sync_object(data['id'], data['return'])
                        else:
                            log.warning("Unexpected function '%s' (%s)" % (data['fun'], tag))
                    else:
                        # This does not concern us, ignore it
                        pass
                except:
                    # Because this is our main event handling loop, swallow exceptions
                    # instead of letting them end the world.
                    log.exception("Exception handling message with tag %s" % tag)
                    log.debug("Message content: %s" % data)

        log.info("%s complete" % self.__class__.__name__)
        self._plugin_monitor.stop()
        self._plugin_monitor.join()
        self.done.set()
Beispiel #46
0
 def reset_event_sink(self):
     if EMIT_EVENTS_TO_SALT_EVENT_BUS:
         log.debug("resetting minion")
         __opts__ = salt.config.minion_config(MINION_CONFIG)
         __opts__['file_client'] = 'local'
         self.caller = salt.client.Caller(mopts=__opts__)
Beispiel #47
0
    def _recover(self):
        if sqlalchemy is None:
            return

        session = Session()
        for server in session.query(Server).all():
            log.debug("Recovered server %s" % server.fqdn)
            assert server.boot_time is None or server.boot_time.tzinfo is not None  # expect timezone-aware DB backend
            self.servers.inject_server(
                ServerState(fqdn=server.fqdn,
                            hostname=server.hostname,
                            managed=server.managed,
                            last_contact=server.last_contact,
                            boot_time=server.boot_time,
                            ceph_version=server.ceph_version))

        for service in session.query(Service).all():
            if service.server:
                server = session.query(Server).get(service.server)
            else:
                server = None
            log.debug("Recovered service %s/%s/%s on %s" %
                      (service.fsid, service.service_type, service.service_id,
                       server.fqdn if server else None))
            self.servers.inject_service(
                ServiceState(fsid=service.fsid,
                             service_type=service.service_type,
                             service_id=service.service_id),
                server.fqdn if server else None)

        # I want the most recent version of every sync_object
        fsids = [(row[0], row[1]) for row in session.query(
            SyncObject.fsid, SyncObject.cluster_name).distinct(SyncObject.fsid)
                 ]
        for fsid, name in fsids:
            cluster_monitor = ClusterMonitor(fsid, name, self.notifier,
                                             self.persister, self.servers,
                                             self.eventer, self.requests)
            self.clusters[fsid] = cluster_monitor

            object_types = [
                row[0]
                for row in session.query(SyncObject.sync_type).filter_by(
                    fsid=fsid).distinct()
            ]
            for sync_type in object_types:
                latest_record = session.query(SyncObject).filter_by(
                    fsid=fsid,
                    sync_type=sync_type).order_by(SyncObject.version.desc(),
                                                  SyncObject.when.desc())[0]

                # FIXME: bit of a hack because records persisted only store their 'version'
                # if it's a real counter version, underlying problem is that we have
                # underlying data (health, pg_brief) without usable version counters.
                def md5(raw):
                    hasher = hashlib.md5()
                    hasher.update(raw)
                    return hasher.hexdigest()

                if latest_record.version:
                    version = latest_record.version
                else:
                    version = md5(latest_record.data)

                when = latest_record.when
                when = when.replace(tzinfo=tzutc())
                if cluster_monitor.update_time is None or when > cluster_monitor.update_time:
                    cluster_monitor.update_time = when

                cluster_monitor.inject_sync_object(
                    None, sync_type, version,
                    msgpack.unpackb(latest_record.data))

        for monitor in self.clusters.values():
            log.info("Recovery: Cluster %s with update time %s" %
                     (monitor.fsid, monitor.update_time))
            monitor.start()
Beispiel #48
0
 def stop(self):
     log.debug("Eventer stopping")
     self._complete.set()
Beispiel #49
0
    def _recover(self):
        if sqlalchemy is None:
            return

        session = Session()
        for server in session.query(Server).all():
            log.debug("Recovered server %s" % server.fqdn)
            assert server.boot_time is None or server.boot_time.tzinfo is not None  # expect timezone-aware DB backend
            self.servers.inject_server(ServerState(
                fqdn=server.fqdn,
                hostname=server.hostname,
                managed=server.managed,
                last_contact=server.last_contact,
                boot_time=server.boot_time,
                ceph_version=server.ceph_version
            ))

        for service in session.query(Service).all():
            if service.server:
                server = session.query(Server).get(service.server)
            else:
                server = None
            log.debug("Recovered service %s/%s/%s on %s" % (
                service.fsid, service.service_type, service.service_id, server.fqdn if server else None
            ))
            self.servers.inject_service(ServiceState(
                fsid=service.fsid,
                service_type=service.service_type,
                service_id=service.service_id
            ), server.fqdn if server else None)

        # I want the most recent version of every sync_object
        fsids = [(row[0], row[1]) for row in session.query(SyncObject.fsid, SyncObject.cluster_name).distinct(SyncObject.fsid)]
        for fsid, name in fsids:
            cluster_monitor = ClusterMonitor(fsid, name, self.persister, self.servers,
                                             self.eventer, self.requests)
            self.clusters[fsid] = cluster_monitor

            object_types = [row[0] for row in session.query(SyncObject.sync_type).filter_by(fsid=fsid).distinct()]
            for sync_type in object_types:
                latest_record = session.query(SyncObject).filter_by(
                    fsid=fsid, sync_type=sync_type).order_by(
                    SyncObject.version.desc(), SyncObject.when.desc())[0]

                # FIXME: bit of a hack because records persisted only store their 'version'
                # if it's a real counter version, underlying problem is that we have
                # underlying data (health, pg_brief) without usable version counters.
                def md5(raw):
                    hasher = hashlib.md5()
                    hasher.update(raw)
                    return hasher.hexdigest()

                if latest_record.version:
                    version = latest_record.version
                else:
                    version = md5(latest_record.data)

                when = latest_record.when
                when = when.replace(tzinfo=tzutc())
                if cluster_monitor.update_time is None or when > cluster_monitor.update_time:
                    cluster_monitor.update_time = when

                cluster_monitor.inject_sync_object(None, sync_type, version, msgpack.unpackb(latest_record.data))

        for monitor in self.clusters.values():
            log.info("Recovery: Cluster %s with update time %s" % (monitor.fsid, monitor.update_time))
            monitor.start()
Beispiel #50
0
 def list_server_logs(self, fqdn):
     client = LocalClient(config.get('cthulhu', 'salt_config_path'))
     results = client.cmd(fqdn, "log_tail.list_logs", ["."])
     log.debug('list_server_log result !!! {results}'.format(results=str(results)))
     return results