Ejemplo n.º 1
0
    def tick(self):
        """
        For walltime-based monitoring of running requests.  Long-running requests
        get a periodic call to saltutil.running to verify that things really
        are still happening.
        """

        if not self._by_jid:
            return
        else:
            log.debug("RequestCollection.tick: %s JIDs underway" %
                      len(self._by_jid))

        # Identify JIDs who haven't had a saltutil.running reponse for too long.
        # Kill requests in a separate phase because request:JID is not 1:1
        stale_jobs = set()
        _now = now()
        for request in self._by_jid.values():
            if _now - request.alive_at > datetime.timedelta(
                    seconds=TICK_PERIOD * 3):
                log.error("Request %s JID %s stale: now=%s, alive_at=%s" %
                          (request.id, request.jid, _now, request.alive_at))
                stale_jobs.add(request)

        # Any identified stale jobs are errored out.
        for request in stale_jobs:
            with self._update_index(request):
                request.set_error("Lost contact")
                request.jid = None
                request.complete()

        # Identify minions associated with JIDs in flight
        query_minions = set()
        for jid, request in self._by_jid.items():
            query_minions.add(request.minion_id)

        # Attempt to emit a saltutil.running to ping jobs, next tick we
        # will see if we got updates to the alive_at attribute to indicate non-staleness
        if query_minions:
            log.info("RequestCollection.tick: sending saltutil.running to {0}".
                     format(query_minions))
            client = LocalClient(config.get('cthulhu', 'salt_config_path'))
            pub_data = client.run_job(list(query_minions),
                                      'saltutil.running', [],
                                      expr_form="list")
            if not pub_data:
                log.warning("Failed to publish saltutil.running to {0}".format(
                    query_minions))
Ejemplo n.º 2
0
    def get_event(self, *args, **kwargs):
        """
        Wrap MasterEvent.get_event
        """
        ev = self._master_event.get_event(self.POLL_TIMEOUT, *args, **kwargs)
        if ev is None:
            self._silence_counter += self.POLL_TIMEOUT
            if self._silence_counter > self.SILENCE_TIMEOUT:
                log.warning("Re-opening connection to salt-master")

                self._silence_counter = 0
                # Re-open the connection as a precaution against this lack of
                # messages being a symptom of a connection that has gone bad.
                old_ev = self._master_event
                gevent.spawn(lambda: self._destroy_conn(old_ev))
                self._master_event = salt.utils.event.MasterEvent(self._config['sock_dir'])
        else:
            self._silence_counter = 0
            return ev
Ejemplo n.º 3
0
    def tick(self):
        """
        For walltime-based monitoring of running requests.  Long-running requests
        get a periodic call to saltutil.running to verify that things really
        are still happening.
        """

        if not self._by_jid:
            return
        else:
            log.debug("RequestCollection.tick: %s JIDs underway" % len(self._by_jid))

        # Identify JIDs who haven't had a saltutil.running reponse for too long.
        # Kill requests in a separate phase because request:JID is not 1:1
        stale_jobs = set()
        _now = now()
        for request in self._by_jid.values():
            if _now - request.alive_at > datetime.timedelta(seconds=TICK_PERIOD * 3):
                log.error("Request %s JID %s stale: now=%s, alive_at=%s" % (
                    request.id, request.jid, _now, request.alive_at
                ))
                stale_jobs.add(request)

        # Any identified stale jobs are errored out.
        for request in stale_jobs:
            with self._update_index(request):
                request.set_error("Lost contact")
                request.jid = None
                request.complete()

        # Identify minions associated with JIDs in flight
        query_minions = set()
        for jid, request in self._by_jid.items():
            query_minions.add(request.minion_id)

        # Attempt to emit a saltutil.running to ping jobs, next tick we
        # will see if we got updates to the alive_at attribute to indicate non-staleness
        if query_minions:
            log.info("RequestCollection.tick: sending saltutil.running to {0}".format(query_minions))
            client = LocalClient(config.get('cthulhu', 'salt_config_path'))
            pub_data = client.run_job(list(query_minions), 'saltutil.running', [], expr_form="list")
            if not pub_data:
                log.warning("Failed to publish saltutil.running to {0}".format(query_minions))
Ejemplo n.º 4
0
    def on_job_complete(self, fqdn, jid, success, result, cmd, args):
        # It would be much nicer to put the FSID at the start of
        # the tag, if salt would only let us add custom tags to our jobs.
        # Instead we enforce a convention that calamari jobs include
        # fsid in their return value.
        if 'fsid' not in result or result['fsid'] != self.fsid:
            # Something for a different ClusterMonitor
            log.debug("Ignoring job return, not for my FSID")
            return

        if cmd == 'ceph.get_cluster_object':
            # A ceph.get_cluster_object response
            if not success:
                log.error("on_sync_object: failure from %s: %s" % (fqdn, result))
                return

            self.on_sync_object(fqdn, result)
        else:
            log.warning("Unexpected function '%s' (%s)" % (cmd, cmd))
Ejemplo n.º 5
0
    def on_job_complete(self, fqdn, jid, success, result, cmd, args):
        # It would be much nicer to put the FSID at the start of
        # the tag, if salt would only let us add custom tags to our jobs.
        # Instead we enforce a convention that calamari jobs include
        # fsid in their return value.
        if 'fsid' not in result or result['fsid'] != self.fsid:
            # Something for a different ClusterMonitor
            log.debug("Ignoring job return, not for my FSID")
            return

        if cmd == 'ceph.get_cluster_object':
            # A ceph.get_cluster_object response
            if not success:
                log.error("on_sync_object: failure from %s: %s" %
                          (fqdn, result))
                return

            self.on_sync_object(fqdn, result)
        else:
            log.warning("Unexpected function '%s' (%s)" % (cmd, cmd))
Ejemplo n.º 6
0
    def _run(self):
        self._plugin_monitor.start()

        self._ready.set()
        log.debug("ClusterMonitor._run: ready")

        event = SaltEventSource(log, salt_config)

        while not self._complete.is_set():
            # No salt tag filtering: https://github.com/saltstack/salt/issues/11582
            ev = event.get_event(full=True)

            if ev is not None:
                data = ev['data']
                tag = ev['tag']
                log.debug("_run.ev: %s/tag=%s" % (data['id'] if 'id' in data else None, tag))

                # I am interested in the following tags:
                # - salt/job/<jid>/ret/<minion id> where jid is one that I started
                #   (this includes ceph.rados_command and ceph.get_cluster_object)
                # - ceph/cluster/<fsid> where fsid is my fsid

                try:
                    if tag.startswith("ceph/cluster/{0}".format(self.fsid)):
                        # A ceph.heartbeat beacon
                        self.on_heartbeat(data['id'], data['data'])
                    elif re.match("^salt/job/\d+/ret/[^/]+$", tag):
                        if data['fun'] == "saltutil.running":
                            # Update on what jobs are running
                            # It would be nice to filter these down to those which really are for
                            # this cluster, but as long as N_clusters and N_jobs are reasonably small
                            # it's not an efficiency problem.
                            self._requests.on_tick_response(data['id'], data['return'])

                        # It would be much nicer to put the FSID at the start of
                        # the tag, if salt would only let us add custom tags to our jobs.
                        # Instead we enforce a convention that all calamari jobs must include
                        # fsid in their return value.
                        if (not isinstance(data, dict)) or not isinstance(data['return'], dict):
                            # Something not formatted for ClusterMonitor
                            log.warning("Ignoring event %s" % tag)
                            continue

                        if 'fsid' not in data['return'] or data['return']['fsid'] != self.fsid:
                            # Something for a different ClusterMonitor
                            log.debug("Ignoring job return, not for my FSID")
                            continue

                        if data['fun'] == 'ceph.get_cluster_object':
                            # A ceph.get_cluster_object response
                            if not data['success']:
                                log.error("on_sync_object: failure from %s: %s" % (data['id'], data['return']))
                                continue

                            self.on_sync_object(data['id'], data['return'])
                        else:
                            log.warning("Unexpected function '%s' (%s)" % (data['fun'], tag))
                    else:
                        # This does not concern us, ignore it
                        pass
                except:
                    # Because this is our main event handling loop, swallow exceptions
                    # instead of letting them end the world.
                    log.exception("Exception handling message with tag %s" % tag)
                    log.debug("Message content: %s" % data)

        log.info("%s complete" % self.__class__.__name__)
        self._plugin_monitor.stop()
        self._plugin_monitor.join()
        self.done.set()
Ejemplo n.º 7
0
    def on_completion(self, data):
        """
        Callback for when a salt/job/<jid>/ret event is received, in which
        we find the UserRequest that created the job, and inform it of
        completion so that it can progress.
        """
        with self._lock:
            jid = data['jid']
            result = data['return']
            log.debug("on_completion: jid=%s data=%s" % (jid, data))

            try:
                request = self.get_by_jid(jid)
                log.debug("on_completion: jid %s belongs to request %s" % (jid, request.id))
            except KeyError:
                log.warning("on_completion: unknown jid {0}".format(jid))
                return

            if not data['success']:
                # This indicates a failure at the salt level, i.e. job threw an exception
                log.error("Remote execution failed for request %s: %s" % (request.id, result))
                if isinstance(result, dict):
                    # Handler ran and recorded an error for us
                    request.set_error(result['error_status'])
                else:
                    # An exception, probably, stringized by salt for us
                    request.set_error(result)
                request.complete()
            elif result['error']:
                # This indicates a failure within ceph.rados_commands which was caught
                # by our code, like one of our Ceph commands returned an error code.
                # NB in future there may be UserRequest subclasses which want to receive
                # and handle these errors themselves, so this branch would be refactored
                # to allow that.
                log.error("Request %s experienced an error: %s" % (request.id, result['error_status']))
                request.jid = None
                request.set_error(result['error_status'])
                request.complete()
            else:
                if request.state != UserRequest.SUBMITTED:
                    # Unexpected, ignore.
                    log.error("Received completion for request %s/%s in state %s" % (
                        request.id, request.jid, request.state
                    ))
                    return

                try:
                    with self._update_index(request):
                        old_jid = request.jid
                        request.complete_jid(result)
                        assert request.jid != old_jid

                        # After a jid completes, requests may start waiting for cluster
                        # map updates, we ask ClusterMonitor to hurry up and get them on
                        # behalf of the request.
                        if request.awaiting_versions:
                            for sync_type, version in request.awaiting_versions.items():
                                if version is not None:
                                    log.debug("Notifying SyncObjects of awaited version %s/%s" % (sync_type.str, version))
                                    self._sync_objects.on_version(data['id'], sync_type, version)

                            # The request may be waiting for an epoch that we already have, if so
                            # give it to the request right away
                            for sync_type, want_version in request.awaiting_versions.items():
                                got_version = self._sync_objects.get_version(sync_type)
                                if want_version and sync_type.cmp(got_version, want_version) >= 0:
                                    log.info("Awaited %s %s is immediately available" % (sync_type, want_version))
                                    request.on_map(sync_type, self._sync_objects)

                except Exception as e:
                    # Ensure that a misbehaving piece of code in a UserRequest subclass
                    # results in a terminated job, not a zombie job
                    log.exception("Calling complete_jid for %s/%s" % (request.id, request.jid))
                    request.jid = None
                    request.set_error("Internal error %s" % e)
                    request.complete()

        if request.state == UserRequest.COMPLETE:
            self._eventer.on_user_request_complete(request)
Ejemplo n.º 8
0
    def on_completion(self, data):
        """
        Callback for when a salt/job/<jid>/ret event is received, in which
        we find the UserRequest that created the job, and inform it of
        completion so that it can progress.
        """
        with self._lock:
            jid = data['jid']
            result = data['return']
            log.debug("on_completion: jid=%s data=%s" % (jid, data))

            try:
                request = self.get_by_jid(jid)
                log.debug("on_completion: jid %s belongs to request %s" %
                          (jid, request.id))
            except KeyError:
                log.warning("on_completion: unknown jid {0}".format(jid))
                return

            if not data['success']:
                # This indicates a failure at the salt level, i.e. job threw an exception
                log.error("Remote execution failed for request %s: %s" %
                          (request.id, result))
                if isinstance(result, dict):
                    # Handler ran and recorded an error for us
                    request.set_error(result['error_status'])
                else:
                    # An exception, probably, stringized by salt for us
                    request.set_error(result)
                request.complete()
            elif result['error']:
                # This indicates a failure within ceph.rados_commands which was caught
                # by our code, like one of our Ceph commands returned an error code.
                # NB in future there may be UserRequest subclasses which want to receive
                # and handle these errors themselves, so this branch would be refactored
                # to allow that.
                log.error("Request %s experienced an error: %s" %
                          (request.id, result['error_status']))
                request.jid = None
                request.set_error(result['error_status'])
                request.complete()
            else:
                if request.state != UserRequest.SUBMITTED:
                    # Unexpected, ignore.
                    log.error(
                        "Received completion for request %s/%s in state %s" %
                        (request.id, request.jid, request.state))
                    return

                try:
                    with self._update_index(request):
                        old_jid = request.jid
                        request.complete_jid(result)
                        assert request.jid != old_jid

                        # After a jid completes, requests may start waiting for cluster
                        # map updates, we ask ClusterMonitor to hurry up and get them on
                        # behalf of the request.
                        if request.awaiting_versions:
                            for sync_type, version in request.awaiting_versions.items(
                            ):
                                if version is not None:
                                    log.debug(
                                        "Notifying SyncObjects of awaited version %s/%s"
                                        % (sync_type.str, version))
                                    self._sync_objects.on_version(
                                        data['id'], sync_type, version)

                            # The request may be waiting for an epoch that we already have, if so
                            # give it to the request right away
                            for sync_type, want_version in request.awaiting_versions.items(
                            ):
                                got_version = self._sync_objects.get_version(
                                    sync_type)
                                if want_version and sync_type.cmp(
                                        got_version, want_version) >= 0:
                                    log.info(
                                        "Awaited %s %s is immediately available"
                                        % (sync_type, want_version))
                                    request.on_map(sync_type,
                                                   self._sync_objects)

                except Exception as e:
                    # Ensure that a misbehaving piece of code in a UserRequest subclass
                    # results in a terminated job, not a zombie job
                    log.exception("Calling complete_jid for %s/%s" %
                                  (request.id, request.jid))
                    request.jid = None
                    request.set_error("Internal error %s" % e)
                    request.complete()

        if request.state == UserRequest.COMPLETE:
            self._eventer.on_user_request_complete(request)