Example #1
0
    def cancel(self, request_id):
        """
        Immediately mark a request as cancelled, and in the background
        try and cancel any outstanding JID for it.
        """
        request = self._by_request_id[request_id]

        # Idempotent behaviour: no-op if already cancelled
        if request.state == request.COMPLETE:
            return

        with self._update_index(request):
            # I will take over cancelling the JID from the request
            cancel_jid = request.jid
            request.jid = None

            # Request is now done, no further calls
            request.set_error("Cancelled")
            request.complete()

            # In the background, try to cancel the request's JID on a best-effort basis
            if cancel_jid:
                client = LocalClient(config.get('cthulhu', 'salt_config_path'))
                client.run_job(request.minion_id, 'saltutil.kill_job',
                               [cancel_jid])
Example #2
0
    def fetch(self, minion_id, sync_type):
        log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type))
        if minion_id is None:
            # We're probably being replayed to from the database
            log.warn("SyncObjects.fetch called with minion_id=None")
            return

        self._fetching_at[sync_type] = now()
        client = LocalClient(config.get('cthulhu', 'salt_config_path'))
        # TODO clean up unused 'since' argument
        pub_data = client.run_job(
            minion_id, 'ceph.get_cluster_object',
            condition_kwarg(
                [], {
                    'cluster_name': self._cluster_name,
                    'sync_type': sync_type.str,
                    'since': None
                }))
        if not pub_data:
            log.error("Failed to start fetch job %s/%s" %
                      (minion_id, sync_type))
            # Don't throw an exception because if a fetch fails we should always
        else:
            log.debug("SyncObjects.fetch: jid=%s minions=%s" %
                      (pub_data['jid'], pub_data['minions']))
Example #3
0
 def get_running(self, minions):
     client = LocalClient(config.get('cthulhu', 'salt_config_path'))
     pub_data = client.run_job(minions,
                               'saltutil.running', [],
                               expr_form="list")
     if not pub_data:
         log.warning(
             "Failed to publish saltutil.running to {0}".format(minions))
Example #4
0
 def run_job(self, fqdn, cmd, args):
     client = LocalClient(config.get('cthulhu', 'salt_config_path'))
     pub_data = client.run_job(fqdn, cmd, condition_kwarg([], args))
     if not pub_data:
         # FIXME: LocalClient uses 'print' to record the
         # details of what went wrong :-(
         raise Unavailable()
     else:
         return pub_data['jid']
Example #5
0
 def run_job(self, fqdn, cmd, args):
     client = LocalClient(config.get('cthulhu', 'salt_config_path'))
     pub_data = client.run_job(fqdn, cmd, condition_kwarg([], args))
     if not pub_data:
         # FIXME: LocalClient uses 'print' to record the
         # details of what went wrong :-(
         raise Unavailable()
     else:
         return pub_data['jid']
Example #6
0
    def _submit(self):
        client = LocalClient(config.get('cthulhu', 'salt_config_path'))
        pub_data = client.run_job(self._minion_id, self._cmd, self._args)
        if not pub_data:
            # FIXME: LocalClient uses 'print' to record the
            # details of what went wrong :-(
            raise PublishError("Failed to publish job")

        self.log.info("Request %s started job %s" % (self.id, pub_data['jid']))

        self.alive_at = now()
        self.jid = pub_data['jid']

        return self.jid
Example #7
0
    def tick(self):
        """
        For walltime-based monitoring of running requests.  Long-running requests
        get a periodic call to saltutil.running to verify that things really
        are still happening.
        """

        if not self._by_jid:
            return
        else:
            log.debug("RequestCollection.tick: %s JIDs underway" %
                      len(self._by_jid))

        # Identify JIDs who haven't had a saltutil.running reponse for too long.
        # Kill requests in a separate phase because request:JID is not 1:1
        stale_jobs = set()
        _now = now()
        for request in self._by_jid.values():
            if _now - request.alive_at > datetime.timedelta(
                    seconds=TICK_PERIOD * 3):
                log.error("Request %s JID %s stale: now=%s, alive_at=%s" %
                          (request.id, request.jid, _now, request.alive_at))
                stale_jobs.add(request)

        # Any identified stale jobs are errored out.
        for request in stale_jobs:
            with self._update_index(request):
                request.set_error("Lost contact")
                request.jid = None
                request.complete()

        # Identify minions associated with JIDs in flight
        query_minions = set()
        for jid, request in self._by_jid.items():
            query_minions.add(request.minion_id)

        # Attempt to emit a saltutil.running to ping jobs, next tick we
        # will see if we got updates to the alive_at attribute to indicate non-staleness
        if query_minions:
            log.info("RequestCollection.tick: sending saltutil.running to {0}".
                     format(query_minions))
            client = LocalClient(config.get('cthulhu', 'salt_config_path'))
            pub_data = client.run_job(list(query_minions),
                                      'saltutil.running', [],
                                      expr_form="list")
            if not pub_data:
                log.warning("Failed to publish saltutil.running to {0}".format(
                    query_minions))
Example #8
0
    def fetch(self, minion_id, sync_type):
        log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type))
        if minion_id is None:
            # We're probably being replayed to from the database
            log.warn("SyncObjects.fetch called with minion_id=None")
            return

        self._fetching_at[sync_type] = now()
        client = LocalClient(config.get('cthulhu', 'salt_config_path'))
        # TODO clean up unused 'since' argument
        pub_data = client.run_job(minion_id, 'ceph.get_cluster_object',
                                  condition_kwarg([], {'cluster_name': self._cluster_name,
                                                       'sync_type': sync_type.str,
                                                       'since': None}))
        if not pub_data:
            log.error("Failed to start fetch job %s/%s" % (minion_id, sync_type))
            # Don't throw an exception because if a fetch fails we should always
        else:
            log.debug("SyncObjects.fetch: jid=%s minions=%s" % (pub_data['jid'], pub_data['minions']))
Example #9
0
    def tick(self):
        """
        For walltime-based monitoring of running requests.  Long-running requests
        get a periodic call to saltutil.running to verify that things really
        are still happening.
        """

        if not self._by_jid:
            return
        else:
            log.debug("RequestCollection.tick: %s JIDs underway" % len(self._by_jid))

        # Identify JIDs who haven't had a saltutil.running reponse for too long.
        # Kill requests in a separate phase because request:JID is not 1:1
        stale_jobs = set()
        _now = now()
        for request in self._by_jid.values():
            if _now - request.alive_at > datetime.timedelta(seconds=TICK_PERIOD * 3):
                log.error("Request %s JID %s stale: now=%s, alive_at=%s" % (
                    request.id, request.jid, _now, request.alive_at
                ))
                stale_jobs.add(request)

        # Any identified stale jobs are errored out.
        for request in stale_jobs:
            with self._update_index(request):
                request.set_error("Lost contact")
                request.jid = None
                request.complete()

        # Identify minions associated with JIDs in flight
        query_minions = set()
        for jid, request in self._by_jid.items():
            query_minions.add(request.minion_id)

        # Attempt to emit a saltutil.running to ping jobs, next tick we
        # will see if we got updates to the alive_at attribute to indicate non-staleness
        if query_minions:
            log.info("RequestCollection.tick: sending saltutil.running to {0}".format(query_minions))
            client = LocalClient(config.get('cthulhu', 'salt_config_path'))
            pub_data = client.run_job(list(query_minions), 'saltutil.running', [], expr_form="list")
            if not pub_data:
                log.warning("Failed to publish saltutil.running to {0}".format(query_minions))
Example #10
0
    def _submit(self, commands=None):
        if commands is None:
            commands = self._commands

        self.log.debug("%s._submit: %s/%s/%s" % (self.__class__.__name__,
                                                 self._minion_id, self._cluster_name, commands))

        client = LocalClient(config.get('cthulhu', 'salt_config_path'))
        pub_data = client.run_job(self._minion_id, 'ceph.rados_commands',
                                  [self.fsid, self._cluster_name, commands])
        if not pub_data:
            # FIXME: LocalClient uses 'print' to record the
            # details of what went wrong :-(
            raise PublishError("Failed to publish job")

        self.log.info("Request %s started job %s" % (self.id, pub_data['jid']))

        self.alive_at = now()
        self.jid = pub_data['jid']

        return self.jid
Example #11
0
 def cancel(self, fqdn, jid):
     client = LocalClient(config.get('cthulhu', 'salt_config_path'))
     client.run_job(fqdn, 'saltutil.kill_job', [jid])
Example #12
0
 def cancel(self, fqdn, jid):
     client = LocalClient(config.get('cthulhu', 'salt_config_path'))
     client.run_job(fqdn, 'saltutil.kill_job', [jid])
Example #13
0
 def get_running(self, minions):
     client = LocalClient(config.get('cthulhu', 'salt_config_path'))
     pub_data = client.run_job(minions, 'saltutil.running', [], expr_form="list")
     if not pub_data:
         log.warning("Failed to publish saltutil.running to {0}".format(minions))