Exemple #1
0
class Job(Task):
    """
    Base class for jobs.
    """
    abstract = True  # Job class itself is not registered.
    initialize_timeout = 30  # seconds
    _runner_thread = None
    _aborter_thread = None
    _result_queue = Queue.Queue()
    _log = None
    _aborted_tasks = set()
    acks_late = True
    _origsigtermhandler = None

    @classmethod
    def getJobType(cls):
        """
        """
        return cls.name

    @classmethod
    def getJobDescription(cls, *args, **kwargs):
        """
        This is expected to be overridden in subclasses for nice descriptions.
        """
        raise NotImplementedError

    @classmethod
    def makeSubJob(cls, args=None, kwargs=None, description=None, **options):
        """
        Return a SubJob instance that wraps the given job and its arguments
        and options.
        """
        job = current_app.tasks[cls.name]
        return SubJob(job,
                      args=args,
                      kwargs=kwargs,
                      description=description,
                      options=options)

    def setProperties(self, **properties):
        self.app.backend.update(self.request.id, **properties)

    def _get_config(self, key, default=_MARKER):
        opts = getattr(self.app, 'db_options', None)
        sanitized_key = key.replace("-", "_")
        value = getattr(opts, sanitized_key, _MARKER)
        if value is _MARKER:
            raise ValueError("Config option %s is not defined" % key)
        return value

    @property
    def log(self):
        if self._log is None:
            # Get log directory, ensure it exists
            logdir = self._get_config('job-log-path')
            try:
                os.makedirs(logdir)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise
            # Make the logfile path and store it in the backend for later
            # retrieval
            logfile = os.path.join(logdir, '%s.log' % self.request.id)
            self.setProperties(logfile=logfile)
            self._log = get_task_logger(self.request.id)
            self._log.setLevel(self._get_config('logseverity'))
            handler = logging.FileHandler(logfile)
            handler.setFormatter(
                logging.Formatter(
                    "%(asctime)s %(levelname)s zen.Job: %(message)s"))
            self._log.handlers = [handler]
        return self._log

    @property
    def dmd(self):
        """
        Gets the dmd object from the backend
        """
        return self.app.backend.dmd

    def _wait_for_pending_job(self, job_id):
        i = 0
        # Exactly one job executes at a time, so it's fine to block waiting
        # for the database to get the pending job.
        jmgr = self.dmd.JobManager
        while i < self.initialize_timeout:
            try:
                jmgr._p_jar.sync()
                return jmgr.getJob(job_id)
            except NoSuchJobException:
                i += 1
                time.sleep(1)
        raise NoSuchJobException(job_id)

    def _check_aborted(self, job_id):
        try:
            while True:
                self.dmd._p_jar.sync()
                try:
                    status = self.app.backend.get_status(job_id)
                except NoSuchJobException:
                    status = states.ABORTED
                if status == states.ABORTED and \
                        self._runner_thread is not None:
                    self.log.info("Job %s is aborted", job_id)
                    # Sometimes the thread is about to commit before it
                    # can get interrupted.  self._aborted_tasks is an
                    # in-memory shared set so other thread can check on
                    # it before it commits.
                    self._aborted_tasks.add(job_id)
                    self._runner_thread.interrupt(JobAborted)
                    break
                time.sleep(0.25)
        finally:
            # release database connection acquired by the self.dmd
            # reference ealier in this method.
            self.backend.reset()

    def _do_run(self, request, args=None, kwargs=None):
        # This method runs a separate thread.
        args = args or ()
        kwargs = kwargs or {}
        job_id = request.id
        job_record = self.dmd.JobManager.getJob(job_id)
        # Log in as the job's user
        self.log.debug("Logging in as %s", job_record.user)
        utool = getToolByName(self.dmd.getPhysicalRoot(), 'acl_users')
        user = utool.getUserById(job_record.user)
        if user is None:
            user = self.dmd.zport.acl_users.getUserById(job_record.user)
        if user is None:
            # Can't get users when using Auth0 at this time, use zenoss_system
            user = self.dmd.zport.acl_users.getUserById("zenoss_system")
        user = user.__of__(utool)
        newSecurityManager(None, user)

        @transact
        def _runjob():
            result = self._run(*args, **kwargs)
            if job_id in self._aborted_tasks:
                raise JobAborted("Job %s aborted" % job_id)
            return result

        # Run it!
        self.log.info("Starting job %s (%s)", job_id, self.name)
        try:
            # Make request available to self.request property
            # (because self.request is thread local)
            self.request_stack.push(request)
            try:
                result = _runjob()
                self.log.info("Job %s finished with result %s", job_id, result)
                self._result_queue.put(result)
            except JobAborted:
                self.log.warning("Job %s aborted.", job_id)
                transaction.abort()
                # re-raise JobAborted to allow celery to perform job
                # failure and clean-up work.  A monkeypatch has been
                # installed to prevent this exception from being written to
                # the log.
                raise
        except Exception as e:
            e.exc_info = sys.exc_info()
            self._result_queue.put(e)
        finally:
            # Remove the request
            self.request_stack.pop()
            # Log out; probably unnecessary but can't hurt
            noSecurityManager()
            self._aborted_tasks.discard(job_id)
            # release database connection acquired by the self.dmd
            # reference ealier in this method.
            self.backend.reset()

    def run(self, *args, **kwargs):
        job_id = self.request.id
        self.log.info("Job %s (%s) received", job_id, self.name)
        self.log.debug("Waiting for job %s to appear in database", job_id)
        try:
            # Wait for the job to appear in the database.
            self._wait_for_pending_job(job_id)
        except NoSuchJobException:
            # Timed out waiting for job.
            try:
                # This may also fail because the job was deleted before
                # being read from the queue.
                self.update_state(state=states.ABORTED)
            except Exception:
                self.log.debug("No such job %s found in database", job_id)
            return
        self.log.debug("Job %s found in database", job_id)

        self._aborter_thread = InterruptableThread(target=self._check_aborted,
                                                   args=(job_id, ))
        # Forward the request to the thread because the self.request
        # property is a thread-local value.
        self._runner_thread = InterruptableThread(target=self._do_run,
                                                  args=(self.request, ),
                                                  kwargs={
                                                      'args': args,
                                                      'kwargs': kwargs
                                                  })

        try:
            # Install a SIGTERM handler so that the 'runner_thread' can be
            # interrupted/aborted when the TERM signal is received.
            self._origsigtermhandler = signal.signal(signal.SIGTERM,
                                                     self._sigtermhandler)

            self._runner_thread.start()
            self._aborter_thread.start()

            # A blocking join() call also blocks the thread from calling
            # signal handlers, so use a timeout join and loop until the
            # thread exits to allow the thread an opportunity to call
            # signal handlers.
            self.log.debug("Monitoring _runner_thread existence")
            while self._runner_thread.is_alive():
                self._runner_thread.join(0.01)
            self.log.debug("_runner_thread has exited")

            result = self._result_queue.get_nowait()
            if isinstance(result, Exception):
                cls, instance, tb = result.exc_info[0:3]
                if not isinstance(result, JobAborted):
                    self.log.error("Job %s failed with an exception" % job_id)
                    self.log.error(tb)
                links = []
                if self.request.callbacks:
                    for callback in self.request.callbacks:
                        links.extend(callback.flatten_links())
                for link in links:
                    link.type.update_state(task_id=link.options['task_id'],
                                           state=states.ABORTED)
                if links:
                    self.log.info(
                        "Dependent job(s) %s aborted",
                        ', '.join(link.options['task_id'] for link in links))
                raise cls, instance, tb

            return result
        except Queue.Empty:
            return None
        finally:
            # Remove our signal handler and re-install the original handler
            if signal.getsignal(signal.SIGTERM) == self._sigtermhandler:
                signal.signal(signal.SIGTERM, self._origsigtermhandler)
            # Kill the aborter
            try:
                self._aborter_thread.kill()
                self._aborter_thread.join(0.5)
            except ValueError:
                pass
            # Clean up the logger
            try:
                del self._log.logger.manager.loggerDict[self.request.id]
            except (AttributeError, KeyError):
                pass
            for handler in self._log.handlers:
                handler.close()
            self._log = None

    def on_failure(self, exc, task_id, args, kwargs, einfo):
        # Because JobAborted is an exception, celery will change the state to
        # FAILURE once the task completes. Since we want it to remain ABORTED,
        # we'll set it back here.
        if isinstance(exc, JobAborted):
            self.update_state(state=states.ABORTED)

    def _run(self, *args, **kwargs):
        raise NotImplementedError("_run must be implemented")

    def _sigtermhandler(self, signum, frame):
        self.log.debug("%s received signal %s", self, signum)
        # Interrupt the runner_thread.
        self._runner_thread.interrupt(JobAborted)
        # Wait for the runner_thread to exit.
        while self._runner_thread.is_alive():
            time.sleep(0.01)
        # Install the original SIGTERM handler
        signal.signal(signal.SIGTERM, self._origsigtermhandler)
        # Send this process a SIGTERM signal
        os.kill(os.getpid(), signal.SIGTERM)
class Job(Task):
    """
    Base class for jobs.
    """
    abstract = True  # Job class itself is not registered.
    initialize_timeout = 30  # seconds
    _runner_thread = None
    _aborter_thread = None
    _result_queue = Queue.Queue()
    _log = None
    _aborted_tasks = set()
    acks_late = True
    _origsigtermhandler = None

    @classmethod
    def getJobType(cls):
        """
        """
        return cls.name

    @classmethod
    def getJobDescription(cls, *args, **kwargs):
        """
        This is expected to be overridden in subclasses for nice descriptions.
        """
        raise NotImplementedError

    @classmethod
    def makeSubJob(cls, args=None, kwargs=None, description=None, **options):
        """
        Return a SubJob instance that wraps the given job and its arguments
        and options.
        """
        job = current_app.tasks[cls.name]
        return SubJob(job, args=args, kwargs=kwargs,
                description=description, options=options)

    def setProperties(self, **properties):
        self.app.backend.update(self.request.id, **properties)

    def _get_config(self, key, default=_MARKER):
        opts = getattr(self.app, 'db_options', None)
        sanitized_key = key.replace("-", "_")
        value = getattr(opts, sanitized_key, _MARKER)
        if value is _MARKER:
            raise ValueError("Config option %s is not defined" % key)
        return value

    @property
    def log(self):
        if self._log is None:
            # Get log directory, ensure it exists
            logdir = self._get_config('job-log-path')
            try:
                os.makedirs(logdir)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise
            # Make the logfile path and store it in the backend for later
            # retrieval
            logfile = os.path.join(logdir, '%s.log' % self.request.id)
            self.setProperties(logfile=logfile)
            self._log = get_task_logger(self.request.id)
            self._log.setLevel(self._get_config('logseverity'))
            handler = logging.FileHandler(logfile)
            handler.setFormatter(logging.Formatter(
                "%(asctime)s %(levelname)s zen.Job: %(message)s"))
            self._log.handlers = [handler]
        return self._log

    @property
    def dmd(self):
        """
        Gets the dmd object from the backend
        """
        return self.app.backend.dmd

    def _wait_for_pending_job(self, job_id):
        i = 0
        # Exactly one job executes at a time, so it's fine to block waiting
        # for the database to get the pending job.
        jmgr = self.dmd.JobManager
        while i < self.initialize_timeout:
            try:
                jmgr._p_jar.sync()
                return jmgr.getJob(job_id)
            except NoSuchJobException:
                i += 1
                time.sleep(1)
        raise NoSuchJobException(job_id)

    def _check_aborted(self, job_id):
        try:
            while True:
                self.dmd._p_jar.sync()
                try:
                    status = self.app.backend.get_status(job_id)
                except NoSuchJobException:
                    status = states.ABORTED
                if status == states.ABORTED and \
                        self._runner_thread is not None:
                    self.log.info("Job %s is aborted", job_id)
                    # Sometimes the thread is about to commit before it
                    # can get interrupted.  self._aborted_tasks is an
                    # in-memory shared set so other thread can check on
                    # it before it commits.
                    self._aborted_tasks.add(job_id)
                    self._runner_thread.interrupt(JobAborted)
                    break
                time.sleep(0.25)
        finally:
            # release database connection acquired by the self.dmd
            # reference ealier in this method.
            self.backend.reset()

    def _do_run(self, request, args=None, kwargs=None):
        # This method runs a separate thread.
        args = args or ()
        kwargs = kwargs or {}
        job_id = request.id
        job_record = self.dmd.JobManager.getJob(job_id)
        # Log in as the job's user
        self.log.debug("Logging in as %s", job_record.user)
        utool = getToolByName(self.dmd.getPhysicalRoot(), 'acl_users')
        user = utool.getUserById(job_record.user)
        if user is None:
            user = self.dmd.zport.acl_users.getUserById(job_record.user)
        user = user.__of__(utool)
        newSecurityManager(None, user)

        @transact
        def _runjob():
            result = self._run(*args, **kwargs)
            if job_id in self._aborted_tasks:
                raise JobAborted("Job %s aborted" % job_id)
            return result

        # Run it!
        self.log.info("Starting job %s (%s)", job_id, self.name)
        try:
            # Make request available to self.request property
            # (because self.request is thread local)
            self.request_stack.push(request)
            try:
                result = _runjob()
                self.log.info(
                    "Job %s finished with result %s", job_id, result
                )
                self._result_queue.put(result)
            except JobAborted:
                self.log.warning("Job %s aborted.", job_id)
                transaction.abort()
                # re-raise JobAborted to allow celery to perform job
                # failure and clean-up work.  A monkeypatch has been
                # installed to prevent this exception from being written to
                # the log.
                raise
        except Exception as e:
            e.exc_info = sys.exc_info()
            self._result_queue.put(e)
        finally:
            # Remove the request
            self.request_stack.pop()
            # Log out; probably unnecessary but can't hurt
            noSecurityManager()
            self._aborted_tasks.discard(job_id)
            # release database connection acquired by the self.dmd
            # reference ealier in this method.
            self.backend.reset()

    def run(self, *args, **kwargs):
        job_id = self.request.id
        self.log.info("Job %s (%s) received", job_id, self.name)
        self.log.debug("Waiting for job %s to appear in database", job_id)
        try:
            # Wait for the job to appear in the database.
            self._wait_for_pending_job(job_id)
        except NoSuchJobException:
            # Timed out waiting for job.
            try:
                # This may also fail because the job was deleted before
                # being read from the queue.
                self.update_state(state=states.ABORTED)
            except Exception:
                self.log.debug("No such job %s found in database", job_id)
            return
        self.log.debug("Job %s found in database", job_id)

        self._aborter_thread = InterruptableThread(
                target=self._check_aborted, args=(job_id,)
            )
        # Forward the request to the thread because the self.request
        # property is a thread-local value.
        self._runner_thread = InterruptableThread(
                target=self._do_run, args=(self.request,),
                kwargs={'args': args, 'kwargs': kwargs}
            )

        try:
            # Install a SIGTERM handler so that the 'runner_thread' can be
            # interrupted/aborted when the TERM signal is received.
            self._origsigtermhandler = signal.signal(
                    signal.SIGTERM, self._sigtermhandler
                )

            self._runner_thread.start()
            self._aborter_thread.start()

            # A blocking join() call also blocks the thread from calling
            # signal handlers, so use a timeout join and loop until the
            # thread exits to allow the thread an opportunity to call
            # signal handlers.
            self.log.debug("Monitoring _runner_thread existence")
            while self._runner_thread.is_alive():
                self._runner_thread.join(0.01)
            self.log.debug("_runner_thread has exited")

            result = self._result_queue.get_nowait()
            if isinstance(result, Exception):
                cls, instance, tb = result.exc_info[0:3]
                if not isinstance(result, JobAborted):
                    self.log.error("Job %s failed with an exception" % job_id)
                    self.log.error(tb)
                links = []
                if self.request.callbacks:
                    for callback in self.request.callbacks:
                        links.extend(callback.flatten_links())
                for link in links:
                    link.type.update_state(
                        task_id=link.options['task_id'],
                        state=states.ABORTED
                    )
                if links:
                    self.log.info(
                        "Dependent job(s) %s aborted",
                        ', '.join(link.options['task_id'] for link in links)
                    )
                raise cls, instance, tb

            return result
        except Queue.Empty:
            return None
        finally:
            # Remove our signal handler and re-install the original handler
            if signal.getsignal(signal.SIGTERM) == self._sigtermhandler:
                signal.signal(signal.SIGTERM, self._origsigtermhandler)
            # Kill the aborter
            try:
                self._aborter_thread.kill()
                self._aborter_thread.join(0.5)
            except ValueError:
                pass
            # Clean up the logger
            try:
                del self._log.logger.manager.loggerDict[self.request.id]
            except (AttributeError, KeyError):
                pass
            self._log = None

    def on_failure(self, exc, task_id, args, kwargs, einfo):
        # Because JobAborted is an exception, celery will change the state to
        # FAILURE once the task completes. Since we want it to remain ABORTED,
        # we'll set it back here.
        if isinstance(exc, JobAborted):
            self.update_state(state=states.ABORTED)

    def _run(self, *args, **kwargs):
        raise NotImplementedError("_run must be implemented")

    def _sigtermhandler(self, signum, frame):
        self.log.debug("%s received signal %s", self, signum)
        # Interrupt the runner_thread.
        self._runner_thread.interrupt(JobAborted)
        # Wait for the runner_thread to exit.
        while self._runner_thread.is_alive():
            time.sleep(0.01)
        # Install the original SIGTERM handler
        signal.signal(signal.SIGTERM, self._origsigtermhandler)
        # Send this process a SIGTERM signal
        os.kill(os.getpid(), signal.SIGTERM)
Exemple #3
0
    def run(self, *args, **kwargs):
        job_id = self.request.id
        self.log.info("Job %s (%s) received", job_id, self.name)
        self.log.debug("Waiting for job %s to appear in database", job_id)
        try:
            # Wait for the job to appear in the database.
            self._wait_for_pending_job(job_id)
        except NoSuchJobException:
            # Timed out waiting for job.
            try:
                # This may also fail because the job was deleted before
                # being read from the queue.
                self.update_state(state=states.ABORTED)
            except Exception:
                self.log.debug("No such job %s found in database", job_id)
            return
        self.log.debug("Job %s found in database", job_id)

        self._aborter_thread = InterruptableThread(target=self._check_aborted,
                                                   args=(job_id, ))
        # Forward the request to the thread because the self.request
        # property is a thread-local value.
        self._runner_thread = InterruptableThread(target=self._do_run,
                                                  args=(self.request, ),
                                                  kwargs={
                                                      'args': args,
                                                      'kwargs': kwargs
                                                  })

        try:
            # Install a SIGTERM handler so that the 'runner_thread' can be
            # interrupted/aborted when the TERM signal is received.
            self._origsigtermhandler = signal.signal(signal.SIGTERM,
                                                     self._sigtermhandler)

            self._runner_thread.start()
            self._aborter_thread.start()

            # A blocking join() call also blocks the thread from calling
            # signal handlers, so use a timeout join and loop until the
            # thread exits to allow the thread an opportunity to call
            # signal handlers.
            self.log.debug("Monitoring _runner_thread existence")
            while self._runner_thread.is_alive():
                self._runner_thread.join(0.01)
            self.log.debug("_runner_thread has exited")

            result = self._result_queue.get_nowait()
            if isinstance(result, Exception):
                cls, instance, tb = result.exc_info[0:3]
                if not isinstance(result, JobAborted):
                    self.log.error("Job %s failed with an exception" % job_id)
                    self.log.error(tb)
                links = []
                if self.request.callbacks:
                    for callback in self.request.callbacks:
                        links.extend(callback.flatten_links())
                for link in links:
                    link.type.update_state(task_id=link.options['task_id'],
                                           state=states.ABORTED)
                if links:
                    self.log.info(
                        "Dependent job(s) %s aborted",
                        ', '.join(link.options['task_id'] for link in links))
                raise cls, instance, tb

            return result
        except Queue.Empty:
            return None
        finally:
            # Remove our signal handler and re-install the original handler
            if signal.getsignal(signal.SIGTERM) == self._sigtermhandler:
                signal.signal(signal.SIGTERM, self._origsigtermhandler)
            # Kill the aborter
            try:
                self._aborter_thread.kill()
                self._aborter_thread.join(0.5)
            except ValueError:
                pass
            # Clean up the logger
            try:
                del self._log.logger.manager.loggerDict[self.request.id]
            except (AttributeError, KeyError):
                pass
            for handler in self._log.handlers:
                handler.close()
            self._log = None
    def run(self, *args, **kwargs):
        job_id = self.request.id
        self.log.info("Job %s (%s) received", job_id, self.name)
        self.log.debug("Waiting for job %s to appear in database", job_id)
        try:
            # Wait for the job to appear in the database.
            self._wait_for_pending_job(job_id)
        except NoSuchJobException:
            # Timed out waiting for job.
            try:
                # This may also fail because the job was deleted before
                # being read from the queue.
                self.update_state(state=states.ABORTED)
            except Exception:
                self.log.debug("No such job %s found in database", job_id)
            return
        self.log.debug("Job %s found in database", job_id)

        self._aborter_thread = InterruptableThread(
                target=self._check_aborted, args=(job_id,)
            )
        # Forward the request to the thread because the self.request
        # property is a thread-local value.
        self._runner_thread = InterruptableThread(
                target=self._do_run, args=(self.request,),
                kwargs={'args': args, 'kwargs': kwargs}
            )

        try:
            # Install a SIGTERM handler so that the 'runner_thread' can be
            # interrupted/aborted when the TERM signal is received.
            self._origsigtermhandler = signal.signal(
                    signal.SIGTERM, self._sigtermhandler
                )

            self._runner_thread.start()
            self._aborter_thread.start()

            # A blocking join() call also blocks the thread from calling
            # signal handlers, so use a timeout join and loop until the
            # thread exits to allow the thread an opportunity to call
            # signal handlers.
            self.log.debug("Monitoring _runner_thread existence")
            while self._runner_thread.is_alive():
                self._runner_thread.join(0.01)
            self.log.debug("_runner_thread has exited")

            result = self._result_queue.get_nowait()
            if isinstance(result, Exception):
                cls, instance, tb = result.exc_info[0:3]
                if not isinstance(result, JobAborted):
                    self.log.error("Job %s failed with an exception" % job_id)
                    self.log.error(tb)
                links = []
                if self.request.callbacks:
                    for callback in self.request.callbacks:
                        links.extend(callback.flatten_links())
                for link in links:
                    link.type.update_state(
                        task_id=link.options['task_id'],
                        state=states.ABORTED
                    )
                if links:
                    self.log.info(
                        "Dependent job(s) %s aborted",
                        ', '.join(link.options['task_id'] for link in links)
                    )
                raise cls, instance, tb

            return result
        except Queue.Empty:
            return None
        finally:
            # Remove our signal handler and re-install the original handler
            if signal.getsignal(signal.SIGTERM) == self._sigtermhandler:
                signal.signal(signal.SIGTERM, self._origsigtermhandler)
            # Kill the aborter
            try:
                self._aborter_thread.kill()
                self._aborter_thread.join(0.5)
            except ValueError:
                pass
            # Clean up the logger
            try:
                del self._log.logger.manager.loggerDict[self.request.id]
            except (AttributeError, KeyError):
                pass
            self._log = None