Exemple #1
0
class JobRunner(object):
    def __init__(self, scheduled_jobs):
        self.my_pid = os.getpid()
        self.timer_wait = None
        signal.signal(signal.SIGUSR1, JobRunner.sig_general_handler)
        self.job_queue = JobQueue(scheduled_jobs, db, logger)
        self._should_quit = False
        self._should_kill = False
        self.sleep_to = None
        self.queue_paused_at = 0
        self.queue_killed_at = 0
        self._last_pause_warn = 0

    def sig_general_handler(signum, frame):
        """General signal handler, for places where we use signal.pause()"""
        logger.debug2("siggeneral_handler(%s)" % (str(signum)))
    sig_general_handler = staticmethod(sig_general_handler)

    def signal_sleep(self, seconds):
        # SIGALRM is already used by the SocketThread, se we arrange
        # for a SIGUSR1 to be delivered instead
        runner_cw.acquire()
        if not self.timer_wait:  # Only have one signal-sleep thread
            logger.debug("Signalling sleep: %s seconds" % str(seconds))
            self.timer_wait = threading.Timer(seconds, self.wake_runner_signal)
            self.timer_wait.setDaemon(True)
            self.timer_wait.start()
            self.sleep_to = time.time() + seconds
        else:
            logger.debug("already doing a signal sleep")
        runner_cw.release()

    def handle_completed_jobs(self):
        """Handle any completed jobs (only jobs that has
        call != None).  Will block if any of the jobs has wait=1"""
        did_wait = False

        logger.debug("handle_completed_jobs: ")
        for job in self.job_queue.get_running_jobs():
            try:
                ret = job['call'].cond_wait(job['pid'])
            except OSError, msg:
                if not str(msg).startswith("[Errno 4]"):
                    # 4 = "Interrupted system call", which we may get
                    # as we catch SIGCHLD
                    # TODO: We need to filter out false positives from being
                    # logged:
                    logger.error("error (%s): %s" % (job['name'], msg))
                time.sleep(1)
                continue
            logger.debug2("cond_wait(%s) = %s" % (job['name'], ret))
            if ret is None:          # Job not completed
                job_def = self.job_queue.get_known_job(job['name'])
                if job_def.max_duration is not None:
                    run_for = time.time() - job['started']
                    if run_for > job_def.max_duration:
                        # We sleep a little so that we don't risk entering
                        # a tight loop with lots of logging
                        time.sleep(1)
                        logger.error("%s (pid %d) has run for %d seconds, "
                                     "sending SIGTERM" %
                                     (job['name'], job['pid'], run_for))
                        try:
                            os.kill(job['pid'], signal.SIGTERM)
                            # By setting did_wait to True, the main loop
                            # will immediately call this function again to
                            # reap the job we just killed.  (If we don't,
                            # the SIGCHLD may be delivered before we reach
                            # sigpause)
                            did_wait = True
                        except OSError, msg:
                            # Don't die if we're not allowed to kill
                            # the job. The reason is probably that the
                            # process is run by root (sudo)
                            logger.error("Couldn't kill job %s (pid %d): %s" %
                                         (job['name'], job['pid'], msg))
            else:
Exemple #2
0
class JobRunner(object):
    def __init__(self, scheduled_jobs):
        self.my_pid = os.getpid()
        self.timer_wait = None
        signal.signal(signal.SIGUSR1, JobRunner.sig_general_handler)
        self.job_queue = JobQueue(scheduled_jobs, db, logger)
        self._should_quit = False
        self._should_kill = False
        self.sleep_to = None
        self.queue_paused_at = 0
        self.queue_killed_at = 0
        self._last_pause_warn = 0

    def sig_general_handler(signum, frame):
        """General signal handler, for places where we use signal.pause()"""
        logger.debug2("siggeneral_handler(%s)" % (str(signum)))

    sig_general_handler = staticmethod(sig_general_handler)

    def signal_sleep(self, seconds):
        # SIGALRM is already used by the SocketThread, se we arrange
        # for a SIGUSR1 to be delivered instead
        runner_cw.acquire()
        if not self.timer_wait:  # Only have one signal-sleep thread
            logger.debug("Signalling sleep: %s seconds" % str(seconds))
            self.timer_wait = threading.Timer(seconds, self.wake_runner_signal)
            self.timer_wait.setDaemon(True)
            self.timer_wait.start()
            self.sleep_to = time.time() + seconds
        else:
            logger.debug("already doing a signal sleep")
        runner_cw.release()

    def handle_completed_jobs(self):
        """Handle any completed jobs (only jobs that has
        call != None).  Will block if any of the jobs has wait=1"""
        did_wait = False

        logger.debug("handle_completed_jobs: ")
        for job in self.job_queue.get_running_jobs():
            try:
                ret = job['call'].cond_wait(job['pid'])
            except OSError, msg:
                if not str(msg).startswith("[Errno 4]"):
                    # 4 = "Interrupted system call", which we may get
                    # as we catch SIGCHLD
                    # TODO: We need to filter out false positives from being
                    # logged:
                    logger.error("error (%s): %s" % (job['name'], msg))
                time.sleep(1)
                continue
            logger.debug2("cond_wait(%s) = %s" % (job['name'], ret))
            if ret is None:  # Job not completed
                job_def = self.job_queue.get_known_job(job['name'])
                if job_def.max_duration is not None:
                    run_for = time.time() - job['started']
                    if run_for > job_def.max_duration:
                        # We sleep a little so that we don't risk entering
                        # a tight loop with lots of logging
                        time.sleep(1)
                        logger.error("%s (pid %d) has run for %d seconds, "
                                     "sending SIGTERM" %
                                     (job['name'], job['pid'], run_for))
                        try:
                            os.kill(job['pid'], signal.SIGTERM)
                            # By setting did_wait to True, the main loop
                            # will immediately call this function again to
                            # reap the job we just killed.  (If we don't,
                            # the SIGCHLD may be delivered before we reach
                            # sigpause)
                            did_wait = True
                        except OSError, msg:
                            # Don't die if we're not allowed to kill
                            # the job. The reason is probably that the
                            # process is run by root (sudo)
                            logger.error("Couldn't kill job %s (pid %d): %s" %
                                         (job['name'], job['pid'], msg))
            else: