def __init__(self, scheduled_jobs): self.my_pid = os.getpid() self.timer_wait = None signal.signal(signal.SIGUSR1, JobRunner.sig_general_handler) self.job_queue = JobQueue(scheduled_jobs, db, logger) self._should_quit = False self._should_kill = False self.sleep_to = None self.queue_paused_at = 0 self.queue_killed_at = 0 self._last_pause_warn = 0
def main(): try: opts, args = getopt.getopt(sys.argv[1:], '', ['reload', 'quit', 'status', 'config=', 'dump=', 'run=', 'pause', 'quiet', 'resume', 'show-job=', 'with-deps', 'kill']) except getopt.GetoptError: usage(1) #global scheduled_jobs alt_config = with_deps = quiet = False for opt, val in opts: if opt == '--with-deps': with_deps = True elif opt == '--quiet': quiet = True for opt, val in opts: if opt in('--reload', '--quit', '--status', '--run', '--pause', '--resume', '--show-job', '--kill'): if opt == '--reload': cmd = 'RELOAD' elif opt == '--quit': cmd = 'QUIT' elif opt == '--status': cmd = 'STATUS' elif opt == '--pause': cmd = 'PAUSE' elif opt == '--resume': cmd = 'RESUME' elif opt == '--run': cmd = 'RUNJOB %s %i' % (val, with_deps) elif opt == '--show-job': cmd = 'SHOWJOB %s' % val elif opt == '--kill': cmd = 'KILL' sock = SocketHandling(logger) try: print "Response: %s" % sock.send_cmd(cmd) except SocketHandling.Timeout: print "Timout contacting server, is it running?" sys.exit(0) elif opt in ('--config',): if val.find("/") == -1: sys.path.insert(0, '.') name = val else: sys.path.insert(0, val[:val.rindex("/")]) name = val[val.rindex("/")+1:] name = name[:name.rindex(".")] exec("import %s as tmp" % name) scheduled_jobs = tmp # sys.path = sys.path[1:] #With this reload(module) loads another file(!) alt_config = True elif opt in ('--dump',): JobQueue.dump_jobs(scheduled_jobs, int(val)) sys.exit(0) if not alt_config: import scheduled_jobs sock = SocketHandling(logger) ca = CallableAction() ca.set_id("master_jr_lock") try: if(sock.ping_server()): if not quiet: print "Server already running" sys.exit(1) try: ca.check_lockfile() except LockExists: logger.error( ("%s: Master lock exists, but jr-socket didn't respond to "+ "ping. This should be a very rare error!") % ca.lockfile_name) sys.exit(1) ca.make_lockfile() except SocketHandling.Timeout: # Assuming that previous run aborted without removing socket logger.warn("Socket timeout, assuming server is dead") try: os.unlink(cereconf.JOB_RUNNER_SOCKET) except OSError: pass pass jr = JobRunner(scheduled_jobs) if True: socket_thread = threading.Thread(target=sock.start_listener, args=(jr,)) socket_thread.setDaemon(True) socket_thread.setName("socket_thread") socket_thread.start() jr.run_job_loop() logger.debug("bye") sock.cleanup() ca.free_lock()
class JobRunner(object): def __init__(self, scheduled_jobs): self.my_pid = os.getpid() self.timer_wait = None signal.signal(signal.SIGUSR1, JobRunner.sig_general_handler) self.job_queue = JobQueue(scheduled_jobs, db, logger) self._should_quit = False self._should_kill = False self.sleep_to = None self.queue_paused_at = 0 self.queue_killed_at = 0 self._last_pause_warn = 0 def sig_general_handler(signum, frame): """General signal handler, for places where we use signal.pause()""" logger.debug2("siggeneral_handler(%s)" % (str(signum))) sig_general_handler = staticmethod(sig_general_handler) def signal_sleep(self, seconds): # SIGALRM is already used by the SocketThread, se we arrange # for a SIGUSR1 to be delivered instead runner_cw.acquire() if not self.timer_wait: # Only have one signal-sleep thread logger.debug("Signalling sleep: %s seconds" % str(seconds)) self.timer_wait = threading.Timer(seconds, self.wake_runner_signal) self.timer_wait.setDaemon(True) self.timer_wait.start() self.sleep_to = time.time() + seconds else: logger.debug("already doing a signal sleep") runner_cw.release() def handle_completed_jobs(self): """Handle any completed jobs (only jobs that has call != None). Will block if any of the jobs has wait=1""" did_wait = False logger.debug("handle_completed_jobs: ") for job in self.job_queue.get_running_jobs(): try: ret = job['call'].cond_wait(job['pid']) except OSError, msg: if not str(msg).startswith("[Errno 4]"): # 4 = "Interrupted system call", which we may get # as we catch SIGCHLD # TODO: We need to filter out false positives from being # logged: logger.error("error (%s): %s" % (job['name'], msg)) time.sleep(1) continue logger.debug2("cond_wait(%s) = %s" % (job['name'], ret)) if ret is None: # Job not completed job_def = self.job_queue.get_known_job(job['name']) if job_def.max_duration is not None: run_for = time.time() - job['started'] if run_for > job_def.max_duration: # We sleep a little so that we don't risk entering # a tight loop with lots of logging time.sleep(1) logger.error("%s (pid %d) has run for %d seconds, " "sending SIGTERM" % (job['name'], job['pid'], run_for)) try: os.kill(job['pid'], signal.SIGTERM) # By setting did_wait to True, the main loop # will immediately call this function again to # reap the job we just killed. (If we don't, # the SIGCHLD may be delivered before we reach # sigpause) did_wait = True except OSError, msg: # Don't die if we're not allowed to kill # the job. The reason is probably that the # process is run by root (sudo) logger.error("Couldn't kill job %s (pid %d): %s" % (job['name'], job['pid'], msg)) else:
def main(): try: opts, args = getopt.getopt(sys.argv[1:], '', [ 'reload', 'quit', 'status', 'config=', 'dump=', 'run=', 'pause', 'quiet', 'resume', 'show-job=', 'with-deps', 'kill' ]) except getopt.GetoptError: usage(1) #global scheduled_jobs alt_config = with_deps = quiet = False for opt, val in opts: if opt == '--with-deps': with_deps = True elif opt == '--quiet': quiet = True for opt, val in opts: if opt in ('--reload', '--quit', '--status', '--run', '--pause', '--resume', '--show-job', '--kill'): if opt == '--reload': cmd = 'RELOAD' elif opt == '--quit': cmd = 'QUIT' elif opt == '--status': cmd = 'STATUS' elif opt == '--pause': cmd = 'PAUSE' elif opt == '--resume': cmd = 'RESUME' elif opt == '--run': cmd = 'RUNJOB %s %i' % (val, with_deps) elif opt == '--show-job': cmd = 'SHOWJOB %s' % val elif opt == '--kill': cmd = 'KILL' sock = SocketHandling(logger) try: print "Response: %s" % sock.send_cmd(cmd) except SocketHandling.Timeout: print "Timout contacting server, is it running?" sys.exit(0) elif opt in ('--config', ): if val.find("/") == -1: sys.path.insert(0, '.') name = val else: sys.path.insert(0, val[:val.rindex("/")]) name = val[val.rindex("/") + 1:] name = name[:name.rindex(".")] exec("import %s as tmp" % name) scheduled_jobs = tmp # sys.path = sys.path[1:] #With this reload(module) loads another file(!) alt_config = True elif opt in ('--dump', ): JobQueue.dump_jobs(scheduled_jobs, int(val)) sys.exit(0) if not alt_config: import scheduled_jobs sock = SocketHandling(logger) ca = CallableAction() ca.set_id("master_jr_lock") try: if (sock.ping_server()): if not quiet: print "Server already running" sys.exit(1) try: ca.check_lockfile() except LockExists: logger.error( ("%s: Master lock exists, but jr-socket didn't respond to " + "ping. This should be a very rare error!") % ca.lockfile_name) sys.exit(1) ca.make_lockfile() except SocketHandling.Timeout: # Assuming that previous run aborted without removing socket logger.warn("Socket timeout, assuming server is dead") try: os.unlink(cereconf.JOB_RUNNER_SOCKET) except OSError: pass pass jr = JobRunner(scheduled_jobs) if True: socket_thread = threading.Thread(target=sock.start_listener, args=(jr, )) socket_thread.setDaemon(True) socket_thread.setName("socket_thread") socket_thread.start() jr.run_job_loop() logger.debug("bye") sock.cleanup() ca.free_lock()