def __init__(self, log_dir, daemon_id_for_log, buffer_data): if not os.path.exists(log_dir): raise Exception("log_dir '%s' not found!" % (log_dir)) self.__log_dir__ = log_dir self.daemon_id_for_log = daemon_id_for_log if not daemon_id_for_log == None: log.info("TMSD stderr & stdout will be in '%s'" % (self.__get_daemon_log_file_name__())) self.buffer_data = buffer_data self.open_files = {}
def interrupt(self): """Interrupt this Task""" assert not self.__subprocess__ == None, "Cannot interrupt process not started" # A bit of interpretive dance to get this to replicate what's much easier in the 2.6 version if self.is_running(): # task is still running; interrupt it! # TODO kill it? (would be signal.SIGKILL) log.info("sending SIGINT to pid:%s, task:%s" % (self.get_pid(), self.get_task().get_id())) os.kill(self.get_pid(), signal.SIGINT) elif self.get_exit_status(): raise Exception("Task cannot be interrupted. It has already succeeded.") else: raise Exception("Task cannot be interrupted. It has failed with status %s." % (self.get_exit_status()))
def do_run(self, tmsd_status): """What's actually called by the daemon to run the Message. Don't override!""" try: try: self.__set_run_status__(SQSTaskRunStatus.STATUS_RUNNING, tmsd_status=tmsd_status) log.info("Running SQS Task '%s'" % (self)) success = self.run() if success: self.__set_run_status__(SQSTaskRunStatus.STATUS_SUCCESS) log.info("SQS Task '%s' succeeded.\n\n" % (self)) else: raise Exception("SQS Task returned failure status. See log for details.") except SystemExit, se: # in python 2.4, SystemExit extends Exception, this is changed in 2.5 to # extend BaseException, specifically so this check isn't necessary. But # we're using 2.4; upon upgrade, this check will be unecessary but ignorable. raise se except Exception, e: log.error("SQS Task failed!", e) log.error("\n\n", noalteration=True) self.__set_run_status__(SQSTaskRunStatus.STATUS_ERROR)
def clear_queue(c, queue_name, use_api): q = c.get_queue(queue_name) if q == None: raise Exception("No queue exists by name '%s'" % (queue_name)) if use_api: log.info("Clearing q '%s' using method recommended in API (had %s messages)" % (queue_name, q.count())) q.clear() else: # clearing is slow & unreliable for some reason. Just delete it and recreate it. log.info("Clearing q using deletion '%s' (had %s messages)" % (queue_name, q.count())) visibility_timeout = q.get_timeout() delete_queue(c, queue_name) wait = 65 log.info("Waiting %s seconds before recreating queue" % (wait)) time.sleep(wait) # amazon forces us to wait 1 minute before creating a queue by the same name create_queue(c, queue_name, visibility_timeout=visibility_timeout)
task = a_task break if task == None: sys.exit("ERROR: Task '%s' not found in job %s!" % (options.task_name, job.get_name())) # status = determine_status(options) if options.delete: trs = core.TaskRunStatus.get_all_statuses(task, iteration) if trs == None: trs = [] else: trs = trs.filter(status=status) if len(trs) >= 1: for tr in trs: log.info("Deleting status ID %s for %s %s w/ status %s" % (tr.get_id(), job.get_name(), task.get_name(), status)) tr.delete() else: sys.exit("ERROR: There are %s statuses for %s %s w/ status %s" \ % (len(trs), job.get_name(), task.get_name(), status)) else: log.info("Setting %s %s to %s" % (job.get_name(), task.get_name(), status)) task.__set_run_status__(iteration, status) if __name__ == '__main__': main() #
def create_queue(c, queue_name, visibility_timeout=None): q = c.get_queue(queue_name) if not q == None: raise Exception("Queue by name '%s' already exists!" % (queue_name)) log.info("Creating queue '%s' with visibility timeout %s" % (queue_name, visibility_timeout)) c.create_queue(queue_name, visibility_timeout=visibility_timeout)
def delete_queue(c, queue_name): q = c.get_queue(queue_name) if q == None: raise Exception("No queue exists by name '%s'" % (queue_name)) log.info("Deleting q '%s' (had %s messages)" % (queue_name, q.count())) q.delete()
def main(): global WAIT_POLL_SECONDS parser = OptionParser(usage()) parser.add_option("--status", action="store_true" , help="show status of all running norc daemons.") parser.add_option("--details", action="store", type="int" , help="show details for tmsd given by id.") parser.add_option("--filter_status", action="store", default="interesting" , help="if showing status, limit to this set. Defaults to 'interesting', which is active+errored.") parser.add_option("--salvage", action="store" , type="int", help="don't exit tms daemon as requested; leave it running.") parser.add_option("--pause", action="store", type="int" , help="pause the tms daemon of given ID so no more tasks are run") parser.add_option("--stop", action="store", type="int" , help="stop the tms daemon of given ID after all currently running tasks have finished") parser.add_option("--kill", action="store" , type="int", help="immediately kill the tms daemon of given ID") parser.add_option("--delete", action="store" , type="int", help="mark tms daemon of given ID as deleted for convenience. Only changes DB.") parser.add_option("--wait_seconds", action="store", default=0 , type="int", help="wait for N seconds for tmsd to stop after kill or stop is issued. Default is 0") parser.add_option("--force", action="store_true", help="overrides some safety checks. Use carefully by trying not to use it first.") parser.add_option("--due_to_run", action="store", type="int" , help="show a max # of Tasks due to run (currently an expensive DB call)") parser.add_option("--debug", action="store_true", help="more messages") (options, args) = parser.parse_args() if options.debug: log.set_logging_debug(options.debug) if not options.status and not options.details \ and not options.pause and not options.stop and not options.kill \ and not options.salvage and not options.delete: raise usage() if options.stop and (options.kill or options.salvage or options.details or options.pause) \ or options.kill and (options.stop or options.salvage or options.details or options.pause) \ or options.details and (options.kill or options.stop or options.salvage or options.pause) \ or options.pause and (options.kill or options.stop or options.salvage or options.details): raise usage() # # edit a tmsd # tds_id = None; tds = None if options.pause: tds_id = options.pause elif options.stop: tds_id = options.stop elif options.kill: tds_id = options.kill elif options.salvage: tds_id = options.salvage elif options.delete: tds_id = options.delete elif options.details: tds_id = options.details if not tds_id == None: tds = get_tds(tds_id) if options.pause and tds.is_paused() or tds.is_pause_requested(): raise Exception("tmsd %s is already paused or pause has been requested." % (tds.id)) if options.stop and tds.is_stop_requested(): raise Exception("tmsd %s is already scheduled to stop. You can also try --kill <id>." % (tds.id)) elif options.kill and tds.is_kill_requested(): raise Exception("tmsd %s is already scheduled to be killed. The only thing more severe is $kill -9 %s." % (tds.id, tds.pid)) elif options.salvage and (not tds.is_stop_requested() and not tds.is_kill_requested() and not tds.is_paused()): raise Exception("tmsd %s cannot be salvaged. Its status is not paused, stop- or kill- requested" % (tds.id)) if options.delete: if not options.force and not tds.is_done_with_error(): raise Exception("tmsd %s cannot be deleted because it has status %s. Use --force to override." % (tds.id, tds.get_status())) log.info("Deleting tmsd %s" % (tds)) tds.set_status(tms_models.NorcDaemonStatus.STATUS_DELETED) elif options.salvage: log.info("Salvaging tmsd %s" % (tds)) tds.set_status(tms_models.NorcDaemonStatus.STATUS_RUNNING) elif options.pause or options.stop or options.kill: if tds.is_done(): raise Exception("tmsd %s is not running. It cannot be shutdown or paused." % (tds.id)) if options.pause: log.info("Sending pause request to tmsd %s" % (tds)) tds.set_status(tms_models.NorcDaemonStatus.STATUS_PAUSEREQUESTED) elif options.stop: log.info("Sending stop request to tmsd %s" % (tds)) tds.set_status(tms_models.NorcDaemonStatus.STATUS_STOPREQUESTED) elif options.kill: log.info("Sending kill request to tmsd %s" % (tds)) tds.set_status(tms_models.NorcDaemonStatus.STATUS_KILLREQUESTED) # if options.wait_seconds: seconds_waited = 0 timeout = False while True: if seconds_waited >= options.wait_seconds: timeout = True break tds = get_tds(tds_id) if tds.is_shutting_down(): log.info("Waiting for shutdown of tmsd %s. It's been %s seconds." % (tds.id, seconds_waited), indent_chars=4) elif tds.is_done(): log.info("tmsd %s is done with status '%s'" % (tds.id, tds.get_status())) break else: raise Exception("tmsd %s shutdown was requested but not honored or was overwritten in DB. This is bad, but try \"kill <pid>\" directly." % (tms.id)) time.sleep(WAIT_POLL_SECONDS) seconds_waited += WAIT_POLL_SECONDS if timeout: log.info("Timeout reached waiting for tmsd %s to finish. Check process id %s on host '%s'" % (tds.id, tds.pid, tds.host)) sys.exit(1) # # report on status # if options.status and not tds == None: report_tmsd_status(options.filter_status, [tds], max_tasks_due_to_run=options.due_to_run) elif options.status: report_tmsd_status(options.filter_status, max_tasks_due_to_run=options.due_to_run) if options.details: daemon_type = tds.get_daemon_type() if daemon_type == tms_models.NorcDaemonStatus.DAEMON_TYPE_TMS: report_tmsd_details(options.filter_status, tds) elif daemon_type == tms_models.NorcDaemonStatus.DAEMON_TYPE_SQS: report_sqsd_details(options.filter_status, tds) else: raise Exception("Unknown daemon_type '%s'" % (daemon_type))
def start_task(self, task, iteration): """Start the given Task in the given Iteration""" log.info('"%s:%s" starting in new thread' % (task.get_job().get_name(), task.get_name())) tt = TaskInThread(task, iteration, self.get_daemon_status(), sys.stdout) tt.start()
def start_task(self, task, iteration): log.info('"%s:%s" starting in new process' % (task.get_job().get_name(), task.get_name())) tp = TaskInProcess(task, iteration, self.get_daemon_status(), self.__log_dir__) tp.run() self.__add_running_task__(tp)
def get_running_tasks(self): """Returns list of currently running RunnableTask's""" running_tasks = [] to_cleanup = [] for running_task in self.__running_tasks__: if running_task.is_running(): running_tasks.append(running_task) else: to_cleanup.append(running_task) # no longer running; log that fact for convenience exit_status = running_task.get_exit_status() if exit_status == 0: log.info('"%s" succeeded' % (self.__get_task_label__(running_task))) elif exit_status == 130: log.info('"%s" timed out.' % (self.__get_task_label__(running_task))) elif exit_status == 131: log.info('"%s" was interrupted.' % (self.__get_task_label__(running_task))) elif exit_status == 132: log.info('"%s" was killed.' % (self.__get_task_label__(running_task))) elif exit_status == 133: log.info('"%s" did not run.' % (self.__get_task_label__(running_task))) elif exit_status == 134: log.info('"%s" ended without a status.' % (self.__get_task_label__(running_task))) elif exit_status == 127: raise Exception( "\"%s\" failed b/c of internal error. \ TaskInProcess.RUN_TASK_EXE '%s' could not be found! BAD!" % (self.__get_task_label__(running_task), TaskInProcess.RUN_TASK_EXE) ) elif exit_status == 126: raise Exception( "\"%s\" failed b/c of internal error. \ TaskInProcess.RUN_TASK_EXE '%s' is not executable! BAD!" % (self.__get_task_label__(running_task), TaskInProcess.RUN_TASK_EXE) ) else: log.info('"%s" failed with exit status %s!' % (self.__get_task_label__(running_task), exit_status)) for no_longer_running in to_cleanup: # TODO can this be done in one loop? self.__running_tasks__.remove(no_longer_running) return running_tasks
def request_kill(self): log.info("tmsd Sending kill request...") self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_KILLREQUESTED) self.__break_tasks_to_run_loop__ = True
def __do_run__(self): """Main daemon loop""" log.info("%s %s..." % (self.get_name(), str(self.get_daemon_status()))) if settings.DEBUG: log.info("WARNING: settings.DEBUG is True: daemon will gobble up memory b/c django stores SQL queries.") self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_RUNNING) last_status = self.get_daemon_status().get_status() while True: if not last_status == self.get_daemon_status().get_status(): log.info("tmsd state changed: %s -> %s" % (last_status, self.get_daemon_status().get_status())) last_status = self.get_daemon_status().get_status() self.__set_daemon_status__(self.get_daemon_status().thwart_cache()) # see note in this method definition if self.get_daemon_status().is_stop_requested() or self.get_daemon_status().is_being_stopped(): # don't kick off more tasks, but wait for those running to finish on their own self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_STOPINPROGRESS) num_running_tasks = self.get_num_running_tasks() if num_running_tasks == 0: log.info("tmsd stop requested and no more tasks. Ending gracefully.") self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_ENDEDGRACEFULLY) return True else: log.info("tmsd stop requested; waiting for %s task(s) to finish." % (num_running_tasks)) elif self.get_daemon_status().is_kill_requested() or self.get_daemon_status().is_being_killed(): running_tasks = self.get_running_tasks() if len(running_tasks) == 0: log.info("tmsd kill requested but no tasks running. Ending gracefully.") self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_ENDEDGRACEFULLY) return True else: log.info( "tmsd kill requested; interrupting %s task(s) and stopping immediately." % (len(running_tasks)) ) self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_KILLINPROGRESS) for running_task in running_tasks: # There's no way to actually interrupt python threads # mark the task as ended in error, and leave it up to # main() to call SIGKILL on this process. log.info("interrupting task '%s'." % (running_task), indent_chars=4) try: running_task.interrupt() except Exception, e: log.error("Could not interrupt Task '%s'" % (running_task), e) self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_KILLED) return False elif self.get_daemon_status().is_pause_requested(): log.info("tmsd pause requested. Will just sit here.") self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_PAUSED)
def start_task(self): log.info("Starting the next SQS Task in new process") tp = SQSTaskInProcess(self.get_daemon_status(), self.__log_dir__) tp.run() self.__add_running_task__(tp)