def run_batch(self): tasks_to_run = tms_manage.get_tasks_allowed_to_run(end_completed_iterations=True, max_to_return=10) num_running_tasks = self.get_num_running_tasks() log.debug("tmsd running %s task(s), at least %s task(s) due to run" % (num_running_tasks, len(tasks_to_run))) need_resource_types = [] for (task, iteration) in tasks_to_run: if self.__break_tasks_to_run_loop__: # some other thread (request_stop) doesn't want me to continue. Stop here. break # check that there are currently sufficient resources to prevent # erroneously thinking this task can be run when it cannot. # There will be occasional cases where race conditions mean a task is not run when # it could be, but there are many more cases when this will save threads. if type(task) in need_resource_types: # A Task of this type already returned unavailable resources; don't check again. # This should be an efficiency gain for the running of Tasks to prevent # excessive polling of the resources table when there are likely no new resources. # log.info("Assuming no resources avail for Task type '%s'" % (type(task))) pass elif task.resources_available_to_run(self.get_daemon_status().get_region()): try: self.start_task(task, iteration) except Exception, e: log.error("Could not run Task '%s'" % (task), e) else: need_resource_types.append(type(task))
def __handle_timeout__(*args): global task sig_name = 'TIMEOUT' exit_code = 130 if task == None: log.error("Received %s but Task not started yet. Stopping with exit code %s." % (sig_name, exit_code)) elif not task.has_timeout(): raise Exception("Task %s doesn't handle timeouts. How did you get here? BUG!" % (task)) __handle_signal__(sig_name, exit_code, True)
def run(self): try: self.__logger__.start_redirect() ended_gracefully = NorcDaemon.run(self) self.__logger__.stop_redirect() return ended_gracefully except Exception, e: log.error("Error running daemon!", e) return False
def run(self): try: try: self.get_task().do_run(self.get_iteration(), self.get_daemon_status()) except Exception, e: log.error("Exception propegated from task.do_run(). BAD! Bug?", e) except: log.error("Poorly thrown exception propegated from task.do_run(). BAD! Bug?") traceback.print_exc()
def run(self): """Start this daemon""" try: ended_gracefully = self.__do_run__() return ended_gracefully except Exception, e: self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_ERROR) log.error("tmsd suffered an internal error. BAD!", e) return False
def rpt_queues(c): all_queues = c.get_all_queues() print "%s AWS SQS Queue(s) as of %s" % (len(all_queues), datetime.datetime.now()) sys.stdout.write("\n") table_data = [] header1 = ["Name", "~ #", "Timeout"] header2 = ["-", "-", "-"] table_data.append(header1) table_data.append(header2) for q in all_queues: try: row = [get_name(q), q.count(), q.get_timeout()] table_data.append(row) except SQSError, sqse: log.error("Internal SQS error (it generates ignorable errors sometimes)" + str(sqse))
def __do_run__(self): """Main daemon loop""" log.info("%s %s..." % (self.get_name(), str(self.get_daemon_status()))) if settings.DEBUG: log.info("WARNING: settings.DEBUG is True: daemon will gobble up memory b/c django stores SQL queries.") self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_RUNNING) last_status = self.get_daemon_status().get_status() while True: if not last_status == self.get_daemon_status().get_status(): log.info("tmsd state changed: %s -> %s" % (last_status, self.get_daemon_status().get_status())) last_status = self.get_daemon_status().get_status() self.__set_daemon_status__(self.get_daemon_status().thwart_cache()) # see note in this method definition if self.get_daemon_status().is_stop_requested() or self.get_daemon_status().is_being_stopped(): # don't kick off more tasks, but wait for those running to finish on their own self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_STOPINPROGRESS) num_running_tasks = self.get_num_running_tasks() if num_running_tasks == 0: log.info("tmsd stop requested and no more tasks. Ending gracefully.") self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_ENDEDGRACEFULLY) return True else: log.info("tmsd stop requested; waiting for %s task(s) to finish." % (num_running_tasks)) elif self.get_daemon_status().is_kill_requested() or self.get_daemon_status().is_being_killed(): running_tasks = self.get_running_tasks() if len(running_tasks) == 0: log.info("tmsd kill requested but no tasks running. Ending gracefully.") self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_ENDEDGRACEFULLY) return True else: log.info( "tmsd kill requested; interrupting %s task(s) and stopping immediately." % (len(running_tasks)) ) self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_KILLINPROGRESS) for running_task in running_tasks: # There's no way to actually interrupt python threads # mark the task as ended in error, and leave it up to # main() to call SIGKILL on this process. log.info("interrupting task '%s'." % (running_task), indent_chars=4) try: running_task.interrupt() except Exception, e: log.error("Could not interrupt Task '%s'" % (running_task), e) self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_KILLED) return False elif self.get_daemon_status().is_pause_requested(): log.info("tmsd pause requested. Will just sit here.") self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_PAUSED)
def do_run(self, tmsd_status): """What's actually called by the daemon to run the Message. Don't override!""" try: try: self.__set_run_status__(SQSTaskRunStatus.STATUS_RUNNING, tmsd_status=tmsd_status) log.info("Running SQS Task '%s'" % (self)) success = self.run() if success: self.__set_run_status__(SQSTaskRunStatus.STATUS_SUCCESS) log.info("SQS Task '%s' succeeded.\n\n" % (self)) else: raise Exception("SQS Task returned failure status. See log for details.") except SystemExit, se: # in python 2.4, SystemExit extends Exception, this is changed in 2.5 to # extend BaseException, specifically so this check isn't necessary. But # we're using 2.4; upon upgrade, this check will be unecessary but ignorable. raise se except Exception, e: log.error("SQS Task failed!", e) log.error("\n\n", noalteration=True) self.__set_run_status__(SQSTaskRunStatus.STATUS_ERROR)
def get_tasks_allowed_to_run(asof=None, end_completed_iterations=False, max_to_return=None): """ Get all tasks that are allowed to run, regardless of resources available. Includes all interfaces. TODO Currently this is EXTREMELY expensive to run. Use max_to_return or beware the sloooowness! *Slowness is due to having to independently query for each Task's lastest status and parent's status. One approach is to query for statuses, then tasks with no statuses, then merge the two lists. But this only satisfies some of the criteria that this slow way uses. Another approach: the daemon should ask for one task at a time, like a proper queue. """ if asof == None: # need to do this here and not in arg so it updates w/ each call asof = datetime.datetime.utcnow() to_run = [] # [[Task, Iteration]...] for iteration in core.Iteration.get_running_iterations(): tasks = iteration.get_job().get_tasks() iteration_is_done = True for a_task in tasks: try: if not max_to_return == None and len(to_run) >= max_to_return: break elif a_task.is_allowed_to_run(iteration, asof=asof): to_run.append([a_task, iteration]) iteration_is_done = False elif iteration_is_done and end_completed_iterations and not __status_is_finished__(a_task, iteration): iteration_is_done = False except Exception, e: log.error( "Could not check if task type '%s' is due to run. Skipping. \ BAD! Maybe DB is in an inconsistent state or software bug?" % (a_task.__class__.__name__), e, ) # TODO there's a bug here! iterations end when tasks are sittign in failed state if iteration_is_done and end_completed_iterations and iteration.is_ephemeral(): # this iteration has completed and should be set as such iteration.set_done() if not max_to_return == None and len(to_run) >= max_to_return: break
self.__set_run_status__(SQSTaskRunStatus.STATUS_SUCCESS) log.info("SQS Task '%s' succeeded.\n\n" % (self)) else: raise Exception("SQS Task returned failure status. See log for details.") except SystemExit, se: # in python 2.4, SystemExit extends Exception, this is changed in 2.5 to # extend BaseException, specifically so this check isn't necessary. But # we're using 2.4; upon upgrade, this check will be unecessary but ignorable. raise se except Exception, e: log.error("SQS Task failed!", e) log.error("\n\n", noalteration=True) self.__set_run_status__(SQSTaskRunStatus.STATUS_ERROR) except: # if the error thrown doesn't use Exception(...), ie just throws a string log.error("Task failed with poorly thrown exception!") traceback.print_exc() log.error("\n\n", noalteration=True) self.__set_run_status__(SQSTaskRunStatus.STATUS_ERROR) finally: pass def get_log_file(self): #f = "%s.%s" % (self.get_id(), self.get_date_enqueued().strftime('%Y%m%d_%H%M%S')) fp = os.path.join(settings.TMS_LOG_DIR, self.get_queue_name(), str(self.get_id())) return fp def get_date_enqueued(self): return self.date_enqueued def get_id(self): raise NotImplementedError def get_queue_name(self):
def __handle_signal__(sig_name, exit_code, timeout): global task, iteration, region if task == None or iteration == None or region == None: log.error("\n", noalteration=True) log.error("Received %s but Task not started yet. Stopping with exit code %s." % (sig_name, exit_code)) log.error("\n", noalteration=True) else: log.error("\n", noalteration=True) log.error("Received %s! TMS Stopping Task with exit code %s." % (sig_name, exit_code)) log.error("\n", noalteration=True) if timeout: task.set_ended_on_timeout(iteration, region) else: task.set_ended_on_error(iteration, region) # We call the normal os.exit(), even though it trusts that whatever try: ... except block # is currently executing will propegate the SystemExit exception instead of handling it. # In Python 2.5 SystemExit does not extend Exception so only when catching all (try: ... except:) # would this be a problem. But we're using Python 2.4, so *all* catchers of Exception need to # distinguish between Exception & SystemExit sys.exit(exit_code)
def __run_task__(task, iteration, daemon_status): # sanity check that this Task is allowed to run if not task.is_active(): raise Exception("Cannot run task '%s' b/c it does not need to be run!" % (task)) # run the Task! try: __start_timeout_timer__() task.do_run(iteration, daemon_status) __stop_timeout_timer__() except SystemExit, se: # in python 2.4, SystemExit extends Exception, this is changed in 2.5 to # extend BaseException, specifically so this check isn't necessary. But # we're using 2.4; upon upgrade, this check will be unecessary but ignorable. raise se except Exception, e: log.error("Exception propegated from task.do_run(). BAD! Bug?", e) raise e except: log.error("Poorly thrown exception propegated from task.do_run(). BAD! Bug?") traceback.print_exc() raise Exception("Poorly handled exception propegated from task.do_run(). BAD! Bug?") # def main(): global task, iteration, region parser = OptionParser("%prog --daemon_status_id <id> --iteration_id <id> \ --task_library <lib> --task_id <id> [--nice 5] [--stdout <file_name>] [--stderr <file_name>|STDOUT>] [--debug]") parser.add_option("--daemon_status_id", action="store", type="int" , help="The id of the daemon status that launched this Task") parser.add_option("--iteration_id", action="store", type="int" , help="The id of the iteration in which this Task runs")
log.info('"%s:%s" starting in new process' % (task.get_job().get_name(), task.get_name())) tp = TaskInProcess(task, iteration, self.get_daemon_status(), self.__log_dir__) tp.run() self.__add_running_task__(tp) def run(self): try: self.__logger__.start_redirect() ended_gracefully = NorcDaemon.run(self) self.__logger__.stop_redirect() return ended_gracefully except Exception, e: log.error("Error running daemon!", e) return False except: log.error("Error running daemon & it was poorly thrown!", e) return False # # # class TaskInThread(RunnableTask, threading.Thread): __logger__ = None def __init__(self, task, iteration, daemon_status, logger): self.__logger__ = logger RunnableTask.__init__(self, task, iteration, daemon_status)