Ejemplo n.º 1
0
Archivo: tmsd.py Proyecto: bdotdub/norc
 def __init__(self, log_dir, daemon_id_for_log, buffer_data):
     if not os.path.exists(log_dir):
         raise Exception("log_dir '%s' not found!" % (log_dir))
     self.__log_dir__ = log_dir
     self.daemon_id_for_log = daemon_id_for_log
     if not daemon_id_for_log == None:
         log.info("TMSD stderr & stdout will be in '%s'" % (self.__get_daemon_log_file_name__()))
     self.buffer_data = buffer_data
     self.open_files = {}
Ejemplo n.º 2
0
Archivo: tmsd.py Proyecto: bdotdub/norc
 def interrupt(self):
     """Interrupt this Task"""
     assert not self.__subprocess__ == None, "Cannot interrupt process not started"
     # A bit of interpretive dance to get this to replicate what's much easier in the 2.6 version
     if self.is_running():
         # task is still running; interrupt it!
         # TODO kill it? (would be signal.SIGKILL)
         log.info("sending SIGINT to pid:%s, task:%s" % (self.get_pid(), self.get_task().get_id()))
         os.kill(self.get_pid(), signal.SIGINT)
     elif self.get_exit_status():
         raise Exception("Task cannot be interrupted. It has already succeeded.")
     else:
         raise Exception("Task cannot be interrupted. It has failed with status %s." % (self.get_exit_status()))
Ejemplo n.º 3
0
 def do_run(self, tmsd_status):
     """What's actually called by the daemon to run the Message. Don't override!"""
     try:
         try:
             self.__set_run_status__(SQSTaskRunStatus.STATUS_RUNNING, tmsd_status=tmsd_status)
             log.info("Running SQS Task '%s'" % (self))
             success = self.run()
             if success:
                 self.__set_run_status__(SQSTaskRunStatus.STATUS_SUCCESS)
                 log.info("SQS Task '%s' succeeded.\n\n" % (self))
             else:
                 raise Exception("SQS Task returned failure status. See log for details.")
         except SystemExit, se:
             # in python 2.4, SystemExit extends Exception, this is changed in 2.5 to 
             # extend BaseException, specifically so this check isn't necessary. But
             # we're using 2.4; upon upgrade, this check will be unecessary but ignorable.
             raise se
         except Exception, e:
             log.error("SQS Task failed!", e)
             log.error("\n\n", noalteration=True)
             self.__set_run_status__(SQSTaskRunStatus.STATUS_ERROR)
Ejemplo n.º 4
0
def clear_queue(c, queue_name, use_api):
    q = c.get_queue(queue_name)
    if q == None:
        raise Exception("No queue exists by name '%s'" % (queue_name))
    if use_api:
        log.info("Clearing q '%s' using method recommended in API (had %s messages)" % (queue_name, q.count()))
        q.clear()
    else:
        # clearing is slow & unreliable for some reason.  Just delete it and recreate it.
        log.info("Clearing q using deletion '%s' (had %s messages)" % (queue_name, q.count()))
        visibility_timeout = q.get_timeout()
        delete_queue(c, queue_name)
        wait = 65
        log.info("Waiting %s seconds before recreating queue" % (wait))
        time.sleep(wait)  # amazon forces us to wait 1 minute before creating a queue by the same name
        create_queue(c, queue_name, visibility_timeout=visibility_timeout)
Ejemplo n.º 5
0
            task = a_task
            break
    if task == None:
        sys.exit("ERROR: Task '%s' not found in job %s!" % (options.task_name, job.get_name()))
    
    #
    status = determine_status(options)
    if options.delete:
        trs = core.TaskRunStatus.get_all_statuses(task, iteration)
        if trs == None:
            trs = []
        else:
            trs = trs.filter(status=status)
        if len(trs) >= 1:
            for tr in trs:
                log.info("Deleting status ID %s for %s %s w/ status %s" 
                    % (tr.get_id(), job.get_name(), task.get_name(), status))
                tr.delete()
        else:
            sys.exit("ERROR: There are %s statuses for %s %s w/ status %s" \
                % (len(trs), job.get_name(), task.get_name(), status))
    else:
        log.info("Setting %s %s to %s" % (job.get_name(), task.get_name(), status))
        task.__set_run_status__(iteration, status)
    

if __name__ == '__main__':
    main()

#
Ejemplo n.º 6
0
def create_queue(c, queue_name, visibility_timeout=None):
    q = c.get_queue(queue_name)
    if not q == None:
        raise Exception("Queue by name '%s' already exists!" % (queue_name))
    log.info("Creating queue '%s' with visibility timeout %s" % (queue_name, visibility_timeout))
    c.create_queue(queue_name, visibility_timeout=visibility_timeout)
Ejemplo n.º 7
0
def delete_queue(c, queue_name):
    q = c.get_queue(queue_name)
    if q == None:
        raise Exception("No queue exists by name '%s'" % (queue_name))
    log.info("Deleting q '%s' (had %s messages)" % (queue_name, q.count()))
    q.delete()
Ejemplo n.º 8
0
def main():
    global WAIT_POLL_SECONDS
    
    parser = OptionParser(usage())
    parser.add_option("--status", action="store_true"
        , help="show status of all running norc daemons.")
    parser.add_option("--details", action="store", type="int"
        , help="show details for tmsd given by id.")
    parser.add_option("--filter_status", action="store", default="interesting"
        , help="if showing status, limit to this set. Defaults to 'interesting', which is active+errored.")
    parser.add_option("--salvage", action="store"
        , type="int", help="don't exit tms daemon as requested; leave it running.")
    parser.add_option("--pause", action="store", type="int"
        , help="pause the tms daemon of given ID so no more tasks are run")
    parser.add_option("--stop", action="store", type="int"
        , help="stop the tms daemon of given ID after all currently running tasks have finished")
    parser.add_option("--kill", action="store"
        , type="int", help="immediately kill the tms daemon of given ID")
    parser.add_option("--delete", action="store"
        , type="int", help="mark tms daemon of given ID as deleted for convenience. Only changes DB.")
    parser.add_option("--wait_seconds", action="store", default=0
        , type="int", help="wait for N seconds for tmsd to stop after kill or stop is issued. Default is 0")
    parser.add_option("--force", action="store_true", help="overrides some safety checks. Use carefully by trying not to use it first.")
    parser.add_option("--due_to_run", action="store", type="int"
        , help="show a max # of Tasks due to run (currently an expensive DB call)")
    parser.add_option("--debug", action="store_true", help="more messages")
    (options, args) = parser.parse_args()
    
    if options.debug:
        log.set_logging_debug(options.debug)
    
    if not options.status and not options.details \
        and not options.pause and not options.stop and not options.kill \
        and not options.salvage and not options.delete:
        raise usage()
    if options.stop and (options.kill or options.salvage or options.details or options.pause) \
        or options.kill and (options.stop or options.salvage or options.details or options.pause) \
        or options.details and (options.kill or options.stop or options.salvage or options.pause) \
        or options.pause and (options.kill or options.stop or options.salvage or options.details):
        raise usage()
    
    #
    # edit a tmsd
    #
    tds_id = None; tds = None
    if options.pause:
        tds_id = options.pause
    elif options.stop:
        tds_id = options.stop
    elif options.kill:
        tds_id = options.kill
    elif options.salvage:
        tds_id = options.salvage
    elif options.delete:
        tds_id = options.delete
    elif options.details:
        tds_id = options.details
    
    if not tds_id == None:
        tds = get_tds(tds_id)
        if options.pause and tds.is_paused() or tds.is_pause_requested():
            raise Exception("tmsd %s is already paused or pause has been requested." % (tds.id))
        if options.stop and tds.is_stop_requested():
            raise Exception("tmsd %s is already scheduled to stop. You can also try --kill <id>." % (tds.id))
        elif options.kill and tds.is_kill_requested():
            raise Exception("tmsd %s is already scheduled to be killed. The only thing more severe is $kill -9 %s." % (tds.id, tds.pid))
        elif options.salvage and (not tds.is_stop_requested() and not tds.is_kill_requested() and not tds.is_paused()):
            raise Exception("tmsd %s cannot be salvaged.  Its status is not paused, stop- or kill- requested" % (tds.id))
    
    if options.delete:
        if not options.force and not tds.is_done_with_error():
            raise Exception("tmsd %s cannot be deleted because it has status %s. Use --force to override." % (tds.id, tds.get_status()))
        log.info("Deleting tmsd %s" % (tds))
        tds.set_status(tms_models.NorcDaemonStatus.STATUS_DELETED)
    elif options.salvage:
        log.info("Salvaging tmsd %s" % (tds))
        tds.set_status(tms_models.NorcDaemonStatus.STATUS_RUNNING)
    elif options.pause or options.stop or options.kill:
        if tds.is_done():
            raise Exception("tmsd %s is not running.  It cannot be shutdown or paused." % (tds.id))
        if options.pause:
            log.info("Sending pause request to tmsd %s" % (tds))
            tds.set_status(tms_models.NorcDaemonStatus.STATUS_PAUSEREQUESTED)
        elif options.stop:
            log.info("Sending stop request to tmsd %s" % (tds))
            tds.set_status(tms_models.NorcDaemonStatus.STATUS_STOPREQUESTED)
        elif options.kill:
            log.info("Sending kill request to tmsd %s" % (tds))
            tds.set_status(tms_models.NorcDaemonStatus.STATUS_KILLREQUESTED)
        #
        if options.wait_seconds:
            seconds_waited = 0
            timeout = False
            while True:
                if seconds_waited >= options.wait_seconds:
                    timeout = True
                    break
                tds = get_tds(tds_id)
                if tds.is_shutting_down():
                    log.info("Waiting for shutdown of tmsd %s.  It's been %s seconds." % (tds.id, seconds_waited), indent_chars=4)
                elif tds.is_done():
                    log.info("tmsd %s is done with status '%s'" % (tds.id, tds.get_status()))
                    break
                else:
                    raise Exception("tmsd %s shutdown was requested but not honored or was overwritten in DB. This is bad, but try \"kill <pid>\" directly." % (tms.id))
                time.sleep(WAIT_POLL_SECONDS)
                seconds_waited += WAIT_POLL_SECONDS
            if timeout:
                log.info("Timeout reached waiting for tmsd %s to finish.  Check process id %s on host '%s'" % (tds.id, tds.pid, tds.host))
                sys.exit(1)
    
    #
    # report on status
    #
    
    if options.status and not tds == None:
        report_tmsd_status(options.filter_status, [tds], max_tasks_due_to_run=options.due_to_run)
    elif options.status:
        report_tmsd_status(options.filter_status, max_tasks_due_to_run=options.due_to_run)
    if options.details:
        daemon_type = tds.get_daemon_type()
        if daemon_type == tms_models.NorcDaemonStatus.DAEMON_TYPE_TMS:
            report_tmsd_details(options.filter_status, tds)
        elif daemon_type == tms_models.NorcDaemonStatus.DAEMON_TYPE_SQS:
            report_sqsd_details(options.filter_status, tds)
        else:
            raise Exception("Unknown daemon_type '%s'" % (daemon_type))
Ejemplo n.º 9
0
Archivo: tmsd.py Proyecto: bdotdub/norc
 def start_task(self, task, iteration):
     """Start the given Task in the given Iteration"""
     log.info('"%s:%s" starting in new thread' % (task.get_job().get_name(), task.get_name()))
     tt = TaskInThread(task, iteration, self.get_daemon_status(), sys.stdout)
     tt.start()
Ejemplo n.º 10
0
Archivo: tmsd.py Proyecto: bdotdub/norc
 def start_task(self, task, iteration):
     log.info('"%s:%s" starting in new process' % (task.get_job().get_name(), task.get_name()))
     tp = TaskInProcess(task, iteration, self.get_daemon_status(), self.__log_dir__)
     tp.run()
     self.__add_running_task__(tp)
Ejemplo n.º 11
0
Archivo: tmsd.py Proyecto: bdotdub/norc
    def get_running_tasks(self):
        """Returns list of currently running RunnableTask's"""
        running_tasks = []
        to_cleanup = []
        for running_task in self.__running_tasks__:
            if running_task.is_running():
                running_tasks.append(running_task)
            else:
                to_cleanup.append(running_task)
                # no longer running; log that fact for convenience
                exit_status = running_task.get_exit_status()
                if exit_status == 0:
                    log.info('"%s" succeeded' % (self.__get_task_label__(running_task)))
                elif exit_status == 130:
                    log.info('"%s" timed out.' % (self.__get_task_label__(running_task)))
                elif exit_status == 131:
                    log.info('"%s" was interrupted.' % (self.__get_task_label__(running_task)))
                elif exit_status == 132:
                    log.info('"%s" was killed.' % (self.__get_task_label__(running_task)))
                elif exit_status == 133:
                    log.info('"%s" did not run.' % (self.__get_task_label__(running_task)))
                elif exit_status == 134:
                    log.info('"%s" ended without a status.' % (self.__get_task_label__(running_task)))
                elif exit_status == 127:
                    raise Exception(
                        "\"%s\" failed b/c of internal error.  \
TaskInProcess.RUN_TASK_EXE '%s' could not be found! BAD!"
                        % (self.__get_task_label__(running_task), TaskInProcess.RUN_TASK_EXE)
                    )
                elif exit_status == 126:
                    raise Exception(
                        "\"%s\" failed b/c of internal error.  \
TaskInProcess.RUN_TASK_EXE '%s' is not executable! BAD!"
                        % (self.__get_task_label__(running_task), TaskInProcess.RUN_TASK_EXE)
                    )
                else:
                    log.info('"%s" failed with exit status %s!' % (self.__get_task_label__(running_task), exit_status))

        for no_longer_running in to_cleanup:  # TODO can this be done in one loop?
            self.__running_tasks__.remove(no_longer_running)

        return running_tasks
Ejemplo n.º 12
0
Archivo: tmsd.py Proyecto: bdotdub/norc
 def request_kill(self):
     log.info("tmsd Sending kill request...")
     self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_KILLREQUESTED)
     self.__break_tasks_to_run_loop__ = True
Ejemplo n.º 13
0
Archivo: tmsd.py Proyecto: bdotdub/norc
 def __do_run__(self):
     """Main daemon loop"""
     log.info("%s %s..." % (self.get_name(), str(self.get_daemon_status())))
     if settings.DEBUG:
         log.info("WARNING: settings.DEBUG is True: daemon will gobble up memory b/c django stores SQL queries.")
     self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_RUNNING)
     last_status = self.get_daemon_status().get_status()
     while True:
         if not last_status == self.get_daemon_status().get_status():
             log.info("tmsd state changed: %s -> %s" % (last_status, self.get_daemon_status().get_status()))
             last_status = self.get_daemon_status().get_status()
         self.__set_daemon_status__(self.get_daemon_status().thwart_cache())  # see note in this method definition
         if self.get_daemon_status().is_stop_requested() or self.get_daemon_status().is_being_stopped():
             # don't kick off more tasks, but wait for those running to finish on their own
             self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_STOPINPROGRESS)
             num_running_tasks = self.get_num_running_tasks()
             if num_running_tasks == 0:
                 log.info("tmsd stop requested and no more tasks. Ending gracefully.")
                 self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_ENDEDGRACEFULLY)
                 return True
             else:
                 log.info("tmsd stop requested; waiting for %s task(s) to finish." % (num_running_tasks))
         elif self.get_daemon_status().is_kill_requested() or self.get_daemon_status().is_being_killed():
             running_tasks = self.get_running_tasks()
             if len(running_tasks) == 0:
                 log.info("tmsd kill requested but no tasks running. Ending gracefully.")
                 self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_ENDEDGRACEFULLY)
                 return True
             else:
                 log.info(
                     "tmsd kill requested; interrupting %s task(s) and stopping immediately." % (len(running_tasks))
                 )
                 self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_KILLINPROGRESS)
                 for running_task in running_tasks:
                     # There's no way to actually interrupt python threads
                     # mark the task as ended in error, and leave it up to
                     # main() to call SIGKILL on this process.
                     log.info("interrupting task '%s'." % (running_task), indent_chars=4)
                     try:
                         running_task.interrupt()
                     except Exception, e:
                         log.error("Could not interrupt Task '%s'" % (running_task), e)
                 self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_KILLED)
                 return False
         elif self.get_daemon_status().is_pause_requested():
             log.info("tmsd pause requested.  Will just sit here.")
             self.get_daemon_status().set_status(tms_models.NorcDaemonStatus.STATUS_PAUSED)
Ejemplo n.º 14
0
 def start_task(self):
     log.info("Starting the next SQS Task in new process")
     tp = SQSTaskInProcess(self.get_daemon_status(), self.__log_dir__)
     tp.run()
     self.__add_running_task__(tp)