def run(self): """Main run loop of the Scheduler.""" self.timer.start() while not Status.is_final(self.status): if self.request: self.handle_request() if self.status == Status.RUNNING: # Clean up orphaned schedules and undead schedulers. # Schedule.objects.orphaned().update(scheduler=None) # CronSchedule.objects.orphaned().update(scheduler=None) cron = CronSchedule.objects.unclaimed()[:SCHEDULER_LIMIT] simple = Schedule.objects.unclaimed()[:SCHEDULER_LIMIT] for schedule in itertools.chain(cron, simple): self.log.info('Claiming %s.' % schedule) schedule.scheduler = self schedule.save() self.add(schedule) if not Status.is_final(self.status): self.wait() self.request = Scheduler.objects.get(pk=self.pk).request cron = self.cronschedules.all() simple = self.schedules.all() claimed_count = cron.count() + simple.count() if claimed_count > 0: self.log.info('Cleaning up %s schedules.' % claimed_count) cron.update(scheduler=None) simple.update(scheduler=None)
def run(self): """Core executor function.""" if settings.BACKUP_SYSTEM: self.pool = ThreadPool(self.concurrent + 1) self.log.info("%s is now running on host %s." % (self, self.host)) if self.log.debug_on: self.resource_reporter = Thread(target=self.report_resources) self.resource_reporter.daemon = True self.resource_reporter.start() # Main loop. while not Status.is_final(self.status): if self.request: self.handle_request() if self.status == Status.RUNNING: while len(self.processes) < self.concurrent: # self.log.debug("Popping instance...") instance = self.queue.pop() if instance: # self.log.debug("Popped %s" % instance) self.start_instance(instance) else: # self.log.debug("No instance in queue.") break elif self.status == Status.STOPPING and len(self.processes) == 0: self.set_status(Status.ENDED) self.save(safe=True) # Clean up completed tasks before iterating. for pid, p in self.processes.items()[:]: p.poll() self.log.debug("Checking pid %s: return code %s." % (pid, p.returncode)) if not p.returncode == None: i = type(p.instance).objects.get(pk=p.instance.pk) if i.status == Status.CREATED: self.log.info( ("%s fail to initialize properly; " + "entering suspension to avoid more errors.") % i) self.set_status(Status.SUSPENDED) self.save() if not Status.is_final(i.status): self.log.info(("%s ended with invalid " + "status %s, changing to ERROR.") % (i, Status.name(i.status))) i.status = Status.ERROR i.save() self.log.info("%s ended with status %s." % (i, Status.name(i.status))) del self.processes[pid] if settings.BACKUP_SYSTEM: self.pool.queueTask(self.backup_instance_log, [i]) if not Status.is_final(self.status): self.wait(EXECUTOR_PERIOD) self.request = Executor.objects.get(pk=self.pk).request
def run(self): """Core executor function.""" if settings.BACKUP_SYSTEM: self.pool = ThreadPool(self.concurrent + 1) self.log.info("%s is now running on host %s." % (self, self.host)) if self.log.debug_on: self.resource_reporter = Thread(target=self.report_resources) self.resource_reporter.daemon = True self.resource_reporter.start() # Main loop. while not Status.is_final(self.status): if self.request: self.handle_request() if self.status == Status.RUNNING: while len(self.processes) < self.concurrent: # self.log.debug("Popping instance...") instance = self.queue.pop() if instance: # self.log.debug("Popped %s" % instance) self.start_instance(instance) else: # self.log.debug("No instance in queue.") break elif self.status == Status.STOPPING and len(self.processes) == 0: self.set_status(Status.ENDED) self.save(safe=True) # Clean up completed tasks before iterating. for pid, p in self.processes.items()[:]: p.poll() self.log.debug( "Checking pid %s: return code %s." % (pid, p.returncode)) if not p.returncode == None: i = type(p.instance).objects.get(pk=p.instance.pk) if i.status == Status.CREATED: self.log.info(("%s fail to initialize properly; " + "entering suspension to avoid more errors.") % i) self.set_status(Status.SUSPENDED) self.save() if not Status.is_final(i.status): self.log.info(("%s ended with invalid " + "status %s, changing to ERROR.") % (i, Status.name(i.status))) i.status = Status.ERROR i.save() self.log.info("%s ended with status %s." % (i, Status.name(i.status))) del self.processes[pid] if settings.BACKUP_SYSTEM: self.pool.queueTask(self.backup_instance_log, [i]) if not Status.is_final(self.status): self.wait(EXECUTOR_PERIOD) self.request = Executor.objects.get(pk=self.pk).request
def make_status_color(status, alive): if status in map(Status.name, Status.GROUPS("error")): return "status_error" elif status in map(Status.name, Status.GROUPS("succeeded")): return "status_good" elif status == "RUNNING": if not alive or (alive and alive == "True"): return "status_good" else: return "status_error" elif status in map(Status.name, Status.GROUPS("active")): return "status_good" else: return "status_error"
def tearDown(self): if not Status.is_final(self._executor.status): self._executor.make_request(Request.KILL) self.thread.join(7) self._executor.heart.join(7) assert not self.thread.isAlive() assert not self._executor.heart.isAlive()
def report_resources(self): while not Status.is_final(self.status): time.sleep(10) rself = resource.getrusage(resource.RUSAGE_SELF) self.log.debug(rself) rchildren = resource.getrusage(resource.RUSAGE_CHILDREN) self.log.debug(rchildren)
def cleanup(self): """Cleanup code that should be executed last.""" self.ended = datetime.utcnow() self.save() self.log.info("Task ended with status %s." % Status.name(self.status)) self.log.stop_redirect()
def test_kill(self): self.thread.start() wait_until(lambda: self.executor.status == Status.RUNNING, 3) self.assertEqual(self.executor.status, Status.RUNNING) self.executor.make_request(Request.KILL) wait_until(lambda: Status.is_final(self.executor.status), 5) self.assertEqual(self.executor.status, Status.KILLED)
def tearDown(self): if not Status.is_final(self._executor.status): print self._executor.make_request(Request.KILL) self.thread.join(7) self._executor.heart.join(7) assert not self.thread.isAlive() assert not self._executor.heart.isAlive()
class executors(BaseReport): get = lambda id: get_object(Executor, id=id) get_all = lambda: Executor.objects.all() since_filter = date_ended_since order_by = date_ended_order details = { 'instances': lambda id, since=None, status=None, **kws: executors.get(id).instances.since(since).status_in(status), } headers = ['ID', 'Queue', 'Queue Type', 'Host', 'PID', 'Running', 'Succeeded', 'Failed', 'Started', 'Ended', 'Status'] data = { 'queue': lambda obj, **kws: obj.queue.name, 'queue_type': lambda obj, **kws: obj.queue.__class__.__name__, 'running': lambda obj, since, **kws: obj.instances.since(since).status_in('running').count(), 'succeeded': lambda obj, since, **kws: obj.instances.since(since).status_in('succeeded').count(), 'failed': lambda obj, since, **kws: obj.instances.since(since).status_in('failed').count(), 'status': lambda obj, **kws: Status.name(obj.status), 'ended': date_ended_getter, 'heartbeat': lambda obj, **kws: obj.heartbeat, 'alive': lambda obj, **kws: str(obj.is_alive()), }
def run(self, instance): """Enqueue instances for all nodes that don't have dependencies.""" for node in self.nodes.all(): node_instance = JobNodeInstance.objects.create( node=node, job_instance=instance) if node_instance.can_run(): instance.schedule.queue.push(node_instance) while True: complete = True for ni in instance.nodis.all(): if not Status.is_final(ni.status): complete = False elif Status.is_failure(ni.status): return False if complete and instance.nodis.count() == self.nodes.count(): return True time.sleep(1)
def test_start_stop(self): self.assertEqual(self.executor.status, Status.CREATED) self.thread.start() wait_until(lambda: self.executor.status == Status.RUNNING, 3) self.assertEqual(self.executor.status, Status.RUNNING) self.executor.make_request(Request.STOP) wait_until(lambda: Status.is_final(self.executor.status), 5) self.assertEqual(self.executor.status, Status.ENDED)
def is_alive(self): """Whether the Daemon is still alive. A Daemon is defined as alive if its status is not final and its last heartbeat was within the last HEARTBEAT_FAILED seconds. """ return not Status.is_final(self.status) \ and self.heartbeat and self.heartbeat > \ datetime.utcnow() - timedelta(seconds=HEARTBEAT_FAILED)
def make_request(self, request): """This method is how the request field should always be set.""" assert request in self.VALID_REQUESTS, "Invalid request: " + \ "\"%s\" (%s)" % (Request.name(request), request) if not Status.is_final(self.status): self.request = request self.save() self.flag.set() return True else: return False
def start(self): try: AbstractInstance.start(self) finally: ji = self.job_instance if not Status.is_failure(self.status): for sub_dep in self.node.sub_deps.all(): sub_node = sub_dep.child ni = sub_node.nis.get(job_instance=ji) if ni.can_run(): self.job_instance.schedule.queue.push(ni)
def make_request(self, request): """This method is how the request field should always be set.""" if not request in self.VALID_REQUESTS: return False if not Status.is_final(self.status): self.request = request self.save() self.flag.set() return True else: return False
def run(self): """Main run loop of the Scheduler.""" self.timer.start() while not Status.is_final(self.status): if self.request: self.handle_request() if self.status == Status.RUNNING: # Clean up orphaned schedules and undead schedulers. # Schedule.objects.orphaned().update(scheduler=None) # CronSchedule.objects.orphaned().update(scheduler=None) cron = CronSchedule.objects.unclaimed()[:SCHEDULER_LIMIT] simple = Schedule.objects.unclaimed()[:SCHEDULER_LIMIT] for schedule in itertools.chain(cron, simple): self.log.info('Claiming %s.' % schedule) schedule.scheduler = self schedule.save() self.add(schedule) if not Status.is_final(self.status): self.wait() self.request = Scheduler.objects.get(pk=self.pk).request
def heart_run(self): """Method to be run by the heart thread.""" while not Status.is_final(self.status): start = time.time() self.heartbeat = datetime.utcnow() self.save(safe=True) # In case the database is slow and saving takes longer # than HEARTBEAT_PERIOD to complete. wait = HEARTBEAT_PERIOD - (time.time() - start) if wait > 0: self.heart.flag.wait(wait) self.heart.flag.clear()
class instances(BaseReport): get = _parse_content_ids get_all = lambda: MultiQuerySet(*[i.objects.all() for i in INSTANCE_MODELS]) since_filter = date_ended_since order_by = date_ended_order headers = ['ID#', 'Type', 'Source', 'Started', 'Ended', 'Status'] data = { 'id': lambda obj, **kws: '%s_%s' % (ContentType.objects.get_for_model(obj).id, obj.id), 'id#': lambda obj, **kws: obj.id, 'type': lambda obj, **kws: type(obj).__name__, 'source': lambda i, **kws: i.source or 'n/a', # i.source if hasattr(i, 'source') else 'n/a', 'status': lambda obj, **kws: Status.name(obj.status), }
def start(self): if not hasattr(self, 'log'): self.log = make_log(self.log_path) if self.status != Status.CREATED: self.log.error("Can't start an instance more than once.") return try: for signum in [signal.SIGINT, signal.SIGTERM]: signal.signal(signum, self.kill_handler) except ValueError: pass if self.timeout > 0: signal.signal(signal.SIGALRM, self.timeout_handler) signal.alarm(self.timeout) self.log.info('Starting %s.' % self) self.log.start_redirect() self.status = Status.RUNNING self.started = datetime.utcnow() self.save() try: success = self.run() except Exception: self.log.error("Task failed with an exception!", trace=True) self.status = Status.ERROR except NorcInterruptException: self.log.error("Interrupt signal received!") self.status = Status.INTERRUPTED except NorcTimeoutException: self.log.info("Task timed out! Ceasing execution.") self.status = Status.TIMEDOUT else: if success or success == None: self.status = Status.SUCCESS else: self.status = Status.FAILURE finally: self.ended = datetime.utcnow() self.save() self.log.info("Task ended with status %s." % Status.name(self.status)) self.log.stop_redirect() self.log.close() sys.exit(0 if self.status == Status.SUCCESS else 1)
class schedulers(BaseReport): get = lambda id: get_object(Scheduler, id=id) get_all = lambda: Scheduler.objects.all() since_filter = date_ended_since order_by = lambda data, o: data.order_by(o if o else '-started') details = { 'schedules': lambda id, **kws: Schedule.objects.filter(scheduler__id=id) } headers = ['ID', 'Host', "PID", "Claimed", 'Started', 'Ended', "Status"] data = { "claimed": lambda obj, **kws: obj.schedules.count() + obj.cronschedules.count(), 'ended': date_ended_getter, 'alive': lambda obj, **kws: str(obj.is_alive()), 'status': lambda obj, **kws: Status.name(obj.status), }
def set_status(self, status): """Sets the status with a log message. Does not save.""" self.log.info("Changing state from %s to %s." % (Status.name(self.status), Status.name(status))) self.status = status
class Scheduler(AbstractDaemon): """Scheduling process for handling Schedules. Takes unclaimed Schedules from the database and adds their next instance to a timer. At the appropriate time, the instance is added to its queue and the Schedule is updated. Idea: Split this up into two threads, one which continuously handles already claimed schedules, the other which periodically polls the DB for new schedules. """ class Meta: app_label = 'core' db_table = 'norc_scheduler' objects = QuerySetManager() class QuerySet(AbstractDaemon.QuerySet): """Custom manager/query set for Scheduler.""" def undead(self): """Schedulers that are active but the heart isn't beating.""" cutoff = datetime.utcnow() - timedelta(seconds=HEARTBEAT_FAILED) return self.status_in("active").filter(heartbeat__lt=cutoff) # All the statuses Schedulers can have. See constants.py. VALID_STATUSES = [ Status.CREATED, Status.RUNNING, Status.PAUSED, Status.ENDED, Status.ERROR, ] VALID_REQUESTS = [ Request.STOP, Request.KILL, Request.PAUSE, Request.RESUME, Request.RELOAD, ] # The status of this scheduler. status = PositiveSmallIntegerField(default=Status.CREATED, choices=[(s, Status.name(s)) for s in VALID_STATUSES]) # A state-change request. request = PositiveSmallIntegerField(null=True, choices=[(r, Request.name(r)) for r in VALID_REQUESTS]) def __init__(self, *args, **kwargs): AbstractDaemon.__init__(self, *args, **kwargs) self.timer = MultiTimer() self.set = set() def start(self): """Starts the Scheduler.""" # Temporary check until multiple schedulers is supported fully. if Scheduler.objects.alive().count() > 0: print "Cannot run more than one scheduler at a time." return AbstractDaemon.start(self) def run(self): """Main run loop of the Scheduler.""" self.timer.start() while not Status.is_final(self.status): if self.request: self.handle_request() if self.status == Status.RUNNING: # Clean up orphaned schedules and undead schedulers. # Schedule.objects.orphaned().update(scheduler=None) # CronSchedule.objects.orphaned().update(scheduler=None) cron = CronSchedule.objects.unclaimed()[:SCHEDULER_LIMIT] simple = Schedule.objects.unclaimed()[:SCHEDULER_LIMIT] for schedule in itertools.chain(cron, simple): self.log.info('Claiming %s.' % schedule) schedule.scheduler = self schedule.save() self.add(schedule) if not Status.is_final(self.status): self.wait() self.request = Scheduler.objects.get(pk=self.pk).request def wait(self): """Waits on the flag.""" AbstractDaemon.wait(self, SCHEDULER_PERIOD) def clean_up(self): self.timer.cancel() self.timer.join() cron = self.cronschedules.all() simple = self.schedules.all() claimed_count = cron.count() + simple.count() if claimed_count > 0: self.log.info("Cleaning up %s schedules." % claimed_count) cron.update(scheduler=None) simple.update(scheduler=None) def handle_request(self): """Called when a request is found.""" # Clear request immediately. request = self.request self.request = None self.save() self.log.info("Request received: %s" % Request.name(request)) if request == Request.PAUSE: self.set_status(Status.PAUSED) elif request == Request.RESUME: if self.status != Status.PAUSED: self.log.info("Must be paused to resume; clearing request.") else: self.set_status(Status.RUNNING) elif request == Request.STOP: self.set_status(Status.ENDED) elif request == Request.KILL: self.set_status(Status.KILLED) elif request == Request.RELOAD: changed = MultiQuerySet(Schedule, CronSchedule) changed = changed.objects.unfinished.filter(changed=True, scheduler=self) for item in self.timer.tasks: s = item[2][0] if s in changed: self.log.info("Removing outdated: %s" % s) self.timer.tasks.remove(item) self.set.remove(s) s = type(s).objects.get(pk=s.pk) for s in changed: self.log.info("Adding updated: %s" % s) self.add(s) changed.update(changed=False) def add(self, schedule): """Adds the schedule to the timer.""" try: if schedule in self.set: self.log.error("%s has already been added to this Scheduler." % schedule) return self.log.debug('Adding %s to timer for %s.' % (schedule, schedule.next)) self.timer.add_task(schedule.next, self._enqueue, [schedule]) self.set.add(schedule) except: self.log.error("Invalid schedule %s found, deleting." % schedule) schedule.soft_delete() def _enqueue(self, schedule): """Called by the timer to add an instance to the queue.""" updated_schedule = get_object(type(schedule), pk=schedule.pk) self.set.remove(schedule) if updated_schedule == None or updated_schedule.deleted: self.log.info('%s was removed.' % schedule) if updated_schedule != None: updated_schedule.scheduler = None updated_schedule.save() return schedule = updated_schedule if not schedule.scheduler == self: self.log.info("%s is no longer tied to this Scheduler." % schedule) # self.set.remove(schedule) return instance = Instance.objects.create(task=schedule.task, schedule=schedule) self.log.info('Enqueuing %s.' % instance) schedule.queue.push(instance) schedule.enqueued() if not schedule.finished(): self.add(schedule) else: schedule.scheduler = None schedule.save() @property def log_path(self): return 'schedulers/scheduler-%s' % self.id
class Executor(AbstractDaemon): """Executors are responsible for the running of instances. Executors have a single queue that they pull instances from. There can (and in many cases should) be more than one Executor running for a single queue. """ class Meta: app_label = 'core' db_table = 'norc_executor' objects = QuerySetManager() class QuerySet(AbstractDaemon.QuerySet): def for_queue(self, q): """Executors pulling from the given queue.""" return self.filter( queue_id=q.id, queue_type=ContentType.objects.get_for_model(q).id) @property def instances(self): """A custom implementation of the Django related manager pattern.""" return MultiQuerySet( *[i.objects.filter(executor=self.pk) for i in INSTANCE_MODELS]) # All the statuses executors can have. See constants.py. VALID_STATUSES = [ Status.CREATED, Status.RUNNING, Status.PAUSED, Status.STOPPING, Status.ENDED, Status.ERROR, Status.KILLED, Status.SUSPENDED, ] VALID_REQUESTS = [ Request.STOP, Request.KILL, Request.PAUSE, Request.RESUME, ] # The status of this executor. status = PositiveSmallIntegerField(default=Status.CREATED, choices=[(s, Status.name(s)) for s in VALID_STATUSES]) # A state-change request. request = PositiveSmallIntegerField(null=True, choices=[(r, Request.name(r)) for r in VALID_REQUESTS]) # The queue this executor draws task instances from. queue_type = ForeignKey(ContentType) queue_id = PositiveIntegerField() queue = GenericForeignKey('queue_type', 'queue_id') # The number of things that can be run concurrently. concurrent = IntegerField() @property def alive(self): return self.status == Status.RUNNING and self.heartbeat > \ datetime.utcnow() - timedelta(seconds=HEARTBEAT_FAILED) def __init__(self, *args, **kwargs): AbstractDaemon.__init__(self, *args, **kwargs) self.processes = {} def run(self): """Core executor function.""" if settings.BACKUP_SYSTEM: self.pool = ThreadPool(self.concurrent + 1) self.log.info("%s is now running on host %s." % (self, self.host)) if self.log.debug_on: self.resource_reporter = Thread(target=self.report_resources) self.resource_reporter.daemon = True self.resource_reporter.start() # Main loop. while not Status.is_final(self.status): if self.request: self.handle_request() if self.status == Status.RUNNING: while len(self.processes) < self.concurrent: # self.log.debug("Popping instance...") instance = self.queue.pop() if instance: # self.log.debug("Popped %s" % instance) self.start_instance(instance) else: # self.log.debug("No instance in queue.") break elif self.status == Status.STOPPING and len(self.processes) == 0: self.set_status(Status.ENDED) self.save(safe=True) # Clean up completed tasks before iterating. for pid, p in self.processes.items()[:]: p.poll() self.log.debug("Checking pid %s: return code %s." % (pid, p.returncode)) if not p.returncode == None: i = type(p.instance).objects.get(pk=p.instance.pk) if i.status == Status.CREATED: self.log.info( ("%s fail to initialize properly; " + "entering suspension to avoid more errors.") % i) self.set_status(Status.SUSPENDED) self.save() if not Status.is_final(i.status): self.log.info(("%s ended with invalid " + "status %s, changing to ERROR.") % (i, Status.name(i.status))) i.status = Status.ERROR i.save() self.log.info("%s ended with status %s." % (i, Status.name(i.status))) del self.processes[pid] if settings.BACKUP_SYSTEM: self.pool.queueTask(self.backup_instance_log, [i]) if not Status.is_final(self.status): self.wait(EXECUTOR_PERIOD) self.request = Executor.objects.get(pk=self.pk).request def clean_up(self): if settings.BACKUP_SYSTEM: self.pool.joinAll() def report_resources(self): while not Status.is_final(self.status): time.sleep(10) rself = resource.getrusage(resource.RUSAGE_SELF) self.log.debug(rself) rchildren = resource.getrusage(resource.RUSAGE_CHILDREN) self.log.debug(rchildren) def start_instance(self, instance): """Starts a given instance in a new process.""" instance.executor = self instance.save() self.log.info("Starting %s..." % instance) # p = Process(target=self.execute, args=[instance.start]) # p.start() ct = ContentType.objects.get_for_model(instance) f = make_log(instance.log_path).file p = Popen('norc_taskrunner --ct_pk %s --target_pk %s' % (ct.pk, instance.pk), stdout=f, stderr=STDOUT, shell=True) p.instance = instance self.processes[p.pid] = p # This should be used in 2.6, but with subprocess it's not possible. # def execute(self, func): # """Calls a function, then sets the flag after its execution.""" # try: # func() # finally: # self.flag.set() def handle_request(self): """Called when a request is found.""" # Clear request immediately. request = self.request self.request = None self.save() self.log.info("Request received: %s" % Request.name(request)) if request == Request.PAUSE: self.set_status(Status.PAUSED) elif request == Request.RESUME: if self.status not in (Status.PAUSED, Status.SUSPENDED): self.log.info("Must be paused or suspended to resume; " + "clearing request.") else: self.set_status(Status.RUNNING) elif request == Request.STOP: self.set_status(Status.STOPPING) elif request == Request.KILL: # for p in self.processes.values(): # p.terminate() for pid, p in self.processes.iteritems(): self.log.info("Killing process for %s." % p.instance) os.kill(pid, signal.SIGTERM) self.set_status(Status.KILLED) def backup_instance_log(self, instance): self.log.info("Attempting upload of log for %s..." % instance) if backup_log(instance.log_path): self.log.info("Completed upload of log for %s." % instance) else: self.log.info("Failed to upload log for %s." % instance) @property def log_path(self): return 'executors/executor-%s' % self.id
class AbstractInstance(Model): """One instance (run) of a Task.""" __metaclass__ = MetaInstance class Meta: app_label = 'core' abstract = True class QuerySet(query.QuerySet): def since(self, since): if type(since) == str: since = parse_since(since) return self.exclude(ended__lt=since) if since else self def status_in(self, statuses): if isinstance(statuses, basestring): statuses = Status.GROUPS(statuses) return self.filter(status__in=statuses) if statuses else self def from_queue(self, q): return self.filter(executor__queue_id=q.id, executor__queue_type=ContentType.objects.get_for_model(q).id) VALID_STATUSES = [ Status.CREATED, Status.RUNNING, Status.SUCCESS, Status.FAILURE, Status.HANDLED, Status.ERROR, Status.TIMEDOUT, Status.INTERRUPTED, ] # The status of the execution. status = PositiveSmallIntegerField(default=Status.CREATED, choices=[(s, Status.name(s)) for s in VALID_STATUSES]) # When the instance was added to a queue. enqueued = DateTimeField(default=datetime.utcnow) # When the instance started. started = DateTimeField(null=True) # When the instance ended. ended = DateTimeField(null=True) # The executor of this instance. executor = ForeignKey('core.Executor', null=True, related_name='_%(class)ss') revision = ForeignKey('core.Revision', null=True, related_name='_%(class)ss') def start(self): """Performs initialization before calling run().""" if not hasattr(self, 'log'): self.log = make_log(self.log_path) if self.status != Status.CREATED: self.log.error("Can't start an instance more than once.") return try: for signum in [signal.SIGINT, signal.SIGTERM]: signal.signal(signum, self.kill_handler) except ValueError: pass if self.timeout > 0: signal.signal(signal.SIGALRM, self.timeout_handler) signal.alarm(self.timeout) self.log.info('Starting %s.' % self) self.log.start_redirect() self.status = Status.RUNNING self.revision = self.get_revision() self.started = datetime.utcnow() self.save() try: success = self.run() except Exception: self.log.error("Task failed with an exception!", trace=True) self.status = Status.FAILURE else: if success or success == None: self.status = Status.SUCCESS else: self.status = Status.FAILURE finally: self.run_finally() self.cleanup() sys.exit(0 if self.status == Status.SUCCESS else 1) def run_finally(self): signal.alarm(0) if hasattr(self, "finally_") and callable(self.finally_): signal.signal(signal.SIGALRM, self.finally_timeout_handler) signal.alarm(FINALLY_TIMEOUT) self.log.info("Executing final block...") self.finally_() signal.alarm(0) def cleanup(self): """Cleanup code that should be executed last.""" self.ended = datetime.utcnow() self.save() self.log.info("Task ended with status %s." % Status.name(self.status)) self.log.stop_redirect() def run(self): """Runs the instance.""" raise NotImplementedError def kill_handler(self, *args, **kwargs): self.log.info("Interrupt signal received!") self.status = Status.INTERRUPTED self.run_finally() self.cleanup() self._nuke() def timeout_handler(self, *args, **kwargs): self.log.error("Task timed out!") self.status = Status.TIMEDOUT self.run_finally() self.cleanup() self._nuke() def finally_timeout_handler(self, *args, **kwargs): self.log.error("Final block timed out!") self.status = Status.TIMEDOUT self.cleanup() self._nuke() def _nuke(self, *args, **kwargs): self.log.info("Ceasing execution.") os._exit(1) def get_revision(self): """ Hook to provide revision tracking functionality for instances. Defaults to None because other instances implementations might not have task attributes. """ return None @property def timeout(self): return 0 @property def source(self): return None @property def queue(self): try: return self.executor.queue except AttributeError: return None @property def log_path(self): return "instances/%s/%s" % (type(self).__name__, self.id) @property def log_url(self): return ('/logs/instances/%s_%s/' % (ContentType.objects.get_for_model(self).id, self.id)) def __unicode__(self): return u"[%s #%s]" % (type(self).__name__, self.id) __repr__ = __unicode__
def main(): usage = "norc_control [executor | scheduler | host] <id | host> " + \ "--[stop | kill | pause | resume | reload | handle] [--wait]" def bad_args(message): print message print usage sys.exit(2) parser = OptionParser(usage) parser.add_option("-s", "--stop", action="store_true", default=False, help="Send a stop request.") parser.add_option("-k", "--kill", action="store_true", default=False, help="Send a kill request.") parser.add_option("-p", "--pause", action="store_true", default=False, help="Send a pause request.") parser.add_option("-u", "--resume", action="store_true", default=False, help="Send an resume request.") parser.add_option("-r", "--reload", action="store_true", default=False, help="Send an reload request to a Scheduler.") parser.add_option("--handle", action="store_true", default=False, help="Change the object's status to HANDLED.") parser.add_option("-f", "--force", action="store_true", default=False, help="Force the request to be made..") parser.add_option("-w", "--wait", action="store_true", default=False, help="Wait until the request has been responded to.") options, args = parser.parse_args() if len(args) != 2: bad_args("Invalid number of arguments.") requests = filter(lambda a: getattr(options, a.lower()), Request.NAMES.values()) if len(requests) + (1 if options.handle else 0) != 1: bad_args("Must request exactly one action.") if not options.handle: request = requests[0] req = getattr(Request, request) cls = None if args[0] in EXECUTOR_KEYWORDS: cls = Executor elif args[0] in SCHEDULER_KEYWORDS: cls = Scheduler elif args[0] in HOST_KEYWORDS: if options.handle: bad_args("Can't perform handle operation on multiple daemons.") daemons = MultiQuerySet(Executor, Scheduler).objects.all() daemons = daemons.filter(host=args[1]).status_in("active") if not options.force: daemons = daemons.filter(request=None) for d in daemons: if req in d.VALID_REQUESTS: d.make_request(req) print "%s was sent a %s request." % (d, request) if options.wait: _wait(daemons, req) else: bad_args("Invalid keyword '%s'." % args[0]) if cls: name = cls.__name__ try: obj_id = int(args[1]) except ValueError: bad_args("Invalid id '%s'; must be an integer." % args[1]) try: d = cls.objects.get(id=obj_id) except cls.DoesNotExist: print "Could not find a(n) %s with id=%s" % (name, obj_id) else: if options.handle: if controls.handle(d): print "The error state of %s was marked as handled." % d else: print "%s isn't in an error state." % d elif Status.is_final(d.status) and not options.force: print "%s is already in a final state." % d elif d.request == None or options.force: d.make_request(req) print "%s was sent a %s request." % (d, request) if options.wait: _wait([d], req) else: print "%s already has request %s." % \ (d, Request.name(d.request))
def status_in(self, statuses): """Filter by status group. Takes a string or iterable.""" if isinstance(statuses, basestring): statuses = Status.GROUPS(statuses) return self.filter(status__in=statuses) if statuses else self
def start(self): """Starts the daemon. Does initialization then calls run().""" if self.status != Status.CREATED: print "Can't start a %s that's already been run." \ % type(self).__name__ return if not hasattr(self, 'id'): self.save() if not hasattr(self, 'log'): self.log = make_log(self.log_path) if settings.DEBUG: self.log.info( "WARNING, DEBUG is True, which means Django " + "will gobble memory as it stores all database queries.") # This try block is needed because the unit tests run daemons # in threads, which breaks signals. try: for signum in (signal.SIGINT, signal.SIGTERM): signal.signal(signum, self.signal_handler) except ValueError: pass self.log.start_redirect() self.log.info("%s initialized; starting..." % self) self.status = Status.RUNNING self.heartbeat = self.started = datetime.utcnow() self.save() self.heart.start() try: self.run() except Exception: self.set_status(Status.ERROR) self.log.error("An internal error occured!", trace=True) else: if not Status.is_final(self.status): self.set_status(Status.ENDED) finally: self.log.info("Shutting down...") try: self.clean_up() except: self.log.error("Clean up function failed.", trace=True) if not Status.is_final(self.status): self.set_status(Status.ERROR) self.heart.flag.set() self.heart.join() self.ended = datetime.utcnow() self.save() if settings.BACKUP_SYSTEM: self.log.info('Backing up log file...') try: if backup_log(self.log_path): self.log.info('Completed log backup.') else: self.log.error('Failed to backup log.') except: self.log.error('Failed to backup log.', trace=True) self.log.info('%s has been shut down successfully.' % self) self.log.stop_redirect() self.log.close()
def tearDown(self): if not Status.is_final(self._scheduler.status): self._scheduler.make_request(Request.KILL) self.thread.join(15) assert not self.thread.isAlive() assert not self._scheduler.timer.isAlive()
def start(self): """Starts the daemon. Does initialization then calls run().""" if self.status != Status.CREATED: print "Can't start a %s that's already been run." \ % type(self).__name__ return if not hasattr(self, 'id'): self.save() if not hasattr(self, 'log'): self.log = make_log(self.log_path) if settings.DEBUG: self.log.info("WARNING, DEBUG is True, which means Django " + "will gobble memory as it stores all database queries.") # This try block is needed because the unit tests run daemons # in threads, which breaks signals. try: for signum in (signal.SIGINT, signal.SIGTERM): signal.signal(signum, self.signal_handler) except ValueError: pass self.log.start_redirect() self.log.info("%s initialized; starting..." % self) self.status = Status.RUNNING self.heartbeat = self.started = datetime.utcnow() self.save() self.heart.start() try: self.run() except Exception: self.set_status(Status.ERROR) self.log.error("An internal error occured!", trace=True) else: if not Status.is_final(self.status): self.set_status(Status.ENDED) finally: self.log.info("Shutting down...") try: self.clean_up() except: self.log.error("Clean up function failed.", trace=True) if not Status.is_final(self.status): self.set_status(Status.ERROR) self.heart.flag.set() self.heart.join() self.ended = datetime.utcnow() self.save() if settings.BACKUP_SYSTEM: self.log.info('Backing up log file...') try: if backup_log(self.log_path): self.log.info('Completed log backup.') else: self.log.error('Failed to backup log.') except: self.log.error('Failed to backup log.', trace=True) self.log.info('%s has been shut down successfully.' % self) self.log.stop_redirect() self.log.close()
def status_in(self, statuses): if isinstance(statuses, basestring): statuses = Status.GROUPS(statuses) return self.filter(status__in=statuses) if statuses else self