Example #1
0
 def run(self):
     """Main run loop of the Scheduler."""
     self.timer.start()
     
     while not Status.is_final(self.status):
         if self.request:
             self.handle_request()
         
         if self.status == Status.RUNNING:
             # Clean up orphaned schedules and undead schedulers.
             # Schedule.objects.orphaned().update(scheduler=None)
             # CronSchedule.objects.orphaned().update(scheduler=None)
             
             cron = CronSchedule.objects.unclaimed()[:SCHEDULER_LIMIT]
             simple = Schedule.objects.unclaimed()[:SCHEDULER_LIMIT]
             for schedule in itertools.chain(cron, simple):
                 self.log.info('Claiming %s.' % schedule)
                 schedule.scheduler = self
                 schedule.save()
                 self.add(schedule)
         
         if not Status.is_final(self.status):
             self.wait()
             self.request = Scheduler.objects.get(pk=self.pk).request
     
     cron = self.cronschedules.all()
     simple = self.schedules.all()
     claimed_count = cron.count() + simple.count()
     if claimed_count > 0:
         self.log.info('Cleaning up %s schedules.' % claimed_count)
         cron.update(scheduler=None)
         simple.update(scheduler=None)
Example #2
0
    def run(self):
        """Core executor function."""
        if settings.BACKUP_SYSTEM:
            self.pool = ThreadPool(self.concurrent + 1)
        self.log.info("%s is now running on host %s." % (self, self.host))

        if self.log.debug_on:
            self.resource_reporter = Thread(target=self.report_resources)
            self.resource_reporter.daemon = True
            self.resource_reporter.start()

        # Main loop.
        while not Status.is_final(self.status):
            if self.request:
                self.handle_request()

            if self.status == Status.RUNNING:
                while len(self.processes) < self.concurrent:
                    # self.log.debug("Popping instance...")
                    instance = self.queue.pop()
                    if instance:
                        # self.log.debug("Popped %s" % instance)
                        self.start_instance(instance)
                    else:
                        # self.log.debug("No instance in queue.")
                        break

            elif self.status == Status.STOPPING and len(self.processes) == 0:
                self.set_status(Status.ENDED)
                self.save(safe=True)

            # Clean up completed tasks before iterating.
            for pid, p in self.processes.items()[:]:
                p.poll()
                self.log.debug("Checking pid %s: return code %s." %
                               (pid, p.returncode))
                if not p.returncode == None:
                    i = type(p.instance).objects.get(pk=p.instance.pk)
                    if i.status == Status.CREATED:
                        self.log.info(
                            ("%s fail to initialize properly; " +
                             "entering suspension to avoid more errors.") % i)
                        self.set_status(Status.SUSPENDED)
                        self.save()
                    if not Status.is_final(i.status):
                        self.log.info(("%s ended with invalid " +
                                       "status %s, changing to ERROR.") %
                                      (i, Status.name(i.status)))
                        i.status = Status.ERROR
                        i.save()
                    self.log.info("%s ended with status %s." %
                                  (i, Status.name(i.status)))
                    del self.processes[pid]
                    if settings.BACKUP_SYSTEM:
                        self.pool.queueTask(self.backup_instance_log, [i])

            if not Status.is_final(self.status):
                self.wait(EXECUTOR_PERIOD)
                self.request = Executor.objects.get(pk=self.pk).request
Example #3
0
 def run(self):
     """Core executor function."""
     if settings.BACKUP_SYSTEM:
         self.pool = ThreadPool(self.concurrent + 1)
     self.log.info("%s is now running on host %s." % (self, self.host))
     
     if self.log.debug_on:
         self.resource_reporter = Thread(target=self.report_resources)
         self.resource_reporter.daemon = True
         self.resource_reporter.start()
     
     # Main loop.
     while not Status.is_final(self.status):
         if self.request:
             self.handle_request()
         
         if self.status == Status.RUNNING:
             while len(self.processes) < self.concurrent:
                 # self.log.debug("Popping instance...")
                 instance = self.queue.pop()
                 if instance:
                     # self.log.debug("Popped %s" % instance)
                     self.start_instance(instance)
                 else:
                     # self.log.debug("No instance in queue.")
                     break
         
         elif self.status == Status.STOPPING and len(self.processes) == 0:
             self.set_status(Status.ENDED)
             self.save(safe=True)
         
         # Clean up completed tasks before iterating.
         for pid, p in self.processes.items()[:]:
             p.poll()
             self.log.debug(
                 "Checking pid %s: return code %s." % (pid, p.returncode))
             if not p.returncode == None:
                 i = type(p.instance).objects.get(pk=p.instance.pk)
                 if i.status == Status.CREATED:
                     self.log.info(("%s fail to initialize properly; " +
                         "entering suspension to avoid more errors.") % i)
                     self.set_status(Status.SUSPENDED)
                     self.save()
                 if not Status.is_final(i.status):
                     self.log.info(("%s ended with invalid " +
                         "status %s, changing to ERROR.") %
                         (i, Status.name(i.status)))
                     i.status = Status.ERROR
                     i.save()
                 self.log.info("%s ended with status %s." %
                     (i, Status.name(i.status)))
                 del self.processes[pid]
                 if settings.BACKUP_SYSTEM:
                     self.pool.queueTask(self.backup_instance_log, [i])
         
         if not Status.is_final(self.status):
             self.wait(EXECUTOR_PERIOD)
             self.request = Executor.objects.get(pk=self.pk).request
Example #4
0
def make_status_color(status, alive):
    if status in map(Status.name, Status.GROUPS("error")):
        return "status_error"
    elif status in map(Status.name, Status.GROUPS("succeeded")):
        return "status_good"
    elif status == "RUNNING":
        if not alive or (alive and alive == "True"):
            return "status_good"
        else:
            return "status_error"
    elif status in map(Status.name, Status.GROUPS("active")):
        return "status_good"
    else:
        return "status_error"
Example #5
0
 def tearDown(self):
     if not Status.is_final(self._executor.status):
         self._executor.make_request(Request.KILL)
     self.thread.join(7)
     self._executor.heart.join(7)
     assert not self.thread.isAlive()
     assert not self._executor.heart.isAlive()
Example #6
0
File: executor.py Project: tml/norc
 def report_resources(self):
     while not Status.is_final(self.status):
         time.sleep(10)
         rself = resource.getrusage(resource.RUSAGE_SELF)
         self.log.debug(rself)
         rchildren = resource.getrusage(resource.RUSAGE_CHILDREN)
         self.log.debug(rchildren)
Example #7
0
 def cleanup(self):
     """Cleanup code that should be executed last."""
     self.ended = datetime.utcnow()
     self.save()
     self.log.info("Task ended with status %s." %
         Status.name(self.status))
     self.log.stop_redirect()
Example #8
0
 def report_resources(self):
     while not Status.is_final(self.status):
         time.sleep(10)
         rself = resource.getrusage(resource.RUSAGE_SELF)
         self.log.debug(rself)
         rchildren = resource.getrusage(resource.RUSAGE_CHILDREN)
         self.log.debug(rchildren)
Example #9
0
 def test_kill(self):
     self.thread.start()
     wait_until(lambda: self.executor.status == Status.RUNNING, 3)
     self.assertEqual(self.executor.status, Status.RUNNING)
     self.executor.make_request(Request.KILL)
     wait_until(lambda: Status.is_final(self.executor.status), 5)
     self.assertEqual(self.executor.status, Status.KILLED)
Example #10
0
 def tearDown(self):
     if not Status.is_final(self._executor.status):
         print self._executor.make_request(Request.KILL)
     self.thread.join(7)
     self._executor.heart.join(7)
     assert not self.thread.isAlive()
     assert not self._executor.heart.isAlive()
Example #11
0
 def test_kill(self):
     self.thread.start()
     wait_until(lambda: self.executor.status == Status.RUNNING, 3)
     self.assertEqual(self.executor.status, Status.RUNNING)
     self.executor.make_request(Request.KILL)
     wait_until(lambda: Status.is_final(self.executor.status), 5)
     self.assertEqual(self.executor.status, Status.KILLED)
Example #12
0
class executors(BaseReport):
    
    get = lambda id: get_object(Executor, id=id)
    get_all = lambda: Executor.objects.all()
    
    since_filter = date_ended_since
    order_by = date_ended_order
    
    details = {
        'instances': lambda id, since=None, status=None, **kws:
            executors.get(id).instances.since(since).status_in(status),
    }
    headers = ['ID', 'Queue', 'Queue Type', 'Host', 'PID', 'Running',
        'Succeeded', 'Failed', 'Started', 'Ended', 'Status']
    data = {
        'queue': lambda obj, **kws: obj.queue.name,
        'queue_type': lambda obj, **kws: obj.queue.__class__.__name__,
        'running': lambda obj, since, **kws:
            obj.instances.since(since).status_in('running').count(),
        'succeeded': lambda obj, since, **kws:
            obj.instances.since(since).status_in('succeeded').count(),
        'failed': lambda obj, since, **kws:
            obj.instances.since(since).status_in('failed').count(),
        'status': lambda obj, **kws: Status.name(obj.status),
        'ended': date_ended_getter,
        'heartbeat': lambda obj, **kws: obj.heartbeat,
        'alive': lambda obj, **kws: str(obj.is_alive()),
    }
Example #13
0
 def run(self, instance):
     """Enqueue instances for all nodes that don't have dependencies."""
     for node in self.nodes.all():
         node_instance = JobNodeInstance.objects.create(
             node=node, job_instance=instance)
         if node_instance.can_run():
             instance.schedule.queue.push(node_instance)
     while True:
         complete = True
         for ni in instance.nodis.all():
             if not Status.is_final(ni.status):
                 complete = False
             elif Status.is_failure(ni.status):
                 return False
         if complete and instance.nodis.count() == self.nodes.count():
             return True
         time.sleep(1)
Example #14
0
 def test_start_stop(self):    
     self.assertEqual(self.executor.status, Status.CREATED)
     self.thread.start()
     wait_until(lambda: self.executor.status == Status.RUNNING, 3)
     self.assertEqual(self.executor.status, Status.RUNNING)
     self.executor.make_request(Request.STOP)
     wait_until(lambda: Status.is_final(self.executor.status), 5)
     self.assertEqual(self.executor.status, Status.ENDED)
Example #15
0
 def test_start_stop(self):    
     self.assertEqual(self.executor.status, Status.CREATED)
     self.thread.start()
     wait_until(lambda: self.executor.status == Status.RUNNING, 3)
     self.assertEqual(self.executor.status, Status.RUNNING)
     self.executor.make_request(Request.STOP)
     wait_until(lambda: Status.is_final(self.executor.status), 5)
     self.assertEqual(self.executor.status, Status.ENDED)
Example #16
0
File: job.py Project: tml/norc
 def run(self, instance):
     """Enqueue instances for all nodes that don't have dependencies."""
     for node in self.nodes.all():
         node_instance = JobNodeInstance.objects.create(
             node=node,
             job_instance=instance)
         if node_instance.can_run():
             instance.schedule.queue.push(node_instance)
     while True:
         complete = True
         for ni in instance.nodis.all():
             if not Status.is_final(ni.status):
                 complete = False
             elif Status.is_failure(ni.status):
                 return False
         if complete and instance.nodis.count() == self.nodes.count():
             return True
         time.sleep(1)
Example #17
0
 def is_alive(self):
     """Whether the Daemon is still alive.
     
     A Daemon is defined as alive if its status is not final and its
     last heartbeat was within the last HEARTBEAT_FAILED seconds.
     
     """
     return not Status.is_final(self.status) \
         and self.heartbeat and self.heartbeat > \
         datetime.utcnow() - timedelta(seconds=HEARTBEAT_FAILED)
Example #18
0
 def is_alive(self):
     """Whether the Daemon is still alive.
     
     A Daemon is defined as alive if its status is not final and its
     last heartbeat was within the last HEARTBEAT_FAILED seconds.
     
     """
     return not Status.is_final(self.status) \
         and self.heartbeat and self.heartbeat > \
         datetime.utcnow() - timedelta(seconds=HEARTBEAT_FAILED)
Example #19
0
File: daemon.py Project: tml/norc
 def make_request(self, request):
     """This method is how the request field should always be set."""
     assert request in self.VALID_REQUESTS, "Invalid request: " + \
         "\"%s\" (%s)" % (Request.name(request), request)
     if not Status.is_final(self.status):
         self.request = request
         self.save()
         self.flag.set()
         return True
     else:
         return False
Example #20
0
File: job.py Project: tml/norc
 def start(self):
     try:
         AbstractInstance.start(self)
     finally:
         ji = self.job_instance
         if not Status.is_failure(self.status):
             for sub_dep in self.node.sub_deps.all():
                 sub_node = sub_dep.child
                 ni = sub_node.nis.get(job_instance=ji)
                 if ni.can_run():
                     self.job_instance.schedule.queue.push(ni)
Example #21
0
 def make_request(self, request):
     """This method is how the request field should always be set."""
     if not request in self.VALID_REQUESTS:
         return False
     if not Status.is_final(self.status):
         self.request = request
         self.save()
         self.flag.set()
         return True
     else:
         return False
Example #22
0
 def start(self):
     try:
         AbstractInstance.start(self)
     finally:
         ji = self.job_instance
         if not Status.is_failure(self.status):
             for sub_dep in self.node.sub_deps.all():
                 sub_node = sub_dep.child
                 ni = sub_node.nis.get(job_instance=ji)
                 if ni.can_run():
                     self.job_instance.schedule.queue.push(ni)
Example #23
0
 def make_request(self, request):
     """This method is how the request field should always be set."""
     if not request in self.VALID_REQUESTS:
         return False
     if not Status.is_final(self.status):
         self.request = request
         self.save()
         self.flag.set()
         return True
     else:
         return False
Example #24
0
    def run(self):
        """Main run loop of the Scheduler."""
        self.timer.start()

        while not Status.is_final(self.status):
            if self.request:
                self.handle_request()

            if self.status == Status.RUNNING:
                # Clean up orphaned schedules and undead schedulers.
                # Schedule.objects.orphaned().update(scheduler=None)
                # CronSchedule.objects.orphaned().update(scheduler=None)

                cron = CronSchedule.objects.unclaimed()[:SCHEDULER_LIMIT]
                simple = Schedule.objects.unclaimed()[:SCHEDULER_LIMIT]
                for schedule in itertools.chain(cron, simple):
                    self.log.info('Claiming %s.' % schedule)
                    schedule.scheduler = self
                    schedule.save()
                    self.add(schedule)
            if not Status.is_final(self.status):
                self.wait()
                self.request = Scheduler.objects.get(pk=self.pk).request
Example #25
0
    def heart_run(self):
        """Method to be run by the heart thread."""
        while not Status.is_final(self.status):
            start = time.time()

            self.heartbeat = datetime.utcnow()
            self.save(safe=True)

            # In case the database is slow and saving takes longer
            # than HEARTBEAT_PERIOD to complete.
            wait = HEARTBEAT_PERIOD - (time.time() - start)
            if wait > 0:
                self.heart.flag.wait(wait)
                self.heart.flag.clear()
Example #26
0
 def heart_run(self):
     """Method to be run by the heart thread."""
     while not Status.is_final(self.status):
         start = time.time()
         
         self.heartbeat = datetime.utcnow()
         self.save(safe=True)
         
         # In case the database is slow and saving takes longer
         # than HEARTBEAT_PERIOD to complete.
         wait = HEARTBEAT_PERIOD - (time.time() - start)
         if wait > 0:
             self.heart.flag.wait(wait)
             self.heart.flag.clear()
Example #27
0
class instances(BaseReport):
    
    get = _parse_content_ids
    get_all = lambda: MultiQuerySet(*[i.objects.all()
        for i in INSTANCE_MODELS])
    since_filter = date_ended_since
    order_by = date_ended_order
    
    headers = ['ID#', 'Type', 'Source', 'Started', 'Ended', 'Status']
    data = {
        'id': lambda obj, **kws: '%s_%s' %
            (ContentType.objects.get_for_model(obj).id, obj.id),
        'id#': lambda obj, **kws: obj.id,
        'type': lambda obj, **kws: type(obj).__name__,
        'source': lambda i, **kws: i.source or 'n/a',
            # i.source if hasattr(i, 'source') else 'n/a',
        'status': lambda obj, **kws: Status.name(obj.status),
    }
Example #28
0
File: task.py Project: tml/norc
 def start(self):
     if not hasattr(self, 'log'):
         self.log = make_log(self.log_path)
     if self.status != Status.CREATED:
         self.log.error("Can't start an instance more than once.")
         return
     try:
         for signum in [signal.SIGINT, signal.SIGTERM]:
             signal.signal(signum, self.kill_handler)
     except ValueError:
         pass
     if self.timeout > 0:
         signal.signal(signal.SIGALRM, self.timeout_handler)
         signal.alarm(self.timeout)
     self.log.info('Starting %s.' % self)
     self.log.start_redirect()
     self.status = Status.RUNNING
     self.started = datetime.utcnow()
     self.save()
     try:
         success = self.run()
     except Exception:
         self.log.error("Task failed with an exception!", trace=True)
         self.status = Status.ERROR
     except NorcInterruptException:
         self.log.error("Interrupt signal received!")
         self.status = Status.INTERRUPTED
     except NorcTimeoutException:
         self.log.info("Task timed out!  Ceasing execution.")
         self.status = Status.TIMEDOUT
     else:
         if success or success == None:
             self.status = Status.SUCCESS
         else:
             self.status = Status.FAILURE
     finally:
         self.ended = datetime.utcnow()
         self.save()
         self.log.info("Task ended with status %s." %
             Status.name(self.status))
         self.log.stop_redirect()
         self.log.close()
         sys.exit(0 if self.status == Status.SUCCESS else 1)
Example #29
0
class schedulers(BaseReport):
    
    get = lambda id: get_object(Scheduler, id=id)
    get_all = lambda: Scheduler.objects.all()
    
    since_filter = date_ended_since
    order_by = lambda data, o: data.order_by(o if o else '-started')
    
    details = {
        'schedules': lambda id, **kws:
            Schedule.objects.filter(scheduler__id=id)
    }
    headers = ['ID', 'Host', "PID", "Claimed", 'Started', 'Ended', "Status"]
    data = {
        "claimed": lambda obj, **kws:
            obj.schedules.count() + obj.cronschedules.count(),
        'ended': date_ended_getter,    
        'alive': lambda obj, **kws: str(obj.is_alive()),
        'status': lambda obj, **kws: Status.name(obj.status),
    }
Example #30
0
 def set_status(self, status):
     """Sets the status with a log message.  Does not save."""
     self.log.info("Changing state from %s to %s." %
         (Status.name(self.status), Status.name(status)))
     self.status = status
Example #31
0
class Scheduler(AbstractDaemon):
    """Scheduling process for handling Schedules.
    
    Takes unclaimed Schedules from the database and adds their next
    instance to a timer.  At the appropriate time, the instance is
    added to its queue and the Schedule is updated.
    
    Idea: Split this up into two threads, one which continuously handles
    already claimed schedules, the other which periodically polls the DB
    for new schedules.
    
    """
    class Meta:
        app_label = 'core'
        db_table = 'norc_scheduler'

    objects = QuerySetManager()

    class QuerySet(AbstractDaemon.QuerySet):
        """Custom manager/query set for Scheduler."""
        def undead(self):
            """Schedulers that are active but the heart isn't beating."""
            cutoff = datetime.utcnow() - timedelta(seconds=HEARTBEAT_FAILED)
            return self.status_in("active").filter(heartbeat__lt=cutoff)

    # All the statuses Schedulers can have.  See constants.py.
    VALID_STATUSES = [
        Status.CREATED,
        Status.RUNNING,
        Status.PAUSED,
        Status.ENDED,
        Status.ERROR,
    ]

    VALID_REQUESTS = [
        Request.STOP,
        Request.KILL,
        Request.PAUSE,
        Request.RESUME,
        Request.RELOAD,
    ]

    # The status of this scheduler.
    status = PositiveSmallIntegerField(default=Status.CREATED,
                                       choices=[(s, Status.name(s))
                                                for s in VALID_STATUSES])

    # A state-change request.
    request = PositiveSmallIntegerField(null=True,
                                        choices=[(r, Request.name(r))
                                                 for r in VALID_REQUESTS])

    def __init__(self, *args, **kwargs):
        AbstractDaemon.__init__(self, *args, **kwargs)
        self.timer = MultiTimer()
        self.set = set()

    def start(self):
        """Starts the Scheduler."""
        # Temporary check until multiple schedulers is supported fully.
        if Scheduler.objects.alive().count() > 0:
            print "Cannot run more than one scheduler at a time."
            return
        AbstractDaemon.start(self)

    def run(self):
        """Main run loop of the Scheduler."""
        self.timer.start()

        while not Status.is_final(self.status):
            if self.request:
                self.handle_request()

            if self.status == Status.RUNNING:
                # Clean up orphaned schedules and undead schedulers.
                # Schedule.objects.orphaned().update(scheduler=None)
                # CronSchedule.objects.orphaned().update(scheduler=None)

                cron = CronSchedule.objects.unclaimed()[:SCHEDULER_LIMIT]
                simple = Schedule.objects.unclaimed()[:SCHEDULER_LIMIT]
                for schedule in itertools.chain(cron, simple):
                    self.log.info('Claiming %s.' % schedule)
                    schedule.scheduler = self
                    schedule.save()
                    self.add(schedule)
            if not Status.is_final(self.status):
                self.wait()
                self.request = Scheduler.objects.get(pk=self.pk).request

    def wait(self):
        """Waits on the flag."""
        AbstractDaemon.wait(self, SCHEDULER_PERIOD)

    def clean_up(self):
        self.timer.cancel()
        self.timer.join()
        cron = self.cronschedules.all()
        simple = self.schedules.all()
        claimed_count = cron.count() + simple.count()
        if claimed_count > 0:
            self.log.info("Cleaning up %s schedules." % claimed_count)
            cron.update(scheduler=None)
            simple.update(scheduler=None)

    def handle_request(self):
        """Called when a request is found."""

        # Clear request immediately.
        request = self.request
        self.request = None
        self.save()

        self.log.info("Request received: %s" % Request.name(request))

        if request == Request.PAUSE:
            self.set_status(Status.PAUSED)

        elif request == Request.RESUME:
            if self.status != Status.PAUSED:
                self.log.info("Must be paused to resume; clearing request.")
            else:
                self.set_status(Status.RUNNING)

        elif request == Request.STOP:
            self.set_status(Status.ENDED)

        elif request == Request.KILL:
            self.set_status(Status.KILLED)

        elif request == Request.RELOAD:
            changed = MultiQuerySet(Schedule, CronSchedule)
            changed = changed.objects.unfinished.filter(changed=True,
                                                        scheduler=self)
            for item in self.timer.tasks:
                s = item[2][0]
                if s in changed:
                    self.log.info("Removing outdated: %s" % s)
                    self.timer.tasks.remove(item)
                    self.set.remove(s)
                s = type(s).objects.get(pk=s.pk)
            for s in changed:
                self.log.info("Adding updated: %s" % s)
                self.add(s)
            changed.update(changed=False)

    def add(self, schedule):
        """Adds the schedule to the timer."""
        try:
            if schedule in self.set:
                self.log.error("%s has already been added to this Scheduler." %
                               schedule)
                return
            self.log.debug('Adding %s to timer for %s.' %
                           (schedule, schedule.next))
            self.timer.add_task(schedule.next, self._enqueue, [schedule])
            self.set.add(schedule)
        except:
            self.log.error("Invalid schedule %s found, deleting." % schedule)
            schedule.soft_delete()

    def _enqueue(self, schedule):
        """Called by the timer to add an instance to the queue."""
        updated_schedule = get_object(type(schedule), pk=schedule.pk)
        self.set.remove(schedule)
        if updated_schedule == None or updated_schedule.deleted:
            self.log.info('%s was removed.' % schedule)
            if updated_schedule != None:
                updated_schedule.scheduler = None
                updated_schedule.save()
            return
        schedule = updated_schedule

        if not schedule.scheduler == self:
            self.log.info("%s is no longer tied to this Scheduler." % schedule)
            # self.set.remove(schedule)
            return
        instance = Instance.objects.create(task=schedule.task,
                                           schedule=schedule)
        self.log.info('Enqueuing %s.' % instance)
        schedule.queue.push(instance)
        schedule.enqueued()
        if not schedule.finished():
            self.add(schedule)
        else:
            schedule.scheduler = None
            schedule.save()

    @property
    def log_path(self):
        return 'schedulers/scheduler-%s' % self.id
Example #32
0
class Executor(AbstractDaemon):
    """Executors are responsible for the running of instances.
    
    Executors have a single queue that they pull instances from.  There
    can (and in many cases should) be more than one Executor running for
    a single queue.
    
    """
    class Meta:
        app_label = 'core'
        db_table = 'norc_executor'

    objects = QuerySetManager()

    class QuerySet(AbstractDaemon.QuerySet):
        def for_queue(self, q):
            """Executors pulling from the given queue."""
            return self.filter(
                queue_id=q.id,
                queue_type=ContentType.objects.get_for_model(q).id)

    @property
    def instances(self):
        """A custom implementation of the Django related manager pattern."""
        return MultiQuerySet(
            *[i.objects.filter(executor=self.pk) for i in INSTANCE_MODELS])

    # All the statuses executors can have.  See constants.py.
    VALID_STATUSES = [
        Status.CREATED,
        Status.RUNNING,
        Status.PAUSED,
        Status.STOPPING,
        Status.ENDED,
        Status.ERROR,
        Status.KILLED,
        Status.SUSPENDED,
    ]

    VALID_REQUESTS = [
        Request.STOP,
        Request.KILL,
        Request.PAUSE,
        Request.RESUME,
    ]

    # The status of this executor.
    status = PositiveSmallIntegerField(default=Status.CREATED,
                                       choices=[(s, Status.name(s))
                                                for s in VALID_STATUSES])

    # A state-change request.
    request = PositiveSmallIntegerField(null=True,
                                        choices=[(r, Request.name(r))
                                                 for r in VALID_REQUESTS])

    # The queue this executor draws task instances from.
    queue_type = ForeignKey(ContentType)
    queue_id = PositiveIntegerField()
    queue = GenericForeignKey('queue_type', 'queue_id')

    # The number of things that can be run concurrently.
    concurrent = IntegerField()

    @property
    def alive(self):
        return self.status == Status.RUNNING and self.heartbeat > \
            datetime.utcnow() - timedelta(seconds=HEARTBEAT_FAILED)

    def __init__(self, *args, **kwargs):
        AbstractDaemon.__init__(self, *args, **kwargs)
        self.processes = {}

    def run(self):
        """Core executor function."""
        if settings.BACKUP_SYSTEM:
            self.pool = ThreadPool(self.concurrent + 1)
        self.log.info("%s is now running on host %s." % (self, self.host))

        if self.log.debug_on:
            self.resource_reporter = Thread(target=self.report_resources)
            self.resource_reporter.daemon = True
            self.resource_reporter.start()

        # Main loop.
        while not Status.is_final(self.status):
            if self.request:
                self.handle_request()

            if self.status == Status.RUNNING:
                while len(self.processes) < self.concurrent:
                    # self.log.debug("Popping instance...")
                    instance = self.queue.pop()
                    if instance:
                        # self.log.debug("Popped %s" % instance)
                        self.start_instance(instance)
                    else:
                        # self.log.debug("No instance in queue.")
                        break

            elif self.status == Status.STOPPING and len(self.processes) == 0:
                self.set_status(Status.ENDED)
                self.save(safe=True)

            # Clean up completed tasks before iterating.
            for pid, p in self.processes.items()[:]:
                p.poll()
                self.log.debug("Checking pid %s: return code %s." %
                               (pid, p.returncode))
                if not p.returncode == None:
                    i = type(p.instance).objects.get(pk=p.instance.pk)
                    if i.status == Status.CREATED:
                        self.log.info(
                            ("%s fail to initialize properly; " +
                             "entering suspension to avoid more errors.") % i)
                        self.set_status(Status.SUSPENDED)
                        self.save()
                    if not Status.is_final(i.status):
                        self.log.info(("%s ended with invalid " +
                                       "status %s, changing to ERROR.") %
                                      (i, Status.name(i.status)))
                        i.status = Status.ERROR
                        i.save()
                    self.log.info("%s ended with status %s." %
                                  (i, Status.name(i.status)))
                    del self.processes[pid]
                    if settings.BACKUP_SYSTEM:
                        self.pool.queueTask(self.backup_instance_log, [i])

            if not Status.is_final(self.status):
                self.wait(EXECUTOR_PERIOD)
                self.request = Executor.objects.get(pk=self.pk).request

    def clean_up(self):
        if settings.BACKUP_SYSTEM:
            self.pool.joinAll()

    def report_resources(self):
        while not Status.is_final(self.status):
            time.sleep(10)
            rself = resource.getrusage(resource.RUSAGE_SELF)
            self.log.debug(rself)
            rchildren = resource.getrusage(resource.RUSAGE_CHILDREN)
            self.log.debug(rchildren)

    def start_instance(self, instance):
        """Starts a given instance in a new process."""
        instance.executor = self
        instance.save()
        self.log.info("Starting %s..." % instance)
        # p = Process(target=self.execute, args=[instance.start])
        # p.start()
        ct = ContentType.objects.get_for_model(instance)
        f = make_log(instance.log_path).file
        p = Popen('norc_taskrunner --ct_pk %s --target_pk %s' %
                  (ct.pk, instance.pk),
                  stdout=f,
                  stderr=STDOUT,
                  shell=True)
        p.instance = instance
        self.processes[p.pid] = p

    # This should be used in 2.6, but with subprocess it's not possible.
    # def execute(self, func):
    #     """Calls a function, then sets the flag after its execution."""
    #     try:
    #         func()
    #     finally:
    #         self.flag.set()

    def handle_request(self):
        """Called when a request is found."""

        # Clear request immediately.
        request = self.request
        self.request = None
        self.save()

        self.log.info("Request received: %s" % Request.name(request))

        if request == Request.PAUSE:
            self.set_status(Status.PAUSED)

        elif request == Request.RESUME:
            if self.status not in (Status.PAUSED, Status.SUSPENDED):
                self.log.info("Must be paused or suspended to resume; " +
                              "clearing request.")
            else:
                self.set_status(Status.RUNNING)

        elif request == Request.STOP:
            self.set_status(Status.STOPPING)

        elif request == Request.KILL:
            # for p in self.processes.values():
            #     p.terminate()
            for pid, p in self.processes.iteritems():
                self.log.info("Killing process for %s." % p.instance)
                os.kill(pid, signal.SIGTERM)
            self.set_status(Status.KILLED)

    def backup_instance_log(self, instance):
        self.log.info("Attempting upload of log for %s..." % instance)
        if backup_log(instance.log_path):
            self.log.info("Completed upload of log for %s." % instance)
        else:
            self.log.info("Failed to upload log for %s." % instance)

    @property
    def log_path(self):
        return 'executors/executor-%s' % self.id
Example #33
0
class AbstractInstance(Model):
    """One instance (run) of a Task."""
    
    __metaclass__ = MetaInstance
    
    class Meta:
        app_label = 'core'
        abstract = True
    
    class QuerySet(query.QuerySet):
        
        def since(self, since):
            if type(since) == str:
                since = parse_since(since)
            return self.exclude(ended__lt=since) if since else self
        
        def status_in(self, statuses):
            if isinstance(statuses, basestring):
                statuses = Status.GROUPS(statuses)
            return self.filter(status__in=statuses) if statuses else self
        
        def from_queue(self, q):
            return self.filter(executor__queue_id=q.id,
                executor__queue_type=ContentType.objects.get_for_model(q).id)
        
    
    VALID_STATUSES = [
        Status.CREATED,
        Status.RUNNING,
        Status.SUCCESS,
        Status.FAILURE,
        Status.HANDLED,
        Status.ERROR,
        Status.TIMEDOUT,
        Status.INTERRUPTED,
    ]
    
    # The status of the execution.
    status = PositiveSmallIntegerField(default=Status.CREATED,
        choices=[(s, Status.name(s)) for s in VALID_STATUSES])
    
    # When the instance was added to a queue.
    enqueued = DateTimeField(default=datetime.utcnow)
    
    # When the instance started.
    started = DateTimeField(null=True)
    
    # When the instance ended.
    ended = DateTimeField(null=True)
    
    # The executor of this instance.
    executor = ForeignKey('core.Executor', null=True,
        related_name='_%(class)ss')
    
    revision = ForeignKey('core.Revision', null=True,
        related_name='_%(class)ss')
    
    def start(self):
        """Performs initialization before calling run()."""
        
        if not hasattr(self, 'log'):
            self.log = make_log(self.log_path)
        if self.status != Status.CREATED:
            self.log.error("Can't start an instance more than once.")
            return
        try:
            for signum in [signal.SIGINT, signal.SIGTERM]:
                signal.signal(signum, self.kill_handler)
        except ValueError:
            pass
        if self.timeout > 0:
            signal.signal(signal.SIGALRM, self.timeout_handler)
            signal.alarm(self.timeout)
        self.log.info('Starting %s.' % self)
        self.log.start_redirect()
        self.status = Status.RUNNING
        self.revision = self.get_revision()
        self.started = datetime.utcnow()
        self.save()
        try:
            success = self.run()
        except Exception:
            self.log.error("Task failed with an exception!", trace=True)
            self.status = Status.FAILURE
        else:
            if success or success == None:
                self.status = Status.SUCCESS
            else:
                self.status = Status.FAILURE
        finally:
            self.run_finally()
            self.cleanup()
            sys.exit(0 if self.status == Status.SUCCESS else 1)
    
    def run_finally(self):
        signal.alarm(0)
        if hasattr(self, "finally_") and callable(self.finally_):
            signal.signal(signal.SIGALRM, self.finally_timeout_handler)
            signal.alarm(FINALLY_TIMEOUT)
            self.log.info("Executing final block...")
            self.finally_()
            signal.alarm(0)
    
    def cleanup(self):
        """Cleanup code that should be executed last."""
        self.ended = datetime.utcnow()
        self.save()
        self.log.info("Task ended with status %s." %
            Status.name(self.status))
        self.log.stop_redirect()
    
    def run(self):
        """Runs the instance."""
        raise NotImplementedError
    
    def kill_handler(self, *args, **kwargs):
        self.log.info("Interrupt signal received!")
        self.status = Status.INTERRUPTED
        self.run_finally()
        self.cleanup()
        self._nuke()
    
    def timeout_handler(self, *args, **kwargs):
        self.log.error("Task timed out!")
        self.status = Status.TIMEDOUT
        self.run_finally()
        self.cleanup()
        self._nuke()
    
    def finally_timeout_handler(self, *args, **kwargs):
        self.log.error("Final block timed out!")
        self.status = Status.TIMEDOUT
        self.cleanup()
        self._nuke()
    
    def _nuke(self, *args, **kwargs):
        self.log.info("Ceasing execution.")
        os._exit(1)
    
    def get_revision(self):
        """ Hook to provide revision tracking functionality for instances.
        
        Defaults to None because other instances implementations might not
        have task attributes.
        
        """
        return None
    
    @property
    def timeout(self):
        return 0
    
    @property
    def source(self):
        return None
    
    @property
    def queue(self):
        try:
            return self.executor.queue
        except AttributeError:
            return None
    
    @property
    def log_path(self):
        return "instances/%s/%s" % (type(self).__name__, self.id)
    
    @property
    def log_url(self):
        return ('/logs/instances/%s_%s/' %
            (ContentType.objects.get_for_model(self).id, self.id))
    
    def __unicode__(self):
        return u"[%s #%s]" % (type(self).__name__, self.id)
    
    __repr__ = __unicode__
Example #34
0
 def set_status(self, status):
     """Sets the status with a log message.  Does not save."""
     self.log.info("Changing state from %s to %s." %
                   (Status.name(self.status), Status.name(status)))
     self.status = status
Example #35
0
def main():
    usage = "norc_control [executor | scheduler | host] <id | host> " + \
        "--[stop | kill | pause | resume | reload | handle] [--wait]"
    
    def bad_args(message):
        print message
        print usage
        sys.exit(2)
    
    parser = OptionParser(usage)
    parser.add_option("-s", "--stop", action="store_true", default=False,
        help="Send a stop request.")
    parser.add_option("-k", "--kill", action="store_true", default=False,
        help="Send a kill request.")
    parser.add_option("-p", "--pause", action="store_true", default=False,
        help="Send a pause request.")
    parser.add_option("-u", "--resume", action="store_true", default=False,
        help="Send an resume request.")
    parser.add_option("-r", "--reload", action="store_true", default=False,
        help="Send an reload request to a Scheduler.")
    parser.add_option("--handle", action="store_true", default=False,
        help="Change the object's status to HANDLED.")
    parser.add_option("-f", "--force", action="store_true", default=False,
        help="Force the request to be made..")
    parser.add_option("-w", "--wait", action="store_true", default=False,
        help="Wait until the request has been responded to.")
    
    options, args = parser.parse_args()
    
    if len(args) != 2:
        bad_args("Invalid number of arguments.")
    
    
    requests = filter(lambda a: getattr(options, a.lower()),
        Request.NAMES.values())
    if  len(requests) + (1 if options.handle else 0) != 1:
        bad_args("Must request exactly one action.")
    if not options.handle:
        request = requests[0]
        req = getattr(Request, request)
    
    cls = None
    if args[0] in EXECUTOR_KEYWORDS:
        cls = Executor
    elif args[0] in SCHEDULER_KEYWORDS:
        cls = Scheduler
    elif args[0] in HOST_KEYWORDS:
        if options.handle:
            bad_args("Can't perform handle operation on multiple daemons.")
        daemons = MultiQuerySet(Executor, Scheduler).objects.all()
        daemons = daemons.filter(host=args[1]).status_in("active")
        if not options.force:
            daemons = daemons.filter(request=None)
        for d in daemons:
            if req in d.VALID_REQUESTS:
                d.make_request(req)
                print "%s was sent a %s request." % (d, request)
        if options.wait:
            _wait(daemons, req)
    else:
        bad_args("Invalid keyword '%s'." % args[0])
    
    if cls:
        name = cls.__name__
        try:
            obj_id = int(args[1])
        except ValueError:
            bad_args("Invalid id '%s'; must be an integer." % args[1])
        try:
            d = cls.objects.get(id=obj_id)
        except cls.DoesNotExist:
            print "Could not find a(n) %s with id=%s" % (name, obj_id)
        else:
            if options.handle:
                if controls.handle(d):
                    print "The error state of %s was marked as handled." % d
                else:
                    print "%s isn't in an error state." % d
            elif Status.is_final(d.status) and not options.force:
                print "%s is already in a final state." % d
            elif d.request == None or options.force:
                d.make_request(req)
                print "%s was sent a %s request." % (d, request)
                if options.wait:
                    _wait([d], req)
            else:
                print "%s already has request %s." % \
                    (d, Request.name(d.request))
Example #36
0
 def status_in(self, statuses):
     """Filter by status group. Takes a string or iterable."""
     if isinstance(statuses, basestring):
         statuses = Status.GROUPS(statuses)
     return self.filter(status__in=statuses) if statuses else self
Example #37
0
    def start(self):
        """Starts the daemon.  Does initialization then calls run()."""

        if self.status != Status.CREATED:
            print "Can't start a %s that's already been run." \
                % type(self).__name__
            return

        if not hasattr(self, 'id'):
            self.save()
        if not hasattr(self, 'log'):
            self.log = make_log(self.log_path)

        if settings.DEBUG:
            self.log.info(
                "WARNING, DEBUG is True, which means Django " +
                "will gobble memory as it stores all database queries.")

        # This try block is needed because the unit tests run daemons
        # in threads, which breaks signals.
        try:
            for signum in (signal.SIGINT, signal.SIGTERM):
                signal.signal(signum, self.signal_handler)
        except ValueError:
            pass

        self.log.start_redirect()
        self.log.info("%s initialized; starting..." % self)

        self.status = Status.RUNNING
        self.heartbeat = self.started = datetime.utcnow()
        self.save()
        self.heart.start()

        try:
            self.run()
        except Exception:
            self.set_status(Status.ERROR)
            self.log.error("An internal error occured!", trace=True)
        else:
            if not Status.is_final(self.status):
                self.set_status(Status.ENDED)
        finally:
            self.log.info("Shutting down...")
            try:
                self.clean_up()
            except:
                self.log.error("Clean up function failed.", trace=True)
            if not Status.is_final(self.status):
                self.set_status(Status.ERROR)
            self.heart.flag.set()
            self.heart.join()
            self.ended = datetime.utcnow()
            self.save()
            if settings.BACKUP_SYSTEM:
                self.log.info('Backing up log file...')
                try:
                    if backup_log(self.log_path):
                        self.log.info('Completed log backup.')
                    else:
                        self.log.error('Failed to backup log.')
                except:
                    self.log.error('Failed to backup log.', trace=True)
            self.log.info('%s has been shut down successfully.' % self)
            self.log.stop_redirect()
            self.log.close()
Example #38
0
 def tearDown(self):
     if not Status.is_final(self._scheduler.status):
         self._scheduler.make_request(Request.KILL)
     self.thread.join(15)
     assert not self.thread.isAlive()
     assert not self._scheduler.timer.isAlive()
Example #39
0
 def start(self):
     """Starts the daemon.  Does initialization then calls run()."""
     
     if self.status != Status.CREATED:
         print "Can't start a %s that's already been run." \
             % type(self).__name__
         return
     
     if not hasattr(self, 'id'):
         self.save()
     if not hasattr(self, 'log'):
         self.log = make_log(self.log_path)
     
     if settings.DEBUG:
         self.log.info("WARNING, DEBUG is True, which means Django " +
             "will gobble memory as it stores all database queries.")
     
     # This try block is needed because the unit tests run daemons
     # in threads, which breaks signals.
     try:
         for signum in (signal.SIGINT, signal.SIGTERM):
             signal.signal(signum, self.signal_handler)
     except ValueError:
         pass
     
     self.log.start_redirect()
     self.log.info("%s initialized; starting..." % self)
     
     self.status = Status.RUNNING
     self.heartbeat = self.started = datetime.utcnow()
     self.save()
     self.heart.start()
     
     try:
         self.run()
     except Exception:
         self.set_status(Status.ERROR)
         self.log.error("An internal error occured!", trace=True)
     else:
         if not Status.is_final(self.status):
             self.set_status(Status.ENDED)
     finally:    
         self.log.info("Shutting down...")
         try:
             self.clean_up()
         except:
             self.log.error("Clean up function failed.", trace=True)
         if not Status.is_final(self.status):
             self.set_status(Status.ERROR)
         self.heart.flag.set()
         self.heart.join()
         self.ended = datetime.utcnow()
         self.save()
         if settings.BACKUP_SYSTEM:
             self.log.info('Backing up log file...')
             try:
                 if backup_log(self.log_path):
                     self.log.info('Completed log backup.')
                 else:
                     self.log.error('Failed to backup log.')
             except:
                 self.log.error('Failed to backup log.', trace=True)
         self.log.info('%s has been shut down successfully.' % self)
         self.log.stop_redirect()
         self.log.close()
Example #40
0
def main():
    usage = "norc_control [executor | scheduler | host] <id | host> " + \
        "--[stop | kill | pause | resume | reload | handle] [--wait]"

    def bad_args(message):
        print message
        print usage
        sys.exit(2)

    parser = OptionParser(usage)
    parser.add_option("-s",
                      "--stop",
                      action="store_true",
                      default=False,
                      help="Send a stop request.")
    parser.add_option("-k",
                      "--kill",
                      action="store_true",
                      default=False,
                      help="Send a kill request.")
    parser.add_option("-p",
                      "--pause",
                      action="store_true",
                      default=False,
                      help="Send a pause request.")
    parser.add_option("-u",
                      "--resume",
                      action="store_true",
                      default=False,
                      help="Send an resume request.")
    parser.add_option("-r",
                      "--reload",
                      action="store_true",
                      default=False,
                      help="Send an reload request to a Scheduler.")
    parser.add_option("--handle",
                      action="store_true",
                      default=False,
                      help="Change the object's status to HANDLED.")
    parser.add_option("-f",
                      "--force",
                      action="store_true",
                      default=False,
                      help="Force the request to be made..")
    parser.add_option("-w",
                      "--wait",
                      action="store_true",
                      default=False,
                      help="Wait until the request has been responded to.")

    options, args = parser.parse_args()

    if len(args) != 2:
        bad_args("Invalid number of arguments.")

    requests = filter(lambda a: getattr(options, a.lower()),
                      Request.NAMES.values())
    if len(requests) + (1 if options.handle else 0) != 1:
        bad_args("Must request exactly one action.")
    if not options.handle:
        request = requests[0]
        req = getattr(Request, request)

    cls = None
    if args[0] in EXECUTOR_KEYWORDS:
        cls = Executor
    elif args[0] in SCHEDULER_KEYWORDS:
        cls = Scheduler
    elif args[0] in HOST_KEYWORDS:
        if options.handle:
            bad_args("Can't perform handle operation on multiple daemons.")
        daemons = MultiQuerySet(Executor, Scheduler).objects.all()
        daemons = daemons.filter(host=args[1]).status_in("active")
        if not options.force:
            daemons = daemons.filter(request=None)
        for d in daemons:
            if req in d.VALID_REQUESTS:
                d.make_request(req)
                print "%s was sent a %s request." % (d, request)
        if options.wait:
            _wait(daemons, req)
    else:
        bad_args("Invalid keyword '%s'." % args[0])

    if cls:
        name = cls.__name__
        try:
            obj_id = int(args[1])
        except ValueError:
            bad_args("Invalid id '%s'; must be an integer." % args[1])
        try:
            d = cls.objects.get(id=obj_id)
        except cls.DoesNotExist:
            print "Could not find a(n) %s with id=%s" % (name, obj_id)
        else:
            if options.handle:
                if controls.handle(d):
                    print "The error state of %s was marked as handled." % d
                else:
                    print "%s isn't in an error state." % d
            elif Status.is_final(d.status) and not options.force:
                print "%s is already in a final state." % d
            elif d.request == None or options.force:
                d.make_request(req)
                print "%s was sent a %s request." % (d, request)
                if options.wait:
                    _wait([d], req)
            else:
                print "%s already has request %s." % \
                    (d, Request.name(d.request))
Example #41
0
 def status_in(self, statuses):
     if isinstance(statuses, basestring):
         statuses = Status.GROUPS(statuses)
     return self.filter(status__in=statuses) if statuses else self
Example #42
0
 def tearDown(self):
     if not Status.is_final(self._scheduler.status):
         self._scheduler.make_request(Request.KILL)
     self.thread.join(15)
     assert not self.thread.isAlive()
     assert not self._scheduler.timer.isAlive()