Ejemplo n.º 1
0
 def work(self):
     # We should probably open up our own redis client
     self.client = qless.client(url=self.host)
     self.queues = [self.client.queues[q] for q in self.queues]
     
     if not os.path.isdir(self.sandbox):
         os.makedirs(self.sandbox)
     self.clean()
     # First things first, we should clear out any jobs that
     # we're responsible for off-hand
     while len(self.jids):
         try:
             job = self.client.jobs[self.jids.pop(0)]
             # If we still have access to it, then we should process it
             if job.heartbeat():
                 logger.info('Resuming %s' % job.jid)
                 self.setproctitle('Working %s (%s)' % (job.jid, job.klass_name))
                 job.process()
                 self.clean()
             else:
                 logger.warn('Lost heart on would-be resumed job %s' % job.jid)
         except KeyboardInterrupt:
             return
     
     sleep_cycles = 0
     while True:
         try:
             for queue in self.queues:
                 job = queue.pop()
                 if job:
                     sleep_cycles = -1
                     self.setproctitle('Working %s (%s)' % (job.jid, job.klass_name))
                     job.process()
                     self.clean()
             
             if self.stop_on_idle and sleep_cycles >= 2:
                 logger.info("Idle for too long, quiting")
                 import sys
                 sys.exit(self.IDLE_EXIT_STATUS)
             if sleep_cycles >= 0:
                 self.setproctitle('sleeping...')
                 logger.debug('Sleeping for %fs' % self.interval)
                 time.sleep(self.interval)
                 sleep_cycles += 1
             else:
                 sleep_cycles = 0
         except KeyboardInterrupt:
             return
Ejemplo n.º 2
0
    def stop(self, sig=signal.SIGINT):
        '''Stop all the workers, and then wait for them'''
        for cpid in self.sandboxes.keys():
            logger.warn('Stopping %i...' % cpid)
            os.kill(cpid, sig)

        # While we still have children running, wait for them
        for cpid in self.sandboxes.keys():
            try:
                logger.info('Waiting for %i...' % cpid)
                pid, status = os.waitpid(cpid, 0)
                logger.warn('%i stopped with status %i' % (pid, status >> 8))
            except OSError:  # pragma: no cover
                logger.exception('Error waiting for %i...' % cpid)
            finally:
                self.sandboxes.pop(pid, None)
Ejemplo n.º 3
0
    def run(self):
        '''Run this worker'''
        self.signals(('TERM', 'INT', 'QUIT'))
        # Divide up the jobs that we have to divy up between the workers. This
        # produces evenly-sized groups of jobs
        resume = self.divide(self.resume, self.count)
        for index in range(self.count):
            # The sandbox for the child worker
            sandbox = os.path.join(os.getcwd(), 'qless-py-workers',
                                   'sandbox-%s' % index)
            cpid = os.fork()
            if cpid:
                logger.info('Spawned worker %i' % cpid)
                self.sandboxes[cpid] = sandbox
            else:  # pragma: no cover
                # Move to the sandbox as the current working directory
                with Worker.sandbox(sandbox):
                    os.chdir(sandbox)
                    try:
                        self.spawn(resume=resume[index], sandbox=sandbox).run()
                    except:
                        logger.exception('Exception in spawned worker')
                    finally:
                        os._exit(0)

        try:
            while not self.shutdown:
                pid, status = os.wait()
                logger.warn('Worker %i died with status %i from signal %i' %
                            (pid, status >> 8, status & 0xff))
                sandbox = self.sandboxes.pop(pid)
                cpid = os.fork()
                if cpid:
                    logger.info('Spawned replacement worker %i' % cpid)
                    self.sandboxes[cpid] = sandbox
                else:  # pragma: no cover
                    with Worker.sandbox(sandbox):
                        os.chdir(sandbox)
                        try:
                            self.spawn(sandbox=sandbox).run()
                        except:
                            logger.exception('Exception in spawned worker')
                        finally:
                            os._exit(0)
        finally:
            self.stop(signal.SIGKILL)
Ejemplo n.º 4
0
    def run(self):
        '''Run this worker'''
        self.signals(('TERM', 'INT', 'QUIT'))
        # Divide up the jobs that we have to divy up between the workers. This
        # produces evenly-sized groups of jobs
        resume = self.divide(self.resume, self.count)
        for index in range(self.count):
            # The sandbox for the child worker
            sandbox = os.path.join(
                os.getcwd(), 'qless-py-workers', 'sandbox-%s' % index)
            cpid = os.fork()
            if cpid:
                logger.info('Spawned worker %i' % cpid)
                self.sandboxes[cpid] = sandbox
            else:  # pragma: no cover
                # Move to the sandbox as the current working directory
                with Worker.sandbox(sandbox):
                    os.chdir(sandbox)
                    try:
                        self.spawn(resume=resume[index], sandbox=sandbox).run()
                    except:
                        logger.exception('Exception in spawned worker')
                    finally:
                        os._exit(0)

        try:
            while not self.shutdown:
                pid, status = os.wait()
                logger.warn('Worker %i died with status %i from signal %i' % (
                    pid, status >> 8, status & 0xff))
                sandbox = self.sandboxes.pop(pid)
                cpid = os.fork()
                if cpid:
                    logger.info('Spawned replacement worker %i' % cpid)
                    self.sandboxes[cpid] = sandbox
                else:  # pragma: no cover
                    with Worker.sandbox(sandbox):
                        os.chdir(sandbox)
                        try:
                           self.spawn(sandbox=sandbox).run()
                        except:
                            logger.exception('Exception in spawned worker')
                        finally:
                            os._exit(0)
        finally:
            self.stop(signal.SIGKILL)
Ejemplo n.º 5
0
    def work(self):
        # We should probably open up our own redis client
        self.client = qless.client(self.host, self.port)
        self.queues = [self.client.queues[q] for q in self.queues]

        if not os.path.isdir(self.sandbox):
            os.makedirs(self.sandbox)
        self.clean()
        # First things first, we should clear out any jobs that
        # we're responsible for off-hand
        while len(self.jids):
            try:
                job = self.client.jobs[self.jids.pop(0)]
                # If we still have access to it, then we should process it
                if job.heartbeat():
                    logger.info('Resuming %s' % job.jid)
                    self.setproctitle('Working %s (%s)' %
                                      (job.jid, job.klass_name))
                    job.process()
                    self.clean()
                else:
                    logger.warn('Lost heart on would-be resumed job %s' %
                                job.jid)
            except KeyboardInterrupt:
                return

        while True:
            try:
                seen = False
                for queue in self.queues:
                    job = queue.pop()
                    if job:
                        seen = True
                        self.setproctitle('Working %s (%s)' %
                                          (job.jid, job.klass_name))
                        job.process()
                        self.clean()

                if not seen:
                    self.setproctitle('sleeping...')
                    logger.debug('Sleeping for %fs' % self.interval)
                    time.sleep(self.interval)
            except KeyboardInterrupt:
                return
Ejemplo n.º 6
0
    def stop(self, sig=signal.SIGINT):
        '''Stop all the workers, and then wait for them'''
        for cpid in self.sandboxes.keys():
            logger.warn('Stopping %i...' % cpid)
            try:
                os.kill(cpid, sig)
            except OSError:  # pragma: no cover
                logger.exception('Error stopping %s...' % cpid)

        # While we still have children running, wait for them
        for cpid in self.sandboxes.keys():
            try:
                logger.info('Waiting for %i...' % cpid)
                pid, status = os.waitpid(cpid, 0)
                logger.warn('%i stopped with status %i' % (pid, status >> 8))
            except OSError:  # pragma: no cover
                logger.exception('Error waiting for %i...' % cpid)
            finally:
                self.sandboxes.pop(cpid, None)
Ejemplo n.º 7
0
    def stop(self, sig=signal.SIGINT):
        '''Stop all the workers, and then wait for them'''
        for cpid in self.sandboxes:
            logger.warn('Stopping %i...' % cpid)
            try:
                os.kill(cpid, sig)
            except OSError:  # pragma: no cover
                logger.exception('Error stopping %s...' % cpid)

        # While we still have children running, wait for them
        # We edit the dictionary during the loop, so we need to copy its keys
        for cpid in list(self.sandboxes):
            try:
                logger.info('Waiting for %i...' % cpid)
                pid, status = os.waitpid(cpid, 0)
                logger.warn('%i stopped with status %i' % (pid, status >> 8))
            except OSError:  # pragma: no cover
                logger.exception('Error waiting for %i...' % cpid)
            finally:
                self.sandboxes.pop(cpid, None)
Ejemplo n.º 8
0
    def fail(self, group, message):
        """Mark the particular job as failed, with the provided type, and a
        more specific message. By `type`, we mean some phrase that might be
        one of several categorical modes of failure. The `message` is
        something more job-specific, like perhaps a traceback.

        This method should __not__ be used to note that a job has been dropped
        or has failed in a transient way. This method __should__ be used to
        note that a job has something really wrong with it that must be
        remedied.

        The motivation behind the `type` is so that similar errors can be
        grouped together. Optionally, updated data can be provided for the job.
        A job in any state can be marked as failed. If it has been given to a
        worker as a job, then its subsequent requests to heartbeat or complete
        that job will fail. Failed jobs are kept until they are canceled or
        completed. __Returns__ the id of the failed job if successful, or
        `False` on failure."""
        logger.warn("Failing %s (%s): %s" % (self.jid, group, message))
        return self.client("fail", self.jid, self.client.worker_name, group, message, json.dumps(self.data)) or False
Ejemplo n.º 9
0
 def work(self):
     # We should probably open up our own redis client
     self.client = qless.client(self.host, self.port, password=self.password)
     self.queues = [self.client.queues[q] for q in self.queues]
     
     if not os.path.isdir(self.sandbox):
         os.makedirs(self.sandbox)
     self.clean()
     # First things first, we should clear out any jobs that
     # we're responsible for off-hand
     while len(self.jids):
         try:
             job = self.client.jobs[self.jids.pop(0)]
             # If we still have access to it, then we should process it
             if job.heartbeat():
                 logger.info('Resuming %s' % job.jid)
                 self.setproctitle('Working %s (%s)' % (job.jid, job.klass_name))
                 job.process()
                 self.clean()
             else:
                 logger.warn('Lost heart on would-be resumed job %s' % job.jid)
         except KeyboardInterrupt:
             return
     
     while True:
         try:
             seen = False
             for queue in self.queues:
                 job = queue.pop()
                 if job:
                     seen = True
                     self.setproctitle('Working %s (%s)' % (job.jid, job.klass_name))
                     job.process()
                     self.clean()
             
             if not seen:
                 self.setproctitle('sleeping...')
                 logger.debug('Sleeping for %fs' % self.interval)
                 time.sleep(self.interval)
         except KeyboardInterrupt:
             return
Ejemplo n.º 10
0
 def handler(self, signum, frame):  # pragma: no cover
     '''Signal handler for this process'''
     if signum == signal.SIGQUIT:
         # QUIT - Finish processing, but don't do any more work after that
         self.stop()
     elif signum == signal.SIGUSR1:
         # USR1 - Print the backtrace
         message = ''.join(traceback.format_stack(frame))
         message = 'Signaled traceback for %s:\n%s' % (os.getpid(), message)
         print(message, file=sys.stderr)
         logger.warn(message)
     elif signum == signal.SIGUSR2:
         # USR2 - Enter a debugger
         # Much thanks to http://stackoverflow.com/questions/132058
         data = {'_frame': frame}  # Allow access to frame object.
         data.update(frame.f_globals)  # Unless shadowed by global
         data.update(frame.f_locals)
         # Build up a message with a traceback
         message = ''.join(traceback.format_stack(frame))
         message = 'Traceback:\n%s' % message
         code.InteractiveConsole(data).interact(message)
Ejemplo n.º 11
0
    def fail(self, group, message):
        '''Mark the particular job as failed, with the provided type, and a
        more specific message. By `type`, we mean some phrase that might be
        one of several categorical modes of failure. The `message` is
        something more job-specific, like perhaps a traceback.

        This method should __not__ be used to note that a job has been dropped
        or has failed in a transient way. This method __should__ be used to
        note that a job has something really wrong with it that must be
        remedied.

        The motivation behind the `type` is so that similar errors can be
        grouped together. Optionally, updated data can be provided for the job.
        A job in any state can be marked as failed. If it has been given to a
        worker as a job, then its subsequent requests to heartbeat or complete
        that job will fail. Failed jobs are kept until they are canceled or
        completed. __Returns__ the id of the failed job if successful, or
        `False` on failure.'''
        logger.warn('Failing %s (%s): %s', self.jid, group, message)
        return self.client('fail', self.jid, self.client.worker_name, group,
            message, json.dumps(self.data)) or False
Ejemplo n.º 12
0
 def handler(self, signum, frame):  # pragma: no cover
     '''Signal handler for this process'''
     if signum == signal.SIGQUIT:
         # QUIT - Finish processing, but don't do any more work after that
         self.stop()
     elif signum == signal.SIGUSR1:
         # USR1 - Print the backtrace
         message = ''.join(traceback.format_stack(frame))
         message = 'Signaled traceback for %s:\n%s' % (os.getpid(), message)
         print(message, file=sys.stderr)
         logger.warn(message)
     elif signum == signal.SIGUSR2:
         # USR2 - Enter a debugger
         # Much thanks to http://stackoverflow.com/questions/132058
         data = {'_frame': frame}    # Allow access to frame object.
         data.update(frame.f_globals)  # Unless shadowed by global
         data.update(frame.f_locals)
         # Build up a message with a traceback
         message = ''.join(traceback.format_stack(frame))
         message = 'Traceback:\n%s' % message
         code.InteractiveConsole(data).interact(message)
Ejemplo n.º 13
0
    def _import(klass):
        '''1) Get a reference to the module
           2) Check the file that module's imported from
           3) If that file's been updated, force a reload of that module
                return it'''
        mod = __import__(klass.rpartition('.')[0])
        for segment in klass.split('.')[1:-1]:
            mod = getattr(mod, segment)

        # Alright, now check the file associated with it. Note that clases
        # defined in __main__ don't have a __file__ attribute
        if klass not in BaseJob._loaded:
            BaseJob._loaded[klass] = time.time()
        if hasattr(mod, '__file__'):
            try:
                mtime = os.stat(mod.__file__).st_mtime
                if BaseJob._loaded[klass] < mtime:
                    mod = reload_module(mod)
            except OSError:
                logger.warn('Could not check modification time of %s',
                    mod.__file__)

        return getattr(mod, klass.rpartition('.')[2])
Ejemplo n.º 14
0
    def _import(klass):
        '''1) Get a reference to the module
           2) Check the file that module's imported from
           3) If that file's been updated, force a reload of that module
                return it'''
        mod = __import__(klass.rpartition('.')[0])
        for segment in klass.split('.')[1:-1]:
            mod = getattr(mod, segment)

        # Alright, now check the file associated with it. Note that clases
        # defined in __main__ don't have a __file__ attribute
        if klass not in BaseJob._loaded:
            BaseJob._loaded[klass] = time.time()
        if hasattr(mod, '__file__'):
            try:
                mtime = os.stat(mod.__file__).st_mtime
                if BaseJob._loaded[klass] < mtime:
                    mod = reload_module(mod)
            except OSError:
                logger.warn('Could not check modification time of %s',
                    mod.__file__)

        return getattr(mod, klass.rpartition('.')[2])
Ejemplo n.º 15
0
 def stop(self):
     # Stop all the workers, and then wait for them
     for cpid in self.sandboxes.keys():
         logger.warn('Stopping %i...' % cpid)
         os.kill(cpid, signal.SIGINT)
     
     while True:
         try:
             pid, status = os.wait()
             self.sandboxes.pop(pid, None)
             logger.warn('Worker %i stopped.' % cpid)
         except OSError:
             break
     
     for cpid in self.sandboxes.keys():
         logger.warn('Could not wait for %i' % cpid)
Ejemplo n.º 16
0
 def kill(self, jid):
     '''Stop the greenlet processing the provided jid'''
     greenlet = self.greenlets.get(jid)
     if greenlet != None:
         logger.warn('Lost ownership of %s' % jid)
         greenlet.kill()
Ejemplo n.º 17
0
 def kill(self, jid):
     '''Stop the greenlet processing the provided jid'''
     greenlet = self.greenlets.get(jid)
     if greenlet != None:
         logger.warn('Lost ownership of %s' % jid)
         greenlet.kill()
Ejemplo n.º 18
0
    def run(self):
        # If this worker is meant to be resumable, then we should find out
        # what jobs this worker was working on beforehand.
        if self.resume:
            jids_to_resume = self.client.workers[self.client.worker_name]['jobs']
        else:
            jids_to_resume = []
        
        pids = []
        for i in range(self.count):
            slot = {
                'worker_id': i,
                'sandbox'  : os.path.join(self.workdir, 'qless-py-workers', 'sandbox-%i' % i)
            }
            cpid = os.fork()
            if cpid:
                logger.info('Spawned worker %i' % cpid)
                self.sandboxes[cpid] = slot
                pids.append(str(cpid))
            else:
                # Set the value of the metadata so that jobs can detect
                # what worker they're running on
                import qless.worker
                qless.worker.meta = slot
                # Make note that we're not the master, and then save our
                # sandbox and worker id for reference
                self.master    = False
                self.sandbox   = slot['sandbox']
                self.worker_id = slot['worker_id']
                # Also, we should take our share of the jobs that we want
                # to resume, if any.
                start = (i     * len(jids_to_resume)) / self.count
                end   = ((i+1) * len(jids_to_resume)) / self.count
                self.jids      = jids_to_resume[start:end]
                return self.work()
        
        f=open(os.path.join(self.workdir, 'workers-pid.txt'),'w')
        f.write(str(os.getpid()))
        f.write('\n')
        for pid in pids:
            f.write(pid)
            f.write('\n')
        f.close()

        while self.master:
            try:
                pid, status = os.wait()
                logger.warn('Worker %i died with status %i from signal %i' % (pid, status >> 8, status & 0xff))
                slot = self.sandboxes.pop(pid)
                cpid = os.fork()
                if cpid:
                    logger.info('Spawned replacement worker %i' % cpid)
                    self.sandboxes[cpid] = slot
                else:
                    # Set the value of the metadata so that jobs can detect
                    # what worker they're running on
                    import qless.worker
                    qless.worker.meta = slot
                    # Make note that we're not the master, and then save our
                    # sandbox and worker id for reference
                    self.master    = False
                    self.sandbox   = slot['sandbox']
                    self.worker_id = slot['worker_id']
                    # NOTE: In the case that the worker died, we're going to
                    # assume that something about the job(s) it was working 
                    # made the worker exit, and so we're going to ignore any
                    # jobs that we might have been working on. It's also 
                    # significantly more difficult than the above problem of
                    # simply distributing work to /new/ workers, rather than
                    # a respawned worker.
                    return self.work()
            except KeyboardInterrupt:
                break
        
        if self.master:
            self.stop()