Example #1
0
    def update_worker(self):
        """
        Look for executors where the connection has broken and tasks need to be re-submitted.
        """
        if not self.worker:
            self.worker = Worker.objects.create(
                name=self.name,
                start_timestamp=functions.Now(),
                last_heartbeat=functions.Now())
        else:
            self.worker.refresh_from_db()

        if self.worker.status == Worker.LOST:
            # someone else marked us as lost, terminate all tasks and exit
            logger.warning('Marked as lost, committing harakiri')
            self.state = State.terminate
            self.executor.mark_terminated(self.executor.get_running_ids())
            return

        # update our timestamp so no one marks us as lost
        self.worker.last_heartbeat = functions.Now()
        self.worker.save()

        # look for lost workers and re-queue their tasks
        Worker.find_lost(self.timeout)
Example #2
0
 def handle_sigint(*args):
     self.state = State.shutdown
     logger.warning(
         'Received SIGINT, shutting down after all tasks have finished.'
     )
     logger.warning('Press CTL-C again to shut down immediately.')
     signal.signal(signal.SIGINT, handle_sigterm)
Example #3
0
    def run(self):
        """
        Main event loop.

        Why not threads/gevent/asyncio?
            Because this is simple and maybe more efficient
        """
        self.handle_signals()
        self.executor = Manager()
        self.state = State.running
        loop_start = time.time()
        logger.warning('Starting YAWN worker with concurrency=%s',
                       self.concurrency)

        while True:

            if self.state == State.running:

                # update own status and check for lost workers
                self.update_worker()

                # check for tasks that should be queued
                self.schedule_workflows()

                # run queued tasks
                self.start_tasks()

                self.mark_terminated()

            elif not self.executor.get_running_ids():
                # shutdown: exit loop once all tasks are finished
                break

            # check status on running tasks
            self.results.extend(self.executor.read_output())
            self.save_results()

            # don't run the loop more than once per second
            loop_duration = time.time() - loop_start
            time.sleep(max(1 - loop_duration, 0))
            loop_start += loop_duration

        logger.warning('Exiting')
        self.worker.status = Worker.EXITED
        self.worker.save()
Example #4
0
    def find_lost(timeout):
        from yawn.task.models import Execution

        # Make a sparse index so looking up active workers is fast:
        # CREATE INDEX yawn_worker_active ON yawn_worker (status) WHERE status = 'active'
        lost = Worker.objects.filter(status=Worker.ACTIVE,
                                     last_heartbeat__lt=functions.Now() -
                                     timedelta(seconds=timeout))
        for worker in lost:
            logger.warning('Marking %r as lost', worker)
            worker.status = Worker.LOST
            worker.save()

            executions = worker.execution_set.filter(status=Execution.RUNNING)

            for execution in executions:
                logger.warning('Marking %r as lost', execution)
                execution.mark_finished(lost=True)
Example #5
0
    def read_output(self, timeout=0.1) -> typing.List[Result]:
        """
        Read from all ready subprocess file descriptors.

        The returned byte string could include partial utf-8 characters,
        and when decoding the user should trim 0-3 bytes off the end to
        get a readable unicode string.

        :param timeout: in seconds, provided for testing
        """
        all_results = {}
        read_set = self.pipes.keys()

        try:
            # 1/10 second timeout so we return control to the calling event loop
            # if no file descriptors are ready to read
            read_ready, _, _ = select.select(read_set, [], [], timeout)
        except select.error as exc:
            # a signal could interrupt. fixed in python 3.5:
            # https://www.python.org/dev/peps/pep-0475/
            if exc.args[0] != errno.EINTR:
                raise
            logger.warning('Received select.error: %s', exc)
            read_ready = []

        # read from each ready file descriptor
        for fd in read_ready:
            execution = self.pipes[fd]
            data = fd.read(1024)
            if data == b'':
                # the pipe is closed
                fd.close()
                del self.pipes[fd]
                continue

            # keep reading up to 3 more bytes until we get a full UTF-8 character
            for _ in range(3):
                try:
                    data = data.decode('utf-8')
                    break
                except UnicodeDecodeError:
                    data += fd.read1(1)
            else:
                logger.error('Unable to decode byte data! Throwing it away.')

            result = all_results.setdefault(execution.id, Result(execution.id))
            if fd == execution.process.stdout.raw:
                result.stdout = data
            else:
                result.stderr = data

        # check if each running process needs cleanup
        for execution in list(self.running.values()):
            # check if the process should be killed
            if execution.deadline and execution.deadline < time.monotonic():
                # kill the process group, in an attempt to kill any children it has spawned
                try:
                    # setpgrp above sets the PGID to the subprocess' PID
                    os.killpg(execution.process.pid, signal.SIGKILL)
                    logger.info('Terminated execution #%s', execution.id)
                except (ProcessLookupError, PermissionError) as exc:
                    logger.info('Execution #%s was marked to kill but has already exited (%s)',
                                execution.id, exc.__class__.__name__)

            # check if the process has exited
            exit_code = execution.process.poll()
            if exit_code is None:
                continue  # we'll check again later

            # we may not have read everything available, so only cleanup after all pipes are closed
            open_pipes = (
                {execution.process.stdout.raw, execution.process.stderr.raw}
                & set(self.pipes.keys())
            )
            if not open_pipes:
                result = all_results.setdefault(execution.id, Result(execution.id))
                result.returncode = execution.process.returncode
                del self.running[execution.id]

        return list(all_results.values())
Example #6
0
 def handle_sigterm(*args):
     self.state = State.terminate
     # kill running tasks
     self.executor.mark_terminated(self.executor.get_running_ids())
     logger.warning(
         'Received SIGTERM, killing running tasks and exiting.')