def update_worker(self): """ Look for executors where the connection has broken and tasks need to be re-submitted. """ if not self.worker: self.worker = Worker.objects.create( name=self.name, start_timestamp=functions.Now(), last_heartbeat=functions.Now()) else: self.worker.refresh_from_db() if self.worker.status == Worker.LOST: # someone else marked us as lost, terminate all tasks and exit logger.warning('Marked as lost, committing harakiri') self.state = State.terminate self.executor.mark_terminated(self.executor.get_running_ids()) return # update our timestamp so no one marks us as lost self.worker.last_heartbeat = functions.Now() self.worker.save() # look for lost workers and re-queue their tasks Worker.find_lost(self.timeout)
def handle_sigint(*args): self.state = State.shutdown logger.warning( 'Received SIGINT, shutting down after all tasks have finished.' ) logger.warning('Press CTL-C again to shut down immediately.') signal.signal(signal.SIGINT, handle_sigterm)
def run(self): """ Main event loop. Why not threads/gevent/asyncio? Because this is simple and maybe more efficient """ self.handle_signals() self.executor = Manager() self.state = State.running loop_start = time.time() logger.warning('Starting YAWN worker with concurrency=%s', self.concurrency) while True: if self.state == State.running: # update own status and check for lost workers self.update_worker() # check for tasks that should be queued self.schedule_workflows() # run queued tasks self.start_tasks() self.mark_terminated() elif not self.executor.get_running_ids(): # shutdown: exit loop once all tasks are finished break # check status on running tasks self.results.extend(self.executor.read_output()) self.save_results() # don't run the loop more than once per second loop_duration = time.time() - loop_start time.sleep(max(1 - loop_duration, 0)) loop_start += loop_duration logger.warning('Exiting') self.worker.status = Worker.EXITED self.worker.save()
def find_lost(timeout): from yawn.task.models import Execution # Make a sparse index so looking up active workers is fast: # CREATE INDEX yawn_worker_active ON yawn_worker (status) WHERE status = 'active' lost = Worker.objects.filter(status=Worker.ACTIVE, last_heartbeat__lt=functions.Now() - timedelta(seconds=timeout)) for worker in lost: logger.warning('Marking %r as lost', worker) worker.status = Worker.LOST worker.save() executions = worker.execution_set.filter(status=Execution.RUNNING) for execution in executions: logger.warning('Marking %r as lost', execution) execution.mark_finished(lost=True)
def read_output(self, timeout=0.1) -> typing.List[Result]: """ Read from all ready subprocess file descriptors. The returned byte string could include partial utf-8 characters, and when decoding the user should trim 0-3 bytes off the end to get a readable unicode string. :param timeout: in seconds, provided for testing """ all_results = {} read_set = self.pipes.keys() try: # 1/10 second timeout so we return control to the calling event loop # if no file descriptors are ready to read read_ready, _, _ = select.select(read_set, [], [], timeout) except select.error as exc: # a signal could interrupt. fixed in python 3.5: # https://www.python.org/dev/peps/pep-0475/ if exc.args[0] != errno.EINTR: raise logger.warning('Received select.error: %s', exc) read_ready = [] # read from each ready file descriptor for fd in read_ready: execution = self.pipes[fd] data = fd.read(1024) if data == b'': # the pipe is closed fd.close() del self.pipes[fd] continue # keep reading up to 3 more bytes until we get a full UTF-8 character for _ in range(3): try: data = data.decode('utf-8') break except UnicodeDecodeError: data += fd.read1(1) else: logger.error('Unable to decode byte data! Throwing it away.') result = all_results.setdefault(execution.id, Result(execution.id)) if fd == execution.process.stdout.raw: result.stdout = data else: result.stderr = data # check if each running process needs cleanup for execution in list(self.running.values()): # check if the process should be killed if execution.deadline and execution.deadline < time.monotonic(): # kill the process group, in an attempt to kill any children it has spawned try: # setpgrp above sets the PGID to the subprocess' PID os.killpg(execution.process.pid, signal.SIGKILL) logger.info('Terminated execution #%s', execution.id) except (ProcessLookupError, PermissionError) as exc: logger.info('Execution #%s was marked to kill but has already exited (%s)', execution.id, exc.__class__.__name__) # check if the process has exited exit_code = execution.process.poll() if exit_code is None: continue # we'll check again later # we may not have read everything available, so only cleanup after all pipes are closed open_pipes = ( {execution.process.stdout.raw, execution.process.stderr.raw} & set(self.pipes.keys()) ) if not open_pipes: result = all_results.setdefault(execution.id, Result(execution.id)) result.returncode = execution.process.returncode del self.running[execution.id] return list(all_results.values())
def handle_sigterm(*args): self.state = State.terminate # kill running tasks self.executor.mark_terminated(self.executor.get_running_ids()) logger.warning( 'Received SIGTERM, killing running tasks and exiting.')