def cancel_job(job: Job) -> Job: """ Cancel a job. This uses Celery's terminate options which will kill the worker child process. This is not normally recommended but in this case is OK because there is only one task per process. See `worker/worker.py` for the reasoning for using `SIGUSR1`. See https://docs.celeryproject.org/en/stable/userguide/workers.html#revoke-revoking-tasks """ if job.is_active: if JobMethod.is_compound(job.method): for child in job.children.all(): cancel_job(child) else: app.control.revoke(str(job.id), terminate=True, signal="SIGUSR1") job.status = JobStatus.CANCELLED.value job.is_active = False job.secrets = None job.save() return job
def dispatch_job(job: Job) -> Job: """ Send a job to a queue. Decides which queue a job should be sent to and sends it. The queue can depend upon both the project and the account (either the account that the project is linked to, or the default account of the job creator). """ if not JobMethod.is_member(job.method): raise ValueError("Unknown job method '{}'".format(job.method)) if job.method in settings.JOB_METHODS_STAFF_ONLY and ( not job.creator or not job.creator.is_staff): raise PermissionDenied if JobMethod.is_compound(job.method): children = job.children.all().order_by("id") if len(children) == 0: # If there are no children (e.g. a pull job for a project with no sources) # then job is immediately finished job.runtime = 0 job.is_active = False job.status = JobStatus.SUCCESS.value else: if job.method == JobMethod.parallel.value: # Dispatch all child jobs simultaneously for child in children: dispatch_job(child) else: # Dispatch the first child; subsequent children # will be status WAITING and will get dispatched later # on update of the parent. for index, child in enumerate(children): if index == 0: dispatch_job(child) else: child.is_active = True child.status = JobStatus.WAITING.value child.save() job.is_active = True job.status = JobStatus.DISPATCHED.value else: # Find queues that have active workers on them # order by descending priority queues = list( Queue.objects.filter( workers__in=Worker.objects.filter( # Has not finished finished__isnull=True, # Has been updated in the last x minutes updated__gte=timezone.now() - datetime.timedelta(minutes=15), ), ).order_by("priority")) # Fallback to the default Stencila queue # Apart from anything else having this fallback is useful in development # because if means that the `overseer` service does not need to be running # in order keep track of the numbers of workers listening on each queue # (during development `worker`s listen to the default queue) if len(queues) == 0: logger.warning("No queues found with active workers") queue, _ = Queue.get_or_create(account_name="stencila", queue_name="default") else: if job.creator is None or job.project is None: # Jobs created by anonymous users go on the lowest # priority queue priority = 1 else: # The priority of other jobs is determined by the # account tier of the project priority = job.project.account.tier.id queue = queues[min(len(queues), priority) - 1] # Add the job's project id, key and secrets to it's kwargs. # Doing this here ensures it is done for all jobs # and avoids putting the secrets in the job's `params` field. kwargs = dict(**job.params) if job.params else {} kwargs["project"] = job.project.id if job.project else None kwargs["key"] = job.key kwargs["secrets"] = job.secrets # Send the job to the queue task = signature( job.method, kwargs=kwargs, queue=queue.name, task_id=str(job.id), app=app, ) task.apply_async() job.queue = queue job.is_active = True job.status = JobStatus.DISPATCHED.value job.save() return job
def update_job(job: Job, data={}, force: bool = False) -> Job: """ Update a job. This method is triggered by a PATCH request from the `overseer` service. It updates the status, and other fields of the job, and if the job has a parent, updates it's status too. See https://stackoverflow.com/a/38267978 for important considerations in using AsyncResult. """ # Avoid unnecessary update if not job.is_active and not force: return job was_active = job.is_active if JobMethod.is_compound(job.method): # Update the status of compound jobs based on children status = job.status is_active = False all_previous_succeeded = True any_previous_failed = False for child in job.get_children(): # If the child has a 'higher' status then update the # status of the compound job status = JobStatus.highest([status, child.status]) # If the child is still waiting then... if child.status == JobStatus.WAITING.value: # If all previous have succeeded, dispatch it if all_previous_succeeded: dispatch_job(child) # If any previous have failed, cancel it elif any_previous_failed: cancel_job(child) if child.status != JobStatus.SUCCESS.value: all_previous_succeeded = False if child.status == JobStatus.FAILURE.value: any_previous_failed = True # If the child is still active then the compound job is active if child.is_active: is_active = True job.is_active = is_active job.status = JobStatus.RUNNING.value if is_active else status else: status = data.get("status") assert status # Do not do anything if the new status is lower rank than the # existing status. This can exist for example when a job is # terminated (the SUCCESS state is sent after TERMINATED) if JobStatus.rank(status) < JobStatus.rank(job.status): return job # Update fields sent by `overseer` service, including `status` for key, value in data.items(): setattr(job, key, value) def async_result(): return AsyncResult(str(job.id), app=app) # If job succeeded then get the result if we haven't already if status == JobStatus.SUCCESS.value and job.result is None: response = None attempts = 0 while not response and attempts < 5: try: response = async_result().get(timeout=30) except Exception: # Catch all errors, but log them. Occasional # errors encountered in prod include ResponseError and TimeoutError logger.warning( "Error getting async result", exc_info=True, extra=dict(id=job.id, method=job.method, attempts=attempts), ) time.sleep(1) attempts += 1 if response: job.result = response.get("result") job.log = response.get("log") else: logger.error( "Unable to get async result", extra=dict(id=job.id, method=job.method, attempts=attempts), ) job.status = JobStatus.FAILURE.value job.error = dict(type="RuntimeError", message="Unable to get result of job") # If job failed then get the error # For FAILURE, `info` is the raised Exception elif status == JobStatus.FAILURE.value: info = async_result().info if info: job.error = dict(type=type(info).__name__, message=str(info)) # If the job has just ended then mark it as inactive if JobStatus.has_ended(status): job.is_active = False # If the job is no longer active clear its secrets and run its callback if was_active and not job.is_active: job.secrets = None job.run_callback() # Save before updating parent (and then this again) job.save() # If the job has a parent then update it too if job.parent: update_job(job.parent) return job