def update_job(job: Job) -> Job: """ Update a job based on it's result. This method is used to update the status of the job by getting it's `AsyncResult`. It is called when (1) the job is retrived (ie. GET) and (2) when it is updated with other information (ie PATCH). See https://stackoverflow.com/a/38267978 for important considerations in using AsyncResult. """ # Get an async result from the backend if the job # is not recorded as ready. if not JobStatus.has_ended(job.status) or job.result is None or job.error is None: async_result = AsyncResult(str(job.id), app=celery) status = async_result.status info = async_result.info job.status = status if status in [JobStatus.RUNNING.value, JobStatus.SUCCESS.value] and isinstance( async_result.info, dict ): # For RUNNING, `info` is the `meta` kwarg passed to # `Job.update_state()` call in the worker process. # For SUCCESS, `info` is the value returned # by the `Job.success()` method in the worker process. for field in ["result", "log", "url"]: if field in info: setattr(job, field, info[field]) if status == JobStatus.FAILURE.value: # For FAILURE, `info` is the raised Exception job.error = dict(type=type(info).__name__, message=str(info)) if job.parent is not None: check_job(job.parent) job.save() return job
def update_job(job: Job, data={}, force: bool = False) -> Job: """ Update a job. This method is triggered by a PATCH request from the `overseer` service. It updates the status, and other fields of the job, and if the job has a parent, updates it's status too. See https://stackoverflow.com/a/38267978 for important considerations in using AsyncResult. """ # Avoid unnecessary update if not job.is_active and not force: return job was_active = job.is_active if JobMethod.is_compound(job.method): # Update the status of compound jobs based on children status = job.status is_active = False all_previous_succeeded = True any_previous_failed = False for child in job.get_children(): # If the child has a 'higher' status then update the # status of the compound job status = JobStatus.highest([status, child.status]) # If the child is still waiting then... if child.status == JobStatus.WAITING.value: # If all previous have succeeded, dispatch it if all_previous_succeeded: dispatch_job(child) # If any previous have failed, cancel it elif any_previous_failed: cancel_job(child) if child.status != JobStatus.SUCCESS.value: all_previous_succeeded = False if child.status == JobStatus.FAILURE.value: any_previous_failed = True # If the child is still active then the compound job is active if child.is_active: is_active = True job.is_active = is_active job.status = JobStatus.RUNNING.value if is_active else status else: status = data.get("status") assert status # Do not do anything if the new status is lower rank than the # existing status. This can exist for example when a job is # terminated (the SUCCESS state is sent after TERMINATED) if JobStatus.rank(status) < JobStatus.rank(job.status): return job # Update fields sent by `overseer` service, including `status` for key, value in data.items(): setattr(job, key, value) def async_result(): return AsyncResult(str(job.id), app=app) # If job succeeded then get the result if we haven't already if status == JobStatus.SUCCESS.value and job.result is None: response = None attempts = 0 while not response and attempts < 5: try: response = async_result().get(timeout=30) except Exception: # Catch all errors, but log them. Occasional # errors encountered in prod include ResponseError and TimeoutError logger.warning( "Error getting async result", exc_info=True, extra=dict(id=job.id, method=job.method, attempts=attempts), ) time.sleep(1) attempts += 1 if response: job.result = response.get("result") job.log = response.get("log") else: logger.error( "Unable to get async result", extra=dict(id=job.id, method=job.method, attempts=attempts), ) job.status = JobStatus.FAILURE.value job.error = dict(type="RuntimeError", message="Unable to get result of job") # If job failed then get the error # For FAILURE, `info` is the raised Exception elif status == JobStatus.FAILURE.value: info = async_result().info if info: job.error = dict(type=type(info).__name__, message=str(info)) # If the job has just ended then mark it as inactive if JobStatus.has_ended(status): job.is_active = False # If the job is no longer active clear its secrets and run its callback if was_active and not job.is_active: job.secrets = None job.run_callback() # Save before updating parent (and then this again) job.save() # If the job has a parent then update it too if job.parent: update_job(job.parent) return job