def create_run_task(run_id, inputs, output_directory=None): logger.info(format_log("Creating and validating run", obj_id=run_id)) run = RunObjectFactory.from_definition(run_id, inputs) run.ready() run.to_db() submit_job.delay(run_id, output_directory) logger.info(format_log("Run is ready", obj=run))
def job_processor(job_id): logger.info(format_log("ETL Creating job", obj_id=job_id)) job = JobObject(job_id) logger.info( format_log("ETL Processing job with args %s" % str(job.job.args), obj=job.job)) job.process()
def fetch_requests_lims(): logger.info("ETL fetching requestIDs") running = Job.objects.filter(run=TYPES["DELIVERY"], status__in=(JobStatus.CREATED, JobStatus.IN_PROGRESS, JobStatus.WAITING_FOR_CHILDREN)) if len(running) > 0: logger.info( format_log("ETL job already in progress", obj=running.first())) return latest = Job.objects.filter( run=TYPES["DELIVERY"]).order_by("-created_date").first() timestamp = None if latest: timestamp = int(latest.created_date.timestamp()) * 1000 else: timestamp = int((datetime.datetime.now() - datetime.timedelta(hours=120)).timestamp()) * 1000 job = Job( run="beagle_etl.jobs.lims_etl_jobs.fetch_new_requests_lims", args={"timestamp": timestamp}, status=JobStatus.CREATED, max_retry=3, children=[], ) job.save() logger.info(format_log("ETL fetch_new_requests_lims job created", obj=job))
def abort_job_on_ridgeback(job_id): response = requests.get(settings.RIDGEBACK_URL + "/v0/jobs/%s/abort/" % job_id) if response.status_code == 200: logger.info(format_log("Job aborted", obj_id=job_id)) return True logger.error(format_log("Failed to abort job", obj_id=job_id)) return None
def scheduler(): jobs = get_pending_jobs() logger.info("Pending jobs: %s" % jobs) for job in jobs: j = Job.objects.get(id=job.id) if not j.is_locked: j.lock_job() logger.info(format_log("ETL submitting job", obj=job)) job_processor.delay(j.id) else: logger.info(format_log("ETL job already locked", obj=job))
def create_jobs_from_request(request_id, operator_id, job_group_id, job_group_notifier_id=None, pipeline=None): logger.info( format_log("Creating operator with %s" % operator_id, job_group_id=job_group_id, request_id=request_id)) operator_model = Operator.objects.get(id=operator_id) if not job_group_notifier_id: try: job_group = JobGroup.objects.get(id=job_group_id) except JobGroup.DoesNotExist: logger.info( format_log("Job group does not exist" % operator_id, job_group_id=job_group_id, request_id=request_id)) return try: job_group_notifier_id = notifier_start(job_group, request_id, operator=operator_model) request_obj = Request.objects.filter(request_id=request_id).first() if request_obj: delivery_date_event = SetDeliveryDateFieldEvent( job_group_notifier_id, str(request_obj.delivery_date)).to_dict() send_notification.delay(delivery_date_event) except Exception as e: logger.info( format_log("Failed to instantiate notifier" % operator_id, job_group_id=job_group_id, request_id=request_id)) operator = OperatorFactory.get_by_model( operator_model, job_group_id=job_group_id, job_group_notifier_id=job_group_notifier_id, request_id=request_id, pipeline=pipeline, ) _set_link_to_run_ticket(request_id, job_group_notifier_id) generate_description(job_group_id, job_group_notifier_id, request_id) generate_label(job_group_notifier_id, request_id) create_jobs_from_operator(operator, job_group_id, job_group_notifier_id)
def fail_job(self, run_id, error_message, lsf_log_location=None, input_json_location=None): lock_id = "run_lock_%s" % run_id with memcache_task_lock(lock_id, self.app.oid) as acquired: if acquired: run = RunObjectFactory.from_db(run_id) if run.run_obj.is_failed: logger.info( format_log("Run Fail already processed", obj=run.run_obj)) return restart_run = run.run_obj.set_for_restart() if not restart_run: run.fail(error_message) run.to_db() job_group_notifier = run.job_group_notifier job_group_notifier_id = str( job_group_notifier.id) if job_group_notifier else None ci_review = SetCIReviewEvent(job_group_notifier_id).to_dict() send_notification.delay(ci_review) _upload_qc_report(run.run_obj) _job_finished_notify(run, lsf_log_location, input_json_location) else: run_id, output_directory, execution_id = restart_run submit_job.delay(run_id, output_directory, execution_id) else: logger.warning("Run %s is processing by another worker" % run_id)
def process(self): if self.job.status == JobStatus.CREATED: self.job.status = JobStatus.IN_PROGRESS elif self.job.status == JobStatus.IN_PROGRESS: self.job.retry_count = self.job.retry_count + 1 try: self._process() self.job.status = JobStatus.WAITING_FOR_CHILDREN except Exception as e: if isinstance(e, ETLExceptions): message = {"message": str(e), "code": e.code} else: message = {"message": str(e)} if self.job.retry_count == self.job.max_retry: self.job.status = JobStatus.FAILED self.job.message = message self._job_failed() elif self.job.status == JobStatus.WAITING_FOR_CHILDREN: self._check_children() logger.info( format_log("ETL job in status: %s" % JobStatus(self.job.status).name, obj=self.job)) self._unlock() self._save()
def submit_job(run_id, output_directory=None, execution_id=None): resume = None try: run = Run.objects.get(id=run_id) except Run.DoesNotExist: raise Exception("Failed to submit a run") run1 = RunObjectFactory.from_db(run_id) if run.resume: run2 = RunObjectFactory.from_db(run.resume) if run1.equal(run2): logger.info( format_log("Resuming run with execution id %s" % run2.run_obj.execution_id, obj=run)) resume = str(run2.run_obj.execution_id) else: logger.info( format_log( "Failed to resume runs as run is not equal to the following run: %s" % str(run2), obj=run)) if execution_id: resume = execution_id if not output_directory: output_directory = os.path.join(run.app.output_directory, str(run_id)) job = run1.dump_job(output_directory=output_directory) logger.info(format_log("Job ready for submitting", obj=run)) if resume: url = urljoin(settings.RIDGEBACK_URL, "/v0/jobs/{id}/resume/".format(id=resume)) job = {"root_dir": output_directory} else: url = settings.RIDGEBACK_URL + "/v0/jobs/" if run.app.walltime: job["walltime"] = run.app.walltime if run.app.memlimit: job["memlimit"] = run.app.memlimit response = requests.post(url, json=job) if response.status_code == 201: run.execution_id = response.json()["id"] logger.info(format_log("Job successfully submitted", obj=run)) run.save() else: raise Exception("Failed to submit job %s" % run_id)
def abort_job(self, run_id): lock_id = "run_lock_%s" % run_id with memcache_task_lock(lock_id, self.app.oid) as acquired: if acquired: run = Run.objects.get(id=run_id) logger.info(format_log("Transition to state ABORTED", obj=run)) if run.status != RunStatus.ABORTED: run.status = RunStatus.ABORTED run.save() else: logger.warning("Run %s is processing by another worker" % run_id)
def complete_job(self, run_id, outputs, lsf_log_location=None, inputs_json_location=None): lock_id = "run_lock_%s" % run_id with memcache_task_lock(lock_id, self.app.oid) as acquired: if acquired: run = RunObjectFactory.from_db(run_id) if run.run_obj.is_completed: logger.info( format_log("Run Complete already processed", obj=run.run_obj)) return logger.info(format_log("Completing Run", obj=run.run_obj)) try: run.complete(outputs) except Exception as e: fail_job(run_id, str(e)) return run.to_db() job_group = run.job_group job_group_id = str(job_group.id) if job_group else None _job_finished_notify(run, lsf_log_location, inputs_json_location) for trigger in run.run_obj.operator_run.operator.from_triggers.filter( run_type=TriggerRunType.INDIVIDUAL): create_jobs_from_chaining.delay( trigger.to_operator_id, trigger.from_operator_id, [run_id], job_group_id=job_group_id, parent=str(run.run_obj.operator_run.id) if run.run_obj.operator_run else None, ) else: logger.warning("Run %s is processing by another worker" % run_id)
def _check_children(self): finished = True failed = [] permission_denied = False recipe = None for child_id in self.job.children: try: child_job = Job.objects.get(id=child_id) except Job.DoesNotExist: failed.append(child_id) continue if child_job.status == JobStatus.FAILED: failed.append(child_id) if isinstance( child_job.message, dict) and child_job.message.get( "code", 0) == 108: logger.error( format_log( "ETL job failed because of permission denied error", obj=self.job)) recipe = child_job.args.get("request_metadata", {}).get("recipe") permission_denied = True if child_job.status in (JobStatus.IN_PROGRESS, JobStatus.CREATED, JobStatus.WAITING_FOR_CHILDREN): finished = False break if finished: if failed: self.job.status = JobStatus.FAILED self.job.message = { "details": "Child jobs %s failed" % ", ".join(failed) } self._job_failed(permission_denied, recipe) else: self.job.status = JobStatus.COMPLETED self._job_successful() if self.job.callback: job = Job( run=self.job.callback, args=self.job.callback_args, status=JobStatus.CREATED, max_retry=1, children=[], job_group=self.job.job_group, ) job.save()
def create_jobs_from_chaining(to_operator_id, from_operator_id, run_ids=[], job_group_id=None, job_group_notifier_id=None, parent=None): logger.info( format_log("Creating operator id %s from chaining: %s" % (to_operator_id, from_operator_id), job_group_id=job_group_id)) operator_model = Operator.objects.get(id=to_operator_id) operator = OperatorFactory.get_by_model( operator_model, job_group_id=job_group_id, job_group_notifier_id=job_group_notifier_id, run_ids=run_ids) create_jobs_from_operator(operator, job_group_id, job_group_notifier_id, parent)
def check_missing_requests(): """ Method implemented because some requests on LIMS can show up with the date from the past """ logger.info("ETL Check for missing requests") timestamp = int((datetime.datetime.now() - datetime.timedelta(hours=12)).timestamp()) * 1000 job = Job( run="beagle_etl.jobs.lims_etl_jobs.fetch_new_requests_lims", args={ "timestamp": timestamp, "redelivery": False }, status=JobStatus.CREATED, max_retry=3, children=[], ) job.save() logger.info(format_log("ETL fetch_new_requests_lims job created", obj=job))
def create_operator_run_from_jobs(operator, jobs, job_group_id=None, job_group_notifier_id=None, parent=None): jg = None jgn = None if not jobs: logger.info( "Could not create operator run due to no jobs being passed") return try: jg = JobGroup.objects.get(id=job_group_id) except JobGroup.DoesNotExist: logger.info(format_log("Job group not set", job_group_id=job_group_id)) try: jgn = JobGroupNotifier.objects.get(id=job_group_notifier_id) except JobGroupNotifier.DoesNotExist: logger.info( format_log("Job group notifier not set", job_group_id=job_group_id)) valid_jobs, invalid_jobs = [], [] for job in jobs: valid_jobs.append(job) if job.is_valid() else invalid_jobs.append(job) try: operator_run_parent = OperatorRun.objects.get(id=parent) except OperatorRun.DoesNotExist: operator_run_parent = None operator_run = OperatorRun.objects.create( operator=operator.model, num_total_runs=len(valid_jobs), job_group=jg, job_group_notifier=jgn, parent=operator_run_parent, ) run_ids = [] pipeline_id = None try: pipeline_id = operator.get_pipeline_id() p = Pipeline.objects.get(id=pipeline_id) pipeline_name = p.name pipeline_version = p.version pipeline_link = p.pipeline_link except Pipeline.DoesNotExist: pipeline_name = "" pipeline_link = "" pipeline_version = "" pipeline_description_event = AddPipelineToDescriptionEvent( job_group_notifier_id, pipeline_name, pipeline_version, pipeline_link).to_dict() send_notification.delay(pipeline_description_event) set_pipeline_field = SetPipelineFieldEvent(job_group_notifier_id, pipeline_name).to_dict() send_notification.delay(set_pipeline_field) for job in valid_jobs: logger.info(format_log("Creating run", obj=job)) job.operator_run_id = str(operator_run.id) job.job_group_id = str(job_group_id) if job_group_id else job_group_id job.job_group_notifier_id = str( job_group_notifier_id ) if job_group_notifier_id else job_group_notifier_id run = job.create() logger.info(format_log("Run created", obj=run)) run_ids.append({ "run_id": str(run.id), "tags": run.tags, "output_directory": run.output_directory }) output_directory = run.output_directory if not pipeline_name and not pipeline_link: logger.error( format_log( "Run failed, could not find pipeline %s" % pipeline_id, obj=run, job_group_id=job_group_id, operator_run_id=operator_run.id, )) error_message = dict(details="Pipeline [ id: %s ] was not found.". format(pipeline_id)) fail_job(run.id, error_message) else: create_run_task.delay(str(run.id), job.inputs, output_directory) if job_group_id: event = OperatorRunEvent(job_group_notifier_id, operator.request_id, pipeline_name, pipeline_link, run_ids, str(operator_run.id)).to_dict() send_notification.delay(event) for job in invalid_jobs: # TODO: Report this to JIRA ticket also logger.error( format_log("Job invalid %s" % job.errors, obj=job, job_group_id=job_group_id, operator_run_id=operator_run.id)) logger.error( format_log("Job invalid %s" % job[0].errors, obj=job[0], job_group_id=job_group_id, operator_run_id=operator_run.id)) operator_run.status = RunStatus.RUNNING operator_run.save()
def check_jobs_status(): runs_queryset = Run.objects.filter(status__in=(RunStatus.RUNNING, RunStatus.READY), execution_id__isnull=False) limit = 800 i = 0 while True: runs = runs_queryset[i:i + limit] i += limit if not runs: return remote_statuses = check_statuses_on_ridgeback( list(runs.values_list("execution_id"))) if not remote_statuses: continue for run in runs: logger.info(format_log("Checking status for run", obj=run)) if str(run.execution_id) not in remote_statuses: logger.info( format_log( "Requested job status from executor that was not returned", obj=run)) continue status = remote_statuses[str(run.execution_id)] if status["started"] and not run.started: run.started = status["started"] if status["submitted"] and not run.submitted: run.submitted = status["submitted"] if status["commandlinetooljob_set"]: update_commandline_job_status(run, status["commandlinetooljob_set"]) if status["status"] == "FAILED": logger.error(format_log("Job failed ", obj=run)) message = dict(details=status.get("message")) lsf_log_location = status.get("message", {}).get("log") inputs_location = None if lsf_log_location: inputs_location = lsf_log_location.replace( "lsf.log", "input.json") fail_job.delay(str(run.id), message, lsf_log_location, inputs_location) continue if status["status"] == "COMPLETED": logger.info(format_log("Job completed", obj=run)) lsf_log_location = status.get("message", {}).get("log") inputs_location = None if lsf_log_location: inputs_location = lsf_log_location.replace( "lsf.log", "input.json") complete_job.delay(str(run.id), status["outputs"], lsf_log_location, inputs_location) continue if status["status"] == "CREATED": logger.info(format_log("Job created", obj=run)) continue if status["status"] == "PENDING": logger.info(format_log("Job pending", obj=run)) continue if status["status"] == "RUNNING": logger.info(format_log("Job running", obj=run)) running_job.delay(str(run.id)) continue if status["status"] == "ABORTED": logger.info(format_log("Job aborted", obj=run)) abort_job.delay(str(run.id)) else: logger.info("Run lock not acquired for run: %s" % str(run.id))
def process_triggers(): operator_runs = OperatorRun.objects.prefetch_related( "runs", "operator__from_triggers").exclude( status__in=[RunStatus.COMPLETED, RunStatus.FAILED]) for operator_run in operator_runs: created_chained_job = False job_group = operator_run.job_group job_group_id = str(job_group.id) if job_group else None job_group_notifier = operator_run.job_group_notifier job_group_notifier_id = str( job_group_notifier.id) if job_group_notifier else None try: for trigger in operator_run.operator.from_triggers.all(): trigger_type = trigger.run_type if trigger_type == TriggerRunType.AGGREGATE: condition = trigger.aggregate_condition if condition == TriggerAggregateConditionType.ALL_RUNS_SUCCEEDED: if operator_run.percent_runs_succeeded == 100.0: created_chained_job = True create_jobs_from_chaining.delay( trigger.to_operator_id, trigger.from_operator_id, list( operator_run.runs.order_by( "id").values_list("id", flat=True)), job_group_id=job_group_id, job_group_notifier_id=job_group_notifier_id, parent=str(operator_run.id), ) continue elif condition == TriggerAggregateConditionType.NINTY_PERCENT_SUCCEEDED: if operator_run.percent_runs_succeeded >= 90.0: created_chained_job = True create_jobs_from_chaining.delay( trigger.to_operator_id, trigger.from_operator_id, list( operator_run.runs.order_by( "id").values_list("id", flat=True)), job_group_id=job_group_id, job_group_notifier_id=job_group_notifier_id, parent=str(operator_run.id), ) continue if operator_run.percent_runs_finished == 100.0: logger.info( format_log("Conditions never met", operator_run_id=operator_run.id, job_group_id=job_group_id)) elif trigger_type == TriggerRunType.INDIVIDUAL: if operator_run.percent_runs_finished == 100.0: operator_run.complete() if operator_run.percent_runs_finished == 100.0: if operator_run.percent_runs_succeeded == 100.0: operator_run.complete() if not created_chained_job and job_group_notifier_id: completed_event = SetPipelineCompletedEvent( job_group_notifier_id).to_dict() send_notification.delay(completed_event) else: operator_run.fail() if job_group_notifier_id: e = OperatorRequestEvent( job_group_notifier_id, "[CIReviewEvent] Operator Run %s failed" % str(operator_run.id)).to_dict() send_notification.delay(e) ci_review_event = SetCIReviewEvent( job_group_notifier_id).to_dict() send_notification.delay(ci_review_event) except Exception as e: logger.info( format_log("Trigger failed %s", str(e), operator_run_id=operator_run.id, job_group_id=job_group_id)) operator_run.fail()