Ejemplo n.º 1
0
def create_run_task(run_id, inputs, output_directory=None):
    logger.info(format_log("Creating and validating run", obj_id=run_id))
    run = RunObjectFactory.from_definition(run_id, inputs)
    run.ready()
    run.to_db()
    submit_job.delay(run_id, output_directory)
    logger.info(format_log("Run is ready", obj=run))
Ejemplo n.º 2
0
def job_processor(job_id):
    logger.info(format_log("ETL Creating job", obj_id=job_id))
    job = JobObject(job_id)
    logger.info(
        format_log("ETL Processing job with args %s" % str(job.job.args),
                   obj=job.job))
    job.process()
Ejemplo n.º 3
0
def fetch_requests_lims():
    logger.info("ETL fetching requestIDs")
    running = Job.objects.filter(run=TYPES["DELIVERY"],
                                 status__in=(JobStatus.CREATED,
                                             JobStatus.IN_PROGRESS,
                                             JobStatus.WAITING_FOR_CHILDREN))
    if len(running) > 0:
        logger.info(
            format_log("ETL job already in progress", obj=running.first()))
        return
    latest = Job.objects.filter(
        run=TYPES["DELIVERY"]).order_by("-created_date").first()
    timestamp = None
    if latest:
        timestamp = int(latest.created_date.timestamp()) * 1000
    else:
        timestamp = int((datetime.datetime.now() -
                         datetime.timedelta(hours=120)).timestamp()) * 1000
    job = Job(
        run="beagle_etl.jobs.lims_etl_jobs.fetch_new_requests_lims",
        args={"timestamp": timestamp},
        status=JobStatus.CREATED,
        max_retry=3,
        children=[],
    )
    job.save()
    logger.info(format_log("ETL fetch_new_requests_lims job created", obj=job))
Ejemplo n.º 4
0
def abort_job_on_ridgeback(job_id):
    response = requests.get(settings.RIDGEBACK_URL +
                            "/v0/jobs/%s/abort/" % job_id)
    if response.status_code == 200:
        logger.info(format_log("Job aborted", obj_id=job_id))
        return True
    logger.error(format_log("Failed to abort job", obj_id=job_id))
    return None
Ejemplo n.º 5
0
def scheduler():
    jobs = get_pending_jobs()
    logger.info("Pending jobs: %s" % jobs)
    for job in jobs:
        j = Job.objects.get(id=job.id)
        if not j.is_locked:
            j.lock_job()
            logger.info(format_log("ETL submitting job", obj=job))
            job_processor.delay(j.id)
        else:
            logger.info(format_log("ETL job already locked", obj=job))
Ejemplo n.º 6
0
def create_jobs_from_request(request_id,
                             operator_id,
                             job_group_id,
                             job_group_notifier_id=None,
                             pipeline=None):
    logger.info(
        format_log("Creating operator with %s" % operator_id,
                   job_group_id=job_group_id,
                   request_id=request_id))
    operator_model = Operator.objects.get(id=operator_id)

    if not job_group_notifier_id:
        try:
            job_group = JobGroup.objects.get(id=job_group_id)
        except JobGroup.DoesNotExist:
            logger.info(
                format_log("Job group does not exist" % operator_id,
                           job_group_id=job_group_id,
                           request_id=request_id))
            return
        try:
            job_group_notifier_id = notifier_start(job_group,
                                                   request_id,
                                                   operator=operator_model)
            request_obj = Request.objects.filter(request_id=request_id).first()
            if request_obj:
                delivery_date_event = SetDeliveryDateFieldEvent(
                    job_group_notifier_id,
                    str(request_obj.delivery_date)).to_dict()
                send_notification.delay(delivery_date_event)
        except Exception as e:
            logger.info(
                format_log("Failed to instantiate notifier" % operator_id,
                           job_group_id=job_group_id,
                           request_id=request_id))

    operator = OperatorFactory.get_by_model(
        operator_model,
        job_group_id=job_group_id,
        job_group_notifier_id=job_group_notifier_id,
        request_id=request_id,
        pipeline=pipeline,
    )

    _set_link_to_run_ticket(request_id, job_group_notifier_id)

    generate_description(job_group_id, job_group_notifier_id, request_id)
    generate_label(job_group_notifier_id, request_id)
    create_jobs_from_operator(operator, job_group_id, job_group_notifier_id)
Ejemplo n.º 7
0
def fail_job(self,
             run_id,
             error_message,
             lsf_log_location=None,
             input_json_location=None):
    lock_id = "run_lock_%s" % run_id
    with memcache_task_lock(lock_id, self.app.oid) as acquired:
        if acquired:
            run = RunObjectFactory.from_db(run_id)
            if run.run_obj.is_failed:
                logger.info(
                    format_log("Run Fail already processed", obj=run.run_obj))
                return

            restart_run = run.run_obj.set_for_restart()

            if not restart_run:
                run.fail(error_message)
                run.to_db()

                job_group_notifier = run.job_group_notifier
                job_group_notifier_id = str(
                    job_group_notifier.id) if job_group_notifier else None

                ci_review = SetCIReviewEvent(job_group_notifier_id).to_dict()
                send_notification.delay(ci_review)

                _upload_qc_report(run.run_obj)
                _job_finished_notify(run, lsf_log_location,
                                     input_json_location)
            else:
                run_id, output_directory, execution_id = restart_run
                submit_job.delay(run_id, output_directory, execution_id)
        else:
            logger.warning("Run %s is processing by another worker" % run_id)
Ejemplo n.º 8
0
    def process(self):
        if self.job.status == JobStatus.CREATED:
            self.job.status = JobStatus.IN_PROGRESS

        elif self.job.status == JobStatus.IN_PROGRESS:
            self.job.retry_count = self.job.retry_count + 1
            try:
                self._process()
                self.job.status = JobStatus.WAITING_FOR_CHILDREN
            except Exception as e:
                if isinstance(e, ETLExceptions):
                    message = {"message": str(e), "code": e.code}
                else:
                    message = {"message": str(e)}
                if self.job.retry_count == self.job.max_retry:
                    self.job.status = JobStatus.FAILED
                    self.job.message = message
                    self._job_failed()

        elif self.job.status == JobStatus.WAITING_FOR_CHILDREN:
            self._check_children()

        logger.info(
            format_log("ETL job in status: %s" %
                       JobStatus(self.job.status).name,
                       obj=self.job))
        self._unlock()
        self._save()
Ejemplo n.º 9
0
def submit_job(run_id, output_directory=None, execution_id=None):
    resume = None
    try:
        run = Run.objects.get(id=run_id)
    except Run.DoesNotExist:
        raise Exception("Failed to submit a run")

    run1 = RunObjectFactory.from_db(run_id)
    if run.resume:
        run2 = RunObjectFactory.from_db(run.resume)

        if run1.equal(run2):
            logger.info(
                format_log("Resuming run with execution id %s" %
                           run2.run_obj.execution_id,
                           obj=run))
            resume = str(run2.run_obj.execution_id)
        else:
            logger.info(
                format_log(
                    "Failed to resume runs as run is not equal to the following run: %s"
                    % str(run2),
                    obj=run))
    if execution_id:
        resume = execution_id
    if not output_directory:
        output_directory = os.path.join(run.app.output_directory, str(run_id))
    job = run1.dump_job(output_directory=output_directory)
    logger.info(format_log("Job ready for submitting", obj=run))
    if resume:
        url = urljoin(settings.RIDGEBACK_URL,
                      "/v0/jobs/{id}/resume/".format(id=resume))
        job = {"root_dir": output_directory}
    else:
        url = settings.RIDGEBACK_URL + "/v0/jobs/"
    if run.app.walltime:
        job["walltime"] = run.app.walltime
    if run.app.memlimit:
        job["memlimit"] = run.app.memlimit
    response = requests.post(url, json=job)
    if response.status_code == 201:
        run.execution_id = response.json()["id"]
        logger.info(format_log("Job successfully submitted", obj=run))
        run.save()
    else:
        raise Exception("Failed to submit job %s" % run_id)
Ejemplo n.º 10
0
def abort_job(self, run_id):
    lock_id = "run_lock_%s" % run_id
    with memcache_task_lock(lock_id, self.app.oid) as acquired:
        if acquired:
            run = Run.objects.get(id=run_id)
            logger.info(format_log("Transition to state ABORTED", obj=run))
            if run.status != RunStatus.ABORTED:
                run.status = RunStatus.ABORTED
                run.save()
        else:
            logger.warning("Run %s is processing by another worker" % run_id)
Ejemplo n.º 11
0
def complete_job(self,
                 run_id,
                 outputs,
                 lsf_log_location=None,
                 inputs_json_location=None):
    lock_id = "run_lock_%s" % run_id
    with memcache_task_lock(lock_id, self.app.oid) as acquired:
        if acquired:
            run = RunObjectFactory.from_db(run_id)
            if run.run_obj.is_completed:
                logger.info(
                    format_log("Run Complete already processed",
                               obj=run.run_obj))
                return

            logger.info(format_log("Completing Run", obj=run.run_obj))

            try:
                run.complete(outputs)
            except Exception as e:
                fail_job(run_id, str(e))
                return

            run.to_db()
            job_group = run.job_group
            job_group_id = str(job_group.id) if job_group else None

            _job_finished_notify(run, lsf_log_location, inputs_json_location)

            for trigger in run.run_obj.operator_run.operator.from_triggers.filter(
                    run_type=TriggerRunType.INDIVIDUAL):
                create_jobs_from_chaining.delay(
                    trigger.to_operator_id,
                    trigger.from_operator_id,
                    [run_id],
                    job_group_id=job_group_id,
                    parent=str(run.run_obj.operator_run.id)
                    if run.run_obj.operator_run else None,
                )
        else:
            logger.warning("Run %s is processing by another worker" % run_id)
Ejemplo n.º 12
0
 def _check_children(self):
     finished = True
     failed = []
     permission_denied = False
     recipe = None
     for child_id in self.job.children:
         try:
             child_job = Job.objects.get(id=child_id)
         except Job.DoesNotExist:
             failed.append(child_id)
             continue
         if child_job.status == JobStatus.FAILED:
             failed.append(child_id)
             if isinstance(
                     child_job.message, dict) and child_job.message.get(
                         "code", 0) == 108:
                 logger.error(
                     format_log(
                         "ETL job failed because of permission denied error",
                         obj=self.job))
                 recipe = child_job.args.get("request_metadata",
                                             {}).get("recipe")
                 permission_denied = True
         if child_job.status in (JobStatus.IN_PROGRESS, JobStatus.CREATED,
                                 JobStatus.WAITING_FOR_CHILDREN):
             finished = False
             break
     if finished:
         if failed:
             self.job.status = JobStatus.FAILED
             self.job.message = {
                 "details": "Child jobs %s failed" % ", ".join(failed)
             }
             self._job_failed(permission_denied, recipe)
         else:
             self.job.status = JobStatus.COMPLETED
             self._job_successful()
         if self.job.callback:
             job = Job(
                 run=self.job.callback,
                 args=self.job.callback_args,
                 status=JobStatus.CREATED,
                 max_retry=1,
                 children=[],
                 job_group=self.job.job_group,
             )
             job.save()
Ejemplo n.º 13
0
def create_jobs_from_chaining(to_operator_id,
                              from_operator_id,
                              run_ids=[],
                              job_group_id=None,
                              job_group_notifier_id=None,
                              parent=None):
    logger.info(
        format_log("Creating operator id %s from chaining: %s" %
                   (to_operator_id, from_operator_id),
                   job_group_id=job_group_id))
    operator_model = Operator.objects.get(id=to_operator_id)
    operator = OperatorFactory.get_by_model(
        operator_model,
        job_group_id=job_group_id,
        job_group_notifier_id=job_group_notifier_id,
        run_ids=run_ids)
    create_jobs_from_operator(operator, job_group_id, job_group_notifier_id,
                              parent)
Ejemplo n.º 14
0
def check_missing_requests():
    """
    Method implemented because some requests on LIMS can show up with the date from the past
    """
    logger.info("ETL Check for missing requests")
    timestamp = int((datetime.datetime.now() -
                     datetime.timedelta(hours=12)).timestamp()) * 1000

    job = Job(
        run="beagle_etl.jobs.lims_etl_jobs.fetch_new_requests_lims",
        args={
            "timestamp": timestamp,
            "redelivery": False
        },
        status=JobStatus.CREATED,
        max_retry=3,
        children=[],
    )
    job.save()
    logger.info(format_log("ETL fetch_new_requests_lims job created", obj=job))
Ejemplo n.º 15
0
def create_operator_run_from_jobs(operator,
                                  jobs,
                                  job_group_id=None,
                                  job_group_notifier_id=None,
                                  parent=None):
    jg = None
    jgn = None

    if not jobs:
        logger.info(
            "Could not create operator run due to no jobs being passed")
        return

    try:
        jg = JobGroup.objects.get(id=job_group_id)
    except JobGroup.DoesNotExist:
        logger.info(format_log("Job group not set", job_group_id=job_group_id))
    try:
        jgn = JobGroupNotifier.objects.get(id=job_group_notifier_id)
    except JobGroupNotifier.DoesNotExist:
        logger.info(
            format_log("Job group notifier not set",
                       job_group_id=job_group_id))
    valid_jobs, invalid_jobs = [], []

    for job in jobs:
        valid_jobs.append(job) if job.is_valid() else invalid_jobs.append(job)

    try:
        operator_run_parent = OperatorRun.objects.get(id=parent)
    except OperatorRun.DoesNotExist:
        operator_run_parent = None

    operator_run = OperatorRun.objects.create(
        operator=operator.model,
        num_total_runs=len(valid_jobs),
        job_group=jg,
        job_group_notifier=jgn,
        parent=operator_run_parent,
    )
    run_ids = []
    pipeline_id = None

    try:
        pipeline_id = operator.get_pipeline_id()
        p = Pipeline.objects.get(id=pipeline_id)
        pipeline_name = p.name
        pipeline_version = p.version
        pipeline_link = p.pipeline_link
    except Pipeline.DoesNotExist:
        pipeline_name = ""
        pipeline_link = ""
        pipeline_version = ""

    pipeline_description_event = AddPipelineToDescriptionEvent(
        job_group_notifier_id, pipeline_name, pipeline_version,
        pipeline_link).to_dict()
    send_notification.delay(pipeline_description_event)

    set_pipeline_field = SetPipelineFieldEvent(job_group_notifier_id,
                                               pipeline_name).to_dict()
    send_notification.delay(set_pipeline_field)

    for job in valid_jobs:
        logger.info(format_log("Creating run", obj=job))
        job.operator_run_id = str(operator_run.id)
        job.job_group_id = str(job_group_id) if job_group_id else job_group_id
        job.job_group_notifier_id = str(
            job_group_notifier_id
        ) if job_group_notifier_id else job_group_notifier_id
        run = job.create()
        logger.info(format_log("Run created", obj=run))

        run_ids.append({
            "run_id": str(run.id),
            "tags": run.tags,
            "output_directory": run.output_directory
        })
        output_directory = run.output_directory
        if not pipeline_name and not pipeline_link:
            logger.error(
                format_log(
                    "Run failed, could not find pipeline %s" % pipeline_id,
                    obj=run,
                    job_group_id=job_group_id,
                    operator_run_id=operator_run.id,
                ))
            error_message = dict(details="Pipeline [ id: %s ] was not found.".
                                 format(pipeline_id))
            fail_job(run.id, error_message)
        else:
            create_run_task.delay(str(run.id), job.inputs, output_directory)

    if job_group_id:
        event = OperatorRunEvent(job_group_notifier_id, operator.request_id,
                                 pipeline_name, pipeline_link, run_ids,
                                 str(operator_run.id)).to_dict()
        send_notification.delay(event)

    for job in invalid_jobs:
        # TODO: Report this to JIRA ticket also
        logger.error(
            format_log("Job invalid %s" % job.errors,
                       obj=job,
                       job_group_id=job_group_id,
                       operator_run_id=operator_run.id))
        logger.error(
            format_log("Job invalid %s" % job[0].errors,
                       obj=job[0],
                       job_group_id=job_group_id,
                       operator_run_id=operator_run.id))

    operator_run.status = RunStatus.RUNNING
    operator_run.save()
Ejemplo n.º 16
0
def check_jobs_status():
    runs_queryset = Run.objects.filter(status__in=(RunStatus.RUNNING,
                                                   RunStatus.READY),
                                       execution_id__isnull=False)

    limit = 800
    i = 0
    while True:
        runs = runs_queryset[i:i + limit]
        i += limit
        if not runs:
            return

        remote_statuses = check_statuses_on_ridgeback(
            list(runs.values_list("execution_id")))
        if not remote_statuses:
            continue

        for run in runs:
            logger.info(format_log("Checking status for run", obj=run))
            if str(run.execution_id) not in remote_statuses:
                logger.info(
                    format_log(
                        "Requested job status from executor that was not returned",
                        obj=run))
                continue

            status = remote_statuses[str(run.execution_id)]
            if status["started"] and not run.started:
                run.started = status["started"]
            if status["submitted"] and not run.submitted:
                run.submitted = status["submitted"]

            if status["commandlinetooljob_set"]:
                update_commandline_job_status(run,
                                              status["commandlinetooljob_set"])
            if status["status"] == "FAILED":
                logger.error(format_log("Job failed ", obj=run))
                message = dict(details=status.get("message"))
                lsf_log_location = status.get("message", {}).get("log")
                inputs_location = None
                if lsf_log_location:
                    inputs_location = lsf_log_location.replace(
                        "lsf.log", "input.json")
                fail_job.delay(str(run.id), message, lsf_log_location,
                               inputs_location)
                continue
            if status["status"] == "COMPLETED":
                logger.info(format_log("Job completed", obj=run))
                lsf_log_location = status.get("message", {}).get("log")
                inputs_location = None
                if lsf_log_location:
                    inputs_location = lsf_log_location.replace(
                        "lsf.log", "input.json")
                complete_job.delay(str(run.id), status["outputs"],
                                   lsf_log_location, inputs_location)
                continue
            if status["status"] == "CREATED":
                logger.info(format_log("Job created", obj=run))
                continue
            if status["status"] == "PENDING":
                logger.info(format_log("Job pending", obj=run))
                continue
            if status["status"] == "RUNNING":
                logger.info(format_log("Job running", obj=run))
                running_job.delay(str(run.id))
                continue
            if status["status"] == "ABORTED":
                logger.info(format_log("Job aborted", obj=run))
                abort_job.delay(str(run.id))
            else:
                logger.info("Run lock not acquired for run: %s" % str(run.id))
Ejemplo n.º 17
0
def process_triggers():
    operator_runs = OperatorRun.objects.prefetch_related(
        "runs", "operator__from_triggers").exclude(
            status__in=[RunStatus.COMPLETED, RunStatus.FAILED])

    for operator_run in operator_runs:
        created_chained_job = False
        job_group = operator_run.job_group
        job_group_id = str(job_group.id) if job_group else None
        job_group_notifier = operator_run.job_group_notifier
        job_group_notifier_id = str(
            job_group_notifier.id) if job_group_notifier else None
        try:
            for trigger in operator_run.operator.from_triggers.all():
                trigger_type = trigger.run_type

                if trigger_type == TriggerRunType.AGGREGATE:
                    condition = trigger.aggregate_condition
                    if condition == TriggerAggregateConditionType.ALL_RUNS_SUCCEEDED:
                        if operator_run.percent_runs_succeeded == 100.0:
                            created_chained_job = True
                            create_jobs_from_chaining.delay(
                                trigger.to_operator_id,
                                trigger.from_operator_id,
                                list(
                                    operator_run.runs.order_by(
                                        "id").values_list("id", flat=True)),
                                job_group_id=job_group_id,
                                job_group_notifier_id=job_group_notifier_id,
                                parent=str(operator_run.id),
                            )
                            continue
                    elif condition == TriggerAggregateConditionType.NINTY_PERCENT_SUCCEEDED:
                        if operator_run.percent_runs_succeeded >= 90.0:
                            created_chained_job = True
                            create_jobs_from_chaining.delay(
                                trigger.to_operator_id,
                                trigger.from_operator_id,
                                list(
                                    operator_run.runs.order_by(
                                        "id").values_list("id", flat=True)),
                                job_group_id=job_group_id,
                                job_group_notifier_id=job_group_notifier_id,
                                parent=str(operator_run.id),
                            )
                            continue

                    if operator_run.percent_runs_finished == 100.0:
                        logger.info(
                            format_log("Conditions never met",
                                       operator_run_id=operator_run.id,
                                       job_group_id=job_group_id))

                elif trigger_type == TriggerRunType.INDIVIDUAL:
                    if operator_run.percent_runs_finished == 100.0:
                        operator_run.complete()

            if operator_run.percent_runs_finished == 100.0:
                if operator_run.percent_runs_succeeded == 100.0:
                    operator_run.complete()
                    if not created_chained_job and job_group_notifier_id:
                        completed_event = SetPipelineCompletedEvent(
                            job_group_notifier_id).to_dict()
                        send_notification.delay(completed_event)
                else:
                    operator_run.fail()
                    if job_group_notifier_id:
                        e = OperatorRequestEvent(
                            job_group_notifier_id,
                            "[CIReviewEvent] Operator Run %s failed" %
                            str(operator_run.id)).to_dict()
                        send_notification.delay(e)
                        ci_review_event = SetCIReviewEvent(
                            job_group_notifier_id).to_dict()
                        send_notification.delay(ci_review_event)

        except Exception as e:
            logger.info(
                format_log("Trigger failed %s",
                           str(e),
                           operator_run_id=operator_run.id,
                           job_group_id=job_group_id))
            operator_run.fail()