def retry_failed_survey_jobs() -> None: """Handle survey jobs that were marked as a failure.""" failed_jobs = SurveyJob.failed_objects.filter( created_at__gt=utils.JOB_CREATED_AT_CUTOFF ).order_by("pk") paginator = Paginator(failed_jobs, 200) page = paginator.page() page_count = 0 if len(page.object_list) <= 0: # No failed jobs, nothing to do! return queue_capacity = get_capacity_for_jobs() while queue_capacity > 0: logger.info( "Handling page %d of failed (explicitly-marked-as-failure) survey jobs!", page_count ) handle_survey_jobs(page.object_list, queue_capacity) if page.has_next(): page = paginator.page(page.next_page_number()) page_count = page_count + 1 queue_capacity = get_capacity_for_jobs() else: break
def retry_unqueued_survey_jobs() -> None: """Retry survey jobs that never made it into the Batch job queue.""" potentially_lost_jobs = SurveyJob.unqueued_objects.filter( created_at__gt=utils.JOB_CREATED_AT_CUTOFF ).order_by("created_at") paginator = Paginator(potentially_lost_jobs, utils.PAGE_SIZE, "created_at") database_page = paginator.page() database_page_count = 0 if len(database_page.object_list) <= 0: # No failed jobs, nothing to do! return queue_capacity = get_capacity_for_jobs() if queue_capacity <= 0: logger.info("Not handling unqueued survey jobs " "because there is no capacity for them.") while queue_capacity > 0: for survey_job in database_page.object_list: if send_job(SurveyJobTypes.SURVEYOR, job=survey_job, is_dispatch=True): queue_capacity -= 1 else: # Can't communicate with Batch just now, leave the job for a later loop. break if database_page.has_next(): database_page = paginator.page(database_page.next_page_number()) database_page_count += 1 queue_capacity = get_capacity_for_jobs() else: break
def retry_failed_processor_jobs() -> None: """Handle processor jobs that were marked as a failure. Ignores Janitor jobs since they are queued every half hour anyway.""" failed_jobs = (ProcessorJob.failed_objects.filter( created_at__gt=utils.JOB_CREATED_AT_CUTOFF).exclude( pipeline_applied="JANITOR").order_by( "created_at").prefetch_related("original_files__samples")) paginator = Paginator(failed_jobs, 200, "created_at") page = paginator.page() page_count = 0 if len(page.object_list) <= 0: # No failed jobs, nothing to do! return queue_capacity = get_capacity_for_jobs() if queue_capacity <= 0: logger.info( "Not handling failed (explicitly-marked-as-failure) processor jobs " "because there is no capacity for them.") if queue_capacity > 0: for i in range(queue_capacity): logger.info( "Handling page %d of failed (explicitly-marked-as-failure) processor jobs!", page_count, ) handle_processor_jobs(page.object_list, queue_capacity) if page.has_next(): page = paginator.page(page.next_page_number()) page_count = page_count + 1 queue_capacity = get_capacity_for_jobs() else: break
def retry_hung_processor_jobs() -> None: """Retry processor jobs that were started but never finished.""" potentially_hung_jobs = (ProcessorJob.hung_objects.filter( created_at__gt=utils.JOB_CREATED_AT_CUTOFF).order_by( "created_at").prefetch_related("original_files__samples")) paginator = Paginator(potentially_hung_jobs, utils.PAGE_SIZE, "created_at") database_page = paginator.page() database_page_count = 0 if len(database_page.object_list) <= 0: # No failed jobs, nothing to do! return queue_capacity = get_capacity_for_jobs() if queue_capacity <= 0: logger.info( "Not handling hung (started-but-never-finished) processor jobs " "because there is no capacity for them.") while queue_capacity > 0: hung_jobs = utils.check_hung_jobs(database_page.object_list) if hung_jobs: logger.info( "Handling page %d of hung (started-but-never-finished) processor jobs!", database_page_count, jobs_count=len(hung_jobs), ) handle_processor_jobs(hung_jobs) if database_page.has_next(): database_page = paginator.page(database_page.next_page_number()) database_page_count += 1 queue_capacity = get_capacity_for_jobs() else: break
def retry_lost_survey_jobs() -> None: """Retry survey jobs that were started but never finished.""" potentially_lost_jobs = SurveyJob.lost_objects.filter( created_at__gt=utils.JOB_CREATED_AT_CUTOFF ).order_by("created_at") paginator = Paginator(potentially_lost_jobs, utils.PAGE_SIZE, "created_at") database_page = paginator.page() database_page_count = 0 if len(database_page.object_list) <= 0: # No failed jobs, nothing to do! return queue_capacity = get_capacity_for_jobs() if queue_capacity <= 0: logger.info( "Not handling lost (never-started) survey jobs " "because there is no capacity for them." ) while queue_capacity > 0: lost_jobs = utils.check_lost_jobs(database_page.object_list) if lost_jobs: logger.info( "Handling page %d of lost (never-started) survey jobs!", database_page_count, jobs_count=len(lost_jobs), ) handle_survey_jobs(lost_jobs) if database_page.has_next(): database_page = paginator.page(database_page.next_page_number()) database_page_count += 1 queue_capacity = get_capacity_for_jobs() else: break
def handle(self, *args, **options): """Requeues all unprocessed RNA-Seq samples for an organism. """ if options["organism_name"] is None: logger.error("You must specify an organism-name.") sys.exit(1) else: organism_name = options["organism_name"] organism = Organism.objects.get(name=organism_name) prioritized_job_list = build_prioritized_jobs_list(organism) if not len(prioritized_job_list): logger.info( "Found no samples that need to be processed. I guess I'm done!" ) sys.exit(0) logger.info( "Found %d samples that need to be processed. Beginning to queue jobs!", len(prioritized_job_list), ) while len(prioritized_job_list) > 0: job_capacity = get_capacity_for_jobs() if job_capacity > 0: for i in range(job_capacity): if len(prioritized_job_list) > 0: requeue_job(prioritized_job_list.pop(0)) # Wait 5 minutes in between queuing additional work to # give it time to actually get done. if len(prioritized_job_list) > 0: logger.info("Sleeping for 5 minutes while jobs get done.") time.sleep(300) logger.info( "Successfully requeued all jobs for unprocessed %s samples.", organism_name)
def handle_survey_jobs(jobs: List[SurveyJob], queue_capacity: int = None) -> None: """For each job in jobs, either retry it or log it. No more than queue_capacity jobs will be retried. """ if queue_capacity is None: queue_capacity = get_capacity_for_jobs() jobs_dispatched = 0 for count, job in enumerate(jobs): if jobs_dispatched >= queue_capacity: logger.info( "We hit the maximum total jobs ceiling," " so we're not handling any more survey jobs now." ) return if job.num_retries < utils.MAX_NUM_RETRIES: if requeue_survey_job(job): jobs_dispatched = jobs_dispatched + 1 else: utils.handle_repeated_failure(job)