def retry_failed_survey_jobs() -> None:
    """Handle survey jobs that were marked as a failure."""
    failed_jobs = SurveyJob.failed_objects.filter(
        created_at__gt=utils.JOB_CREATED_AT_CUTOFF
    ).order_by("pk")

    paginator = Paginator(failed_jobs, 200)
    page = paginator.page()
    page_count = 0

    if len(page.object_list) <= 0:
        # No failed jobs, nothing to do!
        return

    queue_capacity = get_capacity_for_jobs()

    while queue_capacity > 0:
        logger.info(
            "Handling page %d of failed (explicitly-marked-as-failure) survey jobs!", page_count
        )
        handle_survey_jobs(page.object_list, queue_capacity)

        if page.has_next():
            page = paginator.page(page.next_page_number())
            page_count = page_count + 1
            queue_capacity = get_capacity_for_jobs()
        else:
            break
def retry_unqueued_survey_jobs() -> None:
    """Retry survey jobs that never made it into the Batch job queue."""
    potentially_lost_jobs = SurveyJob.unqueued_objects.filter(
        created_at__gt=utils.JOB_CREATED_AT_CUTOFF
    ).order_by("created_at")
    paginator = Paginator(potentially_lost_jobs, utils.PAGE_SIZE, "created_at")
    database_page = paginator.page()
    database_page_count = 0

    if len(database_page.object_list) <= 0:
        # No failed jobs, nothing to do!
        return

    queue_capacity = get_capacity_for_jobs()

    if queue_capacity <= 0:
        logger.info("Not handling unqueued survey jobs " "because there is no capacity for them.")

    while queue_capacity > 0:
        for survey_job in database_page.object_list:
            if send_job(SurveyJobTypes.SURVEYOR, job=survey_job, is_dispatch=True):
                queue_capacity -= 1
        else:
            # Can't communicate with Batch just now, leave the job for a later loop.
            break

        if database_page.has_next():
            database_page = paginator.page(database_page.next_page_number())
            database_page_count += 1
            queue_capacity = get_capacity_for_jobs()
        else:
            break
def retry_failed_processor_jobs() -> None:
    """Handle processor jobs that were marked as a failure.

    Ignores Janitor jobs since they are queued every half hour anyway."""
    failed_jobs = (ProcessorJob.failed_objects.filter(
        created_at__gt=utils.JOB_CREATED_AT_CUTOFF).exclude(
            pipeline_applied="JANITOR").order_by(
                "created_at").prefetch_related("original_files__samples"))

    paginator = Paginator(failed_jobs, 200, "created_at")
    page = paginator.page()
    page_count = 0

    if len(page.object_list) <= 0:
        # No failed jobs, nothing to do!
        return

    queue_capacity = get_capacity_for_jobs()

    if queue_capacity <= 0:
        logger.info(
            "Not handling failed (explicitly-marked-as-failure) processor jobs "
            "because there is no capacity for them.")

    if queue_capacity > 0:
        for i in range(queue_capacity):
            logger.info(
                "Handling page %d of failed (explicitly-marked-as-failure) processor jobs!",
                page_count,
            )
            handle_processor_jobs(page.object_list, queue_capacity)

            if page.has_next():
                page = paginator.page(page.next_page_number())
                page_count = page_count + 1
                queue_capacity = get_capacity_for_jobs()
            else:
                break
def retry_hung_processor_jobs() -> None:
    """Retry processor jobs that were started but never finished."""
    potentially_hung_jobs = (ProcessorJob.hung_objects.filter(
        created_at__gt=utils.JOB_CREATED_AT_CUTOFF).order_by(
            "created_at").prefetch_related("original_files__samples"))
    paginator = Paginator(potentially_hung_jobs, utils.PAGE_SIZE, "created_at")
    database_page = paginator.page()
    database_page_count = 0

    if len(database_page.object_list) <= 0:
        # No failed jobs, nothing to do!
        return

    queue_capacity = get_capacity_for_jobs()

    if queue_capacity <= 0:
        logger.info(
            "Not handling hung (started-but-never-finished) processor jobs "
            "because there is no capacity for them.")

    while queue_capacity > 0:
        hung_jobs = utils.check_hung_jobs(database_page.object_list)

        if hung_jobs:
            logger.info(
                "Handling page %d of hung (started-but-never-finished) processor jobs!",
                database_page_count,
                jobs_count=len(hung_jobs),
            )
            handle_processor_jobs(hung_jobs)

        if database_page.has_next():
            database_page = paginator.page(database_page.next_page_number())
            database_page_count += 1
            queue_capacity = get_capacity_for_jobs()
        else:
            break
def retry_lost_survey_jobs() -> None:
    """Retry survey jobs that were started but never finished."""
    potentially_lost_jobs = SurveyJob.lost_objects.filter(
        created_at__gt=utils.JOB_CREATED_AT_CUTOFF
    ).order_by("created_at")
    paginator = Paginator(potentially_lost_jobs, utils.PAGE_SIZE, "created_at")
    database_page = paginator.page()
    database_page_count = 0

    if len(database_page.object_list) <= 0:
        # No failed jobs, nothing to do!
        return

    queue_capacity = get_capacity_for_jobs()

    if queue_capacity <= 0:
        logger.info(
            "Not handling lost (never-started) survey jobs "
            "because there is no capacity for them."
        )
    while queue_capacity > 0:
        lost_jobs = utils.check_lost_jobs(database_page.object_list)

        if lost_jobs:
            logger.info(
                "Handling page %d of lost (never-started) survey jobs!",
                database_page_count,
                jobs_count=len(lost_jobs),
            )
            handle_survey_jobs(lost_jobs)

        if database_page.has_next():
            database_page = paginator.page(database_page.next_page_number())
            database_page_count += 1
            queue_capacity = get_capacity_for_jobs()
        else:
            break
    def handle(self, *args, **options):
        """Requeues all unprocessed RNA-Seq samples for an organism.
        """
        if options["organism_name"] is None:
            logger.error("You must specify an organism-name.")
            sys.exit(1)
        else:
            organism_name = options["organism_name"]

        organism = Organism.objects.get(name=organism_name)

        prioritized_job_list = build_prioritized_jobs_list(organism)

        if not len(prioritized_job_list):
            logger.info(
                "Found no samples that need to be processed. I guess I'm done!"
            )
            sys.exit(0)

        logger.info(
            "Found %d samples that need to be processed. Beginning to queue jobs!",
            len(prioritized_job_list),
        )

        while len(prioritized_job_list) > 0:
            job_capacity = get_capacity_for_jobs()

            if job_capacity > 0:
                for i in range(job_capacity):
                    if len(prioritized_job_list) > 0:
                        requeue_job(prioritized_job_list.pop(0))

            # Wait 5 minutes in between queuing additional work to
            # give it time to actually get done.
            if len(prioritized_job_list) > 0:
                logger.info("Sleeping for 5 minutes while jobs get done.")
                time.sleep(300)

        logger.info(
            "Successfully requeued all jobs for unprocessed %s samples.",
            organism_name)
def handle_survey_jobs(jobs: List[SurveyJob], queue_capacity: int = None) -> None:
    """For each job in jobs, either retry it or log it.

    No more than queue_capacity jobs will be retried.
    """
    if queue_capacity is None:
        queue_capacity = get_capacity_for_jobs()

    jobs_dispatched = 0
    for count, job in enumerate(jobs):
        if jobs_dispatched >= queue_capacity:
            logger.info(
                "We hit the maximum total jobs ceiling,"
                " so we're not handling any more survey jobs now."
            )
            return

        if job.num_retries < utils.MAX_NUM_RETRIES:
            if requeue_survey_job(job):
                jobs_dispatched = jobs_dispatched + 1
        else:
            utils.handle_repeated_failure(job)