def retry_failed_survey_jobs() -> None:
    """Handle survey jobs that were marked as a failure."""
    failed_jobs = SurveyJob.failed_objects.filter(
        created_at__gt=utils.JOB_CREATED_AT_CUTOFF
    ).order_by("pk")

    paginator = Paginator(failed_jobs, 200)
    page = paginator.page()
    page_count = 0

    if len(page.object_list) <= 0:
        # No failed jobs, nothing to do!
        return

    queue_capacity = get_capacity_for_jobs()

    while queue_capacity > 0:
        logger.info(
            "Handling page %d of failed (explicitly-marked-as-failure) survey jobs!", page_count
        )
        handle_survey_jobs(page.object_list, queue_capacity)

        if page.has_next():
            page = paginator.page(page.next_page_number())
            page_count = page_count + 1
            queue_capacity = get_capacity_for_jobs()
        else:
            break
def run_tximport_for_all_eligible_experiments(dispatch_jobs=True):
    """Creates a tximport job for all eligible experiments.
    """
    eligible_experiments = (Experiment.objects.annotate(
        num_organisms=Count("organisms")).filter(
            num_organisms=1, technology="RNA-SEQ",
            num_processed_samples=0).prefetch_related("samples__results"))

    paginator = Paginator(eligible_experiments, PAGE_SIZE)
    page = paginator.page()

    # Next is to figure out how many samples were processed for
    # each experiment. Should be able to reuse code from salmon
    # cause it does this stuff.
    created_jobs = []

    while True:
        creation_count = 0

        for experiment in page.object_list:
            processor_job = run_tximport_if_eligible(experiment)
            if processor_job:
                creation_count += 1
                created_jobs.append(processor_job)

        logger.info(
            "Created %d tximport jobs for experiments past the thresholds.",
            creation_count)

        if not page.has_next():
            break
        else:
            page = paginator.page(page.next_page_number())

    return created_jobs
def retry_unqueued_survey_jobs() -> None:
    """Retry survey jobs that never made it into the Batch job queue."""
    potentially_lost_jobs = SurveyJob.unqueued_objects.filter(
        created_at__gt=utils.JOB_CREATED_AT_CUTOFF
    ).order_by("created_at")
    paginator = Paginator(potentially_lost_jobs, utils.PAGE_SIZE, "created_at")
    database_page = paginator.page()
    database_page_count = 0

    if len(database_page.object_list) <= 0:
        # No failed jobs, nothing to do!
        return

    queue_capacity = get_capacity_for_jobs()

    if queue_capacity <= 0:
        logger.info("Not handling unqueued survey jobs " "because there is no capacity for them.")

    while queue_capacity > 0:
        for survey_job in database_page.object_list:
            if send_job(SurveyJobTypes.SURVEYOR, job=survey_job, is_dispatch=True):
                queue_capacity -= 1
        else:
            # Can't communicate with Batch just now, leave the job for a later loop.
            break

        if database_page.has_next():
            database_page = paginator.page(database_page.next_page_number())
            database_page_count += 1
            queue_capacity = get_capacity_for_jobs()
        else:
            break
Beispiel #4
0
def requeue_samples(sample_queryset, dry_run=False):
    paginator = PerformantPaginator(sample_queryset, PAGE_SIZE)
    page = paginator.page()

    # Loop through the samples one by one to see if they've been
    # erroneously marked as processed. If so, mark them as
    # unprocessed, and kick off a new job so they can get
    # processed correctly.
    # Do this before deleting the computed files in case we get
    # interrupted. It'll be harder to tell what samples were
    # erroneously marked as processed.
    while True:
        counter = 0
        for sample in page.object_list:
            if requeue_sample(sample, dry_run):
                counter += 1
            # requeue_sample makes database calls, not a good idea to
            # call in a loop without a sleep.
            time.sleep(1)

        print(f"Requeued {counter} samples in that page.")

        if not page.has_next():
            break
        else:
            page = paginator.page(page.next_page_number())
Beispiel #5
0
def run_tximport():
    """Creates a tximport job for all eligible experiments."""
    eligible_experiments = (Experiment.objects.annotate(
        num_organisms=Count("organisms")).filter(
            num_organisms=1, technology="RNA-SEQ",
            num_processed_samples=0).prefetch_related("samples__results"))

    paginator = Paginator(eligible_experiments, PAGE_SIZE)
    page = paginator.page()

    # Next is to figure out how many samples were processed for
    # each experiment. Should be able to reuse code from salmon
    # cause it does this stuff.
    tximport_pipeline = ProcessorPipeline.TXIMPORT

    while True:
        creation_count = 0

        for experiment in page.object_list:
            quant_results = get_quant_results_for_experiment(experiment)

            if should_run_tximport(experiment, quant_results, True):
                processor_job = ProcessorJob()
                processor_job.pipeline_applied = tximport_pipeline.value
                processor_job.ram_amount = 8192
                # This job doesn't need to run on a specific volume
                # but it uses the same Nomad job as Salmon jobs which
                # do require the volume index.
                processor_job.volume_index = random.choice(
                    list(get_active_volumes()))
                processor_job.save()

                assoc = ProcessorJobOriginalFileAssociation()
                # Any original file linked to any sample of the
                # experiment will work. Tximport is somewhat special
                # in that it doesn't actuallhy use original files so
                # this is just used to point to the experiment.
                assoc.original_file = experiment.samples.all(
                )[0].original_files.all()[0]
                assoc.processor_job = processor_job
                assoc.save()

                creation_count += 1

                try:
                    send_job(tximport_pipeline, processor_job)
                except Exception:
                    # If we cannot queue the job now the Foreman will do
                    # it later.
                    pass

        logger.info(
            "Created %d tximport jobs for experiments past the thresholds.",
            creation_count)

        if not page.has_next():
            break
        else:
            page = paginator.page(page.next_page_number())
Beispiel #6
0
def queryset_page_iterator(queryset, page_size=2000):
    """ use the performant paginator to iterate over each page in a queryset """
    paginator = PerformantPaginator(queryset, page_size)
    page = paginator.page()
    while True:
        yield page.object_list

        if not page.has_next():
            break
        else:
            page = paginator.page(page.next_page_number())
    def handle(self, *args, **options):
        """ Requeues downloader jobs for samples that haven't been processed and their original files
        have no no downloader jobs associated with them
        """
        supported_microarray_platforms = [
            x["platform_accession"] for x in get_supported_microarray_platforms()
        ]
        supported_rnaseq_platforms = [x.replace(" ", "") for x in get_supported_rnaseq_platforms()]
        all_supported_platforms = (
            supported_microarray_platforms + supported_rnaseq_platforms
        )  # https://www.postgresql.org/docs/9.1/functions-array.html

        # Ensure selected samples have valid platforms
        samples_without_downloader = (
            Sample.objects.all()
            .filter(platform_accession_code__in=all_supported_platforms)
            .annotate(
                original_files_count=Count("original_files"),
                downloader_job_count=Count("original_files__downloader_jobs"),
            )
            .filter(is_processed=False, original_files_count__gt=0, downloader_job_count=0)
        )
        if options.get("created_after", None):
            samples_without_downloader = samples_without_downloader.filter(
                created_at__gt=options["created_after"]
            )

        samples_without_downloader = samples_without_downloader.prefetch_related("original_files")

        logger.info(
            "Found %d samples without downloader jobs, starting to create them now.",
            samples_without_downloader.count(),
        )

        paginator = Paginator(samples_without_downloader, PAGE_SIZE)
        page = paginator.page()

        while True:
            for sample in page.object_list:
                logger.debug("Creating downloader job for a sample.", sample=sample.accession_code)
                create_downloader_job(sample.original_files.all())

            logger.info(
                "Created %d new downloader jobs because their samples didn't have any.", PAGE_SIZE
            )

            if not page.has_next():
                break

            page = paginator.page(page.next_page_number())
    def handle(self, *args, **options):
        samples = Sample.processed_objects.all()
        paginator = PerformantPaginator(samples, PAGE_SIZE)
        page = paginator.page()
        counter = 0
        while True:
            for sample in page.object_list:
                counter += 1
                if sample.results.count() == 0:
                    print(sample.accession_code)
            if not page.has_next():
                break
            else:
                page = paginator.page(page.next_page_number())

            if counter % 10000 == 0:
                print("Checked another 10000k samples.")
Beispiel #9
0
def requeue_samples(eligible_samples):
    paginator = Paginator(eligible_samples, PAGE_SIZE)
    page = paginator.page()

    creation_count = 0
    while True:
        for sample in page.object_list:
            if create_downloader_job(sample.original_files.all(), force=True):
                creation_count += 1

        if not page.has_next():
            break
        else:
            page = paginator.page(page.next_page_number())

        logger.info("Creating new downloader jobs. %d so far", creation_count)

        # 2000 samples queued up every five minutes should be fast
        # enough and also not thrash the DB.
        time.sleep(60 * 5)

    return creation_count
Beispiel #10
0
def retry_hung_downloader_jobs() -> None:
    """Retry downloader jobs that were started but never finished."""
    potentially_hung_jobs = (
        DownloaderJob.hung_objects.filter(created_at__gt=utils.JOB_CREATED_AT_CUTOFF)
        .order_by("created_at")
        .prefetch_related("original_files__samples")
    )
    paginator = Paginator(potentially_hung_jobs, utils.PAGE_SIZE, "created_at")
    database_page = paginator.page()
    database_page_count = 0

    if len(database_page.object_list) <= 0:
        # No failed jobs, nothing to do!
        return

    queue_capacity = get_capacity_for_downloader_jobs()

    if queue_capacity <= 0:
        logger.info(
            "Not handling hung (started-but-never-finished) downloader jobs "
            "because there is no capacity for them."
        )
    while queue_capacity > 0:
        hung_jobs = utils.check_hung_jobs(database_page.object_list)

        if hung_jobs:
            logger.info(
                "Handling page %d of hung (started-but-never-finished) downloader jobs!",
                database_page_count,
                jobs_count=len(hung_jobs),
            )
            handle_downloader_jobs(hung_jobs)

        if database_page.has_next():
            database_page = paginator.page(database_page.next_page_number())
            database_page_count += 1
            queue_capacity = get_capacity_for_downloader_jobs()
        else:
            break
def retry_failed_processor_jobs() -> None:
    """Handle processor jobs that were marked as a failure.

    Ignores Janitor jobs since they are queued every half hour anyway."""
    failed_jobs = (ProcessorJob.failed_objects.filter(
        created_at__gt=utils.JOB_CREATED_AT_CUTOFF).exclude(
            pipeline_applied="JANITOR").order_by(
                "created_at").prefetch_related("original_files__samples"))

    paginator = Paginator(failed_jobs, 200, "created_at")
    page = paginator.page()
    page_count = 0

    if len(page.object_list) <= 0:
        # No failed jobs, nothing to do!
        return

    queue_capacity = get_capacity_for_jobs()

    if queue_capacity <= 0:
        logger.info(
            "Not handling failed (explicitly-marked-as-failure) processor jobs "
            "because there is no capacity for them.")

    if queue_capacity > 0:
        for i in range(queue_capacity):
            logger.info(
                "Handling page %d of failed (explicitly-marked-as-failure) processor jobs!",
                page_count,
            )
            handle_processor_jobs(page.object_list, queue_capacity)

            if page.has_next():
                page = paginator.page(page.next_page_number())
                page_count = page_count + 1
                queue_capacity = get_capacity_for_jobs()
            else:
                break
    def handle(self, *args, **options):
        timed_out_jobs = ProcessorJob.objects.filter(
            success="f",
            failure_reason=
            "Salmon timed out because it failed to complete within 3 hours.",
            retried_job_id__isnull=True,  # Only get jobs that weren't retried.
        )

        total = 0
        i = 0
        paginator = Paginator(timed_out_jobs, 1000)
        page = paginator.page()
        while page:
            for processor_job in page.object_list:
                # Reset the job so it'll start at 12288 RAM (since it'll go up from here).
                processor_job.ram_amount = 8192
                # And so it'll be retried twice more.
                processor_job.num_retries = 0
                processor_job.retried = False

                # We don't actually have to send this off to Batch ourselves.
                # The Foreman will find it and requeue it for us!
                processor_job.save()

                if page.has_next():
                    page = paginator.page(page.next_page_number())
                else:
                    break

                total += 1
                i += 1
                # Only queue 300 of these an hour so we don't overload ENA.
                if i == 300:
                    logger.info(
                        "Requeued 300 more jobs (total %d). Sleeping for 1 hour.",
                        total)
                    time.sleep(60 * 60)
                    i = 0
def retry_lost_survey_jobs() -> None:
    """Retry survey jobs that were started but never finished."""
    potentially_lost_jobs = SurveyJob.lost_objects.filter(
        created_at__gt=utils.JOB_CREATED_AT_CUTOFF
    ).order_by("created_at")
    paginator = Paginator(potentially_lost_jobs, utils.PAGE_SIZE, "created_at")
    database_page = paginator.page()
    database_page_count = 0

    if len(database_page.object_list) <= 0:
        # No failed jobs, nothing to do!
        return

    queue_capacity = get_capacity_for_jobs()

    if queue_capacity <= 0:
        logger.info(
            "Not handling lost (never-started) survey jobs "
            "because there is no capacity for them."
        )
    while queue_capacity > 0:
        lost_jobs = utils.check_lost_jobs(database_page.object_list)

        if lost_jobs:
            logger.info(
                "Handling page %d of lost (never-started) survey jobs!",
                database_page_count,
                jobs_count=len(lost_jobs),
            )
            handle_survey_jobs(lost_jobs)

        if database_page.has_next():
            database_page = paginator.page(database_page.next_page_number())
            database_page_count += 1
            queue_capacity = get_capacity_for_jobs()
        else:
            break
def retry_lost_processor_jobs() -> None:
    """Retry processor jobs that were started but never finished."""
    potentially_lost_jobs = (ProcessorJob.lost_objects.filter(
        created_at__gt=utils.JOB_CREATED_AT_CUTOFF).exclude(
            pipeline_applied="JANITOR").order_by(
                "created_at").prefetch_related("original_files__samples"))
    paginator = Paginator(potentially_lost_jobs, utils.PAGE_SIZE, "created_at")
    database_page = paginator.page()
    database_page_count = 0

    if len(database_page.object_list) <= 0:
        # No failed jobs, nothing to do!
        return

    queue_capacity = get_capacity_for_jobs()

    if queue_capacity <= 0:
        logger.info("Not handling lost (never-started) processor jobs "
                    "because there is no capacity for them.")

    while queue_capacity > 0:
        lost_jobs = utils.check_lost_jobs(database_page.object_list)

        if lost_jobs:
            logger.info(
                "Handling page %d of lost (never-started) processor jobs!",
                database_page_count,
                jobs_count=len(lost_jobs),
            )
            handle_processor_jobs(lost_jobs)

        if database_page.has_next():
            database_page = paginator.page(database_page.next_page_number())
            database_page_count += 1
            queue_capacity = get_capacity_for_jobs()
        else:
            break
Beispiel #15
0
def retry_failed_downloader_jobs() -> None:
    """Handle downloader jobs that were marked as a failure."""
    failed_jobs = (
        DownloaderJob.failed_objects.filter(created_at__gt=utils.JOB_CREATED_AT_CUTOFF)
        .order_by("created_at")
        .prefetch_related("original_files__samples")
    )

    paginator = Paginator(failed_jobs, utils.PAGE_SIZE, "created_at")
    page = paginator.page()
    page_count = 0

    if len(page.object_list) <= 0:
        # No failed jobs, nothing to do!
        return

    queue_capacity = get_capacity_for_downloader_jobs()

    if queue_capacity <= 0:
        logger.info(
            "Not handling failed (explicitly-marked-as-failure) downloader jobs "
            "because there is no capacity for them."
        )

    while queue_capacity > 0:
        logger.info(
            "Handling page %d of failed (explicitly-marked-as-failure) downloader jobs!", page_count
        )

        handle_downloader_jobs(page.object_list)

        if page.has_next():
            page = paginator.page(page.next_page_number())
            page_count = page_count + 1
            queue_capacity = get_capacity_for_downloader_jobs()
        else:
            break
    def handle(self, *args, **options):
        """Refreshes the metadata for all experiments, or experiments from a specific database
        """
        possible_source_databases = ["ARRAY_EXPRESS", "GEO", "SRA"]

        if options.get("source_database", None) is None:
            experiments = Experiment.objects.all()
        elif options["source_database"] in possible_source_databases:
            source_database = options["source_database"]
            experiments = Experiment.objects.filter(
                source_database=source_database)
        else:
            logger.error('Invalid source database "{}"'.format(
                options["source_database"]) +
                         "\nPossible source databases: {}".format(", ".join(
                             possible_source_databases)))
            sys.exit(1)

        paginator = PerformantPaginator(experiments, PAGE_SIZE)
        page = paginator.page()

        while True:
            for experiment in page.object_list:
                logger.debug("Refreshing metadata for an experiment.",
                             experiment=experiment.accession_code)
                try:
                    if experiment.source_database == "SRA":
                        metadata = SraSurveyor.gather_all_metadata(
                            experiment.samples.first().accession_code)
                        SraSurveyor._apply_metadata_to_experiment(
                            experiment, metadata)

                    elif experiment.source_database == "GEO":
                        gse = GEOparse.get_GEO(
                            experiment.accession_code,
                            destdir="/tmp/management",
                            silent=True,
                        )

                        GeoSurveyor._apply_metadata_to_experiment(
                            experiment, gse)

                    elif experiment.source_database == "ARRAY_EXPRESS":
                        request_url = EXPERIMENTS_URL + experiment.accession_code
                        experiment_request = utils.requests_retry_session(
                        ).get(request_url, timeout=60)
                        try:
                            parsed_json = experiment_request.json(
                            )["experiments"]["experiment"][0]
                        except KeyError:
                            logger.error(
                                "Remote experiment has no Experiment data!",
                                experiment_accession_code=experiment.
                                accession_code,
                                survey_job=self.survey_job.id,
                            )
                            continue
                        ArrayExpressSurveyor._apply_metadata_to_experiment(
                            experiment, parsed_json)

                    experiment.save()

                # If there are any errors, just continue. It's likely that it's
                # just a problem with this experiment.
                except Exception:
                    logger.exception(
                        "exception caught while updating metadata for {}".
                        format(experiment.accession_code))

            if not page.has_next():
                break
            else:
                page = paginator.page(page.next_page_number())

            # 2000 samples queued up every five minutes should be fast
            # enough and also not thrash the DB.
            time.sleep(60 * 5)
Beispiel #17
0
    def handle(self, *args, **options):
        """Re-surveys GEO experiments containing samples with incorrect platform information.
        """
        # Check against CDF corrected accessions table to prevent recorrection of the same samples.
        corrected_experiments = CdfCorrectedAccession.objects.all().values(
            "accession_code")

        gse_experiments = Experiment.objects.filter(
            source_database="GEO").exclude(
                accession_code__in=corrected_experiments)

        paginator = Paginator(gse_experiments, PAGE_SIZE)
        page = paginator.page()

        while True:
            for experiment in page.object_list:
                try:
                    gse = GEOparse.get_GEO(experiment.accession_code,
                                           destdir=GEO_TEMP_DIR,
                                           how="brief",
                                           silent=True)

                    sample_accessions = list(gse.gsms.keys())
                    samples = Sample.objects.filter(
                        accession_code__in=sample_accessions)

                    wrong_platform = False
                    for sample in samples:
                        gpl = gse.gsms[
                            sample.accession_code].metadata["platform_id"][0]
                        internal_accession = get_internal_microarray_accession(
                            gpl)
                        if internal_accession != sample.platform_accession_code:
                            wrong_platform = True
                            break

                    if wrong_platform:
                        if options["dry_run"]:
                            logger.info(
                                "Would have re-surveyed experiment with accession code %s",
                                experiment.accession_code,
                            )
                        else:
                            logger.info(
                                "Re-surveying experiment with accession code %s",
                                experiment.accession_code,
                            )

                            purge_experiment(experiment.accession_code)

                            queue_surveyor_for_accession(
                                experiment.accession_code)

                    current_time = timezone.now()
                    CdfCorrectedAccession(
                        accession_code=experiment.accession_code,
                        created_at=current_time).save()
                except Exception:
                    logger.exception("Caught an exception with %s!",
                                     experiment.accession_code)
                finally:
                    # GEOparse downloads files here and never cleans them up! Grrrr!
                    download_path = GEO_TEMP_DIR + experiment.accession_code + "_family.soft.gz"
                    # It's not a directory, but ignore_errors is useful.
                    try:
                        os.remove(download_path)
                    except Exception:
                        # Don't anything interrupt this, like say,
                        # GEOParse downloading a directory instead of
                        # a file...
                        logger.exception("Failed to delete an archive.")

            if not page.has_next():
                break

            page = paginator.page(page.next_page_number())
    def handle(self, *args, **options):
        """Refreshes the metadata for all samples, or samples from a specific database
        """
        possible_source_databases = ["ARRAY_EXPRESS", "GEO", "SRA"]

        if options.get("source_database", None) is None:
            samples = Sample.objects.all()
        elif options["source_database"] in possible_source_databases:
            source_database = options["source_database"]
            samples = Sample.objects.filter(source_database=source_database)
        else:
            logger.error('Invalid source database "{}"'.format(
                options["source_database"]) +
                         "\nPossible source databases: {}".format(", ".join(
                             possible_source_databases)))
            sys.exit(1)

        paginator = PerformantPaginator(samples, PAGE_SIZE)
        page = paginator.page()

        while True:
            for sample in samples:
                logger.debug("Refreshing metadata for a sample.",
                             sample=sample.accession_code)
                if sample.source_database == "SRA":
                    metadata = SraSurveyor.gather_all_metadata(
                        sample.accession_code)
                    SraSurveyor._apply_harmonized_metadata_to_sample(
                        sample, metadata)
                elif sample.source_database == "GEO":
                    gse = GEOparse.get_GEO(
                        sample.experiments.first().accession_code,
                        destdir="/tmp/management",
                        how="brief",
                        silent=True,
                    )
                    preprocessed_samples = harmony.preprocess_geo(
                        gse.gsms.items())
                    harmonized_samples = harmony.harmonize(
                        preprocessed_samples)
                    GeoSurveyor._apply_harmonized_metadata_to_sample(
                        sample, harmonized_samples[sample.title])
                elif sample.source_database == "ARRAY_EXPRESS":
                    SDRF_URL_TEMPLATE = (
                        "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt"
                    )
                    sdrf_url = SDRF_URL_TEMPLATE.format(
                        code=sample.experiments.first().accession_code)
                    sdrf_samples = harmony.parse_sdrf(sdrf_url)
                    harmonized_samples = harmony.harmonize(sdrf_samples)
                    ArrayExpressSurveyor._apply_harmonized_metadata_to_sample(
                        sample, harmonized_samples[sample.title])

                sample.save()

            if not page.has_next():
                break
            else:
                page = paginator.page(page.next_page_number())

            # 2000 samples queued up every five minutes should be fast
            # enough and also not thrash the DB.
            time.sleep(60 * 5)