Beispiel #1
0
def create_long_and_short_processor_jobs(downloader_job, long_files_to_process,
                                         short_files_to_process):
    """ Creates two processor jobs for the files needed for this transcriptome"""

    processor_job_long = ProcessorJob()
    processor_job_long.downloader_job = downloader_job
    processor_job_long.pipeline_applied = "TRANSCRIPTOME_INDEX_LONG"
    processor_job_long.ram_amount = 4096
    processor_job_long.save()

    for original_file in long_files_to_process:

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.processor_job = processor_job_long
        assoc.save()

    try:
        send_job(ProcessorPipeline[processor_job_long.pipeline_applied],
                 processor_job_long)
    except Exception:
        # This is fine, the foreman will requeue these later.
        logger.exception(
            "Problem with submitting a long transcriptome index job.")

    processor_job_short = ProcessorJob()
    processor_job_short.downloader_job = downloader_job
    processor_job_short.pipeline_applied = "TRANSCRIPTOME_INDEX_SHORT"
    processor_job_short.ram_amount = 4096
    processor_job_short.save()

    for original_file in short_files_to_process:

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.processor_job = processor_job_short
        assoc.save()

    try:
        send_job(ProcessorPipeline[processor_job_short.pipeline_applied],
                 processor_job_short)
    except Exception:
        # This is fine, the foreman will requeue these later.
        logger.exception(
            "Problem with submitting a long transcriptome index job.")
Beispiel #2
0
def create_processor_job_for_original_files(
    original_files: List[OriginalFile], downloader_job: DownloaderJob = None,
):
    """
    Create a processor job and queue a processor task for sample related to an experiment.
    """
    # If there's no acceptable original files then we've created all the jobs we need to!
    if len(original_files) == 0:
        return

    # If there's more than one original file we should prefer one with raw data.
    original_file_to_use = original_files[0]
    for original_file in original_files:
        if original_file.is_blacklisted():
            logger.debug(
                "Original file had a blacklisted extension of %s, skipping",
                extension=original_file.get_extension(),
                original_file=original_file.id,
            )
            original_file.delete_local_file()
            continue

        if original_file.has_raw:
            original_file_to_use = original_file

    # For anything that has raw data there should only be one Sample per OriginalFile
    sample_object = original_file_to_use.samples.first()
    pipeline_to_apply = determine_processor_pipeline(sample_object, original_file_to_use)

    if pipeline_to_apply == ProcessorPipeline.NONE:
        logger.info(
            "No valid processor pipeline found to apply to sample.",
            sample=sample_object.id,
            original_file=original_file_to_use.id,
        )
        for original_file in original_files:
            original_file.delete_local_file()
    else:
        processor_job = ProcessorJob()
        processor_job.downloader_job = downloader_job
        processor_job.pipeline_applied = pipeline_to_apply.value
        processor_job.ram_amount = determine_ram_amount(processor_job, sample_object)

        processor_job.save()

        for original_file in original_files:
            assoc = ProcessorJobOriginalFileAssociation()
            assoc.original_file = original_file
            assoc.processor_job = processor_job
            assoc.save()

        logger.debug(
            "Queuing processor job.",
            processor_job=processor_job.id,
            downloader_job=downloader_job.id if downloader_job else None,
        )

        try:
            send_job(pipeline_to_apply, processor_job)
        except Exception:
            # If we cannot queue the job now the Foreman will do
            # it later.
            pass
Beispiel #3
0
    def test_create_missing_jobs(self):
        """Tests that files which should have processor jobs get them created.

        Specifically files that fall into this category are files that
        had successful downloader jobs but for some reason do not have
        processor jobs. It's not yet known why this is happening, but
        part of this management command is logging about them to get a
        grasp of how many there are.

        We want this test to cover both Microarray and RNA-Seq. We
        also need to test both that files which need processor jobs
        have them created, but also that files which don't need them
        don't get them created.

        Therefore we need at least 4 original files:
          * Microarray needing processor job.
          * Microarray not needing processor job.
          * RNA-Seq needing processor job.
          * RNA-Seq not needing processor job.

        However Microarray can have files which shouldn't get
        processor jobs, so we're going to make one of those as
        well. Also Microarray jobs can download multiple files which
        get a processor job each, so we're going to make an additional
        Microarray file and associate it with the same downloader job
        so we can make sure two processor jobs are created based on
        that one downloader job.
        """
        # Microarray File/Samples/Jobs
        ma_og_doesnt_need_processor = OriginalFile()
        ma_og_doesnt_need_processor.filename = "processed.CEL"
        ma_og_doesnt_need_processor.source_filename = "processed.CEL"
        ma_og_doesnt_need_processor.is_downloaded = True
        ma_og_doesnt_need_processor.is_archive = False
        ma_og_doesnt_need_processor.save()

        ma_sample_doesnt_need_processor = Sample()
        ma_sample_doesnt_need_processor.accession_code = "MA_doesnt_need_processor"
        ma_sample_doesnt_need_processor.technology = "MICROARRAY"
        ma_sample_doesnt_need_processor.source_database = "ARRAY_EXPRESS"
        ma_sample_doesnt_need_processor.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            sample=ma_sample_doesnt_need_processor,
            original_file=ma_og_doesnt_need_processor)

        ma_dl_job_doesnt_need_processor = DownloaderJob()
        ma_dl_job_doesnt_need_processor.success = True
        ma_dl_job_doesnt_need_processor.worker_id = "worker_1"
        ma_dl_job_doesnt_need_processor.volume_index = "1"
        ma_dl_job_doesnt_need_processor.save()

        DownloaderJobOriginalFileAssociation.objects.get_or_create(
            downloader_job=ma_dl_job_doesnt_need_processor,
            original_file=ma_og_doesnt_need_processor,
        )

        ma_processor_job = ProcessorJob()
        ma_processor_job.downloader_job = ma_dl_job_doesnt_need_processor
        ma_processor_job.success = True
        ma_processor_job.worker_id = "worker_1"
        ma_processor_job.volume_index = "1"
        ma_processor_job.save()

        ProcessorJobOriginalFileAssociation.objects.get_or_create(
            processor_job=ma_processor_job,
            original_file=ma_og_doesnt_need_processor)

        ma_og_needs_processor_1 = OriginalFile()
        ma_og_needs_processor_1.filename = "something.CEL"
        ma_og_needs_processor_1.source_filename = "something.CEL"
        ma_og_needs_processor_1.is_downloaded = True
        ma_og_needs_processor_1.is_archive = False
        ma_og_needs_processor_1.save()

        ma_og_needs_processor_2 = OriginalFile()
        ma_og_needs_processor_2.filename = "something_else.CEL"
        ma_og_needs_processor_2.source_filename = "something_else.CEL"
        ma_og_needs_processor_2.is_downloaded = True
        ma_og_needs_processor_2.is_archive = False
        ma_og_needs_processor_2.save()

        ma_og_archive = OriginalFile()
        ma_og_archive.filename = "tarball.gz"
        ma_og_archive.source_filename = "tarball.gz"
        ma_og_archive.is_downloaded = True
        ma_og_archive.is_archive = True
        ma_og_archive.save()

        ma_sample_needs_processor_1 = Sample()
        ma_sample_needs_processor_1.accession_code = "MA_needs_processor_1"
        ma_sample_needs_processor_1.technology = "MICROARRAY"
        ma_sample_needs_processor_1.source_database = "ARRAY_EXPRESS"
        ma_sample_needs_processor_1.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            sample=ma_sample_needs_processor_1,
            original_file=ma_og_needs_processor_1)
        OriginalFileSampleAssociation.objects.get_or_create(
            sample=ma_sample_needs_processor_1, original_file=ma_og_archive)

        ma_sample_needs_processor_2 = Sample()
        ma_sample_needs_processor_2.accession_code = "MA_needs_processor_2"
        ma_sample_needs_processor_2.technology = "MICROARRAY"
        ma_sample_needs_processor_2.source_database = "ARRAY_EXPRESS"
        ma_sample_needs_processor_2.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            sample=ma_sample_needs_processor_2,
            original_file=ma_og_needs_processor_2)
        OriginalFileSampleAssociation.objects.get_or_create(
            sample=ma_sample_needs_processor_2, original_file=ma_og_archive)

        ma_dl_job_needs_processor = DownloaderJob()
        ma_dl_job_needs_processor.success = True
        ma_dl_job_needs_processor.worker_id = "worker_1"
        ma_dl_job_doesnt_need_processor.volume_index = "1"
        ma_dl_job_needs_processor.save()

        DownloaderJobOriginalFileAssociation.objects.get_or_create(
            downloader_job=ma_dl_job_needs_processor,
            original_file=ma_og_needs_processor_1)
        DownloaderJobOriginalFileAssociation.objects.get_or_create(
            downloader_job=ma_dl_job_needs_processor,
            original_file=ma_og_needs_processor_2)
        DownloaderJobOriginalFileAssociation.objects.get_or_create(
            downloader_job=ma_dl_job_needs_processor,
            original_file=ma_og_archive)

        # RNA-Seq File/Samples/Jobs
        rna_og_doesnt_need_processor = OriginalFile()
        rna_og_doesnt_need_processor.filename = "processed.fastq"
        rna_og_doesnt_need_processor.source_filename = "processed.fastq"
        rna_og_doesnt_need_processor.is_downloaded = True
        rna_og_doesnt_need_processor.is_archive = False
        rna_og_doesnt_need_processor.save()

        rna_sample_doesnt_need_processor = Sample()
        rna_sample_doesnt_need_processor.accession_code = "RNA_doesnt_need_processor"
        rna_sample_doesnt_need_processor.technology = "RNA-SEQ"
        rna_sample_doesnt_need_processor.source_database = "SRA"
        rna_sample_doesnt_need_processor.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            sample=rna_sample_doesnt_need_processor,
            original_file=rna_og_doesnt_need_processor)

        rna_dl_job_doesnt_need_processor = DownloaderJob()
        rna_dl_job_doesnt_need_processor.success = True
        rna_dl_job_doesnt_need_processor.worker_id = "worker_1"
        rna_dl_job_doesnt_need_processor.volume_index = "1"
        rna_dl_job_doesnt_need_processor.save()

        DownloaderJobOriginalFileAssociation.objects.get_or_create(
            downloader_job=rna_dl_job_doesnt_need_processor,
            original_file=rna_og_doesnt_need_processor,
        )

        rna_processor_job = ProcessorJob()
        rna_processor_job.downloader_job = rna_dl_job_doesnt_need_processor
        # Failed ProcessorJobs will be retried, so they still count.
        rna_processor_job.success = False
        rna_processor_job.worker_id = "worker_1"
        rna_dl_job_doesnt_need_processor.volume_index = "1"
        rna_processor_job.save()

        ProcessorJobOriginalFileAssociation.objects.get_or_create(
            processor_job=rna_processor_job,
            original_file=rna_og_doesnt_need_processor)

        rna_og_needs_processor = OriginalFile()
        rna_og_needs_processor.filename = "something.fastq"
        rna_og_needs_processor.source_filename = "something.fastq"
        rna_og_needs_processor.is_downloaded = True
        rna_og_needs_processor.is_archive = False
        rna_og_needs_processor.save()

        rna_sample_needs_processor = Sample()
        rna_sample_needs_processor.accession_code = "RNA_needs_processor"
        rna_sample_needs_processor.technology = "RNA-SEQ"
        rna_sample_needs_processor.source_database = "SRA"
        rna_sample_needs_processor.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            sample=rna_sample_needs_processor,
            original_file=rna_og_needs_processor)

        rna_dl_job_needs_processor = DownloaderJob()
        rna_dl_job_needs_processor.success = True
        rna_dl_job_needs_processor.worker_id = "worker_1"
        rna_dl_job_doesnt_need_processor.volume_index = "1"
        rna_dl_job_needs_processor.save()

        DownloaderJobOriginalFileAssociation.objects.get_or_create(
            downloader_job=rna_dl_job_needs_processor,
            original_file=rna_og_needs_processor)

        # Setup is done, actually run the command.
        command = Command()
        command.handle()

        # Test Microarray was handled correctly.
        ## Test that a missing processor job was created.
        self.assertEqual(
            1,
            ProcessorJobOriginalFileAssociation.objects.filter(
                original_file=ma_og_needs_processor_1).count(),
        )
        self.assertEqual(
            1,
            ProcessorJobOriginalFileAssociation.objects.filter(
                original_file=ma_og_needs_processor_2).count(),
        )
        self.assertEqual(
            0,
            ProcessorJobOriginalFileAssociation.objects.filter(
                original_file=ma_og_archive).count(),
        )

        ## Test that a processor job that wasn't missing wasn't created.
        ## Of course, we created one in test setup, so we're really
        ## checking that it's still only 1.
        self.assertEqual(
            1,
            ProcessorJobOriginalFileAssociation.objects.filter(
                original_file=ma_og_doesnt_need_processor).count(),
        )

        # Test Microarray was handled correctly.
        ## Test that the missing processor job was created.
        self.assertEqual(
            1,
            ProcessorJobOriginalFileAssociation.objects.filter(
                original_file=rna_og_needs_processor).count(),
        )

        ## Test that a processor job that wasn't missing wasn't created.
        ## Of course, we created one in test setup, so we're really
        ## checking that it's still only 1.
        self.assertEqual(
            1,
            ProcessorJobOriginalFileAssociation.objects.filter(
                original_file=rna_og_doesnt_need_processor).count(),
        )