Esempio n. 1
0
def prepare_job():
    pj = ProcessorJob()
    pj.pipeline_applied = "AFFY_TO_PCL"
    pj.save()

    original_file = OriginalFile()
    original_file.source_filename = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip"
    original_file.filename = "GSM1426071_CD_colon_active_1.CEL"
    original_file.absolute_file_path = "/home/user/data_store/raw/TEST/CEL/GSM1426071_CD_colon_active_1.CEL"
    original_file.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = original_file
    assoc1.processor_job = pj
    assoc1.save()

    c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS")

    sample = Sample()
    sample.title = "Heyo"
    sample.organism = c_elegans
    sample.is_processed = False
    sample.save()

    ogsa = OriginalFileSampleAssociation()
    ogsa.sample = sample
    ogsa.original_file = original_file
    ogsa.save()

    return pj
def create_qn_target(organism, platform, create_results=True):
    sample_codes_results = Sample.processed_objects.filter(
        platform_accession_code=platform,
        has_raw=True,
        technology="MICROARRAY",
        organism=organism,
        is_processed=True,
    ).values("accession_code")
    sample_codes = [res["accession_code"] for res in sample_codes_results]

    dataset = Dataset()
    dataset.data = {organism.name + "_(" + platform + ")": sample_codes}
    dataset.aggregate_by = "ALL"
    dataset.scale_by = "NONE"
    dataset.quantile_normalize = False
    dataset.save()

    job = ProcessorJob()
    job.pipeline_applied = "QN_REFERENCE"
    job.save()

    pjda = ProcessorJobDatasetAssociation()
    pjda.processor_job = job
    pjda.dataset = dataset
    pjda.save()

    return qn_reference.create_qn_reference(job.pk, create_results=create_results)
Esempio n. 3
0
    def create_processor_job(self, pipeline="AFFY_TO_PCL", ram_amount=2048):
        job = ProcessorJob(
            pipeline_applied=pipeline,
            nomad_job_id="PROCESSOR/dispatch-1528945054-e8eaf540",
            ram_amount=ram_amount,
            num_retries=0,
            volume_index="1",
            success=None)
        job.save()

        og_file = OriginalFile()
        og_file.source_filename = "doesn't matter"
        og_file.filename = "this either"
        og_file.absolute_file_path = "nor this"
        og_file.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = job
        assoc1.save()

        og_file = OriginalFile()
        og_file.source_filename = "doesn't matter"
        og_file.filename = "this either"
        og_file.absolute_file_path = "nor this"
        og_file.save()

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.original_file = og_file
        assoc.processor_job = job
        assoc.save()

        return job
Esempio n. 4
0
    def test_filter_rnaseq_matrix_drop_row_sums(self):
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
        job.save()

        samples = list(str(i) for i in range(0, 10))
        df = pd.DataFrame(columns=samples)
        for i in range(1, 101):
            df.loc[str(i)] = {idx: i for idx in samples}

        job_context = {"rnaseq_matrix": df, "job": job}

        final_job_context = create_compendia._filter_rnaseq_matrix(job_context)

        filtered_matrix = final_job_context["filtered_rnaseq_matrix"]

        # Make sure that we are getting rid of intermediate results
        # appropriately. Because these matrices can be pretty heavy, the input
        # should not stick around in the job context like this.
        self.assertNotIn("rnaseq_matrix", final_job_context.keys())

        # We drop all rows below the 10th percentile in row sum, so we would
        # expect to drop rows 1 through 10 that we created above
        self.assertEqual(set(filtered_matrix.index),
                         set(str(i) for i in range(11, 101)))
Esempio n. 5
0
    def test_failure(self):
        """Fails because there are no files for the job."""
        processor_job = ProcessorJob()
        processor_job.save()

        job_context = utils.start_job({"job": processor_job})
        self.assertFalse(job_context["success"])
Esempio n. 6
0
    def test_notify(self):

        ds = Dataset()
        ds.data = {'GSM1487313': ['GSM1487313'], 'SRS332914': ['SRS332914']}
        ds.aggregate_by = 'SPECIES'
        ds.scale_by = 'STANDARD'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        job_context = {}
        job_context['job'] = pj
        job_context['dataset'] = ds
        job_context['upload'] = True
        job_context[
            'result_url'] = 'https://s3.amazonaws.com/data-refinery-test-assets/all_the_things.jpg'

        final_context = smasher._notify(job_context)
        self.assertTrue(final_context.get('success', True))
Esempio n. 7
0
def create_processor_job(pipeline="AFFY_TO_PCL",
                         ram_amount=2048,
                         start_time=None):
    og_file_1 = OriginalFile()
    og_file_1.source_filename = "doesn't matter"
    og_file_1.filename = "this either"
    og_file_1.absolute_file_path = "nor this"
    og_file_1.save()

    og_file_2 = OriginalFile()
    og_file_2.source_filename = "doesn't matter"
    og_file_2.filename = "this either"
    og_file_2.absolute_file_path = "nor this"
    og_file_2.save()

    downloader_job = None
    if pipeline == "AFFY_TO_PCL":
        downloader_job = DownloaderJob(
            downloader_task="SRA",
            batch_job_id="DEFAULT",
            num_retries=0,
            accession_code="NUNYA",
            success=None,
        )
        downloader_job.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.original_file = og_file_2
        assoc.downloader_job = downloader_job
        assoc.save()

        assoc1 = DownloaderJobOriginalFileAssociation()
        assoc1.original_file = og_file_1
        assoc1.downloader_job = downloader_job
        assoc1.save()

    processor_job = ProcessorJob(
        downloader_job=downloader_job,
        pipeline_applied=pipeline,
        batch_job_id="PROCESSOR/dispatch-1528945054-e8eaf540",
        ram_amount=ram_amount,
        num_retries=0,
        success=None,
        start_time=start_time,
    )
    processor_job.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file_1
    assoc1.processor_job = processor_job
    assoc1.save()

    assoc = ProcessorJobOriginalFileAssociation()
    assoc.original_file = og_file_2
    assoc.processor_job = processor_job
    assoc.save()

    return processor_job
Esempio n. 8
0
def prepare_illumina_job(organism):
    pj = ProcessorJob()
    pj.pipeline_applied = "ILLUMINA_TO_PCL"
    pj.save()

    og_file = OriginalFile()
    og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427%5Fnon%2Dnormalized%2Etxt.gz"
    og_file.filename = "GSE22427_non-normalized.txt"
    og_file.absolute_file_path = (
        "/home/user/data_store/raw/TEST/ILLUMINA/GSE22427_non-normalized.txt")
    og_file.is_downloaded = True
    og_file.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = pj
    assoc1.save()

    sample_names = [
        "LV-C&si-Control-1",
        "LV-C&si-Control-2",
        "LV-C&si-Control-3",
        "LV-C&si-EZH2-1",
        "LV-C&si-EZH2-2",
        "LV-C&si-EZH2-3",
        "LV-EZH2&si-EZH2-1",
        "LV-EZH2&si-EZH2-2",
        "LV-EZH2&si-EZH2-3",
        "LV-T350A&si-EZH2-1",
        "LV-T350A&si-EZH2-2",
        "LV-T350A&si-EZH2-3",
    ]

    for name in sample_names:
        sample = Sample()
        sample.accession_code = name
        sample.title = name
        sample.organism = organism
        sample.save()

        sa = SampleAnnotation()
        sa.sample = sample
        sa.data = {"description": [name]}
        sa.is_ccdl = False
        sa.save()

        sample_assoc = OriginalFileSampleAssociation()
        sample_assoc.original_file = og_file
        sample_assoc.sample = sample
        sample_assoc.save()

    sample = Sample.objects.get(title="LV-T350A&si-EZH2-3")
    sample.title = "ignoreme_for_description"
    sample.accession_code = "ignoreme_for_description"
    sample.save()

    return pj
Esempio n. 9
0
    def test_fail(self):
        """ Test our ability to fail """

        result = ComputationalResult()
        result.save()

        sample = Sample()
        sample.accession_code = 'XXX'
        sample.title = 'XXX'
        sample.organism = Organism.get_object_for_name("HOMO_SAPIENS")
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        computed_file = ComputedFile()
        computed_file.filename = "NOT_REAL.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE51081': ['XXX']}
        ds.aggregate_by = 'EXPERIMENT'
        ds.scale_by = 'MINMAX'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()
        dsid = ds.id

        job = ProcessorJob()
        job.pipeline_applied = "SMASHER"
        job.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(job.pk, upload=False)
        ds = Dataset.objects.get(id=dsid)
        print(ds.failure_reason)
        print(final_context['dataset'].failure_reason)
        self.assertNotEqual(final_context['unsmashable_files'], [])
Esempio n. 10
0
    def handle(self, *args, **options):

        pj = ProcessorJob()
        pj.pipeline_applied = "JANITOR"
        pj.save()

        final_context = run_janitor(pj.pk)

        print("Removed: ")
        for item in final_context['deleted_items']:
            print('\t - ' + item)

        sys.exit(0)
Esempio n. 11
0
def prepare_job():
    pj = ProcessorJob()
    pj.pipeline_applied = "SALMON"
    pj.save()

    c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS")

    samp = Sample()
    samp.accession_code = "SALMON" # So the test files go to the right place
    samp.organism = c_elegans
    samp.source_database = 'SRA'
    samp.technology = 'RNA-SEQ'
    samp.save()

    prepare_organism_indices()

    og_file = OriginalFile()
    og_file.source_filename = "ERR1562482_1.fastq.gz"
    og_file.filename = "ERR1562482_1.fastq.gz"
    og_file.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/ERR1562482_1.fastq.gz"
    og_file.is_downloaded = True
    og_file.save()

    og_file2 = OriginalFile()
    og_file2.source_filename = "ERR1562482_2.fastq.gz"
    og_file2.filename = "ERR1562482_2.fastq.gz"
    og_file2.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/ERR1562482_2.fastq.gz"
    og_file2.is_downloaded = True
    og_file2.save()

    og_file_samp_assoc = OriginalFileSampleAssociation()
    og_file_samp_assoc.original_file = og_file
    og_file_samp_assoc.sample = samp
    og_file_samp_assoc.save()

    og_file_samp_assoc2 = OriginalFileSampleAssociation()
    og_file_samp_assoc2.original_file = og_file2
    og_file_samp_assoc2.sample = samp
    og_file_samp_assoc2.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = pj
    assoc1.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file2
    assoc1.processor_job = pj
    assoc1.save()

    return pj, [og_file, og_file2]
Esempio n. 12
0
def prepare_illumina_job(job_info: Dict) -> ProcessorJob:
    pj = ProcessorJob()
    pj.pipeline_applied = "ILLUMINA_TO_PCL"
    pj.save()

    og_file = OriginalFile()
    og_file.source_filename = job_info["source_filename"]
    og_file.filename = job_info["filename"]
    og_file.absolute_file_path = job_info["absolute_file_path"]
    og_file.is_downloaded = True
    og_file.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = pj
    assoc1.save()

    for s in job_info["samples"]:
        # For convenience, if you give a list of strings we'll just use the
        # strings as both titles and accessions.
        annotation = None
        if type(s) == str:
            accession_code = s
            title = s
        elif type(s) == tuple and list(map(type, s)) == [str, str]:
            accession_code, title = s
        elif type(s) == tuple and list(map(type, s)) == [str, str, dict]:
            accession_code, title, annotation = s
        else:
            raise ValueError(f"Invalid sample type for sample {s}")

        sample = Sample()
        sample.accession_code = accession_code
        sample.title = title
        sample.organism = job_info["organism"]
        sample.save()

        sa = SampleAnnotation()
        sa.sample = sample
        sa.data = annotation if annotation is not None else {
            "description": [title]
        }
        sa.is_ccdl = False
        sa.save()

        sample_assoc = OriginalFileSampleAssociation()
        sample_assoc.original_file = og_file
        sample_assoc.sample = sample
        sample_assoc.save()

    return pj
Esempio n. 13
0
def _try_sanitizing_file(file: str) -> str:
    pj = ProcessorJob()
    pj.pipeline_applied = "ILLUMINA_TO_PCL"
    pj.save()

    og_file = _make_original_file_with_contents(file)

    job_context = illumina._prepare_files({
        "job_id": pj.pk,
        "original_files": [og_file],
        "job": pj
    })
    job_context = illumina._detect_encoding(job_context)
    return illumina._sanitize_input_file(job_context)
def create_job_for_organism(organism: Organism):
    """Returns a quantpendia job for the provided organism."""
    job = ProcessorJob()
    job.pipeline_applied = ProcessorPipeline.CREATE_QUANTPENDIA.value
    job.save()

    dset = Dataset()
    dset.data = build_dataset(organism)
    dset.scale_by = "NONE"
    dset.aggregate_by = "EXPERIMENT"
    dset.quantile_normalize = False
    dset.quant_sf_only = True
    dset.svd_algorithm = "NONE"
    dset.save()

    pjda = ProcessorJobDatasetAssociation()
    pjda.processor_job = job
    pjda.dataset = dset
    pjda.save()

    # Have to call this after setting the dataset since it's used in
    # the caclulation.
    job.ram_amount = determine_ram_amount(job)
    job.save()

    return job
Esempio n. 15
0
    def test_convert_processed_illumina(self):
        job = ProcessorJob()
        job.pipeline_applied = "NO_OP"
        job.save()

        # ex:
        # Reporter Identifier VALUE   Detection Pval
        # ILMN_1343291    14.943602   0
        # ILMN_1343295    13.528082   0
        og_file = OriginalFile()
        og_file.source_filename = "https://www.ebi.ac.uk/arrayexpress/experiments/E-GEOD-22433/"
        og_file.filename = "GSM557500_sample_table.txt"
        og_file.absolute_file_path = (
            "/home/user/data_store/raw/TEST/NO_OP/GSM557500_sample_table.txt")
        og_file.is_downloaded = True
        og_file.save()

        homo_sapiens = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
        homo_sapiens.save()

        sample = Sample()
        sample.accession_code = "GSM557500"
        sample.title = "GSM557500"
        sample.platform_accession_code = "A-MEXP-1171"
        sample.manufacturer = "ILLUMINA"
        sample.organism = homo_sapiens
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.original_file = og_file
        assoc.sample = sample
        assoc.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = job
        assoc1.save()

        # To:
        # ENSG00000156508 14.943602
        # ENSG00000111640 13.528082
        final_context = no_op.no_op_processor(job.pk)
        self.assertTrue(final_context["success"])
        self.assertTrue(os.path.exists(final_context["output_file_path"]))
        self.assertEqual(os.path.getsize(final_context["output_file_path"]),
                         920374)
        self.assertTrue(
            no_op.check_output_quality(final_context["output_file_path"]))
Esempio n. 16
0
    def test_good_detection(self):
        """GSE54661 appears to be mislabled (illuminaHumanv4) on GEO. Shows our detector works. """
        from data_refinery_workers.processors import illumina

        pj = ProcessorJob()
        pj.pipeline_applied = "ILLUMINA_TO_PCL"
        pj.save()

        og_file = OriginalFile()
        og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE54nnn/GSE54661/suppl/GSE54661%5Fnon%5Fnormalized%2Etxt%2Egz"
        og_file.filename = "GSE54661_non_normalized.txt"
        og_file.absolute_file_path = (
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE54661_non_normalized.txt"
        )
        og_file.is_downloaded = True
        og_file.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = pj
        assoc1.save()

        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        sample = Sample()
        sample.accession_code = "ABCD-1234"
        sample.title = "hypoxia_Signal"
        sample.organism = organism
        sample.save()

        sample_assoc = OriginalFileSampleAssociation()
        sample_assoc.original_file = og_file
        sample_assoc.sample = sample
        sample_assoc.save()

        final_context = illumina.illumina_to_pcl(pj.pk)
        self.assertEqual(final_context["platform"], "illuminaHumanv3")

        for key in final_context["samples"][0].sampleannotation_set.all(
        )[0].data.keys():
            self.assertTrue(key in [
                "detected_platform", "detection_percentage",
                "mapped_percentage"
            ])

        # Cleanup after the job since it won't since we aren't running in cloud.
        shutil.rmtree(final_context["work_dir"], ignore_errors=True)
Esempio n. 17
0
    def test_jobs_sanity(self):
        """Just makes sure creating Jobs doesn't fail"""

        s_job = SurveyJob()
        s_job.save()

        processor_job = ProcessorJob()
        processor_job.pipeline_applied = "test0"
        processor_job.save()

        dl_job = DownloaderJob()
        dl_job.downloader_task = "XYZ"
        dl_job.accession_code = "123"
        dl_job.save()
Esempio n. 18
0
def create_job_for_organism(organisms: List[Organism], svd_algorithm="ARPACK"):
    """Returns a compendia job for the provided organism.

    Fetch all of the experiments and compile large but normally formated Dataset.
    """
    job = ProcessorJob()
    job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
    job.save()

    dataset = Dataset()
    dataset.data = get_dataset(organisms)
    dataset.scale_by = "NONE"
    dataset.aggregate_by = "SPECIES"
    dataset.quantile_normalize = True
    dataset.quant_sf_only = False
    dataset.svd_algorithm = svd_algorithm
    dataset.save()

    pjda = ProcessorJobDatasetAssociation()
    pjda.processor_job = job
    pjda.dataset = dataset
    pjda.save()

    # Have to call this after setting the dataset since it's used in
    # the caclulation.
    job.ram_amount = determine_ram_amount(job)
    job.save()

    return job
Esempio n. 19
0
    def test_convert_illumina_no_header(self):
        job = ProcessorJob()
        job.pipeline_applied = "NO_OP"
        job.save()

        # ex:
        # ILMN_1885639    10.0000 0.7931
        # ILMN_2209417    10.0000 0.2029
        # ILMN_1765401    152.0873    0.0000
        og_file = OriginalFile()
        og_file.source_filename = (
            "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1.txt"
        )
        og_file.filename = "GSM1089291-tbl-1.txt"
        og_file.absolute_file_path = "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1.txt"
        og_file.is_downloaded = True
        og_file.save()

        homo_sapiens = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
        homo_sapiens.save()

        sample = Sample()
        sample.accession_code = "GSM557500"
        sample.title = "GSM557500"
        sample.platform_accession_code = "A-MEXP-1171"
        sample.manufacturer = "ILLUMINA"
        sample.organism = homo_sapiens
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.original_file = og_file
        assoc.sample = sample
        assoc.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = job
        assoc1.save()

        # To:
        # ENSG00000105675 10
        # ENSG00000085721 152.0873
        # ENSG00000278494 152.0873
        final_context = no_op.no_op_processor(job.pk)
        self.assertTrue(final_context["success"])
        self.assertTrue(os.path.exists(final_context["output_file_path"]))
        self.assertEqual(os.path.getsize(final_context["output_file_path"]),
                         786207)
Esempio n. 20
0
def send_janitor_jobs():
    """Dispatch a Janitor job for each job queue.

    TODO: make this dispatch janitor jobs for all job queues.
    https://github.com/AlexsLemonade/refinebio/issues/2789
    """
    new_job = ProcessorJob(num_retries=0, pipeline_applied="JANITOR", ram_amount=2048)
    new_job.save()
    logger.info("Sending Janitor Job.", job_id=new_job.id)
    try:
        send_job(ProcessorPipeline["JANITOR"], job=new_job, is_dispatch=True)
    except Exception:
        # If we can't dispatch this job, something else has gone wrong, we can get it next loop.
        return
Esempio n. 21
0
    def test_convert_illumina_bad_cols(self):
        """
        In future, this test may be deprecated. For now it just alerts that it needs attention.
        """
        job = ProcessorJob()
        job.pipeline_applied = "NO_OP"
        job.save()

        # ex:
        # ILMN_1885639    10.0000 0.7931  11.0000 0.123
        # ILMN_2209417    10.0000 0.2029  11.1234 0.543
        # LMN_1765401    152.0873    0.0000  99.999  0.19
        og_file = OriginalFile()
        og_file.source_filename = (
            "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1-modified.txt"
        )
        og_file.filename = "GSM1089291-tbl-1-modified.txt"
        og_file.absolute_file_path = (
            "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1-modified.txt"
        )
        og_file.is_downloaded = True
        og_file.save()

        homo_sapiens = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
        homo_sapiens.save()

        sample = Sample()
        sample.accession_code = "GSM557500"
        sample.title = "GSM557500"
        sample.platform_accession_code = "A-MEXP-1171"
        sample.manufacturer = "ILLUMINA"
        sample.organism = homo_sapiens
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.original_file = og_file
        assoc.sample = sample
        assoc.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = job
        assoc1.save()

        final_context = no_op.no_op_processor(job.pk)
        self.assertFalse(final_context["success"])
        self.assertTrue("Tell Rich!" in final_context["job"].failure_reason)
Esempio n. 22
0
def prepare_job(length):

    pj = ProcessorJob()
    pj.pipeline_applied = "TRANSCRIPTOME_INDEX_" + length.upper()
    pj.save()

    homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS",
                                                taxonomy_id=1001)

    samp = Sample()
    samp.organism = homo_sapiens
    samp.accession_code = "derp" + length
    samp.save()

    og_file = OriginalFile()
    og_file.source_filename = "aegilops_tauschii_" + length + ".fa.gz"
    og_file.filename = "aegilops_tauschii_" + length + ".fa.gz"
    og_file.absolute_file_path = "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/AEGILOPS_TAUSCHII/aegilops_tauschii_short.fa.gz"
    og_file.is_downloaded = True
    og_file.save()

    og_file2 = OriginalFile()
    og_file2.source_filename = "aegilops_tauschii_" + length + ".gtf.gz"
    og_file2.filename = "aegilops_tauschii_" + length + ".gtf.gz"
    og_file2.absolute_file_path = "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/AEGILOPS_TAUSCHII/aegilops_tauschii_short.gtf.gz"
    og_file2.is_downloaded = True
    og_file2.save()

    og_file_samp_assoc = OriginalFileSampleAssociation()
    og_file_samp_assoc.original_file = og_file
    og_file_samp_assoc.sample = samp
    og_file_samp_assoc.save()

    og_file_samp_assoc2 = OriginalFileSampleAssociation()
    og_file_samp_assoc2.original_file = og_file2
    og_file_samp_assoc2.sample = samp
    og_file_samp_assoc2.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = pj
    assoc1.save()

    assoc2 = ProcessorJobOriginalFileAssociation()
    assoc2.original_file = og_file2
    assoc2.processor_job = pj
    assoc2.save()

    return pj
Esempio n. 23
0
    def test_value_passing(self):
        """The keys added to job_context and returned by processors will be
        passed through to other processors.
        """
        batch, _ = init_objects()
        processor_job = ProcessorJob.create_job_and_relationships(
            batches=[batch])

        mock_processor = MagicMock()
        mock_context = {
            "something_to_pass_along": True,
            "job": processor_job,
            "batches": [batch]
        }
        mock_processor.return_value = mock_context

        def processor_function(job_context):
            self.assertTrue(job_context["something_to_pass_along"])
            return job_context

        test_processor = MagicMock(side_effect=processor_function)

        utils.run_pipeline(
            {"job_id": processor_job.id},
            [utils.start_job, mock_processor, test_processor, utils.end_job])

        processor_job.refresh_from_db()
        self.assertTrue(processor_job.success)
        self.assertIsNotNone(processor_job.end_time)

        batch.refresh_from_db()
        self.assertEqual(batch.status, BatchStatuses.PROCESSED.value)
Esempio n. 24
0
    def test_single_read(self):
        """Test outputs when the sample has one read only."""
        job_context = {
            'job_id': 456,
            'job': ProcessorJob(),
            'pipeline': Pipeline(name="Salmon"),
            'input_file_path': self.test_dir + 'single_input/single_read.fastq',
            'output_directory': self.test_dir + 'single_output/',
            'salmontools_directory': self.test_dir + 'single_salmontools/',
            'salmontools_archive': self.test_dir + 'salmontools-result.tar.gz',
            'computed_files': []
        }
        os.makedirs(job_context["salmontools_directory"], exist_ok=True)

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")
        sample = Sample()
        sample.organism = homo_sapiens
        sample.save()
        job_context["sample"] = sample

        salmon._run_salmontools(job_context)

        # Confirm job status
        self.assertTrue(job_context["success"])

        # Unpack result for checking
        os.system('gunzip ' + job_context['salmontools_directory'] + "*.gz")

        # Check output file
        output_file = job_context['salmontools_directory'] + 'unmapped_by_salmon.fa'
        expected_output_file = self.test_dir + 'expected_single_output/unmapped_by_salmon.fa'
        self.assertTrue(identical_checksum(output_file, expected_output_file))
    def test_create_index_failure(self, mocked_subprocess):
        # Initialize mock and test objects
        mocked_subprocess.return_value = CompletedProcess(
            [], 1, stdout=None, stderr="Error: something went wrong.")
        batch, gtf_file, fasta_file = init_objects()
        processor_job = ProcessorJob.create_job_and_relationships(
            batches=[batch])

        # Mock out the job_context with everything the function under
        # test will expect
        job_context = utils.start_job({
            "job": processor_job,
            "job_id": processor_job.id,
            "gtf_file": gtf_file,
            "gtf_file_path": "dummy",
            "fasta_file": fasta_file,
            "fasta_file_path": "dummy",
            "genes_to_transcripts_path": "dummy"
        })
        job_context = transcriptome_index._set_job_prefix(job_context)

        # The function being tested.
        job_context = transcriptome_index._create_index(job_context)

        self.assertFalse(job_context["success"])
        self.assertEqual(
            processor_job.failure_reason,
            ("Shell call to rsem-prepare-reference failed because: "
             "Error: something went wrong."))
        self.assertFalse(os.path.isfile(batch.files[0].get_temp_pre_path()))
    def test_zip_index_failure(self):
        # Initialize test objects
        batch, gtf_file, _ = init_objects()
        processor_job = ProcessorJob.create_job_and_relationships(
            batches=[batch])

        # Mock out the job_context with everything the function under
        # test will expect
        job_context = utils.start_job({
            "job": processor_job,
            "job_id": processor_job.id,
            "gtf_file": gtf_file,
            "output_dir": "missing/index"
        })
        job_context = transcriptome_index._set_job_prefix(job_context)

        # The function being tested.
        job_context = transcriptome_index._zip_index(job_context)

        self.assertFalse(job_context["success"])
        self.assertEqual(
            processor_job.failure_reason,
            ("Exception caught while zipping index directory /home/user"
             "/data_store/temp/EnsemblPlants/TRANSCRIPTOME_INDEX/{}"
             "/aegilops_tauschii_short.tar.gz").format(
                 job_context["job_dir_prefix"]))
        self.assertFalse(os.path.isfile(batch.files[0].get_temp_pre_path()))
Esempio n. 27
0
    def test_success(self):
        batch = init_objects()
        file = batch.files[0]
        batch.platform_accession_code = "TEST"
        batch.save()
        file.internal_location = "TEST/AFFY_TO_PCL"
        file.name = "GSM1426186_UC_colon_inactive_201.CEL"
        file.save()

        file.remove_temp_directory = MagicMock()

        processor_job = ProcessorJob.create_job_and_relationships(batches=[batch])

        # We have a test file in the repo, but it needs to be in the
        # correct location which depends on the ID of the Batch, which
        # changes based on the order tests are run in.
        test_file_path = "/home/user/data_store/temp/TEST/AFFY_TO_PCL/" + file.name
        input_file_path = file.get_temp_pre_path()
        os.makedirs(file.get_temp_dir(), exist_ok=True)
        shutil.copyfile(test_file_path, input_file_path)

        job_context = {"job_id": processor_job.id,
                       "job": processor_job,
                       "input_file_path": input_file_path}

        job_context = array_express._determine_brainarray_package(job_context)

        self.assertEqual(job_context["brainarray_package"], "hugene10sthsentrezgprobe")
        file.remove_temp_directory.assert_not_called()

        # Clean up the copied file
        os.remove(input_file_path)
Esempio n. 28
0
    def test_run_salmon_failure(self):
        batch, first_fastq_file, second_fastq_file = init_objects()
        processor_job = ProcessorJob.create_job_and_relationships(
            batches=[batch])

        # Mock out the job_context with everything the function under
        # test will expect
        input_file_path_1 = first_fastq_file.get_temp_pre_path("dummy")
        input_file_path_2 = second_fastq_file.get_temp_pre_path("dummy")
        job_context = utils.start_job({
            "job": processor_job,
            "job_id": processor_job.id,
            "job_dir_prefix": "dummy",
            "batches": [batch],
            "index_directory": "missing",
            "input_file_path": input_file_path_1,
            "input_file_path_2": input_file_path_2,
            "output_directory": "blah"
        })

        # The function being tested.
        job_context = salmon._run_salmon(job_context)

        self.assertFalse(job_context["success"])
        self.assertNotEqual(processor_job.failure_reason, None)
        self.assertFalse(os.path.isfile(batch.files[0].get_temp_pre_path()))
Esempio n. 29
0
    def test_failure(self):
        batch = init_objects()
        batch.platform_accession_code = "TEST2"
        batch.save()
        file = batch.files[0]
        file.internal_location = "TEST2/AFFY_TO_PCL"
        file.name = "dummy"
        file.save()

        processor_job = ProcessorJob.create_job_and_relationships(batches=[batch])

        output_file_path = file.get_processed_path()
        job_context = {"job_id": processor_job.id,
                       "job": processor_job,
                       "batches": [batch]}

        # If output_file exists, remove it first.
        if os.path.isfile(output_file_path):
            os.remove(output_file_path)

        job_context = no_op._no_op_processor_fn(job_context)

        # success is only populated by this function on an error
        self.assertFalse(job_context["success"])
        self.assertFalse(os.path.isfile(output_file_path))
        self.assertEqual(processor_job.failure_reason,
                         "Exception caught while moving file dummy")
Esempio n. 30
0
    def test_success(self, mock_file_objects):
        batch = init_objects()

        # Prevent the test file from getting removed.
        batch.files[0].remove_raw_files = MagicMock()

        processor_job = ProcessorJob.create_job_and_relationships(batches=[batch])

        output_file_path = batch.files[0].get_processed_path()
        job_context = {"job_id": processor_job.id,
                       "job": processor_job,
                       "batches": [batch]}

        # If output_file exists, remove it first.
        if os.path.isfile(output_file_path):
            os.remove(output_file_path)

        job_context = no_op._no_op_processor_fn(job_context)

        # success is only populated by this function on an error
        self.assertTrue(job_context["success"])
        self.assertTrue(os.path.isfile(output_file_path))

        # Clean up the processed file
        os.remove(output_file_path)