Example #1
0
    def _generate_files(self, species: Dict) -> None:
        url_builder = ensembl_url_builder_factory(species)
        fasta_download_url = url_builder.build_transcriptome_url()
        gtf_download_url = url_builder.build_gtf_url()

        # Getting the object will ensure it is created in the DB.
        Organism.get_or_create_object_for_id(url_builder.taxonomy_id)

        all_new_files = []

        fasta_filename = url_builder.filename_species + ".fa.gz"
        original_file = OriginalFile()
        original_file.source_filename = fasta_filename
        original_file.source_url = fasta_download_url
        original_file.is_archive = True
        original_file.is_downloaded = False
        original_file.save()
        all_new_files.append(original_file)

        gtf_filename = url_builder.filename_species + ".gtf.gz"
        original_file = OriginalFile()
        original_file.source_filename = gtf_filename
        original_file.source_url = gtf_download_url
        original_file.is_archive = True
        original_file.is_downloaded = False
        original_file.save()
        all_new_files.append(original_file)

        return all_new_files
Example #2
0
    def test_processor_and_organism_in_sample(self):
        sample = Sample.objects.create(accession_code="ACCESSION", title="fake sample")
        homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True)
        homo_sapiens.save()
        transcriptome_result = ComputationalResult.objects.create()
        organism_index = OrganismIndex.objects.create(
            organism=homo_sapiens, result=transcriptome_result, index_type="TRANSCRIPTOME_LONG"
        )
        result = ComputationalResult.objects.create(
            processor=self.salmon_quant_proc, organism_index=organism_index
        )
        SampleResultAssociation.objects.create(sample=sample, result=result)

        response = self.client.get(
            reverse(
                "samples_detail",
                kwargs={"accession_code": sample.accession_code, "version": API_VERSION},
            )
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        processor = response.json()["results"][0]["processor"]
        self.assertEqual(processor["name"], self.salmon_quant_proc.name)
        self.assertEqual(
            processor["environment"]["os_pkg"]["python3"],
            self.salmon_quant_proc.environment["os_pkg"]["python3"],
        )

        organism_index = response.json()["results"][0]["organism_index"]
        self.assertEqual(organism_index["result_id"], transcriptome_result.id)
        self.assertEqual(organism_index["index_type"], "TRANSCRIPTOME_LONG")
Example #3
0
    def test_illumina_id_ref_column_with_whitespace(self):
        """This test case tests the issue brought up in
        https://github.com/alexslemonade/refinebio/issues/1560
        where an ID_REF column would not be detected because the column name had a trailing space
        """

        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        pj = prepare_illumina_job({
            "source_filename":
            "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100301/suppl/GSE100301%5Fnon%2Dnormalized%2Etxt%2Egz",
            "filename":
            "GSE100301_non-normalized.txt",
            "absolute_file_path":
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE100301_non-normalized.txt",
            "organism":
            organism,
            "samples": [
                (
                    "GSM2677583",
                    "22Rv1-tetO-Gal4, replicate 1",
                    {
                        "description": ["SAMPLE 1"],
                    },
                ),
            ],
        })

        final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False)
        self.assertSucceeded(pj)
 def setUp(self):
     # Insert human organism into the database so the model doesn't call the
     # taxonomy API to populate it.
     organism = Organism(name="HOMO_SAPIENS",
                         taxonomy_id=9606,
                         is_scientific_name=True)
     organism.save()
Example #5
0
    def test_illumina_no_pvalue(self):
        """This experiment should fail because it has no p-value columns, so
        make sure it fails at that stage of the processing"""
        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        pj = prepare_illumina_job({
            "source_filename":
            "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE41nnn/GSE41355/suppl/GSE41355%5Fnon%2Dnormalized%2Etxt%2Egz",
            "filename":
            "GSE41355_non-normalized.txt",
            "absolute_file_path":
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE41355_non-normalized.txt",
            "organism":
            organism,
            "samples": [
                ("GSM1015436", "IRF3/7 DKO 2"),
            ],
        })

        final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False)

        self.assertFailed(pj, "Could not detect PValue column!")
Example #6
0
    def test_illumina_rows_starting_with_whitespace(self):
        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        pj = prepare_illumina_job({
            "source_filename":
            "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE112nnn/GSE112517/suppl/GSE112517_non-normalized.txt.gz",
            "filename":
            "GSE112517_non-normalized.txt",
            "absolute_file_path":
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE112517_non-normalized.txt",
            "organism":
            organism,
            "samples": [
                (
                    "GSM3071991",
                    "MCF-7 KLHDC7B siRNA knockdown control",
                    {
                        "description": ["SAMPLE 1"],
                    },
                ),
                (
                    "GSM3071992",
                    "MCF-7 KLHDC7B siRNA knockdown",
                    {
                        "description": ["SAMPLE 2"],
                    },
                ),
            ],
        })

        final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False)
        self.assertSucceeded(pj)
Example #7
0
    def test_calls_survey(self, mock_get):
        """If source_type is supported calls the appropriate survey method."""
        mock_get.side_effect = mocked_requests_get

        # Prevent a call being made to NCBI's API to determine
        # organism name/id.
        organism = Organism(name="H**O SAPIENS", taxonomy_id=9606, is_scientific_name=True)
        organism.save()

        survey_job = SurveyJob(source_type="ARRAY_EXPRESS")
        survey_job.save()
        key_value_pair = SurveyJobKeyValue(survey_job=survey_job,
                                           key="experiment_accession_code",
                                           value="E-GEOD-22166")
        key_value_pair.save()

        surveyor.run_job(survey_job)
        logger.info("Started Survey Job %d, waiting for it to complete.", survey_job.id)
        survey_job = wait_for_job(survey_job, SurveyJob)
        self.assertTrue(survey_job.success)

        batch = Batch.objects.all()[0]
        batch = Batch.objects.filter(survey_job=survey_job).get()

        downloader_job = batch.downloaderjob_set.get()
        logger.info("Survey Job finished, waiting for Downloader Job %d to complete.",
                    downloader_job.id)
        downloader_job = wait_for_job(downloader_job, DownloaderJob)
        self.assertTrue(downloader_job.success)

        processor_job = batch.processorjob_set.get()
        logger.info("Downloader Job finished, waiting for processor Job %d to complete.",
                    processor_job.id)
        processor_job = wait_for_job(processor_job, ProcessorJob)
        self.assertTrue(processor_job.success)
Example #8
0
    def test_illumina_to_pcl(self):
        """Most basic Illumina to PCL test"""

        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        job = prepare_illumina_job({**GSE22427, "organism": organism})

        # Remove the title of one of the samples to make sure that we can still
        # find its detection column using the description given as an annotation
        sample = Sample.objects.get(title="LV-T350A&si-EZH2-3")
        sample.title = "ignoreme_for_description"
        sample.accession_code = "ignoreme_for_description"
        sample.save()

        final_context = illumina.illumina_to_pcl(job.pk, cleanup=False)
        self.assertSucceeded(job)

        for sample in final_context["samples"]:
            smashme = sample.get_most_recent_smashable_result_file()
            self.assertTrue(os.path.exists(smashme.absolute_file_path))
            os.remove(smashme.absolute_file_path)

        # Cleanup after the job since it won't since we aren't running in cloud.
        shutil.rmtree(final_context["work_dir"], ignore_errors=True)
Example #9
0
 def setUp(self):
     self.homo_sapiens = Organism(name="HOMO_SAPIENS",
                                  taxonomy_id=9606,
                                  is_scientific_name=True)
     self.homo_sapiens.save()
     self.danio_rerio = Organism(name="DANIO_RERIO",
                                 taxonomy_id=1337,
                                 is_scientific_name=True)
     self.danio_rerio.save()
Example #10
0
    def test_convert_processed_illumina(self):
        job = ProcessorJob()
        job.pipeline_applied = "NO_OP"
        job.save()

        # ex:
        # Reporter Identifier VALUE   Detection Pval
        # ILMN_1343291    14.943602   0
        # ILMN_1343295    13.528082   0
        og_file = OriginalFile()
        og_file.source_filename = "https://www.ebi.ac.uk/arrayexpress/experiments/E-GEOD-22433/"
        og_file.filename = "GSM557500_sample_table.txt"
        og_file.absolute_file_path = (
            "/home/user/data_store/raw/TEST/NO_OP/GSM557500_sample_table.txt")
        og_file.is_downloaded = True
        og_file.save()

        homo_sapiens = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
        homo_sapiens.save()

        sample = Sample()
        sample.accession_code = "GSM557500"
        sample.title = "GSM557500"
        sample.platform_accession_code = "A-MEXP-1171"
        sample.manufacturer = "ILLUMINA"
        sample.organism = homo_sapiens
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.original_file = og_file
        assoc.sample = sample
        assoc.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = job
        assoc1.save()

        # To:
        # ENSG00000156508 14.943602
        # ENSG00000111640 13.528082
        final_context = no_op.no_op_processor(job.pk)
        self.assertTrue(final_context["success"])
        self.assertTrue(os.path.exists(final_context["output_file_path"]))
        self.assertEqual(os.path.getsize(final_context["output_file_path"]),
                         920374)
        self.assertTrue(
            no_op.check_output_quality(final_context["output_file_path"]))
Example #11
0
    def test_convert_illumina_no_header(self):
        job = ProcessorJob()
        job.pipeline_applied = "NO_OP"
        job.save()

        # ex:
        # ILMN_1885639    10.0000 0.7931
        # ILMN_2209417    10.0000 0.2029
        # ILMN_1765401    152.0873    0.0000
        og_file = OriginalFile()
        og_file.source_filename = (
            "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1.txt"
        )
        og_file.filename = "GSM1089291-tbl-1.txt"
        og_file.absolute_file_path = "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1.txt"
        og_file.is_downloaded = True
        og_file.save()

        homo_sapiens = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
        homo_sapiens.save()

        sample = Sample()
        sample.accession_code = "GSM557500"
        sample.title = "GSM557500"
        sample.platform_accession_code = "A-MEXP-1171"
        sample.manufacturer = "ILLUMINA"
        sample.organism = homo_sapiens
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.original_file = og_file
        assoc.sample = sample
        assoc.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = job
        assoc1.save()

        # To:
        # ENSG00000105675 10
        # ENSG00000085721 152.0873
        # ENSG00000278494 152.0873
        final_context = no_op.no_op_processor(job.pk)
        self.assertTrue(final_context["success"])
        self.assertTrue(os.path.exists(final_context["output_file_path"]))
        self.assertEqual(os.path.getsize(final_context["output_file_path"]),
                         786207)
Example #12
0
    def test_good_detection(self):
        """GSE54661 appears to be mislabled (illuminaHumanv4) on GEO. Shows our detector works. """
        from data_refinery_workers.processors import illumina

        pj = ProcessorJob()
        pj.pipeline_applied = "ILLUMINA_TO_PCL"
        pj.save()

        og_file = OriginalFile()
        og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE54nnn/GSE54661/suppl/GSE54661%5Fnon%5Fnormalized%2Etxt%2Egz"
        og_file.filename = "GSE54661_non_normalized.txt"
        og_file.absolute_file_path = (
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE54661_non_normalized.txt"
        )
        og_file.is_downloaded = True
        og_file.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = pj
        assoc1.save()

        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        sample = Sample()
        sample.accession_code = "ABCD-1234"
        sample.title = "hypoxia_Signal"
        sample.organism = organism
        sample.save()

        sample_assoc = OriginalFileSampleAssociation()
        sample_assoc.original_file = og_file
        sample_assoc.sample = sample
        sample_assoc.save()

        final_context = illumina.illumina_to_pcl(pj.pk)
        self.assertEqual(final_context["platform"], "illuminaHumanv3")

        for key in final_context["samples"][0].sampleannotation_set.all(
        )[0].data.keys():
            self.assertTrue(key in [
                "detected_platform", "detection_percentage",
                "mapped_percentage"
            ])

        # Cleanup after the job since it won't since we aren't running in cloud.
        shutil.rmtree(final_context["work_dir"], ignore_errors=True)
Example #13
0
    def test_bad_illumina_detection(self):
        """With the wrong species, this will fail the platform detection threshold."""

        organism = Organism(name="RATTUS_NORVEGICUS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        job = prepare_illumina_job({**GSE22427, "organism": organism})
        final_context = illumina.illumina_to_pcl(job.pk, cleanup=False)
        self.assertTrue(final_context["abort"])

        # Cleanup after the job since it won't since we aren't running in cloud.
        shutil.rmtree(final_context["work_dir"], ignore_errors=True)
Example #14
0
    def test_convert_illumina_bad_cols(self):
        """
        In future, this test may be deprecated. For now it just alerts that it needs attention.
        """
        job = ProcessorJob()
        job.pipeline_applied = "NO_OP"
        job.save()

        # ex:
        # ILMN_1885639    10.0000 0.7931  11.0000 0.123
        # ILMN_2209417    10.0000 0.2029  11.1234 0.543
        # LMN_1765401    152.0873    0.0000  99.999  0.19
        og_file = OriginalFile()
        og_file.source_filename = (
            "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1-modified.txt"
        )
        og_file.filename = "GSM1089291-tbl-1-modified.txt"
        og_file.absolute_file_path = (
            "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1-modified.txt"
        )
        og_file.is_downloaded = True
        og_file.save()

        homo_sapiens = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
        homo_sapiens.save()

        sample = Sample()
        sample.accession_code = "GSM557500"
        sample.title = "GSM557500"
        sample.platform_accession_code = "A-MEXP-1171"
        sample.manufacturer = "ILLUMINA"
        sample.organism = homo_sapiens
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.original_file = og_file
        assoc.sample = sample
        assoc.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = job
        assoc1.save()

        final_context = no_op.no_op_processor(job.pk)
        self.assertFalse(final_context["success"])
        self.assertTrue("Tell Rich!" in final_context["job"].failure_reason)
Example #15
0
    def setUp(self):
        survey_job = SurveyJob(source_type="SRA")
        survey_job.save()
        self.survey_job = survey_job

        key_value_pair = SurveyJobKeyValue(survey_job=survey_job,
                                           key="experiment_accession_code",
                                           value="DRR002116")
        key_value_pair.save()

        # Insert the organism into the database so the model doesn't call the
        # taxonomy API to populate it.
        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()
Example #16
0
def prepare_organism_indices():
    c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS")

    # This is a lie, but this image doesn't have the dependencies for TRANSCRIPTOME_INDEX
    computational_result_short = ComputationalResult(processor=utils.find_processor('SALMON_QUANT'))
    computational_result_short.save()

    organism_index = OrganismIndex()
    organism_index.index_type = "TRANSCRIPTOME_SHORT"
    organism_index.organism = c_elegans
    organism_index.result = computational_result_short
    organism_index.absolute_directory_path = "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT"
    organism_index.save()

    comp_file = ComputedFile()
    # This path will not be used because we already have the files extracted.
    comp_file.absolute_file_path = "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT/celgans_short.tar.gz"
    comp_file.result = computational_result_short
    comp_file.size_in_bytes=1337
    comp_file.sha1="ABC"
    comp_file.save()

    # This is a lie, but this image doesn't have the dependencies for TX_IMPORT
    computational_result_long = ComputationalResult(processor=utils.find_processor('SALMON_QUANT'))
    computational_result_long.save()
def _prepare_input(job_context: Dict) -> Dict:
    start_time = log_state("prepare input", job_context["job"].id)

    # We'll store here all sample accession codes that didn't make it into the compendia
    # with the reason why not.
    job_context["filtered_samples"] = {}

    job_context = smashing_utils.prepare_files(job_context)

    # Compendia jobs only run for one organism, so we know the only
    # key will be the organism name, unless of course we've already failed.
    if job_context["job"].success is not False:
        job_context["organism_name"] = job_context["group_by_keys"][0]

        # TEMPORARY for iterating on compendia more quickly. Rather
        # than downloading the data from S3 each run we're just gonna
        # use the same directory every job.
        job_context["old_work_dir"] = job_context["work_dir"]
        job_context[
            "work_dir"] = SMASHING_DIR + job_context["organism_name"] + "/"
        if not os.path.exists(job_context["work_dir"]):
            os.makedirs(job_context["work_dir"])

    job_context["organism_object"] = Organism.get_object_for_name(
        job_context["organism_name"])
    job_context["compendium_version"] = (CompendiumResult.objects.filter(
        primary_organism=job_context["organism_object"],
        quant_sf_only=False).count() + 1)

    job_context["all_organisms"] = job_context["samples"].keys()
    all_samples = list(itertools.chain(*job_context["samples"].values()))
    job_context["samples"] = {job_context["organism_name"]: all_samples}

    log_state("prepare input done", job_context["job"].id, start_time)
    return job_context
Example #18
0
def prepare_job():
    pj = ProcessorJob()
    pj.pipeline_applied = "AFFY_TO_PCL"
    pj.save()

    original_file = OriginalFile()
    original_file.source_filename = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip"
    original_file.filename = "GSM1426071_CD_colon_active_1.CEL"
    original_file.absolute_file_path = "/home/user/data_store/raw/TEST/CEL/GSM1426071_CD_colon_active_1.CEL"
    original_file.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = original_file
    assoc1.processor_job = pj
    assoc1.save()

    c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS")

    sample = Sample()
    sample.title = "Heyo"
    sample.organism = c_elegans
    sample.is_processed = False
    sample.save()

    ogsa = OriginalFileSampleAssociation()
    ogsa.sample = sample
    ogsa.original_file = original_file
    ogsa.save()

    return pj
Example #19
0
def get_organism_with_qn_target():
    result = ComputationalResult()
    result.save()

    qn_target = ComputedFile()
    qn_target.filename = "danio_target.tsv"
    qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv"
    qn_target.is_qn_target = True
    qn_target.size_in_bytes = "12345"
    qn_target.sha1 = "aabbccddeeff"
    qn_target.result = result
    qn_target.save()

    danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1, qn_target=result)
    danio_rerio.save()
    return danio_rerio
Example #20
0
    def test_single_read(self):
        """Test outputs when the sample has one read only."""
        job_context = {
            'job_id': 456,
            'job': ProcessorJob(),
            'pipeline': Pipeline(name="Salmon"),
            'input_file_path': self.test_dir + 'single_input/single_read.fastq',
            'output_directory': self.test_dir + 'single_output/',
            'salmontools_directory': self.test_dir + 'single_salmontools/',
            'salmontools_archive': self.test_dir + 'salmontools-result.tar.gz',
            'computed_files': []
        }
        os.makedirs(job_context["salmontools_directory"], exist_ok=True)

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")
        sample = Sample()
        sample.organism = homo_sapiens
        sample.save()
        job_context["sample"] = sample

        salmon._run_salmontools(job_context)

        # Confirm job status
        self.assertTrue(job_context["success"])

        # Unpack result for checking
        os.system('gunzip ' + job_context['salmontools_directory'] + "*.gz")

        # Check output file
        output_file = job_context['salmontools_directory'] + 'unmapped_by_salmon.fa'
        expected_output_file = self.test_dir + 'expected_single_output/unmapped_by_salmon.fa'
        self.assertTrue(identical_checksum(output_file, expected_output_file))
Example #21
0
    def test_salmontools_with_bad_processor(self):
        """Test salmontools with a bad processor key."""
        test_dir = '/home/user/data_store/salmontools/'
        job_context = {
            'job_id': 123,
            'job': ProcessorJob.objects.create(),
            'pipeline': Pipeline(name="Salmon"),
            'input_file_path': test_dir + 'double_input/reads_1.fastq',
            'input_file_path_2': test_dir + 'double_input/reads_2.fastq',
            'salmontools_directory': test_dir + 'double_salmontools/',
            'salmontools_archive': test_dir + 'salmontools-result.tar.gz',
            'output_directory': test_dir + 'double_output/'
        }
        os.makedirs(job_context["salmontools_directory"], exist_ok=True)
        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")
        sample = Sample()
        sample.organism = homo_sapiens
        sample.save()
        job_context["sample"] = sample

        # Set the wrong yml filename on purpose to mess up Salmontools processor
        original_yml_file = utils.ProcessorEnum['SALMONTOOLS'].value['yml_file']
        utils.ProcessorEnum['SALMONTOOLS'].value['yml_file'] = 'foobar.yml'

        salmon._run_salmontools(job_context)
        self.assertEqual(job_context["success"], False)
        self.assertTrue(job_context["job"].failure_reason.startswith('Failed to set processor:'))

        # Change yml filename back
        utils.ProcessorEnum['SALMONTOOLS'].value['yml_file'] = original_yml_file
Example #22
0
def prepare_dotsra_job(filename="ERR1562482.sra"):
    pj = ProcessorJob()
    pj.pipeline_applied = "SALMON"
    pj.id = random.randint(111, 999999)
    pj.save()

    c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS")

    samp = Sample()
    samp.accession_code = "SALMON" # So the test files go to the right place
    samp.organism = c_elegans
    samp.save()

    prepare_organism_indices()

    og_file = OriginalFile()
    og_file.source_filename = filename
    og_file.filename = filename
    og_file.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/" + filename
    og_file.save()

    og_file_samp_assoc = OriginalFileSampleAssociation()
    og_file_samp_assoc.original_file = og_file
    og_file_samp_assoc.sample = samp
    og_file_samp_assoc.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = pj
    assoc1.save()

    return pj, [og_file]
Example #23
0
    def test_survey(self, mock_get, mock_urlopen, mock_send_job):
        json_file_path = os.path.join(os.path.dirname(__file__),
                                      "test_transcriptome_species.json")
        with open(json_file_path, "r") as json_file:
            species_json = json.load(json_file)

        # Insert the organisms into the database so the model doesn't call the
        # taxonomy API to populate them.
        for species in species_json:
            # Account for the subtle difference between the API for
            # the main Ensembl division and the API for the rest of
            # them.
            name_key = "common_name" if "common_name" in species else "name"
            taxonomy_key = "taxonomy_id" if "taxonomy_id" in species else "taxon_id"
            organism = Organism(name=species[name_key].upper(),
                                taxonomy_id=species[taxonomy_key],
                                is_scientific_name=True)
            organism.save()

        mock_get.return_value = Mock(ok=True)
        mock_get.return_value.json.return_value = species_json

        # There are two possible file locations. The correct one is
        # determined by making a request to one to see if it
        # exists. This URLError simulates it not existing.
        mock_urlopen.side_effect = URLError("404 or something")

        surveyor = TranscriptomeIndexSurveyor(self.survey_job)
        surveyor.survey()

        downloader_jobs = DownloaderJob.objects.order_by("id").all()
        self.assertEqual(downloader_jobs.count(), len(species_json))
        send_job_calls = []
        for downloader_job in downloader_jobs:
            send_job_calls.append(
                call(Downloaders.TRANSCRIPTOME_INDEX, downloader_job.id))

        mock_send_job.assert_has_calls(send_job_calls)

        # There should be 2 Batches for each species (long and short
        # transcriptome lengths).
        batches = Batch.objects.all()
        self.assertEqual(batches.count(), len(species_json) * 2)
        # And each batch has two files: fasta and gtf
        for batch in batches:
            self.assertEqual(len(batch.files), 2)
Example #24
0
    def test_unmated_reads(self):
        """Survey, download, then process a sample we know is SRA and has unmated reads.

        This test uses VCR to remove the dependence upon NCBI's
        servers, but the downloader job hits ENA's FTP and aspera
        servers. Unfortunately there's not much that can be done to
        avoid that behavior from here because the downloader jobs
        always check ENA's FTP server to see if the file has an
        unmated read. For now we'll just have to be content with the
        fact that NCBI going down won't affect this test.
        """
        # Clear out pre-existing work dirs so there's no conflicts:
        self.env = EnvironmentVarGuard()
        self.env.set("RUNING_IN_CLOUD", "False")
        with self.env:
            for work_dir in glob.glob(LOCAL_ROOT_DIR + "/processor_job_*"):
                shutil.rmtree(work_dir)

            # prevent a call being made to NCBI's API to determine
            # organism name/id.
            organism = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
            organism.save()

            # Survey just a single run to make things faster!
            # This sample has unmated reads!
            survey_job = surveyor.survey_experiment("SRR1603661", "SRA")

            self.assertTrue(survey_job.success)

            # Let's give the downloader a little bit to get started
            # and to update the OriginalFiles' source_urls.
            time.sleep(60)

            downloader_jobs = DownloaderJob.objects.all()
            self.assertEqual(downloader_jobs.count(), 1)
            downloader_job = downloader_jobs.first()

            self.assertIsNotNone(downloader_job.start_time)

            for original_file in downloader_job.original_files.all():
                self.assertTrue(".fastq.gz" in original_file.source_url)

            # The downloader job will take a while to complete. Let's not wait.
            print(downloader_job.kill_nomad_job())
Example #25
0
    def test_salmon_quant_one_sample_double_reads(self):
        """Test `salmon quant` on a sample that has double reads."""
        # Set up organism index database objects.
        prepare_organism_indices()

        # Create an Experiment that includes two samples.
        # (The first sample has test data available, but the second does not.)
        experiment_accession = 'test_experiment'
        experiment = Experiment.objects.create(accession_code=experiment_accession)

        c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS")

        # test_sample record
        sample_accession = 'test_sample'
        test_sample = Sample.objects.create(accession_code=sample_accession,
                                            organism=c_elegans,
                                            source_database='SRA',
                                            technology='RNA-SEQ')
        ExperimentSampleAssociation.objects.create(experiment=experiment, sample=test_sample)
        # fake_sample record (created to prevent tximport step in this experiment)
        fake_sample = Sample.objects.create(accession_code='fake_sample',
                                            source_database='SRA',
                                            technology='RNA-SEQ')
        ExperimentSampleAssociation.objects.create(experiment=experiment, sample=fake_sample)

        experiment_dir = '/home/user/data_store/salmon_tests/test_experiment'

        og_read_1 = OriginalFile()
        og_read_1.absolute_file_path = os.path.join(experiment_dir, 'raw/reads_1.fastq')
        og_read_1.filename = "reads_1.fastq"
        og_read_1.save()

        OriginalFileSampleAssociation.objects.create(original_file=og_read_1, sample=test_sample).save()

        og_read_2 = OriginalFile()
        og_read_2.absolute_file_path = os.path.join(experiment_dir, "raw/reads_2.fastq")
        og_read_2.filename = "reads_1.fastq"
        og_read_2.save()

        OriginalFileSampleAssociation.objects.create(original_file=og_read_2, sample=test_sample).save()

        sample_dir = os.path.join(experiment_dir, 'test_sample')

        job_context = salmon._prepare_files({"job_dir_prefix": "TEST",
                                             "job_id": "TEST",
                                             "job": ProcessorJob(),
                                             'pipeline': Pipeline(name="Salmon"),
                                             'computed_files': [],
                                             "original_files": [og_read_1, og_read_2]})

        # Run salmon.
        self.check_salmon_quant(job_context, sample_dir)

        # Confirm that this experiment is not ready for tximport yet,
        # because `salmon quant` is not run on 'fake_sample'.
        experiments_ready = salmon.get_tximport_inputs(job_context)['tximport_inputs']
        self.assertEqual(len(experiments_ready), 0)
Example #26
0
class APITestCases(APITestCase):
    def setUp(self):
        self.homo_sapiens = Organism(name="HOMO_SAPIENS",
                                     taxonomy_id=9606,
                                     is_scientific_name=True)
        self.homo_sapiens.save()
        self.danio_rerio = Organism(name="DANIO_RERIO",
                                    taxonomy_id=1337,
                                    is_scientific_name=True)
        self.danio_rerio.save()

    def tearDown(self):
        Organism.objects.all().delete()

    def test_qn_endpoints(self):
        # create two qn endpoints

        result = ComputationalResult()
        result.commands.append("create_qn_target.py")
        result.is_ccdl = True
        result.is_public = True
        result.processor = None
        result.save()

        cra = ComputationalResultAnnotation()
        cra.result = result
        cra.data = {
            "organism_id": self.danio_rerio.id,  # Danio
            "is_qn": True,
            "platform_accession_code": "zebrafish",
            "samples": [],
            "geneset": str(["RWWJ000001", "RWWJ000002"]),
        }
        cra.save()
        cra = ComputationalResultAnnotation()
        cra.result = result
        cra.data = {
            "organism_id": self.homo_sapiens.id,  # IDK
            "is_qn": True,
            "platform_accession_code": "zebrafishplusone",
            "samples": [],
            "geneset": str(["RWWJ000003", "RWWJ000004"]),
        }
        cra.save()

        self.homo_sapiens.qn_target = result
        self.homo_sapiens.save()
        self.danio_rerio.qn_target = result
        self.danio_rerio.save()

        response = self.client.get(
            reverse("qn_targets_available", kwargs={"version": API_VERSION}))

        self.assertEqual(len(response.json()), 2)
Example #27
0
    def test_illumina_to_pcl(self):
        """ Most basic Illumina to PCL test """
        from data_refinery_workers.processors import illumina

        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        job = prepare_illumina_job(organism)
        final_context = illumina.illumina_to_pcl(job.pk)

        for sample in final_context["samples"]:
            smashme = sample.get_most_recent_smashable_result_file()
            self.assertTrue(os.path.exists(smashme.absolute_file_path))
            os.remove(smashme.absolute_file_path)

        # Cleanup after the job since it won't since we aren't running in cloud.
        shutil.rmtree(final_context["work_dir"], ignore_errors=True)
Example #28
0
    def test_detect_columns(self):
        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        job = prepare_illumina_job({**GSE22427, "organism": organism})

        pipeline = Pipeline(name=PipelineEnum.ILLUMINA.value)

        final_context = utils.run_pipeline(
            {
                "job_id": job.id,
                "pipeline": pipeline
            },
            [
                utils.start_job,
                illumina._prepare_files,
                illumina._detect_encoding,
                illumina._sanitize_input_file,
                illumina._convert_sanitized_to_tsv,
                illumina._detect_columns,
            ],
        )

        self.assertNotEqual(final_context.get("success"), False)

        # For this experiment, the probe ID is the first column
        self.assertEqual(final_context.get("probeId"), GSE22427_HEADER[0])

        expected_column_ids = ",".join(
            map(
                lambda t: str(t[0]),
                filter(
                    # For this header file, the samples all have the prefix LV-
                    lambda t: t[1].startswith("LV-"),
                    # We use start=1 here because the column IDs are formatted
                    # for R code so they treat the header as a 1-indexed list
                    enumerate(GSE22427_HEADER, start=1),
                ),
            ))
        self.assertEqual(final_context.get("columnIds"), expected_column_ids)
Example #29
0
def prepare_job(job_info: dict) -> ProcessorJob:
    job = ProcessorJob()
    job.pipeline_applied = "NO_OP"
    job.save()

    og_file = OriginalFile()
    og_file.source_filename = job_info["source_filename"]
    og_file.filename = job_info["filename"]
    og_file.absolute_file_path = job_info["absolute_file_path"]
    og_file.is_downloaded = True
    og_file.save()

    sample = Sample()
    sample.accession_code = job_info["accession_code"]
    sample.title = job_info["accession_code"]
    sample.platform_accession_code = job_info["platform_accession_code"]

    manufacturer = job_info.get("manufacturer", None)
    if manufacturer is not None:
        sample.manufacturer = manufacturer

    # The illumina samples need the human organism
    if manufacturer == "ILLUMINA":
        homo_sapiens = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
        homo_sapiens.save()
        sample.organism = homo_sapiens

    sample.save()

    assoc = OriginalFileSampleAssociation()
    assoc.original_file = og_file
    assoc.sample = sample
    assoc.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = job
    assoc1.save()

    return job
Example #30
0
    def test_illumina_quoted_row_names(self):
        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        pj = prepare_illumina_job({
            "source_filename":
            "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE33nnn/GSE33814/suppl/GSE33814%5Fnon%2Dnormalized%2Etxt%2Egz",
            # Some of the columns are trimmed to save space and time
            "filename":
            "GSE33814_trimmed_non-normalized.txt",
            "absolute_file_path":
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE33814_trimmed_non-normalized.txt",
            "organism":
            organism,
            "samples": [
                ("GSM836222", "IMGUS_32"),
                ("GSM836223", "IMGUS_33"),
            ],
        })

        final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False)
        self.assertSucceeded(pj)

        # Make sure that the row names are no longer quoted after sanitizing the file
        def assertNotQuoted(string: str):
            string = string.strip()
            self.assertNotEqual(string[0], '"')
            self.assertNotEqual(string[-1], '"')

        with open(final_context["sanitized_file_path"], "r") as f:
            reader = csv.reader(f, delimiter="\t")

            headers = next(reader)
            for header in headers:
                assertNotQuoted(header)

            # Also make sure the probe IDs aren't qutoed
            first_row = next(reader)
            assertNotQuoted(first_row[0])