def _generate_files(self, species: Dict) -> None: url_builder = ensembl_url_builder_factory(species) fasta_download_url = url_builder.build_transcriptome_url() gtf_download_url = url_builder.build_gtf_url() # Getting the object will ensure it is created in the DB. Organism.get_or_create_object_for_id(url_builder.taxonomy_id) all_new_files = [] fasta_filename = url_builder.filename_species + ".fa.gz" original_file = OriginalFile() original_file.source_filename = fasta_filename original_file.source_url = fasta_download_url original_file.is_archive = True original_file.is_downloaded = False original_file.save() all_new_files.append(original_file) gtf_filename = url_builder.filename_species + ".gtf.gz" original_file = OriginalFile() original_file.source_filename = gtf_filename original_file.source_url = gtf_download_url original_file.is_archive = True original_file.is_downloaded = False original_file.save() all_new_files.append(original_file) return all_new_files
def test_processor_and_organism_in_sample(self): sample = Sample.objects.create(accession_code="ACCESSION", title="fake sample") homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() transcriptome_result = ComputationalResult.objects.create() organism_index = OrganismIndex.objects.create( organism=homo_sapiens, result=transcriptome_result, index_type="TRANSCRIPTOME_LONG" ) result = ComputationalResult.objects.create( processor=self.salmon_quant_proc, organism_index=organism_index ) SampleResultAssociation.objects.create(sample=sample, result=result) response = self.client.get( reverse( "samples_detail", kwargs={"accession_code": sample.accession_code, "version": API_VERSION}, ) ) self.assertEqual(response.status_code, status.HTTP_200_OK) processor = response.json()["results"][0]["processor"] self.assertEqual(processor["name"], self.salmon_quant_proc.name) self.assertEqual( processor["environment"]["os_pkg"]["python3"], self.salmon_quant_proc.environment["os_pkg"]["python3"], ) organism_index = response.json()["results"][0]["organism_index"] self.assertEqual(organism_index["result_id"], transcriptome_result.id) self.assertEqual(organism_index["index_type"], "TRANSCRIPTOME_LONG")
def test_illumina_id_ref_column_with_whitespace(self): """This test case tests the issue brought up in https://github.com/alexslemonade/refinebio/issues/1560 where an ID_REF column would not be detected because the column name had a trailing space """ organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() pj = prepare_illumina_job({ "source_filename": "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100301/suppl/GSE100301%5Fnon%2Dnormalized%2Etxt%2Egz", "filename": "GSE100301_non-normalized.txt", "absolute_file_path": "/home/user/data_store/raw/TEST/ILLUMINA/GSE100301_non-normalized.txt", "organism": organism, "samples": [ ( "GSM2677583", "22Rv1-tetO-Gal4, replicate 1", { "description": ["SAMPLE 1"], }, ), ], }) final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False) self.assertSucceeded(pj)
def setUp(self): # Insert human organism into the database so the model doesn't call the # taxonomy API to populate it. organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save()
def test_illumina_no_pvalue(self): """This experiment should fail because it has no p-value columns, so make sure it fails at that stage of the processing""" organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() pj = prepare_illumina_job({ "source_filename": "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE41nnn/GSE41355/suppl/GSE41355%5Fnon%2Dnormalized%2Etxt%2Egz", "filename": "GSE41355_non-normalized.txt", "absolute_file_path": "/home/user/data_store/raw/TEST/ILLUMINA/GSE41355_non-normalized.txt", "organism": organism, "samples": [ ("GSM1015436", "IRF3/7 DKO 2"), ], }) final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False) self.assertFailed(pj, "Could not detect PValue column!")
def test_illumina_rows_starting_with_whitespace(self): organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() pj = prepare_illumina_job({ "source_filename": "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE112nnn/GSE112517/suppl/GSE112517_non-normalized.txt.gz", "filename": "GSE112517_non-normalized.txt", "absolute_file_path": "/home/user/data_store/raw/TEST/ILLUMINA/GSE112517_non-normalized.txt", "organism": organism, "samples": [ ( "GSM3071991", "MCF-7 KLHDC7B siRNA knockdown control", { "description": ["SAMPLE 1"], }, ), ( "GSM3071992", "MCF-7 KLHDC7B siRNA knockdown", { "description": ["SAMPLE 2"], }, ), ], }) final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False) self.assertSucceeded(pj)
def test_calls_survey(self, mock_get): """If source_type is supported calls the appropriate survey method.""" mock_get.side_effect = mocked_requests_get # Prevent a call being made to NCBI's API to determine # organism name/id. organism = Organism(name="H**O SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() survey_job = SurveyJob(source_type="ARRAY_EXPRESS") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value="E-GEOD-22166") key_value_pair.save() surveyor.run_job(survey_job) logger.info("Started Survey Job %d, waiting for it to complete.", survey_job.id) survey_job = wait_for_job(survey_job, SurveyJob) self.assertTrue(survey_job.success) batch = Batch.objects.all()[0] batch = Batch.objects.filter(survey_job=survey_job).get() downloader_job = batch.downloaderjob_set.get() logger.info("Survey Job finished, waiting for Downloader Job %d to complete.", downloader_job.id) downloader_job = wait_for_job(downloader_job, DownloaderJob) self.assertTrue(downloader_job.success) processor_job = batch.processorjob_set.get() logger.info("Downloader Job finished, waiting for processor Job %d to complete.", processor_job.id) processor_job = wait_for_job(processor_job, ProcessorJob) self.assertTrue(processor_job.success)
def test_illumina_to_pcl(self): """Most basic Illumina to PCL test""" organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() job = prepare_illumina_job({**GSE22427, "organism": organism}) # Remove the title of one of the samples to make sure that we can still # find its detection column using the description given as an annotation sample = Sample.objects.get(title="LV-T350A&si-EZH2-3") sample.title = "ignoreme_for_description" sample.accession_code = "ignoreme_for_description" sample.save() final_context = illumina.illumina_to_pcl(job.pk, cleanup=False) self.assertSucceeded(job) for sample in final_context["samples"]: smashme = sample.get_most_recent_smashable_result_file() self.assertTrue(os.path.exists(smashme.absolute_file_path)) os.remove(smashme.absolute_file_path) # Cleanup after the job since it won't since we aren't running in cloud. shutil.rmtree(final_context["work_dir"], ignore_errors=True)
def setUp(self): self.homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) self.homo_sapiens.save() self.danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1337, is_scientific_name=True) self.danio_rerio.save()
def test_convert_processed_illumina(self): job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # Reporter Identifier VALUE Detection Pval # ILMN_1343291 14.943602 0 # ILMN_1343295 13.528082 0 og_file = OriginalFile() og_file.source_filename = "https://www.ebi.ac.uk/arrayexpress/experiments/E-GEOD-22433/" og_file.filename = "GSM557500_sample_table.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/NO_OP/GSM557500_sample_table.txt") og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() # To: # ENSG00000156508 14.943602 # ENSG00000111640 13.528082 final_context = no_op.no_op_processor(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["output_file_path"])) self.assertEqual(os.path.getsize(final_context["output_file_path"]), 920374) self.assertTrue( no_op.check_output_quality(final_context["output_file_path"]))
def test_convert_illumina_no_header(self): job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # ILMN_1885639 10.0000 0.7931 # ILMN_2209417 10.0000 0.2029 # ILMN_1765401 152.0873 0.0000 og_file = OriginalFile() og_file.source_filename = ( "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1.txt" ) og_file.filename = "GSM1089291-tbl-1.txt" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1.txt" og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() # To: # ENSG00000105675 10 # ENSG00000085721 152.0873 # ENSG00000278494 152.0873 final_context = no_op.no_op_processor(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["output_file_path"])) self.assertEqual(os.path.getsize(final_context["output_file_path"]), 786207)
def test_good_detection(self): """GSE54661 appears to be mislabled (illuminaHumanv4) on GEO. Shows our detector works. """ from data_refinery_workers.processors import illumina pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE54nnn/GSE54661/suppl/GSE54661%5Fnon%5Fnormalized%2Etxt%2Egz" og_file.filename = "GSE54661_non_normalized.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/ILLUMINA/GSE54661_non_normalized.txt" ) og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() sample = Sample() sample.accession_code = "ABCD-1234" sample.title = "hypoxia_Signal" sample.organism = organism sample.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() final_context = illumina.illumina_to_pcl(pj.pk) self.assertEqual(final_context["platform"], "illuminaHumanv3") for key in final_context["samples"][0].sampleannotation_set.all( )[0].data.keys(): self.assertTrue(key in [ "detected_platform", "detection_percentage", "mapped_percentage" ]) # Cleanup after the job since it won't since we aren't running in cloud. shutil.rmtree(final_context["work_dir"], ignore_errors=True)
def test_bad_illumina_detection(self): """With the wrong species, this will fail the platform detection threshold.""" organism = Organism(name="RATTUS_NORVEGICUS", taxonomy_id=9606, is_scientific_name=True) organism.save() job = prepare_illumina_job({**GSE22427, "organism": organism}) final_context = illumina.illumina_to_pcl(job.pk, cleanup=False) self.assertTrue(final_context["abort"]) # Cleanup after the job since it won't since we aren't running in cloud. shutil.rmtree(final_context["work_dir"], ignore_errors=True)
def test_convert_illumina_bad_cols(self): """ In future, this test may be deprecated. For now it just alerts that it needs attention. """ job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # ILMN_1885639 10.0000 0.7931 11.0000 0.123 # ILMN_2209417 10.0000 0.2029 11.1234 0.543 # LMN_1765401 152.0873 0.0000 99.999 0.19 og_file = OriginalFile() og_file.source_filename = ( "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1-modified.txt" ) og_file.filename = "GSM1089291-tbl-1-modified.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1-modified.txt" ) og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() final_context = no_op.no_op_processor(job.pk) self.assertFalse(final_context["success"]) self.assertTrue("Tell Rich!" in final_context["job"].failure_reason)
def setUp(self): survey_job = SurveyJob(source_type="SRA") survey_job.save() self.survey_job = survey_job key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value="DRR002116") key_value_pair.save() # Insert the organism into the database so the model doesn't call the # taxonomy API to populate it. organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save()
def prepare_organism_indices(): c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") # This is a lie, but this image doesn't have the dependencies for TRANSCRIPTOME_INDEX computational_result_short = ComputationalResult(processor=utils.find_processor('SALMON_QUANT')) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = c_elegans organism_index.result = computational_result_short organism_index.absolute_directory_path = "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT" organism_index.save() comp_file = ComputedFile() # This path will not be used because we already have the files extracted. comp_file.absolute_file_path = "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT/celgans_short.tar.gz" comp_file.result = computational_result_short comp_file.size_in_bytes=1337 comp_file.sha1="ABC" comp_file.save() # This is a lie, but this image doesn't have the dependencies for TX_IMPORT computational_result_long = ComputationalResult(processor=utils.find_processor('SALMON_QUANT')) computational_result_long.save()
def _prepare_input(job_context: Dict) -> Dict: start_time = log_state("prepare input", job_context["job"].id) # We'll store here all sample accession codes that didn't make it into the compendia # with the reason why not. job_context["filtered_samples"] = {} job_context = smashing_utils.prepare_files(job_context) # Compendia jobs only run for one organism, so we know the only # key will be the organism name, unless of course we've already failed. if job_context["job"].success is not False: job_context["organism_name"] = job_context["group_by_keys"][0] # TEMPORARY for iterating on compendia more quickly. Rather # than downloading the data from S3 each run we're just gonna # use the same directory every job. job_context["old_work_dir"] = job_context["work_dir"] job_context[ "work_dir"] = SMASHING_DIR + job_context["organism_name"] + "/" if not os.path.exists(job_context["work_dir"]): os.makedirs(job_context["work_dir"]) job_context["organism_object"] = Organism.get_object_for_name( job_context["organism_name"]) job_context["compendium_version"] = (CompendiumResult.objects.filter( primary_organism=job_context["organism_object"], quant_sf_only=False).count() + 1) job_context["all_organisms"] = job_context["samples"].keys() all_samples = list(itertools.chain(*job_context["samples"].values())) job_context["samples"] = {job_context["organism_name"]: all_samples} log_state("prepare input done", job_context["job"].id, start_time) return job_context
def prepare_job(): pj = ProcessorJob() pj.pipeline_applied = "AFFY_TO_PCL" pj.save() original_file = OriginalFile() original_file.source_filename = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip" original_file.filename = "GSM1426071_CD_colon_active_1.CEL" original_file.absolute_file_path = "/home/user/data_store/raw/TEST/CEL/GSM1426071_CD_colon_active_1.CEL" original_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = original_file assoc1.processor_job = pj assoc1.save() c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") sample = Sample() sample.title = "Heyo" sample.organism = c_elegans sample.is_processed = False sample.save() ogsa = OriginalFileSampleAssociation() ogsa.sample = sample ogsa.original_file = original_file ogsa.save() return pj
def get_organism_with_qn_target(): result = ComputationalResult() result.save() qn_target = ComputedFile() qn_target.filename = "danio_target.tsv" qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv" qn_target.is_qn_target = True qn_target.size_in_bytes = "12345" qn_target.sha1 = "aabbccddeeff" qn_target.result = result qn_target.save() danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1, qn_target=result) danio_rerio.save() return danio_rerio
def test_single_read(self): """Test outputs when the sample has one read only.""" job_context = { 'job_id': 456, 'job': ProcessorJob(), 'pipeline': Pipeline(name="Salmon"), 'input_file_path': self.test_dir + 'single_input/single_read.fastq', 'output_directory': self.test_dir + 'single_output/', 'salmontools_directory': self.test_dir + 'single_salmontools/', 'salmontools_archive': self.test_dir + 'salmontools-result.tar.gz', 'computed_files': [] } os.makedirs(job_context["salmontools_directory"], exist_ok=True) homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.organism = homo_sapiens sample.save() job_context["sample"] = sample salmon._run_salmontools(job_context) # Confirm job status self.assertTrue(job_context["success"]) # Unpack result for checking os.system('gunzip ' + job_context['salmontools_directory'] + "*.gz") # Check output file output_file = job_context['salmontools_directory'] + 'unmapped_by_salmon.fa' expected_output_file = self.test_dir + 'expected_single_output/unmapped_by_salmon.fa' self.assertTrue(identical_checksum(output_file, expected_output_file))
def test_salmontools_with_bad_processor(self): """Test salmontools with a bad processor key.""" test_dir = '/home/user/data_store/salmontools/' job_context = { 'job_id': 123, 'job': ProcessorJob.objects.create(), 'pipeline': Pipeline(name="Salmon"), 'input_file_path': test_dir + 'double_input/reads_1.fastq', 'input_file_path_2': test_dir + 'double_input/reads_2.fastq', 'salmontools_directory': test_dir + 'double_salmontools/', 'salmontools_archive': test_dir + 'salmontools-result.tar.gz', 'output_directory': test_dir + 'double_output/' } os.makedirs(job_context["salmontools_directory"], exist_ok=True) homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.organism = homo_sapiens sample.save() job_context["sample"] = sample # Set the wrong yml filename on purpose to mess up Salmontools processor original_yml_file = utils.ProcessorEnum['SALMONTOOLS'].value['yml_file'] utils.ProcessorEnum['SALMONTOOLS'].value['yml_file'] = 'foobar.yml' salmon._run_salmontools(job_context) self.assertEqual(job_context["success"], False) self.assertTrue(job_context["job"].failure_reason.startswith('Failed to set processor:')) # Change yml filename back utils.ProcessorEnum['SALMONTOOLS'].value['yml_file'] = original_yml_file
def prepare_dotsra_job(filename="ERR1562482.sra"): pj = ProcessorJob() pj.pipeline_applied = "SALMON" pj.id = random.randint(111, 999999) pj.save() c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") samp = Sample() samp.accession_code = "SALMON" # So the test files go to the right place samp.organism = c_elegans samp.save() prepare_organism_indices() og_file = OriginalFile() og_file.source_filename = filename og_file.filename = filename og_file.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/" + filename og_file.save() og_file_samp_assoc = OriginalFileSampleAssociation() og_file_samp_assoc.original_file = og_file og_file_samp_assoc.sample = samp og_file_samp_assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() return pj, [og_file]
def test_survey(self, mock_get, mock_urlopen, mock_send_job): json_file_path = os.path.join(os.path.dirname(__file__), "test_transcriptome_species.json") with open(json_file_path, "r") as json_file: species_json = json.load(json_file) # Insert the organisms into the database so the model doesn't call the # taxonomy API to populate them. for species in species_json: # Account for the subtle difference between the API for # the main Ensembl division and the API for the rest of # them. name_key = "common_name" if "common_name" in species else "name" taxonomy_key = "taxonomy_id" if "taxonomy_id" in species else "taxon_id" organism = Organism(name=species[name_key].upper(), taxonomy_id=species[taxonomy_key], is_scientific_name=True) organism.save() mock_get.return_value = Mock(ok=True) mock_get.return_value.json.return_value = species_json # There are two possible file locations. The correct one is # determined by making a request to one to see if it # exists. This URLError simulates it not existing. mock_urlopen.side_effect = URLError("404 or something") surveyor = TranscriptomeIndexSurveyor(self.survey_job) surveyor.survey() downloader_jobs = DownloaderJob.objects.order_by("id").all() self.assertEqual(downloader_jobs.count(), len(species_json)) send_job_calls = [] for downloader_job in downloader_jobs: send_job_calls.append( call(Downloaders.TRANSCRIPTOME_INDEX, downloader_job.id)) mock_send_job.assert_has_calls(send_job_calls) # There should be 2 Batches for each species (long and short # transcriptome lengths). batches = Batch.objects.all() self.assertEqual(batches.count(), len(species_json) * 2) # And each batch has two files: fasta and gtf for batch in batches: self.assertEqual(len(batch.files), 2)
def test_unmated_reads(self): """Survey, download, then process a sample we know is SRA and has unmated reads. This test uses VCR to remove the dependence upon NCBI's servers, but the downloader job hits ENA's FTP and aspera servers. Unfortunately there's not much that can be done to avoid that behavior from here because the downloader jobs always check ENA's FTP server to see if the file has an unmated read. For now we'll just have to be content with the fact that NCBI going down won't affect this test. """ # Clear out pre-existing work dirs so there's no conflicts: self.env = EnvironmentVarGuard() self.env.set("RUNING_IN_CLOUD", "False") with self.env: for work_dir in glob.glob(LOCAL_ROOT_DIR + "/processor_job_*"): shutil.rmtree(work_dir) # prevent a call being made to NCBI's API to determine # organism name/id. organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() # Survey just a single run to make things faster! # This sample has unmated reads! survey_job = surveyor.survey_experiment("SRR1603661", "SRA") self.assertTrue(survey_job.success) # Let's give the downloader a little bit to get started # and to update the OriginalFiles' source_urls. time.sleep(60) downloader_jobs = DownloaderJob.objects.all() self.assertEqual(downloader_jobs.count(), 1) downloader_job = downloader_jobs.first() self.assertIsNotNone(downloader_job.start_time) for original_file in downloader_job.original_files.all(): self.assertTrue(".fastq.gz" in original_file.source_url) # The downloader job will take a while to complete. Let's not wait. print(downloader_job.kill_nomad_job())
def test_salmon_quant_one_sample_double_reads(self): """Test `salmon quant` on a sample that has double reads.""" # Set up organism index database objects. prepare_organism_indices() # Create an Experiment that includes two samples. # (The first sample has test data available, but the second does not.) experiment_accession = 'test_experiment' experiment = Experiment.objects.create(accession_code=experiment_accession) c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") # test_sample record sample_accession = 'test_sample' test_sample = Sample.objects.create(accession_code=sample_accession, organism=c_elegans, source_database='SRA', technology='RNA-SEQ') ExperimentSampleAssociation.objects.create(experiment=experiment, sample=test_sample) # fake_sample record (created to prevent tximport step in this experiment) fake_sample = Sample.objects.create(accession_code='fake_sample', source_database='SRA', technology='RNA-SEQ') ExperimentSampleAssociation.objects.create(experiment=experiment, sample=fake_sample) experiment_dir = '/home/user/data_store/salmon_tests/test_experiment' og_read_1 = OriginalFile() og_read_1.absolute_file_path = os.path.join(experiment_dir, 'raw/reads_1.fastq') og_read_1.filename = "reads_1.fastq" og_read_1.save() OriginalFileSampleAssociation.objects.create(original_file=og_read_1, sample=test_sample).save() og_read_2 = OriginalFile() og_read_2.absolute_file_path = os.path.join(experiment_dir, "raw/reads_2.fastq") og_read_2.filename = "reads_1.fastq" og_read_2.save() OriginalFileSampleAssociation.objects.create(original_file=og_read_2, sample=test_sample).save() sample_dir = os.path.join(experiment_dir, 'test_sample') job_context = salmon._prepare_files({"job_dir_prefix": "TEST", "job_id": "TEST", "job": ProcessorJob(), 'pipeline': Pipeline(name="Salmon"), 'computed_files': [], "original_files": [og_read_1, og_read_2]}) # Run salmon. self.check_salmon_quant(job_context, sample_dir) # Confirm that this experiment is not ready for tximport yet, # because `salmon quant` is not run on 'fake_sample'. experiments_ready = salmon.get_tximport_inputs(job_context)['tximport_inputs'] self.assertEqual(len(experiments_ready), 0)
class APITestCases(APITestCase): def setUp(self): self.homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) self.homo_sapiens.save() self.danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1337, is_scientific_name=True) self.danio_rerio.save() def tearDown(self): Organism.objects.all().delete() def test_qn_endpoints(self): # create two qn endpoints result = ComputationalResult() result.commands.append("create_qn_target.py") result.is_ccdl = True result.is_public = True result.processor = None result.save() cra = ComputationalResultAnnotation() cra.result = result cra.data = { "organism_id": self.danio_rerio.id, # Danio "is_qn": True, "platform_accession_code": "zebrafish", "samples": [], "geneset": str(["RWWJ000001", "RWWJ000002"]), } cra.save() cra = ComputationalResultAnnotation() cra.result = result cra.data = { "organism_id": self.homo_sapiens.id, # IDK "is_qn": True, "platform_accession_code": "zebrafishplusone", "samples": [], "geneset": str(["RWWJ000003", "RWWJ000004"]), } cra.save() self.homo_sapiens.qn_target = result self.homo_sapiens.save() self.danio_rerio.qn_target = result self.danio_rerio.save() response = self.client.get( reverse("qn_targets_available", kwargs={"version": API_VERSION})) self.assertEqual(len(response.json()), 2)
def test_illumina_to_pcl(self): """ Most basic Illumina to PCL test """ from data_refinery_workers.processors import illumina organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() job = prepare_illumina_job(organism) final_context = illumina.illumina_to_pcl(job.pk) for sample in final_context["samples"]: smashme = sample.get_most_recent_smashable_result_file() self.assertTrue(os.path.exists(smashme.absolute_file_path)) os.remove(smashme.absolute_file_path) # Cleanup after the job since it won't since we aren't running in cloud. shutil.rmtree(final_context["work_dir"], ignore_errors=True)
def test_detect_columns(self): organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() job = prepare_illumina_job({**GSE22427, "organism": organism}) pipeline = Pipeline(name=PipelineEnum.ILLUMINA.value) final_context = utils.run_pipeline( { "job_id": job.id, "pipeline": pipeline }, [ utils.start_job, illumina._prepare_files, illumina._detect_encoding, illumina._sanitize_input_file, illumina._convert_sanitized_to_tsv, illumina._detect_columns, ], ) self.assertNotEqual(final_context.get("success"), False) # For this experiment, the probe ID is the first column self.assertEqual(final_context.get("probeId"), GSE22427_HEADER[0]) expected_column_ids = ",".join( map( lambda t: str(t[0]), filter( # For this header file, the samples all have the prefix LV- lambda t: t[1].startswith("LV-"), # We use start=1 here because the column IDs are formatted # for R code so they treat the header as a 1-indexed list enumerate(GSE22427_HEADER, start=1), ), )) self.assertEqual(final_context.get("columnIds"), expected_column_ids)
def prepare_job(job_info: dict) -> ProcessorJob: job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() og_file = OriginalFile() og_file.source_filename = job_info["source_filename"] og_file.filename = job_info["filename"] og_file.absolute_file_path = job_info["absolute_file_path"] og_file.is_downloaded = True og_file.save() sample = Sample() sample.accession_code = job_info["accession_code"] sample.title = job_info["accession_code"] sample.platform_accession_code = job_info["platform_accession_code"] manufacturer = job_info.get("manufacturer", None) if manufacturer is not None: sample.manufacturer = manufacturer # The illumina samples need the human organism if manufacturer == "ILLUMINA": homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() return job
def test_illumina_quoted_row_names(self): organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() pj = prepare_illumina_job({ "source_filename": "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE33nnn/GSE33814/suppl/GSE33814%5Fnon%2Dnormalized%2Etxt%2Egz", # Some of the columns are trimmed to save space and time "filename": "GSE33814_trimmed_non-normalized.txt", "absolute_file_path": "/home/user/data_store/raw/TEST/ILLUMINA/GSE33814_trimmed_non-normalized.txt", "organism": organism, "samples": [ ("GSM836222", "IMGUS_32"), ("GSM836223", "IMGUS_33"), ], }) final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False) self.assertSucceeded(pj) # Make sure that the row names are no longer quoted after sanitizing the file def assertNotQuoted(string: str): string = string.strip() self.assertNotEqual(string[0], '"') self.assertNotEqual(string[-1], '"') with open(final_context["sanitized_file_path"], "r") as f: reader = csv.reader(f, delimiter="\t") headers = next(reader) for header in headers: assertNotQuoted(header) # Also make sure the probe IDs aren't qutoed first_row = next(reader) assertNotQuoted(first_row[0])