def test_salmon_quant_one_sample_double_reads(self): """Test `salmon quant` on a sample that has double reads.""" # Set up organism index database objects. prepare_organism_indices() # Create an Experiment that includes two samples. # (The first sample has test data available, but the second does not.) experiment_accession = 'test_experiment' experiment = Experiment.objects.create(accession_code=experiment_accession) c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") # test_sample record sample_accession = 'test_sample' test_sample = Sample.objects.create(accession_code=sample_accession, organism=c_elegans, source_database='SRA', technology='RNA-SEQ') ExperimentSampleAssociation.objects.create(experiment=experiment, sample=test_sample) # fake_sample record (created to prevent tximport step in this experiment) fake_sample = Sample.objects.create(accession_code='fake_sample', source_database='SRA', technology='RNA-SEQ') ExperimentSampleAssociation.objects.create(experiment=experiment, sample=fake_sample) experiment_dir = '/home/user/data_store/salmon_tests/test_experiment' og_read_1 = OriginalFile() og_read_1.absolute_file_path = os.path.join(experiment_dir, 'raw/reads_1.fastq') og_read_1.filename = "reads_1.fastq" og_read_1.save() OriginalFileSampleAssociation.objects.create(original_file=og_read_1, sample=test_sample).save() og_read_2 = OriginalFile() og_read_2.absolute_file_path = os.path.join(experiment_dir, "raw/reads_2.fastq") og_read_2.filename = "reads_1.fastq" og_read_2.save() OriginalFileSampleAssociation.objects.create(original_file=og_read_2, sample=test_sample).save() sample_dir = os.path.join(experiment_dir, 'test_sample') job_context = salmon._prepare_files({"job_dir_prefix": "TEST", "job_id": "TEST", "job": ProcessorJob(), 'pipeline': Pipeline(name="Salmon"), 'computed_files': [], "original_files": [og_read_1, og_read_2]}) # Run salmon. self.check_salmon_quant(job_context, sample_dir) # Confirm that this experiment is not ready for tximport yet, # because `salmon quant` is not run on 'fake_sample'. experiments_ready = salmon.get_tximport_inputs(job_context)['tximport_inputs'] self.assertEqual(len(experiments_ready), 0)
def test_salmon_determine_index_length_double_read(self): """Test that the right length is calculated when the sample has two reads.""" job, files = prepare_job() job_context = salmon._set_job_prefix({'original_files': files, 'job_id': job}) job_context = salmon._prepare_files(job_context) results = salmon._determine_index_length(job_context) self.assertEqual(results['index_length_raw'], 41) self.assertEqual(results['index_length'], 'short')
def test_prepare_files_failure(self): batch, _, _ = init_objects() processor_job = ProcessorJob.create_job_and_relationships( batches=[batch]) job_context = utils.start_job({ "job": processor_job, "job_id": processor_job.id }) job_context = salmon._prepare_files(job_context) self.assertFalse(job_context["success"]) self.assertEqual( processor_job.failure_reason, "Exception caught while retrieving raw file ERR003000_2.fastq.gz") self.assertFalse(os.path.isfile(batch.files[0].get_temp_pre_path()))
def test_download_index_not_found(self): batch, _, _ = init_objects() processor_job = ProcessorJob.create_job_and_relationships( batches=[batch]) job_context = utils.start_job({ "job": processor_job, "job_id": processor_job.id, "kmer_size": "23" }) job_context = salmon._prepare_files(job_context) # Function we're testing. salmon._download_index(job_context) self.assertFalse(job_context["success"]) self.assertEqual( processor_job.failure_reason, "Failed to find an index for organism H**O SAPIENS with kmer_size of 23." )
def test_fastqc(self): job, og_files = prepare_job() win_context = { 'job': job, 'job_id': 789, 'job_dir_prefix': "processor_job_789", 'pipeline': Pipeline(name="Salmon"), 'qc_directory': "/home/user/data_store/raw/TEST/SALMON/qc", 'original_files': og_files, 'input_file_path': og_files[0], 'input_file_path_2': og_files[1], "computed_files": [], 'success': True } # Ensure clean testdir shutil.rmtree(win_context['qc_directory'], ignore_errors=True) os.makedirs(win_context['qc_directory'], exist_ok=True) win_context = salmon._prepare_files(win_context) win = salmon._run_fastqc(win_context) self.assertTrue(win['success']) win = salmon._run_multiqc(win_context) self.assertTrue(win['success']) for file in win['qc_files']: self.assertTrue(os.path.isfile(file.absolute_file_path)) fail_context = { 'job': job, 'job_id': 'hippityhoppity', 'pipeline': Pipeline(name="Salmon"), 'qc_directory': "/home/user/data_store/raw/TEST/SALMON/derp", 'original_files': [], 'success': True, 'computed_files': [] } fail = salmon._run_fastqc(fail_context) self.assertFalse(fail['success'])
def test_download_index_missing(self, mock_download_processed_file): batch, _, _ = init_objects() _insert_salmon_index() processor_job = ProcessorJob.create_job_and_relationships( batches=[batch]) job_context = utils.start_job({ "job": processor_job, "job_id": processor_job.id, "job_dir_prefix": "dummy", "kmer_size": "23" }) job_context = salmon._prepare_files(job_context) mock_download_processed_file.side_effect = FileNotFoundError() # The function being testing. salmon._download_index(job_context) self.assertFalse(job_context["success"]) self.assertEqual( processor_job.failure_reason, "Failed to download and extract index tarball Homo_sapiens_short.gtf.gz" )
def test_salmon_quant_two_samples_single_read(self): """Test `salmon quant` outputs on two samples that have single read and that belong to same experiment. """ prepare_organism_indices() # Create one experiment and two related samples, based on: # https://www.ncbi.nlm.nih.gov/sra/?term=SRP040623 # (For testing purpose, only two of the four samples' data are included.) experiment_accession = 'PRJNA242809' experiment = Experiment.objects.create(accession_code=experiment_accession) c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") ## Sample 1 sample1_accession = 'SRR1206053' sample1 = Sample.objects.create(accession_code=sample1_accession, organism=c_elegans) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample1) experiment_dir = "/home/user/data_store/salmon_tests/PRJNA242809" og_file_1 = OriginalFile() og_file_1.absolute_file_path = os.path.join(experiment_dir, "raw/SRR1206053.fastq.gz") og_file_1.filename = "SRR1206053.fastq.gz" og_file_1.save() OriginalFileSampleAssociation.objects.create(original_file=og_file_1, sample=sample1).save() ## Sample 2 sample2_accession = 'SRR1206054' sample2 = Sample.objects.create(accession_code=sample2_accession, organism=c_elegans) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample2) og_file_2 = OriginalFile() og_file_2.absolute_file_path = os.path.join(experiment_dir, "raw/SRR1206054.fastq.gz") og_file_2.filename = "SRR1206054.fastq.gz" og_file_2.save() OriginalFileSampleAssociation.objects.create(original_file=og_file_2, sample=sample2).save() # Test `salmon quant` on sample1 (SRR1206053) sample1_dir = os.path.join(experiment_dir, sample1_accession) job1_context = salmon._prepare_files({"job_dir_prefix": "TEST", "job_id": "TEST", 'pipeline': Pipeline(name="Salmon"), 'computed_files': [], "original_files": [og_file_1]}) # Check quant.sf in `salmon quant` output dir of sample1 self.check_salmon_quant(job1_context, sample1_dir) # Confirm that this experiment is not ready for tximport yet. experiments_ready = salmon._get_tximport_inputs(job1_context) self.assertEqual(len(experiments_ready), 0) # This job should not have produced any tximport output # because the other sample isn't ready yet. self.assertFalse(os.path.exists(os.path.join(job1_context["work_dir"], 'txi_out.RDS'))) # Now run `salmon quant` on sample2 (SRR1206054) too sample2_dir = os.path.join(experiment_dir, sample2_accession) job2_context = salmon._prepare_files({"job_dir_prefix": "TEST2", "job_id": "TEST2", 'pipeline': Pipeline(name="Salmon"), 'computed_files': [], "original_files": [og_file_2]}) # Clean up tximport output: rds_filename = os.path.join(job2_context["work_dir"], 'txi_out.RDS') if (os.path.isfile(rds_filename)): os.remove(rds_filename) # Check quant.sf in `salmon quant` output dir of sample2 self.check_salmon_quant(job2_context, sample2_dir) # rds_filename should have been generated by tximport at this point. # Note: `tximport` step is launched by subprocess module in Python. # If input "quant.sf" files are too large, we may have to wait for # a few seconds before testing the existence of rds_filename. self.assertTrue(os.path.exists(rds_filename)) for computed_file in job2_context['computed_files']: if computed_file.filename[-4:] == '.RDS': rds_file_path = computed_file.absolute_file_path cmd_tokens = [ "/usr/bin/Rscript", "--vanilla", "/home/user/data_refinery_workers/processors/test_tximport.R", "--txi_out", rds_file_path, "--gene2txmap", job2_context["genes_to_transcripts_path"] ] tximport_test_result = subprocess.run(cmd_tokens, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if tximport_test_result.returncode != 0: # If the exit code is not 0 then tximport failed so fail the tests. self.assertTrue(False) # Check the individual files self.assertTrue(len(job2_context['individual_files']), 2) for file in job2_context['individual_files']: self.assertTrue(os.path.isfile(file.absolute_file_path))
def test_success(self): """Tests the successful path of the module under test.""" logger.info("STARTING SALMON SUCCESS TEST!!!!!!!!") # Set up test environment. batch, first_file, second_file = init_objects() _insert_salmon_index() # Change the batch/files to point to test-specific locations batch.platform_accession_code = "TEST" batch.save() first_file.internal_location = "TEST/SALMON" first_file.save() second_file.internal_location = "TEST/SALMON" second_file.save() processor_job = ProcessorJob.create_job_and_relationships( batches=[batch]) processor_job.save() job_context = utils.start_job({ "job": processor_job, "job_id": processor_job.id }) job_context = salmon._set_job_prefix(job_context) # Ensure temp dir isn't leftover from a previous test. temp_dir = first_file.get_temp_dir(job_context["job_dir_prefix"]) shutil.rmtree(temp_dir, ignore_errors=True) # One of the functions being tested: job_context = salmon._prepare_files(job_context) input_file_path = job_context["input_file_path"] self.assertIsInstance(input_file_path, str) self.assertTrue(os.path.isfile(input_file_path)) input_file_path_2 = job_context["input_file_path_2"] self.assertIsInstance(input_file_path_2, str) self.assertTrue(os.path.isfile(input_file_path_2)) output_directory_path = job_context["output_directory"] self.assertIsInstance(output_directory_path, str) self.assertTrue(os.path.isdir(output_directory_path)) job_context = salmon._determine_index_length(job_context) # The 'kmer_size' key has been added to job_context with the # correct value. self.assertEqual(job_context["kmer_size"], "23") # Another function being tested job_context = salmon._download_index(job_context) self.assertTrue(job_context["success"]) self.assertTrue("index_directory" in job_context) self.assertTrue(os.path.isdir(job_context["index_directory"])) self.assertEqual(9, len(os.listdir(job_context["index_directory"]))) # Another function being tested job_context = salmon._run_salmon(job_context) self.assertTrue(job_context["success"]) self.assertGreater(len(os.listdir(output_directory_path)), 1) # The last function to test job_context = salmon._zip_and_upload(job_context) self.assertTrue(job_context["success"]) self.assertTrue(os.path.exists(first_file.get_processed_path())) # Clean up both input and output files first_file.remove_temp_directory() shutil.rmtree(first_file.get_processed_dir()) logger.info("ENDING SALMON SUCCESS TEST!!!!!!!!")