def prepare_job(): pj = ProcessorJob() pj.pipeline_applied = "AFFY_TO_PCL" pj.save() original_file = OriginalFile() original_file.source_filename = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip" original_file.filename = "GSM1426071_CD_colon_active_1.CEL" original_file.absolute_file_path = "/home/user/data_store/raw/TEST/CEL/GSM1426071_CD_colon_active_1.CEL" original_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = original_file assoc1.processor_job = pj assoc1.save() c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") sample = Sample() sample.title = "Heyo" sample.organism = c_elegans sample.is_processed = False sample.save() ogsa = OriginalFileSampleAssociation() ogsa.sample = sample ogsa.original_file = original_file ogsa.save() return pj
def create_qn_target(organism, platform, create_results=True): sample_codes_results = Sample.processed_objects.filter( platform_accession_code=platform, has_raw=True, technology="MICROARRAY", organism=organism, is_processed=True, ).values("accession_code") sample_codes = [res["accession_code"] for res in sample_codes_results] dataset = Dataset() dataset.data = {organism.name + "_(" + platform + ")": sample_codes} dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False dataset.save() job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() return qn_reference.create_qn_reference(job.pk, create_results=create_results)
def create_processor_job(self, pipeline="AFFY_TO_PCL", ram_amount=2048): job = ProcessorJob( pipeline_applied=pipeline, nomad_job_id="PROCESSOR/dispatch-1528945054-e8eaf540", ram_amount=ram_amount, num_retries=0, volume_index="1", success=None) job.save() og_file = OriginalFile() og_file.source_filename = "doesn't matter" og_file.filename = "this either" og_file.absolute_file_path = "nor this" og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() og_file = OriginalFile() og_file.source_filename = "doesn't matter" og_file.filename = "this either" og_file.absolute_file_path = "nor this" og_file.save() assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = og_file assoc.processor_job = job assoc.save() return job
def test_filter_rnaseq_matrix_drop_row_sums(self): job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() samples = list(str(i) for i in range(0, 10)) df = pd.DataFrame(columns=samples) for i in range(1, 101): df.loc[str(i)] = {idx: i for idx in samples} job_context = {"rnaseq_matrix": df, "job": job} final_job_context = create_compendia._filter_rnaseq_matrix(job_context) filtered_matrix = final_job_context["filtered_rnaseq_matrix"] # Make sure that we are getting rid of intermediate results # appropriately. Because these matrices can be pretty heavy, the input # should not stick around in the job context like this. self.assertNotIn("rnaseq_matrix", final_job_context.keys()) # We drop all rows below the 10th percentile in row sum, so we would # expect to drop rows 1 through 10 that we created above self.assertEqual(set(filtered_matrix.index), set(str(i) for i in range(11, 101)))
def test_failure(self): """Fails because there are no files for the job.""" processor_job = ProcessorJob() processor_job.save() job_context = utils.start_job({"job": processor_job}) self.assertFalse(job_context["success"])
def test_notify(self): ds = Dataset() ds.data = {'GSM1487313': ['GSM1487313'], 'SRS332914': ['SRS332914']} ds.aggregate_by = 'SPECIES' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() job_context = {} job_context['job'] = pj job_context['dataset'] = ds job_context['upload'] = True job_context[ 'result_url'] = 'https://s3.amazonaws.com/data-refinery-test-assets/all_the_things.jpg' final_context = smasher._notify(job_context) self.assertTrue(final_context.get('success', True))
def create_processor_job(pipeline="AFFY_TO_PCL", ram_amount=2048, start_time=None): og_file_1 = OriginalFile() og_file_1.source_filename = "doesn't matter" og_file_1.filename = "this either" og_file_1.absolute_file_path = "nor this" og_file_1.save() og_file_2 = OriginalFile() og_file_2.source_filename = "doesn't matter" og_file_2.filename = "this either" og_file_2.absolute_file_path = "nor this" og_file_2.save() downloader_job = None if pipeline == "AFFY_TO_PCL": downloader_job = DownloaderJob( downloader_task="SRA", batch_job_id="DEFAULT", num_retries=0, accession_code="NUNYA", success=None, ) downloader_job.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = og_file_2 assoc.downloader_job = downloader_job assoc.save() assoc1 = DownloaderJobOriginalFileAssociation() assoc1.original_file = og_file_1 assoc1.downloader_job = downloader_job assoc1.save() processor_job = ProcessorJob( downloader_job=downloader_job, pipeline_applied=pipeline, batch_job_id="PROCESSOR/dispatch-1528945054-e8eaf540", ram_amount=ram_amount, num_retries=0, success=None, start_time=start_time, ) processor_job.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file_1 assoc1.processor_job = processor_job assoc1.save() assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = og_file_2 assoc.processor_job = processor_job assoc.save() return processor_job
def prepare_illumina_job(organism): pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427%5Fnon%2Dnormalized%2Etxt.gz" og_file.filename = "GSE22427_non-normalized.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/ILLUMINA/GSE22427_non-normalized.txt") og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() sample_names = [ "LV-C&si-Control-1", "LV-C&si-Control-2", "LV-C&si-Control-3", "LV-C&si-EZH2-1", "LV-C&si-EZH2-2", "LV-C&si-EZH2-3", "LV-EZH2&si-EZH2-1", "LV-EZH2&si-EZH2-2", "LV-EZH2&si-EZH2-3", "LV-T350A&si-EZH2-1", "LV-T350A&si-EZH2-2", "LV-T350A&si-EZH2-3", ] for name in sample_names: sample = Sample() sample.accession_code = name sample.title = name sample.organism = organism sample.save() sa = SampleAnnotation() sa.sample = sample sa.data = {"description": [name]} sa.is_ccdl = False sa.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() sample = Sample.objects.get(title="LV-T350A&si-EZH2-3") sample.title = "ignoreme_for_description" sample.accession_code = "ignoreme_for_description" sample.save() return pj
def test_fail(self): """ Test our ability to fail """ result = ComputationalResult() result.save() sample = Sample() sample.accession_code = 'XXX' sample.title = 'XXX' sample.organism = Organism.get_object_for_name("HOMO_SAPIENS") sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "NOT_REAL.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['XXX']} ds.aggregate_by = 'EXPERIMENT' ds.scale_by = 'MINMAX' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() dsid = ds.id job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() final_context = smasher.smash(job.pk, upload=False) ds = Dataset.objects.get(id=dsid) print(ds.failure_reason) print(final_context['dataset'].failure_reason) self.assertNotEqual(final_context['unsmashable_files'], [])
def handle(self, *args, **options): pj = ProcessorJob() pj.pipeline_applied = "JANITOR" pj.save() final_context = run_janitor(pj.pk) print("Removed: ") for item in final_context['deleted_items']: print('\t - ' + item) sys.exit(0)
def prepare_job(): pj = ProcessorJob() pj.pipeline_applied = "SALMON" pj.save() c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") samp = Sample() samp.accession_code = "SALMON" # So the test files go to the right place samp.organism = c_elegans samp.source_database = 'SRA' samp.technology = 'RNA-SEQ' samp.save() prepare_organism_indices() og_file = OriginalFile() og_file.source_filename = "ERR1562482_1.fastq.gz" og_file.filename = "ERR1562482_1.fastq.gz" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/ERR1562482_1.fastq.gz" og_file.is_downloaded = True og_file.save() og_file2 = OriginalFile() og_file2.source_filename = "ERR1562482_2.fastq.gz" og_file2.filename = "ERR1562482_2.fastq.gz" og_file2.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/ERR1562482_2.fastq.gz" og_file2.is_downloaded = True og_file2.save() og_file_samp_assoc = OriginalFileSampleAssociation() og_file_samp_assoc.original_file = og_file og_file_samp_assoc.sample = samp og_file_samp_assoc.save() og_file_samp_assoc2 = OriginalFileSampleAssociation() og_file_samp_assoc2.original_file = og_file2 og_file_samp_assoc2.sample = samp og_file_samp_assoc2.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file2 assoc1.processor_job = pj assoc1.save() return pj, [og_file, og_file2]
def prepare_illumina_job(job_info: Dict) -> ProcessorJob: pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = job_info["source_filename"] og_file.filename = job_info["filename"] og_file.absolute_file_path = job_info["absolute_file_path"] og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() for s in job_info["samples"]: # For convenience, if you give a list of strings we'll just use the # strings as both titles and accessions. annotation = None if type(s) == str: accession_code = s title = s elif type(s) == tuple and list(map(type, s)) == [str, str]: accession_code, title = s elif type(s) == tuple and list(map(type, s)) == [str, str, dict]: accession_code, title, annotation = s else: raise ValueError(f"Invalid sample type for sample {s}") sample = Sample() sample.accession_code = accession_code sample.title = title sample.organism = job_info["organism"] sample.save() sa = SampleAnnotation() sa.sample = sample sa.data = annotation if annotation is not None else { "description": [title] } sa.is_ccdl = False sa.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() return pj
def _try_sanitizing_file(file: str) -> str: pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = _make_original_file_with_contents(file) job_context = illumina._prepare_files({ "job_id": pj.pk, "original_files": [og_file], "job": pj }) job_context = illumina._detect_encoding(job_context) return illumina._sanitize_input_file(job_context)
def create_job_for_organism(organism: Organism): """Returns a quantpendia job for the provided organism.""" job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_QUANTPENDIA.value job.save() dset = Dataset() dset.data = build_dataset(organism) dset.scale_by = "NONE" dset.aggregate_by = "EXPERIMENT" dset.quantile_normalize = False dset.quant_sf_only = True dset.svd_algorithm = "NONE" dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() # Have to call this after setting the dataset since it's used in # the caclulation. job.ram_amount = determine_ram_amount(job) job.save() return job
def test_convert_processed_illumina(self): job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # Reporter Identifier VALUE Detection Pval # ILMN_1343291 14.943602 0 # ILMN_1343295 13.528082 0 og_file = OriginalFile() og_file.source_filename = "https://www.ebi.ac.uk/arrayexpress/experiments/E-GEOD-22433/" og_file.filename = "GSM557500_sample_table.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/NO_OP/GSM557500_sample_table.txt") og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() # To: # ENSG00000156508 14.943602 # ENSG00000111640 13.528082 final_context = no_op.no_op_processor(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["output_file_path"])) self.assertEqual(os.path.getsize(final_context["output_file_path"]), 920374) self.assertTrue( no_op.check_output_quality(final_context["output_file_path"]))
def test_good_detection(self): """GSE54661 appears to be mislabled (illuminaHumanv4) on GEO. Shows our detector works. """ from data_refinery_workers.processors import illumina pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE54nnn/GSE54661/suppl/GSE54661%5Fnon%5Fnormalized%2Etxt%2Egz" og_file.filename = "GSE54661_non_normalized.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/ILLUMINA/GSE54661_non_normalized.txt" ) og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() sample = Sample() sample.accession_code = "ABCD-1234" sample.title = "hypoxia_Signal" sample.organism = organism sample.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() final_context = illumina.illumina_to_pcl(pj.pk) self.assertEqual(final_context["platform"], "illuminaHumanv3") for key in final_context["samples"][0].sampleannotation_set.all( )[0].data.keys(): self.assertTrue(key in [ "detected_platform", "detection_percentage", "mapped_percentage" ]) # Cleanup after the job since it won't since we aren't running in cloud. shutil.rmtree(final_context["work_dir"], ignore_errors=True)
def test_jobs_sanity(self): """Just makes sure creating Jobs doesn't fail""" s_job = SurveyJob() s_job.save() processor_job = ProcessorJob() processor_job.pipeline_applied = "test0" processor_job.save() dl_job = DownloaderJob() dl_job.downloader_task = "XYZ" dl_job.accession_code = "123" dl_job.save()
def create_job_for_organism(organisms: List[Organism], svd_algorithm="ARPACK"): """Returns a compendia job for the provided organism. Fetch all of the experiments and compile large but normally formated Dataset. """ job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() dataset = Dataset() dataset.data = get_dataset(organisms) dataset.scale_by = "NONE" dataset.aggregate_by = "SPECIES" dataset.quantile_normalize = True dataset.quant_sf_only = False dataset.svd_algorithm = svd_algorithm dataset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() # Have to call this after setting the dataset since it's used in # the caclulation. job.ram_amount = determine_ram_amount(job) job.save() return job
def test_convert_illumina_no_header(self): job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # ILMN_1885639 10.0000 0.7931 # ILMN_2209417 10.0000 0.2029 # ILMN_1765401 152.0873 0.0000 og_file = OriginalFile() og_file.source_filename = ( "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1.txt" ) og_file.filename = "GSM1089291-tbl-1.txt" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1.txt" og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() # To: # ENSG00000105675 10 # ENSG00000085721 152.0873 # ENSG00000278494 152.0873 final_context = no_op.no_op_processor(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["output_file_path"])) self.assertEqual(os.path.getsize(final_context["output_file_path"]), 786207)
def send_janitor_jobs(): """Dispatch a Janitor job for each job queue. TODO: make this dispatch janitor jobs for all job queues. https://github.com/AlexsLemonade/refinebio/issues/2789 """ new_job = ProcessorJob(num_retries=0, pipeline_applied="JANITOR", ram_amount=2048) new_job.save() logger.info("Sending Janitor Job.", job_id=new_job.id) try: send_job(ProcessorPipeline["JANITOR"], job=new_job, is_dispatch=True) except Exception: # If we can't dispatch this job, something else has gone wrong, we can get it next loop. return
def test_convert_illumina_bad_cols(self): """ In future, this test may be deprecated. For now it just alerts that it needs attention. """ job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # ILMN_1885639 10.0000 0.7931 11.0000 0.123 # ILMN_2209417 10.0000 0.2029 11.1234 0.543 # LMN_1765401 152.0873 0.0000 99.999 0.19 og_file = OriginalFile() og_file.source_filename = ( "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1-modified.txt" ) og_file.filename = "GSM1089291-tbl-1-modified.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1-modified.txt" ) og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() final_context = no_op.no_op_processor(job.pk) self.assertFalse(final_context["success"]) self.assertTrue("Tell Rich!" in final_context["job"].failure_reason)
def prepare_job(length): pj = ProcessorJob() pj.pipeline_applied = "TRANSCRIPTOME_INDEX_" + length.upper() pj.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS", taxonomy_id=1001) samp = Sample() samp.organism = homo_sapiens samp.accession_code = "derp" + length samp.save() og_file = OriginalFile() og_file.source_filename = "aegilops_tauschii_" + length + ".fa.gz" og_file.filename = "aegilops_tauschii_" + length + ".fa.gz" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/AEGILOPS_TAUSCHII/aegilops_tauschii_short.fa.gz" og_file.is_downloaded = True og_file.save() og_file2 = OriginalFile() og_file2.source_filename = "aegilops_tauschii_" + length + ".gtf.gz" og_file2.filename = "aegilops_tauschii_" + length + ".gtf.gz" og_file2.absolute_file_path = "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/AEGILOPS_TAUSCHII/aegilops_tauschii_short.gtf.gz" og_file2.is_downloaded = True og_file2.save() og_file_samp_assoc = OriginalFileSampleAssociation() og_file_samp_assoc.original_file = og_file og_file_samp_assoc.sample = samp og_file_samp_assoc.save() og_file_samp_assoc2 = OriginalFileSampleAssociation() og_file_samp_assoc2.original_file = og_file2 og_file_samp_assoc2.sample = samp og_file_samp_assoc2.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() assoc2 = ProcessorJobOriginalFileAssociation() assoc2.original_file = og_file2 assoc2.processor_job = pj assoc2.save() return pj
def test_value_passing(self): """The keys added to job_context and returned by processors will be passed through to other processors. """ batch, _ = init_objects() processor_job = ProcessorJob.create_job_and_relationships( batches=[batch]) mock_processor = MagicMock() mock_context = { "something_to_pass_along": True, "job": processor_job, "batches": [batch] } mock_processor.return_value = mock_context def processor_function(job_context): self.assertTrue(job_context["something_to_pass_along"]) return job_context test_processor = MagicMock(side_effect=processor_function) utils.run_pipeline( {"job_id": processor_job.id}, [utils.start_job, mock_processor, test_processor, utils.end_job]) processor_job.refresh_from_db() self.assertTrue(processor_job.success) self.assertIsNotNone(processor_job.end_time) batch.refresh_from_db() self.assertEqual(batch.status, BatchStatuses.PROCESSED.value)
def test_single_read(self): """Test outputs when the sample has one read only.""" job_context = { 'job_id': 456, 'job': ProcessorJob(), 'pipeline': Pipeline(name="Salmon"), 'input_file_path': self.test_dir + 'single_input/single_read.fastq', 'output_directory': self.test_dir + 'single_output/', 'salmontools_directory': self.test_dir + 'single_salmontools/', 'salmontools_archive': self.test_dir + 'salmontools-result.tar.gz', 'computed_files': [] } os.makedirs(job_context["salmontools_directory"], exist_ok=True) homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.organism = homo_sapiens sample.save() job_context["sample"] = sample salmon._run_salmontools(job_context) # Confirm job status self.assertTrue(job_context["success"]) # Unpack result for checking os.system('gunzip ' + job_context['salmontools_directory'] + "*.gz") # Check output file output_file = job_context['salmontools_directory'] + 'unmapped_by_salmon.fa' expected_output_file = self.test_dir + 'expected_single_output/unmapped_by_salmon.fa' self.assertTrue(identical_checksum(output_file, expected_output_file))
def test_create_index_failure(self, mocked_subprocess): # Initialize mock and test objects mocked_subprocess.return_value = CompletedProcess( [], 1, stdout=None, stderr="Error: something went wrong.") batch, gtf_file, fasta_file = init_objects() processor_job = ProcessorJob.create_job_and_relationships( batches=[batch]) # Mock out the job_context with everything the function under # test will expect job_context = utils.start_job({ "job": processor_job, "job_id": processor_job.id, "gtf_file": gtf_file, "gtf_file_path": "dummy", "fasta_file": fasta_file, "fasta_file_path": "dummy", "genes_to_transcripts_path": "dummy" }) job_context = transcriptome_index._set_job_prefix(job_context) # The function being tested. job_context = transcriptome_index._create_index(job_context) self.assertFalse(job_context["success"]) self.assertEqual( processor_job.failure_reason, ("Shell call to rsem-prepare-reference failed because: " "Error: something went wrong.")) self.assertFalse(os.path.isfile(batch.files[0].get_temp_pre_path()))
def test_zip_index_failure(self): # Initialize test objects batch, gtf_file, _ = init_objects() processor_job = ProcessorJob.create_job_and_relationships( batches=[batch]) # Mock out the job_context with everything the function under # test will expect job_context = utils.start_job({ "job": processor_job, "job_id": processor_job.id, "gtf_file": gtf_file, "output_dir": "missing/index" }) job_context = transcriptome_index._set_job_prefix(job_context) # The function being tested. job_context = transcriptome_index._zip_index(job_context) self.assertFalse(job_context["success"]) self.assertEqual( processor_job.failure_reason, ("Exception caught while zipping index directory /home/user" "/data_store/temp/EnsemblPlants/TRANSCRIPTOME_INDEX/{}" "/aegilops_tauschii_short.tar.gz").format( job_context["job_dir_prefix"])) self.assertFalse(os.path.isfile(batch.files[0].get_temp_pre_path()))
def test_success(self): batch = init_objects() file = batch.files[0] batch.platform_accession_code = "TEST" batch.save() file.internal_location = "TEST/AFFY_TO_PCL" file.name = "GSM1426186_UC_colon_inactive_201.CEL" file.save() file.remove_temp_directory = MagicMock() processor_job = ProcessorJob.create_job_and_relationships(batches=[batch]) # We have a test file in the repo, but it needs to be in the # correct location which depends on the ID of the Batch, which # changes based on the order tests are run in. test_file_path = "/home/user/data_store/temp/TEST/AFFY_TO_PCL/" + file.name input_file_path = file.get_temp_pre_path() os.makedirs(file.get_temp_dir(), exist_ok=True) shutil.copyfile(test_file_path, input_file_path) job_context = {"job_id": processor_job.id, "job": processor_job, "input_file_path": input_file_path} job_context = array_express._determine_brainarray_package(job_context) self.assertEqual(job_context["brainarray_package"], "hugene10sthsentrezgprobe") file.remove_temp_directory.assert_not_called() # Clean up the copied file os.remove(input_file_path)
def test_run_salmon_failure(self): batch, first_fastq_file, second_fastq_file = init_objects() processor_job = ProcessorJob.create_job_and_relationships( batches=[batch]) # Mock out the job_context with everything the function under # test will expect input_file_path_1 = first_fastq_file.get_temp_pre_path("dummy") input_file_path_2 = second_fastq_file.get_temp_pre_path("dummy") job_context = utils.start_job({ "job": processor_job, "job_id": processor_job.id, "job_dir_prefix": "dummy", "batches": [batch], "index_directory": "missing", "input_file_path": input_file_path_1, "input_file_path_2": input_file_path_2, "output_directory": "blah" }) # The function being tested. job_context = salmon._run_salmon(job_context) self.assertFalse(job_context["success"]) self.assertNotEqual(processor_job.failure_reason, None) self.assertFalse(os.path.isfile(batch.files[0].get_temp_pre_path()))
def test_failure(self): batch = init_objects() batch.platform_accession_code = "TEST2" batch.save() file = batch.files[0] file.internal_location = "TEST2/AFFY_TO_PCL" file.name = "dummy" file.save() processor_job = ProcessorJob.create_job_and_relationships(batches=[batch]) output_file_path = file.get_processed_path() job_context = {"job_id": processor_job.id, "job": processor_job, "batches": [batch]} # If output_file exists, remove it first. if os.path.isfile(output_file_path): os.remove(output_file_path) job_context = no_op._no_op_processor_fn(job_context) # success is only populated by this function on an error self.assertFalse(job_context["success"]) self.assertFalse(os.path.isfile(output_file_path)) self.assertEqual(processor_job.failure_reason, "Exception caught while moving file dummy")
def test_success(self, mock_file_objects): batch = init_objects() # Prevent the test file from getting removed. batch.files[0].remove_raw_files = MagicMock() processor_job = ProcessorJob.create_job_and_relationships(batches=[batch]) output_file_path = batch.files[0].get_processed_path() job_context = {"job_id": processor_job.id, "job": processor_job, "batches": [batch]} # If output_file exists, remove it first. if os.path.isfile(output_file_path): os.remove(output_file_path) job_context = no_op._no_op_processor_fn(job_context) # success is only populated by this function on an error self.assertTrue(job_context["success"]) self.assertTrue(os.path.isfile(output_file_path)) # Clean up the processed file os.remove(output_file_path)