def test_value_passing(self): """The keys added to job_context and returned by processors will be passed through to other processors. """ batch, _ = init_objects() processor_job = ProcessorJob.create_job_and_relationships( batches=[batch]) mock_processor = MagicMock() mock_context = { "something_to_pass_along": True, "job": processor_job, "batches": [batch] } mock_processor.return_value = mock_context def processor_function(job_context): self.assertTrue(job_context["something_to_pass_along"]) return job_context test_processor = MagicMock(side_effect=processor_function) utils.run_pipeline( {"job_id": processor_job.id}, [utils.start_job, mock_processor, test_processor, utils.end_job]) processor_job.refresh_from_db() self.assertTrue(processor_job.success) self.assertIsNotNone(processor_job.end_time) batch.refresh_from_db() self.assertEqual(batch.status, BatchStatuses.PROCESSED.value)
def affy_to_pcl(job_id: int) -> None: pipeline = Pipeline(name=utils.PipelineEnum.ARRAY_EXPRESS.value) utils.run_pipeline({ "job_id": job_id, "pipeline": pipeline }, [ utils.start_job, _prepare_files, _determine_brainarray_package, _run_scan_upc, _create_result_objects, utils.end_job ])
def agilent_twocolor_to_pcl(job_id: int) -> None: pipeline = Pipeline(name=utils.PipelineEnum.AGILENT_TWOCOLOR.value) utils.run_pipeline({ "job_id": job_id, "pipeline": pipeline }, [ utils.start_job, _prepare_files, _run_scan_twocolor, _create_result_objects, utils.end_job ])
def salmon(job_id: int) -> None: """Main processor function for the Salmon Processor. Runs salmon quant command line tool, specifying either a long or short read length. """ utils.run_pipeline({"job_id": job_id}, [ utils.start_job, _set_job_prefix, _prepare_files, _determine_index_length, _download_index, _run_salmon, _zip_and_upload, utils.cleanup_raw_files, utils.end_job ])
def salmon(job_id: int) -> None: """Main processor function for the Salmon Processor. Runs salmon quant command line tool, specifying either a long or short read length. Also runs Salmontools and Tximport. """ pipeline = Pipeline(name=PipelineEnum.SALMON.value) final_context = utils.run_pipeline( { "job_id": job_id, "pipeline": pipeline }, [ utils.start_job, _set_job_prefix, _prepare_files, _determine_index_length, _find_or_download_index, _run_salmon, get_tximport_inputs, tximport, _run_salmontools, utils.end_job, ], ) return final_context
def create_compendia(job_id: int) -> None: pipeline = Pipeline(name=PipelineEnum.CREATE_COMPENDIA.value) job_context = utils.run_pipeline({ "job_id": job_id, "pipeline": pipeline }, COMPENDIA_PIPELINE) return job_context
def build_transcriptome_index(job_id: int, length="long") -> None: """The main function for the Transcriptome Index Processor. The steps in this process are as follows: * First, files are retrieved from Temporary Storage. * Next, they are prepared by removing pseudogenes from the gtf file. * Next the tool RSEM's prepare-reference is run. * Finally the salmon index command is run The output of salmon index is a directory which is pushed in full to Permanent Storage. """ pipeline = Pipeline(name=PipelineEnum.TX_INDEX.value) return utils.run_pipeline( { "job_id": job_id, "length": length, "pipeline": pipeline }, [ utils.start_job, _compute_paths, _prepare_files, _extract_assembly_information, _process_gtf, _create_index, _zip_index, _populate_index_object, utils.end_job, ], )
def run_janitor(job_id: int) -> None: pipeline = Pipeline(name=utils.PipelineEnum.JANITOR.value) job_context = utils.run_pipeline({ "job_id": job_id, "pipeline": pipeline }, [utils.start_job, _find_and_remove_expired_jobs, utils.end_job]) return job_context
def no_op_processor(job_id: int) -> None: pipeline = Pipeline(name=utils.PipelineEnum.NO_OP.value) return utils.run_pipeline({"job_id": job_id, "pipeline": pipeline}, [utils.start_job, _prepare_files, _convert_genes, _create_result, utils.end_job])
def create_compendia(job_id: int) -> None: pipeline = Pipeline(name=utils.PipelineEnum.COMPENDIA.value) job_context = utils.run_pipeline({"job_id": job_id, "pipeline": pipeline}, [utils.start_job, _prepare_input, _perform_imputation, _create_result_objects, utils.end_job]) return job_context
def illumina_to_pcl(job_id: int) -> None: pipeline = Pipeline(name=utils.PipelineEnum.ILLUMINA.value) return utils.run_pipeline({ "job_id": job_id, "pipeline": pipeline }, [ utils.start_job, _prepare_files, _detect_columns, _detect_platform, _run_illumina, _create_result_objects, utils.end_job ])
def create_qn_reference(job_id: int) -> None: pipeline = Pipeline(name=utils.PipelineEnum.QN_REFERENCE.value) job_context = utils.run_pipeline({"job_id": job_id, "pipeline": pipeline}, [utils.start_job, _prepare_input, _quantile_normalize, _verify_result, _create_result_objects, utils.end_job]) return job_context
def test_processor_failure(self): processor_job = ProcessorJob() processor_job.save() job_context = { "job_id": processor_job.id, "job": processor_job, "batches": [] } mock_processor = MagicMock() mock_processor.__name__ = "Fake processor." return_context = copy.copy(job_context) return_context["success"] = False mock_processor.return_value = return_context utils.run_pipeline(job_context, [mock_processor]) self.assertEqual(mock_processor.call_count, 1) processor_job.refresh_from_db() self.assertFalse(processor_job.success) self.assertIsNotNone(processor_job.end_time)
def build_transcriptome_index(job_id: int) -> None: """The main function for the Transcriptome Index Processor. The steps in this process are as follows: * First, files are retrieved from Temporary Storage. * Next, they are prepared by removing pseudogenes from the gtf file. * Next the tool RSEM's prepare-reference is run. * Finally the salmon index command is run The output of salmon index is a directory which is pushed in full to Permanent Storage. """ utils.run_pipeline({"job_id": job_id}, [utils.start_job, _set_job_prefix, _prepare_files, _process_gtf, _create_index, _zip_index, utils.upload_processed_files, utils.cleanup_raw_files, utils.end_job])
def smash(job_id: int, upload=True) -> None: """ Main Smasher interface """ pipeline = Pipeline(name=utils.PipelineEnum.SMASHER.value) return utils.run_pipeline( { "job_id": job_id, "upload": upload, "pipeline": pipeline }, [ utils.start_job, _prepare_files, _smash, _upload, _notify, _update_result_objects, utils.end_job ])
def create_qn_reference(job_id: int, create_results=True) -> None: pipeline = Pipeline(name=PipelineEnum.QN_REFERENCE.value) job_context = utils.run_pipeline( {"job_id": job_id, "pipeline": pipeline, "create_results": create_results}, [ utils.start_job, _prepare_input, _build_qn_target, _create_result_objects, _update_caches, utils.end_job, ], ) return job_context
def tximport(job_id: int) -> None: """Main processor function for the Tximport Processor. Runs tximport command line tool on an experiment. """ pipeline = Pipeline(name=utils.PipelineEnum.TXIMPORT.value) final_context = utils.run_pipeline({ "job_id": job_id, "pipeline": pipeline }, [ utils.start_job, _set_job_prefix, _prepare_files, salmon._find_or_download_index, salmon.tximport, utils.end_job ]) return final_context
def _perform_imputation(job_context: Dict) -> Dict: """ Take the inputs and perform the primary imputation. Via https://github.com/AlexsLemonade/refinebio/issues/508#issuecomment-435879283: - Calculate the sum of the lengthScaledTPM values for each row (gene) of the rnaseq_expression_matrix (rnaseq_row_sums). - Calculate the 10th percentile of rnaseq_row_sums - Drop all rows in rnaseq_expression_matrix with a row sum < 10th percentile of rnaseq_row_sums; this is now filtered_rnaseq_matrix - log2(x + 1) transform filtered_rnaseq_matrix; this is now log2_rnaseq_matrix - Set all zero values in log2_rnaseq_matrix to NA, but make sure to keep track of where these zeroes are - Perform a full outer join of microarray_expression_matrix and log2_rnaseq_matrix; combined_matrix - Remove genes (rows) with >30% missing values in combined_matrix - Remove samples (columns) with >50% missing values in combined_matrix - "Reset" zero values that were set to NA in RNA-seq samples (i.e., make these zero again) in combined_matrix - Transpose combined_matrix; transposed_matrix - Perform imputation of missing values with IterativeSVD (rank=10) on the transposed_matrix; imputed_matrix -- with specified svd algorithm or skip - Untranspose imputed_matrix (genes are now rows, samples are now columns) """ imputation_start = log_state("start perform imputation", job_context["job"].id) job_context["time_start"] = timezone.now() job_context = utils.run_pipeline( job_context, [ _filter_rnaseq_matrix, _log2_transform_matrix, _cached_remove_zeroes, _full_outer_join_gene_matrices, _filter_rows_and_columns, _reset_zero_values, _run_iterativesvd, ], ) job_context["time_end"] = timezone.now() job_context["formatted_command"] = ["create_compendia.py"] log_state("end perform imputation", job_context["job"].id, imputation_start) return job_context
def create_quantpendia(job_id: int) -> None: pipeline = Pipeline(name=PipelineEnum.CREATE_QUANTPENDIA.value) job_context = utils.run_pipeline( {"job_id": job_id, "pipeline": pipeline}, [ utils.start_job, _make_dirs, _download_files, _add_metadata, _make_archive, _create_result_objects, _remove_job_dir, utils.end_job, ], ) return job_context
def create_compendia(job_id: int) -> None: pipeline = Pipeline(name=PipelineEnum.CREATE_COMPENDIA.value) job_context = utils.run_pipeline( { "job_id": job_id, "pipeline": pipeline }, [ utils.start_job, _prepare_input, _prepare_frames, _perform_imputation, smashing_utils.write_non_data_files, _create_result_objects, utils.end_job, ], ) return job_context
def test_detect_columns(self): organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() job = prepare_illumina_job({**GSE22427, "organism": organism}) pipeline = Pipeline(name=PipelineEnum.ILLUMINA.value) final_context = utils.run_pipeline( { "job_id": job.id, "pipeline": pipeline }, [ utils.start_job, illumina._prepare_files, illumina._detect_encoding, illumina._sanitize_input_file, illumina._convert_sanitized_to_tsv, illumina._detect_columns, ], ) self.assertNotEqual(final_context.get("success"), False) # For this experiment, the probe ID is the first column self.assertEqual(final_context.get("probeId"), GSE22427_HEADER[0]) expected_column_ids = ",".join( map( lambda t: str(t[0]), filter( # For this header file, the samples all have the prefix LV- lambda t: t[1].startswith("LV-"), # We use start=1 here because the column IDs are formatted # for R code so they treat the header as a 1-indexed list enumerate(GSE22427_HEADER, start=1), ), )) self.assertEqual(final_context.get("columnIds"), expected_column_ids)
def smash(job_id: int, upload=True) -> None: """Main Smasher interface""" pipeline = Pipeline(name=PipelineEnum.SMASHER.value) job_context = utils.run_pipeline( { "job_id": job_id, "upload": upload, "pipeline": pipeline }, [ utils.start_job, smashing_utils.prepare_files, _smash_all, _upload, _update_result_objects, utils.end_job, ], ) # ensure that `notify` is always called so that users get emails in case processing fails or succeeds job_context = _notify(job_context) return job_context
def salmon(job_id: int) -> None: """Main processor function for the Salmon Processor. Runs salmon quant command line tool, specifying either a long or short read length. Also runs FastQC, MultiQC, and Salmontools. """ pipeline = Pipeline(name=utils.PipelineEnum.SALMON.value) final_context = utils.run_pipeline({"job_id": job_id, "pipeline": pipeline}, [utils.start_job, _set_job_prefix, _prepare_files, _extract_sra, _determine_index_length, _find_or_download_index, _run_fastqc, _run_salmon, _run_salmontools, _run_multiqc, utils.end_job]) return final_context
def test_imputation(self): job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1234" experiment.save() result = ComputationalResult() result.save() qn_target = ComputedFile() qn_target.filename = "danio_target.tsv" qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv" qn_target.is_qn_target = True qn_target.size_in_bytes = "12345" qn_target.sha1 = "aabbccddeeff" qn_target.result = result qn_target.save() danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1, qn_target=result) danio_rerio.save() cra = ComputationalResultAnnotation() cra.data = {} cra.data["organism_id"] = danio_rerio.id cra.data["is_qn"] = True cra.result = result cra.save() result = ComputationalResult() result.save() micros = [] for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"): if "microarray.txt" in file: continue create_sample_for_experiment( { "organism": danio_rerio, "accession_code": file, "technology": "MICROARRAY", "filename": file, "data_dir": "/home/user/data_store/raw/TEST/MICROARRAY/", }, experiment, ) micros.append(file) experiment = Experiment() experiment.accession_code = "GSE5678" experiment.save() result = ComputationalResult() result.save() rnas = [] for file in os.listdir("/home/user/data_store/raw/TEST/RNASEQ/"): if "rnaseq.txt" in file: continue create_sample_for_experiment( { "organism": danio_rerio, "accession_code": file, "technology": "RNA-SEQ", "filename": file, "data_dir": "/home/user/data_store/raw/TEST/RNASEQ/", }, experiment, ) rnas.append(file) # Missing sample that will be filtered sample = create_sample_for_experiment( { "organism": danio_rerio, "accession_code": "GSM1487222", "title": "this sample will be filtered", "technology": "RNA-SEQ", "filename": None, }, experiment, ) rnas.append(sample.accession_code) dset = Dataset() dset.data = {"GSE1234": micros, "GSE5678": rnas} dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = True dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() imputation_index = create_compendia.COMPENDIA_PIPELINE.index( create_compendia._perform_imputation) pipeline = Pipeline(name=PipelineEnum.CREATE_COMPENDIA.value) job_context = utils.run_pipeline( { "job_id": job.id, "pipeline": pipeline }, create_compendia.COMPENDIA_PIPELINE[:imputation_index], ) # First, run the imputation step without removing anything to get a baseline expected_context = utils.run_pipeline( job_context.copy(), [create_compendia.COMPENDIA_PIPELINE[imputation_index]]) # Now pick some rows to remove according to the instructions from # https://github.com/AlexsLemonade/refinebio/pull/2879#issuecomment-895143336 random.seed(42) # Select some rows randomly and mask a little bit less than 30% of the values rare_rows = random.sample(list(job_context["microarray_matrix"].index), k=25) rare_genes = {} for row in rare_rows: cols = random.sample( list(job_context["microarray_matrix"].columns), # There are around 840 samples, and we want to pick a little bit # less than 30% of them k=int(0.28 * 840), ) rare_genes[row] = cols for col in cols: job_context["microarray_matrix"].loc[row, col] = np.nan # Now randomly select some entries from the other rows to mask individual_indices = random.sample( list( itertools.product( set(job_context["microarray_matrix"].index) - set(rare_rows), job_context["microarray_matrix"].columns, )), k=1000, ) for row, col in individual_indices: job_context["microarray_matrix"].loc[row, col] = np.nan final_context = utils.run_pipeline( job_context, [create_compendia.COMPENDIA_PIPELINE[imputation_index]]) self.assertDidNotFail(job) index = set(final_context["merged_no_qn"].index) & set( expected_context["merged_no_qn"].index) columns = set(final_context["merged_no_qn"].columns) & set( expected_context["merged_no_qn"].columns) # Calculate the Root-Mean-Square Error (RMSE) of the imputed values. # See https://en.wikipedia.org/wiki/Root-mean-square_deviation # for a description of the formula. N = 0 squared_error = 0 affected_entries = { *individual_indices, *((row, col) for row, cols in rare_genes.items() for col in cols), } for row, col in affected_entries: if row in index and col in columns: actual = final_context["merged_no_qn"].loc[row, col] expected = expected_context["merged_no_qn"].loc[row, col] N += 1 squared_error += (actual - expected)**2 rmse = math.sqrt(squared_error / N) # The results of a previous run plus a little bit of leeway self.assertLess(abs(rmse - 0.2868600293662542), 0.05)
def affy_to_pcl(job_id: int) -> None: utils.run_pipeline({"job_id": job_id}, [ utils.start_job, _prepare_files, _determine_brainarray_package, _run_scan_upc, utils.upload_processed_files, utils.cleanup_raw_files, utils.end_job ])
def no_op_processor(job_id: int) -> None: utils.run_pipeline({"job_id": job_id}, [utils.start_job, _no_op_processor_fn, utils.end_job])
def test_no_job(self): mock_processor = MagicMock() utils.run_pipeline({"job_id": 100}, [mock_processor]) mock_processor.assert_not_called()