def _create_result_objects(job_context: Dict) -> Dict: """ Create the ComputationalResult objects after a Scan run is complete """ result = ComputationalResult() result.commands.append("SCAN.UPC::SCAN_TwoColor") result.is_ccdl = True result.is_public = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "AGILENT_TWOCOLOR" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context["pipeline"].steps.append(result.id) # Create a ComputedFile for the sample, # sync it S3 and save it. try: computed_file = ComputedFile() computed_file.absolute_file_path = job_context["output_file_path"] computed_file.filename = os.path.split( job_context["output_file_path"])[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.save() job_context["computed_files"].append(computed_file) except Exception: logger.exception( "Exception caught while moving file %s to S3", computed_file.filename, processor_job=job_context["job_id"], ) failure_reason = "Exception caught while moving file to S3" job_context["job"].failure_reason = failure_reason job_context["success"] = False return job_context for sample in job_context["samples"]: assoc = SampleResultAssociation() assoc.sample = sample assoc.result = result assoc.save() SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file) logger.info("Created %s", result) job_context["success"] = True return job_context
def _create_result_objects(job_context: Dict) -> Dict: result = ComputationalResult() result.commands.append(" ".join(job_context['formatted_command'])) result.is_ccdl = True result.is_public = True result.time_start = job_context['time_start'] result.time_end = job_context['time_end'] try: processor_key = "QN_REFERENCE" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() computed_file = ComputedFile() computed_file.absolute_file_path = job_context['target_file'] computed_file.filename = job_context['target_file'].split('/')[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.is_smashable = False computed_file.is_qn_target = True computed_file.result = result computed_file.save() annotation = ComputationalResultAnnotation() annotation.result = result annotation.data = { "organism_id": job_context['samples']['ALL'][0].organism_id, "is_qn": True, "platform_accession_code": job_context['samples']['ALL'][0].platform_accession_code, "samples": [sample.accession_code for sample in job_context["samples"]["ALL"]], "geneset": str(job_context["geneset"]), "num_valid_inputs": job_context["num_valid_inputs"] } annotation.save() # TODO: upload this to a public read bucket. # https://github.com/AlexsLemonade/refinebio/issues/586 job_context['result'] = result job_context['computed_files'] = [computed_file] job_context['annotation'] = annotation job_context['success'] = True return job_context
def _create_result_objects(job_context: Dict) -> Dict: """ Create the ComputationalResult objects after a Scan run is complete """ result = ComputationalResult() result.commands.append('SCAN.UPC::SCANfast') result.is_ccdl = True result.is_public = True result.time_start = job_context['time_start'] result.time_end = job_context['time_end'] try: processor_key = "AFFYMETRIX_SCAN" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context['pipeline'].steps.append(result.id) # Create a ComputedFile for the sample computed_file = ComputedFile() computed_file.absolute_file_path = job_context["output_file_path"] computed_file.filename = os.path.split(job_context["output_file_path"])[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.save() job_context['computed_files'].append(computed_file) for sample in job_context['samples']: assoc = SampleResultAssociation() assoc.sample = sample assoc.result = result assoc.save() SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file) logger.debug("Created %s", result, processor_job=job_context["job_id"]) job_context["success"] = True return job_context
def _create_result_objects(job_context: Dict) -> Dict: if not job_context["create_results"]: return job_context result = ComputationalResult() result.commands.append(" ".join(job_context["formatted_command"])) result.is_ccdl = True result.is_public = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "QN_REFERENCE" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() computed_file = ComputedFile() computed_file.absolute_file_path = job_context["target_file"] computed_file.filename = job_context["target_file"].split("/")[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.is_smashable = False computed_file.is_qn_target = True computed_file.result = result computed_file.save() annotation = ComputationalResultAnnotation() annotation.result = result annotation.data = { "organism_id": job_context["samples"]["ALL"][0].organism_id, "is_qn": True, "platform_accession_code": job_context["samples"]["ALL"][0].platform_accession_code, "samples": [sample.accession_code for sample in job_context["samples"]["ALL"]], "geneset": str(job_context["geneset"]), "num_valid_inputs": job_context["num_valid_inputs"], } annotation.save() job_context["result"] = result job_context["computed_files"] = [computed_file] job_context["annotation"] = annotation job_context["success"] = True return job_context
def _create_result(job_context: Dict) -> Dict: """ Create the actual Result object""" # This is a NO-OP, but we make a ComputationalResult regardless. result = ComputationalResult() result.commands.append(job_context["script_name"]) result.is_ccdl = True try: processor_key = "SUBMITTER_PROCESSED" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context["pipeline"].steps.append(result.id) # Create a ComputedFile for the computed file, # sync it S3 and save it. computed_file = ComputedFile() computed_file.absolute_file_path = job_context["output_file_path"] computed_file.filename = job_context["output_file_path"].split("/")[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.save() # utils.end_job will sync this to S3 for us. job_context["computed_files"] = [computed_file] for sample in job_context["samples"]: assoc = SampleResultAssociation() assoc.sample = sample assoc.result = result assoc.save() SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file ) logger.debug("Created %s", result) job_context["success"] = True return job_context
def test_qn_endpoints(self): # create two additional qn endpoints result = ComputationalResult() result.commands.append("create_qn_target.py") result.is_ccdl = True result.is_public = True result.processor = None result.save() cra = ComputationalResultAnnotation() cra.result = result cra.data = { "organism_id": self.danio_rerio.id, # Danio "is_qn": True, "platform_accession_code": "zebrafish", "samples": [], "geneset": str(["RWWJ000001", "RWWJ000002"]), } cra.save() cra = ComputationalResultAnnotation() cra.result = result cra.data = { "organism_id": self.homo_sapiens.id, # IDK "is_qn": True, "platform_accession_code": "zebrafishplusone", "samples": [], "geneset": str(["RWWJ000003", "RWWJ000004"]), } cra.save() self.homo_sapiens.qn_target = result self.homo_sapiens.save() self.danio_rerio.qn_target = result self.danio_rerio.save() response = self.client.get( reverse("qn_targets_available", kwargs={"version": API_VERSION})) # there's another qn endpoint that is created in the setup method of this test case self.assertEqual(len(response.json()), 3)
def _create_result_objects(job_context: Dict) -> Dict: """ Create the ComputationalResult objects after a Scan run is complete """ result = ComputationalResult() result.commands.append(job_context["formatted_command"]) result.is_ccdl = True result.is_public = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "ILLUMINA_SCAN" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context["pipeline"].steps.append(result.id) # Split the result into smashable subfiles big_tsv = job_context["output_file_path"] data = pd.read_csv(big_tsv, sep="\t", header=0, index_col=0) individual_files = [] frames = np.split(data, len(data.columns), axis=1) for frame in frames: filename = ( frame.columns.values[0].replace("&", "").replace("*", "").replace(";", "") + ".tsv" ) frame_path = job_context["work_dir"] + filename frame.to_csv(frame_path, sep="\t", encoding="utf-8") # This needs to be the same as the ones in the job context! sample = _get_sample_for_column(frame.columns.values[0], job_context) if sample is None: job_context["job"].failure_reason = ( "Could not find sample for column " + frame.columns.values[0] + " while splitting Illumina file " + big_tsv ) job_context["success"] = False job_context["job"].no_retry = True return job_context computed_file = ComputedFile() computed_file.absolute_file_path = frame_path computed_file.filename = frame_path.split("/")[-1] computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.is_public = True computed_file.calculate_sha1() computed_file.calculate_size() computed_file.save() job_context["computed_files"].append(computed_file) SampleResultAssociation.objects.get_or_create(sample=sample, result=result) SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file ) individual_files.append(computed_file) logger.debug("Created %s", result) job_context["success"] = True job_context["individual_files"] = individual_files job_context["result"] = result return job_context
def setup_experiment(new_version_accessions: List[str], old_version_accessions: List[str]) -> Dict: """ Create an experiment where some samples were processed with the newest version of salmon and other with an older one. """ # Create the experiment experiment_accession = "SRP095529" data_dir = "/home/user/data_store/" experiment_dir = data_dir + experiment_accession experiment = Experiment.objects.create(accession_code=experiment_accession, technology="RNA-SEQ") zebrafish = Organism.get_object_for_name("DANIO_RERIO") # Create the transcriptome processor and result: transcriptome_processor = Processor() transcriptome_processor.name = "Transcriptome" transcriptome_processor.version = "salmon 0.9.1" transcriptome_processor.docker_image = "dr_transcriptome" transcriptome_processor.environment = '{"some": "environment"}' transcriptome_processor.save() computational_result_short = ComputationalResult( processor=transcriptome_processor) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = zebrafish organism_index.result = computational_result_short organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT" organism_index.salmon_version = "salmon 0.9.1" organism_index.save() comp_file = ComputedFile() # This path will not be used because we already have the files extracted. comp_file.absolute_file_path = ( "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz") comp_file.result = computational_result_short comp_file.size_in_bytes = 1337 comp_file.sha1 = "ABC" comp_file.s3_key = "key" comp_file.s3_bucket = "bucket" comp_file.save() quant_processor = Processor() quant_processor.name = "Salmon Quant" quant_processor.version = "salmon 0.9.1" quant_processor.docker_image = "dr_salmon" quant_processor.environment = '{"some": "environment"}' quant_processor.save() for accession_code in old_version_accessions: sample = Sample.objects.create( accession_code=accession_code, organism=zebrafish, source_database="SRA", technology="RNA-SEQ", platform_accession_code="IlluminaHiSeq1000", ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample) original_file = OriginalFile() original_file.filename = accession_code + ".SRA" original_file.source_filename = accession_code + ".SRA" original_file.save() OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample) # Create and associate quant result and files. quant_result = ComputationalResult() quant_result.is_ccdl = True quant_result.processor = quant_processor quant_result.organism_index = organism_index # associate with OLD organism index quant_result.save() kv = ComputationalResultAnnotation() kv.data = {"index_length": "short"} kv.result = quant_result kv.is_public = True kv.save() # In prod the filename pattern will involve the timestamp # but here we're using the accession code so we can find # the archive file for the current sample. archive_filename = "result-" + accession_code + ".tar.gz" archive_file = ComputedFile() archive_file.filename = archive_filename archive_file.absolute_file_path = os.path.join(experiment_dir, archive_filename) archive_file.is_public = False archive_file.is_smashable = False archive_file.is_qc = False archive_file.result = quant_result archive_file.size_in_bytes = 12345 archive_file.save() quant_file = ComputedFile() quant_file.filename = "quant.sf" quant_file.absolute_file_path = (experiment_dir + "/quant_files/" + accession_code + "_output/quant.sf") quant_file.is_public = False quant_file.is_smashable = False quant_file.is_qc = False quant_file.result = quant_result quant_file.size_in_bytes = 12345 quant_file.s3_bucket = "bucket" quant_file.s3_key = "key" quant_file.save() SampleResultAssociation.objects.get_or_create(sample=sample, result=quant_result) # Create another OrganismIndex with a newer version of transcriptome_processor = Processor() transcriptome_processor.name = "Transcriptome" transcriptome_processor.version = "salmon 0.13.1" transcriptome_processor.docker_image = "dr_transcriptome" transcriptome_processor.environment = '{"some": "environment"}' transcriptome_processor.save() computational_result_short = ComputationalResult( processor=transcriptome_processor) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = zebrafish organism_index.result = computational_result_short organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT" organism_index.salmon_version = "salmon 0.13.1" # DIFFERENT SALMON VERSION organism_index.save() comp_file = ComputedFile() # This path will not be used because we already have the files extracted. comp_file.absolute_file_path = ( "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz") comp_file.result = computational_result_short comp_file.size_in_bytes = 1337 comp_file.sha1 = "ABC" comp_file.s3_key = "key" comp_file.s3_bucket = "bucket" comp_file.save() for accession_code in new_version_accessions: sample = Sample.objects.create( accession_code=accession_code, organism=zebrafish, source_database="SRA", technology="RNA-SEQ", platform_accession_code="IlluminaHiSeq1000", ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample) original_file = OriginalFile() original_file.filename = accession_code + ".SRA" original_file.source_filename = accession_code + ".SRA" original_file.save() OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample) # Create and associate quant result and files. quant_result = ComputationalResult() quant_result.is_ccdl = True quant_result.processor = quant_processor quant_result.organism_index = organism_index # NEWER VERSION quant_result.save() kv = ComputationalResultAnnotation() kv.data = {"index_length": "short"} kv.result = quant_result kv.is_public = True kv.save() # In prod the filename pattern will involve the timestamp # but here we're using the accession code so we can find # the archive file for the current sample. archive_filename = "result-" + accession_code + ".tar.gz" archive_file = ComputedFile() archive_file.filename = archive_filename archive_file.absolute_file_path = os.path.join(experiment_dir, archive_filename) archive_file.is_public = False archive_file.is_smashable = False archive_file.is_qc = False archive_file.result = quant_result archive_file.size_in_bytes = 12345 archive_file.save() quant_file = ComputedFile() quant_file.filename = "quant.sf" quant_file.absolute_file_path = (experiment_dir + "/quant_files/" + accession_code + "_output/quant.sf") quant_file.is_public = False quant_file.is_smashable = False quant_file.is_qc = False quant_file.result = quant_result quant_file.size_in_bytes = 12345 quant_file.s3_bucket = "bucket" quant_file.s3_key = "key" quant_file.save() SampleResultAssociation.objects.get_or_create(sample=sample, result=quant_result) return experiment
def prep_tximport_at_progress_point(complete_accessions: List[str], incomplete_accessions: List[str]) -> Dict: """Create an experiment and associated objects that tximport needs to run on it. Creates a sample for each accession contained in either input list. The samples in complete_accessions will be simlulated as already having salmon quant run on them. The samples in incomplete_accessions won't. """ # Create the experiment experiment_accession = "SRP095529" data_dir = "/home/user/data_store/" experiment_dir = data_dir + experiment_accession experiment = Experiment.objects.create(accession_code=experiment_accession, technology="RNA-SEQ") zebrafish = Organism.get_object_for_name("DANIO_RERIO") ExperimentOrganismAssociation.objects.get_or_create(experiment=experiment, organism=zebrafish) # Create the transcriptome processor and result: transcriptome_processor = Processor() transcriptome_processor.name = "Transcriptome" transcriptome_processor.version = "salmon 0.13.1" transcriptome_processor.docker_image = "dr_transcriptome" transcriptome_processor.environment = '{"some": "environment"}' transcriptome_processor.save() computational_result_short = ComputationalResult( processor=transcriptome_processor) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = zebrafish organism_index.result = computational_result_short organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT" organism_index.salmon_version = "salmon 0.13.1" organism_index.save() comp_file = ComputedFile() # This path will not be used because we already have the files extracted. comp_file.absolute_file_path = ( "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz") comp_file.result = computational_result_short comp_file.size_in_bytes = 1337 comp_file.sha1 = "ABC" comp_file.s3_key = "key" comp_file.s3_bucket = "bucket" comp_file.save() for accession_code in incomplete_accessions: sample = Sample.objects.create( accession_code=accession_code, organism=zebrafish, source_database="SRA", technology="RNA-SEQ", ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample) original_file = OriginalFile() original_file.filename = accession_code + ".SRA" original_file.source_filename = accession_code + ".SRA" original_file.save() OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample) quant_processor = Processor() quant_processor.name = "Salmon Quant" quant_processor.version = "salmon 0.13.1" quant_processor.docker_image = "dr_salmon" quant_processor.environment = '{"some": "environment"}' quant_processor.save() tximport_processor = Processor() tximport_processor.name = "Tximport" tximport_processor.version = "salmon 0.13.1" tximport_processor.docker_image = "dr_salmon" tximport_processor.environment = '{"some": "environment"}' tximport_processor.save() # Create the already processed samples along with their # ComputationalResults and ComputedFiles. They don't need # original files for this test because we aren't going to run # salmon quant on them. for accession_code in complete_accessions: sample = Sample.objects.create( accession_code=accession_code, organism=zebrafish, source_database="SRA", technology="RNA-SEQ", ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample) original_file = OriginalFile() original_file.filename = accession_code + ".SRA" original_file.source_filename = accession_code + ".SRA" original_file.save() OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample) # Create and associate quant result and files. quant_result = ComputationalResult() quant_result.is_ccdl = True quant_result.processor = quant_processor quant_result.organism_index = organism_index quant_result.save() kv = ComputationalResultAnnotation() kv.data = {"index_length": "short"} kv.result = quant_result kv.is_public = True kv.save() # In prod the filename pattern will involve the timestamp # but here we're using the accession code so we can find # the archive file for the current sample. archive_filename = "result-" + accession_code + ".tar.gz" archive_file = ComputedFile() archive_file.filename = archive_filename archive_file.absolute_file_path = os.path.join(experiment_dir, archive_filename) archive_file.is_public = False archive_file.is_smashable = False archive_file.is_qc = False archive_file.result = quant_result archive_file.size_in_bytes = 12345 archive_file.save() quant_file = ComputedFile() quant_file.filename = "quant.sf" quant_file.absolute_file_path = (experiment_dir + "/quant_files/" + accession_code + "_output/quant.sf") quant_file.is_public = False quant_file.is_smashable = False quant_file.is_qc = False quant_file.result = quant_result quant_file.size_in_bytes = 12345 quant_file.s3_bucket = "bucket" quant_file.s3_key = "key" quant_file.save() sample.most_recent_quant_file = quant_file sample.save() SampleResultAssociation.objects.get_or_create(sample=sample, result=quant_result)
def _create_result_objects(job_context: Dict) -> Dict: """ Store and host the result as a ComputationalResult object. """ result = ComputationalResult() result.commands.append(" ".join(job_context['formatted_command'])) result.is_ccdl = True result.is_public = True result.time_start = job_context['time_start'] result.time_end = job_context['time_end'] try: processor_key = "COMPENDIA" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() # Write the compendia dataframe to a file, overwriting the previous smash job_context['merged_qn'].to_csv(job_context['smash_outfile'], sep='\t', encoding='utf-8') compendia_tsv_computed_file = ComputedFile() compendia_tsv_computed_file.absolute_file_path = job_context['smash_outfile'] compendia_tsv_computed_file.filename = job_context['smash_outfile'].split('/')[-1] compendia_tsv_computed_file.calculate_sha1() compendia_tsv_computed_file.calculate_size() compendia_tsv_computed_file.is_smashable = False compendia_tsv_computed_file.is_qn_target = False compendia_tsv_computed_file.result = result compendia_tsv_computed_file.save() organism_key = list(job_context['samples'].keys())[0] annotation = ComputationalResultAnnotation() annotation.result = result annotation.data = { "organism_id": job_context['samples'][organism_key][0].organism_id, "organism_name": job_context['samples'][organism_key][0].organism.name, "is_qn": False, "is_compendia": True, "samples": [sample.accession_code for sample in job_context["samples"][organism_key]], "num_samples": len(job_context["samples"][organism_key]), "experiment_accessions": [e.accession_code for e in job_context['experiments']] } annotation.save() # Save the related metadata file metadata_computed_file = ComputedFile() metadata_computed_file.absolute_file_path = job_context['metadata_tsv_paths'][0] metadata_computed_file.filename = job_context['metadata_tsv_paths'][0].split('/')[-1] metadata_computed_file.calculate_sha1() metadata_computed_file.calculate_size() metadata_computed_file.is_smashable = False metadata_computed_file.is_qn_target = False metadata_computed_file.result = result metadata_computed_file.save() # Create the resulting archive final_zip_base = "/home/user/data_store/smashed/" + str(job_context["dataset"].pk) + "_compendia" archive_path = shutil.make_archive(final_zip_base, 'zip', job_context["output_dir"]) # Save the related metadata file organism = job_context['samples'][organism_key][0].organism try: last_compendia = ComputedFile.objects.filter( is_compendia=True, compendia_organism=organism).order_by('-compendia_version')[-1] compendia_version = last_compendia.compendia_version + 1 except Exception as e: # This is the first compendia for this Organism compendia_version = 1 archive_computed_file = ComputedFile() archive_computed_file.absolute_file_path = archive_path archive_computed_file.filename = archive_path.split('/')[-1] archive_computed_file.calculate_sha1() archive_computed_file.calculate_size() archive_computed_file.is_smashable = False archive_computed_file.is_qn_target = False archive_computed_file.result = result archive_computed_file.is_compendia = True archive_computed_file.compendia_organism = job_context['samples'][organism_key][0].organism archive_computed_file.compendia_version = compendia_version archive_computed_file.save() logger.info("Compendia created!", archive_path=archive_path, organism_name=job_context['samples'][organism_key][0].organism.name ) # Upload the result to S3 key = job_context['samples'][organism_key][0].organism.name + "_" + str(compendia_version) + "_" + str(int(time.time())) + ".zip" archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, key) job_context['result'] = result job_context['computed_files'] = [compendia_tsv_computed_file, metadata_computed_file, archive_computed_file] job_context['success'] = True return job_context
def test_make_experiment_result_associations(self): """Tests that the correct associations are made. The situation we're setting up is basically this: * tximport has been run for an experiment. * It made associations between the samples in the experiment and the ComputationalResult. * It didn't make associations between the experiment itself and the ComputationalResult. * There is a second experiment that hasn't had tximport run but shares a sample with the other experiment. * This second experiment has a sample which has not yet had tximport run on it. And what we're going to test for is: * An association is created between the tximport result and the first experiment. * An association is NOT created between the tximport result and the second experiment. """ # Get an organism to set on samples: homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS", taxonomy_id=9606) # Create the tximport processor and result: processor = Processor() processor.name = "Tximport" processor.version = "v9.9.9" processor.docker_image = "dr_salmon" processor.environment = '{"some": "environment"}' processor.save() result = ComputationalResult() result.commands.append("tximport invocation") result.is_ccdl = True result.processor = processor result.save() # Create the first experiment and it's samples: processed_experiment = Experiment() processed_experiment.accession_code = "SRP12345" processed_experiment.save() processed_sample_one = Sample() processed_sample_one.accession_code = "SRX12345" processed_sample_one.title = "SRX12345" processed_sample_one.organism = homo_sapiens processed_sample_one.save() sra = SampleResultAssociation() sra.sample = processed_sample_one sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = processed_experiment esa.sample = processed_sample_one esa.save() processed_sample_two = Sample() processed_sample_two.accession_code = "SRX12346" processed_sample_two.title = "SRX12346" processed_sample_two.organism = homo_sapiens processed_sample_two.save() sra = SampleResultAssociation() sra.sample = processed_sample_two sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = processed_experiment esa.sample = processed_sample_two esa.save() # Create the second experiment and it's additional sample. unprocessed_experiment = Experiment() unprocessed_experiment.accession_code = "SRP6789" unprocessed_experiment.save() unprocessed_sample = Sample() unprocessed_sample.accession_code = "SRX6789" unprocessed_sample.title = "SRX6789" unprocessed_sample.organism = homo_sapiens unprocessed_sample.save() sra = SampleResultAssociation() sra.sample = unprocessed_sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = unprocessed_experiment esa.sample = unprocessed_sample esa.save() esa = ExperimentSampleAssociation() esa.experiment = unprocessed_experiment esa.sample = processed_sample_two esa.save() # Run the function we're testing: make_experiment_result_associations() # Test that only one association was created and that it was # to the processed experiment: eras = ExperimentResultAssociation.objects.all() self.assertEqual(len(eras), 1) self.assertEqual(eras.first().experiment, processed_experiment)
def _create_result_objects(job_context: Dict) -> Dict: """ Create the ComputationalResult objects after a Scan run is complete """ result = ComputationalResult() result.commands.append(job_context['formatted_command']) result.is_ccdl = True result.is_public = True result.time_start = job_context['time_start'] result.time_end = job_context['time_end'] try: processor_key = "ILLUMINA_SCAN" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context['pipeline'].steps.append(result.id) # Split the result into smashable subfiles big_tsv = job_context["output_file_path"] data = pd.read_csv(big_tsv, sep='\t', header=0, index_col=0) individual_files = [] frames = np.split(data, len(data.columns), axis=1) for frame in frames: filename = frame.columns.values[0].replace('&', '').replace( "*", '').replace(";", '') + '.tsv' frame_path = job_context["work_dir"] + filename frame.to_csv(frame_path, sep='\t', encoding='utf-8') # This needs to be the same as the ones in the job context! try: sample = job_context['samples'].get(title=frame.columns.values[0]) except Sample.DoesNotExist: logger.error( "Could not find sample for column while splitting Illumina file.", title=frame.columns.values[0], processor_job=job_context["job_id"], file_path=big_tsv, ) continue computed_file = ComputedFile() computed_file.absolute_file_path = frame_path computed_file.filename = frame_path.split('/')[-1] computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.is_public = True computed_file.calculate_sha1() computed_file.calculate_size() computed_file.save() job_context['computed_files'].append(computed_file) SampleResultAssociation.objects.get_or_create(sample=sample, result=result) SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file) individual_files.append(computed_file) logger.debug("Created %s", result) job_context["success"] = True job_context["individual_files"] = individual_files job_context["result"] = result return job_context
def _extract_sra(job_context: Dict) -> Dict: """ If this is a .sra file, run `fasterq-dump` to get our desired fastq files. """ if ".sra" not in job_context["input_file_path"]: return job_context if not os.path.exists(job_context["input_file_path"]): logger.error("Was told to SRA-extract a non-existent file - why did this happen?", input_file_path=job_context["input_file_path"], processor_job=job_context["job_id"] ) job_context["job"].failure_reason = "Missing SRA file: " + str(job_context["input_file_path"]) job_context["success"] = False return job_context # What the heck. Copy the file to work_dir, but remove the `.sra` extention. # https://github.com/ncbi/sra-tools/issues/150#issuecomment-422529894 job_context['work_file'] = job_context['work_dir'] + job_context['sample_accession_code'] shutil.copyfile(job_context["input_file_path"], job_context['work_file']) time_start = timezone.now() # This can be improved with: " -e " + str(multiprocessing.cpu_count()) # but it seems to cause time to increase if there are too many jobs calling it at once. formatted_command = "fasterq-dump " + job_context['work_file'] + " -O " + job_context['work_dir'] + " --temp " + job_context["temp_dir"] logger.debug("Running fasterq-dump using the following shell command: %s", formatted_command, processor_job=job_context["job_id"]) try: completed_command = subprocess.run(formatted_command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=1200) except subprocess.TimeoutExpired as e: logger.exception("Shell call to fasterq-dump failed with timeout", processor_job=job_context["job_id"], file=job_context["input_file_path"]) job_context["job"].failure_reason = str(e) job_context["success"] = False return job_context stderr = completed_command.stderr.decode().strip() stdout = completed_command.stderr.decode().strip() # fasterq-dump doesn't respect return codes # Related: https://github.com/ncbi/sra-tools/issues/146 if (completed_command.returncode != 0) or "err:" in stdout: logger.error("Shell call to fasterq-dump failed with error message: %s", stderr, stdout=stdout, processor_job=job_context["job_id"], file=job_context["input_file_path"]) job_context["job"].failure_reason = stderr job_context["success"] = False return job_context result = ComputationalResult() result.commands.append(formatted_command) result.time_start = time_start result.time_end = timezone.now() result.is_ccdl = True try: processor_key = "FASTERQ_DUMP" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context['pipeline'].steps.append(result.id) # Overwrite our current input_file_path with our newly extracted files # We either want the one created file or _just_ _1 new_files = glob.glob(job_context['work_dir'] + '*.fastq') if len(new_files) == 1: job_context['input_file_path'] = new_files[0] else: for new_file in new_files: # We only care about '_1' and '_2', unmated reads can skeddadle if '_1.fast' in new_file: job_context['input_file_path'] = new_file continue if '_2.fast' in new_file: job_context['input_file_path_2'] = new_file continue return job_context
def _run_salmontools(job_context: Dict) -> Dict: """ Run Salmontools to extract unmapped genes. """ logger.debug("Running SalmonTools ...") unmapped_filename = job_context[ "output_directory"] + "aux_info/unmapped_names.txt" command_str = "salmontools extract-unmapped -u {unmapped_file} -o {output} " output_prefix = job_context["salmontools_directory"] + "unmapped_by_salmon" command_str = command_str.format(unmapped_file=unmapped_filename, output=output_prefix) if "input_file_path_2" in job_context: command_str += "-1 {input_1} -2 {input_2}" command_str = command_str.format( input_1=job_context["input_file_path"], input_2=job_context["input_file_path_2"]) else: command_str += "-r {input_1}" command_str = command_str.format( input_1=job_context["input_file_path"]) start_time = timezone.now() logger.debug( "Running the following SalmonTools command: %s", command_str, processor_job=job_context["job_id"], ) completed_command = subprocess.run(command_str.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) end_time = timezone.now() # As of SalmonTools 0.1.0, completed_command.returncode is always 0, # (even if error happens). completed_command.stderr is not totally # reliable either, because it will output the following line even # when the execution succeeds: # "There were <N> unmapped reads\n" # in which "<N>" is the number of lines in input unmapped_names.txt. # # As a workaround, we are using a regular expression here to test # the status of SalmonTools execution. Any text in stderr that is # not in the above format is treated as error message. status_str = completed_command.stderr.decode().strip() success_pattern = r"^There were \d+ unmapped reads$" if re.match(success_pattern, status_str): # Zip up the output of salmontools try: with tarfile.open(job_context["salmontools_archive"], "w:gz") as tar: tar.add(job_context["salmontools_directory"], arcname=os.sep) except Exception: logger.exception( "Exception caught while zipping processed directory %s", job_context["salmontools_directory"], processor_job=job_context["job_id"], ) failure_template = "Exception caught while zipping salmontools directory {}" job_context["job"].failure_reason = failure_template.format( job_context["salmontools_archive"]) job_context["success"] = False return job_context result = ComputationalResult() result.commands.append(command_str) result.time_start = start_time result.time_end = end_time result.is_ccdl = True try: processor_key = "SALMONTOOLS" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context["pipeline"].steps.append(result.id) assoc = SampleResultAssociation() assoc.sample = job_context["sample"] assoc.result = result assoc.save() computed_file = ComputedFile() computed_file.filename = job_context["salmontools_archive"].split( "/")[-1] computed_file.absolute_file_path = job_context["salmontools_archive"] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.is_public = True computed_file.is_smashable = False computed_file.is_qc = True computed_file.result = result computed_file.save() job_context["computed_files"].append(computed_file) assoc = SampleComputedFileAssociation() assoc.sample = job_context["sample"] assoc.computed_file = computed_file assoc.save() job_context["result"] = result job_context["success"] = True else: # error in salmontools logger.error( "Shell call to salmontools failed with error message: %s", status_str, processor_job=job_context["job_id"], ) job_context["job"].failure_reason = ( "Shell call to salmontools failed because: " + status_str) job_context["success"] = False return job_context
def setUp(self): # Saving this for if we have protected endpoints # self.superuser = User.objects.create_superuser('john', '*****@*****.**', 'johnpassword') # self.client.login(username='******', password='******') # self.user = User.objects.create(username="******") experiment = Experiment() experiment.accession_code = "GSE000" experiment.alternate_accession_code = "E-GEOD-000" experiment.title = "NONONONO" experiment.description = "Boooooourns. Wasabi." experiment.technology = "RNA-SEQ" experiment.save() experiment = Experiment() experiment.accession_code = "GSE123" experiment.title = "Hey Ho Let's Go" experiment.description = ( "This is a very exciting test experiment. Faygo soda. Blah blah blah." ) experiment.technology = "MICROARRAY" experiment.save() self.experiment = experiment experiment_annotation = ExperimentAnnotation() experiment_annotation.data = {"hello": "world", "123": 456} experiment_annotation.experiment = experiment experiment_annotation.save() # Create 26 test organisms numbered 0-25 for pagination test, so there should be 29 organisms total (with the 3 others below) for i in range(26): Organism(name=("TEST_ORGANISM_{}".format(i)), taxonomy_id=(1234 + i)).save() ailuropoda = Organism(name="AILUROPODA_MELANOLEUCA", taxonomy_id=9646, is_scientific_name=True) ailuropoda.save() self.homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) self.homo_sapiens.save() self.danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1337, is_scientific_name=True) self.danio_rerio.save() sample = Sample() sample.title = "123" sample.accession_code = "123" sample.is_processed = True sample.organism = ailuropoda sample.save() sample = Sample() sample.title = "789" sample.accession_code = "789" sample.is_processed = True sample.organism = ailuropoda sample.save() self.sample = sample # add qn target for sample organism result = ComputationalResult() result.commands.append("create_qn_target.py") result.is_ccdl = True result.is_public = True result.processor = None result.save() cra = ComputationalResultAnnotation() cra.result = result cra.data = {"organism_id": ailuropoda.id, "is_qn": True} cra.save() ailuropoda.qn_target = result ailuropoda.save() sample_annotation = SampleAnnotation() sample_annotation.data = {"goodbye": "world", "789": 123} sample_annotation.sample = sample sample_annotation.save() original_file = OriginalFile() original_file.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = original_file original_file_sample_association.save() downloader_job = DownloaderJob() downloader_job.save() download_assoc = DownloaderJobOriginalFileAssociation() download_assoc.original_file = original_file download_assoc.downloader_job = downloader_job download_assoc.save() processor_job = ProcessorJob() processor_job.save() processor_assoc = ProcessorJobOriginalFileAssociation() processor_assoc.original_file = original_file processor_assoc.processor_job = processor_job processor_assoc.save() experiment_sample_association = ExperimentSampleAssociation() experiment_sample_association.sample = sample experiment_sample_association.experiment = experiment experiment_sample_association.save() experiment.num_total_samples = 1 experiment.num_processed_samples = 1 experiment.save() result = ComputationalResult() result.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() result = ComputationalResult() result.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() processor = Processor() processor.name = "Salmon Quant" processor.version = "v9.9.9" processor.docker_image = "dr_salmon" processor.environment = '{"some": "environment"}' processor.save() computational_result_short = ComputationalResult(processor=processor) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = self.danio_rerio organism_index.result = computational_result_short organism_index.absolute_directory_path = ( "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT") organism_index.is_public = True organism_index.s3_url = "not_blank" organism_index.save() return
def _run_salmon(job_context: Dict) -> Dict: """Runs Salmon Quant.""" logger.debug("Running Salmon..") # Salmon needs to be run differently for different sample types. # SRA files also get processed differently as we don't want to use fasterq-dump to extract # them to disk. if job_context.get("sra_input_file_path", None): # Single reads if job_context["sra_num_reads"] == 1: fifo = "/tmp/barney" os.mkfifo(fifo) dump_str = "fastq-dump --stdout {input_sra_file} > {fifo} &" formatted_dump_command = dump_str.format( input_sra_file=job_context["sra_input_file_path"], fifo=fifo) subprocess.Popen(formatted_dump_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) command_str = ( "salmon --no-version-check quant -l A -i {index} " "-r {fifo} -p 16 -o {output_directory} --seqBias --dumpEq --writeUnmappedNames" ) formatted_command = command_str.format( index=job_context["index_directory"], input_sra_file=job_context["sra_input_file_path"], fifo=fifo, output_directory=job_context["output_directory"], ) # Paired are trickier else: # Okay, for some reason I can't explain, this only works in the temp directory, # otherwise the `tee` part will only output to one or the other of the streams (non-deterministically), # but not both. This doesn't appear to happen if the fifos are in tmp. alpha = "/tmp/alpha" os.mkfifo(alpha) beta = "/tmp/beta" os.mkfifo(beta) dump_str = "fastq-dump --stdout --split-files -I {input_sra_file} | tee >(grep '@.*\.1\s' -A3 --no-group-separator > {fifo_alpha}) >(grep '@.*\.2\s' -A3 --no-group-separator > {fifo_beta}) > /dev/null &" formatted_dump_command = dump_str.format( input_sra_file=job_context["sra_input_file_path"], fifo_alpha=alpha, fifo_beta=beta) subprocess.Popen( formatted_dump_command, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.STDOUT, ) command_str = ( "salmon --no-version-check quant -l A -i {index} " "-1 {fifo_alpha} -2 {fifo_beta} -p 16 -o {output_directory} --seqBias --dumpEq --writeUnmappedNames" ) formatted_command = command_str.format( index=job_context["index_directory"], input_sra_file=job_context["sra_input_file_path"], fifo_alpha=alpha, fifo_beta=beta, output_directory=job_context["output_directory"], ) else: if "input_file_path_2" in job_context: second_read_str = " -2 {}".format(job_context["input_file_path_2"]) # Rob recommends 16 threads/process, which fits snugly on an x1 at 8GB RAM per Salmon container: # (2 threads/core * 16 cores/socket * 64 vCPU) / (1TB/8GB) = ~17 command_str = ( "salmon --no-version-check quant -l A --biasSpeedSamp 5 -i {index}" " -1 {input_one}{second_read_str} -p 16 -o {output_directory}" " --seqBias --gcBias --dumpEq --writeUnmappedNames") formatted_command = command_str.format( index=job_context["index_directory"], input_one=job_context["input_file_path"], second_read_str=second_read_str, output_directory=job_context["output_directory"], ) else: # Related: https://github.com/COMBINE-lab/salmon/issues/83 command_str = ("salmon --no-version-check quant -l A -i {index}" " -r {input_one} -p 16 -o {output_directory}" " --seqBias --dumpEq --writeUnmappedNames") formatted_command = command_str.format( index=job_context["index_directory"], input_one=job_context["input_file_path"], output_directory=job_context["output_directory"], ) logger.debug( "Running Salmon Quant using the following shell command: %s", formatted_command, processor_job=job_context["job_id"], ) # Salmon probably shouldn't take longer than three hours. timeout = 60 * 60 * 3 job_context["time_start"] = timezone.now() try: completed_command = subprocess.run( formatted_command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout, ) except subprocess.TimeoutExpired: failure_reason = "Salmon timed out because it failed to complete within 3 hours." logger.error( failure_reason, sample_accesion_code=job_context["sample"].accession_code, processor_job=job_context["job_id"], ) job_context["job"].failure_reason = failure_reason job_context["job"].no_retry = True job_context["success"] = False return job_context job_context["time_end"] = timezone.now() if completed_command.returncode == 1: stderr = completed_command.stderr.decode().strip() error_start = stderr.upper().find("ERROR:") error_start = error_start if error_start != -1 else 0 logger.error( "Shell call to salmon failed with error message: %s", stderr[error_start:], processor_job=job_context["job_id"], ) # If salmon has an error exit code then we don't want to retry it. job_context["job"].no_retry = True job_context["job"].failure_reason = ( "Shell call to salmon failed because: " + stderr[error_start:]) job_context["success"] = False else: result = ComputationalResult() result.commands.append(formatted_command) result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] result.organism_index = job_context["organism_index"] result.is_ccdl = True try: processor_key = "SALMON_QUANT" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) # Zip up the output of Salmon Quant try: with tarfile.open(job_context["output_archive"], "w:gz") as tar: tar.add(job_context["output_directory"], arcname=os.sep) except Exception: logger.exception( "Exception caught while zipping processed directory %s", job_context["output_directory"], processor_job=job_context["job_id"], ) failure_template = "Exception caught while zipping processed directory {}" job_context["job"].failure_reason = failure_template.format( job_context["output_archive"]) job_context["success"] = False return job_context salmon_quant_archive = ComputedFile() salmon_quant_archive.absolute_file_path = job_context["output_archive"] salmon_quant_archive.filename = os.path.split( job_context["output_archive"])[-1] salmon_quant_archive.calculate_sha1() salmon_quant_archive.calculate_size() salmon_quant_archive.is_public = True salmon_quant_archive.is_smashable = False salmon_quant_archive.is_qc = False quant_file = ComputedFile() quant_file.s3_bucket = S3_BUCKET_NAME timestamp = str(timezone.now().timestamp()).split(".")[0] quant_file.s3_key = "quant_files/sample_{0}_{1}_quant.sf".format( job_context["sample"].id, timestamp) quant_file.filename = "quant.sf" quant_file.absolute_file_path = job_context[ "output_directory"] + "quant.sf" quant_file.is_public = False quant_file.is_smashable = False quant_file.is_qc = False quant_file.calculate_sha1() quant_file.calculate_size() # If we're running in the cloud we need to upload the quant.sf # file so that it can be used by a job running on any machine # to run tximport. We can't use sync_to_s3 though because we # have to sync it before we can save the file so it cannot be # discovered by other jobs before it is uploaded. if settings.RUNNING_IN_CLOUD: try: S3.upload_file( quant_file.absolute_file_path, quant_file.s3_bucket, quant_file.s3_key, ExtraArgs={ "ACL": "public-read", "StorageClass": "STANDARD_IA" }, ) except Exception as e: logger.exception(e, processor_job=job_context["job_id"], sample=job_context["sample"].id) failure_template = "Exception caught while uploading quantfile to S3: {}" job_context["job"].failure_reason = failure_template.format( quant_file.absolute_file_path) job_context["success"] = False return job_context # Here select_for_update() is used as a mutex that forces multiple # jobs to execute this block of code in serial manner. See: # https://docs.djangoproject.com/en/1.11/ref/models/querysets/#select-for-update # Theorectically any rows in any table can be locked here, we're # locking all existing rows in ComputationalResult table. with transaction.atomic(): ComputationalResult.objects.select_for_update() result.save() job_context["quant_result"] = result quant_file.result = result quant_file.save() job_context["result"] = result job_context["pipeline"].steps.append(result.id) SampleResultAssociation.objects.get_or_create( sample=job_context["sample"], result=result) salmon_quant_archive.result = result salmon_quant_archive.save() job_context["computed_files"].append(salmon_quant_archive) kv = ComputationalResultAnnotation() kv.data = { "index_length": job_context["index_length"], "index_length_get": job_context.get("index_length_raw", None), } kv.result = result kv.is_public = True kv.save() try: with open( os.path.join(job_context["output_directory"], "lib_format_counts.json")) as lfc_file: format_count_data = json.load(lfc_file) kv = ComputationalResultAnnotation() kv.data = format_count_data kv.result = result kv.is_public = True kv.save() except Exception: # See: https://github.com/AlexsLemonade/refinebio/issues/1167 logger.exception( "Error parsing Salmon lib_format_counts JSON output!", processor_job=job_context["job_id"], ) try: with open( os.path.join(job_context["output_directory"], "aux_info", "meta_info.json")) as mi_file: meta_info = json.load(mi_file) kv = ComputationalResultAnnotation() kv.data = meta_info kv.result = result kv.is_public = True kv.save() except Exception: # See: https://github.com/AlexsLemonade/refinebio/issues/1167 logger.exception("Error parsing Salmon meta_info JSON output!", processor_job=job_context["job_id"]) job_context["success"] = True return job_context
def _run_tximport_for_experiment(job_context: Dict, experiment: Experiment, quant_files: List[ComputedFile]) -> Dict: # Download all the quant.sf fles for this experiment. Write all # their paths to a file so we can pass a path to that to # tximport.R rather than having to pass in one argument per # sample. tximport_path_list_file = job_context["work_dir"] + "tximport_inputs.txt" quant_file_paths = {} with open(tximport_path_list_file, "w") as input_list: for quant_file in quant_files: # We create a directory in the work directory for each (quant.sf) file, as # tximport assigns column names based on the parent directory name, # and we need those names so that we can reassociate withe samples later. # ex., a file with absolute_file_path: /processor_job_1/SRR123_output/quant.sf # downloads to: /processor_job_2/SRR123_output/quant.sf # So the result file has frame "SRR123_output", which we can associate with sample SRR123 sample_output = ( job_context["work_dir"] + str(quant_file.absolute_file_path.split("/")[-2]) + "/") os.makedirs(sample_output, exist_ok=True) quant_work_path = sample_output + quant_file.filename quant_file_path = quant_file.get_synced_file_path( path=quant_work_path) input_list.write(quant_file_path + "\n") quant_file_paths[quant_file_path] = os.stat( quant_file_path).st_size rds_filename = "txi_out.RDS" rds_file_path = job_context["work_dir"] + rds_filename tpm_filename = "gene_lengthScaledTPM.tsv" tpm_file_path = job_context["work_dir"] + tpm_filename result = ComputationalResult() cmd_tokens = [ "/usr/bin/Rscript", "--vanilla", "/home/user/data_refinery_workers/processors/tximport.R", "--file_list", tximport_path_list_file, "--gene2txmap", job_context["genes_to_transcripts_path"], "--rds_file", rds_file_path, "--tpm_file", tpm_file_path, ] result.time_start = timezone.now() logger.debug( "Running tximport with: %s", str(cmd_tokens), processor_job=job_context["job_id"], experiment=experiment.id, ) try: tximport_result = subprocess.run(cmd_tokens, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except Exception as e: raise utils.ProcessorJobError( "Encountered error in R code while running tximport.R: {}".format( str(e)), success=False, experiment=experiment.id, ) if tximport_result.returncode != 0: raise utils.ProcessorJobError( "Found non-zero exit code from R code while running tximport.R: {}" .format(tximport_result.stderr.decode().strip()), success=False, experiment=experiment.id, quant_files=quant_files, cmd_tokens=cmd_tokens, quant_file_paths=quant_file_paths, ) result.time_end = timezone.now() result.commands.append(" ".join(cmd_tokens)) result.is_ccdl = True try: processor_key = "TXIMPORT" result.processor = utils.find_processor(processor_key) except Exception as e: raise utils.ProcessorJobError("Failed to set processor: {}".format(e), success=False, processor_key=processor_key) result.save() job_context["pipeline"].steps.append(result.id) rds_file = ComputedFile() rds_file.absolute_file_path = rds_file_path rds_file.filename = rds_filename rds_file.result = result rds_file.is_smashable = False rds_file.is_qc = False rds_file.is_public = True rds_file.calculate_sha1() rds_file.calculate_size() rds_file.save() job_context["computed_files"].append(rds_file) # Split the tximport result into smashable subfiles data = pd.read_csv(tpm_file_path, sep="\t", header=0, index_col=0) individual_files = [] frames = np.split(data, len(data.columns), axis=1) for frame in frames: # Create sample-specific TPM file. sample_file_name = frame.columns.values[0] + "_" + tpm_filename frame_path = os.path.join(job_context["work_dir"], sample_file_name) frame.to_csv(frame_path, sep="\t", encoding="utf-8") # The frame column header is based off of the path, which includes _output. sample_accession_code = frame.columns.values[0].replace("_output", "") sample = Sample.objects.get(accession_code=sample_accession_code) computed_file = ComputedFile() computed_file.absolute_file_path = frame_path computed_file.filename = sample_file_name computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.is_public = True computed_file.calculate_sha1() computed_file.calculate_size() computed_file.save() job_context["computed_files"].append(computed_file) job_context["smashable_files"].append(computed_file) SampleResultAssociation.objects.get_or_create(sample=sample, result=result) # Create association with the RDS file. SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=rds_file) # Create association with TPM file. SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file) individual_files.append(computed_file) job_context["samples"].append(sample) # Salmon-processed samples aren't marked as is_processed # until they are fully tximported, this value sets that # for the end_job function. job_context["tximported"] = True job_context["individual_files"] = individual_files return job_context
def _run_multiqc(job_context: Dict) -> Dict: """Runs the `MultiQC` package to generate the QC report. TODO: These seem to consume a lot of RAM, even for small files. We should consider tuning these or breaking them out into their own processors. JVM settings may reduce RAM footprint. """ command_str = ("multiqc {input_directory} --outdir {qc_directory} --zip-data-dir") formatted_command = command_str.format(input_directory=job_context["qc_input_directory"], qc_directory=job_context["qc_directory"]) logger.debug("Running MultiQC using the following shell command: %s", formatted_command, processor_job=job_context["job_id"]) qc_env = os.environ.copy() qc_env["LC_ALL"] = "C.UTF-8" qc_env["LANG"] = "C.UTF-8" time_start = timezone.now() completed_command = subprocess.run(formatted_command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=qc_env) time_end = timezone.now() if completed_command.returncode != 0: stderr = completed_command.stderr.decode().strip() error_start = stderr.upper().find("ERROR:") error_start = error_start if error_start != -1 else 0 logger.error("Shell call to MultiQC failed with error message: %s", stderr[error_start:], processor_job=job_context["job_id"]) job_context["job"].failure_reason = ("Shell call to MultiQC failed because: " + stderr[error_start:]) job_context["success"] = False result = ComputationalResult() result.commands.append(formatted_command) result.time_start = time_start result.time_end = time_end result.is_ccdl = True try: processor_key = "MULTIQC" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context['pipeline'].steps.append(result.id) assoc = SampleResultAssociation() assoc.sample = job_context["sample"] assoc.result = result assoc.save() job_context['qc_result'] = result data_file = ComputedFile() data_file.filename = "multiqc_data.zip" # This is deterministic data_file.absolute_file_path = os.path.join(job_context["qc_directory"], data_file.filename) data_file.calculate_sha1() data_file.calculate_size() data_file.is_public = True data_file.result = job_context['qc_result'] data_file.is_smashable = False data_file.is_qc = True data_file.save() job_context['computed_files'].append(data_file) SampleComputedFileAssociation.objects.get_or_create( sample=job_context["sample"], computed_file=data_file) report_file = ComputedFile() report_file.filename = "multiqc_report.html" # This is deterministic report_file.absolute_file_path = os.path.join(job_context["qc_directory"], report_file.filename) report_file.calculate_sha1() report_file.calculate_size() report_file.is_public = True report_file.is_smashable = False report_file.is_qc = True report_file.result = job_context['qc_result'] report_file.save() job_context['computed_files'].append(report_file) job_context['qc_files'] = [data_file, report_file] return job_context
def _run_salmon(job_context: Dict) -> Dict: """Runs Salmon Quant.""" logger.debug("Running Salmon..") # Salmon needs to be run differently for different sample types. if "input_file_path_2" in job_context: second_read_str = " -2 {}".format(job_context["input_file_path_2"]) # Rob recommends 16 threads/process, which fits snugly on an x1 at 8GB RAM per Salmon container: # (2 threads/core * 16 cores/socket * 64 vCPU) / (1TB/8GB) = ~17 command_str = ("salmon --no-version-check quant -l A --biasSpeedSamp 5 -i {index}" " -1 {input_one}{second_read_str} -p 16 -o {output_directory}" " --seqBias --gcBias --dumpEq --writeUnmappedNames") formatted_command = command_str.format(index=job_context["index_directory"], input_one=job_context["input_file_path"], second_read_str=second_read_str, output_directory=job_context["output_directory"]) else: # Related: https://github.com/COMBINE-lab/salmon/issues/83 command_str = ("salmon --no-version-check quant -l A -i {index}" " -r {input_one} -p 16 -o {output_directory}" " --seqBias --dumpEq --writeUnmappedNames") formatted_command = command_str.format(index=job_context["index_directory"], input_one=job_context["input_file_path"], output_directory=job_context["output_directory"]) logger.debug("Running Salmon Quant using the following shell command: %s", formatted_command, processor_job=job_context["job_id"]) job_context['time_start'] = timezone.now() completed_command = subprocess.run(formatted_command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) job_context['time_end'] = timezone.now() ## To me, this looks broken: error codes are anything non-zero. ## However, Salmon (seems) to output with negative status codes ## even with successful executions. ## Possibly related: https://github.com/COMBINE-lab/salmon/issues/55 if completed_command.returncode == 1: stderr = completed_command.stderr.decode().strip() error_start = stderr.upper().find("ERROR:") error_start = error_start if error_start != -1 else 0 logger.error("Shell call to salmon failed with error message: %s", stderr[error_start:], processor_job=job_context["job_id"]) job_context["job"].failure_reason = ("Shell call to salmon failed because: " + stderr[error_start:]) job_context["success"] = False else: result = ComputationalResult() result.commands.append(formatted_command) result.time_start = job_context['time_start'] result.time_end = job_context['time_end'] result.organism_index = job_context["organism_index"] result.is_ccdl = True try: processor_key = "SALMON_QUANT" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) # Zip up the output of Salmon Quant try: with tarfile.open(job_context['output_archive'], "w:gz") as tar: tar.add(job_context["output_directory"], arcname=os.sep) except Exception: logger.exception("Exception caught while zipping processed directory %s", job_context["output_directory"], processor_job=job_context["job_id"] ) failure_template = "Exception caught while zipping processed directory {}" job_context["job"].failure_reason = failure_template.format(job_context['output_archive']) job_context["success"] = False return job_context salmon_quant_archive = ComputedFile() salmon_quant_archive.absolute_file_path = job_context["output_archive"] salmon_quant_archive.filename = os.path.split(job_context["output_archive"])[-1] salmon_quant_archive.calculate_sha1() salmon_quant_archive.calculate_size() salmon_quant_archive.is_public = True salmon_quant_archive.is_smashable = False salmon_quant_archive.is_qc = False quant_file = ComputedFile() quant_file.s3_bucket = S3_BUCKET_NAME quant_file.s3_key = "quant_files/sample_" + str(job_context["sample"].id) + "_quant.sf" quant_file.filename = "quant.sf" quant_file.absolute_file_path = job_context["output_directory"] + "quant.sf" quant_file.is_public = False quant_file.is_smashable = False quant_file.is_qc = False quant_file.calculate_sha1() quant_file.calculate_size() # If we're running in the cloud we need to upload the quant.sf # file so that it can be used by a job running on any machine # to run tximport. We can't use sync_to_s3 though because we # have to sync it before we can save the file so it cannot be # discovered by other jobs before it is uploaded. if settings.RUNNING_IN_CLOUD: try: S3.upload_file( quant_file.absolute_file_path, quant_file.s3_bucket, quant_file.s3_key, ExtraArgs={ 'ACL': 'public-read', 'StorageClass': 'STANDARD_IA' } ) except Exception as e: logger.exception(e, processor_job=job_context["job_id"], sample=job_context["sample"].id) failure_template = "Exception caught while uploading quantfile to S3: {}" job_context["job"].failure_reason = failure_template.format(quant_file.absolute_file_path) job_context["success"] = False return job_context # Here select_for_update() is used as a mutex that forces multiple # jobs to execute this block of code in serial manner. See: # https://docs.djangoproject.com/en/1.11/ref/models/querysets/#select-for-update # Theorectically any rows in any table can be locked here, we're # locking all existing rows in ComputationalResult table. with transaction.atomic(): ComputationalResult.objects.select_for_update() result.save() job_context["quant_result"] = result quant_file.result = result quant_file.save() job_context["result"] = result job_context['pipeline'].steps.append(result.id) SampleResultAssociation.objects.get_or_create(sample=job_context['sample'], result=result) salmon_quant_archive.result = result salmon_quant_archive.save() job_context['computed_files'].append(salmon_quant_archive) tximport_inputs = _get_tximport_inputs(job_context) # tximport analysis is done outside of the transaction so that # the mutex wouldn't hold the other jobs too long. for experiment, quant_files in tximport_inputs.items(): _tximport(job_context, experiment, quant_files) # If `tximport` on any related experiment fails, exit immediately. if not job_context["success"]: return job_context kv = ComputationalResultAnnotation() kv.data = {"index_length": job_context["index_length"]} kv.result = result kv.is_public = True kv.save() with open(os.path.join(job_context['output_directory'], 'lib_format_counts.json')) as lfc_file: format_count_data = json.load(lfc_file) kv = ComputationalResultAnnotation() kv.data = format_count_data kv.result = result kv.is_public = True kv.save() with open(os.path.join(job_context['output_directory'], 'aux_info', 'meta_info.json')) as mi_file: meta_info = json.load(mi_file) kv = ComputationalResultAnnotation() kv.data = meta_info kv.result = result kv.is_public = True kv.save() job_context["success"] = True return job_context
def _tximport(job_context: Dict, experiment: Experiment, quant_files: List[ComputedFile]) -> Dict: """Run tximport R script based on input quant files and the path of genes_to_transcripts.txt. """ # Download all the quant.sf fles for this experiment. Write all # their paths to a file so we can pass a path to that to # tximport.R rather than having to pass in one argument per # sample. tximport_path_list_file = job_context["work_dir"] + "tximport_inputs.txt" with open(tximport_path_list_file, "w") as input_list: for quant_file in quant_files: input_list.write(quant_file.get_synced_file_path() + "\n") rds_filename = "txi_out.RDS" rds_file_path = job_context["work_dir"] + rds_filename tpm_filename = "gene_lengthScaledTPM.tsv" tpm_file_path = job_context["work_dir"] + tpm_filename result = ComputationalResult() cmd_tokens = [ "/usr/bin/Rscript", "--vanilla", "/home/user/data_refinery_workers/processors/tximport.R", "--file_list", tximport_path_list_file, "--gene2txmap", job_context["genes_to_transcripts_path"], "--rds_file", rds_file_path, "--tpm_file", tpm_file_path ] result.time_start = timezone.now() logger.debug("Running tximport with: %s", str(cmd_tokens), processor_job=job_context['job_id'], experiment=experiment.id) try: tximport_result = subprocess.run(cmd_tokens, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except Exception as e: error_template = ("Encountered error in R code while running tximport.R: {}") error_message = error_template.format(str(e)) logger.error(error_message, processor_job=job_context["job_id"], experiment=experiment.id) job_context["job"].failure_reason = error_message job_context["success"] = False return job_context if tximport_result.returncode != 0: error_template = ("Found non-zero exit code from R code while running tximport.R: {}") error_message = error_template.format(tximport_result.stderr.decode().strip()) logger.error(error_message, processor_job=job_context["job_id"], experiment=experiment.id) job_context["job"].failure_reason = error_message job_context["success"] = False return job_context result.time_end = timezone.now() result.commands.append(" ".join(cmd_tokens)) result.is_ccdl = True try: processor_key = "TXIMPORT" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context['pipeline'].steps.append(result.id) # Associate this result with all samples in this experiment. # TODO: This may not be completely sensible, because `tximport` is # done at experiment level, not at sample level. # Could be very problematic if SRA's data model allows many # Experiments to one Run. # https://github.com/AlexsLemonade/refinebio/issues/297 for sample in experiment.samples.all(): s_r = SampleResultAssociation(sample=sample, result=result) s_r.save() rds_file = ComputedFile() rds_file.absolute_file_path = rds_file_path rds_file.filename = rds_filename rds_file.result = result rds_file.is_smashable = False rds_file.is_qc = False rds_file.is_public = True rds_file.calculate_sha1() rds_file.calculate_size() rds_file.save() job_context['computed_files'].append(rds_file) # Split the tximport result into smashable subfiles data = pd.read_csv(tpm_file_path, sep='\t', header=0, index_col=0) individual_files = [] frames = np.split(data, len(data.columns), axis=1) for frame in frames: # Create sample-specific TPM file. sample_file_name = frame.columns.values[0] + '_' + tpm_filename frame_path = os.path.join(job_context["work_dir"], sample_file_name) frame.to_csv(frame_path, sep='\t', encoding='utf-8') # The frame column header is based off of the path, which includes _output. sample = Sample.objects.get(accession_code=frame.columns.values[0].replace("_output", "")) computed_file = ComputedFile() computed_file.absolute_file_path = frame_path computed_file.filename = sample_file_name computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.is_public = True computed_file.calculate_sha1() computed_file.calculate_size() computed_file.save() job_context['computed_files'].append(computed_file) job_context['smashable_files'].append(computed_file) SampleResultAssociation.objects.get_or_create( sample=sample, result=result) # Create association with the RDS file. SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=rds_file) # Create association with TPM file. SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file) individual_files.append(computed_file) job_context['samples'].append(sample) # Clean up quant.sf files that were created just for this. for quant_file in quant_files: quant_file.delete_s3_file() # It's only okay to delete the local file because the full # output directory has already been zipped up. quant_file.delete_local_file() quant_file.delete() # Salmon-processed samples aren't marked as is_processed # until they are fully tximported, this value sets that # for the end_job function. job_context['tximported'] = True job_context['individual_files'] = individual_files return job_context
def _create_result_objects(job_context: Dict) -> Dict: """ Store and host the result as a ComputationalResult object. """ result_start = log_state("start create result object", job_context["job"].id) result = ComputationalResult() result.commands.append(" ".join(job_context["formatted_command"])) result.is_ccdl = True # Temporary until we re-enable the QN test step. result.is_public = False result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "CREATE_COMPENDIA" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() # Write the compendia dataframe to a file job_context["csv_outfile"] = job_context["output_dir"] + job_context[ "organism_name"] + ".tsv" job_context["merged_qn"].to_csv(job_context["csv_outfile"], sep="\t", encoding="utf-8") organism_key = list(job_context["samples"].keys())[0] annotation = ComputationalResultAnnotation() annotation.result = result annotation.data = { "organism_id": job_context["samples"][organism_key][0].organism_id, "organism_name": job_context["organism_name"], "is_qn": False, "is_compendia": True, "samples": [ sample.accession_code for sample in job_context["samples"][organism_key] ], "num_samples": len(job_context["samples"][organism_key]), "experiment_accessions": [e.accession_code for e in job_context["experiments"]], "total_percent_imputed": job_context["total_percent_imputed"], } annotation.save() # Create the resulting archive final_zip_base = SMASHING_DIR + str( job_context["dataset"].pk) + "_compendia" # Copy LICENSE.txt and correct README.md files. if job_context["dataset"].quant_sf_only: readme_file = "/home/user/README_QUANT.md" else: readme_file = "/home/user/README_NORMALIZED.md" shutil.copy(readme_file, job_context["output_dir"] + "/README.md") shutil.copy("/home/user/LICENSE_DATASET.txt", job_context["output_dir"] + "/LICENSE.TXT") archive_path = shutil.make_archive(final_zip_base, "zip", job_context["output_dir"]) archive_computed_file = ComputedFile() archive_computed_file.absolute_file_path = archive_path archive_computed_file.filename = archive_path.split("/")[-1] archive_computed_file.calculate_sha1() archive_computed_file.calculate_size() archive_computed_file.is_smashable = False archive_computed_file.is_qn_target = False archive_computed_file.result = result archive_computed_file.save() # Compendia Result Helpers primary_organism = Organism.get_object_for_name( job_context["primary_organism"]) organisms = [ Organism.get_object_for_name(organism) for organism in job_context["all_organisms"] ] compendium_version = (CompendiumResult.objects.filter( primary_organism=primary_organism, quant_sf_only=False).count() + 1) # Save Compendia Result compendium_result = CompendiumResult() compendium_result.quant_sf_only = job_context["dataset"].quant_sf_only compendium_result.svd_algorithm = job_context["dataset"].svd_algorithm compendium_result.compendium_version = compendium_version compendium_result.result = result compendium_result.primary_organism = primary_organism compendium_result.save() # create relations to all organisms contained in the compendia compendium_result_organism_associations = [] for compendium_organism in organisms: compendium_result_organism_association = CompendiumResultOrganismAssociation( ) compendium_result_organism_association.compendium_result = compendium_result compendium_result_organism_association.organism = compendium_organism compendium_result_organism_associations.append( compendium_result_organism_association) CompendiumResultOrganismAssociation.objects.bulk_create( compendium_result_organism_associations) job_context["compendium_result"] = compendium_result logger.info("Compendium created!", archive_path=archive_path, organism_name=job_context["organism_name"]) # Upload the result to S3 timestamp = str(int(time.time())) key = job_context["organism_name"] + "_" + str( compendium_version) + "_" + timestamp + ".zip" uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, key) if not uploaded_to_s3: raise utils.ProcessorJobError( "Failed to upload compendia to S3", success=False, computed_file_id=archive_computed_file.id, ) if settings.RUNNING_IN_CLOUD: archive_computed_file.delete_local_file() job_context["result"] = result job_context["success"] = True log_state("end create result object", job_context["job"].id, result_start) # TEMPORARY for iterating on compendia more quickly. # Reset this so the end_job does clean up the job's non-input-data stuff. job_context["work_dir"] = job_context["old_work_dir"] return job_context
def _create_result_objects(job_context: Dict) -> Dict: """ Store and host the result as a ComputationalResult object. """ archive_path = job_context["archive_path"] compendia_organism = _get_organisms(job_context["samples"]).first() compendia_version = _get_next_compendia_version(compendia_organism) result = ComputationalResult() result.commands.append(" ".join(job_context["formatted_command"])) result.is_ccdl = True result.is_public = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "CREATE_QUANTPENDIA" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() archive_computed_file = ComputedFile() archive_computed_file.absolute_file_path = archive_path archive_computed_file.filename = FileUtils.get_filename(archive_path) archive_computed_file.calculate_sha1() archive_computed_file.calculate_size() archive_computed_file.is_smashable = False archive_computed_file.is_qn_target = False archive_computed_file.result = result archive_computed_file.is_compendia = True archive_computed_file.quant_sf_only = True archive_computed_file.compendia_organism = compendia_organism archive_computed_file.compendia_version = compendia_version archive_computed_file.save() compendium_result = CompendiumResult() compendium_result.quant_sf_only = True compendium_result.result = result compendium_result.primary_organism = compendia_organism compendium_result.compendium_version = compendia_version compendium_result.save() logger.info( "Quantpendia created! Uploading to S3.", job_id=job_context["job_id"], archive_path=archive_path, organism_name=compendia_organism.name, **get_process_stats() ) # Upload the result to S3 timestamp = str(int(time.time())) s3_key = compendia_organism.name + "_" + str(compendia_version) + "_" + timestamp + ".zip" uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, s3_key) if not uploaded_to_s3: raise utils.ProcessorJobError( "Failed to upload compendia to S3", success=False, computed_file_id=archive_computed_file.id, ) if settings.RUNNING_IN_CLOUD: archive_computed_file.delete_local_file() job_context["result"] = result job_context["success"] = True return job_context
def _populate_index_object(job_context: Dict) -> Dict: """ """ result = ComputationalResult() result.commands.append(job_context["salmon_formatted_command"]) try: processor_key = "TX_INDEX" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.is_ccdl = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] result.save() job_context['pipeline'].steps.append(result.id) computed_file = ComputedFile() computed_file.absolute_file_path = job_context["computed_archive"] computed_file.filename = os.path.split(job_context["computed_archive"])[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.result = result computed_file.is_smashable = False computed_file.is_qc = False computed_file.save() organism_object = Organism.get_object_for_name(job_context['organism_name']) index_object = OrganismIndex() index_object.organism = organism_object index_object.source_version = job_context["assembly_version"] index_object.assembly_name = job_context["assembly_name"] index_object.salmon_version = job_context["salmon_version"] index_object.index_type = "TRANSCRIPTOME_" + job_context['length'].upper() # This is where the index will be extracted to. index_object.absolute_directory_path = LOCAL_ROOT_DIR + "/TRANSCRIPTOME_INDEX/" \ + organism_object.name + "/" + job_context['length'] index_object.result = result if S3_TRANSCRIPTOME_INDEX_BUCKET_NAME: logger.info("Uploading %s %s to s3", job_context['organism_name'], job_context['length'], processor_job=job_context["job_id"]) timestamp = str(timezone.now().timestamp()).split('.')[0] s3_key = organism_object.name + '_' + index_object.index_type + "_" + timestamp + '.tar.gz' sync_result = computed_file.sync_to_s3(S3_TRANSCRIPTOME_INDEX_BUCKET_NAME, s3_key) if sync_result: computed_file.delete_local_file() else: logger.warn("S3_TRANSCRIPTOME_INDEX_BUCKET_NAME not configured, therefore %s %s will not be uploaded.", job_context['organism_name'], job_context['length'], processor_job=job_context["job_id"]) index_object.save() # We uploaded the file ourselves since we wanted it to go to a # different bucket than end_job would put it in, therefore empty # this list so end_job doesn't try to upload it again. job_context['computed_files'] = [] job_context['result'] = result job_context['computed_file'] = computed_file job_context['index'] = index_object # If there's not a long and a short index for this organism yet, # don't delete the input. # XXX: This will break once we introduce additional versions of these. short_indices = OrganismIndex.objects.filter(organism=organism_object, index_type="TRANSCRIPTOME_SHORT", source_version=job_context["assembly_version"]) long_indices = OrganismIndex.objects.filter(organism=organism_object, index_type="TRANSCRIPTOME_LONG", source_version=job_context["assembly_version"]) if short_indices.count() < 1 or long_indices.count() < 1: # utils.end_job deletes these, so remove them so it doesn't. job_context["original_files"] = [] return job_context
def run_tximport_at_progress_point(complete_accessions: List[str], incomplete_accessions: List[str]) -> Dict: """Create an experiment and associated objects and run tximport on it. Creates a sample for each accession contained in either input list. The samples in complete_accessions will be simlulated as already having salmon quant run on them. The samples in incomplete_accessions won't. """ # Create the experiment experiment_accession = 'SRP095529' data_dir = '/home/user/data_store/salmon_tests/' experiment_dir = data_dir + experiment_accession experiment = Experiment.objects.create(accession_code=experiment_accession) zebrafish = Organism.get_object_for_name("DANIO_RERIO") # This is a lie, but this image doesn't have the dependencies for TRANSCRIPTOME_INDEX computational_result_short = ComputationalResult(processor=utils.find_processor('SALMON_QUANT')) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = zebrafish organism_index.result = computational_result_short organism_index.absolute_directory_path = "/home/user/data_store/salmon_tests/ZEBRAFISH_INDEX/SHORT" organism_index.save() comp_file = ComputedFile() # This path will not be used because we already have the files extracted. comp_file.absolute_file_path = "/home/user/data_store/salmon_tests/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz" comp_file.result = computational_result_short comp_file.size_in_bytes=1337 comp_file.sha1="ABC" comp_file.s3_key = "key" comp_file.s3_bucket = "bucket" comp_file.save() for accession_code in incomplete_accessions: last_sample = Sample.objects.create( accession_code=accession_code, organism=zebrafish, source_database='SRA', technology='RNA-SEQ' ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=last_sample) # Create tximport result and files quant_processor = utils.find_processor("SALMON_QUANT") tximport_processor = utils.find_processor("TXIMPORT") # Create the already processed samples along with their # ComputationalResults and ComputedFiles. They don't need # original files for this test because we aren't going to run # salmon quant on them. for accession_code in complete_accessions: sample = Sample.objects.create( accession_code=accession_code, organism=zebrafish, source_database='SRA', technology='RNA-SEQ' ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample) if accession_code == "SRR5125622": current_sample = sample # Create and associate quant result and files. quant_result = ComputationalResult() quant_result.is_ccdl = True quant_result.processor = quant_processor quant_result.save() # In prod the filename pattern will involve the timestamp # but here we're using the accession code so we can find # the archive file for the current sample. archive_filename = "result-" + accession_code + ".tar.gz" archive_file = ComputedFile() archive_file.filename = archive_filename archive_file.absolute_file_path = os.path.join(experiment_dir, archive_filename) archive_file.is_public = False archive_file.is_smashable = False archive_file.is_qc = False archive_file.result = quant_result archive_file.size_in_bytes = 12345 archive_file.save() quant_file = ComputedFile() quant_file.filename = "quant.sf" quant_file.absolute_file_path = experiment_dir + "/quant_files/" + accession_code + "_output/quant.sf" quant_file.is_public = False quant_file.is_smashable = False quant_file.is_qc = False quant_file.result = quant_result quant_file.size_in_bytes = 12345 quant_file.s3_bucket = "bucket" quant_file.s3_key = "key" quant_file.save() SampleResultAssociation.objects.get_or_create( sample=sample, result=quant_result ) # Processor jobs need at least one original file associated with # them so they know what they're processing. current_og = OriginalFile() current_og.absolute_file_path = os.path.join(experiment_dir, 'SRR5125622.fastq.gz') current_og.filename = "SRR5125622.fastq.gz" current_og.save() OriginalFileSampleAssociation.objects.create(original_file=current_og, sample=current_sample).save() pj = ProcessorJob() pj.pipeline_applied = "TXIMPORT" pj.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = current_og assoc1.processor_job = pj assoc1.save() # Prep our job context job_context = tximport._prepare_files({"job_dir_prefix": "TEST3", "job_id": "TEST3", "job": pj, "index_directory": organism_index.absolute_directory_path, "pipeline": Pipeline(name="Salmon"), "computed_files": [], "original_files": [current_og]}) # We don't have the raw file to run _determine_index_length so # just pick one, it doesn't matter that much because we aren't # checking the output data. job_context["index_length"] = "short" job_context = salmon._find_or_download_index(job_context) job_context = salmon.get_tximport_inputs(job_context) job_context = salmon.tximport(job_context) return job_context