def _create_result_objects(job_context: Dict) -> Dict: """ Create the ComputationalResult objects after a Scan run is complete """ result = ComputationalResult() result.commands.append("SCAN.UPC::SCAN_TwoColor") result.is_ccdl = True result.is_public = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "AGILENT_TWOCOLOR" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context["pipeline"].steps.append(result.id) # Create a ComputedFile for the sample, # sync it S3 and save it. try: computed_file = ComputedFile() computed_file.absolute_file_path = job_context["output_file_path"] computed_file.filename = os.path.split( job_context["output_file_path"])[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.save() job_context["computed_files"].append(computed_file) except Exception: logger.exception( "Exception caught while moving file %s to S3", computed_file.filename, processor_job=job_context["job_id"], ) failure_reason = "Exception caught while moving file to S3" job_context["job"].failure_reason = failure_reason job_context["success"] = False return job_context for sample in job_context["samples"]: assoc = SampleResultAssociation() assoc.sample = sample assoc.result = result assoc.save() SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file) logger.info("Created %s", result) job_context["success"] = True return job_context
def _create_result_objects(job_context: Dict) -> Dict: result = ComputationalResult() result.commands.append(" ".join(job_context['formatted_command'])) result.is_ccdl = True result.is_public = True result.time_start = job_context['time_start'] result.time_end = job_context['time_end'] try: processor_key = "QN_REFERENCE" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() computed_file = ComputedFile() computed_file.absolute_file_path = job_context['target_file'] computed_file.filename = job_context['target_file'].split('/')[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.is_smashable = False computed_file.is_qn_target = True computed_file.result = result computed_file.save() annotation = ComputationalResultAnnotation() annotation.result = result annotation.data = { "organism_id": job_context['samples']['ALL'][0].organism_id, "is_qn": True, "platform_accession_code": job_context['samples']['ALL'][0].platform_accession_code, "samples": [sample.accession_code for sample in job_context["samples"]["ALL"]], "geneset": str(job_context["geneset"]), "num_valid_inputs": job_context["num_valid_inputs"] } annotation.save() # TODO: upload this to a public read bucket. # https://github.com/AlexsLemonade/refinebio/issues/586 job_context['result'] = result job_context['computed_files'] = [computed_file] job_context['annotation'] = annotation job_context['success'] = True return job_context
def _create_result_objects(job_context: Dict) -> Dict: if not job_context["create_results"]: return job_context result = ComputationalResult() result.commands.append(" ".join(job_context["formatted_command"])) result.is_ccdl = True result.is_public = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "QN_REFERENCE" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() computed_file = ComputedFile() computed_file.absolute_file_path = job_context["target_file"] computed_file.filename = job_context["target_file"].split("/")[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.is_smashable = False computed_file.is_qn_target = True computed_file.result = result computed_file.save() annotation = ComputationalResultAnnotation() annotation.result = result annotation.data = { "organism_id": job_context["samples"]["ALL"][0].organism_id, "is_qn": True, "platform_accession_code": job_context["samples"]["ALL"][0].platform_accession_code, "samples": [sample.accession_code for sample in job_context["samples"]["ALL"]], "geneset": str(job_context["geneset"]), "num_valid_inputs": job_context["num_valid_inputs"], } annotation.save() job_context["result"] = result job_context["computed_files"] = [computed_file] job_context["annotation"] = annotation job_context["success"] = True return job_context
def _create_result_objects(job_context: Dict) -> Dict: """ Create the ComputationalResult objects after a Scan run is complete """ result = ComputationalResult() result.commands.append('SCAN.UPC::SCANfast') result.is_ccdl = True result.is_public = True result.time_start = job_context['time_start'] result.time_end = job_context['time_end'] try: processor_key = "AFFYMETRIX_SCAN" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context['pipeline'].steps.append(result.id) # Create a ComputedFile for the sample computed_file = ComputedFile() computed_file.absolute_file_path = job_context["output_file_path"] computed_file.filename = os.path.split(job_context["output_file_path"])[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.save() job_context['computed_files'].append(computed_file) for sample in job_context['samples']: assoc = SampleResultAssociation() assoc.sample = sample assoc.result = result assoc.save() SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file) logger.debug("Created %s", result, processor_job=job_context["job_id"]) job_context["success"] = True return job_context
def _create_result(job_context: Dict) -> Dict: """ Create the actual Result object""" # This is a NO-OP, but we make a ComputationalResult regardless. result = ComputationalResult() result.commands.append(job_context["script_name"]) result.is_ccdl = True try: processor_key = "SUBMITTER_PROCESSED" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context["pipeline"].steps.append(result.id) # Create a ComputedFile for the computed file, # sync it S3 and save it. computed_file = ComputedFile() computed_file.absolute_file_path = job_context["output_file_path"] computed_file.filename = job_context["output_file_path"].split("/")[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.save() # utils.end_job will sync this to S3 for us. job_context["computed_files"] = [computed_file] for sample in job_context["samples"]: assoc = SampleResultAssociation() assoc.sample = sample assoc.result = result assoc.save() SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file ) logger.debug("Created %s", result) job_context["success"] = True return job_context
def _create_result_objects(job_context: Dict) -> Dict: """ Store and host the result as a ComputationalResult object. """ result = ComputationalResult() result.commands.append(" ".join(job_context['formatted_command'])) result.is_ccdl = True result.is_public = True result.time_start = job_context['time_start'] result.time_end = job_context['time_end'] try: processor_key = "COMPENDIA" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() # Write the compendia dataframe to a file, overwriting the previous smash job_context['merged_qn'].to_csv(job_context['smash_outfile'], sep='\t', encoding='utf-8') compendia_tsv_computed_file = ComputedFile() compendia_tsv_computed_file.absolute_file_path = job_context['smash_outfile'] compendia_tsv_computed_file.filename = job_context['smash_outfile'].split('/')[-1] compendia_tsv_computed_file.calculate_sha1() compendia_tsv_computed_file.calculate_size() compendia_tsv_computed_file.is_smashable = False compendia_tsv_computed_file.is_qn_target = False compendia_tsv_computed_file.result = result compendia_tsv_computed_file.save() organism_key = list(job_context['samples'].keys())[0] annotation = ComputationalResultAnnotation() annotation.result = result annotation.data = { "organism_id": job_context['samples'][organism_key][0].organism_id, "organism_name": job_context['samples'][organism_key][0].organism.name, "is_qn": False, "is_compendia": True, "samples": [sample.accession_code for sample in job_context["samples"][organism_key]], "num_samples": len(job_context["samples"][organism_key]), "experiment_accessions": [e.accession_code for e in job_context['experiments']] } annotation.save() # Save the related metadata file metadata_computed_file = ComputedFile() metadata_computed_file.absolute_file_path = job_context['metadata_tsv_paths'][0] metadata_computed_file.filename = job_context['metadata_tsv_paths'][0].split('/')[-1] metadata_computed_file.calculate_sha1() metadata_computed_file.calculate_size() metadata_computed_file.is_smashable = False metadata_computed_file.is_qn_target = False metadata_computed_file.result = result metadata_computed_file.save() # Create the resulting archive final_zip_base = "/home/user/data_store/smashed/" + str(job_context["dataset"].pk) + "_compendia" archive_path = shutil.make_archive(final_zip_base, 'zip', job_context["output_dir"]) # Save the related metadata file organism = job_context['samples'][organism_key][0].organism try: last_compendia = ComputedFile.objects.filter( is_compendia=True, compendia_organism=organism).order_by('-compendia_version')[-1] compendia_version = last_compendia.compendia_version + 1 except Exception as e: # This is the first compendia for this Organism compendia_version = 1 archive_computed_file = ComputedFile() archive_computed_file.absolute_file_path = archive_path archive_computed_file.filename = archive_path.split('/')[-1] archive_computed_file.calculate_sha1() archive_computed_file.calculate_size() archive_computed_file.is_smashable = False archive_computed_file.is_qn_target = False archive_computed_file.result = result archive_computed_file.is_compendia = True archive_computed_file.compendia_organism = job_context['samples'][organism_key][0].organism archive_computed_file.compendia_version = compendia_version archive_computed_file.save() logger.info("Compendia created!", archive_path=archive_path, organism_name=job_context['samples'][organism_key][0].organism.name ) # Upload the result to S3 key = job_context['samples'][organism_key][0].organism.name + "_" + str(compendia_version) + "_" + str(int(time.time())) + ".zip" archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, key) job_context['result'] = result job_context['computed_files'] = [compendia_tsv_computed_file, metadata_computed_file, archive_computed_file] job_context['success'] = True return job_context
def _create_result_objects(job_context: Dict) -> Dict: """ Create the ComputationalResult objects after a Scan run is complete """ result = ComputationalResult() result.commands.append(job_context['formatted_command']) result.is_ccdl = True result.is_public = True result.time_start = job_context['time_start'] result.time_end = job_context['time_end'] try: processor_key = "ILLUMINA_SCAN" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context['pipeline'].steps.append(result.id) # Split the result into smashable subfiles big_tsv = job_context["output_file_path"] data = pd.read_csv(big_tsv, sep='\t', header=0, index_col=0) individual_files = [] frames = np.split(data, len(data.columns), axis=1) for frame in frames: filename = frame.columns.values[0].replace('&', '').replace( "*", '').replace(";", '') + '.tsv' frame_path = job_context["work_dir"] + filename frame.to_csv(frame_path, sep='\t', encoding='utf-8') # This needs to be the same as the ones in the job context! try: sample = job_context['samples'].get(title=frame.columns.values[0]) except Sample.DoesNotExist: logger.error( "Could not find sample for column while splitting Illumina file.", title=frame.columns.values[0], processor_job=job_context["job_id"], file_path=big_tsv, ) continue computed_file = ComputedFile() computed_file.absolute_file_path = frame_path computed_file.filename = frame_path.split('/')[-1] computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.is_public = True computed_file.calculate_sha1() computed_file.calculate_size() computed_file.save() job_context['computed_files'].append(computed_file) SampleResultAssociation.objects.get_or_create(sample=sample, result=result) SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file) individual_files.append(computed_file) logger.debug("Created %s", result) job_context["success"] = True job_context["individual_files"] = individual_files job_context["result"] = result return job_context
def _run_salmon(job_context: Dict) -> Dict: """Runs Salmon Quant.""" logger.debug("Running Salmon..") # Salmon needs to be run differently for different sample types. if "input_file_path_2" in job_context: second_read_str = " -2 {}".format(job_context["input_file_path_2"]) # Rob recommends 16 threads/process, which fits snugly on an x1 at 8GB RAM per Salmon container: # (2 threads/core * 16 cores/socket * 64 vCPU) / (1TB/8GB) = ~17 command_str = ("salmon --no-version-check quant -l A --biasSpeedSamp 5 -i {index}" " -1 {input_one}{second_read_str} -p 16 -o {output_directory}" " --seqBias --gcBias --dumpEq --writeUnmappedNames") formatted_command = command_str.format(index=job_context["index_directory"], input_one=job_context["input_file_path"], second_read_str=second_read_str, output_directory=job_context["output_directory"]) else: # Related: https://github.com/COMBINE-lab/salmon/issues/83 command_str = ("salmon --no-version-check quant -l A -i {index}" " -r {input_one} -p 16 -o {output_directory}" " --seqBias --dumpEq --writeUnmappedNames") formatted_command = command_str.format(index=job_context["index_directory"], input_one=job_context["input_file_path"], output_directory=job_context["output_directory"]) logger.debug("Running Salmon Quant using the following shell command: %s", formatted_command, processor_job=job_context["job_id"]) job_context['time_start'] = timezone.now() completed_command = subprocess.run(formatted_command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) job_context['time_end'] = timezone.now() ## To me, this looks broken: error codes are anything non-zero. ## However, Salmon (seems) to output with negative status codes ## even with successful executions. ## Possibly related: https://github.com/COMBINE-lab/salmon/issues/55 if completed_command.returncode == 1: stderr = completed_command.stderr.decode().strip() error_start = stderr.upper().find("ERROR:") error_start = error_start if error_start != -1 else 0 logger.error("Shell call to salmon failed with error message: %s", stderr[error_start:], processor_job=job_context["job_id"]) job_context["job"].failure_reason = ("Shell call to salmon failed because: " + stderr[error_start:]) job_context["success"] = False else: result = ComputationalResult() result.commands.append(formatted_command) result.time_start = job_context['time_start'] result.time_end = job_context['time_end'] result.organism_index = job_context["organism_index"] result.is_ccdl = True try: processor_key = "SALMON_QUANT" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) # Zip up the output of Salmon Quant try: with tarfile.open(job_context['output_archive'], "w:gz") as tar: tar.add(job_context["output_directory"], arcname=os.sep) except Exception: logger.exception("Exception caught while zipping processed directory %s", job_context["output_directory"], processor_job=job_context["job_id"] ) failure_template = "Exception caught while zipping processed directory {}" job_context["job"].failure_reason = failure_template.format(job_context['output_archive']) job_context["success"] = False return job_context salmon_quant_archive = ComputedFile() salmon_quant_archive.absolute_file_path = job_context["output_archive"] salmon_quant_archive.filename = os.path.split(job_context["output_archive"])[-1] salmon_quant_archive.calculate_sha1() salmon_quant_archive.calculate_size() salmon_quant_archive.is_public = True salmon_quant_archive.is_smashable = False salmon_quant_archive.is_qc = False quant_file = ComputedFile() quant_file.s3_bucket = S3_BUCKET_NAME quant_file.s3_key = "quant_files/sample_" + str(job_context["sample"].id) + "_quant.sf" quant_file.filename = "quant.sf" quant_file.absolute_file_path = job_context["output_directory"] + "quant.sf" quant_file.is_public = False quant_file.is_smashable = False quant_file.is_qc = False quant_file.calculate_sha1() quant_file.calculate_size() # If we're running in the cloud we need to upload the quant.sf # file so that it can be used by a job running on any machine # to run tximport. We can't use sync_to_s3 though because we # have to sync it before we can save the file so it cannot be # discovered by other jobs before it is uploaded. if settings.RUNNING_IN_CLOUD: try: S3.upload_file( quant_file.absolute_file_path, quant_file.s3_bucket, quant_file.s3_key, ExtraArgs={ 'ACL': 'public-read', 'StorageClass': 'STANDARD_IA' } ) except Exception as e: logger.exception(e, processor_job=job_context["job_id"], sample=job_context["sample"].id) failure_template = "Exception caught while uploading quantfile to S3: {}" job_context["job"].failure_reason = failure_template.format(quant_file.absolute_file_path) job_context["success"] = False return job_context # Here select_for_update() is used as a mutex that forces multiple # jobs to execute this block of code in serial manner. See: # https://docs.djangoproject.com/en/1.11/ref/models/querysets/#select-for-update # Theorectically any rows in any table can be locked here, we're # locking all existing rows in ComputationalResult table. with transaction.atomic(): ComputationalResult.objects.select_for_update() result.save() job_context["quant_result"] = result quant_file.result = result quant_file.save() job_context["result"] = result job_context['pipeline'].steps.append(result.id) SampleResultAssociation.objects.get_or_create(sample=job_context['sample'], result=result) salmon_quant_archive.result = result salmon_quant_archive.save() job_context['computed_files'].append(salmon_quant_archive) tximport_inputs = _get_tximport_inputs(job_context) # tximport analysis is done outside of the transaction so that # the mutex wouldn't hold the other jobs too long. for experiment, quant_files in tximport_inputs.items(): _tximport(job_context, experiment, quant_files) # If `tximport` on any related experiment fails, exit immediately. if not job_context["success"]: return job_context kv = ComputationalResultAnnotation() kv.data = {"index_length": job_context["index_length"]} kv.result = result kv.is_public = True kv.save() with open(os.path.join(job_context['output_directory'], 'lib_format_counts.json')) as lfc_file: format_count_data = json.load(lfc_file) kv = ComputationalResultAnnotation() kv.data = format_count_data kv.result = result kv.is_public = True kv.save() with open(os.path.join(job_context['output_directory'], 'aux_info', 'meta_info.json')) as mi_file: meta_info = json.load(mi_file) kv = ComputationalResultAnnotation() kv.data = meta_info kv.result = result kv.is_public = True kv.save() job_context["success"] = True return job_context
def _create_result_objects(job_context: Dict) -> Dict: """ Store and host the result as a ComputationalResult object. """ result_start = log_state("start create result object", job_context["job"].id) result = ComputationalResult() result.commands.append(" ".join(job_context["formatted_command"])) result.is_ccdl = True # Temporary until we re-enable the QN test step. result.is_public = False result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "CREATE_COMPENDIA" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() # Write the compendia dataframe to a file job_context["csv_outfile"] = job_context["output_dir"] + job_context[ "organism_name"] + ".tsv" job_context["merged_qn"].to_csv(job_context["csv_outfile"], sep="\t", encoding="utf-8") organism_key = list(job_context["samples"].keys())[0] annotation = ComputationalResultAnnotation() annotation.result = result annotation.data = { "organism_id": job_context["samples"][organism_key][0].organism_id, "organism_name": job_context["organism_name"], "is_qn": False, "is_compendia": True, "samples": [ sample.accession_code for sample in job_context["samples"][organism_key] ], "num_samples": len(job_context["samples"][organism_key]), "experiment_accessions": [e.accession_code for e in job_context["experiments"]], "total_percent_imputed": job_context["total_percent_imputed"], } annotation.save() # Create the resulting archive final_zip_base = SMASHING_DIR + str( job_context["dataset"].pk) + "_compendia" # Copy LICENSE.txt and correct README.md files. if job_context["dataset"].quant_sf_only: readme_file = "/home/user/README_QUANT.md" else: readme_file = "/home/user/README_NORMALIZED.md" shutil.copy(readme_file, job_context["output_dir"] + "/README.md") shutil.copy("/home/user/LICENSE_DATASET.txt", job_context["output_dir"] + "/LICENSE.TXT") archive_path = shutil.make_archive(final_zip_base, "zip", job_context["output_dir"]) archive_computed_file = ComputedFile() archive_computed_file.absolute_file_path = archive_path archive_computed_file.filename = archive_path.split("/")[-1] archive_computed_file.calculate_sha1() archive_computed_file.calculate_size() archive_computed_file.is_smashable = False archive_computed_file.is_qn_target = False archive_computed_file.result = result archive_computed_file.save() # Compendia Result Helpers primary_organism = Organism.get_object_for_name( job_context["primary_organism"]) organisms = [ Organism.get_object_for_name(organism) for organism in job_context["all_organisms"] ] compendium_version = (CompendiumResult.objects.filter( primary_organism=primary_organism, quant_sf_only=False).count() + 1) # Save Compendia Result compendium_result = CompendiumResult() compendium_result.quant_sf_only = job_context["dataset"].quant_sf_only compendium_result.svd_algorithm = job_context["dataset"].svd_algorithm compendium_result.compendium_version = compendium_version compendium_result.result = result compendium_result.primary_organism = primary_organism compendium_result.save() # create relations to all organisms contained in the compendia compendium_result_organism_associations = [] for compendium_organism in organisms: compendium_result_organism_association = CompendiumResultOrganismAssociation( ) compendium_result_organism_association.compendium_result = compendium_result compendium_result_organism_association.organism = compendium_organism compendium_result_organism_associations.append( compendium_result_organism_association) CompendiumResultOrganismAssociation.objects.bulk_create( compendium_result_organism_associations) job_context["compendium_result"] = compendium_result logger.info("Compendium created!", archive_path=archive_path, organism_name=job_context["organism_name"]) # Upload the result to S3 timestamp = str(int(time.time())) key = job_context["organism_name"] + "_" + str( compendium_version) + "_" + timestamp + ".zip" uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, key) if not uploaded_to_s3: raise utils.ProcessorJobError( "Failed to upload compendia to S3", success=False, computed_file_id=archive_computed_file.id, ) if settings.RUNNING_IN_CLOUD: archive_computed_file.delete_local_file() job_context["result"] = result job_context["success"] = True log_state("end create result object", job_context["job"].id, result_start) # TEMPORARY for iterating on compendia more quickly. # Reset this so the end_job does clean up the job's non-input-data stuff. job_context["work_dir"] = job_context["old_work_dir"] return job_context
def _create_result_objects(job_context: Dict) -> Dict: """ Create the ComputationalResult objects after a Scan run is complete """ result = ComputationalResult() result.commands.append(job_context["formatted_command"]) result.is_ccdl = True result.is_public = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "ILLUMINA_SCAN" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context["pipeline"].steps.append(result.id) # Split the result into smashable subfiles big_tsv = job_context["output_file_path"] data = pd.read_csv(big_tsv, sep="\t", header=0, index_col=0) individual_files = [] frames = np.split(data, len(data.columns), axis=1) for frame in frames: filename = ( frame.columns.values[0].replace("&", "").replace("*", "").replace(";", "") + ".tsv" ) frame_path = job_context["work_dir"] + filename frame.to_csv(frame_path, sep="\t", encoding="utf-8") # This needs to be the same as the ones in the job context! sample = _get_sample_for_column(frame.columns.values[0], job_context) if sample is None: job_context["job"].failure_reason = ( "Could not find sample for column " + frame.columns.values[0] + " while splitting Illumina file " + big_tsv ) job_context["success"] = False job_context["job"].no_retry = True return job_context computed_file = ComputedFile() computed_file.absolute_file_path = frame_path computed_file.filename = frame_path.split("/")[-1] computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.is_public = True computed_file.calculate_sha1() computed_file.calculate_size() computed_file.save() job_context["computed_files"].append(computed_file) SampleResultAssociation.objects.get_or_create(sample=sample, result=result) SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file ) individual_files.append(computed_file) logger.debug("Created %s", result) job_context["success"] = True job_context["individual_files"] = individual_files job_context["result"] = result return job_context
def _run_salmontools(job_context: Dict) -> Dict: """ Run Salmontools to extract unmapped genes. """ logger.debug("Running SalmonTools ...") unmapped_filename = job_context[ "output_directory"] + "aux_info/unmapped_names.txt" command_str = "salmontools extract-unmapped -u {unmapped_file} -o {output} " output_prefix = job_context["salmontools_directory"] + "unmapped_by_salmon" command_str = command_str.format(unmapped_file=unmapped_filename, output=output_prefix) if "input_file_path_2" in job_context: command_str += "-1 {input_1} -2 {input_2}" command_str = command_str.format( input_1=job_context["input_file_path"], input_2=job_context["input_file_path_2"]) else: command_str += "-r {input_1}" command_str = command_str.format( input_1=job_context["input_file_path"]) start_time = timezone.now() logger.debug( "Running the following SalmonTools command: %s", command_str, processor_job=job_context["job_id"], ) completed_command = subprocess.run(command_str.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) end_time = timezone.now() # As of SalmonTools 0.1.0, completed_command.returncode is always 0, # (even if error happens). completed_command.stderr is not totally # reliable either, because it will output the following line even # when the execution succeeds: # "There were <N> unmapped reads\n" # in which "<N>" is the number of lines in input unmapped_names.txt. # # As a workaround, we are using a regular expression here to test # the status of SalmonTools execution. Any text in stderr that is # not in the above format is treated as error message. status_str = completed_command.stderr.decode().strip() success_pattern = r"^There were \d+ unmapped reads$" if re.match(success_pattern, status_str): # Zip up the output of salmontools try: with tarfile.open(job_context["salmontools_archive"], "w:gz") as tar: tar.add(job_context["salmontools_directory"], arcname=os.sep) except Exception: logger.exception( "Exception caught while zipping processed directory %s", job_context["salmontools_directory"], processor_job=job_context["job_id"], ) failure_template = "Exception caught while zipping salmontools directory {}" job_context["job"].failure_reason = failure_template.format( job_context["salmontools_archive"]) job_context["success"] = False return job_context result = ComputationalResult() result.commands.append(command_str) result.time_start = start_time result.time_end = end_time result.is_ccdl = True try: processor_key = "SALMONTOOLS" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context["pipeline"].steps.append(result.id) assoc = SampleResultAssociation() assoc.sample = job_context["sample"] assoc.result = result assoc.save() computed_file = ComputedFile() computed_file.filename = job_context["salmontools_archive"].split( "/")[-1] computed_file.absolute_file_path = job_context["salmontools_archive"] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.is_public = True computed_file.is_smashable = False computed_file.is_qc = True computed_file.result = result computed_file.save() job_context["computed_files"].append(computed_file) assoc = SampleComputedFileAssociation() assoc.sample = job_context["sample"] assoc.computed_file = computed_file assoc.save() job_context["result"] = result job_context["success"] = True else: # error in salmontools logger.error( "Shell call to salmontools failed with error message: %s", status_str, processor_job=job_context["job_id"], ) job_context["job"].failure_reason = ( "Shell call to salmontools failed because: " + status_str) job_context["success"] = False return job_context
def _run_salmon(job_context: Dict) -> Dict: """Runs Salmon Quant.""" logger.debug("Running Salmon..") # Salmon needs to be run differently for different sample types. # SRA files also get processed differently as we don't want to use fasterq-dump to extract # them to disk. if job_context.get("sra_input_file_path", None): # Single reads if job_context["sra_num_reads"] == 1: fifo = "/tmp/barney" os.mkfifo(fifo) dump_str = "fastq-dump --stdout {input_sra_file} > {fifo} &" formatted_dump_command = dump_str.format( input_sra_file=job_context["sra_input_file_path"], fifo=fifo) subprocess.Popen(formatted_dump_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) command_str = ( "salmon --no-version-check quant -l A -i {index} " "-r {fifo} -p 16 -o {output_directory} --seqBias --dumpEq --writeUnmappedNames" ) formatted_command = command_str.format( index=job_context["index_directory"], input_sra_file=job_context["sra_input_file_path"], fifo=fifo, output_directory=job_context["output_directory"], ) # Paired are trickier else: # Okay, for some reason I can't explain, this only works in the temp directory, # otherwise the `tee` part will only output to one or the other of the streams (non-deterministically), # but not both. This doesn't appear to happen if the fifos are in tmp. alpha = "/tmp/alpha" os.mkfifo(alpha) beta = "/tmp/beta" os.mkfifo(beta) dump_str = "fastq-dump --stdout --split-files -I {input_sra_file} | tee >(grep '@.*\.1\s' -A3 --no-group-separator > {fifo_alpha}) >(grep '@.*\.2\s' -A3 --no-group-separator > {fifo_beta}) > /dev/null &" formatted_dump_command = dump_str.format( input_sra_file=job_context["sra_input_file_path"], fifo_alpha=alpha, fifo_beta=beta) subprocess.Popen( formatted_dump_command, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.STDOUT, ) command_str = ( "salmon --no-version-check quant -l A -i {index} " "-1 {fifo_alpha} -2 {fifo_beta} -p 16 -o {output_directory} --seqBias --dumpEq --writeUnmappedNames" ) formatted_command = command_str.format( index=job_context["index_directory"], input_sra_file=job_context["sra_input_file_path"], fifo_alpha=alpha, fifo_beta=beta, output_directory=job_context["output_directory"], ) else: if "input_file_path_2" in job_context: second_read_str = " -2 {}".format(job_context["input_file_path_2"]) # Rob recommends 16 threads/process, which fits snugly on an x1 at 8GB RAM per Salmon container: # (2 threads/core * 16 cores/socket * 64 vCPU) / (1TB/8GB) = ~17 command_str = ( "salmon --no-version-check quant -l A --biasSpeedSamp 5 -i {index}" " -1 {input_one}{second_read_str} -p 16 -o {output_directory}" " --seqBias --gcBias --dumpEq --writeUnmappedNames") formatted_command = command_str.format( index=job_context["index_directory"], input_one=job_context["input_file_path"], second_read_str=second_read_str, output_directory=job_context["output_directory"], ) else: # Related: https://github.com/COMBINE-lab/salmon/issues/83 command_str = ("salmon --no-version-check quant -l A -i {index}" " -r {input_one} -p 16 -o {output_directory}" " --seqBias --dumpEq --writeUnmappedNames") formatted_command = command_str.format( index=job_context["index_directory"], input_one=job_context["input_file_path"], output_directory=job_context["output_directory"], ) logger.debug( "Running Salmon Quant using the following shell command: %s", formatted_command, processor_job=job_context["job_id"], ) # Salmon probably shouldn't take longer than three hours. timeout = 60 * 60 * 3 job_context["time_start"] = timezone.now() try: completed_command = subprocess.run( formatted_command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout, ) except subprocess.TimeoutExpired: failure_reason = "Salmon timed out because it failed to complete within 3 hours." logger.error( failure_reason, sample_accesion_code=job_context["sample"].accession_code, processor_job=job_context["job_id"], ) job_context["job"].failure_reason = failure_reason job_context["job"].no_retry = True job_context["success"] = False return job_context job_context["time_end"] = timezone.now() if completed_command.returncode == 1: stderr = completed_command.stderr.decode().strip() error_start = stderr.upper().find("ERROR:") error_start = error_start if error_start != -1 else 0 logger.error( "Shell call to salmon failed with error message: %s", stderr[error_start:], processor_job=job_context["job_id"], ) # If salmon has an error exit code then we don't want to retry it. job_context["job"].no_retry = True job_context["job"].failure_reason = ( "Shell call to salmon failed because: " + stderr[error_start:]) job_context["success"] = False else: result = ComputationalResult() result.commands.append(formatted_command) result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] result.organism_index = job_context["organism_index"] result.is_ccdl = True try: processor_key = "SALMON_QUANT" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) # Zip up the output of Salmon Quant try: with tarfile.open(job_context["output_archive"], "w:gz") as tar: tar.add(job_context["output_directory"], arcname=os.sep) except Exception: logger.exception( "Exception caught while zipping processed directory %s", job_context["output_directory"], processor_job=job_context["job_id"], ) failure_template = "Exception caught while zipping processed directory {}" job_context["job"].failure_reason = failure_template.format( job_context["output_archive"]) job_context["success"] = False return job_context salmon_quant_archive = ComputedFile() salmon_quant_archive.absolute_file_path = job_context["output_archive"] salmon_quant_archive.filename = os.path.split( job_context["output_archive"])[-1] salmon_quant_archive.calculate_sha1() salmon_quant_archive.calculate_size() salmon_quant_archive.is_public = True salmon_quant_archive.is_smashable = False salmon_quant_archive.is_qc = False quant_file = ComputedFile() quant_file.s3_bucket = S3_BUCKET_NAME timestamp = str(timezone.now().timestamp()).split(".")[0] quant_file.s3_key = "quant_files/sample_{0}_{1}_quant.sf".format( job_context["sample"].id, timestamp) quant_file.filename = "quant.sf" quant_file.absolute_file_path = job_context[ "output_directory"] + "quant.sf" quant_file.is_public = False quant_file.is_smashable = False quant_file.is_qc = False quant_file.calculate_sha1() quant_file.calculate_size() # If we're running in the cloud we need to upload the quant.sf # file so that it can be used by a job running on any machine # to run tximport. We can't use sync_to_s3 though because we # have to sync it before we can save the file so it cannot be # discovered by other jobs before it is uploaded. if settings.RUNNING_IN_CLOUD: try: S3.upload_file( quant_file.absolute_file_path, quant_file.s3_bucket, quant_file.s3_key, ExtraArgs={ "ACL": "public-read", "StorageClass": "STANDARD_IA" }, ) except Exception as e: logger.exception(e, processor_job=job_context["job_id"], sample=job_context["sample"].id) failure_template = "Exception caught while uploading quantfile to S3: {}" job_context["job"].failure_reason = failure_template.format( quant_file.absolute_file_path) job_context["success"] = False return job_context # Here select_for_update() is used as a mutex that forces multiple # jobs to execute this block of code in serial manner. See: # https://docs.djangoproject.com/en/1.11/ref/models/querysets/#select-for-update # Theorectically any rows in any table can be locked here, we're # locking all existing rows in ComputationalResult table. with transaction.atomic(): ComputationalResult.objects.select_for_update() result.save() job_context["quant_result"] = result quant_file.result = result quant_file.save() job_context["result"] = result job_context["pipeline"].steps.append(result.id) SampleResultAssociation.objects.get_or_create( sample=job_context["sample"], result=result) salmon_quant_archive.result = result salmon_quant_archive.save() job_context["computed_files"].append(salmon_quant_archive) kv = ComputationalResultAnnotation() kv.data = { "index_length": job_context["index_length"], "index_length_get": job_context.get("index_length_raw", None), } kv.result = result kv.is_public = True kv.save() try: with open( os.path.join(job_context["output_directory"], "lib_format_counts.json")) as lfc_file: format_count_data = json.load(lfc_file) kv = ComputationalResultAnnotation() kv.data = format_count_data kv.result = result kv.is_public = True kv.save() except Exception: # See: https://github.com/AlexsLemonade/refinebio/issues/1167 logger.exception( "Error parsing Salmon lib_format_counts JSON output!", processor_job=job_context["job_id"], ) try: with open( os.path.join(job_context["output_directory"], "aux_info", "meta_info.json")) as mi_file: meta_info = json.load(mi_file) kv = ComputationalResultAnnotation() kv.data = meta_info kv.result = result kv.is_public = True kv.save() except Exception: # See: https://github.com/AlexsLemonade/refinebio/issues/1167 logger.exception("Error parsing Salmon meta_info JSON output!", processor_job=job_context["job_id"]) job_context["success"] = True return job_context
def _run_tximport_for_experiment(job_context: Dict, experiment: Experiment, quant_files: List[ComputedFile]) -> Dict: # Download all the quant.sf fles for this experiment. Write all # their paths to a file so we can pass a path to that to # tximport.R rather than having to pass in one argument per # sample. tximport_path_list_file = job_context["work_dir"] + "tximport_inputs.txt" quant_file_paths = {} with open(tximport_path_list_file, "w") as input_list: for quant_file in quant_files: # We create a directory in the work directory for each (quant.sf) file, as # tximport assigns column names based on the parent directory name, # and we need those names so that we can reassociate withe samples later. # ex., a file with absolute_file_path: /processor_job_1/SRR123_output/quant.sf # downloads to: /processor_job_2/SRR123_output/quant.sf # So the result file has frame "SRR123_output", which we can associate with sample SRR123 sample_output = ( job_context["work_dir"] + str(quant_file.absolute_file_path.split("/")[-2]) + "/") os.makedirs(sample_output, exist_ok=True) quant_work_path = sample_output + quant_file.filename quant_file_path = quant_file.get_synced_file_path( path=quant_work_path) input_list.write(quant_file_path + "\n") quant_file_paths[quant_file_path] = os.stat( quant_file_path).st_size rds_filename = "txi_out.RDS" rds_file_path = job_context["work_dir"] + rds_filename tpm_filename = "gene_lengthScaledTPM.tsv" tpm_file_path = job_context["work_dir"] + tpm_filename result = ComputationalResult() cmd_tokens = [ "/usr/bin/Rscript", "--vanilla", "/home/user/data_refinery_workers/processors/tximport.R", "--file_list", tximport_path_list_file, "--gene2txmap", job_context["genes_to_transcripts_path"], "--rds_file", rds_file_path, "--tpm_file", tpm_file_path, ] result.time_start = timezone.now() logger.debug( "Running tximport with: %s", str(cmd_tokens), processor_job=job_context["job_id"], experiment=experiment.id, ) try: tximport_result = subprocess.run(cmd_tokens, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except Exception as e: raise utils.ProcessorJobError( "Encountered error in R code while running tximport.R: {}".format( str(e)), success=False, experiment=experiment.id, ) if tximport_result.returncode != 0: raise utils.ProcessorJobError( "Found non-zero exit code from R code while running tximport.R: {}" .format(tximport_result.stderr.decode().strip()), success=False, experiment=experiment.id, quant_files=quant_files, cmd_tokens=cmd_tokens, quant_file_paths=quant_file_paths, ) result.time_end = timezone.now() result.commands.append(" ".join(cmd_tokens)) result.is_ccdl = True try: processor_key = "TXIMPORT" result.processor = utils.find_processor(processor_key) except Exception as e: raise utils.ProcessorJobError("Failed to set processor: {}".format(e), success=False, processor_key=processor_key) result.save() job_context["pipeline"].steps.append(result.id) rds_file = ComputedFile() rds_file.absolute_file_path = rds_file_path rds_file.filename = rds_filename rds_file.result = result rds_file.is_smashable = False rds_file.is_qc = False rds_file.is_public = True rds_file.calculate_sha1() rds_file.calculate_size() rds_file.save() job_context["computed_files"].append(rds_file) # Split the tximport result into smashable subfiles data = pd.read_csv(tpm_file_path, sep="\t", header=0, index_col=0) individual_files = [] frames = np.split(data, len(data.columns), axis=1) for frame in frames: # Create sample-specific TPM file. sample_file_name = frame.columns.values[0] + "_" + tpm_filename frame_path = os.path.join(job_context["work_dir"], sample_file_name) frame.to_csv(frame_path, sep="\t", encoding="utf-8") # The frame column header is based off of the path, which includes _output. sample_accession_code = frame.columns.values[0].replace("_output", "") sample = Sample.objects.get(accession_code=sample_accession_code) computed_file = ComputedFile() computed_file.absolute_file_path = frame_path computed_file.filename = sample_file_name computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.is_public = True computed_file.calculate_sha1() computed_file.calculate_size() computed_file.save() job_context["computed_files"].append(computed_file) job_context["smashable_files"].append(computed_file) SampleResultAssociation.objects.get_or_create(sample=sample, result=result) # Create association with the RDS file. SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=rds_file) # Create association with TPM file. SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file) individual_files.append(computed_file) job_context["samples"].append(sample) # Salmon-processed samples aren't marked as is_processed # until they are fully tximported, this value sets that # for the end_job function. job_context["tximported"] = True job_context["individual_files"] = individual_files return job_context
def _run_multiqc(job_context: Dict) -> Dict: """Runs the `MultiQC` package to generate the QC report. TODO: These seem to consume a lot of RAM, even for small files. We should consider tuning these or breaking them out into their own processors. JVM settings may reduce RAM footprint. """ command_str = ("multiqc {input_directory} --outdir {qc_directory} --zip-data-dir") formatted_command = command_str.format(input_directory=job_context["qc_input_directory"], qc_directory=job_context["qc_directory"]) logger.debug("Running MultiQC using the following shell command: %s", formatted_command, processor_job=job_context["job_id"]) qc_env = os.environ.copy() qc_env["LC_ALL"] = "C.UTF-8" qc_env["LANG"] = "C.UTF-8" time_start = timezone.now() completed_command = subprocess.run(formatted_command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=qc_env) time_end = timezone.now() if completed_command.returncode != 0: stderr = completed_command.stderr.decode().strip() error_start = stderr.upper().find("ERROR:") error_start = error_start if error_start != -1 else 0 logger.error("Shell call to MultiQC failed with error message: %s", stderr[error_start:], processor_job=job_context["job_id"]) job_context["job"].failure_reason = ("Shell call to MultiQC failed because: " + stderr[error_start:]) job_context["success"] = False result = ComputationalResult() result.commands.append(formatted_command) result.time_start = time_start result.time_end = time_end result.is_ccdl = True try: processor_key = "MULTIQC" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context['pipeline'].steps.append(result.id) assoc = SampleResultAssociation() assoc.sample = job_context["sample"] assoc.result = result assoc.save() job_context['qc_result'] = result data_file = ComputedFile() data_file.filename = "multiqc_data.zip" # This is deterministic data_file.absolute_file_path = os.path.join(job_context["qc_directory"], data_file.filename) data_file.calculate_sha1() data_file.calculate_size() data_file.is_public = True data_file.result = job_context['qc_result'] data_file.is_smashable = False data_file.is_qc = True data_file.save() job_context['computed_files'].append(data_file) SampleComputedFileAssociation.objects.get_or_create( sample=job_context["sample"], computed_file=data_file) report_file = ComputedFile() report_file.filename = "multiqc_report.html" # This is deterministic report_file.absolute_file_path = os.path.join(job_context["qc_directory"], report_file.filename) report_file.calculate_sha1() report_file.calculate_size() report_file.is_public = True report_file.is_smashable = False report_file.is_qc = True report_file.result = job_context['qc_result'] report_file.save() job_context['computed_files'].append(report_file) job_context['qc_files'] = [data_file, report_file] return job_context
def _create_result_objects(job_context: Dict) -> Dict: """ Store and host the result as a ComputationalResult object. """ archive_path = job_context["archive_path"] compendia_organism = _get_organisms(job_context["samples"]).first() compendia_version = _get_next_compendia_version(compendia_organism) result = ComputationalResult() result.commands.append(" ".join(job_context["formatted_command"])) result.is_ccdl = True result.is_public = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "CREATE_QUANTPENDIA" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() archive_computed_file = ComputedFile() archive_computed_file.absolute_file_path = archive_path archive_computed_file.filename = FileUtils.get_filename(archive_path) archive_computed_file.calculate_sha1() archive_computed_file.calculate_size() archive_computed_file.is_smashable = False archive_computed_file.is_qn_target = False archive_computed_file.result = result archive_computed_file.is_compendia = True archive_computed_file.quant_sf_only = True archive_computed_file.compendia_organism = compendia_organism archive_computed_file.compendia_version = compendia_version archive_computed_file.save() compendium_result = CompendiumResult() compendium_result.quant_sf_only = True compendium_result.result = result compendium_result.primary_organism = compendia_organism compendium_result.compendium_version = compendia_version compendium_result.save() logger.info( "Quantpendia created! Uploading to S3.", job_id=job_context["job_id"], archive_path=archive_path, organism_name=compendia_organism.name, **get_process_stats() ) # Upload the result to S3 timestamp = str(int(time.time())) s3_key = compendia_organism.name + "_" + str(compendia_version) + "_" + timestamp + ".zip" uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, s3_key) if not uploaded_to_s3: raise utils.ProcessorJobError( "Failed to upload compendia to S3", success=False, computed_file_id=archive_computed_file.id, ) if settings.RUNNING_IN_CLOUD: archive_computed_file.delete_local_file() job_context["result"] = result job_context["success"] = True return job_context
def _populate_index_object(job_context: Dict) -> Dict: """ """ result = ComputationalResult() result.commands.append(job_context["salmon_formatted_command"]) try: processor_key = "TX_INDEX" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.is_ccdl = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] result.save() job_context['pipeline'].steps.append(result.id) computed_file = ComputedFile() computed_file.absolute_file_path = job_context["computed_archive"] computed_file.filename = os.path.split(job_context["computed_archive"])[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.result = result computed_file.is_smashable = False computed_file.is_qc = False computed_file.save() organism_object = Organism.get_object_for_name(job_context['organism_name']) index_object = OrganismIndex() index_object.organism = organism_object index_object.source_version = job_context["assembly_version"] index_object.assembly_name = job_context["assembly_name"] index_object.salmon_version = job_context["salmon_version"] index_object.index_type = "TRANSCRIPTOME_" + job_context['length'].upper() # This is where the index will be extracted to. index_object.absolute_directory_path = LOCAL_ROOT_DIR + "/TRANSCRIPTOME_INDEX/" \ + organism_object.name + "/" + job_context['length'] index_object.result = result if S3_TRANSCRIPTOME_INDEX_BUCKET_NAME: logger.info("Uploading %s %s to s3", job_context['organism_name'], job_context['length'], processor_job=job_context["job_id"]) timestamp = str(timezone.now().timestamp()).split('.')[0] s3_key = organism_object.name + '_' + index_object.index_type + "_" + timestamp + '.tar.gz' sync_result = computed_file.sync_to_s3(S3_TRANSCRIPTOME_INDEX_BUCKET_NAME, s3_key) if sync_result: computed_file.delete_local_file() else: logger.warn("S3_TRANSCRIPTOME_INDEX_BUCKET_NAME not configured, therefore %s %s will not be uploaded.", job_context['organism_name'], job_context['length'], processor_job=job_context["job_id"]) index_object.save() # We uploaded the file ourselves since we wanted it to go to a # different bucket than end_job would put it in, therefore empty # this list so end_job doesn't try to upload it again. job_context['computed_files'] = [] job_context['result'] = result job_context['computed_file'] = computed_file job_context['index'] = index_object # If there's not a long and a short index for this organism yet, # don't delete the input. # XXX: This will break once we introduce additional versions of these. short_indices = OrganismIndex.objects.filter(organism=organism_object, index_type="TRANSCRIPTOME_SHORT", source_version=job_context["assembly_version"]) long_indices = OrganismIndex.objects.filter(organism=organism_object, index_type="TRANSCRIPTOME_LONG", source_version=job_context["assembly_version"]) if short_indices.count() < 1 or long_indices.count() < 1: # utils.end_job deletes these, so remove them so it doesn't. job_context["original_files"] = [] return job_context
def _tximport(job_context: Dict, experiment: Experiment, quant_files: List[ComputedFile]) -> Dict: """Run tximport R script based on input quant files and the path of genes_to_transcripts.txt. """ # Download all the quant.sf fles for this experiment. Write all # their paths to a file so we can pass a path to that to # tximport.R rather than having to pass in one argument per # sample. tximport_path_list_file = job_context["work_dir"] + "tximport_inputs.txt" with open(tximport_path_list_file, "w") as input_list: for quant_file in quant_files: input_list.write(quant_file.get_synced_file_path() + "\n") rds_filename = "txi_out.RDS" rds_file_path = job_context["work_dir"] + rds_filename tpm_filename = "gene_lengthScaledTPM.tsv" tpm_file_path = job_context["work_dir"] + tpm_filename result = ComputationalResult() cmd_tokens = [ "/usr/bin/Rscript", "--vanilla", "/home/user/data_refinery_workers/processors/tximport.R", "--file_list", tximport_path_list_file, "--gene2txmap", job_context["genes_to_transcripts_path"], "--rds_file", rds_file_path, "--tpm_file", tpm_file_path ] result.time_start = timezone.now() logger.debug("Running tximport with: %s", str(cmd_tokens), processor_job=job_context['job_id'], experiment=experiment.id) try: tximport_result = subprocess.run(cmd_tokens, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except Exception as e: error_template = ("Encountered error in R code while running tximport.R: {}") error_message = error_template.format(str(e)) logger.error(error_message, processor_job=job_context["job_id"], experiment=experiment.id) job_context["job"].failure_reason = error_message job_context["success"] = False return job_context if tximport_result.returncode != 0: error_template = ("Found non-zero exit code from R code while running tximport.R: {}") error_message = error_template.format(tximport_result.stderr.decode().strip()) logger.error(error_message, processor_job=job_context["job_id"], experiment=experiment.id) job_context["job"].failure_reason = error_message job_context["success"] = False return job_context result.time_end = timezone.now() result.commands.append(" ".join(cmd_tokens)) result.is_ccdl = True try: processor_key = "TXIMPORT" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context['pipeline'].steps.append(result.id) # Associate this result with all samples in this experiment. # TODO: This may not be completely sensible, because `tximport` is # done at experiment level, not at sample level. # Could be very problematic if SRA's data model allows many # Experiments to one Run. # https://github.com/AlexsLemonade/refinebio/issues/297 for sample in experiment.samples.all(): s_r = SampleResultAssociation(sample=sample, result=result) s_r.save() rds_file = ComputedFile() rds_file.absolute_file_path = rds_file_path rds_file.filename = rds_filename rds_file.result = result rds_file.is_smashable = False rds_file.is_qc = False rds_file.is_public = True rds_file.calculate_sha1() rds_file.calculate_size() rds_file.save() job_context['computed_files'].append(rds_file) # Split the tximport result into smashable subfiles data = pd.read_csv(tpm_file_path, sep='\t', header=0, index_col=0) individual_files = [] frames = np.split(data, len(data.columns), axis=1) for frame in frames: # Create sample-specific TPM file. sample_file_name = frame.columns.values[0] + '_' + tpm_filename frame_path = os.path.join(job_context["work_dir"], sample_file_name) frame.to_csv(frame_path, sep='\t', encoding='utf-8') # The frame column header is based off of the path, which includes _output. sample = Sample.objects.get(accession_code=frame.columns.values[0].replace("_output", "")) computed_file = ComputedFile() computed_file.absolute_file_path = frame_path computed_file.filename = sample_file_name computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.is_public = True computed_file.calculate_sha1() computed_file.calculate_size() computed_file.save() job_context['computed_files'].append(computed_file) job_context['smashable_files'].append(computed_file) SampleResultAssociation.objects.get_or_create( sample=sample, result=result) # Create association with the RDS file. SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=rds_file) # Create association with TPM file. SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file) individual_files.append(computed_file) job_context['samples'].append(sample) # Clean up quant.sf files that were created just for this. for quant_file in quant_files: quant_file.delete_s3_file() # It's only okay to delete the local file because the full # output directory has already been zipped up. quant_file.delete_local_file() quant_file.delete() # Salmon-processed samples aren't marked as is_processed # until they are fully tximported, this value sets that # for the end_job function. job_context['tximported'] = True job_context['individual_files'] = individual_files return job_context