def _smash_all(job_context: Dict) -> Dict: """Perform smashing on all species/experiments in the dataset. """ start_smash = log_state("start smash", job_context["job"].id) job_context["unsmashable_files"] = [] job_context["num_samples"] = 0 # Smash all of the sample sets logger.debug( "About to smash!", dataset_count=len(job_context["dataset"].data), job_id=job_context["job"].id, ) try: # Once again, `key` is either a species name or an experiment accession for key, input_files in job_context.pop("input_files").items(): job_context = _smash_key(job_context, key, input_files) except Exception as e: raise utils.ProcessorJobError( "Could not smash dataset: " + str(e), success=False, dataset_id=job_context["dataset"].id, num_input_files=job_context["num_input_files"], ) smashing_utils.write_non_data_files(job_context) # Finally, compress all files into a zip final_zip_base = "/home/user/data_store/smashed/" + str( job_context["dataset"].pk) try: shutil.make_archive(final_zip_base, "zip", job_context["output_dir"]) except OSError: raise utils.ProcessorJobError("Smash Error while generating zip file", success=False) job_context["output_file"] = final_zip_base + ".zip" job_context["dataset"].success = True job_context["dataset"].save() logger.debug("Created smash output!", archive_location=job_context["output_file"]) log_state("end smash", job_context["job"].id, start_smash) return job_context
def _upload(job_context: Dict) -> Dict: """Uploads the result file to S3 and notifies user.""" if not job_context.get("upload", True) or not settings.RUNNING_IN_CLOUD: return job_context s3_client = boto3.client("s3") output_filename = job_context["output_file"].split("/")[-1] try: # Note that file expiry is handled by the S3 object lifecycle, # managed by terraform. s3_client.upload_file( job_context["output_file"], RESULTS_BUCKET, output_filename, ) except Exception: raise utils.ProcessorJobError("Failed to upload smash result file.", success=False, file=job_context["output_file"]) result_url = "https://s3.amazonaws.com/" + RESULTS_BUCKET + "/" + output_filename job_context["result_url"] = result_url logger.debug("Result uploaded!", result_url=job_context["result_url"]) return job_context
def write_non_data_files(job_context: Dict) -> Dict: """Writes the files that are not the actual data of the dataset. This include LICENSE.txt and README.md files and the metadata. Adds the key `metadata` to job_context and populates it with all the metadata that needs to be written. """ job_context["metadata"] = compile_metadata(job_context) shutil.copy("README_DATASET.md", job_context["output_dir"] + "README.md") shutil.copy("LICENSE_DATASET.txt", job_context["output_dir"] + "LICENSE.TXT") # Write samples metadata to TSV try: write_tsv_json(job_context) # Metadata to JSON job_context["metadata"]["created_at"] = timezone.now().strftime( "%Y-%m-%dT%H:%M:%S") aggregated_metadata_path = os.path.join(job_context["output_dir"], "aggregated_metadata.json") with open(aggregated_metadata_path, "w", encoding="utf-8") as metadata_file: json.dump(job_context["metadata"], metadata_file, indent=4, sort_keys=True) if job_context["filtered_samples"]: # generate filtered samples file only if some samples were skipped filtered_samples_path = os.path.join( job_context["output_dir"], "filtered_samples_metadata.json") with open(filtered_samples_path, "w", encoding="utf-8") as metadata_file: json.dump(job_context["filtered_samples"], metadata_file, indent=4, sort_keys=True) columns = get_tsv_columns(job_context["filtered_samples"]) filtered_samples_tsv_path = os.path.join( job_context["output_dir"], "filtered_samples_metadata.tsv") with open(filtered_samples_tsv_path, "w", encoding="utf-8") as tsv_file: dw = csv.DictWriter(tsv_file, columns, delimiter="\t", extrasaction="ignore") dw.writeheader() for sample_metadata in job_context["filtered_samples"].values( ): dw.writerow( get_tsv_row_data(sample_metadata, job_context["dataset"].data)) except Exception: raise utils.ProcessorJobError("Failed to write metadata TSV!", success=False) return job_context
def prepare_files(job_context: Dict) -> Dict: """ Fetches and prepares the files to smash. """ start_prepare_files = log_state("start prepare files", job_context["job"].id) found_files = False job_context["filtered_samples"] = {} job_context["input_files"] = {} # `key` can either be the species name or experiment accession. for key, samples in job_context["samples"].items(): smashable_files = [] seen_files = set() for sample in samples: if job_context["dataset"].quant_sf_only: # For quant.sf only jobs, just check that they have a quant.sf file smashable_file = sample.get_most_recent_quant_sf_file() else: smashable_file = sample.get_most_recent_smashable_result_file() if smashable_file is not None and smashable_file not in seen_files: smashable_files = smashable_files + [(smashable_file, sample)] seen_files.add(smashable_file) found_files = True else: sample_metadata = sample.to_metadata_dict() job_context["filtered_samples"][sample.accession_code] = { **sample_metadata, "reason": "This sample did not have a processed file associated with it in our database.", "experiment_accession_code": get_experiment_accession(sample.accession_code, job_context["dataset"].data), } job_context["input_files"][key] = smashable_files job_context["num_input_files"] = len(job_context["input_files"]) job_context["group_by_keys"] = list(job_context["input_files"].keys()) if not found_files: raise utils.ProcessorJobError( "Couldn't get any files to smash for Smash job!!", success=False, dataset_id=job_context["dataset"].id, num_samples=len(job_context["samples"]), ) dataset_id = str(job_context["dataset"].pk) job_context[ "work_dir"] = "/home/user/data_store/smashed/" + dataset_id + "/" # Ensure we have a fresh smash directory shutil.rmtree(job_context["work_dir"], ignore_errors=True) os.makedirs(job_context["work_dir"]) job_context["output_dir"] = job_context["work_dir"] + "output/" os.makedirs(job_context["output_dir"]) log_state("end prepare files", job_context["job"].id, start_prepare_files) return job_context
def quantile_normalize(job_context: Dict, ks_stat=0.001) -> Dict: """ Apply quantile normalization. """ # Prepare our QN target file organism = job_context["organism"] if not organism.qn_target: raise utils.ProcessorJobError( "Could not find QN target for Organism: " + str(organism), success=False, organism=organism, dataset_id=job_context["dataset"].id, ) qn_target_path = organism.qn_target.computedfile_set.latest().sync_from_s3() qn_target_frame = pd.read_csv( qn_target_path, sep="\t", header=None, index_col=None, error_bad_lines=False ) # Prepare our RPy2 bridge pandas2ri.activate() # Remove un-quantiled normalized matrix from job_context # because we no longer need it. merged_no_qn = job_context.pop("merged_no_qn") # Perform the Actual QN new_merged = _quantile_normalize_matrix(qn_target_frame[0], merged_no_qn) # And add the quantile normalized matrix to job_context. job_context["merged_qn"] = new_merged ks_res = _test_qn(new_merged) if ks_res: for (statistic, pvalue) in ks_res: job_context["ks_statistic"] = statistic job_context["ks_pvalue"] = pvalue # We're unsure of how strigent to be about # the pvalue just yet, so we're extra lax # rather than failing tons of tests. This may need tuning. if statistic > ks_stat or pvalue < 0.8: job_context["ks_warning"] = ( "Failed Kolmogorov Smirnov test! Stat: " + str(statistic) + ", PVal: " + str(pvalue) ) else: logger.warning( "Not enough columns to perform KS test - either bad smash or single sample smash.", dataset_id=job_context["dataset"].id, ) return job_context
def _notify(job_context: Dict) -> Dict: """Use AWS SES to notify a user of a smash result..""" if not job_context.get("upload", True) or not settings.RUNNING_IN_CLOUD: return job_context # Send a notification to slack when a dataset fails to be processed if job_context["job"].success is False: try: _notify_slack_failed_dataset(job_context) except Exception as e: logger.warn(e) # It doesn't really matter if this didn't work # Don't send an email if we don't have address or the user doesn't want an email. if job_context["dataset"].email_address and job_context[ "dataset"].notify_me: # Try to send the email. try: _notify_send_email(job_context) # Display an error if something goes wrong. except ClientError as e: raise utils.ProcessorJobError( "ClientError while notifying", success=False, exc_info=1, client_error_message=e.response["Error"]["Message"], ) except Exception: raise utils.ProcessorJobError( "General failure when trying to send email.", success=False, exc_info=1, result_url=job_context["result_url"], ) # We don't want to retry this dataset after we send a notification to users # https://github.com/alexslemonade/refinebio/issues/1944 job_context["job"].no_retry = True job_context["job"].save() return job_context
def set_tximport_inputs(job_context: Dict) -> Dict: """Adds to the job_context a mapping from experiments to a list of their quant files. Checks all the experiments which contain a sample from the current experiment. If any of them are fully processed (at least with salmon-quant) then the return dict will include the experiment mapping to a list of paths to the quant.sf file for each sample in that experiment. """ experiments = job_context["sample"].experiments.all() quantified_experiments = {} for experiment in experiments: # We only want to consider samples that we actually can run salmon on. eligible_samples = experiment.samples.filter(source_database="SRA", technology="RNA-SEQ") if not eligible_samples.exists(): continue is_tximport_job = "is_tximport_only" in job_context and job_context[ "is_tximport_only"] salmon_quant_files = get_tximport_inputs_if_eligible( experiment, is_tximport_job) if is_tximport_job and salmon_quant_files: # If the job is only running tximport, then index_length # hasn't been set on the job context because we don't have # a raw file to run it on. Therefore pull it from one of # the result annotations. # Can't just do salmon_quant_results[0] because it's a set. index_length = salmon_quant_files[0].result.get_index_length() if index_length: job_context["index_length"] = index_length elif "index_length" not in job_context: raise utils.ProcessorJobError( ("Found quant result without an annotation specifying its index length." " Why did this happen?!?"), success=False, no_retry=True, ) if salmon_quant_files: quantified_experiments[experiment] = salmon_quant_files job_context["tximport_inputs"] = quantified_experiments return job_context
def _prepare_frames(job_context: Dict) -> Dict: """ Takes the inputs and places them into matrices - Combine all microarray samples with a full join to form a DataFrame `microarray_expression_matrix`. - Combine all RNA-seq samples (lengthScaledTPM) with a full outer join to form a DataFrame `rnaseq_expression_matrix`. - Adds unsmashable files to `job_context["unsmashable_files"]` """ start_prepare_frames = log_state("start _prepare_frames", job_context["job"].id) job_context["unsmashable_files"] = [] job_context["num_samples"] = 0 # Smash all of the sample sets logger.debug( "About to smash!", dataset_count=len(job_context["dataset"].data), job_id=job_context["job"].id, ) try: # Once again, `key` is either a species name or an experiment accession for key, input_files in job_context.pop("input_files").items(): job_context = smashing_utils.process_frames_for_key( key, input_files, job_context) # if len(job_context['all_frames']) < 1: # TODO: Enable this check? except Exception: raise utils.ProcessorJobError( "Could not prepare frames for compendia.", success=False, dataset_id=job_context["dataset"].id, processor_job_id=job_context["job_id"], num_input_files=job_context["num_input_files"], ) job_context["dataset"].success = True job_context["dataset"].save() log_state("end _prepare_frames", job_context["job"].id, start_prepare_frames) return job_context
def tximport(job_context: Dict) -> Dict: """Run tximport R script based on input quant files and the path of genes_to_transcripts.txt. """ tximport_inputs = job_context["tximport_inputs"] quantified_experiments = 0 for experiment, quant_files in tximport_inputs.items(): job_context = _run_tximport_for_experiment(job_context, experiment, quant_files) quantified_experiments += 1 if (quantified_experiments == 0 and "is_tximport_job" in job_context and job_context["is_tximport_only"]): raise utils.ProcessorJobError( "Tximport job ran on no experiments... Why?!?!?", success=False, no_retry=True) return job_context
def _prepare_frames(job_context: Dict) -> Dict: start_prepare_frames = log_state("start _prepare_frames", job_context["job"].id) job_context["unsmashable_files"] = [] job_context["num_samples"] = 0 # Smash all of the sample sets logger.debug( "About to smash!", dataset_count=len(job_context["dataset"].data), job_id=job_context["job"].id, ) try: # Once again, `key` is either a species name or an experiment accession for key, input_files in job_context.pop("input_files").items(): job_context = smashing_utils.process_frames_for_key( key, input_files, job_context) # if len(job_context['all_frames']) < 1: # TODO: Enable this check? except Exception: raise utils.ProcessorJobError( "Could not prepare frames for compendia.", success=False, dataset_id=job_context["dataset"].id, processor_job_id=job_context["job_id"], num_input_files=job_context["num_input_files"], ) job_context["dataset"].success = True job_context["dataset"].save() log_state("end _prepare_frames", job_context["job"].id, start_prepare_frames) return job_context
def _create_result_objects(job_context: Dict) -> Dict: """ Store and host the result as a ComputationalResult object. """ archive_path = job_context["archive_path"] compendia_organism = _get_organisms(job_context["samples"]).first() compendia_version = _get_next_compendia_version(compendia_organism) result = ComputationalResult() result.commands.append(" ".join(job_context["formatted_command"])) result.is_ccdl = True result.is_public = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "CREATE_QUANTPENDIA" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() archive_computed_file = ComputedFile() archive_computed_file.absolute_file_path = archive_path archive_computed_file.filename = FileUtils.get_filename(archive_path) archive_computed_file.calculate_sha1() archive_computed_file.calculate_size() archive_computed_file.is_smashable = False archive_computed_file.is_qn_target = False archive_computed_file.result = result archive_computed_file.is_compendia = True archive_computed_file.quant_sf_only = True archive_computed_file.compendia_organism = compendia_organism archive_computed_file.compendia_version = compendia_version archive_computed_file.save() compendium_result = CompendiumResult() compendium_result.quant_sf_only = True compendium_result.result = result compendium_result.primary_organism = compendia_organism compendium_result.compendium_version = compendia_version compendium_result.save() logger.info( "Quantpendia created! Uploading to S3.", job_id=job_context["job_id"], archive_path=archive_path, organism_name=compendia_organism.name, **get_process_stats() ) # Upload the result to S3 timestamp = str(int(time.time())) s3_key = compendia_organism.name + "_" + str(compendia_version) + "_" + timestamp + ".zip" uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, s3_key) if not uploaded_to_s3: raise utils.ProcessorJobError( "Failed to upload compendia to S3", success=False, computed_file_id=archive_computed_file.id, ) if settings.RUNNING_IN_CLOUD: archive_computed_file.delete_local_file() job_context["result"] = result job_context["success"] = True return job_context
def _create_result_objects(job_context: Dict) -> Dict: """ Store and host the result as a ComputationalResult object. """ result_start = log_state("start create result object", job_context["job"].id) result = ComputationalResult() result.commands.append(" ".join(job_context["formatted_command"])) result.is_ccdl = True # Temporary until we re-enable the QN test step. result.is_public = False result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "CREATE_COMPENDIA" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() # Write the compendia dataframe to a file job_context["csv_outfile"] = job_context["output_dir"] + job_context[ "organism_name"] + ".tsv" job_context["merged_qn"].to_csv(job_context["csv_outfile"], sep="\t", encoding="utf-8") organism_key = list(job_context["samples"].keys())[0] annotation = ComputationalResultAnnotation() annotation.result = result annotation.data = { "organism_id": job_context["samples"][organism_key][0].organism_id, "organism_name": job_context["organism_name"], "is_qn": False, "is_compendia": True, "samples": [ sample.accession_code for sample in job_context["samples"][organism_key] ], "num_samples": len(job_context["samples"][organism_key]), "experiment_accessions": [e.accession_code for e in job_context["experiments"]], "total_percent_imputed": job_context["total_percent_imputed"], } annotation.save() # Create the resulting archive final_zip_base = SMASHING_DIR + str( job_context["dataset"].pk) + "_compendia" # Copy LICENSE.txt and correct README.md files. if job_context["dataset"].quant_sf_only: readme_file = "/home/user/README_QUANT.md" else: readme_file = "/home/user/README_NORMALIZED.md" shutil.copy(readme_file, job_context["output_dir"] + "/README.md") shutil.copy("/home/user/LICENSE_DATASET.txt", job_context["output_dir"] + "/LICENSE.TXT") archive_path = shutil.make_archive(final_zip_base, "zip", job_context["output_dir"]) archive_computed_file = ComputedFile() archive_computed_file.absolute_file_path = archive_path archive_computed_file.filename = archive_path.split("/")[-1] archive_computed_file.calculate_sha1() archive_computed_file.calculate_size() archive_computed_file.is_smashable = False archive_computed_file.is_qn_target = False archive_computed_file.result = result archive_computed_file.save() # Compendia Result Helpers primary_organism = Organism.get_object_for_name( job_context["primary_organism"]) organisms = [ Organism.get_object_for_name(organism) for organism in job_context["all_organisms"] ] compendium_version = (CompendiumResult.objects.filter( primary_organism=primary_organism, quant_sf_only=False).count() + 1) # Save Compendia Result compendium_result = CompendiumResult() compendium_result.quant_sf_only = job_context["dataset"].quant_sf_only compendium_result.svd_algorithm = job_context["dataset"].svd_algorithm compendium_result.compendium_version = compendium_version compendium_result.result = result compendium_result.primary_organism = primary_organism compendium_result.save() # create relations to all organisms contained in the compendia compendium_result_organism_associations = [] for compendium_organism in organisms: compendium_result_organism_association = CompendiumResultOrganismAssociation( ) compendium_result_organism_association.compendium_result = compendium_result compendium_result_organism_association.organism = compendium_organism compendium_result_organism_associations.append( compendium_result_organism_association) CompendiumResultOrganismAssociation.objects.bulk_create( compendium_result_organism_associations) job_context["compendium_result"] = compendium_result logger.info("Compendium created!", archive_path=archive_path, organism_name=job_context["organism_name"]) # Upload the result to S3 timestamp = str(int(time.time())) key = job_context["organism_name"] + "_" + str( compendium_version) + "_" + timestamp + ".zip" uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, key) if not uploaded_to_s3: raise utils.ProcessorJobError( "Failed to upload compendia to S3", success=False, computed_file_id=archive_computed_file.id, ) if settings.RUNNING_IN_CLOUD: archive_computed_file.delete_local_file() job_context["result"] = result job_context["success"] = True log_state("end create result object", job_context["job"].id, result_start) # TEMPORARY for iterating on compendia more quickly. # Reset this so the end_job does clean up the job's non-input-data stuff. job_context["work_dir"] = job_context["old_work_dir"] return job_context
def _run_tximport_for_experiment(job_context: Dict, experiment: Experiment, quant_files: List[ComputedFile]) -> Dict: # Download all the quant.sf fles for this experiment. Write all # their paths to a file so we can pass a path to that to # tximport.R rather than having to pass in one argument per # sample. tximport_path_list_file = job_context["work_dir"] + "tximport_inputs.txt" quant_file_paths = {} with open(tximport_path_list_file, "w") as input_list: for quant_file in quant_files: # We create a directory in the work directory for each (quant.sf) file, as # tximport assigns column names based on the parent directory name, # and we need those names so that we can reassociate withe samples later. # ex., a file with absolute_file_path: /processor_job_1/SRR123_output/quant.sf # downloads to: /processor_job_2/SRR123_output/quant.sf # So the result file has frame "SRR123_output", which we can associate with sample SRR123 sample_output = ( job_context["work_dir"] + str(quant_file.absolute_file_path.split("/")[-2]) + "/") os.makedirs(sample_output, exist_ok=True) quant_work_path = sample_output + quant_file.filename quant_file_path = quant_file.get_synced_file_path( path=quant_work_path) input_list.write(quant_file_path + "\n") quant_file_paths[quant_file_path] = os.stat( quant_file_path).st_size rds_filename = "txi_out.RDS" rds_file_path = job_context["work_dir"] + rds_filename tpm_filename = "gene_lengthScaledTPM.tsv" tpm_file_path = job_context["work_dir"] + tpm_filename result = ComputationalResult() cmd_tokens = [ "/usr/bin/Rscript", "--vanilla", "/home/user/data_refinery_workers/processors/tximport.R", "--file_list", tximport_path_list_file, "--gene2txmap", job_context["genes_to_transcripts_path"], "--rds_file", rds_file_path, "--tpm_file", tpm_file_path, ] result.time_start = timezone.now() logger.debug( "Running tximport with: %s", str(cmd_tokens), processor_job=job_context["job_id"], experiment=experiment.id, ) try: tximport_result = subprocess.run(cmd_tokens, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except Exception as e: raise utils.ProcessorJobError( "Encountered error in R code while running tximport.R: {}".format( str(e)), success=False, experiment=experiment.id, ) if tximport_result.returncode != 0: raise utils.ProcessorJobError( "Found non-zero exit code from R code while running tximport.R: {}" .format(tximport_result.stderr.decode().strip()), success=False, experiment=experiment.id, quant_files=quant_files, cmd_tokens=cmd_tokens, quant_file_paths=quant_file_paths, ) result.time_end = timezone.now() result.commands.append(" ".join(cmd_tokens)) result.is_ccdl = True try: processor_key = "TXIMPORT" result.processor = utils.find_processor(processor_key) except Exception as e: raise utils.ProcessorJobError("Failed to set processor: {}".format(e), success=False, processor_key=processor_key) result.save() job_context["pipeline"].steps.append(result.id) rds_file = ComputedFile() rds_file.absolute_file_path = rds_file_path rds_file.filename = rds_filename rds_file.result = result rds_file.is_smashable = False rds_file.is_qc = False rds_file.is_public = True rds_file.calculate_sha1() rds_file.calculate_size() rds_file.save() job_context["computed_files"].append(rds_file) # Split the tximport result into smashable subfiles data = pd.read_csv(tpm_file_path, sep="\t", header=0, index_col=0) individual_files = [] frames = np.split(data, len(data.columns), axis=1) for frame in frames: # Create sample-specific TPM file. sample_file_name = frame.columns.values[0] + "_" + tpm_filename frame_path = os.path.join(job_context["work_dir"], sample_file_name) frame.to_csv(frame_path, sep="\t", encoding="utf-8") # The frame column header is based off of the path, which includes _output. sample_accession_code = frame.columns.values[0].replace("_output", "") sample = Sample.objects.get(accession_code=sample_accession_code) computed_file = ComputedFile() computed_file.absolute_file_path = frame_path computed_file.filename = sample_file_name computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.is_public = True computed_file.calculate_sha1() computed_file.calculate_size() computed_file.save() job_context["computed_files"].append(computed_file) job_context["smashable_files"].append(computed_file) SampleResultAssociation.objects.get_or_create(sample=sample, result=result) # Create association with the RDS file. SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=rds_file) # Create association with TPM file. SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file) individual_files.append(computed_file) job_context["samples"].append(sample) # Salmon-processed samples aren't marked as is_processed # until they are fully tximported, this value sets that # for the end_job function. job_context["tximported"] = True job_context["individual_files"] = individual_files return job_context
def _populate_index_object(job_context: Dict) -> Dict: """ """ result = ComputationalResult() result.commands.append(job_context["salmon_formatted_command"]) try: processor_key = "TX_INDEX" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.is_ccdl = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] result.save() job_context["pipeline"].steps.append(result.id) computed_file = ComputedFile() computed_file.absolute_file_path = job_context["computed_archive"] computed_file.filename = os.path.split(job_context["computed_archive"])[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.result = result computed_file.is_smashable = False computed_file.is_qc = False computed_file.save() organism_object = Organism.get_object_for_name( job_context["organism_name"]) index_object = OrganismIndex() index_object.organism = organism_object index_object.database_name = job_context["database_name"] index_object.release_version = job_context["assembly_version"] index_object.assembly_name = job_context["assembly_name"] index_object.salmon_version = job_context["salmon_version"] index_object.index_type = "TRANSCRIPTOME_" + job_context["length"].upper() # This is where the index will be extracted to. index_object.absolute_directory_path = (LOCAL_ROOT_DIR + "/TRANSCRIPTOME_INDEX/" + organism_object.name + "/" + job_context["length"]) index_object.result = result if S3_TRANSCRIPTOME_INDEX_BUCKET_NAME: logger.info( "Uploading %s %s to s3", job_context["organism_name"], job_context["length"], processor_job=job_context["job_id"], ) timestamp = str(timezone.now().timestamp()).split(".")[0] s3_key = organism_object.name + "_" + index_object.index_type + "_" + timestamp + ".tar.gz" sync_result = computed_file.sync_to_s3( S3_TRANSCRIPTOME_INDEX_BUCKET_NAME, s3_key, public=True) if sync_result: computed_file.delete_local_file() else: computed_file.delete() raise utils.ProcessorJobError( "Failed to upload transcriptome index to S3", success=False, computed_file_id=computed_file.id, ) else: logger.warn( "S3_TRANSCRIPTOME_INDEX_BUCKET_NAME not configured, therefore %s %s will not be uploaded.", job_context["organism_name"], job_context["length"], processor_job=job_context["job_id"], ) index_object.save() # We uploaded the file ourselves since we wanted it to go to a # different bucket than end_job would put it in, therefore empty # this list so end_job doesn't try to upload it again. job_context["computed_files"] = [] job_context["result"] = result job_context["computed_file"] = computed_file job_context["index"] = index_object # If there's not a long and a short index for this organism yet, # don't delete the input. # XXX: This will break once we introduce additional versions of these. short_indices = OrganismIndex.objects.filter( organism=organism_object, index_type="TRANSCRIPTOME_SHORT", release_version=job_context["assembly_version"], ) long_indices = OrganismIndex.objects.filter( organism=organism_object, index_type="TRANSCRIPTOME_LONG", release_version=job_context["assembly_version"], ) if short_indices.count() < 1 or long_indices.count() < 1: # utils.end_job deletes these, so remove them so it doesn't. job_context["original_files"] = [] return job_context